[llvm-branch-commits] [clang] [clang-tools-extra] [compiler-rt] [flang] [libc] [libcxx] [lldb] [llvm] [mlir] [DLCov] Origin-Tracking: Collect stack traces in DebugLoc (PR #143592)

Stephen Tozer via llvm-branch-commits llvm-branch-commits at lists.llvm.org
Wed Jun 11 01:51:52 PDT 2025


Martin =?utf-8?q?Storsjö?= <martin at martin.st>,Florian Hahn
 <flo at fhahn.com>,Tomohiro Kashiwada <kikairoya at gmail.com>,Sumanth Gundapaneni
 <sumanth.gundapaneni at amd.com>,Amir Ayupov <aaupov at fb.com>,Peter Klausler
 <pklausler at nvidia.com>,Peter Klausler <pklausler at nvidia.com>,Peter Klausler
 <pklausler at nvidia.com>,Craig Topper <craig.topper at sifive.com>,S.
 VenkataKeerthy <31350914+svkeerthy at users.noreply.github.com>=?utf-8?q?,?=Philip
 Reames <preames at rivosinc.com>,Jonas Devlieghere <jonas at devlieghere.com>,Florian
 Mayer <fmayer at google.com>,Zhen Wang
 <37195552+wangzpgi at users.noreply.github.com>,Amy Huang <akhuang at google.com>,Matt
 Arsenault <Matthew.Arsenault at amd.com>,Andy Kaylor <akaylor at nvidia.com>,George
 Burgess IV <george.burgess.iv at gmail.com>,Aiden Grossman
 <aidengrossman at google.com>,Ethan Luis McDonough
 <ethanluismcdonough at gmail.com>,Min-Yih Hsu <min.hsu at sifive.com>,Iris Shi
 <0.0 at owo.li>,Florian Mayer <fmayer at google.com>,Brox Chen <guochen2 at amd.com>,Peter
 Klausler <pklausler at nvidia.com>,quic_hchandel <quic_hchandel at quicinc.com>,
Valentin Clement =?utf-8?b?KOODkOODrOODsw=?=,Jim Lin <jim at andestech.com>,Shafik
 Yaghmour <shafik.yaghmour at intel.com>,maflcko
 <6399679+maflcko at users.noreply.github.com>,Simon Pilgrim
 <llvm-dev at redking.me.uk>,Simon Pilgrim <llvm-dev at redking.me.uk>,
=?utf-8?q?Björn?= Pettersson <bjorn.a.pettersson at ericsson.com>,Paschalis
 Mpeis <paschalis.mpeis at arm.com>,Baranov Victor <bar.victor.2002 at gmail.com>,Simon
 Pilgrim <llvm-dev at redking.me.uk>,Simon Pilgrim <llvm-dev at redking.me.uk>,Simon
 Pilgrim <llvm-dev at redking.me.uk>,David Green <david.green at arm.com>,Adrian
 Vogelsgesang <avogelsgesang at salesforce.com>,Iris Shi <0.0 at owo.li>,Kareem
 Ergawy <kareem.ergawy at amd.com>,CHANDRA GHALE <chandra.nitdgp at gmail.com>,Kareem
 Ergawy <kareem.ergawy at amd.com>,Jameson Nash <vtjnash at gmail.com>,Stephen
 Tozer <stephen.tozer at sony.com>,Stephen Tozer <stephen.tozer at sony.com>,Stephen
 Tozer <stephen.tozer at sony.com>
Message-ID:
In-Reply-To: <llvm.org/llvm/llvm-project/pull/143592 at github.com>


https://github.com/SLTozer updated https://github.com/llvm/llvm-project/pull/143592

>From 44b928e0d578735572bcb264b70475e064b82022 Mon Sep 17 00:00:00 2001
From: Jon Roelofs <jonathan_roelofs at apple.com>
Date: Tue, 10 Jun 2025 15:15:41 -0400
Subject: [PATCH 01/61] [Matrix] Propagate shape information through cast insts
 (#141869)

---
 .../Scalar/LowerMatrixIntrinsics.cpp          |  56 ++++
 .../Transforms/LowerMatrixIntrinsics/unary.ll | 250 ++++++++++++++++++
 2 files changed, 306 insertions(+)
 create mode 100644 llvm/test/Transforms/LowerMatrixIntrinsics/unary.ll

diff --git a/llvm/lib/Transforms/Scalar/LowerMatrixIntrinsics.cpp b/llvm/lib/Transforms/Scalar/LowerMatrixIntrinsics.cpp
index a719697806270..026f2fa96146a 100644
--- a/llvm/lib/Transforms/Scalar/LowerMatrixIntrinsics.cpp
+++ b/llvm/lib/Transforms/Scalar/LowerMatrixIntrinsics.cpp
@@ -33,8 +33,10 @@
 #include "llvm/IR/CFG.h"
 #include "llvm/IR/DataLayout.h"
 #include "llvm/IR/DebugInfoMetadata.h"
+#include "llvm/IR/DerivedTypes.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/InstrTypes.h"
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/IntrinsicInst.h"
 #include "llvm/IR/MatrixBuilder.h"
@@ -249,6 +251,34 @@ static bool isUniformShape(Value *V) {
   if (I->isBinaryOp())
     return true;
 
+  if (auto *Cast = dyn_cast<CastInst>(V)) {
+    switch (Cast->getOpcode()) {
+    case llvm::Instruction::Trunc:
+    case llvm::Instruction::ZExt:
+    case llvm::Instruction::SExt:
+    case llvm::Instruction::FPToUI:
+    case llvm::Instruction::FPToSI:
+    case llvm::Instruction::UIToFP:
+    case llvm::Instruction::SIToFP:
+    case llvm::Instruction::FPTrunc:
+    case llvm::Instruction::FPExt:
+      return true;
+    case llvm::Instruction::AddrSpaceCast:
+    case CastInst::PtrToInt:
+    case CastInst::IntToPtr:
+      return false;
+    case CastInst::BitCast: {
+      if (auto *SrcVTy = dyn_cast<FixedVectorType>(Cast->getSrcTy()))
+        if (auto *DestVTy = dyn_cast<FixedVectorType>(Cast->getDestTy()))
+          return SrcVTy->getNumElements() == DestVTy->getNumElements();
+      return false;
+    }
+    case llvm::Instruction::CastOpsEnd:
+      llvm_unreachable("not an actual cast op");
+    }
+    llvm_unreachable("unhandled cast opcode");
+  }
+
   if (auto *II = dyn_cast<IntrinsicInst>(V))
     switch (II->getIntrinsicID()) {
     case Intrinsic::abs:
@@ -1112,6 +1142,8 @@ class LowerMatrixIntrinsics {
       Value *Op2;
       if (auto *BinOp = dyn_cast<BinaryOperator>(Inst))
         VisitBinaryOperator(BinOp, SI);
+      else if (auto *Cast = dyn_cast<CastInst>(Inst))
+        VisitCastInstruction(Cast, SI);
       else if (auto *UnOp = dyn_cast<UnaryOperator>(Inst))
         VisitUnaryOperator(UnOp, SI);
       else if (IntrinsicInst *Intr = dyn_cast<IntrinsicInst>(Inst))
@@ -2262,6 +2294,30 @@ class LowerMatrixIntrinsics {
                      Builder);
   }
 
+  /// Lower cast instructions.
+  void VisitCastInstruction(CastInst *Inst, const ShapeInfo &Shape) {
+    Value *Op = Inst->getOperand(0);
+
+    IRBuilder<> Builder(Inst);
+
+    MatrixTy Result;
+    MatrixTy M = getMatrix(Op, Shape, Builder);
+
+    Builder.setFastMathFlags(getFastMathFlags(Inst));
+
+    auto *OrigVTy = cast<VectorType>(Inst->getType());
+    auto *NewVTy = VectorType::get(OrigVTy->getElementType(),
+                                   ElementCount::getFixed(M.getStride()));
+
+    for (auto &Vector : M.vectors())
+      Result.addVector(Builder.CreateCast(Inst->getOpcode(), Vector, NewVTy));
+
+    finalizeLowering(Inst,
+                     Result.addNumComputeOps(getNumOps(Result.getVectorTy()) *
+                                             Result.getNumVectors()),
+                     Builder);
+  }
+
   /// Helper to linearize a matrix expression tree into a string. Currently
   /// matrix expressions are linarized by starting at an expression leaf and
   /// linearizing bottom up.
diff --git a/llvm/test/Transforms/LowerMatrixIntrinsics/unary.ll b/llvm/test/Transforms/LowerMatrixIntrinsics/unary.ll
new file mode 100644
index 0000000000000..a4bd516868bcd
--- /dev/null
+++ b/llvm/test/Transforms/LowerMatrixIntrinsics/unary.ll
@@ -0,0 +1,250 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -passes='lower-matrix-intrinsics' -S < %s | FileCheck %s
+
+define void @fneg_2x2(ptr %in, ptr %out) {
+; CHECK-LABEL: @fneg_2x2(
+; CHECK-NEXT:    [[COL_LOAD:%.*]] = load <2 x float>, ptr [[IN:%.*]], align 16
+; CHECK-NEXT:    [[VEC_GEP:%.*]] = getelementptr float, ptr [[IN]], i64 2
+; CHECK-NEXT:    [[COL_LOAD1:%.*]] = load <2 x float>, ptr [[VEC_GEP]], align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = fneg <2 x float> [[COL_LOAD]]
+; CHECK-NEXT:    [[TMP2:%.*]] = fneg <2 x float> [[COL_LOAD1]]
+; CHECK-NEXT:    store <2 x float> [[TMP1]], ptr [[OUT:%.*]], align 4
+; CHECK-NEXT:    [[VEC_GEP2:%.*]] = getelementptr float, ptr [[OUT]], i64 2
+; CHECK-NEXT:    store <2 x float> [[TMP2]], ptr [[VEC_GEP2]], align 4
+; CHECK-NEXT:    ret void
+;
+  %inv = load <4 x float>, ptr %in
+  %op = fneg <4 x float> %inv
+  call void @llvm.matrix.column.major.store(<4 x float> %op, ptr %out, i64 2, i1 false, i32 2, i32 2)
+  ret void
+}
+
+define void @trunc_2x2(ptr %in, ptr %out) {
+; CHECK-LABEL: @trunc_2x2(
+; CHECK-NEXT:    [[COL_LOAD:%.*]] = load <2 x i64>, ptr [[IN:%.*]], align 32
+; CHECK-NEXT:    [[VEC_GEP:%.*]] = getelementptr i64, ptr [[IN]], i64 2
+; CHECK-NEXT:    [[COL_LOAD1:%.*]] = load <2 x i64>, ptr [[VEC_GEP]], align 16
+; CHECK-NEXT:    [[TMP1:%.*]] = trunc <2 x i64> [[COL_LOAD]] to <2 x i32>
+; CHECK-NEXT:    [[TMP2:%.*]] = trunc <2 x i64> [[COL_LOAD1]] to <2 x i32>
+; CHECK-NEXT:    store <2 x i32> [[TMP1]], ptr [[OUT:%.*]], align 4
+; CHECK-NEXT:    [[VEC_GEP2:%.*]] = getelementptr i32, ptr [[OUT]], i64 2
+; CHECK-NEXT:    store <2 x i32> [[TMP2]], ptr [[VEC_GEP2]], align 4
+; CHECK-NEXT:    ret void
+;
+  %inv = load <4 x i64>, ptr %in
+  %op = trunc <4 x i64> %inv to <4 x i32>
+  call void @llvm.matrix.column.major.store(<4 x i32> %op, ptr %out, i64 2, i1 false, i32 2, i32 2)
+  ret void
+}
+
+define void @zext_2x2(ptr %in, ptr %out) {
+; CHECK-LABEL: @zext_2x2(
+; CHECK-NEXT:    [[COL_LOAD:%.*]] = load <2 x i16>, ptr [[IN:%.*]], align 8
+; CHECK-NEXT:    [[VEC_GEP:%.*]] = getelementptr i16, ptr [[IN]], i64 2
+; CHECK-NEXT:    [[COL_LOAD1:%.*]] = load <2 x i16>, ptr [[VEC_GEP]], align 4
+; CHECK-NEXT:    [[TMP1:%.*]] = zext <2 x i16> [[COL_LOAD]] to <2 x i32>
+; CHECK-NEXT:    [[TMP2:%.*]] = zext <2 x i16> [[COL_LOAD1]] to <2 x i32>
+; CHECK-NEXT:    store <2 x i32> [[TMP1]], ptr [[OUT:%.*]], align 4
+; CHECK-NEXT:    [[VEC_GEP2:%.*]] = getelementptr i32, ptr [[OUT]], i64 2
+; CHECK-NEXT:    store <2 x i32> [[TMP2]], ptr [[VEC_GEP2]], align 4
+; CHECK-NEXT:    ret void
+;
+  %inv = load <4 x i16>, ptr %in
+  %op = zext <4 x i16> %inv to <4 x i32>
+  call void @llvm.matrix.column.major.store(<4 x i32> %op, ptr %out, i64 2, i1 false, i32 2, i32 2)
+  ret void
+}
+
+define void @sext_2x2(ptr %in, ptr %out) {
+; CHECK-LABEL: @sext_2x2(
+; CHECK-NEXT:    [[COL_LOAD:%.*]] = load <2 x i8>, ptr [[IN:%.*]], align 4
+; CHECK-NEXT:    [[VEC_GEP:%.*]] = getelementptr i8, ptr [[IN]], i64 2
+; CHECK-NEXT:    [[COL_LOAD1:%.*]] = load <2 x i8>, ptr [[VEC_GEP]], align 2
+; CHECK-NEXT:    [[TMP1:%.*]] = sext <2 x i8> [[COL_LOAD]] to <2 x i16>
+; CHECK-NEXT:    [[TMP2:%.*]] = sext <2 x i8> [[COL_LOAD1]] to <2 x i16>
+; CHECK-NEXT:    store <2 x i16> [[TMP1]], ptr [[OUT:%.*]], align 2
+; CHECK-NEXT:    [[VEC_GEP2:%.*]] = getelementptr i16, ptr [[OUT]], i64 2
+; CHECK-NEXT:    store <2 x i16> [[TMP2]], ptr [[VEC_GEP2]], align 2
+; CHECK-NEXT:    ret void
+;
+  %inv = load <4 x i8>, ptr %in
+  %op = sext <4 x i8> %inv to <4 x i16>
+  call void @llvm.matrix.column.major.store(<4 x i16> %op, ptr %out, i64 2, i1 false, i32 2, i32 2)
+  ret void
+}
+
+define void @fptoui_2x2(ptr %in, ptr %out) {
+; CHECK-LABEL: @fptoui_2x2(
+; CHECK-NEXT:    [[COL_LOAD:%.*]] = load <2 x float>, ptr [[IN:%.*]], align 16
+; CHECK-NEXT:    [[VEC_GEP:%.*]] = getelementptr float, ptr [[IN]], i64 2
+; CHECK-NEXT:    [[COL_LOAD1:%.*]] = load <2 x float>, ptr [[VEC_GEP]], align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = fptoui <2 x float> [[COL_LOAD]] to <2 x i32>
+; CHECK-NEXT:    [[TMP2:%.*]] = fptoui <2 x float> [[COL_LOAD1]] to <2 x i32>
+; CHECK-NEXT:    store <2 x i32> [[TMP1]], ptr [[OUT:%.*]], align 4
+; CHECK-NEXT:    [[VEC_GEP2:%.*]] = getelementptr i32, ptr [[OUT]], i64 2
+; CHECK-NEXT:    store <2 x i32> [[TMP2]], ptr [[VEC_GEP2]], align 4
+; CHECK-NEXT:    ret void
+;
+  %inv = load <4 x float>, ptr %in
+  %op = fptoui <4 x float> %inv to <4 x i32>
+  call void @llvm.matrix.column.major.store(<4 x i32> %op, ptr %out, i64 2, i1 false, i32 2, i32 2)
+  ret void
+}
+
+define void @fptosi_2x2(ptr %in, ptr %out) {
+; CHECK-LABEL: @fptosi_2x2(
+; CHECK-NEXT:    [[COL_LOAD:%.*]] = load <2 x float>, ptr [[IN:%.*]], align 16
+; CHECK-NEXT:    [[VEC_GEP:%.*]] = getelementptr float, ptr [[IN]], i64 2
+; CHECK-NEXT:    [[COL_LOAD1:%.*]] = load <2 x float>, ptr [[VEC_GEP]], align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = fptosi <2 x float> [[COL_LOAD]] to <2 x i32>
+; CHECK-NEXT:    [[TMP2:%.*]] = fptosi <2 x float> [[COL_LOAD1]] to <2 x i32>
+; CHECK-NEXT:    store <2 x i32> [[TMP1]], ptr [[OUT:%.*]], align 4
+; CHECK-NEXT:    [[VEC_GEP2:%.*]] = getelementptr i32, ptr [[OUT]], i64 2
+; CHECK-NEXT:    store <2 x i32> [[TMP2]], ptr [[VEC_GEP2]], align 4
+; CHECK-NEXT:    ret void
+;
+  %inv = load <4 x float>, ptr %in
+  %op = fptosi <4 x float> %inv to <4 x i32>
+  call void @llvm.matrix.column.major.store(<4 x i32> %op, ptr %out, i64 2, i1 false, i32 2, i32 2)
+  ret void
+}
+
+define void @uitofp_2x2(ptr %in, ptr %out) {
+; CHECK-LABEL: @uitofp_2x2(
+; CHECK-NEXT:    [[COL_LOAD:%.*]] = load <2 x i64>, ptr [[IN:%.*]], align 32
+; CHECK-NEXT:    [[VEC_GEP:%.*]] = getelementptr i64, ptr [[IN]], i64 2
+; CHECK-NEXT:    [[COL_LOAD1:%.*]] = load <2 x i64>, ptr [[VEC_GEP]], align 16
+; CHECK-NEXT:    [[TMP1:%.*]] = uitofp <2 x i64> [[COL_LOAD]] to <2 x double>
+; CHECK-NEXT:    [[TMP2:%.*]] = uitofp <2 x i64> [[COL_LOAD1]] to <2 x double>
+; CHECK-NEXT:    store <2 x double> [[TMP1]], ptr [[OUT:%.*]], align 8
+; CHECK-NEXT:    [[VEC_GEP2:%.*]] = getelementptr double, ptr [[OUT]], i64 2
+; CHECK-NEXT:    store <2 x double> [[TMP2]], ptr [[VEC_GEP2]], align 8
+; CHECK-NEXT:    ret void
+;
+  %inv = load <4 x i64>, ptr %in
+  %op = uitofp <4 x i64> %inv to <4 x double>
+  call void @llvm.matrix.column.major.store(<4 x double> %op, ptr %out, i64 2, i1 false, i32 2, i32 2)
+  ret void
+}
+
+define void @sitofp_2x2(ptr %in, ptr %out) {
+; CHECK-LABEL: @sitofp_2x2(
+; CHECK-NEXT:    [[COL_LOAD:%.*]] = load <2 x i64>, ptr [[IN:%.*]], align 32
+; CHECK-NEXT:    [[VEC_GEP:%.*]] = getelementptr i64, ptr [[IN]], i64 2
+; CHECK-NEXT:    [[COL_LOAD1:%.*]] = load <2 x i64>, ptr [[VEC_GEP]], align 16
+; CHECK-NEXT:    [[TMP1:%.*]] = sitofp <2 x i64> [[COL_LOAD]] to <2 x double>
+; CHECK-NEXT:    [[TMP2:%.*]] = sitofp <2 x i64> [[COL_LOAD1]] to <2 x double>
+; CHECK-NEXT:    store <2 x double> [[TMP1]], ptr [[OUT:%.*]], align 8
+; CHECK-NEXT:    [[VEC_GEP2:%.*]] = getelementptr double, ptr [[OUT]], i64 2
+; CHECK-NEXT:    store <2 x double> [[TMP2]], ptr [[VEC_GEP2]], align 8
+; CHECK-NEXT:    ret void
+;
+  %inv = load <4 x i64>, ptr %in
+  %op = sitofp <4 x i64> %inv to <4 x double>
+  call void @llvm.matrix.column.major.store(<4 x double> %op, ptr %out, i64 2, i1 false, i32 2, i32 2)
+  ret void
+}
+
+define void @fptrunc_2x2(ptr %in, ptr %out) {
+; CHECK-LABEL: @fptrunc_2x2(
+; CHECK-NEXT:    [[COL_LOAD:%.*]] = load <2 x double>, ptr [[IN:%.*]], align 32
+; CHECK-NEXT:    [[VEC_GEP:%.*]] = getelementptr double, ptr [[IN]], i64 2
+; CHECK-NEXT:    [[COL_LOAD1:%.*]] = load <2 x double>, ptr [[VEC_GEP]], align 16
+; CHECK-NEXT:    [[TMP1:%.*]] = fptrunc nnan <2 x double> [[COL_LOAD]] to <2 x float>
+; CHECK-NEXT:    [[TMP2:%.*]] = fptrunc nnan <2 x double> [[COL_LOAD1]] to <2 x float>
+; CHECK-NEXT:    store <2 x float> [[TMP1]], ptr [[OUT:%.*]], align 4
+; CHECK-NEXT:    [[VEC_GEP2:%.*]] = getelementptr float, ptr [[OUT]], i64 2
+; CHECK-NEXT:    store <2 x float> [[TMP2]], ptr [[VEC_GEP2]], align 4
+; CHECK-NEXT:    ret void
+;
+  %inv = load <4 x double>, ptr %in
+  %op = fptrunc nnan <4 x double> %inv to <4 x float>
+  call void @llvm.matrix.column.major.store(<4 x float> %op, ptr %out, i64 2, i1 false, i32 2, i32 2)
+  ret void
+}
+
+define void @fpext_2x2(ptr %in, ptr %out) {
+; CHECK-LABEL: @fpext_2x2(
+; CHECK-NEXT:    [[COL_LOAD:%.*]] = load <2 x float>, ptr [[IN:%.*]], align 16
+; CHECK-NEXT:    [[VEC_GEP:%.*]] = getelementptr float, ptr [[IN]], i64 2
+; CHECK-NEXT:    [[COL_LOAD1:%.*]] = load <2 x float>, ptr [[VEC_GEP]], align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = fpext <2 x float> [[COL_LOAD]] to <2 x double>
+; CHECK-NEXT:    [[TMP2:%.*]] = fpext <2 x float> [[COL_LOAD1]] to <2 x double>
+; CHECK-NEXT:    store <2 x double> [[TMP1]], ptr [[OUT:%.*]], align 8
+; CHECK-NEXT:    [[VEC_GEP2:%.*]] = getelementptr double, ptr [[OUT]], i64 2
+; CHECK-NEXT:    store <2 x double> [[TMP2]], ptr [[VEC_GEP2]], align 8
+; CHECK-NEXT:    ret void
+;
+  %inv = load <4 x float>, ptr %in
+  %op = fpext <4 x float> %inv to <4 x double>
+  call void @llvm.matrix.column.major.store(<4 x double> %op, ptr %out, i64 2, i1 false, i32 2, i32 2)
+  ret void
+}
+
+define void @bitcast_2x2_v4f64_to_v4i64(ptr %in, ptr %out) {
+; CHECK-LABEL: @bitcast_2x2_v4f64_to_v4i64(
+; CHECK-NEXT:    [[COL_LOAD:%.*]] = load <2 x double>, ptr [[IN:%.*]], align 32
+; CHECK-NEXT:    [[VEC_GEP:%.*]] = getelementptr double, ptr [[IN]], i64 2
+; CHECK-NEXT:    [[COL_LOAD1:%.*]] = load <2 x double>, ptr [[VEC_GEP]], align 16
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x double> [[COL_LOAD]] to <2 x i64>
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x double> [[COL_LOAD1]] to <2 x i64>
+; CHECK-NEXT:    store <2 x i64> [[TMP1]], ptr [[OUT:%.*]], align 4
+; CHECK-NEXT:    [[VEC_GEP2:%.*]] = getelementptr i64, ptr [[OUT]], i64 2
+; CHECK-NEXT:    store <2 x i64> [[TMP2]], ptr [[VEC_GEP2]], align 4
+; CHECK-NEXT:    ret void
+;
+  %inv = load <4 x double>, ptr %in
+  %op = bitcast <4 x double> %inv to <4 x i64>
+  call void @llvm.matrix.column.major.store(<4 x i64> %op, ptr %out, i64 2, i1 false, i32 2, i32 2)
+  ret void
+}
+
+define void @bitcast_2x2_v4f64_to_v8i32(ptr %in, ptr %out) {
+; CHECK-LABEL: @bitcast_2x2_v4f64_to_v8i32(
+; CHECK-NEXT:    [[INV:%.*]] = load <4 x double>, ptr [[IN:%.*]], align 32
+; CHECK-NEXT:    [[OP:%.*]] = bitcast <4 x double> [[INV]] to <8 x i32>
+; CHECK-NEXT:    [[SPLIT:%.*]] = shufflevector <8 x i32> [[OP]], <8 x i32> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[SPLIT1:%.*]] = shufflevector <8 x i32> [[OP]], <8 x i32> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT:    store <4 x i32> [[SPLIT]], ptr [[OUT:%.*]], align 4
+; CHECK-NEXT:    [[VEC_GEP:%.*]] = getelementptr i32, ptr [[OUT]], i64 4
+; CHECK-NEXT:    store <4 x i32> [[SPLIT1]], ptr [[VEC_GEP]], align 4
+; CHECK-NEXT:    ret void
+;
+  %inv = load <4 x double>, ptr %in
+  %op = bitcast <4 x double> %inv to <8 x i32>
+  call void @llvm.matrix.column.major.store(<8 x i32> %op, ptr %out, i64 4, i1 false, i32 4, i32 2)
+  ret void
+}
+
+define void @bitcast_2x2_i256_to_v4i64(ptr %in, ptr %out) {
+; CHECK-LABEL: @bitcast_2x2_i256_to_v4i64(
+; CHECK-NEXT:    [[INV:%.*]] = load i256, ptr [[IN:%.*]], align 4
+; CHECK-NEXT:    [[OP:%.*]] = bitcast i256 [[INV]] to <4 x double>
+; CHECK-NEXT:    [[SPLIT:%.*]] = shufflevector <4 x double> [[OP]], <4 x double> poison, <2 x i32> <i32 0, i32 1>
+; CHECK-NEXT:    [[SPLIT1:%.*]] = shufflevector <4 x double> [[OP]], <4 x double> poison, <2 x i32> <i32 2, i32 3>
+; CHECK-NEXT:    store <2 x double> [[SPLIT]], ptr [[OUT:%.*]], align 8
+; CHECK-NEXT:    [[VEC_GEP:%.*]] = getelementptr double, ptr [[OUT]], i64 2
+; CHECK-NEXT:    store <2 x double> [[SPLIT1]], ptr [[VEC_GEP]], align 8
+; CHECK-NEXT:    ret void
+;
+  %inv = load i256, ptr %in
+  %op = bitcast i256 %inv to <4 x double>
+  call void @llvm.matrix.column.major.store(<4 x double> %op, ptr %out, i64 2, i1 false, i32 2, i32 2)
+  ret void
+}
+
+define void @bitcast_2x2_4i64_to_i256(ptr %in, ptr %out) {
+; CHECK-LABEL: @bitcast_2x2_4i64_to_i256(
+; CHECK-NEXT:    [[COL_LOAD:%.*]] = load <2 x double>, ptr [[IN:%.*]], align 8
+; CHECK-NEXT:    [[VEC_GEP:%.*]] = getelementptr double, ptr [[IN]], i64 2
+; CHECK-NEXT:    [[COL_LOAD1:%.*]] = load <2 x double>, ptr [[VEC_GEP]], align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <2 x double> [[COL_LOAD]], <2 x double> [[COL_LOAD1]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[OP:%.*]] = bitcast <4 x double> [[TMP1]] to i256
+; CHECK-NEXT:    store i256 [[OP]], ptr [[OUT:%.*]], align 4
+; CHECK-NEXT:    ret void
+;
+  %inv = call <4 x double> @llvm.matrix.column.major.load(ptr %in, i64 2, i1 false, i32 2, i32 2)
+  %op = bitcast <4 x double> %inv to i256
+  store i256 %op, ptr %out
+  ret void
+}

>From 616f83530f0215d077ad11d0a09faf62fa2daf5f Mon Sep 17 00:00:00 2001
From: Louis Dionne <ldionne.2 at gmail.com>
Date: Tue, 10 Jun 2025 15:22:42 -0400
Subject: [PATCH 02/61] [libc++] Move swap test to a .compile.pass.cpp
 (#143167)

---
 ...{swap_noexcept.pass.cpp => swap_noexcept.compile.pass.cpp} | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)
 rename libcxx/test/std/containers/sequences/forwardlist/forwardlist.spec/{swap_noexcept.pass.cpp => swap_noexcept.compile.pass.cpp} (98%)

diff --git a/libcxx/test/std/containers/sequences/forwardlist/forwardlist.spec/swap_noexcept.pass.cpp b/libcxx/test/std/containers/sequences/forwardlist/forwardlist.spec/swap_noexcept.compile.pass.cpp
similarity index 98%
rename from libcxx/test/std/containers/sequences/forwardlist/forwardlist.spec/swap_noexcept.pass.cpp
rename to libcxx/test/std/containers/sequences/forwardlist/forwardlist.spec/swap_noexcept.compile.pass.cpp
index b83ec3c3c1210..b50e67589471d 100644
--- a/libcxx/test/std/containers/sequences/forwardlist/forwardlist.spec/swap_noexcept.pass.cpp
+++ b/libcxx/test/std/containers/sequences/forwardlist/forwardlist.spec/swap_noexcept.compile.pass.cpp
@@ -52,7 +52,7 @@ struct some_alloc2 {
   typedef std::true_type is_always_equal;
 };
 
-int main(int, char**) {
+void f() {
   {
     typedef std::forward_list<MoveOnly> C;
     static_assert(noexcept(swap(std::declval<C&>(), std::declval<C&>())), "");
@@ -83,6 +83,4 @@ int main(int, char**) {
     static_assert(noexcept(swap(std::declval<C&>(), std::declval<C&>())), "");
   }
 #endif
-
-  return 0;
 }

>From 77da1257b61c728a4d35dc518bfb758d0b1ddf26 Mon Sep 17 00:00:00 2001
From: Jon Roelofs <jonathan_roelofs at apple.com>
Date: Tue, 10 Jun 2025 12:31:35 -0700
Subject: [PATCH 03/61] [Matrix] Collect split/reshape stats, even without
 -debug-only=

---
 .../Scalar/LowerMatrixIntrinsics.cpp          | 24 +++++++++----------
 1 file changed, 12 insertions(+), 12 deletions(-)

diff --git a/llvm/lib/Transforms/Scalar/LowerMatrixIntrinsics.cpp b/llvm/lib/Transforms/Scalar/LowerMatrixIntrinsics.cpp
index 026f2fa96146a..21683089a4693 100644
--- a/llvm/lib/Transforms/Scalar/LowerMatrixIntrinsics.cpp
+++ b/llvm/lib/Transforms/Scalar/LowerMatrixIntrinsics.cpp
@@ -57,10 +57,8 @@ using namespace PatternMatch;
 #define DEBUG_TYPE "lower-matrix-intrinsics"
 
 STATISTIC(FlattenedMatrices, "Number of matrix flattenings");
-#ifndef NDEBUG
 STATISTIC(ReshapedMatrices, "Number of matrix reshapes");
 STATISTIC(SplitMatrices, "Number of matrix splits");
-#endif
 
 static cl::opt<bool>
     FuseMatrix("fuse-matrix", cl::init(true), cl::Hidden,
@@ -627,27 +625,29 @@ class LowerMatrixIntrinsics {
       SplitVecs.push_back(V);
     }
 
-    LLVM_DEBUG(if (Instruction *Inst = dyn_cast<Instruction>(MatrixVal)) {
+    if (Instruction *Inst = dyn_cast<Instruction>(MatrixVal)) {
       if (Found != Inst2ColumnMatrix.end()) {
         // FIXME: re: "at least": SplitVecs.size() doesn't count the shuffles
         // that embedInVector created.
-        dbgs() << "matrix reshape from " << Found->second.shape() << " to "
-               << SI << " using at least " << SplitVecs.size()
-               << " shuffles on behalf of:\n"
-               << *Inst << '\n';
+        LLVM_DEBUG(dbgs() << "matrix reshape from " << Found->second.shape()
+                          << " to " << SI << " using at least "
+                          << SplitVecs.size() << " shuffles on behalf of:\n"
+                          << *Inst << '\n');
         ReshapedMatrices++;
       } else if (!ShapeMap.contains(MatrixVal)) {
-        dbgs() << "splitting a " << SI << " matrix with " << SplitVecs.size()
-               << " shuffles beacuse we do not have a shape-aware lowering for "
-                  "its def:\n"
-               << *Inst << '\n';
+        LLVM_DEBUG(
+            dbgs()
+            << "splitting a " << SI << " matrix with " << SplitVecs.size()
+            << " shuffles beacuse we do not have a shape-aware lowering for "
+               "its def:\n"
+            << *Inst << '\n');
         SplitMatrices++;
       } else {
         // The ShapeMap has it, so it's a case where we're being lowered
         // before the def, and we expect that InstCombine will clean things up
         // afterward.
       }
-    });
+    }
 
     return {SplitVecs};
   }

>From 8345d62478054d4ab97c6f28cfea6d1ecca837da Mon Sep 17 00:00:00 2001
From: Jon Roelofs <jonathan_roelofs at apple.com>
Date: Tue, 10 Jun 2025 15:36:37 -0400
Subject: [PATCH 04/61] [Matrix] Hoist finalizeLowering into caller. NFC
 (#143038)

---
 .../Scalar/LowerMatrixIntrinsics.cpp          | 144 ++++++++----------
 1 file changed, 66 insertions(+), 78 deletions(-)

diff --git a/llvm/lib/Transforms/Scalar/LowerMatrixIntrinsics.cpp b/llvm/lib/Transforms/Scalar/LowerMatrixIntrinsics.cpp
index 21683089a4693..eb81d2ea49673 100644
--- a/llvm/lib/Transforms/Scalar/LowerMatrixIntrinsics.cpp
+++ b/llvm/lib/Transforms/Scalar/LowerMatrixIntrinsics.cpp
@@ -1134,26 +1134,28 @@ class LowerMatrixIntrinsics {
       if (FusedInsts.count(Inst))
         continue;
 
-      IRBuilder<> Builder(Inst);
-
       const ShapeInfo &SI = ShapeMap.at(Inst);
 
       Value *Op1;
       Value *Op2;
+      MatrixTy Result;
       if (auto *BinOp = dyn_cast<BinaryOperator>(Inst))
-        VisitBinaryOperator(BinOp, SI);
+        Result = VisitBinaryOperator(BinOp, SI);
       else if (auto *Cast = dyn_cast<CastInst>(Inst))
-        VisitCastInstruction(Cast, SI);
+        Result = VisitCastInstruction(Cast, SI);
       else if (auto *UnOp = dyn_cast<UnaryOperator>(Inst))
-        VisitUnaryOperator(UnOp, SI);
-      else if (IntrinsicInst *Intr = dyn_cast<IntrinsicInst>(Inst))
-        VisitIntrinsicInst(Intr, SI);
+        Result = VisitUnaryOperator(UnOp, SI);
+      else if (auto *Intr = dyn_cast<IntrinsicInst>(Inst))
+        Result = VisitIntrinsicInst(Intr, SI);
       else if (match(Inst, m_Load(m_Value(Op1))))
-        VisitLoad(cast<LoadInst>(Inst), SI, Op1, Builder);
+        Result = VisitLoad(cast<LoadInst>(Inst), SI, Op1);
       else if (match(Inst, m_Store(m_Value(Op1), m_Value(Op2))))
-        VisitStore(cast<StoreInst>(Inst), SI, Op1, Op2, Builder);
+        Result = VisitStore(cast<StoreInst>(Inst), SI, Op1, Op2);
       else
         continue;
+
+      IRBuilder<> Builder(Inst);
+      finalizeLowering(Inst, Result, Builder);
       Changed = true;
     }
 
@@ -1193,25 +1195,24 @@ class LowerMatrixIntrinsics {
   }
 
   /// Replace intrinsic calls.
-  void VisitIntrinsicInst(IntrinsicInst *Inst, const ShapeInfo &Shape) {
-    switch (Inst->getIntrinsicID()) {
+  MatrixTy VisitIntrinsicInst(IntrinsicInst *Inst, const ShapeInfo &SI) {
+    assert(Inst->getCalledFunction() &&
+           Inst->getCalledFunction()->isIntrinsic());
+
+    switch (Inst->getCalledFunction()->getIntrinsicID()) {
     case Intrinsic::matrix_multiply:
-      LowerMultiply(Inst);
-      return;
+      return LowerMultiply(Inst);
     case Intrinsic::matrix_transpose:
-      LowerTranspose(Inst);
-      return;
+      return LowerTranspose(Inst);
     case Intrinsic::matrix_column_major_load:
-      LowerColumnMajorLoad(Inst);
-      return;
+      return LowerColumnMajorLoad(Inst);
     case Intrinsic::matrix_column_major_store:
-      LowerColumnMajorStore(Inst);
-      return;
+      return LowerColumnMajorStore(Inst);
     case Intrinsic::abs:
     case Intrinsic::fabs: {
       IRBuilder<> Builder(Inst);
       MatrixTy Result;
-      MatrixTy M = getMatrix(Inst->getOperand(0), Shape, Builder);
+      MatrixTy M = getMatrix(Inst->getOperand(0), SI, Builder);
       Builder.setFastMathFlags(getFastMathFlags(Inst));
 
       for (auto &Vector : M.vectors()) {
@@ -1229,16 +1230,14 @@ class LowerMatrixIntrinsics {
         }
       }
 
-      finalizeLowering(Inst,
-                       Result.addNumComputeOps(getNumOps(Result.getVectorTy()) *
-                                               Result.getNumVectors()),
-                       Builder);
-      return;
+      return Result.addNumComputeOps(getNumOps(Result.getVectorTy()) *
+                                     Result.getNumVectors());
     }
     default:
-      llvm_unreachable(
-          "only intrinsics supporting shape info should be seen here");
+      break;
     }
+    llvm_unreachable(
+        "only intrinsics supporting shape info should be seen here");
   }
 
   /// Compute the alignment for a column/row \p Idx with \p Stride between them.
@@ -1304,26 +1303,24 @@ class LowerMatrixIntrinsics {
   }
 
   /// Lower a load instruction with shape information.
-  void LowerLoad(Instruction *Inst, Value *Ptr, MaybeAlign Align, Value *Stride,
-                 bool IsVolatile, ShapeInfo Shape) {
+  MatrixTy LowerLoad(Instruction *Inst, Value *Ptr, MaybeAlign Align,
+                     Value *Stride, bool IsVolatile, ShapeInfo Shape) {
     IRBuilder<> Builder(Inst);
-    finalizeLowering(Inst,
-                     loadMatrix(Inst->getType(), Ptr, Align, Stride, IsVolatile,
-                                Shape, Builder),
-                     Builder);
+    return loadMatrix(Inst->getType(), Ptr, Align, Stride, IsVolatile, Shape,
+                      Builder);
   }
 
   /// Lowers llvm.matrix.column.major.load.
   ///
   /// The intrinsic loads a matrix from memory using a stride between columns.
-  void LowerColumnMajorLoad(CallInst *Inst) {
+  MatrixTy LowerColumnMajorLoad(CallInst *Inst) {
     assert(MatrixLayout == MatrixLayoutTy::ColumnMajor &&
            "Intrinsic only supports column-major layout!");
     Value *Ptr = Inst->getArgOperand(0);
     Value *Stride = Inst->getArgOperand(1);
-    LowerLoad(Inst, Ptr, Inst->getParamAlign(0), Stride,
-              cast<ConstantInt>(Inst->getArgOperand(2))->isOne(),
-              {Inst->getArgOperand(3), Inst->getArgOperand(4)});
+    return LowerLoad(Inst, Ptr, Inst->getParamAlign(0), Stride,
+                     cast<ConstantInt>(Inst->getArgOperand(2))->isOne(),
+                     {Inst->getArgOperand(3), Inst->getArgOperand(4)});
   }
 
   /// Stores a sub-matrix \p StoreVal into the \p R x \p C matrix starting at \p
@@ -1366,28 +1363,27 @@ class LowerMatrixIntrinsics {
   }
 
   /// Lower a store instruction with shape information.
-  void LowerStore(Instruction *Inst, Value *Matrix, Value *Ptr, MaybeAlign A,
-                  Value *Stride, bool IsVolatile, ShapeInfo Shape) {
+  MatrixTy LowerStore(Instruction *Inst, Value *Matrix, Value *Ptr,
+                      MaybeAlign A, Value *Stride, bool IsVolatile,
+                      ShapeInfo Shape) {
     IRBuilder<> Builder(Inst);
     auto StoreVal = getMatrix(Matrix, Shape, Builder);
-    finalizeLowering(Inst,
-                     storeMatrix(Matrix->getType(), StoreVal, Ptr, A, Stride,
-                                 IsVolatile, Builder),
-                     Builder);
+    return storeMatrix(Matrix->getType(), StoreVal, Ptr, A, Stride, IsVolatile,
+                       Builder);
   }
 
   /// Lowers llvm.matrix.column.major.store.
   ///
   /// The intrinsic store a matrix back memory using a stride between columns.
-  void LowerColumnMajorStore(CallInst *Inst) {
+  MatrixTy LowerColumnMajorStore(CallInst *Inst) {
     assert(MatrixLayout == MatrixLayoutTy::ColumnMajor &&
            "Intrinsic only supports column-major layout!");
     Value *Matrix = Inst->getArgOperand(0);
     Value *Ptr = Inst->getArgOperand(1);
     Value *Stride = Inst->getArgOperand(2);
-    LowerStore(Inst, Matrix, Ptr, Inst->getParamAlign(1), Stride,
-               cast<ConstantInt>(Inst->getArgOperand(3))->isOne(),
-               {Inst->getArgOperand(4), Inst->getArgOperand(5)});
+    return LowerStore(Inst, Matrix, Ptr, Inst->getParamAlign(1), Stride,
+                      cast<ConstantInt>(Inst->getArgOperand(3))->isOne(),
+                      {Inst->getArgOperand(4), Inst->getArgOperand(5)});
   }
 
   // Set elements I..I+NumElts-1 to Block
@@ -2162,7 +2158,7 @@ class LowerMatrixIntrinsics {
   }
 
   /// Lowers llvm.matrix.multiply.
-  void LowerMultiply(CallInst *MatMul) {
+  MatrixTy LowerMultiply(CallInst *MatMul) {
     IRBuilder<> Builder(MatMul);
     auto *EltType = cast<FixedVectorType>(MatMul->getType())->getElementType();
     ShapeInfo LShape(MatMul->getArgOperand(2), MatMul->getArgOperand(3));
@@ -2184,11 +2180,11 @@ class LowerMatrixIntrinsics {
 
     emitMatrixMultiply(Result, Lhs, Rhs, Builder, false, false,
                        getFastMathFlags(MatMul));
-    finalizeLowering(MatMul, Result, Builder);
+    return Result;
   }
 
   /// Lowers llvm.matrix.transpose.
-  void LowerTranspose(CallInst *Inst) {
+  MatrixTy LowerTranspose(CallInst *Inst) {
     MatrixTy Result;
     IRBuilder<> Builder(Inst);
     Value *InputVal = Inst->getArgOperand(0);
@@ -2218,28 +2214,26 @@ class LowerMatrixIntrinsics {
     // TODO: Improve estimate of operations needed for transposes. Currently we
     // just count the insertelement/extractelement instructions, but do not
     // account for later simplifications/combines.
-    finalizeLowering(
-        Inst,
-        Result.addNumComputeOps(2 * ArgShape.NumRows * ArgShape.NumColumns)
-            .addNumExposedTransposes(1),
-        Builder);
+    return Result.addNumComputeOps(2 * ArgShape.NumRows * ArgShape.NumColumns)
+        .addNumExposedTransposes(1);
   }
 
   /// Lower load instructions.
-  void VisitLoad(LoadInst *Inst, const ShapeInfo &SI, Value *Ptr,
-                 IRBuilder<> &Builder) {
-    LowerLoad(Inst, Ptr, Inst->getAlign(), Builder.getInt64(SI.getStride()),
-              Inst->isVolatile(), SI);
+  MatrixTy VisitLoad(LoadInst *Inst, const ShapeInfo &SI, Value *Ptr) {
+    IRBuilder<> Builder(Inst);
+    return LowerLoad(Inst, Ptr, Inst->getAlign(),
+                     Builder.getInt64(SI.getStride()), Inst->isVolatile(), SI);
   }
 
-  void VisitStore(StoreInst *Inst, const ShapeInfo &SI, Value *StoredVal,
-                  Value *Ptr, IRBuilder<> &Builder) {
-    LowerStore(Inst, StoredVal, Ptr, Inst->getAlign(),
-               Builder.getInt64(SI.getStride()), Inst->isVolatile(), SI);
+  MatrixTy VisitStore(StoreInst *Inst, const ShapeInfo &SI, Value *StoredVal,
+                      Value *Ptr) {
+    IRBuilder<> Builder(Inst);
+    return LowerStore(Inst, StoredVal, Ptr, Inst->getAlign(),
+                      Builder.getInt64(SI.getStride()), Inst->isVolatile(), SI);
   }
 
   /// Lower binary operators.
-  void VisitBinaryOperator(BinaryOperator *Inst, const ShapeInfo &SI) {
+  MatrixTy VisitBinaryOperator(BinaryOperator *Inst, const ShapeInfo &SI) {
     Value *Lhs = Inst->getOperand(0);
     Value *Rhs = Inst->getOperand(1);
 
@@ -2258,14 +2252,12 @@ class LowerMatrixIntrinsics {
       Result.addVector(Builder.CreateBinOp(Inst->getOpcode(), A.getVector(I),
                                            B.getVector(I)));
 
-    finalizeLowering(Inst,
-                     Result.addNumComputeOps(getNumOps(Result.getVectorTy()) *
-                                             Result.getNumVectors()),
-                     Builder);
+    return Result.addNumComputeOps(getNumOps(Result.getVectorTy()) *
+                                   Result.getNumVectors());
   }
 
   /// Lower unary operators.
-  void VisitUnaryOperator(UnaryOperator *Inst, const ShapeInfo &SI) {
+  MatrixTy VisitUnaryOperator(UnaryOperator *Inst, const ShapeInfo &SI) {
     Value *Op = Inst->getOperand(0);
 
     IRBuilder<> Builder(Inst);
@@ -2288,14 +2280,12 @@ class LowerMatrixIntrinsics {
     for (unsigned I = 0; I < SI.getNumVectors(); ++I)
       Result.addVector(BuildVectorOp(M.getVector(I)));
 
-    finalizeLowering(Inst,
-                     Result.addNumComputeOps(getNumOps(Result.getVectorTy()) *
-                                             Result.getNumVectors()),
-                     Builder);
+    return Result.addNumComputeOps(getNumOps(Result.getVectorTy()) *
+                                   Result.getNumVectors());
   }
 
   /// Lower cast instructions.
-  void VisitCastInstruction(CastInst *Inst, const ShapeInfo &Shape) {
+  MatrixTy VisitCastInstruction(CastInst *Inst, const ShapeInfo &Shape) {
     Value *Op = Inst->getOperand(0);
 
     IRBuilder<> Builder(Inst);
@@ -2312,10 +2302,8 @@ class LowerMatrixIntrinsics {
     for (auto &Vector : M.vectors())
       Result.addVector(Builder.CreateCast(Inst->getOpcode(), Vector, NewVTy));
 
-    finalizeLowering(Inst,
-                     Result.addNumComputeOps(getNumOps(Result.getVectorTy()) *
-                                             Result.getNumVectors()),
-                     Builder);
+    return Result.addNumComputeOps(getNumOps(Result.getVectorTy()) *
+                                   Result.getNumVectors());
   }
 
   /// Helper to linearize a matrix expression tree into a string. Currently

>From e48731bc03419f133a85b50571a368a889c6dab2 Mon Sep 17 00:00:00 2001
From: Brox Chen <guochen2 at amd.com>
Date: Tue, 10 Jun 2025 15:36:44 -0400
Subject: [PATCH 05/61] [AMDGPU][True16][CodeGen] v_s_xxx_f16 t16 mode handling
 in movetoVALU process (#141152)

Add op_sel for v_s_xxx_f16 when move them to VALU

update a few related codegen test for gfx12 in true16 mode
---
 llvm/lib/Target/AMDGPU/SIInstrInfo.cpp        |  23 +
 llvm/test/CodeGen/AMDGPU/frem.ll              | 893 ++++++++++++++++++
 .../CodeGen/AMDGPU/llvm.amdgcn.rcp.f16.ll     | 101 +-
 .../CodeGen/AMDGPU/llvm.amdgcn.rsq.f16.ll     | 101 +-
 llvm/test/CodeGen/AMDGPU/llvm.sqrt.f16.ll     |  88 +-
 ...-to-valu-pseudo-scalar-trans-f16-fake16.ll | 113 +++
 ...-to-valu-pseudo-scalar-trans-f16-true16.ll | 123 +++
 .../move-to-valu-pseudo-scalar-trans.ll       | 112 +--
 8 files changed, 1425 insertions(+), 129 deletions(-)
 create mode 100644 llvm/test/CodeGen/AMDGPU/move-to-valu-pseudo-scalar-trans-f16-fake16.ll
 create mode 100644 llvm/test/CodeGen/AMDGPU/move-to-valu-pseudo-scalar-trans-f16-true16.ll

diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
index 805f8e9acdca7..2ebf8b99e9d7b 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -7734,6 +7734,29 @@ void SIInstrInfo::moveToVALUImpl(SIInstrWorklist &Worklist,
     Inst.eraseFromParent();
     return;
   }
+  case AMDGPU::V_S_EXP_F16_e64:
+  case AMDGPU::V_S_LOG_F16_e64:
+  case AMDGPU::V_S_RCP_F16_e64:
+  case AMDGPU::V_S_RSQ_F16_e64:
+  case AMDGPU::V_S_SQRT_F16_e64: {
+    const DebugLoc &DL = Inst.getDebugLoc();
+    Register NewDst = MRI.createVirtualRegister(ST.useRealTrue16Insts()
+                                                    ? &AMDGPU::VGPR_16RegClass
+                                                    : &AMDGPU::VGPR_32RegClass);
+    auto NewInstr = BuildMI(*MBB, Inst, DL, get(NewOpcode), NewDst)
+                        .addImm(0) // src0_modifiers
+                        .add(Inst.getOperand(2))
+                        .addImm(0)  // clamp
+                        .addImm(0); // omod
+    if (ST.useRealTrue16Insts())
+      NewInstr.addImm(0); // opsel0
+    MRI.replaceRegWith(Inst.getOperand(0).getReg(), NewDst);
+    legalizeOperandsVALUt16(*NewInstr, MRI);
+    legalizeOperands(*NewInstr, MDT);
+    addUsersToMoveToVALUWorklist(NewDst, MRI, Worklist);
+    Inst.eraseFromParent();
+    return;
+  }
   }
 
   if (NewOpcode == AMDGPU::INSTRUCTION_LIST_END) {
diff --git a/llvm/test/CodeGen/AMDGPU/frem.ll b/llvm/test/CodeGen/AMDGPU/frem.ll
index 7a1351174733b..d3432daedadf8 100644
--- a/llvm/test/CodeGen/AMDGPU/frem.ll
+++ b/llvm/test/CodeGen/AMDGPU/frem.ll
@@ -8,6 +8,8 @@
 ; RUN:  llc -amdgpu-scalarize-global-loads=false -enable-misched=0 -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX11,GFX11-FAKE16 %s
 ; RUN:  llc -amdgpu-scalarize-global-loads=false -enable-misched=0 -mtriple=amdgcn -mcpu=gfx1150 -mattr=+real-true16 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX1150,GFX1150-TRUE16 %s
 ; RUN:  llc -amdgpu-scalarize-global-loads=false -enable-misched=0 -mtriple=amdgcn -mcpu=gfx1150 -mattr=-real-true16 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX1150,GFX1150-FAKE16 %s
+; RUN:  llc -amdgpu-scalarize-global-loads=false -enable-misched=0 -mtriple=amdgcn -mcpu=gfx1200 -mattr=+real-true16 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX1200,GFX1200-TRUE16 %s
+; RUN:  llc -amdgpu-scalarize-global-loads=false -enable-misched=0 -mtriple=amdgcn -mcpu=gfx1200 -mattr=-real-true16 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX1200,GFX1200-FAKE16 %s
 
 define amdgpu_kernel void @frem_f16(ptr addrspace(1) %out, ptr addrspace(1) %in1,
 ; SI-LABEL: frem_f16:
@@ -331,6 +333,82 @@ define amdgpu_kernel void @frem_f16(ptr addrspace(1) %out, ptr addrspace(1) %in1
 ; GFX1150-FAKE16-NEXT:    v_fmac_f16_e32 v1, v3, v2
 ; GFX1150-FAKE16-NEXT:    global_store_b16 v0, v1, s[0:1]
 ; GFX1150-FAKE16-NEXT:    s_endpgm
+;
+; GFX1200-TRUE16-LABEL: frem_f16:
+; GFX1200-TRUE16:       ; %bb.0:
+; GFX1200-TRUE16-NEXT:    s_clause 0x1
+; GFX1200-TRUE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX1200-TRUE16-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34
+; GFX1200-TRUE16-NEXT:    v_mov_b32_e32 v2, 0
+; GFX1200-TRUE16-NEXT:    s_wait_kmcnt 0x0
+; GFX1200-TRUE16-NEXT:    s_clause 0x1
+; GFX1200-TRUE16-NEXT:    global_load_d16_b16 v0, v2, s[2:3]
+; GFX1200-TRUE16-NEXT:    global_load_d16_b16 v1, v2, s[4:5] offset:8
+; GFX1200-TRUE16-NEXT:    s_wait_loadcnt 0x1
+; GFX1200-TRUE16-NEXT:    v_cvt_f32_f16_e32 v3, v0.l
+; GFX1200-TRUE16-NEXT:    s_wait_loadcnt 0x0
+; GFX1200-TRUE16-NEXT:    v_cvt_f32_f16_e32 v4, v1.l
+; GFX1200-TRUE16-NEXT:    v_mov_b16_e32 v5.l, v1.l
+; GFX1200-TRUE16-NEXT:    v_mov_b16_e32 v6.l, v0.l
+; GFX1200-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(TRANS32_DEP_1)
+; GFX1200-TRUE16-NEXT:    v_rcp_f32_e32 v4, v4
+; GFX1200-TRUE16-NEXT:    v_mul_f32_e32 v3, v3, v4
+; GFX1200-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1200-TRUE16-NEXT:    v_fma_mix_f32 v7, -v5, v3, v6 op_sel_hi:[1,0,1]
+; GFX1200-TRUE16-NEXT:    v_fmac_f32_e32 v3, v7, v4
+; GFX1200-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1200-TRUE16-NEXT:    v_fma_mix_f32 v5, -v5, v3, v6 op_sel_hi:[1,0,1]
+; GFX1200-TRUE16-NEXT:    v_mul_f32_e32 v4, v5, v4
+; GFX1200-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1200-TRUE16-NEXT:    v_and_b32_e32 v4, 0xff800000, v4
+; GFX1200-TRUE16-NEXT:    v_add_f32_e32 v3, v4, v3
+; GFX1200-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1200-TRUE16-NEXT:    v_cvt_f16_f32_e32 v0.h, v3
+; GFX1200-TRUE16-NEXT:    v_div_fixup_f16 v0.h, v0.h, v1.l, v0.l
+; GFX1200-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1200-TRUE16-NEXT:    v_trunc_f16_e32 v3.l, v0.h
+; GFX1200-TRUE16-NEXT:    v_xor_b32_e32 v3, 0x8000, v3
+; GFX1200-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1200-TRUE16-NEXT:    v_fmac_f16_e32 v0.l, v3.l, v1.l
+; GFX1200-TRUE16-NEXT:    global_store_b16 v2, v0, s[0:1]
+; GFX1200-TRUE16-NEXT:    s_endpgm
+;
+; GFX1200-FAKE16-LABEL: frem_f16:
+; GFX1200-FAKE16:       ; %bb.0:
+; GFX1200-FAKE16-NEXT:    s_clause 0x1
+; GFX1200-FAKE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX1200-FAKE16-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34
+; GFX1200-FAKE16-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1200-FAKE16-NEXT:    s_wait_kmcnt 0x0
+; GFX1200-FAKE16-NEXT:    s_clause 0x1
+; GFX1200-FAKE16-NEXT:    global_load_u16 v1, v0, s[2:3]
+; GFX1200-FAKE16-NEXT:    global_load_u16 v2, v0, s[4:5] offset:8
+; GFX1200-FAKE16-NEXT:    s_wait_loadcnt 0x1
+; GFX1200-FAKE16-NEXT:    v_cvt_f32_f16_e32 v3, v1
+; GFX1200-FAKE16-NEXT:    s_wait_loadcnt 0x0
+; GFX1200-FAKE16-NEXT:    v_cvt_f32_f16_e32 v4, v2
+; GFX1200-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(TRANS32_DEP_1)
+; GFX1200-FAKE16-NEXT:    v_rcp_f32_e32 v4, v4
+; GFX1200-FAKE16-NEXT:    v_mul_f32_e32 v3, v3, v4
+; GFX1200-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1200-FAKE16-NEXT:    v_fma_mix_f32 v5, -v2, v3, v1 op_sel_hi:[1,0,1]
+; GFX1200-FAKE16-NEXT:    v_fmac_f32_e32 v3, v5, v4
+; GFX1200-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1200-FAKE16-NEXT:    v_fma_mix_f32 v5, -v2, v3, v1 op_sel_hi:[1,0,1]
+; GFX1200-FAKE16-NEXT:    v_mul_f32_e32 v4, v5, v4
+; GFX1200-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1200-FAKE16-NEXT:    v_and_b32_e32 v4, 0xff800000, v4
+; GFX1200-FAKE16-NEXT:    v_add_f32_e32 v3, v4, v3
+; GFX1200-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1200-FAKE16-NEXT:    v_cvt_f16_f32_e32 v3, v3
+; GFX1200-FAKE16-NEXT:    v_div_fixup_f16 v3, v3, v2, v1
+; GFX1200-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1200-FAKE16-NEXT:    v_trunc_f16_e32 v3, v3
+; GFX1200-FAKE16-NEXT:    v_xor_b32_e32 v3, 0x8000, v3
+; GFX1200-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1200-FAKE16-NEXT:    v_fmac_f16_e32 v1, v3, v2
+; GFX1200-FAKE16-NEXT:    global_store_b16 v0, v1, s[0:1]
+; GFX1200-FAKE16-NEXT:    s_endpgm
                       ptr addrspace(1) %in2) #0 {
    %gep2 = getelementptr half, ptr addrspace(1) %in2, i32 4
    %r0 = load half, ptr addrspace(1) %in1, align 4
@@ -537,6 +615,48 @@ define amdgpu_kernel void @fast_frem_f16(ptr addrspace(1) %out, ptr addrspace(1)
 ; GFX1150-FAKE16-NEXT:    v_fmac_f16_e32 v1, v3, v2
 ; GFX1150-FAKE16-NEXT:    global_store_b16 v0, v1, s[0:1]
 ; GFX1150-FAKE16-NEXT:    s_endpgm
+;
+; GFX1200-TRUE16-LABEL: fast_frem_f16:
+; GFX1200-TRUE16:       ; %bb.0:
+; GFX1200-TRUE16-NEXT:    s_clause 0x1
+; GFX1200-TRUE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX1200-TRUE16-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34
+; GFX1200-TRUE16-NEXT:    v_mov_b32_e32 v2, 0
+; GFX1200-TRUE16-NEXT:    s_wait_kmcnt 0x0
+; GFX1200-TRUE16-NEXT:    s_clause 0x1
+; GFX1200-TRUE16-NEXT:    global_load_d16_b16 v0, v2, s[2:3]
+; GFX1200-TRUE16-NEXT:    global_load_d16_hi_b16 v0, v2, s[4:5] offset:8
+; GFX1200-TRUE16-NEXT:    s_wait_loadcnt 0x0
+; GFX1200-TRUE16-NEXT:    v_rcp_f16_e32 v1.l, v0.h
+; GFX1200-TRUE16-NEXT:    s_delay_alu instid0(TRANS32_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1200-TRUE16-NEXT:    v_mul_f16_e32 v1.l, v0.l, v1.l
+; GFX1200-TRUE16-NEXT:    v_trunc_f16_e32 v1.l, v1.l
+; GFX1200-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1200-TRUE16-NEXT:    v_xor_b32_e32 v1, 0x8000, v1
+; GFX1200-TRUE16-NEXT:    v_fmac_f16_e32 v0.l, v1.l, v0.h
+; GFX1200-TRUE16-NEXT:    global_store_b16 v2, v0, s[0:1]
+; GFX1200-TRUE16-NEXT:    s_endpgm
+;
+; GFX1200-FAKE16-LABEL: fast_frem_f16:
+; GFX1200-FAKE16:       ; %bb.0:
+; GFX1200-FAKE16-NEXT:    s_clause 0x1
+; GFX1200-FAKE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX1200-FAKE16-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34
+; GFX1200-FAKE16-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1200-FAKE16-NEXT:    s_wait_kmcnt 0x0
+; GFX1200-FAKE16-NEXT:    s_clause 0x1
+; GFX1200-FAKE16-NEXT:    global_load_u16 v1, v0, s[2:3]
+; GFX1200-FAKE16-NEXT:    global_load_u16 v2, v0, s[4:5] offset:8
+; GFX1200-FAKE16-NEXT:    s_wait_loadcnt 0x0
+; GFX1200-FAKE16-NEXT:    v_rcp_f16_e32 v3, v2
+; GFX1200-FAKE16-NEXT:    s_delay_alu instid0(TRANS32_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1200-FAKE16-NEXT:    v_mul_f16_e32 v3, v1, v3
+; GFX1200-FAKE16-NEXT:    v_trunc_f16_e32 v3, v3
+; GFX1200-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1200-FAKE16-NEXT:    v_xor_b32_e32 v3, 0x8000, v3
+; GFX1200-FAKE16-NEXT:    v_fmac_f16_e32 v1, v3, v2
+; GFX1200-FAKE16-NEXT:    global_store_b16 v0, v1, s[0:1]
+; GFX1200-FAKE16-NEXT:    s_endpgm
                       ptr addrspace(1) %in2) #0 {
    %gep2 = getelementptr half, ptr addrspace(1) %in2, i32 4
    %r0 = load half, ptr addrspace(1) %in1, align 4
@@ -743,6 +863,48 @@ define amdgpu_kernel void @unsafe_frem_f16(ptr addrspace(1) %out, ptr addrspace(
 ; GFX1150-FAKE16-NEXT:    v_fmac_f16_e32 v1, v3, v2
 ; GFX1150-FAKE16-NEXT:    global_store_b16 v0, v1, s[0:1]
 ; GFX1150-FAKE16-NEXT:    s_endpgm
+;
+; GFX1200-TRUE16-LABEL: unsafe_frem_f16:
+; GFX1200-TRUE16:       ; %bb.0:
+; GFX1200-TRUE16-NEXT:    s_clause 0x1
+; GFX1200-TRUE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX1200-TRUE16-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34
+; GFX1200-TRUE16-NEXT:    v_mov_b32_e32 v2, 0
+; GFX1200-TRUE16-NEXT:    s_wait_kmcnt 0x0
+; GFX1200-TRUE16-NEXT:    s_clause 0x1
+; GFX1200-TRUE16-NEXT:    global_load_d16_b16 v0, v2, s[2:3]
+; GFX1200-TRUE16-NEXT:    global_load_d16_hi_b16 v0, v2, s[4:5] offset:8
+; GFX1200-TRUE16-NEXT:    s_wait_loadcnt 0x0
+; GFX1200-TRUE16-NEXT:    v_rcp_f16_e32 v1.l, v0.h
+; GFX1200-TRUE16-NEXT:    s_delay_alu instid0(TRANS32_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1200-TRUE16-NEXT:    v_mul_f16_e32 v1.l, v0.l, v1.l
+; GFX1200-TRUE16-NEXT:    v_trunc_f16_e32 v1.l, v1.l
+; GFX1200-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1200-TRUE16-NEXT:    v_xor_b32_e32 v1, 0x8000, v1
+; GFX1200-TRUE16-NEXT:    v_fmac_f16_e32 v0.l, v1.l, v0.h
+; GFX1200-TRUE16-NEXT:    global_store_b16 v2, v0, s[0:1]
+; GFX1200-TRUE16-NEXT:    s_endpgm
+;
+; GFX1200-FAKE16-LABEL: unsafe_frem_f16:
+; GFX1200-FAKE16:       ; %bb.0:
+; GFX1200-FAKE16-NEXT:    s_clause 0x1
+; GFX1200-FAKE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX1200-FAKE16-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34
+; GFX1200-FAKE16-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1200-FAKE16-NEXT:    s_wait_kmcnt 0x0
+; GFX1200-FAKE16-NEXT:    s_clause 0x1
+; GFX1200-FAKE16-NEXT:    global_load_u16 v1, v0, s[2:3]
+; GFX1200-FAKE16-NEXT:    global_load_u16 v2, v0, s[4:5] offset:8
+; GFX1200-FAKE16-NEXT:    s_wait_loadcnt 0x0
+; GFX1200-FAKE16-NEXT:    v_rcp_f16_e32 v3, v2
+; GFX1200-FAKE16-NEXT:    s_delay_alu instid0(TRANS32_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1200-FAKE16-NEXT:    v_mul_f16_e32 v3, v1, v3
+; GFX1200-FAKE16-NEXT:    v_trunc_f16_e32 v3, v3
+; GFX1200-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1200-FAKE16-NEXT:    v_xor_b32_e32 v3, 0x8000, v3
+; GFX1200-FAKE16-NEXT:    v_fmac_f16_e32 v1, v3, v2
+; GFX1200-FAKE16-NEXT:    global_store_b16 v0, v1, s[0:1]
+; GFX1200-FAKE16-NEXT:    s_endpgm
                              ptr addrspace(1) %in2) #1 {
    %gep2 = getelementptr half, ptr addrspace(1) %in2, i32 4
    %r0 = load half, ptr addrspace(1) %in1, align 4
@@ -985,6 +1147,42 @@ define amdgpu_kernel void @frem_f32(ptr addrspace(1) %out, ptr addrspace(1) %in1
 ; GFX1150-NEXT:    v_fmac_f32_e32 v1, v3, v2
 ; GFX1150-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX1150-NEXT:    s_endpgm
+;
+; GFX1200-LABEL: frem_f32:
+; GFX1200:       ; %bb.0:
+; GFX1200-NEXT:    s_clause 0x1
+; GFX1200-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX1200-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34
+; GFX1200-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1200-NEXT:    s_wait_kmcnt 0x0
+; GFX1200-NEXT:    s_clause 0x1
+; GFX1200-NEXT:    global_load_b32 v1, v0, s[2:3]
+; GFX1200-NEXT:    global_load_b32 v2, v0, s[4:5] offset:16
+; GFX1200-NEXT:    s_wait_loadcnt 0x0
+; GFX1200-NEXT:    v_div_scale_f32 v4, null, v2, v2, v1
+; GFX1200-NEXT:    v_div_scale_f32 v3, vcc_lo, v1, v2, v1
+; GFX1200-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(TRANS32_DEP_1)
+; GFX1200-NEXT:    v_rcp_f32_e32 v5, v4
+; GFX1200-NEXT:    s_denorm_mode 15
+; GFX1200-NEXT:    v_fma_f32 v6, -v4, v5, 1.0
+; GFX1200-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1200-NEXT:    v_fmac_f32_e32 v5, v6, v5
+; GFX1200-NEXT:    v_mul_f32_e32 v6, v3, v5
+; GFX1200-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1200-NEXT:    v_fma_f32 v7, -v4, v6, v3
+; GFX1200-NEXT:    v_fmac_f32_e32 v6, v7, v5
+; GFX1200-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1200-NEXT:    v_fma_f32 v3, -v4, v6, v3
+; GFX1200-NEXT:    s_denorm_mode 12
+; GFX1200-NEXT:    v_div_fmas_f32 v3, v3, v5, v6
+; GFX1200-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1200-NEXT:    v_div_fixup_f32 v3, v3, v2, v1
+; GFX1200-NEXT:    v_trunc_f32_e32 v3, v3
+; GFX1200-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1200-NEXT:    v_xor_b32_e32 v3, 0x80000000, v3
+; GFX1200-NEXT:    v_fmac_f32_e32 v1, v3, v2
+; GFX1200-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX1200-NEXT:    s_endpgm
                       ptr addrspace(1) %in2) #0 {
    %gep2 = getelementptr float, ptr addrspace(1) %in2, i32 4
    %r0 = load float, ptr addrspace(1) %in1, align 4
@@ -1142,6 +1340,27 @@ define amdgpu_kernel void @fast_frem_f32(ptr addrspace(1) %out, ptr addrspace(1)
 ; GFX1150-NEXT:    v_fmac_f32_e32 v1, v3, v2
 ; GFX1150-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX1150-NEXT:    s_endpgm
+;
+; GFX1200-LABEL: fast_frem_f32:
+; GFX1200:       ; %bb.0:
+; GFX1200-NEXT:    s_clause 0x1
+; GFX1200-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX1200-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34
+; GFX1200-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1200-NEXT:    s_wait_kmcnt 0x0
+; GFX1200-NEXT:    s_clause 0x1
+; GFX1200-NEXT:    global_load_b32 v1, v0, s[2:3]
+; GFX1200-NEXT:    global_load_b32 v2, v0, s[4:5] offset:16
+; GFX1200-NEXT:    s_wait_loadcnt 0x0
+; GFX1200-NEXT:    v_rcp_f32_e32 v3, v2
+; GFX1200-NEXT:    s_delay_alu instid0(TRANS32_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1200-NEXT:    v_mul_f32_e32 v3, v1, v3
+; GFX1200-NEXT:    v_trunc_f32_e32 v3, v3
+; GFX1200-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1200-NEXT:    v_xor_b32_e32 v3, 0x80000000, v3
+; GFX1200-NEXT:    v_fmac_f32_e32 v1, v3, v2
+; GFX1200-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX1200-NEXT:    s_endpgm
                       ptr addrspace(1) %in2) #0 {
    %gep2 = getelementptr float, ptr addrspace(1) %in2, i32 4
    %r0 = load float, ptr addrspace(1) %in1, align 4
@@ -1299,6 +1518,27 @@ define amdgpu_kernel void @unsafe_frem_f32(ptr addrspace(1) %out, ptr addrspace(
 ; GFX1150-NEXT:    v_fmac_f32_e32 v1, v3, v2
 ; GFX1150-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX1150-NEXT:    s_endpgm
+;
+; GFX1200-LABEL: unsafe_frem_f32:
+; GFX1200:       ; %bb.0:
+; GFX1200-NEXT:    s_clause 0x1
+; GFX1200-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX1200-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34
+; GFX1200-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1200-NEXT:    s_wait_kmcnt 0x0
+; GFX1200-NEXT:    s_clause 0x1
+; GFX1200-NEXT:    global_load_b32 v1, v0, s[2:3]
+; GFX1200-NEXT:    global_load_b32 v2, v0, s[4:5] offset:16
+; GFX1200-NEXT:    s_wait_loadcnt 0x0
+; GFX1200-NEXT:    v_rcp_f32_e32 v3, v2
+; GFX1200-NEXT:    s_delay_alu instid0(TRANS32_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1200-NEXT:    v_mul_f32_e32 v3, v1, v3
+; GFX1200-NEXT:    v_trunc_f32_e32 v3, v3
+; GFX1200-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1200-NEXT:    v_xor_b32_e32 v3, 0x80000000, v3
+; GFX1200-NEXT:    v_fmac_f32_e32 v1, v3, v2
+; GFX1200-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX1200-NEXT:    s_endpgm
                              ptr addrspace(1) %in2) #1 {
    %gep2 = getelementptr float, ptr addrspace(1) %in2, i32 4
    %r0 = load float, ptr addrspace(1) %in1, align 4
@@ -1551,6 +1791,39 @@ define amdgpu_kernel void @frem_f64(ptr addrspace(1) %out, ptr addrspace(1) %in1
 ; GFX1150-NEXT:    v_fma_f64 v[0:1], -v[4:5], v[2:3], v[0:1]
 ; GFX1150-NEXT:    global_store_b64 v12, v[0:1], s[0:1]
 ; GFX1150-NEXT:    s_endpgm
+;
+; GFX1200-LABEL: frem_f64:
+; GFX1200:       ; %bb.0:
+; GFX1200-NEXT:    s_clause 0x1
+; GFX1200-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX1200-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34
+; GFX1200-NEXT:    v_mov_b32_e32 v12, 0
+; GFX1200-NEXT:    s_wait_kmcnt 0x0
+; GFX1200-NEXT:    s_clause 0x1
+; GFX1200-NEXT:    global_load_b64 v[0:1], v12, s[2:3]
+; GFX1200-NEXT:    global_load_b64 v[2:3], v12, s[4:5]
+; GFX1200-NEXT:    s_wait_loadcnt 0x0
+; GFX1200-NEXT:    v_div_scale_f64 v[4:5], null, v[2:3], v[2:3], v[0:1]
+; GFX1200-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(TRANS32_DEP_1)
+; GFX1200-NEXT:    v_rcp_f64_e32 v[6:7], v[4:5]
+; GFX1200-NEXT:    v_fma_f64 v[8:9], -v[4:5], v[6:7], 1.0
+; GFX1200-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1200-NEXT:    v_fma_f64 v[6:7], v[6:7], v[8:9], v[6:7]
+; GFX1200-NEXT:    v_fma_f64 v[8:9], -v[4:5], v[6:7], 1.0
+; GFX1200-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1200-NEXT:    v_fma_f64 v[6:7], v[6:7], v[8:9], v[6:7]
+; GFX1200-NEXT:    v_div_scale_f64 v[8:9], vcc_lo, v[0:1], v[2:3], v[0:1]
+; GFX1200-NEXT:    v_mul_f64_e32 v[10:11], v[8:9], v[6:7]
+; GFX1200-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1200-NEXT:    v_fma_f64 v[4:5], -v[4:5], v[10:11], v[8:9]
+; GFX1200-NEXT:    v_div_fmas_f64 v[4:5], v[4:5], v[6:7], v[10:11]
+; GFX1200-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1200-NEXT:    v_div_fixup_f64 v[4:5], v[4:5], v[2:3], v[0:1]
+; GFX1200-NEXT:    v_trunc_f64_e32 v[4:5], v[4:5]
+; GFX1200-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1200-NEXT:    v_fma_f64 v[0:1], -v[4:5], v[2:3], v[0:1]
+; GFX1200-NEXT:    global_store_b64 v12, v[0:1], s[0:1]
+; GFX1200-NEXT:    s_endpgm
                       ptr addrspace(1) %in2) #0 {
    %r0 = load double, ptr addrspace(1) %in1, align 8
    %r1 = load double, ptr addrspace(1) %in2, align 8
@@ -1772,6 +2045,35 @@ define amdgpu_kernel void @fast_frem_f64(ptr addrspace(1) %out, ptr addrspace(1)
 ; GFX1150-NEXT:    v_fma_f64 v[0:1], -v[4:5], v[2:3], v[0:1]
 ; GFX1150-NEXT:    global_store_b64 v10, v[0:1], s[0:1]
 ; GFX1150-NEXT:    s_endpgm
+;
+; GFX1200-LABEL: fast_frem_f64:
+; GFX1200:       ; %bb.0:
+; GFX1200-NEXT:    s_clause 0x1
+; GFX1200-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX1200-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34
+; GFX1200-NEXT:    v_mov_b32_e32 v10, 0
+; GFX1200-NEXT:    s_wait_kmcnt 0x0
+; GFX1200-NEXT:    s_clause 0x1
+; GFX1200-NEXT:    global_load_b64 v[0:1], v10, s[2:3]
+; GFX1200-NEXT:    global_load_b64 v[2:3], v10, s[4:5]
+; GFX1200-NEXT:    s_wait_loadcnt 0x0
+; GFX1200-NEXT:    v_rcp_f64_e32 v[4:5], v[2:3]
+; GFX1200-NEXT:    s_delay_alu instid0(TRANS32_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1200-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0
+; GFX1200-NEXT:    v_fma_f64 v[4:5], v[6:7], v[4:5], v[4:5]
+; GFX1200-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1200-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0
+; GFX1200-NEXT:    v_fma_f64 v[4:5], v[6:7], v[4:5], v[4:5]
+; GFX1200-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1200-NEXT:    v_mul_f64_e32 v[6:7], v[0:1], v[4:5]
+; GFX1200-NEXT:    v_fma_f64 v[8:9], -v[2:3], v[6:7], v[0:1]
+; GFX1200-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1200-NEXT:    v_fma_f64 v[4:5], v[8:9], v[4:5], v[6:7]
+; GFX1200-NEXT:    v_trunc_f64_e32 v[4:5], v[4:5]
+; GFX1200-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1200-NEXT:    v_fma_f64 v[0:1], -v[4:5], v[2:3], v[0:1]
+; GFX1200-NEXT:    global_store_b64 v10, v[0:1], s[0:1]
+; GFX1200-NEXT:    s_endpgm
                       ptr addrspace(1) %in2) #0 {
    %r0 = load double, ptr addrspace(1) %in1, align 8
    %r1 = load double, ptr addrspace(1) %in2, align 8
@@ -1993,6 +2295,35 @@ define amdgpu_kernel void @unsafe_frem_f64(ptr addrspace(1) %out, ptr addrspace(
 ; GFX1150-NEXT:    v_fma_f64 v[0:1], -v[4:5], v[2:3], v[0:1]
 ; GFX1150-NEXT:    global_store_b64 v10, v[0:1], s[0:1]
 ; GFX1150-NEXT:    s_endpgm
+;
+; GFX1200-LABEL: unsafe_frem_f64:
+; GFX1200:       ; %bb.0:
+; GFX1200-NEXT:    s_clause 0x1
+; GFX1200-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX1200-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34
+; GFX1200-NEXT:    v_mov_b32_e32 v10, 0
+; GFX1200-NEXT:    s_wait_kmcnt 0x0
+; GFX1200-NEXT:    s_clause 0x1
+; GFX1200-NEXT:    global_load_b64 v[0:1], v10, s[2:3]
+; GFX1200-NEXT:    global_load_b64 v[2:3], v10, s[4:5]
+; GFX1200-NEXT:    s_wait_loadcnt 0x0
+; GFX1200-NEXT:    v_rcp_f64_e32 v[4:5], v[2:3]
+; GFX1200-NEXT:    s_delay_alu instid0(TRANS32_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1200-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0
+; GFX1200-NEXT:    v_fma_f64 v[4:5], v[6:7], v[4:5], v[4:5]
+; GFX1200-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1200-NEXT:    v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0
+; GFX1200-NEXT:    v_fma_f64 v[4:5], v[6:7], v[4:5], v[4:5]
+; GFX1200-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1200-NEXT:    v_mul_f64_e32 v[6:7], v[0:1], v[4:5]
+; GFX1200-NEXT:    v_fma_f64 v[8:9], -v[2:3], v[6:7], v[0:1]
+; GFX1200-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1200-NEXT:    v_fma_f64 v[4:5], v[8:9], v[4:5], v[6:7]
+; GFX1200-NEXT:    v_trunc_f64_e32 v[4:5], v[4:5]
+; GFX1200-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1200-NEXT:    v_fma_f64 v[0:1], -v[4:5], v[2:3], v[0:1]
+; GFX1200-NEXT:    global_store_b64 v10, v[0:1], s[0:1]
+; GFX1200-NEXT:    s_endpgm
                              ptr addrspace(1) %in2) #1 {
    %r0 = load double, ptr addrspace(1) %in1, align 8
    %r1 = load double, ptr addrspace(1) %in2, align 8
@@ -2514,6 +2845,131 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; GFX1150-FAKE16-NEXT:    v_pack_b32_f16 v1, v1, v3
 ; GFX1150-FAKE16-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX1150-FAKE16-NEXT:    s_endpgm
+;
+; GFX1200-TRUE16-LABEL: frem_v2f16:
+; GFX1200-TRUE16:       ; %bb.0:
+; GFX1200-TRUE16-NEXT:    s_clause 0x1
+; GFX1200-TRUE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX1200-TRUE16-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34
+; GFX1200-TRUE16-NEXT:    v_mov_b32_e32 v1, 0
+; GFX1200-TRUE16-NEXT:    s_wait_kmcnt 0x0
+; GFX1200-TRUE16-NEXT:    s_clause 0x1
+; GFX1200-TRUE16-NEXT:    global_load_b32 v2, v1, s[2:3]
+; GFX1200-TRUE16-NEXT:    global_load_b32 v3, v1, s[4:5] offset:16
+; GFX1200-TRUE16-NEXT:    s_wait_loadcnt 0x1
+; GFX1200-TRUE16-NEXT:    v_cvt_f32_f16_e32 v0, v2.h
+; GFX1200-TRUE16-NEXT:    s_wait_loadcnt 0x0
+; GFX1200-TRUE16-NEXT:    v_cvt_f32_f16_e32 v4, v3.h
+; GFX1200-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(TRANS32_DEP_1)
+; GFX1200-TRUE16-NEXT:    v_rcp_f32_e32 v4, v4
+; GFX1200-TRUE16-NEXT:    v_mul_f32_e32 v0, v0, v4
+; GFX1200-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1200-TRUE16-NEXT:    v_fma_mix_f32 v5, -v3, v0, v2 op_sel:[1,0,1] op_sel_hi:[1,0,1]
+; GFX1200-TRUE16-NEXT:    v_fmac_f32_e32 v0, v5, v4
+; GFX1200-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1200-TRUE16-NEXT:    v_fma_mix_f32 v5, -v3, v0, v2 op_sel:[1,0,1] op_sel_hi:[1,0,1]
+; GFX1200-TRUE16-NEXT:    v_mul_f32_e32 v4, v5, v4
+; GFX1200-TRUE16-NEXT:    v_lshrrev_b32_e32 v5, 16, v3
+; GFX1200-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1200-TRUE16-NEXT:    v_and_b32_e32 v4, 0xff800000, v4
+; GFX1200-TRUE16-NEXT:    v_add_f32_e32 v0, v4, v0
+; GFX1200-TRUE16-NEXT:    v_lshrrev_b32_e32 v4, 16, v2
+; GFX1200-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1200-TRUE16-NEXT:    v_cvt_f16_f32_e32 v0.l, v0
+; GFX1200-TRUE16-NEXT:    v_div_fixup_f16 v0.l, v0.l, v5.l, v4.l
+; GFX1200-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1200-TRUE16-NEXT:    v_trunc_f16_e32 v0.l, v0.l
+; GFX1200-TRUE16-NEXT:    v_xor_b32_e32 v0, 0x8000, v0
+; GFX1200-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX1200-TRUE16-NEXT:    v_fmac_f16_e32 v4.l, v0.l, v5.l
+; GFX1200-TRUE16-NEXT:    v_cvt_f32_f16_e32 v5, v3.l
+; GFX1200-TRUE16-NEXT:    v_cvt_f32_f16_e32 v0, v2.l
+; GFX1200-TRUE16-NEXT:    v_rcp_f32_e32 v5, v5
+; GFX1200-TRUE16-NEXT:    s_delay_alu instid0(TRANS32_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1200-TRUE16-NEXT:    v_mul_f32_e32 v0, v0, v5
+; GFX1200-TRUE16-NEXT:    v_fma_mix_f32 v6, -v3, v0, v2 op_sel_hi:[1,0,1]
+; GFX1200-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1200-TRUE16-NEXT:    v_fmac_f32_e32 v0, v6, v5
+; GFX1200-TRUE16-NEXT:    v_fma_mix_f32 v6, -v3, v0, v2 op_sel_hi:[1,0,1]
+; GFX1200-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1200-TRUE16-NEXT:    v_mul_f32_e32 v5, v6, v5
+; GFX1200-TRUE16-NEXT:    v_and_b32_e32 v5, 0xff800000, v5
+; GFX1200-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1200-TRUE16-NEXT:    v_add_f32_e32 v0, v5, v0
+; GFX1200-TRUE16-NEXT:    v_cvt_f16_f32_e32 v0.l, v0
+; GFX1200-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1200-TRUE16-NEXT:    v_div_fixup_f16 v0.l, v0.l, v3.l, v2.l
+; GFX1200-TRUE16-NEXT:    v_trunc_f16_e32 v0.l, v0.l
+; GFX1200-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1200-TRUE16-NEXT:    v_xor_b32_e32 v0, 0x8000, v0
+; GFX1200-TRUE16-NEXT:    v_fmac_f16_e32 v2.l, v0.l, v3.l
+; GFX1200-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1200-TRUE16-NEXT:    v_pack_b32_f16 v0, v2.l, v4.l
+; GFX1200-TRUE16-NEXT:    global_store_b32 v1, v0, s[0:1]
+; GFX1200-TRUE16-NEXT:    s_endpgm
+;
+; GFX1200-FAKE16-LABEL: frem_v2f16:
+; GFX1200-FAKE16:       ; %bb.0:
+; GFX1200-FAKE16-NEXT:    s_clause 0x1
+; GFX1200-FAKE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX1200-FAKE16-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34
+; GFX1200-FAKE16-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1200-FAKE16-NEXT:    s_wait_kmcnt 0x0
+; GFX1200-FAKE16-NEXT:    s_clause 0x1
+; GFX1200-FAKE16-NEXT:    global_load_b32 v1, v0, s[2:3]
+; GFX1200-FAKE16-NEXT:    global_load_b32 v2, v0, s[4:5] offset:16
+; GFX1200-FAKE16-NEXT:    s_wait_loadcnt 0x1
+; GFX1200-FAKE16-NEXT:    v_lshrrev_b32_e32 v3, 16, v1
+; GFX1200-FAKE16-NEXT:    s_wait_loadcnt 0x0
+; GFX1200-FAKE16-NEXT:    v_lshrrev_b32_e32 v5, 16, v2
+; GFX1200-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1200-FAKE16-NEXT:    v_cvt_f32_f16_e32 v4, v3
+; GFX1200-FAKE16-NEXT:    v_cvt_f32_f16_e32 v6, v5
+; GFX1200-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(TRANS32_DEP_1)
+; GFX1200-FAKE16-NEXT:    v_rcp_f32_e32 v6, v6
+; GFX1200-FAKE16-NEXT:    v_mul_f32_e32 v4, v4, v6
+; GFX1200-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1200-FAKE16-NEXT:    v_fma_mix_f32 v7, -v2, v4, v1 op_sel:[1,0,1] op_sel_hi:[1,0,1]
+; GFX1200-FAKE16-NEXT:    v_fmac_f32_e32 v4, v7, v6
+; GFX1200-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1200-FAKE16-NEXT:    v_fma_mix_f32 v7, -v2, v4, v1 op_sel:[1,0,1] op_sel_hi:[1,0,1]
+; GFX1200-FAKE16-NEXT:    v_mul_f32_e32 v6, v7, v6
+; GFX1200-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1200-FAKE16-NEXT:    v_and_b32_e32 v6, 0xff800000, v6
+; GFX1200-FAKE16-NEXT:    v_add_f32_e32 v4, v6, v4
+; GFX1200-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1200-FAKE16-NEXT:    v_cvt_f16_f32_e32 v4, v4
+; GFX1200-FAKE16-NEXT:    v_div_fixup_f16 v4, v4, v5, v3
+; GFX1200-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1200-FAKE16-NEXT:    v_trunc_f16_e32 v4, v4
+; GFX1200-FAKE16-NEXT:    v_xor_b32_e32 v4, 0x8000, v4
+; GFX1200-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX1200-FAKE16-NEXT:    v_fmac_f16_e32 v3, v4, v5
+; GFX1200-FAKE16-NEXT:    v_cvt_f32_f16_e32 v5, v2
+; GFX1200-FAKE16-NEXT:    v_cvt_f32_f16_e32 v4, v1
+; GFX1200-FAKE16-NEXT:    v_rcp_f32_e32 v5, v5
+; GFX1200-FAKE16-NEXT:    s_delay_alu instid0(TRANS32_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1200-FAKE16-NEXT:    v_mul_f32_e32 v4, v4, v5
+; GFX1200-FAKE16-NEXT:    v_fma_mix_f32 v6, -v2, v4, v1 op_sel_hi:[1,0,1]
+; GFX1200-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1200-FAKE16-NEXT:    v_fmac_f32_e32 v4, v6, v5
+; GFX1200-FAKE16-NEXT:    v_fma_mix_f32 v6, -v2, v4, v1 op_sel_hi:[1,0,1]
+; GFX1200-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1200-FAKE16-NEXT:    v_mul_f32_e32 v5, v6, v5
+; GFX1200-FAKE16-NEXT:    v_and_b32_e32 v5, 0xff800000, v5
+; GFX1200-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1200-FAKE16-NEXT:    v_add_f32_e32 v4, v5, v4
+; GFX1200-FAKE16-NEXT:    v_cvt_f16_f32_e32 v4, v4
+; GFX1200-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1200-FAKE16-NEXT:    v_div_fixup_f16 v4, v4, v2, v1
+; GFX1200-FAKE16-NEXT:    v_trunc_f16_e32 v4, v4
+; GFX1200-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1200-FAKE16-NEXT:    v_xor_b32_e32 v4, 0x8000, v4
+; GFX1200-FAKE16-NEXT:    v_fmac_f16_e32 v1, v4, v2
+; GFX1200-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1200-FAKE16-NEXT:    v_pack_b32_f16 v1, v1, v3
+; GFX1200-FAKE16-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX1200-FAKE16-NEXT:    s_endpgm
                         ptr addrspace(1) %in2) #0 {
    %gep2 = getelementptr <2 x half>, ptr addrspace(1) %in2, i32 4
    %r0 = load <2 x half>, ptr addrspace(1) %in1, align 8
@@ -3398,6 +3854,227 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; GFX1150-FAKE16-NEXT:    v_pack_b32_f16 v1, v1, v2
 ; GFX1150-FAKE16-NEXT:    global_store_b64 v4, v[0:1], s[0:1]
 ; GFX1150-FAKE16-NEXT:    s_endpgm
+;
+; GFX1200-TRUE16-LABEL: frem_v4f16:
+; GFX1200-TRUE16:       ; %bb.0:
+; GFX1200-TRUE16-NEXT:    s_clause 0x1
+; GFX1200-TRUE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX1200-TRUE16-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34
+; GFX1200-TRUE16-NEXT:    v_mov_b32_e32 v5, 0
+; GFX1200-TRUE16-NEXT:    s_wait_kmcnt 0x0
+; GFX1200-TRUE16-NEXT:    s_clause 0x1
+; GFX1200-TRUE16-NEXT:    global_load_b64 v[1:2], v5, s[2:3]
+; GFX1200-TRUE16-NEXT:    global_load_b64 v[3:4], v5, s[4:5] offset:32
+; GFX1200-TRUE16-NEXT:    s_wait_loadcnt 0x1
+; GFX1200-TRUE16-NEXT:    v_cvt_f32_f16_e32 v0, v1.h
+; GFX1200-TRUE16-NEXT:    s_wait_loadcnt 0x0
+; GFX1200-TRUE16-NEXT:    v_cvt_f32_f16_e32 v6, v3.h
+; GFX1200-TRUE16-NEXT:    v_mov_b16_e32 v8.l, v3.l
+; GFX1200-TRUE16-NEXT:    v_mov_b16_e32 v9.l, v1.l
+; GFX1200-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(TRANS32_DEP_1)
+; GFX1200-TRUE16-NEXT:    v_rcp_f32_e32 v6, v6
+; GFX1200-TRUE16-NEXT:    v_mul_f32_e32 v0, v0, v6
+; GFX1200-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1200-TRUE16-NEXT:    v_fma_mix_f32 v7, -v3, v0, v1 op_sel:[1,0,1] op_sel_hi:[1,0,1]
+; GFX1200-TRUE16-NEXT:    v_fmac_f32_e32 v0, v7, v6
+; GFX1200-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1200-TRUE16-NEXT:    v_fma_mix_f32 v7, -v3, v0, v1 op_sel:[1,0,1] op_sel_hi:[1,0,1]
+; GFX1200-TRUE16-NEXT:    v_mul_f32_e32 v6, v7, v6
+; GFX1200-TRUE16-NEXT:    v_lshrrev_b32_e32 v7, 16, v3
+; GFX1200-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1200-TRUE16-NEXT:    v_and_b32_e32 v6, 0xff800000, v6
+; GFX1200-TRUE16-NEXT:    v_add_f32_e32 v0, v6, v0
+; GFX1200-TRUE16-NEXT:    v_lshrrev_b32_e32 v6, 16, v1
+; GFX1200-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1200-TRUE16-NEXT:    v_cvt_f16_f32_e32 v0.l, v0
+; GFX1200-TRUE16-NEXT:    v_div_fixup_f16 v0.l, v0.l, v7.l, v6.l
+; GFX1200-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1200-TRUE16-NEXT:    v_trunc_f16_e32 v0.l, v0.l
+; GFX1200-TRUE16-NEXT:    v_xor_b32_e32 v0, 0x8000, v0
+; GFX1200-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX1200-TRUE16-NEXT:    v_fmac_f16_e32 v6.l, v0.l, v7.l
+; GFX1200-TRUE16-NEXT:    v_cvt_f32_f16_e32 v7, v3.l
+; GFX1200-TRUE16-NEXT:    v_cvt_f32_f16_e32 v0, v1.l
+; GFX1200-TRUE16-NEXT:    v_rcp_f32_e32 v7, v7
+; GFX1200-TRUE16-NEXT:    s_delay_alu instid0(TRANS32_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1200-TRUE16-NEXT:    v_mul_f32_e32 v0, v0, v7
+; GFX1200-TRUE16-NEXT:    v_fma_mix_f32 v10, -v8, v0, v9 op_sel_hi:[1,0,1]
+; GFX1200-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1200-TRUE16-NEXT:    v_fmac_f32_e32 v0, v10, v7
+; GFX1200-TRUE16-NEXT:    v_fma_mix_f32 v8, -v8, v0, v9 op_sel_hi:[1,0,1]
+; GFX1200-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1200-TRUE16-NEXT:    v_mul_f32_e32 v7, v8, v7
+; GFX1200-TRUE16-NEXT:    v_and_b32_e32 v7, 0xff800000, v7
+; GFX1200-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1200-TRUE16-NEXT:    v_add_f32_e32 v0, v7, v0
+; GFX1200-TRUE16-NEXT:    v_cvt_f16_f32_e32 v0.l, v0
+; GFX1200-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1200-TRUE16-NEXT:    v_div_fixup_f16 v0.l, v0.l, v3.l, v1.l
+; GFX1200-TRUE16-NEXT:    v_trunc_f16_e32 v0.l, v0.l
+; GFX1200-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1200-TRUE16-NEXT:    v_xor_b32_e32 v0, 0x8000, v0
+; GFX1200-TRUE16-NEXT:    v_fma_f16 v0.l, v0.l, v3.l, v1.l
+; GFX1200-TRUE16-NEXT:    v_cvt_f32_f16_e32 v3, v4.h
+; GFX1200-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1200-TRUE16-NEXT:    v_pack_b32_f16 v1, v0.l, v6.l
+; GFX1200-TRUE16-NEXT:    v_rcp_f32_e32 v3, v3
+; GFX1200-TRUE16-NEXT:    v_cvt_f32_f16_e32 v0, v2.h
+; GFX1200-TRUE16-NEXT:    s_delay_alu instid0(TRANS32_DEP_1) | instid1(VALU_DEP_1)
+; GFX1200-TRUE16-NEXT:    v_mul_f32_e32 v0, v0, v3
+; GFX1200-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1200-TRUE16-NEXT:    v_fma_mix_f32 v6, -v4, v0, v2 op_sel:[1,0,1] op_sel_hi:[1,0,1]
+; GFX1200-TRUE16-NEXT:    v_fmac_f32_e32 v0, v6, v3
+; GFX1200-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1200-TRUE16-NEXT:    v_fma_mix_f32 v6, -v4, v0, v2 op_sel:[1,0,1] op_sel_hi:[1,0,1]
+; GFX1200-TRUE16-NEXT:    v_mul_f32_e32 v3, v6, v3
+; GFX1200-TRUE16-NEXT:    v_lshrrev_b32_e32 v6, 16, v4
+; GFX1200-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1200-TRUE16-NEXT:    v_and_b32_e32 v3, 0xff800000, v3
+; GFX1200-TRUE16-NEXT:    v_add_f32_e32 v0, v3, v0
+; GFX1200-TRUE16-NEXT:    v_lshrrev_b32_e32 v3, 16, v2
+; GFX1200-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1200-TRUE16-NEXT:    v_cvt_f16_f32_e32 v0.l, v0
+; GFX1200-TRUE16-NEXT:    v_div_fixup_f16 v0.l, v0.l, v6.l, v3.l
+; GFX1200-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1200-TRUE16-NEXT:    v_trunc_f16_e32 v0.l, v0.l
+; GFX1200-TRUE16-NEXT:    v_xor_b32_e32 v0, 0x8000, v0
+; GFX1200-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX1200-TRUE16-NEXT:    v_fmac_f16_e32 v3.l, v0.l, v6.l
+; GFX1200-TRUE16-NEXT:    v_cvt_f32_f16_e32 v6, v4.l
+; GFX1200-TRUE16-NEXT:    v_cvt_f32_f16_e32 v0, v2.l
+; GFX1200-TRUE16-NEXT:    v_rcp_f32_e32 v6, v6
+; GFX1200-TRUE16-NEXT:    s_delay_alu instid0(TRANS32_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1200-TRUE16-NEXT:    v_mul_f32_e32 v0, v0, v6
+; GFX1200-TRUE16-NEXT:    v_fma_mix_f32 v7, -v4, v0, v2 op_sel_hi:[1,0,1]
+; GFX1200-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1200-TRUE16-NEXT:    v_fmac_f32_e32 v0, v7, v6
+; GFX1200-TRUE16-NEXT:    v_fma_mix_f32 v7, -v4, v0, v2 op_sel_hi:[1,0,1]
+; GFX1200-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1200-TRUE16-NEXT:    v_mul_f32_e32 v6, v7, v6
+; GFX1200-TRUE16-NEXT:    v_and_b32_e32 v6, 0xff800000, v6
+; GFX1200-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1200-TRUE16-NEXT:    v_add_f32_e32 v0, v6, v0
+; GFX1200-TRUE16-NEXT:    v_cvt_f16_f32_e32 v0.l, v0
+; GFX1200-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1200-TRUE16-NEXT:    v_div_fixup_f16 v0.l, v0.l, v4.l, v2.l
+; GFX1200-TRUE16-NEXT:    v_trunc_f16_e32 v0.l, v0.l
+; GFX1200-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1200-TRUE16-NEXT:    v_xor_b32_e32 v0, 0x8000, v0
+; GFX1200-TRUE16-NEXT:    v_fma_f16 v0.l, v0.l, v4.l, v2.l
+; GFX1200-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1200-TRUE16-NEXT:    v_pack_b32_f16 v2, v0.l, v3.l
+; GFX1200-TRUE16-NEXT:    global_store_b64 v5, v[1:2], s[0:1]
+; GFX1200-TRUE16-NEXT:    s_endpgm
+;
+; GFX1200-FAKE16-LABEL: frem_v4f16:
+; GFX1200-FAKE16:       ; %bb.0:
+; GFX1200-FAKE16-NEXT:    s_clause 0x1
+; GFX1200-FAKE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX1200-FAKE16-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34
+; GFX1200-FAKE16-NEXT:    v_mov_b32_e32 v4, 0
+; GFX1200-FAKE16-NEXT:    s_wait_kmcnt 0x0
+; GFX1200-FAKE16-NEXT:    s_clause 0x1
+; GFX1200-FAKE16-NEXT:    global_load_b64 v[0:1], v4, s[2:3]
+; GFX1200-FAKE16-NEXT:    global_load_b64 v[2:3], v4, s[4:5] offset:32
+; GFX1200-FAKE16-NEXT:    s_wait_loadcnt 0x1
+; GFX1200-FAKE16-NEXT:    v_lshrrev_b32_e32 v5, 16, v0
+; GFX1200-FAKE16-NEXT:    s_wait_loadcnt 0x0
+; GFX1200-FAKE16-NEXT:    v_lshrrev_b32_e32 v7, 16, v2
+; GFX1200-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1200-FAKE16-NEXT:    v_cvt_f32_f16_e32 v6, v5
+; GFX1200-FAKE16-NEXT:    v_cvt_f32_f16_e32 v8, v7
+; GFX1200-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(TRANS32_DEP_1)
+; GFX1200-FAKE16-NEXT:    v_rcp_f32_e32 v8, v8
+; GFX1200-FAKE16-NEXT:    v_mul_f32_e32 v6, v6, v8
+; GFX1200-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1200-FAKE16-NEXT:    v_fma_mix_f32 v9, -v2, v6, v0 op_sel:[1,0,1] op_sel_hi:[1,0,1]
+; GFX1200-FAKE16-NEXT:    v_fmac_f32_e32 v6, v9, v8
+; GFX1200-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1200-FAKE16-NEXT:    v_fma_mix_f32 v9, -v2, v6, v0 op_sel:[1,0,1] op_sel_hi:[1,0,1]
+; GFX1200-FAKE16-NEXT:    v_mul_f32_e32 v8, v9, v8
+; GFX1200-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1200-FAKE16-NEXT:    v_and_b32_e32 v8, 0xff800000, v8
+; GFX1200-FAKE16-NEXT:    v_add_f32_e32 v6, v8, v6
+; GFX1200-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1200-FAKE16-NEXT:    v_cvt_f16_f32_e32 v6, v6
+; GFX1200-FAKE16-NEXT:    v_div_fixup_f16 v6, v6, v7, v5
+; GFX1200-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1200-FAKE16-NEXT:    v_trunc_f16_e32 v6, v6
+; GFX1200-FAKE16-NEXT:    v_xor_b32_e32 v6, 0x8000, v6
+; GFX1200-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX1200-FAKE16-NEXT:    v_fmac_f16_e32 v5, v6, v7
+; GFX1200-FAKE16-NEXT:    v_cvt_f32_f16_e32 v7, v2
+; GFX1200-FAKE16-NEXT:    v_cvt_f32_f16_e32 v6, v0
+; GFX1200-FAKE16-NEXT:    v_rcp_f32_e32 v7, v7
+; GFX1200-FAKE16-NEXT:    s_delay_alu instid0(TRANS32_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1200-FAKE16-NEXT:    v_mul_f32_e32 v6, v6, v7
+; GFX1200-FAKE16-NEXT:    v_fma_mix_f32 v8, -v2, v6, v0 op_sel_hi:[1,0,1]
+; GFX1200-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1200-FAKE16-NEXT:    v_fmac_f32_e32 v6, v8, v7
+; GFX1200-FAKE16-NEXT:    v_fma_mix_f32 v8, -v2, v6, v0 op_sel_hi:[1,0,1]
+; GFX1200-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1200-FAKE16-NEXT:    v_mul_f32_e32 v7, v8, v7
+; GFX1200-FAKE16-NEXT:    v_and_b32_e32 v7, 0xff800000, v7
+; GFX1200-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1200-FAKE16-NEXT:    v_add_f32_e32 v6, v7, v6
+; GFX1200-FAKE16-NEXT:    v_cvt_f16_f32_e32 v6, v6
+; GFX1200-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1200-FAKE16-NEXT:    v_div_fixup_f16 v6, v6, v2, v0
+; GFX1200-FAKE16-NEXT:    v_trunc_f16_e32 v6, v6
+; GFX1200-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1200-FAKE16-NEXT:    v_xor_b32_e32 v6, 0x8000, v6
+; GFX1200-FAKE16-NEXT:    v_fma_f16 v0, v6, v2, v0
+; GFX1200-FAKE16-NEXT:    v_lshrrev_b32_e32 v6, 16, v3
+; GFX1200-FAKE16-NEXT:    v_lshrrev_b32_e32 v2, 16, v1
+; GFX1200-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX1200-FAKE16-NEXT:    v_pack_b32_f16 v0, v0, v5
+; GFX1200-FAKE16-NEXT:    v_cvt_f32_f16_e32 v7, v6
+; GFX1200-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1200-FAKE16-NEXT:    v_cvt_f32_f16_e32 v5, v2
+; GFX1200-FAKE16-NEXT:    v_rcp_f32_e32 v7, v7
+; GFX1200-FAKE16-NEXT:    s_delay_alu instid0(TRANS32_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1200-FAKE16-NEXT:    v_mul_f32_e32 v5, v5, v7
+; GFX1200-FAKE16-NEXT:    v_fma_mix_f32 v8, -v3, v5, v1 op_sel:[1,0,1] op_sel_hi:[1,0,1]
+; GFX1200-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1200-FAKE16-NEXT:    v_fmac_f32_e32 v5, v8, v7
+; GFX1200-FAKE16-NEXT:    v_fma_mix_f32 v8, -v3, v5, v1 op_sel:[1,0,1] op_sel_hi:[1,0,1]
+; GFX1200-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1200-FAKE16-NEXT:    v_mul_f32_e32 v7, v8, v7
+; GFX1200-FAKE16-NEXT:    v_and_b32_e32 v7, 0xff800000, v7
+; GFX1200-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1200-FAKE16-NEXT:    v_add_f32_e32 v5, v7, v5
+; GFX1200-FAKE16-NEXT:    v_cvt_f16_f32_e32 v5, v5
+; GFX1200-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1200-FAKE16-NEXT:    v_div_fixup_f16 v5, v5, v6, v2
+; GFX1200-FAKE16-NEXT:    v_trunc_f16_e32 v5, v5
+; GFX1200-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1200-FAKE16-NEXT:    v_xor_b32_e32 v5, 0x8000, v5
+; GFX1200-FAKE16-NEXT:    v_fmac_f16_e32 v2, v5, v6
+; GFX1200-FAKE16-NEXT:    v_cvt_f32_f16_e32 v6, v3
+; GFX1200-FAKE16-NEXT:    v_cvt_f32_f16_e32 v5, v1
+; GFX1200-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(TRANS32_DEP_1)
+; GFX1200-FAKE16-NEXT:    v_rcp_f32_e32 v6, v6
+; GFX1200-FAKE16-NEXT:    v_mul_f32_e32 v5, v5, v6
+; GFX1200-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1200-FAKE16-NEXT:    v_fma_mix_f32 v7, -v3, v5, v1 op_sel_hi:[1,0,1]
+; GFX1200-FAKE16-NEXT:    v_fmac_f32_e32 v5, v7, v6
+; GFX1200-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1200-FAKE16-NEXT:    v_fma_mix_f32 v7, -v3, v5, v1 op_sel_hi:[1,0,1]
+; GFX1200-FAKE16-NEXT:    v_mul_f32_e32 v6, v7, v6
+; GFX1200-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1200-FAKE16-NEXT:    v_and_b32_e32 v6, 0xff800000, v6
+; GFX1200-FAKE16-NEXT:    v_add_f32_e32 v5, v6, v5
+; GFX1200-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1200-FAKE16-NEXT:    v_cvt_f16_f32_e32 v5, v5
+; GFX1200-FAKE16-NEXT:    v_div_fixup_f16 v5, v5, v3, v1
+; GFX1200-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1200-FAKE16-NEXT:    v_trunc_f16_e32 v5, v5
+; GFX1200-FAKE16-NEXT:    v_xor_b32_e32 v5, 0x8000, v5
+; GFX1200-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1200-FAKE16-NEXT:    v_fmac_f16_e32 v1, v5, v3
+; GFX1200-FAKE16-NEXT:    v_pack_b32_f16 v1, v1, v2
+; GFX1200-FAKE16-NEXT:    global_store_b64 v4, v[0:1], s[0:1]
+; GFX1200-FAKE16-NEXT:    s_endpgm
                         ptr addrspace(1) %in2) #0 {
    %gep2 = getelementptr <4 x half>, ptr addrspace(1) %in2, i32 4
    %r0 = load <4 x half>, ptr addrspace(1) %in1, align 16
@@ -3758,6 +4435,65 @@ define amdgpu_kernel void @frem_v2f32(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; GFX1150-NEXT:    v_fmac_f32_e32 v0, v3, v2
 ; GFX1150-NEXT:    global_store_b64 v4, v[0:1], s[0:1]
 ; GFX1150-NEXT:    s_endpgm
+;
+; GFX1200-LABEL: frem_v2f32:
+; GFX1200:       ; %bb.0:
+; GFX1200-NEXT:    s_clause 0x1
+; GFX1200-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX1200-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34
+; GFX1200-NEXT:    v_mov_b32_e32 v4, 0
+; GFX1200-NEXT:    s_wait_kmcnt 0x0
+; GFX1200-NEXT:    s_clause 0x1
+; GFX1200-NEXT:    global_load_b64 v[0:1], v4, s[2:3]
+; GFX1200-NEXT:    global_load_b64 v[2:3], v4, s[4:5] offset:32
+; GFX1200-NEXT:    s_wait_loadcnt 0x0
+; GFX1200-NEXT:    v_div_scale_f32 v6, null, v3, v3, v1
+; GFX1200-NEXT:    v_div_scale_f32 v5, vcc_lo, v1, v3, v1
+; GFX1200-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(TRANS32_DEP_1)
+; GFX1200-NEXT:    v_rcp_f32_e32 v7, v6
+; GFX1200-NEXT:    s_denorm_mode 15
+; GFX1200-NEXT:    v_fma_f32 v8, -v6, v7, 1.0
+; GFX1200-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1200-NEXT:    v_fmac_f32_e32 v7, v8, v7
+; GFX1200-NEXT:    v_mul_f32_e32 v8, v5, v7
+; GFX1200-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1200-NEXT:    v_fma_f32 v9, -v6, v8, v5
+; GFX1200-NEXT:    v_fmac_f32_e32 v8, v9, v7
+; GFX1200-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1200-NEXT:    v_fma_f32 v5, -v6, v8, v5
+; GFX1200-NEXT:    s_denorm_mode 12
+; GFX1200-NEXT:    v_div_fmas_f32 v5, v5, v7, v8
+; GFX1200-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1200-NEXT:    v_div_fixup_f32 v5, v5, v3, v1
+; GFX1200-NEXT:    v_trunc_f32_e32 v5, v5
+; GFX1200-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1200-NEXT:    v_xor_b32_e32 v5, 0x80000000, v5
+; GFX1200-NEXT:    v_fma_f32 v1, v5, v3, v1
+; GFX1200-NEXT:    v_div_scale_f32 v5, null, v2, v2, v0
+; GFX1200-NEXT:    v_div_scale_f32 v3, vcc_lo, v0, v2, v0
+; GFX1200-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(TRANS32_DEP_1)
+; GFX1200-NEXT:    v_rcp_f32_e32 v6, v5
+; GFX1200-NEXT:    s_denorm_mode 15
+; GFX1200-NEXT:    v_fma_f32 v7, -v5, v6, 1.0
+; GFX1200-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1200-NEXT:    v_fmac_f32_e32 v6, v7, v6
+; GFX1200-NEXT:    v_mul_f32_e32 v7, v3, v6
+; GFX1200-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1200-NEXT:    v_fma_f32 v8, -v5, v7, v3
+; GFX1200-NEXT:    v_fmac_f32_e32 v7, v8, v6
+; GFX1200-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX1200-NEXT:    v_fma_f32 v3, -v5, v7, v3
+; GFX1200-NEXT:    s_denorm_mode 12
+; GFX1200-NEXT:    s_wait_alu 0xfffd
+; GFX1200-NEXT:    v_div_fmas_f32 v3, v3, v6, v7
+; GFX1200-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1200-NEXT:    v_div_fixup_f32 v3, v3, v2, v0
+; GFX1200-NEXT:    v_trunc_f32_e32 v3, v3
+; GFX1200-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1200-NEXT:    v_xor_b32_e32 v3, 0x80000000, v3
+; GFX1200-NEXT:    v_fmac_f32_e32 v0, v3, v2
+; GFX1200-NEXT:    global_store_b64 v4, v[0:1], s[0:1]
+; GFX1200-NEXT:    s_endpgm
                         ptr addrspace(1) %in2) #0 {
    %gep2 = getelementptr <2 x float>, ptr addrspace(1) %in2, i32 4
    %r0 = load <2 x float>, ptr addrspace(1) %in1, align 8
@@ -4354,6 +5090,111 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; GFX1150-NEXT:    v_fmac_f32_e32 v0, v5, v4
 ; GFX1150-NEXT:    global_store_b128 v8, v[0:3], s[0:1]
 ; GFX1150-NEXT:    s_endpgm
+;
+; GFX1200-LABEL: frem_v4f32:
+; GFX1200:       ; %bb.0:
+; GFX1200-NEXT:    s_clause 0x1
+; GFX1200-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX1200-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34
+; GFX1200-NEXT:    v_mov_b32_e32 v8, 0
+; GFX1200-NEXT:    s_wait_kmcnt 0x0
+; GFX1200-NEXT:    s_clause 0x1
+; GFX1200-NEXT:    global_load_b128 v[0:3], v8, s[2:3]
+; GFX1200-NEXT:    global_load_b128 v[4:7], v8, s[4:5] offset:64
+; GFX1200-NEXT:    s_wait_loadcnt 0x0
+; GFX1200-NEXT:    v_div_scale_f32 v10, null, v7, v7, v3
+; GFX1200-NEXT:    v_div_scale_f32 v9, vcc_lo, v3, v7, v3
+; GFX1200-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(TRANS32_DEP_1)
+; GFX1200-NEXT:    v_rcp_f32_e32 v11, v10
+; GFX1200-NEXT:    s_denorm_mode 15
+; GFX1200-NEXT:    v_fma_f32 v12, -v10, v11, 1.0
+; GFX1200-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1200-NEXT:    v_fmac_f32_e32 v11, v12, v11
+; GFX1200-NEXT:    v_mul_f32_e32 v12, v9, v11
+; GFX1200-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1200-NEXT:    v_fma_f32 v13, -v10, v12, v9
+; GFX1200-NEXT:    v_fmac_f32_e32 v12, v13, v11
+; GFX1200-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1200-NEXT:    v_fma_f32 v9, -v10, v12, v9
+; GFX1200-NEXT:    s_denorm_mode 12
+; GFX1200-NEXT:    v_div_fmas_f32 v9, v9, v11, v12
+; GFX1200-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1200-NEXT:    v_div_fixup_f32 v9, v9, v7, v3
+; GFX1200-NEXT:    v_trunc_f32_e32 v9, v9
+; GFX1200-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1200-NEXT:    v_xor_b32_e32 v9, 0x80000000, v9
+; GFX1200-NEXT:    v_fma_f32 v3, v9, v7, v3
+; GFX1200-NEXT:    v_div_scale_f32 v9, null, v6, v6, v2
+; GFX1200-NEXT:    v_div_scale_f32 v7, vcc_lo, v2, v6, v2
+; GFX1200-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(TRANS32_DEP_1)
+; GFX1200-NEXT:    v_rcp_f32_e32 v10, v9
+; GFX1200-NEXT:    s_denorm_mode 15
+; GFX1200-NEXT:    v_fma_f32 v11, -v9, v10, 1.0
+; GFX1200-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1200-NEXT:    v_fmac_f32_e32 v10, v11, v10
+; GFX1200-NEXT:    v_mul_f32_e32 v11, v7, v10
+; GFX1200-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1200-NEXT:    v_fma_f32 v12, -v9, v11, v7
+; GFX1200-NEXT:    v_fmac_f32_e32 v11, v12, v10
+; GFX1200-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX1200-NEXT:    v_fma_f32 v7, -v9, v11, v7
+; GFX1200-NEXT:    s_denorm_mode 12
+; GFX1200-NEXT:    s_wait_alu 0xfffd
+; GFX1200-NEXT:    v_div_fmas_f32 v7, v7, v10, v11
+; GFX1200-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1200-NEXT:    v_div_fixup_f32 v7, v7, v6, v2
+; GFX1200-NEXT:    v_trunc_f32_e32 v7, v7
+; GFX1200-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1200-NEXT:    v_xor_b32_e32 v7, 0x80000000, v7
+; GFX1200-NEXT:    v_fma_f32 v2, v7, v6, v2
+; GFX1200-NEXT:    v_div_scale_f32 v7, null, v5, v5, v1
+; GFX1200-NEXT:    v_div_scale_f32 v6, vcc_lo, v1, v5, v1
+; GFX1200-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(TRANS32_DEP_1)
+; GFX1200-NEXT:    v_rcp_f32_e32 v9, v7
+; GFX1200-NEXT:    s_denorm_mode 15
+; GFX1200-NEXT:    v_fma_f32 v10, -v7, v9, 1.0
+; GFX1200-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1200-NEXT:    v_fmac_f32_e32 v9, v10, v9
+; GFX1200-NEXT:    v_mul_f32_e32 v10, v6, v9
+; GFX1200-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1200-NEXT:    v_fma_f32 v11, -v7, v10, v6
+; GFX1200-NEXT:    v_fmac_f32_e32 v10, v11, v9
+; GFX1200-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX1200-NEXT:    v_fma_f32 v6, -v7, v10, v6
+; GFX1200-NEXT:    s_denorm_mode 12
+; GFX1200-NEXT:    s_wait_alu 0xfffd
+; GFX1200-NEXT:    v_div_fmas_f32 v6, v6, v9, v10
+; GFX1200-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1200-NEXT:    v_div_fixup_f32 v6, v6, v5, v1
+; GFX1200-NEXT:    v_trunc_f32_e32 v6, v6
+; GFX1200-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1200-NEXT:    v_xor_b32_e32 v6, 0x80000000, v6
+; GFX1200-NEXT:    v_fma_f32 v1, v6, v5, v1
+; GFX1200-NEXT:    v_div_scale_f32 v6, null, v4, v4, v0
+; GFX1200-NEXT:    v_div_scale_f32 v5, vcc_lo, v0, v4, v0
+; GFX1200-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(TRANS32_DEP_1)
+; GFX1200-NEXT:    v_rcp_f32_e32 v7, v6
+; GFX1200-NEXT:    s_denorm_mode 15
+; GFX1200-NEXT:    v_fma_f32 v9, -v6, v7, 1.0
+; GFX1200-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1200-NEXT:    v_fmac_f32_e32 v7, v9, v7
+; GFX1200-NEXT:    v_mul_f32_e32 v9, v5, v7
+; GFX1200-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1200-NEXT:    v_fma_f32 v10, -v6, v9, v5
+; GFX1200-NEXT:    v_fmac_f32_e32 v9, v10, v7
+; GFX1200-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX1200-NEXT:    v_fma_f32 v5, -v6, v9, v5
+; GFX1200-NEXT:    s_denorm_mode 12
+; GFX1200-NEXT:    s_wait_alu 0xfffd
+; GFX1200-NEXT:    v_div_fmas_f32 v5, v5, v7, v9
+; GFX1200-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1200-NEXT:    v_div_fixup_f32 v5, v5, v4, v0
+; GFX1200-NEXT:    v_trunc_f32_e32 v5, v5
+; GFX1200-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1200-NEXT:    v_xor_b32_e32 v5, 0x80000000, v5
+; GFX1200-NEXT:    v_fmac_f32_e32 v0, v5, v4
+; GFX1200-NEXT:    global_store_b128 v8, v[0:3], s[0:1]
+; GFX1200-NEXT:    s_endpgm
                         ptr addrspace(1) %in2) #0 {
    %gep2 = getelementptr <4 x float>, ptr addrspace(1) %in2, i32 4
    %r0 = load <4 x float>, ptr addrspace(1) %in1, align 16
@@ -4734,6 +5575,58 @@ define amdgpu_kernel void @frem_v2f64(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; GFX1150-NEXT:    v_fma_f64 v[0:1], -v[6:7], v[4:5], v[0:1]
 ; GFX1150-NEXT:    global_store_b128 v16, v[0:3], s[0:1]
 ; GFX1150-NEXT:    s_endpgm
+;
+; GFX1200-LABEL: frem_v2f64:
+; GFX1200:       ; %bb.0:
+; GFX1200-NEXT:    s_clause 0x1
+; GFX1200-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX1200-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34
+; GFX1200-NEXT:    v_mov_b32_e32 v16, 0
+; GFX1200-NEXT:    s_wait_kmcnt 0x0
+; GFX1200-NEXT:    s_clause 0x1
+; GFX1200-NEXT:    global_load_b128 v[0:3], v16, s[2:3]
+; GFX1200-NEXT:    global_load_b128 v[4:7], v16, s[4:5] offset:64
+; GFX1200-NEXT:    s_wait_loadcnt 0x0
+; GFX1200-NEXT:    v_div_scale_f64 v[8:9], null, v[6:7], v[6:7], v[2:3]
+; GFX1200-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(TRANS32_DEP_1)
+; GFX1200-NEXT:    v_rcp_f64_e32 v[10:11], v[8:9]
+; GFX1200-NEXT:    v_fma_f64 v[12:13], -v[8:9], v[10:11], 1.0
+; GFX1200-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1200-NEXT:    v_fma_f64 v[10:11], v[10:11], v[12:13], v[10:11]
+; GFX1200-NEXT:    v_fma_f64 v[12:13], -v[8:9], v[10:11], 1.0
+; GFX1200-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1200-NEXT:    v_fma_f64 v[10:11], v[10:11], v[12:13], v[10:11]
+; GFX1200-NEXT:    v_div_scale_f64 v[12:13], vcc_lo, v[2:3], v[6:7], v[2:3]
+; GFX1200-NEXT:    v_mul_f64_e32 v[14:15], v[12:13], v[10:11]
+; GFX1200-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1200-NEXT:    v_fma_f64 v[8:9], -v[8:9], v[14:15], v[12:13]
+; GFX1200-NEXT:    v_div_fmas_f64 v[8:9], v[8:9], v[10:11], v[14:15]
+; GFX1200-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1200-NEXT:    v_div_fixup_f64 v[8:9], v[8:9], v[6:7], v[2:3]
+; GFX1200-NEXT:    v_trunc_f64_e32 v[8:9], v[8:9]
+; GFX1200-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1200-NEXT:    v_fma_f64 v[2:3], -v[8:9], v[6:7], v[2:3]
+; GFX1200-NEXT:    v_div_scale_f64 v[6:7], null, v[4:5], v[4:5], v[0:1]
+; GFX1200-NEXT:    v_rcp_f64_e32 v[8:9], v[6:7]
+; GFX1200-NEXT:    s_delay_alu instid0(TRANS32_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1200-NEXT:    v_fma_f64 v[10:11], -v[6:7], v[8:9], 1.0
+; GFX1200-NEXT:    v_fma_f64 v[8:9], v[8:9], v[10:11], v[8:9]
+; GFX1200-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1200-NEXT:    v_fma_f64 v[10:11], -v[6:7], v[8:9], 1.0
+; GFX1200-NEXT:    v_fma_f64 v[8:9], v[8:9], v[10:11], v[8:9]
+; GFX1200-NEXT:    v_div_scale_f64 v[10:11], vcc_lo, v[0:1], v[4:5], v[0:1]
+; GFX1200-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1200-NEXT:    v_mul_f64_e32 v[12:13], v[10:11], v[8:9]
+; GFX1200-NEXT:    v_fma_f64 v[6:7], -v[6:7], v[12:13], v[10:11]
+; GFX1200-NEXT:    s_wait_alu 0xfffd
+; GFX1200-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1200-NEXT:    v_div_fmas_f64 v[6:7], v[6:7], v[8:9], v[12:13]
+; GFX1200-NEXT:    v_div_fixup_f64 v[6:7], v[6:7], v[4:5], v[0:1]
+; GFX1200-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1200-NEXT:    v_trunc_f64_e32 v[6:7], v[6:7]
+; GFX1200-NEXT:    v_fma_f64 v[0:1], -v[6:7], v[4:5], v[0:1]
+; GFX1200-NEXT:    global_store_b128 v16, v[0:3], s[0:1]
+; GFX1200-NEXT:    s_endpgm
                         ptr addrspace(1) %in2) #0 {
    %gep2 = getelementptr <2 x double>, ptr addrspace(1) %in2, i32 4
    %r0 = load <2 x double>, ptr addrspace(1) %in1, align 16
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.rcp.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.rcp.f16.ll
index c6ea12dd61651..a2be749bb3c20 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.rcp.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.rcp.f16.ll
@@ -1,17 +1,102 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
 ; RUN: llc -mtriple=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-flat-for-global,+real-true16 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX11-TRUE16 %s
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-flat-for-global,-real-true16 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX11-FAKE16 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=-flat-for-global,+real-true16 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX12-TRUE16 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=-flat-for-global,-real-true16 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX12-FAKE16 %s
 
 declare half @llvm.amdgcn.rcp.f16(half %a)
 
-; GCN-LABEL: {{^}}rcp_f16
-; GCN: buffer_load_ushort v[[A_F16:[0-9]+]]
-; VI:  v_rcp_f16_e32 v[[R_F16:[0-9]+]], v[[A_F16]]
-; GFX11-TRUE16:  v_rcp_f16_e32 v[[A_F16:[0-9]+]].l, v[[A_F16]].l
-; GFX11-FAKE16:  v_rcp_f16_e32 v[[A_F16:[0-9]+]], v[[A_F16]]
-; GCN: buffer_store_short v[[R_F16]]
-; GCN: s_endpgm
 define amdgpu_kernel void @rcp_f16(
+; GCN-LABEL: rcp_f16:
+; GCN:       ; %bb.0: ; %entry
+; GCN-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GCN-NEXT:    s_mov_b32 s7, 0xf000
+; GCN-NEXT:    s_mov_b32 s6, -1
+; GCN-NEXT:    s_mov_b32 s10, s6
+; GCN-NEXT:    s_mov_b32 s11, s7
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    s_mov_b32 s8, s2
+; GCN-NEXT:    s_mov_b32 s9, s3
+; GCN-NEXT:    buffer_load_ushort v0, off, s[8:11], 0
+; GCN-NEXT:    s_mov_b32 s4, s0
+; GCN-NEXT:    s_mov_b32 s5, s1
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_rcp_f16_e32 v0, v0
+; GCN-NEXT:    buffer_store_short v0, off, s[4:7], 0
+; GCN-NEXT:    s_endpgm
+;
+; GFX11-TRUE16-LABEL: rcp_f16:
+; GFX11-TRUE16:       ; %bb.0: ; %entry
+; GFX11-TRUE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-TRUE16-NEXT:    s_mov_b32 s6, -1
+; GFX11-TRUE16-NEXT:    s_mov_b32 s7, 0x31016000
+; GFX11-TRUE16-NEXT:    s_mov_b32 s10, s6
+; GFX11-TRUE16-NEXT:    s_mov_b32 s11, s7
+; GFX11-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    s_mov_b32 s8, s2
+; GFX11-TRUE16-NEXT:    s_mov_b32 s9, s3
+; GFX11-TRUE16-NEXT:    s_mov_b32 s4, s0
+; GFX11-TRUE16-NEXT:    buffer_load_u16 v0, off, s[8:11], 0
+; GFX11-TRUE16-NEXT:    s_mov_b32 s5, s1
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT:    v_rcp_f16_e32 v0.l, v0.l
+; GFX11-TRUE16-NEXT:    buffer_store_b16 v0, off, s[4:7], 0
+; GFX11-TRUE16-NEXT:    s_endpgm
+;
+; GFX11-FAKE16-LABEL: rcp_f16:
+; GFX11-FAKE16:       ; %bb.0: ; %entry
+; GFX11-FAKE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-FAKE16-NEXT:    s_mov_b32 s6, -1
+; GFX11-FAKE16-NEXT:    s_mov_b32 s7, 0x31016000
+; GFX11-FAKE16-NEXT:    s_mov_b32 s10, s6
+; GFX11-FAKE16-NEXT:    s_mov_b32 s11, s7
+; GFX11-FAKE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    s_mov_b32 s8, s2
+; GFX11-FAKE16-NEXT:    s_mov_b32 s9, s3
+; GFX11-FAKE16-NEXT:    s_mov_b32 s4, s0
+; GFX11-FAKE16-NEXT:    buffer_load_u16 v0, off, s[8:11], 0
+; GFX11-FAKE16-NEXT:    s_mov_b32 s5, s1
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT:    v_rcp_f16_e32 v0, v0
+; GFX11-FAKE16-NEXT:    buffer_store_b16 v0, off, s[4:7], 0
+; GFX11-FAKE16-NEXT:    s_endpgm
+;
+; GFX12-TRUE16-LABEL: rcp_f16:
+; GFX12-TRUE16:       ; %bb.0: ; %entry
+; GFX12-TRUE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX12-TRUE16-NEXT:    s_mov_b32 s6, -1
+; GFX12-TRUE16-NEXT:    s_mov_b32 s7, 0x31016000
+; GFX12-TRUE16-NEXT:    s_mov_b32 s10, s6
+; GFX12-TRUE16-NEXT:    s_mov_b32 s11, s7
+; GFX12-TRUE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-TRUE16-NEXT:    s_mov_b32 s8, s2
+; GFX12-TRUE16-NEXT:    s_mov_b32 s9, s3
+; GFX12-TRUE16-NEXT:    s_mov_b32 s4, s0
+; GFX12-TRUE16-NEXT:    buffer_load_u16 v0, off, s[8:11], null
+; GFX12-TRUE16-NEXT:    s_mov_b32 s5, s1
+; GFX12-TRUE16-NEXT:    s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT:    v_rcp_f16_e32 v0.l, v0.l
+; GFX12-TRUE16-NEXT:    buffer_store_b16 v0, off, s[4:7], null
+; GFX12-TRUE16-NEXT:    s_endpgm
+;
+; GFX12-FAKE16-LABEL: rcp_f16:
+; GFX12-FAKE16:       ; %bb.0: ; %entry
+; GFX12-FAKE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX12-FAKE16-NEXT:    s_mov_b32 s6, -1
+; GFX12-FAKE16-NEXT:    s_mov_b32 s7, 0x31016000
+; GFX12-FAKE16-NEXT:    s_mov_b32 s10, s6
+; GFX12-FAKE16-NEXT:    s_mov_b32 s11, s7
+; GFX12-FAKE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-FAKE16-NEXT:    s_mov_b32 s8, s2
+; GFX12-FAKE16-NEXT:    s_mov_b32 s9, s3
+; GFX12-FAKE16-NEXT:    s_mov_b32 s4, s0
+; GFX12-FAKE16-NEXT:    buffer_load_u16 v0, off, s[8:11], null
+; GFX12-FAKE16-NEXT:    s_mov_b32 s5, s1
+; GFX12-FAKE16-NEXT:    s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT:    v_rcp_f16_e32 v0, v0
+; GFX12-FAKE16-NEXT:    buffer_store_b16 v0, off, s[4:7], null
+; GFX12-FAKE16-NEXT:    s_endpgm
     ptr addrspace(1) %r,
     ptr addrspace(1) %a) {
 entry:
@@ -20,3 +105,5 @@ entry:
   store half %r.val, ptr addrspace(1) %r
   ret void
 }
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; VI: {{.*}}
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.rsq.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.rsq.f16.ll
index 0924e9a5c2314..bf371478cc1dc 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.rsq.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.rsq.f16.ll
@@ -1,17 +1,102 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
 ; RUN: llc -mtriple=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-flat-for-global,+real-true16 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX11-TRUE16 %s
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-flat-for-global,-real-true16 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX11-FAKE16 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=-flat-for-global,+real-true16 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX12-TRUE16 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=-flat-for-global,-real-true16 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX12-FAKE16 %s
 
 declare half @llvm.amdgcn.rsq.f16(half %a)
 
-; GCN-LABEL: {{^}}rsq_f16
-; GCN: buffer_load_ushort v[[A_F16:[0-9]+]]
-; VI:  v_rsq_f16_e32 v[[R_F16:[0-9]+]], v[[A_F16]]
-; GFX11-TRUE16:  v_rsq_f16_e32 v[[A_F16:[0-9]+]].l, v[[A_F16]].l
-; GFX11-FAKE16:  v_rsq_f16_e32 v[[A_F16:[0-9]+]], v[[A_F16]]
-; GCN: buffer_store_short v[[R_F16]]
-; GCN: s_endpgm
 define amdgpu_kernel void @rsq_f16(
+; GCN-LABEL: rsq_f16:
+; GCN:       ; %bb.0: ; %entry
+; GCN-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GCN-NEXT:    s_mov_b32 s7, 0xf000
+; GCN-NEXT:    s_mov_b32 s6, -1
+; GCN-NEXT:    s_mov_b32 s10, s6
+; GCN-NEXT:    s_mov_b32 s11, s7
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    s_mov_b32 s8, s2
+; GCN-NEXT:    s_mov_b32 s9, s3
+; GCN-NEXT:    buffer_load_ushort v0, off, s[8:11], 0
+; GCN-NEXT:    s_mov_b32 s4, s0
+; GCN-NEXT:    s_mov_b32 s5, s1
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_rsq_f16_e32 v0, v0
+; GCN-NEXT:    buffer_store_short v0, off, s[4:7], 0
+; GCN-NEXT:    s_endpgm
+;
+; GFX11-TRUE16-LABEL: rsq_f16:
+; GFX11-TRUE16:       ; %bb.0: ; %entry
+; GFX11-TRUE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-TRUE16-NEXT:    s_mov_b32 s6, -1
+; GFX11-TRUE16-NEXT:    s_mov_b32 s7, 0x31016000
+; GFX11-TRUE16-NEXT:    s_mov_b32 s10, s6
+; GFX11-TRUE16-NEXT:    s_mov_b32 s11, s7
+; GFX11-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    s_mov_b32 s8, s2
+; GFX11-TRUE16-NEXT:    s_mov_b32 s9, s3
+; GFX11-TRUE16-NEXT:    s_mov_b32 s4, s0
+; GFX11-TRUE16-NEXT:    buffer_load_u16 v0, off, s[8:11], 0
+; GFX11-TRUE16-NEXT:    s_mov_b32 s5, s1
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT:    v_rsq_f16_e32 v0.l, v0.l
+; GFX11-TRUE16-NEXT:    buffer_store_b16 v0, off, s[4:7], 0
+; GFX11-TRUE16-NEXT:    s_endpgm
+;
+; GFX11-FAKE16-LABEL: rsq_f16:
+; GFX11-FAKE16:       ; %bb.0: ; %entry
+; GFX11-FAKE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-FAKE16-NEXT:    s_mov_b32 s6, -1
+; GFX11-FAKE16-NEXT:    s_mov_b32 s7, 0x31016000
+; GFX11-FAKE16-NEXT:    s_mov_b32 s10, s6
+; GFX11-FAKE16-NEXT:    s_mov_b32 s11, s7
+; GFX11-FAKE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    s_mov_b32 s8, s2
+; GFX11-FAKE16-NEXT:    s_mov_b32 s9, s3
+; GFX11-FAKE16-NEXT:    s_mov_b32 s4, s0
+; GFX11-FAKE16-NEXT:    buffer_load_u16 v0, off, s[8:11], 0
+; GFX11-FAKE16-NEXT:    s_mov_b32 s5, s1
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT:    v_rsq_f16_e32 v0, v0
+; GFX11-FAKE16-NEXT:    buffer_store_b16 v0, off, s[4:7], 0
+; GFX11-FAKE16-NEXT:    s_endpgm
+;
+; GFX12-TRUE16-LABEL: rsq_f16:
+; GFX12-TRUE16:       ; %bb.0: ; %entry
+; GFX12-TRUE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX12-TRUE16-NEXT:    s_mov_b32 s6, -1
+; GFX12-TRUE16-NEXT:    s_mov_b32 s7, 0x31016000
+; GFX12-TRUE16-NEXT:    s_mov_b32 s10, s6
+; GFX12-TRUE16-NEXT:    s_mov_b32 s11, s7
+; GFX12-TRUE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-TRUE16-NEXT:    s_mov_b32 s8, s2
+; GFX12-TRUE16-NEXT:    s_mov_b32 s9, s3
+; GFX12-TRUE16-NEXT:    s_mov_b32 s4, s0
+; GFX12-TRUE16-NEXT:    buffer_load_u16 v0, off, s[8:11], null
+; GFX12-TRUE16-NEXT:    s_mov_b32 s5, s1
+; GFX12-TRUE16-NEXT:    s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT:    v_rsq_f16_e32 v0.l, v0.l
+; GFX12-TRUE16-NEXT:    buffer_store_b16 v0, off, s[4:7], null
+; GFX12-TRUE16-NEXT:    s_endpgm
+;
+; GFX12-FAKE16-LABEL: rsq_f16:
+; GFX12-FAKE16:       ; %bb.0: ; %entry
+; GFX12-FAKE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX12-FAKE16-NEXT:    s_mov_b32 s6, -1
+; GFX12-FAKE16-NEXT:    s_mov_b32 s7, 0x31016000
+; GFX12-FAKE16-NEXT:    s_mov_b32 s10, s6
+; GFX12-FAKE16-NEXT:    s_mov_b32 s11, s7
+; GFX12-FAKE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-FAKE16-NEXT:    s_mov_b32 s8, s2
+; GFX12-FAKE16-NEXT:    s_mov_b32 s9, s3
+; GFX12-FAKE16-NEXT:    s_mov_b32 s4, s0
+; GFX12-FAKE16-NEXT:    buffer_load_u16 v0, off, s[8:11], null
+; GFX12-FAKE16-NEXT:    s_mov_b32 s5, s1
+; GFX12-FAKE16-NEXT:    s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT:    v_rsq_f16_e32 v0, v0
+; GFX12-FAKE16-NEXT:    buffer_store_b16 v0, off, s[4:7], null
+; GFX12-FAKE16-NEXT:    s_endpgm
     ptr addrspace(1) %r,
     ptr addrspace(1) %a) {
 entry:
@@ -20,3 +105,5 @@ entry:
   store half %r.val, ptr addrspace(1) %r
   ret void
 }
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; VI: {{.*}}
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.sqrt.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.sqrt.f16.ll
index 2996a4e22a3ef..8604feb3b492f 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.sqrt.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.sqrt.f16.ll
@@ -1,8 +1,10 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
 ; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -check-prefixes=SI %s
 ; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefixes=VI %s
-; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11,GFX11-TRUE16 %s
-; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11,GFX11-FAKE16 %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11-TRUE16 %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11-FAKE16 %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1200 -mattr=+real-true16 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX12-TRUE16 %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1200 -mattr=-real-true16 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX12-FAKE16 %s
 
 declare half @llvm.sqrt.f16(half %a)
 declare <2 x half> @llvm.sqrt.v2f16(<2 x half> %a)
@@ -81,6 +83,42 @@ define amdgpu_kernel void @sqrt_f16(
 ; GFX11-FAKE16-NEXT:    v_sqrt_f16_e32 v0, v0
 ; GFX11-FAKE16-NEXT:    buffer_store_b16 v0, off, s[4:7], 0
 ; GFX11-FAKE16-NEXT:    s_endpgm
+;
+; GFX12-TRUE16-LABEL: sqrt_f16:
+; GFX12-TRUE16:       ; %bb.0: ; %entry
+; GFX12-TRUE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX12-TRUE16-NEXT:    s_mov_b32 s6, -1
+; GFX12-TRUE16-NEXT:    s_mov_b32 s7, 0x31016000
+; GFX12-TRUE16-NEXT:    s_mov_b32 s10, s6
+; GFX12-TRUE16-NEXT:    s_mov_b32 s11, s7
+; GFX12-TRUE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-TRUE16-NEXT:    s_mov_b32 s8, s2
+; GFX12-TRUE16-NEXT:    s_mov_b32 s9, s3
+; GFX12-TRUE16-NEXT:    s_mov_b32 s4, s0
+; GFX12-TRUE16-NEXT:    buffer_load_u16 v0, off, s[8:11], null
+; GFX12-TRUE16-NEXT:    s_mov_b32 s5, s1
+; GFX12-TRUE16-NEXT:    s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT:    v_sqrt_f16_e32 v0.l, v0.l
+; GFX12-TRUE16-NEXT:    buffer_store_b16 v0, off, s[4:7], null
+; GFX12-TRUE16-NEXT:    s_endpgm
+;
+; GFX12-FAKE16-LABEL: sqrt_f16:
+; GFX12-FAKE16:       ; %bb.0: ; %entry
+; GFX12-FAKE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX12-FAKE16-NEXT:    s_mov_b32 s6, -1
+; GFX12-FAKE16-NEXT:    s_mov_b32 s7, 0x31016000
+; GFX12-FAKE16-NEXT:    s_mov_b32 s10, s6
+; GFX12-FAKE16-NEXT:    s_mov_b32 s11, s7
+; GFX12-FAKE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-FAKE16-NEXT:    s_mov_b32 s8, s2
+; GFX12-FAKE16-NEXT:    s_mov_b32 s9, s3
+; GFX12-FAKE16-NEXT:    s_mov_b32 s4, s0
+; GFX12-FAKE16-NEXT:    buffer_load_u16 v0, off, s[8:11], null
+; GFX12-FAKE16-NEXT:    s_mov_b32 s5, s1
+; GFX12-FAKE16-NEXT:    s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT:    v_sqrt_f16_e32 v0, v0
+; GFX12-FAKE16-NEXT:    buffer_store_b16 v0, off, s[4:7], null
+; GFX12-FAKE16-NEXT:    s_endpgm
     ptr addrspace(1) %r,
     ptr addrspace(1) %a) {
 entry:
@@ -189,6 +227,50 @@ define amdgpu_kernel void @sqrt_v2f16(
 ; GFX11-FAKE16-NEXT:    v_pack_b32_f16 v0, v0, v1
 ; GFX11-FAKE16-NEXT:    buffer_store_b32 v0, off, s[4:7], 0
 ; GFX11-FAKE16-NEXT:    s_endpgm
+;
+; GFX12-TRUE16-LABEL: sqrt_v2f16:
+; GFX12-TRUE16:       ; %bb.0: ; %entry
+; GFX12-TRUE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX12-TRUE16-NEXT:    s_mov_b32 s6, -1
+; GFX12-TRUE16-NEXT:    s_mov_b32 s7, 0x31016000
+; GFX12-TRUE16-NEXT:    s_mov_b32 s10, s6
+; GFX12-TRUE16-NEXT:    s_mov_b32 s11, s7
+; GFX12-TRUE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-TRUE16-NEXT:    s_mov_b32 s8, s2
+; GFX12-TRUE16-NEXT:    s_mov_b32 s9, s3
+; GFX12-TRUE16-NEXT:    s_mov_b32 s4, s0
+; GFX12-TRUE16-NEXT:    buffer_load_b32 v0, off, s[8:11], null
+; GFX12-TRUE16-NEXT:    s_mov_b32 s5, s1
+; GFX12-TRUE16-NEXT:    s_wait_loadcnt 0x0
+; GFX12-TRUE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
+; GFX12-TRUE16-NEXT:    v_sqrt_f16_e32 v0.l, v0.l
+; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(TRANS32_DEP_1)
+; GFX12-TRUE16-NEXT:    v_sqrt_f16_e32 v0.h, v1.l
+; GFX12-TRUE16-NEXT:    v_pack_b32_f16 v0, v0.l, v0.h
+; GFX12-TRUE16-NEXT:    buffer_store_b32 v0, off, s[4:7], null
+; GFX12-TRUE16-NEXT:    s_endpgm
+;
+; GFX12-FAKE16-LABEL: sqrt_v2f16:
+; GFX12-FAKE16:       ; %bb.0: ; %entry
+; GFX12-FAKE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX12-FAKE16-NEXT:    s_mov_b32 s6, -1
+; GFX12-FAKE16-NEXT:    s_mov_b32 s7, 0x31016000
+; GFX12-FAKE16-NEXT:    s_mov_b32 s10, s6
+; GFX12-FAKE16-NEXT:    s_mov_b32 s11, s7
+; GFX12-FAKE16-NEXT:    s_wait_kmcnt 0x0
+; GFX12-FAKE16-NEXT:    s_mov_b32 s8, s2
+; GFX12-FAKE16-NEXT:    s_mov_b32 s9, s3
+; GFX12-FAKE16-NEXT:    s_mov_b32 s4, s0
+; GFX12-FAKE16-NEXT:    buffer_load_b32 v0, off, s[8:11], null
+; GFX12-FAKE16-NEXT:    s_mov_b32 s5, s1
+; GFX12-FAKE16-NEXT:    s_wait_loadcnt 0x0
+; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
+; GFX12-FAKE16-NEXT:    v_sqrt_f16_e32 v0, v0
+; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(TRANS32_DEP_1)
+; GFX12-FAKE16-NEXT:    v_sqrt_f16_e32 v1, v1
+; GFX12-FAKE16-NEXT:    v_pack_b32_f16 v0, v0, v1
+; GFX12-FAKE16-NEXT:    buffer_store_b32 v0, off, s[4:7], null
+; GFX12-FAKE16-NEXT:    s_endpgm
     ptr addrspace(1) %r,
     ptr addrspace(1) %a) {
 entry:
@@ -197,5 +279,3 @@ entry:
   store <2 x half> %r.val, ptr addrspace(1) %r
   ret void
 }
-;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
-; GFX11: {{.*}}
diff --git a/llvm/test/CodeGen/AMDGPU/move-to-valu-pseudo-scalar-trans-f16-fake16.ll b/llvm/test/CodeGen/AMDGPU/move-to-valu-pseudo-scalar-trans-f16-fake16.ll
new file mode 100644
index 0000000000000..a6819359561b5
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/move-to-valu-pseudo-scalar-trans-f16-fake16.ll
@@ -0,0 +1,113 @@
+; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 2
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=-real-true16 -stop-after=si-fix-sgpr-copies -verify-machineinstrs < %s | FileCheck %s
+
+define amdgpu_kernel void @exp_f16(ptr addrspace(1) %ptr) {
+  ; CHECK-LABEL: name: exp_f16
+  ; CHECK: bb.0 (%ir-block.0):
+  ; CHECK-NEXT:   liveins: $sgpr4_sgpr5
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr4_sgpr5
+  ; CHECK-NEXT:   [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec_xnull = S_LOAD_DWORDX2_IMM [[COPY]](p4), 36, 0 :: (dereferenceable invariant load (s64) from %ir.ptr.kernarg.offset, align 4, addrspace 4)
+  ; CHECK-NEXT:   [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+  ; CHECK-NEXT:   [[GLOBAL_LOAD_USHORT_SADDR:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_USHORT_SADDR [[S_LOAD_DWORDX2_IMM]], [[V_MOV_B32_e32_]], 0, 0, implicit $exec :: (volatile "amdgpu-noclobber" load (s16) from %ir.ptr.load, addrspace 1)
+  ; CHECK-NEXT:   [[DEF:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[V_EXP_F16_fake16_e64_:%[0-9]+]]:vgpr_32 = V_EXP_F16_fake16_e64 0, [[GLOBAL_LOAD_USHORT_SADDR]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[DEF1:%[0-9]+]]:sreg_32_xexec = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[V_EXP_F16_fake16_e64_]]
+  ; CHECK-NEXT:   GLOBAL_STORE_SHORT_SADDR [[V_MOV_B32_e32_]], killed [[COPY1]], [[S_LOAD_DWORDX2_IMM]], 0, 0, implicit $exec :: (store (s16) into %ir.ptr.load, addrspace 1)
+  ; CHECK-NEXT:   S_ENDPGM 0
+  %val = load volatile half, ptr addrspace(1) %ptr
+  %res = call half @llvm.amdgcn.exp2.f16(half %val)
+  store half %res, ptr addrspace(1) %ptr
+  ret void
+}
+
+define amdgpu_kernel void @log_f16(ptr addrspace(1) %ptr) {
+  ; CHECK-LABEL: name: log_f16
+  ; CHECK: bb.0 (%ir-block.0):
+  ; CHECK-NEXT:   liveins: $sgpr4_sgpr5
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr4_sgpr5
+  ; CHECK-NEXT:   [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec_xnull = S_LOAD_DWORDX2_IMM [[COPY]](p4), 36, 0 :: (dereferenceable invariant load (s64) from %ir.ptr.kernarg.offset, align 4, addrspace 4)
+  ; CHECK-NEXT:   [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+  ; CHECK-NEXT:   [[GLOBAL_LOAD_USHORT_SADDR:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_USHORT_SADDR [[S_LOAD_DWORDX2_IMM]], [[V_MOV_B32_e32_]], 0, 0, implicit $exec :: (volatile "amdgpu-noclobber" load (s16) from %ir.ptr.load, addrspace 1)
+  ; CHECK-NEXT:   [[DEF:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[V_LOG_F16_fake16_e64_:%[0-9]+]]:vgpr_32 = V_LOG_F16_fake16_e64 0, [[GLOBAL_LOAD_USHORT_SADDR]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[DEF1:%[0-9]+]]:sreg_32_xexec = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[V_LOG_F16_fake16_e64_]]
+  ; CHECK-NEXT:   GLOBAL_STORE_SHORT_SADDR [[V_MOV_B32_e32_]], killed [[COPY1]], [[S_LOAD_DWORDX2_IMM]], 0, 0, implicit $exec :: (store (s16) into %ir.ptr.load, addrspace 1)
+  ; CHECK-NEXT:   S_ENDPGM 0
+  %val = load volatile half, ptr addrspace(1) %ptr
+  %res = call half @llvm.amdgcn.log.f16(half %val)
+  store half %res, ptr addrspace(1) %ptr
+  ret void
+}
+
+define amdgpu_kernel void @rcp_f16(ptr addrspace(1) %ptr) {
+  ; CHECK-LABEL: name: rcp_f16
+  ; CHECK: bb.0 (%ir-block.0):
+  ; CHECK-NEXT:   liveins: $sgpr4_sgpr5
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr4_sgpr5
+  ; CHECK-NEXT:   [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec_xnull = S_LOAD_DWORDX2_IMM [[COPY]](p4), 36, 0 :: (dereferenceable invariant load (s64) from %ir.ptr.kernarg.offset, align 4, addrspace 4)
+  ; CHECK-NEXT:   [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+  ; CHECK-NEXT:   [[GLOBAL_LOAD_USHORT_SADDR:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_USHORT_SADDR [[S_LOAD_DWORDX2_IMM]], [[V_MOV_B32_e32_]], 0, 0, implicit $exec :: (volatile "amdgpu-noclobber" load (s16) from %ir.ptr.load, addrspace 1)
+  ; CHECK-NEXT:   [[DEF:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[V_RCP_F16_fake16_e64_:%[0-9]+]]:vgpr_32 = V_RCP_F16_fake16_e64 0, [[GLOBAL_LOAD_USHORT_SADDR]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[DEF1:%[0-9]+]]:sreg_32_xexec = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[V_RCP_F16_fake16_e64_]]
+  ; CHECK-NEXT:   GLOBAL_STORE_SHORT_SADDR [[V_MOV_B32_e32_]], killed [[COPY1]], [[S_LOAD_DWORDX2_IMM]], 0, 0, implicit $exec :: (store (s16) into %ir.ptr.load, addrspace 1)
+  ; CHECK-NEXT:   S_ENDPGM 0
+  %val = load volatile half, ptr addrspace(1) %ptr
+  %res = call half @llvm.amdgcn.rcp.f16(half %val)
+  store half %res, ptr addrspace(1) %ptr
+  ret void
+}
+
+define amdgpu_kernel void @rsq_f16(ptr addrspace(1) %ptr) {
+  ; CHECK-LABEL: name: rsq_f16
+  ; CHECK: bb.0 (%ir-block.0):
+  ; CHECK-NEXT:   liveins: $sgpr4_sgpr5
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr4_sgpr5
+  ; CHECK-NEXT:   [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec_xnull = S_LOAD_DWORDX2_IMM [[COPY]](p4), 36, 0 :: (dereferenceable invariant load (s64) from %ir.ptr.kernarg.offset, align 4, addrspace 4)
+  ; CHECK-NEXT:   [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+  ; CHECK-NEXT:   [[GLOBAL_LOAD_USHORT_SADDR:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_USHORT_SADDR [[S_LOAD_DWORDX2_IMM]], [[V_MOV_B32_e32_]], 0, 0, implicit $exec :: (volatile "amdgpu-noclobber" load (s16) from %ir.ptr.load, addrspace 1)
+  ; CHECK-NEXT:   [[DEF:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[V_RSQ_F16_fake16_e64_:%[0-9]+]]:vgpr_32 = V_RSQ_F16_fake16_e64 0, [[GLOBAL_LOAD_USHORT_SADDR]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[DEF1:%[0-9]+]]:sreg_32_xexec = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[V_RSQ_F16_fake16_e64_]]
+  ; CHECK-NEXT:   GLOBAL_STORE_SHORT_SADDR [[V_MOV_B32_e32_]], killed [[COPY1]], [[S_LOAD_DWORDX2_IMM]], 0, 0, implicit $exec :: (store (s16) into %ir.ptr.load, addrspace 1)
+  ; CHECK-NEXT:   S_ENDPGM 0
+  %val = load volatile half, ptr addrspace(1) %ptr
+  %res = call half @llvm.amdgcn.rsq.f16(half %val)
+  store half %res, ptr addrspace(1) %ptr
+  ret void
+}
+
+define amdgpu_kernel void @sqrt_f16(ptr addrspace(1) %ptr) {
+  ; CHECK-LABEL: name: sqrt_f16
+  ; CHECK: bb.0 (%ir-block.0):
+  ; CHECK-NEXT:   liveins: $sgpr4_sgpr5
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr4_sgpr5
+  ; CHECK-NEXT:   [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec_xnull = S_LOAD_DWORDX2_IMM [[COPY]](p4), 36, 0 :: (dereferenceable invariant load (s64) from %ir.ptr.kernarg.offset, align 4, addrspace 4)
+  ; CHECK-NEXT:   [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+  ; CHECK-NEXT:   [[GLOBAL_LOAD_USHORT_SADDR:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_USHORT_SADDR [[S_LOAD_DWORDX2_IMM]], [[V_MOV_B32_e32_]], 0, 0, implicit $exec :: (volatile "amdgpu-noclobber" load (s16) from %ir.ptr.load, addrspace 1)
+  ; CHECK-NEXT:   [[DEF:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[V_SQRT_F16_fake16_e64_:%[0-9]+]]:vgpr_32 = V_SQRT_F16_fake16_e64 0, [[GLOBAL_LOAD_USHORT_SADDR]], 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[DEF1:%[0-9]+]]:sreg_32_xexec = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[V_SQRT_F16_fake16_e64_]]
+  ; CHECK-NEXT:   GLOBAL_STORE_SHORT_SADDR [[V_MOV_B32_e32_]], killed [[COPY1]], [[S_LOAD_DWORDX2_IMM]], 0, 0, implicit $exec :: (store (s16) into %ir.ptr.load, addrspace 1)
+  ; CHECK-NEXT:   S_ENDPGM 0
+  %val = load volatile half, ptr addrspace(1) %ptr
+  %res = call half @llvm.amdgcn.sqrt.f16(half %val)
+  store half %res, ptr addrspace(1) %ptr
+  ret void
+}
+
+declare half @llvm.amdgcn.exp2.f16(half)
+declare half @llvm.amdgcn.log.f16(half)
+declare half @llvm.amdgcn.rcp.f16(half)
+declare half @llvm.amdgcn.rsq.f16(half)
+declare half @llvm.amdgcn.sqrt.f16(half)
diff --git a/llvm/test/CodeGen/AMDGPU/move-to-valu-pseudo-scalar-trans-f16-true16.ll b/llvm/test/CodeGen/AMDGPU/move-to-valu-pseudo-scalar-trans-f16-true16.ll
new file mode 100644
index 0000000000000..b1b5b6b62296d
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/move-to-valu-pseudo-scalar-trans-f16-true16.ll
@@ -0,0 +1,123 @@
+; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 2
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=+real-true16 -stop-after=si-fix-sgpr-copies -verify-machineinstrs < %s | FileCheck %s
+
+define amdgpu_kernel void @exp_f16(ptr addrspace(1) %ptr) {
+  ; CHECK-LABEL: name: exp_f16
+  ; CHECK: bb.0 (%ir-block.0):
+  ; CHECK-NEXT:   liveins: $sgpr4_sgpr5
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr4_sgpr5
+  ; CHECK-NEXT:   [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec_xnull = S_LOAD_DWORDX2_IMM [[COPY]](p4), 36, 0 :: (dereferenceable invariant load (s64) from %ir.ptr.kernarg.offset, align 4, addrspace 4)
+  ; CHECK-NEXT:   [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+  ; CHECK-NEXT:   [[GLOBAL_LOAD_SHORT_D16_SADDR_t16_:%[0-9]+]]:vgpr_16 = GLOBAL_LOAD_SHORT_D16_SADDR_t16 [[S_LOAD_DWORDX2_IMM]], [[V_MOV_B32_e32_]], 0, 0, implicit $exec :: (volatile "amdgpu-noclobber" load (s16) from %ir.ptr.load, addrspace 1)
+  ; CHECK-NEXT:   [[DEF:%[0-9]+]]:vgpr_16 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[REG_SEQUENCE:%[0-9]+]]:vgpr_32 = REG_SEQUENCE [[GLOBAL_LOAD_SHORT_D16_SADDR_t16_]], %subreg.lo16, [[DEF]], %subreg.hi16
+  ; CHECK-NEXT:   [[V_EXP_F16_t16_e64_:%[0-9]+]]:vgpr_16 = V_EXP_F16_t16_e64 0, killed [[REG_SEQUENCE]].lo16, 0, 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[DEF1:%[0-9]+]]:vgpr_16 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[REG_SEQUENCE1:%[0-9]+]]:vgpr_32 = REG_SEQUENCE [[V_EXP_F16_t16_e64_]], %subreg.lo16, [[DEF1]], %subreg.hi16
+  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:vgpr_16 = COPY [[REG_SEQUENCE1]]
+  ; CHECK-NEXT:   GLOBAL_STORE_SHORT_SADDR_t16 [[V_MOV_B32_e32_]], killed [[COPY1]], [[S_LOAD_DWORDX2_IMM]], 0, 0, implicit $exec :: (store (s16) into %ir.ptr.load, addrspace 1)
+  ; CHECK-NEXT:   S_ENDPGM 0
+  %val = load volatile half, ptr addrspace(1) %ptr
+  %res = call half @llvm.amdgcn.exp2.f16(half %val)
+  store half %res, ptr addrspace(1) %ptr
+  ret void
+}
+
+define amdgpu_kernel void @log_f16(ptr addrspace(1) %ptr) {
+  ; CHECK-LABEL: name: log_f16
+  ; CHECK: bb.0 (%ir-block.0):
+  ; CHECK-NEXT:   liveins: $sgpr4_sgpr5
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr4_sgpr5
+  ; CHECK-NEXT:   [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec_xnull = S_LOAD_DWORDX2_IMM [[COPY]](p4), 36, 0 :: (dereferenceable invariant load (s64) from %ir.ptr.kernarg.offset, align 4, addrspace 4)
+  ; CHECK-NEXT:   [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+  ; CHECK-NEXT:   [[GLOBAL_LOAD_SHORT_D16_SADDR_t16_:%[0-9]+]]:vgpr_16 = GLOBAL_LOAD_SHORT_D16_SADDR_t16 [[S_LOAD_DWORDX2_IMM]], [[V_MOV_B32_e32_]], 0, 0, implicit $exec :: (volatile "amdgpu-noclobber" load (s16) from %ir.ptr.load, addrspace 1)
+  ; CHECK-NEXT:   [[DEF:%[0-9]+]]:vgpr_16 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[REG_SEQUENCE:%[0-9]+]]:vgpr_32 = REG_SEQUENCE [[GLOBAL_LOAD_SHORT_D16_SADDR_t16_]], %subreg.lo16, [[DEF]], %subreg.hi16
+  ; CHECK-NEXT:   [[V_LOG_F16_t16_e64_:%[0-9]+]]:vgpr_16 = V_LOG_F16_t16_e64 0, killed [[REG_SEQUENCE]].lo16, 0, 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[DEF1:%[0-9]+]]:vgpr_16 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[REG_SEQUENCE1:%[0-9]+]]:vgpr_32 = REG_SEQUENCE [[V_LOG_F16_t16_e64_]], %subreg.lo16, [[DEF1]], %subreg.hi16
+  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:vgpr_16 = COPY [[REG_SEQUENCE1]]
+  ; CHECK-NEXT:   GLOBAL_STORE_SHORT_SADDR_t16 [[V_MOV_B32_e32_]], killed [[COPY1]], [[S_LOAD_DWORDX2_IMM]], 0, 0, implicit $exec :: (store (s16) into %ir.ptr.load, addrspace 1)
+  ; CHECK-NEXT:   S_ENDPGM 0
+  %val = load volatile half, ptr addrspace(1) %ptr
+  %res = call half @llvm.amdgcn.log.f16(half %val)
+  store half %res, ptr addrspace(1) %ptr
+  ret void
+}
+
+define amdgpu_kernel void @rcp_f16(ptr addrspace(1) %ptr) {
+  ; CHECK-LABEL: name: rcp_f16
+  ; CHECK: bb.0 (%ir-block.0):
+  ; CHECK-NEXT:   liveins: $sgpr4_sgpr5
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr4_sgpr5
+  ; CHECK-NEXT:   [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec_xnull = S_LOAD_DWORDX2_IMM [[COPY]](p4), 36, 0 :: (dereferenceable invariant load (s64) from %ir.ptr.kernarg.offset, align 4, addrspace 4)
+  ; CHECK-NEXT:   [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+  ; CHECK-NEXT:   [[GLOBAL_LOAD_SHORT_D16_SADDR_t16_:%[0-9]+]]:vgpr_16 = GLOBAL_LOAD_SHORT_D16_SADDR_t16 [[S_LOAD_DWORDX2_IMM]], [[V_MOV_B32_e32_]], 0, 0, implicit $exec :: (volatile "amdgpu-noclobber" load (s16) from %ir.ptr.load, addrspace 1)
+  ; CHECK-NEXT:   [[DEF:%[0-9]+]]:vgpr_16 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[REG_SEQUENCE:%[0-9]+]]:vgpr_32 = REG_SEQUENCE [[GLOBAL_LOAD_SHORT_D16_SADDR_t16_]], %subreg.lo16, [[DEF]], %subreg.hi16
+  ; CHECK-NEXT:   [[V_RCP_F16_t16_e64_:%[0-9]+]]:vgpr_16 = V_RCP_F16_t16_e64 0, killed [[REG_SEQUENCE]].lo16, 0, 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[DEF1:%[0-9]+]]:vgpr_16 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[REG_SEQUENCE1:%[0-9]+]]:vgpr_32 = REG_SEQUENCE [[V_RCP_F16_t16_e64_]], %subreg.lo16, [[DEF1]], %subreg.hi16
+  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:vgpr_16 = COPY [[REG_SEQUENCE1]]
+  ; CHECK-NEXT:   GLOBAL_STORE_SHORT_SADDR_t16 [[V_MOV_B32_e32_]], killed [[COPY1]], [[S_LOAD_DWORDX2_IMM]], 0, 0, implicit $exec :: (store (s16) into %ir.ptr.load, addrspace 1)
+  ; CHECK-NEXT:   S_ENDPGM 0
+  %val = load volatile half, ptr addrspace(1) %ptr
+  %res = call half @llvm.amdgcn.rcp.f16(half %val)
+  store half %res, ptr addrspace(1) %ptr
+  ret void
+}
+
+define amdgpu_kernel void @rsq_f16(ptr addrspace(1) %ptr) {
+  ; CHECK-LABEL: name: rsq_f16
+  ; CHECK: bb.0 (%ir-block.0):
+  ; CHECK-NEXT:   liveins: $sgpr4_sgpr5
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr4_sgpr5
+  ; CHECK-NEXT:   [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec_xnull = S_LOAD_DWORDX2_IMM [[COPY]](p4), 36, 0 :: (dereferenceable invariant load (s64) from %ir.ptr.kernarg.offset, align 4, addrspace 4)
+  ; CHECK-NEXT:   [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+  ; CHECK-NEXT:   [[GLOBAL_LOAD_SHORT_D16_SADDR_t16_:%[0-9]+]]:vgpr_16 = GLOBAL_LOAD_SHORT_D16_SADDR_t16 [[S_LOAD_DWORDX2_IMM]], [[V_MOV_B32_e32_]], 0, 0, implicit $exec :: (volatile "amdgpu-noclobber" load (s16) from %ir.ptr.load, addrspace 1)
+  ; CHECK-NEXT:   [[DEF:%[0-9]+]]:vgpr_16 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[REG_SEQUENCE:%[0-9]+]]:vgpr_32 = REG_SEQUENCE [[GLOBAL_LOAD_SHORT_D16_SADDR_t16_]], %subreg.lo16, [[DEF]], %subreg.hi16
+  ; CHECK-NEXT:   [[V_RSQ_F16_t16_e64_:%[0-9]+]]:vgpr_16 = V_RSQ_F16_t16_e64 0, killed [[REG_SEQUENCE]].lo16, 0, 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[DEF1:%[0-9]+]]:vgpr_16 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[REG_SEQUENCE1:%[0-9]+]]:vgpr_32 = REG_SEQUENCE [[V_RSQ_F16_t16_e64_]], %subreg.lo16, [[DEF1]], %subreg.hi16
+  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:vgpr_16 = COPY [[REG_SEQUENCE1]]
+  ; CHECK-NEXT:   GLOBAL_STORE_SHORT_SADDR_t16 [[V_MOV_B32_e32_]], killed [[COPY1]], [[S_LOAD_DWORDX2_IMM]], 0, 0, implicit $exec :: (store (s16) into %ir.ptr.load, addrspace 1)
+  ; CHECK-NEXT:   S_ENDPGM 0
+  %val = load volatile half, ptr addrspace(1) %ptr
+  %res = call half @llvm.amdgcn.rsq.f16(half %val)
+  store half %res, ptr addrspace(1) %ptr
+  ret void
+}
+
+define amdgpu_kernel void @sqrt_f16(ptr addrspace(1) %ptr) {
+  ; CHECK-LABEL: name: sqrt_f16
+  ; CHECK: bb.0 (%ir-block.0):
+  ; CHECK-NEXT:   liveins: $sgpr4_sgpr5
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr4_sgpr5
+  ; CHECK-NEXT:   [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec_xnull = S_LOAD_DWORDX2_IMM [[COPY]](p4), 36, 0 :: (dereferenceable invariant load (s64) from %ir.ptr.kernarg.offset, align 4, addrspace 4)
+  ; CHECK-NEXT:   [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+  ; CHECK-NEXT:   [[GLOBAL_LOAD_SHORT_D16_SADDR_t16_:%[0-9]+]]:vgpr_16 = GLOBAL_LOAD_SHORT_D16_SADDR_t16 [[S_LOAD_DWORDX2_IMM]], [[V_MOV_B32_e32_]], 0, 0, implicit $exec :: (volatile "amdgpu-noclobber" load (s16) from %ir.ptr.load, addrspace 1)
+  ; CHECK-NEXT:   [[DEF:%[0-9]+]]:vgpr_16 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[REG_SEQUENCE:%[0-9]+]]:vgpr_32 = REG_SEQUENCE [[GLOBAL_LOAD_SHORT_D16_SADDR_t16_]], %subreg.lo16, [[DEF]], %subreg.hi16
+  ; CHECK-NEXT:   [[V_SQRT_F16_t16_e64_:%[0-9]+]]:vgpr_16 = V_SQRT_F16_t16_e64 0, killed [[REG_SEQUENCE]].lo16, 0, 0, 0, implicit $mode, implicit $exec
+  ; CHECK-NEXT:   [[DEF1:%[0-9]+]]:vgpr_16 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[REG_SEQUENCE1:%[0-9]+]]:vgpr_32 = REG_SEQUENCE [[V_SQRT_F16_t16_e64_]], %subreg.lo16, [[DEF1]], %subreg.hi16
+  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:vgpr_16 = COPY [[REG_SEQUENCE1]]
+  ; CHECK-NEXT:   GLOBAL_STORE_SHORT_SADDR_t16 [[V_MOV_B32_e32_]], killed [[COPY1]], [[S_LOAD_DWORDX2_IMM]], 0, 0, implicit $exec :: (store (s16) into %ir.ptr.load, addrspace 1)
+  ; CHECK-NEXT:   S_ENDPGM 0
+  %val = load volatile half, ptr addrspace(1) %ptr
+  %res = call half @llvm.amdgcn.sqrt.f16(half %val)
+  store half %res, ptr addrspace(1) %ptr
+  ret void
+}
+
+declare half @llvm.amdgcn.exp2.f16(half)
+declare half @llvm.amdgcn.log.f16(half)
+declare half @llvm.amdgcn.rcp.f16(half)
+declare half @llvm.amdgcn.rsq.f16(half)
+declare half @llvm.amdgcn.sqrt.f16(half)
diff --git a/llvm/test/CodeGen/AMDGPU/move-to-valu-pseudo-scalar-trans.ll b/llvm/test/CodeGen/AMDGPU/move-to-valu-pseudo-scalar-trans.ll
index 9407c8a9ee436..56848eabf8607 100644
--- a/llvm/test/CodeGen/AMDGPU/move-to-valu-pseudo-scalar-trans.ll
+++ b/llvm/test/CodeGen/AMDGPU/move-to-valu-pseudo-scalar-trans.ll
@@ -1,4 +1,4 @@
-; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 2
+; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -stop-after=si-fix-sgpr-copies -verify-machineinstrs < %s | FileCheck %s
 
 define amdgpu_kernel void @exp_f32(ptr addrspace(1) %ptr) {
@@ -21,27 +21,6 @@ define amdgpu_kernel void @exp_f32(ptr addrspace(1) %ptr) {
   ret void
 }
 
-define amdgpu_kernel void @exp_f16(ptr addrspace(1) %ptr) {
-  ; CHECK-LABEL: name: exp_f16
-  ; CHECK: bb.0 (%ir-block.0):
-  ; CHECK-NEXT:   liveins: $sgpr4_sgpr5
-  ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr4_sgpr5
-  ; CHECK-NEXT:   [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec_xnull = S_LOAD_DWORDX2_IMM [[COPY]](p4), 36, 0 :: (dereferenceable invariant load (s64) from %ir.ptr.kernarg.offset, align 4, addrspace 4)
-  ; CHECK-NEXT:   [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
-  ; CHECK-NEXT:   [[GLOBAL_LOAD_USHORT_SADDR:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_USHORT_SADDR [[S_LOAD_DWORDX2_IMM]], [[V_MOV_B32_e32_]], 0, 0, implicit $exec :: (volatile "amdgpu-noclobber" load (s16) from %ir.ptr.load, addrspace 1)
-  ; CHECK-NEXT:   [[DEF:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[V_EXP_F16_fake16_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_EXP_F16_fake16_e64 0, [[GLOBAL_LOAD_USHORT_SADDR]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[DEF1:%[0-9]+]]:sreg_32_xexec = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[V_EXP_F16_fake16_e64_]]
-  ; CHECK-NEXT:   GLOBAL_STORE_SHORT_SADDR [[V_MOV_B32_e32_]], killed [[COPY1]], [[S_LOAD_DWORDX2_IMM]], 0, 0, implicit $exec :: (store (s16) into %ir.ptr.load, addrspace 1)
-  ; CHECK-NEXT:   S_ENDPGM 0
-  %val = load volatile half, ptr addrspace(1) %ptr
-  %res = call half @llvm.amdgcn.exp2.f16(half %val)
-  store half %res, ptr addrspace(1) %ptr
-  ret void
-}
-
 define amdgpu_kernel void @log_f32(ptr addrspace(1) %ptr) {
   ; CHECK-LABEL: name: log_f32
   ; CHECK: bb.0 (%ir-block.0):
@@ -62,27 +41,6 @@ define amdgpu_kernel void @log_f32(ptr addrspace(1) %ptr) {
   ret void
 }
 
-define amdgpu_kernel void @log_f16(ptr addrspace(1) %ptr) {
-  ; CHECK-LABEL: name: log_f16
-  ; CHECK: bb.0 (%ir-block.0):
-  ; CHECK-NEXT:   liveins: $sgpr4_sgpr5
-  ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr4_sgpr5
-  ; CHECK-NEXT:   [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec_xnull = S_LOAD_DWORDX2_IMM [[COPY]](p4), 36, 0 :: (dereferenceable invariant load (s64) from %ir.ptr.kernarg.offset, align 4, addrspace 4)
-  ; CHECK-NEXT:   [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
-  ; CHECK-NEXT:   [[GLOBAL_LOAD_USHORT_SADDR:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_USHORT_SADDR [[S_LOAD_DWORDX2_IMM]], [[V_MOV_B32_e32_]], 0, 0, implicit $exec :: (volatile "amdgpu-noclobber" load (s16) from %ir.ptr.load, addrspace 1)
-  ; CHECK-NEXT:   [[DEF:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[V_LOG_F16_fake16_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_LOG_F16_fake16_e64 0, [[GLOBAL_LOAD_USHORT_SADDR]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[DEF1:%[0-9]+]]:sreg_32_xexec = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[V_LOG_F16_fake16_e64_]]
-  ; CHECK-NEXT:   GLOBAL_STORE_SHORT_SADDR [[V_MOV_B32_e32_]], killed [[COPY1]], [[S_LOAD_DWORDX2_IMM]], 0, 0, implicit $exec :: (store (s16) into %ir.ptr.load, addrspace 1)
-  ; CHECK-NEXT:   S_ENDPGM 0
-  %val = load volatile half, ptr addrspace(1) %ptr
-  %res = call half @llvm.amdgcn.log.f16(half %val)
-  store half %res, ptr addrspace(1) %ptr
-  ret void
-}
-
 define amdgpu_kernel void @rcp_f32(ptr addrspace(1) %ptr) {
   ; CHECK-LABEL: name: rcp_f32
   ; CHECK: bb.0 (%ir-block.0):
@@ -103,27 +61,6 @@ define amdgpu_kernel void @rcp_f32(ptr addrspace(1) %ptr) {
   ret void
 }
 
-define amdgpu_kernel void @rcp_f16(ptr addrspace(1) %ptr) {
-  ; CHECK-LABEL: name: rcp_f16
-  ; CHECK: bb.0 (%ir-block.0):
-  ; CHECK-NEXT:   liveins: $sgpr4_sgpr5
-  ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr4_sgpr5
-  ; CHECK-NEXT:   [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec_xnull = S_LOAD_DWORDX2_IMM [[COPY]](p4), 36, 0 :: (dereferenceable invariant load (s64) from %ir.ptr.kernarg.offset, align 4, addrspace 4)
-  ; CHECK-NEXT:   [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
-  ; CHECK-NEXT:   [[GLOBAL_LOAD_USHORT_SADDR:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_USHORT_SADDR [[S_LOAD_DWORDX2_IMM]], [[V_MOV_B32_e32_]], 0, 0, implicit $exec :: (volatile "amdgpu-noclobber" load (s16) from %ir.ptr.load, addrspace 1)
-  ; CHECK-NEXT:   [[DEF:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[V_RCP_F16_fake16_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_RCP_F16_fake16_e64 0, [[GLOBAL_LOAD_USHORT_SADDR]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[DEF1:%[0-9]+]]:sreg_32_xexec = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[V_RCP_F16_fake16_e64_]]
-  ; CHECK-NEXT:   GLOBAL_STORE_SHORT_SADDR [[V_MOV_B32_e32_]], killed [[COPY1]], [[S_LOAD_DWORDX2_IMM]], 0, 0, implicit $exec :: (store (s16) into %ir.ptr.load, addrspace 1)
-  ; CHECK-NEXT:   S_ENDPGM 0
-  %val = load volatile half, ptr addrspace(1) %ptr
-  %res = call half @llvm.amdgcn.rcp.f16(half %val)
-  store half %res, ptr addrspace(1) %ptr
-  ret void
-}
-
 define amdgpu_kernel void @rsq_f32(ptr addrspace(1) %ptr) {
   ; CHECK-LABEL: name: rsq_f32
   ; CHECK: bb.0 (%ir-block.0):
@@ -144,27 +81,6 @@ define amdgpu_kernel void @rsq_f32(ptr addrspace(1) %ptr) {
   ret void
 }
 
-define amdgpu_kernel void @rsq_f16(ptr addrspace(1) %ptr) {
-  ; CHECK-LABEL: name: rsq_f16
-  ; CHECK: bb.0 (%ir-block.0):
-  ; CHECK-NEXT:   liveins: $sgpr4_sgpr5
-  ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr4_sgpr5
-  ; CHECK-NEXT:   [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec_xnull = S_LOAD_DWORDX2_IMM [[COPY]](p4), 36, 0 :: (dereferenceable invariant load (s64) from %ir.ptr.kernarg.offset, align 4, addrspace 4)
-  ; CHECK-NEXT:   [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
-  ; CHECK-NEXT:   [[GLOBAL_LOAD_USHORT_SADDR:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_USHORT_SADDR [[S_LOAD_DWORDX2_IMM]], [[V_MOV_B32_e32_]], 0, 0, implicit $exec :: (volatile "amdgpu-noclobber" load (s16) from %ir.ptr.load, addrspace 1)
-  ; CHECK-NEXT:   [[DEF:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[V_RSQ_F16_fake16_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_RSQ_F16_fake16_e64 0, [[GLOBAL_LOAD_USHORT_SADDR]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[DEF1:%[0-9]+]]:sreg_32_xexec = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[V_RSQ_F16_fake16_e64_]]
-  ; CHECK-NEXT:   GLOBAL_STORE_SHORT_SADDR [[V_MOV_B32_e32_]], killed [[COPY1]], [[S_LOAD_DWORDX2_IMM]], 0, 0, implicit $exec :: (store (s16) into %ir.ptr.load, addrspace 1)
-  ; CHECK-NEXT:   S_ENDPGM 0
-  %val = load volatile half, ptr addrspace(1) %ptr
-  %res = call half @llvm.amdgcn.rsq.f16(half %val)
-  store half %res, ptr addrspace(1) %ptr
-  ret void
-}
-
 define amdgpu_kernel void @sqrt_f32(ptr addrspace(1) %ptr) {
   ; CHECK-LABEL: name: sqrt_f32
   ; CHECK: bb.0 (%ir-block.0):
@@ -185,34 +101,8 @@ define amdgpu_kernel void @sqrt_f32(ptr addrspace(1) %ptr) {
   ret void
 }
 
-define amdgpu_kernel void @sqrt_f16(ptr addrspace(1) %ptr) {
-  ; CHECK-LABEL: name: sqrt_f16
-  ; CHECK: bb.0 (%ir-block.0):
-  ; CHECK-NEXT:   liveins: $sgpr4_sgpr5
-  ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr4_sgpr5
-  ; CHECK-NEXT:   [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec_xnull = S_LOAD_DWORDX2_IMM [[COPY]](p4), 36, 0 :: (dereferenceable invariant load (s64) from %ir.ptr.kernarg.offset, align 4, addrspace 4)
-  ; CHECK-NEXT:   [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
-  ; CHECK-NEXT:   [[GLOBAL_LOAD_USHORT_SADDR:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_USHORT_SADDR [[S_LOAD_DWORDX2_IMM]], [[V_MOV_B32_e32_]], 0, 0, implicit $exec :: (volatile "amdgpu-noclobber" load (s16) from %ir.ptr.load, addrspace 1)
-  ; CHECK-NEXT:   [[DEF:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[V_SQRT_F16_fake16_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_SQRT_F16_fake16_e64 0, [[GLOBAL_LOAD_USHORT_SADDR]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[DEF1:%[0-9]+]]:sreg_32_xexec = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[V_SQRT_F16_fake16_e64_]]
-  ; CHECK-NEXT:   GLOBAL_STORE_SHORT_SADDR [[V_MOV_B32_e32_]], killed [[COPY1]], [[S_LOAD_DWORDX2_IMM]], 0, 0, implicit $exec :: (store (s16) into %ir.ptr.load, addrspace 1)
-  ; CHECK-NEXT:   S_ENDPGM 0
-  %val = load volatile half, ptr addrspace(1) %ptr
-  %res = call half @llvm.amdgcn.sqrt.f16(half %val)
-  store half %res, ptr addrspace(1) %ptr
-  ret void
-}
-
 declare float @llvm.amdgcn.exp2.f32(float)
-declare half @llvm.amdgcn.exp2.f16(half)
 declare float @llvm.amdgcn.log.f32(float)
-declare half @llvm.amdgcn.log.f16(half)
 declare float @llvm.amdgcn.rcp.f32(float)
-declare half @llvm.amdgcn.rcp.f16(half)
 declare float @llvm.amdgcn.rsq.f32(float)
-declare half @llvm.amdgcn.rsq.f16(half)
 declare float @llvm.amdgcn.sqrt.f32(float)
-declare half @llvm.amdgcn.sqrt.f16(half)

>From be3c6a06d36d6210ce51bbeacc343f3590043b96 Mon Sep 17 00:00:00 2001
From: Kazu Hirata <kazu at google.com>
Date: Tue, 10 Jun 2025 12:49:39 -0700
Subject: [PATCH 06/61] [Scalar] Fix a warning

This patch fixes:

  llvm/lib/Transforms/Scalar/LowerMatrixIntrinsics.cpp:628:22: error:
  variable 'Inst' set but not used [-Werror,-Wunused-but-set-variable]
---
 llvm/lib/Transforms/Scalar/LowerMatrixIntrinsics.cpp | 1 +
 1 file changed, 1 insertion(+)

diff --git a/llvm/lib/Transforms/Scalar/LowerMatrixIntrinsics.cpp b/llvm/lib/Transforms/Scalar/LowerMatrixIntrinsics.cpp
index eb81d2ea49673..a7072ea719292 100644
--- a/llvm/lib/Transforms/Scalar/LowerMatrixIntrinsics.cpp
+++ b/llvm/lib/Transforms/Scalar/LowerMatrixIntrinsics.cpp
@@ -641,6 +641,7 @@ class LowerMatrixIntrinsics {
             << " shuffles beacuse we do not have a shape-aware lowering for "
                "its def:\n"
             << *Inst << '\n');
+        (void)Inst;
         SplitMatrices++;
       } else {
         // The ShapeMap has it, so it's a case where we're being lowered

>From f72dd4eca8dac84d819abccde4447755a7d9420d Mon Sep 17 00:00:00 2001
From: Craig Topper <craig.topper at sifive.com>
Date: Tue, 10 Jun 2025 12:50:16 -0700
Subject: [PATCH 07/61] [ConstantFolding] Fold scalable shufflevector of
 poison/undef. (#143475)

---
 llvm/lib/IR/ConstantFold.cpp                     |  7 +++----
 .../Transforms/InstSimplify/shufflevector.ll     | 16 ++++++++++++++++
 2 files changed, 19 insertions(+), 4 deletions(-)

diff --git a/llvm/lib/IR/ConstantFold.cpp b/llvm/lib/IR/ConstantFold.cpp
index b9db402fe9562..d4ad21e69e848 100644
--- a/llvm/lib/IR/ConstantFold.cpp
+++ b/llvm/lib/IR/ConstantFold.cpp
@@ -463,10 +463,9 @@ Constant *llvm::ConstantFoldShuffleVectorInstruction(Constant *V1, Constant *V2,
     Constant *Elt =
         ConstantExpr::getExtractElement(V1, ConstantInt::get(Ty, 0));
 
-    if (Elt->isNullValue()) {
-      auto *VTy = VectorType::get(EltTy, MaskEltCount);
-      return ConstantAggregateZero::get(VTy);
-    } else if (!MaskEltCount.isScalable())
+    // For scalable vectors, make sure this doesn't fold back into a
+    // shufflevector.
+    if (!MaskEltCount.isScalable() || Elt->isNullValue() || isa<UndefValue>(Elt))
       return ConstantVector::getSplat(MaskEltCount, Elt);
   }
 
diff --git a/llvm/test/Transforms/InstSimplify/shufflevector.ll b/llvm/test/Transforms/InstSimplify/shufflevector.ll
index 0442bd721974b..feab024a29213 100644
--- a/llvm/test/Transforms/InstSimplify/shufflevector.ll
+++ b/llvm/test/Transforms/InstSimplify/shufflevector.ll
@@ -337,3 +337,19 @@ define <4 x i32> @not_fold_identity2(<4 x i32> %x) {
   %revshuf = shufflevector <4 x i32> %shuf, <4 x i32> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
   ret <4 x i32> %revshuf
 }
+
+define <vscale x 2 x i32> @scalable_splat_undef() {
+; CHECK-LABEL: @scalable_splat_undef(
+; CHECK-NEXT:    ret <vscale x 2 x i32> undef
+;
+  %shuf = shufflevector <vscale x 2 x i32> undef, <vscale x 2 x i32> undef, <vscale x 2 x i32> zeroinitializer
+  ret <vscale x 2 x i32> %shuf
+}
+
+define <vscale x 2 x i32> @scalable_splat_poison() {
+; CHECK-LABEL: @scalable_splat_poison(
+; CHECK-NEXT:    ret <vscale x 2 x i32> poison
+;
+  %shuf = shufflevector <vscale x 2 x i32> poison, <vscale x 2 x i32> poison, <vscale x 2 x i32> zeroinitializer
+  ret <vscale x 2 x i32> %shuf
+}

>From e0cc556ad4276c2c76e303c39616ad126bea56d6 Mon Sep 17 00:00:00 2001
From: Craig Topper <craig.topper at sifive.com>
Date: Tue, 10 Jun 2025 12:52:39 -0700
Subject: [PATCH 08/61] [IndVars] Teach widenLoopCompare to use sext if narrow
 IV is positive and other operand is already sext. (#142703)

This prevents us from ending up with (zext (sext X)). The zext will
require an instruction on targets where zext isn't free like RISC-V.
---
 llvm/lib/Transforms/Utils/SimplifyIndVar.cpp  | 14 ++--
 .../Transforms/IndVarSimplify/iv-cmp-sext.ll  | 66 +++++++++++++++++++
 2 files changed, 76 insertions(+), 4 deletions(-)
 create mode 100644 llvm/test/Transforms/IndVarSimplify/iv-cmp-sext.ll

diff --git a/llvm/lib/Transforms/Utils/SimplifyIndVar.cpp b/llvm/lib/Transforms/Utils/SimplifyIndVar.cpp
index 3fe4621eca70d..43264cce73719 100644
--- a/llvm/lib/Transforms/Utils/SimplifyIndVar.cpp
+++ b/llvm/lib/Transforms/Utils/SimplifyIndVar.cpp
@@ -1605,13 +1605,13 @@ bool WidenIV::widenLoopCompare(WidenIV::NarrowIVDefUse DU) {
   //
   //  - The signedness of the IV extension and comparison match
   //
-  //  - The narrow IV is always positive (and thus its sign extension is equal
-  //    to its zero extension).  For instance, let's say we're zero extending
-  //    %narrow for the following use
+  //  - The narrow IV is always non-negative (and thus its sign extension is
+  //    equal to its zero extension).  For instance, let's say we're zero
+  //    extending %narrow for the following use
   //
   //      icmp slt i32 %narrow, %val   ... (A)
   //
-  //    and %narrow is always positive.  Then
+  //    and %narrow is always non-negative.  Then
   //
   //      (A) == icmp slt i32 sext(%narrow), sext(%val)
   //          == icmp slt i32 zext(%narrow), sext(%val)
@@ -1630,6 +1630,12 @@ bool WidenIV::widenLoopCompare(WidenIV::NarrowIVDefUse DU) {
 
   // Widen the other operand of the compare, if necessary.
   if (CastWidth < IVWidth) {
+    // If the narrow IV is always non-negative and the other operand is sext,
+    // widen using sext so we can combine them. This works for all non-signed
+    // comparison predicates.
+    if (DU.NeverNegative && isa<SExtInst>(Op) && !Cmp->isSigned())
+      CmpPreferredSign = true;
+
     Value *ExtOp = createExtendInst(Op, WideType, CmpPreferredSign, Cmp);
     DU.NarrowUse->replaceUsesOfWith(Op, ExtOp);
   }
diff --git a/llvm/test/Transforms/IndVarSimplify/iv-cmp-sext.ll b/llvm/test/Transforms/IndVarSimplify/iv-cmp-sext.ll
new file mode 100644
index 0000000000000..34925a6cca955
--- /dev/null
+++ b/llvm/test/Transforms/IndVarSimplify/iv-cmp-sext.ll
@@ -0,0 +1,66 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt < %s -passes=indvars -S | FileCheck %s
+
+target datalayout = "e-m:e-p:64:64-i64:64-i128:128-n32:64-S128"
+target triple = "riscv64"
+
+define void @foo(ptr %x, i32 %n) {
+; CHECK-LABEL: define void @foo(
+; CHECK-SAME: ptr [[X:%.*]], i32 [[N:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[CMP10:%.*]] = icmp sgt i32 [[N]], 0
+; CHECK-NEXT:    br i1 [[CMP10]], label %[[FOR_BODY_PREHEADER:.*]], label %[[FOR_COND_CLEANUP:.*]]
+; CHECK:       [[FOR_BODY_PREHEADER]]:
+; CHECK-NEXT:    [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64
+; CHECK-NEXT:    br label %[[FOR_BODY:.*]]
+; CHECK:       [[FOR_COND_CLEANUP_LOOPEXIT:.*]]:
+; CHECK-NEXT:    br label %[[FOR_COND_CLEANUP]]
+; CHECK:       [[FOR_COND_CLEANUP]]:
+; CHECK-NEXT:    ret void
+; CHECK:       [[FOR_BODY]]:
+; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], %[[FOR_INC:.*]] ], [ 0, %[[FOR_BODY_PREHEADER]] ]
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds nuw i16, ptr [[X]], i64 [[INDVARS_IV]]
+; CHECK-NEXT:    [[TMP0:%.*]] = load i16, ptr [[ARRAYIDX]], align 2
+; CHECK-NEXT:    [[CONV:%.*]] = sext i16 [[TMP0]] to i32
+; CHECK-NEXT:    [[TMP1:%.*]] = sext i32 [[CONV]] to i64
+; CHECK-NEXT:    [[CMP1:%.*]] = icmp eq i64 [[INDVARS_IV]], [[TMP1]]
+; CHECK-NEXT:    br i1 [[CMP1]], label %[[IF_THEN:.*]], label %[[FOR_INC]]
+; CHECK:       [[IF_THEN]]:
+; CHECK-NEXT:    store i16 0, ptr [[ARRAYIDX]], align 2
+; CHECK-NEXT:    br label %[[FOR_INC]]
+; CHECK:       [[FOR_INC]]:
+; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp ne i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
+; CHECK-NEXT:    br i1 [[EXITCOND]], label %[[FOR_BODY]], label %[[FOR_COND_CLEANUP_LOOPEXIT]]
+;
+entry:
+  %cmp10 = icmp sgt i32 %n, 0
+  br i1 %cmp10, label %for.body.preheader, label %for.cond.cleanup
+
+for.body.preheader:                               ; preds = %entry
+  br label %for.body
+
+for.cond.cleanup.loopexit:                        ; preds = %for.inc
+  br label %for.cond.cleanup
+
+for.cond.cleanup:                                 ; preds = %for.cond.cleanup.loopexit, %entry
+  ret void
+
+for.body:                                         ; preds = %for.body.preheader, %for.inc
+  %i.011 = phi i32 [ %inc, %for.inc ], [ 0, %for.body.preheader ]
+  %idxprom = zext nneg i32 %i.011 to i64
+  %arrayidx = getelementptr inbounds nuw i16, ptr %x, i64 %idxprom
+  %0 = load i16, ptr %arrayidx, align 2
+  %conv = sext i16 %0 to i32
+  %cmp1 = icmp eq i32 %i.011, %conv
+  br i1 %cmp1, label %if.then, label %for.inc
+
+if.then:                                          ; preds = %for.body
+  store i16 0, ptr %arrayidx, align 2
+  br label %for.inc
+
+for.inc:                                          ; preds = %for.body, %if.then
+  %inc = add nuw nsw i32 %i.011, 1
+  %cmp = icmp slt i32 %inc, %n
+  br i1 %cmp, label %for.body, label %for.cond.cleanup.loopexit
+}

>From 766b3016c4527a8b23772a421d71876fe186255f Mon Sep 17 00:00:00 2001
From: Kazu Hirata <kazu at google.com>
Date: Tue, 10 Jun 2025 12:54:42 -0700
Subject: [PATCH 09/61] [mlir] Fix a warning

This patch fixes:

  mlir/lib/Dialect/SCF/Transforms/TileUsingInterface.cpp:1066:62:
  error: missing field 'mergeOps' initializer
  [-Werror,-Wmissing-field-initializers]

  mlir/lib/Dialect/SCF/Transforms/TileUsingInterface.cpp:1076:75:
  error: missing field 'mergeOps' initializer
  [-Werror,-Wmissing-field-initializers]
---
 .../Dialect/SCF/Transforms/TileUsingInterface.cpp    | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

diff --git a/mlir/lib/Dialect/SCF/Transforms/TileUsingInterface.cpp b/mlir/lib/Dialect/SCF/Transforms/TileUsingInterface.cpp
index a0f9b599d1351..3f29dd3ac5e48 100644
--- a/mlir/lib/Dialect/SCF/Transforms/TileUsingInterface.cpp
+++ b/mlir/lib/Dialect/SCF/Transforms/TileUsingInterface.cpp
@@ -1061,9 +1061,12 @@ mlir::scf::tileUsingSCF(RewriterBase &rewriter, TilingInterface op,
   if (loops.empty()) {
     // If loops are empty, the tiled op is used as the replacement for the
     // untiled op.
-    return scf::SCFTilingResult{tilingResult->tiledOps, initTensors, loops,
+    return scf::SCFTilingResult{tilingResult->tiledOps,
+                                initTensors,
+                                loops,
                                 tilingResult->tiledValues,
-                                tilingResult->generatedSlices};
+                                tilingResult->generatedSlices,
+                                {}};
   }
 
   auto loopResults = llvm::map_to_vector(loops.front()->getResults(),
@@ -1072,8 +1075,9 @@ mlir::scf::tileUsingSCF(RewriterBase &rewriter, TilingInterface op,
   // For the full reduction case, there is nothing more to do.
   if (options.reductionStrategy ==
       scf::SCFTilingOptions::ReductionTilingStrategy::FullReduction) {
-    return scf::SCFTilingResult{tilingResult->tiledOps, initTensors, loops,
-                                loopResults, tilingResult->generatedSlices};
+    return scf::SCFTilingResult{
+        tilingResult->tiledOps,        initTensors, loops, loopResults,
+        tilingResult->generatedSlices, {}};
   }
 
   // The results of the loop needs to be merged.

>From b8f79f81a3fb559f93b30e1aef07cc965d2bf2b3 Mon Sep 17 00:00:00 2001
From: Nick Sarnie <nick.sarnie at intel.com>
Date: Tue, 10 Jun 2025 16:05:53 -0400
Subject: [PATCH 10/61] [Clang][Attr] Fix possible crash when trying to check
 for DeviceKernel spelling (#143546)

I didn't add a test because this can't be reproduced yet in this repo, I
reproduced this only in intel's fork with more SYCL support.

I also fixed some formatting.

Signed-off-by: Sarnie, Nick <nick.sarnie at intel.com>
---
 clang/include/clang/Basic/Attr.td | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/clang/include/clang/Basic/Attr.td b/clang/include/clang/Basic/Attr.td
index bd07bb1361a85..b8e5806d3c5e9 100644
--- a/clang/include/clang/Basic/Attr.td
+++ b/clang/include/clang/Basic/Attr.td
@@ -1612,11 +1612,11 @@ def DeviceKernel : DeclOrTypeAttr {
     // list, but here we have the same spelling with unscores and without,
     // so handle that case manually.
       return A.getAttributeSpellingListIndex() == Keyword_kernel ||
-      A.getAttrName()->getName() == "kernel";
+      (A.getAttrName() && A.getAttrName()->getName() == "kernel");
     }
     static inline bool isOpenCLSpelling(const AttributeCommonInfo* A) {
-    if (!A) return false;
-    return isOpenCLSpelling(*A);
+      if (!A) return false;
+      return isOpenCLSpelling(*A);
     }
 }];
 }

>From 719e7bea8af3335f8d7c46205ce8357baf4fa11d Mon Sep 17 00:00:00 2001
From: Philip Reames <preames at rivosinc.com>
Date: Tue, 10 Jun 2025 13:02:16 -0700
Subject: [PATCH 11/61] [LoopPeel] Add tests for last iteration peeling of
 min/max intrinsics

---
 .../LoopUnroll/peel-last-iteration-minmax.ll  | 137 ++++++++++++++++++
 1 file changed, 137 insertions(+)
 create mode 100644 llvm/test/Transforms/LoopUnroll/peel-last-iteration-minmax.ll

diff --git a/llvm/test/Transforms/LoopUnroll/peel-last-iteration-minmax.ll b/llvm/test/Transforms/LoopUnroll/peel-last-iteration-minmax.ll
new file mode 100644
index 0000000000000..cd098e123b5f6
--- /dev/null
+++ b/llvm/test/Transforms/LoopUnroll/peel-last-iteration-minmax.ll
@@ -0,0 +1,137 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt -p loop-unroll -unroll-full-max-count=0 -S %s | FileCheck %s
+
+declare void @foo(i32)
+
+define i32 @umin_unit_step() {
+; CHECK-LABEL: define i32 @umin_unit_step() {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    br label %[[LOOP:.*]]
+; CHECK:       [[LOOP]]:
+; CHECK-NEXT:    [[IV:%.*]] = phi i32 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[SUB:%.*]] = sub i32 1024, [[IV]]
+; CHECK-NEXT:    [[MINMAX:%.*]] = call i32 @llvm.umin.i32(i32 [[SUB]], i32 1)
+; CHECK-NEXT:    call void @foo(i32 [[MINMAX]])
+; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i32 [[IV]], 1
+; CHECK-NEXT:    [[EC:%.*]] = icmp ne i32 [[IV_NEXT]], 1024
+; CHECK-NEXT:    br i1 [[EC]], label %[[LOOP]], label %[[EXIT:.*]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    [[MINMAX_LCSSA:%.*]] = phi i32 [ [[MINMAX]], %[[LOOP]] ]
+; CHECK-NEXT:    ret i32 [[MINMAX_LCSSA]]
+;
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i32 [ 0, %entry ], [ %iv.next, %loop ]
+  %sub = sub i32 1024, %iv
+  %minmax = call i32 @llvm.umin(i32 %sub, i32 1)
+  call void @foo(i32 %minmax)
+  %iv.next = add nuw nsw i32 %iv, 1
+  %ec = icmp ne i32 %iv.next, 1024
+  br i1 %ec, label %loop, label %exit
+
+exit:
+  %minmax.lcssa = phi i32 [ %minmax, %loop ]
+  ret i32 %minmax.lcssa
+}
+
+define i32 @smin_unit_step() {
+; CHECK-LABEL: define i32 @smin_unit_step() {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    br label %[[LOOP:.*]]
+; CHECK:       [[LOOP]]:
+; CHECK-NEXT:    [[IV:%.*]] = phi i32 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[SUB:%.*]] = sub i32 1024, [[IV]]
+; CHECK-NEXT:    [[MINMAX:%.*]] = call i32 @llvm.smin.i32(i32 [[SUB]], i32 1)
+; CHECK-NEXT:    call void @foo(i32 [[MINMAX]])
+; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i32 [[IV]], 1
+; CHECK-NEXT:    [[EC_PEEL:%.*]] = icmp ne i32 [[IV_NEXT]], 1024
+; CHECK-NEXT:    br i1 [[EC_PEEL]], label %[[LOOP]], label %[[EXIT:.*]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    [[MINMAX_LCSSA:%.*]] = phi i32 [ [[MINMAX]], %[[LOOP]] ]
+; CHECK-NEXT:    ret i32 [[MINMAX_LCSSA]]
+;
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i32 [ 0, %entry ], [ %iv.next, %loop ]
+  %sub = sub i32 1024, %iv
+  %minmax = call i32 @llvm.smin(i32 %sub, i32 1)
+  call void @foo(i32 %minmax)
+  %iv.next = add nuw nsw i32 %iv, 1
+  %ec = icmp ne i32 %iv.next, 1024
+  br i1 %ec, label %loop, label %exit
+
+exit:
+  %minmax.lcssa = phi i32 [ %minmax, %loop ]
+  ret i32 %minmax.lcssa
+}
+
+define i32 @smax_unit_step() {
+; CHECK-LABEL: define i32 @smax_unit_step() {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    br label %[[LOOP:.*]]
+; CHECK:       [[LOOP]]:
+; CHECK-NEXT:    [[IV:%.*]] = phi i32 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[SUB:%.*]] = sub i32 1024, [[IV]]
+; CHECK-NEXT:    [[MINMAX:%.*]] = call i32 @llvm.smax.i32(i32 [[SUB]], i32 1)
+; CHECK-NEXT:    call void @foo(i32 [[MINMAX]])
+; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i32 [[IV]], 1
+; CHECK-NEXT:    [[EC_PEEL:%.*]] = icmp ne i32 [[IV_NEXT]], 1024
+; CHECK-NEXT:    br i1 [[EC_PEEL]], label %[[LOOP]], label %[[EXIT:.*]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    [[MINMAX_LCSSA:%.*]] = phi i32 [ [[MINMAX]], %[[LOOP]] ]
+; CHECK-NEXT:    ret i32 [[MINMAX_LCSSA]]
+;
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i32 [ 0, %entry ], [ %iv.next, %loop ]
+  %sub = sub i32 1024, %iv
+  %minmax = call i32 @llvm.smax(i32 %sub, i32 1)
+  call void @foo(i32 %minmax)
+  %iv.next = add nuw nsw i32 %iv, 1
+  %ec = icmp ne i32 %iv.next, 1024
+  br i1 %ec, label %loop, label %exit
+
+exit:
+  %minmax.lcssa = phi i32 [ %minmax, %loop ]
+  ret i32 %minmax.lcssa
+}
+
+define i32 @umax_unit_step() {
+; CHECK-LABEL: define i32 @umax_unit_step() {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    br label %[[LOOP:.*]]
+; CHECK:       [[LOOP]]:
+; CHECK-NEXT:    [[IV:%.*]] = phi i32 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[SUB:%.*]] = sub i32 1024, [[IV]]
+; CHECK-NEXT:    [[MINMAX:%.*]] = call i32 @llvm.umax.i32(i32 [[SUB]], i32 1)
+; CHECK-NEXT:    call void @foo(i32 [[MINMAX]])
+; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i32 [[IV]], 1
+; CHECK-NEXT:    [[EC:%.*]] = icmp ne i32 [[IV_NEXT]], 1024
+; CHECK-NEXT:    br i1 [[EC]], label %[[LOOP]], label %[[EXIT:.*]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    [[MINMAX_LCSSA:%.*]] = phi i32 [ [[MINMAX]], %[[LOOP]] ]
+; CHECK-NEXT:    ret i32 [[MINMAX_LCSSA]]
+;
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i32 [ 0, %entry ], [ %iv.next, %loop ]
+  %sub = sub i32 1024, %iv
+  %minmax = call i32 @llvm.umax(i32 %sub, i32 1)
+  call void @foo(i32 %minmax)
+  %iv.next = add nuw nsw i32 %iv, 1
+  %ec = icmp ne i32 %iv.next, 1024
+  br i1 %ec, label %loop, label %exit
+
+exit:
+  %minmax.lcssa = phi i32 [ %minmax, %loop ]
+  ret i32 %minmax.lcssa
+}
+

>From 94877ce1b4c514e839b49907efff95c0cf942656 Mon Sep 17 00:00:00 2001
From: Baranov Victor <bar.victor.2002 at gmail.com>
Date: Tue, 10 Jun 2025 23:23:37 +0300
Subject: [PATCH 12/61] [clang-tidy][NFC] fix 'misc-use-internal-linkage' check
 warnings (#143482)

Run misc-use-internal-linkage check over clang-tidy code.
Also fixed a couple of other clang-tidy warnings.

Apart from issues in header files, all '.cpp' in
`clang-tools-extra/clang-tidy` must be clang-tidy clear now.
---
 .../clang-tidy/abseil/AbseilTidyModule.cpp    |  2 +-
 .../clang-tidy/abseil/CleanupCtadCheck.cpp    |  2 +-
 .../clang-tidy/abseil/StrCatAppendCheck.cpp   |  6 ++--
 .../abseil/StringFindStrContainsCheck.cpp     |  2 ++
 .../clang-tidy/altera/AlteraTidyModule.cpp    |  2 +-
 .../clang-tidy/android/AndroidTidyModule.cpp  |  2 +-
 .../clang-tidy/boost/BoostTidyModule.cpp      |  2 +-
 .../clang-tidy/bugprone/BranchCloneCheck.cpp  |  1 +
 .../bugprone/BugproneTidyModule.cpp           |  1 +
 .../DynamicStaticInitializersCheck.cpp        |  4 +++
 .../EasilySwappableParametersCheck.cpp        | 10 ++++--
 .../bugprone/IncDecInConditionsCheck.cpp      |  4 +++
 .../MultipleNewInOneExpressionCheck.cpp       |  4 +--
 .../bugprone/StandaloneEmptyCheck.cpp         |  3 +-
 .../bugprone/StringviewNullptrCheck.cpp       |  2 +-
 .../bugprone/UnhandledExceptionAtNewCheck.cpp |  3 ++
 .../clang-tidy/bugprone/UnusedRaiiCheck.cpp   |  4 +--
 .../clang-tidy/bugprone/UseAfterMoveCheck.cpp |  2 +-
 .../clang-tidy/cert/CERTTidyModule.cpp        |  2 +-
 .../concurrency/ConcurrencyTidyModule.cpp     |  1 +
 .../CppCoreGuidelinesTidyModule.cpp           |  1 +
 .../VirtualClassDestructorCheck.cpp           |  4 +++
 .../clang-tidy/darwin/DarwinTidyModule.cpp    |  2 +-
 .../clang-tidy/fuchsia/FuchsiaTidyModule.cpp  |  2 +-
 .../clang-tidy/google/GoogleTidyModule.cpp    |  2 +-
 .../clang-tidy/hicpp/HICPPTidyModule.cpp      |  2 +-
 .../linuxkernel/LinuxKernelTidyModule.cpp     |  1 +
 .../clang-tidy/llvm/LLVMTidyModule.cpp        |  2 +-
 .../PreferIsaOrDynCastInConditionalsCheck.cpp | 12 +++----
 .../llvmlibc/CalleeNamespaceCheck.cpp         |  2 +-
 .../llvmlibc/LLVMLibcTidyModule.cpp           |  1 +
 .../clang-tidy/misc/MiscTidyModule.cpp        |  2 +-
 .../clang-tidy/modernize/AvoidBindCheck.cpp   |  6 ++--
 .../clang-tidy/modernize/LoopConvertCheck.cpp |  6 ++--
 .../modernize/ModernizeTidyModule.cpp         |  1 +
 .../modernize/RedundantVoidArgCheck.cpp       |  3 +-
 .../modernize/UseConstraintsCheck.cpp         |  2 +-
 .../modernize/UseEqualsDefaultCheck.cpp       |  4 +--
 .../modernize/UseStartsEndsWithCheck.cpp      |  4 +++
 .../clang-tidy/mpi/MPITidyModule.cpp          |  2 +-
 .../clang-tidy/objc/NSDateFormatterCheck.cpp  |  2 +-
 .../clang-tidy/objc/ObjCTidyModule.cpp        |  2 +-
 .../clang-tidy/openmp/OpenMPTidyModule.cpp    |  2 +-
 .../performance/MoveConstArgCheck.cpp         |  6 ++--
 .../performance/PerformanceTidyModule.cpp     |  1 +
 .../clang-tidy/plugin/ClangTidyPlugin.cpp     |  1 +
 .../portability/PortabilityTidyModule.cpp     |  1 +
 .../readability/ContainerSizeEmptyCheck.cpp   | 31 ++++++++++---------
 .../ConvertMemberFunctionsToStatic.cpp        |  4 +++
 .../MakeMemberFunctionConstCheck.cpp          |  4 +++
 .../readability/ReadabilityTidyModule.cpp     |  1 +
 .../readability/RedundantDeclarationCheck.cpp |  4 +++
 .../RedundantInlineSpecifierCheck.cpp         |  4 +--
 .../clang-tidy/tool/ClangTidyMain.cpp         |  2 +-
 .../clang-tidy/utils/NamespaceAliaser.cpp     | 11 ++++---
 .../utils/TransformerClangTidyCheck.cpp       |  2 +-
 .../zircon/TemporaryObjectsCheck.cpp          |  4 +++
 .../clang-tidy/zircon/ZirconTidyModule.cpp    |  2 +-
 58 files changed, 130 insertions(+), 74 deletions(-)

diff --git a/clang-tools-extra/clang-tidy/abseil/AbseilTidyModule.cpp b/clang-tools-extra/clang-tidy/abseil/AbseilTidyModule.cpp
index c5c42908d0c59..78605d59b4421 100644
--- a/clang-tools-extra/clang-tidy/abseil/AbseilTidyModule.cpp
+++ b/clang-tools-extra/clang-tidy/abseil/AbseilTidyModule.cpp
@@ -80,6 +80,6 @@ static ClangTidyModuleRegistry::Add<AbseilModule> X("abseil-module",
 
 // This anchor is used to force the linker to link in the generated object file
 // and thus register the AbseilModule.
-volatile int AbseilModuleAnchorSource = 0;
+volatile int AbseilModuleAnchorSource = 0; // NOLINT(misc-use-internal-linkage)
 
 } // namespace clang::tidy
diff --git a/clang-tools-extra/clang-tidy/abseil/CleanupCtadCheck.cpp b/clang-tools-extra/clang-tidy/abseil/CleanupCtadCheck.cpp
index 0819642be0558..8063fc540cce5 100644
--- a/clang-tools-extra/clang-tidy/abseil/CleanupCtadCheck.cpp
+++ b/clang-tools-extra/clang-tidy/abseil/CleanupCtadCheck.cpp
@@ -18,7 +18,7 @@ using namespace ::clang::transformer;
 
 namespace clang::tidy::abseil {
 
-RewriteRuleWith<std::string> cleanupCtadCheckImpl() {
+static RewriteRuleWith<std::string> cleanupCtadCheckImpl() {
   auto WarningMessage = cat("prefer absl::Cleanup's class template argument "
                             "deduction pattern in C++17 and higher");
 
diff --git a/clang-tools-extra/clang-tidy/abseil/StrCatAppendCheck.cpp b/clang-tools-extra/clang-tidy/abseil/StrCatAppendCheck.cpp
index 6025d3d66675a..ced92590be02e 100644
--- a/clang-tools-extra/clang-tidy/abseil/StrCatAppendCheck.cpp
+++ b/clang-tools-extra/clang-tidy/abseil/StrCatAppendCheck.cpp
@@ -17,7 +17,7 @@ namespace clang::tidy::abseil {
 namespace {
 // Skips any combination of temporary materialization, temporary binding and
 // implicit casting.
-AST_MATCHER_P(Stmt, IgnoringTemporaries, ast_matchers::internal::Matcher<Stmt>,
+AST_MATCHER_P(Stmt, ignoringTemporaries, ast_matchers::internal::Matcher<Stmt>,
               InnerMatcher) {
   const Stmt *E = &Node;
   while (true) {
@@ -43,7 +43,7 @@ void StrCatAppendCheck::registerMatchers(MatchFinder *Finder) {
   const auto StrCat = functionDecl(hasName("::absl::StrCat"));
   // The arguments of absl::StrCat are implicitly converted to AlphaNum. This
   // matches to the arguments because of that behavior.
-  const auto AlphaNum = IgnoringTemporaries(cxxConstructExpr(
+  const auto AlphaNum = ignoringTemporaries(cxxConstructExpr(
       argumentCountIs(1), hasType(cxxRecordDecl(hasName("::absl::AlphaNum"))),
       hasArgument(0, ignoringImpCasts(declRefExpr(to(equalsBoundNode("LHS")),
                                                   expr().bind("Arg0"))))));
@@ -62,7 +62,7 @@ void StrCatAppendCheck::registerMatchers(MatchFinder *Finder) {
                    hasOverloadedOperatorName("="),
                    hasArgument(0, declRefExpr(to(decl().bind("LHS")))),
                    hasArgument(
-                       1, IgnoringTemporaries(
+                       1, ignoringTemporaries(
                               callExpr(callee(StrCat), hasArgument(0, AlphaNum),
                                        unless(HasAnotherReferenceToLhs))
                                   .bind("Call"))))
diff --git a/clang-tools-extra/clang-tidy/abseil/StringFindStrContainsCheck.cpp b/clang-tools-extra/clang-tidy/abseil/StringFindStrContainsCheck.cpp
index 6e89783bb51eb..0c2fe285ce060 100644
--- a/clang-tools-extra/clang-tidy/abseil/StringFindStrContainsCheck.cpp
+++ b/clang-tools-extra/clang-tidy/abseil/StringFindStrContainsCheck.cpp
@@ -29,7 +29,9 @@ using ::clang::transformer::makeRule;
 using ::clang::transformer::node;
 using ::clang::transformer::RewriteRuleWith;
 
+namespace {
 AST_MATCHER(Type, isCharType) { return Node.isCharType(); }
+} // namespace
 
 static const char DefaultStringLikeClasses[] = "::std::basic_string;"
                                                "::std::basic_string_view;"
diff --git a/clang-tools-extra/clang-tidy/altera/AlteraTidyModule.cpp b/clang-tools-extra/clang-tidy/altera/AlteraTidyModule.cpp
index 21610c7d1eced..02a43ba86d7bb 100644
--- a/clang-tools-extra/clang-tidy/altera/AlteraTidyModule.cpp
+++ b/clang-tools-extra/clang-tidy/altera/AlteraTidyModule.cpp
@@ -43,6 +43,6 @@ static ClangTidyModuleRegistry::Add<altera::AlteraModule>
 
 // This anchor is used to force the linker to link in the generated object file
 // and thus register the AlteraModule.
-volatile int AlteraModuleAnchorSource = 0;
+volatile int AlteraModuleAnchorSource = 0; // NOLINT(misc-use-internal-linkage)
 
 } // namespace clang::tidy
diff --git a/clang-tools-extra/clang-tidy/android/AndroidTidyModule.cpp b/clang-tools-extra/clang-tidy/android/AndroidTidyModule.cpp
index 1cc17daded903..17efa10909d0a 100644
--- a/clang-tools-extra/clang-tidy/android/AndroidTidyModule.cpp
+++ b/clang-tools-extra/clang-tidy/android/AndroidTidyModule.cpp
@@ -67,6 +67,6 @@ static ClangTidyModuleRegistry::Add<AndroidModule>
 
 // This anchor is used to force the linker to link in the generated object file
 // and thus register the AndroidModule.
-volatile int AndroidModuleAnchorSource = 0;
+volatile int AndroidModuleAnchorSource = 0; // NOLINT(misc-use-internal-linkage)
 
 } // namespace clang::tidy
diff --git a/clang-tools-extra/clang-tidy/boost/BoostTidyModule.cpp b/clang-tools-extra/clang-tidy/boost/BoostTidyModule.cpp
index 79d0e380e402d..f414fe750d023 100644
--- a/clang-tools-extra/clang-tidy/boost/BoostTidyModule.cpp
+++ b/clang-tools-extra/clang-tidy/boost/BoostTidyModule.cpp
@@ -32,6 +32,6 @@ static ClangTidyModuleRegistry::Add<BoostModule> X("boost-module",
 
 // This anchor is used to force the linker to link in the generated object file
 // and thus register the BoostModule.
-volatile int BoostModuleAnchorSource = 0;
+volatile int BoostModuleAnchorSource = 0; // NOLINT(misc-use-internal-linkage)
 
 } // namespace clang::tidy
diff --git a/clang-tools-extra/clang-tidy/bugprone/BranchCloneCheck.cpp b/clang-tools-extra/clang-tidy/bugprone/BranchCloneCheck.cpp
index 0b76cfb2ad8dc..a6cd68edda55e 100644
--- a/clang-tools-extra/clang-tidy/bugprone/BranchCloneCheck.cpp
+++ b/clang-tools-extra/clang-tidy/bugprone/BranchCloneCheck.cpp
@@ -62,6 +62,7 @@ static bool isFallthroughSwitchBranch(const SwitchBranch &Branch) {
       return true; // Ignore sub-switches
     }
 
+    // NOLINTNEXTLINE(readability-identifier-naming) - FIXME
     bool TraverseSwitchCase(SwitchCase *, DataRecursionQueue * = nullptr) {
       return true; // Ignore cases
     }
diff --git a/clang-tools-extra/clang-tidy/bugprone/BugproneTidyModule.cpp b/clang-tools-extra/clang-tidy/bugprone/BugproneTidyModule.cpp
index 64f4a524daf0d..ed1fd138d8f1b 100644
--- a/clang-tools-extra/clang-tidy/bugprone/BugproneTidyModule.cpp
+++ b/clang-tools-extra/clang-tidy/bugprone/BugproneTidyModule.cpp
@@ -291,6 +291,7 @@ static ClangTidyModuleRegistry::Add<bugprone::BugproneModule>
 
 // This anchor is used to force the linker to link in the generated object file
 // and thus register the BugproneModule.
+// NOLINTNEXTLINE(misc-use-internal-linkage)
 volatile int BugproneModuleAnchorSource = 0;
 
 } // namespace clang::tidy
diff --git a/clang-tools-extra/clang-tidy/bugprone/DynamicStaticInitializersCheck.cpp b/clang-tools-extra/clang-tidy/bugprone/DynamicStaticInitializersCheck.cpp
index d99e17c691485..3fe028b94771d 100644
--- a/clang-tools-extra/clang-tidy/bugprone/DynamicStaticInitializersCheck.cpp
+++ b/clang-tools-extra/clang-tidy/bugprone/DynamicStaticInitializersCheck.cpp
@@ -15,6 +15,8 @@ using namespace clang::ast_matchers;
 
 namespace clang::tidy::bugprone {
 
+namespace {
+
 AST_MATCHER(clang::VarDecl, hasConstantDeclaration) {
   const Expr *Init = Node.getInit();
   if (Init && !Init->isValueDependent()) {
@@ -25,6 +27,8 @@ AST_MATCHER(clang::VarDecl, hasConstantDeclaration) {
   return false;
 }
 
+} // namespace
+
 DynamicStaticInitializersCheck::DynamicStaticInitializersCheck(
     StringRef Name, ClangTidyContext *Context)
     : ClangTidyCheck(Name, Context),
diff --git a/clang-tools-extra/clang-tidy/bugprone/EasilySwappableParametersCheck.cpp b/clang-tools-extra/clang-tidy/bugprone/EasilySwappableParametersCheck.cpp
index 10868129e76da..a179d4bf66b4d 100644
--- a/clang-tools-extra/clang-tidy/bugprone/EasilySwappableParametersCheck.cpp
+++ b/clang-tools-extra/clang-tidy/bugprone/EasilySwappableParametersCheck.cpp
@@ -1497,11 +1497,13 @@ static MixableParameterRange modelMixingRange(
 
 } // namespace model
 
+namespace {
 /// Matches DeclRefExprs and their ignorable wrappers to ParmVarDecls.
 AST_MATCHER_FUNCTION(ast_matchers::internal::Matcher<Stmt>, paramRefExpr) {
   return expr(ignoringParenImpCasts(ignoringElidableConstructorCall(
       declRefExpr(to(parmVarDecl().bind("param"))))));
 }
+} // namespace
 
 namespace filter {
 
@@ -1574,8 +1576,8 @@ using ParamToSmallSetMap =
 /// Returns whether the sets mapped to the two elements in the map have at
 /// least one element in common.
 template <typename MapTy, typename ElemTy>
-bool lazyMapOfSetsIntersectionExists(const MapTy &Map, const ElemTy &E1,
-                                     const ElemTy &E2) {
+static bool lazyMapOfSetsIntersectionExists(const MapTy &Map, const ElemTy &E1,
+                                            const ElemTy &E2) {
   auto E1Iterator = Map.find(E1);
   auto E2Iterator = Map.find(E2);
   if (E1Iterator == Map.end() || E2Iterator == Map.end())
@@ -1882,6 +1884,8 @@ static bool prefixSuffixCoverUnderThreshold(std::size_t Threshold,
 
 } // namespace filter
 
+namespace {
+
 /// Matches functions that have at least the specified amount of parameters.
 AST_MATCHER_P(FunctionDecl, parameterCountGE, unsigned, N) {
   return Node.getNumParams() >= N;
@@ -1904,6 +1908,8 @@ AST_MATCHER(FunctionDecl, isOverloadedUnaryOrBinaryOperator) {
   }
 }
 
+} // namespace
+
 /// Returns the DefaultMinimumLength if the Value of requested minimum length
 /// is less than 2. Minimum lengths of 0 or 1 are not accepted.
 static inline unsigned clampMinimumLength(const unsigned Value) {
diff --git a/clang-tools-extra/clang-tidy/bugprone/IncDecInConditionsCheck.cpp b/clang-tools-extra/clang-tidy/bugprone/IncDecInConditionsCheck.cpp
index 9b3b01eb02683..73bffe93146e6 100644
--- a/clang-tools-extra/clang-tidy/bugprone/IncDecInConditionsCheck.cpp
+++ b/clang-tools-extra/clang-tidy/bugprone/IncDecInConditionsCheck.cpp
@@ -15,6 +15,8 @@ using namespace clang::ast_matchers;
 
 namespace clang::tidy::bugprone {
 
+namespace {
+
 AST_MATCHER(BinaryOperator, isLogicalOperator) { return Node.isLogicalOp(); }
 
 AST_MATCHER(UnaryOperator, isUnaryPrePostOperator) {
@@ -26,6 +28,8 @@ AST_MATCHER(CXXOperatorCallExpr, isPrePostOperator) {
          Node.getOperator() == OO_MinusMinus;
 }
 
+} // namespace
+
 void IncDecInConditionsCheck::registerMatchers(MatchFinder *Finder) {
   auto OperatorMatcher = expr(
       anyOf(binaryOperator(anyOf(isComparisonOperator(), isLogicalOperator())),
diff --git a/clang-tools-extra/clang-tidy/bugprone/MultipleNewInOneExpressionCheck.cpp b/clang-tools-extra/clang-tidy/bugprone/MultipleNewInOneExpressionCheck.cpp
index 6d75a8262b5bc..b68888cb5b928 100644
--- a/clang-tools-extra/clang-tidy/bugprone/MultipleNewInOneExpressionCheck.cpp
+++ b/clang-tools-extra/clang-tidy/bugprone/MultipleNewInOneExpressionCheck.cpp
@@ -49,8 +49,6 @@ bool isExprValueStored(const Expr *E, ASTContext &C) {
   return isa<CallExpr, CXXConstructExpr>(ParentE);
 }
 
-} // namespace
-
 AST_MATCHER_P(CXXTryStmt, hasHandlerFor,
               ast_matchers::internal::Matcher<QualType>, InnerMatcher) {
   for (unsigned NH = Node.getNumHandlers(), I = 0; I < NH; ++I) {
@@ -74,6 +72,8 @@ AST_MATCHER(CXXNewExpr, mayThrow) {
   return !OperatorNew->getType()->castAs<FunctionProtoType>()->isNothrow();
 }
 
+} // namespace
+
 void MultipleNewInOneExpressionCheck::registerMatchers(MatchFinder *Finder) {
   auto BadAllocType =
       recordType(hasDeclaration(cxxRecordDecl(hasName("::std::bad_alloc"))));
diff --git a/clang-tools-extra/clang-tidy/bugprone/StandaloneEmptyCheck.cpp b/clang-tools-extra/clang-tidy/bugprone/StandaloneEmptyCheck.cpp
index 682478ecead0b..5d9e91e0b82c7 100644
--- a/clang-tools-extra/clang-tidy/bugprone/StandaloneEmptyCheck.cpp
+++ b/clang-tools-extra/clang-tidy/bugprone/StandaloneEmptyCheck.cpp
@@ -46,7 +46,8 @@ using ast_matchers::stmtExpr;
 using ast_matchers::unless;
 using ast_matchers::voidType;
 
-const Expr *getCondition(const BoundNodes &Nodes, const StringRef NodeId) {
+static const Expr *getCondition(const BoundNodes &Nodes,
+                                const StringRef NodeId) {
   const auto *If = Nodes.getNodeAs<IfStmt>(NodeId);
   if (If != nullptr)
     return If->getCond();
diff --git a/clang-tools-extra/clang-tidy/bugprone/StringviewNullptrCheck.cpp b/clang-tools-extra/clang-tidy/bugprone/StringviewNullptrCheck.cpp
index f944ae6c5b9e3..20789b3123e2f 100644
--- a/clang-tools-extra/clang-tidy/bugprone/StringviewNullptrCheck.cpp
+++ b/clang-tools-extra/clang-tidy/bugprone/StringviewNullptrCheck.cpp
@@ -33,7 +33,7 @@ AST_MATCHER(clang::VarDecl, isDirectInitialization) {
 
 } // namespace
 
-RewriteRuleWith<std::string> stringviewNullptrCheckImpl() {
+static RewriteRuleWith<std::string> stringviewNullptrCheckImpl() {
   auto ConstructionWarning =
       cat("constructing basic_string_view from null is undefined; replace with "
           "the default constructor");
diff --git a/clang-tools-extra/clang-tidy/bugprone/UnhandledExceptionAtNewCheck.cpp b/clang-tools-extra/clang-tidy/bugprone/UnhandledExceptionAtNewCheck.cpp
index ca4562aa9a44f..5e220017c97f4 100644
--- a/clang-tools-extra/clang-tidy/bugprone/UnhandledExceptionAtNewCheck.cpp
+++ b/clang-tools-extra/clang-tidy/bugprone/UnhandledExceptionAtNewCheck.cpp
@@ -12,6 +12,7 @@
 using namespace clang::ast_matchers;
 
 namespace clang::tidy::bugprone {
+namespace {
 
 AST_MATCHER_P(CXXTryStmt, hasHandlerFor,
               ast_matchers::internal::Matcher<QualType>, InnerMatcher) {
@@ -36,6 +37,8 @@ AST_MATCHER(CXXNewExpr, mayThrow) {
   return !OperatorNew->getType()->castAs<FunctionProtoType>()->isNothrow();
 }
 
+} // namespace
+
 UnhandledExceptionAtNewCheck::UnhandledExceptionAtNewCheck(
     StringRef Name, ClangTidyContext *Context)
     : ClangTidyCheck(Name, Context) {}
diff --git a/clang-tools-extra/clang-tidy/bugprone/UnusedRaiiCheck.cpp b/clang-tools-extra/clang-tidy/bugprone/UnusedRaiiCheck.cpp
index d96e7524172bd..b17d3868dd76a 100644
--- a/clang-tools-extra/clang-tidy/bugprone/UnusedRaiiCheck.cpp
+++ b/clang-tools-extra/clang-tidy/bugprone/UnusedRaiiCheck.cpp
@@ -37,8 +37,8 @@ void UnusedRaiiCheck::registerMatchers(MatchFinder *Finder) {
 }
 
 template <typename T>
-void reportDiagnostic(DiagnosticBuilder D, const T *Node, SourceRange SR,
-                      bool DefaultConstruction) {
+static void reportDiagnostic(DiagnosticBuilder D, const T *Node, SourceRange SR,
+                             bool DefaultConstruction) {
   const char *Replacement = " give_me_a_name";
 
   // If this is a default ctor we have to remove the parens or we'll introduce a
diff --git a/clang-tools-extra/clang-tidy/bugprone/UseAfterMoveCheck.cpp b/clang-tools-extra/clang-tidy/bugprone/UseAfterMoveCheck.cpp
index 960133159dbbf..1bcacf96a4129 100644
--- a/clang-tools-extra/clang-tidy/bugprone/UseAfterMoveCheck.cpp
+++ b/clang-tools-extra/clang-tidy/bugprone/UseAfterMoveCheck.cpp
@@ -242,7 +242,7 @@ void UseAfterMoveFinder::getUsesAndReinits(
   });
 }
 
-bool isStandardSmartPointer(const ValueDecl *VD) {
+static bool isStandardSmartPointer(const ValueDecl *VD) {
   const Type *TheType = VD->getType().getNonReferenceType().getTypePtrOrNull();
   if (!TheType)
     return false;
diff --git a/clang-tools-extra/clang-tidy/cert/CERTTidyModule.cpp b/clang-tools-extra/clang-tidy/cert/CERTTidyModule.cpp
index 6614b10d4ce40..66fedabaf3ca6 100644
--- a/clang-tools-extra/clang-tidy/cert/CERTTidyModule.cpp
+++ b/clang-tools-extra/clang-tidy/cert/CERTTidyModule.cpp
@@ -358,6 +358,6 @@ static ClangTidyModuleRegistry::Add<cert::CERTModule>
 
 // This anchor is used to force the linker to link in the generated object file
 // and thus register the CERTModule.
-volatile int CERTModuleAnchorSource = 0;
+volatile int CERTModuleAnchorSource = 0; // NOLINT(misc-use-internal-linkage)
 
 } // namespace clang::tidy
diff --git a/clang-tools-extra/clang-tidy/concurrency/ConcurrencyTidyModule.cpp b/clang-tools-extra/clang-tidy/concurrency/ConcurrencyTidyModule.cpp
index 8d74d0332df82..6c58c506dc903 100644
--- a/clang-tools-extra/clang-tidy/concurrency/ConcurrencyTidyModule.cpp
+++ b/clang-tools-extra/clang-tidy/concurrency/ConcurrencyTidyModule.cpp
@@ -34,6 +34,7 @@ static ClangTidyModuleRegistry::Add<concurrency::ConcurrencyModule>
 
 // This anchor is used to force the linker to link in the generated object file
 // and thus register the ConcurrencyModule.
+// NOLINTNEXTLINE(misc-use-internal-linkage)
 volatile int ConcurrencyModuleAnchorSource = 0;
 
 } // namespace clang::tidy
diff --git a/clang-tools-extra/clang-tidy/cppcoreguidelines/CppCoreGuidelinesTidyModule.cpp b/clang-tools-extra/clang-tidy/cppcoreguidelines/CppCoreGuidelinesTidyModule.cpp
index 6adef04264347..4dd9b0904f075 100644
--- a/clang-tools-extra/clang-tidy/cppcoreguidelines/CppCoreGuidelinesTidyModule.cpp
+++ b/clang-tools-extra/clang-tidy/cppcoreguidelines/CppCoreGuidelinesTidyModule.cpp
@@ -156,6 +156,7 @@ static ClangTidyModuleRegistry::Add<CppCoreGuidelinesModule>
 
 // This anchor is used to force the linker to link in the generated object file
 // and thus register the CppCoreGuidelinesModule.
+// NOLINTNEXTLINE(misc-use-internal-linkage)
 volatile int CppCoreGuidelinesModuleAnchorSource = 0;
 
 } // namespace clang::tidy
diff --git a/clang-tools-extra/clang-tidy/cppcoreguidelines/VirtualClassDestructorCheck.cpp b/clang-tools-extra/clang-tidy/cppcoreguidelines/VirtualClassDestructorCheck.cpp
index aa70b3896f16d..e31d046565677 100644
--- a/clang-tools-extra/clang-tidy/cppcoreguidelines/VirtualClassDestructorCheck.cpp
+++ b/clang-tools-extra/clang-tidy/cppcoreguidelines/VirtualClassDestructorCheck.cpp
@@ -18,6 +18,8 @@ using namespace clang::ast_matchers;
 
 namespace clang::tidy::cppcoreguidelines {
 
+namespace {
+
 AST_MATCHER(CXXRecordDecl, hasPublicVirtualOrProtectedNonVirtualDestructor) {
   // We need to call Node.getDestructor() instead of matching a
   // CXXDestructorDecl. Otherwise, tests will fail for class templates, since
@@ -33,6 +35,8 @@ AST_MATCHER(CXXRecordDecl, hasPublicVirtualOrProtectedNonVirtualDestructor) {
            !Destructor->isVirtual()));
 }
 
+} // namespace
+
 void VirtualClassDestructorCheck::registerMatchers(MatchFinder *Finder) {
   ast_matchers::internal::Matcher<CXXRecordDecl> InheritsVirtualMethod =
       hasAnyBase(hasType(cxxRecordDecl(has(cxxMethodDecl(isVirtual())))));
diff --git a/clang-tools-extra/clang-tidy/darwin/DarwinTidyModule.cpp b/clang-tools-extra/clang-tidy/darwin/DarwinTidyModule.cpp
index 616a2a2a85c56..bc8c91a9ed413 100644
--- a/clang-tools-extra/clang-tidy/darwin/DarwinTidyModule.cpp
+++ b/clang-tools-extra/clang-tidy/darwin/DarwinTidyModule.cpp
@@ -32,6 +32,6 @@ static ClangTidyModuleRegistry::Add<darwin::DarwinModule>
 
 // This anchor is used to force the linker to link in the generated object file
 // and thus register the DarwinModule.
-volatile int DarwinModuleAnchorSource = 0;
+volatile int DarwinModuleAnchorSource = 0; // NOLINT(misc-use-internal-linkage)
 
 } // namespace clang::tidy
diff --git a/clang-tools-extra/clang-tidy/fuchsia/FuchsiaTidyModule.cpp b/clang-tools-extra/clang-tidy/fuchsia/FuchsiaTidyModule.cpp
index 45a79b75e0937..d7a70b39bdc55 100644
--- a/clang-tools-extra/clang-tidy/fuchsia/FuchsiaTidyModule.cpp
+++ b/clang-tools-extra/clang-tidy/fuchsia/FuchsiaTidyModule.cpp
@@ -52,6 +52,6 @@ static ClangTidyModuleRegistry::Add<FuchsiaModule>
 
 // This anchor is used to force the linker to link in the generated object file
 // and thus register the FuchsiaModule.
-volatile int FuchsiaModuleAnchorSource = 0;
+volatile int FuchsiaModuleAnchorSource = 0; // NOLINT(misc-use-internal-linkage)
 
 } // namespace clang::tidy
diff --git a/clang-tools-extra/clang-tidy/google/GoogleTidyModule.cpp b/clang-tools-extra/clang-tidy/google/GoogleTidyModule.cpp
index 5b783c40ddb34..5343e2b3a5975 100644
--- a/clang-tools-extra/clang-tidy/google/GoogleTidyModule.cpp
+++ b/clang-tools-extra/clang-tidy/google/GoogleTidyModule.cpp
@@ -100,6 +100,6 @@ static ClangTidyModuleRegistry::Add<GoogleModule> X("google-module",
 
 // This anchor is used to force the linker to link in the generated object file
 // and thus register the GoogleModule.
-volatile int GoogleModuleAnchorSource = 0;
+volatile int GoogleModuleAnchorSource = 0; // NOLINT(misc-use-internal-linkage)
 
 } // namespace clang::tidy
diff --git a/clang-tools-extra/clang-tidy/hicpp/HICPPTidyModule.cpp b/clang-tools-extra/clang-tidy/hicpp/HICPPTidyModule.cpp
index 38f3ab5ae85d3..65a56be3e5a05 100644
--- a/clang-tools-extra/clang-tidy/hicpp/HICPPTidyModule.cpp
+++ b/clang-tools-extra/clang-tidy/hicpp/HICPPTidyModule.cpp
@@ -119,6 +119,6 @@ static ClangTidyModuleRegistry::Add<HICPPModule>
 
 // This anchor is used to force the linker to link in the generated object file
 // and thus register the HICPPModule.
-volatile int HICPPModuleAnchorSource = 0;
+volatile int HICPPModuleAnchorSource = 0; // NOLINT(misc-use-internal-linkage)
 
 } // namespace clang::tidy
diff --git a/clang-tools-extra/clang-tidy/linuxkernel/LinuxKernelTidyModule.cpp b/clang-tools-extra/clang-tidy/linuxkernel/LinuxKernelTidyModule.cpp
index e6d64ff61d261..b8b75b7ccaefe 100644
--- a/clang-tools-extra/clang-tidy/linuxkernel/LinuxKernelTidyModule.cpp
+++ b/clang-tools-extra/clang-tidy/linuxkernel/LinuxKernelTidyModule.cpp
@@ -30,6 +30,7 @@ static ClangTidyModuleRegistry::Add<LinuxKernelModule>
 
 // This anchor is used to force the linker to link in the generated object file
 // and thus register the LinuxKernelModule.
+// NOLINTNEXTLINE(misc-use-internal-linkage)
 volatile int LinuxKernelModuleAnchorSource = 0;
 
 } // namespace clang::tidy
diff --git a/clang-tools-extra/clang-tidy/llvm/LLVMTidyModule.cpp b/clang-tools-extra/clang-tidy/llvm/LLVMTidyModule.cpp
index e749163699b34..ceebde1595e7f 100644
--- a/clang-tools-extra/clang-tidy/llvm/LLVMTidyModule.cpp
+++ b/clang-tools-extra/clang-tidy/llvm/LLVMTidyModule.cpp
@@ -57,6 +57,6 @@ static ClangTidyModuleRegistry::Add<LLVMModule> X("llvm-module",
 
 // This anchor is used to force the linker to link in the generated object file
 // and thus register the LLVMModule.
-volatile int LLVMModuleAnchorSource = 0;
+volatile int LLVMModuleAnchorSource = 0; // NOLINT(misc-use-internal-linkage)
 
 } // namespace clang::tidy
diff --git a/clang-tools-extra/clang-tidy/llvm/PreferIsaOrDynCastInConditionalsCheck.cpp b/clang-tools-extra/clang-tidy/llvm/PreferIsaOrDynCastInConditionalsCheck.cpp
index 9333337777705..4c138bcc564d8 100644
--- a/clang-tools-extra/clang-tidy/llvm/PreferIsaOrDynCastInConditionalsCheck.cpp
+++ b/clang-tools-extra/clang-tidy/llvm/PreferIsaOrDynCastInConditionalsCheck.cpp
@@ -14,12 +14,11 @@
 
 using namespace clang::ast_matchers;
 
-namespace clang {
-namespace ast_matchers {
-AST_MATCHER(Expr, isMacroID) { return Node.getExprLoc().isMacroID(); }
-} // namespace ast_matchers
+namespace clang::tidy::llvm_check {
 
-namespace tidy::llvm_check {
+namespace {
+AST_MATCHER(Expr, isMacroID) { return Node.getExprLoc().isMacroID(); }
+} // namespace
 
 void PreferIsaOrDynCastInConditionalsCheck::registerMatchers(
     MatchFinder *Finder) {
@@ -125,5 +124,4 @@ void PreferIsaOrDynCastInConditionalsCheck::check(
   }
 }
 
-} // namespace tidy::llvm_check
-} // namespace clang
+} // namespace clang::tidy::llvm_check
diff --git a/clang-tools-extra/clang-tidy/llvmlibc/CalleeNamespaceCheck.cpp b/clang-tools-extra/clang-tidy/llvmlibc/CalleeNamespaceCheck.cpp
index d14cd2721d9a1..4bc4d5a4691f0 100644
--- a/clang-tools-extra/clang-tidy/llvmlibc/CalleeNamespaceCheck.cpp
+++ b/clang-tools-extra/clang-tidy/llvmlibc/CalleeNamespaceCheck.cpp
@@ -19,7 +19,7 @@ namespace clang::tidy::llvm_libc {
 
 // Gets the outermost namespace of a DeclContext, right under the Translation
 // Unit.
-const DeclContext *getOutermostNamespace(const DeclContext *Decl) {
+static const DeclContext *getOutermostNamespace(const DeclContext *Decl) {
   const DeclContext *Parent = Decl->getParent();
   if (Parent->isTranslationUnit())
     return Decl;
diff --git a/clang-tools-extra/clang-tidy/llvmlibc/LLVMLibcTidyModule.cpp b/clang-tools-extra/clang-tidy/llvmlibc/LLVMLibcTidyModule.cpp
index 7f26840be7372..562d71a0891c4 100644
--- a/clang-tools-extra/clang-tidy/llvmlibc/LLVMLibcTidyModule.cpp
+++ b/clang-tools-extra/clang-tidy/llvmlibc/LLVMLibcTidyModule.cpp
@@ -39,6 +39,7 @@ static ClangTidyModuleRegistry::Add<LLVMLibcModule>
 
 // This anchor is used to force the linker to link in the generated object file
 // and thus register the LLVMLibcModule.
+// NOLINTNEXTLINE(misc-use-internal-linkage)
 volatile int LLVMLibcModuleAnchorSource = 0;
 
 } // namespace clang::tidy
diff --git a/clang-tools-extra/clang-tidy/misc/MiscTidyModule.cpp b/clang-tools-extra/clang-tidy/misc/MiscTidyModule.cpp
index 54bcebca7e186..6ddebcbc0e152 100644
--- a/clang-tools-extra/clang-tidy/misc/MiscTidyModule.cpp
+++ b/clang-tools-extra/clang-tidy/misc/MiscTidyModule.cpp
@@ -92,6 +92,6 @@ static ClangTidyModuleRegistry::Add<misc::MiscModule>
 
 // This anchor is used to force the linker to link in the generated object file
 // and thus register the MiscModule.
-volatile int MiscModuleAnchorSource = 0;
+volatile int MiscModuleAnchorSource = 0; // NOLINT(misc-use-internal-linkage)
 
 } // namespace clang::tidy
diff --git a/clang-tools-extra/clang-tidy/modernize/AvoidBindCheck.cpp b/clang-tools-extra/clang-tidy/modernize/AvoidBindCheck.cpp
index a41524af56800..c9477327742d7 100644
--- a/clang-tools-extra/clang-tidy/modernize/AvoidBindCheck.cpp
+++ b/clang-tools-extra/clang-tidy/modernize/AvoidBindCheck.cpp
@@ -452,8 +452,8 @@ static bool isFixitSupported(const CallableInfo &Callee,
   return true;
 }
 
-const FunctionDecl *getCallOperator(const CXXRecordDecl *Callable,
-                                    size_t NumArgs) {
+static const FunctionDecl *getCallOperator(const CXXRecordDecl *Callable,
+                                           size_t NumArgs) {
   std::vector<const FunctionDecl *> Candidates =
       findCandidateCallOperators(Callable, NumArgs);
   if (Candidates.size() != 1)
@@ -462,7 +462,7 @@ const FunctionDecl *getCallOperator(const CXXRecordDecl *Callable,
   return Candidates.front();
 }
 
-const FunctionDecl *
+static const FunctionDecl *
 getCallMethodDecl(const MatchFinder::MatchResult &Result, CallableType Type,
                   CallableMaterializationKind Materialization) {
 
diff --git a/clang-tools-extra/clang-tidy/modernize/LoopConvertCheck.cpp b/clang-tools-extra/clang-tidy/modernize/LoopConvertCheck.cpp
index eb11ae6162028..6c6c626ec4fed 100644
--- a/clang-tools-extra/clang-tidy/modernize/LoopConvertCheck.cpp
+++ b/clang-tools-extra/clang-tidy/modernize/LoopConvertCheck.cpp
@@ -114,7 +114,7 @@ arrayConditionMatcher(internal::Matcher<Expr> LimitExpr) {
 /// Client code will need to make sure that:
 ///   - The index variable is only used as an array index.
 ///   - All arrays indexed by the loop are the same.
-StatementMatcher makeArrayLoopMatcher() {
+static StatementMatcher makeArrayLoopMatcher() {
   StatementMatcher ArrayBoundMatcher =
       expr(hasType(isInteger())).bind(ConditionBoundName);
 
@@ -155,7 +155,7 @@ StatementMatcher makeArrayLoopMatcher() {
 ///
 /// Client code will need to make sure that:
 ///   - The two containers on which 'begin' and 'end' are called are the same.
-StatementMatcher makeIteratorLoopMatcher(bool IsReverse) {
+static StatementMatcher makeIteratorLoopMatcher(bool IsReverse) {
 
   auto BeginNameMatcher = IsReverse ? hasAnyName("rbegin", "crbegin")
                                     : hasAnyName("begin", "cbegin");
@@ -267,7 +267,7 @@ StatementMatcher makeIteratorLoopMatcher(bool IsReverse) {
 ///   - The index variable is only used in overloaded operator[] or
 ///     container.at().
 ///   - The container's iterators would not be invalidated during the loop.
-StatementMatcher makePseudoArrayLoopMatcher() {
+static StatementMatcher makePseudoArrayLoopMatcher() {
   // Test that the incoming type has a record declaration that has methods
   // called 'begin' and 'end'. If the incoming type is const, then make sure
   // these methods are also marked const.
diff --git a/clang-tools-extra/clang-tidy/modernize/ModernizeTidyModule.cpp b/clang-tools-extra/clang-tidy/modernize/ModernizeTidyModule.cpp
index e872759856f3c..0cf59b6e0216a 100644
--- a/clang-tools-extra/clang-tidy/modernize/ModernizeTidyModule.cpp
+++ b/clang-tools-extra/clang-tidy/modernize/ModernizeTidyModule.cpp
@@ -136,6 +136,7 @@ static ClangTidyModuleRegistry::Add<ModernizeModule> X("modernize-module",
 
 // This anchor is used to force the linker to link in the generated object file
 // and thus register the ModernizeModule.
+// NOLINTNEXTLINE(misc-use-internal-linkage)
 volatile int ModernizeModuleAnchorSource = 0;
 
 } // namespace clang::tidy
diff --git a/clang-tools-extra/clang-tidy/modernize/RedundantVoidArgCheck.cpp b/clang-tools-extra/clang-tidy/modernize/RedundantVoidArgCheck.cpp
index 53447c2b960f9..5eebccc366fd5 100644
--- a/clang-tools-extra/clang-tidy/modernize/RedundantVoidArgCheck.cpp
+++ b/clang-tools-extra/clang-tidy/modernize/RedundantVoidArgCheck.cpp
@@ -112,7 +112,8 @@ void RedundantVoidArgCheck::processFunctionDecl(
                              "function declaration");
 }
 
-bool isMacroIdentifier(const IdentifierTable &Idents, const Token &ProtoToken) {
+static bool isMacroIdentifier(const IdentifierTable &Idents,
+                              const Token &ProtoToken) {
   if (!ProtoToken.is(tok::TokenKind::raw_identifier))
     return false;
 
diff --git a/clang-tools-extra/clang-tidy/modernize/UseConstraintsCheck.cpp b/clang-tools-extra/clang-tidy/modernize/UseConstraintsCheck.cpp
index 6040cddf0e52a..9e4d184c4b6e1 100644
--- a/clang-tools-extra/clang-tidy/modernize/UseConstraintsCheck.cpp
+++ b/clang-tools-extra/clang-tidy/modernize/UseConstraintsCheck.cpp
@@ -279,7 +279,7 @@ findInsertionForConstraint(const FunctionDecl *Function, ASTContext &Context) {
   return Body->getBeginLoc();
 }
 
-bool isPrimaryExpression(const Expr *Expression) {
+static bool isPrimaryExpression(const Expr *Expression) {
   // This function is an incomplete approximation of checking whether
   // an Expr is a primary expression. In particular, if this function
   // returns true, the expression is a primary expression. The converse
diff --git a/clang-tools-extra/clang-tidy/modernize/UseEqualsDefaultCheck.cpp b/clang-tools-extra/clang-tidy/modernize/UseEqualsDefaultCheck.cpp
index 93151024064b4..b361ae4456538 100644
--- a/clang-tools-extra/clang-tidy/modernize/UseEqualsDefaultCheck.cpp
+++ b/clang-tools-extra/clang-tidy/modernize/UseEqualsDefaultCheck.cpp
@@ -48,8 +48,8 @@ static std::set<const Type *> getAllDirectBases(const CXXRecordDecl *Record) {
 /// Returns a matcher that matches member expressions where the base is
 /// the variable declared as \p Var and the accessed member is the one declared
 /// as \p Field.
-internal::Matcher<Expr> accessToFieldInVar(const FieldDecl *Field,
-                                           const ValueDecl *Var) {
+static internal::Matcher<Expr> accessToFieldInVar(const FieldDecl *Field,
+                                                  const ValueDecl *Var) {
   return ignoringImpCasts(
       memberExpr(hasObjectExpression(declRefExpr(to(varDecl(equalsNode(Var))))),
                  member(fieldDecl(equalsNode(Field)))));
diff --git a/clang-tools-extra/clang-tidy/modernize/UseStartsEndsWithCheck.cpp b/clang-tools-extra/clang-tidy/modernize/UseStartsEndsWithCheck.cpp
index 2e059f24d47b6..2af67f7ccb4c1 100644
--- a/clang-tools-extra/clang-tidy/modernize/UseStartsEndsWithCheck.cpp
+++ b/clang-tools-extra/clang-tidy/modernize/UseStartsEndsWithCheck.cpp
@@ -33,6 +33,8 @@ static bool isNegativeComparison(const Expr *ComparisonExpr) {
   return false;
 }
 
+namespace {
+
 struct NotLengthExprForStringNode {
   NotLengthExprForStringNode(std::string ID, DynTypedNode Node,
                              ASTContext *Context)
@@ -91,6 +93,8 @@ AST_MATCHER_P(Expr, lengthExprForStringNode, std::string, ID) {
       ID, DynTypedNode::create(Node), &(Finder->getASTContext())));
 }
 
+} // namespace
+
 UseStartsEndsWithCheck::UseStartsEndsWithCheck(StringRef Name,
                                                ClangTidyContext *Context)
     : ClangTidyCheck(Name, Context) {}
diff --git a/clang-tools-extra/clang-tidy/mpi/MPITidyModule.cpp b/clang-tools-extra/clang-tidy/mpi/MPITidyModule.cpp
index 166d71e130db5..67ae101c18cb1 100644
--- a/clang-tools-extra/clang-tidy/mpi/MPITidyModule.cpp
+++ b/clang-tools-extra/clang-tidy/mpi/MPITidyModule.cpp
@@ -31,6 +31,6 @@ static ClangTidyModuleRegistry::Add<mpi::MPIModule>
 
 // This anchor is used to force the linker to link in the generated object file
 // and thus register the MPIModule.
-volatile int MPIModuleAnchorSource = 0;
+volatile int MPIModuleAnchorSource = 0; // NOLINT(misc-use-internal-linkage)
 
 } // namespace clang::tidy
diff --git a/clang-tools-extra/clang-tidy/objc/NSDateFormatterCheck.cpp b/clang-tools-extra/clang-tidy/objc/NSDateFormatterCheck.cpp
index 5438c9c892e2e..79e9d97d9594b 100644
--- a/clang-tools-extra/clang-tidy/objc/NSDateFormatterCheck.cpp
+++ b/clang-tools-extra/clang-tidy/objc/NSDateFormatterCheck.cpp
@@ -34,7 +34,7 @@ static char ValidDatePatternChars[] = {
 // A string pattern is valid if all the letters(a-z, A-Z) in it belong to the
 // set of reserved characters. See:
 // https://www.unicode.org/reports/tr35/tr35.html#Invalid_Patterns
-bool isValidDatePattern(StringRef Pattern) {
+static bool isValidDatePattern(StringRef Pattern) {
   return llvm::all_of(Pattern, [](const auto &PatternChar) {
     return !isalpha(PatternChar) ||
            llvm::is_contained(ValidDatePatternChars, PatternChar);
diff --git a/clang-tools-extra/clang-tidy/objc/ObjCTidyModule.cpp b/clang-tools-extra/clang-tidy/objc/ObjCTidyModule.cpp
index 35113f80870f6..56ccf33a6362a 100644
--- a/clang-tools-extra/clang-tidy/objc/ObjCTidyModule.cpp
+++ b/clang-tools-extra/clang-tidy/objc/ObjCTidyModule.cpp
@@ -53,6 +53,6 @@ static ClangTidyModuleRegistry::Add<ObjCModule>
 
 // This anchor is used to force the linker to link in the generated object file
 // and thus register the ObjCModule.
-volatile int ObjCModuleAnchorSource = 0;
+volatile int ObjCModuleAnchorSource = 0; // NOLINT(misc-use-internal-linkage)
 
 } // namespace clang::tidy
diff --git a/clang-tools-extra/clang-tidy/openmp/OpenMPTidyModule.cpp b/clang-tools-extra/clang-tidy/openmp/OpenMPTidyModule.cpp
index 37092f49b25d6..d9c9d90673408 100644
--- a/clang-tools-extra/clang-tidy/openmp/OpenMPTidyModule.cpp
+++ b/clang-tools-extra/clang-tidy/openmp/OpenMPTidyModule.cpp
@@ -34,6 +34,6 @@ static ClangTidyModuleRegistry::Add<OpenMPModule>
 
 // This anchor is used to force the linker to link in the generated object file
 // and thus register the OpenMPModule.
-volatile int OpenMPModuleAnchorSource = 0;
+volatile int OpenMPModuleAnchorSource = 0; // NOLINT(misc-use-internal-linkage)
 
 } // namespace clang::tidy
diff --git a/clang-tools-extra/clang-tidy/performance/MoveConstArgCheck.cpp b/clang-tools-extra/clang-tidy/performance/MoveConstArgCheck.cpp
index 75ef7a697031e..f458e26d964b0 100644
--- a/clang-tools-extra/clang-tidy/performance/MoveConstArgCheck.cpp
+++ b/clang-tools-extra/clang-tidy/performance/MoveConstArgCheck.cpp
@@ -78,9 +78,9 @@ void MoveConstArgCheck::registerMatchers(MatchFinder *Finder) {
       this);
 }
 
-bool isRValueReferenceParam(const Expr *Invocation,
-                            const QualType *InvocationParmType,
-                            const Expr *Arg) {
+static bool isRValueReferenceParam(const Expr *Invocation,
+                                   const QualType *InvocationParmType,
+                                   const Expr *Arg) {
   if (Invocation && (*InvocationParmType)->isRValueReferenceType() &&
       Arg->isLValue()) {
     if (!Invocation->getType()->isRecordType())
diff --git a/clang-tools-extra/clang-tidy/performance/PerformanceTidyModule.cpp b/clang-tools-extra/clang-tidy/performance/PerformanceTidyModule.cpp
index 9e0fa6f88b36a..10ad9ec6fef4c 100644
--- a/clang-tools-extra/clang-tidy/performance/PerformanceTidyModule.cpp
+++ b/clang-tools-extra/clang-tidy/performance/PerformanceTidyModule.cpp
@@ -81,6 +81,7 @@ static ClangTidyModuleRegistry::Add<PerformanceModule>
 
 // This anchor is used to force the linker to link in the generated object file
 // and thus register the PerformanceModule.
+// NOLINTNEXTLINE(misc-use-internal-linkage)
 volatile int PerformanceModuleAnchorSource = 0;
 
 } // namespace clang::tidy
diff --git a/clang-tools-extra/clang-tidy/plugin/ClangTidyPlugin.cpp b/clang-tools-extra/clang-tidy/plugin/ClangTidyPlugin.cpp
index 3caa222b9fa4c..651a63b3aa972 100644
--- a/clang-tools-extra/clang-tidy/plugin/ClangTidyPlugin.cpp
+++ b/clang-tools-extra/clang-tidy/plugin/ClangTidyPlugin.cpp
@@ -78,6 +78,7 @@ class ClangTidyPluginAction : public PluginASTAction {
 
 // This anchor is used to force the linker to link in the generated object file
 // and thus register the clang-tidy plugin.
+// NOLINTNEXTLINE(misc-use-internal-linkage)
 volatile int ClangTidyPluginAnchorSource = 0;
 
 static clang::FrontendPluginRegistry::Add<clang::tidy::ClangTidyPluginAction>
diff --git a/clang-tools-extra/clang-tidy/portability/PortabilityTidyModule.cpp b/clang-tools-extra/clang-tidy/portability/PortabilityTidyModule.cpp
index a15cb36dfdaff..98853556588b3 100644
--- a/clang-tools-extra/clang-tidy/portability/PortabilityTidyModule.cpp
+++ b/clang-tools-extra/clang-tidy/portability/PortabilityTidyModule.cpp
@@ -42,6 +42,7 @@ static ClangTidyModuleRegistry::Add<PortabilityModule>
 
 // This anchor is used to force the linker to link in the generated object file
 // and thus register the PortabilityModule.
+// NOLINTNEXTLINE(misc-use-internal-linkage)
 volatile int PortabilityModuleAnchorSource = 0;
 
 } // namespace clang::tidy
diff --git a/clang-tools-extra/clang-tidy/readability/ContainerSizeEmptyCheck.cpp b/clang-tools-extra/clang-tidy/readability/ContainerSizeEmptyCheck.cpp
index bf7a847dff103..ce736a8d16023 100644
--- a/clang-tools-extra/clang-tidy/readability/ContainerSizeEmptyCheck.cpp
+++ b/clang-tools-extra/clang-tidy/readability/ContainerSizeEmptyCheck.cpp
@@ -16,29 +16,31 @@
 
 using namespace clang::ast_matchers;
 
-namespace clang {
-namespace ast_matchers {
+namespace clang::tidy::readability {
+
+namespace {
 
 AST_POLYMORPHIC_MATCHER_P2(hasAnyArgumentWithParam,
                            AST_POLYMORPHIC_SUPPORTED_TYPES(CallExpr,
                                                            CXXConstructExpr),
-                           internal::Matcher<Expr>, ArgMatcher,
-                           internal::Matcher<ParmVarDecl>, ParamMatcher) {
-  BoundNodesTreeBuilder Result;
+                           ast_matchers::internal::Matcher<Expr>, ArgMatcher,
+                           ast_matchers::internal::Matcher<ParmVarDecl>,
+                           ParamMatcher) {
+  ast_matchers::internal::BoundNodesTreeBuilder Result;
   // The first argument of an overloaded member operator is the implicit object
   // argument of the method which should not be matched against a parameter, so
   // we skip over it here.
-  BoundNodesTreeBuilder Matches;
+  ast_matchers::internal::BoundNodesTreeBuilder Matches;
   unsigned ArgIndex = cxxOperatorCallExpr(callee(cxxMethodDecl()))
                               .matches(Node, Finder, &Matches)
                           ? 1
                           : 0;
   int ParamIndex = 0;
   for (; ArgIndex < Node.getNumArgs(); ++ArgIndex) {
-    BoundNodesTreeBuilder ArgMatches(*Builder);
+    ast_matchers::internal::BoundNodesTreeBuilder ArgMatches(*Builder);
     if (ArgMatcher.matches(*(Node.getArg(ArgIndex)->IgnoreParenCasts()), Finder,
                            &ArgMatches)) {
-      BoundNodesTreeBuilder ParamMatches(ArgMatches);
+      ast_matchers::internal::BoundNodesTreeBuilder ParamMatches(ArgMatches);
       if (expr(anyOf(cxxConstructExpr(hasDeclaration(cxxConstructorDecl(
                          hasParameter(ParamIndex, ParamMatcher)))),
                      callExpr(callee(functionDecl(
@@ -80,9 +82,10 @@ AST_MATCHER(Expr, usedInBooleanContext) {
                      binaryOperator(hasAnyOperatorName("&&", "||")),
                      unaryOperator(hasOperatorName("!")).bind("NegOnSize"))))))
           .matches(Node, Finder, Builder);
-  Builder->removeBindings([ExprName](const BoundNodesMap &Nodes) {
-    return Nodes.getNode(ExprName).getNodeKind().isNone();
-  });
+  Builder->removeBindings(
+      [ExprName](const ast_matchers::internal::BoundNodesMap &Nodes) {
+        return Nodes.getNode(ExprName).getNodeKind().isNone();
+      });
   return Result;
 }
 
@@ -107,8 +110,7 @@ AST_MATCHER_P(UserDefinedLiteral, hasLiteral,
   return false;
 }
 
-} // namespace ast_matchers
-namespace tidy::readability {
+} // namespace
 
 using utils::isBinaryOrTernary;
 
@@ -407,5 +409,4 @@ void ContainerSizeEmptyCheck::check(const MatchFinder::MatchResult &Result) {
       << Container;
 }
 
-} // namespace tidy::readability
-} // namespace clang
+} // namespace clang::tidy::readability
diff --git a/clang-tools-extra/clang-tidy/readability/ConvertMemberFunctionsToStatic.cpp b/clang-tools-extra/clang-tidy/readability/ConvertMemberFunctionsToStatic.cpp
index beca824c8c8ce..30df40bda57d8 100644
--- a/clang-tools-extra/clang-tidy/readability/ConvertMemberFunctionsToStatic.cpp
+++ b/clang-tools-extra/clang-tidy/readability/ConvertMemberFunctionsToStatic.cpp
@@ -18,6 +18,8 @@ using namespace clang::ast_matchers;
 
 namespace clang::tidy::readability {
 
+namespace {
+
 AST_MATCHER(CXXMethodDecl, isStatic) { return Node.isStatic(); }
 
 AST_MATCHER(CXXMethodDecl, hasTrivialBody) { return Node.hasTrivialBody(); }
@@ -74,6 +76,8 @@ AST_MATCHER(CXXMethodDecl, usesThis) {
   return UsageOfThis.Used;
 }
 
+} // namespace
+
 void ConvertMemberFunctionsToStatic::registerMatchers(MatchFinder *Finder) {
   Finder->addMatcher(
       cxxMethodDecl(
diff --git a/clang-tools-extra/clang-tidy/readability/MakeMemberFunctionConstCheck.cpp b/clang-tools-extra/clang-tidy/readability/MakeMemberFunctionConstCheck.cpp
index d42fcba70e81b..85852c2c829a1 100644
--- a/clang-tools-extra/clang-tidy/readability/MakeMemberFunctionConstCheck.cpp
+++ b/clang-tools-extra/clang-tidy/readability/MakeMemberFunctionConstCheck.cpp
@@ -17,6 +17,8 @@ using namespace clang::ast_matchers;
 
 namespace clang::tidy::readability {
 
+namespace {
+
 AST_MATCHER(CXXMethodDecl, isStatic) { return Node.isStatic(); }
 
 AST_MATCHER(CXXMethodDecl, hasTrivialBody) { return Node.hasTrivialBody(); }
@@ -214,6 +216,8 @@ AST_MATCHER(CXXMethodDecl, usesThisAsConst) {
   return UsageOfThis.Usage == Const;
 }
 
+} // namespace
+
 void MakeMemberFunctionConstCheck::registerMatchers(MatchFinder *Finder) {
   Finder->addMatcher(
       traverse(
diff --git a/clang-tools-extra/clang-tidy/readability/ReadabilityTidyModule.cpp b/clang-tools-extra/clang-tidy/readability/ReadabilityTidyModule.cpp
index 4c0812f0e6793..d59b0312673b9 100644
--- a/clang-tools-extra/clang-tidy/readability/ReadabilityTidyModule.cpp
+++ b/clang-tools-extra/clang-tidy/readability/ReadabilityTidyModule.cpp
@@ -186,6 +186,7 @@ static ClangTidyModuleRegistry::Add<ReadabilityModule>
 
 // This anchor is used to force the linker to link in the generated object file
 // and thus register the ReadabilityModule.
+// NOLINTNEXTLINE(misc-use-internal-linkage)
 volatile int ReadabilityModuleAnchorSource = 0;
 
 } // namespace clang::tidy
diff --git a/clang-tools-extra/clang-tidy/readability/RedundantDeclarationCheck.cpp b/clang-tools-extra/clang-tidy/readability/RedundantDeclarationCheck.cpp
index 7850a6f29995f..053892dffc7b6 100644
--- a/clang-tools-extra/clang-tidy/readability/RedundantDeclarationCheck.cpp
+++ b/clang-tools-extra/clang-tidy/readability/RedundantDeclarationCheck.cpp
@@ -15,10 +15,14 @@ using namespace clang::ast_matchers;
 
 namespace clang::tidy::readability {
 
+namespace {
+
 AST_MATCHER(FunctionDecl, doesDeclarationForceExternallyVisibleDefinition) {
   return Node.doesDeclarationForceExternallyVisibleDefinition();
 }
 
+} // namespace
+
 RedundantDeclarationCheck::RedundantDeclarationCheck(StringRef Name,
                                                      ClangTidyContext *Context)
     : ClangTidyCheck(Name, Context),
diff --git a/clang-tools-extra/clang-tidy/readability/RedundantInlineSpecifierCheck.cpp b/clang-tools-extra/clang-tidy/readability/RedundantInlineSpecifierCheck.cpp
index 1693e5c5e9cd4..969bf2d1add6b 100644
--- a/clang-tools-extra/clang-tidy/readability/RedundantInlineSpecifierCheck.cpp
+++ b/clang-tools-extra/clang-tidy/readability/RedundantInlineSpecifierCheck.cpp
@@ -38,8 +38,8 @@ AST_POLYMORPHIC_MATCHER(isInlineSpecified,
 AST_POLYMORPHIC_MATCHER_P(isInternalLinkage,
                           AST_POLYMORPHIC_SUPPORTED_TYPES(FunctionDecl,
                                                           VarDecl),
-                          bool, strictMode) {
-  if (!strictMode)
+                          bool, StrictMode) {
+  if (!StrictMode)
     return false;
   if (const auto *FD = dyn_cast<FunctionDecl>(&Node))
     return FD->getStorageClass() == SC_Static || FD->isInAnonymousNamespace();
diff --git a/clang-tools-extra/clang-tidy/tool/ClangTidyMain.cpp b/clang-tools-extra/clang-tidy/tool/ClangTidyMain.cpp
index dc36db4e0e8d9..4336c723bd7cd 100644
--- a/clang-tools-extra/clang-tidy/tool/ClangTidyMain.cpp
+++ b/clang-tools-extra/clang-tidy/tool/ClangTidyMain.cpp
@@ -447,7 +447,7 @@ createOptionsProvider(llvm::IntrusiveRefCntPtr<vfs::FileSystem> FS) {
       std::move(OverrideOptions), std::move(FS));
 }
 
-llvm::IntrusiveRefCntPtr<vfs::FileSystem>
+static llvm::IntrusiveRefCntPtr<vfs::FileSystem>
 getVfsFromFile(const std::string &OverlayFile,
                llvm::IntrusiveRefCntPtr<vfs::FileSystem> BaseFS) {
   llvm::ErrorOr<std::unique_ptr<llvm::MemoryBuffer>> Buffer =
diff --git a/clang-tools-extra/clang-tidy/utils/NamespaceAliaser.cpp b/clang-tools-extra/clang-tidy/utils/NamespaceAliaser.cpp
index 4703ce1269819..22cf23fb2446e 100644
--- a/clang-tools-extra/clang-tidy/utils/NamespaceAliaser.cpp
+++ b/clang-tools-extra/clang-tidy/utils/NamespaceAliaser.cpp
@@ -16,15 +16,16 @@
 namespace clang::tidy::utils {
 
 using namespace ast_matchers;
+namespace {
+AST_MATCHER_P(NamespaceAliasDecl, hasTargetNamespace,
+              ast_matchers::internal::Matcher<NamespaceDecl>, InnerMatcher) {
+  return InnerMatcher.matches(*Node.getNamespace(), Finder, Builder);
+}
+} // namespace
 
 NamespaceAliaser::NamespaceAliaser(const SourceManager &SourceMgr)
     : SourceMgr(SourceMgr) {}
 
-AST_MATCHER_P(NamespaceAliasDecl, hasTargetNamespace,
-              ast_matchers::internal::Matcher<NamespaceDecl>, innerMatcher) {
-  return innerMatcher.matches(*Node.getNamespace(), Finder, Builder);
-}
-
 std::optional<FixItHint>
 NamespaceAliaser::createAlias(ASTContext &Context, const Stmt &Statement,
                               StringRef Namespace,
diff --git a/clang-tools-extra/clang-tidy/utils/TransformerClangTidyCheck.cpp b/clang-tools-extra/clang-tidy/utils/TransformerClangTidyCheck.cpp
index a40433e38a041..7d84a4a9331b1 100644
--- a/clang-tools-extra/clang-tidy/utils/TransformerClangTidyCheck.cpp
+++ b/clang-tools-extra/clang-tidy/utils/TransformerClangTidyCheck.cpp
@@ -30,7 +30,7 @@ static void verifyRule(const RewriteRuleWith<std::string> &Rule) {
 // If a string unintentionally containing '%' is passed as a diagnostic, Clang
 // will claim the string is ill-formed and assert-fail. This function escapes
 // such strings so they can be safely used in diagnostics.
-std::string escapeForDiagnostic(std::string ToEscape) {
+static std::string escapeForDiagnostic(std::string ToEscape) {
   // Optimize for the common case that the string does not contain `%` at the
   // cost of an extra scan over the string in the slow case.
   auto Pos = ToEscape.find('%');
diff --git a/clang-tools-extra/clang-tidy/zircon/TemporaryObjectsCheck.cpp b/clang-tools-extra/clang-tidy/zircon/TemporaryObjectsCheck.cpp
index bb2976081a692..bb2c71913193b 100644
--- a/clang-tools-extra/clang-tidy/zircon/TemporaryObjectsCheck.cpp
+++ b/clang-tools-extra/clang-tidy/zircon/TemporaryObjectsCheck.cpp
@@ -17,11 +17,15 @@ using namespace clang::ast_matchers;
 
 namespace clang::tidy::zircon {
 
+namespace {
+
 AST_MATCHER_P(CXXRecordDecl, matchesAnyName, ArrayRef<StringRef>, Names) {
   std::string QualifiedName = Node.getQualifiedNameAsString();
   return llvm::is_contained(Names, QualifiedName);
 }
 
+} // namespace
+
 void TemporaryObjectsCheck::registerMatchers(MatchFinder *Finder) {
   // Matcher for default constructors.
   Finder->addMatcher(
diff --git a/clang-tools-extra/clang-tidy/zircon/ZirconTidyModule.cpp b/clang-tools-extra/clang-tidy/zircon/ZirconTidyModule.cpp
index 000680455d617..0eb5683a94e41 100644
--- a/clang-tools-extra/clang-tidy/zircon/ZirconTidyModule.cpp
+++ b/clang-tools-extra/clang-tidy/zircon/ZirconTidyModule.cpp
@@ -32,6 +32,6 @@ static ClangTidyModuleRegistry::Add<ZirconModule>
 
 // This anchor is used to force the linker to link in the generated object file
 // and thus register the ZirconModule.
-volatile int ZirconModuleAnchorSource = 0;
+volatile int ZirconModuleAnchorSource = 0; // NOLINT(misc-use-internal-linkage)
 
 } // namespace clang::tidy

>From a7f495f170864e6bddc4bb29ae7fae293a7136aa Mon Sep 17 00:00:00 2001
From: Jonas Devlieghere <jonas at devlieghere.com>
Date: Tue, 10 Jun 2025 13:30:31 -0700
Subject: [PATCH 13/61] [lldb] Revive TestSimulatorPlatform.py (#142244)

This test was incorrectly disabled and bitrotted since then. This PR
fixes up the test and re-enables it.

 - Build against the system libc++ (which can target the simulator)
 - Bump the deployment target for iOS and tvOS on Apple Silicon
 - Skip backdeploying to pre-Apple Silicon OS on Apple Silicon.
---
 .../Python/lldbsuite/test/decorators.py       | 54 +++++++++++++++++--
 .../macosx/simulator/TestSimulatorPlatform.py | 12 +++--
 2 files changed, 57 insertions(+), 9 deletions(-)

diff --git a/lldb/packages/Python/lldbsuite/test/decorators.py b/lldb/packages/Python/lldbsuite/test/decorators.py
index 868e9f7e5eca0..a391319ca9b0e 100644
--- a/lldb/packages/Python/lldbsuite/test/decorators.py
+++ b/lldb/packages/Python/lldbsuite/test/decorators.py
@@ -9,6 +9,7 @@
 import sys
 import tempfile
 import subprocess
+import json
 
 # Third-party modules
 import unittest
@@ -451,24 +452,67 @@ def apple_simulator_test(platform):
     """
     Decorate the test as a test requiring a simulator for a specific platform.
 
-    Consider that a simulator is available if you have the corresponding SDK installed.
-    The SDK identifiers for simulators are iphonesimulator, appletvsimulator, watchsimulator
+    Consider that a simulator is available if you have the corresponding SDK
+    and runtime installed.
+
+    The SDK identifiers for simulators are iphonesimulator, appletvsimulator,
+    watchsimulator
     """
 
     def should_skip_simulator_test():
         if lldbplatformutil.getHostPlatform() not in ["darwin", "macosx"]:
             return "simulator tests are run only on darwin hosts."
+
+        # Make sure we recognize the platform.
+        mapping = {
+            "iphone": "ios",
+            "appletv": "tvos",
+            "watch": "watchos",
+        }
+        if platform not in mapping:
+            return "unknown simulator platform: {}".format(platform)
+
+        # Make sure we have an SDK.
         try:
             output = subprocess.check_output(
                 ["xcodebuild", "-showsdks"], stderr=subprocess.DEVNULL
             ).decode("utf-8")
-            if re.search("%ssimulator" % platform, output):
-                return None
-            else:
+            if not re.search("%ssimulator" % platform, output):
                 return "%s simulator is not supported on this system." % platform
         except subprocess.CalledProcessError:
             return "Simulators are unsupported on this system (xcodebuild failed)"
 
+        # Make sure we a simulator runtime.
+        try:
+            sim_devices_str = subprocess.check_output(
+                ["xcrun", "simctl", "list", "-j", "devices"]
+            ).decode("utf-8")
+
+            sim_devices = json.loads(sim_devices_str)["devices"]
+            for simulator in sim_devices:
+                if isinstance(simulator, dict):
+                    runtime = simulator["name"]
+                    devices = simulator["devices"]
+                else:
+                    runtime = simulator
+                    devices = sim_devices[simulator]
+
+                if not mapping[platform] in runtime.lower():
+                    continue
+
+                for device in devices:
+                    if (
+                        "availability" in device
+                        and device["availability"] == "(available)"
+                    ):
+                        return None
+                    if "isAvailable" in device and device["isAvailable"]:
+                        return None
+
+            return "{} simulator is not supported on this system.".format(platform)
+        except (subprocess.CalledProcessError, json.decoder.JSONDecodeError):
+            return "Simulators are unsupported on this system (simctl failed)"
+
     return skipTestIfFn(should_skip_simulator_test)
 
 
diff --git a/lldb/test/API/macosx/simulator/TestSimulatorPlatform.py b/lldb/test/API/macosx/simulator/TestSimulatorPlatform.py
index faf2256b03a0d..74ba0ee6c83bb 100644
--- a/lldb/test/API/macosx/simulator/TestSimulatorPlatform.py
+++ b/lldb/test/API/macosx/simulator/TestSimulatorPlatform.py
@@ -39,15 +39,15 @@ def check_debugserver(self, log, expected_platform, expected_version):
         if expected_version:
             self.assertEqual(aout_info["min_version_os_sdk"], expected_version)
 
-    @skipIf(bugnumber="rdar://76995109")
     def run_with(self, arch, os, vers, env, expected_load_command):
         env_list = [env] if env else []
         triple = "-".join([arch, "apple", os + vers] + env_list)
         sdk = lldbutil.get_xcode_sdk(os, env)
 
-        version_min = ""
         if not vers:
             vers = lldbutil.get_xcode_sdk_version(sdk)
+
+        version_min = ""
         if env == "simulator":
             version_min = "-m{}-simulator-version-min={}".format(os, vers)
         elif os == "macosx":
@@ -56,11 +56,14 @@ def run_with(self, arch, os, vers, env, expected_load_command):
         sdk_root = lldbutil.get_xcode_sdk_root(sdk)
         clang = lldbutil.get_xcode_clang(sdk)
 
+        print(triple)
+
         self.build(
             dictionary={
                 "ARCH": arch,
                 "ARCH_CFLAGS": "-target {} {}".format(triple, version_min),
                 "SDKROOT": sdk_root,
+                "USE_SYSTEM_STDLIB": 1,
             },
             compiler=clang,
         )
@@ -146,6 +149,7 @@ def test_watchos_armv7k(self):
 
     @skipUnlessDarwin
     @skipIfDarwinEmbedded
+    @skipIf(archs=["arm64", "arm64e"])
     def test_lc_version_min_macosx(self):
         """Test running a back-deploying non-simulator MacOS X binary"""
         self.run_with(
@@ -198,7 +202,7 @@ def test_ios_backdeploy_apple_silicon(self):
         self.run_with(
             arch=self.getArchitecture(),
             os="ios",
-            vers="11.0",
+            vers="14.0",
             env="simulator",
             expected_load_command="LC_BUILD_VERSION",
         )
@@ -229,7 +233,7 @@ def test_tvos_backdeploy_apple_silicon(self):
         self.run_with(
             arch=self.getArchitecture(),
             os="tvos",
-            vers="11.0",
+            vers="14.0",
             env="simulator",
             expected_load_command="LC_BUILD_VERSION",
         )

>From d7282c56cd294a2eb4890e50c84e6eae6f7c6671 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Martin=20Storsj=C3=B6?= <martin at martin.st>
Date: Tue, 10 Jun 2025 23:34:26 +0300
Subject: [PATCH 14/61] [llvm-rc] Add support for multiplication and division
 in expressions (#143373)

This is supported by GNU windres. MS rc.exe does accept these
expressions, but doesn't evalulate them correctly, it only returns the
left hand side.

This fixes one aspect of
https://github.com/llvm/llvm-project/issues/143157.
---
 llvm/test/tools/llvm-rc/Inputs/parser-expr.rc |  5 ++
 llvm/test/tools/llvm-rc/Inputs/tokens.rc      |  1 +
 llvm/test/tools/llvm-rc/parser-expr.test      |  5 ++
 llvm/test/tools/llvm-rc/tokenizer.test        |  5 ++
 llvm/tools/llvm-rc/ResourceScriptParser.cpp   | 49 ++++++++++++++-----
 llvm/tools/llvm-rc/ResourceScriptParser.h     |  1 +
 llvm/tools/llvm-rc/ResourceScriptStmt.h       | 24 +++++++++
 llvm/tools/llvm-rc/ResourceScriptToken.cpp    | 12 ++++-
 llvm/tools/llvm-rc/ResourceScriptToken.h      |  7 ++-
 .../tools/llvm-rc/ResourceScriptTokenList.def |  2 +
 10 files changed, 97 insertions(+), 14 deletions(-)

diff --git a/llvm/test/tools/llvm-rc/Inputs/parser-expr.rc b/llvm/test/tools/llvm-rc/Inputs/parser-expr.rc
index 8e69c1cd1fa16..2f8e4b2d344a0 100644
--- a/llvm/test/tools/llvm-rc/Inputs/parser-expr.rc
+++ b/llvm/test/tools/llvm-rc/Inputs/parser-expr.rc
@@ -5,6 +5,11 @@ LANGUAGE 1|1&0, 0&0|1
 LANGUAGE 3+4-5, 3-4+5
 LANGUAGE 1+2|3, 3|1+2
 LANGUAGE 6&~5, 6&-8
+LANGUAGE 7/3, 7*3
+LANGUAGE 5/2*2, 5*3/2
+LANGUAGE 1+2*3, (1+2)*3
+LANGUAGE 100/12/5*5, 1+1+1+1*4
+LANGUAGE 9/(1+3), (4+5)/4
 LANGUAGE -1, --1
 LANGUAGE ----1, -----1
 LANGUAGE ~1, ~~1
diff --git a/llvm/test/tools/llvm-rc/Inputs/tokens.rc b/llvm/test/tools/llvm-rc/Inputs/tokens.rc
index 6a781202a7e37..20f77912477d9 100644
--- a/llvm/test/tools/llvm-rc/Inputs/tokens.rc
+++ b/llvm/test/tools/llvm-rc/Inputs/tokens.rc
@@ -1,4 +1,5 @@
 1 + 2 - 3214L & 0x120894 032173 2|&~+(-7){0xabcdef 0xABCDEFl} Begin End
+1*3/4
 He11o LLVM
 identifier-with-dashes
 
diff --git a/llvm/test/tools/llvm-rc/parser-expr.test b/llvm/test/tools/llvm-rc/parser-expr.test
index ed6796529fdfa..14a299c9e3e96 100644
--- a/llvm/test/tools/llvm-rc/parser-expr.test
+++ b/llvm/test/tools/llvm-rc/parser-expr.test
@@ -7,6 +7,11 @@
 ; CHECK-NEXT:  Language: 2, Sublanguage: 4
 ; CHECK-NEXT:  Language: 3, Sublanguage: 5
 ; CHECK-NEXT:  Language: 2, Sublanguage: 0
+; CHECK-NEXT:  Language: 2, Sublanguage: 21
+; CHECK-NEXT:  Language: 4, Sublanguage: 7
+; CHECK-NEXT:  Language: 7, Sublanguage: 9
+; CHECK-NEXT:  Language: 5, Sublanguage: 7
+; CHECK-NEXT:  Language: 2, Sublanguage: 2
 ; CHECK-NEXT:  Language: 4294967295, Sublanguage: 1
 ; CHECK-NEXT:  Language: 1, Sublanguage: 4294967295
 ; CHECK-NEXT:  Language: 4294967294, Sublanguage: 1
diff --git a/llvm/test/tools/llvm-rc/tokenizer.test b/llvm/test/tools/llvm-rc/tokenizer.test
index 8486f8bd78690..3062e2bf64629 100644
--- a/llvm/test/tools/llvm-rc/tokenizer.test
+++ b/llvm/test/tools/llvm-rc/tokenizer.test
@@ -25,6 +25,11 @@
 ; CHECK-NEXT:  BlockEnd: }
 ; CHECK-NEXT:  BlockBegin: Begin
 ; CHECK-NEXT:  BlockEnd: End
+; CHECK-NEXT:  Int: 1; int value = 1
+; CHECK-NEXT:  Asterisk: *
+; CHECK-NEXT:  Int: 3; int value = 3
+; CHECK-NEXT:  Slash: /
+; CHECK-NEXT:  Int: 4; int value = 4
 ; CHECK-NEXT:  Identifier: He11o
 ; CHECK-NEXT:  Identifier: LLVM
 ; CHECK-NEXT:  Identifier: identifier-with-dashes
diff --git a/llvm/tools/llvm-rc/ResourceScriptParser.cpp b/llvm/tools/llvm-rc/ResourceScriptParser.cpp
index 69798152c1f25..e4efc83c933b4 100644
--- a/llvm/tools/llvm-rc/ResourceScriptParser.cpp
+++ b/llvm/tools/llvm-rc/ResourceScriptParser.cpp
@@ -132,12 +132,13 @@ void RCParser::consume() {
 //
 // The following grammar is used to parse the expressions Exp1:
 //   Exp1 ::= Exp2 || Exp1 + Exp2 || Exp1 - Exp2 || Exp1 | Exp2 || Exp1 & Exp2
-//   Exp2 ::= -Exp2 || ~Exp2 || not Expr2 || Int || (Exp1).
-// (More conveniently, Exp1 is a non-empty sequence of Exp2 expressions,
-// separated by binary operators.)
+//   Exp2 ::= Exp3 || Exp3 * Exp3 || Exp3 / Exp3
+//   Exp3 ::= -Exp3 || ~Exp3 || not Expr3 || Int || (Exp1)
+// (More conveniently, Exp1 and Exp2 are non-empty sequences of Exp3
+// expressions, separated by binary operators.)
 //
-// Expressions of type Exp1 are read by parseIntExpr1(Inner) method, while Exp2
-// is read by parseIntExpr2().
+// Expressions of type Exp1 are read by parseIntExpr1(Inner) method, Exp2
+// is read by parseIntExpr2() and Exp3 is read by parseIntExpr3().
 //
 // The original Microsoft tool handles multiple unary operators incorrectly.
 // For example, in 16-bit little-endian integers:
@@ -158,7 +159,7 @@ Expected<IntWithNotMask> RCParser::parseIntExpr1() {
   ASSIGN_OR_RETURN(FirstResult, parseIntExpr2());
   IntWithNotMask Result = *FirstResult;
 
-  while (!isEof() && look().isBinaryOp()) {
+  while (!isEof() && look().isLowPrecedenceBinaryOp()) {
     auto OpToken = read();
     ASSIGN_OR_RETURN(NextResult, parseIntExpr2());
 
@@ -180,7 +181,7 @@ Expected<IntWithNotMask> RCParser::parseIntExpr1() {
       break;
 
     default:
-      llvm_unreachable("Already processed all binary ops.");
+      llvm_unreachable("Already processed all low precedence binary ops.");
     }
   }
 
@@ -188,7 +189,33 @@ Expected<IntWithNotMask> RCParser::parseIntExpr1() {
 }
 
 Expected<IntWithNotMask> RCParser::parseIntExpr2() {
-  // Exp2 ::= -Exp2 || ~Exp2 || not Expr2 || Int || (Exp1).
+  // Exp2 ::= Exp3 || Exp3 * Exp3 || Exp3 / Exp3.
+  ASSIGN_OR_RETURN(FirstResult, parseIntExpr3());
+  IntWithNotMask Result = *FirstResult;
+
+  while (!isEof() && look().isHighPrecedenceBinaryOp()) {
+    auto OpToken = read();
+    ASSIGN_OR_RETURN(NextResult, parseIntExpr3());
+
+    switch (OpToken.kind()) {
+    case Kind::Asterisk:
+      Result *= *NextResult;
+      break;
+
+    case Kind::Slash:
+      Result /= *NextResult;
+      break;
+
+    default:
+      llvm_unreachable("Already processed all high precedence binary ops.");
+    }
+  }
+
+  return Result;
+}
+
+Expected<IntWithNotMask> RCParser::parseIntExpr3() {
+  // Exp3 ::= -Exp3 || ~Exp3 || not Expr3 || Int || (Exp1).
   static const char ErrorMsg[] = "'-', '~', integer or '('";
 
   if (isEof())
@@ -197,13 +224,13 @@ Expected<IntWithNotMask> RCParser::parseIntExpr2() {
   switch (look().kind()) {
   case Kind::Minus: {
     consume();
-    ASSIGN_OR_RETURN(Result, parseIntExpr2());
+    ASSIGN_OR_RETURN(Result, parseIntExpr3());
     return -(*Result);
   }
 
   case Kind::Tilde: {
     consume();
-    ASSIGN_OR_RETURN(Result, parseIntExpr2());
+    ASSIGN_OR_RETURN(Result, parseIntExpr3());
     return ~(*Result);
   }
 
@@ -220,7 +247,7 @@ Expected<IntWithNotMask> RCParser::parseIntExpr2() {
   case Kind::Identifier: {
     if (!read().value().equals_insensitive("not"))
       return getExpectedError(ErrorMsg, true);
-    ASSIGN_OR_RETURN(Result, parseIntExpr2());
+    ASSIGN_OR_RETURN(Result, parseIntExpr3());
     return IntWithNotMask(0, (*Result).getValue());
   }
 
diff --git a/llvm/tools/llvm-rc/ResourceScriptParser.h b/llvm/tools/llvm-rc/ResourceScriptParser.h
index aa7f847187c49..1e7618c84142e 100644
--- a/llvm/tools/llvm-rc/ResourceScriptParser.h
+++ b/llvm/tools/llvm-rc/ResourceScriptParser.h
@@ -88,6 +88,7 @@ class RCParser {
   // Helper integer expression parsing methods.
   Expected<IntWithNotMask> parseIntExpr1();
   Expected<IntWithNotMask> parseIntExpr2();
+  Expected<IntWithNotMask> parseIntExpr3();
 
   // Advance the state by one, discarding the current token.
   // If the discarded token had an incorrect type, fail.
diff --git a/llvm/tools/llvm-rc/ResourceScriptStmt.h b/llvm/tools/llvm-rc/ResourceScriptStmt.h
index 8f099202c0b47..a81e384fda365 100644
--- a/llvm/tools/llvm-rc/ResourceScriptStmt.h
+++ b/llvm/tools/llvm-rc/ResourceScriptStmt.h
@@ -49,6 +49,16 @@ class RCInt {
     return *this;
   }
 
+  RCInt &operator*=(const RCInt &Rhs) {
+    std::tie(Val, Long) = std::make_pair(Val * Rhs.Val, Long | Rhs.Long);
+    return *this;
+  }
+
+  RCInt &operator/=(const RCInt &Rhs) {
+    std::tie(Val, Long) = std::make_pair(Val / Rhs.Val, Long | Rhs.Long);
+    return *this;
+  }
+
   RCInt &operator|=(const RCInt &Rhs) {
     std::tie(Val, Long) = std::make_pair(Val | Rhs.Val, Long | Rhs.Long);
     return *this;
@@ -98,6 +108,20 @@ class IntWithNotMask {
     return *this;
   }
 
+  IntWithNotMask &operator*=(const IntWithNotMask &Rhs) {
+    Value &= ~Rhs.NotMask;
+    Value *= Rhs.Value;
+    NotMask |= Rhs.NotMask;
+    return *this;
+  }
+
+  IntWithNotMask &operator/=(const IntWithNotMask &Rhs) {
+    Value &= ~Rhs.NotMask;
+    Value /= Rhs.Value;
+    NotMask |= Rhs.NotMask;
+    return *this;
+  }
+
   IntWithNotMask &operator|=(const IntWithNotMask &Rhs) {
     Value &= ~Rhs.NotMask;
     Value |= Rhs.Value;
diff --git a/llvm/tools/llvm-rc/ResourceScriptToken.cpp b/llvm/tools/llvm-rc/ResourceScriptToken.cpp
index aad1060c4a381..0070037e63e6a 100644
--- a/llvm/tools/llvm-rc/ResourceScriptToken.cpp
+++ b/llvm/tools/llvm-rc/ResourceScriptToken.cpp
@@ -64,7 +64,7 @@ StringRef RCToken::value() const { return TokenValue; }
 
 Kind RCToken::kind() const { return TokenKind; }
 
-bool RCToken::isBinaryOp() const {
+bool RCToken::isLowPrecedenceBinaryOp() const {
   switch (TokenKind) {
   case Kind::Plus:
   case Kind::Minus:
@@ -76,6 +76,16 @@ bool RCToken::isBinaryOp() const {
   }
 }
 
+bool RCToken::isHighPrecedenceBinaryOp() const {
+  switch (TokenKind) {
+  case Kind::Asterisk:
+  case Kind::Slash:
+    return true;
+  default:
+    return false;
+  }
+}
+
 static Error getStringError(const Twine &message) {
   return make_error<StringError>("Error parsing file: " + message,
                                  inconvertibleErrorCode());
diff --git a/llvm/tools/llvm-rc/ResourceScriptToken.h b/llvm/tools/llvm-rc/ResourceScriptToken.h
index 29f7502f89efd..3dcdfafd2d576 100644
--- a/llvm/tools/llvm-rc/ResourceScriptToken.h
+++ b/llvm/tools/llvm-rc/ResourceScriptToken.h
@@ -56,8 +56,11 @@ class RCToken {
   StringRef value() const;
   Kind kind() const;
 
-  // Check if a token describes a binary operator.
-  bool isBinaryOp() const;
+  // Check if a token describes a low precedence binary operator.
+  bool isLowPrecedenceBinaryOp() const;
+
+  // Check if a token describes a high precedence binary operator.
+  bool isHighPrecedenceBinaryOp() const;
 
 private:
   Kind TokenKind;
diff --git a/llvm/tools/llvm-rc/ResourceScriptTokenList.def b/llvm/tools/llvm-rc/ResourceScriptTokenList.def
index a61a96461f0fb..6ee13b2815d35 100644
--- a/llvm/tools/llvm-rc/ResourceScriptTokenList.def
+++ b/llvm/tools/llvm-rc/ResourceScriptTokenList.def
@@ -29,6 +29,8 @@ SHORT_TOKEN(BlockEnd, '}')     // End of the block; can also be END.
 SHORT_TOKEN(Comma, ',')        // Comma - resource arguments separator.
 SHORT_TOKEN(Plus, '+')         // Addition operator.
 SHORT_TOKEN(Minus, '-')        // Subtraction operator.
+SHORT_TOKEN(Asterisk, '*')     // Multiplication operator.
+SHORT_TOKEN(Slash, '/')        // Division operator.
 SHORT_TOKEN(Pipe, '|')         // Bitwise-OR operator.
 SHORT_TOKEN(Amp, '&')          // Bitwise-AND operator.
 SHORT_TOKEN(Tilde, '~')        // Bitwise-NOT operator.

>From 62b3e89afc54a118d597a27185f6915a68e408a0 Mon Sep 17 00:00:00 2001
From: Florian Hahn <flo at fhahn.com>
Date: Tue, 10 Jun 2025 21:37:03 +0100
Subject: [PATCH 15/61] [LV] Remove unused LoopBypassBlocks from ILV (NFC).

After recent refactorings to move parts of skeleton creation
LoopBypassBlocks isn't used any more. Remove it.
---
 .../lib/Transforms/Vectorize/LoopVectorize.cpp | 18 ------------------
 1 file changed, 18 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 333e50ee98418..427c1460fcfc9 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -611,9 +611,6 @@ class InnerLoopVectorizer {
   /// Middle Block between the vector and the scalar.
   BasicBlock *LoopMiddleBlock = nullptr;
 
-  /// A list of all bypass blocks. The first block is the entry of the loop.
-  SmallVector<BasicBlock *, 4> LoopBypassBlocks;
-
   /// Trip count of the original loop.
   Value *TripCount = nullptr;
 
@@ -2445,7 +2442,6 @@ void InnerLoopVectorizer::emitIterationCountCheck(BasicBlock *Bypass) {
   if (hasBranchWeightMD(*OrigLoop->getLoopLatch()->getTerminator()))
     setBranchWeights(BI, MinItersBypassWeights, /*IsExpected=*/false);
   ReplaceInstWithInst(TCCheckBlock->getTerminator(), &BI);
-  LoopBypassBlocks.push_back(TCCheckBlock);
 
   assert(cast<VPIRBasicBlock>(Plan.getEntry())->getIRBasicBlock() ==
              TCCheckBlock &&
@@ -2461,9 +2457,6 @@ BasicBlock *InnerLoopVectorizer::emitSCEVChecks(BasicBlock *Bypass) {
   assert((!Cost->OptForSize ||
           Cost->Hints->getForce() == LoopVectorizeHints::FK_Enabled) &&
          "Cannot SCEV check stride or overflow when optimizing for size");
-  assert(!LoopBypassBlocks.empty() &&
-         "Should already be a bypass block due to iteration count check");
-  LoopBypassBlocks.push_back(SCEVCheckBlock);
   AddedSafetyChecks = true;
 
   introduceCheckBlockInVPlan(SCEVCheckBlock);
@@ -2499,7 +2492,6 @@ BasicBlock *InnerLoopVectorizer::emitMemRuntimeChecks(BasicBlock *Bypass) {
     });
   }
 
-  LoopBypassBlocks.push_back(MemCheckBlock);
 
   AddedSafetyChecks = true;
 
@@ -7557,8 +7549,6 @@ EpilogueVectorizerMainLoop::emitIterationCountCheck(BasicBlock *Bypass,
                                    nullptr, "vector.ph");
 
   if (ForEpilogue) {
-    LoopBypassBlocks.push_back(TCCheckBlock);
-
     // Save the trip count so we don't have to regenerate it in the
     // vec.epilog.iter.check. This is safe to do because the trip count
     // generated here dominates the vector epilog iter check.
@@ -7619,13 +7609,6 @@ EpilogueVectorizerEpilogueLoop::createEpilogueVectorizedLoopSkeleton() {
 
   DT->changeImmediateDominator(LoopScalarPreHeader,
                                EPI.EpilogueIterationCountCheck);
-  // Keep track of bypass blocks, as they feed start values to the induction and
-  // reduction phis in the scalar loop preheader.
-  if (EPI.SCEVSafetyCheck)
-    LoopBypassBlocks.push_back(EPI.SCEVSafetyCheck);
-  if (EPI.MemSafetyCheck)
-    LoopBypassBlocks.push_back(EPI.MemSafetyCheck);
-  LoopBypassBlocks.push_back(EPI.EpilogueIterationCountCheck);
 
   // The vec.epilog.iter.check block may contain Phi nodes from inductions or
   // reductions which merge control-flow from the latch block and the middle
@@ -7696,7 +7679,6 @@ EpilogueVectorizerEpilogueLoop::emitMinimumVectorEpilogueIterCountCheck(
     setBranchWeights(BI, Weights, /*IsExpected=*/false);
   }
   ReplaceInstWithInst(Insert->getTerminator(), &BI);
-  LoopBypassBlocks.push_back(Insert);
 
   // A new entry block has been created for the epilogue VPlan. Hook it in, as
   // otherwise we would try to modify the entry to the main vector loop.

>From 830a74092adafa425db05e1c5120d3294f874777 Mon Sep 17 00:00:00 2001
From: Tomohiro Kashiwada <kikairoya at gmail.com>
Date: Wed, 11 Jun 2025 05:42:36 +0900
Subject: [PATCH 16/61] [Clang] [Cygwin] va_list must be treated like normal
 Windows (#143115)

Handling of va_list on Cygwin environment must be matched to normal
Windows environment.

The existing test `test/CodeGen/ms_abi.c` seems relevant, but it
contains `__attribute__((sysv_abi))`, which is not supported on Cygwin.
The new test is based on the `__attribute__((ms_abi))` portion of that
test.

---------

Co-authored-by: jeremyd2019 <github at jdrake.com>
---
 clang/lib/Basic/Targets/X86.h           |  4 +++
 clang/test/CodeGen/X86/cygwin-varargs.c | 35 +++++++++++++++++++++++++
 2 files changed, 39 insertions(+)
 create mode 100644 clang/test/CodeGen/X86/cygwin-varargs.c

diff --git a/clang/lib/Basic/Targets/X86.h b/clang/lib/Basic/Targets/X86.h
index 6f8a2365be256..ecb31ffa4750f 100644
--- a/clang/lib/Basic/Targets/X86.h
+++ b/clang/lib/Basic/Targets/X86.h
@@ -997,6 +997,10 @@ class LLVM_LIBRARY_VISIBILITY CygwinX86_64TargetInfo : public X86_64TargetInfo {
     if (Opts.CPlusPlus)
       Builder.defineMacro("_GNU_SOURCE");
   }
+
+  BuiltinVaListKind getBuiltinVaListKind() const override {
+    return TargetInfo::CharPtrBuiltinVaList;
+  }
 };
 
 class LLVM_LIBRARY_VISIBILITY DarwinX86_64TargetInfo
diff --git a/clang/test/CodeGen/X86/cygwin-varargs.c b/clang/test/CodeGen/X86/cygwin-varargs.c
new file mode 100644
index 0000000000000..4eea7d64bcb35
--- /dev/null
+++ b/clang/test/CodeGen/X86/cygwin-varargs.c
@@ -0,0 +1,35 @@
+// RUN: %clang_cc1 -triple x86_64-windows-gnu -emit-llvm < %s | FileCheck %s
+// RUN: %clang_cc1 -triple x86_64-pc-cygwin -emit-llvm < %s | FileCheck %s
+
+struct foo {
+  int x;
+  float y;
+  char z;
+};
+// CHECK: %[[STRUCT_FOO:.*]] = type { i32, float, i8 }
+
+void f(int a, ...) {
+  // CHECK-LABEL: define dso_local void @f
+  __builtin_va_list ap;
+  __builtin_va_start(ap, a);
+  // CHECK: %[[AP:.*]] = alloca ptr
+  // CHECK: call void @llvm.va_start
+  int b = __builtin_va_arg(ap, int);
+  // CHECK: %[[AP_CUR:.*]] = load ptr, ptr %[[AP]]
+  // CHECK-NEXT: %[[AP_NEXT:.*]] = getelementptr inbounds i8, ptr %[[AP_CUR]], i64 8
+  // CHECK-NEXT: store ptr %[[AP_NEXT]], ptr %[[AP]]
+  double _Complex c = __builtin_va_arg(ap, double _Complex);
+  // CHECK: %[[AP_CUR2:.*]] = load ptr, ptr %[[AP]]
+  // CHECK-NEXT: %[[AP_NEXT2:.*]] = getelementptr inbounds i8, ptr %[[AP_CUR2]], i64 8
+  // CHECK-NEXT: store ptr %[[AP_NEXT2]], ptr %[[AP]]
+  // CHECK-NEXT: load ptr, ptr %[[AP_CUR2]]
+  struct foo d = __builtin_va_arg(ap, struct foo);
+  // CHECK: %[[AP_CUR3:.*]] = load ptr, ptr %[[AP]]
+  // CHECK-NEXT: %[[AP_NEXT3:.*]] = getelementptr inbounds i8, ptr %[[AP_CUR3]], i64 8
+  // CHECK-NEXT: store ptr %[[AP_NEXT3]], ptr %[[AP]]
+  __builtin_va_list ap2;
+  __builtin_va_copy(ap2, ap);
+  // CHECK: call void @llvm.va_copy
+  __builtin_va_end(ap);
+  // CHECK: call void @llvm.va_end
+}

>From 13ccce28776d8ad27b0c6a92b5a452d62da05663 Mon Sep 17 00:00:00 2001
From: Sumanth Gundapaneni <sumanth.gundapaneni at amd.com>
Date: Tue, 10 Jun 2025 15:46:27 -0500
Subject: [PATCH 17/61] [SeparateConstOffsetFromGEP] Decompose constant xor
 operand if possible (#135788)

Try to transform XOR(A, B+C) in to XOR(A,C) + B where XOR(A,C) becomes
the base for memory operations. This transformation is true under the
following conditions
Check 1 -  B and C are disjoint.
Check 2 - XOR(A,C) and B are disjoint.

This transformation is beneficial particularly for GEPs because
Disjoint OR operations often map better to addressing modes than XOR.
This can enable further optimizations in the GEP offset folding pipeline
---
 .../Scalar/SeparateConstOffsetFromGEP.cpp     | 193 +++++++++++++++++
 .../AMDGPU/xor-to-or-disjoint.ll              | 204 ++++++++++++++++++
 2 files changed, 397 insertions(+)
 create mode 100644 llvm/test/Transforms/SeparateConstOffsetFromGEP/AMDGPU/xor-to-or-disjoint.ll

diff --git a/llvm/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp b/llvm/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp
index 320b79203c0b3..6fae9f1dd2404 100644
--- a/llvm/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp
+++ b/llvm/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp
@@ -174,6 +174,7 @@
 #include "llvm/IR/Function.h"
 #include "llvm/IR/GetElementPtrTypeIterator.h"
 #include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/InstIterator.h"
 #include "llvm/IR/InstrTypes.h"
 #include "llvm/IR/Instruction.h"
 #include "llvm/IR/Instructions.h"
@@ -190,6 +191,7 @@
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Transforms/Scalar.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
 #include "llvm/Transforms/Utils/Local.h"
 #include <cassert>
 #include <cstdint>
@@ -198,6 +200,8 @@
 using namespace llvm;
 using namespace llvm::PatternMatch;
 
+#define DEBUG_TYPE "separate-offset-gep"
+
 static cl::opt<bool> DisableSeparateConstOffsetFromGEP(
     "disable-separate-const-offset-from-gep", cl::init(false),
     cl::desc("Do not separate the constant offset from a GEP instruction"),
@@ -488,6 +492,42 @@ class SeparateConstOffsetFromGEP {
   DenseMap<ExprKey, SmallVector<Instruction *, 2>> DominatingSubs;
 };
 
+/// A helper class that aims to convert xor operations into or operations when
+/// their operands are disjoint and the result is used in a GEP's index. This
+/// can then enable further GEP optimizations by effectively turning BaseVal |
+/// Const into BaseVal + Const when they are disjoint, which
+/// SeparateConstOffsetFromGEP can then process. This is a common pattern that
+/// sets up a grid of memory accesses across a wave where each thread acesses
+/// data at various offsets.
+class XorToOrDisjointTransformer {
+public:
+  XorToOrDisjointTransformer(Function &F, DominatorTree &DT,
+                             const DataLayout &DL)
+      : F(F), DT(DT), DL(DL) {}
+
+  bool run();
+
+private:
+  Function &F;
+  DominatorTree &DT;
+  const DataLayout &DL;
+  /// Maps a common operand to all Xor instructions
+  using XorOpList = SmallVector<std::pair<BinaryOperator *, APInt>, 8>;
+  using XorBaseValInst = DenseMap<Instruction *, XorOpList>;
+  XorBaseValInst XorGroups;
+
+  /// Checks if the given value has at least one GetElementPtr user
+  static bool hasGEPUser(const Value *V);
+
+  /// Helper function to check if BaseXor dominates all XORs in the group
+  bool dominatesAllXors(BinaryOperator *BaseXor, const XorOpList &XorsInGroup);
+
+  /// Processes a group of XOR instructions that share the same non-constant
+  /// base operand. Returns true if this group's processing modified the
+  /// function.
+  bool processXorGroup(Instruction *OriginalBaseInst, XorOpList &XorsInGroup);
+};
+
 } // end anonymous namespace
 
 char SeparateConstOffsetFromGEPLegacyPass::ID = 0;
@@ -1223,6 +1263,154 @@ bool SeparateConstOffsetFromGEP::splitGEP(GetElementPtrInst *GEP) {
   return true;
 }
 
+// Helper function to check if an instruction has at least one GEP user
+bool XorToOrDisjointTransformer::hasGEPUser(const Value *V) {
+  return llvm::any_of(V->users(), [](const User *U) {
+    return isa<llvm::GetElementPtrInst>(U);
+  });
+}
+
+bool XorToOrDisjointTransformer::dominatesAllXors(
+    BinaryOperator *BaseXor, const XorOpList &XorsInGroup) {
+  return llvm::all_of(XorsInGroup, [&](const auto &XorEntry) {
+    BinaryOperator *XorInst = XorEntry.first;
+    // Do not evaluate the BaseXor, otherwise we end up cloning it.
+    return XorInst == BaseXor || DT.dominates(BaseXor, XorInst);
+  });
+}
+
+bool XorToOrDisjointTransformer::processXorGroup(Instruction *OriginalBaseInst,
+                                                 XorOpList &XorsInGroup) {
+  bool Changed = false;
+  if (XorsInGroup.size() <= 1)
+    return false;
+
+  // Sort XorsInGroup by the constant offset value in increasing order.
+  llvm::sort(XorsInGroup, [](const auto &A, const auto &B) {
+    return A.second.slt(B.second);
+  });
+
+  // Dominance check
+  // The "base" XOR for dominance purposes is the one with the smallest
+  // constant.
+  BinaryOperator *XorWithSmallConst = XorsInGroup[0].first;
+
+  if (!dominatesAllXors(XorWithSmallConst, XorsInGroup)) {
+    LLVM_DEBUG(dbgs() << DEBUG_TYPE
+                      << ": Cloning and inserting XOR with smallest constant ("
+                      << *XorWithSmallConst
+                      << ") as it does not dominate all other XORs"
+                      << " in function " << F.getName() << "\n");
+
+    BinaryOperator *ClonedXor =
+        cast<BinaryOperator>(XorWithSmallConst->clone());
+    ClonedXor->setName(XorWithSmallConst->getName() + ".dom_clone");
+    ClonedXor->insertAfter(OriginalBaseInst);
+    LLVM_DEBUG(dbgs() << "  Cloned Inst: " << *ClonedXor << "\n");
+    Changed = true;
+    XorWithSmallConst = ClonedXor;
+  }
+
+  SmallVector<Instruction *, 8> InstructionsToErase;
+  const APInt SmallestConst =
+      cast<ConstantInt>(XorWithSmallConst->getOperand(1))->getValue();
+
+  // Main transformation loop: Iterate over the original XORs in the sorted
+  // group.
+  for (const auto &XorEntry : XorsInGroup) {
+    BinaryOperator *XorInst = XorEntry.first; // Original XOR instruction
+    const APInt ConstOffsetVal = XorEntry.second;
+
+    // Do not process the one with smallest constant as it is the base.
+    if (XorInst == XorWithSmallConst)
+      continue;
+
+    // Disjointness Check 1
+    APInt NewConstVal = ConstOffsetVal - SmallestConst;
+    if ((NewConstVal & SmallestConst) != 0) {
+      LLVM_DEBUG(dbgs() << DEBUG_TYPE << ": Cannot transform XOR in function "
+                        << F.getName() << ":\n"
+                        << "  New Const: " << NewConstVal
+                        << "  Smallest Const: " << SmallestConst
+                        << "  are not disjoint \n");
+      continue;
+    }
+
+    // Disjointness Check 2
+    if (MaskedValueIsZero(XorWithSmallConst, NewConstVal, SimplifyQuery(DL),
+                          0)) {
+      LLVM_DEBUG(dbgs() << DEBUG_TYPE
+                        << ": Transforming XOR to OR (disjoint) in function "
+                        << F.getName() << ":\n"
+                        << "  Xor: " << *XorInst << "\n"
+                        << "  Base Val: " << *XorWithSmallConst << "\n"
+                        << "  New Const: " << NewConstVal << "\n");
+
+      auto *NewOrInst = BinaryOperator::CreateDisjointOr(
+          XorWithSmallConst,
+          ConstantInt::get(OriginalBaseInst->getType(), NewConstVal),
+          XorInst->getName() + ".or_disjoint", XorInst->getIterator());
+
+      NewOrInst->copyMetadata(*XorInst);
+      XorInst->replaceAllUsesWith(NewOrInst);
+      LLVM_DEBUG(dbgs() << "  New Inst: " << *NewOrInst << "\n");
+      InstructionsToErase.push_back(XorInst); // Mark original XOR for deletion
+
+      Changed = true;
+    } else {
+      LLVM_DEBUG(
+          dbgs() << DEBUG_TYPE
+                 << ": Cannot transform XOR (not proven disjoint) in function "
+                 << F.getName() << ":\n"
+                 << "  Xor: " << *XorInst << "\n"
+                 << "  Base Val: " << *XorWithSmallConst << "\n"
+                 << "  New Const: " << NewConstVal << "\n");
+    }
+  }
+
+  for (Instruction *I : InstructionsToErase)
+    I->eraseFromParent();
+
+  return Changed;
+}
+
+// Try to transform XOR(A, B+C) in to XOR(A,C) + B where XOR(A,C) becomes
+// the base for memory operations. This transformation is true under the
+// following conditions
+// Check 1 -  B and C are disjoint.
+// Check 2 - XOR(A,C) and B are disjoint.
+//
+// This transformation is beneficial particularly for GEPs because:
+// 1. OR operations often map better to addressing modes than XOR
+// 2. Disjoint OR operations preserve the semantics of the original XOR
+// 3. This can enable further optimizations in the GEP offset folding pipeline
+bool XorToOrDisjointTransformer::run() {
+  bool Changed = false;
+
+  // Collect all candidate XORs
+  for (Instruction &I : instructions(F)) {
+    Instruction *Op0 = nullptr;
+    ConstantInt *C1 = nullptr;
+    BinaryOperator *MatchedXorOp = nullptr;
+
+    // Attempt to match the instruction 'I' as XOR operation.
+    if (match(&I, m_CombineAnd(m_Xor(m_Instruction(Op0), m_ConstantInt(C1)),
+                               m_BinOp(MatchedXorOp))) &&
+        hasGEPUser(MatchedXorOp))
+      XorGroups[Op0].emplace_back(MatchedXorOp, C1->getValue());
+  }
+
+  if (XorGroups.empty())
+    return false;
+
+  // Process each group of XORs
+  for (auto &[OriginalBaseInst, XorsInGroup] : XorGroups)
+    if (processXorGroup(OriginalBaseInst, XorsInGroup))
+      Changed = true;
+
+  return Changed;
+}
+
 bool SeparateConstOffsetFromGEPLegacyPass::runOnFunction(Function &F) {
   if (skipFunction(F))
     return false;
@@ -1242,6 +1430,11 @@ bool SeparateConstOffsetFromGEP::run(Function &F) {
 
   DL = &F.getDataLayout();
   bool Changed = false;
+
+  // Decompose xor in to "or disjoint" if possible.
+  XorToOrDisjointTransformer XorTransformer(F, *DT, *DL);
+  Changed |= XorTransformer.run();
+
   for (BasicBlock &B : F) {
     if (!DT->isReachableFromEntry(&B))
       continue;
diff --git a/llvm/test/Transforms/SeparateConstOffsetFromGEP/AMDGPU/xor-to-or-disjoint.ll b/llvm/test/Transforms/SeparateConstOffsetFromGEP/AMDGPU/xor-to-or-disjoint.ll
new file mode 100644
index 0000000000000..825227292fe14
--- /dev/null
+++ b/llvm/test/Transforms/SeparateConstOffsetFromGEP/AMDGPU/xor-to-or-disjoint.ll
@@ -0,0 +1,204 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt -mtriple=amdgcn-amd-amdhsa -passes=separate-const-offset-from-gep \
+; RUN: -S < %s | FileCheck %s
+
+
+; Test a simple case of xor to or disjoint transformation
+define half @test_basic_transformation(ptr %ptr, i64 %input) {
+; CHECK-LABEL: define half @test_basic_transformation(
+; CHECK-SAME: ptr [[PTR:%.*]], i64 [[INPUT:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[BASE:%.*]] = and i64 [[INPUT]], -8192
+; CHECK-NEXT:    [[ADDR1:%.*]] = xor i64 [[BASE]], 32
+; CHECK-NEXT:    [[ADDR2_OR_DISJOINT:%.*]] = or disjoint i64 [[ADDR1]], 2048
+; CHECK-NEXT:    [[ADDR3_OR_DISJOINT:%.*]] = or disjoint i64 [[ADDR1]], 4096
+; CHECK-NEXT:    [[GEP1:%.*]] = getelementptr i8, ptr [[PTR]], i64 [[ADDR1]]
+; CHECK-NEXT:    [[GEP2:%.*]] = getelementptr i8, ptr [[PTR]], i64 [[ADDR2_OR_DISJOINT]]
+; CHECK-NEXT:    [[GEP3:%.*]] = getelementptr i8, ptr [[PTR]], i64 [[ADDR3_OR_DISJOINT]]
+; CHECK-NEXT:    [[VAL1:%.*]] = load half, ptr [[GEP1]], align 2
+; CHECK-NEXT:    [[VAL2:%.*]] = load half, ptr [[GEP2]], align 2
+; CHECK-NEXT:    [[VAL3:%.*]] = load half, ptr [[GEP3]], align 2
+; CHECK-NEXT:    [[VAL1_F:%.*]] = fpext half [[VAL1]] to float
+; CHECK-NEXT:    [[VAL2_F:%.*]] = fpext half [[VAL2]] to float
+; CHECK-NEXT:    [[VAL3_F:%.*]] = fpext half [[VAL3]] to float
+; CHECK-NEXT:    [[SUM1_F:%.*]] = fadd float [[VAL1_F]], [[VAL2_F]]
+; CHECK-NEXT:    [[SUM_TOTAL_F:%.*]] = fadd float [[SUM1_F]], [[VAL3_F]]
+; CHECK-NEXT:    [[RESULT_H:%.*]] = fptrunc float [[SUM_TOTAL_F]] to half
+; CHECK-NEXT:    ret half [[RESULT_H]]
+;
+entry:
+  %base = and i64 %input, -8192    ; Clear low bits
+  %addr1 = xor i64 %base, 32
+  %addr2 = xor i64 %base, 2080
+  %addr3 = xor i64 %base, 4128
+  %gep1 = getelementptr i8, ptr %ptr, i64 %addr1
+  %gep2 = getelementptr i8, ptr %ptr, i64 %addr2
+  %gep3 = getelementptr i8, ptr %ptr, i64 %addr3
+  %val1 = load half, ptr %gep1
+  %val2 = load half, ptr %gep2
+  %val3 = load half, ptr %gep3
+  %val1.f = fpext half %val1 to float
+  %val2.f = fpext half %val2 to float
+  %val3.f = fpext half %val3 to float
+  %sum1.f = fadd float %val1.f, %val2.f
+  %sum_total.f = fadd float %sum1.f, %val3.f
+  %result.h = fptrunc float %sum_total.f to half
+  ret half %result.h
+}
+
+
+; Test the decreasing order of offset xor to or disjoint transformation
+define half @test_descending_offset_transformation(ptr %ptr, i64 %input) {
+; CHECK-LABEL: define half @test_descending_offset_transformation(
+; CHECK-SAME: ptr [[PTR:%.*]], i64 [[INPUT:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[BASE:%.*]] = and i64 [[INPUT]], -8192
+; CHECK-NEXT:    [[ADDR3_DOM_CLONE:%.*]] = xor i64 [[BASE]], 32
+; CHECK-NEXT:    [[ADDR1_OR_DISJOINT:%.*]] = or disjoint i64 [[ADDR3_DOM_CLONE]], 4096
+; CHECK-NEXT:    [[ADDR2_OR_DISJOINT:%.*]] = or disjoint i64 [[ADDR3_DOM_CLONE]], 2048
+; CHECK-NEXT:    [[ADDR3_OR_DISJOINT:%.*]] = or disjoint i64 [[ADDR3_DOM_CLONE]], 0
+; CHECK-NEXT:    [[GEP1:%.*]] = getelementptr i8, ptr [[PTR]], i64 [[ADDR1_OR_DISJOINT]]
+; CHECK-NEXT:    [[GEP2:%.*]] = getelementptr i8, ptr [[PTR]], i64 [[ADDR2_OR_DISJOINT]]
+; CHECK-NEXT:    [[GEP3:%.*]] = getelementptr i8, ptr [[PTR]], i64 [[ADDR3_OR_DISJOINT]]
+; CHECK-NEXT:    [[VAL1:%.*]] = load half, ptr [[GEP1]], align 2
+; CHECK-NEXT:    [[VAL2:%.*]] = load half, ptr [[GEP2]], align 2
+; CHECK-NEXT:    [[VAL3:%.*]] = load half, ptr [[GEP3]], align 2
+; CHECK-NEXT:    [[VAL1_F:%.*]] = fpext half [[VAL1]] to float
+; CHECK-NEXT:    [[VAL2_F:%.*]] = fpext half [[VAL2]] to float
+; CHECK-NEXT:    [[VAL3_F:%.*]] = fpext half [[VAL3]] to float
+; CHECK-NEXT:    [[SUM1_F:%.*]] = fadd float [[VAL1_F]], [[VAL2_F]]
+; CHECK-NEXT:    [[SUM_TOTAL_F:%.*]] = fadd float [[SUM1_F]], [[VAL3_F]]
+; CHECK-NEXT:    [[RESULT_H:%.*]] = fptrunc float [[SUM_TOTAL_F]] to half
+; CHECK-NEXT:    ret half [[RESULT_H]]
+;
+entry:
+  %base = and i64 %input, -8192    ; Clear low bits
+  %addr1 = xor i64 %base, 4128
+  %addr2 = xor i64 %base, 2080
+  %addr3 = xor i64 %base, 32
+  %gep1 = getelementptr i8, ptr %ptr, i64 %addr1
+  %gep2 = getelementptr i8, ptr %ptr, i64 %addr2
+  %gep3 = getelementptr i8, ptr %ptr, i64 %addr3
+  %val1 = load half, ptr %gep1
+  %val2 = load half, ptr %gep2
+  %val3 = load half, ptr %gep3
+  %val1.f = fpext half %val1 to float
+  %val2.f = fpext half %val2 to float
+  %val3.f = fpext half %val3 to float
+  %sum1.f = fadd float %val1.f, %val2.f
+  %sum_total.f = fadd float %sum1.f, %val3.f
+  %result.h = fptrunc float %sum_total.f to half
+  ret half %result.h
+}
+
+
+; Test that %addr2 is not transformed to or disjoint.
+define half @test_no_transfomation(ptr %ptr, i64 %input) {
+; CHECK-LABEL: define half @test_no_transfomation(
+; CHECK-SAME: ptr [[PTR:%.*]], i64 [[INPUT:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[BASE:%.*]] = and i64 [[INPUT]], -8192
+; CHECK-NEXT:    [[ADDR1:%.*]] = xor i64 [[BASE]], 32
+; CHECK-NEXT:    [[ADDR2:%.*]] = xor i64 [[BASE]], 64
+; CHECK-NEXT:    [[ADDR3_OR_DISJOINT:%.*]] = or disjoint i64 [[ADDR1]], 2048
+; CHECK-NEXT:    [[GEP1:%.*]] = getelementptr i8, ptr [[PTR]], i64 [[ADDR1]]
+; CHECK-NEXT:    [[GEP2:%.*]] = getelementptr i8, ptr [[PTR]], i64 [[ADDR2]]
+; CHECK-NEXT:    [[GEP3:%.*]] = getelementptr i8, ptr [[PTR]], i64 [[ADDR3_OR_DISJOINT]]
+; CHECK-NEXT:    [[VAL1:%.*]] = load half, ptr [[GEP1]], align 2
+; CHECK-NEXT:    [[VAL2:%.*]] = load half, ptr [[GEP2]], align 2
+; CHECK-NEXT:    [[VAL3:%.*]] = load half, ptr [[GEP3]], align 2
+; CHECK-NEXT:    [[VAL1_F:%.*]] = fpext half [[VAL1]] to float
+; CHECK-NEXT:    [[VAL2_F:%.*]] = fpext half [[VAL2]] to float
+; CHECK-NEXT:    [[VAL3_F:%.*]] = fpext half [[VAL3]] to float
+; CHECK-NEXT:    [[SUM1_F:%.*]] = fadd float [[VAL1_F]], [[VAL2_F]]
+; CHECK-NEXT:    [[SUM_TOTAL_F:%.*]] = fadd float [[SUM1_F]], [[VAL3_F]]
+; CHECK-NEXT:    [[RESULT_H:%.*]] = fptrunc float [[SUM_TOTAL_F]] to half
+; CHECK-NEXT:    ret half [[RESULT_H]]
+;
+entry:
+  %base = and i64 %input, -8192    ; Clear low bits
+  %addr1 = xor i64 %base, 32
+  %addr2 = xor i64 %base, 64  ; Should not be transformed
+  %addr3 = xor i64 %base, 2080
+  %gep1 = getelementptr i8, ptr %ptr, i64 %addr1
+  %gep2 = getelementptr i8, ptr %ptr, i64 %addr2
+  %gep3 = getelementptr i8, ptr %ptr, i64 %addr3
+  %val1 = load half, ptr %gep1
+  %val2 = load half, ptr %gep2
+  %val3 = load half, ptr %gep3
+  %val1.f = fpext half %val1 to float
+  %val2.f = fpext half %val2 to float
+  %val3.f = fpext half %val3 to float
+  %sum1.f = fadd float %val1.f, %val2.f
+  %sum_total.f = fadd float %sum1.f, %val3.f
+  %result.h = fptrunc float %sum_total.f to half
+  ret half %result.h
+}
+
+
+; Test case with xor instructions in different basic blocks
+define half @test_dom_tree(ptr %ptr, i64 %input, i1 %cond) {
+; CHECK-LABEL: define half @test_dom_tree(
+; CHECK-SAME: ptr [[PTR:%.*]], i64 [[INPUT:%.*]], i1 [[COND:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[BASE:%.*]] = and i64 [[INPUT]], -8192
+; CHECK-NEXT:    [[ADDR1:%.*]] = xor i64 [[BASE]], 16
+; CHECK-NEXT:    [[GEP1:%.*]] = getelementptr i8, ptr [[PTR]], i64 [[ADDR1]]
+; CHECK-NEXT:    [[VAL1:%.*]] = load half, ptr [[GEP1]], align 2
+; CHECK-NEXT:    br i1 [[COND]], label %[[THEN:.*]], label %[[ELSE:.*]]
+; CHECK:       [[THEN]]:
+; CHECK-NEXT:    [[ADDR2_OR_DISJOINT:%.*]] = or disjoint i64 [[ADDR1]], 32
+; CHECK-NEXT:    [[GEP2:%.*]] = getelementptr i8, ptr [[PTR]], i64 [[ADDR2_OR_DISJOINT]]
+; CHECK-NEXT:    [[VAL2:%.*]] = load half, ptr [[GEP2]], align 2
+; CHECK-NEXT:    br label %[[MERGE:.*]]
+; CHECK:       [[ELSE]]:
+; CHECK-NEXT:    [[ADDR3_OR_DISJOINT:%.*]] = or disjoint i64 [[ADDR1]], 96
+; CHECK-NEXT:    [[GEP3:%.*]] = getelementptr i8, ptr [[PTR]], i64 [[ADDR3_OR_DISJOINT]]
+; CHECK-NEXT:    [[VAL3:%.*]] = load half, ptr [[GEP3]], align 2
+; CHECK-NEXT:    br label %[[MERGE]]
+; CHECK:       [[MERGE]]:
+; CHECK-NEXT:    [[VAL_FROM_BRANCH:%.*]] = phi half [ [[VAL2]], %[[THEN]] ], [ [[VAL3]], %[[ELSE]] ]
+; CHECK-NEXT:    [[ADDR4_OR_DISJOINT:%.*]] = or disjoint i64 [[ADDR1]], 224
+; CHECK-NEXT:    [[GEP4:%.*]] = getelementptr i8, ptr [[PTR]], i64 [[ADDR4_OR_DISJOINT]]
+; CHECK-NEXT:    [[VAL4:%.*]] = load half, ptr [[GEP4]], align 2
+; CHECK-NEXT:    [[VAL1_F:%.*]] = fpext half [[VAL1]] to float
+; CHECK-NEXT:    [[VAL_FROM_BRANCH_F:%.*]] = fpext half [[VAL_FROM_BRANCH]] to float
+; CHECK-NEXT:    [[VAL4_F:%.*]] = fpext half [[VAL4]] to float
+; CHECK-NEXT:    [[SUM_INTERMEDIATE_F:%.*]] = fadd float [[VAL1_F]], [[VAL_FROM_BRANCH_F]]
+; CHECK-NEXT:    [[FINAL_SUM_F:%.*]] = fadd float [[SUM_INTERMEDIATE_F]], [[VAL4_F]]
+; CHECK-NEXT:    [[RESULT_H:%.*]] = fptrunc float [[FINAL_SUM_F]] to half
+; CHECK-NEXT:    ret half [[RESULT_H]]
+;
+entry:
+  %base = and i64 %input, -8192   ; Clear low bits
+  %addr1 = xor i64 %base,16
+  %gep1 = getelementptr i8, ptr %ptr, i64 %addr1
+  %val1 = load half, ptr %gep1
+  br i1 %cond, label %then, label %else
+
+then:
+  %addr2 = xor i64 %base, 48
+  %gep2 = getelementptr i8, ptr %ptr, i64 %addr2
+  %val2 = load half, ptr %gep2
+  br label %merge
+
+else:
+  %addr3 = xor i64 %base, 112
+  %gep3 = getelementptr i8, ptr %ptr, i64 %addr3
+  %val3 = load half, ptr %gep3
+  br label %merge
+
+merge:
+  %val_from_branch = phi half [ %val2, %then ], [ %val3, %else ]
+  %addr4 = xor i64 %base, 240
+  %gep4 = getelementptr i8, ptr %ptr, i64 %addr4
+  %val4 = load half, ptr %gep4
+  %val1.f = fpext half %val1 to float
+  %val_from_branch.f = fpext half %val_from_branch to float
+  %val4.f = fpext half %val4 to float
+  %sum_intermediate.f = fadd float %val1.f, %val_from_branch.f
+  %final_sum.f = fadd float %sum_intermediate.f, %val4.f
+  %result.h = fptrunc float %final_sum.f to half
+  ret half %result.h
+}
+

>From 0c774682889ae9b1b89cb9d4d796283f205b8a63 Mon Sep 17 00:00:00 2001
From: Amir Ayupov <aaupov at fb.com>
Date: Tue, 10 Jun 2025 14:31:22 -0700
Subject: [PATCH 18/61] [BOLT] Expose external entry count for functions
 (#141674)

Record the number of function invocations from external code - code
outside the binary, which may include JIT code and DSOs. Accounting
external entry counts improves the fidelity of call graph flow
conservation analysis.

Test Plan: updated shrinkwrapping.test
---
 bolt/include/bolt/Core/BinaryFunction.h        | 12 ++++++++++++
 bolt/include/bolt/Profile/DataReader.h         |  3 +++
 bolt/include/bolt/Profile/ProfileYAMLMapping.h |  2 ++
 bolt/lib/Core/BinaryFunction.cpp               |  2 ++
 bolt/lib/Passes/ProfileQualityStats.cpp        |  3 +++
 bolt/lib/Profile/DataAggregator.cpp            |  1 +
 bolt/lib/Profile/DataReader.cpp                |  6 ++++++
 bolt/lib/Profile/YAMLProfileReader.cpp         |  1 +
 bolt/lib/Profile/YAMLProfileWriter.cpp         |  1 +
 bolt/test/X86/shrinkwrapping.test              |  2 ++
 10 files changed, 33 insertions(+)

diff --git a/bolt/include/bolt/Core/BinaryFunction.h b/bolt/include/bolt/Core/BinaryFunction.h
index 14957cba50174..ca8b786f4ab69 100644
--- a/bolt/include/bolt/Core/BinaryFunction.h
+++ b/bolt/include/bolt/Core/BinaryFunction.h
@@ -388,6 +388,10 @@ class BinaryFunction {
   /// The profile data for the number of times the function was executed.
   uint64_t ExecutionCount{COUNT_NO_PROFILE};
 
+  /// Profile data for the number of times this function was entered from
+  /// external code (DSO, JIT, etc).
+  uint64_t ExternEntryCount{0};
+
   /// Profile match ratio.
   float ProfileMatchRatio{0.0f};
 
@@ -1877,6 +1881,10 @@ class BinaryFunction {
     return *this;
   }
 
+  /// Set the profile data for the number of times the function was entered from
+  /// external code (DSO/JIT).
+  void setExternEntryCount(uint64_t Count) { ExternEntryCount = Count; }
+
   /// Adjust execution count for the function by a given \p Count. The value
   /// \p Count will be subtracted from the current function count.
   ///
@@ -1904,6 +1912,10 @@ class BinaryFunction {
   /// Return COUNT_NO_PROFILE if there's no profile info.
   uint64_t getExecutionCount() const { return ExecutionCount; }
 
+  /// Return the profile information about the number of times the function was
+  /// entered from external code (DSO/JIT).
+  uint64_t getExternEntryCount() const { return ExternEntryCount; }
+
   /// Return the raw profile information about the number of branch
   /// executions corresponding to this function.
   uint64_t getRawSampleCount() const { return RawSampleCount; }
diff --git a/bolt/include/bolt/Profile/DataReader.h b/bolt/include/bolt/Profile/DataReader.h
index 3c770fed2598f..6f527ba3931d4 100644
--- a/bolt/include/bolt/Profile/DataReader.h
+++ b/bolt/include/bolt/Profile/DataReader.h
@@ -97,6 +97,9 @@ struct FuncBranchData {
   /// Total execution count for the function.
   int64_t ExecutionCount{0};
 
+  /// Total entry count from external code for the function.
+  uint64_t ExternEntryCount{0};
+
   /// Indicate if the data was used.
   bool Used{false};
 
diff --git a/bolt/include/bolt/Profile/ProfileYAMLMapping.h b/bolt/include/bolt/Profile/ProfileYAMLMapping.h
index a8d9a15311d94..41e2bd1651efd 100644
--- a/bolt/include/bolt/Profile/ProfileYAMLMapping.h
+++ b/bolt/include/bolt/Profile/ProfileYAMLMapping.h
@@ -206,6 +206,7 @@ struct BinaryFunctionProfile {
   uint32_t Id{0};
   llvm::yaml::Hex64 Hash{0};
   uint64_t ExecCount{0};
+  uint64_t ExternEntryCount{0};
   std::vector<BinaryBasicBlockProfile> Blocks;
   std::vector<InlineTreeNode> InlineTree;
   bool Used{false};
@@ -218,6 +219,7 @@ template <> struct MappingTraits<bolt::BinaryFunctionProfile> {
     YamlIO.mapRequired("fid", BFP.Id);
     YamlIO.mapRequired("hash", BFP.Hash);
     YamlIO.mapRequired("exec", BFP.ExecCount);
+    YamlIO.mapOptional("extern", BFP.ExternEntryCount, 0);
     YamlIO.mapRequired("nblocks", BFP.NumBasicBlocks);
     YamlIO.mapOptional("blocks", BFP.Blocks,
                        std::vector<bolt::BinaryBasicBlockProfile>());
diff --git a/bolt/lib/Core/BinaryFunction.cpp b/bolt/lib/Core/BinaryFunction.cpp
index 6d1969f5c6c30..b998d7160aae7 100644
--- a/bolt/lib/Core/BinaryFunction.cpp
+++ b/bolt/lib/Core/BinaryFunction.cpp
@@ -471,6 +471,8 @@ void BinaryFunction::print(raw_ostream &OS, std::string Annotation) {
     OS << "\n  Sample Count: " << RawSampleCount;
     OS << "\n  Profile Acc : " << format("%.1f%%", ProfileMatchRatio * 100.0f);
   }
+  if (ExternEntryCount)
+    OS << "\n  Extern Entry Count: " << ExternEntryCount;
 
   if (opts::PrintDynoStats && !getLayout().block_empty()) {
     OS << '\n';
diff --git a/bolt/lib/Passes/ProfileQualityStats.cpp b/bolt/lib/Passes/ProfileQualityStats.cpp
index dfd74d3dd5719..64cc662c3ab29 100644
--- a/bolt/lib/Passes/ProfileQualityStats.cpp
+++ b/bolt/lib/Passes/ProfileQualityStats.cpp
@@ -532,6 +532,9 @@ void computeFlowMappings(const BinaryContext &BC, FlowInfo &TotalFlowMap) {
     std::vector<uint64_t> &MaxCountMap = TotalMaxCountMaps[FunctionNum];
     std::vector<uint64_t> &MinCountMap = TotalMinCountMaps[FunctionNum];
 
+    // Record external entry count into CallGraphIncomingFlows
+    CallGraphIncomingFlows[FunctionNum] += Function->getExternEntryCount();
+
     // Update MaxCountMap, MinCountMap, and CallGraphIncomingFlows
     auto recordCall = [&](const BinaryBasicBlock *SourceBB,
                           const MCSymbol *DestSymbol, uint64_t Count,
diff --git a/bolt/lib/Profile/DataAggregator.cpp b/bolt/lib/Profile/DataAggregator.cpp
index 4022212bcf1b6..308346e5d02ce 100644
--- a/bolt/lib/Profile/DataAggregator.cpp
+++ b/bolt/lib/Profile/DataAggregator.cpp
@@ -2255,6 +2255,7 @@ std::error_code DataAggregator::writeBATYAML(BinaryContext &BC,
       YamlBF.Id = BF->getFunctionNumber();
       YamlBF.Hash = BAT->getBFHash(FuncAddress);
       YamlBF.ExecCount = BF->getKnownExecutionCount();
+      YamlBF.ExternEntryCount = BF->getExternEntryCount();
       YamlBF.NumBasicBlocks = BAT->getNumBasicBlocks(FuncAddress);
       const BoltAddressTranslation::BBHashMapTy &BlockMap =
           BAT->getBBHashMap(FuncAddress);
diff --git a/bolt/lib/Profile/DataReader.cpp b/bolt/lib/Profile/DataReader.cpp
index c512394f26a3b..afe24216d7f5d 100644
--- a/bolt/lib/Profile/DataReader.cpp
+++ b/bolt/lib/Profile/DataReader.cpp
@@ -85,6 +85,7 @@ void FuncBranchData::appendFrom(const FuncBranchData &FBD, uint64_t Offset) {
   }
   llvm::stable_sort(Data);
   ExecutionCount += FBD.ExecutionCount;
+  ExternEntryCount += FBD.ExternEntryCount;
   for (auto I = FBD.EntryData.begin(), E = FBD.EntryData.end(); I != E; ++I) {
     assert(I->To.Name == FBD.Name);
     auto NewElmt = EntryData.insert(EntryData.end(), *I);
@@ -269,6 +270,7 @@ Error DataReader::preprocessProfile(BinaryContext &BC) {
     if (FuncBranchData *FuncData = getBranchDataForNames(Function.getNames())) {
       setBranchData(Function, FuncData);
       Function.ExecutionCount = FuncData->ExecutionCount;
+      Function.ExternEntryCount = FuncData->ExternEntryCount;
       FuncData->Used = true;
     }
   }
@@ -419,6 +421,7 @@ void DataReader::matchProfileData(BinaryFunction &BF) {
       if (fetchProfileForOtherEntryPoints(BF)) {
         BF.ProfileMatchRatio = evaluateProfileData(BF, *FBD);
         BF.ExecutionCount = FBD->ExecutionCount;
+        BF.ExternEntryCount = FBD->ExternEntryCount;
         BF.RawSampleCount = FBD->getNumExecutedBranches();
       }
       return;
@@ -449,6 +452,7 @@ void DataReader::matchProfileData(BinaryFunction &BF) {
     setBranchData(BF, NewBranchData);
     NewBranchData->Used = true;
     BF.ExecutionCount = NewBranchData->ExecutionCount;
+    BF.ExternEntryCount = NewBranchData->ExternEntryCount;
     BF.ProfileMatchRatio = 1.0f;
     break;
   }
@@ -1190,6 +1194,8 @@ std::error_code DataReader::parse() {
     if (BI.To.IsSymbol && BI.To.Offset == 0) {
       I = GetOrCreateFuncEntry(BI.To.Name);
       I->second.ExecutionCount += BI.Branches;
+      if (!BI.From.IsSymbol)
+        I->second.ExternEntryCount += BI.Branches;
     }
   }
 
diff --git a/bolt/lib/Profile/YAMLProfileReader.cpp b/bolt/lib/Profile/YAMLProfileReader.cpp
index 33ce40ac2eeec..086e47b661e10 100644
--- a/bolt/lib/Profile/YAMLProfileReader.cpp
+++ b/bolt/lib/Profile/YAMLProfileReader.cpp
@@ -176,6 +176,7 @@ bool YAMLProfileReader::parseFunctionProfile(
   uint64_t FunctionExecutionCount = 0;
 
   BF.setExecutionCount(YamlBF.ExecCount);
+  BF.setExternEntryCount(YamlBF.ExternEntryCount);
 
   uint64_t FuncRawBranchCount = 0;
   for (const yaml::bolt::BinaryBasicBlockProfile &YamlBB : YamlBF.Blocks)
diff --git a/bolt/lib/Profile/YAMLProfileWriter.cpp b/bolt/lib/Profile/YAMLProfileWriter.cpp
index 0ae67a4d35595..1632aa1c6bfe2 100644
--- a/bolt/lib/Profile/YAMLProfileWriter.cpp
+++ b/bolt/lib/Profile/YAMLProfileWriter.cpp
@@ -226,6 +226,7 @@ YAMLProfileWriter::convert(const BinaryFunction &BF, bool UseDFS,
   YamlBF.Hash = BF.getHash();
   YamlBF.NumBasicBlocks = BF.size();
   YamlBF.ExecCount = BF.getKnownExecutionCount();
+  YamlBF.ExternEntryCount = BF.getExternEntryCount();
   DenseMap<const MCDecodedPseudoProbeInlineTree *, uint32_t> InlineTreeNodeId;
   if (PseudoProbeDecoder && BF.getGUID()) {
     std::tie(YamlBF.InlineTree, InlineTreeNodeId) =
diff --git a/bolt/test/X86/shrinkwrapping.test b/bolt/test/X86/shrinkwrapping.test
index 8581d7e0c0f7b..521b4561b3ba6 100644
--- a/bolt/test/X86/shrinkwrapping.test
+++ b/bolt/test/X86/shrinkwrapping.test
@@ -8,6 +8,7 @@ REQUIRES: shell
 
 RUN: %clangxx %cxxflags -no-pie %S/Inputs/exc4sw.S -o %t.exe -Wl,-q
 RUN: llvm-bolt %t.exe -o %t --relocs --frame-opt=all \
+RUN:   --print-only=main --print-cfg \
 RUN:   --data=%p/Inputs/exc4sw.fdata --reorder-blocks=cache 2>&1 | \
 RUN:   FileCheck %s --check-prefix=CHECK-BOLT
 
@@ -19,6 +20,7 @@ RUN: llvm-objdump --dwarf=frames %t | grep -A20 -e \
 RUN:   `llvm-nm --numeric-sort %t | grep main | tail -n 1 | cut -f1 -d' ' | \
 RUN:    tail -c9` 2>&1 | FileCheck %s --check-prefix=CHECK-OUTPUT
 
+CHECK-BOLT: Extern Entry Count: 100
 CHECK-BOLT: Shrink wrapping moved 2 spills inserting load/stores and 0 spills inserting push/pops
 
 CHECK-INPUT:  DW_CFA_advance_loc: 2

>From 163c67ad3d1bf7af6590930d8f18700d65ad4564 Mon Sep 17 00:00:00 2001
From: Peter Klausler <pklausler at nvidia.com>
Date: Tue, 10 Jun 2025 14:44:19 -0700
Subject: [PATCH 19/61] [flang][runtime] Replace recursion with iterative work
 queue (#137727)

Recursion, both direct and indirect, prevents accurate stack size
calculation at link time for GPU device code. Restructure these
recursive (often mutually so) routines in the Fortran runtime with new
implementations based on an iterative work queue with
suspendable/resumable work tickets: Assign, Initialize, initializeClone,
Finalize, and Destroy.

Default derived type I/O is also recursive, but already disabled. It can
be added to this new framework later if the overall approach succeeds.

Note that derived type FINAL subroutine calls, defined assignments, and
defined I/O procedures all perform callbacks into user code, which may
well reenter the runtime library. This kind of recursion is not handled
by this change, although it may be possible to do so in the future using
thread-local work queues.

The effects of this restructuring on CPU performance are yet to be
measured.
---
 .../include/flang-rt/runtime/environment.h    |   3 +
 flang-rt/include/flang-rt/runtime/stat.h      |  10 +-
 flang-rt/include/flang-rt/runtime/type-info.h |   2 +
 .../include/flang-rt/runtime/work-queue.h     | 548 +++++++++++++++
 flang-rt/lib/runtime/CMakeLists.txt           |   2 +
 flang-rt/lib/runtime/assign.cpp               | 623 +++++++++++------
 flang-rt/lib/runtime/derived.cpp              | 517 +++++++-------
 flang-rt/lib/runtime/descriptor-io.cpp        | 651 +++++++++++++++++-
 flang-rt/lib/runtime/descriptor-io.h          | 620 +----------------
 flang-rt/lib/runtime/environment.cpp          |   4 +
 flang-rt/lib/runtime/namelist.cpp             |   1 +
 flang-rt/lib/runtime/tools.cpp                |   4 +-
 flang-rt/lib/runtime/type-info.cpp            |   6 +-
 flang-rt/lib/runtime/work-queue.cpp           | 161 +++++
 flang-rt/unittests/Runtime/ExternalIOTest.cpp |   2 +-
 flang/docs/Extensions.md                      |  10 +
 flang/include/flang/Runtime/assign.h          |   2 +-
 flang/include/flang/Semantics/tools.h         |   7 +-
 flang/lib/Semantics/runtime-type-info.cpp     |   4 +
 flang/lib/Semantics/tools.cpp                 |  32 +
 flang/module/__fortran_type_info.f90          |   3 +-
 flang/test/Lower/volatile-openmp.f90          |   8 +-
 flang/test/Semantics/typeinfo01.f90           |  30 +-
 flang/test/Semantics/typeinfo03.f90           |   2 +-
 flang/test/Semantics/typeinfo04.f90           |   8 +-
 flang/test/Semantics/typeinfo05.f90           |   4 +-
 flang/test/Semantics/typeinfo06.f90           |   4 +-
 flang/test/Semantics/typeinfo07.f90           |   8 +-
 flang/test/Semantics/typeinfo08.f90           |   2 +-
 flang/test/Semantics/typeinfo11.f90           |   2 +-
 flang/test/Semantics/typeinfo12.f90           |  67 ++
 31 files changed, 2227 insertions(+), 1120 deletions(-)
 create mode 100644 flang-rt/include/flang-rt/runtime/work-queue.h
 create mode 100644 flang-rt/lib/runtime/work-queue.cpp
 create mode 100644 flang/test/Semantics/typeinfo12.f90

diff --git a/flang-rt/include/flang-rt/runtime/environment.h b/flang-rt/include/flang-rt/runtime/environment.h
index 16258b3bbba9b..e579f6012ce86 100644
--- a/flang-rt/include/flang-rt/runtime/environment.h
+++ b/flang-rt/include/flang-rt/runtime/environment.h
@@ -64,6 +64,9 @@ struct ExecutionEnvironment {
   bool defaultUTF8{false}; // DEFAULT_UTF8
   bool checkPointerDeallocation{true}; // FORT_CHECK_POINTER_DEALLOCATION
 
+  enum InternalDebugging { WorkQueue = 1 };
+  int internalDebugging{0}; // FLANG_RT_DEBUG
+
   // CUDA related variables
   std::size_t cudaStackLimit{0}; // ACC_OFFLOAD_STACK_SIZE
   bool cudaDeviceIsManaged{false}; // NV_CUDAFOR_DEVICE_IS_MANAGED
diff --git a/flang-rt/include/flang-rt/runtime/stat.h b/flang-rt/include/flang-rt/runtime/stat.h
index 070d0bf8673fb..dc372de53506a 100644
--- a/flang-rt/include/flang-rt/runtime/stat.h
+++ b/flang-rt/include/flang-rt/runtime/stat.h
@@ -24,7 +24,7 @@ class Terminator;
 enum Stat {
   StatOk = 0, // required to be zero by Fortran
 
-  // Interoperable STAT= codes
+  // Interoperable STAT= codes (>= 11)
   StatBaseNull = CFI_ERROR_BASE_ADDR_NULL,
   StatBaseNotNull = CFI_ERROR_BASE_ADDR_NOT_NULL,
   StatInvalidElemLen = CFI_INVALID_ELEM_LEN,
@@ -36,7 +36,7 @@ enum Stat {
   StatMemAllocation = CFI_ERROR_MEM_ALLOCATION,
   StatOutOfBounds = CFI_ERROR_OUT_OF_BOUNDS,
 
-  // Standard STAT= values
+  // Standard STAT= values (>= 101)
   StatFailedImage = FORTRAN_RUNTIME_STAT_FAILED_IMAGE,
   StatLocked = FORTRAN_RUNTIME_STAT_LOCKED,
   StatLockedOtherImage = FORTRAN_RUNTIME_STAT_LOCKED_OTHER_IMAGE,
@@ -49,10 +49,14 @@ enum Stat {
   // Additional "processor-defined" STAT= values
   StatInvalidArgumentNumber = FORTRAN_RUNTIME_STAT_INVALID_ARG_NUMBER,
   StatMissingArgument = FORTRAN_RUNTIME_STAT_MISSING_ARG,
-  StatValueTooShort = FORTRAN_RUNTIME_STAT_VALUE_TOO_SHORT,
+  StatValueTooShort = FORTRAN_RUNTIME_STAT_VALUE_TOO_SHORT, // -1
   StatMoveAllocSameAllocatable =
       FORTRAN_RUNTIME_STAT_MOVE_ALLOC_SAME_ALLOCATABLE,
   StatBadPointerDeallocation = FORTRAN_RUNTIME_STAT_BAD_POINTER_DEALLOCATION,
+
+  // Dummy status for work queue continuation, declared here to perhaps
+  // avoid collisions
+  StatContinue = 201
 };
 
 RT_API_ATTRS const char *StatErrorString(int);
diff --git a/flang-rt/include/flang-rt/runtime/type-info.h b/flang-rt/include/flang-rt/runtime/type-info.h
index 5e79efde164f2..9bde3adba87f5 100644
--- a/flang-rt/include/flang-rt/runtime/type-info.h
+++ b/flang-rt/include/flang-rt/runtime/type-info.h
@@ -240,6 +240,7 @@ class DerivedType {
   RT_API_ATTRS bool noFinalizationNeeded() const {
     return noFinalizationNeeded_;
   }
+  RT_API_ATTRS bool noDefinedAssignment() const { return noDefinedAssignment_; }
 
   RT_API_ATTRS std::size_t LenParameters() const {
     return lenParameterKind().Elements();
@@ -322,6 +323,7 @@ class DerivedType {
   bool noInitializationNeeded_{false};
   bool noDestructionNeeded_{false};
   bool noFinalizationNeeded_{false};
+  bool noDefinedAssignment_{false};
 };
 
 } // namespace Fortran::runtime::typeInfo
diff --git a/flang-rt/include/flang-rt/runtime/work-queue.h b/flang-rt/include/flang-rt/runtime/work-queue.h
new file mode 100644
index 0000000000000..878b18373e1d2
--- /dev/null
+++ b/flang-rt/include/flang-rt/runtime/work-queue.h
@@ -0,0 +1,548 @@
+//===-- include/flang-rt/runtime/work-queue.h -------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// Internal runtime utilities for work queues that replace the use of recursion
+// for better GPU device support.
+//
+// A work queue comprises a list of tickets.  Each ticket class has a Begin()
+// member function, which is called once, and a Continue() member function
+// that can be called zero or more times.  A ticket's execution terminates
+// when either of these member functions returns a status other than
+// StatContinue.  When that status is not StatOk, then the whole queue
+// is shut down.
+//
+// By returning StatContinue from its Continue() member function,
+// a ticket suspends its execution so that any nested tickets that it
+// may have created can be run to completion.  It is the reponsibility
+// of each ticket class to maintain resumption information in its state
+// and manage its own progress.  Most ticket classes inherit from
+// class ComponentsOverElements, which implements an outer loop over all
+// components of a derived type, and an inner loop over all elements
+// of a descriptor, possibly with multiple phases of execution per element.
+//
+// Tickets are created by WorkQueue::Begin...() member functions.
+// There is one of these for each "top level" recursive function in the
+// Fortran runtime support library that has been restructured into this
+// ticket framework.
+//
+// When the work queue is running tickets, it always selects the last ticket
+// on the list for execution -- "work stack" might have been a more accurate
+// name for this framework.  This ticket may, while doing its job, create
+// new tickets, and since those are pushed after the active one, the first
+// such nested ticket will be the next one executed to completion -- i.e.,
+// the order of nested WorkQueue::Begin...() calls is respected.
+// Note that a ticket's Continue() member function won't be called again
+// until all nested tickets have run to completion and it is once again
+// the last ticket on the queue.
+//
+// Example for an assignment to a derived type:
+// 1. Assign() is called, and its work queue is created.  It calls
+//    WorkQueue::BeginAssign() and then WorkQueue::Run().
+// 2. Run calls AssignTicket::Begin(), which pushes a tickets via
+//    BeginFinalize() and returns StatContinue.
+// 3. FinalizeTicket::Begin() and FinalizeTicket::Continue() are called
+//    until one of them returns StatOk, which ends the finalization ticket.
+// 4. AssignTicket::Continue() is then called; it creates a DerivedAssignTicket
+//    and then returns StatOk, which ends the ticket.
+// 5. At this point, only one ticket remains.  DerivedAssignTicket::Begin()
+//    and ::Continue() are called until they are done (not StatContinue).
+//    Along the way, it may create nested AssignTickets for components,
+//    and suspend itself so that they may each run to completion.
+
+#ifndef FLANG_RT_RUNTIME_WORK_QUEUE_H_
+#define FLANG_RT_RUNTIME_WORK_QUEUE_H_
+
+#include "flang-rt/runtime/connection.h"
+#include "flang-rt/runtime/descriptor.h"
+#include "flang-rt/runtime/stat.h"
+#include "flang-rt/runtime/type-info.h"
+#include "flang/Common/api-attrs.h"
+#include "flang/Runtime/freestanding-tools.h"
+#include <flang/Common/variant.h>
+
+namespace Fortran::runtime::io {
+class IoStatementState;
+struct NonTbpDefinedIoTable;
+} // namespace Fortran::runtime::io
+
+namespace Fortran::runtime {
+class Terminator;
+class WorkQueue;
+
+// Ticket worker base classes
+
+template <typename TICKET> class ImmediateTicketRunner {
+public:
+  RT_API_ATTRS explicit ImmediateTicketRunner(TICKET &ticket)
+      : ticket_{ticket} {}
+  RT_API_ATTRS int Run(WorkQueue &workQueue) {
+    int status{ticket_.Begin(workQueue)};
+    while (status == StatContinue) {
+      status = ticket_.Continue(workQueue);
+    }
+    return status;
+  }
+
+private:
+  TICKET &ticket_;
+};
+
+// Base class for ticket workers that operate elementwise over descriptors
+class Elementwise {
+protected:
+  RT_API_ATTRS Elementwise(
+      const Descriptor &instance, const Descriptor *from = nullptr)
+      : instance_{instance}, from_{from} {
+    instance_.GetLowerBounds(subscripts_);
+    if (from_) {
+      from_->GetLowerBounds(fromSubscripts_);
+    }
+  }
+  RT_API_ATTRS bool IsComplete() const { return elementAt_ >= elements_; }
+  RT_API_ATTRS void Advance() {
+    ++elementAt_;
+    instance_.IncrementSubscripts(subscripts_);
+    if (from_) {
+      from_->IncrementSubscripts(fromSubscripts_);
+    }
+  }
+  RT_API_ATTRS void SkipToEnd() { elementAt_ = elements_; }
+  RT_API_ATTRS void Reset() {
+    elementAt_ = 0;
+    instance_.GetLowerBounds(subscripts_);
+    if (from_) {
+      from_->GetLowerBounds(fromSubscripts_);
+    }
+  }
+
+  const Descriptor &instance_, *from_{nullptr};
+  std::size_t elements_{instance_.Elements()};
+  std::size_t elementAt_{0};
+  SubscriptValue subscripts_[common::maxRank];
+  SubscriptValue fromSubscripts_[common::maxRank];
+};
+
+// Base class for ticket workers that operate over derived type components.
+class Componentwise {
+protected:
+  RT_API_ATTRS Componentwise(const typeInfo::DerivedType &);
+  RT_API_ATTRS bool IsComplete() const { return componentAt_ >= components_; }
+  RT_API_ATTRS void Advance() {
+    ++componentAt_;
+    GetComponent();
+  }
+  RT_API_ATTRS void SkipToEnd() {
+    component_ = nullptr;
+    componentAt_ = components_;
+  }
+  RT_API_ATTRS void Reset() {
+    component_ = nullptr;
+    componentAt_ = 0;
+    GetComponent();
+  }
+  RT_API_ATTRS void GetComponent();
+
+  const typeInfo::DerivedType &derived_;
+  std::size_t components_{0}, componentAt_{0};
+  const typeInfo::Component *component_{nullptr};
+  StaticDescriptor<common::maxRank, true, 0> componentDescriptor_;
+};
+
+// Base class for ticket workers that operate over derived type components
+// in an outer loop, and elements in an inner loop.
+class ComponentsOverElements : protected Componentwise, protected Elementwise {
+protected:
+  RT_API_ATTRS ComponentsOverElements(const Descriptor &instance,
+      const typeInfo::DerivedType &derived, const Descriptor *from = nullptr)
+      : Componentwise{derived}, Elementwise{instance, from} {
+    if (Elementwise::IsComplete()) {
+      Componentwise::SkipToEnd();
+    }
+  }
+  RT_API_ATTRS bool IsComplete() const { return Componentwise::IsComplete(); }
+  RT_API_ATTRS void Advance() {
+    SkipToNextElement();
+    if (Elementwise::IsComplete()) {
+      Elementwise::Reset();
+      Componentwise::Advance();
+    }
+  }
+  RT_API_ATTRS void SkipToNextElement() {
+    phase_ = 0;
+    Elementwise::Advance();
+  }
+  RT_API_ATTRS void SkipToNextComponent() {
+    phase_ = 0;
+    Elementwise::Reset();
+    Componentwise::Advance();
+  }
+  RT_API_ATTRS void Reset() {
+    phase_ = 0;
+    Elementwise::Reset();
+    Componentwise::Reset();
+  }
+
+  int phase_{0};
+};
+
+// Base class for ticket workers that operate over elements in an outer loop,
+// type components in an inner loop.
+class ElementsOverComponents : protected Elementwise, protected Componentwise {
+protected:
+  RT_API_ATTRS ElementsOverComponents(const Descriptor &instance,
+      const typeInfo::DerivedType &derived, const Descriptor *from = nullptr)
+      : Elementwise{instance, from}, Componentwise{derived} {
+    if (Componentwise::IsComplete()) {
+      Elementwise::SkipToEnd();
+    }
+  }
+  RT_API_ATTRS bool IsComplete() const { return Elementwise::IsComplete(); }
+  RT_API_ATTRS void Advance() {
+    SkipToNextComponent();
+    if (Componentwise::IsComplete()) {
+      Componentwise::Reset();
+      Elementwise::Advance();
+    }
+  }
+  RT_API_ATTRS void SkipToNextComponent() {
+    phase_ = 0;
+    Componentwise::Advance();
+  }
+  RT_API_ATTRS void SkipToNextElement() {
+    phase_ = 0;
+    Componentwise::Reset();
+    Elementwise::Advance();
+  }
+
+  int phase_{0};
+};
+
+// Ticket worker classes
+
+// Implements derived type instance initialization
+class InitializeTicket : public ImmediateTicketRunner<InitializeTicket>,
+                         private ComponentsOverElements {
+public:
+  RT_API_ATTRS InitializeTicket(
+      const Descriptor &instance, const typeInfo::DerivedType &derived)
+      : ImmediateTicketRunner<InitializeTicket>{*this},
+        ComponentsOverElements{instance, derived} {}
+  RT_API_ATTRS int Begin(WorkQueue &);
+  RT_API_ATTRS int Continue(WorkQueue &);
+};
+
+// Initializes one derived type instance from the value of another
+class InitializeCloneTicket
+    : public ImmediateTicketRunner<InitializeCloneTicket>,
+      private ComponentsOverElements {
+public:
+  RT_API_ATTRS InitializeCloneTicket(const Descriptor &clone,
+      const Descriptor &original, const typeInfo::DerivedType &derived,
+      bool hasStat, const Descriptor *errMsg)
+      : ImmediateTicketRunner<InitializeCloneTicket>{*this},
+        ComponentsOverElements{original, derived}, clone_{clone},
+        hasStat_{hasStat}, errMsg_{errMsg} {}
+  RT_API_ATTRS int Begin(WorkQueue &) { return StatContinue; }
+  RT_API_ATTRS int Continue(WorkQueue &);
+
+private:
+  const Descriptor &clone_;
+  bool hasStat_{false};
+  const Descriptor *errMsg_{nullptr};
+  StaticDescriptor<common::maxRank, true, 0> cloneComponentDescriptor_;
+};
+
+// Implements derived type instance finalization
+class FinalizeTicket : public ImmediateTicketRunner<FinalizeTicket>,
+                       private ComponentsOverElements {
+public:
+  RT_API_ATTRS FinalizeTicket(
+      const Descriptor &instance, const typeInfo::DerivedType &derived)
+      : ImmediateTicketRunner<FinalizeTicket>{*this},
+        ComponentsOverElements{instance, derived} {}
+  RT_API_ATTRS int Begin(WorkQueue &);
+  RT_API_ATTRS int Continue(WorkQueue &);
+
+private:
+  const typeInfo::DerivedType *finalizableParentType_{nullptr};
+};
+
+// Implements derived type instance destruction
+class DestroyTicket : public ImmediateTicketRunner<DestroyTicket>,
+                      private ComponentsOverElements {
+public:
+  RT_API_ATTRS DestroyTicket(const Descriptor &instance,
+      const typeInfo::DerivedType &derived, bool finalize)
+      : ImmediateTicketRunner<DestroyTicket>{*this},
+        ComponentsOverElements{instance, derived}, finalize_{finalize} {}
+  RT_API_ATTRS int Begin(WorkQueue &);
+  RT_API_ATTRS int Continue(WorkQueue &);
+
+private:
+  bool finalize_{false};
+};
+
+// Implements general intrinsic assignment
+class AssignTicket : public ImmediateTicketRunner<AssignTicket> {
+public:
+  RT_API_ATTRS AssignTicket(
+      Descriptor &to, const Descriptor &from, int flags, MemmoveFct memmoveFct)
+      : ImmediateTicketRunner<AssignTicket>{*this}, to_{to}, from_{&from},
+        flags_{flags}, memmoveFct_{memmoveFct} {}
+  RT_API_ATTRS int Begin(WorkQueue &);
+  RT_API_ATTRS int Continue(WorkQueue &);
+
+private:
+  RT_API_ATTRS bool IsSimpleMemmove() const {
+    return !toDerived_ && to_.rank() == from_->rank() && to_.IsContiguous() &&
+        from_->IsContiguous() && to_.ElementBytes() == from_->ElementBytes();
+  }
+  RT_API_ATTRS Descriptor &GetTempDescriptor();
+
+  Descriptor &to_;
+  const Descriptor *from_{nullptr};
+  int flags_{0}; // enum AssignFlags
+  MemmoveFct memmoveFct_{nullptr};
+  StaticDescriptor<common::maxRank, true, 0> tempDescriptor_;
+  const typeInfo::DerivedType *toDerived_{nullptr};
+  Descriptor *toDeallocate_{nullptr};
+  bool persist_{false};
+  bool done_{false};
+};
+
+// Implements derived type intrinsic assignment.
+template <bool IS_COMPONENTWISE>
+class DerivedAssignTicket
+    : public ImmediateTicketRunner<DerivedAssignTicket<IS_COMPONENTWISE>>,
+      private std::conditional_t<IS_COMPONENTWISE, ComponentsOverElements,
+          ElementsOverComponents> {
+public:
+  using Base = std::conditional_t<IS_COMPONENTWISE, ComponentsOverElements,
+      ElementsOverComponents>;
+  RT_API_ATTRS DerivedAssignTicket(const Descriptor &to, const Descriptor &from,
+      const typeInfo::DerivedType &derived, int flags, MemmoveFct memmoveFct,
+      Descriptor *deallocateAfter)
+      : ImmediateTicketRunner<DerivedAssignTicket>{*this},
+        Base{to, derived, &from}, flags_{flags}, memmoveFct_{memmoveFct},
+        deallocateAfter_{deallocateAfter} {}
+  RT_API_ATTRS int Begin(WorkQueue &);
+  RT_API_ATTRS int Continue(WorkQueue &);
+
+private:
+  static constexpr bool isComponentwise_{IS_COMPONENTWISE};
+  bool toIsContiguous_{this->instance_.IsContiguous()};
+  bool fromIsContiguous_{this->from_->IsContiguous()};
+  int flags_{0};
+  MemmoveFct memmoveFct_{nullptr};
+  Descriptor *deallocateAfter_{nullptr};
+  StaticDescriptor<common::maxRank, true, 0> fromComponentDescriptor_;
+};
+
+namespace io::descr {
+
+template <io::Direction DIR>
+class DescriptorIoTicket
+    : public ImmediateTicketRunner<DescriptorIoTicket<DIR>>,
+      private Elementwise {
+public:
+  RT_API_ATTRS DescriptorIoTicket(io::IoStatementState &io,
+      const Descriptor &descriptor, const io::NonTbpDefinedIoTable *table,
+      bool &anyIoTookPlace)
+      : ImmediateTicketRunner<DescriptorIoTicket>(*this),
+        Elementwise{descriptor}, io_{io}, table_{table},
+        anyIoTookPlace_{anyIoTookPlace} {}
+  RT_API_ATTRS int Begin(WorkQueue &);
+  RT_API_ATTRS int Continue(WorkQueue &);
+  RT_API_ATTRS bool &anyIoTookPlace() { return anyIoTookPlace_; }
+
+private:
+  io::IoStatementState &io_;
+  const io::NonTbpDefinedIoTable *table_{nullptr};
+  bool &anyIoTookPlace_;
+  common::optional<typeInfo::SpecialBinding> nonTbpSpecial_;
+  const typeInfo::DerivedType *derived_{nullptr};
+  const typeInfo::SpecialBinding *special_{nullptr};
+  StaticDescriptor<common::maxRank, true, 0> elementDescriptor_;
+};
+
+template <io::Direction DIR>
+class DerivedIoTicket : public ImmediateTicketRunner<DerivedIoTicket<DIR>>,
+                        private ElementsOverComponents {
+public:
+  RT_API_ATTRS DerivedIoTicket(io::IoStatementState &io,
+      const Descriptor &descriptor, const typeInfo::DerivedType &derived,
+      const io::NonTbpDefinedIoTable *table, bool &anyIoTookPlace)
+      : ImmediateTicketRunner<DerivedIoTicket>(*this),
+        ElementsOverComponents{descriptor, derived}, io_{io}, table_{table},
+        anyIoTookPlace_{anyIoTookPlace} {}
+  RT_API_ATTRS int Begin(WorkQueue &) { return StatContinue; }
+  RT_API_ATTRS int Continue(WorkQueue &);
+
+private:
+  io::IoStatementState &io_;
+  const io::NonTbpDefinedIoTable *table_{nullptr};
+  bool &anyIoTookPlace_;
+};
+
+} // namespace io::descr
+
+struct NullTicket {
+  RT_API_ATTRS int Begin(WorkQueue &) const { return StatOk; }
+  RT_API_ATTRS int Continue(WorkQueue &) const { return StatOk; }
+};
+
+struct Ticket {
+  RT_API_ATTRS int Continue(WorkQueue &);
+  bool begun{false};
+  std::variant<NullTicket, InitializeTicket, InitializeCloneTicket,
+      FinalizeTicket, DestroyTicket, AssignTicket, DerivedAssignTicket<false>,
+      DerivedAssignTicket<true>,
+      io::descr::DescriptorIoTicket<io::Direction::Output>,
+      io::descr::DescriptorIoTicket<io::Direction::Input>,
+      io::descr::DerivedIoTicket<io::Direction::Output>,
+      io::descr::DerivedIoTicket<io::Direction::Input>>
+      u;
+};
+
+class WorkQueue {
+public:
+  RT_API_ATTRS explicit WorkQueue(Terminator &terminator)
+      : terminator_{terminator} {
+    for (int j{1}; j < numStatic_; ++j) {
+      static_[j].previous = &static_[j - 1];
+      static_[j - 1].next = &static_[j];
+    }
+  }
+  RT_API_ATTRS ~WorkQueue();
+  RT_API_ATTRS Terminator &terminator() { return terminator_; };
+
+  // APIs for particular tasks.  These can return StatOk if the work is
+  // completed immediately.
+  RT_API_ATTRS int BeginInitialize(
+      const Descriptor &descriptor, const typeInfo::DerivedType &derived) {
+    if (runTicketsImmediately_) {
+      return InitializeTicket{descriptor, derived}.Run(*this);
+    } else {
+      StartTicket().u.emplace<InitializeTicket>(descriptor, derived);
+      return StatContinue;
+    }
+  }
+  RT_API_ATTRS int BeginInitializeClone(const Descriptor &clone,
+      const Descriptor &original, const typeInfo::DerivedType &derived,
+      bool hasStat, const Descriptor *errMsg) {
+    if (runTicketsImmediately_) {
+      return InitializeCloneTicket{clone, original, derived, hasStat, errMsg}
+          .Run(*this);
+    } else {
+      StartTicket().u.emplace<InitializeCloneTicket>(
+          clone, original, derived, hasStat, errMsg);
+      return StatContinue;
+    }
+  }
+  RT_API_ATTRS int BeginFinalize(
+      const Descriptor &descriptor, const typeInfo::DerivedType &derived) {
+    if (runTicketsImmediately_) {
+      return FinalizeTicket{descriptor, derived}.Run(*this);
+    } else {
+      StartTicket().u.emplace<FinalizeTicket>(descriptor, derived);
+      return StatContinue;
+    }
+  }
+  RT_API_ATTRS int BeginDestroy(const Descriptor &descriptor,
+      const typeInfo::DerivedType &derived, bool finalize) {
+    if (runTicketsImmediately_) {
+      return DestroyTicket{descriptor, derived, finalize}.Run(*this);
+    } else {
+      StartTicket().u.emplace<DestroyTicket>(descriptor, derived, finalize);
+      return StatContinue;
+    }
+  }
+  RT_API_ATTRS int BeginAssign(Descriptor &to, const Descriptor &from,
+      int flags, MemmoveFct memmoveFct) {
+    if (runTicketsImmediately_) {
+      return AssignTicket{to, from, flags, memmoveFct}.Run(*this);
+    } else {
+      StartTicket().u.emplace<AssignTicket>(to, from, flags, memmoveFct);
+      return StatContinue;
+    }
+  }
+  template <bool IS_COMPONENTWISE>
+  RT_API_ATTRS int BeginDerivedAssign(Descriptor &to, const Descriptor &from,
+      const typeInfo::DerivedType &derived, int flags, MemmoveFct memmoveFct,
+      Descriptor *deallocateAfter) {
+    if (runTicketsImmediately_) {
+      return DerivedAssignTicket<IS_COMPONENTWISE>{
+          to, from, derived, flags, memmoveFct, deallocateAfter}
+          .Run(*this);
+    } else {
+      StartTicket().u.emplace<DerivedAssignTicket<IS_COMPONENTWISE>>(
+          to, from, derived, flags, memmoveFct, deallocateAfter);
+      return StatContinue;
+    }
+  }
+  template <io::Direction DIR>
+  RT_API_ATTRS int BeginDescriptorIo(io::IoStatementState &io,
+      const Descriptor &descriptor, const io::NonTbpDefinedIoTable *table,
+      bool &anyIoTookPlace) {
+    if (runTicketsImmediately_) {
+      return io::descr::DescriptorIoTicket<DIR>{
+          io, descriptor, table, anyIoTookPlace}
+          .Run(*this);
+    } else {
+      StartTicket().u.emplace<io::descr::DescriptorIoTicket<DIR>>(
+          io, descriptor, table, anyIoTookPlace);
+      return StatContinue;
+    }
+  }
+  template <io::Direction DIR>
+  RT_API_ATTRS int BeginDerivedIo(io::IoStatementState &io,
+      const Descriptor &descriptor, const typeInfo::DerivedType &derived,
+      const io::NonTbpDefinedIoTable *table, bool &anyIoTookPlace) {
+    if (runTicketsImmediately_) {
+      return io::descr::DerivedIoTicket<DIR>{
+          io, descriptor, derived, table, anyIoTookPlace}
+          .Run(*this);
+    } else {
+      StartTicket().u.emplace<io::descr::DerivedIoTicket<DIR>>(
+          io, descriptor, derived, table, anyIoTookPlace);
+      return StatContinue;
+    }
+  }
+
+  RT_API_ATTRS int Run();
+
+private:
+#if RT_DEVICE_COMPILATION
+  // Always use the work queue on a GPU device to avoid recursion.
+  static constexpr bool runTicketsImmediately_{false};
+#else
+  // Avoid the work queue overhead on the host, unless it needs
+  // debugging, which is so much easier there.
+  static constexpr bool runTicketsImmediately_{true};
+#endif
+
+  // Most uses of the work queue won't go very deep.
+  static constexpr int numStatic_{2};
+
+  struct TicketList {
+    bool isStatic{true};
+    Ticket ticket;
+    TicketList *previous{nullptr}, *next{nullptr};
+  };
+
+  RT_API_ATTRS Ticket &StartTicket();
+  RT_API_ATTRS void Stop();
+
+  Terminator &terminator_;
+  TicketList *first_{nullptr}, *last_{nullptr}, *insertAfter_{nullptr};
+  TicketList static_[numStatic_];
+  TicketList *firstFree_{static_};
+};
+
+} // namespace Fortran::runtime
+#endif // FLANG_RT_RUNTIME_WORK_QUEUE_H_
diff --git a/flang-rt/lib/runtime/CMakeLists.txt b/flang-rt/lib/runtime/CMakeLists.txt
index a3f63b4315644..332c0872e065f 100644
--- a/flang-rt/lib/runtime/CMakeLists.txt
+++ b/flang-rt/lib/runtime/CMakeLists.txt
@@ -68,6 +68,7 @@ set(supported_sources
   type-info.cpp
   unit.cpp
   utf.cpp
+  work-queue.cpp
 )
 
 # List of source not used for GPU offloading.
@@ -131,6 +132,7 @@ set(gpu_sources
   type-code.cpp
   type-info.cpp
   utf.cpp
+  work-queue.cpp
   complex-powi.cpp
   reduce.cpp
   reduction.cpp
diff --git a/flang-rt/lib/runtime/assign.cpp b/flang-rt/lib/runtime/assign.cpp
index bf67b5dc8b645..41b130cc8f257 100644
--- a/flang-rt/lib/runtime/assign.cpp
+++ b/flang-rt/lib/runtime/assign.cpp
@@ -14,6 +14,7 @@
 #include "flang-rt/runtime/terminator.h"
 #include "flang-rt/runtime/tools.h"
 #include "flang-rt/runtime/type-info.h"
+#include "flang-rt/runtime/work-queue.h"
 
 namespace Fortran::runtime {
 
@@ -102,11 +103,7 @@ static RT_API_ATTRS int AllocateAssignmentLHS(
     toDim.SetByteStride(stride);
     stride *= toDim.Extent();
   }
-  int result{ReturnError(terminator, to.Allocate(kNoAsyncObject))};
-  if (result == StatOk && derived && !derived->noInitializationNeeded()) {
-    result = ReturnError(terminator, Initialize(to, *derived, terminator));
-  }
-  return result;
+  return ReturnError(terminator, to.Allocate(kNoAsyncObject));
 }
 
 // least <= 0, most >= 0
@@ -231,6 +228,8 @@ static RT_API_ATTRS void BlankPadCharacterAssignment(Descriptor &to,
   }
 }
 
+RT_OFFLOAD_API_GROUP_BEGIN
+
 // Common implementation of assignments, both intrinsic assignments and
 // those cases of polymorphic user-defined ASSIGNMENT(=) TBPs that could not
 // be resolved in semantics.  Most assignment statements do not need any
@@ -244,275 +243,453 @@ static RT_API_ATTRS void BlankPadCharacterAssignment(Descriptor &to,
 // dealing with array constructors.
 RT_API_ATTRS void Assign(Descriptor &to, const Descriptor &from,
     Terminator &terminator, int flags, MemmoveFct memmoveFct) {
-  bool mustDeallocateLHS{(flags & DeallocateLHS) ||
-      MustDeallocateLHS(to, from, terminator, flags)};
-  DescriptorAddendum *toAddendum{to.Addendum()};
-  const typeInfo::DerivedType *toDerived{
-      toAddendum ? toAddendum->derivedType() : nullptr};
-  if (toDerived && (flags & NeedFinalization) &&
-      toDerived->noFinalizationNeeded()) {
-    flags &= ~NeedFinalization;
-  }
-  std::size_t toElementBytes{to.ElementBytes()};
-  std::size_t fromElementBytes{from.ElementBytes()};
-  // The following lambda definition violates the conding style,
-  // but cuda-11.8 nvcc hits an internal error with the brace initialization.
-  auto isSimpleMemmove = [&]() {
-    return !toDerived && to.rank() == from.rank() && to.IsContiguous() &&
-        from.IsContiguous() && toElementBytes == fromElementBytes;
-  };
-  StaticDescriptor<maxRank, true, 10 /*?*/> deferredDeallocStatDesc;
-  Descriptor *deferDeallocation{nullptr};
-  if (MayAlias(to, from)) {
+  WorkQueue workQueue{terminator};
+  if (workQueue.BeginAssign(to, from, flags, memmoveFct) == StatContinue) {
+    workQueue.Run();
+  }
+}
+
+RT_API_ATTRS int AssignTicket::Begin(WorkQueue &workQueue) {
+  bool mustDeallocateLHS{(flags_ & DeallocateLHS) ||
+      MustDeallocateLHS(to_, *from_, workQueue.terminator(), flags_)};
+  DescriptorAddendum *toAddendum{to_.Addendum()};
+  toDerived_ = toAddendum ? toAddendum->derivedType() : nullptr;
+  if (toDerived_ && (flags_ & NeedFinalization) &&
+      toDerived_->noFinalizationNeeded()) {
+    flags_ &= ~NeedFinalization;
+  }
+  if (MayAlias(to_, *from_)) {
     if (mustDeallocateLHS) {
-      deferDeallocation = &deferredDeallocStatDesc.descriptor();
+      // Convert the LHS into a temporary, then make it look deallocated.
+      toDeallocate_ = &tempDescriptor_.descriptor();
+      persist_ = true; // tempDescriptor_ state must outlive child tickets
       std::memcpy(
-          reinterpret_cast<void *>(deferDeallocation), &to, to.SizeInBytes());
-      to.set_base_addr(nullptr);
-    } else if (!isSimpleMemmove()) {
+          reinterpret_cast<void *>(toDeallocate_), &to_, to_.SizeInBytes());
+      to_.set_base_addr(nullptr);
+      if (toDerived_ && (flags_ & NeedFinalization)) {
+        if (int status{workQueue.BeginFinalize(*toDeallocate_, *toDerived_)};
+            status != StatOk && status != StatContinue) {
+          return status;
+        }
+        flags_ &= ~NeedFinalization;
+      }
+    } else if (!IsSimpleMemmove()) {
       // Handle LHS/RHS aliasing by copying RHS into a temp, then
       // recursively assigning from that temp.
-      auto descBytes{from.SizeInBytes()};
-      StaticDescriptor<maxRank, true, 16> staticDesc;
-      Descriptor &newFrom{staticDesc.descriptor()};
-      std::memcpy(reinterpret_cast<void *>(&newFrom), &from, descBytes);
+      auto descBytes{from_->SizeInBytes()};
+      Descriptor &newFrom{tempDescriptor_.descriptor()};
+      persist_ = true; // tempDescriptor_ state must outlive child tickets
+      std::memcpy(reinterpret_cast<void *>(&newFrom), from_, descBytes);
       // Pretend the temporary descriptor is for an ALLOCATABLE
       // entity, otherwise, the Deallocate() below will not
       // free the descriptor memory.
       newFrom.raw().attribute = CFI_attribute_allocatable;
-      auto stat{ReturnError(terminator, newFrom.Allocate(kNoAsyncObject))};
-      if (stat == StatOk) {
-        if (HasDynamicComponent(from)) {
-          // If 'from' has allocatable/automatic component, we cannot
-          // just make a shallow copy of the descriptor member.
-          // This will still leave data overlap in 'to' and 'newFrom'.
-          // For example:
-          //   type t
-          //     character, allocatable :: c(:)
-          //   end type t
-          //   type(t) :: x(3)
-          //   x(2:3) = x(1:2)
-          // We have to make a deep copy into 'newFrom' in this case.
-          RTNAME(AssignTemporary)
-          (newFrom, from, terminator.sourceFileName(), terminator.sourceLine());
-        } else {
-          ShallowCopy(newFrom, from, true, from.IsContiguous());
+      if (int stat{ReturnError(
+              workQueue.terminator(), newFrom.Allocate(kNoAsyncObject))};
+          stat != StatOk) {
+        return stat;
+      }
+      if (HasDynamicComponent(*from_)) {
+        // If 'from' has allocatable/automatic component, we cannot
+        // just make a shallow copy of the descriptor member.
+        // This will still leave data overlap in 'to' and 'newFrom'.
+        // For example:
+        //   type t
+        //     character, allocatable :: c(:)
+        //   end type t
+        //   type(t) :: x(3)
+        //   x(2:3) = x(1:2)
+        // We have to make a deep copy into 'newFrom' in this case.
+        if (const DescriptorAddendum *addendum{newFrom.Addendum()}) {
+          if (const auto *derived{addendum->derivedType()}) {
+            if (!derived->noInitializationNeeded()) {
+              if (int status{workQueue.BeginInitialize(newFrom, *derived)};
+                  status != StatOk && status != StatContinue) {
+                return status;
+              }
+            }
+          }
+        }
+        static constexpr int nestedFlags{MaybeReallocate | PolymorphicLHS};
+        if (int status{workQueue.BeginAssign(
+                newFrom, *from_, nestedFlags, memmoveFct_)};
+            status != StatOk && status != StatContinue) {
+          return status;
         }
-        Assign(to, newFrom, terminator,
-            flags &
-                (NeedFinalization | ComponentCanBeDefinedAssignment |
-                    ExplicitLengthCharacterLHS | CanBeDefinedAssignment));
-        newFrom.Deallocate();
+      } else {
+        ShallowCopy(newFrom, *from_, true, from_->IsContiguous());
       }
-      return;
+      from_ = &newFrom;
+      flags_ &= NeedFinalization | ComponentCanBeDefinedAssignment |
+          ExplicitLengthCharacterLHS | CanBeDefinedAssignment;
+      toDeallocate_ = &newFrom;
     }
   }
-  if (to.IsAllocatable()) {
+  if (to_.IsAllocatable()) {
     if (mustDeallocateLHS) {
-      if (deferDeallocation) {
-        if ((flags & NeedFinalization) && toDerived) {
-          Finalize(*deferDeallocation, *toDerived, &terminator);
-          flags &= ~NeedFinalization;
-        }
-      } else {
-        to.Destroy((flags & NeedFinalization) != 0, /*destroyPointers=*/false,
-            &terminator);
-        flags &= ~NeedFinalization;
+      if (!toDeallocate_ && to_.IsAllocated()) {
+        toDeallocate_ = &to_;
       }
-    } else if (to.rank() != from.rank() && !to.IsAllocated()) {
-      terminator.Crash("Assign: mismatched ranks (%d != %d) in assignment to "
-                       "unallocated allocatable",
-          to.rank(), from.rank());
+    } else if (to_.rank() != from_->rank() && !to_.IsAllocated()) {
+      workQueue.terminator().Crash("Assign: mismatched ranks (%d != %d) in "
+                                   "assignment to unallocated allocatable",
+          to_.rank(), from_->rank());
     }
-    if (!to.IsAllocated()) {
-      if (AllocateAssignmentLHS(to, from, terminator, flags) != StatOk) {
-        return;
+  } else if (!to_.IsAllocated()) {
+    workQueue.terminator().Crash(
+        "Assign: left-hand side variable is neither allocated nor allocatable");
+  }
+  if (toDerived_ && to_.IsAllocated()) {
+    // Schedule finalization or destruction of the LHS.
+    if (flags_ & NeedFinalization) {
+      if (int status{workQueue.BeginFinalize(to_, *toDerived_)};
+          status != StatOk && status != StatContinue) {
+        return status;
+      }
+    } else if (!toDerived_->noDestructionNeeded()) {
+      if (int status{
+              workQueue.BeginDestroy(to_, *toDerived_, /*finalize=*/false)};
+          status != StatOk && status != StatContinue) {
+        return status;
       }
-      flags &= ~NeedFinalization;
-      toElementBytes = to.ElementBytes(); // may have changed
-      toDerived = toAddendum ? toAddendum->derivedType() : nullptr;
     }
   }
-  if (toDerived && (flags & CanBeDefinedAssignment)) {
-    // Check for a user-defined assignment type-bound procedure;
-    // see 10.2.1.4-5.  A user-defined assignment TBP defines all of
-    // the semantics, including allocatable (re)allocation and any
-    // finalization.
-    //
-    // Note that the aliasing and LHS (re)allocation handling above
-    // needs to run even with CanBeDefinedAssignment flag, when
-    // the Assign() is invoked recursively for component-per-component
-    // assignments.
-    if (to.rank() == 0) {
-      if (const auto *special{toDerived->FindSpecialBinding(
+  return StatContinue;
+}
+
+RT_API_ATTRS int AssignTicket::Continue(WorkQueue &workQueue) {
+  if (done_) {
+    // All child tickets are complete; can release this ticket's state.
+    if (toDeallocate_) {
+      toDeallocate_->Deallocate();
+    }
+    return StatOk;
+  }
+  // All necessary finalization or destruction that was initiated by Begin()
+  // has been completed.  Deallocation may be pending, and if it's for the LHS,
+  // do it now so that the LHS gets reallocated.
+  if (toDeallocate_ == &to_) {
+    toDeallocate_ = nullptr;
+    to_.Deallocate();
+  }
+  // Allocate the LHS if needed
+  if (!to_.IsAllocated()) {
+    if (int stat{
+            AllocateAssignmentLHS(to_, *from_, workQueue.terminator(), flags_)};
+        stat != StatOk) {
+      return stat;
+    }
+    const auto *addendum{to_.Addendum()};
+    toDerived_ = addendum ? addendum->derivedType() : nullptr;
+    if (toDerived_ && !toDerived_->noInitializationNeeded()) {
+      if (int status{workQueue.BeginInitialize(to_, *toDerived_)};
+          status != StatOk) {
+        return status;
+      }
+    }
+  }
+  // Check for a user-defined assignment type-bound procedure;
+  // see 10.2.1.4-5.
+  // Note that the aliasing and LHS (re)allocation handling above
+  // needs to run even with CanBeDefinedAssignment flag, since
+  // Assign() can be invoked recursively for component-wise assignments.
+  if (toDerived_ && (flags_ & CanBeDefinedAssignment)) {
+    if (to_.rank() == 0) {
+      if (const auto *special{toDerived_->FindSpecialBinding(
               typeInfo::SpecialBinding::Which::ScalarAssignment)}) {
-        return DoScalarDefinedAssignment(to, from, *special);
+        DoScalarDefinedAssignment(to_, *from_, *special);
+        done_ = true;
+        return StatContinue;
       }
     }
-    if (const auto *special{toDerived->FindSpecialBinding(
+    if (const auto *special{toDerived_->FindSpecialBinding(
             typeInfo::SpecialBinding::Which::ElementalAssignment)}) {
-      return DoElementalDefinedAssignment(to, from, *toDerived, *special);
+      DoElementalDefinedAssignment(to_, *from_, *toDerived_, *special);
+      done_ = true;
+      return StatContinue;
     }
   }
-  SubscriptValue toAt[maxRank];
-  to.GetLowerBounds(toAt);
-  // Scalar expansion of the RHS is implied by using the same empty
-  // subscript values on each (seemingly) elemental reference into
-  // "from".
-  SubscriptValue fromAt[maxRank];
-  from.GetLowerBounds(fromAt);
-  std::size_t toElements{to.Elements()};
-  if (from.rank() > 0 && toElements != from.Elements()) {
-    terminator.Crash("Assign: mismatching element counts in array assignment "
-                     "(to %zd, from %zd)",
-        toElements, from.Elements());
+  // Intrinsic assignment
+  std::size_t toElements{to_.Elements()};
+  if (from_->rank() > 0 && toElements != from_->Elements()) {
+    workQueue.terminator().Crash("Assign: mismatching element counts in array "
+                                 "assignment (to %zd, from %zd)",
+        toElements, from_->Elements());
   }
-  if (to.type() != from.type()) {
-    terminator.Crash("Assign: mismatching types (to code %d != from code %d)",
-        to.type().raw(), from.type().raw());
+  if (to_.type() != from_->type()) {
+    workQueue.terminator().Crash(
+        "Assign: mismatching types (to code %d != from code %d)",
+        to_.type().raw(), from_->type().raw());
   }
-  if (toElementBytes > fromElementBytes && !to.type().IsCharacter()) {
-    terminator.Crash("Assign: mismatching non-character element sizes (to %zd "
-                     "bytes != from %zd bytes)",
+  std::size_t toElementBytes{to_.ElementBytes()};
+  std::size_t fromElementBytes{from_->ElementBytes()};
+  if (toElementBytes > fromElementBytes && !to_.type().IsCharacter()) {
+    workQueue.terminator().Crash("Assign: mismatching non-character element "
+                                 "sizes (to %zd bytes != from %zd bytes)",
         toElementBytes, fromElementBytes);
   }
-  if (const typeInfo::DerivedType *
-      updatedToDerived{toAddendum ? toAddendum->derivedType() : nullptr}) {
-    // Derived type intrinsic assignment, which is componentwise and elementwise
-    // for all components, including parent components (10.2.1.2-3).
-    // The target is first finalized if still necessary (7.5.6.3(1))
-    if (flags & NeedFinalization) {
-      Finalize(to, *updatedToDerived, &terminator);
-    } else if (updatedToDerived && !updatedToDerived->noDestructionNeeded()) {
-      Destroy(to, /*finalize=*/false, *updatedToDerived, &terminator);
-    }
-    // Copy the data components (incl. the parent) first.
-    const Descriptor &componentDesc{updatedToDerived->component()};
-    std::size_t numComponents{componentDesc.Elements()};
-    for (std::size_t j{0}; j < toElements;
-         ++j, to.IncrementSubscripts(toAt), from.IncrementSubscripts(fromAt)) {
-      for (std::size_t k{0}; k < numComponents; ++k) {
-        const auto &comp{
-            *componentDesc.ZeroBasedIndexedElement<typeInfo::Component>(
-                k)}; // TODO: exploit contiguity here
-        // Use PolymorphicLHS for components so that the right things happen
-        // when the components are polymorphic; when they're not, they're both
-        // not, and their declared types will match.
-        int nestedFlags{MaybeReallocate | PolymorphicLHS};
-        if (flags & ComponentCanBeDefinedAssignment) {
-          nestedFlags |=
-              CanBeDefinedAssignment | ComponentCanBeDefinedAssignment;
-        }
-        switch (comp.genre()) {
-        case typeInfo::Component::Genre::Data:
-          if (comp.category() == TypeCategory::Derived) {
-            StaticDescriptor<maxRank, true, 10 /*?*/> statDesc[2];
-            Descriptor &toCompDesc{statDesc[0].descriptor()};
-            Descriptor &fromCompDesc{statDesc[1].descriptor()};
-            comp.CreatePointerDescriptor(toCompDesc, to, terminator, toAt);
-            comp.CreatePointerDescriptor(
-                fromCompDesc, from, terminator, fromAt);
-            Assign(toCompDesc, fromCompDesc, terminator, nestedFlags);
-          } else { // Component has intrinsic type; simply copy raw bytes
-            std::size_t componentByteSize{comp.SizeInBytes(to)};
-            memmoveFct(to.Element<char>(toAt) + comp.offset(),
-                from.Element<const char>(fromAt) + comp.offset(),
-                componentByteSize);
-          }
-          break;
-        case typeInfo::Component::Genre::Pointer: {
-          std::size_t componentByteSize{comp.SizeInBytes(to)};
-          memmoveFct(to.Element<char>(toAt) + comp.offset(),
-              from.Element<const char>(fromAt) + comp.offset(),
-              componentByteSize);
-        } break;
-        case typeInfo::Component::Genre::Allocatable:
-        case typeInfo::Component::Genre::Automatic: {
-          auto *toDesc{reinterpret_cast<Descriptor *>(
-              to.Element<char>(toAt) + comp.offset())};
-          const auto *fromDesc{reinterpret_cast<const Descriptor *>(
-              from.Element<char>(fromAt) + comp.offset())};
-          // Allocatable components of the LHS are unconditionally
-          // deallocated before assignment (F'2018 10.2.1.3(13)(1)),
-          // unlike a "top-level" assignment to a variable, where
-          // deallocation is optional.
-          //
-          // Be careful not to destroy/reallocate the LHS, if there is
-          // overlap between LHS and RHS (it seems that partial overlap
-          // is not possible, though).
-          // Invoke Assign() recursively to deal with potential aliasing.
-          if (toDesc->IsAllocatable()) {
-            if (!fromDesc->IsAllocated()) {
-              // No aliasing.
-              //
-              // If to is not allocated, the Destroy() call is a no-op.
-              // This is just a shortcut, because the recursive Assign()
-              // below would initiate the destruction for to.
-              // No finalization is required.
-              toDesc->Destroy(
-                  /*finalize=*/false, /*destroyPointers=*/false, &terminator);
-              continue; // F'2018 10.2.1.3(13)(2)
-            }
-          }
-          // Force LHS deallocation with DeallocateLHS flag.
-          // The actual deallocation may be avoided, if the existing
-          // location can be reoccupied.
-          Assign(*toDesc, *fromDesc, terminator, nestedFlags | DeallocateLHS);
-        } break;
-        }
+  if (toDerived_) {
+    if (toDerived_->noDefinedAssignment()) { // componentwise
+      if (int status{workQueue.BeginDerivedAssign<true>(
+              to_, *from_, *toDerived_, flags_, memmoveFct_, toDeallocate_)};
+          status != StatOk && status != StatContinue) {
+        return status;
       }
-      // Copy procedure pointer components
-      const Descriptor &procPtrDesc{updatedToDerived->procPtr()};
-      std::size_t numProcPtrs{procPtrDesc.Elements()};
-      for (std::size_t k{0}; k < numProcPtrs; ++k) {
-        const auto &procPtr{
-            *procPtrDesc.ZeroBasedIndexedElement<typeInfo::ProcPtrComponent>(
-                k)};
-        memmoveFct(to.Element<char>(toAt) + procPtr.offset,
-            from.Element<const char>(fromAt) + procPtr.offset,
-            sizeof(typeInfo::ProcedurePointer));
+    } else { // elementwise
+      if (int status{workQueue.BeginDerivedAssign<false>(
+              to_, *from_, *toDerived_, flags_, memmoveFct_, toDeallocate_)};
+          status != StatOk && status != StatContinue) {
+        return status;
       }
     }
-  } else { // intrinsic type, intrinsic assignment
-    if (isSimpleMemmove()) {
-      memmoveFct(to.raw().base_addr, from.raw().base_addr,
-          toElements * toElementBytes);
-    } else if (toElementBytes > fromElementBytes) { // blank padding
-      switch (to.type().raw()) {
+    toDeallocate_ = nullptr;
+  } else if (IsSimpleMemmove()) {
+    memmoveFct_(to_.raw().base_addr, from_->raw().base_addr,
+        toElements * toElementBytes);
+  } else {
+    // Scalar expansion of the RHS is implied by using the same empty
+    // subscript values on each (seemingly) elemental reference into
+    // "from".
+    SubscriptValue toAt[maxRank];
+    to_.GetLowerBounds(toAt);
+    SubscriptValue fromAt[maxRank];
+    from_->GetLowerBounds(fromAt);
+    if (toElementBytes > fromElementBytes) { // blank padding
+      switch (to_.type().raw()) {
       case CFI_type_signed_char:
       case CFI_type_char:
-        BlankPadCharacterAssignment<char>(to, from, toAt, fromAt, toElements,
+        BlankPadCharacterAssignment<char>(to_, *from_, toAt, fromAt, toElements,
             toElementBytes, fromElementBytes);
         break;
       case CFI_type_char16_t:
-        BlankPadCharacterAssignment<char16_t>(to, from, toAt, fromAt,
+        BlankPadCharacterAssignment<char16_t>(to_, *from_, toAt, fromAt,
             toElements, toElementBytes, fromElementBytes);
         break;
       case CFI_type_char32_t:
-        BlankPadCharacterAssignment<char32_t>(to, from, toAt, fromAt,
+        BlankPadCharacterAssignment<char32_t>(to_, *from_, toAt, fromAt,
             toElements, toElementBytes, fromElementBytes);
         break;
       default:
-        terminator.Crash("unexpected type code %d in blank padded Assign()",
-            to.type().raw());
+        workQueue.terminator().Crash(
+            "unexpected type code %d in blank padded Assign()",
+            to_.type().raw());
       }
     } else { // elemental copies, possibly with character truncation
       for (std::size_t n{toElements}; n-- > 0;
-          to.IncrementSubscripts(toAt), from.IncrementSubscripts(fromAt)) {
-        memmoveFct(to.Element<char>(toAt), from.Element<const char>(fromAt),
+          to_.IncrementSubscripts(toAt), from_->IncrementSubscripts(fromAt)) {
+        memmoveFct_(to_.Element<char>(toAt), from_->Element<const char>(fromAt),
             toElementBytes);
       }
     }
   }
-  if (deferDeallocation) {
-    // deferDeallocation is used only when LHS is an allocatable.
-    // The finalization has already been run for it.
-    deferDeallocation->Destroy(
-        /*finalize=*/false, /*destroyPointers=*/false, &terminator);
+  if (persist_) {
+    done_ = true;
+    return StatContinue;
+  } else {
+    if (toDeallocate_) {
+      toDeallocate_->Deallocate();
+      toDeallocate_ = nullptr;
+    }
+    return StatOk;
   }
 }
 
-RT_OFFLOAD_API_GROUP_BEGIN
+template <bool IS_COMPONENTWISE>
+RT_API_ATTRS int DerivedAssignTicket<IS_COMPONENTWISE>::Begin(
+    WorkQueue &workQueue) {
+  if (toIsContiguous_ && fromIsContiguous_ &&
+      this->derived_.noDestructionNeeded() &&
+      this->derived_.noDefinedAssignment() &&
+      this->instance_.rank() == this->from_->rank()) {
+    if (std::size_t elementBytes{this->instance_.ElementBytes()};
+        elementBytes == this->from_->ElementBytes()) {
+      // Fastest path.  Both LHS and RHS are contiguous, RHS is not a scalar
+      // to be expanded, the types have the same size, and there are no
+      // allocatable components or defined ASSIGNMENT(=) at any level.
+      memmoveFct_(this->instance_.template OffsetElement<char>(),
+          this->from_->template OffsetElement<const char *>(),
+          this->instance_.Elements() * elementBytes);
+      return StatOk;
+    }
+  }
+  // Use PolymorphicLHS for components so that the right things happen
+  // when the components are polymorphic; when they're not, they're both
+  // not, and their declared types will match.
+  int nestedFlags{MaybeReallocate | PolymorphicLHS};
+  if (flags_ & ComponentCanBeDefinedAssignment) {
+    nestedFlags |= CanBeDefinedAssignment | ComponentCanBeDefinedAssignment;
+  }
+  flags_ = nestedFlags;
+  // Copy procedure pointer components
+  const Descriptor &procPtrDesc{this->derived_.procPtr()};
+  bool noDataComponents{this->IsComplete()};
+  if (std::size_t numProcPtrs{procPtrDesc.Elements()}) {
+    for (std::size_t k{0}; k < numProcPtrs; ++k) {
+      const auto &procPtr{
+          *procPtrDesc.ZeroBasedIndexedElement<typeInfo::ProcPtrComponent>(k)};
+      // Loop only over elements
+      if (noDataComponents) {
+        Elementwise::Reset();
+      }
+      for (; !Elementwise::IsComplete(); Elementwise::Advance()) {
+        memmoveFct_(this->instance_.template ElementComponent<char>(
+                        this->subscripts_, procPtr.offset),
+            this->from_->template ElementComponent<const char>(
+                this->fromSubscripts_, procPtr.offset),
+            sizeof(typeInfo::ProcedurePointer));
+      }
+    }
+    if (noDataComponents) {
+      return StatOk;
+    }
+    Elementwise::Reset();
+  }
+  if (noDataComponents) {
+    return StatOk;
+  }
+  return StatContinue;
+}
+template RT_API_ATTRS int DerivedAssignTicket<false>::Begin(WorkQueue &);
+template RT_API_ATTRS int DerivedAssignTicket<true>::Begin(WorkQueue &);
+
+template <bool IS_COMPONENTWISE>
+RT_API_ATTRS int DerivedAssignTicket<IS_COMPONENTWISE>::Continue(
+    WorkQueue &workQueue) {
+  while (!this->IsComplete()) {
+    // Copy the data components (incl. the parent) first.
+    switch (this->component_->genre()) {
+    case typeInfo::Component::Genre::Data:
+      if (this->component_->category() == TypeCategory::Derived) {
+        Descriptor &toCompDesc{this->componentDescriptor_.descriptor()};
+        Descriptor &fromCompDesc{this->fromComponentDescriptor_.descriptor()};
+        this->component_->CreatePointerDescriptor(toCompDesc, this->instance_,
+            workQueue.terminator(), this->subscripts_);
+        this->component_->CreatePointerDescriptor(fromCompDesc, *this->from_,
+            workQueue.terminator(), this->fromSubscripts_);
+        this->Advance();
+        if (int status{workQueue.BeginAssign(
+                toCompDesc, fromCompDesc, flags_, memmoveFct_)};
+            status != StatOk) {
+          return status;
+        }
+      } else { // Component has intrinsic type; simply copy raw bytes
+        std::size_t componentByteSize{
+            this->component_->SizeInBytes(this->instance_)};
+        if (IS_COMPONENTWISE && toIsContiguous_ && fromIsContiguous_) {
+          std::size_t offset{this->component_->offset()};
+          char *to{this->instance_.template OffsetElement<char>(offset)};
+          const char *from{
+              this->from_->template OffsetElement<const char>(offset)};
+          std::size_t toElementStride{this->instance_.ElementBytes()};
+          std::size_t fromElementStride{
+              this->from_->rank() == 0 ? 0 : this->from_->ElementBytes()};
+          if (toElementStride == fromElementStride &&
+              toElementStride == componentByteSize) {
+            memmoveFct_(to, from, this->elements_ * componentByteSize);
+          } else {
+            for (std::size_t n{this->elements_}; n--;
+                to += toElementStride, from += fromElementStride) {
+              memmoveFct_(to, from, componentByteSize);
+            }
+          }
+          this->Componentwise::Advance();
+        } else {
+          memmoveFct_(
+              this->instance_.template Element<char>(this->subscripts_) +
+                  this->component_->offset(),
+              this->from_->template Element<const char>(this->fromSubscripts_) +
+                  this->component_->offset(),
+              componentByteSize);
+          this->Advance();
+        }
+      }
+      break;
+    case typeInfo::Component::Genre::Pointer: {
+      std::size_t componentByteSize{
+          this->component_->SizeInBytes(this->instance_)};
+      if (IS_COMPONENTWISE && toIsContiguous_ && fromIsContiguous_) {
+        std::size_t offset{this->component_->offset()};
+        char *to{this->instance_.template OffsetElement<char>(offset)};
+        const char *from{
+            this->from_->template OffsetElement<const char>(offset)};
+        std::size_t toElementStride{this->instance_.ElementBytes()};
+        std::size_t fromElementStride{
+            this->from_->rank() == 0 ? 0 : this->from_->ElementBytes()};
+        if (toElementStride == fromElementStride &&
+            toElementStride == componentByteSize) {
+          memmoveFct_(to, from, this->elements_ * componentByteSize);
+        } else {
+          for (std::size_t n{this->elements_}; n--;
+              to += toElementStride, from += fromElementStride) {
+            memmoveFct_(to, from, componentByteSize);
+          }
+        }
+        this->Componentwise::Advance();
+      } else {
+        memmoveFct_(this->instance_.template Element<char>(this->subscripts_) +
+                this->component_->offset(),
+            this->from_->template Element<const char>(this->fromSubscripts_) +
+                this->component_->offset(),
+            componentByteSize);
+        this->Advance();
+      }
+    } break;
+    case typeInfo::Component::Genre::Allocatable:
+    case typeInfo::Component::Genre::Automatic: {
+      auto *toDesc{reinterpret_cast<Descriptor *>(
+          this->instance_.template Element<char>(this->subscripts_) +
+          this->component_->offset())};
+      const auto *fromDesc{reinterpret_cast<const Descriptor *>(
+          this->from_->template Element<char>(this->fromSubscripts_) +
+          this->component_->offset())};
+      if (toDesc->IsAllocatable() && !fromDesc->IsAllocated()) {
+        if (toDesc->IsAllocated()) {
+          if (this->phase_ == 0) {
+            this->phase_++;
+            if (const auto *componentDerived{this->component_->derivedType()};
+                componentDerived && !componentDerived->noDestructionNeeded()) {
+              if (int status{workQueue.BeginDestroy(
+                      *toDesc, *componentDerived, /*finalize=*/false)};
+                  status != StatOk) {
+                return status;
+              }
+            }
+          }
+          toDesc->Deallocate();
+        }
+        this->Advance();
+      } else {
+        // Allocatable components of the LHS are unconditionally
+        // deallocated before assignment (F'2018 10.2.1.3(13)(1)),
+        // unlike a "top-level" assignment to a variable, where
+        // deallocation is optional.
+        this->Advance();
+        int nestedFlags{flags_};
+        if (this->derived_.noFinalizationNeeded() &&
+            this->derived_.noInitializationNeeded() &&
+            this->derived_.noDestructionNeeded()) {
+          // The actual deallocation may be avoided, if the existing
+          // location can be reoccupied.
+        } else {
+          // Force LHS deallocation with DeallocateLHS flag.
+          nestedFlags |= DeallocateLHS;
+        }
+        if (int status{workQueue.BeginAssign(
+                *toDesc, *fromDesc, nestedFlags, memmoveFct_)};
+            status != StatOk) {
+          return status;
+        }
+      }
+    } break;
+    }
+  }
+  if (deallocateAfter_) {
+    deallocateAfter_->Deallocate();
+  }
+  return StatOk;
+}
+template RT_API_ATTRS int DerivedAssignTicket<false>::Continue(WorkQueue &);
+template RT_API_ATTRS int DerivedAssignTicket<true>::Continue(WorkQueue &);
 
 RT_API_ATTRS void DoFromSourceAssign(Descriptor &alloc,
     const Descriptor &source, Terminator &terminator, MemmoveFct memmoveFct) {
@@ -582,7 +759,6 @@ void RTDEF(AssignTemporary)(Descriptor &to, const Descriptor &from,
       }
     }
   }
-
   Assign(to, from, terminator, MaybeReallocate | PolymorphicLHS);
 }
 
@@ -599,7 +775,6 @@ void RTDEF(CopyInAssign)(Descriptor &temp, const Descriptor &var,
 void RTDEF(CopyOutAssign)(
     Descriptor *var, Descriptor &temp, const char *sourceFile, int sourceLine) {
   Terminator terminator{sourceFile, sourceLine};
-
   // Copyout from the temporary must not cause any finalizations
   // for LHS. The variable must be properly initialized already.
   if (var) {
diff --git a/flang-rt/lib/runtime/derived.cpp b/flang-rt/lib/runtime/derived.cpp
index 35037036f63e7..8ab737c701b01 100644
--- a/flang-rt/lib/runtime/derived.cpp
+++ b/flang-rt/lib/runtime/derived.cpp
@@ -12,6 +12,7 @@
 #include "flang-rt/runtime/terminator.h"
 #include "flang-rt/runtime/tools.h"
 #include "flang-rt/runtime/type-info.h"
+#include "flang-rt/runtime/work-queue.h"
 
 namespace Fortran::runtime {
 
@@ -30,180 +31,193 @@ static RT_API_ATTRS void GetComponentExtents(SubscriptValue (&extents)[maxRank],
 }
 
 RT_API_ATTRS int Initialize(const Descriptor &instance,
-    const typeInfo::DerivedType &derived, Terminator &terminator, bool hasStat,
-    const Descriptor *errMsg) {
-  const Descriptor &componentDesc{derived.component()};
-  std::size_t elements{instance.Elements()};
-  int stat{StatOk};
-  // Initialize data components in each element; the per-element iterations
-  // constitute the inner loops, not the outer ones
-  std::size_t myComponents{componentDesc.Elements()};
-  for (std::size_t k{0}; k < myComponents; ++k) {
-    const auto &comp{
-        *componentDesc.ZeroBasedIndexedElement<typeInfo::Component>(k)};
-    SubscriptValue at[maxRank];
-    instance.GetLowerBounds(at);
-    if (comp.genre() == typeInfo::Component::Genre::Allocatable ||
-        comp.genre() == typeInfo::Component::Genre::Automatic) {
-      for (std::size_t j{0}; j++ < elements; instance.IncrementSubscripts(at)) {
-        Descriptor &allocDesc{
-            *instance.ElementComponent<Descriptor>(at, comp.offset())};
-        comp.EstablishDescriptor(allocDesc, instance, terminator);
+    const typeInfo::DerivedType &derived, Terminator &terminator, bool,
+    const Descriptor *) {
+  WorkQueue workQueue{terminator};
+  int status{workQueue.BeginInitialize(instance, derived)};
+  return status == StatContinue ? workQueue.Run() : status;
+}
+
+RT_API_ATTRS int InitializeTicket::Begin(WorkQueue &) {
+  // Initialize procedure pointer components in each element
+  const Descriptor &procPtrDesc{derived_.procPtr()};
+  if (std::size_t numProcPtrs{procPtrDesc.Elements()}) {
+    bool noDataComponents{IsComplete()};
+    for (std::size_t k{0}; k < numProcPtrs; ++k) {
+      const auto &comp{
+          *procPtrDesc.ZeroBasedIndexedElement<typeInfo::ProcPtrComponent>(k)};
+      // Loop only over elements
+      if (noDataComponents) {
+        Elementwise::Reset();
+      }
+      for (; !Elementwise::IsComplete(); Elementwise::Advance()) {
+        auto &pptr{*instance_.ElementComponent<typeInfo::ProcedurePointer>(
+            subscripts_, comp.offset)};
+        pptr = comp.procInitialization;
+      }
+    }
+    if (noDataComponents) {
+      return StatOk;
+    }
+    Elementwise::Reset();
+  }
+  return StatContinue;
+}
+
+RT_API_ATTRS int InitializeTicket::Continue(WorkQueue &workQueue) {
+  while (!IsComplete()) {
+    if (component_->genre() == typeInfo::Component::Genre::Allocatable) {
+      // Establish allocatable descriptors
+      for (; !Elementwise::IsComplete(); Elementwise::Advance()) {
+        Descriptor &allocDesc{*instance_.ElementComponent<Descriptor>(
+            subscripts_, component_->offset())};
+        component_->EstablishDescriptor(
+            allocDesc, instance_, workQueue.terminator());
         allocDesc.raw().attribute = CFI_attribute_allocatable;
-        if (comp.genre() == typeInfo::Component::Genre::Automatic) {
-          stat = ReturnError(
-              terminator, allocDesc.Allocate(kNoAsyncObject), errMsg, hasStat);
-          if (stat == StatOk) {
-            if (const DescriptorAddendum * addendum{allocDesc.Addendum()}) {
-              if (const auto *derived{addendum->derivedType()}) {
-                if (!derived->noInitializationNeeded()) {
-                  stat = Initialize(
-                      allocDesc, *derived, terminator, hasStat, errMsg);
-                }
-              }
-            }
-          }
-          if (stat != StatOk) {
-            break;
-          }
-        }
       }
-    } else if (const void *init{comp.initialization()}) {
+      SkipToNextComponent();
+    } else if (const void *init{component_->initialization()}) {
       // Explicit initialization of data pointers and
       // non-allocatable non-automatic components
-      std::size_t bytes{comp.SizeInBytes(instance)};
-      for (std::size_t j{0}; j++ < elements; instance.IncrementSubscripts(at)) {
-        char *ptr{instance.ElementComponent<char>(at, comp.offset())};
+      std::size_t bytes{component_->SizeInBytes(instance_)};
+      for (; !Elementwise::IsComplete(); Elementwise::Advance()) {
+        char *ptr{instance_.ElementComponent<char>(
+            subscripts_, component_->offset())};
         std::memcpy(ptr, init, bytes);
       }
-    } else if (comp.genre() == typeInfo::Component::Genre::Pointer) {
+      SkipToNextComponent();
+    } else if (component_->genre() == typeInfo::Component::Genre::Pointer) {
       // Data pointers without explicit initialization are established
       // so that they are valid right-hand side targets of pointer
       // assignment statements.
-      for (std::size_t j{0}; j++ < elements; instance.IncrementSubscripts(at)) {
-        Descriptor &ptrDesc{
-            *instance.ElementComponent<Descriptor>(at, comp.offset())};
-        comp.EstablishDescriptor(ptrDesc, instance, terminator);
+      for (; !Elementwise::IsComplete(); Elementwise::Advance()) {
+        Descriptor &ptrDesc{*instance_.ElementComponent<Descriptor>(
+            subscripts_, component_->offset())};
+        component_->EstablishDescriptor(
+            ptrDesc, instance_, workQueue.terminator());
         ptrDesc.raw().attribute = CFI_attribute_pointer;
       }
-    } else if (comp.genre() == typeInfo::Component::Genre::Data &&
-        comp.derivedType() && !comp.derivedType()->noInitializationNeeded()) {
+      SkipToNextComponent();
+    } else if (component_->genre() == typeInfo::Component::Genre::Data &&
+        component_->derivedType() &&
+        !component_->derivedType()->noInitializationNeeded()) {
       // Default initialization of non-pointer non-allocatable/automatic
-      // data component.  Handles parent component's elements.  Recursive.
+      // data component.  Handles parent component's elements.
       SubscriptValue extents[maxRank];
-      GetComponentExtents(extents, comp, instance);
-      StaticDescriptor<maxRank, true, 0> staticDescriptor;
-      Descriptor &compDesc{staticDescriptor.descriptor()};
-      const typeInfo::DerivedType &compType{*comp.derivedType()};
-      for (std::size_t j{0}; j++ < elements; instance.IncrementSubscripts(at)) {
-        compDesc.Establish(compType,
-            instance.ElementComponent<char>(at, comp.offset()), comp.rank(),
-            extents);
-        stat = Initialize(compDesc, compType, terminator, hasStat, errMsg);
-        if (stat != StatOk) {
-          break;
-        }
+      GetComponentExtents(extents, *component_, instance_);
+      Descriptor &compDesc{componentDescriptor_.descriptor()};
+      const typeInfo::DerivedType &compType{*component_->derivedType()};
+      compDesc.Establish(compType,
+          instance_.ElementComponent<char>(subscripts_, component_->offset()),
+          component_->rank(), extents);
+      Advance();
+      if (int status{workQueue.BeginInitialize(compDesc, compType)};
+          status != StatOk) {
+        return status;
       }
+    } else {
+      SkipToNextComponent();
     }
   }
-  // Initialize procedure pointer components in each element
-  const Descriptor &procPtrDesc{derived.procPtr()};
-  std::size_t myProcPtrs{procPtrDesc.Elements()};
-  for (std::size_t k{0}; k < myProcPtrs; ++k) {
-    const auto &comp{
-        *procPtrDesc.ZeroBasedIndexedElement<typeInfo::ProcPtrComponent>(k)};
-    SubscriptValue at[maxRank];
-    instance.GetLowerBounds(at);
-    for (std::size_t j{0}; j++ < elements; instance.IncrementSubscripts(at)) {
-      auto &pptr{*instance.ElementComponent<typeInfo::ProcedurePointer>(
-          at, comp.offset)};
-      pptr = comp.procInitialization;
-    }
-  }
-  return stat;
+  return StatOk;
 }
 
 RT_API_ATTRS int InitializeClone(const Descriptor &clone,
-    const Descriptor &orig, const typeInfo::DerivedType &derived,
+    const Descriptor &original, const typeInfo::DerivedType &derived,
     Terminator &terminator, bool hasStat, const Descriptor *errMsg) {
-  const Descriptor &componentDesc{derived.component()};
-  std::size_t elements{orig.Elements()};
-  int stat{StatOk};
-
-  // Skip pointers and unallocated variables.
-  if (orig.IsPointer() || !orig.IsAllocated()) {
-    return stat;
+  if (original.IsPointer() || !original.IsAllocated()) {
+    return StatOk; // nothing to do
+  } else {
+    WorkQueue workQueue{terminator};
+    int status{workQueue.BeginInitializeClone(
+        clone, original, derived, hasStat, errMsg)};
+    return status == StatContinue ? workQueue.Run() : status;
   }
-  // Initialize each data component.
-  std::size_t components{componentDesc.Elements()};
-  for (std::size_t i{0}; i < components; ++i) {
-    const typeInfo::Component &comp{
-        *componentDesc.ZeroBasedIndexedElement<typeInfo::Component>(i)};
-    SubscriptValue at[maxRank];
-    orig.GetLowerBounds(at);
-    // Allocate allocatable components that are also allocated in the original
-    // object.
-    if (comp.genre() == typeInfo::Component::Genre::Allocatable) {
-      // Initialize each element.
-      for (std::size_t j{0}; j < elements; ++j, orig.IncrementSubscripts(at)) {
-        Descriptor &origDesc{
-            *orig.ElementComponent<Descriptor>(at, comp.offset())};
-        Descriptor &cloneDesc{
-            *clone.ElementComponent<Descriptor>(at, comp.offset())};
-        if (origDesc.IsAllocated()) {
+}
+
+RT_API_ATTRS int InitializeCloneTicket::Continue(WorkQueue &workQueue) {
+  while (!IsComplete()) {
+    if (component_->genre() == typeInfo::Component::Genre::Allocatable) {
+      Descriptor &origDesc{*instance_.ElementComponent<Descriptor>(
+          subscripts_, component_->offset())};
+      if (origDesc.IsAllocated()) {
+        Descriptor &cloneDesc{*clone_.ElementComponent<Descriptor>(
+            subscripts_, component_->offset())};
+        if (phase_ == 0) {
+          ++phase_;
           cloneDesc.ApplyMold(origDesc, origDesc.rank());
-          stat = ReturnError(
-              terminator, cloneDesc.Allocate(kNoAsyncObject), errMsg, hasStat);
-          if (stat == StatOk) {
-            if (const DescriptorAddendum * addendum{cloneDesc.Addendum()}) {
-              if (const typeInfo::DerivedType *
-                  derived{addendum->derivedType()}) {
-                if (!derived->noInitializationNeeded()) {
-                  // Perform default initialization for the allocated element.
-                  stat = Initialize(
-                      cloneDesc, *derived, terminator, hasStat, errMsg);
-                }
-                // Initialize derived type's allocatables.
-                if (stat == StatOk) {
-                  stat = InitializeClone(cloneDesc, origDesc, *derived,
-                      terminator, hasStat, errMsg);
+          if (int stat{ReturnError(workQueue.terminator(),
+                  cloneDesc.Allocate(kNoAsyncObject), errMsg_, hasStat_)};
+              stat != StatOk) {
+            return stat;
+          }
+          if (const DescriptorAddendum *addendum{cloneDesc.Addendum()}) {
+            if (const typeInfo::DerivedType *derived{addendum->derivedType()}) {
+              if (!derived->noInitializationNeeded()) {
+                // Perform default initialization for the allocated element.
+                if (int status{workQueue.BeginInitialize(cloneDesc, *derived)};
+                    status != StatOk) {
+                  return status;
                 }
               }
             }
           }
         }
-        if (stat != StatOk) {
-          break;
+        if (phase_ == 1) {
+          ++phase_;
+          if (const DescriptorAddendum *addendum{cloneDesc.Addendum()}) {
+            if (const typeInfo::DerivedType *derived{addendum->derivedType()}) {
+              // Initialize derived type's allocatables.
+              if (int status{workQueue.BeginInitializeClone(
+                      cloneDesc, origDesc, *derived, hasStat_, errMsg_)};
+                  status != StatOk) {
+                return status;
+              }
+            }
+          }
         }
       }
-    } else if (comp.genre() == typeInfo::Component::Genre::Data &&
-        comp.derivedType()) {
-      // Handle nested derived types.
-      const typeInfo::DerivedType &compType{*comp.derivedType()};
-      SubscriptValue extents[maxRank];
-      GetComponentExtents(extents, comp, orig);
-      // Data components don't have descriptors, allocate them.
-      StaticDescriptor<maxRank, true, 0> origStaticDesc;
-      StaticDescriptor<maxRank, true, 0> cloneStaticDesc;
-      Descriptor &origDesc{origStaticDesc.descriptor()};
-      Descriptor &cloneDesc{cloneStaticDesc.descriptor()};
-      // Initialize each element.
-      for (std::size_t j{0}; j < elements; ++j, orig.IncrementSubscripts(at)) {
+      Advance();
+    } else if (component_->genre() == typeInfo::Component::Genre::Data) {
+      if (component_->derivedType()) {
+        // Handle nested derived types.
+        const typeInfo::DerivedType &compType{*component_->derivedType()};
+        SubscriptValue extents[maxRank];
+        GetComponentExtents(extents, *component_, instance_);
+        Descriptor &origDesc{componentDescriptor_.descriptor()};
+        Descriptor &cloneDesc{cloneComponentDescriptor_.descriptor()};
         origDesc.Establish(compType,
-            orig.ElementComponent<char>(at, comp.offset()), comp.rank(),
-            extents);
+            instance_.ElementComponent<char>(subscripts_, component_->offset()),
+            component_->rank(), extents);
         cloneDesc.Establish(compType,
-            clone.ElementComponent<char>(at, comp.offset()), comp.rank(),
-            extents);
-        stat = InitializeClone(
-            cloneDesc, origDesc, compType, terminator, hasStat, errMsg);
-        if (stat != StatOk) {
-          break;
+            clone_.ElementComponent<char>(subscripts_, component_->offset()),
+            component_->rank(), extents);
+        Advance();
+        if (int status{workQueue.BeginInitializeClone(
+                cloneDesc, origDesc, compType, hasStat_, errMsg_)};
+            status != StatOk) {
+          return status;
         }
+      } else {
+        SkipToNextComponent();
       }
+    } else {
+      SkipToNextComponent();
+    }
+  }
+  return StatOk;
+}
+
+// Fortran 2018 subclause 7.5.6.2
+RT_API_ATTRS void Finalize(const Descriptor &descriptor,
+    const typeInfo::DerivedType &derived, Terminator *terminator) {
+  if (!derived.noFinalizationNeeded() && descriptor.IsAllocated()) {
+    Terminator stubTerminator{"Finalize() in Fortran runtime", 0};
+    WorkQueue workQueue{terminator ? *terminator : stubTerminator};
+    if (workQueue.BeginFinalize(descriptor, derived) == StatContinue) {
+      workQueue.Run();
     }
   }
-  return stat;
 }
 
 static RT_API_ATTRS const typeInfo::SpecialBinding *FindFinal(
@@ -221,7 +235,7 @@ static RT_API_ATTRS const typeInfo::SpecialBinding *FindFinal(
 }
 
 static RT_API_ATTRS void CallFinalSubroutine(const Descriptor &descriptor,
-    const typeInfo::DerivedType &derived, Terminator *terminator) {
+    const typeInfo::DerivedType &derived, Terminator &terminator) {
   if (const auto *special{FindFinal(derived, descriptor.rank())}) {
     if (special->which() == typeInfo::SpecialBinding::Which::ElementalFinal) {
       std::size_t elements{descriptor.Elements()};
@@ -258,9 +272,7 @@ static RT_API_ATTRS void CallFinalSubroutine(const Descriptor &descriptor,
         copy = descriptor;
         copy.set_base_addr(nullptr);
         copy.raw().attribute = CFI_attribute_allocatable;
-        Terminator stubTerminator{"CallFinalProcedure() in Fortran runtime", 0};
-        RUNTIME_CHECK(terminator ? *terminator : stubTerminator,
-            copy.Allocate(kNoAsyncObject) == CFI_SUCCESS);
+        RUNTIME_CHECK(terminator, copy.Allocate(kNoAsyncObject) == CFI_SUCCESS);
         ShallowCopyDiscontiguousToContiguous(copy, descriptor);
         argDescriptor = ©
       }
@@ -284,87 +296,94 @@ static RT_API_ATTRS void CallFinalSubroutine(const Descriptor &descriptor,
   }
 }
 
-// Fortran 2018 subclause 7.5.6.2
-RT_API_ATTRS void Finalize(const Descriptor &descriptor,
-    const typeInfo::DerivedType &derived, Terminator *terminator) {
-  if (derived.noFinalizationNeeded() || !descriptor.IsAllocated()) {
-    return;
-  }
-  CallFinalSubroutine(descriptor, derived, terminator);
-  const auto *parentType{derived.GetParentType()};
-  bool recurse{parentType && !parentType->noFinalizationNeeded()};
+RT_API_ATTRS int FinalizeTicket::Begin(WorkQueue &workQueue) {
+  CallFinalSubroutine(instance_, derived_, workQueue.terminator());
   // If there's a finalizable parent component, handle it last, as required
   // by the Fortran standard (7.5.6.2), and do so recursively with the same
   // descriptor so that the rank is preserved.
-  const Descriptor &componentDesc{derived.component()};
-  std::size_t myComponents{componentDesc.Elements()};
-  std::size_t elements{descriptor.Elements()};
-  for (auto k{recurse ? std::size_t{1}
-                      /* skip first component, it's the parent */
-                      : 0};
-       k < myComponents; ++k) {
-    const auto &comp{
-        *componentDesc.ZeroBasedIndexedElement<typeInfo::Component>(k)};
-    SubscriptValue at[maxRank];
-    descriptor.GetLowerBounds(at);
-    if (comp.genre() == typeInfo::Component::Genre::Allocatable &&
-        comp.category() == TypeCategory::Derived) {
+  finalizableParentType_ = derived_.GetParentType();
+  if (finalizableParentType_) {
+    if (finalizableParentType_->noFinalizationNeeded()) {
+      finalizableParentType_ = nullptr;
+    } else {
+      SkipToNextComponent();
+    }
+  }
+  return StatContinue;
+}
+
+RT_API_ATTRS int FinalizeTicket::Continue(WorkQueue &workQueue) {
+  while (!IsComplete()) {
+    if (component_->genre() == typeInfo::Component::Genre::Allocatable &&
+        component_->category() == TypeCategory::Derived) {
       // Component may be polymorphic or unlimited polymorphic. Need to use the
       // dynamic type to check whether finalization is needed.
-      for (std::size_t j{0}; j++ < elements;
-           descriptor.IncrementSubscripts(at)) {
-        const Descriptor &compDesc{
-            *descriptor.ElementComponent<Descriptor>(at, comp.offset())};
-        if (compDesc.IsAllocated()) {
-          if (const DescriptorAddendum * addendum{compDesc.Addendum()}) {
-            if (const typeInfo::DerivedType *
-                compDynamicType{addendum->derivedType()}) {
-              if (!compDynamicType->noFinalizationNeeded()) {
-                Finalize(compDesc, *compDynamicType, terminator);
+      const Descriptor &compDesc{*instance_.ElementComponent<Descriptor>(
+          subscripts_, component_->offset())};
+      Advance();
+      if (compDesc.IsAllocated()) {
+        if (const DescriptorAddendum *addendum{compDesc.Addendum()}) {
+          if (const typeInfo::DerivedType *compDynamicType{
+                  addendum->derivedType()}) {
+            if (!compDynamicType->noFinalizationNeeded()) {
+              if (int status{
+                      workQueue.BeginFinalize(compDesc, *compDynamicType)};
+                  status != StatOk) {
+                return status;
               }
             }
           }
         }
       }
-    } else if (comp.genre() == typeInfo::Component::Genre::Allocatable ||
-        comp.genre() == typeInfo::Component::Genre::Automatic) {
-      if (const typeInfo::DerivedType * compType{comp.derivedType()}) {
-        if (!compType->noFinalizationNeeded()) {
-          for (std::size_t j{0}; j++ < elements;
-               descriptor.IncrementSubscripts(at)) {
-            const Descriptor &compDesc{
-                *descriptor.ElementComponent<Descriptor>(at, comp.offset())};
-            if (compDesc.IsAllocated()) {
-              Finalize(compDesc, *compType, terminator);
-            }
+    } else if (component_->genre() == typeInfo::Component::Genre::Allocatable ||
+        component_->genre() == typeInfo::Component::Genre::Automatic) {
+      if (const typeInfo::DerivedType *compType{component_->derivedType()};
+          compType && !compType->noFinalizationNeeded()) {
+        const Descriptor &compDesc{*instance_.ElementComponent<Descriptor>(
+            subscripts_, component_->offset())};
+        Advance();
+        if (compDesc.IsAllocated()) {
+          if (int status{workQueue.BeginFinalize(compDesc, *compType)};
+              status != StatOk) {
+            return status;
           }
         }
+      } else {
+        SkipToNextComponent();
       }
-    } else if (comp.genre() == typeInfo::Component::Genre::Data &&
-        comp.derivedType() && !comp.derivedType()->noFinalizationNeeded()) {
+    } else if (component_->genre() == typeInfo::Component::Genre::Data &&
+        component_->derivedType() &&
+        !component_->derivedType()->noFinalizationNeeded()) {
       SubscriptValue extents[maxRank];
-      GetComponentExtents(extents, comp, descriptor);
-      StaticDescriptor<maxRank, true, 0> staticDescriptor;
-      Descriptor &compDesc{staticDescriptor.descriptor()};
-      const typeInfo::DerivedType &compType{*comp.derivedType()};
-      for (std::size_t j{0}; j++ < elements;
-           descriptor.IncrementSubscripts(at)) {
-        compDesc.Establish(compType,
-            descriptor.ElementComponent<char>(at, comp.offset()), comp.rank(),
-            extents);
-        Finalize(compDesc, compType, terminator);
+      GetComponentExtents(extents, *component_, instance_);
+      Descriptor &compDesc{componentDescriptor_.descriptor()};
+      const typeInfo::DerivedType &compType{*component_->derivedType()};
+      compDesc.Establish(compType,
+          instance_.ElementComponent<char>(subscripts_, component_->offset()),
+          component_->rank(), extents);
+      Advance();
+      if (int status{workQueue.BeginFinalize(compDesc, compType)};
+          status != StatOk) {
+        return status;
       }
+    } else {
+      SkipToNextComponent();
     }
   }
-  if (recurse) {
-    StaticDescriptor<maxRank, true, 8 /*?*/> statDesc;
-    Descriptor &tmpDesc{statDesc.descriptor()};
-    tmpDesc = descriptor;
+  // Last, do the parent component, if any and finalizable.
+  if (finalizableParentType_) {
+    Descriptor &tmpDesc{componentDescriptor_.descriptor()};
+    tmpDesc = instance_;
     tmpDesc.raw().attribute = CFI_attribute_pointer;
-    tmpDesc.Addendum()->set_derivedType(parentType);
-    tmpDesc.raw().elem_len = parentType->sizeInBytes();
-    Finalize(tmpDesc, *parentType, terminator);
+    tmpDesc.Addendum()->set_derivedType(finalizableParentType_);
+    tmpDesc.raw().elem_len = finalizableParentType_->sizeInBytes();
+    const auto &parentType{*finalizableParentType_};
+    finalizableParentType_ = nullptr;
+    // Don't return StatOk here if the nested FInalize is still running;
+    // it needs this->componentDescriptor_.
+    return workQueue.BeginFinalize(tmpDesc, parentType);
   }
+  return StatOk;
 }
 
 // The order of finalization follows Fortran 2018 7.5.6.2, with
@@ -373,51 +392,71 @@ RT_API_ATTRS void Finalize(const Descriptor &descriptor,
 // preceding any deallocation.
 RT_API_ATTRS void Destroy(const Descriptor &descriptor, bool finalize,
     const typeInfo::DerivedType &derived, Terminator *terminator) {
-  if (derived.noDestructionNeeded() || !descriptor.IsAllocated()) {
-    return;
+  if (!derived.noFinalizationNeeded() && descriptor.IsAllocated()) {
+    Terminator stubTerminator{"Destroy() in Fortran runtime", 0};
+    WorkQueue workQueue{terminator ? *terminator : stubTerminator};
+    if (workQueue.BeginDestroy(descriptor, derived, finalize) == StatContinue) {
+      workQueue.Run();
+    }
   }
-  if (finalize && !derived.noFinalizationNeeded()) {
-    Finalize(descriptor, derived, terminator);
+}
+
+RT_API_ATTRS int DestroyTicket::Begin(WorkQueue &workQueue) {
+  if (finalize_ && !derived_.noFinalizationNeeded()) {
+    if (int status{workQueue.BeginFinalize(instance_, derived_)};
+        status != StatOk && status != StatContinue) {
+      return status;
+    }
   }
+  return StatContinue;
+}
+
+RT_API_ATTRS int DestroyTicket::Continue(WorkQueue &workQueue) {
   // Deallocate all direct and indirect allocatable and automatic components.
   // Contrary to finalization, the order of deallocation does not matter.
-  const Descriptor &componentDesc{derived.component()};
-  std::size_t myComponents{componentDesc.Elements()};
-  std::size_t elements{descriptor.Elements()};
-  SubscriptValue at[maxRank];
-  descriptor.GetLowerBounds(at);
-  for (std::size_t k{0}; k < myComponents; ++k) {
-    const auto &comp{
-        *componentDesc.ZeroBasedIndexedElement<typeInfo::Component>(k)};
-    const bool destroyComp{
-        comp.derivedType() && !comp.derivedType()->noDestructionNeeded()};
-    if (comp.genre() == typeInfo::Component::Genre::Allocatable ||
-        comp.genre() == typeInfo::Component::Genre::Automatic) {
-      for (std::size_t j{0}; j < elements; ++j) {
-        Descriptor *d{
-            descriptor.ElementComponent<Descriptor>(at, comp.offset())};
-        if (destroyComp) {
-          Destroy(*d, /*finalize=*/false, *comp.derivedType(), terminator);
+  while (!IsComplete()) {
+    const auto *componentDerived{component_->derivedType()};
+    if (component_->genre() == typeInfo::Component::Genre::Allocatable ||
+        component_->genre() == typeInfo::Component::Genre::Automatic) {
+      Descriptor *d{instance_.ElementComponent<Descriptor>(
+          subscripts_, component_->offset())};
+      if (d->IsAllocated()) {
+        if (phase_ == 0) {
+          ++phase_;
+          if (componentDerived && !componentDerived->noDestructionNeeded()) {
+            if (int status{workQueue.BeginDestroy(
+                    *d, *componentDerived, /*finalize=*/false)};
+                status != StatOk) {
+              return status;
+            }
+          }
         }
         d->Deallocate();
-        descriptor.IncrementSubscripts(at);
       }
-    } else if (destroyComp &&
-        comp.genre() == typeInfo::Component::Genre::Data) {
-      SubscriptValue extents[maxRank];
-      GetComponentExtents(extents, comp, descriptor);
-      StaticDescriptor<maxRank, true, 0> staticDescriptor;
-      Descriptor &compDesc{staticDescriptor.descriptor()};
-      const typeInfo::DerivedType &compType{*comp.derivedType()};
-      for (std::size_t j{0}; j++ < elements;
-           descriptor.IncrementSubscripts(at)) {
+      Advance();
+    } else if (component_->genre() == typeInfo::Component::Genre::Data) {
+      if (!componentDerived || componentDerived->noDestructionNeeded()) {
+        SkipToNextComponent();
+      } else {
+        SubscriptValue extents[maxRank];
+        GetComponentExtents(extents, *component_, instance_);
+        Descriptor &compDesc{componentDescriptor_.descriptor()};
+        const typeInfo::DerivedType &compType{*componentDerived};
         compDesc.Establish(compType,
-            descriptor.ElementComponent<char>(at, comp.offset()), comp.rank(),
-            extents);
-        Destroy(compDesc, /*finalize=*/false, *comp.derivedType(), terminator);
+            instance_.ElementComponent<char>(subscripts_, component_->offset()),
+            component_->rank(), extents);
+        Advance();
+        if (int status{workQueue.BeginDestroy(
+                compDesc, *componentDerived, /*finalize=*/false)};
+            status != StatOk) {
+          return status;
+        }
       }
+    } else {
+      SkipToNextComponent();
     }
   }
+  return StatOk;
 }
 
 RT_API_ATTRS bool HasDynamicComponent(const Descriptor &descriptor) {
diff --git a/flang-rt/lib/runtime/descriptor-io.cpp b/flang-rt/lib/runtime/descriptor-io.cpp
index 3db1455af52fe..364724b89ba0d 100644
--- a/flang-rt/lib/runtime/descriptor-io.cpp
+++ b/flang-rt/lib/runtime/descriptor-io.cpp
@@ -7,15 +7,44 @@
 //===----------------------------------------------------------------------===//
 
 #include "descriptor-io.h"
+#include "edit-input.h"
+#include "edit-output.h"
+#include "unit.h"
+#include "flang-rt/runtime/descriptor.h"
+#include "flang-rt/runtime/io-stmt.h"
+#include "flang-rt/runtime/namelist.h"
+#include "flang-rt/runtime/terminator.h"
+#include "flang-rt/runtime/type-info.h"
+#include "flang-rt/runtime/work-queue.h"
+#include "flang/Common/optional.h"
 #include "flang/Common/restorer.h"
+#include "flang/Common/uint128.h"
+#include "flang/Runtime/cpp-type.h"
 #include "flang/Runtime/freestanding-tools.h"
 
+// Implementation of I/O data list item transfers based on descriptors.
+// (All I/O items come through here so that the code is exercised for test;
+// some scalar I/O data transfer APIs could be changed to bypass their use
+// of descriptors in the future for better efficiency.)
+
 namespace Fortran::runtime::io::descr {
 RT_OFFLOAD_API_GROUP_BEGIN
 
+template <typename A>
+inline RT_API_ATTRS A &ExtractElement(IoStatementState &io,
+    const Descriptor &descriptor, const SubscriptValue subscripts[]) {
+  A *p{descriptor.Element<A>(subscripts)};
+  if (!p) {
+    io.GetIoErrorHandler().Crash("Bad address for I/O item -- null base "
+                                 "address or subscripts out of range");
+  }
+  return *p;
+}
+
 // Defined formatted I/O (maybe)
-Fortran::common::optional<bool> DefinedFormattedIo(IoStatementState &io,
-    const Descriptor &descriptor, const typeInfo::DerivedType &derived,
+static RT_API_ATTRS Fortran::common::optional<bool> DefinedFormattedIo(
+    IoStatementState &io, const Descriptor &descriptor,
+    const typeInfo::DerivedType &derived,
     const typeInfo::SpecialBinding &special,
     const SubscriptValue subscripts[]) {
   Fortran::common::optional<DataEdit> peek{
@@ -104,8 +133,8 @@ Fortran::common::optional<bool> DefinedFormattedIo(IoStatementState &io,
 }
 
 // Defined unformatted I/O
-bool DefinedUnformattedIo(IoStatementState &io, const Descriptor &descriptor,
-    const typeInfo::DerivedType &derived,
+static RT_API_ATTRS bool DefinedUnformattedIo(IoStatementState &io,
+    const Descriptor &descriptor, const typeInfo::DerivedType &derived,
     const typeInfo::SpecialBinding &special) {
   // Unformatted I/O must have an external unit (or child thereof).
   IoErrorHandler &handler{io.GetIoErrorHandler()};
@@ -152,5 +181,619 @@ bool DefinedUnformattedIo(IoStatementState &io, const Descriptor &descriptor,
   return handler.GetIoStat() == IostatOk;
 }
 
+// Per-category descriptor-based I/O templates
+
+// TODO (perhaps as a nontrivial but small starter project): implement
+// automatic repetition counts, like "10*3.14159", for list-directed and
+// NAMELIST array output.
+
+template <int KIND, Direction DIR>
+inline RT_API_ATTRS bool FormattedIntegerIO(IoStatementState &io,
+    const Descriptor &descriptor, [[maybe_unused]] bool isSigned) {
+  std::size_t numElements{descriptor.Elements()};
+  SubscriptValue subscripts[maxRank];
+  descriptor.GetLowerBounds(subscripts);
+  using IntType = CppTypeFor<common::TypeCategory::Integer, KIND>;
+  bool anyInput{false};
+  for (std::size_t j{0}; j < numElements; ++j) {
+    if (auto edit{io.GetNextDataEdit()}) {
+      IntType &x{ExtractElement<IntType>(io, descriptor, subscripts)};
+      if constexpr (DIR == Direction::Output) {
+        if (!EditIntegerOutput<KIND>(io, *edit, x, isSigned)) {
+          return false;
+        }
+      } else if (edit->descriptor != DataEdit::ListDirectedNullValue) {
+        if (EditIntegerInput(
+                io, *edit, reinterpret_cast<void *>(&x), KIND, isSigned)) {
+          anyInput = true;
+        } else {
+          return anyInput && edit->IsNamelist();
+        }
+      }
+      if (!descriptor.IncrementSubscripts(subscripts) && j + 1 < numElements) {
+        io.GetIoErrorHandler().Crash(
+            "FormattedIntegerIO: subscripts out of bounds");
+      }
+    } else {
+      return false;
+    }
+  }
+  return true;
+}
+
+template <int KIND, Direction DIR>
+inline RT_API_ATTRS bool FormattedRealIO(
+    IoStatementState &io, const Descriptor &descriptor) {
+  std::size_t numElements{descriptor.Elements()};
+  SubscriptValue subscripts[maxRank];
+  descriptor.GetLowerBounds(subscripts);
+  using RawType = typename RealOutputEditing<KIND>::BinaryFloatingPoint;
+  bool anyInput{false};
+  for (std::size_t j{0}; j < numElements; ++j) {
+    if (auto edit{io.GetNextDataEdit()}) {
+      RawType &x{ExtractElement<RawType>(io, descriptor, subscripts)};
+      if constexpr (DIR == Direction::Output) {
+        if (!RealOutputEditing<KIND>{io, x}.Edit(*edit)) {
+          return false;
+        }
+      } else if (edit->descriptor != DataEdit::ListDirectedNullValue) {
+        if (EditRealInput<KIND>(io, *edit, reinterpret_cast<void *>(&x))) {
+          anyInput = true;
+        } else {
+          return anyInput && edit->IsNamelist();
+        }
+      }
+      if (!descriptor.IncrementSubscripts(subscripts) && j + 1 < numElements) {
+        io.GetIoErrorHandler().Crash(
+            "FormattedRealIO: subscripts out of bounds");
+      }
+    } else {
+      return false;
+    }
+  }
+  return true;
+}
+
+template <int KIND, Direction DIR>
+inline RT_API_ATTRS bool FormattedComplexIO(
+    IoStatementState &io, const Descriptor &descriptor) {
+  std::size_t numElements{descriptor.Elements()};
+  SubscriptValue subscripts[maxRank];
+  descriptor.GetLowerBounds(subscripts);
+  bool isListOutput{
+      io.get_if<ListDirectedStatementState<Direction::Output>>() != nullptr};
+  using RawType = typename RealOutputEditing<KIND>::BinaryFloatingPoint;
+  bool anyInput{false};
+  for (std::size_t j{0}; j < numElements; ++j) {
+    RawType *x{&ExtractElement<RawType>(io, descriptor, subscripts)};
+    if (isListOutput) {
+      DataEdit rEdit, iEdit;
+      rEdit.descriptor = DataEdit::ListDirectedRealPart;
+      iEdit.descriptor = DataEdit::ListDirectedImaginaryPart;
+      rEdit.modes = iEdit.modes = io.mutableModes();
+      if (!RealOutputEditing<KIND>{io, x[0]}.Edit(rEdit) ||
+          !RealOutputEditing<KIND>{io, x[1]}.Edit(iEdit)) {
+        return false;
+      }
+    } else {
+      for (int k{0}; k < 2; ++k, ++x) {
+        auto edit{io.GetNextDataEdit()};
+        if (!edit) {
+          return false;
+        } else if constexpr (DIR == Direction::Output) {
+          if (!RealOutputEditing<KIND>{io, *x}.Edit(*edit)) {
+            return false;
+          }
+        } else if (edit->descriptor == DataEdit::ListDirectedNullValue) {
+          break;
+        } else if (EditRealInput<KIND>(
+                       io, *edit, reinterpret_cast<void *>(x))) {
+          anyInput = true;
+        } else {
+          return anyInput && edit->IsNamelist();
+        }
+      }
+    }
+    if (!descriptor.IncrementSubscripts(subscripts) && j + 1 < numElements) {
+      io.GetIoErrorHandler().Crash(
+          "FormattedComplexIO: subscripts out of bounds");
+    }
+  }
+  return true;
+}
+
+template <typename A, Direction DIR>
+inline RT_API_ATTRS bool FormattedCharacterIO(
+    IoStatementState &io, const Descriptor &descriptor) {
+  std::size_t numElements{descriptor.Elements()};
+  SubscriptValue subscripts[maxRank];
+  descriptor.GetLowerBounds(subscripts);
+  std::size_t length{descriptor.ElementBytes() / sizeof(A)};
+  auto *listOutput{io.get_if<ListDirectedStatementState<Direction::Output>>()};
+  bool anyInput{false};
+  for (std::size_t j{0}; j < numElements; ++j) {
+    A *x{&ExtractElement<A>(io, descriptor, subscripts)};
+    if (listOutput) {
+      if (!ListDirectedCharacterOutput(io, *listOutput, x, length)) {
+        return false;
+      }
+    } else if (auto edit{io.GetNextDataEdit()}) {
+      if constexpr (DIR == Direction::Output) {
+        if (!EditCharacterOutput(io, *edit, x, length)) {
+          return false;
+        }
+      } else { // input
+        if (edit->descriptor != DataEdit::ListDirectedNullValue) {
+          if (EditCharacterInput(io, *edit, x, length)) {
+            anyInput = true;
+          } else {
+            return anyInput && edit->IsNamelist();
+          }
+        }
+      }
+    } else {
+      return false;
+    }
+    if (!descriptor.IncrementSubscripts(subscripts) && j + 1 < numElements) {
+      io.GetIoErrorHandler().Crash(
+          "FormattedCharacterIO: subscripts out of bounds");
+    }
+  }
+  return true;
+}
+
+template <int KIND, Direction DIR>
+inline RT_API_ATTRS bool FormattedLogicalIO(
+    IoStatementState &io, const Descriptor &descriptor) {
+  std::size_t numElements{descriptor.Elements()};
+  SubscriptValue subscripts[maxRank];
+  descriptor.GetLowerBounds(subscripts);
+  auto *listOutput{io.get_if<ListDirectedStatementState<Direction::Output>>()};
+  using IntType = CppTypeFor<TypeCategory::Integer, KIND>;
+  bool anyInput{false};
+  for (std::size_t j{0}; j < numElements; ++j) {
+    IntType &x{ExtractElement<IntType>(io, descriptor, subscripts)};
+    if (listOutput) {
+      if (!ListDirectedLogicalOutput(io, *listOutput, x != 0)) {
+        return false;
+      }
+    } else if (auto edit{io.GetNextDataEdit()}) {
+      if constexpr (DIR == Direction::Output) {
+        if (!EditLogicalOutput(io, *edit, x != 0)) {
+          return false;
+        }
+      } else {
+        if (edit->descriptor != DataEdit::ListDirectedNullValue) {
+          bool truth{};
+          if (EditLogicalInput(io, *edit, truth)) {
+            x = truth;
+            anyInput = true;
+          } else {
+            return anyInput && edit->IsNamelist();
+          }
+        }
+      }
+    } else {
+      return false;
+    }
+    if (!descriptor.IncrementSubscripts(subscripts) && j + 1 < numElements) {
+      io.GetIoErrorHandler().Crash(
+          "FormattedLogicalIO: subscripts out of bounds");
+    }
+  }
+  return true;
+}
+
+template <Direction DIR>
+RT_API_ATTRS int DerivedIoTicket<DIR>::Continue(WorkQueue &workQueue) {
+  while (!IsComplete()) {
+    if (component_->genre() == typeInfo::Component::Genre::Data) {
+      // Create a descriptor for the component
+      Descriptor &compDesc{componentDescriptor_.descriptor()};
+      component_->CreatePointerDescriptor(
+          compDesc, instance_, io_.GetIoErrorHandler(), subscripts_);
+      Advance();
+      if (int status{workQueue.BeginDescriptorIo<DIR>(
+              io_, compDesc, table_, anyIoTookPlace_)};
+          status != StatOk) {
+        return status;
+      }
+    } else {
+      // Component is itself a descriptor
+      char *pointer{
+          instance_.Element<char>(subscripts_) + component_->offset()};
+      const Descriptor &compDesc{
+          *reinterpret_cast<const Descriptor *>(pointer)};
+      Advance();
+      if (compDesc.IsAllocated()) {
+        if (int status{workQueue.BeginDescriptorIo<DIR>(
+                io_, compDesc, table_, anyIoTookPlace_)};
+            status != StatOk) {
+          return status;
+        }
+      }
+    }
+  }
+  return StatOk;
+}
+
+template RT_API_ATTRS int DerivedIoTicket<Direction::Output>::Continue(
+    WorkQueue &);
+template RT_API_ATTRS int DerivedIoTicket<Direction::Input>::Continue(
+    WorkQueue &);
+
+template <Direction DIR>
+RT_API_ATTRS int DescriptorIoTicket<DIR>::Begin(WorkQueue &workQueue) {
+  IoErrorHandler &handler{io_.GetIoErrorHandler()};
+  if (handler.InError()) {
+    return handler.GetIoStat();
+  }
+  if (!io_.get_if<IoDirectionState<DIR>>()) {
+    handler.Crash("DescriptorIO() called for wrong I/O direction");
+    return handler.GetIoStat();
+  }
+  if constexpr (DIR == Direction::Input) {
+    if (!io_.BeginReadingRecord()) {
+      return StatOk;
+    }
+  }
+  if (!io_.get_if<FormattedIoStatementState<DIR>>()) {
+    // Unformatted I/O
+    IoErrorHandler &handler{io_.GetIoErrorHandler()};
+    const DescriptorAddendum *addendum{instance_.Addendum()};
+    if (const typeInfo::DerivedType *type{
+            addendum ? addendum->derivedType() : nullptr}) {
+      // derived type unformatted I/O
+      if (table_) {
+        if (const auto *definedIo{table_->Find(*type,
+                DIR == Direction::Input
+                    ? common::DefinedIo::ReadUnformatted
+                    : common::DefinedIo::WriteUnformatted)}) {
+          if (definedIo->subroutine) {
+            typeInfo::SpecialBinding special{DIR == Direction::Input
+                    ? typeInfo::SpecialBinding::Which::ReadUnformatted
+                    : typeInfo::SpecialBinding::Which::WriteUnformatted,
+                definedIo->subroutine, definedIo->isDtvArgPolymorphic, false,
+                false};
+            if (DefinedUnformattedIo(io_, instance_, *type, special)) {
+              anyIoTookPlace_ = true;
+              return StatOk;
+            }
+          } else {
+            int status{workQueue.BeginDerivedIo<DIR>(
+                io_, instance_, *type, table_, anyIoTookPlace_)};
+            return status == StatContinue ? StatOk : status; // done here
+          }
+        }
+      }
+      if (const typeInfo::SpecialBinding *special{
+              type->FindSpecialBinding(DIR == Direction::Input
+                      ? typeInfo::SpecialBinding::Which::ReadUnformatted
+                      : typeInfo::SpecialBinding::Which::WriteUnformatted)}) {
+        if (!table_ || !table_->ignoreNonTbpEntries || special->isTypeBound()) {
+          // defined derived type unformatted I/O
+          if (DefinedUnformattedIo(io_, instance_, *type, *special)) {
+            anyIoTookPlace_ = true;
+            return StatOk;
+          } else {
+            return IostatEnd;
+          }
+        }
+      }
+      // Default derived type unformatted I/O
+      // TODO: If no component at any level has defined READ or WRITE
+      // (as appropriate), the elements are contiguous, and no byte swapping
+      // is active, do a block transfer via the code below.
+      int status{workQueue.BeginDerivedIo<DIR>(
+          io_, instance_, *type, table_, anyIoTookPlace_)};
+      return status == StatContinue ? StatOk : status; // done here
+    } else {
+      // intrinsic type unformatted I/O
+      auto *externalUnf{io_.get_if<ExternalUnformattedIoStatementState<DIR>>()};
+      ChildUnformattedIoStatementState<DIR> *childUnf{nullptr};
+      InquireIOLengthState *inq{nullptr};
+      bool swapEndianness{false};
+      if (externalUnf) {
+        swapEndianness = externalUnf->unit().swapEndianness();
+      } else {
+        childUnf = io_.get_if<ChildUnformattedIoStatementState<DIR>>();
+        if (!childUnf) {
+          inq = DIR == Direction::Output ? io_.get_if<InquireIOLengthState>()
+                                         : nullptr;
+          RUNTIME_CHECK(handler, inq != nullptr);
+        }
+      }
+      std::size_t elementBytes{instance_.ElementBytes()};
+      std::size_t swappingBytes{elementBytes};
+      if (auto maybeCatAndKind{instance_.type().GetCategoryAndKind()}) {
+        // Byte swapping units can be smaller than elements, namely
+        // for COMPLEX and CHARACTER.
+        if (maybeCatAndKind->first == TypeCategory::Character) {
+          // swap each character position independently
+          swappingBytes = maybeCatAndKind->second; // kind
+        } else if (maybeCatAndKind->first == TypeCategory::Complex) {
+          // swap real and imaginary components independently
+          swappingBytes /= 2;
+        }
+      }
+      using CharType =
+          std::conditional_t<DIR == Direction::Output, const char, char>;
+      auto Transfer{[=](CharType &x, std::size_t totalBytes) -> bool {
+        if constexpr (DIR == Direction::Output) {
+          return externalUnf ? externalUnf->Emit(&x, totalBytes, swappingBytes)
+              : childUnf     ? childUnf->Emit(&x, totalBytes, swappingBytes)
+                             : inq->Emit(&x, totalBytes, swappingBytes);
+        } else {
+          return externalUnf
+              ? externalUnf->Receive(&x, totalBytes, swappingBytes)
+              : childUnf->Receive(&x, totalBytes, swappingBytes);
+        }
+      }};
+      if (!swapEndianness &&
+          instance_.IsContiguous()) { // contiguous unformatted I/O
+        char &x{ExtractElement<char>(io_, instance_, subscripts_)};
+        if (Transfer(x, elements_ * elementBytes)) {
+          anyIoTookPlace_ = true;
+        } else {
+          return IostatEnd;
+        }
+      } else { // non-contiguous or byte-swapped intrinsic type unformatted I/O
+        for (; !IsComplete(); Advance()) {
+          char &x{ExtractElement<char>(io_, instance_, subscripts_)};
+          if (Transfer(x, elementBytes)) {
+            anyIoTookPlace_ = true;
+          } else {
+            return IostatEnd;
+          }
+        }
+      }
+    }
+    // Unformatted I/O never needs to call Continue().
+    return StatOk;
+  }
+  // Formatted I/O
+  if (auto catAndKind{instance_.type().GetCategoryAndKind()}) {
+    TypeCategory cat{catAndKind->first};
+    int kind{catAndKind->second};
+    bool any{false};
+    switch (cat) {
+    case TypeCategory::Integer:
+      switch (kind) {
+      case 1:
+        any = FormattedIntegerIO<1, DIR>(io_, instance_, true);
+        break;
+      case 2:
+        any = FormattedIntegerIO<2, DIR>(io_, instance_, true);
+        break;
+      case 4:
+        any = FormattedIntegerIO<4, DIR>(io_, instance_, true);
+        break;
+      case 8:
+        any = FormattedIntegerIO<8, DIR>(io_, instance_, true);
+        break;
+      case 16:
+        any = FormattedIntegerIO<16, DIR>(io_, instance_, true);
+        break;
+      default:
+        handler.Crash(
+            "not yet implemented: INTEGER(KIND=%d) in formatted IO", kind);
+        return IostatEnd;
+      }
+      break;
+    case TypeCategory::Unsigned:
+      switch (kind) {
+      case 1:
+        any = FormattedIntegerIO<1, DIR>(io_, instance_, false);
+        break;
+      case 2:
+        any = FormattedIntegerIO<2, DIR>(io_, instance_, false);
+        break;
+      case 4:
+        any = FormattedIntegerIO<4, DIR>(io_, instance_, false);
+        break;
+      case 8:
+        any = FormattedIntegerIO<8, DIR>(io_, instance_, false);
+        break;
+      case 16:
+        any = FormattedIntegerIO<16, DIR>(io_, instance_, false);
+        break;
+      default:
+        handler.Crash(
+            "not yet implemented: UNSIGNED(KIND=%d) in formatted IO", kind);
+        return IostatEnd;
+      }
+      break;
+    case TypeCategory::Real:
+      switch (kind) {
+      case 2:
+        any = FormattedRealIO<2, DIR>(io_, instance_);
+        break;
+      case 3:
+        any = FormattedRealIO<3, DIR>(io_, instance_);
+        break;
+      case 4:
+        any = FormattedRealIO<4, DIR>(io_, instance_);
+        break;
+      case 8:
+        any = FormattedRealIO<8, DIR>(io_, instance_);
+        break;
+      case 10:
+        any = FormattedRealIO<10, DIR>(io_, instance_);
+        break;
+      // TODO: case double/double
+      case 16:
+        any = FormattedRealIO<16, DIR>(io_, instance_);
+        break;
+      default:
+        handler.Crash(
+            "not yet implemented: REAL(KIND=%d) in formatted IO", kind);
+        return IostatEnd;
+      }
+      break;
+    case TypeCategory::Complex:
+      switch (kind) {
+      case 2:
+        any = FormattedComplexIO<2, DIR>(io_, instance_);
+        break;
+      case 3:
+        any = FormattedComplexIO<3, DIR>(io_, instance_);
+        break;
+      case 4:
+        any = FormattedComplexIO<4, DIR>(io_, instance_);
+        break;
+      case 8:
+        any = FormattedComplexIO<8, DIR>(io_, instance_);
+        break;
+      case 10:
+        any = FormattedComplexIO<10, DIR>(io_, instance_);
+        break;
+      // TODO: case double/double
+      case 16:
+        any = FormattedComplexIO<16, DIR>(io_, instance_);
+        break;
+      default:
+        handler.Crash(
+            "not yet implemented: COMPLEX(KIND=%d) in formatted IO", kind);
+        return IostatEnd;
+      }
+      break;
+    case TypeCategory::Character:
+      switch (kind) {
+      case 1:
+        any = FormattedCharacterIO<char, DIR>(io_, instance_);
+        break;
+      case 2:
+        any = FormattedCharacterIO<char16_t, DIR>(io_, instance_);
+        break;
+      case 4:
+        any = FormattedCharacterIO<char32_t, DIR>(io_, instance_);
+        break;
+      default:
+        handler.Crash(
+            "not yet implemented: CHARACTER(KIND=%d) in formatted IO", kind);
+        return IostatEnd;
+      }
+      break;
+    case TypeCategory::Logical:
+      switch (kind) {
+      case 1:
+        any = FormattedLogicalIO<1, DIR>(io_, instance_);
+        break;
+      case 2:
+        any = FormattedLogicalIO<2, DIR>(io_, instance_);
+        break;
+      case 4:
+        any = FormattedLogicalIO<4, DIR>(io_, instance_);
+        break;
+      case 8:
+        any = FormattedLogicalIO<8, DIR>(io_, instance_);
+        break;
+      default:
+        handler.Crash(
+            "not yet implemented: LOGICAL(KIND=%d) in formatted IO", kind);
+        return IostatEnd;
+      }
+      break;
+    case TypeCategory::Derived: {
+      // Derived type information must be present for formatted I/O.
+      IoErrorHandler &handler{io_.GetIoErrorHandler()};
+      const DescriptorAddendum *addendum{instance_.Addendum()};
+      RUNTIME_CHECK(handler, addendum != nullptr);
+      derived_ = addendum->derivedType();
+      RUNTIME_CHECK(handler, derived_ != nullptr);
+      if (table_) {
+        if (const auto *definedIo{table_->Find(*derived_,
+                DIR == Direction::Input ? common::DefinedIo::ReadFormatted
+                                        : common::DefinedIo::WriteFormatted)}) {
+          if (definedIo->subroutine) {
+            nonTbpSpecial_.emplace(DIR == Direction::Input
+                    ? typeInfo::SpecialBinding::Which::ReadFormatted
+                    : typeInfo::SpecialBinding::Which::WriteFormatted,
+                definedIo->subroutine, definedIo->isDtvArgPolymorphic, false,
+                false);
+            special_ = &*nonTbpSpecial_;
+          }
+        }
+      }
+      if (!special_) {
+        if (const typeInfo::SpecialBinding *binding{
+                derived_->FindSpecialBinding(DIR == Direction::Input
+                        ? typeInfo::SpecialBinding::Which::ReadFormatted
+                        : typeInfo::SpecialBinding::Which::WriteFormatted)}) {
+          if (!table_ || !table_->ignoreNonTbpEntries ||
+              binding->isTypeBound()) {
+            special_ = binding;
+          }
+        }
+      }
+      return StatContinue;
+    }
+    }
+    if (any) {
+      anyIoTookPlace_ = true;
+    } else {
+      return IostatEnd;
+    }
+  } else {
+    handler.Crash("DescriptorIO: bad type code (%d) in descriptor",
+        static_cast<int>(instance_.type().raw()));
+    return handler.GetIoStat();
+  }
+  return StatOk;
+}
+
+template RT_API_ATTRS int DescriptorIoTicket<Direction::Output>::Begin(
+    WorkQueue &);
+template RT_API_ATTRS int DescriptorIoTicket<Direction::Input>::Begin(
+    WorkQueue &);
+
+template <Direction DIR>
+RT_API_ATTRS int DescriptorIoTicket<DIR>::Continue(WorkQueue &workQueue) {
+  // Only derived type formatted I/O gets here.
+  while (!IsComplete()) {
+    if (special_) {
+      if (auto defined{DefinedFormattedIo(
+              io_, instance_, *derived_, *special_, subscripts_)}) {
+        anyIoTookPlace_ |= *defined;
+        Advance();
+        continue;
+      }
+    }
+    Descriptor &elementDesc{elementDescriptor_.descriptor()};
+    elementDesc.Establish(
+        *derived_, nullptr, 0, nullptr, CFI_attribute_pointer);
+    elementDesc.set_base_addr(instance_.Element<char>(subscripts_));
+    Advance();
+    if (int status{workQueue.BeginDerivedIo<DIR>(
+            io_, elementDesc, *derived_, table_, anyIoTookPlace_)};
+        status != StatOk) {
+      return status;
+    }
+  }
+  return StatOk;
+}
+
+template RT_API_ATTRS int DescriptorIoTicket<Direction::Output>::Continue(
+    WorkQueue &);
+template RT_API_ATTRS int DescriptorIoTicket<Direction::Input>::Continue(
+    WorkQueue &);
+
+template <Direction DIR>
+RT_API_ATTRS bool DescriptorIO(IoStatementState &io,
+    const Descriptor &descriptor, const NonTbpDefinedIoTable *table) {
+  bool anyIoTookPlace{false};
+  WorkQueue workQueue{io.GetIoErrorHandler()};
+  if (workQueue.BeginDescriptorIo<DIR>(io, descriptor, table, anyIoTookPlace) ==
+      StatContinue) {
+    workQueue.Run();
+  }
+  return anyIoTookPlace;
+}
+
+template RT_API_ATTRS bool DescriptorIO<Direction::Output>(
+    IoStatementState &, const Descriptor &, const NonTbpDefinedIoTable *);
+template RT_API_ATTRS bool DescriptorIO<Direction::Input>(
+    IoStatementState &, const Descriptor &, const NonTbpDefinedIoTable *);
+
 RT_OFFLOAD_API_GROUP_END
 } // namespace Fortran::runtime::io::descr
diff --git a/flang-rt/lib/runtime/descriptor-io.h b/flang-rt/lib/runtime/descriptor-io.h
index eb60f106c9203..88ad59bd24b53 100644
--- a/flang-rt/lib/runtime/descriptor-io.h
+++ b/flang-rt/lib/runtime/descriptor-io.h
@@ -9,619 +9,27 @@
 #ifndef FLANG_RT_RUNTIME_DESCRIPTOR_IO_H_
 #define FLANG_RT_RUNTIME_DESCRIPTOR_IO_H_
 
-// Implementation of I/O data list item transfers based on descriptors.
-// (All I/O items come through here so that the code is exercised for test;
-// some scalar I/O data transfer APIs could be changed to bypass their use
-// of descriptors in the future for better efficiency.)
+#include "flang-rt/runtime/connection.h"
 
-#include "edit-input.h"
-#include "edit-output.h"
-#include "unit.h"
-#include "flang-rt/runtime/descriptor.h"
-#include "flang-rt/runtime/io-stmt.h"
-#include "flang-rt/runtime/namelist.h"
-#include "flang-rt/runtime/terminator.h"
-#include "flang-rt/runtime/type-info.h"
-#include "flang/Common/optional.h"
-#include "flang/Common/uint128.h"
-#include "flang/Runtime/cpp-type.h"
+namespace Fortran::runtime {
+class Descriptor;
+} // namespace Fortran::runtime
 
-namespace Fortran::runtime::io::descr {
-template <typename A>
-inline RT_API_ATTRS A &ExtractElement(IoStatementState &io,
-    const Descriptor &descriptor, const SubscriptValue subscripts[]) {
-  A *p{descriptor.Element<A>(subscripts)};
-  if (!p) {
-    io.GetIoErrorHandler().Crash("Bad address for I/O item -- null base "
-                                 "address or subscripts out of range");
-  }
-  return *p;
-}
-
-// Per-category descriptor-based I/O templates
-
-// TODO (perhaps as a nontrivial but small starter project): implement
-// automatic repetition counts, like "10*3.14159", for list-directed and
-// NAMELIST array output.
-
-template <int KIND, Direction DIR>
-inline RT_API_ATTRS bool FormattedIntegerIO(IoStatementState &io,
-    const Descriptor &descriptor, [[maybe_unused]] bool isSigned) {
-  std::size_t numElements{descriptor.Elements()};
-  SubscriptValue subscripts[maxRank];
-  descriptor.GetLowerBounds(subscripts);
-  using IntType = CppTypeFor<common::TypeCategory::Integer, KIND>;
-  bool anyInput{false};
-  for (std::size_t j{0}; j < numElements; ++j) {
-    if (auto edit{io.GetNextDataEdit()}) {
-      IntType &x{ExtractElement<IntType>(io, descriptor, subscripts)};
-      if constexpr (DIR == Direction::Output) {
-        if (!EditIntegerOutput<KIND>(io, *edit, x, isSigned)) {
-          return false;
-        }
-      } else if (edit->descriptor != DataEdit::ListDirectedNullValue) {
-        if (EditIntegerInput(
-                io, *edit, reinterpret_cast<void *>(&x), KIND, isSigned)) {
-          anyInput = true;
-        } else {
-          return anyInput && edit->IsNamelist();
-        }
-      }
-      if (!descriptor.IncrementSubscripts(subscripts) && j + 1 < numElements) {
-        io.GetIoErrorHandler().Crash(
-            "FormattedIntegerIO: subscripts out of bounds");
-      }
-    } else {
-      return false;
-    }
-  }
-  return true;
-}
-
-template <int KIND, Direction DIR>
-inline RT_API_ATTRS bool FormattedRealIO(
-    IoStatementState &io, const Descriptor &descriptor) {
-  std::size_t numElements{descriptor.Elements()};
-  SubscriptValue subscripts[maxRank];
-  descriptor.GetLowerBounds(subscripts);
-  using RawType = typename RealOutputEditing<KIND>::BinaryFloatingPoint;
-  bool anyInput{false};
-  for (std::size_t j{0}; j < numElements; ++j) {
-    if (auto edit{io.GetNextDataEdit()}) {
-      RawType &x{ExtractElement<RawType>(io, descriptor, subscripts)};
-      if constexpr (DIR == Direction::Output) {
-        if (!RealOutputEditing<KIND>{io, x}.Edit(*edit)) {
-          return false;
-        }
-      } else if (edit->descriptor != DataEdit::ListDirectedNullValue) {
-        if (EditRealInput<KIND>(io, *edit, reinterpret_cast<void *>(&x))) {
-          anyInput = true;
-        } else {
-          return anyInput && edit->IsNamelist();
-        }
-      }
-      if (!descriptor.IncrementSubscripts(subscripts) && j + 1 < numElements) {
-        io.GetIoErrorHandler().Crash(
-            "FormattedRealIO: subscripts out of bounds");
-      }
-    } else {
-      return false;
-    }
-  }
-  return true;
-}
+namespace Fortran::runtime::io {
+class IoStatementState;
+struct NonTbpDefinedIoTable;
+} // namespace Fortran::runtime::io
 
-template <int KIND, Direction DIR>
-inline RT_API_ATTRS bool FormattedComplexIO(
-    IoStatementState &io, const Descriptor &descriptor) {
-  std::size_t numElements{descriptor.Elements()};
-  SubscriptValue subscripts[maxRank];
-  descriptor.GetLowerBounds(subscripts);
-  bool isListOutput{
-      io.get_if<ListDirectedStatementState<Direction::Output>>() != nullptr};
-  using RawType = typename RealOutputEditing<KIND>::BinaryFloatingPoint;
-  bool anyInput{false};
-  for (std::size_t j{0}; j < numElements; ++j) {
-    RawType *x{&ExtractElement<RawType>(io, descriptor, subscripts)};
-    if (isListOutput) {
-      DataEdit rEdit, iEdit;
-      rEdit.descriptor = DataEdit::ListDirectedRealPart;
-      iEdit.descriptor = DataEdit::ListDirectedImaginaryPart;
-      rEdit.modes = iEdit.modes = io.mutableModes();
-      if (!RealOutputEditing<KIND>{io, x[0]}.Edit(rEdit) ||
-          !RealOutputEditing<KIND>{io, x[1]}.Edit(iEdit)) {
-        return false;
-      }
-    } else {
-      for (int k{0}; k < 2; ++k, ++x) {
-        auto edit{io.GetNextDataEdit()};
-        if (!edit) {
-          return false;
-        } else if constexpr (DIR == Direction::Output) {
-          if (!RealOutputEditing<KIND>{io, *x}.Edit(*edit)) {
-            return false;
-          }
-        } else if (edit->descriptor == DataEdit::ListDirectedNullValue) {
-          break;
-        } else if (EditRealInput<KIND>(
-                       io, *edit, reinterpret_cast<void *>(x))) {
-          anyInput = true;
-        } else {
-          return anyInput && edit->IsNamelist();
-        }
-      }
-    }
-    if (!descriptor.IncrementSubscripts(subscripts) && j + 1 < numElements) {
-      io.GetIoErrorHandler().Crash(
-          "FormattedComplexIO: subscripts out of bounds");
-    }
-  }
-  return true;
-}
-
-template <typename A, Direction DIR>
-inline RT_API_ATTRS bool FormattedCharacterIO(
-    IoStatementState &io, const Descriptor &descriptor) {
-  std::size_t numElements{descriptor.Elements()};
-  SubscriptValue subscripts[maxRank];
-  descriptor.GetLowerBounds(subscripts);
-  std::size_t length{descriptor.ElementBytes() / sizeof(A)};
-  auto *listOutput{io.get_if<ListDirectedStatementState<Direction::Output>>()};
-  bool anyInput{false};
-  for (std::size_t j{0}; j < numElements; ++j) {
-    A *x{&ExtractElement<A>(io, descriptor, subscripts)};
-    if (listOutput) {
-      if (!ListDirectedCharacterOutput(io, *listOutput, x, length)) {
-        return false;
-      }
-    } else if (auto edit{io.GetNextDataEdit()}) {
-      if constexpr (DIR == Direction::Output) {
-        if (!EditCharacterOutput(io, *edit, x, length)) {
-          return false;
-        }
-      } else { // input
-        if (edit->descriptor != DataEdit::ListDirectedNullValue) {
-          if (EditCharacterInput(io, *edit, x, length)) {
-            anyInput = true;
-          } else {
-            return anyInput && edit->IsNamelist();
-          }
-        }
-      }
-    } else {
-      return false;
-    }
-    if (!descriptor.IncrementSubscripts(subscripts) && j + 1 < numElements) {
-      io.GetIoErrorHandler().Crash(
-          "FormattedCharacterIO: subscripts out of bounds");
-    }
-  }
-  return true;
-}
-
-template <int KIND, Direction DIR>
-inline RT_API_ATTRS bool FormattedLogicalIO(
-    IoStatementState &io, const Descriptor &descriptor) {
-  std::size_t numElements{descriptor.Elements()};
-  SubscriptValue subscripts[maxRank];
-  descriptor.GetLowerBounds(subscripts);
-  auto *listOutput{io.get_if<ListDirectedStatementState<Direction::Output>>()};
-  using IntType = CppTypeFor<TypeCategory::Integer, KIND>;
-  bool anyInput{false};
-  for (std::size_t j{0}; j < numElements; ++j) {
-    IntType &x{ExtractElement<IntType>(io, descriptor, subscripts)};
-    if (listOutput) {
-      if (!ListDirectedLogicalOutput(io, *listOutput, x != 0)) {
-        return false;
-      }
-    } else if (auto edit{io.GetNextDataEdit()}) {
-      if constexpr (DIR == Direction::Output) {
-        if (!EditLogicalOutput(io, *edit, x != 0)) {
-          return false;
-        }
-      } else {
-        if (edit->descriptor != DataEdit::ListDirectedNullValue) {
-          bool truth{};
-          if (EditLogicalInput(io, *edit, truth)) {
-            x = truth;
-            anyInput = true;
-          } else {
-            return anyInput && edit->IsNamelist();
-          }
-        }
-      }
-    } else {
-      return false;
-    }
-    if (!descriptor.IncrementSubscripts(subscripts) && j + 1 < numElements) {
-      io.GetIoErrorHandler().Crash(
-          "FormattedLogicalIO: subscripts out of bounds");
-    }
-  }
-  return true;
-}
+namespace Fortran::runtime::io::descr {
 
 template <Direction DIR>
-static RT_API_ATTRS bool DescriptorIO(IoStatementState &, const Descriptor &,
+RT_API_ATTRS bool DescriptorIO(IoStatementState &, const Descriptor &,
     const NonTbpDefinedIoTable * = nullptr);
 
-// For intrinsic (not defined) derived type I/O, formatted & unformatted
-template <Direction DIR>
-static RT_API_ATTRS bool DefaultComponentIO(IoStatementState &io,
-    const typeInfo::Component &component, const Descriptor &origDescriptor,
-    const SubscriptValue origSubscripts[], Terminator &terminator,
-    const NonTbpDefinedIoTable *table) {
-#if !defined(RT_DEVICE_AVOID_RECURSION)
-  if (component.genre() == typeInfo::Component::Genre::Data) {
-    // Create a descriptor for the component
-    StaticDescriptor<maxRank, true, 16 /*?*/> statDesc;
-    Descriptor &desc{statDesc.descriptor()};
-    component.CreatePointerDescriptor(
-        desc, origDescriptor, terminator, origSubscripts);
-    return DescriptorIO<DIR>(io, desc, table);
-  } else {
-    // Component is itself a descriptor
-    char *pointer{
-        origDescriptor.Element<char>(origSubscripts) + component.offset()};
-    const Descriptor &compDesc{*reinterpret_cast<const Descriptor *>(pointer)};
-    return compDesc.IsAllocated() && DescriptorIO<DIR>(io, compDesc, table);
-  }
-#else
-  terminator.Crash("not yet implemented: component IO");
-#endif
-}
-
-template <Direction DIR>
-static RT_API_ATTRS bool DefaultComponentwiseFormattedIO(IoStatementState &io,
-    const Descriptor &descriptor, const typeInfo::DerivedType &type,
-    const NonTbpDefinedIoTable *table, const SubscriptValue subscripts[]) {
-  IoErrorHandler &handler{io.GetIoErrorHandler()};
-  const Descriptor &compArray{type.component()};
-  RUNTIME_CHECK(handler, compArray.rank() == 1);
-  std::size_t numComponents{compArray.Elements()};
-  SubscriptValue at[maxRank];
-  compArray.GetLowerBounds(at);
-  for (std::size_t k{0}; k < numComponents;
-       ++k, compArray.IncrementSubscripts(at)) {
-    const typeInfo::Component &component{
-        *compArray.Element<typeInfo::Component>(at)};
-    if (!DefaultComponentIO<DIR>(
-            io, component, descriptor, subscripts, handler, table)) {
-      // Return true for NAMELIST input if any component appeared.
-      auto *listInput{
-          io.get_if<ListDirectedStatementState<Direction::Input>>()};
-      return DIR == Direction::Input && k > 0 && listInput &&
-          listInput->inNamelistSequence();
-    }
-  }
-  return true;
-}
-
-template <Direction DIR>
-static RT_API_ATTRS bool DefaultComponentwiseUnformattedIO(IoStatementState &io,
-    const Descriptor &descriptor, const typeInfo::DerivedType &type,
-    const NonTbpDefinedIoTable *table) {
-  IoErrorHandler &handler{io.GetIoErrorHandler()};
-  const Descriptor &compArray{type.component()};
-  RUNTIME_CHECK(handler, compArray.rank() == 1);
-  std::size_t numComponents{compArray.Elements()};
-  std::size_t numElements{descriptor.Elements()};
-  SubscriptValue subscripts[maxRank];
-  descriptor.GetLowerBounds(subscripts);
-  for (std::size_t j{0}; j < numElements;
-       ++j, descriptor.IncrementSubscripts(subscripts)) {
-    SubscriptValue at[maxRank];
-    compArray.GetLowerBounds(at);
-    for (std::size_t k{0}; k < numComponents;
-         ++k, compArray.IncrementSubscripts(at)) {
-      const typeInfo::Component &component{
-          *compArray.Element<typeInfo::Component>(at)};
-      if (!DefaultComponentIO<DIR>(
-              io, component, descriptor, subscripts, handler, table)) {
-        return false;
-      }
-    }
-  }
-  return true;
-}
-
-RT_API_ATTRS Fortran::common::optional<bool> DefinedFormattedIo(
-    IoStatementState &, const Descriptor &, const typeInfo::DerivedType &,
-    const typeInfo::SpecialBinding &, const SubscriptValue[]);
-
-template <Direction DIR>
-static RT_API_ATTRS bool FormattedDerivedTypeIO(IoStatementState &io,
-    const Descriptor &descriptor, const NonTbpDefinedIoTable *table) {
-  IoErrorHandler &handler{io.GetIoErrorHandler()};
-  // Derived type information must be present for formatted I/O.
-  const DescriptorAddendum *addendum{descriptor.Addendum()};
-  RUNTIME_CHECK(handler, addendum != nullptr);
-  const typeInfo::DerivedType *type{addendum->derivedType()};
-  RUNTIME_CHECK(handler, type != nullptr);
-  Fortran::common::optional<typeInfo::SpecialBinding> nonTbpSpecial;
-  const typeInfo::SpecialBinding *special{nullptr};
-  if (table) {
-    if (const auto *definedIo{table->Find(*type,
-            DIR == Direction::Input ? common::DefinedIo::ReadFormatted
-                                    : common::DefinedIo::WriteFormatted)}) {
-      if (definedIo->subroutine) {
-        nonTbpSpecial.emplace(DIR == Direction::Input
-                ? typeInfo::SpecialBinding::Which::ReadFormatted
-                : typeInfo::SpecialBinding::Which::WriteFormatted,
-            definedIo->subroutine, definedIo->isDtvArgPolymorphic, false,
-            false);
-        special = &*nonTbpSpecial;
-      }
-    }
-  }
-  if (!special) {
-    if (const typeInfo::SpecialBinding *
-        binding{type->FindSpecialBinding(DIR == Direction::Input
-                ? typeInfo::SpecialBinding::Which::ReadFormatted
-                : typeInfo::SpecialBinding::Which::WriteFormatted)}) {
-      if (!table || !table->ignoreNonTbpEntries || binding->isTypeBound()) {
-        special = binding;
-      }
-    }
-  }
-  SubscriptValue subscripts[maxRank];
-  descriptor.GetLowerBounds(subscripts);
-  std::size_t numElements{descriptor.Elements()};
-  for (std::size_t j{0}; j < numElements;
-       ++j, descriptor.IncrementSubscripts(subscripts)) {
-    Fortran::common::optional<bool> result;
-    if (special) {
-      result = DefinedFormattedIo(io, descriptor, *type, *special, subscripts);
-    }
-    if (!result) {
-      result = DefaultComponentwiseFormattedIO<DIR>(
-          io, descriptor, *type, table, subscripts);
-    }
-    if (!result.value()) {
-      // Return true for NAMELIST input if we got anything.
-      auto *listInput{
-          io.get_if<ListDirectedStatementState<Direction::Input>>()};
-      return DIR == Direction::Input && j > 0 && listInput &&
-          listInput->inNamelistSequence();
-    }
-  }
-  return true;
-}
-
-RT_API_ATTRS bool DefinedUnformattedIo(IoStatementState &, const Descriptor &,
-    const typeInfo::DerivedType &, const typeInfo::SpecialBinding &);
+extern template RT_API_ATTRS bool DescriptorIO<Direction::Output>(
+    IoStatementState &, const Descriptor &, const NonTbpDefinedIoTable *);
+extern template RT_API_ATTRS bool DescriptorIO<Direction::Input>(
+    IoStatementState &, const Descriptor &, const NonTbpDefinedIoTable *);
 
-// Unformatted I/O
-template <Direction DIR>
-static RT_API_ATTRS bool UnformattedDescriptorIO(IoStatementState &io,
-    const Descriptor &descriptor, const NonTbpDefinedIoTable *table = nullptr) {
-  IoErrorHandler &handler{io.GetIoErrorHandler()};
-  const DescriptorAddendum *addendum{descriptor.Addendum()};
-  if (const typeInfo::DerivedType *
-      type{addendum ? addendum->derivedType() : nullptr}) {
-    // derived type unformatted I/O
-    if (table) {
-      if (const auto *definedIo{table->Find(*type,
-              DIR == Direction::Input ? common::DefinedIo::ReadUnformatted
-                                      : common::DefinedIo::WriteUnformatted)}) {
-        if (definedIo->subroutine) {
-          typeInfo::SpecialBinding special{DIR == Direction::Input
-                  ? typeInfo::SpecialBinding::Which::ReadUnformatted
-                  : typeInfo::SpecialBinding::Which::WriteUnformatted,
-              definedIo->subroutine, definedIo->isDtvArgPolymorphic, false,
-              false};
-          if (Fortran::common::optional<bool> wasDefined{
-                  DefinedUnformattedIo(io, descriptor, *type, special)}) {
-            return *wasDefined;
-          }
-        } else {
-          return DefaultComponentwiseUnformattedIO<DIR>(
-              io, descriptor, *type, table);
-        }
-      }
-    }
-    if (const typeInfo::SpecialBinding *
-        special{type->FindSpecialBinding(DIR == Direction::Input
-                ? typeInfo::SpecialBinding::Which::ReadUnformatted
-                : typeInfo::SpecialBinding::Which::WriteUnformatted)}) {
-      if (!table || !table->ignoreNonTbpEntries || special->isTypeBound()) {
-        // defined derived type unformatted I/O
-        return DefinedUnformattedIo(io, descriptor, *type, *special);
-      }
-    }
-    // Default derived type unformatted I/O
-    // TODO: If no component at any level has defined READ or WRITE
-    // (as appropriate), the elements are contiguous, and no byte swapping
-    // is active, do a block transfer via the code below.
-    return DefaultComponentwiseUnformattedIO<DIR>(io, descriptor, *type, table);
-  } else {
-    // intrinsic type unformatted I/O
-    auto *externalUnf{io.get_if<ExternalUnformattedIoStatementState<DIR>>()};
-    auto *childUnf{io.get_if<ChildUnformattedIoStatementState<DIR>>()};
-    auto *inq{
-        DIR == Direction::Output ? io.get_if<InquireIOLengthState>() : nullptr};
-    RUNTIME_CHECK(handler, externalUnf || childUnf || inq);
-    std::size_t elementBytes{descriptor.ElementBytes()};
-    std::size_t numElements{descriptor.Elements()};
-    std::size_t swappingBytes{elementBytes};
-    if (auto maybeCatAndKind{descriptor.type().GetCategoryAndKind()}) {
-      // Byte swapping units can be smaller than elements, namely
-      // for COMPLEX and CHARACTER.
-      if (maybeCatAndKind->first == TypeCategory::Character) {
-        // swap each character position independently
-        swappingBytes = maybeCatAndKind->second; // kind
-      } else if (maybeCatAndKind->first == TypeCategory::Complex) {
-        // swap real and imaginary components independently
-        swappingBytes /= 2;
-      }
-    }
-    SubscriptValue subscripts[maxRank];
-    descriptor.GetLowerBounds(subscripts);
-    using CharType =
-        std::conditional_t<DIR == Direction::Output, const char, char>;
-    auto Transfer{[=](CharType &x, std::size_t totalBytes) -> bool {
-      if constexpr (DIR == Direction::Output) {
-        return externalUnf ? externalUnf->Emit(&x, totalBytes, swappingBytes)
-            : childUnf     ? childUnf->Emit(&x, totalBytes, swappingBytes)
-                           : inq->Emit(&x, totalBytes, swappingBytes);
-      } else {
-        return externalUnf ? externalUnf->Receive(&x, totalBytes, swappingBytes)
-                           : childUnf->Receive(&x, totalBytes, swappingBytes);
-      }
-    }};
-    bool swapEndianness{externalUnf && externalUnf->unit().swapEndianness()};
-    if (!swapEndianness &&
-        descriptor.IsContiguous()) { // contiguous unformatted I/O
-      char &x{ExtractElement<char>(io, descriptor, subscripts)};
-      return Transfer(x, numElements * elementBytes);
-    } else { // non-contiguous or byte-swapped intrinsic type unformatted I/O
-      for (std::size_t j{0}; j < numElements; ++j) {
-        char &x{ExtractElement<char>(io, descriptor, subscripts)};
-        if (!Transfer(x, elementBytes)) {
-          return false;
-        }
-        if (!descriptor.IncrementSubscripts(subscripts) &&
-            j + 1 < numElements) {
-          handler.Crash("DescriptorIO: subscripts out of bounds");
-        }
-      }
-      return true;
-    }
-  }
-}
-
-template <Direction DIR>
-static RT_API_ATTRS bool DescriptorIO(IoStatementState &io,
-    const Descriptor &descriptor, const NonTbpDefinedIoTable *table) {
-  IoErrorHandler &handler{io.GetIoErrorHandler()};
-  if (handler.InError()) {
-    return false;
-  }
-  if (!io.get_if<IoDirectionState<DIR>>()) {
-    handler.Crash("DescriptorIO() called for wrong I/O direction");
-    return false;
-  }
-  if constexpr (DIR == Direction::Input) {
-    if (!io.BeginReadingRecord()) {
-      return false;
-    }
-  }
-  if (!io.get_if<FormattedIoStatementState<DIR>>()) {
-    return UnformattedDescriptorIO<DIR>(io, descriptor, table);
-  }
-  if (auto catAndKind{descriptor.type().GetCategoryAndKind()}) {
-    TypeCategory cat{catAndKind->first};
-    int kind{catAndKind->second};
-    switch (cat) {
-    case TypeCategory::Integer:
-      switch (kind) {
-      case 1:
-        return FormattedIntegerIO<1, DIR>(io, descriptor, true);
-      case 2:
-        return FormattedIntegerIO<2, DIR>(io, descriptor, true);
-      case 4:
-        return FormattedIntegerIO<4, DIR>(io, descriptor, true);
-      case 8:
-        return FormattedIntegerIO<8, DIR>(io, descriptor, true);
-      case 16:
-        return FormattedIntegerIO<16, DIR>(io, descriptor, true);
-      default:
-        handler.Crash(
-            "not yet implemented: INTEGER(KIND=%d) in formatted IO", kind);
-        return false;
-      }
-    case TypeCategory::Unsigned:
-      switch (kind) {
-      case 1:
-        return FormattedIntegerIO<1, DIR>(io, descriptor, false);
-      case 2:
-        return FormattedIntegerIO<2, DIR>(io, descriptor, false);
-      case 4:
-        return FormattedIntegerIO<4, DIR>(io, descriptor, false);
-      case 8:
-        return FormattedIntegerIO<8, DIR>(io, descriptor, false);
-      case 16:
-        return FormattedIntegerIO<16, DIR>(io, descriptor, false);
-      default:
-        handler.Crash(
-            "not yet implemented: UNSIGNED(KIND=%d) in formatted IO", kind);
-        return false;
-      }
-    case TypeCategory::Real:
-      switch (kind) {
-      case 2:
-        return FormattedRealIO<2, DIR>(io, descriptor);
-      case 3:
-        return FormattedRealIO<3, DIR>(io, descriptor);
-      case 4:
-        return FormattedRealIO<4, DIR>(io, descriptor);
-      case 8:
-        return FormattedRealIO<8, DIR>(io, descriptor);
-      case 10:
-        return FormattedRealIO<10, DIR>(io, descriptor);
-      // TODO: case double/double
-      case 16:
-        return FormattedRealIO<16, DIR>(io, descriptor);
-      default:
-        handler.Crash(
-            "not yet implemented: REAL(KIND=%d) in formatted IO", kind);
-        return false;
-      }
-    case TypeCategory::Complex:
-      switch (kind) {
-      case 2:
-        return FormattedComplexIO<2, DIR>(io, descriptor);
-      case 3:
-        return FormattedComplexIO<3, DIR>(io, descriptor);
-      case 4:
-        return FormattedComplexIO<4, DIR>(io, descriptor);
-      case 8:
-        return FormattedComplexIO<8, DIR>(io, descriptor);
-      case 10:
-        return FormattedComplexIO<10, DIR>(io, descriptor);
-      // TODO: case double/double
-      case 16:
-        return FormattedComplexIO<16, DIR>(io, descriptor);
-      default:
-        handler.Crash(
-            "not yet implemented: COMPLEX(KIND=%d) in formatted IO", kind);
-        return false;
-      }
-    case TypeCategory::Character:
-      switch (kind) {
-      case 1:
-        return FormattedCharacterIO<char, DIR>(io, descriptor);
-      case 2:
-        return FormattedCharacterIO<char16_t, DIR>(io, descriptor);
-      case 4:
-        return FormattedCharacterIO<char32_t, DIR>(io, descriptor);
-      default:
-        handler.Crash(
-            "not yet implemented: CHARACTER(KIND=%d) in formatted IO", kind);
-        return false;
-      }
-    case TypeCategory::Logical:
-      switch (kind) {
-      case 1:
-        return FormattedLogicalIO<1, DIR>(io, descriptor);
-      case 2:
-        return FormattedLogicalIO<2, DIR>(io, descriptor);
-      case 4:
-        return FormattedLogicalIO<4, DIR>(io, descriptor);
-      case 8:
-        return FormattedLogicalIO<8, DIR>(io, descriptor);
-      default:
-        handler.Crash(
-            "not yet implemented: LOGICAL(KIND=%d) in formatted IO", kind);
-        return false;
-      }
-    case TypeCategory::Derived:
-      return FormattedDerivedTypeIO<DIR>(io, descriptor, table);
-    }
-  }
-  handler.Crash("DescriptorIO: bad type code (%d) in descriptor",
-      static_cast<int>(descriptor.type().raw()));
-  return false;
-}
 } // namespace Fortran::runtime::io::descr
 #endif // FLANG_RT_RUNTIME_DESCRIPTOR_IO_H_
diff --git a/flang-rt/lib/runtime/environment.cpp b/flang-rt/lib/runtime/environment.cpp
index 1d5304254ed0e..0f0564403c0e2 100644
--- a/flang-rt/lib/runtime/environment.cpp
+++ b/flang-rt/lib/runtime/environment.cpp
@@ -143,6 +143,10 @@ void ExecutionEnvironment::Configure(int ac, const char *av[],
     }
   }
 
+  if (auto *x{std::getenv("FLANG_RT_DEBUG")}) {
+    internalDebugging = std::strtol(x, nullptr, 10);
+  }
+
   if (auto *x{std::getenv("ACC_OFFLOAD_STACK_SIZE")}) {
     char *end;
     auto n{std::strtoul(x, &end, 10)};
diff --git a/flang-rt/lib/runtime/namelist.cpp b/flang-rt/lib/runtime/namelist.cpp
index b0cf2180fc6d4..1bef387a9771f 100644
--- a/flang-rt/lib/runtime/namelist.cpp
+++ b/flang-rt/lib/runtime/namelist.cpp
@@ -10,6 +10,7 @@
 #include "descriptor-io.h"
 #include "flang-rt/runtime/emit-encoded.h"
 #include "flang-rt/runtime/io-stmt.h"
+#include "flang-rt/runtime/type-info.h"
 #include "flang/Runtime/io-api.h"
 #include <algorithm>
 #include <cstring>
diff --git a/flang-rt/lib/runtime/tools.cpp b/flang-rt/lib/runtime/tools.cpp
index b08195cd31e05..24d05f369fcbe 100644
--- a/flang-rt/lib/runtime/tools.cpp
+++ b/flang-rt/lib/runtime/tools.cpp
@@ -205,7 +205,7 @@ RT_API_ATTRS void ShallowCopyInner(const Descriptor &to, const Descriptor &from,
 // Doing the recursion upwards instead of downwards puts the more common
 // cases earlier in the if-chain and has a tangible impact on performance.
 template <typename P, int RANK> struct ShallowCopyRankSpecialize {
-  static bool execute(const Descriptor &to, const Descriptor &from,
+  static RT_API_ATTRS bool execute(const Descriptor &to, const Descriptor &from,
       bool toIsContiguous, bool fromIsContiguous) {
     if (to.rank() == RANK && from.rank() == RANK) {
       ShallowCopyInner<P, RANK>(to, from, toIsContiguous, fromIsContiguous);
@@ -217,7 +217,7 @@ template <typename P, int RANK> struct ShallowCopyRankSpecialize {
 };
 
 template <typename P> struct ShallowCopyRankSpecialize<P, maxRank + 1> {
-  static bool execute(const Descriptor &to, const Descriptor &from,
+  static RT_API_ATTRS bool execute(const Descriptor &to, const Descriptor &from,
       bool toIsContiguous, bool fromIsContiguous) {
     return false;
   }
diff --git a/flang-rt/lib/runtime/type-info.cpp b/flang-rt/lib/runtime/type-info.cpp
index 82182696d70c6..451213202acef 100644
--- a/flang-rt/lib/runtime/type-info.cpp
+++ b/flang-rt/lib/runtime/type-info.cpp
@@ -140,11 +140,11 @@ RT_API_ATTRS void Component::CreatePointerDescriptor(Descriptor &descriptor,
     const SubscriptValue *subscripts) const {
   RUNTIME_CHECK(terminator, genre_ == Genre::Data);
   EstablishDescriptor(descriptor, container, terminator);
+  std::size_t offset{offset_};
   if (subscripts) {
-    descriptor.set_base_addr(container.Element<char>(subscripts) + offset_);
-  } else {
-    descriptor.set_base_addr(container.OffsetElement<char>() + offset_);
+    offset += container.SubscriptsToByteOffset(subscripts);
   }
+  descriptor.set_base_addr(container.OffsetElement<char>() + offset);
   descriptor.raw().attribute = CFI_attribute_pointer;
 }
 
diff --git a/flang-rt/lib/runtime/work-queue.cpp b/flang-rt/lib/runtime/work-queue.cpp
new file mode 100644
index 0000000000000..a508ecb637102
--- /dev/null
+++ b/flang-rt/lib/runtime/work-queue.cpp
@@ -0,0 +1,161 @@
+//===-- lib/runtime/work-queue.cpp ------------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "flang-rt/runtime/work-queue.h"
+#include "flang-rt/runtime/environment.h"
+#include "flang-rt/runtime/memory.h"
+#include "flang-rt/runtime/type-info.h"
+#include "flang/Common/visit.h"
+
+namespace Fortran::runtime {
+
+#if !defined(RT_DEVICE_COMPILATION)
+// FLANG_RT_DEBUG code is disabled when false.
+static constexpr bool enableDebugOutput{false};
+#endif
+
+RT_OFFLOAD_API_GROUP_BEGIN
+
+RT_API_ATTRS Componentwise::Componentwise(const typeInfo::DerivedType &derived)
+    : derived_{derived}, components_{derived_.component().Elements()} {
+  GetComponent();
+}
+
+RT_API_ATTRS void Componentwise::GetComponent() {
+  if (IsComplete()) {
+    component_ = nullptr;
+  } else {
+    const Descriptor &componentDesc{derived_.component()};
+    component_ = componentDesc.ZeroBasedIndexedElement<typeInfo::Component>(
+        componentAt_);
+  }
+}
+
+RT_API_ATTRS int Ticket::Continue(WorkQueue &workQueue) {
+  if (!begun) {
+    begun = true;
+    return common::visit(
+        [&workQueue](
+            auto &specificTicket) { return specificTicket.Begin(workQueue); },
+        u);
+  } else {
+    return common::visit(
+        [&workQueue](auto &specificTicket) {
+          return specificTicket.Continue(workQueue);
+        },
+        u);
+  }
+}
+
+RT_API_ATTRS WorkQueue::~WorkQueue() {
+  if (last_) {
+    if ((last_->next = firstFree_)) {
+      last_->next->previous = last_;
+    }
+    firstFree_ = first_;
+    first_ = last_ = nullptr;
+  }
+  while (firstFree_) {
+    TicketList *next{firstFree_->next};
+    if (!firstFree_->isStatic) {
+      FreeMemory(firstFree_);
+    }
+    firstFree_ = next;
+  }
+}
+
+RT_API_ATTRS Ticket &WorkQueue::StartTicket() {
+  if (!firstFree_) {
+    void *p{AllocateMemoryOrCrash(terminator_, sizeof(TicketList))};
+    firstFree_ = new (p) TicketList;
+    firstFree_->isStatic = false;
+  }
+  TicketList *newTicket{firstFree_};
+  if ((firstFree_ = newTicket->next)) {
+    firstFree_->previous = nullptr;
+  }
+  TicketList *after{insertAfter_ ? insertAfter_->next : nullptr};
+  if ((newTicket->previous = insertAfter_ ? insertAfter_ : last_)) {
+    newTicket->previous->next = newTicket;
+  } else {
+    first_ = newTicket;
+  }
+  if ((newTicket->next = after)) {
+    after->previous = newTicket;
+  } else {
+    last_ = newTicket;
+  }
+  newTicket->ticket.begun = false;
+#if !defined(RT_DEVICE_COMPILATION)
+  if (enableDebugOutput &&
+      (executionEnvironment.internalDebugging &
+          ExecutionEnvironment::WorkQueue)) {
+    std::fprintf(stderr, "WQ: new ticket\n");
+  }
+#endif
+  return newTicket->ticket;
+}
+
+RT_API_ATTRS int WorkQueue::Run() {
+  while (last_) {
+    TicketList *at{last_};
+    insertAfter_ = last_;
+#if !defined(RT_DEVICE_COMPILATION)
+    if (enableDebugOutput &&
+        (executionEnvironment.internalDebugging &
+            ExecutionEnvironment::WorkQueue)) {
+      std::fprintf(stderr, "WQ: %zd %s\n", at->ticket.u.index(),
+          at->ticket.begun ? "Continue" : "Begin");
+    }
+#endif
+    int stat{at->ticket.Continue(*this)};
+#if !defined(RT_DEVICE_COMPILATION)
+    if (enableDebugOutput &&
+        (executionEnvironment.internalDebugging &
+            ExecutionEnvironment::WorkQueue)) {
+      std::fprintf(stderr, "WQ: ... stat %d\n", stat);
+    }
+#endif
+    insertAfter_ = nullptr;
+    if (stat == StatOk) {
+      if (at->previous) {
+        at->previous->next = at->next;
+      } else {
+        first_ = at->next;
+      }
+      if (at->next) {
+        at->next->previous = at->previous;
+      } else {
+        last_ = at->previous;
+      }
+      if ((at->next = firstFree_)) {
+        at->next->previous = at;
+      }
+      at->previous = nullptr;
+      firstFree_ = at;
+    } else if (stat != StatContinue) {
+      Stop();
+      return stat;
+    }
+  }
+  return StatOk;
+}
+
+RT_API_ATTRS void WorkQueue::Stop() {
+  if (last_) {
+    if ((last_->next = firstFree_)) {
+      last_->next->previous = last_;
+    }
+    firstFree_ = first_;
+    first_ = last_ = nullptr;
+  }
+}
+
+RT_OFFLOAD_API_GROUP_END
+
+} // namespace Fortran::runtime
diff --git a/flang-rt/unittests/Runtime/ExternalIOTest.cpp b/flang-rt/unittests/Runtime/ExternalIOTest.cpp
index 3833e48be3dd6..6c148b1de6f82 100644
--- a/flang-rt/unittests/Runtime/ExternalIOTest.cpp
+++ b/flang-rt/unittests/Runtime/ExternalIOTest.cpp
@@ -184,7 +184,7 @@ TEST(ExternalIOTests, TestSequentialFixedUnformatted) {
   io = IONAME(BeginInquireIoLength)(__FILE__, __LINE__);
   for (int j{1}; j <= 3; ++j) {
     ASSERT_TRUE(IONAME(OutputDescriptor)(io, desc))
-        << "OutputDescriptor() for InquireIoLength";
+        << "OutputDescriptor() for InquireIoLength " << j;
   }
   ASSERT_EQ(IONAME(GetIoLength)(io), 3 * recl) << "GetIoLength";
   ASSERT_EQ(IONAME(EndIoStatement)(io), IostatOk)
diff --git a/flang/docs/Extensions.md b/flang/docs/Extensions.md
index 78d871c593e1d..871749934810c 100644
--- a/flang/docs/Extensions.md
+++ b/flang/docs/Extensions.md
@@ -858,6 +858,16 @@ print *, [(j,j=1,10)]
   warning since such values may have become defined by the time the nested
   expression's value is required.
 
+* Intrinsic assignment of arrays is defined elementally, and intrinsic
+  assignment of derived type components is defined componentwise.
+  However, when intrinsic assignment takes place for an array of derived
+  type, the order of the loop nesting is not defined.
+  Some compilers will loop over the elements, assigning all of the components
+  of each element before proceeding to the next element.
+  This compiler loops over all of the components, and assigns all of
+  the elements for each component before proceeding to the next component.
+  A program using defined assignment might be able to detect the difference.
+
 ## De Facto Standard Features
 
 * `EXTENDS_TYPE_OF()` returns `.TRUE.` if both of its arguments have the
diff --git a/flang/include/flang/Runtime/assign.h b/flang/include/flang/Runtime/assign.h
index bc80997a1bec2..eb1f63184a177 100644
--- a/flang/include/flang/Runtime/assign.h
+++ b/flang/include/flang/Runtime/assign.h
@@ -38,7 +38,7 @@ enum AssignFlags {
   ComponentCanBeDefinedAssignment = 1 << 3,
   ExplicitLengthCharacterLHS = 1 << 4,
   PolymorphicLHS = 1 << 5,
-  DeallocateLHS = 1 << 6
+  DeallocateLHS = 1 << 6,
 };
 
 #ifdef RT_DEVICE_COMPILATION
diff --git a/flang/include/flang/Semantics/tools.h b/flang/include/flang/Semantics/tools.h
index 3839bc1d2a215..79f7032aac312 100644
--- a/flang/include/flang/Semantics/tools.h
+++ b/flang/include/flang/Semantics/tools.h
@@ -182,9 +182,12 @@ const Symbol *HasImpureFinal(
     const Symbol &, std::optional<int> rank = std::nullopt);
 // Is this type finalizable or does it contain any polymorphic allocatable
 // ultimate components?
-bool MayRequireFinalization(const DerivedTypeSpec &derived);
+bool MayRequireFinalization(const DerivedTypeSpec &);
 // Does this type have an allocatable direct component?
-bool HasAllocatableDirectComponent(const DerivedTypeSpec &derived);
+bool HasAllocatableDirectComponent(const DerivedTypeSpec &);
+// Does this type have any defined assignment at any level (or any polymorphic
+// allocatable)?
+bool MayHaveDefinedAssignment(const DerivedTypeSpec &);
 
 bool IsInBlankCommon(const Symbol &);
 bool IsAssumedLengthCharacter(const Symbol &);
diff --git a/flang/lib/Semantics/runtime-type-info.cpp b/flang/lib/Semantics/runtime-type-info.cpp
index ccc5e37c840a9..2a862e0e2858b 100644
--- a/flang/lib/Semantics/runtime-type-info.cpp
+++ b/flang/lib/Semantics/runtime-type-info.cpp
@@ -661,6 +661,10 @@ const Symbol *RuntimeTableBuilder::DescribeType(
     AddValue(dtValues, derivedTypeSchema_, "nofinalizationneeded"s,
         IntExpr<1>(
             derivedTypeSpec && !MayRequireFinalization(*derivedTypeSpec)));
+    // Similarly, a flag to enable optimized runtime assignment.
+    AddValue(dtValues, derivedTypeSchema_, "nodefinedassignment"s,
+        IntExpr<1>(
+            derivedTypeSpec && !MayHaveDefinedAssignment(*derivedTypeSpec)));
   }
   dtObject.get<ObjectEntityDetails>().set_init(MaybeExpr{
       StructureExpr(Structure(derivedTypeSchema_, std::move(dtValues)))});
diff --git a/flang/lib/Semantics/tools.cpp b/flang/lib/Semantics/tools.cpp
index 1d1e3ac044166..3247addc905ba 100644
--- a/flang/lib/Semantics/tools.cpp
+++ b/flang/lib/Semantics/tools.cpp
@@ -813,6 +813,38 @@ bool HasAllocatableDirectComponent(const DerivedTypeSpec &derived) {
   return std::any_of(directs.begin(), directs.end(), IsAllocatable);
 }
 
+static bool MayHaveDefinedAssignment(
+    const DerivedTypeSpec &derived, std::set<const Scope *> &checked) {
+  if (const Scope *scope{derived.GetScope()};
+      scope && checked.find(scope) == checked.end()) {
+    checked.insert(scope);
+    for (const auto &[_, symbolRef] : *scope) {
+      if (const auto *generic{symbolRef->detailsIf<GenericDetails>()}) {
+        if (generic->kind().IsAssignment()) {
+          return true;
+        }
+      } else if (symbolRef->has<ObjectEntityDetails>() &&
+          !IsPointer(*symbolRef)) {
+        if (const DeclTypeSpec *type{symbolRef->GetType()}) {
+          if (type->IsPolymorphic()) {
+            return true;
+          } else if (const DerivedTypeSpec *derived{type->AsDerived()}) {
+            if (MayHaveDefinedAssignment(*derived, checked)) {
+              return true;
+            }
+          }
+        }
+      }
+    }
+  }
+  return false;
+}
+
+bool MayHaveDefinedAssignment(const DerivedTypeSpec &derived) {
+  std::set<const Scope *> checked;
+  return MayHaveDefinedAssignment(derived, checked);
+}
+
 bool IsAssumedLengthCharacter(const Symbol &symbol) {
   if (const DeclTypeSpec * type{symbol.GetType()}) {
     return type->category() == DeclTypeSpec::Character &&
diff --git a/flang/module/__fortran_type_info.f90 b/flang/module/__fortran_type_info.f90
index b30a6bf697563..7226b06504d28 100644
--- a/flang/module/__fortran_type_info.f90
+++ b/flang/module/__fortran_type_info.f90
@@ -52,7 +52,8 @@
     integer(1) :: noInitializationNeeded ! 1 if no component w/ init
     integer(1) :: noDestructionNeeded ! 1 if no component w/ dealloc/final
     integer(1) :: noFinalizationNeeded ! 1 if nothing finalizeable
-    integer(1) :: __padding0(4)
+    integer(1) :: noDefinedAssignment ! 1 if no defined ASSIGNMENT(=)
+    integer(1) :: __padding0(3)
   end type
 
   type :: Binding
diff --git a/flang/test/Lower/volatile-openmp.f90 b/flang/test/Lower/volatile-openmp.f90
index 28f0bf78f33c9..2e05b652822b5 100644
--- a/flang/test/Lower/volatile-openmp.f90
+++ b/flang/test/Lower/volatile-openmp.f90
@@ -23,11 +23,11 @@
 ! CHECK:           %[[VAL_11:.*]] = fir.address_of(@_QFEcontainer) : !fir.ref<!fir.type<_QFTt{array:!fir.box<!fir.ptr<!fir.array<?xi32>>>}>>
 ! CHECK:           %[[VAL_12:.*]] = fir.volatile_cast %[[VAL_11]] : (!fir.ref<!fir.type<_QFTt{array:!fir.box<!fir.ptr<!fir.array<?xi32>>>}>>) -> !fir.ref<!fir.type<_QFTt{array:!fir.box<!fir.ptr<!fir.array<?xi32>>>}>, volatile>
 ! CHECK:           %[[VAL_13:.*]]:2 = hlfir.declare %[[VAL_12]] {fortran_attrs = #fir.var_attrs<volatile>, uniq_name = "_QFEcontainer"} : (!fir.ref<!fir.type<_QFTt{array:!fir.box<!fir.ptr<!fir.array<?xi32>>>}>, volatile>) -> (!fir.ref<!fir.type<_QFTt{array:!fir.box<!fir.ptr<!fir.array<?xi32>>>}>, volatile>, !fir.ref<!fir.type<_QFTt{array:!fir.box<!fir.ptr<!fir.array<?xi32>>>}>, volatile>)
-! CHECK:           %[[VAL_14:.*]] = fir.address_of(@_QFE.c.t) : !fir.ref<!fir.array<1x!fir.type<_QM__fortran_type_infoTcomponent{name:!fir.box<!fir.ptr<!fir.char<1,?>>>,genre:i8,category:i8,kind:i8,rank:i8,__padding0:!fir.array<4xi8>,offset:i64,characterlen:!fir.type<_QM__fortran_type_infoTvalue{{[<]?}}{genre:i8,__padding0:!fir.array<7xi8>,value:i64}{{[>]?}}>,derived:!fir.box<!fir.ptr<!fir.type<_QM__fortran_type_infoTderivedtype{binding:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTbinding{proc:!fir.type<_QM__fortran_builtinsT__builtin_c_funptr{__address:i64}>,name:!fir.box<!fir.ptr<!fir.char<1,?>>>}>>>>,name:!fir.box<!fir.ptr<!fir.char<1,?>>>,sizeinbytes:i64,uninstantiated:!fir.box<!fir.ptr<!fir.type<_QM__fortran_type_infoTderivedtype>>>,kindparameter:!fir.box<!fir.ptr<!fir.array<?xi64>>>,lenparameterkind:!fir.box<!fir.ptr<!fir.array<?xi8>>>,component:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTcomponent>>>>,procptr:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTprocptrcomponent{name:!fir.box<!fir.ptr<!fir.char<1,?>>>,offset:i64,initialization:!fir.type<_QM__fortran_builtinsT__builtin_c_funptr{__address:i64}>}>>>>,special:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTspecialbinding{{[<]?}}{which:i8,isargdescriptorset:i8,istypebound:i8,isargcontiguousset:i8,__padding0:!fir.array<4xi8>,proc:!fir.type<_QM__fortran_builtinsT__builtin_c_funptr{__address:i64}>}{{[>]?}}>>>>,specialbitset:i32,hasparent:i8,noinitializationneeded:i8,nodestructionneeded:i8,nofinalizationneeded:i8,__padding0:!fir.array<4xi8>}>>>,lenvalue:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTvalue{{[<]?}}{genre:i8,__padding0:!fir.array<7xi8>,value:i64}{{[>]?}}>>>>,bounds:!fir.box<!fir.ptr<!fir.array<?x?x!fir.type<_QM__fortran_type_infoTvalue{{[<]?}}{genre:i8,__padding0:!fir.array<7xi8>,value:i64}{{[>]?}}>>>>,initialization:!fir.type<_QM__fortran_builtinsT__builtin_c_ptr{__address:i64}>}>>>
+! CHECK:           %[[VAL_14:.*]] = fir.address_of(@_QFE.c.t) : !fir.ref<!fir.array<1x!fir.type<_QM__fortran_type_infoTcomponent{name:!fir.box<!fir.ptr<!fir.char<1,?>>>,genre:i8,category:i8,kind:i8,rank:i8,__padding0:!fir.array<4xi8>,offset:i64,characterlen:!fir.type<_QM__fortran_type_infoTvalue{{[<]?}}{genre:i8,__padding0:!fir.array<7xi8>,value:i64}{{[>]?}}>,derived:!fir.box<!fir.ptr<!fir.type<_QM__fortran_type_infoTderivedtype{binding:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTbinding{proc:!fir.type<_QM__fortran_builtinsT__builtin_c_funptr{__address:i64}>,name:!fir.box<!fir.ptr<!fir.char<1,?>>>}>>>>,name:!fir.box<!fir.ptr<!fir.char<1,?>>>,sizeinbytes:i64,uninstantiated:!fir.box<!fir.ptr<!fir.type<_QM__fortran_type_infoTderivedtype>>>,kindparameter:!fir.box<!fir.ptr<!fir.array<?xi64>>>,lenparameterkind:!fir.box<!fir.ptr<!fir.array<?xi8>>>,component:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTcomponent>>>>,procptr:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTprocptrcomponent{name:!fir.box<!fir.ptr<!fir.char<1,?>>>,offset:i64,initialization:!fir.type<_QM__fortran_builtinsT__builtin_c_funptr{__address:i64}>}>>>>,special:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTspecialbinding{{[<]?}}{which:i8,isargdescriptorset:i8,istypebound:i8,isargcontiguousset:i8,__padding0:!fir.array<4xi8>,proc:!fir.type<_QM__fortran_builtinsT__builtin_c_funptr{__address:i64}>}{{[>]?}}>>>>,specialbitset:i32,hasparent:i8,noinitializationneeded:i8,nodestructionneeded:i8,nofinalizationneeded:i8,nodefinedassignment:i8,__padding0:!fir.array<3xi8>}>>>,lenvalue:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTvalue{{[<]?}}{genre:i8,__padding0:!fir.array<7xi8>,value:i64}{{[>]?}}>>>>,bounds:!fir.box<!fir.ptr<!fir.array<?x?x!fir.type<_QM__fortran_type_infoTvalue{{[<]?}}{genre:i8,__padding0:!fir.array<7xi8>,value:i64}{{[>]?}}>>>>,initialization:!fir.type<_QM__fortran_builtinsT__builtin_c_ptr{__address:i64}>}>>>
 ! CHECK:           %[[VAL_15:.*]] = fir.shape_shift %[[VAL_0]], %[[VAL_1]] : (index, index) -> !fir.shapeshift<1>
-! CHECK:           %[[VAL_16:.*]]:2 = hlfir.declare %[[VAL_14]](%[[VAL_15]]) {fortran_attrs = #fir.var_attrs<target>, uniq_name = "_QFE.c.t"} : (!fir.ref<!fir.array<1x!fir.type<_QM__fortran_type_infoTcomponent{name:!fir.box<!fir.ptr<!fir.char<1,?>>>,genre:i8,category:i8,kind:i8,rank:i8,__padding0:!fir.array<4xi8>,offset:i64,characterlen:!fir.type<_QM__fortran_type_infoTvalue{{[<]?}}{genre:i8,__padding0:!fir.array<7xi8>,value:i64}{{[>]?}}>,derived:!fir.box<!fir.ptr<!fir.type<_QM__fortran_type_infoTderivedtype{binding:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTbinding{proc:!fir.type<_QM__fortran_builtinsT__builtin_c_funptr{__address:i64}>,name:!fir.box<!fir.ptr<!fir.char<1,?>>>}>>>>,name:!fir.box<!fir.ptr<!fir.char<1,?>>>,sizeinbytes:i64,uninstantiated:!fir.box<!fir.ptr<!fir.type<_QM__fortran_type_infoTderivedtype>>>,kindparameter:!fir.box<!fir.ptr<!fir.array<?xi64>>>,lenparameterkind:!fir.box<!fir.ptr<!fir.array<?xi8>>>,component:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTcomponent>>>>,procptr:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTprocptrcomponent{name:!fir.box<!fir.ptr<!fir.char<1,?>>>,offset:i64,initialization:!fir.type<_QM__fortran_builtinsT__builtin_c_funptr{__address:i64}>}>>>>,special:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTspecialbinding{{[<]?}}{which:i8,isargdescriptorset:i8,istypebound:i8,isargcontiguousset:i8,__padding0:!fir.array<4xi8>,proc:!fir.type<_QM__fortran_builtinsT__builtin_c_funptr{__address:i64}>}{{[>]?}}>>>>,specialbitset:i32,hasparent:i8,noinitializationneeded:i8,nodestructionneeded:i8,nofinalizationneeded:i8,__padding0:!fir.array<4xi8>}>>>,lenvalue:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTvalue{{[<]?}}{genre:i8,__padding0:!fir.array<7xi8>,value:i64}{{[>]?}}>>>>,bounds:!fir.box<!fir.ptr<!fir.array<?x?x!fir.type<_QM__fortran_type_infoTvalue{{[<]?}}{genre:i8,__padding0:!fir.array<7xi8>,value:i64}{{[>]?}}>>>>,initialization:!fir.type<_QM__fortran_builtinsT__builtin_c_ptr{__address:i64}>}>>>, !fir.shapeshift<1>) -> (!fir.box<!fir.array<1x!fir.type<_QM__fortran_type_infoTcomponent{name:!fir.box<!fir.ptr<!fir.char<1,?>>>,genre:i8,category:i8,kind:i8,rank:i8,__padding0:!fir.array<4xi8>,offset:i64,characterlen:!fir.type<_QM__fortran_type_infoTvalue{{[<]?}}{genre:i8,__padding0:!fir.array<7xi8>,value:i64}{{[>]?}}>,derived:!fir.box<!fir.ptr<!fir.type<_QM__fortran_type_infoTderivedtype{binding:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTbinding{proc:!fir.type<_QM__fortran_builtinsT__builtin_c_funptr{__address:i64}>,name:!fir.box<!fir.ptr<!fir.char<1,?>>>}>>>>,name:!fir.box<!fir.ptr<!fir.char<1,?>>>,sizeinbytes:i64,uninstantiated:!fir.box<!fir.ptr<!fir.type<_QM__fortran_type_infoTderivedtype>>>,kindparameter:!fir.box<!fir.ptr<!fir.array<?xi64>>>,lenparameterkind:!fir.box<!fir.ptr<!fir.array<?xi8>>>,component:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTcomponent>>>>,procptr:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTprocptrcomponent{name:!fir.box<!fir.ptr<!fir.char<1,?>>>,offset:i64,initialization:!fir.type<_QM__fortran_builtinsT__builtin_c_funptr{__address:i64}>}>>>>,special:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTspecialbinding{{[<]?}}{which:i8,isargdescriptorset:i8,istypebound:i8,isargcontiguousset:i8,__padding0:!fir.array<4xi8>,proc:!fir.type<_QM__fortran_builtinsT__builtin_c_funptr{__address:i64}>}{{[>]?}}>>>>,specialbitset:i32,hasparent:i8,noinitializationneeded:i8,nodestructionneeded:i8,nofinalizationneeded:i8,__padding0:!fir.array<4xi8>}>>>,lenvalue:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTvalue{{[<]?}}{genre:i8,__padding0:!fir.array<7xi8>,value:i64}{{[>]?}}>>>>,bounds:!fir.box<!fir.ptr<!fir.array<?x?x!fir.type<_QM__fortran_type_infoTvalue{{[<]?}}{genre:i8,__padding0:!fir.array<7xi8>,value:i64}{{[>]?}}>>>>,initialization:!fir.type<_QM__fortran_builtinsT__builtin_c_ptr{__address:i64}>}>>>, !fir.ref<!fir.array<1x!fir.type<_QM__fortran_type_infoTcomponent{name:!fir.box<!fir.ptr<!fir.char<1,?>>>,genre:i8,category:i8,kind:i8,rank:i8,__padding0:!fir.array<4xi8>,offset:i64,characterlen:!fir.type<_QM__fortran_type_infoTvalue{{[<]?}}{genre:i8,__padding0:!fir.array<7xi8>,value:i64}{{[>]?}}>,derived:!fir.box<!fir.ptr<!fir.type<_QM__fortran_type_infoTderivedtype{binding:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTbinding{proc:!fir.type<_QM__fortran_builtinsT__builtin_c_funptr{__address:i64}>,name:!fir.box<!fir.ptr<!fir.char<1,?>>>}>>>>,name:!fir.box<!fir.ptr<!fir.char<1,?>>>,sizeinbytes:i64,uninstantiated:!fir.box<!fir.ptr<!fir.type<_QM__fortran_type_infoTderivedtype>>>,kindparameter:!fir.box<!fir.ptr<!fir.array<?xi64>>>,lenparameterkind:!fir.box<!fir.ptr<!fir.array<?xi8>>>,component:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTcomponent>>>>,procptr:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTprocptrcomponent{name:!fir.box<!fir.ptr<!fir.char<1,?>>>,offset:i64,initialization:!fir.type<_QM__fortran_builtinsT__builtin_c_funptr{__address:i64}>}>>>>,special:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTspecialbinding{{[<]?}}{which:i8,isargdescriptorset:i8,istypebound:i8,isargcontiguousset:i8,__padding0:!fir.array<4xi8>,proc:!fir.type<_QM__fortran_builtinsT__builtin_c_funptr{__address:i64}>}{{[>]?}}>>>>,specialbitset:i32,hasparent:i8,noinitializationneeded:i8,nodestructionneeded:i8,nofinalizationneeded:i8,__padding0:!fir.array<4xi8>}>>>,lenvalue:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTvalue{{[<]?}}{genre:i8,__padding0:!fir.array<7xi8>,value:i64}{{[>]?}}>>>>,bounds:!fir.box<!fir.ptr<!fir.array<?x?x!fir.type<_QM__fortran_type_infoTvalue{{[<]?}}{genre:i8,__padding0:!fir.array<7xi8>,value:i64}{{[>]?}}>>>>,initialization:!fir.type<_QM__fortran_builtinsT__builtin_c_ptr{__address:i64}>}>>>)
-! CHECK:           %[[VAL_17:.*]] = fir.address_of(@_QFE.dt.t) : !fir.ref<!fir.type<_QM__fortran_type_infoTderivedtype{binding:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTbinding{proc:!fir.type<_QM__fortran_builtinsT__builtin_c_funptr{__address:i64}>,name:!fir.box<!fir.ptr<!fir.char<1,?>>>}>>>>,name:!fir.box<!fir.ptr<!fir.char<1,?>>>,sizeinbytes:i64,uninstantiated:!fir.box<!fir.ptr<!fir.type<_QM__fortran_type_infoTderivedtype>>>,kindparameter:!fir.box<!fir.ptr<!fir.array<?xi64>>>,lenparameterkind:!fir.box<!fir.ptr<!fir.array<?xi8>>>,component:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTcomponent{name:!fir.box<!fir.ptr<!fir.char<1,?>>>,genre:i8,category:i8,kind:i8,rank:i8,__padding0:!fir.array<4xi8>,offset:i64,characterlen:!fir.type<_QM__fortran_type_infoTvalue{{[<]?}}{genre:i8,__padding0:!fir.array<7xi8>,value:i64}{{[>]?}}>,derived:!fir.box<!fir.ptr<!fir.type<_QM__fortran_type_infoTderivedtype>>>,lenvalue:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTvalue{{[<]?}}{genre:i8,__padding0:!fir.array<7xi8>,value:i64}{{[>]?}}>>>>,bounds:!fir.box<!fir.ptr<!fir.array<?x?x!fir.type<_QM__fortran_type_infoTvalue{{[<]?}}{genre:i8,__padding0:!fir.array<7xi8>,value:i64}{{[>]?}}>>>>,initialization:!fir.type<_QM__fortran_builtinsT__builtin_c_ptr{__address:i64}>}>>>>,procptr:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTprocptrcomponent{name:!fir.box<!fir.ptr<!fir.char<1,?>>>,offset:i64,initialization:!fir.type<_QM__fortran_builtinsT__builtin_c_funptr{__address:i64}>}>>>>,special:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTspecialbinding{{[<]?}}{which:i8,isargdescriptorset:i8,istypebound:i8,isargcontiguousset:i8,__padding0:!fir.array<4xi8>,proc:!fir.type<_QM__fortran_builtinsT__builtin_c_funptr{__address:i64}>}{{[>]?}}>>>>,specialbitset:i32,hasparent:i8,noinitializationneeded:i8,nodestructionneeded:i8,nofinalizationneeded:i8,__padding0:!fir.array<4xi8>}>>
-! CHECK:           %[[VAL_18:.*]]:2 = hlfir.declare %[[VAL_17]] {fortran_attrs = #fir.var_attrs<target>, uniq_name = "_QFE.dt.t"} : (!fir.ref<!fir.type<_QM__fortran_type_infoTderivedtype{binding:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTbinding{proc:!fir.type<_QM__fortran_builtinsT__builtin_c_funptr{__address:i64}>,name:!fir.box<!fir.ptr<!fir.char<1,?>>>}>>>>,name:!fir.box<!fir.ptr<!fir.char<1,?>>>,sizeinbytes:i64,uninstantiated:!fir.box<!fir.ptr<!fir.type<_QM__fortran_type_infoTderivedtype>>>,kindparameter:!fir.box<!fir.ptr<!fir.array<?xi64>>>,lenparameterkind:!fir.box<!fir.ptr<!fir.array<?xi8>>>,component:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTcomponent{name:!fir.box<!fir.ptr<!fir.char<1,?>>>,genre:i8,category:i8,kind:i8,rank:i8,__padding0:!fir.array<4xi8>,offset:i64,characterlen:!fir.type<_QM__fortran_type_infoTvalue{{[<]?}}{genre:i8,__padding0:!fir.array<7xi8>,value:i64}{{[>]?}}>,derived:!fir.box<!fir.ptr<!fir.type<_QM__fortran_type_infoTderivedtype>>>,lenvalue:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTvalue{{[<]?}}{genre:i8,__padding0:!fir.array<7xi8>,value:i64}{{[>]?}}>>>>,bounds:!fir.box<!fir.ptr<!fir.array<?x?x!fir.type<_QM__fortran_type_infoTvalue{{[<]?}}{genre:i8,__padding0:!fir.array<7xi8>,value:i64}{{[>]?}}>>>>,initialization:!fir.type<_QM__fortran_builtinsT__builtin_c_ptr{__address:i64}>}>>>>,procptr:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTprocptrcomponent{name:!fir.box<!fir.ptr<!fir.char<1,?>>>,offset:i64,initialization:!fir.type<_QM__fortran_builtinsT__builtin_c_funptr{__address:i64}>}>>>>,special:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTspecialbinding{{[<]?}}{which:i8,isargdescriptorset:i8,istypebound:i8,isargcontiguousset:i8,__padding0:!fir.array<4xi8>,proc:!fir.type<_QM__fortran_builtinsT__builtin_c_funptr{__address:i64}>}{{[>]?}}>>>>,specialbitset:i32,hasparent:i8,noinitializationneeded:i8,nodestructionneeded:i8,nofinalizationneeded:i8,__padding0:!fir.array<4xi8>}>>) -> (!fir.ref<!fir.type<_QM__fortran_type_infoTderivedtype{binding:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTbinding{proc:!fir.type<_QM__fortran_builtinsT__builtin_c_funptr{__address:i64}>,name:!fir.box<!fir.ptr<!fir.char<1,?>>>}>>>>,name:!fir.box<!fir.ptr<!fir.char<1,?>>>,sizeinbytes:i64,uninstantiated:!fir.box<!fir.ptr<!fir.type<_QM__fortran_type_infoTderivedtype>>>,kindparameter:!fir.box<!fir.ptr<!fir.array<?xi64>>>,lenparameterkind:!fir.box<!fir.ptr<!fir.array<?xi8>>>,component:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTcomponent{name:!fir.box<!fir.ptr<!fir.char<1,?>>>,genre:i8,category:i8,kind:i8,rank:i8,__padding0:!fir.array<4xi8>,offset:i64,characterlen:!fir.type<_QM__fortran_type_infoTvalue{{[<]?}}{genre:i8,__padding0:!fir.array<7xi8>,value:i64}{{[>]?}}>,derived:!fir.box<!fir.ptr<!fir.type<_QM__fortran_type_infoTderivedtype>>>,lenvalue:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTvalue{{[<]?}}{genre:i8,__padding0:!fir.array<7xi8>,value:i64}{{[>]?}}>>>>,bounds:!fir.box<!fir.ptr<!fir.array<?x?x!fir.type<_QM__fortran_type_infoTvalue{{[<]?}}{genre:i8,__padding0:!fir.array<7xi8>,value:i64}{{[>]?}}>>>>,initialization:!fir.type<_QM__fortran_builtinsT__builtin_c_ptr{__address:i64}>}>>>>,procptr:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTprocptrcomponent{name:!fir.box<!fir.ptr<!fir.char<1,?>>>,offset:i64,initialization:!fir.type<_QM__fortran_builtinsT__builtin_c_funptr{__address:i64}>}>>>>,special:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTspecialbinding{{[<]?}}{which:i8,isargdescriptorset:i8,istypebound:i8,isargcontiguousset:i8,__padding0:!fir.array<4xi8>,proc:!fir.type<_QM__fortran_builtinsT__builtin_c_funptr{__address:i64}>}{{[>]?}}>>>>,specialbitset:i32,hasparent:i8,noinitializationneeded:i8,nodestructionneeded:i8,nofinalizationneeded:i8,__padding0:!fir.array<4xi8>}>>, !fir.ref<!fir.type<_QM__fortran_type_infoTderivedtype{binding:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTbinding{proc:!fir.type<_QM__fortran_builtinsT__builtin_c_funptr{__address:i64}>,name:!fir.box<!fir.ptr<!fir.char<1,?>>>}>>>>,name:!fir.box<!fir.ptr<!fir.char<1,?>>>,sizeinbytes:i64,uninstantiated:!fir.box<!fir.ptr<!fir.type<_QM__fortran_type_infoTderivedtype>>>,kindparameter:!fir.box<!fir.ptr<!fir.array<?xi64>>>,lenparameterkind:!fir.box<!fir.ptr<!fir.array<?xi8>>>,component:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTcomponent{name:!fir.box<!fir.ptr<!fir.char<1,?>>>,genre:i8,category:i8,kind:i8,rank:i8,__padding0:!fir.array<4xi8>,offset:i64,characterlen:!fir.type<_QM__fortran_type_infoTvalue{{[<]?}}{genre:i8,__padding0:!fir.array<7xi8>,value:i64}{{[>]?}}>,derived:!fir.box<!fir.ptr<!fir.type<_QM__fortran_type_infoTderivedtype>>>,lenvalue:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTvalue{{[<]?}}{genre:i8,__padding0:!fir.array<7xi8>,value:i64}{{[>]?}}>>>>,bounds:!fir.box<!fir.ptr<!fir.array<?x?x!fir.type<_QM__fortran_type_infoTvalue{{[<]?}}{genre:i8,__padding0:!fir.array<7xi8>,value:i64}{{[>]?}}>>>>,initialization:!fir.type<_QM__fortran_builtinsT__builtin_c_ptr{__address:i64}>}>>>>,procptr:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTprocptrcomponent{name:!fir.box<!fir.ptr<!fir.char<1,?>>>,offset:i64,initialization:!fir.type<_QM__fortran_builtinsT__builtin_c_funptr{__address:i64}>}>>>>,special:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTspecialbinding{{[<]?}}{which:i8,isargdescriptorset:i8,istypebound:i8,isargcontiguousset:i8,__padding0:!fir.array<4xi8>,proc:!fir.type<_QM__fortran_builtinsT__builtin_c_funptr{__address:i64}>}{{[>]?}}>>>>,specialbitset:i32,hasparent:i8,noinitializationneeded:i8,nodestructionneeded:i8,nofinalizationneeded:i8,__padding0:!fir.array<4xi8>}>>)
+! CHECK:           %[[VAL_16:.*]]:2 = hlfir.declare %[[VAL_14]](%[[VAL_15]]) {fortran_attrs = #fir.var_attrs<target>, uniq_name = "_QFE.c.t"} : (!fir.ref<!fir.array<1x!fir.type<_QM__fortran_type_infoTcomponent{name:!fir.box<!fir.ptr<!fir.char<1,?>>>,genre:i8,category:i8,kind:i8,rank:i8,__padding0:!fir.array<4xi8>,offset:i64,characterlen:!fir.type<_QM__fortran_type_infoTvalue{{[<]?}}{genre:i8,__padding0:!fir.array<7xi8>,value:i64}{{[>]?}}>,derived:!fir.box<!fir.ptr<!fir.type<_QM__fortran_type_infoTderivedtype{binding:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTbinding{proc:!fir.type<_QM__fortran_builtinsT__builtin_c_funptr{__address:i64}>,name:!fir.box<!fir.ptr<!fir.char<1,?>>>}>>>>,name:!fir.box<!fir.ptr<!fir.char<1,?>>>,sizeinbytes:i64,uninstantiated:!fir.box<!fir.ptr<!fir.type<_QM__fortran_type_infoTderivedtype>>>,kindparameter:!fir.box<!fir.ptr<!fir.array<?xi64>>>,lenparameterkind:!fir.box<!fir.ptr<!fir.array<?xi8>>>,component:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTcomponent>>>>,procptr:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTprocptrcomponent{name:!fir.box<!fir.ptr<!fir.char<1,?>>>,offset:i64,initialization:!fir.type<_QM__fortran_builtinsT__builtin_c_funptr{__address:i64}>}>>>>,special:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTspecialbinding{{[<]?}}{which:i8,isargdescriptorset:i8,istypebound:i8,isargcontiguousset:i8,__padding0:!fir.array<4xi8>,proc:!fir.type<_QM__fortran_builtinsT__builtin_c_funptr{__address:i64}>}{{[>]?}}>>>>,specialbitset:i32,hasparent:i8,noinitializationneeded:i8,nodestructionneeded:i8,nofinalizationneeded:i8,nodefinedassignment:i8,__padding0:!fir.array<3xi8>}>>>,lenvalue:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTvalue{{[<]?}}{genre:i8,__padding0:!fir.array<7xi8>,value:i64}{{[>]?}}>>>>,bounds:!fir.box<!fir.ptr<!fir.array<?x?x!fir.type<_QM__fortran_type_infoTvalue{{[<]?}}{genre:i8,__padding0:!fir.array<7xi8>,value:i64}{{[>]?}}>>>>,initialization:!fir.type<_QM__fortran_builtinsT__builtin_c_ptr{__address:i64}>}>>>, !fir.shapeshift<1>) -> (!fir.box<!fir.array<1x!fir.type<_QM__fortran_type_infoTcomponent{name:!fir.box<!fir.ptr<!fir.char<1,?>>>,genre:i8,category:i8,kind:i8,rank:i8,__padding0:!fir.array<4xi8>,offset:i64,characterlen:!fir.type<_QM__fortran_type_infoTvalue{{[<]?}}{genre:i8,__padding0:!fir.array<7xi8>,value:i64}{{[>]?}}>,derived:!fir.box<!fir.ptr<!fir.type<_QM__fortran_type_infoTderivedtype{binding:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTbinding{proc:!fir.type<_QM__fortran_builtinsT__builtin_c_funptr{__address:i64}>,name:!fir.box<!fir.ptr<!fir.char<1,?>>>}>>>>,name:!fir.box<!fir.ptr<!fir.char<1,?>>>,sizeinbytes:i64,uninstantiated:!fir.box<!fir.ptr<!fir.type<_QM__fortran_type_infoTderivedtype>>>,kindparameter:!fir.box<!fir.ptr<!fir.array<?xi64>>>,lenparameterkind:!fir.box<!fir.ptr<!fir.array<?xi8>>>,component:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTcomponent>>>>,procptr:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTprocptrcomponent{name:!fir.box<!fir.ptr<!fir.char<1,?>>>,offset:i64,initialization:!fir.type<_QM__fortran_builtinsT__builtin_c_funptr{__address:i64}>}>>>>,special:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTspecialbinding{{[<]?}}{which:i8,isargdescriptorset:i8,istypebound:i8,isargcontiguousset:i8,__padding0:!fir.array<4xi8>,proc:!fir.type<_QM__fortran_builtinsT__builtin_c_funptr{__address:i64}>}{{[>]?}}>>>>,specialbitset:i32,hasparent:i8,noinitializationneeded:i8,nodestructionneeded:i8,nofinalizationneeded:i8,nodefinedassignment:i8,__padding0:!fir.array<3xi8>}>>>,lenvalue:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTvalue{{[<]?}}{genre:i8,__padding0:!fir.array<7xi8>,value:i64}{{[>]?}}>>>>,bounds:!fir.box<!fir.ptr<!fir.array<?x?x!fir.type<_QM__fortran_type_infoTvalue{{[<]?}}{genre:i8,__padding0:!fir.array<7xi8>,value:i64}{{[>]?}}>>>>,initialization:!fir.type<_QM__fortran_builtinsT__builtin_c_ptr{__address:i64}>}>>>, !fir.ref<!fir.array<1x!fir.type<_QM__fortran_type_infoTcomponent{name:!fir.box<!fir.ptr<!fir.char<1,?>>>,genre:i8,category:i8,kind:i8,rank:i8,__padding0:!fir.array<4xi8>,offset:i64,characterlen:!fir.type<_QM__fortran_type_infoTvalue{{[<]?}}{genre:i8,__padding0:!fir.array<7xi8>,value:i64}{{[>]?}}>,derived:!fir.box<!fir.ptr<!fir.type<_QM__fortran_type_infoTderivedtype{binding:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTbinding{proc:!fir.type<_QM__fortran_builtinsT__builtin_c_funptr{__address:i64}>,name:!fir.box<!fir.ptr<!fir.char<1,?>>>}>>>>,name:!fir.box<!fir.ptr<!fir.char<1,?>>>,sizeinbytes:i64,uninstantiated:!fir.box<!fir.ptr<!fir.type<_QM__fortran_type_infoTderivedtype>>>,kindparameter:!fir.box<!fir.ptr<!fir.array<?xi64>>>,lenparameterkind:!fir.box<!fir.ptr<!fir.array<?xi8>>>,component:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTcomponent>>>>,procptr:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTprocptrcomponent{name:!fir.box<!fir.ptr<!fir.char<1,?>>>,offset:i64,initialization:!fir.type<_QM__fortran_builtinsT__builtin_c_funptr{__address:i64}>}>>>>,special:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTspecialbinding{{[<]?}}{which:i8,isargdescriptorset:i8,istypebound:i8,isargcontiguousset:i8,__padding0:!fir.array<4xi8>,proc:!fir.type<_QM__fortran_builtinsT__builtin_c_funptr{__address:i64}>}{{[>]?}}>>>>,specialbitset:i32,hasparent:i8,noinitializationneeded:i8,nodestructionneeded:i8,nofinalizationneeded:i8,nodefinedassignment:i8,__padding0:!fir.array<3xi8>}>>>,lenvalue:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTvalue{{[<]?}}{genre:i8,__padding0:!fir.array<7xi8>,value:i64}{{[>]?}}>>>>,bounds:!fir.box<!fir.ptr<!fir.array<?x?x!fir.type<_QM__fortran_type_infoTvalue{{[<]?}}{genre:i8,__padding0:!fir.array<7xi8>,value:i64}{{[>]?}}>>>>,initialization:!fir.type<_QM__fortran_builtinsT__builtin_c_ptr{__address:i64}>}>>>)
+! CHECK:           %[[VAL_17:.*]] = fir.address_of(@_QFE.dt.t) : !fir.ref<!fir.type<_QM__fortran_type_infoTderivedtype{binding:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTbinding{proc:!fir.type<_QM__fortran_builtinsT__builtin_c_funptr{__address:i64}>,name:!fir.box<!fir.ptr<!fir.char<1,?>>>}>>>>,name:!fir.box<!fir.ptr<!fir.char<1,?>>>,sizeinbytes:i64,uninstantiated:!fir.box<!fir.ptr<!fir.type<_QM__fortran_type_infoTderivedtype>>>,kindparameter:!fir.box<!fir.ptr<!fir.array<?xi64>>>,lenparameterkind:!fir.box<!fir.ptr<!fir.array<?xi8>>>,component:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTcomponent{name:!fir.box<!fir.ptr<!fir.char<1,?>>>,genre:i8,category:i8,kind:i8,rank:i8,__padding0:!fir.array<4xi8>,offset:i64,characterlen:!fir.type<_QM__fortran_type_infoTvalue{{[<]?}}{genre:i8,__padding0:!fir.array<7xi8>,value:i64}{{[>]?}}>,derived:!fir.box<!fir.ptr<!fir.type<_QM__fortran_type_infoTderivedtype>>>,lenvalue:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTvalue{{[<]?}}{genre:i8,__padding0:!fir.array<7xi8>,value:i64}{{[>]?}}>>>>,bounds:!fir.box<!fir.ptr<!fir.array<?x?x!fir.type<_QM__fortran_type_infoTvalue{{[<]?}}{genre:i8,__padding0:!fir.array<7xi8>,value:i64}{{[>]?}}>>>>,initialization:!fir.type<_QM__fortran_builtinsT__builtin_c_ptr{__address:i64}>}>>>>,procptr:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTprocptrcomponent{name:!fir.box<!fir.ptr<!fir.char<1,?>>>,offset:i64,initialization:!fir.type<_QM__fortran_builtinsT__builtin_c_funptr{__address:i64}>}>>>>,special:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTspecialbinding{{[<]?}}{which:i8,isargdescriptorset:i8,istypebound:i8,isargcontiguousset:i8,__padding0:!fir.array<4xi8>,proc:!fir.type<_QM__fortran_builtinsT__builtin_c_funptr{__address:i64}>}{{[>]?}}>>>>,specialbitset:i32,hasparent:i8,noinitializationneeded:i8,nodestructionneeded:i8,nofinalizationneeded:i8,nodefinedassignment:i8,__padding0:!fir.array<3xi8>}>>
+! CHECK:           %[[VAL_18:.*]]:2 = hlfir.declare %[[VAL_17]] {fortran_attrs = #fir.var_attrs<target>, uniq_name = "_QFE.dt.t"} : (!fir.ref<!fir.type<_QM__fortran_type_infoTderivedtype{binding:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTbinding{proc:!fir.type<_QM__fortran_builtinsT__builtin_c_funptr{__address:i64}>,name:!fir.box<!fir.ptr<!fir.char<1,?>>>}>>>>,name:!fir.box<!fir.ptr<!fir.char<1,?>>>,sizeinbytes:i64,uninstantiated:!fir.box<!fir.ptr<!fir.type<_QM__fortran_type_infoTderivedtype>>>,kindparameter:!fir.box<!fir.ptr<!fir.array<?xi64>>>,lenparameterkind:!fir.box<!fir.ptr<!fir.array<?xi8>>>,component:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTcomponent{name:!fir.box<!fir.ptr<!fir.char<1,?>>>,genre:i8,category:i8,kind:i8,rank:i8,__padding0:!fir.array<4xi8>,offset:i64,characterlen:!fir.type<_QM__fortran_type_infoTvalue{{[<]?}}{genre:i8,__padding0:!fir.array<7xi8>,value:i64}{{[>]?}}>,derived:!fir.box<!fir.ptr<!fir.type<_QM__fortran_type_infoTderivedtype>>>,lenvalue:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTvalue{{[<]?}}{genre:i8,__padding0:!fir.array<7xi8>,value:i64}{{[>]?}}>>>>,bounds:!fir.box<!fir.ptr<!fir.array<?x?x!fir.type<_QM__fortran_type_infoTvalue{{[<]?}}{genre:i8,__padding0:!fir.array<7xi8>,value:i64}{{[>]?}}>>>>,initialization:!fir.type<_QM__fortran_builtinsT__builtin_c_ptr{__address:i64}>}>>>>,procptr:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTprocptrcomponent{name:!fir.box<!fir.ptr<!fir.char<1,?>>>,offset:i64,initialization:!fir.type<_QM__fortran_builtinsT__builtin_c_funptr{__address:i64}>}>>>>,special:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTspecialbinding{{[<]?}}{which:i8,isargdescriptorset:i8,istypebound:i8,isargcontiguousset:i8,__padding0:!fir.array<4xi8>,proc:!fir.type<_QM__fortran_builtinsT__builtin_c_funptr{__address:i64}>}{{[>]?}}>>>>,specialbitset:i32,hasparent:i8,noinitializationneeded:i8,nodestructionneeded:i8,nofinalizationneeded:i8,nodefinedassignment:i8,__padding0:!fir.array<3xi8>}>>) -> (!fir.ref<!fir.type<_QM__fortran_type_infoTderivedtype{binding:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTbinding{proc:!fir.type<_QM__fortran_builtinsT__builtin_c_funptr{__address:i64}>,name:!fir.box<!fir.ptr<!fir.char<1,?>>>}>>>>,name:!fir.box<!fir.ptr<!fir.char<1,?>>>,sizeinbytes:i64,uninstantiated:!fir.box<!fir.ptr<!fir.type<_QM__fortran_type_infoTderivedtype>>>,kindparameter:!fir.box<!fir.ptr<!fir.array<?xi64>>>,lenparameterkind:!fir.box<!fir.ptr<!fir.array<?xi8>>>,component:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTcomponent{name:!fir.box<!fir.ptr<!fir.char<1,?>>>,genre:i8,category:i8,kind:i8,rank:i8,__padding0:!fir.array<4xi8>,offset:i64,characterlen:!fir.type<_QM__fortran_type_infoTvalue{{[<]?}}{genre:i8,__padding0:!fir.array<7xi8>,value:i64}{{[>]?}}>,derived:!fir.box<!fir.ptr<!fir.type<_QM__fortran_type_infoTderivedtype>>>,lenvalue:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTvalue{{[<]?}}{genre:i8,__padding0:!fir.array<7xi8>,value:i64}{{[>]?}}>>>>,bounds:!fir.box<!fir.ptr<!fir.array<?x?x!fir.type<_QM__fortran_type_infoTvalue{{[<]?}}{genre:i8,__padding0:!fir.array<7xi8>,value:i64}{{[>]?}}>>>>,initialization:!fir.type<_QM__fortran_builtinsT__builtin_c_ptr{__address:i64}>}>>>>,procptr:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTprocptrcomponent{name:!fir.box<!fir.ptr<!fir.char<1,?>>>,offset:i64,initialization:!fir.type<_QM__fortran_builtinsT__builtin_c_funptr{__address:i64}>}>>>>,special:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTspecialbinding{{[<]?}}{which:i8,isargdescriptorset:i8,istypebound:i8,isargcontiguousset:i8,__padding0:!fir.array<4xi8>,proc:!fir.type<_QM__fortran_builtinsT__builtin_c_funptr{__address:i64}>}{{[>]?}}>>>>,specialbitset:i32,hasparent:i8,noinitializationneeded:i8,nodestructionneeded:i8,nofinalizationneeded:i8,nodefinedassignment:i8,__padding0:!fir.array<3xi8>}>>, !fir.ref<!fir.type<_QM__fortran_type_infoTderivedtype{binding:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTbinding{proc:!fir.type<_QM__fortran_builtinsT__builtin_c_funptr{__address:i64}>,name:!fir.box<!fir.ptr<!fir.char<1,?>>>}>>>>,name:!fir.box<!fir.ptr<!fir.char<1,?>>>,sizeinbytes:i64,uninstantiated:!fir.box<!fir.ptr<!fir.type<_QM__fortran_type_infoTderivedtype>>>,kindparameter:!fir.box<!fir.ptr<!fir.array<?xi64>>>,lenparameterkind:!fir.box<!fir.ptr<!fir.array<?xi8>>>,component:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTcomponent{name:!fir.box<!fir.ptr<!fir.char<1,?>>>,genre:i8,category:i8,kind:i8,rank:i8,__padding0:!fir.array<4xi8>,offset:i64,characterlen:!fir.type<_QM__fortran_type_infoTvalue{{[<]?}}{genre:i8,__padding0:!fir.array<7xi8>,value:i64}{{[>]?}}>,derived:!fir.box<!fir.ptr<!fir.type<_QM__fortran_type_infoTderivedtype>>>,lenvalue:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTvalue{{[<]?}}{genre:i8,__padding0:!fir.array<7xi8>,value:i64}{{[>]?}}>>>>,bounds:!fir.box<!fir.ptr<!fir.array<?x?x!fir.type<_QM__fortran_type_infoTvalue{{[<]?}}{genre:i8,__padding0:!fir.array<7xi8>,value:i64}{{[>]?}}>>>>,initialization:!fir.type<_QM__fortran_builtinsT__builtin_c_ptr{__address:i64}>}>>>>,procptr:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTprocptrcomponent{name:!fir.box<!fir.ptr<!fir.char<1,?>>>,offset:i64,initialization:!fir.type<_QM__fortran_builtinsT__builtin_c_funptr{__address:i64}>}>>>>,special:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QM__fortran_type_infoTspecialbinding{{[<]?}}{which:i8,isargdescriptorset:i8,istypebound:i8,isargcontiguousset:i8,__padding0:!fir.array<4xi8>,proc:!fir.type<_QM__fortran_builtinsT__builtin_c_funptr{__address:i64}>}{{[>]?}}>>>>,specialbitset:i32,hasparent:i8,noinitializationneeded:i8,nodestructionneeded:i8,nofinalizationneeded:i8,nodefinedassignment:i8,__padding0:!fir.array<3xi8>}>>)
 ! CHECK:           %[[VAL_19:.*]] = hlfir.designate %[[VAL_13]]#0{"array"}   {fortran_attrs = #fir.var_attrs<pointer>} : (!fir.ref<!fir.type<_QFTt{array:!fir.box<!fir.ptr<!fir.array<?xi32>>>}>, volatile>) -> !fir.ref<!fir.box<!fir.ptr<!fir.array<?xi32>>>, volatile>
 ! CHECK:           %[[VAL_20:.*]] = fir.load %[[VAL_19]] : !fir.ref<!fir.box<!fir.ptr<!fir.array<?xi32>>>, volatile>
 ! CHECK:           %[[VAL_21:.*]]:3 = fir.box_dims %[[VAL_20]], %[[VAL_0]] : (!fir.box<!fir.ptr<!fir.array<?xi32>>>, index) -> (index, index, index)
diff --git a/flang/test/Semantics/typeinfo01.f90 b/flang/test/Semantics/typeinfo01.f90
index d228cd2a84ca4..7dc92504aeebf 100644
--- a/flang/test/Semantics/typeinfo01.f90
+++ b/flang/test/Semantics/typeinfo01.f90
@@ -8,7 +8,7 @@ module m01
   end type
 !CHECK: Module scope: m01
 !CHECK: .c.t1, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(component) shape: 0_8:0_8 init:[component::component(name=.n.n,genre=1_1,category=0_1,kind=4_1,rank=0_1,offset=0_8,characterlen=value(genre=1_1,value=0_8),derived=NULL(),lenvalue=NULL(),bounds=NULL(),initialization=NULL())]
-!CHECK: .dt.t1, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(derivedtype) init:derivedtype(binding=NULL(),name=.n.t1,sizeinbytes=4_8,uninstantiated=NULL(),kindparameter=NULL(),lenparameterkind=NULL(),component=.c.t1,procptr=NULL(),special=NULL(),specialbitset=0_4,hasparent=0_1,noinitializationneeded=1_1,nodestructionneeded=1_1,nofinalizationneeded=1_1)
+!CHECK: .dt.t1, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(derivedtype) init:derivedtype(binding=NULL(),name=.n.t1,sizeinbytes=4_8,uninstantiated=NULL(),kindparameter=NULL(),lenparameterkind=NULL(),component=.c.t1,procptr=NULL(),special=NULL(),specialbitset=0_4,hasparent=0_1,noinitializationneeded=1_1,nodestructionneeded=1_1,nofinalizationneeded=1_1,nodefinedassignment=1_1)
 !CHECK: .n.n, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: CHARACTER(1_8,1) init:"n"
 !CHECK: .n.t1, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: CHARACTER(2_8,1) init:"t1"
 !CHECK: DerivedType scope: t1
@@ -23,8 +23,8 @@ module m02
   end type
 !CHECK: .c.child, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(component) shape: 0_8:1_8 init:[component::component(name=.n.parent,genre=1_1,category=6_1,kind=0_1,rank=0_1,offset=0_8,characterlen=value(genre=1_1,value=0_8),derived=.dt.parent,lenvalue=NULL(),bounds=NULL(),initialization=NULL()),component(name=.n.cn,genre=1_1,category=0_1,kind=4_1,rank=0_1,offset=4_8,characterlen=value(genre=1_1,value=0_8),derived=NULL(),lenvalue=NULL(),bounds=NULL(),initialization=NULL())]
 !CHECK: .c.parent, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(component) shape: 0_8:0_8 init:[component::component(name=.n.pn,genre=1_1,category=0_1,kind=4_1,rank=0_1,offset=0_8,characterlen=value(genre=1_1,value=0_8),derived=NULL(),lenvalue=NULL(),bounds=NULL(),initialization=NULL())]
-!CHECK: .dt.child, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(derivedtype) init:derivedtype(binding=NULL(),name=.n.child,sizeinbytes=8_8,uninstantiated=NULL(),kindparameter=NULL(),lenparameterkind=NULL(),component=.c.child,procptr=NULL(),special=NULL(),specialbitset=0_4,hasparent=1_1,noinitializationneeded=1_1,nodestructionneeded=1_1,nofinalizationneeded=1_1)
-!CHECK: .dt.parent, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(derivedtype) init:derivedtype(binding=NULL(),name=.n.parent,sizeinbytes=4_8,uninstantiated=NULL(),kindparameter=NULL(),lenparameterkind=NULL(),component=.c.parent,procptr=NULL(),special=NULL(),specialbitset=0_4,hasparent=0_1,noinitializationneeded=1_1,nodestructionneeded=1_1,nofinalizationneeded=1_1)
+!CHECK: .dt.child, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(derivedtype) init:derivedtype(binding=NULL(),name=.n.child,sizeinbytes=8_8,uninstantiated=NULL(),kindparameter=NULL(),lenparameterkind=NULL(),component=.c.child,procptr=NULL(),special=NULL(),specialbitset=0_4,hasparent=1_1,noinitializationneeded=1_1,nodestructionneeded=1_1,nofinalizationneeded=1_1,nodefinedassignment=1_1)
+!CHECK: .dt.parent, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(derivedtype) init:derivedtype(binding=NULL(),name=.n.parent,sizeinbytes=4_8,uninstantiated=NULL(),kindparameter=NULL(),lenparameterkind=NULL(),component=.c.parent,procptr=NULL(),special=NULL(),specialbitset=0_4,hasparent=0_1,noinitializationneeded=1_1,nodestructionneeded=1_1,nofinalizationneeded=1_1,nodefinedassignment=1_1)
 end module
 
 module m03
@@ -35,7 +35,7 @@ module m03
   type(kpdt(4)) :: x
 !CHECK: .c.kpdt.4, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(component) shape: 0_8:0_8 init:[component::component(name=.n.a,genre=1_1,category=2_1,kind=4_1,rank=0_1,offset=0_8,characterlen=value(genre=1_1,value=0_8),derived=NULL(),lenvalue=NULL(),bounds=NULL(),initialization=NULL())]
 !CHECK: .dt.kpdt, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(derivedtype) init:derivedtype(name=.n.kpdt,uninstantiated=NULL(),kindparameter=.kp.kpdt,lenparameterkind=NULL())
-!CHECK: .dt.kpdt.4, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(derivedtype) init:derivedtype(binding=NULL(),name=.n.kpdt,sizeinbytes=4_8,uninstantiated=.dt.kpdt,kindparameter=.kp.kpdt.4,lenparameterkind=NULL(),component=.c.kpdt.4,procptr=NULL(),special=NULL(),specialbitset=0_4,hasparent=0_1,noinitializationneeded=1_1,nodestructionneeded=1_1,nofinalizationneeded=1_1)
+!CHECK: .dt.kpdt.4, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(derivedtype) init:derivedtype(binding=NULL(),name=.n.kpdt,sizeinbytes=4_8,uninstantiated=.dt.kpdt,kindparameter=.kp.kpdt.4,lenparameterkind=NULL(),component=.c.kpdt.4,procptr=NULL(),special=NULL(),specialbitset=0_4,hasparent=0_1,noinitializationneeded=1_1,nodestructionneeded=1_1,nofinalizationneeded=1_1,nodefinedassignment=1_1)
 !CHECK: .kp.kpdt.4, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: INTEGER(8) shape: 0_8:0_8 init:[INTEGER(8)::4_8]
 end module
 
@@ -49,7 +49,7 @@ module m04
   subroutine s1(x)
     class(tbps), intent(in) :: x
   end subroutine
-!CHECK: .dt.tbps, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(derivedtype) init:derivedtype(binding=.v.tbps,name=.n.tbps,sizeinbytes=0_8,uninstantiated=NULL(),kindparameter=NULL(),lenparameterkind=NULL(),component=NULL(),procptr=NULL(),special=NULL(),specialbitset=0_4,hasparent=0_1,noinitializationneeded=1_1,nodestructionneeded=1_1,nofinalizationneeded=1_1)
+!CHECK: .dt.tbps, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(derivedtype) init:derivedtype(binding=.v.tbps,name=.n.tbps,sizeinbytes=0_8,uninstantiated=NULL(),kindparameter=NULL(),lenparameterkind=NULL(),component=NULL(),procptr=NULL(),special=NULL(),specialbitset=0_4,hasparent=0_1,noinitializationneeded=1_1,nodestructionneeded=1_1,nofinalizationneeded=1_1,nodefinedassignment=1_1)
 !CHECK: .v.tbps, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(binding) shape: 0_8:1_8 init:[binding::binding(proc=s1,name=.n.b1),binding(proc=s1,name=.n.b2)]
 end module
 
@@ -61,7 +61,7 @@ module m05
   subroutine s1(x)
     class(t), intent(in) :: x
   end subroutine
-!CHECK: .dt.t, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(derivedtype) init:derivedtype(binding=NULL(),name=.n.t,sizeinbytes=8_8,uninstantiated=NULL(),kindparameter=NULL(),lenparameterkind=NULL(),component=NULL(),procptr=.p.t,special=NULL(),specialbitset=0_4,hasparent=0_1,noinitializationneeded=0_1,nodestructionneeded=1_1,nofinalizationneeded=1_1)
+!CHECK: .dt.t, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(derivedtype) init:derivedtype(binding=NULL(),name=.n.t,sizeinbytes=8_8,uninstantiated=NULL(),kindparameter=NULL(),lenparameterkind=NULL(),component=NULL(),procptr=.p.t,special=NULL(),specialbitset=0_4,hasparent=0_1,noinitializationneeded=0_1,nodestructionneeded=1_1,nofinalizationneeded=1_1,nodefinedassignment=1_1)
 !CHECK: .p.t, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(procptrcomponent) shape: 0_8:0_8 init:[procptrcomponent::procptrcomponent(name=.n.p1,offset=0_8,initialization=s1)]
 end module
 
@@ -85,8 +85,8 @@ subroutine s2(x, y)
     class(t), intent(in) :: y
   end subroutine
 !CHECK: .c.t2, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(component) shape: 0_8:0_8 init:[component::component(name=.n.t,genre=1_1,category=6_1,kind=0_1,rank=0_1,offset=0_8,characterlen=value(genre=1_1,value=0_8),derived=.dt.t,lenvalue=NULL(),bounds=NULL(),initialization=NULL())]
-!CHECK: .dt.t, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(derivedtype) init:derivedtype(binding=.v.t,name=.n.t,sizeinbytes=0_8,uninstantiated=NULL(),kindparameter=NULL(),lenparameterkind=NULL(),component=NULL(),procptr=NULL(),special=.s.t,specialbitset=2_4,hasparent=0_1,noinitializationneeded=1_1,nodestructionneeded=1_1,nofinalizationneeded=1_1)
-!CHECK: .dt.t2, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(derivedtype) init:derivedtype(binding=.v.t2,name=.n.t2,sizeinbytes=0_8,uninstantiated=NULL(),kindparameter=NULL(),lenparameterkind=NULL(),component=.c.t2,procptr=NULL(),special=.s.t2,specialbitset=2_4,hasparent=1_1,noinitializationneeded=1_1,nodestructionneeded=1_1,nofinalizationneeded=1_1)
+!CHECK: .dt.t, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(derivedtype) init:derivedtype(binding=.v.t,name=.n.t,sizeinbytes=0_8,uninstantiated=NULL(),kindparameter=NULL(),lenparameterkind=NULL(),component=NULL(),procptr=NULL(),special=.s.t,specialbitset=2_4,hasparent=0_1,noinitializationneeded=1_1,nodestructionneeded=1_1,nofinalizationneeded=1_1,nodefinedassignment=0_1)
+!CHECK: .dt.t2, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(derivedtype) init:derivedtype(binding=.v.t2,name=.n.t2,sizeinbytes=0_8,uninstantiated=NULL(),kindparameter=NULL(),lenparameterkind=NULL(),component=.c.t2,procptr=NULL(),special=.s.t2,specialbitset=2_4,hasparent=1_1,noinitializationneeded=1_1,nodestructionneeded=1_1,nofinalizationneeded=1_1,nodefinedassignment=0_1)
 !CHECK: .s.t, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(specialbinding) shape: 0_8:0_8 init:[specialbinding::specialbinding(which=1_1,isargdescriptorset=3_1,istypebound=1_1,isargcontiguousset=0_1,proc=s1)]
 !CHECK: .s.t2, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(specialbinding) shape: 0_8:0_8 init:[specialbinding::specialbinding(which=1_1,isargdescriptorset=3_1,istypebound=1_1,isargcontiguousset=0_1,proc=s2)]
 !CHECK: .v.t, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(binding) shape: 0_8:0_8 init:[binding::binding(proc=s1,name=.n.s1)]
@@ -113,8 +113,8 @@ subroutine s2(x, y)
     class(t2), intent(in) :: y
   end subroutine
 !CHECK: .c.t2, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(component) shape: 0_8:0_8 init:[component::component(name=.n.t,genre=1_1,category=6_1,kind=0_1,rank=0_1,offset=0_8,characterlen=value(genre=1_1,value=0_8),derived=.dt.t,lenvalue=NULL(),bounds=NULL(),initialization=NULL())]
-!CHECK: .dt.t, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(derivedtype) init:derivedtype(binding=.v.t,name=.n.t,sizeinbytes=0_8,uninstantiated=NULL(),kindparameter=NULL(),lenparameterkind=NULL(),component=NULL(),procptr=NULL(),special=.s.t,specialbitset=2_4,hasparent=0_1,noinitializationneeded=1_1,nodestructionneeded=1_1,nofinalizationneeded=1_1)
-!CHECK: .dt.t2, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(derivedtype) init:derivedtype(binding=.v.t2,name=.n.t2,sizeinbytes=0_8,uninstantiated=NULL(),kindparameter=NULL(),lenparameterkind=NULL(),component=.c.t2,procptr=NULL(),special=.s.t2,specialbitset=2_4,hasparent=1_1,noinitializationneeded=1_1,nodestructionneeded=1_1,nofinalizationneeded=1_1)
+!CHECK: .dt.t, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(derivedtype) init:derivedtype(binding=.v.t,name=.n.t,sizeinbytes=0_8,uninstantiated=NULL(),kindparameter=NULL(),lenparameterkind=NULL(),component=NULL(),procptr=NULL(),special=.s.t,specialbitset=2_4,hasparent=0_1,noinitializationneeded=1_1,nodestructionneeded=1_1,nofinalizationneeded=1_1,nodefinedassignment=0_1)
+!CHECK: .dt.t2, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(derivedtype) init:derivedtype(binding=.v.t2,name=.n.t2,sizeinbytes=0_8,uninstantiated=NULL(),kindparameter=NULL(),lenparameterkind=NULL(),component=.c.t2,procptr=NULL(),special=.s.t2,specialbitset=2_4,hasparent=1_1,noinitializationneeded=1_1,nodestructionneeded=1_1,nofinalizationneeded=1_1,nodefinedassignment=0_1)
 !CHECK: .s.t, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(specialbinding) shape: 0_8:0_8 init:[specialbinding::specialbinding(which=1_1,isargdescriptorset=3_1,istypebound=1_1,isargcontiguousset=0_1,proc=s1)]
 !CHECK: .s.t2, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(specialbinding) shape: 0_8:0_8 init:[specialbinding::specialbinding(which=1_1,isargdescriptorset=3_1,istypebound=1_1,isargcontiguousset=0_1,proc=s2)]
 !CHECK: .v.t, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(binding) shape: 0_8:0_8 init:[binding::binding(proc=s1,name=.n.s1)]
@@ -132,7 +132,7 @@ impure elemental subroutine s1(x, y)
     class(t), intent(out) :: x
     class(t), intent(in) :: y
   end subroutine
-!CHECK: .dt.t, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(derivedtype) init:derivedtype(binding=.v.t,name=.n.t,sizeinbytes=0_8,uninstantiated=NULL(),kindparameter=NULL(),lenparameterkind=NULL(),component=NULL(),procptr=NULL(),special=.s.t,specialbitset=4_4,hasparent=0_1,noinitializationneeded=1_1,nodestructionneeded=1_1,nofinalizationneeded=1_1)
+!CHECK: .dt.t, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(derivedtype) init:derivedtype(binding=.v.t,name=.n.t,sizeinbytes=0_8,uninstantiated=NULL(),kindparameter=NULL(),lenparameterkind=NULL(),component=NULL(),procptr=NULL(),special=.s.t,specialbitset=4_4,hasparent=0_1,noinitializationneeded=1_1,nodestructionneeded=1_1,nofinalizationneeded=1_1,nodefinedassignment=0_1)
 !CHECK: .s.t, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(specialbinding) shape: 0_8:0_8 init:[specialbinding::specialbinding(which=2_1,isargdescriptorset=3_1,istypebound=1_1,isargcontiguousset=0_1,proc=s1)]
 !CHECK: .v.t, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(binding) shape: 0_8:0_8 init:[binding::binding(proc=s1,name=.n.s1)]
 end module
@@ -155,7 +155,7 @@ impure elemental subroutine s3(x)
   subroutine s4(x)
     type(t), contiguous :: x(:,:,:)
   end subroutine
-!CHECK: .dt.t, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(derivedtype) init:derivedtype(binding=NULL(),name=.n.t,sizeinbytes=0_8,uninstantiated=NULL(),kindparameter=NULL(),lenparameterkind=NULL(),component=NULL(),procptr=NULL(),special=.s.t,specialbitset=7296_4,hasparent=0_1,noinitializationneeded=1_1,nodestructionneeded=0_1,nofinalizationneeded=0_1)
+!CHECK: .dt.t, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(derivedtype) init:derivedtype(binding=NULL(),name=.n.t,sizeinbytes=0_8,uninstantiated=NULL(),kindparameter=NULL(),lenparameterkind=NULL(),component=NULL(),procptr=NULL(),special=.s.t,specialbitset=7296_4,hasparent=0_1,noinitializationneeded=1_1,nodestructionneeded=0_1,nofinalizationneeded=0_1,nodefinedassignment=1_1)
 !CHECK: .s.t, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(specialbinding) shape: 0_8:3_8 init:[specialbinding::specialbinding(which=7_1,isargdescriptorset=0_1,istypebound=1_1,isargcontiguousset=0_1,proc=s3),specialbinding(which=10_1,isargdescriptorset=1_1,istypebound=1_1,isargcontiguousset=0_1,proc=s1),specialbinding(which=11_1,isargdescriptorset=0_1,istypebound=1_1,isargcontiguousset=1_1,proc=s2),specialbinding(which=12_1,isargdescriptorset=1_1,istypebound=1_1,isargcontiguousset=1_1,proc=s4)]
 end module
 
@@ -197,7 +197,7 @@ subroutine wu(x,u,iostat,iomsg)
     integer, intent(out) :: iostat
     character(len=*), intent(inout) :: iomsg
   end subroutine
-!CHECK: .dt.t, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(derivedtype) init:derivedtype(binding=.v.t,name=.n.t,sizeinbytes=0_8,uninstantiated=NULL(),kindparameter=NULL(),lenparameterkind=NULL(),component=NULL(),procptr=NULL(),special=.s.t,specialbitset=120_4,hasparent=0_1,noinitializationneeded=1_1,nodestructionneeded=1_1,nofinalizationneeded=1_1)
+!CHECK: .dt.t, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(derivedtype) init:derivedtype(binding=.v.t,name=.n.t,sizeinbytes=0_8,uninstantiated=NULL(),kindparameter=NULL(),lenparameterkind=NULL(),component=NULL(),procptr=NULL(),special=.s.t,specialbitset=120_4,hasparent=0_1,noinitializationneeded=1_1,nodestructionneeded=1_1,nofinalizationneeded=1_1,nodefinedassignment=1_1)
 !CHECK: .s.t, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(specialbinding) shape: 0_8:3_8 init:[specialbinding::specialbinding(which=3_1,isargdescriptorset=1_1,istypebound=1_1,isargcontiguousset=0_1,proc=rf),specialbinding(which=4_1,isargdescriptorset=1_1,istypebound=1_1,isargcontiguousset=0_1,proc=ru),specialbinding(which=5_1,isargdescriptorset=1_1,istypebound=1_1,isargcontiguousset=0_1,proc=wf),specialbinding(which=6_1,isargdescriptorset=1_1,istypebound=1_1,isargcontiguousset=0_1,proc=wu)]
 !CHECK: .v.t, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(binding) shape: 0_8:3_8 init:[binding::binding(proc=rf,name=.n.rf),binding(proc=ru,name=.n.ru),binding(proc=wf,name=.n.wf),binding(proc=wu,name=.n.wu)]
 end module
@@ -246,7 +246,7 @@ subroutine wu(x,u,iostat,iomsg)
     integer, intent(out) :: iostat
     character(len=*), intent(inout) :: iomsg
   end subroutine
-!CHECK: .dt.t, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(derivedtype) init:derivedtype(binding=NULL(),name=.n.t,sizeinbytes=0_8,uninstantiated=NULL(),kindparameter=NULL(),lenparameterkind=NULL(),component=NULL(),procptr=NULL(),special=.s.t,specialbitset=120_4,hasparent=0_1,noinitializationneeded=1_1,nodestructionneeded=1_1,nofinalizationneeded=1_1)
+!CHECK: .dt.t, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(derivedtype) init:derivedtype(binding=NULL(),name=.n.t,sizeinbytes=0_8,uninstantiated=NULL(),kindparameter=NULL(),lenparameterkind=NULL(),component=NULL(),procptr=NULL(),special=.s.t,specialbitset=120_4,hasparent=0_1,noinitializationneeded=1_1,nodestructionneeded=1_1,nofinalizationneeded=1_1,nodefinedassignment=1_1)
 !CHECK: .s.t, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(specialbinding) shape: 0_8:3_8 init:[specialbinding::specialbinding(which=3_1,isargdescriptorset=0_1,istypebound=0_1,isargcontiguousset=0_1,proc=rf),specialbinding(which=4_1,isargdescriptorset=0_1,istypebound=0_1,isargcontiguousset=0_1,proc=ru),specialbinding(which=5_1,isargdescriptorset=0_1,istypebound=0_1,isargcontiguousset=0_1,proc=wf),specialbinding(which=6_1,isargdescriptorset=0_1,istypebound=0_1,isargcontiguousset=0_1,proc=wu)]
 end module
 
@@ -263,7 +263,7 @@ module m11
 !CHECK: .c.t, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(component) shape: 0_8:3_8 init:[component::component(name=.n.allocatable,genre=3_1,category=2_1,kind=4_1,rank=1_1,offset=0_8,characterlen=value(genre=1_1,value=0_8),derived=NULL(),lenvalue=NULL(),bounds=NULL(),initialization=NULL()),component(name=.n.pointer,genre=2_1,category=2_1,kind=4_1,rank=0_1,offset=48_8,characterlen=value(genre=1_1,value=0_8),derived=NULL(),lenvalue=NULL(),bounds=NULL(),initialization=.di.t.pointer),component(name=.n.chauto,genre=4_1,category=4_1,kind=1_1,rank=0_1,offset=72_8,characterlen=value(genre=3_1,value=0_8),derived=NULL(),lenvalue=NULL(),bounds=NULL(),initialization=NULL()),component(name=.n.automatic,genre=4_1,category=2_1,kind=4_1,rank=1_1,offset=96_8,characterlen=value(genre=1_1,value=0_8),derived=NULL(),lenvalue=NULL(),bounds=.b.t.automatic,initialization=NULL())]
 !CHECK: .di.t.pointer, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(.dp.t.pointer) init:.dp.t.pointer(pointer=target)
 !CHECK: .dp.t.pointer (CompilerCreated): DerivedType components: pointer
-!CHECK: .dt.t, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(derivedtype) init:derivedtype(binding=NULL(),name=.n.t,sizeinbytes=144_8,uninstantiated=NULL(),kindparameter=NULL(),lenparameterkind=.lpk.t,component=.c.t,procptr=NULL(),special=NULL(),specialbitset=0_4,hasparent=0_1,noinitializationneeded=0_1,nodestructionneeded=0_1,nofinalizationneeded=1_1)
+!CHECK: .dt.t, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(derivedtype) init:derivedtype(binding=NULL(),name=.n.t,sizeinbytes=144_8,uninstantiated=NULL(),kindparameter=NULL(),lenparameterkind=.lpk.t,component=.c.t,procptr=NULL(),special=NULL(),specialbitset=0_4,hasparent=0_1,noinitializationneeded=0_1,nodestructionneeded=0_1,nofinalizationneeded=1_1,nodefinedassignment=1_1)
 !CHECK: .lpk.t, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: INTEGER(1) shape: 0_8:0_8 init:[INTEGER(1)::8_1]
 !CHECK: DerivedType scope: .dp.t.pointer size=24 alignment=8 instantiation of .dp.t.pointer
 !CHECK: pointer, POINTER size=24 offset=0: ObjectEntity type: REAL(4)
diff --git a/flang/test/Semantics/typeinfo03.f90 b/flang/test/Semantics/typeinfo03.f90
index f0c0a817da4a4..e2552d0a21d6f 100644
--- a/flang/test/Semantics/typeinfo03.f90
+++ b/flang/test/Semantics/typeinfo03.f90
@@ -6,4 +6,4 @@ module m
     class(*), pointer :: sp, ap(:)
   end type
 end module
-!CHECK: .dt.haspointer, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(derivedtype) init:derivedtype(binding=NULL(),name=.n.haspointer,sizeinbytes=104_8,uninstantiated=NULL(),kindparameter=NULL(),lenparameterkind=NULL(),component=.c.haspointer,procptr=NULL(),special=NULL(),specialbitset=0_4,hasparent=0_1,noinitializationneeded=0_1,nodestructionneeded=1_1,nofinalizationneeded=1_1)
+!CHECK: .dt.haspointer, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(derivedtype) init:derivedtype(binding=NULL(),name=.n.haspointer,sizeinbytes=104_8,uninstantiated=NULL(),kindparameter=NULL(),lenparameterkind=NULL(),component=.c.haspointer,procptr=NULL(),special=NULL(),specialbitset=0_4,hasparent=0_1,noinitializationneeded=0_1,nodestructionneeded=1_1,nofinalizationneeded=1_1,nodefinedassignment=1_1)
diff --git a/flang/test/Semantics/typeinfo04.f90 b/flang/test/Semantics/typeinfo04.f90
index de8464321a409..94dd2199db35a 100644
--- a/flang/test/Semantics/typeinfo04.f90
+++ b/flang/test/Semantics/typeinfo04.f90
@@ -7,18 +7,18 @@ module m
    contains
     final :: final
   end type
-!CHECK: .dt.finalizable, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(derivedtype) init:derivedtype(binding=NULL(),name=.n.finalizable,sizeinbytes=0_8,uninstantiated=NULL(),kindparameter=NULL(),lenparameterkind=NULL(),component=NULL(),procptr=NULL(),special=.s.finalizable,specialbitset=128_4,hasparent=0_1,noinitializationneeded=1_1,nodestructionneeded=0_1,nofinalizationneeded=0_1)
+!CHECK: .dt.finalizable, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(derivedtype) init:derivedtype(binding=NULL(),name=.n.finalizable,sizeinbytes=0_8,uninstantiated=NULL(),kindparameter=NULL(),lenparameterkind=NULL(),component=NULL(),procptr=NULL(),special=.s.finalizable,specialbitset=128_4,hasparent=0_1,noinitializationneeded=1_1,nodestructionneeded=0_1,nofinalizationneeded=0_1,nodefinedassignment=1_1)
   type, abstract :: t1
   end type
-!CHECK: .dt.t1, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(derivedtype) init:derivedtype(name=.n.t1,sizeinbytes=0_8,uninstantiated=NULL(),kindparameter=NULL(),lenparameterkind=NULL(),component=NULL(),procptr=NULL(),specialbitset=0_4,hasparent=0_1,noinitializationneeded=1_1,nodestructionneeded=1_1,nofinalizationneeded=1_1)
+!CHECK: .dt.t1, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(derivedtype) init:derivedtype(name=.n.t1,sizeinbytes=0_8,uninstantiated=NULL(),kindparameter=NULL(),lenparameterkind=NULL(),component=NULL(),procptr=NULL(),specialbitset=0_4,hasparent=0_1,noinitializationneeded=1_1,nodestructionneeded=1_1,nofinalizationneeded=1_1,nodefinedassignment=1_1)
   type, abstract :: t2
     real, allocatable :: a(:)
   end type
-!CHECK: .dt.t2, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(derivedtype) init:derivedtype(name=.n.t2,sizeinbytes=48_8,uninstantiated=NULL(),kindparameter=NULL(),lenparameterkind=NULL(),component=.c.t2,procptr=NULL(),specialbitset=0_4,hasparent=0_1,noinitializationneeded=0_1,nodestructionneeded=0_1,nofinalizationneeded=1_1)
+!CHECK: .dt.t2, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(derivedtype) init:derivedtype(name=.n.t2,sizeinbytes=48_8,uninstantiated=NULL(),kindparameter=NULL(),lenparameterkind=NULL(),component=.c.t2,procptr=NULL(),specialbitset=0_4,hasparent=0_1,noinitializationneeded=0_1,nodestructionneeded=0_1,nofinalizationneeded=1_1,nodefinedassignment=1_1)
   type, abstract :: t3
     type(finalizable) :: x
   end type
-!CHECK: .dt.t3, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(derivedtype) init:derivedtype(name=.n.t3,sizeinbytes=0_8,uninstantiated=NULL(),kindparameter=NULL(),lenparameterkind=NULL(),component=.c.t3,procptr=NULL(),specialbitset=0_4,hasparent=0_1,noinitializationneeded=1_1,nodestructionneeded=0_1,nofinalizationneeded=0_1)
+!CHECK: .dt.t3, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(derivedtype) init:derivedtype(name=.n.t3,sizeinbytes=0_8,uninstantiated=NULL(),kindparameter=NULL(),lenparameterkind=NULL(),component=.c.t3,procptr=NULL(),specialbitset=0_4,hasparent=0_1,noinitializationneeded=1_1,nodestructionneeded=0_1,nofinalizationneeded=0_1,nodefinedassignment=1_1)
  contains
   impure elemental subroutine final(x)
     type(finalizable), intent(in out) :: x
diff --git a/flang/test/Semantics/typeinfo05.f90 b/flang/test/Semantics/typeinfo05.f90
index 2a7f12a153eb8..df1aecf3821de 100644
--- a/flang/test/Semantics/typeinfo05.f90
+++ b/flang/test/Semantics/typeinfo05.f90
@@ -7,10 +7,10 @@ program main
   type t1
     type(t2), pointer :: b
   end type t1
-!CHECK: .dt.t1, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(derivedtype) init:derivedtype(binding=NULL(),name=.n.t1,sizeinbytes=40_8,uninstantiated=NULL(),kindparameter=NULL(),lenparameterkind=NULL(),component=.c.t1,procptr=NULL(),special=NULL(),specialbitset=0_4,hasparent=0_1,noinitializationneeded=0_1,nodestructionneeded=1_1,nofinalizationneeded=1_1)
+!CHECK: .dt.t1, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(derivedtype) init:derivedtype(binding=NULL(),name=.n.t1,sizeinbytes=40_8,uninstantiated=NULL(),kindparameter=NULL(),lenparameterkind=NULL(),component=.c.t1,procptr=NULL(),special=NULL(),specialbitset=0_4,hasparent=0_1,noinitializationneeded=0_1,nodestructionneeded=1_1,nofinalizationneeded=1_1,nodefinedassignment=1_1)
   type :: t2
     type(t1) :: a
   end type t2
-! CHECK: .dt.t2, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(derivedtype) init:derivedtype(binding=NULL(),name=.n.t2,sizeinbytes=40_8,uninstantiated=NULL(),kindparameter=NULL(),lenparameterkind=NULL(),component=.c.t2,procptr=NULL(),special=NULL(),specialbitset=0_4,hasparent=0_1,noinitializationneeded=0_1,nodestructionneeded=1_1,nofinalizationneeded=1_1)
+! CHECK: .dt.t2, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(derivedtype) init:derivedtype(binding=NULL(),name=.n.t2,sizeinbytes=40_8,uninstantiated=NULL(),kindparameter=NULL(),lenparameterkind=NULL(),component=.c.t2,procptr=NULL(),special=NULL(),specialbitset=0_4,hasparent=0_1,noinitializationneeded=0_1,nodestructionneeded=1_1,nofinalizationneeded=1_1,nodefinedassignment=1_1)
 end program main
 
diff --git a/flang/test/Semantics/typeinfo06.f90 b/flang/test/Semantics/typeinfo06.f90
index 2385709a8eb44..22f37b1a4369d 100644
--- a/flang/test/Semantics/typeinfo06.f90
+++ b/flang/test/Semantics/typeinfo06.f90
@@ -7,10 +7,10 @@ program main
   type t1
     type(t2), allocatable :: b
   end type t1
-!CHECK: .dt.t1, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(derivedtype) init:derivedtype(binding=NULL(),name=.n.t1,sizeinbytes=40_8,uninstantiated=NULL(),kindparameter=NULL(),lenparameterkind=NULL(),component=.c.t1,procptr=NULL(),special=NULL(),specialbitset=0_4,hasparent=0_1,noinitializationneeded=0_1,nodestructionneeded=0_1,nofinalizationneeded=1_1)
+!CHECK: .dt.t1, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(derivedtype) init:derivedtype(binding=NULL(),name=.n.t1,sizeinbytes=40_8,uninstantiated=NULL(),kindparameter=NULL(),lenparameterkind=NULL(),component=.c.t1,procptr=NULL(),special=NULL(),specialbitset=0_4,hasparent=0_1,noinitializationneeded=0_1,nodestructionneeded=0_1,nofinalizationneeded=1_1,nodefinedassignment=1_1)
   type :: t2
     type(t1) :: a
   end type t2
-! CHECK: .dt.t2, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(derivedtype) init:derivedtype(binding=NULL(),name=.n.t2,sizeinbytes=40_8,uninstantiated=NULL(),kindparameter=NULL(),lenparameterkind=NULL(),component=.c.t2,procptr=NULL(),special=NULL(),specialbitset=0_4,hasparent=0_1,noinitializationneeded=0_1,nodestructionneeded=0_1,nofinalizationneeded=1_1)
+! CHECK: .dt.t2, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(derivedtype) init:derivedtype(binding=NULL(),name=.n.t2,sizeinbytes=40_8,uninstantiated=NULL(),kindparameter=NULL(),lenparameterkind=NULL(),component=.c.t2,procptr=NULL(),special=NULL(),specialbitset=0_4,hasparent=0_1,noinitializationneeded=0_1,nodestructionneeded=0_1,nofinalizationneeded=1_1,nodefinedassignment=1_1)
 end program main
 
diff --git a/flang/test/Semantics/typeinfo07.f90 b/flang/test/Semantics/typeinfo07.f90
index e8766d9811db8..ab20d6f601106 100644
--- a/flang/test/Semantics/typeinfo07.f90
+++ b/flang/test/Semantics/typeinfo07.f90
@@ -16,7 +16,7 @@
     type(t_container_extension) :: wrapper
   end type
 end
-! CHECK: .dt.t_container, SAVE, TARGET (CompilerCreated, ReadOnly): {{.*}}noinitializationneeded=0_1,nodestructionneeded=0_1,nofinalizationneeded=0_1)
-! CHECK: .dt.t_container_extension, SAVE, TARGET (CompilerCreated, ReadOnly): {{.*}}noinitializationneeded=0_1,nodestructionneeded=0_1,nofinalizationneeded=0_1)
-! CHECK: .dt.t_container_not_polymorphic, SAVE, TARGET (CompilerCreated, ReadOnly): {{.*}}noinitializationneeded=0_1,nodestructionneeded=0_1,nofinalizationneeded=1_1)
-! CHECK: .dt.t_container_wrapper, SAVE, TARGET (CompilerCreated, ReadOnly): {{.*}}noinitializationneeded=0_1,nodestructionneeded=0_1,nofinalizationneeded=0_1)
+! CHECK: .dt.t_container, SAVE, TARGET (CompilerCreated, ReadOnly): {{.*}}noinitializationneeded=0_1,nodestructionneeded=0_1,nofinalizationneeded=0_1,nodefinedassignment=0_1)
+! CHECK: .dt.t_container_extension, SAVE, TARGET (CompilerCreated, ReadOnly): {{.*}}noinitializationneeded=0_1,nodestructionneeded=0_1,nofinalizationneeded=0_1,nodefinedassignment=0_1)
+! CHECK: .dt.t_container_not_polymorphic, SAVE, TARGET (CompilerCreated, ReadOnly): {{.*}}noinitializationneeded=0_1,nodestructionneeded=0_1,nofinalizationneeded=1_1,nodefinedassignment=1_1)
+! CHECK: .dt.t_container_wrapper, SAVE, TARGET (CompilerCreated, ReadOnly): {{.*}}noinitializationneeded=0_1,nodestructionneeded=0_1,nofinalizationneeded=0_1,nodefinedassignment=0_1)
diff --git a/flang/test/Semantics/typeinfo08.f90 b/flang/test/Semantics/typeinfo08.f90
index 689cf469dee3b..391a66f3d6664 100644
--- a/flang/test/Semantics/typeinfo08.f90
+++ b/flang/test/Semantics/typeinfo08.f90
@@ -13,7 +13,7 @@ module m
 
 !CHECK: Module scope: m size=0 alignment=1 sourceRange=113 bytes
 !CHECK: .c.s, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(component) shape: 0_8:0_8 init:[component::component(name=.n.t1,genre=1_1,category=6_1,kind=0_1,rank=0_1,offset=0_8,characterlen=value(genre=1_1,value=0_8),lenvalue=NULL(),bounds=NULL(),initialization=NULL())]
-!CHECK: .dt.s, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(derivedtype) init:derivedtype(binding=NULL(),name=.n.s,sizeinbytes=0_8,uninstantiated=NULL(),kindparameter=NULL(),lenparameterkind=.lpk.s,component=.c.s,procptr=NULL(),special=NULL(),specialbitset=0_4,hasparent=0_1,noinitializationneeded=1_1,nodestructionneeded=1_1,nofinalizationneeded=1_1)
+!CHECK: .dt.s, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(derivedtype) init:derivedtype(binding=NULL(),name=.n.s,sizeinbytes=0_8,uninstantiated=NULL(),kindparameter=NULL(),lenparameterkind=.lpk.s,component=.c.s,procptr=NULL(),special=NULL(),specialbitset=0_4,hasparent=0_1,noinitializationneeded=1_1,nodestructionneeded=1_1,nofinalizationneeded=1_1,nodefinedassignment=1_1)
 !CHECK: .lpk.s, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: INTEGER(1) shape: 0_8:0_8 init:[INTEGER(1)::4_1]
 !CHECK: .n.s, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: CHARACTER(1_8,1) init:"s"
 !CHECK: .n.t1, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: CHARACTER(2_8,1) init:"t1"
diff --git a/flang/test/Semantics/typeinfo11.f90 b/flang/test/Semantics/typeinfo11.f90
index 92efc8f9ea54b..08e0b95abb763 100644
--- a/flang/test/Semantics/typeinfo11.f90
+++ b/flang/test/Semantics/typeinfo11.f90
@@ -14,4 +14,4 @@
 type(t2) x
 end
 
-!CHECK: .dt.t2, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(derivedtype) init:derivedtype(binding=NULL(),name=.n.t2,sizeinbytes=40_8,uninstantiated=NULL(),kindparameter=NULL(),lenparameterkind=NULL(),component=.c.t2,procptr=NULL(),special=NULL(),specialbitset=0_4,hasparent=0_1,noinitializationneeded=0_1,nodestructionneeded=0_1,nofinalizationneeded=0_1)
+!CHECK: .dt.t2, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(derivedtype) init:derivedtype(binding=NULL(),name=.n.t2,sizeinbytes=40_8,uninstantiated=NULL(),kindparameter=NULL(),lenparameterkind=NULL(),component=.c.t2,procptr=NULL(),special=NULL(),specialbitset=0_4,hasparent=0_1,noinitializationneeded=0_1,nodestructionneeded=0_1,nofinalizationneeded=0_1,nodefinedassignment=0_1)
diff --git a/flang/test/Semantics/typeinfo12.f90 b/flang/test/Semantics/typeinfo12.f90
new file mode 100644
index 0000000000000..6b23b63d28b1d
--- /dev/null
+++ b/flang/test/Semantics/typeinfo12.f90
@@ -0,0 +1,67 @@
+!RUN: bbc --dump-symbols %s | FileCheck %s
+!Check "nodefinedassignment" settings.
+
+module m01
+
+  type hasAsst1
+   contains
+    procedure asst1
+    generic :: assignment(=) => asst1
+  end type
+!CHECK: .dt.hasasst1, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(derivedtype) init:derivedtype(binding=.v.hasasst1,name=.n.hasasst1,sizeinbytes=0_8,uninstantiated=NULL(),kindparameter=NULL(),lenparameterkind=NULL(),component=NULL(),procptr=NULL(),special=.s.hasasst1,specialbitset=4_4,hasparent=0_1,noinitializationneeded=1_1,nodestructionneeded=1_1,nofinalizationneeded=1_1,nodefinedassignment=0_1)
+
+  type hasAsst2 ! no defined assignment relevant to the runtime
+  end type
+  interface assignment(=)
+    procedure asst2
+  end interface
+!CHECK: .dt.hasasst2, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(derivedtype) init:derivedtype(binding=NULL(),name=.n.hasasst2,sizeinbytes=0_8,uninstantiated=NULL(),kindparameter=NULL(),lenparameterkind=NULL(),component=NULL(),procptr=NULL(),special=NULL(),specialbitset=0_4,hasparent=0_1,noinitializationneeded=1_1,nodestructionneeded=1_1,nofinalizationneeded=1_1,nodefinedassignment=1_1)
+
+  type test1
+    type(hasAsst1) c
+  end type
+!CHECK: .dt.test1, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(derivedtype) init:derivedtype(binding=NULL(),name=.n.test1,sizeinbytes=0_8,uninstantiated=NULL(),kindparameter=NULL(),lenparameterkind=NULL(),component=.c.test1,procptr=NULL(),special=NULL(),specialbitset=0_4,hasparent=0_1,noinitializationneeded=1_1,nodestructionneeded=1_1,nofinalizationneeded=1_1,nodefinedassignment=0_1)
+
+  type test2
+    type(hasAsst2) c
+  end type
+!CHECK: .dt.test2, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(derivedtype) init:derivedtype(binding=NULL(),name=.n.test2,sizeinbytes=0_8,uninstantiated=NULL(),kindparameter=NULL(),lenparameterkind=NULL(),component=.c.test2,procptr=NULL(),special=NULL(),specialbitset=0_4,hasparent=0_1,noinitializationneeded=1_1,nodestructionneeded=1_1,nofinalizationneeded=1_1,nodefinedassignment=1_1)
+
+  type test3
+    type(hasAsst1), pointer :: p
+  end type
+!CHECK: .dt.test3, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(derivedtype) init:derivedtype(binding=NULL(),name=.n.test3,sizeinbytes=40_8,uninstantiated=NULL(),kindparameter=NULL(),lenparameterkind=NULL(),component=.c.test3,procptr=NULL(),special=NULL(),specialbitset=0_4,hasparent=0_1,noinitializationneeded=0_1,nodestructionneeded=1_1,nofinalizationneeded=1_1,nodefinedassignment=1_1)
+
+  type test4
+    type(hasAsst2), pointer :: p
+  end type
+!CHECK: .dt.test4, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(derivedtype) init:derivedtype(binding=NULL(),name=.n.test4,sizeinbytes=40_8,uninstantiated=NULL(),kindparameter=NULL(),lenparameterkind=NULL(),component=.c.test4,procptr=NULL(),special=NULL(),specialbitset=0_4,hasparent=0_1,noinitializationneeded=0_1,nodestructionneeded=1_1,nofinalizationneeded=1_1,nodefinedassignment=1_1)
+
+  type, extends(hasAsst1) :: test5
+  end type
+!CHECK: .dt.test5, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(derivedtype) init:derivedtype(binding=.v.test5,name=.n.test5,sizeinbytes=0_8,uninstantiated=NULL(),kindparameter=NULL(),lenparameterkind=NULL(),component=.c.test5,procptr=NULL(),special=.s.test5,specialbitset=4_4,hasparent=1_1,noinitializationneeded=1_1,nodestructionneeded=1_1,nofinalizationneeded=1_1,nodefinedassignment=0_1)
+
+  type, extends(hasAsst2) :: test6
+  end type
+!CHECK: .dt.test6, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(derivedtype) init:derivedtype(binding=NULL(),name=.n.test6,sizeinbytes=0_8,uninstantiated=NULL(),kindparameter=NULL(),lenparameterkind=NULL(),component=.c.test6,procptr=NULL(),special=NULL(),specialbitset=0_4,hasparent=1_1,noinitializationneeded=1_1,nodestructionneeded=1_1,nofinalizationneeded=1_1,nodefinedassignment=1_1)
+
+  type test7
+    type(test7), allocatable :: c
+  end type
+!CHECK: .dt.test7, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(derivedtype) init:derivedtype(binding=NULL(),name=.n.test7,sizeinbytes=40_8,uninstantiated=NULL(),kindparameter=NULL(),lenparameterkind=NULL(),component=.c.test7,procptr=NULL(),special=NULL(),specialbitset=0_4,hasparent=0_1,noinitializationneeded=0_1,nodestructionneeded=0_1,nofinalizationneeded=1_1,nodefinedassignment=1_1)
+
+  type test8
+    class(test8), allocatable :: c
+  end type
+!CHECK: .dt.test8, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(derivedtype) init:derivedtype(binding=NULL(),name=.n.test8,sizeinbytes=40_8,uninstantiated=NULL(),kindparameter=NULL(),lenparameterkind=NULL(),component=.c.test8,procptr=NULL(),special=NULL(),specialbitset=0_4,hasparent=0_1,noinitializationneeded=0_1,nodestructionneeded=0_1,nofinalizationneeded=0_1,nodefinedassignment=0_1)
+
+ contains
+  impure elemental subroutine asst1(left, right)
+    class(hasAsst1), intent(out) :: left
+    class(hasAsst1), intent(in) :: right
+  end
+  impure elemental subroutine asst2(left, right)
+    class(hasAsst2), intent(out) :: left
+    class(hasAsst2), intent(in) :: right
+  end
+end

>From b994a4c04f38d8cfb13f3dbf3d99146cb778443e Mon Sep 17 00:00:00 2001
From: Peter Klausler <pklausler at nvidia.com>
Date: Tue, 10 Jun 2025 14:44:41 -0700
Subject: [PATCH 20/61] [flang][NFC] Clean up code in two new functions
 (#142037)

Two recently-added functions in Semantics/tools.h need some cleaning up
to conform to the coding style of the project. One of them should
actually be in Parser/tools.{h,cpp}, the other doesn't need to be
defined in the header.
---
 flang/include/flang/Parser/tools.h          |  3 +++
 flang/include/flang/Semantics/tools.h       | 26 ++-------------------
 flang/lib/Lower/OpenACC.cpp                 |  4 ++--
 flang/lib/Lower/OpenMP/OpenMP.cpp           |  4 ++--
 flang/lib/Parser/tools.cpp                  |  5 ++++
 flang/lib/Semantics/check-omp-structure.cpp |  8 +++----
 flang/lib/Semantics/tools.cpp               | 14 +++++++++++
 7 files changed, 32 insertions(+), 32 deletions(-)

diff --git a/flang/include/flang/Parser/tools.h b/flang/include/flang/Parser/tools.h
index f1ead11734fa0..447bccd5d35a6 100644
--- a/flang/include/flang/Parser/tools.h
+++ b/flang/include/flang/Parser/tools.h
@@ -250,5 +250,8 @@ template <typename A> std::optional<CharBlock> GetLastSource(A &x) {
   return GetSourceHelper<false>::GetSource(const_cast<const A &>(x));
 }
 
+// Checks whether the assignment statement has a single variable on the RHS.
+bool CheckForSingleVariableOnRHS(const AssignmentStmt &);
+
 } // namespace Fortran::parser
 #endif // FORTRAN_PARSER_TOOLS_H_
diff --git a/flang/include/flang/Semantics/tools.h b/flang/include/flang/Semantics/tools.h
index 79f7032aac312..51df7c40f5b8b 100644
--- a/flang/include/flang/Semantics/tools.h
+++ b/flang/include/flang/Semantics/tools.h
@@ -756,29 +756,7 @@ std::string GetCommonBlockObjectName(const Symbol &, bool underscoring);
 // Check for ambiguous USE associations
 bool HadUseError(SemanticsContext &, SourceName at, const Symbol *);
 
-/// Checks if the assignment statement has a single variable on the RHS.
-inline bool checkForSingleVariableOnRHS(
-    const Fortran::parser::AssignmentStmt &assignmentStmt) {
-  const Fortran::parser::Expr &expr{
-      std::get<Fortran::parser::Expr>(assignmentStmt.t)};
-  const Fortran::common::Indirection<Fortran::parser::Designator> *designator =
-      std::get_if<Fortran::common::Indirection<Fortran::parser::Designator>>(
-          &expr.u);
-  return designator != nullptr;
-}
-
-/// Checks if the symbol on the LHS is present in the RHS expression.
-inline bool checkForSymbolMatch(const Fortran::semantics::SomeExpr *lhs,
-    const Fortran::semantics::SomeExpr *rhs) {
-  auto lhsSyms{Fortran::evaluate::GetSymbolVector(*lhs)};
-  const Fortran::semantics::Symbol &lhsSymbol{*lhsSyms.front()};
-  for (const Fortran::semantics::Symbol &symbol :
-      Fortran::evaluate::GetSymbolVector(*rhs)) {
-    if (lhsSymbol == symbol) {
-      return true;
-    }
-  }
-  return false;
-}
+// Checks whether the symbol on the LHS is present in the RHS expression.
+bool CheckForSymbolMatch(const SomeExpr *lhs, const SomeExpr *rhs);
 } // namespace Fortran::semantics
 #endif // FORTRAN_SEMANTICS_TOOLS_H_
diff --git a/flang/lib/Lower/OpenACC.cpp b/flang/lib/Lower/OpenACC.cpp
index 02dba22c29c7f..c10e1777614cd 100644
--- a/flang/lib/Lower/OpenACC.cpp
+++ b/flang/lib/Lower/OpenACC.cpp
@@ -653,8 +653,8 @@ void genAtomicCapture(Fortran::lower::AbstractConverter &converter,
   firOpBuilder.createBlock(&(atomicCaptureOp->getRegion(0)));
   mlir::Block &block = atomicCaptureOp->getRegion(0).back();
   firOpBuilder.setInsertionPointToStart(&block);
-  if (Fortran::semantics::checkForSingleVariableOnRHS(stmt1)) {
-    if (Fortran::semantics::checkForSymbolMatch(
+  if (Fortran::parser::CheckForSingleVariableOnRHS(stmt1)) {
+    if (Fortran::semantics::CheckForSymbolMatch(
             Fortran::semantics::GetExpr(stmt2Var),
             Fortran::semantics::GetExpr(stmt2Expr))) {
       // Atomic capture construct is of the form [capture-stmt, update-stmt]
diff --git a/flang/lib/Lower/OpenMP/OpenMP.cpp b/flang/lib/Lower/OpenMP/OpenMP.cpp
index 6892e571e62a3..784749bba5a0c 100644
--- a/flang/lib/Lower/OpenMP/OpenMP.cpp
+++ b/flang/lib/Lower/OpenMP/OpenMP.cpp
@@ -3200,8 +3200,8 @@ static void genAtomicCapture(lower::AbstractConverter &converter,
   firOpBuilder.createBlock(&(atomicCaptureOp->getRegion(0)));
   mlir::Block &block = atomicCaptureOp->getRegion(0).back();
   firOpBuilder.setInsertionPointToStart(&block);
-  if (semantics::checkForSingleVariableOnRHS(stmt1)) {
-    if (semantics::checkForSymbolMatch(semantics::GetExpr(stmt2Var),
+  if (parser::CheckForSingleVariableOnRHS(stmt1)) {
+    if (semantics::CheckForSymbolMatch(semantics::GetExpr(stmt2Var),
                                        semantics::GetExpr(stmt2Expr))) {
       // Atomic capture construct is of the form [capture-stmt, update-stmt]
       const semantics::SomeExpr &fromExpr = *semantics::GetExpr(stmt1Expr);
diff --git a/flang/lib/Parser/tools.cpp b/flang/lib/Parser/tools.cpp
index 6e5f1ed2fc66f..264ca520f38b8 100644
--- a/flang/lib/Parser/tools.cpp
+++ b/flang/lib/Parser/tools.cpp
@@ -174,4 +174,9 @@ const CoindexedNamedObject *GetCoindexedNamedObject(
       },
       allocateObject.u);
 }
+
+bool CheckForSingleVariableOnRHS(const AssignmentStmt &assignmentStmt) {
+  return Unwrap<Designator>(std::get<Expr>(assignmentStmt.t)) != nullptr;
+}
+
 } // namespace Fortran::parser
diff --git a/flang/lib/Semantics/check-omp-structure.cpp b/flang/lib/Semantics/check-omp-structure.cpp
index bdd078c33da92..31fcbb9683202 100644
--- a/flang/lib/Semantics/check-omp-structure.cpp
+++ b/flang/lib/Semantics/check-omp-structure.cpp
@@ -2933,9 +2933,9 @@ void OmpStructureChecker::CheckAtomicCaptureConstruct(
   const auto *e2 = GetExpr(context_, stmt2Expr);
 
   if (e1 && v1 && e2 && v2) {
-    if (semantics::checkForSingleVariableOnRHS(stmt1)) {
+    if (parser::CheckForSingleVariableOnRHS(stmt1)) {
       CheckAtomicCaptureStmt(stmt1);
-      if (semantics::checkForSymbolMatch(v2, e2)) {
+      if (CheckForSymbolMatch(v2, e2)) {
         // ATOMIC CAPTURE construct is of the form [capture-stmt, update-stmt]
         CheckAtomicUpdateStmt(stmt2);
       } else {
@@ -2947,8 +2947,8 @@ void OmpStructureChecker::CheckAtomicCaptureConstruct(
             "Captured variable/array element/derived-type component %s expected to be assigned in the second statement of ATOMIC CAPTURE construct"_err_en_US,
             stmt1Expr.source);
       }
-    } else if (semantics::checkForSymbolMatch(v1, e1) &&
-        semantics::checkForSingleVariableOnRHS(stmt2)) {
+    } else if (CheckForSymbolMatch(v1, e1) &&
+        parser::CheckForSingleVariableOnRHS(stmt2)) {
       // ATOMIC CAPTURE construct is of the form [update-stmt, capture-stmt]
       CheckAtomicUpdateStmt(stmt1);
       CheckAtomicCaptureStmt(stmt2);
diff --git a/flang/lib/Semantics/tools.cpp b/flang/lib/Semantics/tools.cpp
index 3247addc905ba..ea5ab2d455b54 100644
--- a/flang/lib/Semantics/tools.cpp
+++ b/flang/lib/Semantics/tools.cpp
@@ -1788,4 +1788,18 @@ bool HadUseError(
   }
 }
 
+bool CheckForSymbolMatch(const SomeExpr *lhs, const SomeExpr *rhs) {
+  if (lhs && rhs) {
+    if (SymbolVector lhsSymbols{evaluate::GetSymbolVector(*lhs)};
+        !lhsSymbols.empty()) {
+      const Symbol &first{*lhsSymbols.front()};
+      for (const Symbol &symbol : evaluate::GetSymbolVector(*rhs)) {
+        if (first == symbol) {
+          return true;
+        }
+      }
+    }
+  }
+  return false;
+}
 } // namespace Fortran::semantics

>From 54e72d15bc09e9e6464792711b8c475f92a759e2 Mon Sep 17 00:00:00 2001
From: Peter Klausler <pklausler at nvidia.com>
Date: Tue, 10 Jun 2025 14:44:59 -0700
Subject: [PATCH 21/61] [flang] Ensure overrides of special procedures
 (#142465)

When a derived type declares a generic procedure binding of interest to
the runtime library, such as for ASSIGNMENT(=), it overrides any binding
that might have been present for the parent type.

Fixes https://github.com/llvm/llvm-project/issues/142414.
---
 flang/lib/Semantics/runtime-type-info.cpp |  4 ++--
 flang/test/Semantics/typeinfo13.f90       | 26 +++++++++++++++++++++++
 2 files changed, 28 insertions(+), 2 deletions(-)
 create mode 100644 flang/test/Semantics/typeinfo13.f90

diff --git a/flang/lib/Semantics/runtime-type-info.cpp b/flang/lib/Semantics/runtime-type-info.cpp
index 2a862e0e2858b..4c186f4874152 100644
--- a/flang/lib/Semantics/runtime-type-info.cpp
+++ b/flang/lib/Semantics/runtime-type-info.cpp
@@ -1067,7 +1067,7 @@ RuntimeTableBuilder::DescribeSpecialGenerics(const Scope &dtScope,
     specials =
         DescribeSpecialGenerics(*parentScope, thisScope, derivedTypeSpec);
   }
-  for (auto pair : dtScope) {
+  for (const auto &pair : dtScope) {
     const Symbol &symbol{*pair.second};
     if (const auto *generic{symbol.detailsIf<GenericDetails>()}) {
       DescribeSpecialGeneric(*generic, specials, thisScope, derivedTypeSpec);
@@ -1245,7 +1245,7 @@ void RuntimeTableBuilder::DescribeSpecialProc(
     AddValue(values, specialSchema_, procCompName,
         SomeExpr{evaluate::ProcedureDesignator{specific}});
     // index might already be present in the case of an override
-    specials.emplace(*index,
+    specials.insert_or_assign(*index,
         evaluate::StructureConstructor{
             DEREF(specialSchema_.AsDerived()), std::move(values)});
   }
diff --git a/flang/test/Semantics/typeinfo13.f90 b/flang/test/Semantics/typeinfo13.f90
new file mode 100644
index 0000000000000..cf4abf9e38181
--- /dev/null
+++ b/flang/test/Semantics/typeinfo13.f90
@@ -0,0 +1,26 @@
+!RUN: %flang_fc1 -fdebug-dump-symbols %s | FileCheck %s
+!Ensure ASSIGNMENT(=) overrides are applied to the special procedures table.
+module m
+  type base
+   contains
+    procedure :: baseAssign
+    generic :: assignment(=) => baseAssign
+  end type
+  type, extends(base) :: child
+   contains
+    procedure :: override
+    generic :: assignment(=) => override
+  end type
+ contains
+  impure elemental subroutine baseAssign(to, from)
+    class(base), intent(out) :: to
+    type(base), intent(in) :: from
+  end
+  impure elemental subroutine override(to, from)
+    class(child), intent(out) :: to
+    type(child), intent(in) :: from
+  end
+end
+
+!CHECK: .s.child, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(specialbinding) shape: 0_8:0_8 init:[specialbinding::specialbinding(which=2_1,isargdescriptorset=1_1,istypebound=1_1,isargcontiguousset=0_1,proc=override)]
+!CHECK: .v.child, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(binding) shape: 0_8:1_8 init:[binding::binding(proc=baseassign,name=.n.baseassign),binding(proc=override,name=.n.override)]

>From 2f9dfdfb35bdb10334b09476a47dc1d93beea96c Mon Sep 17 00:00:00 2001
From: Craig Topper <craig.topper at sifive.com>
Date: Tue, 10 Jun 2025 15:11:44 -0700
Subject: [PATCH 22/61] [IR] Simplify scalable vector handling in
 ShuffleVectorInst::getShuffleMask. NFC (#143596)

Combine the scalable vector UndefValue check with the earlier
ConstantAggregateZero handling for fixed and scalable vectors.

Assert that the rest of the code is only reached for fixed vectors.

Use append instead of resize since we know the size is increasing.
---
 llvm/lib/IR/Instructions.cpp | 19 +++++++------------
 1 file changed, 7 insertions(+), 12 deletions(-)

diff --git a/llvm/lib/IR/Instructions.cpp b/llvm/lib/IR/Instructions.cpp
index b29969657e7fc..2d89ec1b0a8d3 100644
--- a/llvm/lib/IR/Instructions.cpp
+++ b/llvm/lib/IR/Instructions.cpp
@@ -1854,23 +1854,18 @@ void ShuffleVectorInst::getShuffleMask(const Constant *Mask,
                                        SmallVectorImpl<int> &Result) {
   ElementCount EC = cast<VectorType>(Mask->getType())->getElementCount();
 
-  if (isa<ConstantAggregateZero>(Mask)) {
-    Result.resize(EC.getKnownMinValue(), 0);
+  if (isa<ConstantAggregateZero>(Mask) || isa<UndefValue>(Mask)) {
+    int MaskVal = isa<UndefValue>(Mask) ? -1 : 0;
+    Result.append(EC.getKnownMinValue(), MaskVal);
     return;
   }
 
-  Result.reserve(EC.getKnownMinValue());
+  assert(!EC.isScalable() &&
+         "Scalable vector shuffle mask must be undef or zeroinitializer");
 
-  if (EC.isScalable()) {
-    assert((isa<ConstantAggregateZero>(Mask) || isa<UndefValue>(Mask)) &&
-           "Scalable vector shuffle mask must be undef or zeroinitializer");
-    int MaskVal = isa<UndefValue>(Mask) ? -1 : 0;
-    for (unsigned I = 0; I < EC.getKnownMinValue(); ++I)
-      Result.emplace_back(MaskVal);
-    return;
-  }
+  unsigned NumElts = EC.getFixedValue();
 
-  unsigned NumElts = EC.getKnownMinValue();
+  Result.reserve(NumElts);
 
   if (auto *CDS = dyn_cast<ConstantDataSequential>(Mask)) {
     for (unsigned i = 0; i != NumElts; ++i)

>From 32649e017eaa609fa556b6d6d74bb73abf37214d Mon Sep 17 00:00:00 2001
From: "S. VenkataKeerthy" <31350914+svkeerthy at users.noreply.github.com>
Date: Tue, 10 Jun 2025 15:12:16 -0700
Subject: [PATCH 23/61] [IR2Vec] Exposing Embedding as an data type wrapped
 around std::vector<double> (#143197)

Currently `Embedding` is `std::vector<double>`. This PR makes it a data type wrapped around `std::vector<double>` to overload basic arithmetic operators and expose comparison operations. It _simplifies_ the usage here and in the passes where operations on `Embedding` would be performed.

(Tracking issue - #141817)
---
 llvm/include/llvm/Analysis/IR2Vec.h    |  69 ++++++--
 llvm/lib/Analysis/IR2Vec.cpp           |  69 +++++---
 llvm/unittests/Analysis/IR2VecTest.cpp | 208 ++++++++++++++++++++-----
 3 files changed, 274 insertions(+), 72 deletions(-)

diff --git a/llvm/include/llvm/Analysis/IR2Vec.h b/llvm/include/llvm/Analysis/IR2Vec.h
index 9fd1b0ae8e248..8bf21b0e75d67 100644
--- a/llvm/include/llvm/Analysis/IR2Vec.h
+++ b/llvm/include/llvm/Analysis/IR2Vec.h
@@ -53,7 +53,63 @@ class raw_ostream;
 enum class IR2VecKind { Symbolic };
 
 namespace ir2vec {
-using Embedding = std::vector<double>;
+/// Embedding is a datatype that wraps std::vector<double>. It provides
+/// additional functionality for arithmetic and comparison operations.
+/// It is meant to be used *like* std::vector<double> but is more restrictive
+/// in the sense that it does not allow the user to change the size of the
+/// embedding vector. The dimension of the embedding is fixed at the time of
+/// construction of Embedding object. But the elements can be modified in-place.
+struct Embedding {
+private:
+  std::vector<double> Data;
+
+public:
+  Embedding() = default;
+  Embedding(const std::vector<double> &V) : Data(V) {}
+  Embedding(std::vector<double> &&V) : Data(std::move(V)) {}
+  Embedding(std::initializer_list<double> IL) : Data(IL) {}
+
+  explicit Embedding(size_t Size) : Data(Size) {}
+  Embedding(size_t Size, double InitialValue) : Data(Size, InitialValue) {}
+
+  size_t size() const { return Data.size(); }
+  bool empty() const { return Data.empty(); }
+
+  double &operator[](size_t Itr) {
+    assert(Itr < Data.size() && "Index out of bounds");
+    return Data[Itr];
+  }
+
+  const double &operator[](size_t Itr) const {
+    assert(Itr < Data.size() && "Index out of bounds");
+    return Data[Itr];
+  }
+
+  using iterator = typename std::vector<double>::iterator;
+  using const_iterator = typename std::vector<double>::const_iterator;
+
+  iterator begin() { return Data.begin(); }
+  iterator end() { return Data.end(); }
+  const_iterator begin() const { return Data.begin(); }
+  const_iterator end() const { return Data.end(); }
+  const_iterator cbegin() const { return Data.cbegin(); }
+  const_iterator cend() const { return Data.cend(); }
+
+  const std::vector<double> &getData() const { return Data; }
+
+  /// Arithmetic operators
+  Embedding &operator+=(const Embedding &RHS);
+  Embedding &operator-=(const Embedding &RHS);
+
+  /// Adds Src Embedding scaled by Factor with the called Embedding.
+  /// Called_Embedding += Src * Factor
+  Embedding &scaleAndAdd(const Embedding &Src, float Factor);
+
+  /// Returns true if the embedding is approximately equal to the RHS embedding
+  /// within the specified tolerance.
+  bool approximatelyEquals(const Embedding &RHS, double Tolerance = 1e-6) const;
+};
+
 using InstEmbeddingsMap = DenseMap<const Instruction *, Embedding>;
 using BBEmbeddingsMap = DenseMap<const BasicBlock *, Embedding>;
 // FIXME: Current the keys are strings. This can be changed to
@@ -61,8 +117,8 @@ using BBEmbeddingsMap = DenseMap<const BasicBlock *, Embedding>;
 using Vocab = std::map<std::string, Embedding>;
 
 /// Embedder provides the interface to generate embeddings (vector
-/// representations) for instructions, basic blocks, and functions. The vector
-/// representations are generated using IR2Vec algorithms.
+/// representations) for instructions, basic blocks, and functions. The
+/// vector representations are generated using IR2Vec algorithms.
 ///
 /// The Embedder class is an abstract class and it is intended to be
 /// subclassed for different IR2Vec algorithms like Symbolic and Flow-aware.
@@ -99,13 +155,6 @@ class Embedder {
   /// zero vector.
   Embedding lookupVocab(const std::string &Key) const;
 
-  /// Adds two vectors: Dst += Src
-  static void addVectors(Embedding &Dst, const Embedding &Src);
-
-  /// Adds Src vector scaled by Factor to Dst vector: Dst += Src * Factor
-  static void addScaledVector(Embedding &Dst, const Embedding &Src,
-                              float Factor);
-
 public:
   virtual ~Embedder() = default;
 
diff --git a/llvm/lib/Analysis/IR2Vec.cpp b/llvm/lib/Analysis/IR2Vec.cpp
index 490db5fdcdf99..25ce35d4ace37 100644
--- a/llvm/lib/Analysis/IR2Vec.cpp
+++ b/llvm/lib/Analysis/IR2Vec.cpp
@@ -55,6 +55,51 @@ static cl::opt<float> ArgWeight("ir2vec-arg-weight", cl::Optional,
 
 AnalysisKey IR2VecVocabAnalysis::Key;
 
+namespace llvm::json {
+inline bool fromJSON(const llvm::json::Value &E, Embedding &Out,
+                     llvm::json::Path P) {
+  std::vector<double> TempOut;
+  if (!llvm::json::fromJSON(E, TempOut, P))
+    return false;
+  Out = Embedding(std::move(TempOut));
+  return true;
+}
+} // namespace llvm::json
+
+// ==----------------------------------------------------------------------===//
+// Embedding
+//===----------------------------------------------------------------------===//
+
+Embedding &Embedding::operator+=(const Embedding &RHS) {
+  assert(this->size() == RHS.size() && "Vectors must have the same dimension");
+  std::transform(this->begin(), this->end(), RHS.begin(), this->begin(),
+                 std::plus<double>());
+  return *this;
+}
+
+Embedding &Embedding::operator-=(const Embedding &RHS) {
+  assert(this->size() == RHS.size() && "Vectors must have the same dimension");
+  std::transform(this->begin(), this->end(), RHS.begin(), this->begin(),
+                 std::minus<double>());
+  return *this;
+}
+
+Embedding &Embedding::scaleAndAdd(const Embedding &Src, float Factor) {
+  assert(this->size() == Src.size() && "Vectors must have the same dimension");
+  for (size_t Itr = 0; Itr < this->size(); ++Itr)
+    (*this)[Itr] += Src[Itr] * Factor;
+  return *this;
+}
+
+bool Embedding::approximatelyEquals(const Embedding &RHS,
+                                    double Tolerance) const {
+  assert(this->size() == RHS.size() && "Vectors must have the same dimension");
+  for (size_t Itr = 0; Itr < this->size(); ++Itr)
+    if (std::abs((*this)[Itr] - RHS[Itr]) > Tolerance)
+      return false;
+  return true;
+}
+
 // ==----------------------------------------------------------------------===//
 // Embedder and its subclasses
 //===----------------------------------------------------------------------===//
@@ -73,20 +118,6 @@ Embedder::create(IR2VecKind Mode, const Function &F, const Vocab &Vocabulary) {
   return make_error<StringError>("Unknown IR2VecKind", errc::invalid_argument);
 }
 
-void Embedder::addVectors(Embedding &Dst, const Embedding &Src) {
-  assert(Dst.size() == Src.size() && "Vectors must have the same dimension");
-  std::transform(Dst.begin(), Dst.end(), Src.begin(), Dst.begin(),
-                 std::plus<double>());
-}
-
-void Embedder::addScaledVector(Embedding &Dst, const Embedding &Src,
-                               float Factor) {
-  assert(Dst.size() == Src.size() && "Vectors must have the same dimension");
-  for (size_t i = 0; i < Dst.size(); ++i) {
-    Dst[i] += Src[i] * Factor;
-  }
-}
-
 // FIXME: Currently lookups are string based. Use numeric Keys
 // for efficiency
 Embedding Embedder::lookupVocab(const std::string &Key) const {
@@ -164,20 +195,20 @@ void SymbolicEmbedder::computeEmbeddings(const BasicBlock &BB) const {
     Embedding InstVector(Dimension, 0);
 
     const auto OpcVec = lookupVocab(I.getOpcodeName());
-    addScaledVector(InstVector, OpcVec, OpcWeight);
+    InstVector.scaleAndAdd(OpcVec, OpcWeight);
 
     // FIXME: Currently lookups are string based. Use numeric Keys
     // for efficiency.
     const auto Type = I.getType();
     const auto TypeVec = getTypeEmbedding(Type);
-    addScaledVector(InstVector, TypeVec, TypeWeight);
+    InstVector.scaleAndAdd(TypeVec, TypeWeight);
 
     for (const auto &Op : I.operands()) {
       const auto OperandVec = getOperandEmbedding(Op.get());
-      addScaledVector(InstVector, OperandVec, ArgWeight);
+      InstVector.scaleAndAdd(OperandVec, ArgWeight);
     }
     InstVecMap[&I] = InstVector;
-    addVectors(BBVector, InstVector);
+    BBVector += InstVector;
   }
   BBVecMap[&BB] = BBVector;
 }
@@ -187,7 +218,7 @@ void SymbolicEmbedder::computeEmbeddings() const {
     return;
   for (const auto &BB : F) {
     computeEmbeddings(BB);
-    addVectors(FuncVector, BBVecMap[&BB]);
+    FuncVector += BBVecMap[&BB];
   }
 }
 
diff --git a/llvm/unittests/Analysis/IR2VecTest.cpp b/llvm/unittests/Analysis/IR2VecTest.cpp
index 9e47b2cd8bedd..053b9f75e7a66 100644
--- a/llvm/unittests/Analysis/IR2VecTest.cpp
+++ b/llvm/unittests/Analysis/IR2VecTest.cpp
@@ -14,6 +14,7 @@
 #include "llvm/IR/Module.h"
 #include "llvm/IR/Type.h"
 #include "llvm/Support/Error.h"
+#include "llvm/Support/JSON.h"
 
 #include "gmock/gmock.h"
 #include "gtest/gtest.h"
@@ -32,89 +33,209 @@ class TestableEmbedder : public Embedder {
   void computeEmbeddings() const override {}
   void computeEmbeddings(const BasicBlock &BB) const override {}
   using Embedder::lookupVocab;
-  static void addVectors(Embedding &Dst, const Embedding &Src) {
-    Embedder::addVectors(Dst, Src);
+};
+
+TEST(EmbeddingTest, ConstructorsAndAccessors) {
+  // Default constructor
+  {
+    Embedding E;
+    EXPECT_TRUE(E.empty());
+    EXPECT_EQ(E.size(), 0u);
   }
-  static void addScaledVector(Embedding &Dst, const Embedding &Src,
-                              float Factor) {
-    Embedder::addScaledVector(Dst, Src, Factor);
+
+  // Constructor with const std::vector<double>&
+  {
+    std::vector<double> Data = {1.0, 2.0, 3.0};
+    Embedding E(Data);
+    EXPECT_FALSE(E.empty());
+    ASSERT_THAT(E, SizeIs(3u));
+    EXPECT_THAT(E.getData(), ElementsAre(1.0, 2.0, 3.0));
+    EXPECT_EQ(E[0], 1.0);
+    EXPECT_EQ(E[1], 2.0);
+    EXPECT_EQ(E[2], 3.0);
   }
-};
 
-TEST(IR2VecTest, CreateSymbolicEmbedder) {
-  Vocab V = {{"foo", {1.0, 2.0}}};
+  // Constructor with std::vector<double>&&
+  {
+    Embedding E(std::vector<double>({4.0, 5.0}));
+    ASSERT_THAT(E, SizeIs(2u));
+    EXPECT_THAT(E.getData(), ElementsAre(4.0, 5.0));
+  }
 
-  LLVMContext Ctx;
-  Module M("M", Ctx);
-  FunctionType *FTy = FunctionType::get(Type::getVoidTy(Ctx), false);
-  Function *F = Function::Create(FTy, Function::ExternalLinkage, "f", M);
+  // Constructor with std::initializer_list<double>
+  {
+    Embedding E({6.0, 7.0, 8.0, 9.0});
+    ASSERT_THAT(E, SizeIs(4u));
+    EXPECT_THAT(E.getData(), ElementsAre(6.0, 7.0, 8.0, 9.0));
+    EXPECT_EQ(E[0], 6.0);
+    E[0] = 6.5;
+    EXPECT_EQ(E[0], 6.5);
+  }
 
-  auto Result = Embedder::create(IR2VecKind::Symbolic, *F, V);
-  EXPECT_TRUE(static_cast<bool>(Result));
+  // Constructor with size_t
+  {
+    Embedding E(5);
+    ASSERT_THAT(E, SizeIs(5u));
+    EXPECT_THAT(E.getData(), ElementsAre(0.0, 0.0, 0.0, 0.0, 0.0));
+  }
 
-  auto *Emb = Result->get();
-  EXPECT_NE(Emb, nullptr);
-}
+  // Constructor with size_t and double
+  {
+    Embedding E(5, 1.5);
+    ASSERT_THAT(E, SizeIs(5u));
+    EXPECT_THAT(E.getData(), ElementsAre(1.5, 1.5, 1.5, 1.5, 1.5));
+  }
 
-TEST(IR2VecTest, CreateInvalidMode) {
-  Vocab V = {{"foo", {1.0, 2.0}}};
+  // Test iterators
+  {
+    Embedding E({6.5, 7.0, 8.0, 9.0});
+    std::vector<double> VecE;
+    for (double Val : E) {
+      VecE.push_back(Val);
+    }
+    EXPECT_THAT(VecE, ElementsAre(6.5, 7.0, 8.0, 9.0));
+
+    const Embedding CE = E;
+    std::vector<double> VecCE;
+    for (const double &Val : CE) {
+      VecCE.push_back(Val);
+    }
+    EXPECT_THAT(VecCE, ElementsAre(6.5, 7.0, 8.0, 9.0));
+
+    EXPECT_EQ(*E.begin(), 6.5);
+    EXPECT_EQ(*(E.end() - 1), 9.0);
+    EXPECT_EQ(*CE.cbegin(), 6.5);
+    EXPECT_EQ(*(CE.cend() - 1), 9.0);
+  }
+}
 
-  LLVMContext Ctx;
-  Module M("M", Ctx);
-  FunctionType *FTy = FunctionType::get(Type::getVoidTy(Ctx), false);
-  Function *F = Function::Create(FTy, Function::ExternalLinkage, "f", M);
+TEST(EmbeddingTest, AddVectors) {
+  Embedding E1 = {1.0, 2.0, 3.0};
+  Embedding E2 = {0.5, 1.5, -1.0};
 
-  // static_cast an invalid int to IR2VecKind
-  auto Result = Embedder::create(static_cast<IR2VecKind>(-1), *F, V);
-  EXPECT_FALSE(static_cast<bool>(Result));
+  E1 += E2;
+  EXPECT_THAT(E1, ElementsAre(1.5, 3.5, 2.0));
 
-  std::string ErrMsg;
-  llvm::handleAllErrors(
-      Result.takeError(),
-      [&](const llvm::ErrorInfoBase &EIB) { ErrMsg = EIB.message(); });
-  EXPECT_NE(ErrMsg.find("Unknown IR2VecKind"), std::string::npos);
+  // Check that E2 is unchanged
+  EXPECT_THAT(E2, ElementsAre(0.5, 1.5, -1.0));
 }
 
-TEST(IR2VecTest, AddVectors) {
+TEST(EmbeddingTest, SubtractVectors) {
   Embedding E1 = {1.0, 2.0, 3.0};
   Embedding E2 = {0.5, 1.5, -1.0};
 
-  TestableEmbedder::addVectors(E1, E2);
-  EXPECT_THAT(E1, ElementsAre(1.5, 3.5, 2.0));
+  E1 -= E2;
+  EXPECT_THAT(E1, ElementsAre(0.5, 0.5, 4.0));
 
   // Check that E2 is unchanged
   EXPECT_THAT(E2, ElementsAre(0.5, 1.5, -1.0));
 }
 
-TEST(IR2VecTest, AddScaledVector) {
+TEST(EmbeddingTest, AddScaledVector) {
   Embedding E1 = {1.0, 2.0, 3.0};
   Embedding E2 = {2.0, 0.5, -1.0};
 
-  TestableEmbedder::addScaledVector(E1, E2, 0.5f);
+  E1.scaleAndAdd(E2, 0.5f);
   EXPECT_THAT(E1, ElementsAre(2.0, 2.25, 2.5));
 
   // Check that E2 is unchanged
   EXPECT_THAT(E2, ElementsAre(2.0, 0.5, -1.0));
 }
 
+TEST(EmbeddingTest, ApproximatelyEqual) {
+  Embedding E1 = {1.0, 2.0, 3.0};
+  Embedding E2 = {1.0000001, 2.0000001, 3.0000001};
+  EXPECT_TRUE(E1.approximatelyEquals(E2)); // Diff = 1e-7
+
+  Embedding E3 = {1.00002, 2.00002, 3.00002}; // Diff = 2e-5
+  EXPECT_FALSE(E1.approximatelyEquals(E3));
+  EXPECT_TRUE(E1.approximatelyEquals(E3, 3e-5));
+
+  Embedding E_clearly_within = {1.0000005, 2.0000005, 3.0000005}; // Diff = 5e-7
+  EXPECT_TRUE(E1.approximatelyEquals(E_clearly_within));
+
+  Embedding E_clearly_outside = {1.00001, 2.00001, 3.00001}; // Diff = 1e-5
+  EXPECT_FALSE(E1.approximatelyEquals(E_clearly_outside));
+
+  Embedding E4 = {1.0, 2.0, 3.5}; // Large diff
+  EXPECT_FALSE(E1.approximatelyEquals(E4, 0.01));
+
+  Embedding E5 = {1.0, 2.0, 3.0};
+  EXPECT_TRUE(E1.approximatelyEquals(E5, 0.0));
+  EXPECT_TRUE(E1.approximatelyEquals(E5));
+}
+
 #if GTEST_HAS_DEATH_TEST
 #ifndef NDEBUG
-TEST(IR2VecTest, MismatchedDimensionsAddVectors) {
+TEST(EmbeddingTest, AccessOutOfBounds) {
+  Embedding E = {1.0, 2.0, 3.0};
+  EXPECT_DEATH(E[3], "Index out of bounds");
+  EXPECT_DEATH(E[-1], "Index out of bounds");
+  EXPECT_DEATH(E[4] = 4.0, "Index out of bounds");
+}
+
+TEST(EmbeddingTest, MismatchedDimensionsAddVectors) {
   Embedding E1 = {1.0, 2.0};
   Embedding E2 = {1.0};
-  EXPECT_DEATH(TestableEmbedder::addVectors(E1, E2),
-               "Vectors must have the same dimension");
+  EXPECT_DEATH(E1 += E2, "Vectors must have the same dimension");
+}
+
+TEST(EmbeddingTest, MismatchedDimensionsSubtractVectors) {
+  Embedding E1 = {1.0, 2.0};
+  Embedding E2 = {1.0};
+  EXPECT_DEATH(E1 -= E2, "Vectors must have the same dimension");
 }
 
-TEST(IR2VecTest, MismatchedDimensionsAddScaledVector) {
+TEST(EmbeddingTest, MismatchedDimensionsAddScaledVector) {
   Embedding E1 = {1.0, 2.0};
   Embedding E2 = {1.0};
-  EXPECT_DEATH(TestableEmbedder::addScaledVector(E1, E2, 1.0f),
+  EXPECT_DEATH(E1.scaleAndAdd(E2, 1.0f),
+               "Vectors must have the same dimension");
+}
+
+TEST(EmbeddingTest, MismatchedDimensionsApproximatelyEqual) {
+  Embedding E1 = {1.0, 2.0};
+  Embedding E2 = {1.010};
+  EXPECT_DEATH(E1.approximatelyEquals(E2),
                "Vectors must have the same dimension");
 }
 #endif // NDEBUG
 #endif // GTEST_HAS_DEATH_TEST
 
+TEST(IR2VecTest, CreateSymbolicEmbedder) {
+  Vocab V = {{"foo", {1.0, 2.0}}};
+
+  LLVMContext Ctx;
+  Module M("M", Ctx);
+  FunctionType *FTy = FunctionType::get(Type::getVoidTy(Ctx), false);
+  Function *F = Function::Create(FTy, Function::ExternalLinkage, "f", M);
+
+  auto Result = Embedder::create(IR2VecKind::Symbolic, *F, V);
+  EXPECT_TRUE(static_cast<bool>(Result));
+
+  auto *Emb = Result->get();
+  EXPECT_NE(Emb, nullptr);
+}
+
+TEST(IR2VecTest, CreateInvalidMode) {
+  Vocab V = {{"foo", {1.0, 2.0}}};
+
+  LLVMContext Ctx;
+  Module M("M", Ctx);
+  FunctionType *FTy = FunctionType::get(Type::getVoidTy(Ctx), false);
+  Function *F = Function::Create(FTy, Function::ExternalLinkage, "f", M);
+
+  // static_cast an invalid int to IR2VecKind
+  auto Result = Embedder::create(static_cast<IR2VecKind>(-1), *F, V);
+  EXPECT_FALSE(static_cast<bool>(Result));
+
+  std::string ErrMsg;
+  llvm::handleAllErrors(
+      Result.takeError(),
+      [&](const llvm::ErrorInfoBase &EIB) { ErrMsg = EIB.message(); });
+  EXPECT_NE(ErrMsg.find("Unknown IR2VecKind"), std::string::npos);
+}
+
 TEST(IR2VecTest, LookupVocab) {
   Vocab V = {{"foo", {1.0, 2.0}}, {"bar", {3.0, 4.0}}};
   LLVMContext Ctx;
@@ -136,8 +257,9 @@ TEST(IR2VecTest, ZeroDimensionEmbedding) {
   Embedding E1;
   Embedding E2;
   // Should be no-op, but not crash
-  TestableEmbedder::addVectors(E1, E2);
-  TestableEmbedder::addScaledVector(E1, E2, 1.0f);
+  E1 += E2;
+  E1 -= E2;
+  E1.scaleAndAdd(E2, 1.0f);
   EXPECT_TRUE(E1.empty());
 }
 

>From 3a2bcd96e22721312c9d340c9122a3988dc1e222 Mon Sep 17 00:00:00 2001
From: Philip Reames <preames at rivosinc.com>
Date: Tue, 10 Jun 2025 15:26:54 -0700
Subject: [PATCH 24/61] [RISCV][TTI] Allow partial reduce with mismatched
 extends (#143608)

This depends on the recently add partial_reduce_sumla node for lowering
but at this point, we have all the parts.
---
 .../Target/RISCV/RISCVTargetTransformInfo.cpp |   3 +-
 .../RISCV/partial-reduce-dot-product.ll       | 439 ++++++++++++------
 2 files changed, 296 insertions(+), 146 deletions(-)

diff --git a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp
index ff822dec232a9..d5ea0c5d52293 100644
--- a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp
+++ b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp
@@ -305,8 +305,7 @@ InstructionCost RISCVTTIImpl::getPartialReductionCost(
   if (!ST->hasStdExtZvqdotq() || ST->getELen() < 64 ||
       Opcode != Instruction::Add || !BinOp || *BinOp != Instruction::Mul ||
       InputTypeA != InputTypeB || !InputTypeA->isIntegerTy(8) ||
-      OpAExtend != OpBExtend || !AccumType->isIntegerTy(32) ||
-      !VF.isKnownMultipleOf(4))
+      !AccumType->isIntegerTy(32) || !VF.isKnownMultipleOf(4))
     return InstructionCost::getInvalid();
 
   Type *Tp = VectorType::get(AccumType, VF.divideCoefficientBy(4));
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/partial-reduce-dot-product.ll b/llvm/test/Transforms/LoopVectorize/RISCV/partial-reduce-dot-product.ll
index 847c4ba0bebfc..8c29da02b813c 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/partial-reduce-dot-product.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/partial-reduce-dot-product.ll
@@ -351,79 +351,153 @@ for.exit:                        ; preds = %for.body
 
 
 define i32 @vqdotsu(ptr %a, ptr %b) #0 {
-; CHECK-LABEL: define i32 @vqdotsu(
-; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0:[0-9]+]] {
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 4
-; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP1]]
-; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK:       vector.ph:
-; CHECK-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 4
-; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP3]]
-; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]]
-; CHECK-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 4
-; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
-; CHECK:       vector.body:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP13:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr i8, ptr [[A]], i64 [[INDEX]]
-; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr i8, ptr [[TMP6]], i32 0
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x i8>, ptr [[TMP7]], align 1
-; CHECK-NEXT:    [[TMP8:%.*]] = zext <vscale x 4 x i8> [[WIDE_LOAD]] to <vscale x 4 x i32>
-; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[B]], i64 [[INDEX]]
-; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr i8, ptr [[TMP9]], i32 0
-; CHECK-NEXT:    [[WIDE_LOAD1:%.*]] = load <vscale x 4 x i8>, ptr [[TMP10]], align 1
-; CHECK-NEXT:    [[TMP11:%.*]] = sext <vscale x 4 x i8> [[WIDE_LOAD1]] to <vscale x 4 x i32>
-; CHECK-NEXT:    [[TMP12:%.*]] = mul <vscale x 4 x i32> [[TMP11]], [[TMP8]]
-; CHECK-NEXT:    [[TMP13]] = add <vscale x 4 x i32> [[TMP12]], [[VEC_PHI]]
-; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
-; CHECK-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
-; CHECK:       middle.block:
-; CHECK-NEXT:    [[TMP15:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[TMP13]])
-; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]]
-; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_EXIT:%.*]], label [[SCALAR_PH]]
-; CHECK:       scalar.ph:
+; V-LABEL: define i32 @vqdotsu(
+; V-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
+; V-NEXT:  entry:
+; V-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; V-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 4
+; V-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP1]]
+; V-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; V:       vector.ph:
+; V-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; V-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 4
+; V-NEXT:    [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP3]]
+; V-NEXT:    [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]]
+; V-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
+; V-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 4
+; V-NEXT:    br label [[VECTOR_BODY:%.*]]
+; V:       vector.body:
+; V-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; V-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP13:%.*]], [[VECTOR_BODY]] ]
+; V-NEXT:    [[TMP6:%.*]] = getelementptr i8, ptr [[A]], i64 [[INDEX]]
+; V-NEXT:    [[TMP7:%.*]] = getelementptr i8, ptr [[TMP6]], i32 0
+; V-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x i8>, ptr [[TMP7]], align 1
+; V-NEXT:    [[TMP8:%.*]] = zext <vscale x 4 x i8> [[WIDE_LOAD]] to <vscale x 4 x i32>
+; V-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[B]], i64 [[INDEX]]
+; V-NEXT:    [[TMP10:%.*]] = getelementptr i8, ptr [[TMP9]], i32 0
+; V-NEXT:    [[WIDE_LOAD1:%.*]] = load <vscale x 4 x i8>, ptr [[TMP10]], align 1
+; V-NEXT:    [[TMP11:%.*]] = sext <vscale x 4 x i8> [[WIDE_LOAD1]] to <vscale x 4 x i32>
+; V-NEXT:    [[TMP12:%.*]] = mul <vscale x 4 x i32> [[TMP11]], [[TMP8]]
+; V-NEXT:    [[TMP13]] = add <vscale x 4 x i32> [[TMP12]], [[VEC_PHI]]
+; V-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
+; V-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; V-NEXT:    br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
+; V:       middle.block:
+; V-NEXT:    [[TMP15:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[TMP13]])
+; V-NEXT:    [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]]
+; V-NEXT:    br i1 [[CMP_N]], label [[FOR_EXIT:%.*]], label [[SCALAR_PH]]
+; V:       scalar.ph:
+;
+; ZVQDOTQ-LABEL: define i32 @vqdotsu(
+; ZVQDOTQ-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
+; ZVQDOTQ-NEXT:  entry:
+; ZVQDOTQ-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; ZVQDOTQ-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 4
+; ZVQDOTQ-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP1]]
+; ZVQDOTQ-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; ZVQDOTQ:       vector.ph:
+; ZVQDOTQ-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; ZVQDOTQ-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 4
+; ZVQDOTQ-NEXT:    [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP3]]
+; ZVQDOTQ-NEXT:    [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]]
+; ZVQDOTQ-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
+; ZVQDOTQ-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 4
+; ZVQDOTQ-NEXT:    br label [[VECTOR_BODY:%.*]]
+; ZVQDOTQ:       vector.body:
+; ZVQDOTQ-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; ZVQDOTQ-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 1 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], [[VECTOR_BODY]] ]
+; ZVQDOTQ-NEXT:    [[TMP6:%.*]] = getelementptr i8, ptr [[A]], i64 [[INDEX]]
+; ZVQDOTQ-NEXT:    [[TMP7:%.*]] = getelementptr i8, ptr [[TMP6]], i32 0
+; ZVQDOTQ-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x i8>, ptr [[TMP7]], align 1
+; ZVQDOTQ-NEXT:    [[TMP8:%.*]] = zext <vscale x 4 x i8> [[WIDE_LOAD]] to <vscale x 4 x i32>
+; ZVQDOTQ-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[B]], i64 [[INDEX]]
+; ZVQDOTQ-NEXT:    [[TMP10:%.*]] = getelementptr i8, ptr [[TMP9]], i32 0
+; ZVQDOTQ-NEXT:    [[WIDE_LOAD1:%.*]] = load <vscale x 4 x i8>, ptr [[TMP10]], align 1
+; ZVQDOTQ-NEXT:    [[TMP11:%.*]] = sext <vscale x 4 x i8> [[WIDE_LOAD1]] to <vscale x 4 x i32>
+; ZVQDOTQ-NEXT:    [[TMP12:%.*]] = mul <vscale x 4 x i32> [[TMP11]], [[TMP8]]
+; ZVQDOTQ-NEXT:    [[PARTIAL_REDUCE]] = call <vscale x 1 x i32> @llvm.experimental.vector.partial.reduce.add.nxv1i32.nxv4i32(<vscale x 1 x i32> [[VEC_PHI]], <vscale x 4 x i32> [[TMP12]])
+; ZVQDOTQ-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
+; ZVQDOTQ-NEXT:    [[TMP13:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; ZVQDOTQ-NEXT:    br i1 [[TMP13]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
+; ZVQDOTQ:       middle.block:
+; ZVQDOTQ-NEXT:    [[TMP14:%.*]] = call i32 @llvm.vector.reduce.add.nxv1i32(<vscale x 1 x i32> [[PARTIAL_REDUCE]])
+; ZVQDOTQ-NEXT:    [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]]
+; ZVQDOTQ-NEXT:    br i1 [[CMP_N]], label [[FOR_EXIT:%.*]], label [[SCALAR_PH]]
+; ZVQDOTQ:       scalar.ph:
 ;
-; FIXED-LABEL: define i32 @vqdotsu(
-; FIXED-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0:[0-9]+]] {
-; FIXED-NEXT:  entry:
-; FIXED-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; FIXED:       vector.ph:
-; FIXED-NEXT:    br label [[VECTOR_BODY:%.*]]
-; FIXED:       vector.body:
-; FIXED-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; FIXED-NEXT:    [[VEC_PHI:%.*]] = phi <8 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP12:%.*]], [[VECTOR_BODY]] ]
-; FIXED-NEXT:    [[VEC_PHI1:%.*]] = phi <8 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP13:%.*]], [[VECTOR_BODY]] ]
-; FIXED-NEXT:    [[TMP0:%.*]] = getelementptr i8, ptr [[A]], i64 [[INDEX]]
-; FIXED-NEXT:    [[TMP1:%.*]] = getelementptr i8, ptr [[TMP0]], i32 0
-; FIXED-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[TMP0]], i32 8
-; FIXED-NEXT:    [[WIDE_LOAD:%.*]] = load <8 x i8>, ptr [[TMP1]], align 1
-; FIXED-NEXT:    [[WIDE_LOAD2:%.*]] = load <8 x i8>, ptr [[TMP2]], align 1
-; FIXED-NEXT:    [[TMP3:%.*]] = zext <8 x i8> [[WIDE_LOAD]] to <8 x i32>
-; FIXED-NEXT:    [[TMP4:%.*]] = zext <8 x i8> [[WIDE_LOAD2]] to <8 x i32>
-; FIXED-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[B]], i64 [[INDEX]]
-; FIXED-NEXT:    [[TMP6:%.*]] = getelementptr i8, ptr [[TMP5]], i32 0
-; FIXED-NEXT:    [[TMP7:%.*]] = getelementptr i8, ptr [[TMP5]], i32 8
-; FIXED-NEXT:    [[WIDE_LOAD3:%.*]] = load <8 x i8>, ptr [[TMP6]], align 1
-; FIXED-NEXT:    [[WIDE_LOAD4:%.*]] = load <8 x i8>, ptr [[TMP7]], align 1
-; FIXED-NEXT:    [[TMP8:%.*]] = sext <8 x i8> [[WIDE_LOAD3]] to <8 x i32>
-; FIXED-NEXT:    [[TMP9:%.*]] = sext <8 x i8> [[WIDE_LOAD4]] to <8 x i32>
-; FIXED-NEXT:    [[TMP10:%.*]] = mul <8 x i32> [[TMP8]], [[TMP3]]
-; FIXED-NEXT:    [[TMP11:%.*]] = mul <8 x i32> [[TMP9]], [[TMP4]]
-; FIXED-NEXT:    [[TMP12]] = add <8 x i32> [[TMP10]], [[VEC_PHI]]
-; FIXED-NEXT:    [[TMP13]] = add <8 x i32> [[TMP11]], [[VEC_PHI1]]
-; FIXED-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
-; FIXED-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
-; FIXED-NEXT:    br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
-; FIXED:       middle.block:
-; FIXED-NEXT:    [[BIN_RDX:%.*]] = add <8 x i32> [[TMP13]], [[TMP12]]
-; FIXED-NEXT:    [[TMP15:%.*]] = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> [[BIN_RDX]])
-; FIXED-NEXT:    br i1 true, label [[FOR_EXIT:%.*]], label [[SCALAR_PH]]
-; FIXED:       scalar.ph:
+; FIXED-V-LABEL: define i32 @vqdotsu(
+; FIXED-V-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
+; FIXED-V-NEXT:  entry:
+; FIXED-V-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; FIXED-V:       vector.ph:
+; FIXED-V-NEXT:    br label [[VECTOR_BODY:%.*]]
+; FIXED-V:       vector.body:
+; FIXED-V-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; FIXED-V-NEXT:    [[VEC_PHI:%.*]] = phi <8 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP12:%.*]], [[VECTOR_BODY]] ]
+; FIXED-V-NEXT:    [[VEC_PHI1:%.*]] = phi <8 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP13:%.*]], [[VECTOR_BODY]] ]
+; FIXED-V-NEXT:    [[TMP0:%.*]] = getelementptr i8, ptr [[A]], i64 [[INDEX]]
+; FIXED-V-NEXT:    [[TMP1:%.*]] = getelementptr i8, ptr [[TMP0]], i32 0
+; FIXED-V-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[TMP0]], i32 8
+; FIXED-V-NEXT:    [[WIDE_LOAD:%.*]] = load <8 x i8>, ptr [[TMP1]], align 1
+; FIXED-V-NEXT:    [[WIDE_LOAD2:%.*]] = load <8 x i8>, ptr [[TMP2]], align 1
+; FIXED-V-NEXT:    [[TMP3:%.*]] = zext <8 x i8> [[WIDE_LOAD]] to <8 x i32>
+; FIXED-V-NEXT:    [[TMP4:%.*]] = zext <8 x i8> [[WIDE_LOAD2]] to <8 x i32>
+; FIXED-V-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[B]], i64 [[INDEX]]
+; FIXED-V-NEXT:    [[TMP6:%.*]] = getelementptr i8, ptr [[TMP5]], i32 0
+; FIXED-V-NEXT:    [[TMP7:%.*]] = getelementptr i8, ptr [[TMP5]], i32 8
+; FIXED-V-NEXT:    [[WIDE_LOAD3:%.*]] = load <8 x i8>, ptr [[TMP6]], align 1
+; FIXED-V-NEXT:    [[WIDE_LOAD4:%.*]] = load <8 x i8>, ptr [[TMP7]], align 1
+; FIXED-V-NEXT:    [[TMP8:%.*]] = sext <8 x i8> [[WIDE_LOAD3]] to <8 x i32>
+; FIXED-V-NEXT:    [[TMP9:%.*]] = sext <8 x i8> [[WIDE_LOAD4]] to <8 x i32>
+; FIXED-V-NEXT:    [[TMP10:%.*]] = mul <8 x i32> [[TMP8]], [[TMP3]]
+; FIXED-V-NEXT:    [[TMP11:%.*]] = mul <8 x i32> [[TMP9]], [[TMP4]]
+; FIXED-V-NEXT:    [[TMP12]] = add <8 x i32> [[TMP10]], [[VEC_PHI]]
+; FIXED-V-NEXT:    [[TMP13]] = add <8 x i32> [[TMP11]], [[VEC_PHI1]]
+; FIXED-V-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
+; FIXED-V-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
+; FIXED-V-NEXT:    br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
+; FIXED-V:       middle.block:
+; FIXED-V-NEXT:    [[BIN_RDX:%.*]] = add <8 x i32> [[TMP13]], [[TMP12]]
+; FIXED-V-NEXT:    [[TMP15:%.*]] = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> [[BIN_RDX]])
+; FIXED-V-NEXT:    br i1 true, label [[FOR_EXIT:%.*]], label [[SCALAR_PH]]
+; FIXED-V:       scalar.ph:
+;
+; FIXED-ZVQDOTQ-LABEL: define i32 @vqdotsu(
+; FIXED-ZVQDOTQ-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
+; FIXED-ZVQDOTQ-NEXT:  entry:
+; FIXED-ZVQDOTQ-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; FIXED-ZVQDOTQ:       vector.ph:
+; FIXED-ZVQDOTQ-NEXT:    br label [[VECTOR_BODY:%.*]]
+; FIXED-ZVQDOTQ:       vector.body:
+; FIXED-ZVQDOTQ-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; FIXED-ZVQDOTQ-NEXT:    [[VEC_PHI:%.*]] = phi <2 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], [[VECTOR_BODY]] ]
+; FIXED-ZVQDOTQ-NEXT:    [[VEC_PHI1:%.*]] = phi <2 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE5:%.*]], [[VECTOR_BODY]] ]
+; FIXED-ZVQDOTQ-NEXT:    [[TMP0:%.*]] = getelementptr i8, ptr [[A]], i64 [[INDEX]]
+; FIXED-ZVQDOTQ-NEXT:    [[TMP1:%.*]] = getelementptr i8, ptr [[TMP0]], i32 0
+; FIXED-ZVQDOTQ-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[TMP0]], i32 8
+; FIXED-ZVQDOTQ-NEXT:    [[WIDE_LOAD:%.*]] = load <8 x i8>, ptr [[TMP1]], align 1
+; FIXED-ZVQDOTQ-NEXT:    [[WIDE_LOAD2:%.*]] = load <8 x i8>, ptr [[TMP2]], align 1
+; FIXED-ZVQDOTQ-NEXT:    [[TMP3:%.*]] = zext <8 x i8> [[WIDE_LOAD]] to <8 x i32>
+; FIXED-ZVQDOTQ-NEXT:    [[TMP4:%.*]] = zext <8 x i8> [[WIDE_LOAD2]] to <8 x i32>
+; FIXED-ZVQDOTQ-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[B]], i64 [[INDEX]]
+; FIXED-ZVQDOTQ-NEXT:    [[TMP6:%.*]] = getelementptr i8, ptr [[TMP5]], i32 0
+; FIXED-ZVQDOTQ-NEXT:    [[TMP7:%.*]] = getelementptr i8, ptr [[TMP5]], i32 8
+; FIXED-ZVQDOTQ-NEXT:    [[WIDE_LOAD3:%.*]] = load <8 x i8>, ptr [[TMP6]], align 1
+; FIXED-ZVQDOTQ-NEXT:    [[WIDE_LOAD4:%.*]] = load <8 x i8>, ptr [[TMP7]], align 1
+; FIXED-ZVQDOTQ-NEXT:    [[TMP8:%.*]] = sext <8 x i8> [[WIDE_LOAD3]] to <8 x i32>
+; FIXED-ZVQDOTQ-NEXT:    [[TMP9:%.*]] = sext <8 x i8> [[WIDE_LOAD4]] to <8 x i32>
+; FIXED-ZVQDOTQ-NEXT:    [[TMP10:%.*]] = mul <8 x i32> [[TMP8]], [[TMP3]]
+; FIXED-ZVQDOTQ-NEXT:    [[TMP11:%.*]] = mul <8 x i32> [[TMP9]], [[TMP4]]
+; FIXED-ZVQDOTQ-NEXT:    [[PARTIAL_REDUCE]] = call <2 x i32> @llvm.experimental.vector.partial.reduce.add.v2i32.v8i32(<2 x i32> [[VEC_PHI]], <8 x i32> [[TMP10]])
+; FIXED-ZVQDOTQ-NEXT:    [[PARTIAL_REDUCE5]] = call <2 x i32> @llvm.experimental.vector.partial.reduce.add.v2i32.v8i32(<2 x i32> [[VEC_PHI1]], <8 x i32> [[TMP11]])
+; FIXED-ZVQDOTQ-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
+; FIXED-ZVQDOTQ-NEXT:    [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
+; FIXED-ZVQDOTQ-NEXT:    br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
+; FIXED-ZVQDOTQ:       middle.block:
+; FIXED-ZVQDOTQ-NEXT:    [[BIN_RDX:%.*]] = add <2 x i32> [[PARTIAL_REDUCE5]], [[PARTIAL_REDUCE]]
+; FIXED-ZVQDOTQ-NEXT:    [[TMP13:%.*]] = call i32 @llvm.vector.reduce.add.v2i32(<2 x i32> [[BIN_RDX]])
+; FIXED-ZVQDOTQ-NEXT:    br i1 true, label [[FOR_EXIT:%.*]], label [[SCALAR_PH]]
+; FIXED-ZVQDOTQ:       scalar.ph:
 ;
 entry:
   br label %for.body
@@ -448,79 +522,153 @@ for.exit:                        ; preds = %for.body
 }
 
 define i32 @vqdotsu2(ptr %a, ptr %b) #0 {
-; CHECK-LABEL: define i32 @vqdotsu2(
-; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 4
-; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP1]]
-; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK:       vector.ph:
-; CHECK-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 4
-; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP3]]
-; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]]
-; CHECK-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 4
-; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
-; CHECK:       vector.body:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP13:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr i8, ptr [[A]], i64 [[INDEX]]
-; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr i8, ptr [[TMP6]], i32 0
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x i8>, ptr [[TMP7]], align 1
-; CHECK-NEXT:    [[TMP8:%.*]] = sext <vscale x 4 x i8> [[WIDE_LOAD]] to <vscale x 4 x i32>
-; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[B]], i64 [[INDEX]]
-; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr i8, ptr [[TMP9]], i32 0
-; CHECK-NEXT:    [[WIDE_LOAD1:%.*]] = load <vscale x 4 x i8>, ptr [[TMP10]], align 1
-; CHECK-NEXT:    [[TMP11:%.*]] = zext <vscale x 4 x i8> [[WIDE_LOAD1]] to <vscale x 4 x i32>
-; CHECK-NEXT:    [[TMP12:%.*]] = mul <vscale x 4 x i32> [[TMP11]], [[TMP8]]
-; CHECK-NEXT:    [[TMP13]] = add <vscale x 4 x i32> [[TMP12]], [[VEC_PHI]]
-; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
-; CHECK-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
-; CHECK:       middle.block:
-; CHECK-NEXT:    [[TMP15:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[TMP13]])
-; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]]
-; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_EXIT:%.*]], label [[SCALAR_PH]]
-; CHECK:       scalar.ph:
+; V-LABEL: define i32 @vqdotsu2(
+; V-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
+; V-NEXT:  entry:
+; V-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; V-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 4
+; V-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP1]]
+; V-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; V:       vector.ph:
+; V-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; V-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 4
+; V-NEXT:    [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP3]]
+; V-NEXT:    [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]]
+; V-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
+; V-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 4
+; V-NEXT:    br label [[VECTOR_BODY:%.*]]
+; V:       vector.body:
+; V-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; V-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP13:%.*]], [[VECTOR_BODY]] ]
+; V-NEXT:    [[TMP6:%.*]] = getelementptr i8, ptr [[A]], i64 [[INDEX]]
+; V-NEXT:    [[TMP7:%.*]] = getelementptr i8, ptr [[TMP6]], i32 0
+; V-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x i8>, ptr [[TMP7]], align 1
+; V-NEXT:    [[TMP8:%.*]] = sext <vscale x 4 x i8> [[WIDE_LOAD]] to <vscale x 4 x i32>
+; V-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[B]], i64 [[INDEX]]
+; V-NEXT:    [[TMP10:%.*]] = getelementptr i8, ptr [[TMP9]], i32 0
+; V-NEXT:    [[WIDE_LOAD1:%.*]] = load <vscale x 4 x i8>, ptr [[TMP10]], align 1
+; V-NEXT:    [[TMP11:%.*]] = zext <vscale x 4 x i8> [[WIDE_LOAD1]] to <vscale x 4 x i32>
+; V-NEXT:    [[TMP12:%.*]] = mul <vscale x 4 x i32> [[TMP11]], [[TMP8]]
+; V-NEXT:    [[TMP13]] = add <vscale x 4 x i32> [[TMP12]], [[VEC_PHI]]
+; V-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
+; V-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; V-NEXT:    br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
+; V:       middle.block:
+; V-NEXT:    [[TMP15:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[TMP13]])
+; V-NEXT:    [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]]
+; V-NEXT:    br i1 [[CMP_N]], label [[FOR_EXIT:%.*]], label [[SCALAR_PH]]
+; V:       scalar.ph:
+;
+; ZVQDOTQ-LABEL: define i32 @vqdotsu2(
+; ZVQDOTQ-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
+; ZVQDOTQ-NEXT:  entry:
+; ZVQDOTQ-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; ZVQDOTQ-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 4
+; ZVQDOTQ-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP1]]
+; ZVQDOTQ-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; ZVQDOTQ:       vector.ph:
+; ZVQDOTQ-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; ZVQDOTQ-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 4
+; ZVQDOTQ-NEXT:    [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP3]]
+; ZVQDOTQ-NEXT:    [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]]
+; ZVQDOTQ-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
+; ZVQDOTQ-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 4
+; ZVQDOTQ-NEXT:    br label [[VECTOR_BODY:%.*]]
+; ZVQDOTQ:       vector.body:
+; ZVQDOTQ-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; ZVQDOTQ-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 1 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], [[VECTOR_BODY]] ]
+; ZVQDOTQ-NEXT:    [[TMP6:%.*]] = getelementptr i8, ptr [[A]], i64 [[INDEX]]
+; ZVQDOTQ-NEXT:    [[TMP7:%.*]] = getelementptr i8, ptr [[TMP6]], i32 0
+; ZVQDOTQ-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x i8>, ptr [[TMP7]], align 1
+; ZVQDOTQ-NEXT:    [[TMP8:%.*]] = sext <vscale x 4 x i8> [[WIDE_LOAD]] to <vscale x 4 x i32>
+; ZVQDOTQ-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[B]], i64 [[INDEX]]
+; ZVQDOTQ-NEXT:    [[TMP10:%.*]] = getelementptr i8, ptr [[TMP9]], i32 0
+; ZVQDOTQ-NEXT:    [[WIDE_LOAD1:%.*]] = load <vscale x 4 x i8>, ptr [[TMP10]], align 1
+; ZVQDOTQ-NEXT:    [[TMP11:%.*]] = zext <vscale x 4 x i8> [[WIDE_LOAD1]] to <vscale x 4 x i32>
+; ZVQDOTQ-NEXT:    [[TMP12:%.*]] = mul <vscale x 4 x i32> [[TMP11]], [[TMP8]]
+; ZVQDOTQ-NEXT:    [[PARTIAL_REDUCE]] = call <vscale x 1 x i32> @llvm.experimental.vector.partial.reduce.add.nxv1i32.nxv4i32(<vscale x 1 x i32> [[VEC_PHI]], <vscale x 4 x i32> [[TMP12]])
+; ZVQDOTQ-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
+; ZVQDOTQ-NEXT:    [[TMP13:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; ZVQDOTQ-NEXT:    br i1 [[TMP13]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
+; ZVQDOTQ:       middle.block:
+; ZVQDOTQ-NEXT:    [[TMP14:%.*]] = call i32 @llvm.vector.reduce.add.nxv1i32(<vscale x 1 x i32> [[PARTIAL_REDUCE]])
+; ZVQDOTQ-NEXT:    [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]]
+; ZVQDOTQ-NEXT:    br i1 [[CMP_N]], label [[FOR_EXIT:%.*]], label [[SCALAR_PH]]
+; ZVQDOTQ:       scalar.ph:
 ;
-; FIXED-LABEL: define i32 @vqdotsu2(
-; FIXED-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
-; FIXED-NEXT:  entry:
-; FIXED-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; FIXED:       vector.ph:
-; FIXED-NEXT:    br label [[VECTOR_BODY:%.*]]
-; FIXED:       vector.body:
-; FIXED-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; FIXED-NEXT:    [[VEC_PHI:%.*]] = phi <8 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP12:%.*]], [[VECTOR_BODY]] ]
-; FIXED-NEXT:    [[VEC_PHI1:%.*]] = phi <8 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP13:%.*]], [[VECTOR_BODY]] ]
-; FIXED-NEXT:    [[TMP0:%.*]] = getelementptr i8, ptr [[A]], i64 [[INDEX]]
-; FIXED-NEXT:    [[TMP1:%.*]] = getelementptr i8, ptr [[TMP0]], i32 0
-; FIXED-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[TMP0]], i32 8
-; FIXED-NEXT:    [[WIDE_LOAD:%.*]] = load <8 x i8>, ptr [[TMP1]], align 1
-; FIXED-NEXT:    [[WIDE_LOAD2:%.*]] = load <8 x i8>, ptr [[TMP2]], align 1
-; FIXED-NEXT:    [[TMP3:%.*]] = sext <8 x i8> [[WIDE_LOAD]] to <8 x i32>
-; FIXED-NEXT:    [[TMP4:%.*]] = sext <8 x i8> [[WIDE_LOAD2]] to <8 x i32>
-; FIXED-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[B]], i64 [[INDEX]]
-; FIXED-NEXT:    [[TMP6:%.*]] = getelementptr i8, ptr [[TMP5]], i32 0
-; FIXED-NEXT:    [[TMP7:%.*]] = getelementptr i8, ptr [[TMP5]], i32 8
-; FIXED-NEXT:    [[WIDE_LOAD3:%.*]] = load <8 x i8>, ptr [[TMP6]], align 1
-; FIXED-NEXT:    [[WIDE_LOAD4:%.*]] = load <8 x i8>, ptr [[TMP7]], align 1
-; FIXED-NEXT:    [[TMP8:%.*]] = zext <8 x i8> [[WIDE_LOAD3]] to <8 x i32>
-; FIXED-NEXT:    [[TMP9:%.*]] = zext <8 x i8> [[WIDE_LOAD4]] to <8 x i32>
-; FIXED-NEXT:    [[TMP10:%.*]] = mul <8 x i32> [[TMP8]], [[TMP3]]
-; FIXED-NEXT:    [[TMP11:%.*]] = mul <8 x i32> [[TMP9]], [[TMP4]]
-; FIXED-NEXT:    [[TMP12]] = add <8 x i32> [[TMP10]], [[VEC_PHI]]
-; FIXED-NEXT:    [[TMP13]] = add <8 x i32> [[TMP11]], [[VEC_PHI1]]
-; FIXED-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
-; FIXED-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
-; FIXED-NEXT:    br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
-; FIXED:       middle.block:
-; FIXED-NEXT:    [[BIN_RDX:%.*]] = add <8 x i32> [[TMP13]], [[TMP12]]
-; FIXED-NEXT:    [[TMP15:%.*]] = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> [[BIN_RDX]])
-; FIXED-NEXT:    br i1 true, label [[FOR_EXIT:%.*]], label [[SCALAR_PH]]
-; FIXED:       scalar.ph:
+; FIXED-V-LABEL: define i32 @vqdotsu2(
+; FIXED-V-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
+; FIXED-V-NEXT:  entry:
+; FIXED-V-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; FIXED-V:       vector.ph:
+; FIXED-V-NEXT:    br label [[VECTOR_BODY:%.*]]
+; FIXED-V:       vector.body:
+; FIXED-V-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; FIXED-V-NEXT:    [[VEC_PHI:%.*]] = phi <8 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP12:%.*]], [[VECTOR_BODY]] ]
+; FIXED-V-NEXT:    [[VEC_PHI1:%.*]] = phi <8 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP13:%.*]], [[VECTOR_BODY]] ]
+; FIXED-V-NEXT:    [[TMP0:%.*]] = getelementptr i8, ptr [[A]], i64 [[INDEX]]
+; FIXED-V-NEXT:    [[TMP1:%.*]] = getelementptr i8, ptr [[TMP0]], i32 0
+; FIXED-V-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[TMP0]], i32 8
+; FIXED-V-NEXT:    [[WIDE_LOAD:%.*]] = load <8 x i8>, ptr [[TMP1]], align 1
+; FIXED-V-NEXT:    [[WIDE_LOAD2:%.*]] = load <8 x i8>, ptr [[TMP2]], align 1
+; FIXED-V-NEXT:    [[TMP3:%.*]] = sext <8 x i8> [[WIDE_LOAD]] to <8 x i32>
+; FIXED-V-NEXT:    [[TMP4:%.*]] = sext <8 x i8> [[WIDE_LOAD2]] to <8 x i32>
+; FIXED-V-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[B]], i64 [[INDEX]]
+; FIXED-V-NEXT:    [[TMP6:%.*]] = getelementptr i8, ptr [[TMP5]], i32 0
+; FIXED-V-NEXT:    [[TMP7:%.*]] = getelementptr i8, ptr [[TMP5]], i32 8
+; FIXED-V-NEXT:    [[WIDE_LOAD3:%.*]] = load <8 x i8>, ptr [[TMP6]], align 1
+; FIXED-V-NEXT:    [[WIDE_LOAD4:%.*]] = load <8 x i8>, ptr [[TMP7]], align 1
+; FIXED-V-NEXT:    [[TMP8:%.*]] = zext <8 x i8> [[WIDE_LOAD3]] to <8 x i32>
+; FIXED-V-NEXT:    [[TMP9:%.*]] = zext <8 x i8> [[WIDE_LOAD4]] to <8 x i32>
+; FIXED-V-NEXT:    [[TMP10:%.*]] = mul <8 x i32> [[TMP8]], [[TMP3]]
+; FIXED-V-NEXT:    [[TMP11:%.*]] = mul <8 x i32> [[TMP9]], [[TMP4]]
+; FIXED-V-NEXT:    [[TMP12]] = add <8 x i32> [[TMP10]], [[VEC_PHI]]
+; FIXED-V-NEXT:    [[TMP13]] = add <8 x i32> [[TMP11]], [[VEC_PHI1]]
+; FIXED-V-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
+; FIXED-V-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
+; FIXED-V-NEXT:    br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
+; FIXED-V:       middle.block:
+; FIXED-V-NEXT:    [[BIN_RDX:%.*]] = add <8 x i32> [[TMP13]], [[TMP12]]
+; FIXED-V-NEXT:    [[TMP15:%.*]] = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> [[BIN_RDX]])
+; FIXED-V-NEXT:    br i1 true, label [[FOR_EXIT:%.*]], label [[SCALAR_PH]]
+; FIXED-V:       scalar.ph:
+;
+; FIXED-ZVQDOTQ-LABEL: define i32 @vqdotsu2(
+; FIXED-ZVQDOTQ-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
+; FIXED-ZVQDOTQ-NEXT:  entry:
+; FIXED-ZVQDOTQ-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; FIXED-ZVQDOTQ:       vector.ph:
+; FIXED-ZVQDOTQ-NEXT:    br label [[VECTOR_BODY:%.*]]
+; FIXED-ZVQDOTQ:       vector.body:
+; FIXED-ZVQDOTQ-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; FIXED-ZVQDOTQ-NEXT:    [[VEC_PHI:%.*]] = phi <2 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], [[VECTOR_BODY]] ]
+; FIXED-ZVQDOTQ-NEXT:    [[VEC_PHI1:%.*]] = phi <2 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE5:%.*]], [[VECTOR_BODY]] ]
+; FIXED-ZVQDOTQ-NEXT:    [[TMP0:%.*]] = getelementptr i8, ptr [[A]], i64 [[INDEX]]
+; FIXED-ZVQDOTQ-NEXT:    [[TMP1:%.*]] = getelementptr i8, ptr [[TMP0]], i32 0
+; FIXED-ZVQDOTQ-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[TMP0]], i32 8
+; FIXED-ZVQDOTQ-NEXT:    [[WIDE_LOAD:%.*]] = load <8 x i8>, ptr [[TMP1]], align 1
+; FIXED-ZVQDOTQ-NEXT:    [[WIDE_LOAD2:%.*]] = load <8 x i8>, ptr [[TMP2]], align 1
+; FIXED-ZVQDOTQ-NEXT:    [[TMP3:%.*]] = sext <8 x i8> [[WIDE_LOAD]] to <8 x i32>
+; FIXED-ZVQDOTQ-NEXT:    [[TMP4:%.*]] = sext <8 x i8> [[WIDE_LOAD2]] to <8 x i32>
+; FIXED-ZVQDOTQ-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[B]], i64 [[INDEX]]
+; FIXED-ZVQDOTQ-NEXT:    [[TMP6:%.*]] = getelementptr i8, ptr [[TMP5]], i32 0
+; FIXED-ZVQDOTQ-NEXT:    [[TMP7:%.*]] = getelementptr i8, ptr [[TMP5]], i32 8
+; FIXED-ZVQDOTQ-NEXT:    [[WIDE_LOAD3:%.*]] = load <8 x i8>, ptr [[TMP6]], align 1
+; FIXED-ZVQDOTQ-NEXT:    [[WIDE_LOAD4:%.*]] = load <8 x i8>, ptr [[TMP7]], align 1
+; FIXED-ZVQDOTQ-NEXT:    [[TMP8:%.*]] = zext <8 x i8> [[WIDE_LOAD3]] to <8 x i32>
+; FIXED-ZVQDOTQ-NEXT:    [[TMP9:%.*]] = zext <8 x i8> [[WIDE_LOAD4]] to <8 x i32>
+; FIXED-ZVQDOTQ-NEXT:    [[TMP10:%.*]] = mul <8 x i32> [[TMP8]], [[TMP3]]
+; FIXED-ZVQDOTQ-NEXT:    [[TMP11:%.*]] = mul <8 x i32> [[TMP9]], [[TMP4]]
+; FIXED-ZVQDOTQ-NEXT:    [[PARTIAL_REDUCE]] = call <2 x i32> @llvm.experimental.vector.partial.reduce.add.v2i32.v8i32(<2 x i32> [[VEC_PHI]], <8 x i32> [[TMP10]])
+; FIXED-ZVQDOTQ-NEXT:    [[PARTIAL_REDUCE5]] = call <2 x i32> @llvm.experimental.vector.partial.reduce.add.v2i32.v8i32(<2 x i32> [[VEC_PHI1]], <8 x i32> [[TMP11]])
+; FIXED-ZVQDOTQ-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
+; FIXED-ZVQDOTQ-NEXT:    [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
+; FIXED-ZVQDOTQ-NEXT:    br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
+; FIXED-ZVQDOTQ:       middle.block:
+; FIXED-ZVQDOTQ-NEXT:    [[BIN_RDX:%.*]] = add <2 x i32> [[PARTIAL_REDUCE5]], [[PARTIAL_REDUCE]]
+; FIXED-ZVQDOTQ-NEXT:    [[TMP13:%.*]] = call i32 @llvm.vector.reduce.add.v2i32(<2 x i32> [[BIN_RDX]])
+; FIXED-ZVQDOTQ-NEXT:    br i1 true, label [[FOR_EXIT:%.*]], label [[SCALAR_PH]]
+; FIXED-ZVQDOTQ:       scalar.ph:
 ;
 entry:
   br label %for.body
@@ -543,3 +691,6 @@ for.body:                                         ; preds = %for.body, %entry
 for.exit:                        ; preds = %for.body
   ret i32 %add
 }
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; CHECK: {{.*}}
+; FIXED: {{.*}}

>From c7063380205d8776e281f7a6603119aa8ea28c12 Mon Sep 17 00:00:00 2001
From: Jonas Devlieghere <jonas at devlieghere.com>
Date: Tue, 10 Jun 2025 15:34:54 -0700
Subject: [PATCH 25/61] [lldb] Fix `target stop-hook add` help output

The help output for `target stop-hook add` references non-existing
option `--one-line-command`. The correct option is `--one-liner`:

```
-o <one-line-command> ( --one-liner <one-line-command> )
     Add a command for the stop hook.  Can be specified more than once,
     and commands will be run in the order they appear.
```

This commit fixes the help text.

rdar://152730660
---
 lldb/source/Commands/CommandObjectTarget.cpp | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/lldb/source/Commands/CommandObjectTarget.cpp b/lldb/source/Commands/CommandObjectTarget.cpp
index 21b21954bbc90..a4ced37649ea0 100644
--- a/lldb/source/Commands/CommandObjectTarget.cpp
+++ b/lldb/source/Commands/CommandObjectTarget.cpp
@@ -4885,9 +4885,9 @@ class CommandObjectTargetStopHookAdd : public CommandObjectParsed,
 Command Based stop-hooks:
 -------------------------
   Stop hooks can run a list of lldb commands by providing one or more
-  --one-line-command options.  The commands will get run in the order they are
-  added.  Or you can provide no commands, in which case you will enter a
-  command editor where you can enter the commands to be run.
+  --one-liner options.  The commands will get run in the order they are added.
+  Or you can provide no commands, in which case you will enter a command editor
+  where you can enter the commands to be run.
 
 Python Based Stop Hooks:
 ------------------------

>From 32d2b6ba4797584743d4764b25af0ae6f6c3d063 Mon Sep 17 00:00:00 2001
From: Florian Mayer <fmayer at google.com>
Date: Tue, 10 Jun 2025 15:58:53 -0700
Subject: [PATCH 26/61] [HWASAN] Disable LSan test on Android (#143625)

Android HWASan does not support LSan.
---
 compiler-rt/test/hwasan/TestCases/Posix/dlerror.cpp | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/compiler-rt/test/hwasan/TestCases/Posix/dlerror.cpp b/compiler-rt/test/hwasan/TestCases/Posix/dlerror.cpp
index b6e486b291f3a..91acd28a1a5ff 100644
--- a/compiler-rt/test/hwasan/TestCases/Posix/dlerror.cpp
+++ b/compiler-rt/test/hwasan/TestCases/Posix/dlerror.cpp
@@ -1,6 +1,9 @@
 // Make sure dlerror is not classified as a leak even if we use dynamic TLS.
 // This is currently not implemented, so this test is XFAIL.
 
+// Android HWAsan does not support LSan.
+// UNSUPPORTED: android
+
 // RUN: %clangxx_hwasan -O0 %s -o %t && HWASAN_OPTIONS=detect_leaks=1 %run %t
 
 #include <assert.h>

>From 48122a797710a05b5b8620f6051e9716a8e5a6c3 Mon Sep 17 00:00:00 2001
From: Zhen Wang <37195552+wangzpgi at users.noreply.github.com>
Date: Tue, 10 Jun 2025 16:15:12 -0700
Subject: [PATCH 27/61] [flang][cuda] Fix CUDA generic resolution for VALUE
 arguments in device procedures (#140952)

For actual arguments that have VALUE attribute inside device routines, treat them as if they have device attribute.
---
 flang/lib/Semantics/check-call.cpp |  7 +++++++
 flang/test/Semantics/cuf21.cuf     | 11 +++++++----
 2 files changed, 14 insertions(+), 4 deletions(-)

diff --git a/flang/lib/Semantics/check-call.cpp b/flang/lib/Semantics/check-call.cpp
index dfc2ddbacf071..6f2503285013d 100644
--- a/flang/lib/Semantics/check-call.cpp
+++ b/flang/lib/Semantics/check-call.cpp
@@ -1033,6 +1033,13 @@ static void CheckExplicitDataArg(const characteristics::DummyDataObject &dummy,
               *actualDataAttr == common::CUDADataAttr::Managed)) {
         actualDataAttr = common::CUDADataAttr::Device;
       }
+      // For device procedures, treat actual arguments with VALUE attribute as
+      // device data
+      if (!actualDataAttr && actualLastSymbol && IsValue(*actualLastSymbol) &&
+          (*procedure.cudaSubprogramAttrs ==
+              common::CUDASubprogramAttrs::Device)) {
+        actualDataAttr = common::CUDADataAttr::Device;
+      }
     }
     if (dummyDataAttr == common::CUDADataAttr::Device &&
         (dummyIsAssumedShape || dummyIsAssumedRank) &&
diff --git a/flang/test/Semantics/cuf21.cuf b/flang/test/Semantics/cuf21.cuf
index b8b99a8d1d9be..077657c8a52d5 100644
--- a/flang/test/Semantics/cuf21.cuf
+++ b/flang/test/Semantics/cuf21.cuf
@@ -9,19 +9,22 @@ module mlocModule
   end interface maxlocUpdate
 contains
 
-  attributes(global) subroutine maxlocPartialMaskR_32F1D()
+  attributes(global) subroutine maxlocPartialMaskR_32F1D(back)
     implicit none
+    logical, intent(in), value :: back
     real(4) :: mval
 
-    call maxlocUpdate(mval)
+    call maxlocUpdate(mval, back)
 
   end subroutine maxlocPartialMaskR_32F1D
 
-  attributes(device) subroutine maxlocUpdateR_32F(mval)
+  attributes(device) subroutine maxlocUpdateR_32F(mval, back)
     real(4) :: mval
+    logical :: back
   end subroutine maxlocUpdateR_32F
 
-  attributes(device) subroutine maxlocUpdateR_64F(mval)
+  attributes(device) subroutine maxlocUpdateR_64F(mval, back)
     real(8) :: mval
+    logical :: back
   end subroutine maxlocUpdateR_64F
 end module

>From 1bf4702d2bbaad522886dfbab913a8dd6efe3b85 Mon Sep 17 00:00:00 2001
From: Amy Huang <akhuang at google.com>
Date: Tue, 10 Jun 2025 16:18:53 -0700
Subject: [PATCH 28/61] Disable prctl test when building for arm or riscv.
 (#143627)

I'm setting up a buildbot for arm32 using qemu and qemu doesn't support
PR_GET_THP_DISABLE.
Disable the test for now while we figure out what to do about that.

Also disable for riscv because we may do the same for riscv buildbots.
---
 libc/test/src/sys/prctl/linux/CMakeLists.txt | 6 ++++++
 libc/test/src/sys/prctl/linux/prctl_test.cpp | 1 +
 2 files changed, 7 insertions(+)

diff --git a/libc/test/src/sys/prctl/linux/CMakeLists.txt b/libc/test/src/sys/prctl/linux/CMakeLists.txt
index b06e1c8087008..d02900e1857a0 100644
--- a/libc/test/src/sys/prctl/linux/CMakeLists.txt
+++ b/libc/test/src/sys/prctl/linux/CMakeLists.txt
@@ -1,5 +1,10 @@
 add_custom_target(libc_sys_prctl_unittests)
 
+# Temporarily disable this test while setting up arm and riscv buildbots
+# using qemu, since PR_GET_THP_DISABLE is not supported on qemu.
+if (NOT (LIBC_TARGET_ARCHITECTURE_IS_ARM OR
+	 LIBC_TARGET_ARCHITECTURE_IS_RISCV32 OR
+	 LIBC_TARGET_ARCHITECTURE_IS_RISCV64))
 add_libc_unittest(
   prctl_test
   SUITE
@@ -13,3 +18,4 @@ add_libc_unittest(
     libc.test.UnitTest.ErrnoCheckingTest
     libc.test.UnitTest.ErrnoSetterMatcher
 )
+endif()
diff --git a/libc/test/src/sys/prctl/linux/prctl_test.cpp b/libc/test/src/sys/prctl/linux/prctl_test.cpp
index 374c905e0ef8a..76b829c82d1be 100644
--- a/libc/test/src/sys/prctl/linux/prctl_test.cpp
+++ b/libc/test/src/sys/prctl/linux/prctl_test.cpp
@@ -34,6 +34,7 @@ TEST_F(LlvmLibcSysPrctlTest, GetSetName) {
 TEST_F(LlvmLibcSysPrctlTest, GetTHPDisable) {
   // Manually check errno since the return value logic here is not
   // covered in ErrnoSetterMatcher.
+  // Note that PR_GET_THP_DISABLE is not supported by QEMU.
   int ret = LIBC_NAMESPACE::prctl(PR_GET_THP_DISABLE, 0, 0, 0, 0);
   ASSERT_ERRNO_SUCCESS();
   // PR_GET_THP_DISABLE return (as the function result) the current

>From ad479ddb343c2756e6eed0f2999bbdb88a65c7c5 Mon Sep 17 00:00:00 2001
From: Matt Arsenault <Matthew.Arsenault at amd.com>
Date: Wed, 11 Jun 2025 08:49:13 +0900
Subject: [PATCH 29/61] Revert "[SeparateConstOffsetFromGEP] Decompose constant
 xor operand if possible (#135788)"

This reverts commit 13ccce28776d8ad27b0c6a92b5a452d62da05663.

The tests are on non-canonical IR, and adds an extra unrelated
pre-processing step to the pass. I'm assuming this is a workaround
for the known-bits recursion depth limit in instcombine.
---
 .../Scalar/SeparateConstOffsetFromGEP.cpp     | 193 -----------------
 .../AMDGPU/xor-to-or-disjoint.ll              | 204 ------------------
 2 files changed, 397 deletions(-)
 delete mode 100644 llvm/test/Transforms/SeparateConstOffsetFromGEP/AMDGPU/xor-to-or-disjoint.ll

diff --git a/llvm/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp b/llvm/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp
index 6fae9f1dd2404..320b79203c0b3 100644
--- a/llvm/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp
+++ b/llvm/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp
@@ -174,7 +174,6 @@
 #include "llvm/IR/Function.h"
 #include "llvm/IR/GetElementPtrTypeIterator.h"
 #include "llvm/IR/IRBuilder.h"
-#include "llvm/IR/InstIterator.h"
 #include "llvm/IR/InstrTypes.h"
 #include "llvm/IR/Instruction.h"
 #include "llvm/IR/Instructions.h"
@@ -191,7 +190,6 @@
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Transforms/Scalar.h"
-#include "llvm/Transforms/Utils/BasicBlockUtils.h"
 #include "llvm/Transforms/Utils/Local.h"
 #include <cassert>
 #include <cstdint>
@@ -200,8 +198,6 @@
 using namespace llvm;
 using namespace llvm::PatternMatch;
 
-#define DEBUG_TYPE "separate-offset-gep"
-
 static cl::opt<bool> DisableSeparateConstOffsetFromGEP(
     "disable-separate-const-offset-from-gep", cl::init(false),
     cl::desc("Do not separate the constant offset from a GEP instruction"),
@@ -492,42 +488,6 @@ class SeparateConstOffsetFromGEP {
   DenseMap<ExprKey, SmallVector<Instruction *, 2>> DominatingSubs;
 };
 
-/// A helper class that aims to convert xor operations into or operations when
-/// their operands are disjoint and the result is used in a GEP's index. This
-/// can then enable further GEP optimizations by effectively turning BaseVal |
-/// Const into BaseVal + Const when they are disjoint, which
-/// SeparateConstOffsetFromGEP can then process. This is a common pattern that
-/// sets up a grid of memory accesses across a wave where each thread acesses
-/// data at various offsets.
-class XorToOrDisjointTransformer {
-public:
-  XorToOrDisjointTransformer(Function &F, DominatorTree &DT,
-                             const DataLayout &DL)
-      : F(F), DT(DT), DL(DL) {}
-
-  bool run();
-
-private:
-  Function &F;
-  DominatorTree &DT;
-  const DataLayout &DL;
-  /// Maps a common operand to all Xor instructions
-  using XorOpList = SmallVector<std::pair<BinaryOperator *, APInt>, 8>;
-  using XorBaseValInst = DenseMap<Instruction *, XorOpList>;
-  XorBaseValInst XorGroups;
-
-  /// Checks if the given value has at least one GetElementPtr user
-  static bool hasGEPUser(const Value *V);
-
-  /// Helper function to check if BaseXor dominates all XORs in the group
-  bool dominatesAllXors(BinaryOperator *BaseXor, const XorOpList &XorsInGroup);
-
-  /// Processes a group of XOR instructions that share the same non-constant
-  /// base operand. Returns true if this group's processing modified the
-  /// function.
-  bool processXorGroup(Instruction *OriginalBaseInst, XorOpList &XorsInGroup);
-};
-
 } // end anonymous namespace
 
 char SeparateConstOffsetFromGEPLegacyPass::ID = 0;
@@ -1263,154 +1223,6 @@ bool SeparateConstOffsetFromGEP::splitGEP(GetElementPtrInst *GEP) {
   return true;
 }
 
-// Helper function to check if an instruction has at least one GEP user
-bool XorToOrDisjointTransformer::hasGEPUser(const Value *V) {
-  return llvm::any_of(V->users(), [](const User *U) {
-    return isa<llvm::GetElementPtrInst>(U);
-  });
-}
-
-bool XorToOrDisjointTransformer::dominatesAllXors(
-    BinaryOperator *BaseXor, const XorOpList &XorsInGroup) {
-  return llvm::all_of(XorsInGroup, [&](const auto &XorEntry) {
-    BinaryOperator *XorInst = XorEntry.first;
-    // Do not evaluate the BaseXor, otherwise we end up cloning it.
-    return XorInst == BaseXor || DT.dominates(BaseXor, XorInst);
-  });
-}
-
-bool XorToOrDisjointTransformer::processXorGroup(Instruction *OriginalBaseInst,
-                                                 XorOpList &XorsInGroup) {
-  bool Changed = false;
-  if (XorsInGroup.size() <= 1)
-    return false;
-
-  // Sort XorsInGroup by the constant offset value in increasing order.
-  llvm::sort(XorsInGroup, [](const auto &A, const auto &B) {
-    return A.second.slt(B.second);
-  });
-
-  // Dominance check
-  // The "base" XOR for dominance purposes is the one with the smallest
-  // constant.
-  BinaryOperator *XorWithSmallConst = XorsInGroup[0].first;
-
-  if (!dominatesAllXors(XorWithSmallConst, XorsInGroup)) {
-    LLVM_DEBUG(dbgs() << DEBUG_TYPE
-                      << ": Cloning and inserting XOR with smallest constant ("
-                      << *XorWithSmallConst
-                      << ") as it does not dominate all other XORs"
-                      << " in function " << F.getName() << "\n");
-
-    BinaryOperator *ClonedXor =
-        cast<BinaryOperator>(XorWithSmallConst->clone());
-    ClonedXor->setName(XorWithSmallConst->getName() + ".dom_clone");
-    ClonedXor->insertAfter(OriginalBaseInst);
-    LLVM_DEBUG(dbgs() << "  Cloned Inst: " << *ClonedXor << "\n");
-    Changed = true;
-    XorWithSmallConst = ClonedXor;
-  }
-
-  SmallVector<Instruction *, 8> InstructionsToErase;
-  const APInt SmallestConst =
-      cast<ConstantInt>(XorWithSmallConst->getOperand(1))->getValue();
-
-  // Main transformation loop: Iterate over the original XORs in the sorted
-  // group.
-  for (const auto &XorEntry : XorsInGroup) {
-    BinaryOperator *XorInst = XorEntry.first; // Original XOR instruction
-    const APInt ConstOffsetVal = XorEntry.second;
-
-    // Do not process the one with smallest constant as it is the base.
-    if (XorInst == XorWithSmallConst)
-      continue;
-
-    // Disjointness Check 1
-    APInt NewConstVal = ConstOffsetVal - SmallestConst;
-    if ((NewConstVal & SmallestConst) != 0) {
-      LLVM_DEBUG(dbgs() << DEBUG_TYPE << ": Cannot transform XOR in function "
-                        << F.getName() << ":\n"
-                        << "  New Const: " << NewConstVal
-                        << "  Smallest Const: " << SmallestConst
-                        << "  are not disjoint \n");
-      continue;
-    }
-
-    // Disjointness Check 2
-    if (MaskedValueIsZero(XorWithSmallConst, NewConstVal, SimplifyQuery(DL),
-                          0)) {
-      LLVM_DEBUG(dbgs() << DEBUG_TYPE
-                        << ": Transforming XOR to OR (disjoint) in function "
-                        << F.getName() << ":\n"
-                        << "  Xor: " << *XorInst << "\n"
-                        << "  Base Val: " << *XorWithSmallConst << "\n"
-                        << "  New Const: " << NewConstVal << "\n");
-
-      auto *NewOrInst = BinaryOperator::CreateDisjointOr(
-          XorWithSmallConst,
-          ConstantInt::get(OriginalBaseInst->getType(), NewConstVal),
-          XorInst->getName() + ".or_disjoint", XorInst->getIterator());
-
-      NewOrInst->copyMetadata(*XorInst);
-      XorInst->replaceAllUsesWith(NewOrInst);
-      LLVM_DEBUG(dbgs() << "  New Inst: " << *NewOrInst << "\n");
-      InstructionsToErase.push_back(XorInst); // Mark original XOR for deletion
-
-      Changed = true;
-    } else {
-      LLVM_DEBUG(
-          dbgs() << DEBUG_TYPE
-                 << ": Cannot transform XOR (not proven disjoint) in function "
-                 << F.getName() << ":\n"
-                 << "  Xor: " << *XorInst << "\n"
-                 << "  Base Val: " << *XorWithSmallConst << "\n"
-                 << "  New Const: " << NewConstVal << "\n");
-    }
-  }
-
-  for (Instruction *I : InstructionsToErase)
-    I->eraseFromParent();
-
-  return Changed;
-}
-
-// Try to transform XOR(A, B+C) in to XOR(A,C) + B where XOR(A,C) becomes
-// the base for memory operations. This transformation is true under the
-// following conditions
-// Check 1 -  B and C are disjoint.
-// Check 2 - XOR(A,C) and B are disjoint.
-//
-// This transformation is beneficial particularly for GEPs because:
-// 1. OR operations often map better to addressing modes than XOR
-// 2. Disjoint OR operations preserve the semantics of the original XOR
-// 3. This can enable further optimizations in the GEP offset folding pipeline
-bool XorToOrDisjointTransformer::run() {
-  bool Changed = false;
-
-  // Collect all candidate XORs
-  for (Instruction &I : instructions(F)) {
-    Instruction *Op0 = nullptr;
-    ConstantInt *C1 = nullptr;
-    BinaryOperator *MatchedXorOp = nullptr;
-
-    // Attempt to match the instruction 'I' as XOR operation.
-    if (match(&I, m_CombineAnd(m_Xor(m_Instruction(Op0), m_ConstantInt(C1)),
-                               m_BinOp(MatchedXorOp))) &&
-        hasGEPUser(MatchedXorOp))
-      XorGroups[Op0].emplace_back(MatchedXorOp, C1->getValue());
-  }
-
-  if (XorGroups.empty())
-    return false;
-
-  // Process each group of XORs
-  for (auto &[OriginalBaseInst, XorsInGroup] : XorGroups)
-    if (processXorGroup(OriginalBaseInst, XorsInGroup))
-      Changed = true;
-
-  return Changed;
-}
-
 bool SeparateConstOffsetFromGEPLegacyPass::runOnFunction(Function &F) {
   if (skipFunction(F))
     return false;
@@ -1430,11 +1242,6 @@ bool SeparateConstOffsetFromGEP::run(Function &F) {
 
   DL = &F.getDataLayout();
   bool Changed = false;
-
-  // Decompose xor in to "or disjoint" if possible.
-  XorToOrDisjointTransformer XorTransformer(F, *DT, *DL);
-  Changed |= XorTransformer.run();
-
   for (BasicBlock &B : F) {
     if (!DT->isReachableFromEntry(&B))
       continue;
diff --git a/llvm/test/Transforms/SeparateConstOffsetFromGEP/AMDGPU/xor-to-or-disjoint.ll b/llvm/test/Transforms/SeparateConstOffsetFromGEP/AMDGPU/xor-to-or-disjoint.ll
deleted file mode 100644
index 825227292fe14..0000000000000
--- a/llvm/test/Transforms/SeparateConstOffsetFromGEP/AMDGPU/xor-to-or-disjoint.ll
+++ /dev/null
@@ -1,204 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
-; RUN: opt -mtriple=amdgcn-amd-amdhsa -passes=separate-const-offset-from-gep \
-; RUN: -S < %s | FileCheck %s
-
-
-; Test a simple case of xor to or disjoint transformation
-define half @test_basic_transformation(ptr %ptr, i64 %input) {
-; CHECK-LABEL: define half @test_basic_transformation(
-; CHECK-SAME: ptr [[PTR:%.*]], i64 [[INPUT:%.*]]) {
-; CHECK-NEXT:  [[ENTRY:.*:]]
-; CHECK-NEXT:    [[BASE:%.*]] = and i64 [[INPUT]], -8192
-; CHECK-NEXT:    [[ADDR1:%.*]] = xor i64 [[BASE]], 32
-; CHECK-NEXT:    [[ADDR2_OR_DISJOINT:%.*]] = or disjoint i64 [[ADDR1]], 2048
-; CHECK-NEXT:    [[ADDR3_OR_DISJOINT:%.*]] = or disjoint i64 [[ADDR1]], 4096
-; CHECK-NEXT:    [[GEP1:%.*]] = getelementptr i8, ptr [[PTR]], i64 [[ADDR1]]
-; CHECK-NEXT:    [[GEP2:%.*]] = getelementptr i8, ptr [[PTR]], i64 [[ADDR2_OR_DISJOINT]]
-; CHECK-NEXT:    [[GEP3:%.*]] = getelementptr i8, ptr [[PTR]], i64 [[ADDR3_OR_DISJOINT]]
-; CHECK-NEXT:    [[VAL1:%.*]] = load half, ptr [[GEP1]], align 2
-; CHECK-NEXT:    [[VAL2:%.*]] = load half, ptr [[GEP2]], align 2
-; CHECK-NEXT:    [[VAL3:%.*]] = load half, ptr [[GEP3]], align 2
-; CHECK-NEXT:    [[VAL1_F:%.*]] = fpext half [[VAL1]] to float
-; CHECK-NEXT:    [[VAL2_F:%.*]] = fpext half [[VAL2]] to float
-; CHECK-NEXT:    [[VAL3_F:%.*]] = fpext half [[VAL3]] to float
-; CHECK-NEXT:    [[SUM1_F:%.*]] = fadd float [[VAL1_F]], [[VAL2_F]]
-; CHECK-NEXT:    [[SUM_TOTAL_F:%.*]] = fadd float [[SUM1_F]], [[VAL3_F]]
-; CHECK-NEXT:    [[RESULT_H:%.*]] = fptrunc float [[SUM_TOTAL_F]] to half
-; CHECK-NEXT:    ret half [[RESULT_H]]
-;
-entry:
-  %base = and i64 %input, -8192    ; Clear low bits
-  %addr1 = xor i64 %base, 32
-  %addr2 = xor i64 %base, 2080
-  %addr3 = xor i64 %base, 4128
-  %gep1 = getelementptr i8, ptr %ptr, i64 %addr1
-  %gep2 = getelementptr i8, ptr %ptr, i64 %addr2
-  %gep3 = getelementptr i8, ptr %ptr, i64 %addr3
-  %val1 = load half, ptr %gep1
-  %val2 = load half, ptr %gep2
-  %val3 = load half, ptr %gep3
-  %val1.f = fpext half %val1 to float
-  %val2.f = fpext half %val2 to float
-  %val3.f = fpext half %val3 to float
-  %sum1.f = fadd float %val1.f, %val2.f
-  %sum_total.f = fadd float %sum1.f, %val3.f
-  %result.h = fptrunc float %sum_total.f to half
-  ret half %result.h
-}
-
-
-; Test the decreasing order of offset xor to or disjoint transformation
-define half @test_descending_offset_transformation(ptr %ptr, i64 %input) {
-; CHECK-LABEL: define half @test_descending_offset_transformation(
-; CHECK-SAME: ptr [[PTR:%.*]], i64 [[INPUT:%.*]]) {
-; CHECK-NEXT:  [[ENTRY:.*:]]
-; CHECK-NEXT:    [[BASE:%.*]] = and i64 [[INPUT]], -8192
-; CHECK-NEXT:    [[ADDR3_DOM_CLONE:%.*]] = xor i64 [[BASE]], 32
-; CHECK-NEXT:    [[ADDR1_OR_DISJOINT:%.*]] = or disjoint i64 [[ADDR3_DOM_CLONE]], 4096
-; CHECK-NEXT:    [[ADDR2_OR_DISJOINT:%.*]] = or disjoint i64 [[ADDR3_DOM_CLONE]], 2048
-; CHECK-NEXT:    [[ADDR3_OR_DISJOINT:%.*]] = or disjoint i64 [[ADDR3_DOM_CLONE]], 0
-; CHECK-NEXT:    [[GEP1:%.*]] = getelementptr i8, ptr [[PTR]], i64 [[ADDR1_OR_DISJOINT]]
-; CHECK-NEXT:    [[GEP2:%.*]] = getelementptr i8, ptr [[PTR]], i64 [[ADDR2_OR_DISJOINT]]
-; CHECK-NEXT:    [[GEP3:%.*]] = getelementptr i8, ptr [[PTR]], i64 [[ADDR3_OR_DISJOINT]]
-; CHECK-NEXT:    [[VAL1:%.*]] = load half, ptr [[GEP1]], align 2
-; CHECK-NEXT:    [[VAL2:%.*]] = load half, ptr [[GEP2]], align 2
-; CHECK-NEXT:    [[VAL3:%.*]] = load half, ptr [[GEP3]], align 2
-; CHECK-NEXT:    [[VAL1_F:%.*]] = fpext half [[VAL1]] to float
-; CHECK-NEXT:    [[VAL2_F:%.*]] = fpext half [[VAL2]] to float
-; CHECK-NEXT:    [[VAL3_F:%.*]] = fpext half [[VAL3]] to float
-; CHECK-NEXT:    [[SUM1_F:%.*]] = fadd float [[VAL1_F]], [[VAL2_F]]
-; CHECK-NEXT:    [[SUM_TOTAL_F:%.*]] = fadd float [[SUM1_F]], [[VAL3_F]]
-; CHECK-NEXT:    [[RESULT_H:%.*]] = fptrunc float [[SUM_TOTAL_F]] to half
-; CHECK-NEXT:    ret half [[RESULT_H]]
-;
-entry:
-  %base = and i64 %input, -8192    ; Clear low bits
-  %addr1 = xor i64 %base, 4128
-  %addr2 = xor i64 %base, 2080
-  %addr3 = xor i64 %base, 32
-  %gep1 = getelementptr i8, ptr %ptr, i64 %addr1
-  %gep2 = getelementptr i8, ptr %ptr, i64 %addr2
-  %gep3 = getelementptr i8, ptr %ptr, i64 %addr3
-  %val1 = load half, ptr %gep1
-  %val2 = load half, ptr %gep2
-  %val3 = load half, ptr %gep3
-  %val1.f = fpext half %val1 to float
-  %val2.f = fpext half %val2 to float
-  %val3.f = fpext half %val3 to float
-  %sum1.f = fadd float %val1.f, %val2.f
-  %sum_total.f = fadd float %sum1.f, %val3.f
-  %result.h = fptrunc float %sum_total.f to half
-  ret half %result.h
-}
-
-
-; Test that %addr2 is not transformed to or disjoint.
-define half @test_no_transfomation(ptr %ptr, i64 %input) {
-; CHECK-LABEL: define half @test_no_transfomation(
-; CHECK-SAME: ptr [[PTR:%.*]], i64 [[INPUT:%.*]]) {
-; CHECK-NEXT:  [[ENTRY:.*:]]
-; CHECK-NEXT:    [[BASE:%.*]] = and i64 [[INPUT]], -8192
-; CHECK-NEXT:    [[ADDR1:%.*]] = xor i64 [[BASE]], 32
-; CHECK-NEXT:    [[ADDR2:%.*]] = xor i64 [[BASE]], 64
-; CHECK-NEXT:    [[ADDR3_OR_DISJOINT:%.*]] = or disjoint i64 [[ADDR1]], 2048
-; CHECK-NEXT:    [[GEP1:%.*]] = getelementptr i8, ptr [[PTR]], i64 [[ADDR1]]
-; CHECK-NEXT:    [[GEP2:%.*]] = getelementptr i8, ptr [[PTR]], i64 [[ADDR2]]
-; CHECK-NEXT:    [[GEP3:%.*]] = getelementptr i8, ptr [[PTR]], i64 [[ADDR3_OR_DISJOINT]]
-; CHECK-NEXT:    [[VAL1:%.*]] = load half, ptr [[GEP1]], align 2
-; CHECK-NEXT:    [[VAL2:%.*]] = load half, ptr [[GEP2]], align 2
-; CHECK-NEXT:    [[VAL3:%.*]] = load half, ptr [[GEP3]], align 2
-; CHECK-NEXT:    [[VAL1_F:%.*]] = fpext half [[VAL1]] to float
-; CHECK-NEXT:    [[VAL2_F:%.*]] = fpext half [[VAL2]] to float
-; CHECK-NEXT:    [[VAL3_F:%.*]] = fpext half [[VAL3]] to float
-; CHECK-NEXT:    [[SUM1_F:%.*]] = fadd float [[VAL1_F]], [[VAL2_F]]
-; CHECK-NEXT:    [[SUM_TOTAL_F:%.*]] = fadd float [[SUM1_F]], [[VAL3_F]]
-; CHECK-NEXT:    [[RESULT_H:%.*]] = fptrunc float [[SUM_TOTAL_F]] to half
-; CHECK-NEXT:    ret half [[RESULT_H]]
-;
-entry:
-  %base = and i64 %input, -8192    ; Clear low bits
-  %addr1 = xor i64 %base, 32
-  %addr2 = xor i64 %base, 64  ; Should not be transformed
-  %addr3 = xor i64 %base, 2080
-  %gep1 = getelementptr i8, ptr %ptr, i64 %addr1
-  %gep2 = getelementptr i8, ptr %ptr, i64 %addr2
-  %gep3 = getelementptr i8, ptr %ptr, i64 %addr3
-  %val1 = load half, ptr %gep1
-  %val2 = load half, ptr %gep2
-  %val3 = load half, ptr %gep3
-  %val1.f = fpext half %val1 to float
-  %val2.f = fpext half %val2 to float
-  %val3.f = fpext half %val3 to float
-  %sum1.f = fadd float %val1.f, %val2.f
-  %sum_total.f = fadd float %sum1.f, %val3.f
-  %result.h = fptrunc float %sum_total.f to half
-  ret half %result.h
-}
-
-
-; Test case with xor instructions in different basic blocks
-define half @test_dom_tree(ptr %ptr, i64 %input, i1 %cond) {
-; CHECK-LABEL: define half @test_dom_tree(
-; CHECK-SAME: ptr [[PTR:%.*]], i64 [[INPUT:%.*]], i1 [[COND:%.*]]) {
-; CHECK-NEXT:  [[ENTRY:.*:]]
-; CHECK-NEXT:    [[BASE:%.*]] = and i64 [[INPUT]], -8192
-; CHECK-NEXT:    [[ADDR1:%.*]] = xor i64 [[BASE]], 16
-; CHECK-NEXT:    [[GEP1:%.*]] = getelementptr i8, ptr [[PTR]], i64 [[ADDR1]]
-; CHECK-NEXT:    [[VAL1:%.*]] = load half, ptr [[GEP1]], align 2
-; CHECK-NEXT:    br i1 [[COND]], label %[[THEN:.*]], label %[[ELSE:.*]]
-; CHECK:       [[THEN]]:
-; CHECK-NEXT:    [[ADDR2_OR_DISJOINT:%.*]] = or disjoint i64 [[ADDR1]], 32
-; CHECK-NEXT:    [[GEP2:%.*]] = getelementptr i8, ptr [[PTR]], i64 [[ADDR2_OR_DISJOINT]]
-; CHECK-NEXT:    [[VAL2:%.*]] = load half, ptr [[GEP2]], align 2
-; CHECK-NEXT:    br label %[[MERGE:.*]]
-; CHECK:       [[ELSE]]:
-; CHECK-NEXT:    [[ADDR3_OR_DISJOINT:%.*]] = or disjoint i64 [[ADDR1]], 96
-; CHECK-NEXT:    [[GEP3:%.*]] = getelementptr i8, ptr [[PTR]], i64 [[ADDR3_OR_DISJOINT]]
-; CHECK-NEXT:    [[VAL3:%.*]] = load half, ptr [[GEP3]], align 2
-; CHECK-NEXT:    br label %[[MERGE]]
-; CHECK:       [[MERGE]]:
-; CHECK-NEXT:    [[VAL_FROM_BRANCH:%.*]] = phi half [ [[VAL2]], %[[THEN]] ], [ [[VAL3]], %[[ELSE]] ]
-; CHECK-NEXT:    [[ADDR4_OR_DISJOINT:%.*]] = or disjoint i64 [[ADDR1]], 224
-; CHECK-NEXT:    [[GEP4:%.*]] = getelementptr i8, ptr [[PTR]], i64 [[ADDR4_OR_DISJOINT]]
-; CHECK-NEXT:    [[VAL4:%.*]] = load half, ptr [[GEP4]], align 2
-; CHECK-NEXT:    [[VAL1_F:%.*]] = fpext half [[VAL1]] to float
-; CHECK-NEXT:    [[VAL_FROM_BRANCH_F:%.*]] = fpext half [[VAL_FROM_BRANCH]] to float
-; CHECK-NEXT:    [[VAL4_F:%.*]] = fpext half [[VAL4]] to float
-; CHECK-NEXT:    [[SUM_INTERMEDIATE_F:%.*]] = fadd float [[VAL1_F]], [[VAL_FROM_BRANCH_F]]
-; CHECK-NEXT:    [[FINAL_SUM_F:%.*]] = fadd float [[SUM_INTERMEDIATE_F]], [[VAL4_F]]
-; CHECK-NEXT:    [[RESULT_H:%.*]] = fptrunc float [[FINAL_SUM_F]] to half
-; CHECK-NEXT:    ret half [[RESULT_H]]
-;
-entry:
-  %base = and i64 %input, -8192   ; Clear low bits
-  %addr1 = xor i64 %base,16
-  %gep1 = getelementptr i8, ptr %ptr, i64 %addr1
-  %val1 = load half, ptr %gep1
-  br i1 %cond, label %then, label %else
-
-then:
-  %addr2 = xor i64 %base, 48
-  %gep2 = getelementptr i8, ptr %ptr, i64 %addr2
-  %val2 = load half, ptr %gep2
-  br label %merge
-
-else:
-  %addr3 = xor i64 %base, 112
-  %gep3 = getelementptr i8, ptr %ptr, i64 %addr3
-  %val3 = load half, ptr %gep3
-  br label %merge
-
-merge:
-  %val_from_branch = phi half [ %val2, %then ], [ %val3, %else ]
-  %addr4 = xor i64 %base, 240
-  %gep4 = getelementptr i8, ptr %ptr, i64 %addr4
-  %val4 = load half, ptr %gep4
-  %val1.f = fpext half %val1 to float
-  %val_from_branch.f = fpext half %val_from_branch to float
-  %val4.f = fpext half %val4 to float
-  %sum_intermediate.f = fadd float %val1.f, %val_from_branch.f
-  %final_sum.f = fadd float %sum_intermediate.f, %val4.f
-  %result.h = fptrunc float %final_sum.f to half
-  ret half %result.h
-}
-

>From b9329fe88e47741d9c20ab92f892ac52457e6195 Mon Sep 17 00:00:00 2001
From: Andy Kaylor <akaylor at nvidia.com>
Date: Tue, 10 Jun 2025 16:50:29 -0700
Subject: [PATCH 30/61] [CIR] Upstream support for calling constructors
 (#143579)

This change adds support for calling C++ constructors. The support for
actually defining a constructor is still missing and will be added in a
later change.
---
 clang/include/clang/CIR/MissingFeatures.h     |  3 +
 clang/lib/CIR/CodeGen/CIRGenCall.cpp          | 99 +++++++++++++++++--
 clang/lib/CIR/CodeGen/CIRGenClass.cpp         | 74 ++++++++++++++
 clang/lib/CIR/CodeGen/CIRGenExpr.cpp          | 51 ++++++++++
 clang/lib/CIR/CodeGen/CIRGenExprAggregate.cpp |  6 ++
 clang/lib/CIR/CodeGen/CIRGenFunction.h        | 13 +++
 clang/lib/CIR/CodeGen/CIRGenModule.cpp        | 54 +++++++++-
 clang/lib/CIR/CodeGen/CIRGenModule.h          | 19 ++++
 clang/lib/CIR/CodeGen/CIRGenTypes.h           |  6 ++
 clang/test/CIR/CodeGen/ctor.cpp               | 19 ++++
 10 files changed, 336 insertions(+), 8 deletions(-)
 create mode 100644 clang/test/CIR/CodeGen/ctor.cpp

diff --git a/clang/include/clang/CIR/MissingFeatures.h b/clang/include/clang/CIR/MissingFeatures.h
index 72d882beb2244..f89d386378e51 100644
--- a/clang/include/clang/CIR/MissingFeatures.h
+++ b/clang/include/clang/CIR/MissingFeatures.h
@@ -222,6 +222,9 @@ struct MissingFeatures {
   static bool instrumentation() { return false; }
   static bool cleanupAfterErrorDiags() { return false; }
   static bool cxxRecordStaticMembers() { return false; }
+  static bool isMemcpyEquivalentSpecialMember() { return false; }
+  static bool isTrivialCtorOrDtor() { return false; }
+  static bool implicitConstructorArgs() { return false; }
 
   // Missing types
   static bool dataMemberType() { return false; }
diff --git a/clang/lib/CIR/CodeGen/CIRGenCall.cpp b/clang/lib/CIR/CodeGen/CIRGenCall.cpp
index b194a8670bfb9..9d25eea9e413d 100644
--- a/clang/lib/CIR/CodeGen/CIRGenCall.cpp
+++ b/clang/lib/CIR/CodeGen/CIRGenCall.cpp
@@ -60,6 +60,13 @@ CIRGenCallee CIRGenCallee::prepareConcreteCallee(CIRGenFunction &cgf) const {
   return *this;
 }
 
+/// Returns the canonical formal type of the given C++ method.
+static CanQual<FunctionProtoType> getFormalType(const CXXMethodDecl *md) {
+  return md->getType()
+      ->getCanonicalTypeUnqualified()
+      .getAs<FunctionProtoType>();
+}
+
 /// Adds the formal parameters in FPT to the given prefix. If any parameter in
 /// FPT has pass_object_size_attrs, then we'll add parameters for those, too.
 /// TODO(cir): this should be shared with LLVM codegen
@@ -76,6 +83,48 @@ static void appendParameterTypes(const CIRGenTypes &cgt,
   cgt.getCGModule().errorNYI("appendParameterTypes: hasExtParameterInfos");
 }
 
+const CIRGenFunctionInfo &
+CIRGenTypes::arrangeCXXStructorDeclaration(GlobalDecl gd) {
+  auto *md = cast<CXXMethodDecl>(gd.getDecl());
+
+  llvm::SmallVector<CanQualType, 16> argTypes;
+  argTypes.push_back(deriveThisType(md->getParent(), md));
+
+  bool passParams = true;
+
+  if (auto *cd = dyn_cast<CXXConstructorDecl>(md)) {
+    // A base class inheriting constructor doesn't get forwarded arguments
+    // needed to construct a virtual base (or base class thereof)
+    if (cd->getInheritedConstructor())
+      cgm.errorNYI(cd->getSourceRange(),
+                   "arrangeCXXStructorDeclaration: inheriting constructor");
+  }
+
+  CanQual<FunctionProtoType> fpt = getFormalType(md);
+
+  if (passParams)
+    appendParameterTypes(*this, argTypes, fpt);
+
+  assert(!cir::MissingFeatures::implicitConstructorArgs());
+
+  RequiredArgs required =
+      (passParams && md->isVariadic() ? RequiredArgs(argTypes.size())
+                                      : RequiredArgs::All);
+
+  CanQualType resultType = theCXXABI.hasThisReturn(gd) ? argTypes.front()
+                           : theCXXABI.hasMostDerivedReturn(gd)
+                               ? astContext.VoidPtrTy
+                               : astContext.VoidTy;
+
+  assert(!theCXXABI.hasThisReturn(gd) &&
+         "Please send PR with a test and remove this");
+
+  assert(!cir::MissingFeatures::opCallCIRGenFuncInfoExtParamInfo());
+  assert(!cir::MissingFeatures::opCallFnInfoOpts());
+
+  return arrangeCIRFunctionInfo(resultType, argTypes, required);
+}
+
 /// Derives the 'this' type for CIRGen purposes, i.e. ignoring method CVR
 /// qualification. Either or both of `rd` and `md` may be null. A null `rd`
 /// indicates that there is no meaningful 'this' type, and a null `md` can occur
@@ -103,13 +152,13 @@ CanQualType CIRGenTypes::deriveThisType(const CXXRecordDecl *rd,
 /// top of any implicit parameters already stored.
 static const CIRGenFunctionInfo &
 arrangeCIRFunctionInfo(CIRGenTypes &cgt, SmallVectorImpl<CanQualType> &prefix,
-                       CanQual<FunctionProtoType> ftp) {
+                       CanQual<FunctionProtoType> fpt) {
   assert(!cir::MissingFeatures::opCallFnInfoOpts());
   RequiredArgs required =
-      RequiredArgs::getFromProtoWithExtraSlots(ftp, prefix.size());
+      RequiredArgs::getFromProtoWithExtraSlots(fpt, prefix.size());
   assert(!cir::MissingFeatures::opCallExtParameterInfo());
-  appendParameterTypes(cgt, prefix, ftp);
-  CanQualType resultType = ftp->getReturnType().getUnqualifiedType();
+  appendParameterTypes(cgt, prefix, fpt);
+  CanQualType resultType = fpt->getReturnType().getUnqualifiedType();
   return cgt.arrangeCIRFunctionInfo(resultType, prefix, required);
 }
 
@@ -141,6 +190,44 @@ arrangeFreeFunctionLikeCall(CIRGenTypes &cgt, CIRGenModule &cgm,
   return cgt.arrangeCIRFunctionInfo(retType, argTypes, required);
 }
 
+/// Arrange a call to a C++ method, passing the given arguments.
+///
+/// passProtoArgs indicates whether `args` has args for the parameters in the
+/// given CXXConstructorDecl.
+const CIRGenFunctionInfo &CIRGenTypes::arrangeCXXConstructorCall(
+    const CallArgList &args, const CXXConstructorDecl *d, CXXCtorType ctorKind,
+    bool passProtoArgs) {
+
+  // FIXME: Kill copy.
+  llvm::SmallVector<CanQualType, 16> argTypes;
+  for (const auto &arg : args)
+    argTypes.push_back(astContext.getCanonicalParamType(arg.ty));
+
+  assert(!cir::MissingFeatures::implicitConstructorArgs());
+  // +1 for implicit this, which should always be args[0]
+  unsigned totalPrefixArgs = 1;
+
+  CanQual<FunctionProtoType> fpt = getFormalType(d);
+  RequiredArgs required =
+      passProtoArgs
+          ? RequiredArgs::getFromProtoWithExtraSlots(fpt, totalPrefixArgs)
+          : RequiredArgs::All;
+
+  GlobalDecl gd(d, ctorKind);
+  if (theCXXABI.hasThisReturn(gd))
+    cgm.errorNYI(d->getSourceRange(),
+                 "arrangeCXXConstructorCall: hasThisReturn");
+  if (theCXXABI.hasMostDerivedReturn(gd))
+    cgm.errorNYI(d->getSourceRange(),
+                 "arrangeCXXConstructorCall: hasMostDerivedReturn");
+  CanQualType resultType = astContext.VoidTy;
+
+  assert(!cir::MissingFeatures::opCallFnInfoOpts());
+  assert(!cir::MissingFeatures::opCallCIRGenFuncInfoExtParamInfo());
+
+  return arrangeCIRFunctionInfo(resultType, argTypes, required);
+}
+
 /// Arrange a call to a C++ method, passing the given arguments.
 ///
 /// numPrefixArgs is the number of the ABI-specific prefix arguments we have. It
@@ -198,7 +285,7 @@ CIRGenTypes::arrangeCXXMethodDeclaration(const CXXMethodDecl *md) {
 /// constructor or destructor.
 const CIRGenFunctionInfo &
 CIRGenTypes::arrangeCXXMethodType(const CXXRecordDecl *rd,
-                                  const FunctionProtoType *ftp,
+                                  const FunctionProtoType *fpt,
                                   const CXXMethodDecl *md) {
   llvm::SmallVector<CanQualType, 16> argTypes;
 
@@ -208,7 +295,7 @@ CIRGenTypes::arrangeCXXMethodType(const CXXRecordDecl *rd,
   assert(!cir::MissingFeatures::opCallFnInfoOpts());
   return ::arrangeCIRFunctionInfo(
       *this, argTypes,
-      ftp->getCanonicalTypeUnqualified().getAs<FunctionProtoType>());
+      fpt->getCanonicalTypeUnqualified().getAs<FunctionProtoType>());
 }
 
 /// Arrange the argument and result information for the declaration or
diff --git a/clang/lib/CIR/CodeGen/CIRGenClass.cpp b/clang/lib/CIR/CodeGen/CIRGenClass.cpp
index 4cdaa480121dd..8491a66ea6cb4 100644
--- a/clang/lib/CIR/CodeGen/CIRGenClass.cpp
+++ b/clang/lib/CIR/CodeGen/CIRGenClass.cpp
@@ -10,9 +10,12 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "CIRGenCXXABI.h"
 #include "CIRGenFunction.h"
 
+#include "clang/AST/ExprCXX.h"
 #include "clang/AST/RecordLayout.h"
+#include "clang/AST/Type.h"
 #include "clang/CIR/MissingFeatures.h"
 
 using namespace clang;
@@ -63,3 +66,74 @@ Address CIRGenFunction::getAddressOfBaseClass(
 
   return value;
 }
+
+void CIRGenFunction::emitCXXConstructorCall(const clang::CXXConstructorDecl *d,
+                                            clang::CXXCtorType type,
+                                            bool forVirtualBase,
+                                            bool delegating,
+                                            AggValueSlot thisAVS,
+                                            const clang::CXXConstructExpr *e) {
+  CallArgList args;
+  Address thisAddr = thisAVS.getAddress();
+  QualType thisType = d->getThisType();
+  mlir::Value thisPtr = thisAddr.getPointer();
+
+  assert(!cir::MissingFeatures::addressSpace());
+
+  args.add(RValue::get(thisPtr), thisType);
+
+  // In LLVM Codegen: If this is a trivial constructor, just emit what's needed.
+  // If this is a union copy constructor, we must emit a memcpy, because the AST
+  // does not model that copy.
+  assert(!cir::MissingFeatures::isMemcpyEquivalentSpecialMember());
+
+  const FunctionProtoType *fpt = d->getType()->castAs<FunctionProtoType>();
+
+  assert(!cir::MissingFeatures::opCallArgEvaluationOrder());
+
+  emitCallArgs(args, fpt, e->arguments(), e->getConstructor(),
+               /*ParamsToSkip=*/0);
+
+  assert(!cir::MissingFeatures::sanitizers());
+  emitCXXConstructorCall(d, type, forVirtualBase, delegating, thisAddr, args,
+                         e->getExprLoc());
+}
+
+void CIRGenFunction::emitCXXConstructorCall(
+    const CXXConstructorDecl *d, CXXCtorType type, bool forVirtualBase,
+    bool delegating, Address thisAddr, CallArgList &args, SourceLocation loc) {
+
+  const CXXRecordDecl *crd = d->getParent();
+
+  // If this is a call to a trivial default constructor:
+  // In LLVM: do nothing.
+  // In CIR: emit as a regular call, other later passes should lower the
+  // ctor call into trivial initialization.
+  assert(!cir::MissingFeatures::isTrivialCtorOrDtor());
+
+  assert(!cir::MissingFeatures::isMemcpyEquivalentSpecialMember());
+
+  bool passPrototypeArgs = true;
+
+  // Check whether we can actually emit the constructor before trying to do so.
+  if (d->getInheritedConstructor()) {
+    cgm.errorNYI(d->getSourceRange(),
+                 "emitCXXConstructorCall: inherited constructor");
+    return;
+  }
+
+  // Insert any ABI-specific implicit constructor arguments.
+  assert(!cir::MissingFeatures::implicitConstructorArgs());
+
+  // Emit the call.
+  auto calleePtr = cgm.getAddrOfCXXStructor(GlobalDecl(d, type));
+  const CIRGenFunctionInfo &info = cgm.getTypes().arrangeCXXConstructorCall(
+      args, d, type, passPrototypeArgs);
+  CIRGenCallee callee = CIRGenCallee::forDirect(calleePtr, GlobalDecl(d, type));
+  cir::CIRCallOpInterface c;
+  emitCall(info, callee, ReturnValueSlot(), args, &c, getLoc(loc));
+
+  if (cgm.getCodeGenOpts().OptimizationLevel != 0 && !crd->isDynamicClass() &&
+      type != Ctor_Base && cgm.getCodeGenOpts().StrictVTablePointers)
+    cgm.errorNYI(d->getSourceRange(), "vtable assumption loads");
+}
diff --git a/clang/lib/CIR/CodeGen/CIRGenExpr.cpp b/clang/lib/CIR/CodeGen/CIRGenExpr.cpp
index 8129fe0ad7db7..f2c2de7a4f59d 100644
--- a/clang/lib/CIR/CodeGen/CIRGenExpr.cpp
+++ b/clang/lib/CIR/CodeGen/CIRGenExpr.cpp
@@ -1393,6 +1393,57 @@ RValue CIRGenFunction::emitCXXMemberCallExpr(const CXXMemberCallExpr *ce,
       ce, md, returnValue, hasQualifier, qualifier, isArrow, base);
 }
 
+void CIRGenFunction::emitCXXConstructExpr(const CXXConstructExpr *e,
+                                          AggValueSlot dest) {
+  assert(!dest.isIgnored() && "Must have a destination!");
+  const CXXConstructorDecl *cd = e->getConstructor();
+
+  // If we require zero initialization before (or instead of) calling the
+  // constructor, as can be the case with a non-user-provided default
+  // constructor, emit the zero initialization now, unless destination is
+  // already zeroed.
+  if (e->requiresZeroInitialization() && !dest.isZeroed()) {
+    cgm.errorNYI(e->getSourceRange(),
+                 "emitCXXConstructExpr: requires initialization");
+    return;
+  }
+
+  // If this is a call to a trivial default constructor:
+  // In LLVM: do nothing.
+  // In CIR: emit as a regular call, other later passes should lower the
+  // ctor call into trivial initialization.
+
+  // Elide the constructor if we're constructing from a temporary
+  if (getLangOpts().ElideConstructors && e->isElidable()) {
+    cgm.errorNYI(e->getSourceRange(),
+                 "emitCXXConstructExpr: elidable constructor");
+    return;
+  }
+
+  if (getContext().getAsArrayType(e->getType())) {
+    cgm.errorNYI(e->getSourceRange(), "emitCXXConstructExpr: array type");
+    return;
+  }
+
+  clang::CXXCtorType type = Ctor_Complete;
+  bool forVirtualBase = false;
+  bool delegating = false;
+
+  switch (e->getConstructionKind()) {
+  case CXXConstructionKind::Complete:
+    type = Ctor_Complete;
+    break;
+  case CXXConstructionKind::Delegating:
+  case CXXConstructionKind::VirtualBase:
+  case CXXConstructionKind::NonVirtualBase:
+    cgm.errorNYI(e->getSourceRange(),
+                 "emitCXXConstructExpr: other construction kind");
+    return;
+  }
+
+  emitCXXConstructorCall(cd, type, forVirtualBase, delegating, dest, e);
+}
+
 RValue CIRGenFunction::emitReferenceBindingToExpr(const Expr *e) {
   // Emit the expression as an lvalue.
   LValue lv = emitLValue(e);
diff --git a/clang/lib/CIR/CodeGen/CIRGenExprAggregate.cpp b/clang/lib/CIR/CodeGen/CIRGenExprAggregate.cpp
index 56d7ea3884ba7..f1df1b79fc48e 100644
--- a/clang/lib/CIR/CodeGen/CIRGenExprAggregate.cpp
+++ b/clang/lib/CIR/CodeGen/CIRGenExprAggregate.cpp
@@ -51,6 +51,7 @@ class AggExprEmitter : public StmtVisitor<AggExprEmitter> {
   void Visit(Expr *e) { StmtVisitor<AggExprEmitter>::Visit(e); }
 
   void VisitInitListExpr(InitListExpr *e);
+  void VisitCXXConstructExpr(const CXXConstructExpr *e);
 
   void visitCXXParenListOrInitListExpr(Expr *e, ArrayRef<Expr *> args,
                                        FieldDecl *initializedFieldInUnion,
@@ -213,6 +214,11 @@ void AggExprEmitter::emitInitializationToLValue(Expr *e, LValue lv) {
   }
 }
 
+void AggExprEmitter::VisitCXXConstructExpr(const CXXConstructExpr *e) {
+  AggValueSlot slot = ensureSlot(cgf.getLoc(e->getSourceRange()), e->getType());
+  cgf.emitCXXConstructExpr(e, slot);
+}
+
 void AggExprEmitter::emitNullInitializationToLValue(mlir::Location loc,
                                                     LValue lv) {
   const QualType type = lv.getType();
diff --git a/clang/lib/CIR/CodeGen/CIRGenFunction.h b/clang/lib/CIR/CodeGen/CIRGenFunction.h
index d6002c3e4d4d9..7db7f6928fd8f 100644
--- a/clang/lib/CIR/CodeGen/CIRGenFunction.h
+++ b/clang/lib/CIR/CodeGen/CIRGenFunction.h
@@ -744,6 +744,19 @@ class CIRGenFunction : public CIRGenTypeCache {
 
   mlir::LogicalResult emitContinueStmt(const clang::ContinueStmt &s);
 
+  void emitCXXConstructExpr(const clang::CXXConstructExpr *e,
+                            AggValueSlot dest);
+
+  void emitCXXConstructorCall(const clang::CXXConstructorDecl *d,
+                              clang::CXXCtorType type, bool forVirtualBase,
+                              bool delegating, AggValueSlot thisAVS,
+                              const clang::CXXConstructExpr *e);
+
+  void emitCXXConstructorCall(const clang::CXXConstructorDecl *d,
+                              clang::CXXCtorType type, bool forVirtualBase,
+                              bool delegating, Address thisAddr,
+                              CallArgList &args, clang::SourceLocation loc);
+
   mlir::LogicalResult emitCXXForRangeStmt(const CXXForRangeStmt &s,
                                           llvm::ArrayRef<const Attr *> attrs);
 
diff --git a/clang/lib/CIR/CodeGen/CIRGenModule.cpp b/clang/lib/CIR/CodeGen/CIRGenModule.cpp
index 3d46c44b4f1ec..8407f8fad06ba 100644
--- a/clang/lib/CIR/CodeGen/CIRGenModule.cpp
+++ b/clang/lib/CIR/CodeGen/CIRGenModule.cpp
@@ -103,6 +103,25 @@ CIRGenModule::CIRGenModule(mlir::MLIRContext &mlirContext,
 
 CIRGenModule::~CIRGenModule() = default;
 
+/// FIXME: this could likely be a common helper and not necessarily related
+/// with codegen.
+/// Return the best known alignment for an unknown pointer to a
+/// particular class.
+CharUnits CIRGenModule::getClassPointerAlignment(const CXXRecordDecl *rd) {
+  if (!rd->hasDefinition())
+    return CharUnits::One(); // Hopefully won't be used anywhere.
+
+  auto &layout = astContext.getASTRecordLayout(rd);
+
+  // If the class is final, then we know that the pointer points to an
+  // object of that type and can use the full alignment.
+  if (rd->isEffectivelyFinal())
+    return layout.getAlignment();
+
+  // Otherwise, we have to assume it could be a subclass.
+  return layout.getNonVirtualAlignment();
+}
+
 CharUnits CIRGenModule::getNaturalTypeAlignment(QualType t,
                                                 LValueBaseInfo *baseInfo) {
   assert(!cir::MissingFeatures::opTBAA());
@@ -1174,6 +1193,34 @@ void CIRGenModule::setInitializer(cir::GlobalOp &op, mlir::Attribute value) {
   assert(!cir::MissingFeatures::opGlobalVisibility());
 }
 
+std::pair<cir::FuncType, cir::FuncOp> CIRGenModule::getAddrAndTypeOfCXXStructor(
+    GlobalDecl gd, const CIRGenFunctionInfo *fnInfo, cir::FuncType fnType,
+    bool dontDefer, ForDefinition_t isForDefinition) {
+  auto *md = cast<CXXMethodDecl>(gd.getDecl());
+
+  if (isa<CXXDestructorDecl>(md)) {
+    // Always alias equivalent complete destructors to base destructors in the
+    // MS ABI.
+    if (getTarget().getCXXABI().isMicrosoft() &&
+        gd.getDtorType() == Dtor_Complete &&
+        md->getParent()->getNumVBases() == 0)
+      errorNYI(md->getSourceRange(),
+               "getAddrAndTypeOfCXXStructor: MS ABI complete destructor");
+  }
+
+  if (!fnType) {
+    if (!fnInfo)
+      fnInfo = &getTypes().arrangeCXXStructorDeclaration(gd);
+    fnType = getTypes().getFunctionType(*fnInfo);
+  }
+
+  auto fn = getOrCreateCIRFunction(getMangledName(gd), fnType, gd,
+                                   /*ForVtable=*/false, dontDefer,
+                                   /*IsThunk=*/false, isForDefinition);
+
+  return {fnType, fn};
+}
+
 cir::FuncOp CIRGenModule::getAddrOfFunction(clang::GlobalDecl gd,
                                             mlir::Type funcType, bool forVTable,
                                             bool dontDefer,
@@ -1248,8 +1295,11 @@ StringRef CIRGenModule::getMangledName(GlobalDecl gd) {
   // Some ABIs don't have constructor variants. Make sure that base and complete
   // constructors get mangled the same.
   if (const auto *cd = dyn_cast<CXXConstructorDecl>(canonicalGd.getDecl())) {
-    errorNYI(cd->getSourceRange(), "getMangledName: C++ constructor");
-    return cast<NamedDecl>(gd.getDecl())->getIdentifier()->getName();
+    if (!getTarget().getCXXABI().hasConstructorVariants()) {
+      errorNYI(cd->getSourceRange(),
+               "getMangledName: C++ constructor without variants");
+      return cast<NamedDecl>(gd.getDecl())->getIdentifier()->getName();
+    }
   }
 
   // Keep the first result in the case of a mangling collision.
diff --git a/clang/lib/CIR/CodeGen/CIRGenModule.h b/clang/lib/CIR/CodeGen/CIRGenModule.h
index 24ec9ca6403bc..9748c0b3ed43a 100644
--- a/clang/lib/CIR/CodeGen/CIRGenModule.h
+++ b/clang/lib/CIR/CodeGen/CIRGenModule.h
@@ -166,11 +166,30 @@ class CIRGenModule : public CIRGenTypeCache {
   mlir::Location getLoc(clang::SourceLocation cLoc);
   mlir::Location getLoc(clang::SourceRange cRange);
 
+  /// Return the best known alignment for an unknown pointer to a
+  /// particular class.
+  clang::CharUnits getClassPointerAlignment(const clang::CXXRecordDecl *rd);
+
   /// FIXME: this could likely be a common helper and not necessarily related
   /// with codegen.
   clang::CharUnits getNaturalTypeAlignment(clang::QualType t,
                                            LValueBaseInfo *baseInfo);
 
+  cir::FuncOp
+  getAddrOfCXXStructor(clang::GlobalDecl gd,
+                       const CIRGenFunctionInfo *fnInfo = nullptr,
+                       cir::FuncType fnType = nullptr, bool dontDefer = false,
+                       ForDefinition_t isForDefinition = NotForDefinition) {
+    return getAddrAndTypeOfCXXStructor(gd, fnInfo, fnType, dontDefer,
+                                       isForDefinition)
+        .second;
+  }
+
+  std::pair<cir::FuncType, cir::FuncOp> getAddrAndTypeOfCXXStructor(
+      clang::GlobalDecl gd, const CIRGenFunctionInfo *fnInfo = nullptr,
+      cir::FuncType fnType = nullptr, bool dontDefer = false,
+      ForDefinition_t isForDefinition = NotForDefinition);
+
   /// This contains all the decls which have definitions but which are deferred
   /// for emission and therefore should only be output if they are actually
   /// used. If a decl is in this, then it is known to have not been referenced
diff --git a/clang/lib/CIR/CodeGen/CIRGenTypes.h b/clang/lib/CIR/CodeGen/CIRGenTypes.h
index 48d474beeddec..c2813d79bf63b 100644
--- a/clang/lib/CIR/CodeGen/CIRGenTypes.h
+++ b/clang/lib/CIR/CodeGen/CIRGenTypes.h
@@ -19,6 +19,7 @@
 
 #include "clang/AST/DeclCXX.h"
 #include "clang/AST/Type.h"
+#include "clang/Basic/ABI.h"
 #include "clang/CIR/Dialect/IR/CIRTypes.h"
 
 #include "llvm/ADT/SmallPtrSet.h"
@@ -165,6 +166,10 @@ class CIRGenTypes {
   bool isZeroInitializable(clang::QualType ty);
   bool isZeroInitializable(const RecordDecl *rd);
 
+  const CIRGenFunctionInfo &arrangeCXXConstructorCall(
+      const CallArgList &args, const clang::CXXConstructorDecl *d,
+      clang::CXXCtorType ctorKind, bool passProtoArgs = true);
+
   const CIRGenFunctionInfo &
   arrangeCXXMethodCall(const CallArgList &args,
                        const clang::FunctionProtoType *type,
@@ -173,6 +178,7 @@ class CIRGenTypes {
   /// C++ methods have some special rules and also have implicit parameters.
   const CIRGenFunctionInfo &
   arrangeCXXMethodDeclaration(const clang::CXXMethodDecl *md);
+  const CIRGenFunctionInfo &arrangeCXXStructorDeclaration(clang::GlobalDecl gd);
 
   const CIRGenFunctionInfo &
   arrangeCXXMethodType(const clang::CXXRecordDecl *rd,
diff --git a/clang/test/CIR/CodeGen/ctor.cpp b/clang/test/CIR/CodeGen/ctor.cpp
new file mode 100644
index 0000000000000..3a1e82e338c1c
--- /dev/null
+++ b/clang/test/CIR/CodeGen/ctor.cpp
@@ -0,0 +1,19 @@
+// RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu -fclangir -emit-cir %s -o %t.cir
+// RUN: FileCheck --input-file=%t.cir %s
+
+struct Struk {
+  int a;
+  Struk();
+};
+
+void baz() {
+  Struk s;
+}
+
+// CHECK: !rec_Struk = !cir.record<struct "Struk" {!s32i}>
+
+// CHECK:   cir.func @_ZN5StrukC1Ev(!cir.ptr<!rec_Struk>)
+// CHECK:   cir.func @_Z3bazv()
+// CHECK-NEXT:     %[[S_ADDR:.*]] = cir.alloca !rec_Struk, !cir.ptr<!rec_Struk>, ["s", init] {alignment = 4 : i64}
+// CHECK-NEXT:     cir.call @_ZN5StrukC1Ev(%[[S_ADDR]]) : (!cir.ptr<!rec_Struk>) -> ()
+// CHECK-NEXT:     cir.return

>From 6f62979a5a5bcf70d65f23e0991a274e6df5955b Mon Sep 17 00:00:00 2001
From: George Burgess IV <george.burgess.iv at gmail.com>
Date: Tue, 10 Jun 2025 16:57:16 -0700
Subject: [PATCH 31/61] Revert "[CI] Migrate to runtimes build" (#143612)

Reverts llvm/llvm-project#142696

See https://github.com/llvm/llvm-project/issues/143610 for details; I
believe this PR causes CI builders to build LLVM in a way that's been
broken for a while. To keep CI green, if this is the correct culprit,
those tests should be fixed or skipped
---
 .ci/compute_projects.py         | 115 ++++++++++++--------------------
 .ci/compute_projects_test.py    |  55 ++-------------
 .ci/monolithic-linux.sh         |  13 +---
 .github/workflows/premerge.yaml |   3 +-
 4 files changed, 49 insertions(+), 137 deletions(-)

diff --git a/.ci/compute_projects.py b/.ci/compute_projects.py
index e61b8dc5021f3..40dd0507a9eaf 100644
--- a/.ci/compute_projects.py
+++ b/.ci/compute_projects.py
@@ -49,7 +49,8 @@
     },
     "lld": {"bolt", "cross-project-tests"},
     # TODO(issues/132795): LLDB should be enabled on clang changes.
-    "clang": {"clang-tools-extra", "cross-project-tests"},
+    "clang": {"clang-tools-extra", "compiler-rt", "cross-project-tests"},
+    "clang-tools-extra": {"libc"},
     "mlir": {"flang"},
     # Test everything if ci scripts are changed.
     # FIXME: Figure out what is missing and add here.
@@ -63,15 +64,7 @@
 
 # This mapping describes runtimes that should be tested when the key project is
 # touched.
-DEPENDENT_RUNTIMES_TO_TEST = {
-    "clang": {"compiler-rt"},
-    "clang-tools-extra": {"libc"},
-}
-DEPENDENT_RUNTIMES_TO_TEST_NEEDS_RECONFIG = {
-    "llvm": {"libcxx", "libcxxabi", "libunwind"},
-    "clang": {"libcxx", "libcxxabi", "libunwind"},
-    ".ci": {"libcxx", "libcxxabi", "libunwind"},
-}
+DEPENDENT_RUNTIMES_TO_TEST = {"clang": {"libcxx", "libcxxabi", "libunwind"}}
 
 EXCLUDE_LINUX = {
     "cross-project-tests",  # TODO(issues/132796): Tests are failing.
@@ -100,6 +93,9 @@
     "cross-project-tests",
     "flang",
     "libc",
+    "libcxx",
+    "libcxxabi",
+    "libunwind",
     "lldb",
     "openmp",
     "polly",
@@ -126,10 +122,10 @@
     "polly": "check-polly",
 }
 
-RUNTIMES = {"libcxx", "libcxxabi", "libunwind", "compiler-rt", "libc"}
+RUNTIMES = {"libcxx", "libcxxabi", "libunwind"}
 
 
-def _add_dependencies(projects: Set[str], runtimes: Set[str]) -> Set[str]:
+def _add_dependencies(projects: Set[str]) -> Set[str]:
     projects_with_dependents = set(projects)
     current_projects_count = 0
     while current_projects_count != len(projects_with_dependents):
@@ -138,25 +134,9 @@ def _add_dependencies(projects: Set[str], runtimes: Set[str]) -> Set[str]:
             if project not in PROJECT_DEPENDENCIES:
                 continue
             projects_with_dependents.update(PROJECT_DEPENDENCIES[project])
-    for runtime in runtimes:
-        if runtime not in PROJECT_DEPENDENCIES:
-            continue
-        projects_with_dependents.update(PROJECT_DEPENDENCIES[runtime])
     return projects_with_dependents
 
 
-def _exclude_projects(current_projects: Set[str], platform: str) -> Set[str]:
-    if platform == "Linux":
-        to_exclude = EXCLUDE_LINUX
-    elif platform == "Windows":
-        to_exclude = EXCLUDE_WINDOWS
-    elif platform == "Darwin":
-        to_exclude = EXCLUDE_MAC
-    else:
-        raise ValueError(f"Unexpected platform: {platform}")
-    return current_projects.difference(to_exclude)
-
-
 def _compute_projects_to_test(modified_projects: Set[str], platform: str) -> Set[str]:
     projects_to_test = set()
     for modified_project in modified_projects:
@@ -174,14 +154,25 @@ def _compute_projects_to_test(modified_projects: Set[str], platform: str) -> Set
             ):
                 continue
             projects_to_test.add(dependent_project)
-    projects_to_test = _exclude_projects(projects_to_test, platform)
+    if platform == "Linux":
+        for to_exclude in EXCLUDE_LINUX:
+            if to_exclude in projects_to_test:
+                projects_to_test.remove(to_exclude)
+    elif platform == "Windows":
+        for to_exclude in EXCLUDE_WINDOWS:
+            if to_exclude in projects_to_test:
+                projects_to_test.remove(to_exclude)
+    elif platform == "Darwin":
+        for to_exclude in EXCLUDE_MAC:
+            if to_exclude in projects_to_test:
+                projects_to_test.remove(to_exclude)
+    else:
+        raise ValueError("Unexpected platform.")
     return projects_to_test
 
 
-def _compute_projects_to_build(
-    projects_to_test: Set[str], runtimes: Set[str]
-) -> Set[str]:
-    return _add_dependencies(projects_to_test, runtimes)
+def _compute_projects_to_build(projects_to_test: Set[str]) -> Set[str]:
+    return _add_dependencies(projects_to_test)
 
 
 def _compute_project_check_targets(projects_to_test: Set[str]) -> Set[str]:
@@ -193,36 +184,24 @@ def _compute_project_check_targets(projects_to_test: Set[str]) -> Set[str]:
     return check_targets
 
 
-def _compute_runtimes_to_test(modified_projects: Set[str], platform: str) -> Set[str]:
+def _compute_runtimes_to_test(projects_to_test: Set[str]) -> Set[str]:
     runtimes_to_test = set()
-    for modified_project in modified_projects:
-        if modified_project not in DEPENDENT_RUNTIMES_TO_TEST:
-            continue
-        runtimes_to_test.update(DEPENDENT_RUNTIMES_TO_TEST[modified_project])
-    return _exclude_projects(runtimes_to_test, platform)
+    for project_to_test in projects_to_test:
+        if project_to_test in DEPENDENT_RUNTIMES_TO_TEST:
+            runtimes_to_test.update(DEPENDENT_RUNTIMES_TO_TEST[project_to_test])
+        if project_to_test in DEPENDENT_RUNTIMES_TO_BUILD:
+            runtimes_to_test.update(DEPENDENT_RUNTIMES_TO_BUILD[project_to_test])
+    return runtimes_to_test
 
 
-def _compute_runtimes_to_test_needs_reconfig(
-    modified_projects: Set[str], platform: str
-) -> Set[str]:
-    runtimes_to_test = set()
-    for modified_project in modified_projects:
-        if modified_project not in DEPENDENT_RUNTIMES_TO_TEST_NEEDS_RECONFIG:
+def _compute_runtime_check_targets(projects_to_test: Set[str]) -> Set[str]:
+    check_targets = set()
+    for project_to_test in projects_to_test:
+        if project_to_test not in DEPENDENT_RUNTIMES_TO_TEST:
             continue
-        runtimes_to_test.update(
-            DEPENDENT_RUNTIMES_TO_TEST_NEEDS_RECONFIG[modified_project]
-        )
-    return _exclude_projects(runtimes_to_test, platform)
-
-
-def _compute_runtimes_to_build(
-    runtimes_to_test: Set[str], modified_projects: Set[str], platform: str
-) -> Set[str]:
-    runtimes_to_build = set(runtimes_to_test)
-    for modified_project in modified_projects:
-        if modified_project in DEPENDENT_RUNTIMES_TO_BUILD:
-            runtimes_to_build.update(DEPENDENT_RUNTIMES_TO_BUILD[modified_project])
-    return _exclude_projects(runtimes_to_build, platform)
+        for runtime_to_test in DEPENDENT_RUNTIMES_TO_TEST[project_to_test]:
+            check_targets.add(PROJECT_CHECK_TARGETS[runtime_to_test])
+    return check_targets
 
 
 def _get_modified_projects(modified_files: list[str]) -> Set[str]:
@@ -246,19 +225,10 @@ def _get_modified_projects(modified_files: list[str]) -> Set[str]:
 def get_env_variables(modified_files: list[str], platform: str) -> Set[str]:
     modified_projects = _get_modified_projects(modified_files)
     projects_to_test = _compute_projects_to_test(modified_projects, platform)
-    runtimes_to_test = _compute_runtimes_to_test(modified_projects, platform)
-    runtimes_to_test_needs_reconfig = _compute_runtimes_to_test_needs_reconfig(
-        modified_projects, platform
-    )
-    runtimes_to_build = _compute_runtimes_to_build(
-        runtimes_to_test | runtimes_to_test_needs_reconfig, modified_projects, platform
-    )
-    projects_to_build = _compute_projects_to_build(projects_to_test, runtimes_to_build)
+    projects_to_build = _compute_projects_to_build(projects_to_test)
     projects_check_targets = _compute_project_check_targets(projects_to_test)
-    runtimes_check_targets = _compute_project_check_targets(runtimes_to_test)
-    runtimes_check_targets_needs_reconfig = _compute_project_check_targets(
-        runtimes_to_test_needs_reconfig
-    )
+    runtimes_to_build = _compute_runtimes_to_test(projects_to_test)
+    runtimes_check_targets = _compute_runtime_check_targets(projects_to_test)
     # We use a semicolon to separate the projects/runtimes as they get passed
     # to the CMake invocation and thus we need to use the CMake list separator
     # (;). We use spaces to separate the check targets as they end up getting
@@ -268,9 +238,6 @@ def get_env_variables(modified_files: list[str], platform: str) -> Set[str]:
         "project_check_targets": " ".join(sorted(projects_check_targets)),
         "runtimes_to_build": ";".join(sorted(runtimes_to_build)),
         "runtimes_check_targets": " ".join(sorted(runtimes_check_targets)),
-        "runtimes_check_targets_needs_reconfig": " ".join(
-            sorted(runtimes_check_targets_needs_reconfig)
-        ),
     }
 
 
diff --git a/.ci/compute_projects_test.py b/.ci/compute_projects_test.py
index 6bc2e34a1cbe1..ae376ea6a43cd 100644
--- a/.ci/compute_projects_test.py
+++ b/.ci/compute_projects_test.py
@@ -26,10 +26,6 @@ def test_llvm(self):
         )
         self.assertEqual(
             env_variables["runtimes_check_targets"],
-            "",
-        )
-        self.assertEqual(
-            env_variables["runtimes_check_targets_needs_reconfig"],
             "check-cxx check-cxxabi check-unwind",
         )
 
@@ -50,10 +46,6 @@ def test_llvm_windows(self):
         )
         self.assertEqual(
             env_variables["runtimes_check_targets"],
-            "",
-        )
-        self.assertEqual(
-            env_variables["runtimes_check_targets_needs_reconfig"],
             "check-cxx check-cxxabi check-unwind",
         )
 
@@ -74,10 +66,6 @@ def test_llvm_mac(self):
         )
         self.assertEqual(
             env_variables["runtimes_check_targets"],
-            "",
-        )
-        self.assertEqual(
-            env_variables["runtimes_check_targets_needs_reconfig"],
             "check-cxx check-cxxabi check-unwind",
         )
 
@@ -87,21 +75,17 @@ def test_clang(self):
         )
         self.assertEqual(
             env_variables["projects_to_build"],
-            "clang;clang-tools-extra;lld;llvm",
+            "clang;clang-tools-extra;compiler-rt;lld;llvm",
         )
         self.assertEqual(
             env_variables["project_check_targets"],
-            "check-clang check-clang-tools",
+            "check-clang check-clang-tools check-compiler-rt",
         )
         self.assertEqual(
-            env_variables["runtimes_to_build"], "compiler-rt;libcxx;libcxxabi;libunwind"
+            env_variables["runtimes_to_build"], "libcxx;libcxxabi;libunwind"
         )
         self.assertEqual(
             env_variables["runtimes_check_targets"],
-            "check-compiler-rt",
-        )
-        self.assertEqual(
-            env_variables["runtimes_check_targets_needs_reconfig"],
             "check-cxx check-cxxabi check-unwind",
         )
 
@@ -120,10 +104,6 @@ def test_clang_windows(self):
         )
         self.assertEqual(
             env_variables["runtimes_check_targets"],
-            "",
-        )
-        self.assertEqual(
-            env_variables["runtimes_check_targets_needs_reconfig"],
             "check-cxx check-cxxabi check-unwind",
         )
 
@@ -135,7 +115,6 @@ def test_bolt(self):
         self.assertEqual(env_variables["project_check_targets"], "check-bolt")
         self.assertEqual(env_variables["runtimes_to_build"], "")
         self.assertEqual(env_variables["runtimes_check_targets"], "")
-        self.assertEqual(env_variables["runtimes_check_targets_needs_reconfig"], "")
 
     def test_lldb(self):
         env_variables = compute_projects.get_env_variables(
@@ -145,7 +124,6 @@ def test_lldb(self):
         self.assertEqual(env_variables["project_check_targets"], "check-lldb")
         self.assertEqual(env_variables["runtimes_to_build"], "")
         self.assertEqual(env_variables["runtimes_check_targets"], "")
-        self.assertEqual(env_variables["runtimes_check_targets_needs_reconfig"], "")
 
     def test_mlir(self):
         env_variables = compute_projects.get_env_variables(
@@ -157,7 +135,6 @@ def test_mlir(self):
         )
         self.assertEqual(env_variables["runtimes_to_build"], "")
         self.assertEqual(env_variables["runtimes_check_targets"], "")
-        self.assertEqual(env_variables["runtimes_check_targets_needs_reconfig"], "")
 
     def test_flang(self):
         env_variables = compute_projects.get_env_variables(
@@ -167,7 +144,6 @@ def test_flang(self):
         self.assertEqual(env_variables["project_check_targets"], "check-flang")
         self.assertEqual(env_variables["runtimes_to_build"], "")
         self.assertEqual(env_variables["runtimes_check_targets"], "")
-        self.assertEqual(env_variables["runtimes_check_targets_needs_reconfig"], "")
 
     def test_invalid_subproject(self):
         env_variables = compute_projects.get_env_variables(
@@ -177,7 +153,6 @@ def test_invalid_subproject(self):
         self.assertEqual(env_variables["project_check_targets"], "")
         self.assertEqual(env_variables["runtimes_to_build"], "")
         self.assertEqual(env_variables["runtimes_check_targets"], "")
-        self.assertEqual(env_variables["runtimes_check_targets_needs_reconfig"], "")
 
     def test_top_level_file(self):
         env_variables = compute_projects.get_env_variables(["README.md"], "Linux")
@@ -185,7 +160,6 @@ def test_top_level_file(self):
         self.assertEqual(env_variables["project_check_targets"], "")
         self.assertEqual(env_variables["runtimes_to_build"], "")
         self.assertEqual(env_variables["runtimes_check_targets"], "")
-        self.assertEqual(env_variables["runtimes_check_targets_needs_reconfig"], "")
 
     def test_exclude_runtiems_in_projects(self):
         env_variables = compute_projects.get_env_variables(
@@ -195,7 +169,6 @@ def test_exclude_runtiems_in_projects(self):
         self.assertEqual(env_variables["project_check_targets"], "")
         self.assertEqual(env_variables["runtimes_to_build"], "")
         self.assertEqual(env_variables["runtimes_check_targets"], "")
-        self.assertEqual(env_variables["runtimes_check_targets_needs_reconfig"], "")
 
     def test_exclude_docs(self):
         env_variables = compute_projects.get_env_variables(
@@ -205,7 +178,6 @@ def test_exclude_docs(self):
         self.assertEqual(env_variables["project_check_targets"], "")
         self.assertEqual(env_variables["runtimes_to_build"], "")
         self.assertEqual(env_variables["runtimes_check_targets"], "")
-        self.assertEqual(env_variables["runtimes_check_targets_needs_reconfig"], "")
 
     def test_exclude_gn(self):
         env_variables = compute_projects.get_env_variables(
@@ -215,7 +187,6 @@ def test_exclude_gn(self):
         self.assertEqual(env_variables["project_check_targets"], "")
         self.assertEqual(env_variables["runtimes_to_build"], "")
         self.assertEqual(env_variables["runtimes_check_targets"], "")
-        self.assertEqual(env_variables["runtimes_check_targets_needs_reconfig"], "")
 
     def test_ci(self):
         env_variables = compute_projects.get_env_variables(
@@ -227,15 +198,10 @@ def test_ci(self):
             "check-clang check-lld check-lldb check-llvm",
         )
         self.assertEqual(
-            env_variables["runtimes_to_build"],
-            "libcxx;libcxxabi;libunwind",
+            env_variables["runtimes_to_build"], "libcxx;libcxxabi;libunwind"
         )
         self.assertEqual(
             env_variables["runtimes_check_targets"],
-            "",
-        )
-        self.assertEqual(
-            env_variables["runtimes_check_targets_needs_reconfig"],
             "check-cxx check-cxxabi check-unwind",
         )
 
@@ -249,19 +215,6 @@ def test_lldb(self):
             env_variables["runtimes_to_build"], "libcxx;libcxxabi;libunwind"
         )
         self.assertEqual(env_variables["runtimes_check_targets"], "")
-        self.assertEqual(env_variables["runtimes_check_targets_needs_reconfig"], "")
-
-    def test_clang_tools_extra(self):
-        env_variables = compute_projects.get_env_variables(
-            ["clang-tools-extra/CMakeLists.txt"], "Linux"
-        )
-        self.assertEqual(
-            env_variables["projects_to_build"], "clang;clang-tools-extra;lld;llvm"
-        )
-        self.assertEqual(env_variables["project_check_targets"], "check-clang-tools")
-        self.assertEqual(env_variables["runtimes_to_build"], "libc")
-        self.assertEqual(env_variables["runtimes_check_targets"], "check-libc")
-        self.assertEqual(env_variables["runtimes_check_targets_needs_reconfig"], "")
 
 
 if __name__ == "__main__":
diff --git a/.ci/monolithic-linux.sh b/.ci/monolithic-linux.sh
index c350a58679140..7503ea4e6a992 100755
--- a/.ci/monolithic-linux.sh
+++ b/.ci/monolithic-linux.sh
@@ -57,7 +57,6 @@ projects="${1}"
 targets="${2}"
 runtimes="${3}"
 runtime_targets="${4}"
-runtime_targets_needs_reconfig="${5}"
 
 lit_args="-v --xunit-xml-output ${BUILD_DIR}/test-results.xml --use-unique-output-file-name --timeout=1200 --time-tests"
 
@@ -94,15 +93,9 @@ echo "--- ninja"
 # Targets are not escaped as they are passed as separate arguments.
 ninja -C "${BUILD_DIR}" -k 0 ${targets}
 
-if [[ "${runtime_targets}" != "" ]]; then
-  echo "--- ninja runtimes"
-
-  ninja -C "${BUILD_DIR}" ${runtime_targets}
-fi
-
 # Compiling runtimes with just-built Clang and running their tests
 # as an additional testing for Clang.
-if [[ "${runtime_targets_needs_reconfig}" != "" ]]; then
+if [[ "${runtimes_targets}" != "" ]]; then
   echo "--- cmake runtimes C++26"
 
   cmake \
@@ -112,7 +105,7 @@ if [[ "${runtime_targets_needs_reconfig}" != "" ]]; then
 
   echo "--- ninja runtimes C++26"
 
-  ninja -C "${BUILD_DIR}" ${runtime_targets_needs_reconfig}
+  ninja -C "${BUILD_DIR}" ${runtime_targets}
 
   echo "--- cmake runtimes clang modules"
 
@@ -123,5 +116,5 @@ if [[ "${runtime_targets_needs_reconfig}" != "" ]]; then
 
   echo "--- ninja runtimes clang modules"
 
-  ninja -C "${BUILD_DIR}" ${runtime_targets_needs_reconfig}
+  ninja -C "${BUILD_DIR}" ${runtime_targets}
 fi
diff --git a/.github/workflows/premerge.yaml b/.github/workflows/premerge.yaml
index 4435a3e905768..709b6d03d94c3 100644
--- a/.github/workflows/premerge.yaml
+++ b/.github/workflows/premerge.yaml
@@ -56,12 +56,11 @@ jobs:
           echo "Running project checks targets: ${project_check_targets}"
           echo "Building runtimes: ${runtimes_to_build}"
           echo "Running runtimes checks targets: ${runtimes_check_targets}"
-          echo "Running runtimes checks requiring reconfiguring targets: ${runtimes_check_targets_needs_reconfig}"
 
           export CC=/opt/llvm/bin/clang
           export CXX=/opt/llvm/bin/clang++
 
-          ./.ci/monolithic-linux.sh "${projects_to_build}" "${project_check_targets}" "${runtimes_to_build}" "${runtimes_check_targets}" "${runtimes_check_targets_needs_reconfig}"
+          ./.ci/monolithic-linux.sh "${projects_to_build}" "${project_check_targets}" "${runtimes_to_build}" "${runtimes_check_targets}"
       - name: Upload Artifacts
         uses: actions/upload-artifact at 65c4c4a1ddee5b72f698fdd19549f0f0fb45cf08 # v4.6.0
         with:

>From 3cef099ceddccefca8e11268624397cde9e04af6 Mon Sep 17 00:00:00 2001
From: Aiden Grossman <aidengrossman at google.com>
Date: Wed, 11 Jun 2025 01:06:13 +0000
Subject: [PATCH 32/61] [TySan][CMake] Depend on tysan for check-tysan in
 runtimes build (#143597)

The runtimes build expects libclang_rt.tysan.a to be available, but the
check-tysan target does not actually depend on it when built using a
runtimes build with LLVM_ENABLE_RUNTIMES pointing at ./llvm. This means
we get test failures when running check-compiler-rt due to the missing
static archive.

This patch also makes check-tysan depend on tysan when we are using the
runtimes build.

This is causing premerge failures currently since we recently migrated
to the runtimes build.
---
 compiler-rt/test/tysan/CMakeLists.txt | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/compiler-rt/test/tysan/CMakeLists.txt b/compiler-rt/test/tysan/CMakeLists.txt
index 76f57501e854e..ce0afa8769f03 100644
--- a/compiler-rt/test/tysan/CMakeLists.txt
+++ b/compiler-rt/test/tysan/CMakeLists.txt
@@ -21,9 +21,7 @@ foreach(arch ${TYSAN_TEST_ARCH})
 endforeach()
 
 set(TYSAN_TEST_DEPS ${SANITIZER_COMMON_LIT_TEST_DEPS})
-if(NOT COMPILER_RT_STANDALONE_BUILD)
-  list(APPEND TYSAN_TEST_DEPS tysan)
-endif()
+list(APPEND TYSAN_TEST_DEPS tysan)
 
 add_lit_testsuite(check-tysan "Running the TypeSanitizer tests"
   ${TYSAN_TESTSUITES}

>From 67ff66e67734c0b283ec676899e5b89b67fdafcb Mon Sep 17 00:00:00 2001
From: Ethan Luis McDonough <ethanluismcdonough at gmail.com>
Date: Tue, 10 Jun 2025 20:19:38 -0500
Subject: [PATCH 33/61] [PGO][Offload] Fix offload coverage mapping  (#143490)

This pull request fixes coverage mapping on GPU targets.

- It adds an address space cast to the coverage mapping generation pass.
- It reads the profiled function names from the ELF directly. Reading it
from public globals was causing issues in cases where multiple
device-code object files are linked together.
---
 clang/lib/CodeGen/CoverageMappingGen.cpp      |  5 +--
 .../Instrumentation/InstrProfiling.cpp        |  6 ----
 .../common/include/GlobalHandler.h            |  4 +--
 .../common/src/GlobalHandler.cpp              | 31 +++++++++----------
 .../common/src/PluginInterface.cpp            |  7 ++---
 5 files changed, 22 insertions(+), 31 deletions(-)

diff --git a/clang/lib/CodeGen/CoverageMappingGen.cpp b/clang/lib/CodeGen/CoverageMappingGen.cpp
index 1788bb4f28697..4aafac349e3e9 100644
--- a/clang/lib/CodeGen/CoverageMappingGen.cpp
+++ b/clang/lib/CodeGen/CoverageMappingGen.cpp
@@ -2622,8 +2622,9 @@ void CoverageMappingModuleGen::emit() {
   CGM.addUsedGlobal(CovData);
   // Create the deferred function records array
   if (!FunctionNames.empty()) {
-    auto NamesArrTy = llvm::ArrayType::get(llvm::PointerType::getUnqual(Ctx),
-                                           FunctionNames.size());
+    auto AddrSpace = FunctionNames.front()->getType()->getPointerAddressSpace();
+    auto NamesArrTy = llvm::ArrayType::get(
+        llvm::PointerType::get(Ctx, AddrSpace), FunctionNames.size());
     auto NamesArrVal = llvm::ConstantArray::get(NamesArrTy, FunctionNames);
     // This variable will *NOT* be emitted to the object file. It is used
     // to pass the list of names referenced to codegen.
diff --git a/llvm/lib/Transforms/Instrumentation/InstrProfiling.cpp b/llvm/lib/Transforms/Instrumentation/InstrProfiling.cpp
index fe3b0da33a009..5e7548b0a2fd1 100644
--- a/llvm/lib/Transforms/Instrumentation/InstrProfiling.cpp
+++ b/llvm/lib/Transforms/Instrumentation/InstrProfiling.cpp
@@ -1955,12 +1955,6 @@ void InstrLowerer::emitNameData() {
                                 GlobalValue::PrivateLinkage, NamesVal,
                                 getInstrProfNamesVarName());
 
-  // Make names variable public if current target is a GPU
-  if (isGPUProfTarget(M)) {
-    NamesVar->setLinkage(GlobalValue::ExternalLinkage);
-    NamesVar->setVisibility(GlobalValue::VisibilityTypes::ProtectedVisibility);
-  }
-
   NamesSize = CompressedNameStr.size();
   setGlobalVariableLargeSection(TT, *NamesVar);
   NamesVar->setSection(
diff --git a/offload/plugins-nextgen/common/include/GlobalHandler.h b/offload/plugins-nextgen/common/include/GlobalHandler.h
index 6def53430a7c0..5d6109df49da5 100644
--- a/offload/plugins-nextgen/common/include/GlobalHandler.h
+++ b/offload/plugins-nextgen/common/include/GlobalHandler.h
@@ -80,6 +80,7 @@ struct GPUProfGlobals {
 
   void dump() const;
   Error write() const;
+  bool empty() const;
 };
 
 /// Subclass of GlobalTy that holds the memory for a global of \p Ty.
@@ -192,9 +193,6 @@ class GenericGlobalHandlerTy {
                                           /*D2H=*/false);
   }
 
-  /// Checks whether a given image contains profiling globals.
-  bool hasProfilingGlobals(GenericDeviceTy &Device, DeviceImageTy &Image);
-
   /// Reads profiling data from a GPU image to supplied profdata struct.
   /// Iterates through the image symbol table and stores global values
   /// with profiling prefixes.
diff --git a/offload/plugins-nextgen/common/src/GlobalHandler.cpp b/offload/plugins-nextgen/common/src/GlobalHandler.cpp
index 27d7e8ee2fdf3..5464c197dba78 100644
--- a/offload/plugins-nextgen/common/src/GlobalHandler.cpp
+++ b/offload/plugins-nextgen/common/src/GlobalHandler.cpp
@@ -173,16 +173,6 @@ Error GenericGlobalHandlerTy::readGlobalFromImage(GenericDeviceTy &Device,
   return Plugin::success();
 }
 
-bool GenericGlobalHandlerTy::hasProfilingGlobals(GenericDeviceTy &Device,
-                                                 DeviceImageTy &Image) {
-  GlobalTy global(getInstrProfNamesVarName().str(), 0);
-  if (auto Err = getGlobalMetadataFromImage(Device, Image, global)) {
-    consumeError(std::move(Err));
-    return false;
-  }
-  return true;
-}
-
 Expected<GPUProfGlobals>
 GenericGlobalHandlerTy::readProfilingGlobals(GenericDeviceTy &Device,
                                              DeviceImageTy &Image) {
@@ -204,12 +194,17 @@ GenericGlobalHandlerTy::readProfilingGlobals(GenericDeviceTy &Device,
     // Check if given current global is a profiling global based
     // on name
     if (*NameOrErr == getInstrProfNamesVarName()) {
-      // Read in profiled function names
-      DeviceProfileData.NamesData = SmallVector<uint8_t>(Sym.getSize(), 0);
-      GlobalTy NamesGlobal(NameOrErr->str(), Sym.getSize(),
-                           DeviceProfileData.NamesData.data());
-      if (auto Err = readGlobalFromDevice(Device, Image, NamesGlobal))
-        return Err;
+      // Read in profiled function names from ELF
+      auto SectionOrErr = Sym.getSection();
+      if (!SectionOrErr)
+        return SectionOrErr.takeError();
+
+      auto ContentsOrErr = (*SectionOrErr)->getContents();
+      if (!ContentsOrErr)
+        return ContentsOrErr.takeError();
+
+      SmallVector<uint8_t> NameBytes(ContentsOrErr->bytes());
+      DeviceProfileData.NamesData = NameBytes;
     } else if (NameOrErr->starts_with(getInstrProfCountersVarPrefix())) {
       // Read global variable profiling counts
       SmallVector<int64_t> Counts(Sym.getSize() / sizeof(int64_t), 0);
@@ -322,3 +317,7 @@ Error GPUProfGlobals::write() const {
 
   return Plugin::success();
 }
+
+bool GPUProfGlobals::empty() const {
+  return Counts.empty() && Data.empty() && NamesData.empty();
+}
diff --git a/offload/plugins-nextgen/common/src/PluginInterface.cpp b/offload/plugins-nextgen/common/src/PluginInterface.cpp
index f9e316adad8f4..f9a6b3c1f4324 100644
--- a/offload/plugins-nextgen/common/src/PluginInterface.cpp
+++ b/offload/plugins-nextgen/common/src/PluginInterface.cpp
@@ -858,14 +858,13 @@ Error GenericDeviceTy::deinit(GenericPluginTy &Plugin) {
 
   for (auto *Image : LoadedImages) {
     GenericGlobalHandlerTy &Handler = Plugin.getGlobalHandler();
-    if (!Handler.hasProfilingGlobals(*this, *Image))
-      continue;
-
-    GPUProfGlobals profdata;
     auto ProfOrErr = Handler.readProfilingGlobals(*this, *Image);
     if (!ProfOrErr)
       return ProfOrErr.takeError();
 
+    if (ProfOrErr->empty())
+      continue;
+
     // Dump out profdata
     if ((OMPX_DebugKind.get() & uint32_t(DeviceDebugKind::PGODump)) ==
         uint32_t(DeviceDebugKind::PGODump))

>From 841a7f0897272f6412bc2e42a7dd695bf1e8a8cf Mon Sep 17 00:00:00 2001
From: Min-Yih Hsu <min.hsu at sifive.com>
Date: Tue, 10 Jun 2025 18:30:07 -0700
Subject: [PATCH 34/61] [RISCV][NFC] Factor out VLEN in the SiFive7 scheduling
 model (#143629)

In preparation of reusing SiFive7Model for sifive-x390, which has a VLEN
of 1024, it's better (and less chaotic) to factor out the VLEN parameter
from various of places first: the plan is to do a major overhaul on this
file in which all the `WriteRes` are going to be encapsulated in a big
`multiclass`, where VLEN is one of its template arguments. Such that we
can instantiate different scheduling models with different VLEN.

Before that happens, a placeholder defvar `SiFive7VLEN` is used instead
in this patch.

NFC.

Co-authored-by: Michael Maitland <michaeltmaitland at gmail.com>
---
 llvm/lib/Target/RISCV/RISCVSchedSiFive7.td | 64 ++++++++++------------
 1 file changed, 30 insertions(+), 34 deletions(-)

diff --git a/llvm/lib/Target/RISCV/RISCVSchedSiFive7.td b/llvm/lib/Target/RISCV/RISCVSchedSiFive7.td
index af64a871a9292..c1d7cd4a716e7 100644
--- a/llvm/lib/Target/RISCV/RISCVSchedSiFive7.td
+++ b/llvm/lib/Target/RISCV/RISCVSchedSiFive7.td
@@ -88,9 +88,8 @@ class SiFive7GetCyclesSegmentedSeg2<string mx> {
 
 // Cycles for segmented loads and stores are calculated using the
 // formula vl * ceil((SEW * nf) / DLEN), where SEW * nf is the segment size.
-class SiFive7GetCyclesSegmented<string mx, int sew, int nf> {
-  defvar VLEN = 512;
-  defvar DLEN = 256;
+class SiFive7GetCyclesSegmented<string mx, int sew, int nf, int VLEN> {
+  defvar DLEN = !div(VLEN, 2);
   // (VLEN * LMUL) / SEW
   defvar VLUpperBound  = !cond(
     !eq(mx, "M1") : !div(VLEN, sew),
@@ -107,23 +106,20 @@ class SiFive7GetCyclesSegmented<string mx, int sew, int nf> {
   int c = !mul(VLUpperBound, !div(!sub(!add(a, b), 1), b));
 }
 
-class SiFive7GetCyclesOnePerElement<string mx, int sew> {
-  // FIXME: On SiFive7, VLEN is 512. Although a user can request the compiler
-  // to use a different VLEN, this model will not make scheduling decisions
-  // based on the user specified VLEN.
+class SiFive7GetCyclesOnePerElement<string mx, int sew, int VLEN> {
   // c = ceil(VLEN / SEW) * LMUL
   // Note: c >= 1 since the smallest VLEN is 512 / 8 = 8, and the
   // largest division performed on VLEN is in MF8 case with division
   // by 8. Therefore, there is no need to ceil the result.
-  int VLEN = !div(512, sew);
+  int numElements = !div(VLEN, sew);
   int c = !cond(
-    !eq(mx, "M1")  : VLEN,
-    !eq(mx, "M2")  : !mul(VLEN, 2),
-    !eq(mx, "M4")  : !mul(VLEN, 4),
-    !eq(mx, "M8")  : !mul(VLEN, 8),
-    !eq(mx, "MF2") : !div(VLEN, 2),
-    !eq(mx, "MF4") : !div(VLEN, 4),
-    !eq(mx, "MF8") : !div(VLEN, 8)
+    !eq(mx, "M1")  : numElements,
+    !eq(mx, "M2")  : !mul(numElements, 2),
+    !eq(mx, "M4")  : !mul(numElements, 4),
+    !eq(mx, "M8")  : !mul(numElements, 8),
+    !eq(mx, "MF2") : !div(numElements, 2),
+    !eq(mx, "MF4") : !div(numElements, 4),
+    !eq(mx, "MF8") : !div(numElements, 8)
   );
 }
 
@@ -139,10 +135,9 @@ class SiFive7GetDivOrSqrtFactor<int sew> {
 
 /// Cycles for reductions take approximately VL*SEW/DLEN + 5(4 + log(DLEN/SEW))
 /// cycles.
-class SiFive7GetReductionCycles<string mx, int sew> {
+class SiFive7GetReductionCycles<string mx, int sew, int VLEN> {
   // VLUpperBound*SEW/DLEN is equivalent to 2*LMUL since
   // VLUpperBound=(VLEN*LMUL)/SEW.
-  defvar VLEN = 512;
   defvar DLEN = !div(VLEN, 2);
   defvar TwoTimesLMUL = !cond(
     !eq(mx, "M1") : 2,
@@ -160,8 +155,7 @@ class SiFive7GetReductionCycles<string mx, int sew> {
 }
 
 /// Cycles for ordered reductions take approximately 6*VL cycles
-class SiFive7GetOrderedReductionCycles<string mx, int sew> {
-  defvar VLEN = 512;
+class SiFive7GetOrderedReductionCycles<string mx, int sew, int VLEN> {
   // (VLEN * LMUL) / SEW
   defvar VLUpperBound  = !cond(
     !eq(mx, "M1") : !div(VLEN, sew),
@@ -234,6 +228,8 @@ def SiFive7VCQ         : ProcResource<1>; // Vector Command Queue
 
 def SiFive7PipeAB : ProcResGroup<[SiFive7PipeA, SiFive7PipeB]>;
 
+defvar SiFive7VLEN = 512;
+
 // Branching
 let Latency = 3 in {
 def : WriteRes<WriteJmp, [SiFive7PipeB]>;
@@ -481,7 +477,7 @@ foreach mx = SchedMxList in {
 
 foreach mx = SchedMxList in {
   defvar VLDSX0Cycles = SiFive7GetCyclesDefault<mx>.c;
-  defvar Cycles = SiFive7GetCyclesOnePerElement<mx, 8>.c;
+  defvar Cycles = SiFive7GetCyclesOnePerElement<mx, 8, SiFive7VLEN>.c;
   defvar IsWorstCase = SiFive7IsWorstCaseMX<mx, SchedMxList>.c;
   defm SiFive7 : LMULWriteResMXVariant<"WriteVLDS8",  VLDSX0Pred, [SiFive7VCQ, SiFive7VL],
                                        4, [0, 1], [1, !add(1, VLDSX0Cycles)], !add(3, Cycles),
@@ -501,7 +497,7 @@ foreach mx = SchedMxList in {
 // since LMUL >= 16/64.
 foreach mx = ["MF4", "MF2", "M1", "M2", "M4", "M8"] in {
   defvar VLDSX0Cycles = SiFive7GetCyclesDefault<mx>.c;
-  defvar Cycles = SiFive7GetCyclesOnePerElement<mx, 16>.c;
+  defvar Cycles = SiFive7GetCyclesOnePerElement<mx, 16, SiFive7VLEN>.c;
   defvar IsWorstCase = SiFive7IsWorstCaseMX<mx, SchedMxList>.c;
   defm SiFive7 : LMULWriteResMXVariant<"WriteVLDS16",  VLDSX0Pred, [SiFive7VCQ, SiFive7VL],
                                        4, [0, 1], [1, !add(1, VLDSX0Cycles)], !add(3, Cycles),
@@ -518,7 +514,7 @@ foreach mx = ["MF4", "MF2", "M1", "M2", "M4", "M8"] in {
 }
 foreach mx = ["MF2", "M1", "M2", "M4", "M8"] in {
   defvar VLDSX0Cycles = SiFive7GetCyclesDefault<mx>.c;
-  defvar Cycles = SiFive7GetCyclesOnePerElement<mx, 32>.c;
+  defvar Cycles = SiFive7GetCyclesOnePerElement<mx, 32, SiFive7VLEN>.c;
   defvar IsWorstCase = SiFive7IsWorstCaseMX<mx, SchedMxList>.c;
   defm SiFive7 : LMULWriteResMXVariant<"WriteVLDS32",  VLDSX0Pred, [SiFive7VCQ, SiFive7VL],
                                        4, [0, 1], [1, !add(1, VLDSX0Cycles)], !add(3, Cycles),
@@ -535,7 +531,7 @@ foreach mx = ["MF2", "M1", "M2", "M4", "M8"] in {
 }
 foreach mx = ["M1", "M2", "M4", "M8"] in {
   defvar VLDSX0Cycles = SiFive7GetCyclesDefault<mx>.c;
-  defvar Cycles = SiFive7GetCyclesOnePerElement<mx, 64>.c;
+  defvar Cycles = SiFive7GetCyclesOnePerElement<mx, 64, SiFive7VLEN>.c;
   defvar IsWorstCase = SiFive7IsWorstCaseMX<mx, SchedMxList>.c;
   defm SiFive7 : LMULWriteResMXVariant<"WriteVLDS64",  VLDSX0Pred, [SiFive7VCQ, SiFive7VL],
                                        4, [0, 1], [1, !add(1, VLDSX0Cycles)], !add(3, Cycles),
@@ -588,7 +584,7 @@ foreach mx = SchedMxList in {
     let Latency = 1, AcquireAtCycles = [0, 1], ReleaseAtCycles = [1, !add(1, Cycles)] in
     defm "" : LMULWriteResMX<"WriteVSSEG2e" # eew,   [SiFive7VCQ, SiFive7VS], mx, IsWorstCase>;
     foreach nf=3-8 in {
-      defvar Cycles = SiFive7GetCyclesSegmented<mx, eew, nf>.c;
+      defvar Cycles = SiFive7GetCyclesSegmented<mx, eew, nf, SiFive7VLEN>.c;
       defvar IsWorstCase = SiFive7IsWorstCaseMX<mx, SchedMxList>.c;
       // Does not chain so set latency high
       let Latency = !add(3, Cycles), AcquireAtCycles = [0, 1], ReleaseAtCycles = [1, !add(1, Cycles)] in {
@@ -603,7 +599,7 @@ foreach mx = SchedMxList in {
 foreach mx = SchedMxList in {
   foreach nf=2-8 in {
     foreach eew = [8, 16, 32, 64] in {
-      defvar Cycles = SiFive7GetCyclesSegmented<mx, eew, nf>.c;
+      defvar Cycles = SiFive7GetCyclesSegmented<mx, eew, nf, SiFive7VLEN>.c;
       defvar IsWorstCase = SiFive7IsWorstCaseMX<mx, SchedMxList>.c;
       // Does not chain so set latency high
       let Latency = !add(3, Cycles), AcquireAtCycles = [0, 1], ReleaseAtCycles = [1, !add(1, Cycles)] in {
@@ -669,7 +665,7 @@ foreach mx = SchedMxList in {
 foreach mx = SchedMxList in {
   foreach sew = SchedSEWSet<mx>.val in {
     defvar Cycles = !mul(SiFive7GetDivOrSqrtFactor<sew>.c,
-                         !div(SiFive7GetCyclesOnePerElement<mx, sew>.c, 4));
+                         !div(SiFive7GetCyclesOnePerElement<mx, sew, SiFive7VLEN>.c, 4));
     defvar IsWorstCase = SiFive7IsWorstCaseMXSEW<mx, sew, SchedMxList>.c;
     let Latency = Cycles, AcquireAtCycles = [0, 1], ReleaseAtCycles = [1, !add(1, Cycles)] in {
       defm "" : LMULSEWWriteResMXSEW<"WriteVIDivV", [SiFive7VCQ, SiFive7VA], mx, sew, IsWorstCase>;
@@ -774,7 +770,7 @@ foreach mx = SchedMxList in {
 foreach mx = SchedMxListF in {
   foreach sew = SchedSEWSet<mx, isF=1>.val in {
     defvar Cycles = !mul(SiFive7GetDivOrSqrtFactor<sew>.c,
-                         !div(SiFive7GetCyclesOnePerElement<mx, sew>.c, 4));
+                         !div(SiFive7GetCyclesOnePerElement<mx, sew, SiFive7VLEN>.c, 4));
     defvar IsWorstCase = SiFive7IsWorstCaseMXSEW<mx, sew, SchedMxListF, 1>.c;
     let Latency = Cycles, AcquireAtCycles = [0, 1], ReleaseAtCycles = [1, !add(1, Cycles)] in {
       defm "" : LMULSEWWriteResMXSEW<"WriteVFSqrtV", [SiFive7VCQ, SiFive7VA], mx, sew, IsWorstCase>;
@@ -834,7 +830,7 @@ foreach mx = SchedMxListFW in {
 // 14. Vector Reduction Operations
 foreach mx = SchedMxList in {
   foreach sew = SchedSEWSet<mx>.val in {
-    defvar Cycles = SiFive7GetReductionCycles<mx, sew>.c;
+    defvar Cycles = SiFive7GetReductionCycles<mx, sew, SiFive7VLEN>.c;
     defvar IsWorstCase = SiFive7IsWorstCaseMXSEW<mx, sew, SchedMxList>.c;
     let Latency = Cycles, AcquireAtCycles = [0, 1], ReleaseAtCycles = [1, !add(1, Cycles)] in {
       defm "" : LMULSEWWriteResMXSEW<"WriteVIRedV_From", [SiFive7VCQ, SiFive7VA],
@@ -847,7 +843,7 @@ foreach mx = SchedMxList in {
 
 foreach mx = SchedMxListWRed in {
   foreach sew = SchedSEWSet<mx, 0, 1>.val in {
-    defvar Cycles = SiFive7GetReductionCycles<mx, sew>.c;
+    defvar Cycles = SiFive7GetReductionCycles<mx, sew, SiFive7VLEN>.c;
     defvar IsWorstCase = SiFive7IsWorstCaseMXSEW<mx, sew, SchedMxListWRed>.c;
     let Latency = Cycles, AcquireAtCycles = [0, 1], ReleaseAtCycles = [1, !add(1, Cycles)] in
     defm "" : LMULSEWWriteResMXSEW<"WriteVIWRedV_From", [SiFive7VCQ, SiFive7VA],
@@ -857,7 +853,7 @@ foreach mx = SchedMxListWRed in {
 
 foreach mx = SchedMxListF in {
   foreach sew = SchedSEWSet<mx, 1>.val in {
-    defvar RedCycles = SiFive7GetReductionCycles<mx, sew>.c;
+    defvar RedCycles = SiFive7GetReductionCycles<mx, sew, SiFive7VLEN>.c;
     defvar IsWorstCase = SiFive7IsWorstCaseMXSEW<mx, sew, SchedMxListF, 1>.c;
     let Latency = RedCycles, AcquireAtCycles = [0, 1], ReleaseAtCycles = [1, !add(1, RedCycles)] in {
       defm "" : LMULSEWWriteResMXSEW<"WriteVFRedV_From", [SiFive7VCQ, SiFive7VA],
@@ -865,7 +861,7 @@ foreach mx = SchedMxListF in {
       defm "" : LMULSEWWriteResMXSEW<"WriteVFRedMinMaxV_From", [SiFive7VCQ, SiFive7VA],
                                      mx, sew, IsWorstCase>;
     }
-    defvar OrdRedCycles = SiFive7GetOrderedReductionCycles<mx, sew>.c;
+    defvar OrdRedCycles = SiFive7GetOrderedReductionCycles<mx, sew, SiFive7VLEN>.c;
     let Latency = OrdRedCycles, AcquireAtCycles = [0, 1], ReleaseAtCycles = [1, !add(1, OrdRedCycles)] in
     defm "" : LMULSEWWriteResMXSEW<"WriteVFRedOV_From", [SiFive7VCQ, SiFive7VA],
                                    mx, sew, IsWorstCase>;
@@ -874,12 +870,12 @@ foreach mx = SchedMxListF in {
 
 foreach mx = SchedMxListFWRed in {
   foreach sew = SchedSEWSet<mx, 1, 1>.val in {
-    defvar RedCycles = SiFive7GetReductionCycles<mx, sew>.c;
+    defvar RedCycles = SiFive7GetReductionCycles<mx, sew, SiFive7VLEN>.c;
     defvar IsWorstCase = SiFive7IsWorstCaseMXSEW<mx, sew, SchedMxListFWRed, 1>.c;
     let Latency = RedCycles, AcquireAtCycles = [0, 1], ReleaseAtCycles = [1, !add(1, RedCycles)] in
     defm "" : LMULSEWWriteResMXSEW<"WriteVFWRedV_From", [SiFive7VCQ, SiFive7VA],
                                    mx, sew, IsWorstCase>;
-    defvar OrdRedCycles = SiFive7GetOrderedReductionCycles<mx, sew>.c;
+    defvar OrdRedCycles = SiFive7GetOrderedReductionCycles<mx, sew, SiFive7VLEN>.c;
     let Latency = OrdRedCycles, AcquireAtCycles = [0, 1], ReleaseAtCycles = [1, !add(1, OrdRedCycles)] in
     defm "" : LMULSEWWriteResMXSEW<"WriteVFWRedOV_From", [SiFive7VCQ, SiFive7VA],
                                    mx, sew, IsWorstCase>;
@@ -924,7 +920,7 @@ foreach mx = SchedMxList in {
 
 foreach mx = SchedMxList in {
   foreach sew = SchedSEWSet<mx>.val in {
-    defvar Cycles = SiFive7GetCyclesOnePerElement<mx, sew>.c;
+    defvar Cycles = SiFive7GetCyclesOnePerElement<mx, sew, SiFive7VLEN>.c;
     defvar IsWorstCase = SiFive7IsWorstCaseMXSEW<mx, sew, SchedMxList>.c;
     let Latency = !add(Cycles, 3), AcquireAtCycles = [0, 1], ReleaseAtCycles = [1, !add(1, Cycles)] in {
       defm "" : LMULSEWWriteResMXSEW<"WriteVRGatherVV", [SiFive7VCQ, SiFive7VA], mx, sew, IsWorstCase>;

>From 8c890eaa3f4cedb494dc2a8180d9c9219bf76900 Mon Sep 17 00:00:00 2001
From: Iris Shi <0.0 at owo.li>
Date: Wed, 11 Jun 2025 10:19:12 +0800
Subject: [PATCH 35/61] Revert "[SelectionDAG] Make `(a & x) | (~a & y) -> (a &
 (x ^ y)) ^ y` available for all targets" (#143648)

---
 llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp |  57 --
 .../Target/SystemZ/SystemZISelLowering.cpp    |  14 -
 llvm/lib/Target/SystemZ/SystemZISelLowering.h |   1 -
 llvm/lib/Target/X86/X86ISelLowering.cpp       |  58 ++
 llvm/test/CodeGen/AMDGPU/bfi_int.ll           |  30 +-
 .../CodeGen/AMDGPU/insert_vector_dynelt.ll    |  42 +-
 llvm/test/CodeGen/AMDGPU/insert_vector_elt.ll | 161 ++---
 .../CodeGen/AMDGPU/insert_vector_elt.v2i16.ll |  42 +-
 ...unfold-masked-merge-scalar-variablemask.ll |  42 +-
 ...unfold-masked-merge-vector-variablemask.ll | 167 ++---
 llvm/test/CodeGen/RISCV/fold-masked-merge.ll  | 302 ---------
 ...unfold-masked-merge-scalar-variablemask.ll |  62 +-
 .../test/CodeGen/SystemZ/fold-masked-merge.ll | 277 --------
 llvm/test/CodeGen/WebAssembly/simd-arith.ll   | 600 +++++++++++-------
 llvm/test/CodeGen/X86/bitselect.ll            |  50 +-
 llvm/test/CodeGen/X86/fold-masked-merge.ll    |  30 +-
 ...unfold-masked-merge-scalar-variablemask.ll |  26 +-
 ...unfold-masked-merge-vector-variablemask.ll | 598 ++++++++---------
 18 files changed, 1059 insertions(+), 1500 deletions(-)
 delete mode 100644 llvm/test/CodeGen/RISCV/fold-masked-merge.ll
 delete mode 100644 llvm/test/CodeGen/SystemZ/fold-masked-merge.ll

diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index b0da536a3b157..b65e8e06eae62 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -8128,59 +8128,6 @@ static SDValue visitORCommutative(SelectionDAG &DAG, SDValue N0, SDValue N1,
   return SDValue();
 }
 
-static SDValue foldMaskedMergeImpl(SDValue AndL0, SDValue AndR0, SDValue AndL1,
-                                   SDValue AndR1, const SDLoc &DL,
-                                   SelectionDAG &DAG) {
-  if (!isBitwiseNot(AndL0, true) || !AndL0->hasOneUse())
-    return SDValue();
-  SDValue NotOp = AndL0->getOperand(0);
-  if (NotOp == AndR1)
-    std::swap(AndR1, AndL1);
-  if (NotOp != AndL1)
-    return SDValue();
-
-  EVT VT = AndL1->getValueType(0);
-  SDValue Xor0 = DAG.getNode(ISD::XOR, DL, VT, AndR1, AndR0);
-  SDValue And = DAG.getNode(ISD::AND, DL, VT, Xor0, NotOp);
-  SDValue Xor1 = DAG.getNode(ISD::XOR, DL, VT, And, AndR0);
-  return Xor1;
-}
-
-/// Fold "masked merge" expressions like `(m & x) | (~m & y)` into the
-/// equivalent `((x ^ y) & m) ^ y)` pattern.
-/// This is typically a better representation for targets without a fused
-/// "and-not" operation.
-static SDValue foldMaskedMerge(SDNode *Node, SelectionDAG &DAG,
-                               const TargetLowering &TLI, const SDLoc &DL) {
-  // Note that masked-merge variants using XOR or ADD expressions are
-  // normalized to OR by InstCombine so we only check for OR.
-  assert(Node->getOpcode() == ISD::OR && "Must be called with ISD::OR node");
-  SDValue N0 = Node->getOperand(0);
-  if (N0->getOpcode() != ISD::AND || !N0->hasOneUse())
-    return SDValue();
-  SDValue N1 = Node->getOperand(1);
-  if (N1->getOpcode() != ISD::AND || !N1->hasOneUse())
-    return SDValue();
-
-  // If the target supports and-not, don't fold this.
-  if (TLI.hasAndNot(SDValue(Node, 0)))
-    return SDValue();
-
-  SDValue N00 = N0->getOperand(0);
-  SDValue N01 = N0->getOperand(1);
-  SDValue N10 = N1->getOperand(0);
-  SDValue N11 = N1->getOperand(1);
-  if (SDValue Result = foldMaskedMergeImpl(N00, N01, N10, N11, DL, DAG))
-    return Result;
-  if (SDValue Result = foldMaskedMergeImpl(N01, N00, N10, N11, DL, DAG))
-    return Result;
-  if (SDValue Result = foldMaskedMergeImpl(N10, N11, N00, N01, DL, DAG))
-    return Result;
-  if (SDValue Result = foldMaskedMergeImpl(N11, N10, N00, N01, DL, DAG))
-    return Result;
-  return SDValue();
-}
-
 SDValue DAGCombiner::visitOR(SDNode *N) {
   SDValue N0 = N->getOperand(0);
   SDValue N1 = N->getOperand(1);
@@ -8359,10 +8306,6 @@ SDValue DAGCombiner::visitOR(SDNode *N) {
     if (SDValue R = foldLogicTreeOfShifts(N, N0, N1, DAG))
       return R;
 
-  if (VT.isScalarInteger() && VT != MVT::i1)
-    if (SDValue R = foldMaskedMerge(N, DAG, TLI, DL))
-      return R;
-
   return SDValue();
 }
 
diff --git a/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp b/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp
index 1c59b1e63b7bc..f06246706aaa9 100644
--- a/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp
+++ b/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp
@@ -1283,20 +1283,6 @@ bool SystemZTargetLowering::allowsMisalignedMemoryAccesses(
   return true;
 }
 
-bool SystemZTargetLowering::hasAndNot(SDValue Y) const {
-  EVT VT = Y.getValueType();
-
-  // We can use NC(G)RK for types in GPRs ...
-  if (VT == MVT::i32 || VT == MVT::i64)
-    return Subtarget.hasMiscellaneousExtensions3();
-
-  // ... or VNC for types in VRs.
-  if (VT.isVector() || VT == MVT::i128)
-    return Subtarget.hasVector();
-
-  return false;
-}
-
 // Information about the addressing mode for a memory access.
 struct AddressingMode {
   // True if a long displacement is supported.
diff --git a/llvm/lib/Target/SystemZ/SystemZISelLowering.h b/llvm/lib/Target/SystemZ/SystemZISelLowering.h
index f2f0bf6d8b410..f3536a840fda8 100644
--- a/llvm/lib/Target/SystemZ/SystemZISelLowering.h
+++ b/llvm/lib/Target/SystemZ/SystemZISelLowering.h
@@ -671,7 +671,6 @@ class SystemZTargetLowering : public TargetLowering {
   }
 
   unsigned getStackProbeSize(const MachineFunction &MF) const;
-  bool hasAndNot(SDValue Y) const override;
 
 private:
   const SystemZSubtarget &Subtarget;
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 68da901c2f123..96be91256915d 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -52350,6 +52350,59 @@ static SDValue combineOrCmpEqZeroToCtlzSrl(SDNode *N, SelectionDAG &DAG,
   return DAG.getNode(ISD::ZERO_EXTEND, SDLoc(N), N->getValueType(0), Ret);
 }
 
+static SDValue foldMaskedMergeImpl(SDValue And0_L, SDValue And0_R,
+                                   SDValue And1_L, SDValue And1_R,
+                                   const SDLoc &DL, SelectionDAG &DAG) {
+  if (!isBitwiseNot(And0_L, true) || !And0_L->hasOneUse())
+    return SDValue();
+  SDValue NotOp = And0_L->getOperand(0);
+  if (NotOp == And1_R)
+    std::swap(And1_R, And1_L);
+  if (NotOp != And1_L)
+    return SDValue();
+
+  // (~(NotOp) & And0_R) | (NotOp & And1_R)
+  // --> ((And0_R ^ And1_R) & NotOp) ^ And1_R
+  EVT VT = And1_L->getValueType(0);
+  SDValue Freeze_And0_R = DAG.getNode(ISD::FREEZE, SDLoc(), VT, And0_R);
+  SDValue Xor0 = DAG.getNode(ISD::XOR, DL, VT, And1_R, Freeze_And0_R);
+  SDValue And = DAG.getNode(ISD::AND, DL, VT, Xor0, NotOp);
+  SDValue Xor1 = DAG.getNode(ISD::XOR, DL, VT, And, Freeze_And0_R);
+  return Xor1;
+}
+
+/// Fold "masked merge" expressions like `(m & x) | (~m & y)` into the
+/// equivalent `((x ^ y) & m) ^ y)` pattern.
+/// This is typically a better representation for  targets without a fused
+/// "and-not" operation. This function is intended to be called from a
+/// `TargetLowering::PerformDAGCombine` callback on `ISD::OR` nodes.
+static SDValue foldMaskedMerge(SDNode *Node, SelectionDAG &DAG) {
+  // Note that masked-merge variants using XOR or ADD expressions are
+  // normalized to OR by InstCombine so we only check for OR.
+  assert(Node->getOpcode() == ISD::OR && "Must be called with ISD::OR node");
+  SDValue N0 = Node->getOperand(0);
+  if (N0->getOpcode() != ISD::AND || !N0->hasOneUse())
+    return SDValue();
+  SDValue N1 = Node->getOperand(1);
+  if (N1->getOpcode() != ISD::AND || !N1->hasOneUse())
+    return SDValue();
+
+  SDLoc DL(Node);
+  SDValue N00 = N0->getOperand(0);
+  SDValue N01 = N0->getOperand(1);
+  SDValue N10 = N1->getOperand(0);
+  SDValue N11 = N1->getOperand(1);
+  if (SDValue Result = foldMaskedMergeImpl(N00, N01, N10, N11, DL, DAG))
+    return Result;
+  if (SDValue Result = foldMaskedMergeImpl(N01, N00, N10, N11, DL, DAG))
+    return Result;
+  if (SDValue Result = foldMaskedMergeImpl(N10, N11, N00, N01, DL, DAG))
+    return Result;
+  if (SDValue Result = foldMaskedMergeImpl(N11, N10, N00, N01, DL, DAG))
+    return Result;
+  return SDValue();
+}
+
 /// If this is an add or subtract where one operand is produced by a cmp+setcc,
 /// then try to convert it to an ADC or SBB. This replaces TEST+SET+{ADD/SUB}
 /// with CMP+{ADC, SBB}.
@@ -52753,6 +52806,11 @@ static SDValue combineOr(SDNode *N, SelectionDAG &DAG,
     }
   }
 
+  // We should fold "masked merge" patterns when `andn` is not available.
+  if (!Subtarget.hasBMI() && VT.isScalarInteger() && VT != MVT::i1)
+    if (SDValue R = foldMaskedMerge(N, DAG))
+      return R;
+
   if (SDValue R = combineOrXorWithSETCC(N->getOpcode(), dl, VT, N0, N1, DAG))
     return R;
 
diff --git a/llvm/test/CodeGen/AMDGPU/bfi_int.ll b/llvm/test/CodeGen/AMDGPU/bfi_int.ll
index b372dec383344..201b97d479c68 100644
--- a/llvm/test/CodeGen/AMDGPU/bfi_int.ll
+++ b/llvm/test/CodeGen/AMDGPU/bfi_int.ll
@@ -16,9 +16,9 @@ define amdgpu_kernel void @s_bfi_def_i32(ptr addrspace(1) %out, i32 %x, i32 %y,
 ; GFX7-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX7-NEXT:    s_mov_b32 s6, -1
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_xor_b32 s1, s1, s2
+; GFX7-NEXT:    s_andn2_b32 s2, s2, s0
 ; GFX7-NEXT:    s_and_b32 s0, s1, s0
-; GFX7-NEXT:    s_xor_b32 s0, s0, s2
+; GFX7-NEXT:    s_or_b32 s0, s2, s0
 ; GFX7-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX7-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; GFX7-NEXT:    s_endpgm
@@ -28,9 +28,9 @@ define amdgpu_kernel void @s_bfi_def_i32(ptr addrspace(1) %out, i32 %x, i32 %y,
 ; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x2c
 ; GFX8-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x24
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-NEXT:    s_xor_b32 s1, s1, s2
+; GFX8-NEXT:    s_andn2_b32 s2, s2, s0
 ; GFX8-NEXT:    s_and_b32 s0, s1, s0
-; GFX8-NEXT:    s_xor_b32 s0, s0, s2
+; GFX8-NEXT:    s_or_b32 s0, s2, s0
 ; GFX8-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX8-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX8-NEXT:    v_mov_b32_e32 v2, s0
@@ -44,9 +44,9 @@ define amdgpu_kernel void @s_bfi_def_i32(ptr addrspace(1) %out, i32 %x, i32 %y,
 ; GFX10-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x24
 ; GFX10-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    s_xor_b32 s1, s1, s2
+; GFX10-NEXT:    s_andn2_b32 s2, s2, s0
 ; GFX10-NEXT:    s_and_b32 s0, s1, s0
-; GFX10-NEXT:    s_xor_b32 s0, s0, s2
+; GFX10-NEXT:    s_or_b32 s0, s2, s0
 ; GFX10-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX10-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-NEXT:    s_endpgm
@@ -1407,9 +1407,9 @@ define amdgpu_kernel void @s_bitselect_i64_pat_0(i64 %a, i64 %b, i64 %mask) {
 ; GFX7-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX7-NEXT:    s_mov_b32 s6, -1
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_xor_b64 s[2:3], s[2:3], s[4:5]
-; GFX7-NEXT:    s_and_b64 s[0:1], s[2:3], s[0:1]
-; GFX7-NEXT:    s_xor_b64 s[0:1], s[0:1], s[4:5]
+; GFX7-NEXT:    s_and_b64 s[2:3], s[0:1], s[2:3]
+; GFX7-NEXT:    s_andn2_b64 s[0:1], s[4:5], s[0:1]
+; GFX7-NEXT:    s_or_b64 s[0:1], s[2:3], s[0:1]
 ; GFX7-NEXT:    s_add_u32 s0, s0, 10
 ; GFX7-NEXT:    s_addc_u32 s1, s1, 0
 ; GFX7-NEXT:    v_mov_b32_e32 v0, s0
@@ -1422,9 +1422,9 @@ define amdgpu_kernel void @s_bitselect_i64_pat_0(i64 %a, i64 %b, i64 %mask) {
 ; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; GFX8-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x34
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-NEXT:    s_xor_b64 s[2:3], s[2:3], s[4:5]
-; GFX8-NEXT:    s_and_b64 s[0:1], s[2:3], s[0:1]
-; GFX8-NEXT:    s_xor_b64 s[0:1], s[0:1], s[4:5]
+; GFX8-NEXT:    s_and_b64 s[2:3], s[0:1], s[2:3]
+; GFX8-NEXT:    s_andn2_b64 s[0:1], s[4:5], s[0:1]
+; GFX8-NEXT:    s_or_b64 s[0:1], s[2:3], s[0:1]
 ; GFX8-NEXT:    s_add_u32 s0, s0, 10
 ; GFX8-NEXT:    s_addc_u32 s1, s1, 0
 ; GFX8-NEXT:    v_mov_b32_e32 v0, s0
@@ -1438,9 +1438,9 @@ define amdgpu_kernel void @s_bitselect_i64_pat_0(i64 %a, i64 %b, i64 %mask) {
 ; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; GFX10-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x34
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    s_xor_b64 s[2:3], s[2:3], s[4:5]
-; GFX10-NEXT:    s_and_b64 s[0:1], s[2:3], s[0:1]
-; GFX10-NEXT:    s_xor_b64 s[0:1], s[0:1], s[4:5]
+; GFX10-NEXT:    s_and_b64 s[2:3], s[0:1], s[2:3]
+; GFX10-NEXT:    s_andn2_b64 s[0:1], s[4:5], s[0:1]
+; GFX10-NEXT:    s_or_b64 s[0:1], s[2:3], s[0:1]
 ; GFX10-NEXT:    s_add_u32 s0, s0, 10
 ; GFX10-NEXT:    s_addc_u32 s1, s1, 0
 ; GFX10-NEXT:    v_mov_b32_e32 v0, s0
diff --git a/llvm/test/CodeGen/AMDGPU/insert_vector_dynelt.ll b/llvm/test/CodeGen/AMDGPU/insert_vector_dynelt.ll
index e1b4cad370f96..6925a98f643b9 100644
--- a/llvm/test/CodeGen/AMDGPU/insert_vector_dynelt.ll
+++ b/llvm/test/CodeGen/AMDGPU/insert_vector_dynelt.ll
@@ -289,16 +289,16 @@ entry:
 define amdgpu_kernel void @half4_inselt(ptr addrspace(1) %out, <4 x half> %vec, i32 %sel) {
 ; GCN-LABEL: half4_inselt:
 ; GCN:       ; %bb.0: ; %entry
-; GCN-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; GCN-NEXT:    s_load_dword s6, s[4:5], 0x34
+; GCN-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; GCN-NEXT:    s_mov_b32 s4, 0x3c003c00
 ; GCN-NEXT:    s_mov_b32 s5, s4
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    s_xor_b64 s[4:5], s[2:3], s[4:5]
 ; GCN-NEXT:    s_lshl_b32 s6, s6, 4
 ; GCN-NEXT:    s_lshl_b64 s[6:7], 0xffff, s6
-; GCN-NEXT:    s_and_b64 s[4:5], s[4:5], s[6:7]
-; GCN-NEXT:    s_xor_b64 s[2:3], s[4:5], s[2:3]
+; GCN-NEXT:    s_andn2_b64 s[2:3], s[2:3], s[6:7]
+; GCN-NEXT:    s_and_b64 s[4:5], s[6:7], s[4:5]
+; GCN-NEXT:    s_or_b64 s[2:3], s[4:5], s[2:3]
 ; GCN-NEXT:    v_mov_b32_e32 v0, s0
 ; GCN-NEXT:    v_mov_b32_e32 v2, s2
 ; GCN-NEXT:    v_mov_b32_e32 v1, s1
@@ -317,10 +317,10 @@ define amdgpu_kernel void @half2_inselt(ptr addrspace(1) %out, <2 x half> %vec,
 ; GCN-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    s_lshl_b32 s3, s3, 4
-; GCN-NEXT:    s_xor_b32 s4, s2, 0x3c003c00
 ; GCN-NEXT:    s_lshl_b32 s3, 0xffff, s3
-; GCN-NEXT:    s_and_b32 s3, s4, s3
-; GCN-NEXT:    s_xor_b32 s2, s3, s2
+; GCN-NEXT:    s_andn2_b32 s2, s2, s3
+; GCN-NEXT:    s_and_b32 s3, s3, 0x3c003c00
+; GCN-NEXT:    s_or_b32 s2, s3, s2
 ; GCN-NEXT:    v_mov_b32_e32 v0, s0
 ; GCN-NEXT:    v_mov_b32_e32 v1, s1
 ; GCN-NEXT:    v_mov_b32_e32 v2, s2
@@ -399,10 +399,10 @@ define amdgpu_kernel void @short2_inselt(ptr addrspace(1) %out, <2 x i16> %vec,
 ; GCN-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    s_lshl_b32 s3, s3, 4
-; GCN-NEXT:    s_xor_b32 s4, s2, 0x10001
 ; GCN-NEXT:    s_lshl_b32 s3, 0xffff, s3
-; GCN-NEXT:    s_and_b32 s3, s4, s3
-; GCN-NEXT:    s_xor_b32 s2, s3, s2
+; GCN-NEXT:    s_andn2_b32 s2, s2, s3
+; GCN-NEXT:    s_and_b32 s3, s3, 0x10001
+; GCN-NEXT:    s_or_b32 s2, s3, s2
 ; GCN-NEXT:    v_mov_b32_e32 v0, s0
 ; GCN-NEXT:    v_mov_b32_e32 v1, s1
 ; GCN-NEXT:    v_mov_b32_e32 v2, s2
@@ -417,16 +417,16 @@ entry:
 define amdgpu_kernel void @short4_inselt(ptr addrspace(1) %out, <4 x i16> %vec, i32 %sel) {
 ; GCN-LABEL: short4_inselt:
 ; GCN:       ; %bb.0: ; %entry
-; GCN-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; GCN-NEXT:    s_load_dword s6, s[4:5], 0x34
+; GCN-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; GCN-NEXT:    s_mov_b32 s4, 0x10001
 ; GCN-NEXT:    s_mov_b32 s5, s4
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    s_xor_b64 s[4:5], s[2:3], s[4:5]
 ; GCN-NEXT:    s_lshl_b32 s6, s6, 4
 ; GCN-NEXT:    s_lshl_b64 s[6:7], 0xffff, s6
-; GCN-NEXT:    s_and_b64 s[4:5], s[4:5], s[6:7]
-; GCN-NEXT:    s_xor_b64 s[2:3], s[4:5], s[2:3]
+; GCN-NEXT:    s_andn2_b64 s[2:3], s[2:3], s[6:7]
+; GCN-NEXT:    s_and_b64 s[4:5], s[6:7], s[4:5]
+; GCN-NEXT:    s_or_b64 s[2:3], s[4:5], s[2:3]
 ; GCN-NEXT:    v_mov_b32_e32 v0, s0
 ; GCN-NEXT:    v_mov_b32_e32 v2, s2
 ; GCN-NEXT:    v_mov_b32_e32 v1, s1
@@ -442,15 +442,15 @@ entry:
 define amdgpu_kernel void @byte8_inselt(ptr addrspace(1) %out, <8 x i8> %vec, i32 %sel) {
 ; GCN-LABEL: byte8_inselt:
 ; GCN:       ; %bb.0: ; %entry
-; GCN-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; GCN-NEXT:    s_load_dword s6, s[4:5], 0x34
+; GCN-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    s_xor_b32 s5, s3, 0x1010101
-; GCN-NEXT:    s_lshl_b32 s6, s6, 3
-; GCN-NEXT:    s_xor_b32 s4, s2, 0x1010101
-; GCN-NEXT:    s_lshl_b64 s[6:7], 0xff, s6
-; GCN-NEXT:    s_and_b64 s[4:5], s[4:5], s[6:7]
-; GCN-NEXT:    s_xor_b64 s[2:3], s[4:5], s[2:3]
+; GCN-NEXT:    s_lshl_b32 s4, s6, 3
+; GCN-NEXT:    s_lshl_b64 s[4:5], 0xff, s4
+; GCN-NEXT:    s_and_b32 s7, s5, 0x1010101
+; GCN-NEXT:    s_and_b32 s6, s4, 0x1010101
+; GCN-NEXT:    s_andn2_b64 s[2:3], s[2:3], s[4:5]
+; GCN-NEXT:    s_or_b64 s[2:3], s[6:7], s[2:3]
 ; GCN-NEXT:    v_mov_b32_e32 v0, s0
 ; GCN-NEXT:    v_mov_b32_e32 v2, s2
 ; GCN-NEXT:    v_mov_b32_e32 v1, s1
diff --git a/llvm/test/CodeGen/AMDGPU/insert_vector_elt.ll b/llvm/test/CodeGen/AMDGPU/insert_vector_elt.ll
index 44bd4090436ef..be16fac4c53f7 100644
--- a/llvm/test/CodeGen/AMDGPU/insert_vector_elt.ll
+++ b/llvm/test/CodeGen/AMDGPU/insert_vector_elt.ll
@@ -1511,13 +1511,13 @@ define amdgpu_kernel void @dynamic_insertelement_v2i16(ptr addrspace(1) %out, <2
 ; SI-NEXT:    s_mov_b32 s7, 0x100f000
 ; SI-NEXT:    s_mov_b32 s6, -1
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    s_mov_b32 s5, s1
-; SI-NEXT:    s_lshl_b32 s1, s3, 4
 ; SI-NEXT:    s_mov_b32 s4, s0
-; SI-NEXT:    s_xor_b32 s0, s2, 0x50005
-; SI-NEXT:    s_lshl_b32 s1, 0xffff, s1
-; SI-NEXT:    s_and_b32 s0, s0, s1
-; SI-NEXT:    s_xor_b32 s0, s0, s2
+; SI-NEXT:    s_lshl_b32 s0, s3, 4
+; SI-NEXT:    s_lshl_b32 s0, 0xffff, s0
+; SI-NEXT:    s_mov_b32 s5, s1
+; SI-NEXT:    s_andn2_b32 s1, s2, s0
+; SI-NEXT:    s_and_b32 s0, s0, 0x50005
+; SI-NEXT:    s_or_b32 s0, s0, s1
 ; SI-NEXT:    v_mov_b32_e32 v0, s0
 ; SI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; SI-NEXT:    s_endpgm
@@ -1528,13 +1528,13 @@ define amdgpu_kernel void @dynamic_insertelement_v2i16(ptr addrspace(1) %out, <2
 ; VI-NEXT:    s_mov_b32 s7, 0x1100f000
 ; VI-NEXT:    s_mov_b32 s6, -1
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    s_mov_b32 s5, s1
-; VI-NEXT:    s_lshl_b32 s1, s3, 4
 ; VI-NEXT:    s_mov_b32 s4, s0
-; VI-NEXT:    s_xor_b32 s0, s2, 0x50005
-; VI-NEXT:    s_lshl_b32 s1, 0xffff, s1
-; VI-NEXT:    s_and_b32 s0, s0, s1
-; VI-NEXT:    s_xor_b32 s0, s0, s2
+; VI-NEXT:    s_lshl_b32 s0, s3, 4
+; VI-NEXT:    s_lshl_b32 s0, 0xffff, s0
+; VI-NEXT:    s_mov_b32 s5, s1
+; VI-NEXT:    s_andn2_b32 s1, s2, s0
+; VI-NEXT:    s_and_b32 s0, s0, 0x50005
+; VI-NEXT:    s_or_b32 s0, s0, s1
 ; VI-NEXT:    v_mov_b32_e32 v0, s0
 ; VI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; VI-NEXT:    s_endpgm
@@ -1552,13 +1552,13 @@ define amdgpu_kernel void @dynamic_insertelement_v3i16(ptr addrspace(1) %out, <3
 ; SI-NEXT:    s_mov_b32 s6, -1
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
 ; SI-NEXT:    s_mov_b32 s4, s0
-; SI-NEXT:    s_lshl_b32 s8, s8, 4
+; SI-NEXT:    s_lshl_b32 s0, s8, 4
 ; SI-NEXT:    s_mov_b32 s5, s1
-; SI-NEXT:    s_xor_b32 s1, s3, 0x50005
-; SI-NEXT:    s_xor_b32 s0, s2, 0x50005
-; SI-NEXT:    s_lshl_b64 s[8:9], 0xffff, s8
-; SI-NEXT:    s_and_b64 s[0:1], s[0:1], s[8:9]
-; SI-NEXT:    s_xor_b64 s[0:1], s[0:1], s[2:3]
+; SI-NEXT:    s_lshl_b64 s[0:1], 0xffff, s0
+; SI-NEXT:    s_and_b32 s9, s1, 0x50005
+; SI-NEXT:    s_and_b32 s8, s0, 0x50005
+; SI-NEXT:    s_andn2_b64 s[0:1], s[2:3], s[0:1]
+; SI-NEXT:    s_or_b64 s[0:1], s[8:9], s[0:1]
 ; SI-NEXT:    v_mov_b32_e32 v0, s1
 ; SI-NEXT:    buffer_store_short v0, off, s[4:7], 0 offset:4
 ; SI-NEXT:    v_mov_b32_e32 v0, s0
@@ -1573,14 +1573,14 @@ define amdgpu_kernel void @dynamic_insertelement_v3i16(ptr addrspace(1) %out, <3
 ; VI-NEXT:    s_mov_b32 s6, -1
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    s_mov_b32 s4, s0
-; VI-NEXT:    s_mov_b32 s0, 0x50005
+; VI-NEXT:    s_lshl_b32 s0, s8, 4
+; VI-NEXT:    s_mov_b32 s8, 0x50005
 ; VI-NEXT:    s_mov_b32 s5, s1
-; VI-NEXT:    s_mov_b32 s1, s0
-; VI-NEXT:    s_lshl_b32 s8, s8, 4
-; VI-NEXT:    s_xor_b64 s[0:1], s[2:3], s[0:1]
-; VI-NEXT:    s_lshl_b64 s[8:9], 0xffff, s8
+; VI-NEXT:    s_lshl_b64 s[0:1], 0xffff, s0
+; VI-NEXT:    s_mov_b32 s9, s8
+; VI-NEXT:    s_andn2_b64 s[2:3], s[2:3], s[0:1]
 ; VI-NEXT:    s_and_b64 s[0:1], s[0:1], s[8:9]
-; VI-NEXT:    s_xor_b64 s[0:1], s[0:1], s[2:3]
+; VI-NEXT:    s_or_b64 s[0:1], s[0:1], s[2:3]
 ; VI-NEXT:    v_mov_b32_e32 v0, s1
 ; VI-NEXT:    buffer_store_short v0, off, s[4:7], 0 offset:4
 ; VI-NEXT:    v_mov_b32_e32 v0, s0
@@ -1594,34 +1594,35 @@ define amdgpu_kernel void @dynamic_insertelement_v3i16(ptr addrspace(1) %out, <3
 define amdgpu_kernel void @dynamic_insertelement_v2i8(ptr addrspace(1) %out, [8 x i32], <2 x i8> %a, [8 x i32], i32 %b) nounwind {
 ; SI-LABEL: dynamic_insertelement_v2i8:
 ; SI:       ; %bb.0:
-; SI-NEXT:    s_load_dword s4, s[8:9], 0xa
-; SI-NEXT:    s_load_dword s5, s[8:9], 0x13
+; SI-NEXT:    s_load_dword s4, s[8:9], 0x13
 ; SI-NEXT:    s_load_dwordx2 s[0:1], s[8:9], 0x0
+; SI-NEXT:    s_load_dword s5, s[8:9], 0xa
 ; SI-NEXT:    s_mov_b32 s3, 0x100f000
 ; SI-NEXT:    s_mov_b32 s2, -1
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    s_xor_b32 s6, s4, 0x505
-; SI-NEXT:    s_lshl_b32 s5, s5, 3
-; SI-NEXT:    s_lshl_b32 s5, 0xff, s5
-; SI-NEXT:    s_and_b32 s5, s6, s5
-; SI-NEXT:    s_xor_b32 s4, s5, s4
+; SI-NEXT:    s_lshl_b32 s4, s4, 3
+; SI-NEXT:    s_lshl_b32 s4, 0xff, s4
+; SI-NEXT:    s_andn2_b32 s5, s5, s4
+; SI-NEXT:    s_and_b32 s4, s4, 0x505
+; SI-NEXT:    s_or_b32 s4, s4, s5
 ; SI-NEXT:    v_mov_b32_e32 v0, s4
 ; SI-NEXT:    buffer_store_short v0, off, s[0:3], 0
 ; SI-NEXT:    s_endpgm
 ;
 ; VI-LABEL: dynamic_insertelement_v2i8:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_load_dword s4, s[8:9], 0x28
-; VI-NEXT:    s_load_dword s5, s[8:9], 0x4c
+; VI-NEXT:    s_load_dword s4, s[8:9], 0x4c
 ; VI-NEXT:    s_load_dwordx2 s[0:1], s[8:9], 0x0
+; VI-NEXT:    s_load_dword s5, s[8:9], 0x28
 ; VI-NEXT:    s_mov_b32 s3, 0x1100f000
 ; VI-NEXT:    s_mov_b32 s2, -1
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    s_xor_b32 s6, s4, 0x505
-; VI-NEXT:    s_lshl_b32 s5, s5, 3
-; VI-NEXT:    s_lshl_b32 s5, 0xff, s5
-; VI-NEXT:    s_and_b32 s5, s6, s5
-; VI-NEXT:    s_xor_b32 s4, s5, s4
+; VI-NEXT:    s_lshl_b32 s4, s4, 3
+; VI-NEXT:    s_lshl_b32 s4, 0xff, s4
+; VI-NEXT:    s_and_b32 s6, s4, 0x505
+; VI-NEXT:    s_xor_b32 s4, s4, 0xffff
+; VI-NEXT:    s_and_b32 s4, s4, s5
+; VI-NEXT:    s_or_b32 s4, s6, s4
 ; VI-NEXT:    v_mov_b32_e32 v0, s4
 ; VI-NEXT:    buffer_store_short v0, off, s[0:3], 0
 ; VI-NEXT:    s_endpgm
@@ -1635,17 +1636,17 @@ define amdgpu_kernel void @dynamic_insertelement_v2i8(ptr addrspace(1) %out, [8
 define amdgpu_kernel void @dynamic_insertelement_v3i8(ptr addrspace(1) %out, [8 x i32], <3 x i8> %a, [8 x i32], i32 %b) nounwind {
 ; SI-LABEL: dynamic_insertelement_v3i8:
 ; SI:       ; %bb.0:
-; SI-NEXT:    s_load_dword s4, s[8:9], 0xa
-; SI-NEXT:    s_load_dword s5, s[8:9], 0x13
+; SI-NEXT:    s_load_dword s4, s[8:9], 0x13
 ; SI-NEXT:    s_load_dwordx2 s[0:1], s[8:9], 0x0
+; SI-NEXT:    s_load_dword s5, s[8:9], 0xa
 ; SI-NEXT:    s_mov_b32 s3, 0x100f000
 ; SI-NEXT:    s_mov_b32 s2, -1
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    s_xor_b32 s6, s4, 0x5050505
-; SI-NEXT:    s_lshl_b32 s5, s5, 3
-; SI-NEXT:    s_lshl_b32 s5, 0xff, s5
-; SI-NEXT:    s_and_b32 s5, s6, s5
-; SI-NEXT:    s_xor_b32 s4, s5, s4
+; SI-NEXT:    s_lshl_b32 s4, s4, 3
+; SI-NEXT:    s_lshl_b32 s4, 0xff, s4
+; SI-NEXT:    s_andn2_b32 s5, s5, s4
+; SI-NEXT:    s_and_b32 s4, s4, 0x5050505
+; SI-NEXT:    s_or_b32 s4, s4, s5
 ; SI-NEXT:    s_lshr_b32 s5, s4, 16
 ; SI-NEXT:    v_mov_b32_e32 v0, s4
 ; SI-NEXT:    buffer_store_short v0, off, s[0:3], 0
@@ -1655,17 +1656,17 @@ define amdgpu_kernel void @dynamic_insertelement_v3i8(ptr addrspace(1) %out, [8
 ;
 ; VI-LABEL: dynamic_insertelement_v3i8:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_load_dword s4, s[8:9], 0x28
-; VI-NEXT:    s_load_dword s5, s[8:9], 0x4c
+; VI-NEXT:    s_load_dword s4, s[8:9], 0x4c
 ; VI-NEXT:    s_load_dwordx2 s[0:1], s[8:9], 0x0
+; VI-NEXT:    s_load_dword s5, s[8:9], 0x28
 ; VI-NEXT:    s_mov_b32 s3, 0x1100f000
 ; VI-NEXT:    s_mov_b32 s2, -1
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    s_xor_b32 s6, s4, 0x5050505
-; VI-NEXT:    s_lshl_b32 s5, s5, 3
-; VI-NEXT:    s_lshl_b32 s5, 0xff, s5
-; VI-NEXT:    s_and_b32 s5, s6, s5
-; VI-NEXT:    s_xor_b32 s4, s5, s4
+; VI-NEXT:    s_lshl_b32 s4, s4, 3
+; VI-NEXT:    s_lshl_b32 s4, 0xff, s4
+; VI-NEXT:    s_andn2_b32 s5, s5, s4
+; VI-NEXT:    s_and_b32 s4, s4, 0x5050505
+; VI-NEXT:    s_or_b32 s4, s4, s5
 ; VI-NEXT:    s_lshr_b32 s5, s4, 16
 ; VI-NEXT:    v_mov_b32_e32 v0, s4
 ; VI-NEXT:    buffer_store_short v0, off, s[0:3], 0
@@ -1680,34 +1681,34 @@ define amdgpu_kernel void @dynamic_insertelement_v3i8(ptr addrspace(1) %out, [8
 define amdgpu_kernel void @dynamic_insertelement_v4i8(ptr addrspace(1) %out, [8 x i32], <4 x i8> %a, [8 x i32], i32 %b) nounwind {
 ; SI-LABEL: dynamic_insertelement_v4i8:
 ; SI:       ; %bb.0:
-; SI-NEXT:    s_load_dword s4, s[8:9], 0xa
-; SI-NEXT:    s_load_dword s5, s[8:9], 0x13
+; SI-NEXT:    s_load_dword s4, s[8:9], 0x13
 ; SI-NEXT:    s_load_dwordx2 s[0:1], s[8:9], 0x0
+; SI-NEXT:    s_load_dword s5, s[8:9], 0xa
 ; SI-NEXT:    s_mov_b32 s3, 0x100f000
 ; SI-NEXT:    s_mov_b32 s2, -1
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    s_xor_b32 s6, s4, 0x5050505
-; SI-NEXT:    s_lshl_b32 s5, s5, 3
-; SI-NEXT:    s_lshl_b32 s5, 0xff, s5
-; SI-NEXT:    s_and_b32 s5, s6, s5
-; SI-NEXT:    s_xor_b32 s4, s5, s4
+; SI-NEXT:    s_lshl_b32 s4, s4, 3
+; SI-NEXT:    s_lshl_b32 s4, 0xff, s4
+; SI-NEXT:    s_andn2_b32 s5, s5, s4
+; SI-NEXT:    s_and_b32 s4, s4, 0x5050505
+; SI-NEXT:    s_or_b32 s4, s4, s5
 ; SI-NEXT:    v_mov_b32_e32 v0, s4
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; SI-NEXT:    s_endpgm
 ;
 ; VI-LABEL: dynamic_insertelement_v4i8:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_load_dword s4, s[8:9], 0x28
-; VI-NEXT:    s_load_dword s5, s[8:9], 0x4c
+; VI-NEXT:    s_load_dword s4, s[8:9], 0x4c
 ; VI-NEXT:    s_load_dwordx2 s[0:1], s[8:9], 0x0
+; VI-NEXT:    s_load_dword s5, s[8:9], 0x28
 ; VI-NEXT:    s_mov_b32 s3, 0x1100f000
 ; VI-NEXT:    s_mov_b32 s2, -1
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    s_xor_b32 s6, s4, 0x5050505
-; VI-NEXT:    s_lshl_b32 s5, s5, 3
-; VI-NEXT:    s_lshl_b32 s5, 0xff, s5
-; VI-NEXT:    s_and_b32 s5, s6, s5
-; VI-NEXT:    s_xor_b32 s4, s5, s4
+; VI-NEXT:    s_lshl_b32 s4, s4, 3
+; VI-NEXT:    s_lshl_b32 s4, 0xff, s4
+; VI-NEXT:    s_andn2_b32 s5, s5, s4
+; VI-NEXT:    s_and_b32 s4, s4, 0x5050505
+; VI-NEXT:    s_or_b32 s4, s4, s5
 ; VI-NEXT:    v_mov_b32_e32 v0, s4
 ; VI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; VI-NEXT:    s_endpgm
@@ -1720,20 +1721,20 @@ define amdgpu_kernel void @s_dynamic_insertelement_v8i8(ptr addrspace(1) %out, p
 ; SI-LABEL: s_dynamic_insertelement_v8i8:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
+; SI-NEXT:    s_load_dword s8, s[8:9], 0x4
 ; SI-NEXT:    s_mov_b32 s7, 0x100f000
 ; SI-NEXT:    s_mov_b32 s6, -1
-; SI-NEXT:    s_load_dword s8, s[8:9], 0x4
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
 ; SI-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x0
 ; SI-NEXT:    s_mov_b32 s4, s0
+; SI-NEXT:    s_lshl_b32 s0, s8, 3
 ; SI-NEXT:    s_mov_b32 s5, s1
-; SI-NEXT:    s_lshl_b32 s8, s8, 3
+; SI-NEXT:    s_lshl_b64 s[0:1], 0xff, s0
+; SI-NEXT:    s_and_b32 s9, s1, 0x5050505
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    s_xor_b32 s1, s3, 0x5050505
-; SI-NEXT:    s_xor_b32 s0, s2, 0x5050505
-; SI-NEXT:    s_lshl_b64 s[8:9], 0xff, s8
-; SI-NEXT:    s_and_b64 s[0:1], s[0:1], s[8:9]
-; SI-NEXT:    s_xor_b64 s[0:1], s[0:1], s[2:3]
+; SI-NEXT:    s_andn2_b64 s[2:3], s[2:3], s[0:1]
+; SI-NEXT:    s_and_b32 s8, s0, 0x5050505
+; SI-NEXT:    s_or_b64 s[0:1], s[8:9], s[2:3]
 ; SI-NEXT:    v_mov_b32_e32 v0, s0
 ; SI-NEXT:    v_mov_b32_e32 v1, s1
 ; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
@@ -1742,20 +1743,20 @@ define amdgpu_kernel void @s_dynamic_insertelement_v8i8(ptr addrspace(1) %out, p
 ; VI-LABEL: s_dynamic_insertelement_v8i8:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
+; VI-NEXT:    s_load_dword s8, s[8:9], 0x10
 ; VI-NEXT:    s_mov_b32 s7, 0x1100f000
 ; VI-NEXT:    s_mov_b32 s6, -1
-; VI-NEXT:    s_load_dword s8, s[8:9], 0x10
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x0
 ; VI-NEXT:    s_mov_b32 s4, s0
+; VI-NEXT:    s_lshl_b32 s0, s8, 3
 ; VI-NEXT:    s_mov_b32 s5, s1
-; VI-NEXT:    s_lshl_b32 s8, s8, 3
+; VI-NEXT:    s_lshl_b64 s[0:1], 0xff, s0
+; VI-NEXT:    s_and_b32 s9, s1, 0x5050505
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    s_xor_b32 s1, s3, 0x5050505
-; VI-NEXT:    s_xor_b32 s0, s2, 0x5050505
-; VI-NEXT:    s_lshl_b64 s[8:9], 0xff, s8
-; VI-NEXT:    s_and_b64 s[0:1], s[0:1], s[8:9]
-; VI-NEXT:    s_xor_b64 s[0:1], s[0:1], s[2:3]
+; VI-NEXT:    s_andn2_b64 s[2:3], s[2:3], s[0:1]
+; VI-NEXT:    s_and_b32 s8, s0, 0x5050505
+; VI-NEXT:    s_or_b64 s[0:1], s[8:9], s[2:3]
 ; VI-NEXT:    v_mov_b32_e32 v0, s0
 ; VI-NEXT:    v_mov_b32_e32 v1, s1
 ; VI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
diff --git a/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll b/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll
index a0ad6328b0c01..e0dacb7a59a42 100644
--- a/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll
+++ b/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll
@@ -1534,11 +1534,11 @@ define amdgpu_kernel void @s_insertelement_v2i16_dynamic(ptr addrspace(1) %out,
 ; GFX9-NEXT:    s_load_dword s6, s[4:5], 0x0
 ; GFX9-NEXT:    s_load_dword s7, s[2:3], 0x0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_lshl_b32 s3, s6, 4
-; GFX9-NEXT:    s_xor_b32 s2, s7, 0x3e703e7
-; GFX9-NEXT:    s_lshl_b32 s3, 0xffff, s3
-; GFX9-NEXT:    s_and_b32 s2, s2, s3
-; GFX9-NEXT:    s_xor_b32 s2, s2, s7
+; GFX9-NEXT:    s_lshl_b32 s2, s6, 4
+; GFX9-NEXT:    s_lshl_b32 s2, 0xffff, s2
+; GFX9-NEXT:    s_andn2_b32 s3, s7, s2
+; GFX9-NEXT:    s_and_b32 s2, s2, 0x3e703e7
+; GFX9-NEXT:    s_or_b32 s2, s2, s3
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
 ; GFX9-NEXT:    s_endpgm
@@ -1553,14 +1553,14 @@ define amdgpu_kernel void @s_insertelement_v2i16_dynamic(ptr addrspace(1) %out,
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    s_load_dword s4, s[4:5], 0x0
 ; VI-NEXT:    s_load_dword s2, s[2:3], 0x0
-; VI-NEXT:    v_mov_b32_e32 v1, s1
 ; VI-NEXT:    v_mov_b32_e32 v0, s0
+; VI-NEXT:    v_mov_b32_e32 v1, s1
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    s_lshl_b32 s1, s4, 4
-; VI-NEXT:    s_xor_b32 s0, s2, 0x3e703e7
-; VI-NEXT:    s_lshl_b32 s1, 0xffff, s1
-; VI-NEXT:    s_and_b32 s0, s0, s1
-; VI-NEXT:    s_xor_b32 s0, s0, s2
+; VI-NEXT:    s_lshl_b32 s0, s4, 4
+; VI-NEXT:    s_lshl_b32 s0, 0xffff, s0
+; VI-NEXT:    s_andn2_b32 s1, s2, s0
+; VI-NEXT:    s_and_b32 s0, s0, 0x3e703e7
+; VI-NEXT:    s_or_b32 s0, s0, s1
 ; VI-NEXT:    v_mov_b32_e32 v2, s0
 ; VI-NEXT:    flat_store_dword v[0:1], v2
 ; VI-NEXT:    s_endpgm
@@ -1575,14 +1575,14 @@ define amdgpu_kernel void @s_insertelement_v2i16_dynamic(ptr addrspace(1) %out,
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    s_load_dword s4, s[4:5], 0x0
 ; CI-NEXT:    s_load_dword s2, s[2:3], 0x0
-; CI-NEXT:    v_mov_b32_e32 v1, s1
 ; CI-NEXT:    v_mov_b32_e32 v0, s0
+; CI-NEXT:    v_mov_b32_e32 v1, s1
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
-; CI-NEXT:    s_lshl_b32 s1, s4, 4
-; CI-NEXT:    s_xor_b32 s0, s2, 0x3e703e7
-; CI-NEXT:    s_lshl_b32 s1, 0xffff, s1
-; CI-NEXT:    s_and_b32 s0, s0, s1
-; CI-NEXT:    s_xor_b32 s0, s0, s2
+; CI-NEXT:    s_lshl_b32 s0, s4, 4
+; CI-NEXT:    s_lshl_b32 s0, 0xffff, s0
+; CI-NEXT:    s_andn2_b32 s1, s2, s0
+; CI-NEXT:    s_and_b32 s0, s0, 0x3e703e7
+; CI-NEXT:    s_or_b32 s0, s0, s1
 ; CI-NEXT:    v_mov_b32_e32 v2, s0
 ; CI-NEXT:    flat_store_dword v[0:1], v2
 ; CI-NEXT:    s_endpgm
@@ -1597,12 +1597,12 @@ define amdgpu_kernel void @s_insertelement_v2i16_dynamic(ptr addrspace(1) %out,
 ; GFX11-NEXT:    s_load_b32 s2, s[2:3], 0x0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    s_lshl_b32 s3, s4, 4
-; GFX11-NEXT:    s_xor_b32 s4, s2, 0x3e703e7
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_lshl_b32 s3, 0xffff, s3
+; GFX11-NEXT:    s_and_not1_b32 s2, s2, s3
+; GFX11-NEXT:    s_and_b32 s3, s3, 0x3e703e7
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GFX11-NEXT:    s_and_b32 s3, s4, s3
-; GFX11-NEXT:    s_xor_b32 s2, s3, s2
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT:    s_or_b32 s2, s3, s2
 ; GFX11-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
 ; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX11-NEXT:    s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/unfold-masked-merge-scalar-variablemask.ll b/llvm/test/CodeGen/AMDGPU/unfold-masked-merge-scalar-variablemask.ll
index 321b64510c35f..69724aa75af4f 100644
--- a/llvm/test/CodeGen/AMDGPU/unfold-masked-merge-scalar-variablemask.ll
+++ b/llvm/test/CodeGen/AMDGPU/unfold-masked-merge-scalar-variablemask.ll
@@ -5,11 +5,10 @@ define i32 @s_out32(i32 inreg %x, i32 inreg %y, i32 inreg %mask) {
 ; GCN-LABEL: s_out32:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    s_xor_b32 s0, s0, s1
-; GCN-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
 ; GCN-NEXT:    s_and_b32 s0, s0, s2
-; GCN-NEXT:    s_xor_b32 s0, s0, s1
-; GCN-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GCN-NEXT:    s_and_not1_b32 s1, s1, s2
+; GCN-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GCN-NEXT:    s_or_b32 s0, s0, s1
 ; GCN-NEXT:    v_mov_b32_e32 v0, s0
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %mx = and i32 %x, %mask
@@ -23,11 +22,10 @@ define i64 @s_out64(i64 inreg %x, i64 inreg %y, i64 inreg %mask) {
 ; GCN-LABEL: s_out64:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    s_xor_b64 s[0:1], s[0:1], s[2:3]
-; GCN-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
 ; GCN-NEXT:    s_and_b64 s[0:1], s[0:1], s[16:17]
-; GCN-NEXT:    s_xor_b64 s[0:1], s[0:1], s[2:3]
-; GCN-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GCN-NEXT:    s_and_not1_b64 s[2:3], s[2:3], s[16:17]
+; GCN-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GCN-NEXT:    s_or_b64 s[0:1], s[0:1], s[2:3]
 ; GCN-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %mx = and i64 %x, %mask
@@ -429,11 +427,10 @@ define i32 @s_out_constant_varx_42(i32 inreg %x, i32 inreg %y, i32 inreg %mask)
 ; GCN-LABEL: s_out_constant_varx_42:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    s_xor_b32 s0, s0, 42
+; GCN-NEXT:    s_and_b32 s0, s2, s0
+; GCN-NEXT:    s_and_not1_b32 s1, 42, s2
 ; GCN-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GCN-NEXT:    s_and_b32 s0, s0, s2
-; GCN-NEXT:    s_xor_b32 s0, s0, 42
-; GCN-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GCN-NEXT:    s_or_b32 s0, s0, s1
 ; GCN-NEXT:    v_mov_b32_e32 v0, s0
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %notmask = xor i32 %mask, -1
@@ -465,11 +462,10 @@ define i32 @s_out_constant_varx_42_invmask(i32 inreg %x, i32 inreg %y, i32 inreg
 ; GCN-LABEL: s_out_constant_varx_42_invmask:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    s_xor_b32 s1, s0, 42
+; GCN-NEXT:    s_and_not1_b32 s0, s0, s2
+; GCN-NEXT:    s_and_b32 s1, s2, 42
 ; GCN-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GCN-NEXT:    s_and_b32 s1, s1, s2
-; GCN-NEXT:    s_xor_b32 s0, s1, s0
-; GCN-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GCN-NEXT:    s_or_b32 s0, s0, s1
 ; GCN-NEXT:    v_mov_b32_e32 v0, s0
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %notmask = xor i32 %mask, -1
@@ -564,11 +560,10 @@ define i32 @s_out_constant_42_vary(i32 inreg %x, i32 inreg %y, i32 inreg %mask)
 ; GCN-LABEL: s_out_constant_42_vary:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    s_xor_b32 s0, s1, 42
+; GCN-NEXT:    s_and_b32 s0, s2, 42
+; GCN-NEXT:    s_and_not1_b32 s1, s1, s2
 ; GCN-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GCN-NEXT:    s_and_b32 s0, s0, s2
-; GCN-NEXT:    s_xor_b32 s0, s0, s1
-; GCN-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GCN-NEXT:    s_or_b32 s0, s0, s1
 ; GCN-NEXT:    v_mov_b32_e32 v0, s0
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %notmask = xor i32 %mask, -1
@@ -600,11 +595,10 @@ define i32 @s_out_constant_42_vary_invmask(i32 inreg %x, i32 inreg %y, i32 inreg
 ; GCN-LABEL: s_out_constant_42_vary_invmask:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    s_xor_b32 s0, s1, 42
+; GCN-NEXT:    s_and_not1_b32 s0, 42, s2
+; GCN-NEXT:    s_and_b32 s1, s2, s1
 ; GCN-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GCN-NEXT:    s_and_b32 s0, s0, s2
-; GCN-NEXT:    s_xor_b32 s0, s0, 42
-; GCN-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GCN-NEXT:    s_or_b32 s0, s0, s1
 ; GCN-NEXT:    v_mov_b32_e32 v0, s0
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %notmask = xor i32 %mask, -1
diff --git a/llvm/test/CodeGen/NVPTX/unfold-masked-merge-vector-variablemask.ll b/llvm/test/CodeGen/NVPTX/unfold-masked-merge-vector-variablemask.ll
index bac8bbbf0b4de..8e4c77e76029c 100644
--- a/llvm/test/CodeGen/NVPTX/unfold-masked-merge-vector-variablemask.ll
+++ b/llvm/test/CodeGen/NVPTX/unfold-masked-merge-vector-variablemask.ll
@@ -8,16 +8,17 @@
 define <1 x i8> @out_v1i8(<1 x i8> %x, <1 x i8> %y, <1 x i8> %mask) nounwind {
 ; CHECK-LABEL: out_v1i8(
 ; CHECK:       {
-; CHECK-NEXT:    .reg .b16 %rs<7>;
+; CHECK-NEXT:    .reg .b16 %rs<8>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
 ; CHECK-NEXT:    ld.param.b8 %rs1, [out_v1i8_param_0];
-; CHECK-NEXT:    ld.param.b8 %rs2, [out_v1i8_param_1];
-; CHECK-NEXT:    ld.param.b8 %rs3, [out_v1i8_param_2];
-; CHECK-NEXT:    xor.b16 %rs4, %rs1, %rs2;
-; CHECK-NEXT:    and.b16 %rs5, %rs4, %rs3;
-; CHECK-NEXT:    xor.b16 %rs6, %rs5, %rs2;
-; CHECK-NEXT:    st.param.b8 [func_retval0], %rs6;
+; CHECK-NEXT:    ld.param.b8 %rs2, [out_v1i8_param_2];
+; CHECK-NEXT:    and.b16 %rs3, %rs1, %rs2;
+; CHECK-NEXT:    ld.param.b8 %rs4, [out_v1i8_param_1];
+; CHECK-NEXT:    not.b16 %rs5, %rs2;
+; CHECK-NEXT:    and.b16 %rs6, %rs4, %rs5;
+; CHECK-NEXT:    or.b16 %rs7, %rs3, %rs6;
+; CHECK-NEXT:    st.param.b8 [func_retval0], %rs7;
 ; CHECK-NEXT:    ret;
   %mx = and <1 x i8> %x, %mask
   %notmask = xor <1 x i8> %mask, <i8 -1>
@@ -33,16 +34,17 @@ define <1 x i8> @out_v1i8(<1 x i8> %x, <1 x i8> %y, <1 x i8> %mask) nounwind {
 define <1 x i16> @out_v1i16(<1 x i16> %x, <1 x i16> %y, <1 x i16> %mask) nounwind {
 ; CHECK-LABEL: out_v1i16(
 ; CHECK:       {
-; CHECK-NEXT:    .reg .b16 %rs<7>;
+; CHECK-NEXT:    .reg .b16 %rs<8>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
 ; CHECK-NEXT:    ld.param.b16 %rs1, [out_v1i16_param_0];
-; CHECK-NEXT:    ld.param.b16 %rs2, [out_v1i16_param_1];
-; CHECK-NEXT:    ld.param.b16 %rs3, [out_v1i16_param_2];
-; CHECK-NEXT:    xor.b16 %rs4, %rs1, %rs2;
-; CHECK-NEXT:    and.b16 %rs5, %rs4, %rs3;
-; CHECK-NEXT:    xor.b16 %rs6, %rs5, %rs2;
-; CHECK-NEXT:    st.param.b16 [func_retval0], %rs6;
+; CHECK-NEXT:    ld.param.b16 %rs2, [out_v1i16_param_2];
+; CHECK-NEXT:    and.b16 %rs3, %rs1, %rs2;
+; CHECK-NEXT:    ld.param.b16 %rs4, [out_v1i16_param_1];
+; CHECK-NEXT:    not.b16 %rs5, %rs2;
+; CHECK-NEXT:    and.b16 %rs6, %rs4, %rs5;
+; CHECK-NEXT:    or.b16 %rs7, %rs3, %rs6;
+; CHECK-NEXT:    st.param.b16 [func_retval0], %rs7;
 ; CHECK-NEXT:    ret;
   %mx = and <1 x i16> %x, %mask
   %notmask = xor <1 x i16> %mask, <i16 -1>
@@ -124,16 +126,17 @@ define <2 x i16> @out_v2i16(<2 x i16> %x, <2 x i16> %y, <2 x i16> %mask) nounwin
 define <1 x i32> @out_v1i32(<1 x i32> %x, <1 x i32> %y, <1 x i32> %mask) nounwind {
 ; CHECK-LABEL: out_v1i32(
 ; CHECK:       {
-; CHECK-NEXT:    .reg .b32 %r<7>;
+; CHECK-NEXT:    .reg .b32 %r<8>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
 ; CHECK-NEXT:    ld.param.b32 %r1, [out_v1i32_param_0];
-; CHECK-NEXT:    ld.param.b32 %r2, [out_v1i32_param_1];
-; CHECK-NEXT:    ld.param.b32 %r3, [out_v1i32_param_2];
-; CHECK-NEXT:    xor.b32 %r4, %r1, %r2;
-; CHECK-NEXT:    and.b32 %r5, %r4, %r3;
-; CHECK-NEXT:    xor.b32 %r6, %r5, %r2;
-; CHECK-NEXT:    st.param.b32 [func_retval0], %r6;
+; CHECK-NEXT:    ld.param.b32 %r2, [out_v1i32_param_2];
+; CHECK-NEXT:    and.b32 %r3, %r1, %r2;
+; CHECK-NEXT:    ld.param.b32 %r4, [out_v1i32_param_1];
+; CHECK-NEXT:    not.b32 %r5, %r2;
+; CHECK-NEXT:    and.b32 %r6, %r4, %r5;
+; CHECK-NEXT:    or.b32 %r7, %r3, %r6;
+; CHECK-NEXT:    st.param.b32 [func_retval0], %r7;
 ; CHECK-NEXT:    ret;
   %mx = and <1 x i32> %x, %mask
   %notmask = xor <1 x i32> %mask, <i32 -1>
@@ -227,19 +230,21 @@ define <4 x i16> @out_v4i16_undef(<4 x i16> %x, <4 x i16> %y, <4 x i16> %mask) n
 define <2 x i32> @out_v2i32(<2 x i32> %x, <2 x i32> %y, <2 x i32> %mask) nounwind {
 ; CHECK-LABEL: out_v2i32(
 ; CHECK:       {
-; CHECK-NEXT:    .reg .b32 %r<13>;
+; CHECK-NEXT:    .reg .b32 %r<15>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
 ; CHECK-NEXT:    ld.param.v2.b32 {%r1, %r2}, [out_v2i32_param_0];
-; CHECK-NEXT:    ld.param.v2.b32 {%r3, %r4}, [out_v2i32_param_1];
-; CHECK-NEXT:    ld.param.v2.b32 {%r5, %r6}, [out_v2i32_param_2];
-; CHECK-NEXT:    xor.b32 %r7, %r2, %r4;
-; CHECK-NEXT:    and.b32 %r8, %r7, %r6;
-; CHECK-NEXT:    xor.b32 %r9, %r8, %r4;
-; CHECK-NEXT:    xor.b32 %r10, %r1, %r3;
-; CHECK-NEXT:    and.b32 %r11, %r10, %r5;
-; CHECK-NEXT:    xor.b32 %r12, %r11, %r3;
-; CHECK-NEXT:    st.param.v2.b32 [func_retval0], {%r12, %r9};
+; CHECK-NEXT:    ld.param.v2.b32 {%r3, %r4}, [out_v2i32_param_2];
+; CHECK-NEXT:    and.b32 %r5, %r1, %r3;
+; CHECK-NEXT:    and.b32 %r6, %r2, %r4;
+; CHECK-NEXT:    ld.param.v2.b32 {%r7, %r8}, [out_v2i32_param_1];
+; CHECK-NEXT:    not.b32 %r9, %r4;
+; CHECK-NEXT:    not.b32 %r10, %r3;
+; CHECK-NEXT:    and.b32 %r11, %r7, %r10;
+; CHECK-NEXT:    and.b32 %r12, %r8, %r9;
+; CHECK-NEXT:    or.b32 %r13, %r6, %r12;
+; CHECK-NEXT:    or.b32 %r14, %r5, %r11;
+; CHECK-NEXT:    st.param.v2.b32 [func_retval0], {%r14, %r13};
 ; CHECK-NEXT:    ret;
   %mx = and <2 x i32> %x, %mask
   %notmask = xor <2 x i32> %mask, <i32 -1, i32 -1>
@@ -251,16 +256,17 @@ define <2 x i32> @out_v2i32(<2 x i32> %x, <2 x i32> %y, <2 x i32> %mask) nounwin
 define <1 x i64> @out_v1i64(<1 x i64> %x, <1 x i64> %y, <1 x i64> %mask) nounwind {
 ; CHECK-LABEL: out_v1i64(
 ; CHECK:       {
-; CHECK-NEXT:    .reg .b64 %rd<7>;
+; CHECK-NEXT:    .reg .b64 %rd<8>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
 ; CHECK-NEXT:    ld.param.b64 %rd1, [out_v1i64_param_0];
-; CHECK-NEXT:    ld.param.b64 %rd2, [out_v1i64_param_1];
-; CHECK-NEXT:    ld.param.b64 %rd3, [out_v1i64_param_2];
-; CHECK-NEXT:    xor.b64 %rd4, %rd1, %rd2;
-; CHECK-NEXT:    and.b64 %rd5, %rd4, %rd3;
-; CHECK-NEXT:    xor.b64 %rd6, %rd5, %rd2;
-; CHECK-NEXT:    st.param.b64 [func_retval0], %rd6;
+; CHECK-NEXT:    ld.param.b64 %rd2, [out_v1i64_param_2];
+; CHECK-NEXT:    and.b64 %rd3, %rd1, %rd2;
+; CHECK-NEXT:    ld.param.b64 %rd4, [out_v1i64_param_1];
+; CHECK-NEXT:    not.b64 %rd5, %rd2;
+; CHECK-NEXT:    and.b64 %rd6, %rd4, %rd5;
+; CHECK-NEXT:    or.b64 %rd7, %rd3, %rd6;
+; CHECK-NEXT:    st.param.b64 [func_retval0], %rd7;
 ; CHECK-NEXT:    ret;
   %mx = and <1 x i64> %x, %mask
   %notmask = xor <1 x i64> %mask, <i64 -1>
@@ -344,25 +350,29 @@ define <8 x i16> @out_v8i16(<8 x i16> %x, <8 x i16> %y, <8 x i16> %mask) nounwin
 define <4 x i32> @out_v4i32(<4 x i32> %x, <4 x i32> %y, <4 x i32> %mask) nounwind {
 ; CHECK-LABEL: out_v4i32(
 ; CHECK:       {
-; CHECK-NEXT:    .reg .b32 %r<25>;
+; CHECK-NEXT:    .reg .b32 %r<29>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
 ; CHECK-NEXT:    ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [out_v4i32_param_0];
-; CHECK-NEXT:    ld.param.v4.b32 {%r5, %r6, %r7, %r8}, [out_v4i32_param_1];
-; CHECK-NEXT:    ld.param.v4.b32 {%r9, %r10, %r11, %r12}, [out_v4i32_param_2];
-; CHECK-NEXT:    xor.b32 %r13, %r4, %r8;
-; CHECK-NEXT:    and.b32 %r14, %r13, %r12;
-; CHECK-NEXT:    xor.b32 %r15, %r14, %r8;
-; CHECK-NEXT:    xor.b32 %r16, %r3, %r7;
-; CHECK-NEXT:    and.b32 %r17, %r16, %r11;
-; CHECK-NEXT:    xor.b32 %r18, %r17, %r7;
-; CHECK-NEXT:    xor.b32 %r19, %r2, %r6;
-; CHECK-NEXT:    and.b32 %r20, %r19, %r10;
-; CHECK-NEXT:    xor.b32 %r21, %r20, %r6;
-; CHECK-NEXT:    xor.b32 %r22, %r1, %r5;
-; CHECK-NEXT:    and.b32 %r23, %r22, %r9;
-; CHECK-NEXT:    xor.b32 %r24, %r23, %r5;
-; CHECK-NEXT:    st.param.v4.b32 [func_retval0], {%r24, %r21, %r18, %r15};
+; CHECK-NEXT:    ld.param.v4.b32 {%r5, %r6, %r7, %r8}, [out_v4i32_param_2];
+; CHECK-NEXT:    and.b32 %r9, %r1, %r5;
+; CHECK-NEXT:    and.b32 %r10, %r2, %r6;
+; CHECK-NEXT:    and.b32 %r11, %r3, %r7;
+; CHECK-NEXT:    and.b32 %r12, %r4, %r8;
+; CHECK-NEXT:    ld.param.v4.b32 {%r13, %r14, %r15, %r16}, [out_v4i32_param_1];
+; CHECK-NEXT:    not.b32 %r17, %r8;
+; CHECK-NEXT:    not.b32 %r18, %r7;
+; CHECK-NEXT:    not.b32 %r19, %r6;
+; CHECK-NEXT:    not.b32 %r20, %r5;
+; CHECK-NEXT:    and.b32 %r21, %r13, %r20;
+; CHECK-NEXT:    and.b32 %r22, %r14, %r19;
+; CHECK-NEXT:    and.b32 %r23, %r15, %r18;
+; CHECK-NEXT:    and.b32 %r24, %r16, %r17;
+; CHECK-NEXT:    or.b32 %r25, %r12, %r24;
+; CHECK-NEXT:    or.b32 %r26, %r11, %r23;
+; CHECK-NEXT:    or.b32 %r27, %r10, %r22;
+; CHECK-NEXT:    or.b32 %r28, %r9, %r21;
+; CHECK-NEXT:    st.param.v4.b32 [func_retval0], {%r28, %r27, %r26, %r25};
 ; CHECK-NEXT:    ret;
   %mx = and <4 x i32> %x, %mask
   %notmask = xor <4 x i32> %mask, <i32 -1, i32 -1, i32 -1, i32 -1>
@@ -374,23 +384,26 @@ define <4 x i32> @out_v4i32(<4 x i32> %x, <4 x i32> %y, <4 x i32> %mask) nounwin
 define <4 x i32> @out_v4i32_undef(<4 x i32> %x, <4 x i32> %y, <4 x i32> %mask) nounwind {
 ; CHECK-LABEL: out_v4i32_undef(
 ; CHECK:       {
-; CHECK-NEXT:    .reg .b32 %r<23>;
+; CHECK-NEXT:    .reg .b32 %r<26>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
 ; CHECK-NEXT:    ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [out_v4i32_undef_param_0];
 ; CHECK-NEXT:    ld.param.v4.b32 {%r5, %r6, %r7, %r8}, [out_v4i32_undef_param_2];
 ; CHECK-NEXT:    and.b32 %r9, %r3, %r7;
-; CHECK-NEXT:    ld.param.v4.b32 {%r10, %r11, %r12, %r13}, [out_v4i32_undef_param_1];
-; CHECK-NEXT:    xor.b32 %r14, %r4, %r13;
-; CHECK-NEXT:    and.b32 %r15, %r14, %r8;
-; CHECK-NEXT:    xor.b32 %r16, %r15, %r13;
-; CHECK-NEXT:    xor.b32 %r17, %r2, %r11;
-; CHECK-NEXT:    and.b32 %r18, %r17, %r6;
-; CHECK-NEXT:    xor.b32 %r19, %r18, %r11;
-; CHECK-NEXT:    xor.b32 %r20, %r1, %r10;
-; CHECK-NEXT:    and.b32 %r21, %r20, %r5;
-; CHECK-NEXT:    xor.b32 %r22, %r21, %r10;
-; CHECK-NEXT:    st.param.v4.b32 [func_retval0], {%r22, %r19, %r9, %r16};
+; CHECK-NEXT:    and.b32 %r10, %r1, %r5;
+; CHECK-NEXT:    and.b32 %r11, %r2, %r6;
+; CHECK-NEXT:    and.b32 %r12, %r4, %r8;
+; CHECK-NEXT:    ld.param.v4.b32 {%r13, %r14, %r15, %r16}, [out_v4i32_undef_param_1];
+; CHECK-NEXT:    not.b32 %r17, %r8;
+; CHECK-NEXT:    not.b32 %r18, %r6;
+; CHECK-NEXT:    not.b32 %r19, %r5;
+; CHECK-NEXT:    and.b32 %r20, %r13, %r19;
+; CHECK-NEXT:    and.b32 %r21, %r14, %r18;
+; CHECK-NEXT:    and.b32 %r22, %r16, %r17;
+; CHECK-NEXT:    or.b32 %r23, %r12, %r22;
+; CHECK-NEXT:    or.b32 %r24, %r11, %r21;
+; CHECK-NEXT:    or.b32 %r25, %r10, %r20;
+; CHECK-NEXT:    st.param.v4.b32 [func_retval0], {%r25, %r24, %r9, %r23};
 ; CHECK-NEXT:    ret;
   %mx = and <4 x i32> %x, %mask
   %notmask = xor <4 x i32> %mask, <i32 -1, i32 -1, i32 undef, i32 -1>
@@ -402,19 +415,21 @@ define <4 x i32> @out_v4i32_undef(<4 x i32> %x, <4 x i32> %y, <4 x i32> %mask) n
 define <2 x i64> @out_v2i64(<2 x i64> %x, <2 x i64> %y, <2 x i64> %mask) nounwind {
 ; CHECK-LABEL: out_v2i64(
 ; CHECK:       {
-; CHECK-NEXT:    .reg .b64 %rd<13>;
+; CHECK-NEXT:    .reg .b64 %rd<15>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
 ; CHECK-NEXT:    ld.param.v2.b64 {%rd1, %rd2}, [out_v2i64_param_0];
-; CHECK-NEXT:    ld.param.v2.b64 {%rd3, %rd4}, [out_v2i64_param_1];
-; CHECK-NEXT:    ld.param.v2.b64 {%rd5, %rd6}, [out_v2i64_param_2];
-; CHECK-NEXT:    xor.b64 %rd7, %rd2, %rd4;
-; CHECK-NEXT:    and.b64 %rd8, %rd7, %rd6;
-; CHECK-NEXT:    xor.b64 %rd9, %rd8, %rd4;
-; CHECK-NEXT:    xor.b64 %rd10, %rd1, %rd3;
-; CHECK-NEXT:    and.b64 %rd11, %rd10, %rd5;
-; CHECK-NEXT:    xor.b64 %rd12, %rd11, %rd3;
-; CHECK-NEXT:    st.param.v2.b64 [func_retval0], {%rd12, %rd9};
+; CHECK-NEXT:    ld.param.v2.b64 {%rd3, %rd4}, [out_v2i64_param_2];
+; CHECK-NEXT:    and.b64 %rd5, %rd1, %rd3;
+; CHECK-NEXT:    and.b64 %rd6, %rd2, %rd4;
+; CHECK-NEXT:    ld.param.v2.b64 {%rd7, %rd8}, [out_v2i64_param_1];
+; CHECK-NEXT:    not.b64 %rd9, %rd4;
+; CHECK-NEXT:    not.b64 %rd10, %rd3;
+; CHECK-NEXT:    and.b64 %rd11, %rd7, %rd10;
+; CHECK-NEXT:    and.b64 %rd12, %rd8, %rd9;
+; CHECK-NEXT:    or.b64 %rd13, %rd6, %rd12;
+; CHECK-NEXT:    or.b64 %rd14, %rd5, %rd11;
+; CHECK-NEXT:    st.param.v2.b64 [func_retval0], {%rd14, %rd13};
 ; CHECK-NEXT:    ret;
   %mx = and <2 x i64> %x, %mask
   %notmask = xor <2 x i64> %mask, <i64 -1, i64 -1>
diff --git a/llvm/test/CodeGen/RISCV/fold-masked-merge.ll b/llvm/test/CodeGen/RISCV/fold-masked-merge.ll
deleted file mode 100644
index 631b7109281e5..0000000000000
--- a/llvm/test/CodeGen/RISCV/fold-masked-merge.ll
+++ /dev/null
@@ -1,302 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=riscv32 < %s \
-; RUN:   | FileCheck %s --check-prefixes=CHECK,CHECK-I,RV32,RV32I
-; RUN: llc -mtriple=riscv64 < %s \
-; RUN:   | FileCheck %s --check-prefixes=CHECK,CHECK-I,RV64,RV64I
-; RUN: llc -mtriple=riscv32 -mattr=+zbb < %s \
-; RUN:   | FileCheck %s --check-prefixes=CHECK,CHECK-ZBB,RV32,RV32ZBB
-; RUN: llc -mtriple=riscv64 -mattr=+zbb < %s \
-; RUN:   | FileCheck %s --check-prefixes=CHECK,CHECK-ZBB,RV64,RV64ZBB
-;
-; test that masked-merge code is generated as "xor;and;xor" sequence or
-; "andn ; and; or" if and-not is available.
-
-define i32 @masked_merge0(i32 %a0, i32 %a1, i32 %a2) {
-; CHECK-I-LABEL: masked_merge0:
-; CHECK-I:       # %bb.0:
-; CHECK-I-NEXT:    xor a1, a1, a2
-; CHECK-I-NEXT:    and a0, a1, a0
-; CHECK-I-NEXT:    xor a0, a0, a2
-; CHECK-I-NEXT:    ret
-;
-; CHECK-ZBB-LABEL: masked_merge0:
-; CHECK-ZBB:       # %bb.0:
-; CHECK-ZBB-NEXT:    and a1, a0, a1
-; CHECK-ZBB-NEXT:    andn a0, a2, a0
-; CHECK-ZBB-NEXT:    or a0, a1, a0
-; CHECK-ZBB-NEXT:    ret
-  %and0 = and i32 %a0, %a1
-  %not = xor i32 %a0, -1
-  %and1 = and i32 %not, %a2
-  %or = or i32 %and0, %and1
-  ret i32 %or
-}
-
-define i16 @masked_merge1(i16 %a0, i16 %a1, i16 %a2) {
-; CHECK-I-LABEL: masked_merge1:
-; CHECK-I:       # %bb.0:
-; CHECK-I-NEXT:    xor a1, a1, a2
-; CHECK-I-NEXT:    and a0, a1, a0
-; CHECK-I-NEXT:    xor a0, a0, a2
-; CHECK-I-NEXT:    ret
-;
-; CHECK-ZBB-LABEL: masked_merge1:
-; CHECK-ZBB:       # %bb.0:
-; CHECK-ZBB-NEXT:    and a1, a0, a1
-; CHECK-ZBB-NEXT:    andn a0, a2, a0
-; CHECK-ZBB-NEXT:    or a0, a1, a0
-; CHECK-ZBB-NEXT:    ret
-  %and0 = and i16 %a0, %a1
-  %not = xor i16 %a0, -1
-  %and1 = and i16 %a2, %not
-  %or = or i16 %and0, %and1
-  ret i16 %or
-}
-
-define i8 @masked_merge2(i8 %a0, i8 %a1, i8 %a2) {
-; CHECK-I-LABEL: masked_merge2:
-; CHECK-I:       # %bb.0:
-; CHECK-I-NEXT:    mv a0, a1
-; CHECK-I-NEXT:    ret
-;
-; CHECK-ZBB-LABEL: masked_merge2:
-; CHECK-ZBB:       # %bb.0:
-; CHECK-ZBB-NEXT:    andn a2, a1, a0
-; CHECK-ZBB-NEXT:    and a0, a1, a0
-; CHECK-ZBB-NEXT:    or a0, a2, a0
-; CHECK-ZBB-NEXT:    ret
-  %not = xor i8 %a0, -1
-  %and0 = and i8 %not, %a1
-  %and1 = and i8 %a1, %a0
-  %or = or i8 %and0, %and1
-  ret i8 %or
-}
-
-define i64 @masked_merge3(i64 %a0, i64 %a1, i64 %a2) {
-; RV32I-LABEL: masked_merge3:
-; RV32I:       # %bb.0:
-; RV32I-NEXT:    not a5, a5
-; RV32I-NEXT:    not a4, a4
-; RV32I-NEXT:    xor a3, a3, a5
-; RV32I-NEXT:    xor a2, a2, a4
-; RV32I-NEXT:    not a2, a2
-; RV32I-NEXT:    not a3, a3
-; RV32I-NEXT:    and a0, a2, a0
-; RV32I-NEXT:    and a1, a3, a1
-; RV32I-NEXT:    xor a0, a0, a4
-; RV32I-NEXT:    xor a1, a1, a5
-; RV32I-NEXT:    ret
-;
-; RV64I-LABEL: masked_merge3:
-; RV64I:       # %bb.0:
-; RV64I-NEXT:    not a2, a2
-; RV64I-NEXT:    xor a1, a1, a2
-; RV64I-NEXT:    not a1, a1
-; RV64I-NEXT:    and a0, a1, a0
-; RV64I-NEXT:    xor a0, a0, a2
-; RV64I-NEXT:    ret
-;
-; RV32ZBB-LABEL: masked_merge3:
-; RV32ZBB:       # %bb.0:
-; RV32ZBB-NEXT:    not a6, a0
-; RV32ZBB-NEXT:    not a7, a1
-; RV32ZBB-NEXT:    andn a1, a1, a3
-; RV32ZBB-NEXT:    andn a0, a0, a2
-; RV32ZBB-NEXT:    andn a2, a7, a5
-; RV32ZBB-NEXT:    andn a3, a6, a4
-; RV32ZBB-NEXT:    or a0, a3, a0
-; RV32ZBB-NEXT:    or a1, a2, a1
-; RV32ZBB-NEXT:    ret
-;
-; RV64ZBB-LABEL: masked_merge3:
-; RV64ZBB:       # %bb.0:
-; RV64ZBB-NEXT:    not a3, a0
-; RV64ZBB-NEXT:    andn a2, a3, a2
-; RV64ZBB-NEXT:    andn a0, a0, a1
-; RV64ZBB-NEXT:    or a0, a2, a0
-; RV64ZBB-NEXT:    ret
-  %v0 = xor i64 %a1, -1
-  %v1 = xor i64 %a2, -1
-  %not = xor i64 %a0, -1
-  %and0 = and i64 %not, %v1
-  %and1 = and i64 %v0, %a0
-  %or = or i64 %and0, %and1
-  ret i64 %or
-}
-
-define i32 @not_a_masked_merge0(i32 %a0, i32 %a1, i32 %a2) {
-; RV32-LABEL: not_a_masked_merge0:
-; RV32:       # %bb.0:
-; RV32-NEXT:    and a1, a0, a1
-; RV32-NEXT:    neg a0, a0
-; RV32-NEXT:    and a0, a0, a2
-; RV32-NEXT:    or a0, a1, a0
-; RV32-NEXT:    ret
-;
-; RV64-LABEL: not_a_masked_merge0:
-; RV64:       # %bb.0:
-; RV64-NEXT:    and a1, a0, a1
-; RV64-NEXT:    negw a0, a0
-; RV64-NEXT:    and a0, a0, a2
-; RV64-NEXT:    or a0, a1, a0
-; RV64-NEXT:    ret
-  %and0 = and i32 %a0, %a1
-  %not_a_not = sub i32 0, %a0
-  %and1 = and i32 %not_a_not, %a2
-  %or = or i32 %and0, %and1
-  ret i32 %or
-}
-
-define i32 @not_a_masked_merge1(i32 %a0, i32 %a1, i32 %a2, i32 %a3) {
-; CHECK-I-LABEL: not_a_masked_merge1:
-; CHECK-I:       # %bb.0:
-; CHECK-I-NEXT:    and a0, a0, a1
-; CHECK-I-NEXT:    not a1, a3
-; CHECK-I-NEXT:    and a1, a1, a2
-; CHECK-I-NEXT:    or a0, a0, a1
-; CHECK-I-NEXT:    ret
-;
-; CHECK-ZBB-LABEL: not_a_masked_merge1:
-; CHECK-ZBB:       # %bb.0:
-; CHECK-ZBB-NEXT:    and a0, a0, a1
-; CHECK-ZBB-NEXT:    andn a1, a2, a3
-; CHECK-ZBB-NEXT:    or a0, a0, a1
-; CHECK-ZBB-NEXT:    ret
-  %and0 = and i32 %a0, %a1
-  %not = xor i32 %a3, -1
-  %and1 = and i32 %not, %a2
-  %or = or i32 %and0, %and1
-  ret i32 %or
-}
-
-define i32 @not_a_masked_merge2(i32 %a0, i32 %a1, i32 %a2) {
-; CHECK-I-LABEL: not_a_masked_merge2:
-; CHECK-I:       # %bb.0:
-; CHECK-I-NEXT:    or a1, a0, a1
-; CHECK-I-NEXT:    not a0, a0
-; CHECK-I-NEXT:    and a0, a0, a2
-; CHECK-I-NEXT:    or a0, a1, a0
-; CHECK-I-NEXT:    ret
-;
-; CHECK-ZBB-LABEL: not_a_masked_merge2:
-; CHECK-ZBB:       # %bb.0:
-; CHECK-ZBB-NEXT:    or a1, a0, a1
-; CHECK-ZBB-NEXT:    andn a0, a2, a0
-; CHECK-ZBB-NEXT:    or a0, a1, a0
-; CHECK-ZBB-NEXT:    ret
-  %not_an_and0 = or i32 %a0, %a1
-  %not = xor i32 %a0, -1
-  %and1 = and i32 %not, %a2
-  %or = or i32 %not_an_and0, %and1
-  ret i32 %or
-}
-
-define i32 @not_a_masked_merge3(i32 %a0, i32 %a1, i32 %a2) {
-; CHECK-I-LABEL: not_a_masked_merge3:
-; CHECK-I:       # %bb.0:
-; CHECK-I-NEXT:    and a1, a0, a1
-; CHECK-I-NEXT:    xor a0, a0, a2
-; CHECK-I-NEXT:    not a0, a0
-; CHECK-I-NEXT:    or a0, a1, a0
-; CHECK-I-NEXT:    ret
-;
-; CHECK-ZBB-LABEL: not_a_masked_merge3:
-; CHECK-ZBB:       # %bb.0:
-; CHECK-ZBB-NEXT:    and a1, a0, a1
-; CHECK-ZBB-NEXT:    xor a0, a0, a2
-; CHECK-ZBB-NEXT:    orn a0, a1, a0
-; CHECK-ZBB-NEXT:    ret
-  %and0 = and i32 %a0, %a1
-  %not = xor i32 %a0, -1
-  %not_an_and1 = xor i32 %not, %a2
-  %or = or i32 %and0, %not_an_and1
-  ret i32 %or
-}
-
-define i32 @not_a_masked_merge4(i32 %a0, i32 %a1, i32 %a2) {
-; CHECK-LABEL: not_a_masked_merge4:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    and a0, a0, a1
-; CHECK-NEXT:    ret
-  %and0 = and i32 %a0, %a1
-  %not = xor i32 %a2, -1
-  %and1 = and i32 %not, %a2
-  %or = or i32 %and0, %and1
-  ret i32 %or
-}
-
-define i32 @masked_merge_no_transform0(i32 %a0, i32 %a1, i32 %a2, ptr %p1) {
-; CHECK-I-LABEL: masked_merge_no_transform0:
-; CHECK-I:       # %bb.0:
-; CHECK-I-NEXT:    and a1, a0, a1
-; CHECK-I-NEXT:    not a0, a0
-; CHECK-I-NEXT:    and a0, a0, a2
-; CHECK-I-NEXT:    or a0, a1, a0
-; CHECK-I-NEXT:    sw a1, 0(a3)
-; CHECK-I-NEXT:    ret
-;
-; CHECK-ZBB-LABEL: masked_merge_no_transform0:
-; CHECK-ZBB:       # %bb.0:
-; CHECK-ZBB-NEXT:    and a1, a0, a1
-; CHECK-ZBB-NEXT:    andn a0, a2, a0
-; CHECK-ZBB-NEXT:    or a0, a1, a0
-; CHECK-ZBB-NEXT:    sw a1, 0(a3)
-; CHECK-ZBB-NEXT:    ret
-  %and0 = and i32 %a0, %a1
-  %not = xor i32 %a0, -1
-  %and1 = and i32 %not, %a2
-  %or = or i32 %and0, %and1
-  store i32 %and0, ptr %p1
-  ret i32 %or
-}
-
-define i32 @masked_merge_no_transform1(i32 %a0, i32 %a1, i32 %a2, ptr %p1) {
-; CHECK-I-LABEL: masked_merge_no_transform1:
-; CHECK-I:       # %bb.0:
-; CHECK-I-NEXT:    and a1, a0, a1
-; CHECK-I-NEXT:    not a4, a0
-; CHECK-I-NEXT:    and a0, a4, a2
-; CHECK-I-NEXT:    or a0, a1, a0
-; CHECK-I-NEXT:    sw a4, 0(a3)
-; CHECK-I-NEXT:    ret
-;
-; CHECK-ZBB-LABEL: masked_merge_no_transform1:
-; CHECK-ZBB:       # %bb.0:
-; CHECK-ZBB-NEXT:    and a1, a0, a1
-; CHECK-ZBB-NEXT:    not a4, a0
-; CHECK-ZBB-NEXT:    andn a0, a2, a0
-; CHECK-ZBB-NEXT:    or a0, a1, a0
-; CHECK-ZBB-NEXT:    sw a4, 0(a3)
-; CHECK-ZBB-NEXT:    ret
-  %and0 = and i32 %a0, %a1
-  %not = xor i32 %a0, -1
-  %and1 = and i32 %not, %a2
-  %or = or i32 %and0, %and1
-  store i32 %not, ptr %p1
-  ret i32 %or
-}
-
-define i32 @masked_merge_no_transform2(i32 %a0, i32 %a1, i32 %a2, ptr %p1) {
-; CHECK-I-LABEL: masked_merge_no_transform2:
-; CHECK-I:       # %bb.0:
-; CHECK-I-NEXT:    and a1, a0, a1
-; CHECK-I-NEXT:    not a0, a0
-; CHECK-I-NEXT:    and a2, a0, a2
-; CHECK-I-NEXT:    or a0, a1, a2
-; CHECK-I-NEXT:    sw a2, 0(a3)
-; CHECK-I-NEXT:    ret
-;
-; CHECK-ZBB-LABEL: masked_merge_no_transform2:
-; CHECK-ZBB:       # %bb.0:
-; CHECK-ZBB-NEXT:    and a1, a0, a1
-; CHECK-ZBB-NEXT:    andn a2, a2, a0
-; CHECK-ZBB-NEXT:    or a0, a1, a2
-; CHECK-ZBB-NEXT:    sw a2, 0(a3)
-; CHECK-ZBB-NEXT:    ret
-  %and0 = and i32 %a0, %a1
-  %not = xor i32 %a0, -1
-  %and1 = and i32 %not, %a2
-  %or = or i32 %and0, %and1
-  store i32 %and1, ptr %p1
-  ret i32 %or
-}
diff --git a/llvm/test/CodeGen/RISCV/unfold-masked-merge-scalar-variablemask.ll b/llvm/test/CodeGen/RISCV/unfold-masked-merge-scalar-variablemask.ll
index efc8243df71e0..1517e524a7f78 100644
--- a/llvm/test/CodeGen/RISCV/unfold-masked-merge-scalar-variablemask.ll
+++ b/llvm/test/CodeGen/RISCV/unfold-masked-merge-scalar-variablemask.ll
@@ -8,13 +8,16 @@
 ; RUN: llc -mtriple=riscv64 -mattr=+zbb < %s \
 ; RUN:   | FileCheck %s --check-prefixes=CHECK,CHECK-ZBB,RV64,RV64ZBB
 
+; TODO: Should we convert these to X ^ ((X ^ Y) & M) form when Zbb isn't
+; present?
 
 define i8 @out8(i8 %x, i8 %y, i8 %mask) {
 ; CHECK-I-LABEL: out8:
 ; CHECK-I:       # %bb.0:
-; CHECK-I-NEXT:    xor a0, a0, a1
 ; CHECK-I-NEXT:    and a0, a0, a2
-; CHECK-I-NEXT:    xor a0, a0, a1
+; CHECK-I-NEXT:    not a2, a2
+; CHECK-I-NEXT:    and a1, a1, a2
+; CHECK-I-NEXT:    or a0, a0, a1
 ; CHECK-I-NEXT:    ret
 ;
 ; CHECK-ZBB-LABEL: out8:
@@ -33,9 +36,10 @@ define i8 @out8(i8 %x, i8 %y, i8 %mask) {
 define i16 @out16(i16 %x, i16 %y, i16 %mask) {
 ; CHECK-I-LABEL: out16:
 ; CHECK-I:       # %bb.0:
-; CHECK-I-NEXT:    xor a0, a0, a1
 ; CHECK-I-NEXT:    and a0, a0, a2
-; CHECK-I-NEXT:    xor a0, a0, a1
+; CHECK-I-NEXT:    not a2, a2
+; CHECK-I-NEXT:    and a1, a1, a2
+; CHECK-I-NEXT:    or a0, a0, a1
 ; CHECK-I-NEXT:    ret
 ;
 ; CHECK-ZBB-LABEL: out16:
@@ -54,9 +58,10 @@ define i16 @out16(i16 %x, i16 %y, i16 %mask) {
 define i32 @out32(i32 %x, i32 %y, i32 %mask) {
 ; CHECK-I-LABEL: out32:
 ; CHECK-I:       # %bb.0:
-; CHECK-I-NEXT:    xor a0, a0, a1
 ; CHECK-I-NEXT:    and a0, a0, a2
-; CHECK-I-NEXT:    xor a0, a0, a1
+; CHECK-I-NEXT:    not a2, a2
+; CHECK-I-NEXT:    and a1, a1, a2
+; CHECK-I-NEXT:    or a0, a0, a1
 ; CHECK-I-NEXT:    ret
 ;
 ; CHECK-ZBB-LABEL: out32:
@@ -75,19 +80,22 @@ define i32 @out32(i32 %x, i32 %y, i32 %mask) {
 define i64 @out64(i64 %x, i64 %y, i64 %mask) {
 ; RV32I-LABEL: out64:
 ; RV32I:       # %bb.0:
-; RV32I-NEXT:    xor a0, a0, a2
-; RV32I-NEXT:    xor a1, a1, a3
-; RV32I-NEXT:    and a0, a0, a4
 ; RV32I-NEXT:    and a1, a1, a5
-; RV32I-NEXT:    xor a0, a0, a2
-; RV32I-NEXT:    xor a1, a1, a3
+; RV32I-NEXT:    and a0, a0, a4
+; RV32I-NEXT:    not a4, a4
+; RV32I-NEXT:    not a5, a5
+; RV32I-NEXT:    and a3, a3, a5
+; RV32I-NEXT:    and a2, a2, a4
+; RV32I-NEXT:    or a0, a0, a2
+; RV32I-NEXT:    or a1, a1, a3
 ; RV32I-NEXT:    ret
 ;
 ; RV64I-LABEL: out64:
 ; RV64I:       # %bb.0:
-; RV64I-NEXT:    xor a0, a0, a1
 ; RV64I-NEXT:    and a0, a0, a2
-; RV64I-NEXT:    xor a0, a0, a1
+; RV64I-NEXT:    not a2, a2
+; RV64I-NEXT:    and a1, a1, a2
+; RV64I-NEXT:    or a0, a0, a1
 ; RV64I-NEXT:    ret
 ;
 ; RV32ZBB-LABEL: out64:
@@ -652,9 +660,10 @@ define i32 @in_constant_varx_mone_invmask(i32 %x, i32 %y, i32 %mask) {
 define i32 @out_constant_varx_42(i32 %x, i32 %y, i32 %mask) {
 ; CHECK-I-LABEL: out_constant_varx_42:
 ; CHECK-I:       # %bb.0:
-; CHECK-I-NEXT:    xori a0, a0, 42
-; CHECK-I-NEXT:    and a0, a0, a2
-; CHECK-I-NEXT:    xori a0, a0, 42
+; CHECK-I-NEXT:    not a1, a2
+; CHECK-I-NEXT:    and a0, a2, a0
+; CHECK-I-NEXT:    andi a1, a1, 42
+; CHECK-I-NEXT:    or a0, a0, a1
 ; CHECK-I-NEXT:    ret
 ;
 ; CHECK-ZBB-LABEL: out_constant_varx_42:
@@ -695,9 +704,10 @@ define i32 @in_constant_varx_42(i32 %x, i32 %y, i32 %mask) {
 define i32 @out_constant_varx_42_invmask(i32 %x, i32 %y, i32 %mask) {
 ; CHECK-I-LABEL: out_constant_varx_42_invmask:
 ; CHECK-I:       # %bb.0:
-; CHECK-I-NEXT:    xori a1, a0, 42
-; CHECK-I-NEXT:    and a1, a1, a2
-; CHECK-I-NEXT:    xor a0, a1, a0
+; CHECK-I-NEXT:    not a1, a2
+; CHECK-I-NEXT:    and a0, a1, a0
+; CHECK-I-NEXT:    andi a1, a2, 42
+; CHECK-I-NEXT:    or a0, a0, a1
 ; CHECK-I-NEXT:    ret
 ;
 ; CHECK-ZBB-LABEL: out_constant_varx_42_invmask:
@@ -802,9 +812,10 @@ define i32 @in_constant_mone_vary_invmask(i32 %x, i32 %y, i32 %mask) {
 define i32 @out_constant_42_vary(i32 %x, i32 %y, i32 %mask) {
 ; CHECK-I-LABEL: out_constant_42_vary:
 ; CHECK-I:       # %bb.0:
-; CHECK-I-NEXT:    xori a0, a1, 42
-; CHECK-I-NEXT:    and a0, a0, a2
-; CHECK-I-NEXT:    xor a0, a0, a1
+; CHECK-I-NEXT:    not a0, a2
+; CHECK-I-NEXT:    andi a2, a2, 42
+; CHECK-I-NEXT:    and a0, a0, a1
+; CHECK-I-NEXT:    or a0, a2, a0
 ; CHECK-I-NEXT:    ret
 ;
 ; CHECK-ZBB-LABEL: out_constant_42_vary:
@@ -844,9 +855,10 @@ define i32 @in_constant_42_vary(i32 %x, i32 %y, i32 %mask) {
 define i32 @out_constant_42_vary_invmask(i32 %x, i32 %y, i32 %mask) {
 ; CHECK-I-LABEL: out_constant_42_vary_invmask:
 ; CHECK-I:       # %bb.0:
-; CHECK-I-NEXT:    xori a0, a1, 42
-; CHECK-I-NEXT:    and a0, a0, a2
-; CHECK-I-NEXT:    xori a0, a0, 42
+; CHECK-I-NEXT:    not a0, a2
+; CHECK-I-NEXT:    andi a0, a0, 42
+; CHECK-I-NEXT:    and a1, a2, a1
+; CHECK-I-NEXT:    or a0, a0, a1
 ; CHECK-I-NEXT:    ret
 ;
 ; CHECK-ZBB-LABEL: out_constant_42_vary_invmask:
diff --git a/llvm/test/CodeGen/SystemZ/fold-masked-merge.ll b/llvm/test/CodeGen/SystemZ/fold-masked-merge.ll
deleted file mode 100644
index c014345507f69..0000000000000
--- a/llvm/test/CodeGen/SystemZ/fold-masked-merge.ll
+++ /dev/null
@@ -1,277 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
-; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z13 | FileCheck %s --check-prefix=NO-MISC3
-; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z15 | FileCheck %s --check-prefix=MISC3
-
-; test that masked-merge code is generated as "xor;and;xor" sequence or
-; "andn ; and; or" if and-not is available.
-
-define i32 @masked_merge0(i32 %a0, i32 %a1, i32 %a2) {
-; NO-MISC3-LABEL: masked_merge0:
-; NO-MISC3:       # %bb.0:
-; NO-MISC3-NEXT:    xr %r3, %r4
-; NO-MISC3-NEXT:    nr %r2, %r3
-; NO-MISC3-NEXT:    xr %r2, %r4
-; NO-MISC3-NEXT:    br %r14
-;
-; MISC3-LABEL: masked_merge0:
-; MISC3:       # %bb.0:
-; MISC3-NEXT:    nr %r3, %r2
-; MISC3-NEXT:    ncrk %r2, %r4, %r2
-; MISC3-NEXT:    or %r2, %r3
-; MISC3-NEXT:    br %r14
-  %and0 = and i32 %a0, %a1
-  %not = xor i32 %a0, -1
-  %and1 = and i32 %not, %a2
-  %or = or i32 %and0, %and1
-  ret i32 %or
-}
-
-define i16 @masked_merge1(i16 %a0, i16 %a1, i16 %a2) {
-; NO-MISC3-LABEL: masked_merge1:
-; NO-MISC3:       # %bb.0:
-; NO-MISC3-NEXT:    xr %r3, %r4
-; NO-MISC3-NEXT:    nr %r2, %r3
-; NO-MISC3-NEXT:    xr %r2, %r4
-; NO-MISC3-NEXT:    br %r14
-;
-; MISC3-LABEL: masked_merge1:
-; MISC3:       # %bb.0:
-; MISC3-NEXT:    ncrk %r0, %r4, %r2
-; MISC3-NEXT:    nr %r2, %r3
-; MISC3-NEXT:    or %r2, %r0
-; MISC3-NEXT:    br %r14
-  %and0 = and i16 %a0, %a1
-  %not = xor i16 %a0, -1
-  %and1 = and i16 %a2, %not
-  %or = or i16 %and0, %and1
-  ret i16 %or
-}
-
-define i8 @masked_merge2(i8 %a0, i8 %a1, i8 %a2) {
-; NO-MISC3-LABEL: masked_merge2:
-; NO-MISC3:       # %bb.0:
-; NO-MISC3-NEXT:    lr %r2, %r3
-; NO-MISC3-NEXT:    br %r14
-;
-; MISC3-LABEL: masked_merge2:
-; MISC3:       # %bb.0:
-; MISC3-NEXT:    lr %r2, %r3
-; MISC3-NEXT:    br %r14
-  %not = xor i8 %a0, -1
-  %and0 = and i8 %not, %a1
-  %and1 = and i8 %a1, %a0
-  %or = or i8 %and0, %and1
-  ret i8 %or
-}
-
-define i64 @masked_merge3(i64 %a0, i64 %a1, i64 %a2) {
-; NO-MISC3-LABEL: masked_merge3:
-; NO-MISC3:       # %bb.0:
-; NO-MISC3-NEXT:    lcgr %r0, %r4
-; NO-MISC3-NEXT:    aghi %r0, -1
-; NO-MISC3-NEXT:    xgr %r3, %r0
-; NO-MISC3-NEXT:    ngr %r3, %r2
-; NO-MISC3-NEXT:    xgr %r3, %r2
-; NO-MISC3-NEXT:    xgrk %r2, %r3, %r0
-; NO-MISC3-NEXT:    br %r14
-;
-; MISC3-LABEL: masked_merge3:
-; MISC3:       # %bb.0:
-; MISC3-NEXT:    lcgr %r0, %r2
-; MISC3-NEXT:    aghi %r0, -1
-; MISC3-NEXT:    ncgrk %r0, %r0, %r4
-; MISC3-NEXT:    ncgrk %r2, %r2, %r3
-; MISC3-NEXT:    ogr %r2, %r0
-; MISC3-NEXT:    br %r14
-  %v0 = xor i64 %a1, -1
-  %v1 = xor i64 %a2, -1
-  %not = xor i64 %a0, -1
-  %and0 = and i64 %not, %v1
-  %and1 = and i64 %v0, %a0
-  %or = or i64 %and0, %and1
-  ret i64 %or
-}
-
-define i32 @not_a_masked_merge0(i32 %a0, i32 %a1, i32 %a2) {
-; NO-MISC3-LABEL: not_a_masked_merge0:
-; NO-MISC3:       # %bb.0:
-; NO-MISC3-NEXT:    lcr %r0, %r2
-; NO-MISC3-NEXT:    nr %r3, %r2
-; NO-MISC3-NEXT:    nr %r0, %r4
-; NO-MISC3-NEXT:    ork %r2, %r3, %r0
-; NO-MISC3-NEXT:    br %r14
-;
-; MISC3-LABEL: not_a_masked_merge0:
-; MISC3:       # %bb.0:
-; MISC3-NEXT:    lcr %r0, %r2
-; MISC3-NEXT:    nr %r3, %r2
-; MISC3-NEXT:    nr %r0, %r4
-; MISC3-NEXT:    ork %r2, %r3, %r0
-; MISC3-NEXT:    br %r14
-  %and0 = and i32 %a0, %a1
-  %not_a_not = sub i32 0, %a0
-  %and1 = and i32 %not_a_not, %a2
-  %or = or i32 %and0, %and1
-  ret i32 %or
-}
-
-define i32 @not_a_masked_merge1(i32 %a0, i32 %a1, i32 %a2, i32 %a3) {
-; NO-MISC3-LABEL: not_a_masked_merge1:
-; NO-MISC3:       # %bb.0:
-; NO-MISC3-NEXT:    xilf %r5, 4294967295
-; NO-MISC3-NEXT:    nr %r2, %r3
-; NO-MISC3-NEXT:    nr %r4, %r5
-; NO-MISC3-NEXT:    or %r2, %r4
-; NO-MISC3-NEXT:    br %r14
-;
-; MISC3-LABEL: not_a_masked_merge1:
-; MISC3:       # %bb.0:
-; MISC3-NEXT:    nr %r2, %r3
-; MISC3-NEXT:    ncrk %r0, %r4, %r5
-; MISC3-NEXT:    or %r2, %r0
-; MISC3-NEXT:    br %r14
-  %and0 = and i32 %a0, %a1
-  %not = xor i32 %a3, -1
-  %and1 = and i32 %not, %a2
-  %or = or i32 %and0, %and1
-  ret i32 %or
-}
-
-define i32 @not_a_masked_merge2(i32 %a0, i32 %a1, i32 %a2) {
-; NO-MISC3-LABEL: not_a_masked_merge2:
-; NO-MISC3:       # %bb.0:
-; NO-MISC3-NEXT:    or %r3, %r2
-; NO-MISC3-NEXT:    xilf %r2, 4294967295
-; NO-MISC3-NEXT:    nr %r2, %r4
-; NO-MISC3-NEXT:    or %r2, %r3
-; NO-MISC3-NEXT:    br %r14
-;
-; MISC3-LABEL: not_a_masked_merge2:
-; MISC3:       # %bb.0:
-; MISC3-NEXT:    or %r3, %r2
-; MISC3-NEXT:    ncrk %r2, %r4, %r2
-; MISC3-NEXT:    or %r2, %r3
-; MISC3-NEXT:    br %r14
-  %not_an_and0 = or i32 %a0, %a1
-  %not = xor i32 %a0, -1
-  %and1 = and i32 %not, %a2
-  %or = or i32 %not_an_and0, %and1
-  ret i32 %or
-}
-
-define i32 @not_a_masked_merge3(i32 %a0, i32 %a1, i32 %a2) {
-; NO-MISC3-LABEL: not_a_masked_merge3:
-; NO-MISC3:       # %bb.0:
-; NO-MISC3-NEXT:    nr %r3, %r2
-; NO-MISC3-NEXT:    xr %r2, %r4
-; NO-MISC3-NEXT:    xilf %r2, 4294967295
-; NO-MISC3-NEXT:    or %r2, %r3
-; NO-MISC3-NEXT:    br %r14
-;
-; MISC3-LABEL: not_a_masked_merge3:
-; MISC3:       # %bb.0:
-; MISC3-NEXT:    nr %r3, %r2
-; MISC3-NEXT:    xr %r2, %r4
-; MISC3-NEXT:    ocrk %r2, %r3, %r2
-; MISC3-NEXT:    br %r14
-  %and0 = and i32 %a0, %a1
-  %not = xor i32 %a0, -1
-  %not_an_and1 = xor i32 %not, %a2
-  %or = or i32 %and0, %not_an_and1
-  ret i32 %or
-}
-
-define i32 @not_a_masked_merge4(i32 %a0, i32 %a1, i32 %a2) {
-; NO-MISC3-LABEL: not_a_masked_merge4:
-; NO-MISC3:       # %bb.0:
-; NO-MISC3-NEXT:    nr %r2, %r3
-; NO-MISC3-NEXT:    br %r14
-;
-; MISC3-LABEL: not_a_masked_merge4:
-; MISC3:       # %bb.0:
-; MISC3-NEXT:    nr %r2, %r3
-; MISC3-NEXT:    br %r14
-  %and0 = and i32 %a0, %a1
-  %not = xor i32 %a2, -1
-  %and1 = and i32 %not, %a2
-  %or = or i32 %and0, %and1
-  ret i32 %or
-}
-
-define i32 @masked_merge_no_transform0(i32 %a0, i32 %a1, i32 %a2, ptr %p1) {
-; NO-MISC3-LABEL: masked_merge_no_transform0:
-; NO-MISC3:       # %bb.0:
-; NO-MISC3-NEXT:    nr %r3, %r2
-; NO-MISC3-NEXT:    xilf %r2, 4294967295
-; NO-MISC3-NEXT:    nr %r2, %r4
-; NO-MISC3-NEXT:    or %r2, %r3
-; NO-MISC3-NEXT:    st %r3, 0(%r5)
-; NO-MISC3-NEXT:    br %r14
-;
-; MISC3-LABEL: masked_merge_no_transform0:
-; MISC3:       # %bb.0:
-; MISC3-NEXT:    nr %r3, %r2
-; MISC3-NEXT:    ncrk %r2, %r4, %r2
-; MISC3-NEXT:    or %r2, %r3
-; MISC3-NEXT:    st %r3, 0(%r5)
-; MISC3-NEXT:    br %r14
-  %and0 = and i32 %a0, %a1
-  %not = xor i32 %a0, -1
-  %and1 = and i32 %not, %a2
-  %or = or i32 %and0, %and1
-  store i32 %and0, ptr %p1
-  ret i32 %or
-}
-
-define i32 @masked_merge_no_transform1(i32 %a0, i32 %a1, i32 %a2, ptr %p1) {
-; NO-MISC3-LABEL: masked_merge_no_transform1:
-; NO-MISC3:       # %bb.0:
-; NO-MISC3-NEXT:    nrk %r0, %r2, %r3
-; NO-MISC3-NEXT:    xilf %r2, 4294967295
-; NO-MISC3-NEXT:    nr %r4, %r2
-; NO-MISC3-NEXT:    or %r0, %r4
-; NO-MISC3-NEXT:    st %r2, 0(%r5)
-; NO-MISC3-NEXT:    lr %r2, %r0
-; NO-MISC3-NEXT:    br %r14
-;
-; MISC3-LABEL: masked_merge_no_transform1:
-; MISC3:       # %bb.0:
-; MISC3-NEXT:    nrk %r0, %r2, %r3
-; MISC3-NEXT:    ncrk %r1, %r4, %r2
-; MISC3-NEXT:    xilf %r2, 4294967295
-; MISC3-NEXT:    or %r0, %r1
-; MISC3-NEXT:    st %r2, 0(%r5)
-; MISC3-NEXT:    lr %r2, %r0
-; MISC3-NEXT:    br %r14
-  %and0 = and i32 %a0, %a1
-  %not = xor i32 %a0, -1
-  %and1 = and i32 %not, %a2
-  %or = or i32 %and0, %and1
-  store i32 %not, ptr %p1
-  ret i32 %or
-}
-
-define i32 @masked_merge_no_transform2(i32 %a0, i32 %a1, i32 %a2, ptr %p1) {
-; NO-MISC3-LABEL: masked_merge_no_transform2:
-; NO-MISC3:       # %bb.0:
-; NO-MISC3-NEXT:    nr %r3, %r2
-; NO-MISC3-NEXT:    xilf %r2, 4294967295
-; NO-MISC3-NEXT:    nr %r4, %r2
-; NO-MISC3-NEXT:    ork %r2, %r3, %r4
-; NO-MISC3-NEXT:    st %r4, 0(%r5)
-; NO-MISC3-NEXT:    br %r14
-;
-; MISC3-LABEL: masked_merge_no_transform2:
-; MISC3:       # %bb.0:
-; MISC3-NEXT:    nr %r3, %r2
-; MISC3-NEXT:    ncrk %r0, %r4, %r2
-; MISC3-NEXT:    ork %r2, %r3, %r0
-; MISC3-NEXT:    st %r0, 0(%r5)
-; MISC3-NEXT:    br %r14
-  %and0 = and i32 %a0, %a1
-  %not = xor i32 %a0, -1
-  %and1 = and i32 %not, %a2
-  %or = or i32 %and0, %and1
-  store i32 %and1, ptr %p1
-  ret i32 %or
-}
diff --git a/llvm/test/CodeGen/WebAssembly/simd-arith.ll b/llvm/test/CodeGen/WebAssembly/simd-arith.ll
index e3607e12bf530..185c46aa5681e 100644
--- a/llvm/test/CodeGen/WebAssembly/simd-arith.ll
+++ b/llvm/test/CodeGen/WebAssembly/simd-arith.ll
@@ -4465,139 +4465,203 @@ define <16 x i8> @bitselect_v16i8(<16 x i8> %c, <16 x i8> %v1, <16 x i8> %v2) {
 ; NO-SIMD128-LABEL: bitselect_v16i8:
 ; NO-SIMD128:         .functype bitselect_v16i8 (i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) -> ()
 ; NO-SIMD128-NEXT:  # %bb.0:
-; NO-SIMD128-NEXT:    i32.xor $push0=, $32, $48
-; NO-SIMD128-NEXT:    i32.and $push1=, $pop0, $16
-; NO-SIMD128-NEXT:    i32.xor $push2=, $pop1, $48
-; NO-SIMD128-NEXT:    i32.store8 15($0), $pop2
-; NO-SIMD128-NEXT:    i32.xor $push3=, $31, $47
-; NO-SIMD128-NEXT:    i32.and $push4=, $pop3, $15
-; NO-SIMD128-NEXT:    i32.xor $push5=, $pop4, $47
-; NO-SIMD128-NEXT:    i32.store8 14($0), $pop5
-; NO-SIMD128-NEXT:    i32.xor $push6=, $30, $46
-; NO-SIMD128-NEXT:    i32.and $push7=, $pop6, $14
-; NO-SIMD128-NEXT:    i32.xor $push8=, $pop7, $46
-; NO-SIMD128-NEXT:    i32.store8 13($0), $pop8
-; NO-SIMD128-NEXT:    i32.xor $push9=, $29, $45
-; NO-SIMD128-NEXT:    i32.and $push10=, $pop9, $13
-; NO-SIMD128-NEXT:    i32.xor $push11=, $pop10, $45
-; NO-SIMD128-NEXT:    i32.store8 12($0), $pop11
-; NO-SIMD128-NEXT:    i32.xor $push12=, $28, $44
-; NO-SIMD128-NEXT:    i32.and $push13=, $pop12, $12
-; NO-SIMD128-NEXT:    i32.xor $push14=, $pop13, $44
-; NO-SIMD128-NEXT:    i32.store8 11($0), $pop14
-; NO-SIMD128-NEXT:    i32.xor $push15=, $27, $43
-; NO-SIMD128-NEXT:    i32.and $push16=, $pop15, $11
-; NO-SIMD128-NEXT:    i32.xor $push17=, $pop16, $43
-; NO-SIMD128-NEXT:    i32.store8 10($0), $pop17
-; NO-SIMD128-NEXT:    i32.xor $push18=, $26, $42
-; NO-SIMD128-NEXT:    i32.and $push19=, $pop18, $10
-; NO-SIMD128-NEXT:    i32.xor $push20=, $pop19, $42
-; NO-SIMD128-NEXT:    i32.store8 9($0), $pop20
-; NO-SIMD128-NEXT:    i32.xor $push21=, $25, $41
-; NO-SIMD128-NEXT:    i32.and $push22=, $pop21, $9
-; NO-SIMD128-NEXT:    i32.xor $push23=, $pop22, $41
-; NO-SIMD128-NEXT:    i32.store8 8($0), $pop23
-; NO-SIMD128-NEXT:    i32.xor $push24=, $24, $40
-; NO-SIMD128-NEXT:    i32.and $push25=, $pop24, $8
-; NO-SIMD128-NEXT:    i32.xor $push26=, $pop25, $40
-; NO-SIMD128-NEXT:    i32.store8 7($0), $pop26
-; NO-SIMD128-NEXT:    i32.xor $push27=, $23, $39
-; NO-SIMD128-NEXT:    i32.and $push28=, $pop27, $7
-; NO-SIMD128-NEXT:    i32.xor $push29=, $pop28, $39
-; NO-SIMD128-NEXT:    i32.store8 6($0), $pop29
-; NO-SIMD128-NEXT:    i32.xor $push30=, $22, $38
-; NO-SIMD128-NEXT:    i32.and $push31=, $pop30, $6
-; NO-SIMD128-NEXT:    i32.xor $push32=, $pop31, $38
-; NO-SIMD128-NEXT:    i32.store8 5($0), $pop32
-; NO-SIMD128-NEXT:    i32.xor $push33=, $21, $37
-; NO-SIMD128-NEXT:    i32.and $push34=, $pop33, $5
-; NO-SIMD128-NEXT:    i32.xor $push35=, $pop34, $37
-; NO-SIMD128-NEXT:    i32.store8 4($0), $pop35
-; NO-SIMD128-NEXT:    i32.xor $push36=, $20, $36
-; NO-SIMD128-NEXT:    i32.and $push37=, $pop36, $4
-; NO-SIMD128-NEXT:    i32.xor $push38=, $pop37, $36
-; NO-SIMD128-NEXT:    i32.store8 3($0), $pop38
-; NO-SIMD128-NEXT:    i32.xor $push39=, $19, $35
-; NO-SIMD128-NEXT:    i32.and $push40=, $pop39, $3
-; NO-SIMD128-NEXT:    i32.xor $push41=, $pop40, $35
-; NO-SIMD128-NEXT:    i32.store8 2($0), $pop41
-; NO-SIMD128-NEXT:    i32.xor $push42=, $18, $34
-; NO-SIMD128-NEXT:    i32.and $push43=, $pop42, $2
-; NO-SIMD128-NEXT:    i32.xor $push44=, $pop43, $34
-; NO-SIMD128-NEXT:    i32.store8 1($0), $pop44
-; NO-SIMD128-NEXT:    i32.xor $push45=, $17, $33
-; NO-SIMD128-NEXT:    i32.and $push46=, $pop45, $1
-; NO-SIMD128-NEXT:    i32.xor $push47=, $pop46, $33
-; NO-SIMD128-NEXT:    i32.store8 0($0), $pop47
+; NO-SIMD128-NEXT:    i32.and $push0=, $16, $32
+; NO-SIMD128-NEXT:    i32.const $push1=, -1
+; NO-SIMD128-NEXT:    i32.xor $push2=, $16, $pop1
+; NO-SIMD128-NEXT:    i32.and $push3=, $pop2, $48
+; NO-SIMD128-NEXT:    i32.or $push4=, $pop0, $pop3
+; NO-SIMD128-NEXT:    i32.store8 15($0), $pop4
+; NO-SIMD128-NEXT:    i32.and $push5=, $15, $31
+; NO-SIMD128-NEXT:    i32.const $push79=, -1
+; NO-SIMD128-NEXT:    i32.xor $push6=, $15, $pop79
+; NO-SIMD128-NEXT:    i32.and $push7=, $pop6, $47
+; NO-SIMD128-NEXT:    i32.or $push8=, $pop5, $pop7
+; NO-SIMD128-NEXT:    i32.store8 14($0), $pop8
+; NO-SIMD128-NEXT:    i32.and $push9=, $14, $30
+; NO-SIMD128-NEXT:    i32.const $push78=, -1
+; NO-SIMD128-NEXT:    i32.xor $push10=, $14, $pop78
+; NO-SIMD128-NEXT:    i32.and $push11=, $pop10, $46
+; NO-SIMD128-NEXT:    i32.or $push12=, $pop9, $pop11
+; NO-SIMD128-NEXT:    i32.store8 13($0), $pop12
+; NO-SIMD128-NEXT:    i32.and $push13=, $13, $29
+; NO-SIMD128-NEXT:    i32.const $push77=, -1
+; NO-SIMD128-NEXT:    i32.xor $push14=, $13, $pop77
+; NO-SIMD128-NEXT:    i32.and $push15=, $pop14, $45
+; NO-SIMD128-NEXT:    i32.or $push16=, $pop13, $pop15
+; NO-SIMD128-NEXT:    i32.store8 12($0), $pop16
+; NO-SIMD128-NEXT:    i32.and $push17=, $12, $28
+; NO-SIMD128-NEXT:    i32.const $push76=, -1
+; NO-SIMD128-NEXT:    i32.xor $push18=, $12, $pop76
+; NO-SIMD128-NEXT:    i32.and $push19=, $pop18, $44
+; NO-SIMD128-NEXT:    i32.or $push20=, $pop17, $pop19
+; NO-SIMD128-NEXT:    i32.store8 11($0), $pop20
+; NO-SIMD128-NEXT:    i32.and $push21=, $11, $27
+; NO-SIMD128-NEXT:    i32.const $push75=, -1
+; NO-SIMD128-NEXT:    i32.xor $push22=, $11, $pop75
+; NO-SIMD128-NEXT:    i32.and $push23=, $pop22, $43
+; NO-SIMD128-NEXT:    i32.or $push24=, $pop21, $pop23
+; NO-SIMD128-NEXT:    i32.store8 10($0), $pop24
+; NO-SIMD128-NEXT:    i32.and $push25=, $10, $26
+; NO-SIMD128-NEXT:    i32.const $push74=, -1
+; NO-SIMD128-NEXT:    i32.xor $push26=, $10, $pop74
+; NO-SIMD128-NEXT:    i32.and $push27=, $pop26, $42
+; NO-SIMD128-NEXT:    i32.or $push28=, $pop25, $pop27
+; NO-SIMD128-NEXT:    i32.store8 9($0), $pop28
+; NO-SIMD128-NEXT:    i32.and $push29=, $9, $25
+; NO-SIMD128-NEXT:    i32.const $push73=, -1
+; NO-SIMD128-NEXT:    i32.xor $push30=, $9, $pop73
+; NO-SIMD128-NEXT:    i32.and $push31=, $pop30, $41
+; NO-SIMD128-NEXT:    i32.or $push32=, $pop29, $pop31
+; NO-SIMD128-NEXT:    i32.store8 8($0), $pop32
+; NO-SIMD128-NEXT:    i32.and $push33=, $8, $24
+; NO-SIMD128-NEXT:    i32.const $push72=, -1
+; NO-SIMD128-NEXT:    i32.xor $push34=, $8, $pop72
+; NO-SIMD128-NEXT:    i32.and $push35=, $pop34, $40
+; NO-SIMD128-NEXT:    i32.or $push36=, $pop33, $pop35
+; NO-SIMD128-NEXT:    i32.store8 7($0), $pop36
+; NO-SIMD128-NEXT:    i32.and $push37=, $7, $23
+; NO-SIMD128-NEXT:    i32.const $push71=, -1
+; NO-SIMD128-NEXT:    i32.xor $push38=, $7, $pop71
+; NO-SIMD128-NEXT:    i32.and $push39=, $pop38, $39
+; NO-SIMD128-NEXT:    i32.or $push40=, $pop37, $pop39
+; NO-SIMD128-NEXT:    i32.store8 6($0), $pop40
+; NO-SIMD128-NEXT:    i32.and $push41=, $6, $22
+; NO-SIMD128-NEXT:    i32.const $push70=, -1
+; NO-SIMD128-NEXT:    i32.xor $push42=, $6, $pop70
+; NO-SIMD128-NEXT:    i32.and $push43=, $pop42, $38
+; NO-SIMD128-NEXT:    i32.or $push44=, $pop41, $pop43
+; NO-SIMD128-NEXT:    i32.store8 5($0), $pop44
+; NO-SIMD128-NEXT:    i32.and $push45=, $5, $21
+; NO-SIMD128-NEXT:    i32.const $push69=, -1
+; NO-SIMD128-NEXT:    i32.xor $push46=, $5, $pop69
+; NO-SIMD128-NEXT:    i32.and $push47=, $pop46, $37
+; NO-SIMD128-NEXT:    i32.or $push48=, $pop45, $pop47
+; NO-SIMD128-NEXT:    i32.store8 4($0), $pop48
+; NO-SIMD128-NEXT:    i32.and $push49=, $4, $20
+; NO-SIMD128-NEXT:    i32.const $push68=, -1
+; NO-SIMD128-NEXT:    i32.xor $push50=, $4, $pop68
+; NO-SIMD128-NEXT:    i32.and $push51=, $pop50, $36
+; NO-SIMD128-NEXT:    i32.or $push52=, $pop49, $pop51
+; NO-SIMD128-NEXT:    i32.store8 3($0), $pop52
+; NO-SIMD128-NEXT:    i32.and $push53=, $3, $19
+; NO-SIMD128-NEXT:    i32.const $push67=, -1
+; NO-SIMD128-NEXT:    i32.xor $push54=, $3, $pop67
+; NO-SIMD128-NEXT:    i32.and $push55=, $pop54, $35
+; NO-SIMD128-NEXT:    i32.or $push56=, $pop53, $pop55
+; NO-SIMD128-NEXT:    i32.store8 2($0), $pop56
+; NO-SIMD128-NEXT:    i32.and $push57=, $2, $18
+; NO-SIMD128-NEXT:    i32.const $push66=, -1
+; NO-SIMD128-NEXT:    i32.xor $push58=, $2, $pop66
+; NO-SIMD128-NEXT:    i32.and $push59=, $pop58, $34
+; NO-SIMD128-NEXT:    i32.or $push60=, $pop57, $pop59
+; NO-SIMD128-NEXT:    i32.store8 1($0), $pop60
+; NO-SIMD128-NEXT:    i32.and $push61=, $1, $17
+; NO-SIMD128-NEXT:    i32.const $push65=, -1
+; NO-SIMD128-NEXT:    i32.xor $push62=, $1, $pop65
+; NO-SIMD128-NEXT:    i32.and $push63=, $pop62, $33
+; NO-SIMD128-NEXT:    i32.or $push64=, $pop61, $pop63
+; NO-SIMD128-NEXT:    i32.store8 0($0), $pop64
 ; NO-SIMD128-NEXT:    return
 ;
 ; NO-SIMD128-FAST-LABEL: bitselect_v16i8:
 ; NO-SIMD128-FAST:         .functype bitselect_v16i8 (i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) -> ()
 ; NO-SIMD128-FAST-NEXT:  # %bb.0:
-; NO-SIMD128-FAST-NEXT:    i32.xor $push0=, $17, $33
-; NO-SIMD128-FAST-NEXT:    i32.and $push1=, $pop0, $1
-; NO-SIMD128-FAST-NEXT:    i32.xor $push2=, $pop1, $33
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($0), $pop2
-; NO-SIMD128-FAST-NEXT:    i32.xor $push3=, $18, $34
-; NO-SIMD128-FAST-NEXT:    i32.and $push4=, $pop3, $2
-; NO-SIMD128-FAST-NEXT:    i32.xor $push5=, $pop4, $34
-; NO-SIMD128-FAST-NEXT:    i32.store8 1($0), $pop5
-; NO-SIMD128-FAST-NEXT:    i32.xor $push6=, $19, $35
-; NO-SIMD128-FAST-NEXT:    i32.and $push7=, $pop6, $3
-; NO-SIMD128-FAST-NEXT:    i32.xor $push8=, $pop7, $35
-; NO-SIMD128-FAST-NEXT:    i32.store8 2($0), $pop8
-; NO-SIMD128-FAST-NEXT:    i32.xor $push9=, $20, $36
-; NO-SIMD128-FAST-NEXT:    i32.and $push10=, $pop9, $4
-; NO-SIMD128-FAST-NEXT:    i32.xor $push11=, $pop10, $36
-; NO-SIMD128-FAST-NEXT:    i32.store8 3($0), $pop11
-; NO-SIMD128-FAST-NEXT:    i32.xor $push12=, $21, $37
-; NO-SIMD128-FAST-NEXT:    i32.and $push13=, $pop12, $5
-; NO-SIMD128-FAST-NEXT:    i32.xor $push14=, $pop13, $37
-; NO-SIMD128-FAST-NEXT:    i32.store8 4($0), $pop14
-; NO-SIMD128-FAST-NEXT:    i32.xor $push15=, $22, $38
-; NO-SIMD128-FAST-NEXT:    i32.and $push16=, $pop15, $6
-; NO-SIMD128-FAST-NEXT:    i32.xor $push17=, $pop16, $38
-; NO-SIMD128-FAST-NEXT:    i32.store8 5($0), $pop17
-; NO-SIMD128-FAST-NEXT:    i32.xor $push18=, $23, $39
-; NO-SIMD128-FAST-NEXT:    i32.and $push19=, $pop18, $7
-; NO-SIMD128-FAST-NEXT:    i32.xor $push20=, $pop19, $39
-; NO-SIMD128-FAST-NEXT:    i32.store8 6($0), $pop20
-; NO-SIMD128-FAST-NEXT:    i32.xor $push21=, $24, $40
-; NO-SIMD128-FAST-NEXT:    i32.and $push22=, $pop21, $8
-; NO-SIMD128-FAST-NEXT:    i32.xor $push23=, $pop22, $40
-; NO-SIMD128-FAST-NEXT:    i32.store8 7($0), $pop23
-; NO-SIMD128-FAST-NEXT:    i32.xor $push24=, $25, $41
-; NO-SIMD128-FAST-NEXT:    i32.and $push25=, $pop24, $9
-; NO-SIMD128-FAST-NEXT:    i32.xor $push26=, $pop25, $41
-; NO-SIMD128-FAST-NEXT:    i32.store8 8($0), $pop26
-; NO-SIMD128-FAST-NEXT:    i32.xor $push27=, $26, $42
-; NO-SIMD128-FAST-NEXT:    i32.and $push28=, $pop27, $10
-; NO-SIMD128-FAST-NEXT:    i32.xor $push29=, $pop28, $42
-; NO-SIMD128-FAST-NEXT:    i32.store8 9($0), $pop29
-; NO-SIMD128-FAST-NEXT:    i32.xor $push30=, $27, $43
-; NO-SIMD128-FAST-NEXT:    i32.and $push31=, $pop30, $11
-; NO-SIMD128-FAST-NEXT:    i32.xor $push32=, $pop31, $43
-; NO-SIMD128-FAST-NEXT:    i32.store8 10($0), $pop32
-; NO-SIMD128-FAST-NEXT:    i32.xor $push33=, $28, $44
-; NO-SIMD128-FAST-NEXT:    i32.and $push34=, $pop33, $12
-; NO-SIMD128-FAST-NEXT:    i32.xor $push35=, $pop34, $44
-; NO-SIMD128-FAST-NEXT:    i32.store8 11($0), $pop35
-; NO-SIMD128-FAST-NEXT:    i32.xor $push36=, $29, $45
-; NO-SIMD128-FAST-NEXT:    i32.and $push37=, $pop36, $13
-; NO-SIMD128-FAST-NEXT:    i32.xor $push38=, $pop37, $45
-; NO-SIMD128-FAST-NEXT:    i32.store8 12($0), $pop38
-; NO-SIMD128-FAST-NEXT:    i32.xor $push39=, $30, $46
-; NO-SIMD128-FAST-NEXT:    i32.and $push40=, $pop39, $14
-; NO-SIMD128-FAST-NEXT:    i32.xor $push41=, $pop40, $46
-; NO-SIMD128-FAST-NEXT:    i32.store8 13($0), $pop41
-; NO-SIMD128-FAST-NEXT:    i32.xor $push42=, $31, $47
-; NO-SIMD128-FAST-NEXT:    i32.and $push43=, $pop42, $15
-; NO-SIMD128-FAST-NEXT:    i32.xor $push44=, $pop43, $47
-; NO-SIMD128-FAST-NEXT:    i32.store8 14($0), $pop44
-; NO-SIMD128-FAST-NEXT:    i32.xor $push45=, $32, $48
-; NO-SIMD128-FAST-NEXT:    i32.and $push46=, $pop45, $16
-; NO-SIMD128-FAST-NEXT:    i32.xor $push47=, $pop46, $48
-; NO-SIMD128-FAST-NEXT:    i32.store8 15($0), $pop47
+; NO-SIMD128-FAST-NEXT:    i32.and $push0=, $1, $17
+; NO-SIMD128-FAST-NEXT:    i32.const $push1=, -1
+; NO-SIMD128-FAST-NEXT:    i32.xor $push2=, $1, $pop1
+; NO-SIMD128-FAST-NEXT:    i32.and $push3=, $pop2, $33
+; NO-SIMD128-FAST-NEXT:    i32.or $push4=, $pop0, $pop3
+; NO-SIMD128-FAST-NEXT:    i32.store8 0($0), $pop4
+; NO-SIMD128-FAST-NEXT:    i32.and $push5=, $2, $18
+; NO-SIMD128-FAST-NEXT:    i32.const $push79=, -1
+; NO-SIMD128-FAST-NEXT:    i32.xor $push6=, $2, $pop79
+; NO-SIMD128-FAST-NEXT:    i32.and $push7=, $pop6, $34
+; NO-SIMD128-FAST-NEXT:    i32.or $push8=, $pop5, $pop7
+; NO-SIMD128-FAST-NEXT:    i32.store8 1($0), $pop8
+; NO-SIMD128-FAST-NEXT:    i32.and $push9=, $3, $19
+; NO-SIMD128-FAST-NEXT:    i32.const $push78=, -1
+; NO-SIMD128-FAST-NEXT:    i32.xor $push10=, $3, $pop78
+; NO-SIMD128-FAST-NEXT:    i32.and $push11=, $pop10, $35
+; NO-SIMD128-FAST-NEXT:    i32.or $push12=, $pop9, $pop11
+; NO-SIMD128-FAST-NEXT:    i32.store8 2($0), $pop12
+; NO-SIMD128-FAST-NEXT:    i32.and $push13=, $4, $20
+; NO-SIMD128-FAST-NEXT:    i32.const $push77=, -1
+; NO-SIMD128-FAST-NEXT:    i32.xor $push14=, $4, $pop77
+; NO-SIMD128-FAST-NEXT:    i32.and $push15=, $pop14, $36
+; NO-SIMD128-FAST-NEXT:    i32.or $push16=, $pop13, $pop15
+; NO-SIMD128-FAST-NEXT:    i32.store8 3($0), $pop16
+; NO-SIMD128-FAST-NEXT:    i32.and $push17=, $5, $21
+; NO-SIMD128-FAST-NEXT:    i32.const $push76=, -1
+; NO-SIMD128-FAST-NEXT:    i32.xor $push18=, $5, $pop76
+; NO-SIMD128-FAST-NEXT:    i32.and $push19=, $pop18, $37
+; NO-SIMD128-FAST-NEXT:    i32.or $push20=, $pop17, $pop19
+; NO-SIMD128-FAST-NEXT:    i32.store8 4($0), $pop20
+; NO-SIMD128-FAST-NEXT:    i32.and $push21=, $6, $22
+; NO-SIMD128-FAST-NEXT:    i32.const $push75=, -1
+; NO-SIMD128-FAST-NEXT:    i32.xor $push22=, $6, $pop75
+; NO-SIMD128-FAST-NEXT:    i32.and $push23=, $pop22, $38
+; NO-SIMD128-FAST-NEXT:    i32.or $push24=, $pop21, $pop23
+; NO-SIMD128-FAST-NEXT:    i32.store8 5($0), $pop24
+; NO-SIMD128-FAST-NEXT:    i32.and $push25=, $7, $23
+; NO-SIMD128-FAST-NEXT:    i32.const $push74=, -1
+; NO-SIMD128-FAST-NEXT:    i32.xor $push26=, $7, $pop74
+; NO-SIMD128-FAST-NEXT:    i32.and $push27=, $pop26, $39
+; NO-SIMD128-FAST-NEXT:    i32.or $push28=, $pop25, $pop27
+; NO-SIMD128-FAST-NEXT:    i32.store8 6($0), $pop28
+; NO-SIMD128-FAST-NEXT:    i32.and $push29=, $8, $24
+; NO-SIMD128-FAST-NEXT:    i32.const $push73=, -1
+; NO-SIMD128-FAST-NEXT:    i32.xor $push30=, $8, $pop73
+; NO-SIMD128-FAST-NEXT:    i32.and $push31=, $pop30, $40
+; NO-SIMD128-FAST-NEXT:    i32.or $push32=, $pop29, $pop31
+; NO-SIMD128-FAST-NEXT:    i32.store8 7($0), $pop32
+; NO-SIMD128-FAST-NEXT:    i32.and $push33=, $9, $25
+; NO-SIMD128-FAST-NEXT:    i32.const $push72=, -1
+; NO-SIMD128-FAST-NEXT:    i32.xor $push34=, $9, $pop72
+; NO-SIMD128-FAST-NEXT:    i32.and $push35=, $pop34, $41
+; NO-SIMD128-FAST-NEXT:    i32.or $push36=, $pop33, $pop35
+; NO-SIMD128-FAST-NEXT:    i32.store8 8($0), $pop36
+; NO-SIMD128-FAST-NEXT:    i32.and $push37=, $10, $26
+; NO-SIMD128-FAST-NEXT:    i32.const $push71=, -1
+; NO-SIMD128-FAST-NEXT:    i32.xor $push38=, $10, $pop71
+; NO-SIMD128-FAST-NEXT:    i32.and $push39=, $pop38, $42
+; NO-SIMD128-FAST-NEXT:    i32.or $push40=, $pop37, $pop39
+; NO-SIMD128-FAST-NEXT:    i32.store8 9($0), $pop40
+; NO-SIMD128-FAST-NEXT:    i32.and $push41=, $11, $27
+; NO-SIMD128-FAST-NEXT:    i32.const $push70=, -1
+; NO-SIMD128-FAST-NEXT:    i32.xor $push42=, $11, $pop70
+; NO-SIMD128-FAST-NEXT:    i32.and $push43=, $pop42, $43
+; NO-SIMD128-FAST-NEXT:    i32.or $push44=, $pop41, $pop43
+; NO-SIMD128-FAST-NEXT:    i32.store8 10($0), $pop44
+; NO-SIMD128-FAST-NEXT:    i32.and $push45=, $12, $28
+; NO-SIMD128-FAST-NEXT:    i32.const $push69=, -1
+; NO-SIMD128-FAST-NEXT:    i32.xor $push46=, $12, $pop69
+; NO-SIMD128-FAST-NEXT:    i32.and $push47=, $pop46, $44
+; NO-SIMD128-FAST-NEXT:    i32.or $push48=, $pop45, $pop47
+; NO-SIMD128-FAST-NEXT:    i32.store8 11($0), $pop48
+; NO-SIMD128-FAST-NEXT:    i32.and $push49=, $13, $29
+; NO-SIMD128-FAST-NEXT:    i32.const $push68=, -1
+; NO-SIMD128-FAST-NEXT:    i32.xor $push50=, $13, $pop68
+; NO-SIMD128-FAST-NEXT:    i32.and $push51=, $pop50, $45
+; NO-SIMD128-FAST-NEXT:    i32.or $push52=, $pop49, $pop51
+; NO-SIMD128-FAST-NEXT:    i32.store8 12($0), $pop52
+; NO-SIMD128-FAST-NEXT:    i32.and $push53=, $14, $30
+; NO-SIMD128-FAST-NEXT:    i32.const $push67=, -1
+; NO-SIMD128-FAST-NEXT:    i32.xor $push54=, $14, $pop67
+; NO-SIMD128-FAST-NEXT:    i32.and $push55=, $pop54, $46
+; NO-SIMD128-FAST-NEXT:    i32.or $push56=, $pop53, $pop55
+; NO-SIMD128-FAST-NEXT:    i32.store8 13($0), $pop56
+; NO-SIMD128-FAST-NEXT:    i32.and $push57=, $15, $31
+; NO-SIMD128-FAST-NEXT:    i32.const $push66=, -1
+; NO-SIMD128-FAST-NEXT:    i32.xor $push58=, $15, $pop66
+; NO-SIMD128-FAST-NEXT:    i32.and $push59=, $pop58, $47
+; NO-SIMD128-FAST-NEXT:    i32.or $push60=, $pop57, $pop59
+; NO-SIMD128-FAST-NEXT:    i32.store8 14($0), $pop60
+; NO-SIMD128-FAST-NEXT:    i32.and $push61=, $16, $32
+; NO-SIMD128-FAST-NEXT:    i32.const $push65=, -1
+; NO-SIMD128-FAST-NEXT:    i32.xor $push62=, $16, $pop65
+; NO-SIMD128-FAST-NEXT:    i32.and $push63=, $pop62, $48
+; NO-SIMD128-FAST-NEXT:    i32.or $push64=, $pop61, $pop63
+; NO-SIMD128-FAST-NEXT:    i32.store8 15($0), $pop64
 ; NO-SIMD128-FAST-NEXT:    return
   %masked_v1 = and <16 x i8> %c, %v1
   %inv_mask = xor <16 x i8> %c,
@@ -7482,75 +7546,107 @@ define <8 x i16> @bitselect_v8i16(<8 x i16> %c, <8 x i16> %v1, <8 x i16> %v2) {
 ; NO-SIMD128-LABEL: bitselect_v8i16:
 ; NO-SIMD128:         .functype bitselect_v8i16 (i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) -> ()
 ; NO-SIMD128-NEXT:  # %bb.0:
-; NO-SIMD128-NEXT:    i32.xor $push0=, $16, $24
-; NO-SIMD128-NEXT:    i32.and $push1=, $pop0, $8
-; NO-SIMD128-NEXT:    i32.xor $push2=, $pop1, $24
-; NO-SIMD128-NEXT:    i32.store16 14($0), $pop2
-; NO-SIMD128-NEXT:    i32.xor $push3=, $15, $23
-; NO-SIMD128-NEXT:    i32.and $push4=, $pop3, $7
-; NO-SIMD128-NEXT:    i32.xor $push5=, $pop4, $23
-; NO-SIMD128-NEXT:    i32.store16 12($0), $pop5
-; NO-SIMD128-NEXT:    i32.xor $push6=, $14, $22
-; NO-SIMD128-NEXT:    i32.and $push7=, $pop6, $6
-; NO-SIMD128-NEXT:    i32.xor $push8=, $pop7, $22
-; NO-SIMD128-NEXT:    i32.store16 10($0), $pop8
-; NO-SIMD128-NEXT:    i32.xor $push9=, $13, $21
-; NO-SIMD128-NEXT:    i32.and $push10=, $pop9, $5
-; NO-SIMD128-NEXT:    i32.xor $push11=, $pop10, $21
-; NO-SIMD128-NEXT:    i32.store16 8($0), $pop11
-; NO-SIMD128-NEXT:    i32.xor $push12=, $12, $20
-; NO-SIMD128-NEXT:    i32.and $push13=, $pop12, $4
-; NO-SIMD128-NEXT:    i32.xor $push14=, $pop13, $20
-; NO-SIMD128-NEXT:    i32.store16 6($0), $pop14
-; NO-SIMD128-NEXT:    i32.xor $push15=, $11, $19
-; NO-SIMD128-NEXT:    i32.and $push16=, $pop15, $3
-; NO-SIMD128-NEXT:    i32.xor $push17=, $pop16, $19
-; NO-SIMD128-NEXT:    i32.store16 4($0), $pop17
-; NO-SIMD128-NEXT:    i32.xor $push18=, $10, $18
-; NO-SIMD128-NEXT:    i32.and $push19=, $pop18, $2
-; NO-SIMD128-NEXT:    i32.xor $push20=, $pop19, $18
-; NO-SIMD128-NEXT:    i32.store16 2($0), $pop20
-; NO-SIMD128-NEXT:    i32.xor $push21=, $9, $17
-; NO-SIMD128-NEXT:    i32.and $push22=, $pop21, $1
-; NO-SIMD128-NEXT:    i32.xor $push23=, $pop22, $17
-; NO-SIMD128-NEXT:    i32.store16 0($0), $pop23
+; NO-SIMD128-NEXT:    i32.and $push0=, $16, $8
+; NO-SIMD128-NEXT:    i32.const $push1=, -1
+; NO-SIMD128-NEXT:    i32.xor $push2=, $8, $pop1
+; NO-SIMD128-NEXT:    i32.and $push3=, $24, $pop2
+; NO-SIMD128-NEXT:    i32.or $push4=, $pop0, $pop3
+; NO-SIMD128-NEXT:    i32.store16 14($0), $pop4
+; NO-SIMD128-NEXT:    i32.and $push5=, $15, $7
+; NO-SIMD128-NEXT:    i32.const $push39=, -1
+; NO-SIMD128-NEXT:    i32.xor $push6=, $7, $pop39
+; NO-SIMD128-NEXT:    i32.and $push7=, $23, $pop6
+; NO-SIMD128-NEXT:    i32.or $push8=, $pop5, $pop7
+; NO-SIMD128-NEXT:    i32.store16 12($0), $pop8
+; NO-SIMD128-NEXT:    i32.and $push9=, $14, $6
+; NO-SIMD128-NEXT:    i32.const $push38=, -1
+; NO-SIMD128-NEXT:    i32.xor $push10=, $6, $pop38
+; NO-SIMD128-NEXT:    i32.and $push11=, $22, $pop10
+; NO-SIMD128-NEXT:    i32.or $push12=, $pop9, $pop11
+; NO-SIMD128-NEXT:    i32.store16 10($0), $pop12
+; NO-SIMD128-NEXT:    i32.and $push13=, $13, $5
+; NO-SIMD128-NEXT:    i32.const $push37=, -1
+; NO-SIMD128-NEXT:    i32.xor $push14=, $5, $pop37
+; NO-SIMD128-NEXT:    i32.and $push15=, $21, $pop14
+; NO-SIMD128-NEXT:    i32.or $push16=, $pop13, $pop15
+; NO-SIMD128-NEXT:    i32.store16 8($0), $pop16
+; NO-SIMD128-NEXT:    i32.and $push17=, $12, $4
+; NO-SIMD128-NEXT:    i32.const $push36=, -1
+; NO-SIMD128-NEXT:    i32.xor $push18=, $4, $pop36
+; NO-SIMD128-NEXT:    i32.and $push19=, $20, $pop18
+; NO-SIMD128-NEXT:    i32.or $push20=, $pop17, $pop19
+; NO-SIMD128-NEXT:    i32.store16 6($0), $pop20
+; NO-SIMD128-NEXT:    i32.and $push21=, $11, $3
+; NO-SIMD128-NEXT:    i32.const $push35=, -1
+; NO-SIMD128-NEXT:    i32.xor $push22=, $3, $pop35
+; NO-SIMD128-NEXT:    i32.and $push23=, $19, $pop22
+; NO-SIMD128-NEXT:    i32.or $push24=, $pop21, $pop23
+; NO-SIMD128-NEXT:    i32.store16 4($0), $pop24
+; NO-SIMD128-NEXT:    i32.and $push25=, $10, $2
+; NO-SIMD128-NEXT:    i32.const $push34=, -1
+; NO-SIMD128-NEXT:    i32.xor $push26=, $2, $pop34
+; NO-SIMD128-NEXT:    i32.and $push27=, $18, $pop26
+; NO-SIMD128-NEXT:    i32.or $push28=, $pop25, $pop27
+; NO-SIMD128-NEXT:    i32.store16 2($0), $pop28
+; NO-SIMD128-NEXT:    i32.and $push29=, $9, $1
+; NO-SIMD128-NEXT:    i32.const $push33=, -1
+; NO-SIMD128-NEXT:    i32.xor $push30=, $1, $pop33
+; NO-SIMD128-NEXT:    i32.and $push31=, $17, $pop30
+; NO-SIMD128-NEXT:    i32.or $push32=, $pop29, $pop31
+; NO-SIMD128-NEXT:    i32.store16 0($0), $pop32
 ; NO-SIMD128-NEXT:    return
 ;
 ; NO-SIMD128-FAST-LABEL: bitselect_v8i16:
 ; NO-SIMD128-FAST:         .functype bitselect_v8i16 (i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) -> ()
 ; NO-SIMD128-FAST-NEXT:  # %bb.0:
-; NO-SIMD128-FAST-NEXT:    i32.xor $push0=, $9, $17
-; NO-SIMD128-FAST-NEXT:    i32.and $push1=, $pop0, $1
-; NO-SIMD128-FAST-NEXT:    i32.xor $push2=, $pop1, $17
-; NO-SIMD128-FAST-NEXT:    i32.store16 0($0), $pop2
-; NO-SIMD128-FAST-NEXT:    i32.xor $push3=, $10, $18
-; NO-SIMD128-FAST-NEXT:    i32.and $push4=, $pop3, $2
-; NO-SIMD128-FAST-NEXT:    i32.xor $push5=, $pop4, $18
-; NO-SIMD128-FAST-NEXT:    i32.store16 2($0), $pop5
-; NO-SIMD128-FAST-NEXT:    i32.xor $push6=, $11, $19
-; NO-SIMD128-FAST-NEXT:    i32.and $push7=, $pop6, $3
-; NO-SIMD128-FAST-NEXT:    i32.xor $push8=, $pop7, $19
-; NO-SIMD128-FAST-NEXT:    i32.store16 4($0), $pop8
-; NO-SIMD128-FAST-NEXT:    i32.xor $push9=, $12, $20
-; NO-SIMD128-FAST-NEXT:    i32.and $push10=, $pop9, $4
-; NO-SIMD128-FAST-NEXT:    i32.xor $push11=, $pop10, $20
-; NO-SIMD128-FAST-NEXT:    i32.store16 6($0), $pop11
-; NO-SIMD128-FAST-NEXT:    i32.xor $push12=, $13, $21
-; NO-SIMD128-FAST-NEXT:    i32.and $push13=, $pop12, $5
-; NO-SIMD128-FAST-NEXT:    i32.xor $push14=, $pop13, $21
-; NO-SIMD128-FAST-NEXT:    i32.store16 8($0), $pop14
-; NO-SIMD128-FAST-NEXT:    i32.xor $push15=, $14, $22
-; NO-SIMD128-FAST-NEXT:    i32.and $push16=, $pop15, $6
-; NO-SIMD128-FAST-NEXT:    i32.xor $push17=, $pop16, $22
-; NO-SIMD128-FAST-NEXT:    i32.store16 10($0), $pop17
-; NO-SIMD128-FAST-NEXT:    i32.xor $push18=, $15, $23
-; NO-SIMD128-FAST-NEXT:    i32.and $push19=, $pop18, $7
-; NO-SIMD128-FAST-NEXT:    i32.xor $push20=, $pop19, $23
-; NO-SIMD128-FAST-NEXT:    i32.store16 12($0), $pop20
-; NO-SIMD128-FAST-NEXT:    i32.xor $push21=, $16, $24
-; NO-SIMD128-FAST-NEXT:    i32.and $push22=, $pop21, $8
-; NO-SIMD128-FAST-NEXT:    i32.xor $push23=, $pop22, $24
-; NO-SIMD128-FAST-NEXT:    i32.store16 14($0), $pop23
+; NO-SIMD128-FAST-NEXT:    i32.and $push0=, $9, $1
+; NO-SIMD128-FAST-NEXT:    i32.const $push1=, -1
+; NO-SIMD128-FAST-NEXT:    i32.xor $push2=, $1, $pop1
+; NO-SIMD128-FAST-NEXT:    i32.and $push3=, $17, $pop2
+; NO-SIMD128-FAST-NEXT:    i32.or $push4=, $pop0, $pop3
+; NO-SIMD128-FAST-NEXT:    i32.store16 0($0), $pop4
+; NO-SIMD128-FAST-NEXT:    i32.and $push5=, $10, $2
+; NO-SIMD128-FAST-NEXT:    i32.const $push39=, -1
+; NO-SIMD128-FAST-NEXT:    i32.xor $push6=, $2, $pop39
+; NO-SIMD128-FAST-NEXT:    i32.and $push7=, $18, $pop6
+; NO-SIMD128-FAST-NEXT:    i32.or $push8=, $pop5, $pop7
+; NO-SIMD128-FAST-NEXT:    i32.store16 2($0), $pop8
+; NO-SIMD128-FAST-NEXT:    i32.and $push9=, $11, $3
+; NO-SIMD128-FAST-NEXT:    i32.const $push38=, -1
+; NO-SIMD128-FAST-NEXT:    i32.xor $push10=, $3, $pop38
+; NO-SIMD128-FAST-NEXT:    i32.and $push11=, $19, $pop10
+; NO-SIMD128-FAST-NEXT:    i32.or $push12=, $pop9, $pop11
+; NO-SIMD128-FAST-NEXT:    i32.store16 4($0), $pop12
+; NO-SIMD128-FAST-NEXT:    i32.and $push13=, $12, $4
+; NO-SIMD128-FAST-NEXT:    i32.const $push37=, -1
+; NO-SIMD128-FAST-NEXT:    i32.xor $push14=, $4, $pop37
+; NO-SIMD128-FAST-NEXT:    i32.and $push15=, $20, $pop14
+; NO-SIMD128-FAST-NEXT:    i32.or $push16=, $pop13, $pop15
+; NO-SIMD128-FAST-NEXT:    i32.store16 6($0), $pop16
+; NO-SIMD128-FAST-NEXT:    i32.and $push17=, $13, $5
+; NO-SIMD128-FAST-NEXT:    i32.const $push36=, -1
+; NO-SIMD128-FAST-NEXT:    i32.xor $push18=, $5, $pop36
+; NO-SIMD128-FAST-NEXT:    i32.and $push19=, $21, $pop18
+; NO-SIMD128-FAST-NEXT:    i32.or $push20=, $pop17, $pop19
+; NO-SIMD128-FAST-NEXT:    i32.store16 8($0), $pop20
+; NO-SIMD128-FAST-NEXT:    i32.and $push21=, $14, $6
+; NO-SIMD128-FAST-NEXT:    i32.const $push35=, -1
+; NO-SIMD128-FAST-NEXT:    i32.xor $push22=, $6, $pop35
+; NO-SIMD128-FAST-NEXT:    i32.and $push23=, $22, $pop22
+; NO-SIMD128-FAST-NEXT:    i32.or $push24=, $pop21, $pop23
+; NO-SIMD128-FAST-NEXT:    i32.store16 10($0), $pop24
+; NO-SIMD128-FAST-NEXT:    i32.and $push25=, $15, $7
+; NO-SIMD128-FAST-NEXT:    i32.const $push34=, -1
+; NO-SIMD128-FAST-NEXT:    i32.xor $push26=, $7, $pop34
+; NO-SIMD128-FAST-NEXT:    i32.and $push27=, $23, $pop26
+; NO-SIMD128-FAST-NEXT:    i32.or $push28=, $pop25, $pop27
+; NO-SIMD128-FAST-NEXT:    i32.store16 12($0), $pop28
+; NO-SIMD128-FAST-NEXT:    i32.and $push29=, $16, $8
+; NO-SIMD128-FAST-NEXT:    i32.const $push33=, -1
+; NO-SIMD128-FAST-NEXT:    i32.xor $push30=, $8, $pop33
+; NO-SIMD128-FAST-NEXT:    i32.and $push31=, $24, $pop30
+; NO-SIMD128-FAST-NEXT:    i32.or $push32=, $pop29, $pop31
+; NO-SIMD128-FAST-NEXT:    i32.store16 14($0), $pop32
 ; NO-SIMD128-FAST-NEXT:    return
   %masked_v1 = and <8 x i16> %v1, %c
   %inv_mask = xor <8 x i16>
@@ -9357,43 +9453,59 @@ define <4 x i32> @bitselect_v4i32(<4 x i32> %c, <4 x i32> %v1, <4 x i32> %v2) {
 ; NO-SIMD128-LABEL: bitselect_v4i32:
 ; NO-SIMD128:         .functype bitselect_v4i32 (i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) -> ()
 ; NO-SIMD128-NEXT:  # %bb.0:
-; NO-SIMD128-NEXT:    i32.xor $push0=, $8, $12
-; NO-SIMD128-NEXT:    i32.and $push1=, $pop0, $4
-; NO-SIMD128-NEXT:    i32.xor $push2=, $pop1, $12
-; NO-SIMD128-NEXT:    i32.store 12($0), $pop2
-; NO-SIMD128-NEXT:    i32.xor $push3=, $7, $11
-; NO-SIMD128-NEXT:    i32.and $push4=, $pop3, $3
-; NO-SIMD128-NEXT:    i32.xor $push5=, $pop4, $11
-; NO-SIMD128-NEXT:    i32.store 8($0), $pop5
-; NO-SIMD128-NEXT:    i32.xor $push6=, $6, $10
-; NO-SIMD128-NEXT:    i32.and $push7=, $pop6, $2
-; NO-SIMD128-NEXT:    i32.xor $push8=, $pop7, $10
-; NO-SIMD128-NEXT:    i32.store 4($0), $pop8
-; NO-SIMD128-NEXT:    i32.xor $push9=, $5, $9
-; NO-SIMD128-NEXT:    i32.and $push10=, $pop9, $1
-; NO-SIMD128-NEXT:    i32.xor $push11=, $pop10, $9
-; NO-SIMD128-NEXT:    i32.store 0($0), $pop11
+; NO-SIMD128-NEXT:    i32.const $push1=, -1
+; NO-SIMD128-NEXT:    i32.xor $push2=, $4, $pop1
+; NO-SIMD128-NEXT:    i32.and $push3=, $pop2, $12
+; NO-SIMD128-NEXT:    i32.and $push0=, $4, $8
+; NO-SIMD128-NEXT:    i32.or $push4=, $pop3, $pop0
+; NO-SIMD128-NEXT:    i32.store 12($0), $pop4
+; NO-SIMD128-NEXT:    i32.const $push19=, -1
+; NO-SIMD128-NEXT:    i32.xor $push6=, $3, $pop19
+; NO-SIMD128-NEXT:    i32.and $push7=, $pop6, $11
+; NO-SIMD128-NEXT:    i32.and $push5=, $3, $7
+; NO-SIMD128-NEXT:    i32.or $push8=, $pop7, $pop5
+; NO-SIMD128-NEXT:    i32.store 8($0), $pop8
+; NO-SIMD128-NEXT:    i32.const $push18=, -1
+; NO-SIMD128-NEXT:    i32.xor $push10=, $2, $pop18
+; NO-SIMD128-NEXT:    i32.and $push11=, $pop10, $10
+; NO-SIMD128-NEXT:    i32.and $push9=, $2, $6
+; NO-SIMD128-NEXT:    i32.or $push12=, $pop11, $pop9
+; NO-SIMD128-NEXT:    i32.store 4($0), $pop12
+; NO-SIMD128-NEXT:    i32.const $push17=, -1
+; NO-SIMD128-NEXT:    i32.xor $push14=, $1, $pop17
+; NO-SIMD128-NEXT:    i32.and $push15=, $pop14, $9
+; NO-SIMD128-NEXT:    i32.and $push13=, $1, $5
+; NO-SIMD128-NEXT:    i32.or $push16=, $pop15, $pop13
+; NO-SIMD128-NEXT:    i32.store 0($0), $pop16
 ; NO-SIMD128-NEXT:    return
 ;
 ; NO-SIMD128-FAST-LABEL: bitselect_v4i32:
 ; NO-SIMD128-FAST:         .functype bitselect_v4i32 (i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) -> ()
 ; NO-SIMD128-FAST-NEXT:  # %bb.0:
-; NO-SIMD128-FAST-NEXT:    i32.xor $push0=, $5, $9
-; NO-SIMD128-FAST-NEXT:    i32.and $push1=, $pop0, $1
-; NO-SIMD128-FAST-NEXT:    i32.xor $push2=, $pop1, $9
-; NO-SIMD128-FAST-NEXT:    i32.store 0($0), $pop2
-; NO-SIMD128-FAST-NEXT:    i32.xor $push3=, $6, $10
-; NO-SIMD128-FAST-NEXT:    i32.and $push4=, $pop3, $2
-; NO-SIMD128-FAST-NEXT:    i32.xor $push5=, $pop4, $10
-; NO-SIMD128-FAST-NEXT:    i32.store 4($0), $pop5
-; NO-SIMD128-FAST-NEXT:    i32.xor $push6=, $7, $11
-; NO-SIMD128-FAST-NEXT:    i32.and $push7=, $pop6, $3
-; NO-SIMD128-FAST-NEXT:    i32.xor $push8=, $pop7, $11
-; NO-SIMD128-FAST-NEXT:    i32.store 8($0), $pop8
-; NO-SIMD128-FAST-NEXT:    i32.xor $push9=, $8, $12
-; NO-SIMD128-FAST-NEXT:    i32.and $push10=, $pop9, $4
-; NO-SIMD128-FAST-NEXT:    i32.xor $push11=, $pop10, $12
-; NO-SIMD128-FAST-NEXT:    i32.store 12($0), $pop11
+; NO-SIMD128-FAST-NEXT:    i32.const $push1=, -1
+; NO-SIMD128-FAST-NEXT:    i32.xor $push2=, $1, $pop1
+; NO-SIMD128-FAST-NEXT:    i32.and $push3=, $pop2, $9
+; NO-SIMD128-FAST-NEXT:    i32.and $push0=, $1, $5
+; NO-SIMD128-FAST-NEXT:    i32.or $push4=, $pop3, $pop0
+; NO-SIMD128-FAST-NEXT:    i32.store 0($0), $pop4
+; NO-SIMD128-FAST-NEXT:    i32.const $push19=, -1
+; NO-SIMD128-FAST-NEXT:    i32.xor $push6=, $2, $pop19
+; NO-SIMD128-FAST-NEXT:    i32.and $push7=, $pop6, $10
+; NO-SIMD128-FAST-NEXT:    i32.and $push5=, $2, $6
+; NO-SIMD128-FAST-NEXT:    i32.or $push8=, $pop7, $pop5
+; NO-SIMD128-FAST-NEXT:    i32.store 4($0), $pop8
+; NO-SIMD128-FAST-NEXT:    i32.const $push18=, -1
+; NO-SIMD128-FAST-NEXT:    i32.xor $push10=, $3, $pop18
+; NO-SIMD128-FAST-NEXT:    i32.and $push11=, $pop10, $11
+; NO-SIMD128-FAST-NEXT:    i32.and $push9=, $3, $7
+; NO-SIMD128-FAST-NEXT:    i32.or $push12=, $pop11, $pop9
+; NO-SIMD128-FAST-NEXT:    i32.store 8($0), $pop12
+; NO-SIMD128-FAST-NEXT:    i32.const $push17=, -1
+; NO-SIMD128-FAST-NEXT:    i32.xor $push14=, $4, $pop17
+; NO-SIMD128-FAST-NEXT:    i32.and $push15=, $pop14, $12
+; NO-SIMD128-FAST-NEXT:    i32.and $push13=, $4, $8
+; NO-SIMD128-FAST-NEXT:    i32.or $push16=, $pop15, $pop13
+; NO-SIMD128-FAST-NEXT:    i32.store 12($0), $pop16
 ; NO-SIMD128-FAST-NEXT:    return
   %masked_v1 = and <4 x i32> %c, %v1
   %inv_mask = xor <4 x i32> <i32 -1, i32 -1, i32 -1, i32 -1>, %c
@@ -10862,27 +10974,35 @@ define <2 x i64> @bitselect_v2i64(<2 x i64> %c, <2 x i64> %v1, <2 x i64> %v2) {
 ; NO-SIMD128-LABEL: bitselect_v2i64:
 ; NO-SIMD128:         .functype bitselect_v2i64 (i32, i64, i64, i64, i64, i64, i64) -> ()
 ; NO-SIMD128-NEXT:  # %bb.0:
-; NO-SIMD128-NEXT:    i64.xor $push0=, $4, $6
-; NO-SIMD128-NEXT:    i64.and $push1=, $pop0, $2
-; NO-SIMD128-NEXT:    i64.xor $push2=, $pop1, $6
-; NO-SIMD128-NEXT:    i64.store 8($0), $pop2
-; NO-SIMD128-NEXT:    i64.xor $push3=, $3, $5
-; NO-SIMD128-NEXT:    i64.and $push4=, $pop3, $1
-; NO-SIMD128-NEXT:    i64.xor $push5=, $pop4, $5
-; NO-SIMD128-NEXT:    i64.store 0($0), $pop5
+; NO-SIMD128-NEXT:    i64.const $push1=, -1
+; NO-SIMD128-NEXT:    i64.xor $push2=, $2, $pop1
+; NO-SIMD128-NEXT:    i64.and $push3=, $6, $pop2
+; NO-SIMD128-NEXT:    i64.and $push0=, $4, $2
+; NO-SIMD128-NEXT:    i64.or $push4=, $pop3, $pop0
+; NO-SIMD128-NEXT:    i64.store 8($0), $pop4
+; NO-SIMD128-NEXT:    i64.const $push9=, -1
+; NO-SIMD128-NEXT:    i64.xor $push6=, $1, $pop9
+; NO-SIMD128-NEXT:    i64.and $push7=, $5, $pop6
+; NO-SIMD128-NEXT:    i64.and $push5=, $3, $1
+; NO-SIMD128-NEXT:    i64.or $push8=, $pop7, $pop5
+; NO-SIMD128-NEXT:    i64.store 0($0), $pop8
 ; NO-SIMD128-NEXT:    return
 ;
 ; NO-SIMD128-FAST-LABEL: bitselect_v2i64:
 ; NO-SIMD128-FAST:         .functype bitselect_v2i64 (i32, i64, i64, i64, i64, i64, i64) -> ()
 ; NO-SIMD128-FAST-NEXT:  # %bb.0:
-; NO-SIMD128-FAST-NEXT:    i64.xor $push0=, $3, $5
-; NO-SIMD128-FAST-NEXT:    i64.and $push1=, $pop0, $1
-; NO-SIMD128-FAST-NEXT:    i64.xor $push2=, $pop1, $5
-; NO-SIMD128-FAST-NEXT:    i64.store 0($0), $pop2
-; NO-SIMD128-FAST-NEXT:    i64.xor $push3=, $4, $6
-; NO-SIMD128-FAST-NEXT:    i64.and $push4=, $pop3, $2
-; NO-SIMD128-FAST-NEXT:    i64.xor $push5=, $pop4, $6
-; NO-SIMD128-FAST-NEXT:    i64.store 8($0), $pop5
+; NO-SIMD128-FAST-NEXT:    i64.const $push1=, -1
+; NO-SIMD128-FAST-NEXT:    i64.xor $push2=, $1, $pop1
+; NO-SIMD128-FAST-NEXT:    i64.and $push3=, $5, $pop2
+; NO-SIMD128-FAST-NEXT:    i64.and $push0=, $3, $1
+; NO-SIMD128-FAST-NEXT:    i64.or $push4=, $pop3, $pop0
+; NO-SIMD128-FAST-NEXT:    i64.store 0($0), $pop4
+; NO-SIMD128-FAST-NEXT:    i64.const $push9=, -1
+; NO-SIMD128-FAST-NEXT:    i64.xor $push6=, $2, $pop9
+; NO-SIMD128-FAST-NEXT:    i64.and $push7=, $6, $pop6
+; NO-SIMD128-FAST-NEXT:    i64.and $push5=, $4, $2
+; NO-SIMD128-FAST-NEXT:    i64.or $push8=, $pop7, $pop5
+; NO-SIMD128-FAST-NEXT:    i64.store 8($0), $pop8
 ; NO-SIMD128-FAST-NEXT:    return
   %masked_v1 = and <2 x i64> %v1, %c
   %inv_mask = xor <2 x i64> <i64 -1, i64 -1>, %c
diff --git a/llvm/test/CodeGen/X86/bitselect.ll b/llvm/test/CodeGen/X86/bitselect.ll
index 4fc0827ac4dd6..2922113b14ea9 100644
--- a/llvm/test/CodeGen/X86/bitselect.ll
+++ b/llvm/test/CodeGen/X86/bitselect.ll
@@ -1,7 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc < %s -mtriple=i686-unknown-unknown | FileCheck %s --check-prefixes=X86
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=-bmi | FileCheck %s --check-prefixes=X64-NOBMI
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+bmi | FileCheck %s --check-prefixes=X64-BMI
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=-bmi | FileCheck %s --check-prefixes=X64,X64-NOBMI
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+bmi | FileCheck %s --check-prefixes=X64,X64-BMI
 
 ; PR46472
 ; bitselect(a,b,m) == or(and(a,not(m)),and(b,m))
@@ -17,22 +17,14 @@ define i8 @bitselect_i8(i8 %a, i8 %b, i8 %m) nounwind {
 ; X86-NEXT:    xorb %cl, %al
 ; X86-NEXT:    retl
 ;
-; X64-NOBMI-LABEL: bitselect_i8:
-; X64-NOBMI:       # %bb.0:
-; X64-NOBMI-NEXT:    movl %esi, %eax
-; X64-NOBMI-NEXT:    xorl %edi, %eax
-; X64-NOBMI-NEXT:    andl %edx, %eax
-; X64-NOBMI-NEXT:    xorl %edi, %eax
-; X64-NOBMI-NEXT:    # kill: def $al killed $al killed $eax
-; X64-NOBMI-NEXT:    retq
-;
-; X64-BMI-LABEL: bitselect_i8:
-; X64-BMI:       # %bb.0:
-; X64-BMI-NEXT:    andnl %edi, %edx, %eax
-; X64-BMI-NEXT:    andl %edx, %esi
-; X64-BMI-NEXT:    orl %esi, %eax
-; X64-BMI-NEXT:    # kill: def $al killed $al killed $eax
-; X64-BMI-NEXT:    retq
+; X64-LABEL: bitselect_i8:
+; X64:       # %bb.0:
+; X64-NEXT:    andl %edx, %esi
+; X64-NEXT:    movl %edx, %eax
+; X64-NEXT:    notb %al
+; X64-NEXT:    andb %dil, %al
+; X64-NEXT:    orb %sil, %al
+; X64-NEXT:    retq
   %not = xor i8 %m, -1
   %ma = and i8 %a, %not
   %mb = and i8 %b, %m
@@ -43,20 +35,21 @@ define i8 @bitselect_i8(i8 %a, i8 %b, i8 %m) nounwind {
 define i16 @bitselect_i16(i16 %a, i16 %b, i16 %m) nounwind {
 ; X86-LABEL: bitselect_i16:
 ; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    xorw %cx, %ax
-; X86-NEXT:    andw {{[0-9]+}}(%esp), %ax
+; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    xorw %ax, %cx
+; X86-NEXT:    andw {{[0-9]+}}(%esp), %cx
 ; X86-NEXT:    xorl %ecx, %eax
 ; X86-NEXT:    # kill: def $ax killed $ax killed $eax
 ; X86-NEXT:    retl
 ;
 ; X64-NOBMI-LABEL: bitselect_i16:
 ; X64-NOBMI:       # %bb.0:
-; X64-NOBMI-NEXT:    movl %esi, %eax
-; X64-NOBMI-NEXT:    xorl %edi, %eax
-; X64-NOBMI-NEXT:    andl %edx, %eax
-; X64-NOBMI-NEXT:    xorl %edi, %eax
+; X64-NOBMI-NEXT:    movl %edx, %eax
+; X64-NOBMI-NEXT:    andl %edx, %esi
+; X64-NOBMI-NEXT:    notl %eax
+; X64-NOBMI-NEXT:    andl %edi, %eax
+; X64-NOBMI-NEXT:    orl %esi, %eax
 ; X64-NOBMI-NEXT:    # kill: def $ax killed $ax killed $eax
 ; X64-NOBMI-NEXT:    retq
 ;
@@ -193,12 +186,13 @@ define i128 @bitselect_i128(i128 %a, i128 %b, i128 %m) nounwind {
 ;
 ; X64-BMI-LABEL: bitselect_i128:
 ; X64-BMI:       # %bb.0:
+; X64-BMI-NEXT:    andnq %rsi, %r9, %rsi
 ; X64-BMI-NEXT:    andnq %rdi, %r8, %rax
+; X64-BMI-NEXT:    andq %r9, %rcx
+; X64-BMI-NEXT:    orq %rcx, %rsi
 ; X64-BMI-NEXT:    andq %r8, %rdx
 ; X64-BMI-NEXT:    orq %rdx, %rax
-; X64-BMI-NEXT:    andnq %rsi, %r9, %rdx
-; X64-BMI-NEXT:    andq %r9, %rcx
-; X64-BMI-NEXT:    orq %rcx, %rdx
+; X64-BMI-NEXT:    movq %rsi, %rdx
 ; X64-BMI-NEXT:    retq
   %not = xor i128 %m, -1
   %ma = and i128 %a, %not
diff --git a/llvm/test/CodeGen/X86/fold-masked-merge.ll b/llvm/test/CodeGen/X86/fold-masked-merge.ll
index 4a4eecbdfb3f3..b2614c5fe0493 100644
--- a/llvm/test/CodeGen/X86/fold-masked-merge.ll
+++ b/llvm/test/CodeGen/X86/fold-masked-merge.ll
@@ -30,17 +30,18 @@ define i32 @masked_merge0(i32 %a0, i32 %a1, i32 %a2) {
 define i16 @masked_merge1(i16 %a0, i16 %a1, i16 %a2) {
 ; NOBMI-LABEL: masked_merge1:
 ; NOBMI:       # %bb.0:
-; NOBMI-NEXT:    movl %esi, %eax
-; NOBMI-NEXT:    xorl %edx, %eax
-; NOBMI-NEXT:    andl %edi, %eax
-; NOBMI-NEXT:    xorl %edx, %eax
+; NOBMI-NEXT:    movl %edi, %eax
+; NOBMI-NEXT:    andl %edi, %esi
+; NOBMI-NEXT:    notl %eax
+; NOBMI-NEXT:    andl %edx, %eax
+; NOBMI-NEXT:    orl %esi, %eax
 ; NOBMI-NEXT:    # kill: def $ax killed $ax killed $eax
 ; NOBMI-NEXT:    retq
 ;
 ; BMI-LABEL: masked_merge1:
 ; BMI:       # %bb.0:
-; BMI-NEXT:    andnl %edx, %edi, %eax
 ; BMI-NEXT:    andl %edi, %esi
+; BMI-NEXT:    andnl %edx, %edi, %eax
 ; BMI-NEXT:    orl %esi, %eax
 ; BMI-NEXT:    # kill: def $ax killed $ax killed $eax
 ; BMI-NEXT:    retq
@@ -52,11 +53,20 @@ define i16 @masked_merge1(i16 %a0, i16 %a1, i16 %a2) {
 }
 
 define i8 @masked_merge2(i8 %a0, i8 %a1, i8 %a2) {
-; CHECK-LABEL: masked_merge2:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    movl %esi, %eax
-; CHECK-NEXT:    # kill: def $al killed $al killed $eax
-; CHECK-NEXT:    retq
+; NOBMI-LABEL: masked_merge2:
+; NOBMI:       # %bb.0:
+; NOBMI-NEXT:    movl %esi, %eax
+; NOBMI-NEXT:    # kill: def $al killed $al killed $eax
+; NOBMI-NEXT:    retq
+;
+; BMI-LABEL: masked_merge2:
+; BMI:       # %bb.0:
+; BMI-NEXT:    movl %edi, %eax
+; BMI-NEXT:    notb %al
+; BMI-NEXT:    andb %sil, %al
+; BMI-NEXT:    andb %dil, %sil
+; BMI-NEXT:    orb %sil, %al
+; BMI-NEXT:    retq
   %not = xor i8 %a0, -1
   %and0 = and i8 %not, %a1
   %and1 = and i8 %a1, %a0
diff --git a/llvm/test/CodeGen/X86/unfold-masked-merge-scalar-variablemask.ll b/llvm/test/CodeGen/X86/unfold-masked-merge-scalar-variablemask.ll
index 6a55d740fe421..9c9d06921096c 100644
--- a/llvm/test/CodeGen/X86/unfold-masked-merge-scalar-variablemask.ll
+++ b/llvm/test/CodeGen/X86/unfold-masked-merge-scalar-variablemask.ll
@@ -6,18 +6,21 @@
 define i8 @out8(i8 %x, i8 %y, i8 %mask) {
 ; CHECK-NOBMI-LABEL: out8:
 ; CHECK-NOBMI:       # %bb.0:
-; CHECK-NOBMI-NEXT:    movl %edi, %eax
-; CHECK-NOBMI-NEXT:    xorl %esi, %eax
-; CHECK-NOBMI-NEXT:    andl %edx, %eax
-; CHECK-NOBMI-NEXT:    xorl %esi, %eax
+; CHECK-NOBMI-NEXT:    movl %edx, %eax
+; CHECK-NOBMI-NEXT:    andl %edx, %edi
+; CHECK-NOBMI-NEXT:    notb %al
+; CHECK-NOBMI-NEXT:    andb %sil, %al
+; CHECK-NOBMI-NEXT:    orb %dil, %al
 ; CHECK-NOBMI-NEXT:    # kill: def $al killed $al killed $eax
 ; CHECK-NOBMI-NEXT:    retq
 ;
 ; CHECK-BMI-LABEL: out8:
 ; CHECK-BMI:       # %bb.0:
-; CHECK-BMI-NEXT:    andnl %esi, %edx, %eax
+; CHECK-BMI-NEXT:    movl %edx, %eax
 ; CHECK-BMI-NEXT:    andl %edx, %edi
-; CHECK-BMI-NEXT:    orl %edi, %eax
+; CHECK-BMI-NEXT:    notb %al
+; CHECK-BMI-NEXT:    andb %sil, %al
+; CHECK-BMI-NEXT:    orb %dil, %al
 ; CHECK-BMI-NEXT:    # kill: def $al killed $al killed $eax
 ; CHECK-BMI-NEXT:    retq
   %mx = and i8 %x, %mask
@@ -30,17 +33,18 @@ define i8 @out8(i8 %x, i8 %y, i8 %mask) {
 define i16 @out16(i16 %x, i16 %y, i16 %mask) {
 ; CHECK-NOBMI-LABEL: out16:
 ; CHECK-NOBMI:       # %bb.0:
-; CHECK-NOBMI-NEXT:    movl %edi, %eax
-; CHECK-NOBMI-NEXT:    xorl %esi, %eax
-; CHECK-NOBMI-NEXT:    andl %edx, %eax
-; CHECK-NOBMI-NEXT:    xorl %esi, %eax
+; CHECK-NOBMI-NEXT:    movl %edx, %eax
+; CHECK-NOBMI-NEXT:    andl %edx, %edi
+; CHECK-NOBMI-NEXT:    notl %eax
+; CHECK-NOBMI-NEXT:    andl %esi, %eax
+; CHECK-NOBMI-NEXT:    orl %edi, %eax
 ; CHECK-NOBMI-NEXT:    # kill: def $ax killed $ax killed $eax
 ; CHECK-NOBMI-NEXT:    retq
 ;
 ; CHECK-BMI-LABEL: out16:
 ; CHECK-BMI:       # %bb.0:
-; CHECK-BMI-NEXT:    andnl %esi, %edx, %eax
 ; CHECK-BMI-NEXT:    andl %edx, %edi
+; CHECK-BMI-NEXT:    andnl %esi, %edx, %eax
 ; CHECK-BMI-NEXT:    orl %edi, %eax
 ; CHECK-BMI-NEXT:    # kill: def $ax killed $ax killed $eax
 ; CHECK-BMI-NEXT:    retq
diff --git a/llvm/test/CodeGen/X86/unfold-masked-merge-vector-variablemask.ll b/llvm/test/CodeGen/X86/unfold-masked-merge-vector-variablemask.ll
index 809c15881cc9b..b1194bedc4e1c 100644
--- a/llvm/test/CodeGen/X86/unfold-masked-merge-vector-variablemask.ll
+++ b/llvm/test/CodeGen/X86/unfold-masked-merge-vector-variablemask.ll
@@ -16,10 +16,11 @@
 define <1 x i8> @out_v1i8(<1 x i8> %x, <1 x i8> %y, <1 x i8> %mask) nounwind {
 ; CHECK-LABEL: out_v1i8:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    movl %edi, %eax
-; CHECK-NEXT:    xorl %esi, %eax
-; CHECK-NEXT:    andl %edx, %eax
-; CHECK-NEXT:    xorl %esi, %eax
+; CHECK-NEXT:    movl %edx, %eax
+; CHECK-NEXT:    andl %edx, %edi
+; CHECK-NEXT:    notb %al
+; CHECK-NEXT:    andb %sil, %al
+; CHECK-NEXT:    orb %dil, %al
 ; CHECK-NEXT:    # kill: def $al killed $al killed $eax
 ; CHECK-NEXT:    retq
   %mx = and <1 x i8> %x, %mask
@@ -36,28 +37,32 @@ define <1 x i8> @out_v1i8(<1 x i8> %x, <1 x i8> %y, <1 x i8> %mask) nounwind {
 define <2 x i8> @out_v2i8(<2 x i8> %x, <2 x i8> %y, <2 x i8> %mask) nounwind {
 ; CHECK-BASELINE-LABEL: out_v2i8:
 ; CHECK-BASELINE:       # %bb.0:
-; CHECK-BASELINE-NEXT:    movl %edi, %eax
-; CHECK-BASELINE-NEXT:    xorl %edx, %eax
-; CHECK-BASELINE-NEXT:    andl %r8d, %eax
-; CHECK-BASELINE-NEXT:    xorl %edx, %eax
-; CHECK-BASELINE-NEXT:    xorl %ecx, %esi
+; CHECK-BASELINE-NEXT:    movl %r8d, %eax
 ; CHECK-BASELINE-NEXT:    andl %r9d, %esi
-; CHECK-BASELINE-NEXT:    xorl %ecx, %esi
+; CHECK-BASELINE-NEXT:    andl %r8d, %edi
+; CHECK-BASELINE-NEXT:    notb %al
+; CHECK-BASELINE-NEXT:    notb %r9b
+; CHECK-BASELINE-NEXT:    andb %cl, %r9b
+; CHECK-BASELINE-NEXT:    andb %dl, %al
+; CHECK-BASELINE-NEXT:    orb %dil, %al
+; CHECK-BASELINE-NEXT:    orb %sil, %r9b
 ; CHECK-BASELINE-NEXT:    # kill: def $al killed $al killed $eax
-; CHECK-BASELINE-NEXT:    movl %esi, %edx
+; CHECK-BASELINE-NEXT:    movl %r9d, %edx
 ; CHECK-BASELINE-NEXT:    retq
 ;
 ; CHECK-SSE1-LABEL: out_v2i8:
 ; CHECK-SSE1:       # %bb.0:
-; CHECK-SSE1-NEXT:    movl %edi, %eax
-; CHECK-SSE1-NEXT:    xorl %edx, %eax
-; CHECK-SSE1-NEXT:    andl %r8d, %eax
-; CHECK-SSE1-NEXT:    xorl %edx, %eax
-; CHECK-SSE1-NEXT:    xorl %ecx, %esi
+; CHECK-SSE1-NEXT:    movl %r8d, %eax
 ; CHECK-SSE1-NEXT:    andl %r9d, %esi
-; CHECK-SSE1-NEXT:    xorl %ecx, %esi
+; CHECK-SSE1-NEXT:    andl %r8d, %edi
+; CHECK-SSE1-NEXT:    notb %al
+; CHECK-SSE1-NEXT:    notb %r9b
+; CHECK-SSE1-NEXT:    andb %cl, %r9b
+; CHECK-SSE1-NEXT:    andb %dl, %al
+; CHECK-SSE1-NEXT:    orb %dil, %al
+; CHECK-SSE1-NEXT:    orb %sil, %r9b
 ; CHECK-SSE1-NEXT:    # kill: def $al killed $al killed $eax
-; CHECK-SSE1-NEXT:    movl %esi, %edx
+; CHECK-SSE1-NEXT:    movl %r9d, %edx
 ; CHECK-SSE1-NEXT:    retq
 ;
 ; CHECK-SSE2-LABEL: out_v2i8:
@@ -81,10 +86,11 @@ define <2 x i8> @out_v2i8(<2 x i8> %x, <2 x i8> %y, <2 x i8> %mask) nounwind {
 define <1 x i16> @out_v1i16(<1 x i16> %x, <1 x i16> %y, <1 x i16> %mask) nounwind {
 ; CHECK-LABEL: out_v1i16:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    movl %edi, %eax
-; CHECK-NEXT:    xorl %esi, %eax
-; CHECK-NEXT:    andl %edx, %eax
-; CHECK-NEXT:    xorl %esi, %eax
+; CHECK-NEXT:    movl %edx, %eax
+; CHECK-NEXT:    andl %edx, %edi
+; CHECK-NEXT:    notl %eax
+; CHECK-NEXT:    andl %esi, %eax
+; CHECK-NEXT:    orl %edi, %eax
 ; CHECK-NEXT:    # kill: def $ax killed $ax killed $eax
 ; CHECK-NEXT:    retq
   %mx = and <1 x i16> %x, %mask
@@ -229,28 +235,32 @@ define <4 x i8> @out_v4i8_undef(<4 x i8> %x, <4 x i8> %y, <4 x i8> %mask) nounwi
 define <2 x i16> @out_v2i16(<2 x i16> %x, <2 x i16> %y, <2 x i16> %mask) nounwind {
 ; CHECK-BASELINE-LABEL: out_v2i16:
 ; CHECK-BASELINE:       # %bb.0:
-; CHECK-BASELINE-NEXT:    movl %edi, %eax
-; CHECK-BASELINE-NEXT:    xorl %edx, %eax
-; CHECK-BASELINE-NEXT:    andl %r8d, %eax
-; CHECK-BASELINE-NEXT:    xorl %edx, %eax
-; CHECK-BASELINE-NEXT:    xorl %ecx, %esi
+; CHECK-BASELINE-NEXT:    movl %r8d, %eax
 ; CHECK-BASELINE-NEXT:    andl %r9d, %esi
-; CHECK-BASELINE-NEXT:    xorl %ecx, %esi
+; CHECK-BASELINE-NEXT:    andl %r8d, %edi
+; CHECK-BASELINE-NEXT:    notl %eax
+; CHECK-BASELINE-NEXT:    notl %r9d
+; CHECK-BASELINE-NEXT:    andl %ecx, %r9d
+; CHECK-BASELINE-NEXT:    orl %esi, %r9d
+; CHECK-BASELINE-NEXT:    andl %edx, %eax
+; CHECK-BASELINE-NEXT:    orl %edi, %eax
 ; CHECK-BASELINE-NEXT:    # kill: def $ax killed $ax killed $eax
-; CHECK-BASELINE-NEXT:    movl %esi, %edx
+; CHECK-BASELINE-NEXT:    movl %r9d, %edx
 ; CHECK-BASELINE-NEXT:    retq
 ;
 ; CHECK-SSE1-LABEL: out_v2i16:
 ; CHECK-SSE1:       # %bb.0:
-; CHECK-SSE1-NEXT:    movl %edi, %eax
-; CHECK-SSE1-NEXT:    xorl %edx, %eax
-; CHECK-SSE1-NEXT:    andl %r8d, %eax
-; CHECK-SSE1-NEXT:    xorl %edx, %eax
-; CHECK-SSE1-NEXT:    xorl %ecx, %esi
+; CHECK-SSE1-NEXT:    movl %r8d, %eax
 ; CHECK-SSE1-NEXT:    andl %r9d, %esi
-; CHECK-SSE1-NEXT:    xorl %ecx, %esi
+; CHECK-SSE1-NEXT:    andl %r8d, %edi
+; CHECK-SSE1-NEXT:    notl %eax
+; CHECK-SSE1-NEXT:    notl %r9d
+; CHECK-SSE1-NEXT:    andl %ecx, %r9d
+; CHECK-SSE1-NEXT:    orl %esi, %r9d
+; CHECK-SSE1-NEXT:    andl %edx, %eax
+; CHECK-SSE1-NEXT:    orl %edi, %eax
 ; CHECK-SSE1-NEXT:    # kill: def $ax killed $ax killed $eax
-; CHECK-SSE1-NEXT:    movl %esi, %edx
+; CHECK-SSE1-NEXT:    movl %r9d, %edx
 ; CHECK-SSE1-NEXT:    retq
 ;
 ; CHECK-SSE2-LABEL: out_v2i16:
@@ -429,12 +439,9 @@ define <4 x i16> @out_v4i16(<4 x i16> %x, <4 x i16> %y, <4 x i16> %mask) nounwin
 ; CHECK-BASELINE-LABEL: out_v4i16:
 ; CHECK-BASELINE:       # %bb.0:
 ; CHECK-BASELINE-NEXT:    movq %rdi, %rax
-; CHECK-BASELINE-NEXT:    movl {{[0-9]+}}(%rsp), %edi
-; CHECK-BASELINE-NEXT:    movl {{[0-9]+}}(%rsp), %r10d
-; CHECK-BASELINE-NEXT:    movl {{[0-9]+}}(%rsp), %r11d
-; CHECK-BASELINE-NEXT:    xorl %r9d, %esi
-; CHECK-BASELINE-NEXT:    andw {{[0-9]+}}(%rsp), %si
-; CHECK-BASELINE-NEXT:    xorl %r9d, %esi
+; CHECK-BASELINE-NEXT:    movzwl {{[0-9]+}}(%rsp), %edi
+; CHECK-BASELINE-NEXT:    movzwl {{[0-9]+}}(%rsp), %r10d
+; CHECK-BASELINE-NEXT:    movzwl {{[0-9]+}}(%rsp), %r11d
 ; CHECK-BASELINE-NEXT:    xorl %r11d, %edx
 ; CHECK-BASELINE-NEXT:    andw {{[0-9]+}}(%rsp), %dx
 ; CHECK-BASELINE-NEXT:    xorl %r11d, %edx
@@ -444,21 +451,21 @@ define <4 x i16> @out_v4i16(<4 x i16> %x, <4 x i16> %y, <4 x i16> %mask) nounwin
 ; CHECK-BASELINE-NEXT:    xorl %edi, %r8d
 ; CHECK-BASELINE-NEXT:    andw {{[0-9]+}}(%rsp), %r8w
 ; CHECK-BASELINE-NEXT:    xorl %edi, %r8d
+; CHECK-BASELINE-NEXT:    xorl %r9d, %esi
+; CHECK-BASELINE-NEXT:    andw {{[0-9]+}}(%rsp), %si
+; CHECK-BASELINE-NEXT:    xorl %r9d, %esi
+; CHECK-BASELINE-NEXT:    movw %si, (%rax)
 ; CHECK-BASELINE-NEXT:    movw %r8w, 6(%rax)
 ; CHECK-BASELINE-NEXT:    movw %cx, 4(%rax)
 ; CHECK-BASELINE-NEXT:    movw %dx, 2(%rax)
-; CHECK-BASELINE-NEXT:    movw %si, (%rax)
 ; CHECK-BASELINE-NEXT:    retq
 ;
 ; CHECK-SSE1-LABEL: out_v4i16:
 ; CHECK-SSE1:       # %bb.0:
 ; CHECK-SSE1-NEXT:    movq %rdi, %rax
-; CHECK-SSE1-NEXT:    movl {{[0-9]+}}(%rsp), %edi
-; CHECK-SSE1-NEXT:    movl {{[0-9]+}}(%rsp), %r10d
-; CHECK-SSE1-NEXT:    movl {{[0-9]+}}(%rsp), %r11d
-; CHECK-SSE1-NEXT:    xorl %r9d, %esi
-; CHECK-SSE1-NEXT:    andw {{[0-9]+}}(%rsp), %si
-; CHECK-SSE1-NEXT:    xorl %r9d, %esi
+; CHECK-SSE1-NEXT:    movzwl {{[0-9]+}}(%rsp), %edi
+; CHECK-SSE1-NEXT:    movzwl {{[0-9]+}}(%rsp), %r10d
+; CHECK-SSE1-NEXT:    movzwl {{[0-9]+}}(%rsp), %r11d
 ; CHECK-SSE1-NEXT:    xorl %r11d, %edx
 ; CHECK-SSE1-NEXT:    andw {{[0-9]+}}(%rsp), %dx
 ; CHECK-SSE1-NEXT:    xorl %r11d, %edx
@@ -468,10 +475,13 @@ define <4 x i16> @out_v4i16(<4 x i16> %x, <4 x i16> %y, <4 x i16> %mask) nounwin
 ; CHECK-SSE1-NEXT:    xorl %edi, %r8d
 ; CHECK-SSE1-NEXT:    andw {{[0-9]+}}(%rsp), %r8w
 ; CHECK-SSE1-NEXT:    xorl %edi, %r8d
+; CHECK-SSE1-NEXT:    xorl %r9d, %esi
+; CHECK-SSE1-NEXT:    andw {{[0-9]+}}(%rsp), %si
+; CHECK-SSE1-NEXT:    xorl %r9d, %esi
+; CHECK-SSE1-NEXT:    movw %si, (%rax)
 ; CHECK-SSE1-NEXT:    movw %r8w, 6(%rax)
 ; CHECK-SSE1-NEXT:    movw %cx, 4(%rax)
 ; CHECK-SSE1-NEXT:    movw %dx, 2(%rax)
-; CHECK-SSE1-NEXT:    movw %si, (%rax)
 ; CHECK-SSE1-NEXT:    retq
 ;
 ; CHECK-SSE2-LABEL: out_v4i16:
@@ -496,43 +506,43 @@ define <4 x i16> @out_v4i16_undef(<4 x i16> %x, <4 x i16> %y, <4 x i16> %mask) n
 ; CHECK-BASELINE-LABEL: out_v4i16_undef:
 ; CHECK-BASELINE:       # %bb.0:
 ; CHECK-BASELINE-NEXT:    movq %rdi, %rax
-; CHECK-BASELINE-NEXT:    movl {{[0-9]+}}(%rsp), %edi
-; CHECK-BASELINE-NEXT:    movl {{[0-9]+}}(%rsp), %r10d
+; CHECK-BASELINE-NEXT:    movzwl {{[0-9]+}}(%rsp), %edi
+; CHECK-BASELINE-NEXT:    movzwl {{[0-9]+}}(%rsp), %r10d
 ; CHECK-BASELINE-NEXT:    andw {{[0-9]+}}(%rsp), %cx
-; CHECK-BASELINE-NEXT:    xorl %r9d, %esi
-; CHECK-BASELINE-NEXT:    andw {{[0-9]+}}(%rsp), %si
-; CHECK-BASELINE-NEXT:    xorl %r9d, %esi
 ; CHECK-BASELINE-NEXT:    xorl %r10d, %edx
 ; CHECK-BASELINE-NEXT:    andw {{[0-9]+}}(%rsp), %dx
 ; CHECK-BASELINE-NEXT:    xorl %r10d, %edx
 ; CHECK-BASELINE-NEXT:    xorl %edi, %r8d
 ; CHECK-BASELINE-NEXT:    andw {{[0-9]+}}(%rsp), %r8w
 ; CHECK-BASELINE-NEXT:    xorl %edi, %r8d
+; CHECK-BASELINE-NEXT:    xorl %r9d, %esi
+; CHECK-BASELINE-NEXT:    andw {{[0-9]+}}(%rsp), %si
+; CHECK-BASELINE-NEXT:    xorl %r9d, %esi
 ; CHECK-BASELINE-NEXT:    movw %cx, 4(%rax)
+; CHECK-BASELINE-NEXT:    movw %si, (%rax)
 ; CHECK-BASELINE-NEXT:    movw %r8w, 6(%rax)
 ; CHECK-BASELINE-NEXT:    movw %dx, 2(%rax)
-; CHECK-BASELINE-NEXT:    movw %si, (%rax)
 ; CHECK-BASELINE-NEXT:    retq
 ;
 ; CHECK-SSE1-LABEL: out_v4i16_undef:
 ; CHECK-SSE1:       # %bb.0:
 ; CHECK-SSE1-NEXT:    movq %rdi, %rax
-; CHECK-SSE1-NEXT:    movl {{[0-9]+}}(%rsp), %edi
-; CHECK-SSE1-NEXT:    movl {{[0-9]+}}(%rsp), %r10d
+; CHECK-SSE1-NEXT:    movzwl {{[0-9]+}}(%rsp), %edi
+; CHECK-SSE1-NEXT:    movzwl {{[0-9]+}}(%rsp), %r10d
 ; CHECK-SSE1-NEXT:    andw {{[0-9]+}}(%rsp), %cx
-; CHECK-SSE1-NEXT:    xorl %r9d, %esi
-; CHECK-SSE1-NEXT:    andw {{[0-9]+}}(%rsp), %si
-; CHECK-SSE1-NEXT:    xorl %r9d, %esi
 ; CHECK-SSE1-NEXT:    xorl %r10d, %edx
 ; CHECK-SSE1-NEXT:    andw {{[0-9]+}}(%rsp), %dx
 ; CHECK-SSE1-NEXT:    xorl %r10d, %edx
 ; CHECK-SSE1-NEXT:    xorl %edi, %r8d
 ; CHECK-SSE1-NEXT:    andw {{[0-9]+}}(%rsp), %r8w
 ; CHECK-SSE1-NEXT:    xorl %edi, %r8d
+; CHECK-SSE1-NEXT:    xorl %r9d, %esi
+; CHECK-SSE1-NEXT:    andw {{[0-9]+}}(%rsp), %si
+; CHECK-SSE1-NEXT:    xorl %r9d, %esi
 ; CHECK-SSE1-NEXT:    movw %cx, 4(%rax)
+; CHECK-SSE1-NEXT:    movw %si, (%rax)
 ; CHECK-SSE1-NEXT:    movw %r8w, 6(%rax)
 ; CHECK-SSE1-NEXT:    movw %dx, 2(%rax)
-; CHECK-SSE1-NEXT:    movw %si, (%rax)
 ; CHECK-SSE1-NEXT:    retq
 ;
 ; CHECK-SSE2-LABEL: out_v4i16_undef:
@@ -873,14 +883,14 @@ define <8 x i16> @out_v8i16(<8 x i16> %x, <8 x i16> %y, <8 x i16> %mask) nounwin
 ; CHECK-BASELINE-NEXT:    pushq %r12
 ; CHECK-BASELINE-NEXT:    pushq %rbx
 ; CHECK-BASELINE-NEXT:    movq %rdi, %rax
-; CHECK-BASELINE-NEXT:    movl {{[0-9]+}}(%rsp), %edi
-; CHECK-BASELINE-NEXT:    movl {{[0-9]+}}(%rsp), %r10d
-; CHECK-BASELINE-NEXT:    movl {{[0-9]+}}(%rsp), %r11d
-; CHECK-BASELINE-NEXT:    movl {{[0-9]+}}(%rsp), %ebx
-; CHECK-BASELINE-NEXT:    movl {{[0-9]+}}(%rsp), %ebp
-; CHECK-BASELINE-NEXT:    movl {{[0-9]+}}(%rsp), %r14d
-; CHECK-BASELINE-NEXT:    movl {{[0-9]+}}(%rsp), %r15d
-; CHECK-BASELINE-NEXT:    movl {{[0-9]+}}(%rsp), %r12d
+; CHECK-BASELINE-NEXT:    movzwl {{[0-9]+}}(%rsp), %edi
+; CHECK-BASELINE-NEXT:    movzwl {{[0-9]+}}(%rsp), %r10d
+; CHECK-BASELINE-NEXT:    movzwl {{[0-9]+}}(%rsp), %r11d
+; CHECK-BASELINE-NEXT:    movzwl {{[0-9]+}}(%rsp), %ebx
+; CHECK-BASELINE-NEXT:    movzwl {{[0-9]+}}(%rsp), %ebp
+; CHECK-BASELINE-NEXT:    movzwl {{[0-9]+}}(%rsp), %r14d
+; CHECK-BASELINE-NEXT:    movzwl {{[0-9]+}}(%rsp), %r15d
+; CHECK-BASELINE-NEXT:    movzwl {{[0-9]+}}(%rsp), %r12d
 ; CHECK-BASELINE-NEXT:    xorl %r12d, %esi
 ; CHECK-BASELINE-NEXT:    andw {{[0-9]+}}(%rsp), %si
 ; CHECK-BASELINE-NEXT:    xorl %r12d, %esi
@@ -896,16 +906,16 @@ define <8 x i16> @out_v8i16(<8 x i16> %x, <8 x i16> %y, <8 x i16> %mask) nounwin
 ; CHECK-BASELINE-NEXT:    xorl %ebx, %r9d
 ; CHECK-BASELINE-NEXT:    andw {{[0-9]+}}(%rsp), %r9w
 ; CHECK-BASELINE-NEXT:    xorl %ebx, %r9d
-; CHECK-BASELINE-NEXT:    movzwl {{[0-9]+}}(%rsp), %ebx
-; CHECK-BASELINE-NEXT:    xorw %r11w, %bx
+; CHECK-BASELINE-NEXT:    movl %r11d, %ebx
+; CHECK-BASELINE-NEXT:    xorw {{[0-9]+}}(%rsp), %bx
 ; CHECK-BASELINE-NEXT:    andw {{[0-9]+}}(%rsp), %bx
 ; CHECK-BASELINE-NEXT:    xorl %r11d, %ebx
-; CHECK-BASELINE-NEXT:    movzwl {{[0-9]+}}(%rsp), %r11d
-; CHECK-BASELINE-NEXT:    xorw %r10w, %r11w
+; CHECK-BASELINE-NEXT:    movl %r10d, %r11d
+; CHECK-BASELINE-NEXT:    xorw {{[0-9]+}}(%rsp), %r11w
 ; CHECK-BASELINE-NEXT:    andw {{[0-9]+}}(%rsp), %r11w
 ; CHECK-BASELINE-NEXT:    xorl %r10d, %r11d
-; CHECK-BASELINE-NEXT:    movzwl {{[0-9]+}}(%rsp), %r10d
-; CHECK-BASELINE-NEXT:    xorw %di, %r10w
+; CHECK-BASELINE-NEXT:    movl %edi, %r10d
+; CHECK-BASELINE-NEXT:    xorw {{[0-9]+}}(%rsp), %r10w
 ; CHECK-BASELINE-NEXT:    andw {{[0-9]+}}(%rsp), %r10w
 ; CHECK-BASELINE-NEXT:    xorl %edi, %r10d
 ; CHECK-BASELINE-NEXT:    movw %r10w, 14(%rax)
@@ -931,14 +941,14 @@ define <8 x i16> @out_v8i16(<8 x i16> %x, <8 x i16> %y, <8 x i16> %mask) nounwin
 ; CHECK-SSE1-NEXT:    pushq %r12
 ; CHECK-SSE1-NEXT:    pushq %rbx
 ; CHECK-SSE1-NEXT:    movq %rdi, %rax
-; CHECK-SSE1-NEXT:    movl {{[0-9]+}}(%rsp), %edi
-; CHECK-SSE1-NEXT:    movl {{[0-9]+}}(%rsp), %r10d
-; CHECK-SSE1-NEXT:    movl {{[0-9]+}}(%rsp), %r11d
-; CHECK-SSE1-NEXT:    movl {{[0-9]+}}(%rsp), %ebx
-; CHECK-SSE1-NEXT:    movl {{[0-9]+}}(%rsp), %ebp
-; CHECK-SSE1-NEXT:    movl {{[0-9]+}}(%rsp), %r14d
-; CHECK-SSE1-NEXT:    movl {{[0-9]+}}(%rsp), %r15d
-; CHECK-SSE1-NEXT:    movl {{[0-9]+}}(%rsp), %r12d
+; CHECK-SSE1-NEXT:    movzwl {{[0-9]+}}(%rsp), %edi
+; CHECK-SSE1-NEXT:    movzwl {{[0-9]+}}(%rsp), %r10d
+; CHECK-SSE1-NEXT:    movzwl {{[0-9]+}}(%rsp), %r11d
+; CHECK-SSE1-NEXT:    movzwl {{[0-9]+}}(%rsp), %ebx
+; CHECK-SSE1-NEXT:    movzwl {{[0-9]+}}(%rsp), %ebp
+; CHECK-SSE1-NEXT:    movzwl {{[0-9]+}}(%rsp), %r14d
+; CHECK-SSE1-NEXT:    movzwl {{[0-9]+}}(%rsp), %r15d
+; CHECK-SSE1-NEXT:    movzwl {{[0-9]+}}(%rsp), %r12d
 ; CHECK-SSE1-NEXT:    xorl %r12d, %esi
 ; CHECK-SSE1-NEXT:    andw {{[0-9]+}}(%rsp), %si
 ; CHECK-SSE1-NEXT:    xorl %r12d, %esi
@@ -954,16 +964,16 @@ define <8 x i16> @out_v8i16(<8 x i16> %x, <8 x i16> %y, <8 x i16> %mask) nounwin
 ; CHECK-SSE1-NEXT:    xorl %ebx, %r9d
 ; CHECK-SSE1-NEXT:    andw {{[0-9]+}}(%rsp), %r9w
 ; CHECK-SSE1-NEXT:    xorl %ebx, %r9d
-; CHECK-SSE1-NEXT:    movzwl {{[0-9]+}}(%rsp), %ebx
-; CHECK-SSE1-NEXT:    xorw %r11w, %bx
+; CHECK-SSE1-NEXT:    movl %r11d, %ebx
+; CHECK-SSE1-NEXT:    xorw {{[0-9]+}}(%rsp), %bx
 ; CHECK-SSE1-NEXT:    andw {{[0-9]+}}(%rsp), %bx
 ; CHECK-SSE1-NEXT:    xorl %r11d, %ebx
-; CHECK-SSE1-NEXT:    movzwl {{[0-9]+}}(%rsp), %r11d
-; CHECK-SSE1-NEXT:    xorw %r10w, %r11w
+; CHECK-SSE1-NEXT:    movl %r10d, %r11d
+; CHECK-SSE1-NEXT:    xorw {{[0-9]+}}(%rsp), %r11w
 ; CHECK-SSE1-NEXT:    andw {{[0-9]+}}(%rsp), %r11w
 ; CHECK-SSE1-NEXT:    xorl %r10d, %r11d
-; CHECK-SSE1-NEXT:    movzwl {{[0-9]+}}(%rsp), %r10d
-; CHECK-SSE1-NEXT:    xorw %di, %r10w
+; CHECK-SSE1-NEXT:    movl %edi, %r10d
+; CHECK-SSE1-NEXT:    xorw {{[0-9]+}}(%rsp), %r10w
 ; CHECK-SSE1-NEXT:    andw {{[0-9]+}}(%rsp), %r10w
 ; CHECK-SSE1-NEXT:    xorl %edi, %r10d
 ; CHECK-SSE1-NEXT:    movw %r10w, 14(%rax)
@@ -1749,117 +1759,113 @@ define <16 x i16> @out_v16i16(ptr%px, ptr%py, ptr%pmask) nounwind {
 ; CHECK-BASELINE-NEXT:    pushq %r13
 ; CHECK-BASELINE-NEXT:    pushq %r12
 ; CHECK-BASELINE-NEXT:    pushq %rbx
-; CHECK-BASELINE-NEXT:    movq %rcx, %r10
-; CHECK-BASELINE-NEXT:    movq %rdx, %r8
-; CHECK-BASELINE-NEXT:    movq %rsi, %r9
-; CHECK-BASELINE-NEXT:    movq %rdi, %r11
-; CHECK-BASELINE-NEXT:    movzwl 18(%rdx), %ebp
-; CHECK-BASELINE-NEXT:    movl 16(%rdx), %r15d
-; CHECK-BASELINE-NEXT:    movzwl 14(%rdx), %r13d
-; CHECK-BASELINE-NEXT:    movl 12(%rdx), %r12d
-; CHECK-BASELINE-NEXT:    movzwl 10(%rdx), %r14d
-; CHECK-BASELINE-NEXT:    movl 8(%rdx), %ebx
-; CHECK-BASELINE-NEXT:    movzwl 6(%rdx), %eax
-; CHECK-BASELINE-NEXT:    movl (%rdx), %ecx
-; CHECK-BASELINE-NEXT:    movl 4(%rdx), %edx
-; CHECK-BASELINE-NEXT:    movzwl 2(%r8), %esi
-; CHECK-BASELINE-NEXT:    movzwl (%r9), %edi
-; CHECK-BASELINE-NEXT:    xorw %cx, %di
-; CHECK-BASELINE-NEXT:    andw (%r10), %di
-; CHECK-BASELINE-NEXT:    xorl %ecx, %edi
-; CHECK-BASELINE-NEXT:    movl %edi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; CHECK-BASELINE-NEXT:    movzwl 2(%r9), %ecx
-; CHECK-BASELINE-NEXT:    xorw %si, %cx
-; CHECK-BASELINE-NEXT:    andw 2(%r10), %cx
-; CHECK-BASELINE-NEXT:    xorl %esi, %ecx
-; CHECK-BASELINE-NEXT:    movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; CHECK-BASELINE-NEXT:    movzwl 4(%r9), %ecx
-; CHECK-BASELINE-NEXT:    xorw %dx, %cx
-; CHECK-BASELINE-NEXT:    andw 4(%r10), %cx
-; CHECK-BASELINE-NEXT:    xorl %edx, %ecx
-; CHECK-BASELINE-NEXT:    movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; CHECK-BASELINE-NEXT:    movzwl 6(%r9), %ecx
-; CHECK-BASELINE-NEXT:    xorw %ax, %cx
-; CHECK-BASELINE-NEXT:    andw 6(%r10), %cx
-; CHECK-BASELINE-NEXT:    xorl %eax, %ecx
-; CHECK-BASELINE-NEXT:    movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; CHECK-BASELINE-NEXT:    movzwl 8(%r9), %eax
+; CHECK-BASELINE-NEXT:    movzwl 18(%rdx), %r15d
+; CHECK-BASELINE-NEXT:    movzwl 16(%rdx), %r14d
+; CHECK-BASELINE-NEXT:    movzwl 14(%rdx), %ebp
+; CHECK-BASELINE-NEXT:    movzwl 12(%rdx), %ebx
+; CHECK-BASELINE-NEXT:    movzwl 10(%rdx), %r13d
+; CHECK-BASELINE-NEXT:    movzwl 8(%rdx), %r11d
+; CHECK-BASELINE-NEXT:    movzwl 6(%rdx), %r10d
+; CHECK-BASELINE-NEXT:    movzwl 4(%rdx), %r9d
+; CHECK-BASELINE-NEXT:    movzwl (%rdx), %r8d
+; CHECK-BASELINE-NEXT:    movzwl 2(%rdx), %r12d
+; CHECK-BASELINE-NEXT:    movzwl (%rsi), %eax
+; CHECK-BASELINE-NEXT:    xorw %r8w, %ax
+; CHECK-BASELINE-NEXT:    andw (%rcx), %ax
+; CHECK-BASELINE-NEXT:    xorl %eax, %r8d
+; CHECK-BASELINE-NEXT:    movl %r8d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; CHECK-BASELINE-NEXT:    movzwl 2(%rsi), %eax
+; CHECK-BASELINE-NEXT:    xorw %r12w, %ax
+; CHECK-BASELINE-NEXT:    andw 2(%rcx), %ax
+; CHECK-BASELINE-NEXT:    xorl %eax, %r12d
+; CHECK-BASELINE-NEXT:    movzwl 4(%rsi), %eax
+; CHECK-BASELINE-NEXT:    xorw %r9w, %ax
+; CHECK-BASELINE-NEXT:    andw 4(%rcx), %ax
+; CHECK-BASELINE-NEXT:    xorl %eax, %r9d
+; CHECK-BASELINE-NEXT:    movl %r9d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; CHECK-BASELINE-NEXT:    movzwl 6(%rsi), %eax
+; CHECK-BASELINE-NEXT:    xorw %r10w, %ax
+; CHECK-BASELINE-NEXT:    andw 6(%rcx), %ax
+; CHECK-BASELINE-NEXT:    xorl %eax, %r10d
+; CHECK-BASELINE-NEXT:    movl %r10d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; CHECK-BASELINE-NEXT:    movzwl 8(%rsi), %eax
+; CHECK-BASELINE-NEXT:    xorw %r11w, %ax
+; CHECK-BASELINE-NEXT:    andw 8(%rcx), %ax
+; CHECK-BASELINE-NEXT:    xorl %eax, %r11d
+; CHECK-BASELINE-NEXT:    movl %r11d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; CHECK-BASELINE-NEXT:    movzwl 10(%rsi), %eax
+; CHECK-BASELINE-NEXT:    xorw %r13w, %ax
+; CHECK-BASELINE-NEXT:    andw 10(%rcx), %ax
+; CHECK-BASELINE-NEXT:    xorl %eax, %r13d
+; CHECK-BASELINE-NEXT:    movl %r13d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; CHECK-BASELINE-NEXT:    movzwl 12(%rsi), %eax
 ; CHECK-BASELINE-NEXT:    xorw %bx, %ax
-; CHECK-BASELINE-NEXT:    andw 8(%r10), %ax
-; CHECK-BASELINE-NEXT:    xorl %ebx, %eax
-; CHECK-BASELINE-NEXT:    movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; CHECK-BASELINE-NEXT:    movzwl 10(%r9), %ebx
-; CHECK-BASELINE-NEXT:    xorw %r14w, %bx
-; CHECK-BASELINE-NEXT:    andw 10(%r10), %bx
-; CHECK-BASELINE-NEXT:    xorl %r14d, %ebx
-; CHECK-BASELINE-NEXT:    movzwl 12(%r9), %r14d
-; CHECK-BASELINE-NEXT:    xorw %r12w, %r14w
-; CHECK-BASELINE-NEXT:    andw 12(%r10), %r14w
-; CHECK-BASELINE-NEXT:    xorl %r12d, %r14d
-; CHECK-BASELINE-NEXT:    movzwl 14(%r9), %r12d
-; CHECK-BASELINE-NEXT:    xorw %r13w, %r12w
-; CHECK-BASELINE-NEXT:    andw 14(%r10), %r12w
-; CHECK-BASELINE-NEXT:    xorl %r13d, %r12d
-; CHECK-BASELINE-NEXT:    movzwl 16(%r9), %r13d
-; CHECK-BASELINE-NEXT:    xorw %r15w, %r13w
-; CHECK-BASELINE-NEXT:    andw 16(%r10), %r13w
-; CHECK-BASELINE-NEXT:    xorl %r15d, %r13d
-; CHECK-BASELINE-NEXT:    movzwl 18(%r9), %r15d
-; CHECK-BASELINE-NEXT:    xorw %bp, %r15w
-; CHECK-BASELINE-NEXT:    andw 18(%r10), %r15w
-; CHECK-BASELINE-NEXT:    xorl %ebp, %r15d
-; CHECK-BASELINE-NEXT:    movl 20(%r8), %eax
-; CHECK-BASELINE-NEXT:    movzwl 20(%r9), %ebp
-; CHECK-BASELINE-NEXT:    xorw %ax, %bp
-; CHECK-BASELINE-NEXT:    andw 20(%r10), %bp
+; CHECK-BASELINE-NEXT:    andw 12(%rcx), %ax
+; CHECK-BASELINE-NEXT:    xorl %eax, %ebx
+; CHECK-BASELINE-NEXT:    movzwl 14(%rsi), %eax
+; CHECK-BASELINE-NEXT:    xorw %bp, %ax
+; CHECK-BASELINE-NEXT:    andw 14(%rcx), %ax
 ; CHECK-BASELINE-NEXT:    xorl %eax, %ebp
-; CHECK-BASELINE-NEXT:    movzwl 22(%r8), %eax
-; CHECK-BASELINE-NEXT:    movzwl 22(%r9), %esi
-; CHECK-BASELINE-NEXT:    xorw %ax, %si
-; CHECK-BASELINE-NEXT:    andw 22(%r10), %si
-; CHECK-BASELINE-NEXT:    xorl %eax, %esi
-; CHECK-BASELINE-NEXT:    movl 24(%r8), %eax
-; CHECK-BASELINE-NEXT:    movzwl 24(%r9), %edx
-; CHECK-BASELINE-NEXT:    xorw %ax, %dx
-; CHECK-BASELINE-NEXT:    andw 24(%r10), %dx
-; CHECK-BASELINE-NEXT:    xorl %eax, %edx
-; CHECK-BASELINE-NEXT:    movzwl 26(%r8), %eax
-; CHECK-BASELINE-NEXT:    movzwl 26(%r9), %ecx
-; CHECK-BASELINE-NEXT:    xorw %ax, %cx
-; CHECK-BASELINE-NEXT:    andw 26(%r10), %cx
-; CHECK-BASELINE-NEXT:    xorl %eax, %ecx
-; CHECK-BASELINE-NEXT:    movl 28(%r8), %edi
-; CHECK-BASELINE-NEXT:    movzwl 28(%r9), %eax
-; CHECK-BASELINE-NEXT:    xorw %di, %ax
-; CHECK-BASELINE-NEXT:    andw 28(%r10), %ax
-; CHECK-BASELINE-NEXT:    xorl %edi, %eax
-; CHECK-BASELINE-NEXT:    movzwl 30(%r8), %edi
-; CHECK-BASELINE-NEXT:    movzwl 30(%r9), %r8d
-; CHECK-BASELINE-NEXT:    xorw %di, %r8w
-; CHECK-BASELINE-NEXT:    andw 30(%r10), %r8w
-; CHECK-BASELINE-NEXT:    xorl %edi, %r8d
-; CHECK-BASELINE-NEXT:    movw %r8w, 30(%r11)
-; CHECK-BASELINE-NEXT:    movw %ax, 28(%r11)
-; CHECK-BASELINE-NEXT:    movw %cx, 26(%r11)
-; CHECK-BASELINE-NEXT:    movw %dx, 24(%r11)
-; CHECK-BASELINE-NEXT:    movw %si, 22(%r11)
-; CHECK-BASELINE-NEXT:    movw %bp, 20(%r11)
-; CHECK-BASELINE-NEXT:    movw %r15w, 18(%r11)
-; CHECK-BASELINE-NEXT:    movw %r13w, 16(%r11)
-; CHECK-BASELINE-NEXT:    movw %r12w, 14(%r11)
-; CHECK-BASELINE-NEXT:    movw %r14w, 12(%r11)
-; CHECK-BASELINE-NEXT:    movw %bx, 10(%r11)
+; CHECK-BASELINE-NEXT:    movzwl 16(%rsi), %eax
+; CHECK-BASELINE-NEXT:    xorw %r14w, %ax
+; CHECK-BASELINE-NEXT:    andw 16(%rcx), %ax
+; CHECK-BASELINE-NEXT:    xorl %eax, %r14d
+; CHECK-BASELINE-NEXT:    movzwl 18(%rsi), %eax
+; CHECK-BASELINE-NEXT:    xorw %r15w, %ax
+; CHECK-BASELINE-NEXT:    andw 18(%rcx), %ax
+; CHECK-BASELINE-NEXT:    xorl %eax, %r15d
+; CHECK-BASELINE-NEXT:    movzwl 20(%rdx), %r13d
+; CHECK-BASELINE-NEXT:    movzwl 20(%rsi), %eax
+; CHECK-BASELINE-NEXT:    xorw %r13w, %ax
+; CHECK-BASELINE-NEXT:    andw 20(%rcx), %ax
+; CHECK-BASELINE-NEXT:    xorl %eax, %r13d
+; CHECK-BASELINE-NEXT:    movzwl 22(%rdx), %r9d
+; CHECK-BASELINE-NEXT:    movzwl 22(%rsi), %eax
+; CHECK-BASELINE-NEXT:    xorw %r9w, %ax
+; CHECK-BASELINE-NEXT:    andw 22(%rcx), %ax
+; CHECK-BASELINE-NEXT:    xorl %eax, %r9d
+; CHECK-BASELINE-NEXT:    movzwl 24(%rdx), %r8d
+; CHECK-BASELINE-NEXT:    movzwl 24(%rsi), %eax
+; CHECK-BASELINE-NEXT:    xorw %r8w, %ax
+; CHECK-BASELINE-NEXT:    andw 24(%rcx), %ax
+; CHECK-BASELINE-NEXT:    xorl %eax, %r8d
+; CHECK-BASELINE-NEXT:    movzwl 26(%rdx), %eax
+; CHECK-BASELINE-NEXT:    movzwl 26(%rsi), %r10d
+; CHECK-BASELINE-NEXT:    xorw %ax, %r10w
+; CHECK-BASELINE-NEXT:    andw 26(%rcx), %r10w
+; CHECK-BASELINE-NEXT:    xorl %r10d, %eax
+; CHECK-BASELINE-NEXT:    movzwl 28(%rdx), %r10d
+; CHECK-BASELINE-NEXT:    movzwl 28(%rsi), %r11d
+; CHECK-BASELINE-NEXT:    xorw %r10w, %r11w
+; CHECK-BASELINE-NEXT:    andw 28(%rcx), %r11w
+; CHECK-BASELINE-NEXT:    xorl %r11d, %r10d
+; CHECK-BASELINE-NEXT:    movzwl 30(%rdx), %edx
+; CHECK-BASELINE-NEXT:    movzwl 30(%rsi), %esi
+; CHECK-BASELINE-NEXT:    xorw %dx, %si
+; CHECK-BASELINE-NEXT:    andw 30(%rcx), %si
+; CHECK-BASELINE-NEXT:    xorl %esi, %edx
+; CHECK-BASELINE-NEXT:    movw %dx, 30(%rdi)
+; CHECK-BASELINE-NEXT:    movw %r10w, 28(%rdi)
+; CHECK-BASELINE-NEXT:    movw %ax, 26(%rdi)
+; CHECK-BASELINE-NEXT:    movw %r8w, 24(%rdi)
+; CHECK-BASELINE-NEXT:    movw %r9w, 22(%rdi)
+; CHECK-BASELINE-NEXT:    movw %r13w, 20(%rdi)
+; CHECK-BASELINE-NEXT:    movw %r15w, 18(%rdi)
+; CHECK-BASELINE-NEXT:    movw %r14w, 16(%rdi)
+; CHECK-BASELINE-NEXT:    movw %bp, 14(%rdi)
+; CHECK-BASELINE-NEXT:    movw %bx, 12(%rdi)
 ; CHECK-BASELINE-NEXT:    movl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Reload
-; CHECK-BASELINE-NEXT:    movw %ax, 8(%r11)
+; CHECK-BASELINE-NEXT:    movw %ax, 10(%rdi)
 ; CHECK-BASELINE-NEXT:    movl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Reload
-; CHECK-BASELINE-NEXT:    movw %ax, 6(%r11)
+; CHECK-BASELINE-NEXT:    movw %ax, 8(%rdi)
 ; CHECK-BASELINE-NEXT:    movl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Reload
-; CHECK-BASELINE-NEXT:    movw %ax, 4(%r11)
+; CHECK-BASELINE-NEXT:    movw %ax, 6(%rdi)
 ; CHECK-BASELINE-NEXT:    movl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Reload
-; CHECK-BASELINE-NEXT:    movw %ax, 2(%r11)
+; CHECK-BASELINE-NEXT:    movw %ax, 4(%rdi)
+; CHECK-BASELINE-NEXT:    movw %r12w, 2(%rdi)
 ; CHECK-BASELINE-NEXT:    movl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Reload
-; CHECK-BASELINE-NEXT:    movw %ax, (%r11)
-; CHECK-BASELINE-NEXT:    movq %r11, %rax
+; CHECK-BASELINE-NEXT:    movw %ax, (%rdi)
+; CHECK-BASELINE-NEXT:    movq %rdi, %rax
 ; CHECK-BASELINE-NEXT:    popq %rbx
 ; CHECK-BASELINE-NEXT:    popq %r12
 ; CHECK-BASELINE-NEXT:    popq %r13
@@ -1876,117 +1882,113 @@ define <16 x i16> @out_v16i16(ptr%px, ptr%py, ptr%pmask) nounwind {
 ; CHECK-SSE1-NEXT:    pushq %r13
 ; CHECK-SSE1-NEXT:    pushq %r12
 ; CHECK-SSE1-NEXT:    pushq %rbx
-; CHECK-SSE1-NEXT:    movq %rcx, %r10
-; CHECK-SSE1-NEXT:    movq %rdx, %r8
-; CHECK-SSE1-NEXT:    movq %rsi, %r9
-; CHECK-SSE1-NEXT:    movq %rdi, %r11
-; CHECK-SSE1-NEXT:    movzwl 18(%rdx), %ebp
-; CHECK-SSE1-NEXT:    movl 16(%rdx), %r15d
-; CHECK-SSE1-NEXT:    movzwl 14(%rdx), %r13d
-; CHECK-SSE1-NEXT:    movl 12(%rdx), %r12d
-; CHECK-SSE1-NEXT:    movzwl 10(%rdx), %r14d
-; CHECK-SSE1-NEXT:    movl 8(%rdx), %ebx
-; CHECK-SSE1-NEXT:    movzwl 6(%rdx), %eax
-; CHECK-SSE1-NEXT:    movl (%rdx), %ecx
-; CHECK-SSE1-NEXT:    movl 4(%rdx), %edx
-; CHECK-SSE1-NEXT:    movzwl 2(%r8), %esi
-; CHECK-SSE1-NEXT:    movzwl (%r9), %edi
-; CHECK-SSE1-NEXT:    xorw %cx, %di
-; CHECK-SSE1-NEXT:    andw (%r10), %di
-; CHECK-SSE1-NEXT:    xorl %ecx, %edi
-; CHECK-SSE1-NEXT:    movl %edi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; CHECK-SSE1-NEXT:    movzwl 2(%r9), %ecx
-; CHECK-SSE1-NEXT:    xorw %si, %cx
-; CHECK-SSE1-NEXT:    andw 2(%r10), %cx
-; CHECK-SSE1-NEXT:    xorl %esi, %ecx
-; CHECK-SSE1-NEXT:    movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; CHECK-SSE1-NEXT:    movzwl 4(%r9), %ecx
-; CHECK-SSE1-NEXT:    xorw %dx, %cx
-; CHECK-SSE1-NEXT:    andw 4(%r10), %cx
-; CHECK-SSE1-NEXT:    xorl %edx, %ecx
-; CHECK-SSE1-NEXT:    movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; CHECK-SSE1-NEXT:    movzwl 6(%r9), %ecx
-; CHECK-SSE1-NEXT:    xorw %ax, %cx
-; CHECK-SSE1-NEXT:    andw 6(%r10), %cx
-; CHECK-SSE1-NEXT:    xorl %eax, %ecx
-; CHECK-SSE1-NEXT:    movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; CHECK-SSE1-NEXT:    movzwl 8(%r9), %eax
+; CHECK-SSE1-NEXT:    movzwl 18(%rdx), %r15d
+; CHECK-SSE1-NEXT:    movzwl 16(%rdx), %r14d
+; CHECK-SSE1-NEXT:    movzwl 14(%rdx), %ebp
+; CHECK-SSE1-NEXT:    movzwl 12(%rdx), %ebx
+; CHECK-SSE1-NEXT:    movzwl 10(%rdx), %r13d
+; CHECK-SSE1-NEXT:    movzwl 8(%rdx), %r11d
+; CHECK-SSE1-NEXT:    movzwl 6(%rdx), %r10d
+; CHECK-SSE1-NEXT:    movzwl 4(%rdx), %r9d
+; CHECK-SSE1-NEXT:    movzwl (%rdx), %r8d
+; CHECK-SSE1-NEXT:    movzwl 2(%rdx), %r12d
+; CHECK-SSE1-NEXT:    movzwl (%rsi), %eax
+; CHECK-SSE1-NEXT:    xorw %r8w, %ax
+; CHECK-SSE1-NEXT:    andw (%rcx), %ax
+; CHECK-SSE1-NEXT:    xorl %eax, %r8d
+; CHECK-SSE1-NEXT:    movl %r8d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; CHECK-SSE1-NEXT:    movzwl 2(%rsi), %eax
+; CHECK-SSE1-NEXT:    xorw %r12w, %ax
+; CHECK-SSE1-NEXT:    andw 2(%rcx), %ax
+; CHECK-SSE1-NEXT:    xorl %eax, %r12d
+; CHECK-SSE1-NEXT:    movzwl 4(%rsi), %eax
+; CHECK-SSE1-NEXT:    xorw %r9w, %ax
+; CHECK-SSE1-NEXT:    andw 4(%rcx), %ax
+; CHECK-SSE1-NEXT:    xorl %eax, %r9d
+; CHECK-SSE1-NEXT:    movl %r9d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; CHECK-SSE1-NEXT:    movzwl 6(%rsi), %eax
+; CHECK-SSE1-NEXT:    xorw %r10w, %ax
+; CHECK-SSE1-NEXT:    andw 6(%rcx), %ax
+; CHECK-SSE1-NEXT:    xorl %eax, %r10d
+; CHECK-SSE1-NEXT:    movl %r10d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; CHECK-SSE1-NEXT:    movzwl 8(%rsi), %eax
+; CHECK-SSE1-NEXT:    xorw %r11w, %ax
+; CHECK-SSE1-NEXT:    andw 8(%rcx), %ax
+; CHECK-SSE1-NEXT:    xorl %eax, %r11d
+; CHECK-SSE1-NEXT:    movl %r11d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; CHECK-SSE1-NEXT:    movzwl 10(%rsi), %eax
+; CHECK-SSE1-NEXT:    xorw %r13w, %ax
+; CHECK-SSE1-NEXT:    andw 10(%rcx), %ax
+; CHECK-SSE1-NEXT:    xorl %eax, %r13d
+; CHECK-SSE1-NEXT:    movl %r13d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; CHECK-SSE1-NEXT:    movzwl 12(%rsi), %eax
 ; CHECK-SSE1-NEXT:    xorw %bx, %ax
-; CHECK-SSE1-NEXT:    andw 8(%r10), %ax
-; CHECK-SSE1-NEXT:    xorl %ebx, %eax
-; CHECK-SSE1-NEXT:    movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; CHECK-SSE1-NEXT:    movzwl 10(%r9), %ebx
-; CHECK-SSE1-NEXT:    xorw %r14w, %bx
-; CHECK-SSE1-NEXT:    andw 10(%r10), %bx
-; CHECK-SSE1-NEXT:    xorl %r14d, %ebx
-; CHECK-SSE1-NEXT:    movzwl 12(%r9), %r14d
-; CHECK-SSE1-NEXT:    xorw %r12w, %r14w
-; CHECK-SSE1-NEXT:    andw 12(%r10), %r14w
-; CHECK-SSE1-NEXT:    xorl %r12d, %r14d
-; CHECK-SSE1-NEXT:    movzwl 14(%r9), %r12d
-; CHECK-SSE1-NEXT:    xorw %r13w, %r12w
-; CHECK-SSE1-NEXT:    andw 14(%r10), %r12w
-; CHECK-SSE1-NEXT:    xorl %r13d, %r12d
-; CHECK-SSE1-NEXT:    movzwl 16(%r9), %r13d
-; CHECK-SSE1-NEXT:    xorw %r15w, %r13w
-; CHECK-SSE1-NEXT:    andw 16(%r10), %r13w
-; CHECK-SSE1-NEXT:    xorl %r15d, %r13d
-; CHECK-SSE1-NEXT:    movzwl 18(%r9), %r15d
-; CHECK-SSE1-NEXT:    xorw %bp, %r15w
-; CHECK-SSE1-NEXT:    andw 18(%r10), %r15w
-; CHECK-SSE1-NEXT:    xorl %ebp, %r15d
-; CHECK-SSE1-NEXT:    movl 20(%r8), %eax
-; CHECK-SSE1-NEXT:    movzwl 20(%r9), %ebp
-; CHECK-SSE1-NEXT:    xorw %ax, %bp
-; CHECK-SSE1-NEXT:    andw 20(%r10), %bp
+; CHECK-SSE1-NEXT:    andw 12(%rcx), %ax
+; CHECK-SSE1-NEXT:    xorl %eax, %ebx
+; CHECK-SSE1-NEXT:    movzwl 14(%rsi), %eax
+; CHECK-SSE1-NEXT:    xorw %bp, %ax
+; CHECK-SSE1-NEXT:    andw 14(%rcx), %ax
 ; CHECK-SSE1-NEXT:    xorl %eax, %ebp
-; CHECK-SSE1-NEXT:    movzwl 22(%r8), %eax
-; CHECK-SSE1-NEXT:    movzwl 22(%r9), %esi
-; CHECK-SSE1-NEXT:    xorw %ax, %si
-; CHECK-SSE1-NEXT:    andw 22(%r10), %si
-; CHECK-SSE1-NEXT:    xorl %eax, %esi
-; CHECK-SSE1-NEXT:    movl 24(%r8), %eax
-; CHECK-SSE1-NEXT:    movzwl 24(%r9), %edx
-; CHECK-SSE1-NEXT:    xorw %ax, %dx
-; CHECK-SSE1-NEXT:    andw 24(%r10), %dx
-; CHECK-SSE1-NEXT:    xorl %eax, %edx
-; CHECK-SSE1-NEXT:    movzwl 26(%r8), %eax
-; CHECK-SSE1-NEXT:    movzwl 26(%r9), %ecx
-; CHECK-SSE1-NEXT:    xorw %ax, %cx
-; CHECK-SSE1-NEXT:    andw 26(%r10), %cx
-; CHECK-SSE1-NEXT:    xorl %eax, %ecx
-; CHECK-SSE1-NEXT:    movl 28(%r8), %edi
-; CHECK-SSE1-NEXT:    movzwl 28(%r9), %eax
-; CHECK-SSE1-NEXT:    xorw %di, %ax
-; CHECK-SSE1-NEXT:    andw 28(%r10), %ax
-; CHECK-SSE1-NEXT:    xorl %edi, %eax
-; CHECK-SSE1-NEXT:    movzwl 30(%r8), %edi
-; CHECK-SSE1-NEXT:    movzwl 30(%r9), %r8d
-; CHECK-SSE1-NEXT:    xorw %di, %r8w
-; CHECK-SSE1-NEXT:    andw 30(%r10), %r8w
-; CHECK-SSE1-NEXT:    xorl %edi, %r8d
-; CHECK-SSE1-NEXT:    movw %r8w, 30(%r11)
-; CHECK-SSE1-NEXT:    movw %ax, 28(%r11)
-; CHECK-SSE1-NEXT:    movw %cx, 26(%r11)
-; CHECK-SSE1-NEXT:    movw %dx, 24(%r11)
-; CHECK-SSE1-NEXT:    movw %si, 22(%r11)
-; CHECK-SSE1-NEXT:    movw %bp, 20(%r11)
-; CHECK-SSE1-NEXT:    movw %r15w, 18(%r11)
-; CHECK-SSE1-NEXT:    movw %r13w, 16(%r11)
-; CHECK-SSE1-NEXT:    movw %r12w, 14(%r11)
-; CHECK-SSE1-NEXT:    movw %r14w, 12(%r11)
-; CHECK-SSE1-NEXT:    movw %bx, 10(%r11)
+; CHECK-SSE1-NEXT:    movzwl 16(%rsi), %eax
+; CHECK-SSE1-NEXT:    xorw %r14w, %ax
+; CHECK-SSE1-NEXT:    andw 16(%rcx), %ax
+; CHECK-SSE1-NEXT:    xorl %eax, %r14d
+; CHECK-SSE1-NEXT:    movzwl 18(%rsi), %eax
+; CHECK-SSE1-NEXT:    xorw %r15w, %ax
+; CHECK-SSE1-NEXT:    andw 18(%rcx), %ax
+; CHECK-SSE1-NEXT:    xorl %eax, %r15d
+; CHECK-SSE1-NEXT:    movzwl 20(%rdx), %r13d
+; CHECK-SSE1-NEXT:    movzwl 20(%rsi), %eax
+; CHECK-SSE1-NEXT:    xorw %r13w, %ax
+; CHECK-SSE1-NEXT:    andw 20(%rcx), %ax
+; CHECK-SSE1-NEXT:    xorl %eax, %r13d
+; CHECK-SSE1-NEXT:    movzwl 22(%rdx), %r9d
+; CHECK-SSE1-NEXT:    movzwl 22(%rsi), %eax
+; CHECK-SSE1-NEXT:    xorw %r9w, %ax
+; CHECK-SSE1-NEXT:    andw 22(%rcx), %ax
+; CHECK-SSE1-NEXT:    xorl %eax, %r9d
+; CHECK-SSE1-NEXT:    movzwl 24(%rdx), %r8d
+; CHECK-SSE1-NEXT:    movzwl 24(%rsi), %eax
+; CHECK-SSE1-NEXT:    xorw %r8w, %ax
+; CHECK-SSE1-NEXT:    andw 24(%rcx), %ax
+; CHECK-SSE1-NEXT:    xorl %eax, %r8d
+; CHECK-SSE1-NEXT:    movzwl 26(%rdx), %eax
+; CHECK-SSE1-NEXT:    movzwl 26(%rsi), %r10d
+; CHECK-SSE1-NEXT:    xorw %ax, %r10w
+; CHECK-SSE1-NEXT:    andw 26(%rcx), %r10w
+; CHECK-SSE1-NEXT:    xorl %r10d, %eax
+; CHECK-SSE1-NEXT:    movzwl 28(%rdx), %r10d
+; CHECK-SSE1-NEXT:    movzwl 28(%rsi), %r11d
+; CHECK-SSE1-NEXT:    xorw %r10w, %r11w
+; CHECK-SSE1-NEXT:    andw 28(%rcx), %r11w
+; CHECK-SSE1-NEXT:    xorl %r11d, %r10d
+; CHECK-SSE1-NEXT:    movzwl 30(%rdx), %edx
+; CHECK-SSE1-NEXT:    movzwl 30(%rsi), %esi
+; CHECK-SSE1-NEXT:    xorw %dx, %si
+; CHECK-SSE1-NEXT:    andw 30(%rcx), %si
+; CHECK-SSE1-NEXT:    xorl %esi, %edx
+; CHECK-SSE1-NEXT:    movw %dx, 30(%rdi)
+; CHECK-SSE1-NEXT:    movw %r10w, 28(%rdi)
+; CHECK-SSE1-NEXT:    movw %ax, 26(%rdi)
+; CHECK-SSE1-NEXT:    movw %r8w, 24(%rdi)
+; CHECK-SSE1-NEXT:    movw %r9w, 22(%rdi)
+; CHECK-SSE1-NEXT:    movw %r13w, 20(%rdi)
+; CHECK-SSE1-NEXT:    movw %r15w, 18(%rdi)
+; CHECK-SSE1-NEXT:    movw %r14w, 16(%rdi)
+; CHECK-SSE1-NEXT:    movw %bp, 14(%rdi)
+; CHECK-SSE1-NEXT:    movw %bx, 12(%rdi)
 ; CHECK-SSE1-NEXT:    movl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Reload
-; CHECK-SSE1-NEXT:    movw %ax, 8(%r11)
+; CHECK-SSE1-NEXT:    movw %ax, 10(%rdi)
 ; CHECK-SSE1-NEXT:    movl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Reload
-; CHECK-SSE1-NEXT:    movw %ax, 6(%r11)
+; CHECK-SSE1-NEXT:    movw %ax, 8(%rdi)
 ; CHECK-SSE1-NEXT:    movl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Reload
-; CHECK-SSE1-NEXT:    movw %ax, 4(%r11)
+; CHECK-SSE1-NEXT:    movw %ax, 6(%rdi)
 ; CHECK-SSE1-NEXT:    movl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Reload
-; CHECK-SSE1-NEXT:    movw %ax, 2(%r11)
+; CHECK-SSE1-NEXT:    movw %ax, 4(%rdi)
+; CHECK-SSE1-NEXT:    movw %r12w, 2(%rdi)
 ; CHECK-SSE1-NEXT:    movl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Reload
-; CHECK-SSE1-NEXT:    movw %ax, (%r11)
-; CHECK-SSE1-NEXT:    movq %r11, %rax
+; CHECK-SSE1-NEXT:    movw %ax, (%rdi)
+; CHECK-SSE1-NEXT:    movq %rdi, %rax
 ; CHECK-SSE1-NEXT:    popq %rbx
 ; CHECK-SSE1-NEXT:    popq %r12
 ; CHECK-SSE1-NEXT:    popq %r13

>From 0a6463039da89914c7a0f99622fb7a008abde2fd Mon Sep 17 00:00:00 2001
From: Florian Mayer <fmayer at google.com>
Date: Tue, 10 Jun 2025 19:48:09 -0700
Subject: [PATCH 36/61] [NFC] get rid of `undef` in avx512vl-intrinsics.ll test
 (#143641)

---
 llvm/test/CodeGen/X86/avx512vl-intrinsics.ll | 72 ++++++++++----------
 1 file changed, 36 insertions(+), 36 deletions(-)

diff --git a/llvm/test/CodeGen/X86/avx512vl-intrinsics.ll b/llvm/test/CodeGen/X86/avx512vl-intrinsics.ll
index 0973824fbb0ef..b408aac218108 100644
--- a/llvm/test/CodeGen/X86/avx512vl-intrinsics.ll
+++ b/llvm/test/CodeGen/X86/avx512vl-intrinsics.ll
@@ -46,7 +46,7 @@ define <2 x double> @test_compress_pd_128(<2 x double> %data) {
 ; CHECK-LABEL: test_compress_pd_128:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
-  %1 = call <2 x double> @llvm.x86.avx512.mask.compress.v2f64(<2 x double> %data, <2 x double> undef, <2 x i1> <i1 true, i1 true>)
+  %1 = call <2 x double> @llvm.x86.avx512.mask.compress.v2f64(<2 x double> %data, <2 x double> poison, <2 x i1> <i1 true, i1 true>)
   ret <2 x double> %1
 }
 
@@ -94,7 +94,7 @@ define <4 x float> @test_compress_ps_128(<4 x float> %data) {
 ; CHECK-LABEL: test_compress_ps_128:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
-  %1 = call <4 x float> @llvm.x86.avx512.mask.compress.v4f32(<4 x float> %data, <4 x float> undef, <4 x i1> <i1 true, i1 true, i1 true, i1 true>)
+  %1 = call <4 x float> @llvm.x86.avx512.mask.compress.v4f32(<4 x float> %data, <4 x float> poison, <4 x i1> <i1 true, i1 true, i1 true, i1 true>)
   ret <4 x float> %1
 }
 
@@ -142,7 +142,7 @@ define <2 x i64> @test_compress_q_128(<2 x i64> %data) {
 ; CHECK-LABEL: test_compress_q_128:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
-  %1 = call <2 x i64> @llvm.x86.avx512.mask.compress.v2i64(<2 x i64> %data, <2 x i64> undef, <2 x i1> <i1 true, i1 true>)
+  %1 = call <2 x i64> @llvm.x86.avx512.mask.compress.v2i64(<2 x i64> %data, <2 x i64> poison, <2 x i1> <i1 true, i1 true>)
   ret <2 x i64> %1
 }
 
@@ -190,7 +190,7 @@ define <4 x i32> @test_compress_d_128(<4 x i32> %data) {
 ; CHECK-LABEL: test_compress_d_128:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
-  %1 = call <4 x i32> @llvm.x86.avx512.mask.compress.v4i32(<4 x i32> %data, <4 x i32> undef, <4 x i1> <i1 true, i1 true, i1 true, i1 true>)
+  %1 = call <4 x i32> @llvm.x86.avx512.mask.compress.v4i32(<4 x i32> %data, <4 x i32> poison, <4 x i1> <i1 true, i1 true, i1 true, i1 true>)
   ret <4 x i32> %1
 }
 
@@ -198,7 +198,7 @@ define <2 x double> @test_expand_pd_128(<2 x double> %data) {
 ; CHECK-LABEL: test_expand_pd_128:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
-  %1 = call <2 x double> @llvm.x86.avx512.mask.expand.v2f64(<2 x double> %data, <2 x double> undef, <2 x i1> <i1 true, i1 true>)
+  %1 = call <2 x double> @llvm.x86.avx512.mask.expand.v2f64(<2 x double> %data, <2 x double> poison, <2 x i1> <i1 true, i1 true>)
   ret <2 x double> %1
 }
 
@@ -246,7 +246,7 @@ define <4 x float> @test_expand_ps_128(<4 x float> %data) {
 ; CHECK-LABEL: test_expand_ps_128:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
-  %1 = call <4 x float> @llvm.x86.avx512.mask.expand.v4f32(<4 x float> %data, <4 x float> undef, <4 x i1> <i1 true, i1 true, i1 true, i1 true>)
+  %1 = call <4 x float> @llvm.x86.avx512.mask.expand.v4f32(<4 x float> %data, <4 x float> poison, <4 x i1> <i1 true, i1 true, i1 true, i1 true>)
   ret <4 x float> %1
 }
 
@@ -294,7 +294,7 @@ define <2 x i64> @test_expand_q_128(<2 x i64> %data) {
 ; CHECK-LABEL: test_expand_q_128:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
-  %1 = call <2 x i64> @llvm.x86.avx512.mask.expand.v2i64(<2 x i64> %data, <2 x i64> undef, <2 x i1> <i1 true, i1 true>)
+  %1 = call <2 x i64> @llvm.x86.avx512.mask.expand.v2i64(<2 x i64> %data, <2 x i64> poison, <2 x i1> <i1 true, i1 true>)
   ret <2 x i64> %1
 }
 
@@ -342,7 +342,7 @@ define <4 x i32> @test_expand_d_128(<4 x i32> %data) {
 ; CHECK-LABEL: test_expand_d_128:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
-  %1 = call <4 x i32> @llvm.x86.avx512.mask.expand.v4i32(<4 x i32> %data, <4 x i32> undef, <4 x i1> <i1 true, i1 true, i1 true, i1 true>)
+  %1 = call <4 x i32> @llvm.x86.avx512.mask.expand.v4i32(<4 x i32> %data, <4 x i32> poison, <4 x i1> <i1 true, i1 true, i1 true, i1 true>)
   ret <4 x i32> %1
 }
 
@@ -430,7 +430,7 @@ define <4 x double> @test_compress_pd_256(<4 x double> %data) {
 ; CHECK-LABEL: test_compress_pd_256:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
-  %1 = call <4 x double> @llvm.x86.avx512.mask.compress.v4f64(<4 x double> %data, <4 x double> undef, <4 x i1> <i1 true, i1 true, i1 true, i1 true>)
+  %1 = call <4 x double> @llvm.x86.avx512.mask.compress.v4f64(<4 x double> %data, <4 x double> poison, <4 x i1> <i1 true, i1 true, i1 true, i1 true>)
   ret <4 x double> %1
 }
 
@@ -476,7 +476,7 @@ define <8 x float> @test_compress_ps_256(<8 x float> %data) {
 ; CHECK-LABEL: test_compress_ps_256:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
-  %1 = call <8 x float> @llvm.x86.avx512.mask.compress.v8f32(<8 x float> %data, <8 x float> undef, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>)
+  %1 = call <8 x float> @llvm.x86.avx512.mask.compress.v8f32(<8 x float> %data, <8 x float> poison, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>)
   ret <8 x float> %1
 }
 
@@ -524,7 +524,7 @@ define <4 x i64> @test_compress_q_256(<4 x i64> %data) {
 ; CHECK-LABEL: test_compress_q_256:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
-  %1 = call <4 x i64> @llvm.x86.avx512.mask.compress.v4i64(<4 x i64> %data, <4 x i64> undef, <4 x i1> <i1 true, i1 true, i1 true, i1 true>)
+  %1 = call <4 x i64> @llvm.x86.avx512.mask.compress.v4i64(<4 x i64> %data, <4 x i64> poison, <4 x i1> <i1 true, i1 true, i1 true, i1 true>)
   ret <4 x i64> %1
 }
 
@@ -570,7 +570,7 @@ define <8 x i32> @test_compress_d_256(<8 x i32> %data) {
 ; CHECK-LABEL: test_compress_d_256:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
-  %1 = call <8 x i32> @llvm.x86.avx512.mask.compress.v8i32(<8 x i32> %data, <8 x i32> undef, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>)
+  %1 = call <8 x i32> @llvm.x86.avx512.mask.compress.v8i32(<8 x i32> %data, <8 x i32> poison, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>)
   ret <8 x i32> %1
 }
 
@@ -578,7 +578,7 @@ define <4 x double> @test_expand_pd_256(<4 x double> %data) {
 ; CHECK-LABEL: test_expand_pd_256:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
-  %1 = call <4 x double> @llvm.x86.avx512.mask.expand.v4f64(<4 x double> %data, <4 x double> undef, <4 x i1> <i1 true, i1 true, i1 true, i1 true>)
+  %1 = call <4 x double> @llvm.x86.avx512.mask.expand.v4f64(<4 x double> %data, <4 x double> poison, <4 x i1> <i1 true, i1 true, i1 true, i1 true>)
   ret <4 x double> %1
 }
 
@@ -626,7 +626,7 @@ define <8 x float> @test_expand_ps_256(<8 x float> %data) {
 ; CHECK-LABEL: test_expand_ps_256:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
-  %1 = call <8 x float> @llvm.x86.avx512.mask.expand.v8f32(<8 x float> %data, <8 x float> undef, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>)
+  %1 = call <8 x float> @llvm.x86.avx512.mask.expand.v8f32(<8 x float> %data, <8 x float> poison, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>)
   ret <8 x float> %1
 }
 
@@ -672,7 +672,7 @@ define <4 x i64> @test_expand_q_256(<4 x i64> %data) {
 ; CHECK-LABEL: test_expand_q_256:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
-  %1 = call <4 x i64> @llvm.x86.avx512.mask.expand.v4i64(<4 x i64> %data, <4 x i64> undef, <4 x i1> <i1 true, i1 true, i1 true, i1 true>)
+  %1 = call <4 x i64> @llvm.x86.avx512.mask.expand.v4i64(<4 x i64> %data, <4 x i64> poison, <4 x i1> <i1 true, i1 true, i1 true, i1 true>)
   ret <4 x i64> %1
 }
 
@@ -720,7 +720,7 @@ define <8 x i32> @test_expand_d_256(<8 x i32> %data) {
 ; CHECK-LABEL: test_expand_d_256:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
-  %1 = call <8 x i32> @llvm.x86.avx512.mask.expand.v8i32(<8 x i32> %data, <8 x i32> undef, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>)
+  %1 = call <8 x i32> @llvm.x86.avx512.mask.expand.v8i32(<8 x i32> %data, <8 x i32> poison, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>)
   ret <8 x i32> %1
 }
 
@@ -884,7 +884,7 @@ define <4 x float> @test_mm512_maskz_max_ps_128(<4 x float> %a0, <4 x float> %a1
 ; X64-NEXT:    retq # encoding: [0xc3]
   %1 = call <4 x float> @llvm.x86.sse.max.ps(<4 x float> %a0, <4 x float> %a1)
   %2 = bitcast i8 %mask to <8 x i1>
-  %extract = shufflevector <8 x i1> %2, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %extract = shufflevector <8 x i1> %2, <8 x i1> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
   %3 = select <4 x i1> %extract, <4 x float> %1, <4 x float> zeroinitializer
   ret <4 x float> %3
 }
@@ -906,7 +906,7 @@ define <4 x float> @test_mm512_mask_max_ps_128(<4 x float> %a0, <4 x float> %a1,
 ; X64-NEXT:    retq # encoding: [0xc3]
   %1 = call <4 x float> @llvm.x86.sse.max.ps(<4 x float> %a0, <4 x float> %a1)
   %2 = bitcast i8 %mask to <8 x i1>
-  %extract = shufflevector <8 x i1> %2, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %extract = shufflevector <8 x i1> %2, <8 x i1> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
   %3 = select <4 x i1> %extract, <4 x float> %1, <4 x float> %src
   ret <4 x float> %3
 }
@@ -986,7 +986,7 @@ define <4 x float> @test_mm512_maskz_min_ps_128(<4 x float> %a0, <4 x float> %a1
 ; X64-NEXT:    retq # encoding: [0xc3]
   %1 = call <4 x float> @llvm.x86.sse.min.ps(<4 x float> %a0, <4 x float> %a1)
   %2 = bitcast i8 %mask to <8 x i1>
-  %extract = shufflevector <8 x i1> %2, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %extract = shufflevector <8 x i1> %2, <8 x i1> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
   %3 = select <4 x i1> %extract, <4 x float> %1, <4 x float> zeroinitializer
   ret <4 x float> %3
 }
@@ -1008,7 +1008,7 @@ define <4 x float> @test_mm512_mask_min_ps_128(<4 x float> %a0, <4 x float> %a1,
 ; X64-NEXT:    retq # encoding: [0xc3]
   %1 = call <4 x float> @llvm.x86.sse.min.ps(<4 x float> %a0, <4 x float> %a1)
   %2 = bitcast i8 %mask to <8 x i1>
-  %extract = shufflevector <8 x i1> %2, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %extract = shufflevector <8 x i1> %2, <8 x i1> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
   %3 = select <4 x i1> %extract, <4 x float> %1, <4 x float> %src
   ret <4 x float> %3
 }
@@ -5223,7 +5223,7 @@ define <2 x i64> @test_x86_avx512_mask_psra_q_128(<2 x i64> %a0, <2 x i64> %a1,
 ; X64-NEXT:    retq # encoding: [0xc3]
   %res = call <2 x i64> @llvm.x86.avx512.psra.q.128(<2 x i64> %a0, <2 x i64> %a1) ; <<2 x i64>> [#uses=1]
   %mask.cast = bitcast i8 %mask to <8 x i1>
-  %mask.extract = shufflevector <8 x i1> %mask.cast, <8 x i1> undef, <2 x i32> <i32 0, i32 1>
+  %mask.extract = shufflevector <8 x i1> %mask.cast, <8 x i1> poison, <2 x i32> <i32 0, i32 1>
   %res2 = select <2 x i1> %mask.extract, <2 x i64> %res, <2 x i64> %passthru
   ret <2 x i64> %res2
 }
@@ -5242,7 +5242,7 @@ define <2 x i64> @test_x86_avx512_maskz_psra_q_128(<2 x i64> %a0, <2 x i64> %a1,
 ; X64-NEXT:    retq # encoding: [0xc3]
   %res = call <2 x i64> @llvm.x86.avx512.psra.q.128(<2 x i64> %a0, <2 x i64> %a1) ; <<2 x i64>> [#uses=1]
   %mask.cast = bitcast i8 %mask to <8 x i1>
-  %mask.extract = shufflevector <8 x i1> %mask.cast, <8 x i1> undef, <2 x i32> <i32 0, i32 1>
+  %mask.extract = shufflevector <8 x i1> %mask.cast, <8 x i1> poison, <2 x i32> <i32 0, i32 1>
   %res2 = select <2 x i1> %mask.extract, <2 x i64> %res, <2 x i64> zeroinitializer
   ret <2 x i64> %res2
 }
@@ -5274,7 +5274,7 @@ define <4 x i64> @test_x86_avx512_mask_psra_q_256(<4 x i64> %a0, <2 x i64> %a1,
 ; X64-NEXT:    retq # encoding: [0xc3]
   %res = call <4 x i64> @llvm.x86.avx512.psra.q.256(<4 x i64> %a0, <2 x i64> %a1) ; <<4 x i64>> [#uses=1]
   %mask.cast = bitcast i8 %mask to <8 x i1>
-  %mask.extract = shufflevector <8 x i1> %mask.cast, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %mask.extract = shufflevector <8 x i1> %mask.cast, <8 x i1> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
   %res2 = select <4 x i1> %mask.extract, <4 x i64> %res, <4 x i64> %passthru
   ret <4 x i64> %res2
 }
@@ -5293,7 +5293,7 @@ define <4 x i64> @test_x86_avx512_maskz_psra_q_256(<4 x i64> %a0, <2 x i64> %a1,
 ; X64-NEXT:    retq # encoding: [0xc3]
   %res = call <4 x i64> @llvm.x86.avx512.psra.q.256(<4 x i64> %a0, <2 x i64> %a1) ; <<4 x i64>> [#uses=1]
   %mask.cast = bitcast i8 %mask to <8 x i1>
-  %mask.extract = shufflevector <8 x i1> %mask.cast, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %mask.extract = shufflevector <8 x i1> %mask.cast, <8 x i1> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
   %res2 = select <4 x i1> %mask.extract, <4 x i64> %res, <4 x i64> zeroinitializer
   ret <4 x i64> %res2
 }
@@ -5325,7 +5325,7 @@ define <2 x i64> @test_x86_avx512_mask_psrai_q_128(<2 x i64> %a0, <2 x i64> %pas
 ; X64-NEXT:    retq # encoding: [0xc3]
   %res = call <2 x i64> @llvm.x86.avx512.psrai.q.128(<2 x i64> %a0, i32 7) ; <<2 x i64>> [#uses=1]
   %mask.cast = bitcast i8 %mask to <8 x i1>
-  %mask.extract = shufflevector <8 x i1> %mask.cast, <8 x i1> undef, <2 x i32> <i32 0, i32 1>
+  %mask.extract = shufflevector <8 x i1> %mask.cast, <8 x i1> poison, <2 x i32> <i32 0, i32 1>
   %res2 = select <2 x i1> %mask.extract, <2 x i64> %res, <2 x i64> %passthru
   ret <2 x i64> %res2
 }
@@ -5344,7 +5344,7 @@ define <2 x i64> @test_x86_avx512_maskz_psrai_q_128(<2 x i64> %a0, i8 %mask) {
 ; X64-NEXT:    retq # encoding: [0xc3]
   %res = call <2 x i64> @llvm.x86.avx512.psrai.q.128(<2 x i64> %a0, i32 7) ; <<2 x i64>> [#uses=1]
   %mask.cast = bitcast i8 %mask to <8 x i1>
-  %mask.extract = shufflevector <8 x i1> %mask.cast, <8 x i1> undef, <2 x i32> <i32 0, i32 1>
+  %mask.extract = shufflevector <8 x i1> %mask.cast, <8 x i1> poison, <2 x i32> <i32 0, i32 1>
   %res2 = select <2 x i1> %mask.extract, <2 x i64> %res, <2 x i64> zeroinitializer
   ret <2 x i64> %res2
 }
@@ -5376,7 +5376,7 @@ define <4 x i64> @test_x86_avx512_mask_psrai_q_256(<4 x i64> %a0, <4 x i64> %pas
 ; X64-NEXT:    retq # encoding: [0xc3]
   %res = call <4 x i64> @llvm.x86.avx512.psrai.q.256(<4 x i64> %a0, i32 7) ; <<4 x i64>> [#uses=1]
   %mask.cast = bitcast i8 %mask to <8 x i1>
-  %mask.extract = shufflevector <8 x i1> %mask.cast, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %mask.extract = shufflevector <8 x i1> %mask.cast, <8 x i1> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
   %res2 = select <4 x i1> %mask.extract, <4 x i64> %res, <4 x i64> %passthru
   ret <4 x i64> %res2
 }
@@ -5395,7 +5395,7 @@ define <4 x i64> @test_x86_avx512_maskz_psrai_q_256(<4 x i64> %a0, i8 %mask) {
 ; X64-NEXT:    retq # encoding: [0xc3]
   %res = call <4 x i64> @llvm.x86.avx512.psrai.q.256(<4 x i64> %a0, i32 7) ; <<4 x i64>> [#uses=1]
   %mask.cast = bitcast i8 %mask to <8 x i1>
-  %mask.extract = shufflevector <8 x i1> %mask.cast, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %mask.extract = shufflevector <8 x i1> %mask.cast, <8 x i1> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
   %res2 = select <4 x i1> %mask.extract, <4 x i64> %res, <4 x i64> zeroinitializer
   ret <4 x i64> %res2
 }
@@ -5427,7 +5427,7 @@ define <2 x i64> @test_x86_avx512_mask_psrav_q_128(<2 x i64> %a0, <2 x i64> %a1,
 ; X64-NEXT:    retq # encoding: [0xc3]
   %res = call <2 x i64> @llvm.x86.avx512.psrav.q.128(<2 x i64> %a0, <2 x i64> %a1)
   %mask.cast = bitcast i8 %mask to <8 x i1>
-  %mask.extract = shufflevector <8 x i1> %mask.cast, <8 x i1> undef, <2 x i32> <i32 0, i32 1>
+  %mask.extract = shufflevector <8 x i1> %mask.cast, <8 x i1> poison, <2 x i32> <i32 0, i32 1>
   %res2 = select <2 x i1> %mask.extract, <2 x i64> %res, <2 x i64> %a2
   ret <2 x i64> %res2
 }
@@ -5447,7 +5447,7 @@ define <2 x i64> @test_x86_avx512_maskz_psrav_q_128(<2 x i64> %a0, <2 x i64> %a1
 ; X64-NEXT:    retq # encoding: [0xc3]
   %res = call <2 x i64> @llvm.x86.avx512.psrav.q.128(<2 x i64> %a0, <2 x i64> %a1)
   %mask.cast = bitcast i8 %mask to <8 x i1>
-  %mask.extract = shufflevector <8 x i1> %mask.cast, <8 x i1> undef, <2 x i32> <i32 0, i32 1>
+  %mask.extract = shufflevector <8 x i1> %mask.cast, <8 x i1> poison, <2 x i32> <i32 0, i32 1>
   %res2 = select <2 x i1> %mask.extract, <2 x i64> %res, <2 x i64> zeroinitializer
   ret <2 x i64> %res2
 }
@@ -5480,7 +5480,7 @@ define <4 x i64> @test_x86_avx512_mask_psrav_q_256(<4 x i64> %a0, <4 x i64> %a1,
 ; X64-NEXT:    retq # encoding: [0xc3]
   %res = call <4 x i64> @llvm.x86.avx512.psrav.q.256(<4 x i64> %a0, <4 x i64> %a1)
   %mask.cast = bitcast i8 %mask to <8 x i1>
-  %mask.extract = shufflevector <8 x i1> %mask.cast, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %mask.extract = shufflevector <8 x i1> %mask.cast, <8 x i1> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
   %res2 = select <4 x i1> %mask.extract, <4 x i64> %res, <4 x i64> %a2
   ret <4 x i64> %res2
 }
@@ -5500,7 +5500,7 @@ define <4 x i64> @test_x86_avx512_maskz_psrav_q_256(<4 x i64> %a0, <4 x i64> %a1
 ; X64-NEXT:    retq # encoding: [0xc3]
   %res = call <4 x i64> @llvm.x86.avx512.psrav.q.256(<4 x i64> %a0, <4 x i64> %a1)
   %mask.cast = bitcast i8 %mask to <8 x i1>
-  %mask.extract = shufflevector <8 x i1> %mask.cast, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %mask.extract = shufflevector <8 x i1> %mask.cast, <8 x i1> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
   %res2 = select <4 x i1> %mask.extract, <4 x i64> %res, <4 x i64> zeroinitializer
   ret <4 x i64> %res2
 }
@@ -6861,7 +6861,7 @@ define <4 x float> @test_mask_vfmadd128_ps_rmb(<4 x float> %a0, <4 x float> %a1,
 ; X64-NEXT:    # xmm0 {%k1} = (xmm1 * xmm0) + mem
 ; X64-NEXT:    retq # encoding: [0xc3]
   %q = load float, ptr %ptr_a2
-  %vecinit.i = insertelement <4 x float> undef, float %q, i32 0
+  %vecinit.i = insertelement <4 x float> poison, float %q, i32 0
   %vecinit2.i = insertelement <4 x float> %vecinit.i, float %q, i32 1
   %vecinit4.i = insertelement <4 x float> %vecinit2.i, float %q, i32 2
   %vecinit6.i = insertelement <4 x float> %vecinit4.i, float %q, i32 3
@@ -6889,7 +6889,7 @@ define <4 x float> @test_mask_vfmadd128_ps_rmba(<4 x float> %a0, <4 x float> %a1
 ; X64-NEXT:    # xmm0 {%k1} = (xmm1 * xmm0) + mem
 ; X64-NEXT:    retq # encoding: [0xc3]
   %q = load float, ptr %ptr_a2, align 4
-  %vecinit.i = insertelement <4 x float> undef, float %q, i32 0
+  %vecinit.i = insertelement <4 x float> poison, float %q, i32 0
   %vecinit2.i = insertelement <4 x float> %vecinit.i, float %q, i32 1
   %vecinit4.i = insertelement <4 x float> %vecinit2.i, float %q, i32 2
   %vecinit6.i = insertelement <4 x float> %vecinit4.i, float %q, i32 3
@@ -6914,7 +6914,7 @@ define <4 x float> @test_mask_vfmadd128_ps_rmbz(<4 x float> %a0, <4 x float> %a1
 ; X64-NEXT:    # xmm0 = (xmm1 * xmm0) + mem
 ; X64-NEXT:    retq # encoding: [0xc3]
   %q = load float, ptr %ptr_a2
-  %vecinit.i = insertelement <4 x float> undef, float %q, i32 0
+  %vecinit.i = insertelement <4 x float> poison, float %q, i32 0
   %vecinit2.i = insertelement <4 x float> %vecinit.i, float %q, i32 1
   %vecinit4.i = insertelement <4 x float> %vecinit2.i, float %q, i32 2
   %vecinit6.i = insertelement <4 x float> %vecinit4.i, float %q, i32 3
@@ -6936,7 +6936,7 @@ define <4 x float> @test_mask_vfmadd128_ps_rmbza(<4 x float> %a0, <4 x float> %a
 ; X64-NEXT:    # xmm0 = (xmm1 * xmm0) + mem
 ; X64-NEXT:    retq # encoding: [0xc3]
   %q = load float, ptr %ptr_a2, align 4
-  %vecinit.i = insertelement <4 x float> undef, float %q, i32 0
+  %vecinit.i = insertelement <4 x float> poison, float %q, i32 0
   %vecinit2.i = insertelement <4 x float> %vecinit.i, float %q, i32 1
   %vecinit4.i = insertelement <4 x float> %vecinit2.i, float %q, i32 2
   %vecinit6.i = insertelement <4 x float> %vecinit4.i, float %q, i32 3

>From 28a4ed945dc101c9a7dbdc93d9461da67225f7dc Mon Sep 17 00:00:00 2001
From: Brox Chen <guochen2 at amd.com>
Date: Tue, 10 Jun 2025 22:49:09 -0400
Subject: [PATCH 37/61] [AMDGPU][True16] remove AsmVOP3OpSel (#143465)

This is NFC. Clean up the AsmVOP3OpSel field, and use Vop3Base instead.
---
 llvm/lib/Target/AMDGPU/SIInstrInfo.td      | 44 +---------------------
 llvm/lib/Target/AMDGPU/VOP3Instructions.td | 10 +----
 llvm/lib/Target/AMDGPU/VOPInstructions.td  |  6 +--
 3 files changed, 5 insertions(+), 55 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.td b/llvm/lib/Target/AMDGPU/SIInstrInfo.td
index 2c20475726a48..e74ccbee975ab 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.td
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.td
@@ -2242,41 +2242,6 @@ class getAsmVOP3P <bit HasDst, int NumSrcArgs, bit HasNeg,
   string ret = dst#src0#src1#src2#opsel#mods#clamp;
 }
 
-// FIXME-TRUE16 AsmVOP3OpSel will be deprecated after all
-// VOP3 16 bit instructions are replaced to true16 format
-class getAsmVOP3OpSel <int NumSrcArgs,
-                       bit HasClamp,
-                       bit HasOMod,
-                       bit Src0HasMods,
-                       bit Src1HasMods,
-                       bit Src2HasMods,
-                       bit HasByteSel = 0,
-                       bit HasBitOp3 = 0> {
-  string dst = "$vdst";
-
-  string isrc0 = !if(!eq(NumSrcArgs, 1), "$src0", "$src0,");
-  string isrc1 = !if(!eq(NumSrcArgs, 1), "",
-                     !if(!eq(NumSrcArgs, 2), " $src1",
-                                             " $src1,"));
-  string isrc2 = !if(!eq(NumSrcArgs, 3), " $src2", "");
-
-  string fsrc0 = !if(!eq(NumSrcArgs, 1), "$src0_modifiers", "$src0_modifiers,");
-  string fsrc1 = !if(!eq(NumSrcArgs, 1), "",
-                     !if(!eq(NumSrcArgs, 2), " $src1_modifiers",
-                                             " $src1_modifiers,"));
-  string fsrc2 = !if(!eq(NumSrcArgs, 3), " $src2_modifiers", "");
-
-  string src0 = !if(Src0HasMods, fsrc0, isrc0);
-  string src1 = !if(Src1HasMods, fsrc1, isrc1);
-  string src2 = !if(Src2HasMods, fsrc2, isrc2);
-
-  string bytesel = !if(HasByteSel, "$byte_sel", "");
-  string clamp = !if(HasClamp, "$clamp", "");
-  string omod = !if(HasOMod, "$omod", "");
-  string bitop3 = !if(HasBitOp3, "$bitop3", "");
-  string ret = dst#", "#src0#src1#src2#bitop3#"$op_sel"#bytesel#clamp#omod;
-}
-
 class getAsmDPP <bit HasDst, int NumSrcArgs, bit HasModifiers, ValueType DstVT = i32> {
   string dst = !if(HasDst,
                    !if(!eq(DstVT.Size, 1),
@@ -2687,14 +2652,7 @@ class VOPProfile <list<ValueType> _ArgVT, bit _EnableClamp = 0> {
    HasSrc2Mods, DstVT, HasFP8ByteSel, HasBitOp3>.ret;
   field string Asm64 = AsmVOP3Base;
   field string AsmVOP3P = getAsmVOP3P<HasDst, NumSrcArgs, HasNeg, HasClamp, HasOpSel>.ret;
-  field string AsmVOP3OpSel = getAsmVOP3OpSel<NumSrcArgs,
-                                              HasClamp,
-                                              HasOMod,
-                                              HasSrc0FloatMods,
-                                              HasSrc1FloatMods,
-                                              HasSrc2FloatMods,
-                                              HasFP8ByteSel,
-                                              HasBitOp3>.ret;
+  field string AsmVOP3OpSel = AsmVOP3Base;
   field string AsmVOP3DPP = getAsmVOP3DPP<AsmVOP3Base>.ret;
   field string AsmVOP3DPP16 = getAsmVOP3DPP16<AsmVOP3Base>.ret;
   field string AsmVOP3DPP8 = getAsmVOP3DPP8<AsmVOP3Base>.ret;
diff --git a/llvm/lib/Target/AMDGPU/VOP3Instructions.td b/llvm/lib/Target/AMDGPU/VOP3Instructions.td
index 9f66951372d19..a005e0245b8ff 100644
--- a/llvm/lib/Target/AMDGPU/VOP3Instructions.td
+++ b/llvm/lib/Target/AMDGPU/VOP3Instructions.td
@@ -626,10 +626,6 @@ def VOP3_CVT_SR_F8_F32_Profile : VOP3_Profile<VOPProfile<[i32, f32, i32, f32]>,
   let HasOpSel = 1;
   let HasFP8DstByteSel = 1;
   let HasFP8ByteSel = 0; // It works as a dst-bytesel, but does not have byte_sel operand.
-  let AsmVOP3OpSel = !subst(", $src2_modifiers", "",
-                            getAsmVOP3OpSel<3, HasClamp, HasOMod,
-                                            HasSrc0FloatMods, HasSrc1FloatMods,
-                                            HasSrc2FloatMods>.ret);
   let AsmVOP3Base = !subst(", $src2_modifiers", "",
                     getAsmVOP3Base<NumSrcArgs, HasDst, HasClamp,
                     HasOpSel, HasOMod, IsVOP3P, HasModifiers, HasModifiers, 0/*Src1Mods*/,
@@ -1066,12 +1062,10 @@ class VOP3_CVT_SCALE_FP4_F16BF16_TiedInput_Profile<VOPProfile P> : VOP3_Profile<
   let HasSrc2 = 0;
   let HasSrc2Mods = 1;
   let HasOpSel = 1;
-  let AsmVOP3OpSel = !subst(", $src2_modifiers", "",
-                            getAsmVOP3OpSel<3, HasClamp, HasOMod,
-                                            HasSrc0FloatMods, HasSrc1FloatMods,
-                                            HasSrc2FloatMods>.ret);
+  let Asm64 = !subst(", $src2_modifiers", "", AsmVOP3Base);
   let HasExtVOP3DPP = 0;
   let HasFP8DstByteSel = 1;
+  let HasFP8ByteSel = 0;
 }
 
 class VOP3_CVT_SCALE_SR_PK_F4_F16BF16_TiedInput_Profile<ValueType Src0Ty> :
diff --git a/llvm/lib/Target/AMDGPU/VOPInstructions.td b/llvm/lib/Target/AMDGPU/VOPInstructions.td
index 4cd845aaa5497..6045f59d1f040 100644
--- a/llvm/lib/Target/AMDGPU/VOPInstructions.td
+++ b/llvm/lib/Target/AMDGPU/VOPInstructions.td
@@ -112,9 +112,7 @@ class VOP3_Pseudo <string opName, VOPProfile P, list<dag> pattern = [],
   bit HasFP8DstByteSel = P.HasFP8DstByteSel;
   bit HasFP4DstByteSel = P.HasFP4DstByteSel;
 
-  let AsmOperands = !if(!and(!not(P.IsTrue16), isVop3OpSel),
-                        P.AsmVOP3OpSel,
-                        !if(!and(isVOP3P, P.IsPacked), P.AsmVOP3P, P.Asm64));
+  let AsmOperands = !if(!and(isVOP3P, P.IsPacked), P.AsmVOP3P, P.Asm64);
 
   let Size = 8;
   let mayLoad = 0;
@@ -1484,7 +1482,7 @@ class VOP3_Profile_Base<VOPProfile P, VOP3Features Features = VOP3_REGULAR> : VO
 
   let HasModifiers =
       !if (Features.IsMAI, 0,
-           !or(Features.IsPacked, Features.HasOpSel, P.HasModifiers));
+           !or(Features.IsPacked, P.HasModifiers));
 }
 
 class VOP3_Profile<VOPProfile P, VOP3Features Features = VOP3_REGULAR> : VOP3_Profile_Base<P, Features> {

>From d75e28477af0baa063a4d4cc7b3cf657cfadd758 Mon Sep 17 00:00:00 2001
From: Peter Klausler <pklausler at nvidia.com>
Date: Tue, 10 Jun 2025 20:36:52 -0700
Subject: [PATCH 38/61] [flang][runtime] Fix build bot flang-runtime-cuda-gcc
 errors (#143650)

Adjust default parent class accessibility to attemp to work around what
appear to be old GCC's interpretation.
---
 flang-rt/include/flang-rt/runtime/work-queue.h | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/flang-rt/include/flang-rt/runtime/work-queue.h b/flang-rt/include/flang-rt/runtime/work-queue.h
index 878b18373e1d2..f7f4777839836 100644
--- a/flang-rt/include/flang-rt/runtime/work-queue.h
+++ b/flang-rt/include/flang-rt/runtime/work-queue.h
@@ -319,7 +319,7 @@ class AssignTicket : public ImmediateTicketRunner<AssignTicket> {
 template <bool IS_COMPONENTWISE>
 class DerivedAssignTicket
     : public ImmediateTicketRunner<DerivedAssignTicket<IS_COMPONENTWISE>>,
-      private std::conditional_t<IS_COMPONENTWISE, ComponentsOverElements,
+      protected std::conditional_t<IS_COMPONENTWISE, ComponentsOverElements,
           ElementsOverComponents> {
 public:
   using Base = std::conditional_t<IS_COMPONENTWISE, ComponentsOverElements,
@@ -348,7 +348,7 @@ namespace io::descr {
 template <io::Direction DIR>
 class DescriptorIoTicket
     : public ImmediateTicketRunner<DescriptorIoTicket<DIR>>,
-      private Elementwise {
+      protected Elementwise {
 public:
   RT_API_ATTRS DescriptorIoTicket(io::IoStatementState &io,
       const Descriptor &descriptor, const io::NonTbpDefinedIoTable *table,
@@ -372,7 +372,7 @@ class DescriptorIoTicket
 
 template <io::Direction DIR>
 class DerivedIoTicket : public ImmediateTicketRunner<DerivedIoTicket<DIR>>,
-                        private ElementsOverComponents {
+                        protected ElementsOverComponents {
 public:
   RT_API_ATTRS DerivedIoTicket(io::IoStatementState &io,
       const Descriptor &descriptor, const typeInfo::DerivedType &derived,

>From 3ece9b06a2d299d5a108efa856e662587543b2f3 Mon Sep 17 00:00:00 2001
From: quic_hchandel <quic_hchandel at quicinc.com>
Date: Wed, 11 Jun 2025 09:56:12 +0530
Subject: [PATCH 39/61] [RISCV][NFC] Improve test coverage for xtheadcondmov
 and xmipscmov (#143567)

Co-authored-by: Harsh Chandel <hchandel at qti.qualcomm.com>
---
 llvm/test/CodeGen/RISCV/select-cond.ll | 1018 ++++++++++++++++++++++++
 1 file changed, 1018 insertions(+)
 create mode 100644 llvm/test/CodeGen/RISCV/select-cond.ll

diff --git a/llvm/test/CodeGen/RISCV/select-cond.ll b/llvm/test/CodeGen/RISCV/select-cond.ll
new file mode 100644
index 0000000000000..a5f4677f73f13
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/select-cond.ll
@@ -0,0 +1,1018 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -mtriple=riscv32 -verify-machineinstrs < %s \
+; RUN:   | FileCheck %s --check-prefixes=RV32
+; RUN: llc -mtriple=riscv32 -mattr=+xtheadcondmov -verify-machineinstrs < %s \
+; RUN:   | FileCheck %s --check-prefixes=RV32-THEAD
+; RUN: llc -mtriple=riscv64 -verify-machineinstrs < %s \
+; RUN:   | FileCheck %s --check-prefixes=RV64
+; RUN: llc -mtriple=riscv64 -mattr=+xmipscmov -verify-machineinstrs < %s \
+; RUN:   | FileCheck %s --check-prefixes=RV64-MIPS
+
+define signext i32 @select_i32_trunc(i32 signext %cond, i32 signext %x, i32 signext %y) nounwind {
+; RV32-LABEL: select_i32_trunc:
+; RV32:       # %bb.0:
+; RV32-NEXT:    andi a3, a0, 1
+; RV32-NEXT:    mv a0, a1
+; RV32-NEXT:    bnez a3, .LBB0_2
+; RV32-NEXT:  # %bb.1:
+; RV32-NEXT:    mv a0, a2
+; RV32-NEXT:  .LBB0_2:
+; RV32-NEXT:    ret
+;
+; RV32-THEAD-LABEL: select_i32_trunc:
+; RV32-THEAD:       # %bb.0:
+; RV32-THEAD-NEXT:    andi a0, a0, 1
+; RV32-THEAD-NEXT:    th.mveqz a1, a2, a0
+; RV32-THEAD-NEXT:    mv a0, a1
+; RV32-THEAD-NEXT:    ret
+;
+; RV64-LABEL: select_i32_trunc:
+; RV64:       # %bb.0:
+; RV64-NEXT:    andi a3, a0, 1
+; RV64-NEXT:    mv a0, a1
+; RV64-NEXT:    bnez a3, .LBB0_2
+; RV64-NEXT:  # %bb.1:
+; RV64-NEXT:    mv a0, a2
+; RV64-NEXT:  .LBB0_2:
+; RV64-NEXT:    ret
+;
+; RV64-MIPS-LABEL: select_i32_trunc:
+; RV64-MIPS:       # %bb.0:
+; RV64-MIPS-NEXT:    andi a0, a0, 1
+; RV64-MIPS-NEXT:    mips.ccmov a0, a0, a1, a2
+; RV64-MIPS-NEXT:    ret
+  %cond_trunc = trunc i32 %cond to i1
+  %res = select i1 %cond_trunc, i32 %x, i32 %y
+  ret i32 %res
+}
+
+define signext i32 @select_i32_param(i1 signext %cond, i32 signext %x, i32 signext %y) nounwind {
+; RV32-LABEL: select_i32_param:
+; RV32:       # %bb.0:
+; RV32-NEXT:    andi a3, a0, 1
+; RV32-NEXT:    mv a0, a1
+; RV32-NEXT:    bnez a3, .LBB1_2
+; RV32-NEXT:  # %bb.1:
+; RV32-NEXT:    mv a0, a2
+; RV32-NEXT:  .LBB1_2:
+; RV32-NEXT:    ret
+;
+; RV32-THEAD-LABEL: select_i32_param:
+; RV32-THEAD:       # %bb.0:
+; RV32-THEAD-NEXT:    andi a0, a0, 1
+; RV32-THEAD-NEXT:    th.mveqz a1, a2, a0
+; RV32-THEAD-NEXT:    mv a0, a1
+; RV32-THEAD-NEXT:    ret
+;
+; RV64-LABEL: select_i32_param:
+; RV64:       # %bb.0:
+; RV64-NEXT:    andi a3, a0, 1
+; RV64-NEXT:    mv a0, a1
+; RV64-NEXT:    bnez a3, .LBB1_2
+; RV64-NEXT:  # %bb.1:
+; RV64-NEXT:    mv a0, a2
+; RV64-NEXT:  .LBB1_2:
+; RV64-NEXT:    ret
+;
+; RV64-MIPS-LABEL: select_i32_param:
+; RV64-MIPS:       # %bb.0:
+; RV64-MIPS-NEXT:    andi a0, a0, 1
+; RV64-MIPS-NEXT:    mips.ccmov a0, a0, a1, a2
+; RV64-MIPS-NEXT:    ret
+  %res = select i1 %cond, i32 %x, i32 %y
+  ret i32 %res
+}
+
+define signext i32 @select_i32_eq(i32 signext %a, i32 signext %b, i32 signext %x, i32 signext %y) nounwind {
+; RV32-LABEL: select_i32_eq:
+; RV32:       # %bb.0:
+; RV32-NEXT:    beq a0, a1, .LBB2_2
+; RV32-NEXT:  # %bb.1:
+; RV32-NEXT:    mv a2, a3
+; RV32-NEXT:  .LBB2_2:
+; RV32-NEXT:    mv a0, a2
+; RV32-NEXT:    ret
+;
+; RV32-THEAD-LABEL: select_i32_eq:
+; RV32-THEAD:       # %bb.0:
+; RV32-THEAD-NEXT:    xor a0, a0, a1
+; RV32-THEAD-NEXT:    th.mvnez a2, a3, a0
+; RV32-THEAD-NEXT:    mv a0, a2
+; RV32-THEAD-NEXT:    ret
+;
+; RV64-LABEL: select_i32_eq:
+; RV64:       # %bb.0:
+; RV64-NEXT:    beq a0, a1, .LBB2_2
+; RV64-NEXT:  # %bb.1:
+; RV64-NEXT:    mv a2, a3
+; RV64-NEXT:  .LBB2_2:
+; RV64-NEXT:    mv a0, a2
+; RV64-NEXT:    ret
+;
+; RV64-MIPS-LABEL: select_i32_eq:
+; RV64-MIPS:       # %bb.0:
+; RV64-MIPS-NEXT:    xor a0, a0, a1
+; RV64-MIPS-NEXT:    mips.ccmov a0, a0, a3, a2
+; RV64-MIPS-NEXT:    ret
+  %cond = icmp eq i32 %a, %b
+  %res = select i1 %cond, i32 %x, i32 %y
+  ret i32 %res
+}
+
+define signext i32 @select_i32_ne(i32 signext %a, i32 signext %b, i32 signext %x, i32 signext %y) nounwind {
+; RV32-LABEL: select_i32_ne:
+; RV32:       # %bb.0:
+; RV32-NEXT:    bne a0, a1, .LBB3_2
+; RV32-NEXT:  # %bb.1:
+; RV32-NEXT:    mv a2, a3
+; RV32-NEXT:  .LBB3_2:
+; RV32-NEXT:    mv a0, a2
+; RV32-NEXT:    ret
+;
+; RV32-THEAD-LABEL: select_i32_ne:
+; RV32-THEAD:       # %bb.0:
+; RV32-THEAD-NEXT:    xor a0, a0, a1
+; RV32-THEAD-NEXT:    th.mveqz a2, a3, a0
+; RV32-THEAD-NEXT:    mv a0, a2
+; RV32-THEAD-NEXT:    ret
+;
+; RV64-LABEL: select_i32_ne:
+; RV64:       # %bb.0:
+; RV64-NEXT:    bne a0, a1, .LBB3_2
+; RV64-NEXT:  # %bb.1:
+; RV64-NEXT:    mv a2, a3
+; RV64-NEXT:  .LBB3_2:
+; RV64-NEXT:    mv a0, a2
+; RV64-NEXT:    ret
+;
+; RV64-MIPS-LABEL: select_i32_ne:
+; RV64-MIPS:       # %bb.0:
+; RV64-MIPS-NEXT:    xor a0, a0, a1
+; RV64-MIPS-NEXT:    mips.ccmov a0, a0, a2, a3
+; RV64-MIPS-NEXT:    ret
+  %cond = icmp ne i32 %a, %b
+  %res = select i1 %cond, i32 %x, i32 %y
+  ret i32 %res
+}
+
+define signext i32 @select_i32_ugt(i32 signext %a, i32 signext %b, i32 signext %x, i32 signext %y) nounwind {
+; RV32-LABEL: select_i32_ugt:
+; RV32:       # %bb.0:
+; RV32-NEXT:    bltu a1, a0, .LBB4_2
+; RV32-NEXT:  # %bb.1:
+; RV32-NEXT:    mv a2, a3
+; RV32-NEXT:  .LBB4_2:
+; RV32-NEXT:    mv a0, a2
+; RV32-NEXT:    ret
+;
+; RV32-THEAD-LABEL: select_i32_ugt:
+; RV32-THEAD:       # %bb.0:
+; RV32-THEAD-NEXT:    sltu a0, a1, a0
+; RV32-THEAD-NEXT:    th.mveqz a2, a3, a0
+; RV32-THEAD-NEXT:    mv a0, a2
+; RV32-THEAD-NEXT:    ret
+;
+; RV64-LABEL: select_i32_ugt:
+; RV64:       # %bb.0:
+; RV64-NEXT:    bltu a1, a0, .LBB4_2
+; RV64-NEXT:  # %bb.1:
+; RV64-NEXT:    mv a2, a3
+; RV64-NEXT:  .LBB4_2:
+; RV64-NEXT:    mv a0, a2
+; RV64-NEXT:    ret
+;
+; RV64-MIPS-LABEL: select_i32_ugt:
+; RV64-MIPS:       # %bb.0:
+; RV64-MIPS-NEXT:    sltu a0, a1, a0
+; RV64-MIPS-NEXT:    mips.ccmov a0, a0, a2, a3
+; RV64-MIPS-NEXT:    ret
+  %cond = icmp ugt i32 %a, %b
+  %res = select i1 %cond, i32 %x, i32 %y
+  ret i32 %res
+}
+
+define signext i32 @select_i32_uge(i32 signext %a, i32 signext %b, i32 signext %x, i32 signext %y) nounwind {
+; RV32-LABEL: select_i32_uge:
+; RV32:       # %bb.0:
+; RV32-NEXT:    bgeu a0, a1, .LBB5_2
+; RV32-NEXT:  # %bb.1:
+; RV32-NEXT:    mv a2, a3
+; RV32-NEXT:  .LBB5_2:
+; RV32-NEXT:    mv a0, a2
+; RV32-NEXT:    ret
+;
+; RV32-THEAD-LABEL: select_i32_uge:
+; RV32-THEAD:       # %bb.0:
+; RV32-THEAD-NEXT:    sltu a0, a0, a1
+; RV32-THEAD-NEXT:    th.mvnez a2, a3, a0
+; RV32-THEAD-NEXT:    mv a0, a2
+; RV32-THEAD-NEXT:    ret
+;
+; RV64-LABEL: select_i32_uge:
+; RV64:       # %bb.0:
+; RV64-NEXT:    bgeu a0, a1, .LBB5_2
+; RV64-NEXT:  # %bb.1:
+; RV64-NEXT:    mv a2, a3
+; RV64-NEXT:  .LBB5_2:
+; RV64-NEXT:    mv a0, a2
+; RV64-NEXT:    ret
+;
+; RV64-MIPS-LABEL: select_i32_uge:
+; RV64-MIPS:       # %bb.0:
+; RV64-MIPS-NEXT:    sltu a0, a0, a1
+; RV64-MIPS-NEXT:    mips.ccmov a0, a0, a3, a2
+; RV64-MIPS-NEXT:    ret
+  %cond = icmp uge i32 %a, %b
+  %res = select i1 %cond, i32 %x, i32 %y
+  ret i32 %res
+}
+
+define signext i32 @select_i32_ult(i32 signext %a, i32 signext %b, i32 signext %x, i32 signext %y) nounwind {
+; RV32-LABEL: select_i32_ult:
+; RV32:       # %bb.0:
+; RV32-NEXT:    bltu a0, a1, .LBB6_2
+; RV32-NEXT:  # %bb.1:
+; RV32-NEXT:    mv a2, a3
+; RV32-NEXT:  .LBB6_2:
+; RV32-NEXT:    mv a0, a2
+; RV32-NEXT:    ret
+;
+; RV32-THEAD-LABEL: select_i32_ult:
+; RV32-THEAD:       # %bb.0:
+; RV32-THEAD-NEXT:    sltu a0, a0, a1
+; RV32-THEAD-NEXT:    th.mveqz a2, a3, a0
+; RV32-THEAD-NEXT:    mv a0, a2
+; RV32-THEAD-NEXT:    ret
+;
+; RV64-LABEL: select_i32_ult:
+; RV64:       # %bb.0:
+; RV64-NEXT:    bltu a0, a1, .LBB6_2
+; RV64-NEXT:  # %bb.1:
+; RV64-NEXT:    mv a2, a3
+; RV64-NEXT:  .LBB6_2:
+; RV64-NEXT:    mv a0, a2
+; RV64-NEXT:    ret
+;
+; RV64-MIPS-LABEL: select_i32_ult:
+; RV64-MIPS:       # %bb.0:
+; RV64-MIPS-NEXT:    sltu a0, a0, a1
+; RV64-MIPS-NEXT:    mips.ccmov a0, a0, a2, a3
+; RV64-MIPS-NEXT:    ret
+  %cond = icmp ult i32 %a, %b
+  %res = select i1 %cond, i32 %x, i32 %y
+  ret i32 %res
+}
+
+define signext i32 @select_i32_ule(i32 signext %a, i32 signext %b, i32 signext %x, i32 signext %y) nounwind {
+; RV32-LABEL: select_i32_ule:
+; RV32:       # %bb.0:
+; RV32-NEXT:    bgeu a1, a0, .LBB7_2
+; RV32-NEXT:  # %bb.1:
+; RV32-NEXT:    mv a2, a3
+; RV32-NEXT:  .LBB7_2:
+; RV32-NEXT:    mv a0, a2
+; RV32-NEXT:    ret
+;
+; RV32-THEAD-LABEL: select_i32_ule:
+; RV32-THEAD:       # %bb.0:
+; RV32-THEAD-NEXT:    sltu a0, a1, a0
+; RV32-THEAD-NEXT:    th.mvnez a2, a3, a0
+; RV32-THEAD-NEXT:    mv a0, a2
+; RV32-THEAD-NEXT:    ret
+;
+; RV64-LABEL: select_i32_ule:
+; RV64:       # %bb.0:
+; RV64-NEXT:    bgeu a1, a0, .LBB7_2
+; RV64-NEXT:  # %bb.1:
+; RV64-NEXT:    mv a2, a3
+; RV64-NEXT:  .LBB7_2:
+; RV64-NEXT:    mv a0, a2
+; RV64-NEXT:    ret
+;
+; RV64-MIPS-LABEL: select_i32_ule:
+; RV64-MIPS:       # %bb.0:
+; RV64-MIPS-NEXT:    sltu a0, a1, a0
+; RV64-MIPS-NEXT:    mips.ccmov a0, a0, a3, a2
+; RV64-MIPS-NEXT:    ret
+  %cond = icmp ule i32 %a, %b
+  %res = select i1 %cond, i32 %x, i32 %y
+  ret i32 %res
+}
+
+define signext i32 @select_i32_sgt(i32 signext %a, i32 signext %b, i32 signext %x, i32 signext %y) nounwind {
+; RV32-LABEL: select_i32_sgt:
+; RV32:       # %bb.0:
+; RV32-NEXT:    blt a1, a0, .LBB8_2
+; RV32-NEXT:  # %bb.1:
+; RV32-NEXT:    mv a2, a3
+; RV32-NEXT:  .LBB8_2:
+; RV32-NEXT:    mv a0, a2
+; RV32-NEXT:    ret
+;
+; RV32-THEAD-LABEL: select_i32_sgt:
+; RV32-THEAD:       # %bb.0:
+; RV32-THEAD-NEXT:    slt a0, a1, a0
+; RV32-THEAD-NEXT:    th.mveqz a2, a3, a0
+; RV32-THEAD-NEXT:    mv a0, a2
+; RV32-THEAD-NEXT:    ret
+;
+; RV64-LABEL: select_i32_sgt:
+; RV64:       # %bb.0:
+; RV64-NEXT:    blt a1, a0, .LBB8_2
+; RV64-NEXT:  # %bb.1:
+; RV64-NEXT:    mv a2, a3
+; RV64-NEXT:  .LBB8_2:
+; RV64-NEXT:    mv a0, a2
+; RV64-NEXT:    ret
+;
+; RV64-MIPS-LABEL: select_i32_sgt:
+; RV64-MIPS:       # %bb.0:
+; RV64-MIPS-NEXT:    slt a0, a1, a0
+; RV64-MIPS-NEXT:    mips.ccmov a0, a0, a2, a3
+; RV64-MIPS-NEXT:    ret
+  %cond = icmp sgt i32 %a, %b
+  %res = select i1 %cond, i32 %x, i32 %y
+  ret i32 %res
+}
+
+define signext i32 @select_i32_sge(i32 signext %a, i32 signext %b, i32 signext %x, i32 signext %y) nounwind {
+; RV32-LABEL: select_i32_sge:
+; RV32:       # %bb.0:
+; RV32-NEXT:    bge a0, a1, .LBB9_2
+; RV32-NEXT:  # %bb.1:
+; RV32-NEXT:    mv a2, a3
+; RV32-NEXT:  .LBB9_2:
+; RV32-NEXT:    mv a0, a2
+; RV32-NEXT:    ret
+;
+; RV32-THEAD-LABEL: select_i32_sge:
+; RV32-THEAD:       # %bb.0:
+; RV32-THEAD-NEXT:    slt a0, a0, a1
+; RV32-THEAD-NEXT:    th.mvnez a2, a3, a0
+; RV32-THEAD-NEXT:    mv a0, a2
+; RV32-THEAD-NEXT:    ret
+;
+; RV64-LABEL: select_i32_sge:
+; RV64:       # %bb.0:
+; RV64-NEXT:    bge a0, a1, .LBB9_2
+; RV64-NEXT:  # %bb.1:
+; RV64-NEXT:    mv a2, a3
+; RV64-NEXT:  .LBB9_2:
+; RV64-NEXT:    mv a0, a2
+; RV64-NEXT:    ret
+;
+; RV64-MIPS-LABEL: select_i32_sge:
+; RV64-MIPS:       # %bb.0:
+; RV64-MIPS-NEXT:    slt a0, a0, a1
+; RV64-MIPS-NEXT:    mips.ccmov a0, a0, a3, a2
+; RV64-MIPS-NEXT:    ret
+  %cond = icmp sge i32 %a, %b
+  %res = select i1 %cond, i32 %x, i32 %y
+  ret i32 %res
+}
+
+define signext i32 @select_i32_slt(i32 signext %a, i32 signext %b, i32 signext %x, i32 signext %y) nounwind {
+; RV32-LABEL: select_i32_slt:
+; RV32:       # %bb.0:
+; RV32-NEXT:    blt a0, a1, .LBB10_2
+; RV32-NEXT:  # %bb.1:
+; RV32-NEXT:    mv a2, a3
+; RV32-NEXT:  .LBB10_2:
+; RV32-NEXT:    mv a0, a2
+; RV32-NEXT:    ret
+;
+; RV32-THEAD-LABEL: select_i32_slt:
+; RV32-THEAD:       # %bb.0:
+; RV32-THEAD-NEXT:    slt a0, a0, a1
+; RV32-THEAD-NEXT:    th.mveqz a2, a3, a0
+; RV32-THEAD-NEXT:    mv a0, a2
+; RV32-THEAD-NEXT:    ret
+;
+; RV64-LABEL: select_i32_slt:
+; RV64:       # %bb.0:
+; RV64-NEXT:    blt a0, a1, .LBB10_2
+; RV64-NEXT:  # %bb.1:
+; RV64-NEXT:    mv a2, a3
+; RV64-NEXT:  .LBB10_2:
+; RV64-NEXT:    mv a0, a2
+; RV64-NEXT:    ret
+;
+; RV64-MIPS-LABEL: select_i32_slt:
+; RV64-MIPS:       # %bb.0:
+; RV64-MIPS-NEXT:    slt a0, a0, a1
+; RV64-MIPS-NEXT:    mips.ccmov a0, a0, a2, a3
+; RV64-MIPS-NEXT:    ret
+  %cond = icmp slt i32 %a, %b
+  %res = select i1 %cond, i32 %x, i32 %y
+  ret i32 %res
+}
+
+define signext i32 @select_i32_sle(i32 signext %a, i32 signext %b, i32 signext %x, i32 signext %y) nounwind {
+; RV32-LABEL: select_i32_sle:
+; RV32:       # %bb.0:
+; RV32-NEXT:    bge a1, a0, .LBB11_2
+; RV32-NEXT:  # %bb.1:
+; RV32-NEXT:    mv a2, a3
+; RV32-NEXT:  .LBB11_2:
+; RV32-NEXT:    mv a0, a2
+; RV32-NEXT:    ret
+;
+; RV32-THEAD-LABEL: select_i32_sle:
+; RV32-THEAD:       # %bb.0:
+; RV32-THEAD-NEXT:    slt a0, a1, a0
+; RV32-THEAD-NEXT:    th.mvnez a2, a3, a0
+; RV32-THEAD-NEXT:    mv a0, a2
+; RV32-THEAD-NEXT:    ret
+;
+; RV64-LABEL: select_i32_sle:
+; RV64:       # %bb.0:
+; RV64-NEXT:    bge a1, a0, .LBB11_2
+; RV64-NEXT:  # %bb.1:
+; RV64-NEXT:    mv a2, a3
+; RV64-NEXT:  .LBB11_2:
+; RV64-NEXT:    mv a0, a2
+; RV64-NEXT:    ret
+;
+; RV64-MIPS-LABEL: select_i32_sle:
+; RV64-MIPS:       # %bb.0:
+; RV64-MIPS-NEXT:    slt a0, a1, a0
+; RV64-MIPS-NEXT:    mips.ccmov a0, a0, a3, a2
+; RV64-MIPS-NEXT:    ret
+  %cond = icmp sle i32 %a, %b
+  %res = select i1 %cond, i32 %x, i32 %y
+  ret i32 %res
+}
+
+define i64 @select_i64_trunc(i64 %cond, i64 %x, i64 %y) nounwind {
+; RV32-LABEL: select_i64_trunc:
+; RV32:       # %bb.0:
+; RV32-NEXT:    mv a1, a3
+; RV32-NEXT:    andi a3, a0, 1
+; RV32-NEXT:    mv a0, a2
+; RV32-NEXT:    bnez a3, .LBB12_2
+; RV32-NEXT:  # %bb.1:
+; RV32-NEXT:    mv a0, a4
+; RV32-NEXT:    mv a1, a5
+; RV32-NEXT:  .LBB12_2:
+; RV32-NEXT:    ret
+;
+; RV32-THEAD-LABEL: select_i64_trunc:
+; RV32-THEAD:       # %bb.0:
+; RV32-THEAD-NEXT:    mv a1, a3
+; RV32-THEAD-NEXT:    andi a0, a0, 1
+; RV32-THEAD-NEXT:    th.mveqz a2, a4, a0
+; RV32-THEAD-NEXT:    th.mveqz a1, a5, a0
+; RV32-THEAD-NEXT:    mv a0, a2
+; RV32-THEAD-NEXT:    ret
+;
+; RV64-LABEL: select_i64_trunc:
+; RV64:       # %bb.0:
+; RV64-NEXT:    andi a3, a0, 1
+; RV64-NEXT:    mv a0, a1
+; RV64-NEXT:    bnez a3, .LBB12_2
+; RV64-NEXT:  # %bb.1:
+; RV64-NEXT:    mv a0, a2
+; RV64-NEXT:  .LBB12_2:
+; RV64-NEXT:    ret
+;
+; RV64-MIPS-LABEL: select_i64_trunc:
+; RV64-MIPS:       # %bb.0:
+; RV64-MIPS-NEXT:    andi a0, a0, 1
+; RV64-MIPS-NEXT:    mips.ccmov a0, a0, a1, a2
+; RV64-MIPS-NEXT:    ret
+  %cond_trunc = trunc i64 %cond to i1
+  %res = select i1 %cond_trunc, i64 %x, i64 %y
+  ret i64 %res
+}
+
+define i64 @select_i64_param(i1 %cond, i64 %x, i64 %y) nounwind {
+; RV32-LABEL: select_i64_param:
+; RV32:       # %bb.0:
+; RV32-NEXT:    andi a5, a0, 1
+; RV32-NEXT:    mv a0, a1
+; RV32-NEXT:    bnez a5, .LBB13_2
+; RV32-NEXT:  # %bb.1:
+; RV32-NEXT:    mv a0, a3
+; RV32-NEXT:    mv a2, a4
+; RV32-NEXT:  .LBB13_2:
+; RV32-NEXT:    mv a1, a2
+; RV32-NEXT:    ret
+;
+; RV32-THEAD-LABEL: select_i64_param:
+; RV32-THEAD:       # %bb.0:
+; RV32-THEAD-NEXT:    andi a0, a0, 1
+; RV32-THEAD-NEXT:    th.mveqz a1, a3, a0
+; RV32-THEAD-NEXT:    th.mveqz a2, a4, a0
+; RV32-THEAD-NEXT:    mv a0, a1
+; RV32-THEAD-NEXT:    mv a1, a2
+; RV32-THEAD-NEXT:    ret
+;
+; RV64-LABEL: select_i64_param:
+; RV64:       # %bb.0:
+; RV64-NEXT:    andi a3, a0, 1
+; RV64-NEXT:    mv a0, a1
+; RV64-NEXT:    bnez a3, .LBB13_2
+; RV64-NEXT:  # %bb.1:
+; RV64-NEXT:    mv a0, a2
+; RV64-NEXT:  .LBB13_2:
+; RV64-NEXT:    ret
+;
+; RV64-MIPS-LABEL: select_i64_param:
+; RV64-MIPS:       # %bb.0:
+; RV64-MIPS-NEXT:    andi a0, a0, 1
+; RV64-MIPS-NEXT:    mips.ccmov a0, a0, a1, a2
+; RV64-MIPS-NEXT:    ret
+  %res = select i1 %cond, i64 %x, i64 %y
+  ret i64 %res
+}
+
+define i64 @select_i64_eq(i64 %a, i64 %b, i64 %x, i64 %y) nounwind {
+; RV32-LABEL: select_i64_eq:
+; RV32:       # %bb.0:
+; RV32-NEXT:    xor a1, a1, a3
+; RV32-NEXT:    xor a0, a0, a2
+; RV32-NEXT:    or a1, a0, a1
+; RV32-NEXT:    mv a0, a4
+; RV32-NEXT:    beqz a1, .LBB14_2
+; RV32-NEXT:  # %bb.1:
+; RV32-NEXT:    mv a0, a6
+; RV32-NEXT:    mv a5, a7
+; RV32-NEXT:  .LBB14_2:
+; RV32-NEXT:    mv a1, a5
+; RV32-NEXT:    ret
+;
+; RV32-THEAD-LABEL: select_i64_eq:
+; RV32-THEAD:       # %bb.0:
+; RV32-THEAD-NEXT:    xor a1, a1, a3
+; RV32-THEAD-NEXT:    xor a0, a0, a2
+; RV32-THEAD-NEXT:    or a0, a0, a1
+; RV32-THEAD-NEXT:    th.mvnez a4, a6, a0
+; RV32-THEAD-NEXT:    th.mvnez a5, a7, a0
+; RV32-THEAD-NEXT:    mv a0, a4
+; RV32-THEAD-NEXT:    mv a1, a5
+; RV32-THEAD-NEXT:    ret
+;
+; RV64-LABEL: select_i64_eq:
+; RV64:       # %bb.0:
+; RV64-NEXT:    beq a0, a1, .LBB14_2
+; RV64-NEXT:  # %bb.1:
+; RV64-NEXT:    mv a2, a3
+; RV64-NEXT:  .LBB14_2:
+; RV64-NEXT:    mv a0, a2
+; RV64-NEXT:    ret
+;
+; RV64-MIPS-LABEL: select_i64_eq:
+; RV64-MIPS:       # %bb.0:
+; RV64-MIPS-NEXT:    xor a0, a0, a1
+; RV64-MIPS-NEXT:    mips.ccmov a0, a0, a3, a2
+; RV64-MIPS-NEXT:    ret
+  %cond = icmp eq i64 %a, %b
+  %res = select i1 %cond, i64 %x, i64 %y
+  ret i64 %res
+}
+
+define i64 @select_i64_ne(i64 %a, i64 %b, i64 %x, i64 %y) nounwind {
+; RV32-LABEL: select_i64_ne:
+; RV32:       # %bb.0:
+; RV32-NEXT:    xor a1, a1, a3
+; RV32-NEXT:    xor a0, a0, a2
+; RV32-NEXT:    or a1, a0, a1
+; RV32-NEXT:    mv a0, a4
+; RV32-NEXT:    bnez a1, .LBB15_2
+; RV32-NEXT:  # %bb.1:
+; RV32-NEXT:    mv a0, a6
+; RV32-NEXT:    mv a5, a7
+; RV32-NEXT:  .LBB15_2:
+; RV32-NEXT:    mv a1, a5
+; RV32-NEXT:    ret
+;
+; RV32-THEAD-LABEL: select_i64_ne:
+; RV32-THEAD:       # %bb.0:
+; RV32-THEAD-NEXT:    xor a1, a1, a3
+; RV32-THEAD-NEXT:    xor a0, a0, a2
+; RV32-THEAD-NEXT:    or a0, a0, a1
+; RV32-THEAD-NEXT:    th.mveqz a4, a6, a0
+; RV32-THEAD-NEXT:    th.mveqz a5, a7, a0
+; RV32-THEAD-NEXT:    mv a0, a4
+; RV32-THEAD-NEXT:    mv a1, a5
+; RV32-THEAD-NEXT:    ret
+;
+; RV64-LABEL: select_i64_ne:
+; RV64:       # %bb.0:
+; RV64-NEXT:    bne a0, a1, .LBB15_2
+; RV64-NEXT:  # %bb.1:
+; RV64-NEXT:    mv a2, a3
+; RV64-NEXT:  .LBB15_2:
+; RV64-NEXT:    mv a0, a2
+; RV64-NEXT:    ret
+;
+; RV64-MIPS-LABEL: select_i64_ne:
+; RV64-MIPS:       # %bb.0:
+; RV64-MIPS-NEXT:    xor a0, a0, a1
+; RV64-MIPS-NEXT:    mips.ccmov a0, a0, a2, a3
+; RV64-MIPS-NEXT:    ret
+  %cond = icmp ne i64 %a, %b
+  %res = select i1 %cond, i64 %x, i64 %y
+  ret i64 %res
+}
+
+define i64 @select_i64_ugt(i64 %a, i64 %b, i64 %x, i64 %y) nounwind {
+; RV32-LABEL: select_i64_ugt:
+; RV32:       # %bb.0:
+; RV32-NEXT:    beq a1, a3, .LBB16_2
+; RV32-NEXT:  # %bb.1:
+; RV32-NEXT:    sltu a0, a3, a1
+; RV32-NEXT:    beqz a0, .LBB16_3
+; RV32-NEXT:    j .LBB16_4
+; RV32-NEXT:  .LBB16_2:
+; RV32-NEXT:    sltu a0, a2, a0
+; RV32-NEXT:    bnez a0, .LBB16_4
+; RV32-NEXT:  .LBB16_3:
+; RV32-NEXT:    mv a4, a6
+; RV32-NEXT:    mv a5, a7
+; RV32-NEXT:  .LBB16_4:
+; RV32-NEXT:    mv a0, a4
+; RV32-NEXT:    mv a1, a5
+; RV32-NEXT:    ret
+;
+; RV32-THEAD-LABEL: select_i64_ugt:
+; RV32-THEAD:       # %bb.0:
+; RV32-THEAD-NEXT:    xor t0, a1, a3
+; RV32-THEAD-NEXT:    sltu a1, a3, a1
+; RV32-THEAD-NEXT:    sltu a0, a2, a0
+; RV32-THEAD-NEXT:    th.mvnez a0, a1, t0
+; RV32-THEAD-NEXT:    th.mveqz a4, a6, a0
+; RV32-THEAD-NEXT:    th.mveqz a5, a7, a0
+; RV32-THEAD-NEXT:    mv a0, a4
+; RV32-THEAD-NEXT:    mv a1, a5
+; RV32-THEAD-NEXT:    ret
+;
+; RV64-LABEL: select_i64_ugt:
+; RV64:       # %bb.0:
+; RV64-NEXT:    bltu a1, a0, .LBB16_2
+; RV64-NEXT:  # %bb.1:
+; RV64-NEXT:    mv a2, a3
+; RV64-NEXT:  .LBB16_2:
+; RV64-NEXT:    mv a0, a2
+; RV64-NEXT:    ret
+;
+; RV64-MIPS-LABEL: select_i64_ugt:
+; RV64-MIPS:       # %bb.0:
+; RV64-MIPS-NEXT:    sltu a0, a1, a0
+; RV64-MIPS-NEXT:    mips.ccmov a0, a0, a2, a3
+; RV64-MIPS-NEXT:    ret
+  %cond = icmp ugt i64 %a, %b
+  %res = select i1 %cond, i64 %x, i64 %y
+  ret i64 %res
+}
+
+define i64 @select_i64_uge(i64 %a, i64 %b, i64 %x, i64 %y) nounwind {
+; RV32-LABEL: select_i64_uge:
+; RV32:       # %bb.0:
+; RV32-NEXT:    beq a1, a3, .LBB17_2
+; RV32-NEXT:  # %bb.1:
+; RV32-NEXT:    sltu a0, a1, a3
+; RV32-NEXT:    bnez a0, .LBB17_3
+; RV32-NEXT:    j .LBB17_4
+; RV32-NEXT:  .LBB17_2:
+; RV32-NEXT:    sltu a0, a0, a2
+; RV32-NEXT:    beqz a0, .LBB17_4
+; RV32-NEXT:  .LBB17_3:
+; RV32-NEXT:    mv a4, a6
+; RV32-NEXT:    mv a5, a7
+; RV32-NEXT:  .LBB17_4:
+; RV32-NEXT:    mv a0, a4
+; RV32-NEXT:    mv a1, a5
+; RV32-NEXT:    ret
+;
+; RV32-THEAD-LABEL: select_i64_uge:
+; RV32-THEAD:       # %bb.0:
+; RV32-THEAD-NEXT:    xor t0, a1, a3
+; RV32-THEAD-NEXT:    sltu a1, a1, a3
+; RV32-THEAD-NEXT:    sltu a0, a0, a2
+; RV32-THEAD-NEXT:    th.mvnez a0, a1, t0
+; RV32-THEAD-NEXT:    th.mvnez a4, a6, a0
+; RV32-THEAD-NEXT:    th.mvnez a5, a7, a0
+; RV32-THEAD-NEXT:    mv a0, a4
+; RV32-THEAD-NEXT:    mv a1, a5
+; RV32-THEAD-NEXT:    ret
+;
+; RV64-LABEL: select_i64_uge:
+; RV64:       # %bb.0:
+; RV64-NEXT:    bgeu a0, a1, .LBB17_2
+; RV64-NEXT:  # %bb.1:
+; RV64-NEXT:    mv a2, a3
+; RV64-NEXT:  .LBB17_2:
+; RV64-NEXT:    mv a0, a2
+; RV64-NEXT:    ret
+;
+; RV64-MIPS-LABEL: select_i64_uge:
+; RV64-MIPS:       # %bb.0:
+; RV64-MIPS-NEXT:    sltu a0, a0, a1
+; RV64-MIPS-NEXT:    mips.ccmov a0, a0, a3, a2
+; RV64-MIPS-NEXT:    ret
+  %cond = icmp uge i64 %a, %b
+  %res = select i1 %cond, i64 %x, i64 %y
+  ret i64 %res
+}
+
+define i64 @select_i64_ult(i64 %a, i64 %b, i64 %x, i64 %y) nounwind {
+; RV32-LABEL: select_i64_ult:
+; RV32:       # %bb.0:
+; RV32-NEXT:    beq a1, a3, .LBB18_2
+; RV32-NEXT:  # %bb.1:
+; RV32-NEXT:    sltu a0, a1, a3
+; RV32-NEXT:    beqz a0, .LBB18_3
+; RV32-NEXT:    j .LBB18_4
+; RV32-NEXT:  .LBB18_2:
+; RV32-NEXT:    sltu a0, a0, a2
+; RV32-NEXT:    bnez a0, .LBB18_4
+; RV32-NEXT:  .LBB18_3:
+; RV32-NEXT:    mv a4, a6
+; RV32-NEXT:    mv a5, a7
+; RV32-NEXT:  .LBB18_4:
+; RV32-NEXT:    mv a0, a4
+; RV32-NEXT:    mv a1, a5
+; RV32-NEXT:    ret
+;
+; RV32-THEAD-LABEL: select_i64_ult:
+; RV32-THEAD:       # %bb.0:
+; RV32-THEAD-NEXT:    xor t0, a1, a3
+; RV32-THEAD-NEXT:    sltu a1, a1, a3
+; RV32-THEAD-NEXT:    sltu a0, a0, a2
+; RV32-THEAD-NEXT:    th.mvnez a0, a1, t0
+; RV32-THEAD-NEXT:    th.mveqz a4, a6, a0
+; RV32-THEAD-NEXT:    th.mveqz a5, a7, a0
+; RV32-THEAD-NEXT:    mv a0, a4
+; RV32-THEAD-NEXT:    mv a1, a5
+; RV32-THEAD-NEXT:    ret
+;
+; RV64-LABEL: select_i64_ult:
+; RV64:       # %bb.0:
+; RV64-NEXT:    bltu a0, a1, .LBB18_2
+; RV64-NEXT:  # %bb.1:
+; RV64-NEXT:    mv a2, a3
+; RV64-NEXT:  .LBB18_2:
+; RV64-NEXT:    mv a0, a2
+; RV64-NEXT:    ret
+;
+; RV64-MIPS-LABEL: select_i64_ult:
+; RV64-MIPS:       # %bb.0:
+; RV64-MIPS-NEXT:    sltu a0, a0, a1
+; RV64-MIPS-NEXT:    mips.ccmov a0, a0, a2, a3
+; RV64-MIPS-NEXT:    ret
+  %cond = icmp ult i64 %a, %b
+  %res = select i1 %cond, i64 %x, i64 %y
+  ret i64 %res
+}
+
+define i64 @select_i64_ule(i64 %a, i64 %b, i64 %x, i64 %y) nounwind {
+; RV32-LABEL: select_i64_ule:
+; RV32:       # %bb.0:
+; RV32-NEXT:    beq a1, a3, .LBB19_2
+; RV32-NEXT:  # %bb.1:
+; RV32-NEXT:    sltu a0, a3, a1
+; RV32-NEXT:    bnez a0, .LBB19_3
+; RV32-NEXT:    j .LBB19_4
+; RV32-NEXT:  .LBB19_2:
+; RV32-NEXT:    sltu a0, a2, a0
+; RV32-NEXT:    beqz a0, .LBB19_4
+; RV32-NEXT:  .LBB19_3:
+; RV32-NEXT:    mv a4, a6
+; RV32-NEXT:    mv a5, a7
+; RV32-NEXT:  .LBB19_4:
+; RV32-NEXT:    mv a0, a4
+; RV32-NEXT:    mv a1, a5
+; RV32-NEXT:    ret
+;
+; RV32-THEAD-LABEL: select_i64_ule:
+; RV32-THEAD:       # %bb.0:
+; RV32-THEAD-NEXT:    xor t0, a1, a3
+; RV32-THEAD-NEXT:    sltu a1, a3, a1
+; RV32-THEAD-NEXT:    sltu a0, a2, a0
+; RV32-THEAD-NEXT:    th.mvnez a0, a1, t0
+; RV32-THEAD-NEXT:    th.mvnez a4, a6, a0
+; RV32-THEAD-NEXT:    th.mvnez a5, a7, a0
+; RV32-THEAD-NEXT:    mv a0, a4
+; RV32-THEAD-NEXT:    mv a1, a5
+; RV32-THEAD-NEXT:    ret
+;
+; RV64-LABEL: select_i64_ule:
+; RV64:       # %bb.0:
+; RV64-NEXT:    bgeu a1, a0, .LBB19_2
+; RV64-NEXT:  # %bb.1:
+; RV64-NEXT:    mv a2, a3
+; RV64-NEXT:  .LBB19_2:
+; RV64-NEXT:    mv a0, a2
+; RV64-NEXT:    ret
+;
+; RV64-MIPS-LABEL: select_i64_ule:
+; RV64-MIPS:       # %bb.0:
+; RV64-MIPS-NEXT:    sltu a0, a1, a0
+; RV64-MIPS-NEXT:    mips.ccmov a0, a0, a3, a2
+; RV64-MIPS-NEXT:    ret
+  %cond = icmp ule i64 %a, %b
+  %res = select i1 %cond, i64 %x, i64 %y
+  ret i64 %res
+}
+
+define i64 @select_i64_sgt(i64 %a, i64 %b, i64 %x, i64 %y) nounwind {
+; RV32-LABEL: select_i64_sgt:
+; RV32:       # %bb.0:
+; RV32-NEXT:    beq a1, a3, .LBB20_2
+; RV32-NEXT:  # %bb.1:
+; RV32-NEXT:    slt a0, a3, a1
+; RV32-NEXT:    beqz a0, .LBB20_3
+; RV32-NEXT:    j .LBB20_4
+; RV32-NEXT:  .LBB20_2:
+; RV32-NEXT:    sltu a0, a2, a0
+; RV32-NEXT:    bnez a0, .LBB20_4
+; RV32-NEXT:  .LBB20_3:
+; RV32-NEXT:    mv a4, a6
+; RV32-NEXT:    mv a5, a7
+; RV32-NEXT:  .LBB20_4:
+; RV32-NEXT:    mv a0, a4
+; RV32-NEXT:    mv a1, a5
+; RV32-NEXT:    ret
+;
+; RV32-THEAD-LABEL: select_i64_sgt:
+; RV32-THEAD:       # %bb.0:
+; RV32-THEAD-NEXT:    xor t0, a1, a3
+; RV32-THEAD-NEXT:    slt a1, a3, a1
+; RV32-THEAD-NEXT:    sltu a0, a2, a0
+; RV32-THEAD-NEXT:    th.mvnez a0, a1, t0
+; RV32-THEAD-NEXT:    th.mveqz a4, a6, a0
+; RV32-THEAD-NEXT:    th.mveqz a5, a7, a0
+; RV32-THEAD-NEXT:    mv a0, a4
+; RV32-THEAD-NEXT:    mv a1, a5
+; RV32-THEAD-NEXT:    ret
+;
+; RV64-LABEL: select_i64_sgt:
+; RV64:       # %bb.0:
+; RV64-NEXT:    blt a1, a0, .LBB20_2
+; RV64-NEXT:  # %bb.1:
+; RV64-NEXT:    mv a2, a3
+; RV64-NEXT:  .LBB20_2:
+; RV64-NEXT:    mv a0, a2
+; RV64-NEXT:    ret
+;
+; RV64-MIPS-LABEL: select_i64_sgt:
+; RV64-MIPS:       # %bb.0:
+; RV64-MIPS-NEXT:    slt a0, a1, a0
+; RV64-MIPS-NEXT:    mips.ccmov a0, a0, a2, a3
+; RV64-MIPS-NEXT:    ret
+  %cond = icmp sgt i64 %a, %b
+  %res = select i1 %cond, i64 %x, i64 %y
+  ret i64 %res
+}
+
+define i64 @select_i64_sge(i64 %a, i64 %b, i64 %x, i64 %y) nounwind {
+; RV32-LABEL: select_i64_sge:
+; RV32:       # %bb.0:
+; RV32-NEXT:    beq a1, a3, .LBB21_2
+; RV32-NEXT:  # %bb.1:
+; RV32-NEXT:    slt a0, a1, a3
+; RV32-NEXT:    bnez a0, .LBB21_3
+; RV32-NEXT:    j .LBB21_4
+; RV32-NEXT:  .LBB21_2:
+; RV32-NEXT:    sltu a0, a0, a2
+; RV32-NEXT:    beqz a0, .LBB21_4
+; RV32-NEXT:  .LBB21_3:
+; RV32-NEXT:    mv a4, a6
+; RV32-NEXT:    mv a5, a7
+; RV32-NEXT:  .LBB21_4:
+; RV32-NEXT:    mv a0, a4
+; RV32-NEXT:    mv a1, a5
+; RV32-NEXT:    ret
+;
+; RV32-THEAD-LABEL: select_i64_sge:
+; RV32-THEAD:       # %bb.0:
+; RV32-THEAD-NEXT:    xor t0, a1, a3
+; RV32-THEAD-NEXT:    slt a1, a1, a3
+; RV32-THEAD-NEXT:    sltu a0, a0, a2
+; RV32-THEAD-NEXT:    th.mvnez a0, a1, t0
+; RV32-THEAD-NEXT:    th.mvnez a4, a6, a0
+; RV32-THEAD-NEXT:    th.mvnez a5, a7, a0
+; RV32-THEAD-NEXT:    mv a0, a4
+; RV32-THEAD-NEXT:    mv a1, a5
+; RV32-THEAD-NEXT:    ret
+;
+; RV64-LABEL: select_i64_sge:
+; RV64:       # %bb.0:
+; RV64-NEXT:    bge a0, a1, .LBB21_2
+; RV64-NEXT:  # %bb.1:
+; RV64-NEXT:    mv a2, a3
+; RV64-NEXT:  .LBB21_2:
+; RV64-NEXT:    mv a0, a2
+; RV64-NEXT:    ret
+;
+; RV64-MIPS-LABEL: select_i64_sge:
+; RV64-MIPS:       # %bb.0:
+; RV64-MIPS-NEXT:    slt a0, a0, a1
+; RV64-MIPS-NEXT:    mips.ccmov a0, a0, a3, a2
+; RV64-MIPS-NEXT:    ret
+  %cond = icmp sge i64 %a, %b
+  %res = select i1 %cond, i64 %x, i64 %y
+  ret i64 %res
+}
+
+define i64 @select_i64_slt(i64 %a, i64 %b, i64 %x, i64 %y) nounwind {
+; RV32-LABEL: select_i64_slt:
+; RV32:       # %bb.0:
+; RV32-NEXT:    beq a1, a3, .LBB22_2
+; RV32-NEXT:  # %bb.1:
+; RV32-NEXT:    slt a0, a1, a3
+; RV32-NEXT:    beqz a0, .LBB22_3
+; RV32-NEXT:    j .LBB22_4
+; RV32-NEXT:  .LBB22_2:
+; RV32-NEXT:    sltu a0, a0, a2
+; RV32-NEXT:    bnez a0, .LBB22_4
+; RV32-NEXT:  .LBB22_3:
+; RV32-NEXT:    mv a4, a6
+; RV32-NEXT:    mv a5, a7
+; RV32-NEXT:  .LBB22_4:
+; RV32-NEXT:    mv a0, a4
+; RV32-NEXT:    mv a1, a5
+; RV32-NEXT:    ret
+;
+; RV32-THEAD-LABEL: select_i64_slt:
+; RV32-THEAD:       # %bb.0:
+; RV32-THEAD-NEXT:    xor t0, a1, a3
+; RV32-THEAD-NEXT:    slt a1, a1, a3
+; RV32-THEAD-NEXT:    sltu a0, a0, a2
+; RV32-THEAD-NEXT:    th.mvnez a0, a1, t0
+; RV32-THEAD-NEXT:    th.mveqz a4, a6, a0
+; RV32-THEAD-NEXT:    th.mveqz a5, a7, a0
+; RV32-THEAD-NEXT:    mv a0, a4
+; RV32-THEAD-NEXT:    mv a1, a5
+; RV32-THEAD-NEXT:    ret
+;
+; RV64-LABEL: select_i64_slt:
+; RV64:       # %bb.0:
+; RV64-NEXT:    blt a0, a1, .LBB22_2
+; RV64-NEXT:  # %bb.1:
+; RV64-NEXT:    mv a2, a3
+; RV64-NEXT:  .LBB22_2:
+; RV64-NEXT:    mv a0, a2
+; RV64-NEXT:    ret
+;
+; RV64-MIPS-LABEL: select_i64_slt:
+; RV64-MIPS:       # %bb.0:
+; RV64-MIPS-NEXT:    slt a0, a0, a1
+; RV64-MIPS-NEXT:    mips.ccmov a0, a0, a2, a3
+; RV64-MIPS-NEXT:    ret
+  %cond = icmp slt i64 %a, %b
+  %res = select i1 %cond, i64 %x, i64 %y
+  ret i64 %res
+}
+
+define i64 @select_i64_sle(i64 %a, i64 %b, i64 %x, i64 %y) nounwind {
+; RV32-LABEL: select_i64_sle:
+; RV32:       # %bb.0:
+; RV32-NEXT:    beq a1, a3, .LBB23_2
+; RV32-NEXT:  # %bb.1:
+; RV32-NEXT:    slt a0, a3, a1
+; RV32-NEXT:    bnez a0, .LBB23_3
+; RV32-NEXT:    j .LBB23_4
+; RV32-NEXT:  .LBB23_2:
+; RV32-NEXT:    sltu a0, a2, a0
+; RV32-NEXT:    beqz a0, .LBB23_4
+; RV32-NEXT:  .LBB23_3:
+; RV32-NEXT:    mv a4, a6
+; RV32-NEXT:    mv a5, a7
+; RV32-NEXT:  .LBB23_4:
+; RV32-NEXT:    mv a0, a4
+; RV32-NEXT:    mv a1, a5
+; RV32-NEXT:    ret
+;
+; RV32-THEAD-LABEL: select_i64_sle:
+; RV32-THEAD:       # %bb.0:
+; RV32-THEAD-NEXT:    xor t0, a1, a3
+; RV32-THEAD-NEXT:    slt a1, a3, a1
+; RV32-THEAD-NEXT:    sltu a0, a2, a0
+; RV32-THEAD-NEXT:    th.mvnez a0, a1, t0
+; RV32-THEAD-NEXT:    th.mvnez a4, a6, a0
+; RV32-THEAD-NEXT:    th.mvnez a5, a7, a0
+; RV32-THEAD-NEXT:    mv a0, a4
+; RV32-THEAD-NEXT:    mv a1, a5
+; RV32-THEAD-NEXT:    ret
+;
+; RV64-LABEL: select_i64_sle:
+; RV64:       # %bb.0:
+; RV64-NEXT:    bge a1, a0, .LBB23_2
+; RV64-NEXT:  # %bb.1:
+; RV64-NEXT:    mv a2, a3
+; RV64-NEXT:  .LBB23_2:
+; RV64-NEXT:    mv a0, a2
+; RV64-NEXT:    ret
+;
+; RV64-MIPS-LABEL: select_i64_sle:
+; RV64-MIPS:       # %bb.0:
+; RV64-MIPS-NEXT:    slt a0, a1, a0
+; RV64-MIPS-NEXT:    mips.ccmov a0, a0, a3, a2
+; RV64-MIPS-NEXT:    ret
+  %cond = icmp sle i64 %a, %b
+  %res = select i1 %cond, i64 %x, i64 %y
+  ret i64 %res
+}
+

>From a3201ce9e114aa2ecd66e525607093e4dff2f574 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Valentin=20Clement=20=28=E3=83=90=E3=83=AC=E3=83=B3?=
 =?UTF-8?q?=E3=82=BF=E3=82=A4=E3=83=B3=20=E3=82=AF=E3=83=AC=E3=83=A1?=
 =?UTF-8?q?=E3=83=B3=29?= <clementval at gmail.com>
Date: Tue, 10 Jun 2025 22:10:26 -0700
Subject: [PATCH 40/61] [flang][cuda] Add option to disable warp function in
 semantic (#143640)

These functions are not available in some lower compute capabilities.
Add option in the language feature to enforce the semantic check on
these.
---
 .../include/flang/Support/Fortran-features.h  |   2 +-
 flang/lib/Semantics/check-cuda.cpp            | 125 ++++++++++++------
 flang/test/Semantics/cuf22.cuf                |   8 ++
 flang/tools/bbc/bbc.cpp                       |  10 ++
 4 files changed, 101 insertions(+), 44 deletions(-)
 create mode 100644 flang/test/Semantics/cuf22.cuf

diff --git a/flang/include/flang/Support/Fortran-features.h b/flang/include/flang/Support/Fortran-features.h
index 3f6d825e2b66c..ea0845b7d605f 100644
--- a/flang/include/flang/Support/Fortran-features.h
+++ b/flang/include/flang/Support/Fortran-features.h
@@ -55,7 +55,7 @@ ENUM_CLASS(LanguageFeature, BackslashEscapes, OldDebugLines,
     SavedLocalInSpecExpr, PrintNamelist, AssumedRankPassedToNonAssumedRank,
     IgnoreIrrelevantAttributes, Unsigned, AmbiguousStructureConstructor,
     ContiguousOkForSeqAssociation, ForwardRefExplicitTypeDummy,
-    InaccessibleDeferredOverride)
+    InaccessibleDeferredOverride, CudaWarpMatchFunction)
 
 // Portability and suspicious usage warnings
 ENUM_CLASS(UsageWarning, Portability, PointerToUndefinable,
diff --git a/flang/lib/Semantics/check-cuda.cpp b/flang/lib/Semantics/check-cuda.cpp
index c024640af1220..8decfb0149829 100644
--- a/flang/lib/Semantics/check-cuda.cpp
+++ b/flang/lib/Semantics/check-cuda.cpp
@@ -17,6 +17,7 @@
 #include "flang/Semantics/expression.h"
 #include "flang/Semantics/symbol.h"
 #include "flang/Semantics/tools.h"
+#include "llvm/ADT/StringSet.h"
 
 // Once labeled DO constructs have been canonicalized and their parse subtrees
 // transformed into parser::DoConstructs, scan the parser::Blocks of the program
@@ -61,6 +62,11 @@ bool CanonicalizeCUDA(parser::Program &program) {
 
 using MaybeMsg = std::optional<parser::MessageFormattedText>;
 
+static const llvm::StringSet<> warpFunctions_ = {"match_all_syncjj",
+    "match_all_syncjx", "match_all_syncjf", "match_all_syncjd",
+    "match_any_syncjj", "match_any_syncjx", "match_any_syncjf",
+    "match_any_syncjd"};
+
 // Traverses an evaluate::Expr<> in search of unsupported operations
 // on the device.
 
@@ -68,7 +74,7 @@ struct DeviceExprChecker
     : public evaluate::AnyTraverse<DeviceExprChecker, MaybeMsg> {
   using Result = MaybeMsg;
   using Base = evaluate::AnyTraverse<DeviceExprChecker, Result>;
-  DeviceExprChecker() : Base(*this) {}
+  explicit DeviceExprChecker(SemanticsContext &c) : Base(*this), context_{c} {}
   using Base::operator();
   Result operator()(const evaluate::ProcedureDesignator &x) const {
     if (const Symbol * sym{x.GetInterfaceSymbol()}) {
@@ -78,10 +84,17 @@ struct DeviceExprChecker
         if (auto attrs{subp->cudaSubprogramAttrs()}) {
           if (*attrs == common::CUDASubprogramAttrs::HostDevice ||
               *attrs == common::CUDASubprogramAttrs::Device) {
+            if (warpFunctions_.contains(sym->name().ToString()) &&
+                !context_.languageFeatures().IsEnabled(
+                    Fortran::common::LanguageFeature::CudaWarpMatchFunction)) {
+              return parser::MessageFormattedText(
+                  "warp match function disabled"_err_en_US);
+            }
             return {};
           }
         }
       }
+
       const Symbol &ultimate{sym->GetUltimate()};
       const Scope &scope{ultimate.owner()};
       const Symbol *mod{scope.IsModule() ? scope.symbol() : nullptr};
@@ -94,9 +107,12 @@ struct DeviceExprChecker
       // TODO(CUDA): Check for unsupported intrinsics here
       return {};
     }
+
     return parser::MessageFormattedText(
         "'%s' may not be called in device code"_err_en_US, x.GetName());
   }
+
+  SemanticsContext &context_;
 };
 
 struct FindHostArray
@@ -133,9 +149,10 @@ struct FindHostArray
   }
 };
 
-template <typename A> static MaybeMsg CheckUnwrappedExpr(const A &x) {
+template <typename A>
+static MaybeMsg CheckUnwrappedExpr(SemanticsContext &context, const A &x) {
   if (const auto *expr{parser::Unwrap<parser::Expr>(x)}) {
-    return DeviceExprChecker{}(expr->typedExpr);
+    return DeviceExprChecker{context}(expr->typedExpr);
   }
   return {};
 }
@@ -144,104 +161,124 @@ template <typename A>
 static void CheckUnwrappedExpr(
     SemanticsContext &context, SourceName at, const A &x) {
   if (const auto *expr{parser::Unwrap<parser::Expr>(x)}) {
-    if (auto msg{DeviceExprChecker{}(expr->typedExpr)}) {
+    if (auto msg{DeviceExprChecker{context}(expr->typedExpr)}) {
       context.Say(at, std::move(*msg));
     }
   }
 }
 
 template <bool CUF_KERNEL> struct ActionStmtChecker {
-  template <typename A> static MaybeMsg WhyNotOk(const A &x) {
+  template <typename A>
+  static MaybeMsg WhyNotOk(SemanticsContext &context, const A &x) {
     if constexpr (ConstraintTrait<A>) {
-      return WhyNotOk(x.thing);
+      return WhyNotOk(context, x.thing);
     } else if constexpr (WrapperTrait<A>) {
-      return WhyNotOk(x.v);
+      return WhyNotOk(context, x.v);
     } else if constexpr (UnionTrait<A>) {
-      return WhyNotOk(x.u);
+      return WhyNotOk(context, x.u);
     } else if constexpr (TupleTrait<A>) {
-      return WhyNotOk(x.t);
+      return WhyNotOk(context, x.t);
     } else {
       return parser::MessageFormattedText{
           "Statement may not appear in device code"_err_en_US};
     }
   }
   template <typename A>
-  static MaybeMsg WhyNotOk(const common::Indirection<A> &x) {
-    return WhyNotOk(x.value());
+  static MaybeMsg WhyNotOk(
+      SemanticsContext &context, const common::Indirection<A> &x) {
+    return WhyNotOk(context, x.value());
   }
   template <typename... As>
-  static MaybeMsg WhyNotOk(const std::variant<As...> &x) {
-    return common::visit([](const auto &x) { return WhyNotOk(x); }, x);
+  static MaybeMsg WhyNotOk(
+      SemanticsContext &context, const std::variant<As...> &x) {
+    return common::visit(
+        [&context](const auto &x) { return WhyNotOk(context, x); }, x);
   }
   template <std::size_t J = 0, typename... As>
-  static MaybeMsg WhyNotOk(const std::tuple<As...> &x) {
+  static MaybeMsg WhyNotOk(
+      SemanticsContext &context, const std::tuple<As...> &x) {
     if constexpr (J == sizeof...(As)) {
       return {};
-    } else if (auto msg{WhyNotOk(std::get<J>(x))}) {
+    } else if (auto msg{WhyNotOk(context, std::get<J>(x))}) {
       return msg;
     } else {
-      return WhyNotOk<(J + 1)>(x);
+      return WhyNotOk<(J + 1)>(context, x);
     }
   }
-  template <typename A> static MaybeMsg WhyNotOk(const std::list<A> &x) {
+  template <typename A>
+  static MaybeMsg WhyNotOk(SemanticsContext &context, const std::list<A> &x) {
     for (const auto &y : x) {
-      if (MaybeMsg result{WhyNotOk(y)}) {
+      if (MaybeMsg result{WhyNotOk(context, y)}) {
         return result;
       }
     }
     return {};
   }
-  template <typename A> static MaybeMsg WhyNotOk(const std::optional<A> &x) {
+  template <typename A>
+  static MaybeMsg WhyNotOk(
+      SemanticsContext &context, const std::optional<A> &x) {
     if (x) {
-      return WhyNotOk(*x);
+      return WhyNotOk(context, *x);
     } else {
       return {};
     }
   }
   template <typename A>
-  static MaybeMsg WhyNotOk(const parser::UnlabeledStatement<A> &x) {
-    return WhyNotOk(x.statement);
+  static MaybeMsg WhyNotOk(
+      SemanticsContext &context, const parser::UnlabeledStatement<A> &x) {
+    return WhyNotOk(context, x.statement);
   }
   template <typename A>
-  static MaybeMsg WhyNotOk(const parser::Statement<A> &x) {
-    return WhyNotOk(x.statement);
+  static MaybeMsg WhyNotOk(
+      SemanticsContext &context, const parser::Statement<A> &x) {
+    return WhyNotOk(context, x.statement);
   }
-  static MaybeMsg WhyNotOk(const parser::AllocateStmt &) {
+  static MaybeMsg WhyNotOk(
+      SemanticsContext &context, const parser::AllocateStmt &) {
     return {}; // AllocateObjects are checked elsewhere
   }
-  static MaybeMsg WhyNotOk(const parser::AllocateCoarraySpec &) {
+  static MaybeMsg WhyNotOk(
+      SemanticsContext &context, const parser::AllocateCoarraySpec &) {
     return parser::MessageFormattedText(
         "A coarray may not be allocated on the device"_err_en_US);
   }
-  static MaybeMsg WhyNotOk(const parser::DeallocateStmt &) {
+  static MaybeMsg WhyNotOk(
+      SemanticsContext &context, const parser::DeallocateStmt &) {
     return {}; // AllocateObjects are checked elsewhere
   }
-  static MaybeMsg WhyNotOk(const parser::AssignmentStmt &x) {
-    return DeviceExprChecker{}(x.typedAssignment);
+  static MaybeMsg WhyNotOk(
+      SemanticsContext &context, const parser::AssignmentStmt &x) {
+    return DeviceExprChecker{context}(x.typedAssignment);
   }
-  static MaybeMsg WhyNotOk(const parser::CallStmt &x) {
-    return DeviceExprChecker{}(x.typedCall);
+  static MaybeMsg WhyNotOk(
+      SemanticsContext &context, const parser::CallStmt &x) {
+    return DeviceExprChecker{context}(x.typedCall);
+  }
+  static MaybeMsg WhyNotOk(
+      SemanticsContext &context, const parser::ContinueStmt &) {
+    return {};
   }
-  static MaybeMsg WhyNotOk(const parser::ContinueStmt &) { return {}; }
-  static MaybeMsg WhyNotOk(const parser::IfStmt &x) {
-    if (auto result{
-            CheckUnwrappedExpr(std::get<parser::ScalarLogicalExpr>(x.t))}) {
+  static MaybeMsg WhyNotOk(SemanticsContext &context, const parser::IfStmt &x) {
+    if (auto result{CheckUnwrappedExpr(
+            context, std::get<parser::ScalarLogicalExpr>(x.t))}) {
       return result;
     }
-    return WhyNotOk(
+    return WhyNotOk(context,
         std::get<parser::UnlabeledStatement<parser::ActionStmt>>(x.t)
             .statement);
   }
-  static MaybeMsg WhyNotOk(const parser::NullifyStmt &x) {
+  static MaybeMsg WhyNotOk(
+      SemanticsContext &context, const parser::NullifyStmt &x) {
     for (const auto &y : x.v) {
-      if (MaybeMsg result{DeviceExprChecker{}(y.typedExpr)}) {
+      if (MaybeMsg result{DeviceExprChecker{context}(y.typedExpr)}) {
         return result;
       }
     }
     return {};
   }
-  static MaybeMsg WhyNotOk(const parser::PointerAssignmentStmt &x) {
-    return DeviceExprChecker{}(x.typedAssignment);
+  static MaybeMsg WhyNotOk(
+      SemanticsContext &context, const parser::PointerAssignmentStmt &x) {
+    return DeviceExprChecker{context}(x.typedAssignment);
   }
 };
 
@@ -435,12 +472,14 @@ template <bool IsCUFKernelDo> class DeviceContextChecker {
                 ErrorIfHostSymbol(assign->lhs, source);
                 ErrorIfHostSymbol(assign->rhs, source);
               }
-              if (auto msg{ActionStmtChecker<IsCUFKernelDo>::WhyNotOk(x)}) {
+              if (auto msg{ActionStmtChecker<IsCUFKernelDo>::WhyNotOk(
+                      context_, x)}) {
                 context_.Say(source, std::move(*msg));
               }
             },
             [&](const auto &x) {
-              if (auto msg{ActionStmtChecker<IsCUFKernelDo>::WhyNotOk(x)}) {
+              if (auto msg{ActionStmtChecker<IsCUFKernelDo>::WhyNotOk(
+                      context_, x)}) {
                 context_.Say(source, std::move(*msg));
               }
             },
@@ -504,7 +543,7 @@ template <bool IsCUFKernelDo> class DeviceContextChecker {
     Check(DEREF(parser::Unwrap<parser::Expr>(x)));
   }
   void Check(const parser::Expr &expr) {
-    if (MaybeMsg msg{DeviceExprChecker{}(expr.typedExpr)}) {
+    if (MaybeMsg msg{DeviceExprChecker{context_}(expr.typedExpr)}) {
       context_.Say(expr.source, std::move(*msg));
     }
   }
diff --git a/flang/test/Semantics/cuf22.cuf b/flang/test/Semantics/cuf22.cuf
new file mode 100644
index 0000000000000..36e0f0b2502df
--- /dev/null
+++ b/flang/test/Semantics/cuf22.cuf
@@ -0,0 +1,8 @@
+! RUN: not bbc -fcuda -fcuda-disable-warp-function %s -o - 2>&1 | FileCheck %s
+
+attributes(device) subroutine testMatch()
+  integer :: a, ipred, mask, v32
+  a = match_all_sync(mask, v32, ipred)
+end subroutine
+
+! CHECK:  warp match function disabled
diff --git a/flang/tools/bbc/bbc.cpp b/flang/tools/bbc/bbc.cpp
index c544008a24d56..c80872108ac8f 100644
--- a/flang/tools/bbc/bbc.cpp
+++ b/flang/tools/bbc/bbc.cpp
@@ -223,6 +223,11 @@ static llvm::cl::opt<bool> enableCUDA("fcuda",
                                       llvm::cl::desc("enable CUDA Fortran"),
                                       llvm::cl::init(false));
 
+static llvm::cl::opt<bool>
+    disableCUDAWarpFunction("fcuda-disable-warp-function",
+                            llvm::cl::desc("Disable CUDA Warp Function"),
+                            llvm::cl::init(false));
+
 static llvm::cl::opt<std::string>
     enableGPUMode("gpu", llvm::cl::desc("Enable GPU Mode managed|unified"),
                   llvm::cl::init(""));
@@ -600,6 +605,11 @@ int main(int argc, char **argv) {
     options.features.Enable(Fortran::common::LanguageFeature::CUDA);
   }
 
+  if (disableCUDAWarpFunction) {
+    options.features.Enable(
+        Fortran::common::LanguageFeature::CudaWarpMatchFunction, false);
+  }
+
   if (enableGPUMode == "managed") {
     options.features.Enable(Fortran::common::LanguageFeature::CudaManaged);
   } else if (enableGPUMode == "unified") {

>From 842377882a3f52e345668751fa6d46ba4f7268d2 Mon Sep 17 00:00:00 2001
From: Jim Lin <jim at andestech.com>
Date: Wed, 11 Jun 2025 13:32:49 +0800
Subject: [PATCH 41/61] [RISCV] Select signed bitfield insert for XAndesPerf
 (#143356)

This patch is similar to #142737

The XAndesPerf extension includes signed bitfield extraction
instruction `NDS.BFOS, which can extract the bits from 0 to Len - 1,
place them starting at bit Lsb, zero-filled the bits from 0 to Lsb -1,
and sign-extend the result.

When Lsb == Msb, it is a special case where the Lsb will be set to 0
instead of being equal to the Msb.
---
 llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp | 56 +++++++++++++++++++++
 llvm/lib/Target/RISCV/RISCVISelDAGToDAG.h   |  1 +
 llvm/test/CodeGen/RISCV/rv32xandesperf.ll   | 26 ++++++++++
 llvm/test/CodeGen/RISCV/rv64xandesperf.ll   | 46 +++++++++++++++++
 4 files changed, 129 insertions(+)

diff --git a/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp b/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp
index d298965595b47..4539efd591c8b 100644
--- a/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp
@@ -683,6 +683,59 @@ bool RISCVDAGToDAGISel::trySignedBitfieldExtract(SDNode *Node) {
   return false;
 }
 
+bool RISCVDAGToDAGISel::trySignedBitfieldInsertInSign(SDNode *Node) {
+  // Only supported with XAndesPerf at the moment.
+  if (!Subtarget->hasVendorXAndesPerf())
+    return false;
+
+  auto *N1C = dyn_cast<ConstantSDNode>(Node->getOperand(1));
+  if (!N1C)
+    return false;
+
+  SDValue N0 = Node->getOperand(0);
+  if (!N0.hasOneUse())
+    return false;
+
+  auto BitfieldInsert = [&](SDValue N0, unsigned Msb, unsigned Lsb,
+                            const SDLoc &DL, MVT VT) {
+    unsigned Opc = RISCV::NDS_BFOS;
+    // If the Lsb is equal to the Msb, then the Lsb should be 0.
+    if (Lsb == Msb)
+      Lsb = 0;
+    return CurDAG->getMachineNode(Opc, DL, VT, N0.getOperand(0),
+                                  CurDAG->getTargetConstant(Lsb, DL, VT),
+                                  CurDAG->getTargetConstant(Msb, DL, VT));
+  };
+
+  SDLoc DL(Node);
+  MVT VT = Node->getSimpleValueType(0);
+  const unsigned RightShAmt = N1C->getZExtValue();
+
+  // Transform (sra (shl X, C1) C2) with C1 > C2
+  //        -> (NDS.BFOS X, lsb, msb)
+  if (N0.getOpcode() == ISD::SHL) {
+    auto *N01C = dyn_cast<ConstantSDNode>(N0->getOperand(1));
+    if (!N01C)
+      return false;
+
+    const unsigned LeftShAmt = N01C->getZExtValue();
+    // Make sure that this is a bitfield insertion (i.e., the shift-right
+    // amount should be less than the left-shift).
+    if (LeftShAmt <= RightShAmt)
+      return false;
+
+    const unsigned MsbPlusOne = VT.getSizeInBits() - RightShAmt;
+    const unsigned Msb = MsbPlusOne - 1;
+    const unsigned Lsb = LeftShAmt - RightShAmt;
+
+    SDNode *Sbi = BitfieldInsert(N0, Msb, Lsb, DL, VT);
+    ReplaceNode(Node, Sbi);
+    return true;
+  }
+
+  return false;
+}
+
 bool RISCVDAGToDAGISel::tryUnsignedBitfieldExtract(SDNode *Node,
                                                    const SDLoc &DL, MVT VT,
                                                    SDValue X, unsigned Msb,
@@ -1214,6 +1267,9 @@ void RISCVDAGToDAGISel::Select(SDNode *Node) {
     if (trySignedBitfieldExtract(Node))
       return;
 
+    if (trySignedBitfieldInsertInSign(Node))
+      return;
+
     // Optimize (sra (sext_inreg X, i16), C) ->
     //          (srai (slli X, (XLen-16), (XLen-16) + C)
     // And      (sra (sext_inreg X, i8), C) ->
diff --git a/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.h b/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.h
index abc0372d15c4f..cb63c21fd8fc9 100644
--- a/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.h
+++ b/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.h
@@ -77,6 +77,7 @@ class RISCVDAGToDAGISel : public SelectionDAGISel {
 
   bool tryShrinkShlLogicImm(SDNode *Node);
   bool trySignedBitfieldExtract(SDNode *Node);
+  bool trySignedBitfieldInsertInSign(SDNode *Node);
   bool tryUnsignedBitfieldExtract(SDNode *Node, const SDLoc &DL, MVT VT,
                                   SDValue X, unsigned Msb, unsigned Lsb);
   bool tryUnsignedBitfieldInsertInZero(SDNode *Node, const SDLoc &DL, MVT VT,
diff --git a/llvm/test/CodeGen/RISCV/rv32xandesperf.ll b/llvm/test/CodeGen/RISCV/rv32xandesperf.ll
index 3996420d477b2..3e7f09f3d6c22 100644
--- a/llvm/test/CodeGen/RISCV/rv32xandesperf.ll
+++ b/llvm/test/CodeGen/RISCV/rv32xandesperf.ll
@@ -154,6 +154,32 @@ define i32 @bfos_from_ashr_sexti16_i32(i16 %x) {
   ret i32 %ashr
 }
 
+; MSB = 0
+
+define i32 @bfos_from_ashr_shl_with_msb_zero_insert_i32(i32 %x) {
+; CHECK-LABEL: bfos_from_ashr_shl_with_msb_zero_insert_i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    nds.bfos a0, a0, 0, 14
+; CHECK-NEXT:    ret
+  %shl = shl i32 %x, 31
+  %lshr = ashr i32 %shl, 17
+  ret i32 %lshr
+}
+
+; MSB < LSB
+
+define i32 @bfos_from_ashr_shl_insert_i32(i32 %x) {
+; CHECK-LABEL: bfos_from_ashr_shl_insert_i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    nds.bfos a0, a0, 18, 20
+; CHECK-NEXT:    ret
+  %shl = shl i32 %x, 29
+  %lshr = ashr i32 %shl, 11
+  ret i32 %lshr
+}
+
+; sext
+
 define i32 @sexti1_i32(i32 %a) {
 ; CHECK-LABEL: sexti1_i32:
 ; CHECK:       # %bb.0:
diff --git a/llvm/test/CodeGen/RISCV/rv64xandesperf.ll b/llvm/test/CodeGen/RISCV/rv64xandesperf.ll
index af7c300a92d1f..98cda42665169 100644
--- a/llvm/test/CodeGen/RISCV/rv64xandesperf.ll
+++ b/llvm/test/CodeGen/RISCV/rv64xandesperf.ll
@@ -212,6 +212,52 @@ define i64 @bfos_from_ashr_sexti16_i64(i16 %x) {
   ret i64 %ashr
 }
 
+; MSB = 0
+
+define i32 @bfos_from_ashr_shl_with_msb_zero_insert_i32(i32 %x) {
+; CHECK-LABEL: bfos_from_ashr_shl_with_msb_zero_insert_i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    nds.bfos a0, a0, 0, 14
+; CHECK-NEXT:    ret
+  %shl = shl i32 %x, 31
+  %lshr = ashr i32 %shl, 17
+  ret i32 %lshr
+}
+
+define i64 @bfos_from_ashr_shl_with_msb_zero_insert_i64(i64 %x) {
+; CHECK-LABEL: bfos_from_ashr_shl_with_msb_zero_insert_i64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    nds.bfos a0, a0, 0, 46
+; CHECK-NEXT:    ret
+  %shl = shl i64 %x, 63
+  %lshr = ashr i64 %shl, 17
+  ret i64 %lshr
+}
+
+; MSB < LSB
+
+define i32 @bfos_from_ashr_shl_insert_i32(i32 %x) {
+; CHECK-LABEL: bfos_from_ashr_shl_insert_i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    nds.bfos a0, a0, 18, 20
+; CHECK-NEXT:    ret
+  %shl = shl i32 %x, 29
+  %lshr = ashr i32 %shl, 11
+  ret i32 %lshr
+}
+
+define i64 @bfos_from_ashr_shl_insert_i64(i64 %x) {
+; CHECK-LABEL: bfos_from_ashr_shl_insert_i64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    nds.bfos a0, a0, 18, 52
+; CHECK-NEXT:    ret
+  %shl = shl i64 %x, 29
+  %lshr = ashr i64 %shl, 11
+  ret i64 %lshr
+}
+
+; sext
+
 define signext i32 @sexti1_i32(i32 signext %a) {
 ; CHECK-LABEL: sexti1_i32:
 ; CHECK:       # %bb.0:

>From c2cb571c6cbcec75ab401974348f9f0d9b2190db Mon Sep 17 00:00:00 2001
From: Shafik Yaghmour <shafik.yaghmour at intel.com>
Date: Tue, 10 Jun 2025 23:41:41 -0700
Subject: [PATCH 42/61] [Clang][NFC] Move UntypedParameters instead of copy
 (#143646)

Static analysis flagged that UntypedParameters could be moved instead of
copied. This would avoid copying a large object.
---
 clang/lib/Sema/SemaExprCXX.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/clang/lib/Sema/SemaExprCXX.cpp b/clang/lib/Sema/SemaExprCXX.cpp
index 2546ab5c0a342..c106ea749170f 100644
--- a/clang/lib/Sema/SemaExprCXX.cpp
+++ b/clang/lib/Sema/SemaExprCXX.cpp
@@ -2888,7 +2888,7 @@ static bool resolveAllocationOverload(
     // type-identity-less argument list.
     IAP.PassTypeIdentity = TypeAwareAllocationMode::No;
     IAP.PassAlignment = InitialAlignmentMode;
-    Args = UntypedParameters;
+    Args = std::move(UntypedParameters);
   }
   assert(!S.isStdTypeIdentity(Args[0]->getType(), nullptr));
   return resolveAllocationOverloadInterior(

>From a17e97e6778b2cd4114052faf6ee25db330ef405 Mon Sep 17 00:00:00 2001
From: maflcko <6399679+maflcko at users.noreply.github.com>
Date: Wed, 11 Jun 2025 08:43:23 +0200
Subject: [PATCH 43/61] [libc++] Add missing C++20 [time.point.arithmetic]
 (#143165)

This was part of https://wg21.link/p0355r7, but apparently never
implemented.

---------

Co-authored-by: MarcoFalke <*~=`'#}+{/-|&$^_ at 721217.xyz>
Co-authored-by: Hristo Hristov <zingam at outlook.com>
---
 libcxx/include/__chrono/time_point.h          | 13 +++++++
 libcxx/include/chrono                         |  5 +++
 .../time.point.arithmetic/op_++.pass.cpp      | 35 +++++++++++++++++++
 .../time.point.arithmetic/op_++int.pass.cpp   | 35 +++++++++++++++++++
 .../time.point.arithmetic/op_--.pass.cpp      | 35 +++++++++++++++++++
 .../time.point.arithmetic/op_--int.pass.cpp   | 35 +++++++++++++++++++
 6 files changed, 158 insertions(+)
 create mode 100644 libcxx/test/std/time/time.point/time.point.arithmetic/op_++.pass.cpp
 create mode 100644 libcxx/test/std/time/time.point/time.point.arithmetic/op_++int.pass.cpp
 create mode 100644 libcxx/test/std/time/time.point/time.point.arithmetic/op_--.pass.cpp
 create mode 100644 libcxx/test/std/time/time.point/time.point.arithmetic/op_--int.pass.cpp

diff --git a/libcxx/include/__chrono/time_point.h b/libcxx/include/__chrono/time_point.h
index 6b866b882f89a..fc4408d23dbf1 100644
--- a/libcxx/include/__chrono/time_point.h
+++ b/libcxx/include/__chrono/time_point.h
@@ -58,6 +58,19 @@ class time_point {
 
   // arithmetic
 
+#if _LIBCPP_STD_VER >= 20
+  _LIBCPP_HIDE_FROM_ABI constexpr time_point& operator++() {
+    ++__d_;
+    return *this;
+  }
+  _LIBCPP_HIDE_FROM_ABI constexpr time_point operator++(int) { return time_point{__d_++}; }
+  _LIBCPP_HIDE_FROM_ABI constexpr time_point& operator--() {
+    --__d_;
+    return *this;
+  }
+  _LIBCPP_HIDE_FROM_ABI constexpr time_point operator--(int) { return time_point{__d_--}; }
+#endif // _LIBCPP_STD_VER >= 20
+
   _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX17 time_point& operator+=(const duration& __d) {
     __d_ += __d;
     return *this;
diff --git a/libcxx/include/chrono b/libcxx/include/chrono
index cd9b98872083e..82e99a31bcc9f 100644
--- a/libcxx/include/chrono
+++ b/libcxx/include/chrono
@@ -132,6 +132,11 @@ public:
 
     // arithmetic
 
+    constexpr time_point& operator++();    // C++20
+    constexpr time_point  operator++(int); // C++20
+    constexpr time_point& operator--();    // C++20
+    constexpr time_point  operator--(int); // C++20
+
     time_point& operator+=(const duration& d); // constexpr in C++17
     time_point& operator-=(const duration& d); // constexpr in C++17
 
diff --git a/libcxx/test/std/time/time.point/time.point.arithmetic/op_++.pass.cpp b/libcxx/test/std/time/time.point/time.point.arithmetic/op_++.pass.cpp
new file mode 100644
index 0000000000000..e035d7ef4fa0e
--- /dev/null
+++ b/libcxx/test/std/time/time.point/time.point.arithmetic/op_++.pass.cpp
@@ -0,0 +1,35 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+// REQUIRES: std-at-least-c++20
+
+// <chrono>
+
+// time_point
+
+// constexpr time_point& operator++();
+
+#include <cassert>
+#include <chrono>
+
+#include "test_macros.h"
+
+constexpr bool test() {
+  using Clock    = std::chrono::system_clock;
+  using Duration = std::chrono::milliseconds;
+  std::chrono::time_point<Clock, Duration> t{Duration{5}};
+  std::chrono::time_point<Clock, Duration>& tref{++t};
+  assert(&tref == &t);
+  assert(tref.time_since_epoch() == Duration{6});
+  return true;
+}
+
+int main(int, char**) {
+  test();
+  static_assert(test());
+  return 0;
+}
diff --git a/libcxx/test/std/time/time.point/time.point.arithmetic/op_++int.pass.cpp b/libcxx/test/std/time/time.point/time.point.arithmetic/op_++int.pass.cpp
new file mode 100644
index 0000000000000..5304d37d5c361
--- /dev/null
+++ b/libcxx/test/std/time/time.point/time.point.arithmetic/op_++int.pass.cpp
@@ -0,0 +1,35 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+// REQUIRES: std-at-least-c++20
+
+// <chrono>
+
+// time_point
+
+// constexpr time_point operator++(int);
+
+#include <cassert>
+#include <chrono>
+
+#include "test_macros.h"
+
+constexpr bool test() {
+  using Clock    = std::chrono::system_clock;
+  using Duration = std::chrono::milliseconds;
+  std::chrono::time_point<Clock, Duration> t1{Duration{3}};
+  std::chrono::time_point<Clock, Duration> t2{t1++};
+  assert(t1.time_since_epoch() == Duration{4});
+  assert(t2.time_since_epoch() == Duration{3});
+  return true;
+}
+
+int main(int, char**) {
+  test();
+  static_assert(test());
+  return 0;
+}
diff --git a/libcxx/test/std/time/time.point/time.point.arithmetic/op_--.pass.cpp b/libcxx/test/std/time/time.point/time.point.arithmetic/op_--.pass.cpp
new file mode 100644
index 0000000000000..915156fcc6b8c
--- /dev/null
+++ b/libcxx/test/std/time/time.point/time.point.arithmetic/op_--.pass.cpp
@@ -0,0 +1,35 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+// REQUIRES: std-at-least-c++20
+
+// <chrono>
+
+// time_point
+
+// constexpr time_point& operator--();
+
+#include <cassert>
+#include <chrono>
+
+#include "test_macros.h"
+
+constexpr bool test() {
+  using Clock    = std::chrono::system_clock;
+  using Duration = std::chrono::milliseconds;
+  std::chrono::time_point<Clock, Duration> t{Duration{5}};
+  std::chrono::time_point<Clock, Duration>& tref{--t};
+  assert(&tref == &t);
+  assert(tref.time_since_epoch() == Duration{4});
+  return true;
+}
+
+int main(int, char**) {
+  test();
+  static_assert(test());
+  return 0;
+}
diff --git a/libcxx/test/std/time/time.point/time.point.arithmetic/op_--int.pass.cpp b/libcxx/test/std/time/time.point/time.point.arithmetic/op_--int.pass.cpp
new file mode 100644
index 0000000000000..cc5f462106bbf
--- /dev/null
+++ b/libcxx/test/std/time/time.point/time.point.arithmetic/op_--int.pass.cpp
@@ -0,0 +1,35 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+// REQUIRES: std-at-least-c++20
+
+// <chrono>
+
+// time_point
+
+// constexpr time_point operator--(int);
+
+#include <cassert>
+#include <chrono>
+
+#include "test_macros.h"
+
+constexpr bool test() {
+  using Clock    = std::chrono::system_clock;
+  using Duration = std::chrono::milliseconds;
+  std::chrono::time_point<Clock, Duration> t1{Duration{3}};
+  std::chrono::time_point<Clock, Duration> t2{t1--};
+  assert(t1.time_since_epoch() == Duration{2});
+  assert(t2.time_since_epoch() == Duration{3});
+  return true;
+}
+
+int main(int, char**) {
+  test();
+  static_assert(test());
+  return 0;
+}

>From 0f3c54a3b3289b6375a1d32684e831cb407af003 Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev at redking.me.uk>
Date: Tue, 10 Jun 2025 21:33:53 +0100
Subject: [PATCH 44/61] [X86] Add test coverage showing failure to merge "zero
 input passthrough" behaviour for BSR instructions on x86_64 targets

---
 llvm/test/CodeGen/X86/bsr.ll | 492 +++++++++++++++++++++++++++++++++++
 1 file changed, 492 insertions(+)
 create mode 100644 llvm/test/CodeGen/X86/bsr.ll

diff --git a/llvm/test/CodeGen/X86/bsr.ll b/llvm/test/CodeGen/X86/bsr.ll
new file mode 100644
index 0000000000000..1247b3ec59324
--- /dev/null
+++ b/llvm/test/CodeGen/X86/bsr.ll
@@ -0,0 +1,492 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=i686-unknown-unknown | FileCheck %s --check-prefixes=X86
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown | FileCheck %s --check-prefixes=X64
+
+define i8 @cmov_bsr8(i8 %x, i8 %y) nounwind {
+; X86-LABEL: cmov_bsr8:
+; X86:       # %bb.0:
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    testb %cl, %cl
+; X86-NEXT:    je .LBB0_1
+; X86-NEXT:  # %bb.2: # %cond.false
+; X86-NEXT:    movzbl %cl, %eax
+; X86-NEXT:    bsrl %eax, %eax
+; X86-NEXT:    xorl $7, %eax
+; X86-NEXT:    testb %cl, %cl
+; X86-NEXT:    je .LBB0_4
+; X86-NEXT:  .LBB0_5: # %cond.end
+; X86-NEXT:    xorb $7, %al
+; X86-NEXT:    # kill: def $al killed $al killed $eax
+; X86-NEXT:    retl
+; X86-NEXT:  .LBB0_1:
+; X86-NEXT:    movb $8, %al
+; X86-NEXT:    testb %cl, %cl
+; X86-NEXT:    jne .LBB0_5
+; X86-NEXT:  .LBB0_4:
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    # kill: def $al killed $al killed $eax
+; X86-NEXT:    retl
+;
+; X64-LABEL: cmov_bsr8:
+; X64:       # %bb.0:
+; X64-NEXT:    movzbl %dil, %ecx
+; X64-NEXT:    movl $15, %eax
+; X64-NEXT:    bsrl %ecx, %eax
+; X64-NEXT:    testb %cl, %cl
+; X64-NEXT:    cmovel %esi, %eax
+; X64-NEXT:    # kill: def $al killed $al killed $eax
+; X64-NEXT:    retq
+  %1 = tail call i8 @llvm.ctlz.i8(i8 %x, i1 false)
+  %2 = xor i8 %1, 7
+  %3 = icmp eq i8 %x, 0
+  %4 = select i1 %3, i8 %y, i8 %2
+  ret i8 %4
+}
+
+define i8 @cmov_bsr8_undef(i8 %x, i8 %y) nounwind {
+; X86-LABEL: cmov_bsr8_undef:
+; X86:       # %bb.0:
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    testl %eax, %eax
+; X86-NEXT:    jne .LBB1_1
+; X86-NEXT:  # %bb.2:
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    # kill: def $al killed $al killed $eax
+; X86-NEXT:    retl
+; X86-NEXT:  .LBB1_1:
+; X86-NEXT:    bsrl %eax, %eax
+; X86-NEXT:    # kill: def $al killed $al killed $eax
+; X86-NEXT:    retl
+;
+; X64-LABEL: cmov_bsr8_undef:
+; X64:       # %bb.0:
+; X64-NEXT:    movzbl %dil, %ecx
+; X64-NEXT:    bsrl %ecx, %eax
+; X64-NEXT:    testb %cl, %cl
+; X64-NEXT:    cmovel %esi, %eax
+; X64-NEXT:    # kill: def $al killed $al killed $eax
+; X64-NEXT:    retq
+  %1 = tail call i8 @llvm.ctlz.i8(i8 %x, i1 true)
+  %2 = xor i8 %1, 7
+  %3 = icmp ne i8 %x, 0
+  %4 = select i1 %3, i8 %2, i8 %y
+  ret i8 %4
+}
+
+define i16 @cmov_bsr16(i16 %x, i16 %y) nounwind {
+; X86-LABEL: cmov_bsr16:
+; X86:       # %bb.0:
+; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    testw %ax, %ax
+; X86-NEXT:    je .LBB2_1
+; X86-NEXT:  # %bb.2: # %cond.false
+; X86-NEXT:    bsrw %ax, %cx
+; X86-NEXT:    xorl $15, %ecx
+; X86-NEXT:    testw %ax, %ax
+; X86-NEXT:    jne .LBB2_4
+; X86-NEXT:  .LBB2_5: # %cond.end
+; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    # kill: def $ax killed $ax killed $eax
+; X86-NEXT:    retl
+; X86-NEXT:  .LBB2_1:
+; X86-NEXT:    movw $16, %cx
+; X86-NEXT:    testw %ax, %ax
+; X86-NEXT:    je .LBB2_5
+; X86-NEXT:  .LBB2_4:
+; X86-NEXT:    movzwl %cx, %eax
+; X86-NEXT:    xorl $15, %eax
+; X86-NEXT:    # kill: def $ax killed $ax killed $eax
+; X86-NEXT:    retl
+;
+; X64-LABEL: cmov_bsr16:
+; X64:       # %bb.0:
+; X64-NEXT:    movw $31, %ax
+; X64-NEXT:    bsrw %di, %ax
+; X64-NEXT:    cmovel %esi, %eax
+; X64-NEXT:    # kill: def $ax killed $ax killed $eax
+; X64-NEXT:    retq
+  %1 = tail call i16 @llvm.ctlz.i16(i16 %x, i1 false)
+  %2 = xor i16 %1, 15
+  %3 = icmp ne i16 %x, 0
+  %4 = select i1 %3, i16 %2, i16 %y
+  ret i16 %4
+}
+
+define i16 @cmov_bsr16_undef(i16 %x, i16 %y) nounwind {
+; X86-LABEL: cmov_bsr16_undef:
+; X86:       # %bb.0:
+; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    testw %ax, %ax
+; X86-NEXT:    je .LBB3_1
+; X86-NEXT:  # %bb.2:
+; X86-NEXT:    bsrw %ax, %ax
+; X86-NEXT:    retl
+; X86-NEXT:  .LBB3_1:
+; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    retl
+;
+; X64-LABEL: cmov_bsr16_undef:
+; X64:       # %bb.0:
+; X64-NEXT:    bsrw %di, %ax
+; X64-NEXT:    cmovel %esi, %eax
+; X64-NEXT:    # kill: def $ax killed $ax killed $eax
+; X64-NEXT:    retq
+  %1 = tail call i16 @llvm.ctlz.i16(i16 %x, i1 true)
+  %2 = xor i16 %1, 15
+  %3 = icmp eq i16 %x, 0
+  %4 = select i1 %3, i16 %y, i16 %2
+  ret i16 %4
+}
+
+define i32 @cmov_bsr32(i32 %x, i32 %y) nounwind {
+; X86-LABEL: cmov_bsr32:
+; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    testl %ecx, %ecx
+; X86-NEXT:    je .LBB4_1
+; X86-NEXT:  # %bb.2: # %cond.false
+; X86-NEXT:    bsrl %ecx, %eax
+; X86-NEXT:    xorl $31, %eax
+; X86-NEXT:    testl %ecx, %ecx
+; X86-NEXT:    je .LBB4_4
+; X86-NEXT:  .LBB4_5: # %cond.end
+; X86-NEXT:    xorl $31, %eax
+; X86-NEXT:    retl
+; X86-NEXT:  .LBB4_1:
+; X86-NEXT:    movl $32, %eax
+; X86-NEXT:    testl %ecx, %ecx
+; X86-NEXT:    jne .LBB4_5
+; X86-NEXT:  .LBB4_4:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    retl
+;
+; X64-LABEL: cmov_bsr32:
+; X64:       # %bb.0:
+; X64-NEXT:    movl $63, %eax
+; X64-NEXT:    bsrl %edi, %eax
+; X64-NEXT:    cmovel %esi, %eax
+; X64-NEXT:    retq
+  %1 = tail call i32 @llvm.ctlz.i32(i32 %x, i1 false)
+  %2 = xor i32 %1, 31
+  %3 = icmp eq i32 %x, 0
+  %4 = select i1 %3, i32 %y, i32 %2
+  ret i32 %4
+}
+
+define i32 @cmov_bsr32_undef(i32 %x, i32 %y) nounwind {
+; X86-LABEL: cmov_bsr32_undef:
+; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    testl %eax, %eax
+; X86-NEXT:    jne .LBB5_1
+; X86-NEXT:  # %bb.2:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    retl
+; X86-NEXT:  .LBB5_1:
+; X86-NEXT:    bsrl %eax, %eax
+; X86-NEXT:    retl
+;
+; X64-LABEL: cmov_bsr32_undef:
+; X64:       # %bb.0:
+; X64-NEXT:    bsrl %edi, %eax
+; X64-NEXT:    cmovel %esi, %eax
+; X64-NEXT:    retq
+  %1 = tail call i32 @llvm.ctlz.i32(i32 %x, i1 true)
+  %2 = xor i32 %1, 31
+  %3 = icmp ne i32 %x, 0
+  %4 = select i1 %3, i32 %2, i32 %y
+  ret i32 %4
+}
+
+define i64 @cmov_bsr64(i64 %x, i64 %y) nounwind {
+; X86-LABEL: cmov_bsr64:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl %esi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    xorl %edx, %edx
+; X86-NEXT:    movl %esi, %eax
+; X86-NEXT:    orl %ecx, %eax
+; X86-NEXT:    je .LBB6_1
+; X86-NEXT:  # %bb.2: # %cond.false
+; X86-NEXT:    testl %ecx, %ecx
+; X86-NEXT:    jne .LBB6_3
+; X86-NEXT:  # %bb.4: # %cond.false
+; X86-NEXT:    bsrl %esi, %eax
+; X86-NEXT:    xorl $31, %eax
+; X86-NEXT:    orl $32, %eax
+; X86-NEXT:    orl %ecx, %esi
+; X86-NEXT:    je .LBB6_7
+; X86-NEXT:    jmp .LBB6_6
+; X86-NEXT:  .LBB6_1:
+; X86-NEXT:    movl $64, %eax
+; X86-NEXT:    orl %ecx, %esi
+; X86-NEXT:    jne .LBB6_6
+; X86-NEXT:  .LBB6_7: # %cond.end
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    popl %esi
+; X86-NEXT:    retl
+; X86-NEXT:  .LBB6_3:
+; X86-NEXT:    bsrl %ecx, %eax
+; X86-NEXT:    xorl $31, %eax
+; X86-NEXT:    orl %ecx, %esi
+; X86-NEXT:    je .LBB6_7
+; X86-NEXT:  .LBB6_6:
+; X86-NEXT:    xorl $63, %eax
+; X86-NEXT:    popl %esi
+; X86-NEXT:    retl
+;
+; X64-LABEL: cmov_bsr64:
+; X64:       # %bb.0:
+; X64-NEXT:    movl $127, %eax
+; X64-NEXT:    bsrq %rdi, %rax
+; X64-NEXT:    cmoveq %rsi, %rax
+; X64-NEXT:    retq
+  %1 = tail call i64 @llvm.ctlz.i64(i64 %x, i1 false)
+  %2 = xor i64 %1, 63
+  %3 = icmp ne i64 %x, 0
+  %4 = select i1 %3, i64 %2, i64 %y
+  ret i64 %4
+}
+
+define i64 @cmov_bsr64_undef(i64 %x, i64 %y) nounwind {
+; X86-LABEL: cmov_bsr64_undef:
+; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    testl %edx, %edx
+; X86-NEXT:    jne .LBB7_1
+; X86-NEXT:  # %bb.2:
+; X86-NEXT:    bsrl %ecx, %eax
+; X86-NEXT:    xorl $31, %eax
+; X86-NEXT:    orl $32, %eax
+; X86-NEXT:    orl %edx, %ecx
+; X86-NEXT:    jne .LBB7_5
+; X86-NEXT:  .LBB7_4:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    retl
+; X86-NEXT:  .LBB7_1:
+; X86-NEXT:    bsrl %edx, %eax
+; X86-NEXT:    xorl $31, %eax
+; X86-NEXT:    orl %edx, %ecx
+; X86-NEXT:    je .LBB7_4
+; X86-NEXT:  .LBB7_5:
+; X86-NEXT:    xorl $63, %eax
+; X86-NEXT:    xorl %edx, %edx
+; X86-NEXT:    retl
+;
+; X64-LABEL: cmov_bsr64_undef:
+; X64:       # %bb.0:
+; X64-NEXT:    bsrq %rdi, %rax
+; X64-NEXT:    cmoveq %rsi, %rax
+; X64-NEXT:    retq
+  %1 = tail call i64 @llvm.ctlz.i64(i64 %x, i1 true)
+  %2 = xor i64 %1, 63
+  %3 = icmp eq i64 %x, 0
+  %4 = select i1 %3, i64 %y, i64 %2
+  ret i64 %4
+}
+
+define i128 @cmov_bsr128(i128 %x, i128 %y) nounwind {
+; X86-LABEL: cmov_bsr128:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl %ebp
+; X86-NEXT:    pushl %ebx
+; X86-NEXT:    pushl %edi
+; X86-NEXT:    pushl %esi
+; X86-NEXT:    pushl %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    movl %edi, %edx
+; X86-NEXT:    orl %ebp, %edx
+; X86-NEXT:    movl %ecx, %esi
+; X86-NEXT:    orl %ebx, %esi
+; X86-NEXT:    orl %edx, %esi
+; X86-NEXT:    je .LBB8_1
+; X86-NEXT:  # %bb.2: # %cond.false
+; X86-NEXT:    testl %ebp, %ebp
+; X86-NEXT:    jne .LBB8_3
+; X86-NEXT:  # %bb.4: # %cond.false
+; X86-NEXT:    bsrl %ebx, %edx
+; X86-NEXT:    xorl $31, %edx
+; X86-NEXT:    orl $32, %edx
+; X86-NEXT:    testl %edi, %edi
+; X86-NEXT:    je .LBB8_7
+; X86-NEXT:  .LBB8_6:
+; X86-NEXT:    bsrl %edi, %esi
+; X86-NEXT:    xorl $31, %esi
+; X86-NEXT:    jmp .LBB8_8
+; X86-NEXT:  .LBB8_1:
+; X86-NEXT:    movl %ebx, %eax
+; X86-NEXT:    movl $0, (%esp) # 4-byte Folded Spill
+; X86-NEXT:    movl $128, %edx
+; X86-NEXT:    jmp .LBB8_11
+; X86-NEXT:  .LBB8_3:
+; X86-NEXT:    bsrl %ebp, %edx
+; X86-NEXT:    xorl $31, %edx
+; X86-NEXT:    testl %edi, %edi
+; X86-NEXT:    jne .LBB8_6
+; X86-NEXT:  .LBB8_7: # %cond.false
+; X86-NEXT:    bsrl %ecx, %esi
+; X86-NEXT:    xorl $31, %esi
+; X86-NEXT:    orl $32, %esi
+; X86-NEXT:  .LBB8_8: # %cond.false
+; X86-NEXT:    movl %ebx, %eax
+; X86-NEXT:    orl %ebp, %ebx
+; X86-NEXT:    jne .LBB8_10
+; X86-NEXT:  # %bb.9: # %cond.false
+; X86-NEXT:    orl $64, %esi
+; X86-NEXT:    movl %esi, %edx
+; X86-NEXT:  .LBB8_10: # %cond.false
+; X86-NEXT:    movl $0, (%esp) # 4-byte Folded Spill
+; X86-NEXT:  .LBB8_11: # %cond.end
+; X86-NEXT:    xorl %esi, %esi
+; X86-NEXT:    xorl %ebx, %ebx
+; X86-NEXT:    orl %eax, %ecx
+; X86-NEXT:    orl %ebp, %edi
+; X86-NEXT:    orl %ecx, %edi
+; X86-NEXT:    je .LBB8_12
+; X86-NEXT:  # %bb.13: # %cond.end
+; X86-NEXT:    xorl $127, %edx
+; X86-NEXT:    movl (%esp), %ecx # 4-byte Reload
+; X86-NEXT:    jmp .LBB8_14
+; X86-NEXT:  .LBB8_12:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:  .LBB8_14: # %cond.end
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %ebx, 12(%eax)
+; X86-NEXT:    movl %esi, 8(%eax)
+; X86-NEXT:    movl %ecx, 4(%eax)
+; X86-NEXT:    movl %edx, (%eax)
+; X86-NEXT:    addl $4, %esp
+; X86-NEXT:    popl %esi
+; X86-NEXT:    popl %edi
+; X86-NEXT:    popl %ebx
+; X86-NEXT:    popl %ebp
+; X86-NEXT:    retl $4
+;
+; X64-LABEL: cmov_bsr128:
+; X64:       # %bb.0:
+; X64-NEXT:    bsrq %rsi, %r8
+; X64-NEXT:    xorq $63, %r8
+; X64-NEXT:    movl $127, %eax
+; X64-NEXT:    bsrq %rdi, %rax
+; X64-NEXT:    xorq $63, %rax
+; X64-NEXT:    addq $64, %rax
+; X64-NEXT:    testq %rsi, %rsi
+; X64-NEXT:    cmovneq %r8, %rax
+; X64-NEXT:    xorq $127, %rax
+; X64-NEXT:    xorl %r8d, %r8d
+; X64-NEXT:    orq %rsi, %rdi
+; X64-NEXT:    cmoveq %rdx, %rax
+; X64-NEXT:    cmoveq %rcx, %r8
+; X64-NEXT:    movq %r8, %rdx
+; X64-NEXT:    retq
+  %1 = tail call i128 @llvm.ctlz.i128(i128 %x, i1 false)
+  %2 = xor i128 %1, 127
+  %3 = icmp eq i128 %x, 0
+  %4 = select i1 %3, i128 %y, i128 %2
+  ret i128 %4
+}
+
+define i128 @cmov_bsr128_undef(i128 %x, i128 %y) nounwind {
+; X86-LABEL: cmov_bsr128_undef:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl %ebp
+; X86-NEXT:    pushl %ebx
+; X86-NEXT:    pushl %edi
+; X86-NEXT:    pushl %esi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    testl %edi, %edi
+; X86-NEXT:    jne .LBB9_1
+; X86-NEXT:  # %bb.2:
+; X86-NEXT:    bsrl %esi, %ecx
+; X86-NEXT:    xorl $31, %ecx
+; X86-NEXT:    orl $32, %ecx
+; X86-NEXT:    jmp .LBB9_3
+; X86-NEXT:  .LBB9_1:
+; X86-NEXT:    bsrl %edi, %ecx
+; X86-NEXT:    xorl $31, %ecx
+; X86-NEXT:  .LBB9_3:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; X86-NEXT:    testl %edx, %edx
+; X86-NEXT:    jne .LBB9_4
+; X86-NEXT:  # %bb.5:
+; X86-NEXT:    bsrl %ebx, %ebp
+; X86-NEXT:    xorl $31, %ebp
+; X86-NEXT:    orl $32, %ebp
+; X86-NEXT:    jmp .LBB9_6
+; X86-NEXT:  .LBB9_4:
+; X86-NEXT:    bsrl %edx, %ebp
+; X86-NEXT:    xorl $31, %ebp
+; X86-NEXT:  .LBB9_6:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    orl %edi, %esi
+; X86-NEXT:    jne .LBB9_8
+; X86-NEXT:  # %bb.7:
+; X86-NEXT:    orl $64, %ebp
+; X86-NEXT:    movl %ebp, %ecx
+; X86-NEXT:  .LBB9_8:
+; X86-NEXT:    orl %edi, %edx
+; X86-NEXT:    orl {{[0-9]+}}(%esp), %ebx
+; X86-NEXT:    orl %edx, %ebx
+; X86-NEXT:    jne .LBB9_9
+; X86-NEXT:  # %bb.10:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    jmp .LBB9_11
+; X86-NEXT:  .LBB9_9:
+; X86-NEXT:    xorl $127, %ecx
+; X86-NEXT:    xorl %edx, %edx
+; X86-NEXT:    xorl %esi, %esi
+; X86-NEXT:    xorl %edi, %edi
+; X86-NEXT:  .LBB9_11:
+; X86-NEXT:    movl %edi, 12(%eax)
+; X86-NEXT:    movl %esi, 8(%eax)
+; X86-NEXT:    movl %edx, 4(%eax)
+; X86-NEXT:    movl %ecx, (%eax)
+; X86-NEXT:    popl %esi
+; X86-NEXT:    popl %edi
+; X86-NEXT:    popl %ebx
+; X86-NEXT:    popl %ebp
+; X86-NEXT:    retl $4
+;
+; X64-LABEL: cmov_bsr128_undef:
+; X64:       # %bb.0:
+; X64-NEXT:    bsrq %rsi, %r8
+; X64-NEXT:    xorq $63, %r8
+; X64-NEXT:    bsrq %rdi, %rax
+; X64-NEXT:    xorq $63, %rax
+; X64-NEXT:    orq $64, %rax
+; X64-NEXT:    testq %rsi, %rsi
+; X64-NEXT:    cmovneq %r8, %rax
+; X64-NEXT:    xorq $127, %rax
+; X64-NEXT:    xorl %r8d, %r8d
+; X64-NEXT:    orq %rsi, %rdi
+; X64-NEXT:    cmoveq %rdx, %rax
+; X64-NEXT:    cmoveq %rcx, %r8
+; X64-NEXT:    movq %r8, %rdx
+; X64-NEXT:    retq
+  %1 = tail call i128 @llvm.ctlz.i128(i128 %x, i1 true)
+  %2 = xor i128 %1, 127
+  %3 = icmp ne i128 %x, 0
+  %4 = select i1 %3, i128 %2, i128 %y
+  ret i128 %4
+}
+
+declare i8 @llvm.ctlz.i8(i8, i1)
+declare i16 @llvm.ctlz.i16(i16, i1)
+declare i32 @llvm.ctlz.i32(i32, i1)
+declare i64 @llvm.ctlz.i64(i64, i1)
+declare i128 @llvm.ctlz.i128(i128, i1)

>From a6ace2801e8900a6fe8c3b8295938f3b3c1e4466 Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev at redking.me.uk>
Date: Wed, 11 Jun 2025 08:07:30 +0100
Subject: [PATCH 45/61] [X86] combineConcatVectorOps - ensure we're only
 concatenating v2f64 generic shuffles into vXf64 vshufpd

Identified while triaging #143606 - we can't concat v4f64 lhs/rhs subvecs and then expect the v2f64 operands to be in the correct place for VSHUFPD

Test coverage will follow
---
 llvm/lib/Target/X86/X86ISelLowering.cpp | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 96be91256915d..8bcd8670879a9 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -59383,7 +59383,8 @@ static SDValue combineConcatVectorOps(const SDLoc &DL, MVT VT,
 
   // We can always convert per-lane vXf64 shuffles into VSHUFPD.
   if (!IsSplat &&
-      (VT == MVT::v4f64 || (VT == MVT::v8f64 && Subtarget.useAVX512Regs())) &&
+      ((NumOps == 2 && VT == MVT::v4f64) ||
+       (NumOps == 4 && VT == MVT::v8f64 && Subtarget.useAVX512Regs())) &&
       all_of(Ops, [](SDValue Op) { return Op.hasOneUse(); })) {
     // Collect the individual per-lane v2f64/v4f64 shuffles.
     MVT OpVT = Ops[0].getSimpleValueType();

>From 32ac7dc2d21843091116b636777c174830cd2dd0 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Bj=C3=B6rn=20Pettersson?= <bjorn.a.pettersson at ericsson.com>
Date: Wed, 11 Jun 2025 09:24:03 +0200
Subject: [PATCH 46/61] [test][AArch64] Adjust vector insertion lit tests
 (#143101)

The test cases test_insert_v16i8_insert_2_undef_base and
test_insert_v16i8_insert_2_undef_base_different_valeus in
CodeGen/AArch64/arm64-vector-insertion.ll was leaving element 8 in the
vector as "undef" without any real explanation. It kind of looked like a
typo as the input IR looked like this
   %v.8 = insertelement <16 x i8> %v.7, i8 %a, i32 8
   %v.10 = insertelement <16 x i8> %v.7, i8 %a, i32 10
leaving %v.8 as unused.

This patch is cleaning up the tests a bit by adding separate test cases
to validate what is happening when skipping insert at index 8, while
amending the original tests cases to use %v.8 instead of %v.7 when
creating %v.10.
---
 .../CodeGen/AArch64/arm64-vector-insertion.ll | 69 ++++++++++++++++++-
 1 file changed, 67 insertions(+), 2 deletions(-)

diff --git a/llvm/test/CodeGen/AArch64/arm64-vector-insertion.ll b/llvm/test/CodeGen/AArch64/arm64-vector-insertion.ll
index 94074d1689f6a..ff28c7817d143 100644
--- a/llvm/test/CodeGen/AArch64/arm64-vector-insertion.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-vector-insertion.ll
@@ -66,6 +66,35 @@ define <16 x i8> @test_insert_v16i8_insert_2_undef_base(i8 %a) {
   %v.6 = insertelement <16 x i8> %v.4, i8 %a, i32 6
   %v.7 = insertelement <16 x i8> %v.6, i8 %a, i32 7
   %v.8 = insertelement <16 x i8> %v.7, i8 %a, i32 8
+  %v.10 = insertelement <16 x i8> %v.8, i8 %a, i32 10
+  %v.11 = insertelement <16 x i8> %v.10, i8 %a, i32 11
+  %v.12 = insertelement <16 x i8> %v.11, i8 %a, i32 12
+  %v.13 = insertelement <16 x i8> %v.12, i8 %a, i32 13
+  %v.14 = insertelement <16 x i8> %v.13, i8 %a, i32 14
+  %v.15 = insertelement <16 x i8> %v.14, i8 %a, i32 15
+  ret <16 x i8> %v.15
+}
+
+; Similar to above, but we leave element 8 as undef. One interesting part with
+; this test case is that %a may be poison, so simply inserting %a also at
+; index 8 would make the result vector more poisonous.
+define <16 x i8> @test_insert_v16i8_insert_2_undef_base_skip8(i32 %a0) {
+; CHECK-LABEL: test_insert_v16i8_insert_2_undef_base_skip8:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    lsr w8, w0, #5
+; CHECK-NEXT:    dup.16b v0, w8
+; CHECK-NEXT:    mov.b v0[5], wzr
+; CHECK-NEXT:    mov.b v0[9], wzr
+; CHECK-NEXT:    ret
+  %a1 = lshr exact i32 %a0, 5
+  %a = trunc i32 %a1 to i8
+  %v.0 = insertelement <16 x i8> <i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 0, i8 undef, i8 undef, i8 undef, i8 0, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef>  , i8 %a, i32 0
+  %v.1 = insertelement <16 x i8> %v.0, i8 %a, i32 1
+  %v.2 = insertelement <16 x i8> %v.1, i8 %a, i32 2
+  %v.3 = insertelement <16 x i8> %v.2, i8 %a, i32 3
+  %v.4 = insertelement <16 x i8> %v.3, i8 %a, i32 4
+  %v.6 = insertelement <16 x i8> %v.4, i8 %a, i32 6
+  %v.7 = insertelement <16 x i8> %v.6, i8 %a, i32 7
   %v.10 = insertelement <16 x i8> %v.7, i8 %a, i32 10
   %v.11 = insertelement <16 x i8> %v.10, i8 %a, i32 11
   %v.12 = insertelement <16 x i8> %v.11, i8 %a, i32 12
@@ -75,8 +104,8 @@ define <16 x i8> @test_insert_v16i8_insert_2_undef_base(i8 %a) {
   ret <16 x i8> %v.15
 }
 
-define <16 x i8> @test_insert_v16i8_insert_2_undef_base_different_valeus(i8 %a, i8 %b) {
-; CHECK-LABEL: test_insert_v16i8_insert_2_undef_base_different_valeus:
+define <16 x i8> @test_insert_v16i8_insert_2_undef_base_different_values(i8 %a, i8 %b) {
+; CHECK-LABEL: test_insert_v16i8_insert_2_undef_base_different_values:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    dup.16b v0, w0
 ; CHECK-NEXT:    mov.b v0[2], w1
@@ -94,6 +123,42 @@ define <16 x i8> @test_insert_v16i8_insert_2_undef_base_different_valeus(i8 %a,
   %v.6 = insertelement <16 x i8> %v.4, i8 %a, i32 6
   %v.7 = insertelement <16 x i8> %v.6, i8 %b, i32 7
   %v.8 = insertelement <16 x i8> %v.7, i8 %a, i32 8
+  %v.10 = insertelement <16 x i8> %v.8, i8 %a, i32 10
+  %v.11 = insertelement <16 x i8> %v.10, i8 %a, i32 11
+  %v.12 = insertelement <16 x i8> %v.11, i8 %b, i32 12
+  %v.13 = insertelement <16 x i8> %v.12, i8 %a, i32 13
+  %v.14 = insertelement <16 x i8> %v.13, i8 %a, i32 14
+  %v.15 = insertelement <16 x i8> %v.14, i8 %b, i32 15
+  ret <16 x i8> %v.15
+}
+
+; Similar to above, but we leave element 8 as undef. One interesting part with
+; this test case is that %a and %b may be poison, so simply inserting %a or %b
+; at index 8 would make the result vector more poisonous.
+define <16 x i8> @test_insert_v16i8_insert_2_undef_base_different_values_skip8(i32 %a0, i32 %b0) {
+; CHECK-LABEL: test_insert_v16i8_insert_2_undef_base_different_values_skip8:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    lsr w8, w0, #5
+; CHECK-NEXT:    dup.16b v0, w8
+; CHECK-NEXT:    lsr w8, w1, #5
+; CHECK-NEXT:    mov.b v0[2], w8
+; CHECK-NEXT:    mov.b v0[5], wzr
+; CHECK-NEXT:    mov.b v0[7], w8
+; CHECK-NEXT:    mov.b v0[9], wzr
+; CHECK-NEXT:    mov.b v0[12], w8
+; CHECK-NEXT:    mov.b v0[15], w8
+; CHECK-NEXT:    ret
+  %a1 = lshr exact i32 %a0, 5
+  %a = trunc i32 %a1 to i8
+  %b1 = lshr exact i32 %b0, 5
+  %b = trunc i32 %b1 to i8
+  %v.0 = insertelement <16 x i8> <i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 0, i8 undef, i8 undef, i8 undef, i8 0, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef>  , i8 %a, i32 0
+  %v.1 = insertelement <16 x i8> %v.0, i8 %a, i32 1
+  %v.2 = insertelement <16 x i8> %v.1, i8 %b, i32 2
+  %v.3 = insertelement <16 x i8> %v.2, i8 %a, i32 3
+  %v.4 = insertelement <16 x i8> %v.3, i8 %a, i32 4
+  %v.6 = insertelement <16 x i8> %v.4, i8 %a, i32 6
+  %v.7 = insertelement <16 x i8> %v.6, i8 %b, i32 7
   %v.10 = insertelement <16 x i8> %v.7, i8 %a, i32 10
   %v.11 = insertelement <16 x i8> %v.10, i8 %a, i32 11
   %v.12 = insertelement <16 x i8> %v.11, i8 %b, i32 12

>From 686ec6cfe86367c43dccd83d7e6e2bac7e6a73a0 Mon Sep 17 00:00:00 2001
From: Paschalis Mpeis <paschalis.mpeis at arm.com>
Date: Wed, 11 Jun 2025 08:24:10 +0100
Subject: [PATCH 47/61] [BOLT][AArch64] Fix adr-relaxation.s test (#143151)

On some AArch64 machines the splitting was inconsistent.
This causes cold `foo` to have a `mov` instruction before adrp.

```
<foo.cold.0>:
  mov     x0, #0x0                // =0
  adrp    x1, 0x600000 <_start>
  add     x1, x1, #0x14
  ret
```

This patch removes the `mov` instruction right above .L2, making
splitting deterministic.
---
 bolt/test/AArch64/adr-relaxation.s | 1 -
 1 file changed, 1 deletion(-)

diff --git a/bolt/test/AArch64/adr-relaxation.s b/bolt/test/AArch64/adr-relaxation.s
index a643a62339ba3..864650c3287d8 100644
--- a/bolt/test/AArch64/adr-relaxation.s
+++ b/bolt/test/AArch64/adr-relaxation.s
@@ -34,7 +34,6 @@ foo:
   .cfi_startproc
   cmp  x1, x11
   b.hi  .L2
-  mov  x0, #0x0
 .L2:
 # CHECK-FOO: <foo.cold.0>:
 # CHECK-FOO-NEXT: adrp

>From 521e6ce5c8fdfc72cccc1accd78a59f1a5e2805a Mon Sep 17 00:00:00 2001
From: Baranov Victor <bar.victor.2002 at gmail.com>
Date: Wed, 11 Jun 2025 10:25:29 +0300
Subject: [PATCH 48/61] [CI] Add mention of LLVM Developer Policy in
 email-check message (NFC) (#143300)

As for now, It may be hard for people to get truth from long Discourse
discussion, so a link to official document may be enough to convince
changing email from private to public.
---
 .github/workflows/email-check.yaml | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/.github/workflows/email-check.yaml b/.github/workflows/email-check.yaml
index f4481d5cf5583..904ad718f97dd 100644
--- a/.github/workflows/email-check.yaml
+++ b/.github/workflows/email-check.yaml
@@ -32,7 +32,8 @@ jobs:
           COMMENT: >-
             ⚠️ We detected that you are using a GitHub private e-mail address to contribute to the repo.<br/>
             Please turn off [Keep my email addresses private](https://github.com/settings/emails) setting in your account.<br/>
-            See [LLVM Discourse](https://discourse.llvm.org/t/hidden-emails-on-github-should-we-do-something-about-it) for more information.
+            See [LLVM Developer Policy](https://llvm.org/docs/DeveloperPolicy.html#email-addresses) and
+            [LLVM Discourse](https://discourse.llvm.org/t/hidden-emails-on-github-should-we-do-something-about-it) for more information.
         run: |
           cat << EOF > comments
           [{"body" : "$COMMENT"}]

>From 17f1dac805d388596be5e8c316c0f14b3222da4e Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev at redking.me.uk>
Date: Wed, 11 Jun 2025 08:12:42 +0100
Subject: [PATCH 49/61] [X86] Add test coverage showing failure to merge "zero
 input passthrough" behaviour for BSF instructions on x86_64 targets

---
 llvm/test/CodeGen/X86/bsf.ll | 452 +++++++++++++++++++++++++++++++++++
 1 file changed, 452 insertions(+)
 create mode 100644 llvm/test/CodeGen/X86/bsf.ll

diff --git a/llvm/test/CodeGen/X86/bsf.ll b/llvm/test/CodeGen/X86/bsf.ll
new file mode 100644
index 0000000000000..58929115baf54
--- /dev/null
+++ b/llvm/test/CodeGen/X86/bsf.ll
@@ -0,0 +1,452 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=i686-unknown-unknown | FileCheck %s --check-prefixes=X86
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown | FileCheck %s --check-prefixes=X64
+
+define i8 @cmov_bsf8(i8 %x, i8 %y) nounwind {
+; X86-LABEL: cmov_bsf8:
+; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    testb %al, %al
+; X86-NEXT:    je .LBB0_1
+; X86-NEXT:  # %bb.2:
+; X86-NEXT:    orl $256, %eax # imm = 0x100
+; X86-NEXT:    rep bsfl %eax, %eax
+; X86-NEXT:    # kill: def $al killed $al killed $eax
+; X86-NEXT:    retl
+; X86-NEXT:  .LBB0_1:
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    # kill: def $al killed $al killed $eax
+; X86-NEXT:    retl
+;
+; X64-LABEL: cmov_bsf8:
+; X64:       # %bb.0:
+; X64-NEXT:    movl %edi, %eax
+; X64-NEXT:    orl $256, %eax # imm = 0x100
+; X64-NEXT:    rep bsfl %eax, %eax
+; X64-NEXT:    testb %dil, %dil
+; X64-NEXT:    cmovel %esi, %eax
+; X64-NEXT:    # kill: def $al killed $al killed $eax
+; X64-NEXT:    retq
+  %1 = tail call i8 @llvm.cttz.i8(i8 %x, i1 false)
+  %2 = icmp eq i8 %x, 0
+  %3 = select i1 %2, i8 %y, i8 %1
+  ret i8 %3
+}
+
+define i8 @cmov_bsf8_undef(i8 %x, i8 %y) nounwind {
+; X86-LABEL: cmov_bsf8_undef:
+; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    testb %al, %al
+; X86-NEXT:    je .LBB1_1
+; X86-NEXT:  # %bb.2:
+; X86-NEXT:    rep bsfl %eax, %eax
+; X86-NEXT:    # kill: def $al killed $al killed $eax
+; X86-NEXT:    retl
+; X86-NEXT:  .LBB1_1:
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    # kill: def $al killed $al killed $eax
+; X86-NEXT:    retl
+;
+; X64-LABEL: cmov_bsf8_undef:
+; X64:       # %bb.0:
+; X64-NEXT:    rep bsfl %edi, %eax
+; X64-NEXT:    testb %dil, %dil
+; X64-NEXT:    cmovel %esi, %eax
+; X64-NEXT:    # kill: def $al killed $al killed $eax
+; X64-NEXT:    retq
+  %1 = tail call i8 @llvm.cttz.i8(i8 %x, i1 true)
+  %2 = icmp eq i8 %x, 0
+  %3 = select i1 %2, i8 %y, i8 %1
+  ret i8 %3
+}
+
+define i16 @cmov_bsf16(i16 %x, i16 %y) nounwind {
+; X86-LABEL: cmov_bsf16:
+; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    testw %ax, %ax
+; X86-NEXT:    je .LBB2_1
+; X86-NEXT:  # %bb.2:
+; X86-NEXT:    orl $65536, %eax # imm = 0x10000
+; X86-NEXT:    rep bsfl %eax, %eax
+; X86-NEXT:    # kill: def $ax killed $ax killed $eax
+; X86-NEXT:    retl
+; X86-NEXT:  .LBB2_1:
+; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    # kill: def $ax killed $ax killed $eax
+; X86-NEXT:    retl
+;
+; X64-LABEL: cmov_bsf16:
+; X64:       # %bb.0:
+; X64-NEXT:    movl %edi, %eax
+; X64-NEXT:    orl $65536, %eax # imm = 0x10000
+; X64-NEXT:    rep bsfl %eax, %eax
+; X64-NEXT:    testw %di, %di
+; X64-NEXT:    cmovel %esi, %eax
+; X64-NEXT:    # kill: def $ax killed $ax killed $eax
+; X64-NEXT:    retq
+  %1 = tail call i16 @llvm.cttz.i16(i16 %x, i1 false)
+  %2 = icmp eq i16 %x, 0
+  %3 = select i1 %2, i16 %y, i16 %1
+  ret i16 %3
+}
+
+define i16 @cmov_bsf16_undef(i16 %x, i16 %y) nounwind {
+; X86-LABEL: cmov_bsf16_undef:
+; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    testw %ax, %ax
+; X86-NEXT:    je .LBB3_1
+; X86-NEXT:  # %bb.2:
+; X86-NEXT:    rep bsfl %eax, %eax
+; X86-NEXT:    # kill: def $ax killed $ax killed $eax
+; X86-NEXT:    retl
+; X86-NEXT:  .LBB3_1:
+; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    # kill: def $ax killed $ax killed $eax
+; X86-NEXT:    retl
+;
+; X64-LABEL: cmov_bsf16_undef:
+; X64:       # %bb.0:
+; X64-NEXT:    rep bsfl %edi, %eax
+; X64-NEXT:    testw %di, %di
+; X64-NEXT:    cmovel %esi, %eax
+; X64-NEXT:    # kill: def $ax killed $ax killed $eax
+; X64-NEXT:    retq
+  %1 = tail call i16 @llvm.cttz.i16(i16 %x, i1 true)
+  %2 = icmp eq i16 %x, 0
+  %3 = select i1 %2, i16 %y, i16 %1
+  ret i16 %3
+}
+
+define i32 @cmov_bsf32(i32 %x, i32 %y) nounwind {
+; X86-LABEL: cmov_bsf32:
+; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    testl %ecx, %ecx
+; X86-NEXT:    je .LBB4_1
+; X86-NEXT:  # %bb.2: # %cond.false
+; X86-NEXT:    rep bsfl %ecx, %eax
+; X86-NEXT:    testl %ecx, %ecx
+; X86-NEXT:    jne .LBB4_5
+; X86-NEXT:  .LBB4_4:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:  .LBB4_5: # %cond.end
+; X86-NEXT:    retl
+; X86-NEXT:  .LBB4_1:
+; X86-NEXT:    movl $32, %eax
+; X86-NEXT:    testl %ecx, %ecx
+; X86-NEXT:    je .LBB4_4
+; X86-NEXT:    jmp .LBB4_5
+;
+; X64-LABEL: cmov_bsf32:
+; X64:       # %bb.0:
+; X64-NEXT:    movl $32, %eax
+; X64-NEXT:    bsfl %edi, %eax
+; X64-NEXT:    cmovel %esi, %eax
+; X64-NEXT:    retq
+  %1 = tail call i32 @llvm.cttz.i32(i32 %x, i1 false)
+  %2 = icmp eq i32 %x, 0
+  %3 = select i1 %2, i32 %y, i32 %1
+  ret i32 %3
+}
+
+define i32 @cmov_bsf32_undef(i32 %x, i32 %y) nounwind {
+; X86-LABEL: cmov_bsf32_undef:
+; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    testl %eax, %eax
+; X86-NEXT:    je .LBB5_1
+; X86-NEXT:  # %bb.2:
+; X86-NEXT:    rep bsfl %eax, %eax
+; X86-NEXT:    retl
+; X86-NEXT:  .LBB5_1:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    retl
+;
+; X64-LABEL: cmov_bsf32_undef:
+; X64:       # %bb.0:
+; X64-NEXT:    bsfl %edi, %eax
+; X64-NEXT:    cmovel %esi, %eax
+; X64-NEXT:    retq
+  %1 = tail call i32 @llvm.cttz.i32(i32 %x, i1 true)
+  %2 = icmp eq i32 %x, 0
+  %3 = select i1 %2, i32 %y, i32 %1
+  ret i32 %3
+}
+
+define i64 @cmov_bsf64(i64 %x, i64 %y) nounwind {
+; X86-LABEL: cmov_bsf64:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl %esi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    xorl %edx, %edx
+; X86-NEXT:    movl %esi, %eax
+; X86-NEXT:    orl %ecx, %eax
+; X86-NEXT:    je .LBB6_1
+; X86-NEXT:  # %bb.2: # %cond.false
+; X86-NEXT:    testl %esi, %esi
+; X86-NEXT:    jne .LBB6_3
+; X86-NEXT:  # %bb.4: # %cond.false
+; X86-NEXT:    rep bsfl %ecx, %eax
+; X86-NEXT:    addl $32, %eax
+; X86-NEXT:    orl %ecx, %esi
+; X86-NEXT:    je .LBB6_6
+; X86-NEXT:    jmp .LBB6_7
+; X86-NEXT:  .LBB6_1:
+; X86-NEXT:    movl $64, %eax
+; X86-NEXT:    orl %ecx, %esi
+; X86-NEXT:    jne .LBB6_7
+; X86-NEXT:  .LBB6_6:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:  .LBB6_7: # %cond.end
+; X86-NEXT:    popl %esi
+; X86-NEXT:    retl
+; X86-NEXT:  .LBB6_3:
+; X86-NEXT:    rep bsfl %esi, %eax
+; X86-NEXT:    orl %ecx, %esi
+; X86-NEXT:    je .LBB6_6
+; X86-NEXT:    jmp .LBB6_7
+;
+; X64-LABEL: cmov_bsf64:
+; X64:       # %bb.0:
+; X64-NEXT:    movl $64, %eax
+; X64-NEXT:    bsfq %rdi, %rax
+; X64-NEXT:    cmoveq %rsi, %rax
+; X64-NEXT:    retq
+  %1 = tail call i64 @llvm.cttz.i64(i64 %x, i1 false)
+  %2 = icmp eq i64 %x, 0
+  %3 = select i1 %2, i64 %y, i64 %1
+  ret i64 %3
+}
+
+define i64 @cmov_bsf64_undef(i64 %x, i64 %y) nounwind {
+; X86-LABEL: cmov_bsf64_undef:
+; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl %ecx, %edx
+; X86-NEXT:    orl %eax, %edx
+; X86-NEXT:    je .LBB7_5
+; X86-NEXT:  # %bb.1: # %select.false.sink
+; X86-NEXT:    testl %ecx, %ecx
+; X86-NEXT:    jne .LBB7_2
+; X86-NEXT:  # %bb.3: # %select.false.sink
+; X86-NEXT:    rep bsfl %eax, %eax
+; X86-NEXT:    addl $32, %eax
+; X86-NEXT:    xorl %edx, %edx
+; X86-NEXT:    retl
+; X86-NEXT:  .LBB7_5: # %select.end
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    retl
+; X86-NEXT:  .LBB7_2:
+; X86-NEXT:    rep bsfl %ecx, %eax
+; X86-NEXT:    xorl %edx, %edx
+; X86-NEXT:    retl
+;
+; X64-LABEL: cmov_bsf64_undef:
+; X64:       # %bb.0:
+; X64-NEXT:    bsfq %rdi, %rax
+; X64-NEXT:    cmoveq %rsi, %rax
+; X64-NEXT:    retq
+  %1 = tail call i64 @llvm.cttz.i64(i64 %x, i1 true)
+  %2 = icmp eq i64 %x, 0
+  %3 = select i1 %2, i64 %y, i64 %1
+  ret i64 %3
+}
+
+define i128 @cmov_bsf128(i128 %x, i128 %y) nounwind {
+; X86-LABEL: cmov_bsf128:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl %ebp
+; X86-NEXT:    pushl %ebx
+; X86-NEXT:    pushl %edi
+; X86-NEXT:    pushl %esi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    movl %edi, %edx
+; X86-NEXT:    orl %ebp, %edx
+; X86-NEXT:    movl %ecx, %esi
+; X86-NEXT:    orl %eax, %esi
+; X86-NEXT:    orl %edx, %esi
+; X86-NEXT:    je .LBB8_1
+; X86-NEXT:  # %bb.2: # %cond.false
+; X86-NEXT:    testl %ecx, %ecx
+; X86-NEXT:    jne .LBB8_3
+; X86-NEXT:  # %bb.4: # %cond.false
+; X86-NEXT:    rep bsfl %edi, %esi
+; X86-NEXT:    addl $32, %esi
+; X86-NEXT:    testl %eax, %eax
+; X86-NEXT:    je .LBB8_7
+; X86-NEXT:  .LBB8_6:
+; X86-NEXT:    rep bsfl %eax, %edx
+; X86-NEXT:    jmp .LBB8_8
+; X86-NEXT:  .LBB8_1:
+; X86-NEXT:    movl %ebp, %eax
+; X86-NEXT:    xorl %ebp, %ebp
+; X86-NEXT:    movl $128, %esi
+; X86-NEXT:    jmp .LBB8_11
+; X86-NEXT:  .LBB8_3:
+; X86-NEXT:    rep bsfl %ecx, %esi
+; X86-NEXT:    testl %eax, %eax
+; X86-NEXT:    jne .LBB8_6
+; X86-NEXT:  .LBB8_7: # %cond.false
+; X86-NEXT:    rep bsfl %ebp, %edx
+; X86-NEXT:    addl $32, %edx
+; X86-NEXT:  .LBB8_8: # %cond.false
+; X86-NEXT:    movl %ebp, %eax
+; X86-NEXT:    movl %ecx, %ebx
+; X86-NEXT:    orl %edi, %ebx
+; X86-NEXT:    jne .LBB8_10
+; X86-NEXT:  # %bb.9: # %cond.false
+; X86-NEXT:    addl $64, %edx
+; X86-NEXT:    movl %edx, %esi
+; X86-NEXT:  .LBB8_10: # %cond.false
+; X86-NEXT:    xorl %ebp, %ebp
+; X86-NEXT:  .LBB8_11: # %cond.end
+; X86-NEXT:    xorl %ebx, %ebx
+; X86-NEXT:    xorl %edx, %edx
+; X86-NEXT:    orl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    orl %eax, %edi
+; X86-NEXT:    orl %ecx, %edi
+; X86-NEXT:    jne .LBB8_13
+; X86-NEXT:  # %bb.12:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:  .LBB8_13: # %cond.end
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %edx, 12(%eax)
+; X86-NEXT:    movl %ebx, 8(%eax)
+; X86-NEXT:    movl %ebp, 4(%eax)
+; X86-NEXT:    movl %esi, (%eax)
+; X86-NEXT:    popl %esi
+; X86-NEXT:    popl %edi
+; X86-NEXT:    popl %ebx
+; X86-NEXT:    popl %ebp
+; X86-NEXT:    retl $4
+;
+; X64-LABEL: cmov_bsf128:
+; X64:       # %bb.0:
+; X64-NEXT:    movq %rdi, %rax
+; X64-NEXT:    orq %rsi, %rax
+; X64-NEXT:    je .LBB8_2
+; X64-NEXT:  # %bb.1: # %select.false.sink
+; X64-NEXT:    rep bsfq %rdi, %rcx
+; X64-NEXT:    movl $64, %eax
+; X64-NEXT:    rep bsfq %rsi, %rax
+; X64-NEXT:    addq $64, %rax
+; X64-NEXT:    testq %rdi, %rdi
+; X64-NEXT:    cmovneq %rcx, %rax
+; X64-NEXT:    xorl %edx, %edx
+; X64-NEXT:    retq
+; X64-NEXT:  .LBB8_2: # %select.end
+; X64-NEXT:    movq %rdx, %rax
+; X64-NEXT:    movq %rcx, %rdx
+; X64-NEXT:    retq
+  %1 = tail call i128 @llvm.cttz.i128(i128 %x, i1 false)
+  %2 = icmp eq i128 %x, 0
+  %3 = select i1 %2, i128 %y, i128 %1
+  ret i128 %3
+}
+
+define i128 @cmov_bsf128_undef(i128 %x, i128 %y) nounwind {
+; X86-LABEL: cmov_bsf128_undef:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl %ebp
+; X86-NEXT:    pushl %ebx
+; X86-NEXT:    pushl %edi
+; X86-NEXT:    pushl %esi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl %ecx, %edi
+; X86-NEXT:    orl %esi, %edi
+; X86-NEXT:    movl %edx, %ebp
+; X86-NEXT:    orl %ebx, %ebp
+; X86-NEXT:    orl %edi, %ebp
+; X86-NEXT:    je .LBB9_11
+; X86-NEXT:  # %bb.1: # %select.false.sink
+; X86-NEXT:    testl %edx, %edx
+; X86-NEXT:    jne .LBB9_2
+; X86-NEXT:  # %bb.3: # %select.false.sink
+; X86-NEXT:    rep bsfl %ecx, %edi
+; X86-NEXT:    addl $32, %edi
+; X86-NEXT:    testl %ebx, %ebx
+; X86-NEXT:    je .LBB9_6
+; X86-NEXT:  .LBB9_5:
+; X86-NEXT:    rep bsfl %ebx, %esi
+; X86-NEXT:    orl %ecx, %edx
+; X86-NEXT:    je .LBB9_8
+; X86-NEXT:    jmp .LBB9_9
+; X86-NEXT:  .LBB9_11: # %select.end
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    movl %edi, (%eax)
+; X86-NEXT:    movl %esi, 4(%eax)
+; X86-NEXT:    movl %edx, 8(%eax)
+; X86-NEXT:    movl %ecx, 12(%eax)
+; X86-NEXT:    jmp .LBB9_10
+; X86-NEXT:  .LBB9_2:
+; X86-NEXT:    rep bsfl %edx, %edi
+; X86-NEXT:    testl %ebx, %ebx
+; X86-NEXT:    jne .LBB9_5
+; X86-NEXT:  .LBB9_6: # %select.false.sink
+; X86-NEXT:    rep bsfl %esi, %esi
+; X86-NEXT:    addl $32, %esi
+; X86-NEXT:    orl %ecx, %edx
+; X86-NEXT:    jne .LBB9_9
+; X86-NEXT:  .LBB9_8: # %select.false.sink
+; X86-NEXT:    addl $64, %esi
+; X86-NEXT:    movl %esi, %edi
+; X86-NEXT:  .LBB9_9: # %select.false.sink
+; X86-NEXT:    movl %edi, (%eax)
+; X86-NEXT:    movl $0, 12(%eax)
+; X86-NEXT:    movl $0, 8(%eax)
+; X86-NEXT:    movl $0, 4(%eax)
+; X86-NEXT:  .LBB9_10: # %select.false.sink
+; X86-NEXT:    popl %esi
+; X86-NEXT:    popl %edi
+; X86-NEXT:    popl %ebx
+; X86-NEXT:    popl %ebp
+; X86-NEXT:    retl $4
+;
+; X64-LABEL: cmov_bsf128_undef:
+; X64:       # %bb.0:
+; X64-NEXT:    movq %rdi, %rax
+; X64-NEXT:    orq %rsi, %rax
+; X64-NEXT:    je .LBB9_2
+; X64-NEXT:  # %bb.1: # %select.false.sink
+; X64-NEXT:    rep bsfq %rdi, %rcx
+; X64-NEXT:    rep bsfq %rsi, %rax
+; X64-NEXT:    addq $64, %rax
+; X64-NEXT:    testq %rdi, %rdi
+; X64-NEXT:    cmovneq %rcx, %rax
+; X64-NEXT:    xorl %edx, %edx
+; X64-NEXT:    retq
+; X64-NEXT:  .LBB9_2: # %select.end
+; X64-NEXT:    movq %rdx, %rax
+; X64-NEXT:    movq %rcx, %rdx
+; X64-NEXT:    retq
+  %1 = tail call i128 @llvm.cttz.i128(i128 %x, i1 true)
+  %2 = icmp eq i128 %x, 0
+  %3 = select i1 %2, i128 %y, i128 %1
+  ret i128 %3
+}
+
+declare i8 @llvm.cttz.i8(i8, i1)
+declare i16 @llvm.cttz.i16(i16, i1)
+declare i32 @llvm.cttz.i32(i32, i1)
+declare i64 @llvm.cttz.i64(i64, i1)
+declare i128 @llvm.cttz.i128(i128, i1)

>From a72bcda1434c72f9db6687565a361479e0dde572 Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev at redking.me.uk>
Date: Wed, 11 Jun 2025 08:24:10 +0100
Subject: [PATCH 50/61] [X86] add test coverage for #143606

---
 .../X86/vector-shuffle-combining-avx512vl.ll  | 28 +++++++++++++++++++
 1 file changed, 28 insertions(+)

diff --git a/llvm/test/CodeGen/X86/vector-shuffle-combining-avx512vl.ll b/llvm/test/CodeGen/X86/vector-shuffle-combining-avx512vl.ll
index 15c82f169c86e..d5aa7588925d8 100644
--- a/llvm/test/CodeGen/X86/vector-shuffle-combining-avx512vl.ll
+++ b/llvm/test/CodeGen/X86/vector-shuffle-combining-avx512vl.ll
@@ -137,3 +137,31 @@ define void @PR142995(ptr %p0, ptr %p1, ptr %p2) nounwind #0 {
 }
 declare <5 x i32> @llvm.masked.load.v5i32.p0(ptr captures(none), i32 immarg, <5 x i1>, <5 x i32>)
 declare <64 x i32> @llvm.masked.load.v64i32.p0(ptr captures(none), i32 immarg, <64 x i1>, <64 x i32>)
+
+define <8 x double> @PR143606(ptr %px, ptr %py) {
+; X86-LABEL: PR143606:
+; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    vmovapd (%ecx), %ymm0
+; X86-NEXT:    vblendpd {{.*#+}} ymm1 = ymm0[0],mem[1,2],ymm0[3]
+; X86-NEXT:    vshufpd {{.*#+}} ymm0 = ymm0[1],mem[0],ymm0[2],mem[3]
+; X86-NEXT:    vinsertf64x4 $1, %ymm0, %zmm1, %zmm0
+; X86-NEXT:    retl
+;
+; X64-LABEL: PR143606:
+; X64:       # %bb.0:
+; X64-NEXT:    vmovapd (%rdi), %ymm0
+; X64-NEXT:    vblendpd {{.*#+}} ymm1 = ymm0[0],mem[1,2],ymm0[3]
+; X64-NEXT:    vshufpd {{.*#+}} ymm0 = ymm0[1],mem[0],ymm0[2],mem[3]
+; X64-NEXT:    vinsertf64x4 $1, %ymm0, %zmm1, %zmm0
+; X64-NEXT:    retq
+  %x = load <4 x double>, ptr %px, align 32
+  %y.lo = load <4 x double>, ptr %py, align 32
+  %py.hi = getelementptr inbounds nuw i8, ptr %py, i64 32
+  %y.hi = load <4 x double>, ptr %py.hi, align 32
+  %lo = shufflevector <4 x double> %x, <4 x double> %y.lo, <4 x i32> <i32 0, i32 5, i32 6, i32 3>
+  %hi = call <4 x double> @llvm.x86.avx512.vpermi2var.pd.256(<4 x double> %x, <4 x i64> <i64 1, i64 4, i64 2, i64 7>, <4 x double> %y.hi)
+  %res = shufflevector <4 x double> %lo, <4 x double> %hi, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  ret <8 x double> %res
+}

>From e9bd1aee6537508970614fd79a4f076ba4ed93d0 Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev at redking.me.uk>
Date: Wed, 11 Jun 2025 08:30:09 +0100
Subject: [PATCH 51/61] [X86] bmi-select-distrib.ll - remove unused check
 prefixes and pull out PR comments above tests. NFC

---
 llvm/test/CodeGen/X86/bmi-select-distrib.ll | 31 +++++++++------------
 1 file changed, 13 insertions(+), 18 deletions(-)

diff --git a/llvm/test/CodeGen/X86/bmi-select-distrib.ll b/llvm/test/CodeGen/X86/bmi-select-distrib.ll
index 49beda516d508..e5696ded4fbf1 100644
--- a/llvm/test/CodeGen/X86/bmi-select-distrib.ll
+++ b/llvm/test/CodeGen/X86/bmi-select-distrib.ll
@@ -1,11 +1,11 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
-; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+cmov,+sse2,+bmi | FileCheck %s --check-prefixes=X86,X86-BMI
-; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+cmov,+sse2,+bmi,+bmi2 | FileCheck %s --check-prefixes=X86,X86-BMI2
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+bmi | FileCheck %s --check-prefixes=X64,X64-BMI
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+bmi,+bmi2 | FileCheck %s --check-prefixes=X64,X64-BMI2
+; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+cmov,+sse2,+bmi | FileCheck %s --check-prefixes=X86
+; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+cmov,+sse2,+bmi,+bmi2 | FileCheck %s --check-prefixes=X86
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+bmi | FileCheck %s --check-prefixes=X64
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+bmi,+bmi2 | FileCheck %s --check-prefixes=X64
 
-define i32 @and_select_neg_to_blsi1(i1 %a0, i32 inreg %a1) nounwind {
 ; PR131587
+define i32 @and_select_neg_to_blsi1(i1 %a0, i32 inreg %a1) nounwind {
 ; X86-LABEL: and_select_neg_to_blsi1:
 ; X86:       # %bb.0:
 ; X86-NEXT:    blsil %eax, %ecx
@@ -25,8 +25,8 @@ define i32 @and_select_neg_to_blsi1(i1 %a0, i32 inreg %a1) nounwind {
   ret i32 %ret
 }
 
-define i32 @and_select_neg_to_blsi2(i1 %a0, i32 inreg %a1) nounwind {
 ; PR131587
+define i32 @and_select_neg_to_blsi2(i1 %a0, i32 inreg %a1) nounwind {
 ; X86-LABEL: and_select_neg_to_blsi2:
 ; X86:       # %bb.0:
 ; X86-NEXT:    blsil %eax, %ecx
@@ -46,8 +46,8 @@ define i32 @and_select_neg_to_blsi2(i1 %a0, i32 inreg %a1) nounwind {
   ret i32 %ret
 }
 
-define i32 @and_select_neg_to_blsi3(i1 %a0, i32 inreg %a1) nounwind {
 ; PR131587
+define i32 @and_select_neg_to_blsi3(i1 %a0, i32 inreg %a1) nounwind {
 ; X86-LABEL: and_select_neg_to_blsi3:
 ; X86:       # %bb.0:
 ; X86-NEXT:    blsil %eax, %ecx
@@ -67,8 +67,8 @@ define i32 @and_select_neg_to_blsi3(i1 %a0, i32 inreg %a1) nounwind {
   ret i32 %ret
 }
 
-define i64 @and_select_neg_to_blsi_i64(i1 %a0, i64 %a1) nounwind {
 ; PR131587
+define i64 @and_select_neg_to_blsi_i64(i1 %a0, i64 %a1) nounwind {
 ; X86-LABEL: and_select_neg_to_blsi_i64:
 ; X86:       # %bb.0:
 ; X86-NEXT:    pushl %esi
@@ -283,8 +283,8 @@ define i32 @and_select_neg_different_op(i1 %a0, i32 inreg %a1, i32 inreg %a2) no
   ret i32 %ret
 }
 
-define i32 @and_select_sub_1_to_blsr1(i1 %a0, i32 inreg %a1) nounwind {
 ; PR133848
+define i32 @and_select_sub_1_to_blsr1(i1 %a0, i32 inreg %a1) nounwind {
 ; X86-LABEL: and_select_sub_1_to_blsr1:
 ; X86:       # %bb.0:
 ; X86-NEXT:    blsrl %eax, %ecx
@@ -304,8 +304,8 @@ define i32 @and_select_sub_1_to_blsr1(i1 %a0, i32 inreg %a1) nounwind {
   ret i32 %ret
 }
 
-define i32 @and_select_sub_1_to_blsr2(i1 %a0, i32 inreg %a1) nounwind {
 ; PR133848
+define i32 @and_select_sub_1_to_blsr2(i1 %a0, i32 inreg %a1) nounwind {
 ; X86-LABEL: and_select_sub_1_to_blsr2:
 ; X86:       # %bb.0:
 ; X86-NEXT:    blsrl %eax, %ecx
@@ -325,8 +325,8 @@ define i32 @and_select_sub_1_to_blsr2(i1 %a0, i32 inreg %a1) nounwind {
   ret i32 %ret
 }
 
-define i32 @and_select_sub_1_to_blsr3(i1 %a0, i32 inreg %a1) nounwind {
 ; PR133848
+define i32 @and_select_sub_1_to_blsr3(i1 %a0, i32 inreg %a1) nounwind {
 ; X86-LABEL: and_select_sub_1_to_blsr3:
 ; X86:       # %bb.0:
 ; X86-NEXT:    blsrl %eax, %ecx
@@ -346,8 +346,8 @@ define i32 @and_select_sub_1_to_blsr3(i1 %a0, i32 inreg %a1) nounwind {
   ret i32 %ret
 }
 
-define i32 @and_select_sub_1_to_blsr4(i1 %a0, i32 inreg %a1) nounwind {
 ; PR133848
+define i32 @and_select_sub_1_to_blsr4(i1 %a0, i32 inreg %a1) nounwind {
 ; X86-LABEL: and_select_sub_1_to_blsr4:
 ; X86:       # %bb.0:
 ; X86-NEXT:    blsrl %eax, %ecx
@@ -392,8 +392,8 @@ define i32 @and_sub_1_select_orig(i1 %a0, i32 inreg %a1) nounwind {
   ret i32 %ret
 }
 
-define i64 @and_select_sub_1_to_blsr_i64(i1 %a0, i64 %a1) nounwind {
 ; PR133848
+define i64 @and_select_sub_1_to_blsr_i64(i1 %a0, i64 %a1) nounwind {
 ; X86-LABEL: and_select_sub_1_to_blsr_i64:
 ; X86:       # %bb.0:
 ; X86-NEXT:    pushl %esi
@@ -863,8 +863,3 @@ define i32 @xor_select_sub_1_different_op(i1 %a0, i32 inreg %a1, i32 inreg %a2)
   %ret = xor i32 %a1, %bls
   ret i32 %ret
 }
-;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
-; X64-BMI: {{.*}}
-; X64-BMI2: {{.*}}
-; X86-BMI: {{.*}}
-; X86-BMI2: {{.*}}

>From 13115276d0d12b0d9bf952abdc19f04866db16a8 Mon Sep 17 00:00:00 2001
From: David Green <david.green at arm.com>
Date: Wed, 11 Jun 2025 08:32:55 +0100
Subject: [PATCH 52/61] Revert "[AArch64][GlobalISel] Expand 64bit extracts to
 128bit to allow more patterns (#142904)"

This reverts commit 61cdba602abe67761ab2bbf12bf85710dfa963f4 due to verifier
issues.
---
 .../AArch64/GISel/AArch64RegisterBankInfo.cpp |  32 +--
 .../GlobalISel/regbank-extract-vector-elt.mir |   4 +-
 llvm/test/CodeGen/AArch64/aarch64-bif-gen.ll  |   3 -
 llvm/test/CodeGen/AArch64/aarch64-bit-gen.ll  |   3 -
 llvm/test/CodeGen/AArch64/abs.ll              |   1 -
 llvm/test/CodeGen/AArch64/arm64-neon-copy.ll  |  13 +-
 .../AArch64/arm64-neon-simd-ldst-one.ll       |  45 ++--
 llvm/test/CodeGen/AArch64/arm64-neon-v8.1a.ll |  55 +++--
 llvm/test/CodeGen/AArch64/arm64-vcvt_f.ll     |   1 -
 llvm/test/CodeGen/AArch64/bswap.ll            |   1 -
 llvm/test/CodeGen/AArch64/concat-vector.ll    |   7 +-
 llvm/test/CodeGen/AArch64/double_reduct.ll    |  18 +-
 llvm/test/CodeGen/AArch64/f16-instructions.ll |  12 +-
 llvm/test/CodeGen/AArch64/faddsub.ll          |   4 +-
 llvm/test/CodeGen/AArch64/fcopysign.ll        |   4 +-
 llvm/test/CodeGen/AArch64/fcvt.ll             |  14 +-
 llvm/test/CodeGen/AArch64/fdiv.ll             |   2 +-
 llvm/test/CodeGen/AArch64/fminimummaximum.ll  |   4 +-
 llvm/test/CodeGen/AArch64/fminmax.ll          |   4 +-
 llvm/test/CodeGen/AArch64/fmla.ll             |   6 +-
 llvm/test/CodeGen/AArch64/fmul.ll             |   2 +-
 .../test/CodeGen/AArch64/fptosi-sat-vector.ll |   1 -
 .../test/CodeGen/AArch64/fptoui-sat-vector.ll |   1 -
 llvm/test/CodeGen/AArch64/fptrunc.ll          |   4 +-
 llvm/test/CodeGen/AArch64/fsqrt.ll            |   2 +-
 llvm/test/CodeGen/AArch64/insertextract.ll    |  45 ++--
 llvm/test/CodeGen/AArch64/itofp.ll            |  20 +-
 llvm/test/CodeGen/AArch64/llvm.exp10.ll       |  33 ++-
 llvm/test/CodeGen/AArch64/popcount.ll         |   8 +-
 llvm/test/CodeGen/AArch64/ptradd.ll           |   1 -
 llvm/test/CodeGen/AArch64/shift.ll            |   6 -
 llvm/test/CodeGen/AArch64/store.ll            |  15 +-
 .../AArch64/vec-combine-compare-to-bitmask.ll | 228 +++++++++++++-----
 .../CodeGen/AArch64/vecreduce-fadd-strict.ll  |   7 +-
 .../vecreduce-fmax-legalization-nan.ll        |  26 +-
 .../AArch64/vecreduce-fmax-legalization.ll    |  26 +-
 .../CodeGen/AArch64/vecreduce-fmaximum.ll     |  26 +-
 .../AArch64/vecreduce-fmin-legalization.ll    |  26 +-
 .../CodeGen/AArch64/vecreduce-fminimum.ll     |  26 +-
 .../CodeGen/AArch64/vecreduce-fmul-strict.ll  |  29 ++-
 llvm/test/CodeGen/AArch64/vecreduce-fmul.ll   | 121 ++++++----
 .../AArch64/vecreduce-umax-legalization.ll    |  15 +-
 llvm/test/CodeGen/AArch64/vector-lrint.ll     |  25 +-
 43 files changed, 592 insertions(+), 334 deletions(-)

diff --git a/llvm/lib/Target/AArch64/GISel/AArch64RegisterBankInfo.cpp b/llvm/lib/Target/AArch64/GISel/AArch64RegisterBankInfo.cpp
index 53c7a00a7f9f0..31954e7954c03 100644
--- a/llvm/lib/Target/AArch64/GISel/AArch64RegisterBankInfo.cpp
+++ b/llvm/lib/Target/AArch64/GISel/AArch64RegisterBankInfo.cpp
@@ -399,26 +399,6 @@ void AArch64RegisterBankInfo::applyMappingImpl(
     MI.getOperand(1).setReg(ConstReg);
     return applyDefaultMapping(OpdMapper);
   }
-  case TargetOpcode::G_EXTRACT_VECTOR_ELT: {
-    // SDAG will promote a 64bit G_EXTRACT_VECTOR_ELT to 128 to reduce the
-    // number of duplicate lane-extract patterns needed. Do the same here so
-    // that selection will operate on the larger vectors.
-    Register Src = MI.getOperand(1).getReg();
-    LLT SrcTy = MRI.getType(Src);
-    assert(SrcTy.getSizeInBits() == 64 && "Expected 64-bit source vector");
-    LLT DstTy = SrcTy.multiplyElements(2);
-    Builder.setInsertPt(*MI.getParent(), MI.getIterator());
-    auto Undef = Builder.buildUndef(SrcTy);
-    auto Concat = Builder.buildConcatVectors(DstTy, {Src, Undef.getReg(0)});
-    MRI.setRegBank(Undef.getReg(0), getRegBank(AArch64::FPRRegBankID));
-    MRI.setRegBank(Concat.getReg(0), getRegBank(AArch64::FPRRegBankID));
-    for (MachineInstr &Ext :
-         make_early_inc_range(MRI.use_nodbg_instructions(Src))) {
-      if (Ext.getOpcode() == TargetOpcode::G_EXTRACT_VECTOR_ELT)
-        Ext.getOperand(1).setReg(Concat.getReg(0));
-    }
-    return applyDefaultMapping(OpdMapper);
-  }
   default:
     llvm_unreachable("Don't know how to handle that operation");
   }
@@ -1034,20 +1014,14 @@ AArch64RegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
     }
     break;
   }
-  case TargetOpcode::G_EXTRACT_VECTOR_ELT: {
+  case TargetOpcode::G_EXTRACT_VECTOR_ELT:
     // Destination and source need to be FPRs.
     OpRegBankIdx[0] = PMI_FirstFPR;
     OpRegBankIdx[1] = PMI_FirstFPR;
-    // Index needs to be a GPR constant.
+
+    // Index needs to be a GPR.
     OpRegBankIdx[2] = PMI_FirstGPR;
-    // SDAG will promote a 64bit G_EXTRACT_VECTOR_ELT to 128 to reduce the
-    // number of duplicate lane-extract patterns needed. Do the same here so
-    // that selection will operate on the larger vectors.
-    LLT Ty = MRI.getType(MI.getOperand(1).getReg());
-    if (!Ty.isScalable() && Ty.getSizeInBits() == 64)
-      MappingID = CustomMappingID;
     break;
-  }
   case TargetOpcode::G_INSERT_VECTOR_ELT:
     OpRegBankIdx[0] = PMI_FirstFPR;
     OpRegBankIdx[1] = PMI_FirstFPR;
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/regbank-extract-vector-elt.mir b/llvm/test/CodeGen/AArch64/GlobalISel/regbank-extract-vector-elt.mir
index 4e569e0bc7e5f..35bc36d472b1a 100644
--- a/llvm/test/CodeGen/AArch64/GlobalISel/regbank-extract-vector-elt.mir
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/regbank-extract-vector-elt.mir
@@ -94,9 +94,7 @@ body:             |
     ; CHECK-NEXT: {{  $}}
     ; CHECK-NEXT: [[COPY:%[0-9]+]]:fpr(<4 x s16>) = COPY $d0
     ; CHECK-NEXT: [[C:%[0-9]+]]:gpr(s64) = G_CONSTANT i64 1
-    ; CHECK-NEXT: [[DEF:%[0-9]+]]:fpr(<4 x s16>) = G_IMPLICIT_DEF
-    ; CHECK-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:fpr(<8 x s16>) = G_CONCAT_VECTORS [[COPY]](<4 x s16>), [[DEF]](<4 x s16>)
-    ; CHECK-NEXT: [[EVEC:%[0-9]+]]:fpr(s16) = G_EXTRACT_VECTOR_ELT [[CONCAT_VECTORS]](<8 x s16>), [[C]](s64)
+    ; CHECK-NEXT: [[EVEC:%[0-9]+]]:fpr(s16) = G_EXTRACT_VECTOR_ELT [[COPY]](<4 x s16>), [[C]](s64)
     ; CHECK-NEXT: $h0 = COPY [[EVEC]](s16)
     ; CHECK-NEXT: RET_ReallyLR implicit $h0
     %0:_(<4 x s16>) = COPY $d0
diff --git a/llvm/test/CodeGen/AArch64/aarch64-bif-gen.ll b/llvm/test/CodeGen/AArch64/aarch64-bif-gen.ll
index 287344bdbd29f..7f922c0047553 100644
--- a/llvm/test/CodeGen/AArch64/aarch64-bif-gen.ll
+++ b/llvm/test/CodeGen/AArch64/aarch64-bif-gen.ll
@@ -70,9 +70,6 @@ define <1 x i32> @test_bitf_v1i32(<1 x i32> %A, <1 x i32> %B, <1 x i32> %C) {
 ;
 ; CHECK-GI-LABEL: test_bitf_v1i32:
 ; CHECK-GI:       // %bb.0:
-; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 def $q0
-; CHECK-GI-NEXT:    // kill: def $d1 killed $d1 def $q1
-; CHECK-GI-NEXT:    // kill: def $d2 killed $d2 def $q2
 ; CHECK-GI-NEXT:    fmov w8, s2
 ; CHECK-GI-NEXT:    fmov w9, s1
 ; CHECK-GI-NEXT:    fmov w10, s0
diff --git a/llvm/test/CodeGen/AArch64/aarch64-bit-gen.ll b/llvm/test/CodeGen/AArch64/aarch64-bit-gen.ll
index 73fcee56506f9..b8eb8269d605c 100644
--- a/llvm/test/CodeGen/AArch64/aarch64-bit-gen.ll
+++ b/llvm/test/CodeGen/AArch64/aarch64-bit-gen.ll
@@ -70,9 +70,6 @@ define <1 x i32> @test_bit_v1i32(<1 x i32> %A, <1 x i32> %B, <1 x i32> %C) {
 ;
 ; CHECK-GI-LABEL: test_bit_v1i32:
 ; CHECK-GI:       // %bb.0:
-; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 def $q0
-; CHECK-GI-NEXT:    // kill: def $d1 killed $d1 def $q1
-; CHECK-GI-NEXT:    // kill: def $d2 killed $d2 def $q2
 ; CHECK-GI-NEXT:    fmov w8, s2
 ; CHECK-GI-NEXT:    fmov w9, s1
 ; CHECK-GI-NEXT:    fmov w10, s0
diff --git a/llvm/test/CodeGen/AArch64/abs.ll b/llvm/test/CodeGen/AArch64/abs.ll
index 470d68a805718..0f56d25a47b2a 100644
--- a/llvm/test/CodeGen/AArch64/abs.ll
+++ b/llvm/test/CodeGen/AArch64/abs.ll
@@ -243,7 +243,6 @@ define <1 x i32> @abs_v1i32(<1 x i32> %a){
 ;
 ; CHECK-GI-LABEL: abs_v1i32:
 ; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 def $q0
 ; CHECK-GI-NEXT:    fmov w8, s0
 ; CHECK-GI-NEXT:    fmov w9, s0
 ; CHECK-GI-NEXT:    cmp w8, #0
diff --git a/llvm/test/CodeGen/AArch64/arm64-neon-copy.ll b/llvm/test/CodeGen/AArch64/arm64-neon-copy.ll
index 60af49d867be7..367105f783817 100644
--- a/llvm/test/CodeGen/AArch64/arm64-neon-copy.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-neon-copy.ll
@@ -1215,7 +1215,6 @@ define <8 x i8> @testDUP.v1i8(<1 x i8> %a) {
 ;
 ; CHECK-GI-LABEL: testDUP.v1i8:
 ; CHECK-GI:       // %bb.0:
-; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 def $q0
 ; CHECK-GI-NEXT:    fmov w8, s0
 ; CHECK-GI-NEXT:    dup v0.8b, w8
 ; CHECK-GI-NEXT:    ret
@@ -1711,7 +1710,7 @@ define <16 x i8> @test_concat_v16i8_v8i8_v16i8(<8 x i8> %x, <16 x i8> %y) #0 {
 ; CHECK-GI-NEXT:    mov v2.16b, v1.16b
 ; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 def $q0
 ; CHECK-GI-NEXT:    adrp x8, .LCPI127_0
-; CHECK-GI-NEXT:    mov b1, v0.b[0]
+; CHECK-GI-NEXT:    mov v1.b[0], v0.b[0]
 ; CHECK-GI-NEXT:    mov v1.b[1], v0.b[1]
 ; CHECK-GI-NEXT:    mov v1.b[2], v0.b[2]
 ; CHECK-GI-NEXT:    mov v1.b[3], v0.b[3]
@@ -1818,7 +1817,7 @@ define <16 x i8> @test_concat_v16i8_v8i8_v8i8(<8 x i8> %x, <8 x i8> %y) #0 {
 ; CHECK-GI-LABEL: test_concat_v16i8_v8i8_v8i8:
 ; CHECK-GI:       // %bb.0: // %entry
 ; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 def $q0
-; CHECK-GI-NEXT:    mov b2, v0.b[0]
+; CHECK-GI-NEXT:    mov v2.b[0], v0.b[0]
 ; CHECK-GI-NEXT:    // kill: def $d1 killed $d1 def $q1
 ; CHECK-GI-NEXT:    mov v2.b[1], v0.b[1]
 ; CHECK-GI-NEXT:    mov v2.b[2], v0.b[2]
@@ -1904,7 +1903,7 @@ define <8 x i16> @test_concat_v8i16_v4i16_v8i16(<4 x i16> %x, <8 x i16> %y) #0 {
 ; CHECK-GI-NEXT:    mov v2.16b, v1.16b
 ; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 def $q0
 ; CHECK-GI-NEXT:    adrp x8, .LCPI131_0
-; CHECK-GI-NEXT:    mov h1, v0.h[0]
+; CHECK-GI-NEXT:    mov v1.h[0], v0.h[0]
 ; CHECK-GI-NEXT:    mov v1.h[1], v0.h[1]
 ; CHECK-GI-NEXT:    mov v1.h[2], v0.h[2]
 ; CHECK-GI-NEXT:    mov v1.h[3], v0.h[3]
@@ -1975,7 +1974,7 @@ define <8 x i16> @test_concat_v8i16_v4i16_v4i16(<4 x i16> %x, <4 x i16> %y) #0 {
 ; CHECK-GI-LABEL: test_concat_v8i16_v4i16_v4i16:
 ; CHECK-GI:       // %bb.0: // %entry
 ; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 def $q0
-; CHECK-GI-NEXT:    mov h2, v0.h[0]
+; CHECK-GI-NEXT:    mov v2.h[0], v0.h[0]
 ; CHECK-GI-NEXT:    // kill: def $d1 killed $d1 def $q1
 ; CHECK-GI-NEXT:    mov v2.h[1], v0.h[1]
 ; CHECK-GI-NEXT:    mov v2.h[2], v0.h[2]
@@ -2037,7 +2036,7 @@ define <4 x i32> @test_concat_v4i32_v2i32_v4i32(<2 x i32> %x, <4 x i32> %y) #0 {
 ; CHECK-GI-NEXT:    mov v2.16b, v1.16b
 ; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 def $q0
 ; CHECK-GI-NEXT:    adrp x8, .LCPI135_0
-; CHECK-GI-NEXT:    mov s1, v0.s[0]
+; CHECK-GI-NEXT:    mov v1.s[0], v0.s[0]
 ; CHECK-GI-NEXT:    mov v1.s[1], v0.s[1]
 ; CHECK-GI-NEXT:    ldr q0, [x8, :lo12:.LCPI135_0]
 ; CHECK-GI-NEXT:    tbl v0.16b, { v1.16b, v2.16b }, v0.16b
@@ -2243,7 +2242,6 @@ define <8 x i8> @concat_vector_v8i8(<1 x i8> %a) {
 ;
 ; CHECK-GI-LABEL: concat_vector_v8i8:
 ; CHECK-GI:       // %bb.0:
-; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 def $q0
 ; CHECK-GI-NEXT:    fmov w8, s0
 ; CHECK-GI-NEXT:    dup v0.8b, w8
 ; CHECK-GI-NEXT:    ret
@@ -2270,7 +2268,6 @@ define <16 x i8> @concat_vector_v16i8(<1 x i8> %a) {
 ;
 ; CHECK-GI-LABEL: concat_vector_v16i8:
 ; CHECK-GI:       // %bb.0:
-; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 def $q0
 ; CHECK-GI-NEXT:    fmov w8, s0
 ; CHECK-GI-NEXT:    dup v0.16b, w8
 ; CHECK-GI-NEXT:    ret
diff --git a/llvm/test/CodeGen/AArch64/arm64-neon-simd-ldst-one.ll b/llvm/test/CodeGen/AArch64/arm64-neon-simd-ldst-one.ll
index ac6f041ccd70d..f47c06e1ba4cb 100644
--- a/llvm/test/CodeGen/AArch64/arm64-neon-simd-ldst-one.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-neon-simd-ldst-one.ll
@@ -614,11 +614,16 @@ entry:
 }
 
 define void @test_vst1_lane0_s16(ptr %a, <4 x i16> %b) {
-; CHECK-LABEL: test_vst1_lane0_s16:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
-; CHECK-NEXT:    str h0, [x0]
-; CHECK-NEXT:    ret
+; CHECK-GI-LABEL: test_vst1_lane0_s16:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-GI-NEXT:    str h0, [x0]
+; CHECK-GI-NEXT:    ret
+;
+; CHECK-SD-LABEL: test_vst1_lane0_s16:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    str h0, [x0]
+; CHECK-SD-NEXT:    ret
 entry:
   %0 = extractelement <4 x i16> %b, i32 0
   store i16 %0, ptr %a, align 2
@@ -638,11 +643,16 @@ entry:
 }
 
 define void @test_vst1_lane0_s32(ptr %a, <2 x i32> %b) {
-; CHECK-LABEL: test_vst1_lane0_s32:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
-; CHECK-NEXT:    str s0, [x0]
-; CHECK-NEXT:    ret
+; CHECK-GI-LABEL: test_vst1_lane0_s32:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-GI-NEXT:    str s0, [x0]
+; CHECK-GI-NEXT:    ret
+;
+; CHECK-SD-LABEL: test_vst1_lane0_s32:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    str s0, [x0]
+; CHECK-SD-NEXT:    ret
 entry:
   %0 = extractelement <2 x i32> %b, i32 0
   store i32 %0, ptr %a, align 4
@@ -673,11 +683,16 @@ entry:
 }
 
 define void @test_vst1_lane0_f32(ptr %a, <2 x float> %b) {
-; CHECK-LABEL: test_vst1_lane0_f32:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
-; CHECK-NEXT:    str s0, [x0]
-; CHECK-NEXT:    ret
+; CHECK-GI-LABEL: test_vst1_lane0_f32:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-GI-NEXT:    str s0, [x0]
+; CHECK-GI-NEXT:    ret
+;
+; CHECK-SD-LABEL: test_vst1_lane0_f32:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    str s0, [x0]
+; CHECK-SD-NEXT:    ret
 entry:
   %0 = extractelement <2 x float> %b, i32 0
   store float %0, ptr %a, align 4
diff --git a/llvm/test/CodeGen/AArch64/arm64-neon-v8.1a.ll b/llvm/test/CodeGen/AArch64/arm64-neon-v8.1a.ll
index 1f8ac792d75f5..cb14adc00df00 100644
--- a/llvm/test/CodeGen/AArch64/arm64-neon-v8.1a.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-neon-v8.1a.ll
@@ -663,14 +663,24 @@ entry:
 }
 
 define i32 @test_vqrdmlahs_lane_s32(i32 %a, i32 %b, <2 x i32> %c) {
-; CHECK-LABEL: test_vqrdmlahs_lane_s32:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    fmov s1, w0
-; CHECK-NEXT:    fmov s2, w1
-; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
-; CHECK-NEXT:    sqrdmlah s1, s2, v0.s[1]
-; CHECK-NEXT:    fmov w0, s1
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: test_vqrdmlahs_lane_s32:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    fmov s1, w0
+; CHECK-SD-NEXT:    fmov s2, w1
+; CHECK-SD-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-SD-NEXT:    sqrdmlah s1, s2, v0.s[1]
+; CHECK-SD-NEXT:    fmov w0, s1
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: test_vqrdmlahs_lane_s32:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-GI-NEXT:    fmov s1, w0
+; CHECK-GI-NEXT:    fmov s2, w1
+; CHECK-GI-NEXT:    mov s0, v0.s[1]
+; CHECK-GI-NEXT:    sqrdmlah s1, s2, s0
+; CHECK-GI-NEXT:    fmov w0, s1
+; CHECK-GI-NEXT:    ret
 entry:
   %vget_lane = extractelement <2 x i32> %c, i64 1
   %vqrdmlahs_s32.i = tail call i32 @llvm.aarch64.neon.sqrdmlah.i32(i32 %a, i32 %b, i32 %vget_lane) #4
@@ -803,14 +813,24 @@ entry:
 }
 
 define i32 @test_vqrdmlshs_lane_s32(i32 %a, i32 %b, <2 x i32> %c) {
-; CHECK-LABEL: test_vqrdmlshs_lane_s32:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    fmov s1, w0
-; CHECK-NEXT:    fmov s2, w1
-; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
-; CHECK-NEXT:    sqrdmlsh s1, s2, v0.s[1]
-; CHECK-NEXT:    fmov w0, s1
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: test_vqrdmlshs_lane_s32:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    fmov s1, w0
+; CHECK-SD-NEXT:    fmov s2, w1
+; CHECK-SD-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-SD-NEXT:    sqrdmlsh s1, s2, v0.s[1]
+; CHECK-SD-NEXT:    fmov w0, s1
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: test_vqrdmlshs_lane_s32:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-GI-NEXT:    fmov s1, w0
+; CHECK-GI-NEXT:    fmov s2, w1
+; CHECK-GI-NEXT:    mov s0, v0.s[1]
+; CHECK-GI-NEXT:    sqrdmlsh s1, s2, s0
+; CHECK-GI-NEXT:    fmov w0, s1
+; CHECK-GI-NEXT:    ret
 entry:
   %vget_lane = extractelement <2 x i32> %c, i64 1
   %vqrdmlshs_s32.i = tail call i32 @llvm.aarch64.neon.sqrdmlsh.i32(i32 %a, i32 %b, i32 %vget_lane) #4
@@ -847,6 +867,3 @@ entry:
   %vqrdmlshs_s32.i = tail call i32 @llvm.aarch64.neon.sqrdmlsh.i32(i32 %a, i32 %b, i32 %vgetq_lane) #4
   ret i32 %vqrdmlshs_s32.i
 }
-;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
-; CHECK-GI: {{.*}}
-; CHECK-SD: {{.*}}
diff --git a/llvm/test/CodeGen/AArch64/arm64-vcvt_f.ll b/llvm/test/CodeGen/AArch64/arm64-vcvt_f.ll
index eccf918f74312..d4cc154ac6afc 100644
--- a/llvm/test/CodeGen/AArch64/arm64-vcvt_f.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-vcvt_f.ll
@@ -271,7 +271,6 @@ define half @test_vcvt_f16_f32(<1 x float> %x) {
 ;
 ; GISEL-LABEL: test_vcvt_f16_f32:
 ; GISEL:       // %bb.0:
-; GISEL-NEXT:    // kill: def $d0 killed $d0 def $q0
 ; GISEL-NEXT:    fcvt h0, s0
 ; GISEL-NEXT:    ret
   %tmp = fptrunc <1 x float> %x to <1 x half>
diff --git a/llvm/test/CodeGen/AArch64/bswap.ll b/llvm/test/CodeGen/AArch64/bswap.ll
index 9ae4782b52bd9..898958fb4993f 100644
--- a/llvm/test/CodeGen/AArch64/bswap.ll
+++ b/llvm/test/CodeGen/AArch64/bswap.ll
@@ -207,7 +207,6 @@ define <1 x i32> @bswap_v1i32(<1 x i32> %a){
 ;
 ; CHECK-GI-LABEL: bswap_v1i32:
 ; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 def $q0
 ; CHECK-GI-NEXT:    fmov w8, s0
 ; CHECK-GI-NEXT:    rev w8, w8
 ; CHECK-GI-NEXT:    fmov s0, w8
diff --git a/llvm/test/CodeGen/AArch64/concat-vector.ll b/llvm/test/CodeGen/AArch64/concat-vector.ll
index 1e8dd0c78043a..acf15f1bd1178 100644
--- a/llvm/test/CodeGen/AArch64/concat-vector.ll
+++ b/llvm/test/CodeGen/AArch64/concat-vector.ll
@@ -13,10 +13,11 @@ define <4 x i8> @concat1(<2 x i8> %A, <2 x i8> %B) {
 ; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 def $q0
 ; CHECK-GI-NEXT:    mov w8, v0.s[1]
 ; CHECK-GI-NEXT:    // kill: def $d1 killed $d1 def $q1
+; CHECK-GI-NEXT:    mov w9, v1.s[1]
 ; CHECK-GI-NEXT:    mov v0.h[1], w8
-; CHECK-GI-NEXT:    mov w8, v1.s[1]
-; CHECK-GI-NEXT:    mov v0.h[2], v1.h[0]
-; CHECK-GI-NEXT:    mov v0.h[3], w8
+; CHECK-GI-NEXT:    fmov w8, s1
+; CHECK-GI-NEXT:    mov v0.h[2], w8
+; CHECK-GI-NEXT:    mov v0.h[3], w9
 ; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 killed $q0
 ; CHECK-GI-NEXT:    ret
    %v4i8 = shufflevector <2 x i8> %A, <2 x i8> %B, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
diff --git a/llvm/test/CodeGen/AArch64/double_reduct.ll b/llvm/test/CodeGen/AArch64/double_reduct.ll
index 2d146bf9aae89..f30895db2c098 100644
--- a/llvm/test/CodeGen/AArch64/double_reduct.ll
+++ b/llvm/test/CodeGen/AArch64/double_reduct.ll
@@ -65,8 +65,10 @@ define float @fmul_f32(<8 x float> %a, <4 x float> %b) {
 ; CHECK-GI-NEXT:    mov d1, v0.d[1]
 ; CHECK-GI-NEXT:    fmul v0.2s, v0.2s, v1.2s
 ; CHECK-GI-NEXT:    fmul v1.2s, v2.2s, v3.2s
-; CHECK-GI-NEXT:    fmul s0, s0, v0.s[1]
-; CHECK-GI-NEXT:    fmul s1, s1, v1.s[1]
+; CHECK-GI-NEXT:    mov s2, v0.s[1]
+; CHECK-GI-NEXT:    mov s3, v1.s[1]
+; CHECK-GI-NEXT:    fmul s0, s0, s2
+; CHECK-GI-NEXT:    fmul s1, s1, s3
 ; CHECK-GI-NEXT:    fmul s0, s0, s1
 ; CHECK-GI-NEXT:    ret
   %r1 = call fast float @llvm.vector.reduce.fmul.f32.v8f32(float 1.0, <8 x float> %a)
@@ -90,8 +92,10 @@ define float @fmul_f32_same(<4 x float> %a, <4 x float> %b) {
 ; CHECK-GI-NEXT:    mov d3, v1.d[1]
 ; CHECK-GI-NEXT:    fmul v0.2s, v0.2s, v2.2s
 ; CHECK-GI-NEXT:    fmul v1.2s, v1.2s, v3.2s
-; CHECK-GI-NEXT:    fmul s0, s0, v0.s[1]
-; CHECK-GI-NEXT:    fmul s1, s1, v1.s[1]
+; CHECK-GI-NEXT:    mov s2, v0.s[1]
+; CHECK-GI-NEXT:    mov s3, v1.s[1]
+; CHECK-GI-NEXT:    fmul s0, s0, s2
+; CHECK-GI-NEXT:    fmul s1, s1, s3
 ; CHECK-GI-NEXT:    fmul s0, s0, s1
 ; CHECK-GI-NEXT:    ret
   %r1 = call fast float @llvm.vector.reduce.fmul.f32.v4f32(float 1.0, <4 x float> %a)
@@ -918,8 +922,10 @@ define float @nested_mul_f32(<4 x float> %a, <4 x float> %b, float %c, float %d)
 ; CHECK-GI-NEXT:    mov d5, v1.d[1]
 ; CHECK-GI-NEXT:    fmul v0.2s, v0.2s, v4.2s
 ; CHECK-GI-NEXT:    fmul v1.2s, v1.2s, v5.2s
-; CHECK-GI-NEXT:    fmul s0, s0, v0.s[1]
-; CHECK-GI-NEXT:    fmul s1, s1, v1.s[1]
+; CHECK-GI-NEXT:    mov s4, v0.s[1]
+; CHECK-GI-NEXT:    mov s5, v1.s[1]
+; CHECK-GI-NEXT:    fmul s0, s0, s4
+; CHECK-GI-NEXT:    fmul s1, s1, s5
 ; CHECK-GI-NEXT:    fmul s0, s0, s2
 ; CHECK-GI-NEXT:    fmul s1, s1, s3
 ; CHECK-GI-NEXT:    fmul s0, s0, s1
diff --git a/llvm/test/CodeGen/AArch64/f16-instructions.ll b/llvm/test/CodeGen/AArch64/f16-instructions.ll
index aa120f2643950..adc536da26f26 100644
--- a/llvm/test/CodeGen/AArch64/f16-instructions.ll
+++ b/llvm/test/CodeGen/AArch64/f16-instructions.ll
@@ -1496,7 +1496,7 @@ define half @test_copysign(half %a, half %b) #0 {
 ; CHECK-CVT-GI-NEXT:    // kill: def $h0 killed $h0 def $d0
 ; CHECK-CVT-GI-NEXT:    // kill: def $h1 killed $h1 def $d1
 ; CHECK-CVT-GI-NEXT:    bif v0.8b, v1.8b, v2.8b
-; CHECK-CVT-GI-NEXT:    // kill: def $h0 killed $h0 killed $q0
+; CHECK-CVT-GI-NEXT:    // kill: def $h0 killed $h0 killed $d0
 ; CHECK-CVT-GI-NEXT:    ret
 ;
 ; CHECK-FP16-GI-LABEL: test_copysign:
@@ -1505,7 +1505,7 @@ define half @test_copysign(half %a, half %b) #0 {
 ; CHECK-FP16-GI-NEXT:    // kill: def $h0 killed $h0 def $d0
 ; CHECK-FP16-GI-NEXT:    // kill: def $h1 killed $h1 def $d1
 ; CHECK-FP16-GI-NEXT:    bif v0.8b, v1.8b, v2.8b
-; CHECK-FP16-GI-NEXT:    // kill: def $h0 killed $h0 killed $q0
+; CHECK-FP16-GI-NEXT:    // kill: def $h0 killed $h0 killed $d0
 ; CHECK-FP16-GI-NEXT:    ret
   %r = call half @llvm.copysign.f16(half %a, half %b)
   ret half %r
@@ -1536,7 +1536,7 @@ define half @test_copysign_f32(half %a, float %b) #0 {
 ; CHECK-CVT-GI-NEXT:    mvni v2.4h, #128, lsl #8
 ; CHECK-CVT-GI-NEXT:    // kill: def $h0 killed $h0 def $d0
 ; CHECK-CVT-GI-NEXT:    bif v0.8b, v1.8b, v2.8b
-; CHECK-CVT-GI-NEXT:    // kill: def $h0 killed $h0 killed $q0
+; CHECK-CVT-GI-NEXT:    // kill: def $h0 killed $h0 killed $d0
 ; CHECK-CVT-GI-NEXT:    ret
 ;
 ; CHECK-FP16-GI-LABEL: test_copysign_f32:
@@ -1545,7 +1545,7 @@ define half @test_copysign_f32(half %a, float %b) #0 {
 ; CHECK-FP16-GI-NEXT:    mvni v2.4h, #128, lsl #8
 ; CHECK-FP16-GI-NEXT:    // kill: def $h0 killed $h0 def $d0
 ; CHECK-FP16-GI-NEXT:    bif v0.8b, v1.8b, v2.8b
-; CHECK-FP16-GI-NEXT:    // kill: def $h0 killed $h0 killed $q0
+; CHECK-FP16-GI-NEXT:    // kill: def $h0 killed $h0 killed $d0
 ; CHECK-FP16-GI-NEXT:    ret
   %tb = fptrunc float %b to half
   %r = call half @llvm.copysign.f16(half %a, half %tb)
@@ -1577,7 +1577,7 @@ define half @test_copysign_f64(half %a, double %b) #0 {
 ; CHECK-CVT-GI-NEXT:    mvni v2.4h, #128, lsl #8
 ; CHECK-CVT-GI-NEXT:    // kill: def $h0 killed $h0 def $d0
 ; CHECK-CVT-GI-NEXT:    bif v0.8b, v1.8b, v2.8b
-; CHECK-CVT-GI-NEXT:    // kill: def $h0 killed $h0 killed $q0
+; CHECK-CVT-GI-NEXT:    // kill: def $h0 killed $h0 killed $d0
 ; CHECK-CVT-GI-NEXT:    ret
 ;
 ; CHECK-FP16-GI-LABEL: test_copysign_f64:
@@ -1586,7 +1586,7 @@ define half @test_copysign_f64(half %a, double %b) #0 {
 ; CHECK-FP16-GI-NEXT:    mvni v2.4h, #128, lsl #8
 ; CHECK-FP16-GI-NEXT:    // kill: def $h0 killed $h0 def $d0
 ; CHECK-FP16-GI-NEXT:    bif v0.8b, v1.8b, v2.8b
-; CHECK-FP16-GI-NEXT:    // kill: def $h0 killed $h0 killed $q0
+; CHECK-FP16-GI-NEXT:    // kill: def $h0 killed $h0 killed $d0
 ; CHECK-FP16-GI-NEXT:    ret
   %tb = fptrunc double %b to half
   %r = call half @llvm.copysign.f16(half %a, half %tb)
diff --git a/llvm/test/CodeGen/AArch64/faddsub.ll b/llvm/test/CodeGen/AArch64/faddsub.ll
index 943073e2a603e..b15579199a059 100644
--- a/llvm/test/CodeGen/AArch64/faddsub.ll
+++ b/llvm/test/CodeGen/AArch64/faddsub.ll
@@ -196,7 +196,7 @@ define <7 x half> @fadd_v7f16(<7 x half> %a, <7 x half> %b) {
 ; CHECK-GI-NOFP16-NEXT:    fcvtn v2.4h, v2.4s
 ; CHECK-GI-NOFP16-NEXT:    mov v4.h[2], v0.h[6]
 ; CHECK-GI-NOFP16-NEXT:    mov v3.h[2], v1.h[6]
-; CHECK-GI-NOFP16-NEXT:    mov h0, v2.h[0]
+; CHECK-GI-NOFP16-NEXT:    mov v0.h[0], v2.h[0]
 ; CHECK-GI-NOFP16-NEXT:    fcvtl v1.4s, v4.4h
 ; CHECK-GI-NOFP16-NEXT:    fcvtl v3.4s, v3.4h
 ; CHECK-GI-NOFP16-NEXT:    mov v0.h[1], v2.h[1]
@@ -537,7 +537,7 @@ define <7 x half> @fsub_v7f16(<7 x half> %a, <7 x half> %b) {
 ; CHECK-GI-NOFP16-NEXT:    fcvtn v2.4h, v2.4s
 ; CHECK-GI-NOFP16-NEXT:    mov v4.h[2], v0.h[6]
 ; CHECK-GI-NOFP16-NEXT:    mov v3.h[2], v1.h[6]
-; CHECK-GI-NOFP16-NEXT:    mov h0, v2.h[0]
+; CHECK-GI-NOFP16-NEXT:    mov v0.h[0], v2.h[0]
 ; CHECK-GI-NOFP16-NEXT:    fcvtl v1.4s, v4.4h
 ; CHECK-GI-NOFP16-NEXT:    fcvtl v3.4s, v3.4h
 ; CHECK-GI-NOFP16-NEXT:    mov v0.h[1], v2.h[1]
diff --git a/llvm/test/CodeGen/AArch64/fcopysign.ll b/llvm/test/CodeGen/AArch64/fcopysign.ll
index 7ac1f37af2e0b..3a5f7e2cd6b29 100644
--- a/llvm/test/CodeGen/AArch64/fcopysign.ll
+++ b/llvm/test/CodeGen/AArch64/fcopysign.ll
@@ -33,7 +33,7 @@ define float @copysign_f32(float %a, float %b) {
 ; CHECK-GI-NEXT:    // kill: def $s0 killed $s0 def $d0
 ; CHECK-GI-NEXT:    // kill: def $s1 killed $s1 def $d1
 ; CHECK-GI-NEXT:    bif v0.8b, v1.8b, v2.8b
-; CHECK-GI-NEXT:    // kill: def $s0 killed $s0 killed $q0
+; CHECK-GI-NEXT:    // kill: def $s0 killed $s0 killed $d0
 ; CHECK-GI-NEXT:    ret
 entry:
   %c = call float @llvm.copysign.f32(float %a, float %b)
@@ -56,7 +56,7 @@ define half @copysign_f16(half %a, half %b) {
 ; CHECK-GI-NEXT:    // kill: def $h0 killed $h0 def $d0
 ; CHECK-GI-NEXT:    // kill: def $h1 killed $h1 def $d1
 ; CHECK-GI-NEXT:    bif v0.8b, v1.8b, v2.8b
-; CHECK-GI-NEXT:    // kill: def $h0 killed $h0 killed $q0
+; CHECK-GI-NEXT:    // kill: def $h0 killed $h0 killed $d0
 ; CHECK-GI-NEXT:    ret
 entry:
   %c = call half @llvm.copysign.f16(half %a, half %b)
diff --git a/llvm/test/CodeGen/AArch64/fcvt.ll b/llvm/test/CodeGen/AArch64/fcvt.ll
index 2c512de413aeb..b408e9c1bd4e6 100644
--- a/llvm/test/CodeGen/AArch64/fcvt.ll
+++ b/llvm/test/CodeGen/AArch64/fcvt.ll
@@ -169,7 +169,7 @@ define <7 x half> @ceil_v7f16(<7 x half> %a) {
 ; CHECK-GI-NOFP16-NEXT:    mov v2.h[1], v0.h[5]
 ; CHECK-GI-NOFP16-NEXT:    fcvtn v1.4h, v1.4s
 ; CHECK-GI-NOFP16-NEXT:    mov v2.h[2], v0.h[6]
-; CHECK-GI-NOFP16-NEXT:    mov h0, v1.h[0]
+; CHECK-GI-NOFP16-NEXT:    mov v0.h[0], v1.h[0]
 ; CHECK-GI-NOFP16-NEXT:    fcvtl v2.4s, v2.4h
 ; CHECK-GI-NOFP16-NEXT:    mov v0.h[1], v1.h[1]
 ; CHECK-GI-NOFP16-NEXT:    frintp v2.4s, v2.4s
@@ -468,7 +468,7 @@ define <7 x half> @floor_v7f16(<7 x half> %a) {
 ; CHECK-GI-NOFP16-NEXT:    mov v2.h[1], v0.h[5]
 ; CHECK-GI-NOFP16-NEXT:    fcvtn v1.4h, v1.4s
 ; CHECK-GI-NOFP16-NEXT:    mov v2.h[2], v0.h[6]
-; CHECK-GI-NOFP16-NEXT:    mov h0, v1.h[0]
+; CHECK-GI-NOFP16-NEXT:    mov v0.h[0], v1.h[0]
 ; CHECK-GI-NOFP16-NEXT:    fcvtl v2.4s, v2.4h
 ; CHECK-GI-NOFP16-NEXT:    mov v0.h[1], v1.h[1]
 ; CHECK-GI-NOFP16-NEXT:    frintm v2.4s, v2.4s
@@ -767,7 +767,7 @@ define <7 x half> @nearbyint_v7f16(<7 x half> %a) {
 ; CHECK-GI-NOFP16-NEXT:    mov v2.h[1], v0.h[5]
 ; CHECK-GI-NOFP16-NEXT:    fcvtn v1.4h, v1.4s
 ; CHECK-GI-NOFP16-NEXT:    mov v2.h[2], v0.h[6]
-; CHECK-GI-NOFP16-NEXT:    mov h0, v1.h[0]
+; CHECK-GI-NOFP16-NEXT:    mov v0.h[0], v1.h[0]
 ; CHECK-GI-NOFP16-NEXT:    fcvtl v2.4s, v2.4h
 ; CHECK-GI-NOFP16-NEXT:    mov v0.h[1], v1.h[1]
 ; CHECK-GI-NOFP16-NEXT:    frinti v2.4s, v2.4s
@@ -1066,7 +1066,7 @@ define <7 x half> @roundeven_v7f16(<7 x half> %a) {
 ; CHECK-GI-NOFP16-NEXT:    mov v2.h[1], v0.h[5]
 ; CHECK-GI-NOFP16-NEXT:    fcvtn v1.4h, v1.4s
 ; CHECK-GI-NOFP16-NEXT:    mov v2.h[2], v0.h[6]
-; CHECK-GI-NOFP16-NEXT:    mov h0, v1.h[0]
+; CHECK-GI-NOFP16-NEXT:    mov v0.h[0], v1.h[0]
 ; CHECK-GI-NOFP16-NEXT:    fcvtl v2.4s, v2.4h
 ; CHECK-GI-NOFP16-NEXT:    mov v0.h[1], v1.h[1]
 ; CHECK-GI-NOFP16-NEXT:    frintn v2.4s, v2.4s
@@ -1365,7 +1365,7 @@ define <7 x half> @rint_v7f16(<7 x half> %a) {
 ; CHECK-GI-NOFP16-NEXT:    mov v2.h[1], v0.h[5]
 ; CHECK-GI-NOFP16-NEXT:    fcvtn v1.4h, v1.4s
 ; CHECK-GI-NOFP16-NEXT:    mov v2.h[2], v0.h[6]
-; CHECK-GI-NOFP16-NEXT:    mov h0, v1.h[0]
+; CHECK-GI-NOFP16-NEXT:    mov v0.h[0], v1.h[0]
 ; CHECK-GI-NOFP16-NEXT:    fcvtl v2.4s, v2.4h
 ; CHECK-GI-NOFP16-NEXT:    mov v0.h[1], v1.h[1]
 ; CHECK-GI-NOFP16-NEXT:    frintx v2.4s, v2.4s
@@ -1664,7 +1664,7 @@ define <7 x half> @round_v7f16(<7 x half> %a) {
 ; CHECK-GI-NOFP16-NEXT:    mov v2.h[1], v0.h[5]
 ; CHECK-GI-NOFP16-NEXT:    fcvtn v1.4h, v1.4s
 ; CHECK-GI-NOFP16-NEXT:    mov v2.h[2], v0.h[6]
-; CHECK-GI-NOFP16-NEXT:    mov h0, v1.h[0]
+; CHECK-GI-NOFP16-NEXT:    mov v0.h[0], v1.h[0]
 ; CHECK-GI-NOFP16-NEXT:    fcvtl v2.4s, v2.4h
 ; CHECK-GI-NOFP16-NEXT:    mov v0.h[1], v1.h[1]
 ; CHECK-GI-NOFP16-NEXT:    frinta v2.4s, v2.4s
@@ -1963,7 +1963,7 @@ define <7 x half> @trunc_v7f16(<7 x half> %a) {
 ; CHECK-GI-NOFP16-NEXT:    mov v2.h[1], v0.h[5]
 ; CHECK-GI-NOFP16-NEXT:    fcvtn v1.4h, v1.4s
 ; CHECK-GI-NOFP16-NEXT:    mov v2.h[2], v0.h[6]
-; CHECK-GI-NOFP16-NEXT:    mov h0, v1.h[0]
+; CHECK-GI-NOFP16-NEXT:    mov v0.h[0], v1.h[0]
 ; CHECK-GI-NOFP16-NEXT:    fcvtl v2.4s, v2.4h
 ; CHECK-GI-NOFP16-NEXT:    mov v0.h[1], v1.h[1]
 ; CHECK-GI-NOFP16-NEXT:    frintz v2.4s, v2.4s
diff --git a/llvm/test/CodeGen/AArch64/fdiv.ll b/llvm/test/CodeGen/AArch64/fdiv.ll
index d232ca4d9c131..5bdccccc62b99 100644
--- a/llvm/test/CodeGen/AArch64/fdiv.ll
+++ b/llvm/test/CodeGen/AArch64/fdiv.ll
@@ -199,7 +199,7 @@ define <7 x half> @fdiv_v7f16(<7 x half> %a, <7 x half> %b) {
 ; CHECK-GI-NOFP16-NEXT:    fcvtl v1.4s, v3.4h
 ; CHECK-GI-NOFP16-NEXT:    fcvtn v2.4h, v2.4s
 ; CHECK-GI-NOFP16-NEXT:    fdiv v1.4s, v0.4s, v1.4s
-; CHECK-GI-NOFP16-NEXT:    mov h0, v2.h[0]
+; CHECK-GI-NOFP16-NEXT:    mov v0.h[0], v2.h[0]
 ; CHECK-GI-NOFP16-NEXT:    mov v0.h[1], v2.h[1]
 ; CHECK-GI-NOFP16-NEXT:    mov v0.h[2], v2.h[2]
 ; CHECK-GI-NOFP16-NEXT:    fcvtn v1.4h, v1.4s
diff --git a/llvm/test/CodeGen/AArch64/fminimummaximum.ll b/llvm/test/CodeGen/AArch64/fminimummaximum.ll
index 1c7c55d12a864..fb12f8acf1745 100644
--- a/llvm/test/CodeGen/AArch64/fminimummaximum.ll
+++ b/llvm/test/CodeGen/AArch64/fminimummaximum.ll
@@ -672,7 +672,7 @@ define <7 x half> @min_v7f16(<7 x half> %a, <7 x half> %b) {
 ; CHECK-NOFP16-GI-NEXT:    fcvtn v2.4h, v2.4s
 ; CHECK-NOFP16-GI-NEXT:    mov v4.h[2], v0.h[6]
 ; CHECK-NOFP16-GI-NEXT:    mov v3.h[2], v1.h[6]
-; CHECK-NOFP16-GI-NEXT:    mov h0, v2.h[0]
+; CHECK-NOFP16-GI-NEXT:    mov v0.h[0], v2.h[0]
 ; CHECK-NOFP16-GI-NEXT:    fcvtl v1.4s, v4.4h
 ; CHECK-NOFP16-GI-NEXT:    fcvtl v3.4s, v3.4h
 ; CHECK-NOFP16-GI-NEXT:    mov v0.h[1], v2.h[1]
@@ -770,7 +770,7 @@ define <7 x half> @max_v7f16(<7 x half> %a, <7 x half> %b) {
 ; CHECK-NOFP16-GI-NEXT:    fcvtn v2.4h, v2.4s
 ; CHECK-NOFP16-GI-NEXT:    mov v4.h[2], v0.h[6]
 ; CHECK-NOFP16-GI-NEXT:    mov v3.h[2], v1.h[6]
-; CHECK-NOFP16-GI-NEXT:    mov h0, v2.h[0]
+; CHECK-NOFP16-GI-NEXT:    mov v0.h[0], v2.h[0]
 ; CHECK-NOFP16-GI-NEXT:    fcvtl v1.4s, v4.4h
 ; CHECK-NOFP16-GI-NEXT:    fcvtl v3.4s, v3.4h
 ; CHECK-NOFP16-GI-NEXT:    mov v0.h[1], v2.h[1]
diff --git a/llvm/test/CodeGen/AArch64/fminmax.ll b/llvm/test/CodeGen/AArch64/fminmax.ll
index da9b57223cff7..64f0da8b4cd0f 100644
--- a/llvm/test/CodeGen/AArch64/fminmax.ll
+++ b/llvm/test/CodeGen/AArch64/fminmax.ll
@@ -672,7 +672,7 @@ define <7 x half> @min_v7f16(<7 x half> %a, <7 x half> %b) {
 ; CHECK-NOFP16-GI-NEXT:    fcvtn v2.4h, v2.4s
 ; CHECK-NOFP16-GI-NEXT:    mov v4.h[2], v0.h[6]
 ; CHECK-NOFP16-GI-NEXT:    mov v3.h[2], v1.h[6]
-; CHECK-NOFP16-GI-NEXT:    mov h0, v2.h[0]
+; CHECK-NOFP16-GI-NEXT:    mov v0.h[0], v2.h[0]
 ; CHECK-NOFP16-GI-NEXT:    fcvtl v1.4s, v4.4h
 ; CHECK-NOFP16-GI-NEXT:    fcvtl v3.4s, v3.4h
 ; CHECK-NOFP16-GI-NEXT:    mov v0.h[1], v2.h[1]
@@ -770,7 +770,7 @@ define <7 x half> @max_v7f16(<7 x half> %a, <7 x half> %b) {
 ; CHECK-NOFP16-GI-NEXT:    fcvtn v2.4h, v2.4s
 ; CHECK-NOFP16-GI-NEXT:    mov v4.h[2], v0.h[6]
 ; CHECK-NOFP16-GI-NEXT:    mov v3.h[2], v1.h[6]
-; CHECK-NOFP16-GI-NEXT:    mov h0, v2.h[0]
+; CHECK-NOFP16-GI-NEXT:    mov v0.h[0], v2.h[0]
 ; CHECK-NOFP16-GI-NEXT:    fcvtl v1.4s, v4.4h
 ; CHECK-NOFP16-GI-NEXT:    fcvtl v3.4s, v3.4h
 ; CHECK-NOFP16-GI-NEXT:    mov v0.h[1], v2.h[1]
diff --git a/llvm/test/CodeGen/AArch64/fmla.ll b/llvm/test/CodeGen/AArch64/fmla.ll
index ef59209b69921..a37aabb0b5384 100644
--- a/llvm/test/CodeGen/AArch64/fmla.ll
+++ b/llvm/test/CodeGen/AArch64/fmla.ll
@@ -268,7 +268,7 @@ define <7 x half> @fma_v7f16(<7 x half> %a, <7 x half> %b, <7 x half> %c) {
 ; CHECK-GI-NOFP16-NEXT:    mov v6.h[2], v0.h[6]
 ; CHECK-GI-NOFP16-NEXT:    mov v3.h[2], v1.h[6]
 ; CHECK-GI-NOFP16-NEXT:    mov v4.h[2], v2.h[6]
-; CHECK-GI-NOFP16-NEXT:    mov h0, v5.h[0]
+; CHECK-GI-NOFP16-NEXT:    mov v0.h[0], v5.h[0]
 ; CHECK-GI-NOFP16-NEXT:    fcvtl v1.4s, v6.4h
 ; CHECK-GI-NOFP16-NEXT:    fcvtl v2.4s, v3.4h
 ; CHECK-GI-NOFP16-NEXT:    fcvtl v3.4s, v4.4h
@@ -873,7 +873,7 @@ define <7 x half> @fmuladd_v7f16(<7 x half> %a, <7 x half> %b, <7 x half> %c) {
 ; CHECK-GI-NOFP16-NEXT:    fcvtn v3.4h, v0.4s
 ; CHECK-GI-NOFP16-NEXT:    mov v5.h[2], v2.h[6]
 ; CHECK-GI-NOFP16-NEXT:    fcvtn v1.4h, v1.4s
-; CHECK-GI-NOFP16-NEXT:    mov h0, v3.h[0]
+; CHECK-GI-NOFP16-NEXT:    mov v0.h[0], v3.h[0]
 ; CHECK-GI-NOFP16-NEXT:    fcvtl v2.4s, v5.4h
 ; CHECK-GI-NOFP16-NEXT:    fcvtl v1.4s, v1.4h
 ; CHECK-GI-NOFP16-NEXT:    mov v0.h[1], v3.h[1]
@@ -1358,7 +1358,7 @@ define <7 x half> @fmul_v7f16(<7 x half> %a, <7 x half> %b, <7 x half> %c) {
 ; CHECK-GI-NOFP16-NEXT:    fcvtn v3.4h, v0.4s
 ; CHECK-GI-NOFP16-NEXT:    mov v5.h[2], v2.h[6]
 ; CHECK-GI-NOFP16-NEXT:    fcvtn v1.4h, v1.4s
-; CHECK-GI-NOFP16-NEXT:    mov h0, v3.h[0]
+; CHECK-GI-NOFP16-NEXT:    mov v0.h[0], v3.h[0]
 ; CHECK-GI-NOFP16-NEXT:    fcvtl v2.4s, v5.4h
 ; CHECK-GI-NOFP16-NEXT:    fcvtl v1.4s, v1.4h
 ; CHECK-GI-NOFP16-NEXT:    mov v0.h[1], v3.h[1]
diff --git a/llvm/test/CodeGen/AArch64/fmul.ll b/llvm/test/CodeGen/AArch64/fmul.ll
index 51eba5666f681..bd3d1353e643e 100644
--- a/llvm/test/CodeGen/AArch64/fmul.ll
+++ b/llvm/test/CodeGen/AArch64/fmul.ll
@@ -196,7 +196,7 @@ define <7 x half> @fmul_v7f16(<7 x half> %a, <7 x half> %b) {
 ; CHECK-GI-NOFP16-NEXT:    fcvtn v2.4h, v2.4s
 ; CHECK-GI-NOFP16-NEXT:    mov v4.h[2], v0.h[6]
 ; CHECK-GI-NOFP16-NEXT:    mov v3.h[2], v1.h[6]
-; CHECK-GI-NOFP16-NEXT:    mov h0, v2.h[0]
+; CHECK-GI-NOFP16-NEXT:    mov v0.h[0], v2.h[0]
 ; CHECK-GI-NOFP16-NEXT:    fcvtl v1.4s, v4.4h
 ; CHECK-GI-NOFP16-NEXT:    fcvtl v3.4s, v3.4h
 ; CHECK-GI-NOFP16-NEXT:    mov v0.h[1], v2.h[1]
diff --git a/llvm/test/CodeGen/AArch64/fptosi-sat-vector.ll b/llvm/test/CodeGen/AArch64/fptosi-sat-vector.ll
index bcebbf4982eaa..9c21d2bf083a2 100644
--- a/llvm/test/CodeGen/AArch64/fptosi-sat-vector.ll
+++ b/llvm/test/CodeGen/AArch64/fptosi-sat-vector.ll
@@ -31,7 +31,6 @@ define <1 x i32> @test_signed_v1f32_v1i32(<1 x float> %f) {
 ;
 ; CHECK-GI-LABEL: test_signed_v1f32_v1i32:
 ; CHECK-GI:       // %bb.0:
-; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 def $q0
 ; CHECK-GI-NEXT:    fcvtzs w8, s0
 ; CHECK-GI-NEXT:    fmov s0, w8
 ; CHECK-GI-NEXT:    ret
diff --git a/llvm/test/CodeGen/AArch64/fptoui-sat-vector.ll b/llvm/test/CodeGen/AArch64/fptoui-sat-vector.ll
index 38895eb7bd761..44847a41287d6 100644
--- a/llvm/test/CodeGen/AArch64/fptoui-sat-vector.ll
+++ b/llvm/test/CodeGen/AArch64/fptoui-sat-vector.ll
@@ -31,7 +31,6 @@ define <1 x i32> @test_unsigned_v1f32_v1i32(<1 x float> %f) {
 ;
 ; CHECK-GI-LABEL: test_unsigned_v1f32_v1i32:
 ; CHECK-GI:       // %bb.0:
-; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 def $q0
 ; CHECK-GI-NEXT:    fcvtzu w8, s0
 ; CHECK-GI-NEXT:    fmov s0, w8
 ; CHECK-GI-NEXT:    ret
diff --git a/llvm/test/CodeGen/AArch64/fptrunc.ll b/llvm/test/CodeGen/AArch64/fptrunc.ll
index a428c95c90387..1f84c944d7c16 100644
--- a/llvm/test/CodeGen/AArch64/fptrunc.ll
+++ b/llvm/test/CodeGen/AArch64/fptrunc.ll
@@ -263,7 +263,7 @@ define <3 x float> @fptrunc_v3f64_v3f32(<3 x double> %a) {
 ; CHECK-GI-NEXT:    fcvt s2, d2
 ; CHECK-GI-NEXT:    mov v0.d[1], v1.d[0]
 ; CHECK-GI-NEXT:    fcvtn v1.2s, v0.2d
-; CHECK-GI-NEXT:    mov s0, v1.s[0]
+; CHECK-GI-NEXT:    mov v0.s[0], v1.s[0]
 ; CHECK-GI-NEXT:    mov v0.s[1], v1.s[1]
 ; CHECK-GI-NEXT:    mov v0.s[2], v2.s[0]
 ; CHECK-GI-NEXT:    ret
@@ -354,7 +354,7 @@ define <2 x half> @fptrunc_v2f32_v2f16(<2 x float> %a) {
 ; CHECK-GI-LABEL: fptrunc_v2f32_v2f16:
 ; CHECK-GI:       // %bb.0: // %entry
 ; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 def $q0
-; CHECK-GI-NEXT:    mov s1, v0.s[0]
+; CHECK-GI-NEXT:    mov v1.s[0], v0.s[0]
 ; CHECK-GI-NEXT:    mov v1.s[1], v0.s[1]
 ; CHECK-GI-NEXT:    fcvtn v0.4h, v1.4s
 ; CHECK-GI-NEXT:    ret
diff --git a/llvm/test/CodeGen/AArch64/fsqrt.ll b/llvm/test/CodeGen/AArch64/fsqrt.ll
index 1e888a4c0e193..6c5fd8e52b017 100644
--- a/llvm/test/CodeGen/AArch64/fsqrt.ll
+++ b/llvm/test/CodeGen/AArch64/fsqrt.ll
@@ -203,7 +203,7 @@ define <7 x half> @sqrt_v7f16(<7 x half> %a) {
 ; CHECK-GI-NOFP16-NEXT:    fcvtl v0.4s, v2.4h
 ; CHECK-GI-NOFP16-NEXT:    fcvtn v1.4h, v1.4s
 ; CHECK-GI-NOFP16-NEXT:    fsqrt v2.4s, v0.4s
-; CHECK-GI-NOFP16-NEXT:    mov h0, v1.h[0]
+; CHECK-GI-NOFP16-NEXT:    mov v0.h[0], v1.h[0]
 ; CHECK-GI-NOFP16-NEXT:    mov v0.h[1], v1.h[1]
 ; CHECK-GI-NOFP16-NEXT:    mov v0.h[2], v1.h[2]
 ; CHECK-GI-NOFP16-NEXT:    fcvtn v2.4h, v2.4s
diff --git a/llvm/test/CodeGen/AArch64/insertextract.ll b/llvm/test/CodeGen/AArch64/insertextract.ll
index 1af36ccaefa30..5c89316e5f570 100644
--- a/llvm/test/CodeGen/AArch64/insertextract.ll
+++ b/llvm/test/CodeGen/AArch64/insertextract.ll
@@ -1478,11 +1478,16 @@ entry:
 }
 
 define float @extract_v2f32_0(<2 x float> %a, i32 %c) {
-; CHECK-LABEL: extract_v2f32_0:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
-; CHECK-NEXT:    // kill: def $s0 killed $s0 killed $q0
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: extract_v2f32_0:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-SD-NEXT:    // kill: def $s0 killed $s0 killed $q0
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: extract_v2f32_0:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    // kill: def $s0 killed $s0 killed $d0
+; CHECK-GI-NEXT:    ret
 entry:
   %d = extractelement <2 x float> %a, i32 0
   ret float %d
@@ -1681,11 +1686,16 @@ entry:
 }
 
 define half @extract_v4f16_0(<4 x half> %a, i32 %c) {
-; CHECK-LABEL: extract_v4f16_0:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
-; CHECK-NEXT:    // kill: def $h0 killed $h0 killed $q0
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: extract_v4f16_0:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-SD-NEXT:    // kill: def $h0 killed $h0 killed $q0
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: extract_v4f16_0:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    // kill: def $h0 killed $h0 killed $d0
+; CHECK-GI-NEXT:    ret
 entry:
   %d = extractelement <4 x half> %a, i32 0
   ret half %d
@@ -2149,11 +2159,16 @@ entry:
 }
 
 define i32 @extract_v2i32_0(<2 x i32> %a, i32 %c) {
-; CHECK-LABEL: extract_v2i32_0:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
-; CHECK-NEXT:    fmov w0, s0
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: extract_v2i32_0:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-SD-NEXT:    fmov w0, s0
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: extract_v2i32_0:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    fmov w0, s0
+; CHECK-GI-NEXT:    ret
 entry:
   %d = extractelement <2 x i32> %a, i32 0
   ret i32 %d
diff --git a/llvm/test/CodeGen/AArch64/itofp.ll b/llvm/test/CodeGen/AArch64/itofp.ll
index 5ec30b6e8a667..e8194b9bd9b27 100644
--- a/llvm/test/CodeGen/AArch64/itofp.ll
+++ b/llvm/test/CodeGen/AArch64/itofp.ll
@@ -4378,7 +4378,7 @@ define <3 x float> @stofp_v3i64_v3f32(<3 x i64> %a) {
 ; CHECK-GI-NEXT:    scvtf v0.2d, v0.2d
 ; CHECK-GI-NEXT:    fcvtn v2.2s, v2.2d
 ; CHECK-GI-NEXT:    fcvtn v1.2s, v0.2d
-; CHECK-GI-NEXT:    mov s0, v1.s[0]
+; CHECK-GI-NEXT:    mov v0.s[0], v1.s[0]
 ; CHECK-GI-NEXT:    mov v0.s[1], v1.s[1]
 ; CHECK-GI-NEXT:    mov v0.s[2], v2.s[0]
 ; CHECK-GI-NEXT:    ret
@@ -4415,7 +4415,7 @@ define <3 x float> @utofp_v3i64_v3f32(<3 x i64> %a) {
 ; CHECK-GI-NEXT:    ucvtf v0.2d, v0.2d
 ; CHECK-GI-NEXT:    fcvtn v2.2s, v2.2d
 ; CHECK-GI-NEXT:    fcvtn v1.2s, v0.2d
-; CHECK-GI-NEXT:    mov s0, v1.s[0]
+; CHECK-GI-NEXT:    mov v0.s[0], v1.s[0]
 ; CHECK-GI-NEXT:    mov v0.s[1], v1.s[1]
 ; CHECK-GI-NEXT:    mov v0.s[2], v2.s[0]
 ; CHECK-GI-NEXT:    ret
@@ -6393,7 +6393,7 @@ define <2 x half> @stofp_v2i64_v2f16(<2 x i64> %a) {
 ; CHECK-GI-NOFP16:       // %bb.0: // %entry
 ; CHECK-GI-NOFP16-NEXT:    scvtf v0.2d, v0.2d
 ; CHECK-GI-NOFP16-NEXT:    fcvtn v0.2s, v0.2d
-; CHECK-GI-NOFP16-NEXT:    mov s1, v0.s[0]
+; CHECK-GI-NOFP16-NEXT:    mov v1.s[0], v0.s[0]
 ; CHECK-GI-NOFP16-NEXT:    mov v1.s[1], v0.s[1]
 ; CHECK-GI-NOFP16-NEXT:    fcvtn v0.4h, v1.4s
 ; CHECK-GI-NOFP16-NEXT:    ret
@@ -6439,7 +6439,7 @@ define <2 x half> @utofp_v2i64_v2f16(<2 x i64> %a) {
 ; CHECK-GI-NOFP16:       // %bb.0: // %entry
 ; CHECK-GI-NOFP16-NEXT:    ucvtf v0.2d, v0.2d
 ; CHECK-GI-NOFP16-NEXT:    fcvtn v0.2s, v0.2d
-; CHECK-GI-NOFP16-NEXT:    mov s1, v0.s[0]
+; CHECK-GI-NOFP16-NEXT:    mov v1.s[0], v0.s[0]
 ; CHECK-GI-NOFP16-NEXT:    mov v1.s[1], v0.s[1]
 ; CHECK-GI-NOFP16-NEXT:    fcvtn v0.4h, v1.4s
 ; CHECK-GI-NOFP16-NEXT:    ret
@@ -7375,7 +7375,7 @@ define <2 x half> @stofp_v2i32_v2f16(<2 x i32> %a) {
 ; CHECK-GI-LABEL: stofp_v2i32_v2f16:
 ; CHECK-GI:       // %bb.0: // %entry
 ; CHECK-GI-NEXT:    scvtf v0.2s, v0.2s
-; CHECK-GI-NEXT:    mov s1, v0.s[0]
+; CHECK-GI-NEXT:    mov v1.s[0], v0.s[0]
 ; CHECK-GI-NEXT:    mov v1.s[1], v0.s[1]
 ; CHECK-GI-NEXT:    fcvtn v0.4h, v1.4s
 ; CHECK-GI-NEXT:    ret
@@ -7395,7 +7395,7 @@ define <2 x half> @utofp_v2i32_v2f16(<2 x i32> %a) {
 ; CHECK-GI-LABEL: utofp_v2i32_v2f16:
 ; CHECK-GI:       // %bb.0: // %entry
 ; CHECK-GI-NEXT:    ucvtf v0.2s, v0.2s
-; CHECK-GI-NEXT:    mov s1, v0.s[0]
+; CHECK-GI-NEXT:    mov v1.s[0], v0.s[0]
 ; CHECK-GI-NEXT:    mov v1.s[1], v0.s[1]
 ; CHECK-GI-NEXT:    fcvtn v0.4h, v1.4s
 ; CHECK-GI-NEXT:    ret
@@ -7602,7 +7602,7 @@ define <2 x half> @stofp_v2i16_v2f16(<2 x i16> %a) {
 ; CHECK-GI-NOFP16-NEXT:    shl v0.2s, v0.2s, #16
 ; CHECK-GI-NOFP16-NEXT:    sshr v0.2s, v0.2s, #16
 ; CHECK-GI-NOFP16-NEXT:    scvtf v0.2s, v0.2s
-; CHECK-GI-NOFP16-NEXT:    mov s1, v0.s[0]
+; CHECK-GI-NOFP16-NEXT:    mov v1.s[0], v0.s[0]
 ; CHECK-GI-NOFP16-NEXT:    mov v1.s[1], v0.s[1]
 ; CHECK-GI-NOFP16-NEXT:    fcvtn v0.4h, v1.4s
 ; CHECK-GI-NOFP16-NEXT:    ret
@@ -7637,7 +7637,7 @@ define <2 x half> @utofp_v2i16_v2f16(<2 x i16> %a) {
 ; CHECK-GI-NOFP16-NEXT:    movi d1, #0x00ffff0000ffff
 ; CHECK-GI-NOFP16-NEXT:    and v0.8b, v0.8b, v1.8b
 ; CHECK-GI-NOFP16-NEXT:    ucvtf v0.2s, v0.2s
-; CHECK-GI-NOFP16-NEXT:    mov s1, v0.s[0]
+; CHECK-GI-NOFP16-NEXT:    mov v1.s[0], v0.s[0]
 ; CHECK-GI-NOFP16-NEXT:    mov v1.s[1], v0.s[1]
 ; CHECK-GI-NOFP16-NEXT:    fcvtn v0.4h, v1.4s
 ; CHECK-GI-NOFP16-NEXT:    ret
@@ -8124,7 +8124,7 @@ define <2 x half> @stofp_v2i8_v2f16(<2 x i8> %a) {
 ; CHECK-GI-NOFP16-NEXT:    shl v0.2s, v0.2s, #24
 ; CHECK-GI-NOFP16-NEXT:    sshr v0.2s, v0.2s, #24
 ; CHECK-GI-NOFP16-NEXT:    scvtf v0.2s, v0.2s
-; CHECK-GI-NOFP16-NEXT:    mov s1, v0.s[0]
+; CHECK-GI-NOFP16-NEXT:    mov v1.s[0], v0.s[0]
 ; CHECK-GI-NOFP16-NEXT:    mov v1.s[1], v0.s[1]
 ; CHECK-GI-NOFP16-NEXT:    fcvtn v0.4h, v1.4s
 ; CHECK-GI-NOFP16-NEXT:    ret
@@ -8175,7 +8175,7 @@ define <2 x half> @utofp_v2i8_v2f16(<2 x i8> %a) {
 ; CHECK-GI-NOFP16-NEXT:    movi d1, #0x0000ff000000ff
 ; CHECK-GI-NOFP16-NEXT:    and v0.8b, v0.8b, v1.8b
 ; CHECK-GI-NOFP16-NEXT:    ucvtf v0.2s, v0.2s
-; CHECK-GI-NOFP16-NEXT:    mov s1, v0.s[0]
+; CHECK-GI-NOFP16-NEXT:    mov v1.s[0], v0.s[0]
 ; CHECK-GI-NOFP16-NEXT:    mov v1.s[1], v0.s[1]
 ; CHECK-GI-NOFP16-NEXT:    fcvtn v0.4h, v1.4s
 ; CHECK-GI-NOFP16-NEXT:    ret
diff --git a/llvm/test/CodeGen/AArch64/llvm.exp10.ll b/llvm/test/CodeGen/AArch64/llvm.exp10.ll
index 9d165556f1c73..c1ea891bc86e7 100644
--- a/llvm/test/CodeGen/AArch64/llvm.exp10.ll
+++ b/llvm/test/CodeGen/AArch64/llvm.exp10.ll
@@ -301,17 +301,28 @@ define float @exp10_f32(float %x) {
 }
 
 define <1 x float> @exp10_v1f32(<1 x float> %x) {
-; CHECK-LABEL: exp10_v1f32:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
-; CHECK-NEXT:    .cfi_def_cfa_offset 16
-; CHECK-NEXT:    .cfi_offset w30, -16
-; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
-; CHECK-NEXT:    // kill: def $s0 killed $s0 killed $q0
-; CHECK-NEXT:    bl exp10f
-; CHECK-NEXT:    // kill: def $s0 killed $s0 def $d0
-; CHECK-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
-; CHECK-NEXT:    ret
+; SDAG-LABEL: exp10_v1f32:
+; SDAG:       // %bb.0:
+; SDAG-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
+; SDAG-NEXT:    .cfi_def_cfa_offset 16
+; SDAG-NEXT:    .cfi_offset w30, -16
+; SDAG-NEXT:    // kill: def $d0 killed $d0 def $q0
+; SDAG-NEXT:    // kill: def $s0 killed $s0 killed $q0
+; SDAG-NEXT:    bl exp10f
+; SDAG-NEXT:    // kill: def $s0 killed $s0 def $d0
+; SDAG-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
+; SDAG-NEXT:    ret
+;
+; GISEL-LABEL: exp10_v1f32:
+; GISEL:       // %bb.0:
+; GISEL-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
+; GISEL-NEXT:    .cfi_def_cfa_offset 16
+; GISEL-NEXT:    .cfi_offset w30, -16
+; GISEL-NEXT:    // kill: def $s0 killed $s0 killed $d0
+; GISEL-NEXT:    bl exp10f
+; GISEL-NEXT:    // kill: def $s0 killed $s0 def $d0
+; GISEL-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
+; GISEL-NEXT:    ret
   %r = call <1 x float> @llvm.exp10.v1f32(<1 x float> %x)
   ret <1 x float> %r
 }
diff --git a/llvm/test/CodeGen/AArch64/popcount.ll b/llvm/test/CodeGen/AArch64/popcount.ll
index eded13a6b3669..c158d8ad93b05 100644
--- a/llvm/test/CodeGen/AArch64/popcount.ll
+++ b/llvm/test/CodeGen/AArch64/popcount.ll
@@ -655,9 +655,7 @@ define i32 @ctpop_into_extract(ptr %p) {
 ; CHECKO0-NEXT:    // implicit-def: $d2
 ; CHECKO0-NEXT:    fmov s2, w8
 ; CHECKO0-NEXT:    ldr d0, [x0]
-; CHECKO0-NEXT:    // implicit-def: $q1
-; CHECKO0-NEXT:    fmov d1, d0
-; CHECKO0-NEXT:    // kill: def $s1 killed $s1 killed $q1
+; CHECKO0-NEXT:    fmov s1, s0
 ; CHECKO0-NEXT:    fmov w8, s1
 ; CHECKO0-NEXT:    fmov s1, w8
 ; CHECKO0-NEXT:    // kill: def $d1 killed $s1
@@ -727,9 +725,7 @@ define i32 @ctpop_into_extract(ptr %p) {
 ; GISELO0-NEXT:    // implicit-def: $d2
 ; GISELO0-NEXT:    fmov s2, w8
 ; GISELO0-NEXT:    ldr d0, [x0]
-; GISELO0-NEXT:    // implicit-def: $q1
-; GISELO0-NEXT:    fmov d1, d0
-; GISELO0-NEXT:    // kill: def $s1 killed $s1 killed $q1
+; GISELO0-NEXT:    fmov s1, s0
 ; GISELO0-NEXT:    fmov w8, s1
 ; GISELO0-NEXT:    fmov s1, w8
 ; GISELO0-NEXT:    // kill: def $d1 killed $s1
diff --git a/llvm/test/CodeGen/AArch64/ptradd.ll b/llvm/test/CodeGen/AArch64/ptradd.ll
index 4a1c50b67ed7b..28a8f4303765b 100644
--- a/llvm/test/CodeGen/AArch64/ptradd.ll
+++ b/llvm/test/CodeGen/AArch64/ptradd.ll
@@ -51,7 +51,6 @@ define <1 x ptr> @vector_gep_v1i32(<1 x ptr> %b, <1 x i32> %off) {
 ;
 ; CHECK-GI-LABEL: vector_gep_v1i32:
 ; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    // kill: def $d1 killed $d1 def $q1
 ; CHECK-GI-NEXT:    fmov w8, s1
 ; CHECK-GI-NEXT:    fmov x9, d0
 ; CHECK-GI-NEXT:    add x8, x9, w8, sxtw
diff --git a/llvm/test/CodeGen/AArch64/shift.ll b/llvm/test/CodeGen/AArch64/shift.ll
index 1652eb70b0625..9827cb3526f99 100644
--- a/llvm/test/CodeGen/AArch64/shift.ll
+++ b/llvm/test/CodeGen/AArch64/shift.ll
@@ -595,8 +595,6 @@ define <1 x i32> @shl_v1i32(<1 x i32> %0, <1 x i32> %1){
 ;
 ; CHECK-GI-LABEL: shl_v1i32:
 ; CHECK-GI:       // %bb.0:
-; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 def $q0
-; CHECK-GI-NEXT:    // kill: def $d1 killed $d1 def $q1
 ; CHECK-GI-NEXT:    fmov w8, s0
 ; CHECK-GI-NEXT:    fmov w9, s1
 ; CHECK-GI-NEXT:    lsl w8, w8, w9
@@ -773,8 +771,6 @@ define <1 x i32> @ashr_v1i32(<1 x i32> %0, <1 x i32> %1){
 ;
 ; CHECK-GI-LABEL: ashr_v1i32:
 ; CHECK-GI:       // %bb.0:
-; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 def $q0
-; CHECK-GI-NEXT:    // kill: def $d1 killed $d1 def $q1
 ; CHECK-GI-NEXT:    fmov w8, s0
 ; CHECK-GI-NEXT:    fmov w9, s1
 ; CHECK-GI-NEXT:    asr w8, w8, w9
@@ -947,8 +943,6 @@ define <1 x i32> @lshr_v1i32(<1 x i32> %0, <1 x i32> %1){
 ;
 ; CHECK-GI-LABEL: lshr_v1i32:
 ; CHECK-GI:       // %bb.0:
-; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 def $q0
-; CHECK-GI-NEXT:    // kill: def $d1 killed $d1 def $q1
 ; CHECK-GI-NEXT:    fmov w8, s0
 ; CHECK-GI-NEXT:    fmov w9, s1
 ; CHECK-GI-NEXT:    lsr w8, w8, w9
diff --git a/llvm/test/CodeGen/AArch64/store.ll b/llvm/test/CodeGen/AArch64/store.ll
index 0fe1ef5039929..3a9f12b838702 100644
--- a/llvm/test/CodeGen/AArch64/store.ll
+++ b/llvm/test/CodeGen/AArch64/store.ll
@@ -167,11 +167,16 @@ define void @store_v16i16(<16 x i16> %a, ptr %ptr){
 }
 
 define void @store_v1i32(<1 x i32> %a, ptr %ptr){
-; CHECK-LABEL: store_v1i32:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
-; CHECK-NEXT:    str s0, [x0]
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: store_v1i32:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-SD-NEXT:    str s0, [x0]
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: store_v1i32:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    str s0, [x0]
+; CHECK-GI-NEXT:    ret
     store <1 x i32> %a, ptr %ptr
     ret void
 }
diff --git a/llvm/test/CodeGen/AArch64/vec-combine-compare-to-bitmask.ll b/llvm/test/CodeGen/AArch64/vec-combine-compare-to-bitmask.ll
index 63e26a25f4e27..77483ebb2235c 100644
--- a/llvm/test/CodeGen/AArch64/vec-combine-compare-to-bitmask.ll
+++ b/llvm/test/CodeGen/AArch64/vec-combine-compare-to-bitmask.ll
@@ -930,85 +930,195 @@ define <2 x i8> @vector_to_vector_cast(<16 x i1> %arg) nounwind {
 ; CHECK-GI-LABEL: vector_to_vector_cast:
 ; CHECK-GI:       ; %bb.0:
 ; CHECK-GI-NEXT:    sub sp, sp, #16
-; CHECK-GI-NEXT:    umov.b w10, v0[1]
-; CHECK-GI-NEXT:    umov.b w9, v0[1]
+; CHECK-GI-NEXT:    umov.b w8, v0[1]
 ; CHECK-GI-NEXT:    mov d1, v0[1]
-; CHECK-GI-NEXT:    umov.b w8, v0[0]
-; CHECK-GI-NEXT:    umov.b w11, v0[0]
-; CHECK-GI-NEXT:    umov.b w12, v0[2]
-; CHECK-GI-NEXT:    umov.b w13, v0[2]
+; CHECK-GI-NEXT:    umov.b w10, v0[1]
+; CHECK-GI-NEXT:    umov.b w9, v0[0]
+; CHECK-GI-NEXT:    umov.b w13, v0[0]
+; CHECK-GI-NEXT:    umov.b w14, v0[2]
 ; CHECK-GI-NEXT:    umov.b w15, v0[3]
+; CHECK-GI-NEXT:    umov.b w11, v0[2]
 ; CHECK-GI-NEXT:    umov.b w16, v0[4]
-; CHECK-GI-NEXT:    umov.b w14, v0[3]
+; CHECK-GI-NEXT:    umov.b w17, v0[5]
+; CHECK-GI-NEXT:    umov.b w12, v0[3]
+; CHECK-GI-NEXT:    and w8, w8, #0x1
 ; CHECK-GI-NEXT:    and w10, w10, #0x1
-; CHECK-GI-NEXT:    and w9, w9, #0x1
-; CHECK-GI-NEXT:    bfi w8, w10, #1, #31
-; CHECK-GI-NEXT:    umov.b w10, v1[1]
-; CHECK-GI-NEXT:    and w12, w12, #0x1
-; CHECK-GI-NEXT:    bfi w11, w9, #1, #31
-; CHECK-GI-NEXT:    umov.b w9, v1[0]
-; CHECK-GI-NEXT:    and w13, w13, #0x1
-; CHECK-GI-NEXT:    orr w8, w8, w12, lsl #2
-; CHECK-GI-NEXT:    umov.b w12, v1[2]
+; CHECK-GI-NEXT:    umov.b w0, v1[1]
+; CHECK-GI-NEXT:    bfi w9, w8, #1, #31
+; CHECK-GI-NEXT:    bfi w13, w10, #1, #31
+; CHECK-GI-NEXT:    and w14, w14, #0x1
+; CHECK-GI-NEXT:    umov.b w8, v1[0]
+; CHECK-GI-NEXT:    umov.b w10, v1[2]
 ; CHECK-GI-NEXT:    and w15, w15, #0x1
-; CHECK-GI-NEXT:    orr w11, w11, w13, lsl #2
-; CHECK-GI-NEXT:    umov.b w13, v0[5]
+; CHECK-GI-NEXT:    orr w13, w13, w14, lsl #2
+; CHECK-GI-NEXT:    umov.b w14, v1[3]
+; CHECK-GI-NEXT:    and w11, w11, #0x1
+; CHECK-GI-NEXT:    and w0, w0, #0x1
 ; CHECK-GI-NEXT:    and w16, w16, #0x1
-; CHECK-GI-NEXT:    orr w8, w8, w15, lsl #3
-; CHECK-GI-NEXT:    umov.b w15, v1[3]
+; CHECK-GI-NEXT:    orr w9, w9, w11, lsl #2
+; CHECK-GI-NEXT:    orr w13, w13, w15, lsl #3
+; CHECK-GI-NEXT:    umov.b w15, v1[4]
+; CHECK-GI-NEXT:    umov.b w11, v0[6]
+; CHECK-GI-NEXT:    bfi w8, w0, #1, #31
 ; CHECK-GI-NEXT:    and w10, w10, #0x1
-; CHECK-GI-NEXT:    bfi w9, w10, #1, #31
-; CHECK-GI-NEXT:    umov.b w10, v0[6]
+; CHECK-GI-NEXT:    and w17, w17, #0x1
+; CHECK-GI-NEXT:    orr w13, w13, w16, lsl #4
 ; CHECK-GI-NEXT:    and w14, w14, #0x1
-; CHECK-GI-NEXT:    orr w8, w8, w16, lsl #4
-; CHECK-GI-NEXT:    umov.b w16, v1[4]
+; CHECK-GI-NEXT:    umov.b w0, v0[7]
+; CHECK-GI-NEXT:    orr w8, w8, w10, lsl #2
+; CHECK-GI-NEXT:    umov.b w10, v1[5]
+; CHECK-GI-NEXT:    umov.b w16, v1[6]
+; CHECK-GI-NEXT:    orr w13, w13, w17, lsl #5
+; CHECK-GI-NEXT:    umov.b w17, v0[4]
+; CHECK-GI-NEXT:    and w15, w15, #0x1
+; CHECK-GI-NEXT:    orr w8, w8, w14, lsl #3
 ; CHECK-GI-NEXT:    and w12, w12, #0x1
-; CHECK-GI-NEXT:    orr w9, w9, w12, lsl #2
+; CHECK-GI-NEXT:    and w11, w11, #0x1
+; CHECK-GI-NEXT:    umov.b w14, v1[7]
+; CHECK-GI-NEXT:    orr w9, w9, w12, lsl #3
+; CHECK-GI-NEXT:    orr w11, w13, w11, lsl #6
+; CHECK-GI-NEXT:    orr w8, w8, w15, lsl #4
+; CHECK-GI-NEXT:    umov.b w15, v0[5]
+; CHECK-GI-NEXT:    and w10, w10, #0x1
+; CHECK-GI-NEXT:    and w0, w0, #0x1
+; CHECK-GI-NEXT:    and w12, w17, #0x1
+; CHECK-GI-NEXT:    umov.b w13, v0[1]
+; CHECK-GI-NEXT:    orr w8, w8, w10, lsl #5
+; CHECK-GI-NEXT:    and w16, w16, #0x1
+; CHECK-GI-NEXT:    orr w9, w9, w12, lsl #4
+; CHECK-GI-NEXT:    umov.b w10, v0[0]
+; CHECK-GI-NEXT:    orr w11, w11, w0, lsl #7
+; CHECK-GI-NEXT:    and w14, w14, #0x1
+; CHECK-GI-NEXT:    and w12, w15, #0x1
+; CHECK-GI-NEXT:    umov.b w15, v0[2]
+; CHECK-GI-NEXT:    orr w8, w8, w16, lsl #6
+; CHECK-GI-NEXT:    orr w9, w9, w12, lsl #5
+; CHECK-GI-NEXT:    umov.b w12, v0[6]
+; CHECK-GI-NEXT:    strb w11, [sp, #8]
+; CHECK-GI-NEXT:    and w11, w13, #0x1
+; CHECK-GI-NEXT:    umov.b w13, v0[3]
+; CHECK-GI-NEXT:    orr w8, w8, w14, lsl #7
+; CHECK-GI-NEXT:    umov.b w14, v0[7]
+; CHECK-GI-NEXT:    ldr b0, [sp, #8]
+; CHECK-GI-NEXT:    bfi w10, w11, #1, #31
+; CHECK-GI-NEXT:    and w11, w15, #0x1
+; CHECK-GI-NEXT:    strb w8, [sp, #9]
+; CHECK-GI-NEXT:    umov.b w15, v0[4]
+; CHECK-GI-NEXT:    and w8, w12, #0x1
+; CHECK-GI-NEXT:    orr w10, w10, w11, lsl #2
+; CHECK-GI-NEXT:    orr w8, w9, w8, lsl #6
+; CHECK-GI-NEXT:    and w9, w13, #0x1
+; CHECK-GI-NEXT:    umov.b w11, v0[1]
+; CHECK-GI-NEXT:    orr w9, w10, w9, lsl #3
+; CHECK-GI-NEXT:    umov.b w10, v0[5]
+; CHECK-GI-NEXT:    umov.b w12, v0[0]
+; CHECK-GI-NEXT:    and w13, w14, #0x1
+; CHECK-GI-NEXT:    umov.b w16, v0[2]
+; CHECK-GI-NEXT:    umov.b w17, v0[3]
+; CHECK-GI-NEXT:    and w14, w15, #0x1
+; CHECK-GI-NEXT:    umov.b w15, v0[2]
+; CHECK-GI-NEXT:    orr w8, w8, w13, lsl #7
+; CHECK-GI-NEXT:    orr w9, w9, w14, lsl #4
+; CHECK-GI-NEXT:    umov.b w13, v0[6]
+; CHECK-GI-NEXT:    and w11, w11, #0x1
+; CHECK-GI-NEXT:    umov.b w14, v0[3]
+; CHECK-GI-NEXT:    strb w8, [sp, #10]
+; CHECK-GI-NEXT:    and w8, w10, #0x1
+; CHECK-GI-NEXT:    bfi w12, w11, #1, #31
+; CHECK-GI-NEXT:    orr w8, w9, w8, lsl #5
+; CHECK-GI-NEXT:    umov.b w10, v0[4]
+; CHECK-GI-NEXT:    and w9, w15, #0x1
+; CHECK-GI-NEXT:    umov.b w11, v0[7]
+; CHECK-GI-NEXT:    umov.b w15, v0[1]
+; CHECK-GI-NEXT:    orr w9, w12, w9, lsl #2
+; CHECK-GI-NEXT:    umov.b w12, v0[5]
 ; CHECK-GI-NEXT:    and w13, w13, #0x1
-; CHECK-GI-NEXT:    umov.b w12, v0[4]
-; CHECK-GI-NEXT:    orr w8, w8, w13, lsl #5
-; CHECK-GI-NEXT:    umov.b w13, v1[5]
+; CHECK-GI-NEXT:    and w14, w14, #0x1
+; CHECK-GI-NEXT:    orr w8, w8, w13, lsl #6
+; CHECK-GI-NEXT:    umov.b w13, v0[0]
+; CHECK-GI-NEXT:    orr w9, w9, w14, lsl #3
+; CHECK-GI-NEXT:    and w10, w10, #0x1
+; CHECK-GI-NEXT:    umov.b w14, v0[6]
+; CHECK-GI-NEXT:    and w11, w11, #0x1
+; CHECK-GI-NEXT:    and w15, w15, #0x1
+; CHECK-GI-NEXT:    umov.b w0, v0[3]
+; CHECK-GI-NEXT:    orr w9, w9, w10, lsl #4
+; CHECK-GI-NEXT:    and w10, w12, #0x1
+; CHECK-GI-NEXT:    umov.b w12, v0[7]
+; CHECK-GI-NEXT:    orr w8, w8, w11, lsl #7
+; CHECK-GI-NEXT:    bfi w13, w15, #1, #31
+; CHECK-GI-NEXT:    and w11, w16, #0x1
+; CHECK-GI-NEXT:    orr w9, w9, w10, lsl #5
+; CHECK-GI-NEXT:    and w10, w14, #0x1
+; CHECK-GI-NEXT:    umov.b w14, v0[4]
+; CHECK-GI-NEXT:    strb w8, [sp, #11]
+; CHECK-GI-NEXT:    umov.b w15, v0[1]
+; CHECK-GI-NEXT:    umov.b w16, v0[3]
+; CHECK-GI-NEXT:    orr w8, w9, w10, lsl #6
+; CHECK-GI-NEXT:    orr w9, w13, w11, lsl #2
+; CHECK-GI-NEXT:    and w10, w12, #0x1
+; CHECK-GI-NEXT:    and w11, w17, #0x1
+; CHECK-GI-NEXT:    umov.b w12, v0[5]
+; CHECK-GI-NEXT:    umov.b w17, v0[0]
+; CHECK-GI-NEXT:    orr w8, w8, w10, lsl #7
+; CHECK-GI-NEXT:    orr w9, w9, w11, lsl #3
+; CHECK-GI-NEXT:    umov.b w10, v0[1]
+; CHECK-GI-NEXT:    and w11, w14, #0x1
+; CHECK-GI-NEXT:    umov.b w14, v0[0]
 ; CHECK-GI-NEXT:    and w15, w15, #0x1
-; CHECK-GI-NEXT:    orr w9, w9, w15, lsl #3
+; CHECK-GI-NEXT:    orr w9, w9, w11, lsl #4
+; CHECK-GI-NEXT:    umov.b w11, v0[2]
+; CHECK-GI-NEXT:    umov.b w13, v0[6]
+; CHECK-GI-NEXT:    and w12, w12, #0x1
+; CHECK-GI-NEXT:    bfi w17, w15, #1, #31
+; CHECK-GI-NEXT:    umov.b w15, v0[5]
+; CHECK-GI-NEXT:    orr w9, w9, w12, lsl #5
 ; CHECK-GI-NEXT:    and w10, w10, #0x1
-; CHECK-GI-NEXT:    umov.b w15, v0[7]
-; CHECK-GI-NEXT:    orr w8, w8, w10, lsl #6
-; CHECK-GI-NEXT:    umov.b w10, v1[6]
-; CHECK-GI-NEXT:    and w16, w16, #0x1
-; CHECK-GI-NEXT:    orr w9, w9, w16, lsl #4
-; CHECK-GI-NEXT:    umov.b w16, v0[5]
-; CHECK-GI-NEXT:    orr w11, w11, w14, lsl #3
+; CHECK-GI-NEXT:    umov.b w12, v0[2]
+; CHECK-GI-NEXT:    bfi w14, w10, #1, #31
+; CHECK-GI-NEXT:    umov.b w10, v0[4]
+; CHECK-GI-NEXT:    ldr b1, [sp, #9]
+; CHECK-GI-NEXT:    and w11, w11, #0x1
 ; CHECK-GI-NEXT:    and w13, w13, #0x1
-; CHECK-GI-NEXT:    umov.b w14, v1[7]
+; CHECK-GI-NEXT:    strb w8, [sp, #12]
+; CHECK-GI-NEXT:    orr w11, w14, w11, lsl #2
+; CHECK-GI-NEXT:    and w14, w16, #0x1
+; CHECK-GI-NEXT:    umov.b w16, v0[4]
 ; CHECK-GI-NEXT:    and w12, w12, #0x1
-; CHECK-GI-NEXT:    orr w9, w9, w13, lsl #5
-; CHECK-GI-NEXT:    umov.b w13, v0[6]
-; CHECK-GI-NEXT:    orr w11, w11, w12, lsl #4
+; CHECK-GI-NEXT:    and w15, w15, #0x1
+; CHECK-GI-NEXT:    orr w9, w9, w13, lsl #6
+; CHECK-GI-NEXT:    orr w11, w11, w14, lsl #3
+; CHECK-GI-NEXT:    orr w12, w17, w12, lsl #2
 ; CHECK-GI-NEXT:    and w10, w10, #0x1
-; CHECK-GI-NEXT:    and w12, w15, #0x1
+; CHECK-GI-NEXT:    and w17, w0, #0x1
+; CHECK-GI-NEXT:    umov.b w0, v0[5]
+; CHECK-GI-NEXT:    umov.b w14, v0[6]
+; CHECK-GI-NEXT:    orr w10, w11, w10, lsl #4
+; CHECK-GI-NEXT:    orr w12, w12, w17, lsl #3
+; CHECK-GI-NEXT:    umov.b w11, v0[7]
+; CHECK-GI-NEXT:    and w16, w16, #0x1
+; CHECK-GI-NEXT:    umov.b w17, v0[6]
+; CHECK-GI-NEXT:    orr w10, w10, w15, lsl #5
 ; CHECK-GI-NEXT:    umov.b w15, v0[7]
-; CHECK-GI-NEXT:    orr w9, w9, w10, lsl #6
-; CHECK-GI-NEXT:    and w10, w16, #0x1
-; CHECK-GI-NEXT:    orr w8, w8, w12, lsl #7
-; CHECK-GI-NEXT:    orr w10, w11, w10, lsl #5
-; CHECK-GI-NEXT:    and w11, w14, #0x1
+; CHECK-GI-NEXT:    orr w12, w12, w16, lsl #4
+; CHECK-GI-NEXT:    and w16, w0, #0x1
+; CHECK-GI-NEXT:    umov.b w0, v0[7]
+; CHECK-GI-NEXT:    and w14, w14, #0x1
+; CHECK-GI-NEXT:    orr w12, w12, w16, lsl #5
+; CHECK-GI-NEXT:    orr w10, w10, w14, lsl #6
+; CHECK-GI-NEXT:    and w11, w11, #0x1
+; CHECK-GI-NEXT:    and w13, w17, #0x1
 ; CHECK-GI-NEXT:    orr w9, w9, w11, lsl #7
-; CHECK-GI-NEXT:    and w11, w13, #0x1
-; CHECK-GI-NEXT:    strb w8, [sp, #8]
-; CHECK-GI-NEXT:    orr w8, w10, w11, lsl #6
-; CHECK-GI-NEXT:    ldr b0, [sp, #8]
-; CHECK-GI-NEXT:    strb w9, [sp, #9]
-; CHECK-GI-NEXT:    and w9, w15, #0x1
-; CHECK-GI-NEXT:    ldr b1, [sp, #9]
-; CHECK-GI-NEXT:    orr w8, w8, w9, lsl #7
 ; CHECK-GI-NEXT:    mov.s v0[1], v1[0]
-; CHECK-GI-NEXT:    strb w8, [sp, #10]
-; CHECK-GI-NEXT:    strb w8, [sp, #11]
+; CHECK-GI-NEXT:    orr w11, w12, w13, lsl #6
+; CHECK-GI-NEXT:    and w12, w15, #0x1
 ; CHECK-GI-NEXT:    ; kill: def $d0 killed $d0 killed $q0
-; CHECK-GI-NEXT:    strb w8, [sp, #12]
-; CHECK-GI-NEXT:    strb w8, [sp, #13]
+; CHECK-GI-NEXT:    orr w8, w10, w12, lsl #7
+; CHECK-GI-NEXT:    and w10, w0, #0x1
+; CHECK-GI-NEXT:    strb w9, [sp, #13]
+; CHECK-GI-NEXT:    orr w9, w11, w10, lsl #7
 ; CHECK-GI-NEXT:    strb w8, [sp, #14]
-; CHECK-GI-NEXT:    strb w8, [sp, #15]
+; CHECK-GI-NEXT:    strb w9, [sp, #15]
 ; CHECK-GI-NEXT:    add sp, sp, #16
 ; CHECK-GI-NEXT:    ret
   %bc = bitcast <16 x i1> %arg to <2 x i8>
diff --git a/llvm/test/CodeGen/AArch64/vecreduce-fadd-strict.ll b/llvm/test/CodeGen/AArch64/vecreduce-fadd-strict.ll
index bd68b213ec988..1164e02a16c9e 100644
--- a/llvm/test/CodeGen/AArch64/vecreduce-fadd-strict.ll
+++ b/llvm/test/CodeGen/AArch64/vecreduce-fadd-strict.ll
@@ -79,10 +79,11 @@ define half @add_HalfH(<4 x half> %bin.rdx)  {
 ; CHECK-GI-FP16-LABEL: add_HalfH:
 ; CHECK-GI-FP16:       // %bb.0:
 ; CHECK-GI-FP16-NEXT:    // kill: def $d0 killed $d0 def $q0
-; CHECK-GI-FP16-NEXT:    mov h1, v0.h[2]
-; CHECK-GI-FP16-NEXT:    faddp h2, v0.2h
+; CHECK-GI-FP16-NEXT:    mov h1, v0.h[1]
+; CHECK-GI-FP16-NEXT:    mov h2, v0.h[2]
+; CHECK-GI-FP16-NEXT:    fadd h1, h0, h1
 ; CHECK-GI-FP16-NEXT:    mov h0, v0.h[3]
-; CHECK-GI-FP16-NEXT:    fadd h1, h2, h1
+; CHECK-GI-FP16-NEXT:    fadd h1, h1, h2
 ; CHECK-GI-FP16-NEXT:    fadd h0, h1, h0
 ; CHECK-GI-FP16-NEXT:    ret
   %r = call half @llvm.vector.reduce.fadd.f16.v4f16(half -0.0, <4 x half> %bin.rdx)
diff --git a/llvm/test/CodeGen/AArch64/vecreduce-fmax-legalization-nan.ll b/llvm/test/CodeGen/AArch64/vecreduce-fmax-legalization-nan.ll
index 1906ca9defa40..1d295a30a994b 100644
--- a/llvm/test/CodeGen/AArch64/vecreduce-fmax-legalization-nan.ll
+++ b/llvm/test/CodeGen/AArch64/vecreduce-fmax-legalization-nan.ll
@@ -44,11 +44,27 @@ define half @test_v1f16(<1 x half> %a) nounwind {
 }
 
 define float @test_v1f32(<1 x float> %a) nounwind {
-; CHECK-LABEL: test_v1f32:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
-; CHECK-NEXT:    // kill: def $s0 killed $s0 killed $q0
-; CHECK-NEXT:    ret
+; CHECK-NOFP-SD-LABEL: test_v1f32:
+; CHECK-NOFP-SD:       // %bb.0:
+; CHECK-NOFP-SD-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-NOFP-SD-NEXT:    // kill: def $s0 killed $s0 killed $q0
+; CHECK-NOFP-SD-NEXT:    ret
+;
+; CHECK-FP-SD-LABEL: test_v1f32:
+; CHECK-FP-SD:       // %bb.0:
+; CHECK-FP-SD-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-FP-SD-NEXT:    // kill: def $s0 killed $s0 killed $q0
+; CHECK-FP-SD-NEXT:    ret
+;
+; CHECK-NOFP-GI-LABEL: test_v1f32:
+; CHECK-NOFP-GI:       // %bb.0:
+; CHECK-NOFP-GI-NEXT:    // kill: def $s0 killed $s0 killed $d0
+; CHECK-NOFP-GI-NEXT:    ret
+;
+; CHECK-FP-GI-LABEL: test_v1f32:
+; CHECK-FP-GI:       // %bb.0:
+; CHECK-FP-GI-NEXT:    // kill: def $s0 killed $s0 killed $d0
+; CHECK-FP-GI-NEXT:    ret
   %b = call float @llvm.vector.reduce.fmax.v1f32(<1 x float> %a)
   ret float %b
 }
diff --git a/llvm/test/CodeGen/AArch64/vecreduce-fmax-legalization.ll b/llvm/test/CodeGen/AArch64/vecreduce-fmax-legalization.ll
index 152eb66ebcdfe..ee2af110c84cd 100644
--- a/llvm/test/CodeGen/AArch64/vecreduce-fmax-legalization.ll
+++ b/llvm/test/CodeGen/AArch64/vecreduce-fmax-legalization.ll
@@ -44,11 +44,27 @@ define half @test_v1f16(<1 x half> %a) nounwind {
 }
 
 define float @test_v1f32(<1 x float> %a) nounwind {
-; CHECK-LABEL: test_v1f32:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
-; CHECK-NEXT:    // kill: def $s0 killed $s0 killed $q0
-; CHECK-NEXT:    ret
+; CHECK-NOFP-SD-LABEL: test_v1f32:
+; CHECK-NOFP-SD:       // %bb.0:
+; CHECK-NOFP-SD-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-NOFP-SD-NEXT:    // kill: def $s0 killed $s0 killed $q0
+; CHECK-NOFP-SD-NEXT:    ret
+;
+; CHECK-FP-SD-LABEL: test_v1f32:
+; CHECK-FP-SD:       // %bb.0:
+; CHECK-FP-SD-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-FP-SD-NEXT:    // kill: def $s0 killed $s0 killed $q0
+; CHECK-FP-SD-NEXT:    ret
+;
+; CHECK-NOFP-GI-LABEL: test_v1f32:
+; CHECK-NOFP-GI:       // %bb.0:
+; CHECK-NOFP-GI-NEXT:    // kill: def $s0 killed $s0 killed $d0
+; CHECK-NOFP-GI-NEXT:    ret
+;
+; CHECK-FP-GI-LABEL: test_v1f32:
+; CHECK-FP-GI:       // %bb.0:
+; CHECK-FP-GI-NEXT:    // kill: def $s0 killed $s0 killed $d0
+; CHECK-FP-GI-NEXT:    ret
   %b = call nnan float @llvm.vector.reduce.fmax.v1f32(<1 x float> %a)
   ret float %b
 }
diff --git a/llvm/test/CodeGen/AArch64/vecreduce-fmaximum.ll b/llvm/test/CodeGen/AArch64/vecreduce-fmaximum.ll
index a1b7118d8080d..be61f9b521795 100644
--- a/llvm/test/CodeGen/AArch64/vecreduce-fmaximum.ll
+++ b/llvm/test/CodeGen/AArch64/vecreduce-fmaximum.ll
@@ -40,11 +40,27 @@ define half @test_v1f16(<1 x half> %a) nounwind {
 }
 
 define float @test_v1f32(<1 x float> %a) nounwind {
-; CHECK-LABEL: test_v1f32:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
-; CHECK-NEXT:    // kill: def $s0 killed $s0 killed $q0
-; CHECK-NEXT:    ret
+; CHECK-NOFP-SD-LABEL: test_v1f32:
+; CHECK-NOFP-SD:       // %bb.0:
+; CHECK-NOFP-SD-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-NOFP-SD-NEXT:    // kill: def $s0 killed $s0 killed $q0
+; CHECK-NOFP-SD-NEXT:    ret
+;
+; CHECK-FP-SD-LABEL: test_v1f32:
+; CHECK-FP-SD:       // %bb.0:
+; CHECK-FP-SD-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-FP-SD-NEXT:    // kill: def $s0 killed $s0 killed $q0
+; CHECK-FP-SD-NEXT:    ret
+;
+; CHECK-NOFP-GI-LABEL: test_v1f32:
+; CHECK-NOFP-GI:       // %bb.0:
+; CHECK-NOFP-GI-NEXT:    // kill: def $s0 killed $s0 killed $d0
+; CHECK-NOFP-GI-NEXT:    ret
+;
+; CHECK-FP-GI-LABEL: test_v1f32:
+; CHECK-FP-GI:       // %bb.0:
+; CHECK-FP-GI-NEXT:    // kill: def $s0 killed $s0 killed $d0
+; CHECK-FP-GI-NEXT:    ret
   %b = call float @llvm.vector.reduce.fmaximum.v1f32(<1 x float> %a)
   ret float %b
 }
diff --git a/llvm/test/CodeGen/AArch64/vecreduce-fmin-legalization.ll b/llvm/test/CodeGen/AArch64/vecreduce-fmin-legalization.ll
index d5f999add22c2..300081dc3ec40 100644
--- a/llvm/test/CodeGen/AArch64/vecreduce-fmin-legalization.ll
+++ b/llvm/test/CodeGen/AArch64/vecreduce-fmin-legalization.ll
@@ -44,11 +44,27 @@ define half @test_v1f16(<1 x half> %a) nounwind {
 }
 
 define float @test_v1f32(<1 x float> %a) nounwind {
-; CHECK-LABEL: test_v1f32:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
-; CHECK-NEXT:    // kill: def $s0 killed $s0 killed $q0
-; CHECK-NEXT:    ret
+; CHECK-NOFP-SD-LABEL: test_v1f32:
+; CHECK-NOFP-SD:       // %bb.0:
+; CHECK-NOFP-SD-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-NOFP-SD-NEXT:    // kill: def $s0 killed $s0 killed $q0
+; CHECK-NOFP-SD-NEXT:    ret
+;
+; CHECK-FP-SD-LABEL: test_v1f32:
+; CHECK-FP-SD:       // %bb.0:
+; CHECK-FP-SD-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-FP-SD-NEXT:    // kill: def $s0 killed $s0 killed $q0
+; CHECK-FP-SD-NEXT:    ret
+;
+; CHECK-NOFP-GI-LABEL: test_v1f32:
+; CHECK-NOFP-GI:       // %bb.0:
+; CHECK-NOFP-GI-NEXT:    // kill: def $s0 killed $s0 killed $d0
+; CHECK-NOFP-GI-NEXT:    ret
+;
+; CHECK-FP-GI-LABEL: test_v1f32:
+; CHECK-FP-GI:       // %bb.0:
+; CHECK-FP-GI-NEXT:    // kill: def $s0 killed $s0 killed $d0
+; CHECK-FP-GI-NEXT:    ret
   %b = call nnan float @llvm.vector.reduce.fmin.v1f32(<1 x float> %a)
   ret float %b
 }
diff --git a/llvm/test/CodeGen/AArch64/vecreduce-fminimum.ll b/llvm/test/CodeGen/AArch64/vecreduce-fminimum.ll
index 719cac8f33028..e735f670ced0c 100644
--- a/llvm/test/CodeGen/AArch64/vecreduce-fminimum.ll
+++ b/llvm/test/CodeGen/AArch64/vecreduce-fminimum.ll
@@ -40,11 +40,27 @@ define half @test_v1f16(<1 x half> %a) nounwind {
 }
 
 define float @test_v1f32(<1 x float> %a) nounwind {
-; CHECK-LABEL: test_v1f32:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
-; CHECK-NEXT:    // kill: def $s0 killed $s0 killed $q0
-; CHECK-NEXT:    ret
+; CHECK-NOFP-SD-LABEL: test_v1f32:
+; CHECK-NOFP-SD:       // %bb.0:
+; CHECK-NOFP-SD-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-NOFP-SD-NEXT:    // kill: def $s0 killed $s0 killed $q0
+; CHECK-NOFP-SD-NEXT:    ret
+;
+; CHECK-FP-SD-LABEL: test_v1f32:
+; CHECK-FP-SD:       // %bb.0:
+; CHECK-FP-SD-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-FP-SD-NEXT:    // kill: def $s0 killed $s0 killed $q0
+; CHECK-FP-SD-NEXT:    ret
+;
+; CHECK-NOFP-GI-LABEL: test_v1f32:
+; CHECK-NOFP-GI:       // %bb.0:
+; CHECK-NOFP-GI-NEXT:    // kill: def $s0 killed $s0 killed $d0
+; CHECK-NOFP-GI-NEXT:    ret
+;
+; CHECK-FP-GI-LABEL: test_v1f32:
+; CHECK-FP-GI:       // %bb.0:
+; CHECK-FP-GI-NEXT:    // kill: def $s0 killed $s0 killed $d0
+; CHECK-FP-GI-NEXT:    ret
   %b = call float @llvm.vector.reduce.fminimum.v1f32(<1 x float> %a)
   ret float %b
 }
diff --git a/llvm/test/CodeGen/AArch64/vecreduce-fmul-strict.ll b/llvm/test/CodeGen/AArch64/vecreduce-fmul-strict.ll
index e22a5a4af4fae..e1b21705c95f3 100644
--- a/llvm/test/CodeGen/AArch64/vecreduce-fmul-strict.ll
+++ b/llvm/test/CodeGen/AArch64/vecreduce-fmul-strict.ll
@@ -5,11 +5,18 @@
 ; RUN: llc -mtriple=aarch64 -mattr=+fullfp16 -global-isel -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-GI,CHECK-GI-FP16
 
 define float @mul_HalfS(<2 x float> %bin.rdx)  {
-; CHECK-LABEL: mul_HalfS:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
-; CHECK-NEXT:    fmul s0, s0, v0.s[1]
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: mul_HalfS:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-SD-NEXT:    fmul s0, s0, v0.s[1]
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: mul_HalfS:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-GI-NEXT:    mov s1, v0.s[1]
+; CHECK-GI-NEXT:    fmul s0, s0, s1
+; CHECK-GI-NEXT:    ret
   %r = call float @llvm.vector.reduce.fmul.f32.v2f32(float 1.0, <2 x float> %bin.rdx)
   ret float %r
 }
@@ -72,9 +79,12 @@ define half @mul_HalfH(<4 x half> %bin.rdx)  {
 ; CHECK-GI-FP16-LABEL: mul_HalfH:
 ; CHECK-GI-FP16:       // %bb.0:
 ; CHECK-GI-FP16-NEXT:    // kill: def $d0 killed $d0 def $q0
-; CHECK-GI-FP16-NEXT:    fmul h1, h0, v0.h[1]
-; CHECK-GI-FP16-NEXT:    fmul h1, h1, v0.h[2]
-; CHECK-GI-FP16-NEXT:    fmul h0, h1, v0.h[3]
+; CHECK-GI-FP16-NEXT:    mov h1, v0.h[1]
+; CHECK-GI-FP16-NEXT:    mov h2, v0.h[2]
+; CHECK-GI-FP16-NEXT:    fmul h1, h0, h1
+; CHECK-GI-FP16-NEXT:    mov h0, v0.h[3]
+; CHECK-GI-FP16-NEXT:    fmul h1, h1, h2
+; CHECK-GI-FP16-NEXT:    fmul h0, h1, h0
 ; CHECK-GI-FP16-NEXT:    ret
   %r = call half @llvm.vector.reduce.fmul.f16.v4f16(half 1.0, <4 x half> %bin.rdx)
   ret half %r
@@ -465,6 +475,3 @@ declare float @llvm.vector.reduce.fmul.f32.v4f32(float, <4 x float>)
 declare float @llvm.vector.reduce.fmul.f32.v8f32(float, <8 x float>)
 declare double @llvm.vector.reduce.fmul.f64.v2f64(double, <2 x double>)
 declare double @llvm.vector.reduce.fmul.f64.v4f64(double, <4 x double>)
-;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
-; CHECK-GI: {{.*}}
-; CHECK-SD: {{.*}}
diff --git a/llvm/test/CodeGen/AArch64/vecreduce-fmul.ll b/llvm/test/CodeGen/AArch64/vecreduce-fmul.ll
index 5fd705b07ca3b..2429cf4b4597a 100644
--- a/llvm/test/CodeGen/AArch64/vecreduce-fmul.ll
+++ b/llvm/test/CodeGen/AArch64/vecreduce-fmul.ll
@@ -5,11 +5,18 @@
 ; RUN: llc -mtriple=aarch64 -mattr=+fullfp16 -global-isel -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-GI,CHECK-GI-FP16
 
 define float @mul_HalfS(<2 x float> %bin.rdx)  {
-; CHECK-LABEL: mul_HalfS:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
-; CHECK-NEXT:    fmul s0, s0, v0.s[1]
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: mul_HalfS:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-SD-NEXT:    fmul s0, s0, v0.s[1]
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: mul_HalfS:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-GI-NEXT:    mov s1, v0.s[1]
+; CHECK-GI-NEXT:    fmul s0, s0, s1
+; CHECK-GI-NEXT:    ret
   %r = call fast float @llvm.vector.reduce.fmul.f32.v2f32(float 1.0, <2 x float> %bin.rdx)
   ret float %r
 }
@@ -44,17 +51,20 @@ define half @mul_HalfH(<4 x half> %bin.rdx)  {
 ; CHECK-GI-NOFP16-NEXT:    fcvtl v0.4s, v0.4h
 ; CHECK-GI-NOFP16-NEXT:    mov d1, v0.d[1]
 ; CHECK-GI-NOFP16-NEXT:    fmul v0.2s, v0.2s, v1.2s
-; CHECK-GI-NOFP16-NEXT:    fmul s0, s0, v0.s[1]
+; CHECK-GI-NOFP16-NEXT:    mov s1, v0.s[1]
+; CHECK-GI-NOFP16-NEXT:    fmul s0, s0, s1
 ; CHECK-GI-NOFP16-NEXT:    fcvt h0, s0
 ; CHECK-GI-NOFP16-NEXT:    ret
 ;
 ; CHECK-GI-FP16-LABEL: mul_HalfH:
 ; CHECK-GI-FP16:       // %bb.0:
 ; CHECK-GI-FP16-NEXT:    // kill: def $d0 killed $d0 def $q0
-; CHECK-GI-FP16-NEXT:    mov h1, v0.h[3]
-; CHECK-GI-FP16-NEXT:    fmul h2, h0, v0.h[1]
-; CHECK-GI-FP16-NEXT:    fmul h0, h1, v0.h[2]
-; CHECK-GI-FP16-NEXT:    fmul h0, h2, h0
+; CHECK-GI-FP16-NEXT:    mov h1, v0.h[1]
+; CHECK-GI-FP16-NEXT:    mov h2, v0.h[2]
+; CHECK-GI-FP16-NEXT:    mov h3, v0.h[3]
+; CHECK-GI-FP16-NEXT:    fmul h0, h0, h1
+; CHECK-GI-FP16-NEXT:    fmul h1, h2, h3
+; CHECK-GI-FP16-NEXT:    fmul h0, h0, h1
 ; CHECK-GI-FP16-NEXT:    ret
   %r = call fast half @llvm.vector.reduce.fmul.f16.v4f16(half 1.0, <4 x half> %bin.rdx)
   ret half %r
@@ -105,7 +115,8 @@ define half @mul_H(<8 x half> %bin.rdx)  {
 ; CHECK-GI-NOFP16-NEXT:    fmul v0.4s, v1.4s, v0.4s
 ; CHECK-GI-NOFP16-NEXT:    mov d1, v0.d[1]
 ; CHECK-GI-NOFP16-NEXT:    fmul v0.2s, v0.2s, v1.2s
-; CHECK-GI-NOFP16-NEXT:    fmul s0, s0, v0.s[1]
+; CHECK-GI-NOFP16-NEXT:    mov s1, v0.s[1]
+; CHECK-GI-NOFP16-NEXT:    fmul s0, s0, s1
 ; CHECK-GI-NOFP16-NEXT:    fcvt h0, s0
 ; CHECK-GI-NOFP16-NEXT:    ret
 ;
@@ -113,10 +124,12 @@ define half @mul_H(<8 x half> %bin.rdx)  {
 ; CHECK-GI-FP16:       // %bb.0:
 ; CHECK-GI-FP16-NEXT:    mov d1, v0.d[1]
 ; CHECK-GI-FP16-NEXT:    fmul v0.4h, v0.4h, v1.4h
-; CHECK-GI-FP16-NEXT:    mov h1, v0.h[3]
-; CHECK-GI-FP16-NEXT:    fmul h2, h0, v0.h[1]
-; CHECK-GI-FP16-NEXT:    fmul h0, h1, v0.h[2]
-; CHECK-GI-FP16-NEXT:    fmul h0, h2, h0
+; CHECK-GI-FP16-NEXT:    mov h1, v0.h[1]
+; CHECK-GI-FP16-NEXT:    mov h2, v0.h[2]
+; CHECK-GI-FP16-NEXT:    mov h3, v0.h[3]
+; CHECK-GI-FP16-NEXT:    fmul h0, h0, h1
+; CHECK-GI-FP16-NEXT:    fmul h1, h2, h3
+; CHECK-GI-FP16-NEXT:    fmul h0, h0, h1
 ; CHECK-GI-FP16-NEXT:    ret
   %r = call fast half @llvm.vector.reduce.fmul.f16.v8f16(half 1.0, <8 x half> %bin.rdx)
   ret half %r
@@ -134,7 +147,8 @@ define float @mul_S(<4 x float> %bin.rdx)  {
 ; CHECK-GI:       // %bb.0:
 ; CHECK-GI-NEXT:    mov d1, v0.d[1]
 ; CHECK-GI-NEXT:    fmul v0.2s, v0.2s, v1.2s
-; CHECK-GI-NEXT:    fmul s0, s0, v0.s[1]
+; CHECK-GI-NEXT:    mov s1, v0.s[1]
+; CHECK-GI-NEXT:    fmul s0, s0, s1
 ; CHECK-GI-NEXT:    ret
   %r = call fast float @llvm.vector.reduce.fmul.f32.v4f32(float 1.0, <4 x float> %bin.rdx)
   ret float %r
@@ -206,7 +220,8 @@ define half @mul_2H(<16 x half> %bin.rdx)  {
 ; CHECK-GI-NOFP16-NEXT:    fmul v0.4s, v0.4s, v1.4s
 ; CHECK-GI-NOFP16-NEXT:    mov d1, v0.d[1]
 ; CHECK-GI-NOFP16-NEXT:    fmul v0.2s, v0.2s, v1.2s
-; CHECK-GI-NOFP16-NEXT:    fmul s0, s0, v0.s[1]
+; CHECK-GI-NOFP16-NEXT:    mov s1, v0.s[1]
+; CHECK-GI-NOFP16-NEXT:    fmul s0, s0, s1
 ; CHECK-GI-NOFP16-NEXT:    fcvt h0, s0
 ; CHECK-GI-NOFP16-NEXT:    ret
 ;
@@ -215,10 +230,12 @@ define half @mul_2H(<16 x half> %bin.rdx)  {
 ; CHECK-GI-FP16-NEXT:    fmul v0.8h, v0.8h, v1.8h
 ; CHECK-GI-FP16-NEXT:    mov d1, v0.d[1]
 ; CHECK-GI-FP16-NEXT:    fmul v0.4h, v0.4h, v1.4h
-; CHECK-GI-FP16-NEXT:    mov h1, v0.h[3]
-; CHECK-GI-FP16-NEXT:    fmul h2, h0, v0.h[1]
-; CHECK-GI-FP16-NEXT:    fmul h0, h1, v0.h[2]
-; CHECK-GI-FP16-NEXT:    fmul h0, h2, h0
+; CHECK-GI-FP16-NEXT:    mov h1, v0.h[1]
+; CHECK-GI-FP16-NEXT:    mov h2, v0.h[2]
+; CHECK-GI-FP16-NEXT:    mov h3, v0.h[3]
+; CHECK-GI-FP16-NEXT:    fmul h0, h0, h1
+; CHECK-GI-FP16-NEXT:    fmul h1, h2, h3
+; CHECK-GI-FP16-NEXT:    fmul h0, h0, h1
 ; CHECK-GI-FP16-NEXT:    ret
   %r = call fast half @llvm.vector.reduce.fmul.f16.v16f16(half 1.0, <16 x half> %bin.rdx)
   ret half %r
@@ -238,7 +255,8 @@ define float @mul_2S(<8 x float> %bin.rdx)  {
 ; CHECK-GI-NEXT:    fmul v0.4s, v0.4s, v1.4s
 ; CHECK-GI-NEXT:    mov d1, v0.d[1]
 ; CHECK-GI-NEXT:    fmul v0.2s, v0.2s, v1.2s
-; CHECK-GI-NEXT:    fmul s0, s0, v0.s[1]
+; CHECK-GI-NEXT:    mov s1, v0.s[1]
+; CHECK-GI-NEXT:    fmul s0, s0, s1
 ; CHECK-GI-NEXT:    ret
   %r = call fast float @llvm.vector.reduce.fmul.f32.v8f32(float 1.0, <8 x float> %bin.rdx)
   ret float %r
@@ -271,8 +289,9 @@ define float @mul_S_init_42(<4 x float> %bin.rdx)  {
 ; CHECK-GI-NEXT:    mov d1, v0.d[1]
 ; CHECK-GI-NEXT:    mov w8, #1109917696 // =0x42280000
 ; CHECK-GI-NEXT:    fmul v0.2s, v0.2s, v1.2s
+; CHECK-GI-NEXT:    mov s1, v0.s[1]
+; CHECK-GI-NEXT:    fmul s0, s0, s1
 ; CHECK-GI-NEXT:    fmov s1, w8
-; CHECK-GI-NEXT:    fmul s0, s0, v0.s[1]
 ; CHECK-GI-NEXT:    fmul s0, s0, s1
 ; CHECK-GI-NEXT:    ret
   %r = call fast float @llvm.vector.reduce.fmul.f32.v4f32(float 42.0, <4 x float> %bin.rdx)
@@ -338,8 +357,10 @@ define half @fmul_reduct_reassoc_v8f16(<8 x half> %a, <8 x half> %b) {
 ; CHECK-GI-NOFP16-NEXT:    mov d3, v1.d[1]
 ; CHECK-GI-NOFP16-NEXT:    fmul v0.2s, v0.2s, v2.2s
 ; CHECK-GI-NOFP16-NEXT:    fmul v1.2s, v1.2s, v3.2s
-; CHECK-GI-NOFP16-NEXT:    fmul s0, s0, v0.s[1]
-; CHECK-GI-NOFP16-NEXT:    fmul s1, s1, v1.s[1]
+; CHECK-GI-NOFP16-NEXT:    mov s2, v0.s[1]
+; CHECK-GI-NOFP16-NEXT:    mov s3, v1.s[1]
+; CHECK-GI-NOFP16-NEXT:    fmul s0, s0, s2
+; CHECK-GI-NOFP16-NEXT:    fmul s1, s1, s3
 ; CHECK-GI-NOFP16-NEXT:    fcvt h0, s0
 ; CHECK-GI-NOFP16-NEXT:    fcvt h1, s1
 ; CHECK-GI-NOFP16-NEXT:    fcvt s0, h0
@@ -354,14 +375,18 @@ define half @fmul_reduct_reassoc_v8f16(<8 x half> %a, <8 x half> %b) {
 ; CHECK-GI-FP16-NEXT:    mov d3, v1.d[1]
 ; CHECK-GI-FP16-NEXT:    fmul v0.4h, v0.4h, v2.4h
 ; CHECK-GI-FP16-NEXT:    fmul v1.4h, v1.4h, v3.4h
-; CHECK-GI-FP16-NEXT:    mov h2, v0.h[3]
-; CHECK-GI-FP16-NEXT:    mov h3, v1.h[3]
-; CHECK-GI-FP16-NEXT:    fmul h4, h0, v0.h[1]
-; CHECK-GI-FP16-NEXT:    fmul h0, h2, v0.h[2]
-; CHECK-GI-FP16-NEXT:    fmul h2, h1, v1.h[1]
-; CHECK-GI-FP16-NEXT:    fmul h1, h3, v1.h[2]
-; CHECK-GI-FP16-NEXT:    fmul h0, h4, h0
-; CHECK-GI-FP16-NEXT:    fmul h1, h2, h1
+; CHECK-GI-FP16-NEXT:    mov h2, v0.h[1]
+; CHECK-GI-FP16-NEXT:    mov h3, v0.h[2]
+; CHECK-GI-FP16-NEXT:    mov h4, v0.h[3]
+; CHECK-GI-FP16-NEXT:    mov h5, v1.h[1]
+; CHECK-GI-FP16-NEXT:    mov h6, v1.h[2]
+; CHECK-GI-FP16-NEXT:    mov h7, v1.h[3]
+; CHECK-GI-FP16-NEXT:    fmul h0, h0, h2
+; CHECK-GI-FP16-NEXT:    fmul h2, h3, h4
+; CHECK-GI-FP16-NEXT:    fmul h1, h1, h5
+; CHECK-GI-FP16-NEXT:    fmul h3, h6, h7
+; CHECK-GI-FP16-NEXT:    fmul h0, h0, h2
+; CHECK-GI-FP16-NEXT:    fmul h1, h1, h3
 ; CHECK-GI-FP16-NEXT:    fmul h0, h0, h1
 ; CHECK-GI-FP16-NEXT:    ret
   %r1 = call fast half @llvm.vector.reduce.fmul.f16.v8f16(half 1.0, <8 x half> %a)
@@ -389,8 +414,10 @@ define float @fmul_reduct_reassoc_v8f32(<8 x float> %a, <8 x float> %b) {
 ; CHECK-GI-NEXT:    mov d3, v1.d[1]
 ; CHECK-GI-NEXT:    fmul v0.2s, v0.2s, v2.2s
 ; CHECK-GI-NEXT:    fmul v1.2s, v1.2s, v3.2s
-; CHECK-GI-NEXT:    fmul s0, s0, v0.s[1]
-; CHECK-GI-NEXT:    fmul s1, s1, v1.s[1]
+; CHECK-GI-NEXT:    mov s2, v0.s[1]
+; CHECK-GI-NEXT:    mov s3, v1.s[1]
+; CHECK-GI-NEXT:    fmul s0, s0, s2
+; CHECK-GI-NEXT:    fmul s1, s1, s3
 ; CHECK-GI-NEXT:    fmul s0, s0, s1
 ; CHECK-GI-NEXT:    ret
   %r1 = call fast float @llvm.vector.reduce.fmul.f32.v8f32(float 1.0, <8 x float> %a)
@@ -414,8 +441,10 @@ define float @fmul_reduct_reassoc_v4f32(<4 x float> %a, <4 x float> %b) {
 ; CHECK-GI-NEXT:    mov d3, v1.d[1]
 ; CHECK-GI-NEXT:    fmul v0.2s, v0.2s, v2.2s
 ; CHECK-GI-NEXT:    fmul v1.2s, v1.2s, v3.2s
-; CHECK-GI-NEXT:    fmul s0, s0, v0.s[1]
-; CHECK-GI-NEXT:    fmul s1, s1, v1.s[1]
+; CHECK-GI-NEXT:    mov s2, v0.s[1]
+; CHECK-GI-NEXT:    mov s3, v1.s[1]
+; CHECK-GI-NEXT:    fmul s0, s0, s2
+; CHECK-GI-NEXT:    fmul s1, s1, s3
 ; CHECK-GI-NEXT:    fmul s0, s0, s1
 ; CHECK-GI-NEXT:    ret
   %r1 = call fast float @llvm.vector.reduce.fmul.f32.v4f32(float 1.0, <4 x float> %a)
@@ -442,10 +471,12 @@ define float @fmul_reduct_reassoc_v4f32_init(float %i, <4 x float> %a, <4 x floa
 ; CHECK-GI-NEXT:    mov d3, v1.d[1]
 ; CHECK-GI-NEXT:    fmul v1.2s, v1.2s, v3.2s
 ; CHECK-GI-NEXT:    mov d3, v2.d[1]
-; CHECK-GI-NEXT:    fmul s1, s1, v1.s[1]
+; CHECK-GI-NEXT:    mov s4, v1.s[1]
 ; CHECK-GI-NEXT:    fmul v2.2s, v2.2s, v3.2s
+; CHECK-GI-NEXT:    fmul s1, s1, s4
+; CHECK-GI-NEXT:    mov s3, v2.s[1]
 ; CHECK-GI-NEXT:    fmul s0, s0, s1
-; CHECK-GI-NEXT:    fmul s1, s2, v2.s[1]
+; CHECK-GI-NEXT:    fmul s1, s2, s3
 ; CHECK-GI-NEXT:    fmul s0, s0, s1
 ; CHECK-GI-NEXT:    ret
   %r1 = call fast float @llvm.vector.reduce.fmul.f32.v4f32(float %i, <4 x float> %a)
@@ -471,8 +502,10 @@ define float @fmul_reduct_reassoc_v4v8f32(<4 x float> %a, <8 x float> %b) {
 ; CHECK-GI-NEXT:    mov d3, v1.d[1]
 ; CHECK-GI-NEXT:    fmul v0.2s, v0.2s, v2.2s
 ; CHECK-GI-NEXT:    fmul v1.2s, v1.2s, v3.2s
-; CHECK-GI-NEXT:    fmul s0, s0, v0.s[1]
-; CHECK-GI-NEXT:    fmul s1, s1, v1.s[1]
+; CHECK-GI-NEXT:    mov s2, v0.s[1]
+; CHECK-GI-NEXT:    mov s3, v1.s[1]
+; CHECK-GI-NEXT:    fmul s0, s0, s2
+; CHECK-GI-NEXT:    fmul s1, s1, s3
 ; CHECK-GI-NEXT:    fmul s0, s0, s1
 ; CHECK-GI-NEXT:    ret
   %r1 = call fast float @llvm.vector.reduce.fmul.f32.v4f32(float 1.0, <4 x float> %a)
@@ -523,8 +556,10 @@ define float @fmul_reduct_reassoc_v4f32_extrause(<4 x float> %a, <4 x float> %b)
 ; CHECK-GI-NEXT:    mov d3, v1.d[1]
 ; CHECK-GI-NEXT:    fmul v0.2s, v0.2s, v2.2s
 ; CHECK-GI-NEXT:    fmul v1.2s, v1.2s, v3.2s
-; CHECK-GI-NEXT:    fmul s0, s0, v0.s[1]
-; CHECK-GI-NEXT:    fmul s1, s1, v1.s[1]
+; CHECK-GI-NEXT:    mov s2, v0.s[1]
+; CHECK-GI-NEXT:    mov s3, v1.s[1]
+; CHECK-GI-NEXT:    fmul s0, s0, s2
+; CHECK-GI-NEXT:    fmul s1, s1, s3
 ; CHECK-GI-NEXT:    fmul s1, s0, s1
 ; CHECK-GI-NEXT:    fmul s0, s1, s0
 ; CHECK-GI-NEXT:    ret
diff --git a/llvm/test/CodeGen/AArch64/vecreduce-umax-legalization.ll b/llvm/test/CodeGen/AArch64/vecreduce-umax-legalization.ll
index d5c040e09945b..0806f7da5c89c 100644
--- a/llvm/test/CodeGen/AArch64/vecreduce-umax-legalization.ll
+++ b/llvm/test/CodeGen/AArch64/vecreduce-umax-legalization.ll
@@ -57,11 +57,16 @@ define i24 @test_v1i24(<1 x i24> %a) nounwind {
 }
 
 define i32 @test_v1i32(<1 x i32> %a) nounwind {
-; CHECK-LABEL: test_v1i32:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
-; CHECK-NEXT:    fmov w0, s0
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: test_v1i32:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-SD-NEXT:    fmov w0, s0
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: test_v1i32:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    fmov w0, s0
+; CHECK-GI-NEXT:    ret
   %b = call i32 @llvm.vector.reduce.umax.v1i32(<1 x i32> %a)
   ret i32 %b
 }
diff --git a/llvm/test/CodeGen/AArch64/vector-lrint.ll b/llvm/test/CodeGen/AArch64/vector-lrint.ll
index 53456c4c81ccc..602643264e7be 100644
--- a/llvm/test/CodeGen/AArch64/vector-lrint.ll
+++ b/llvm/test/CodeGen/AArch64/vector-lrint.ll
@@ -755,13 +755,20 @@ define <1 x iXLen> @lrint_v1f32(<1 x float> %x) {
 ; CHECK-i32-NEXT:    fcvtzs v0.2s, v0.2s
 ; CHECK-i32-NEXT:    ret
 ;
-; CHECK-i64-LABEL: lrint_v1f32:
-; CHECK-i64:       // %bb.0:
-; CHECK-i64-NEXT:    // kill: def $d0 killed $d0 def $q0
-; CHECK-i64-NEXT:    frintx s0, s0
-; CHECK-i64-NEXT:    fcvtzs x8, s0
-; CHECK-i64-NEXT:    fmov d0, x8
-; CHECK-i64-NEXT:    ret
+; CHECK-i64-SD-LABEL: lrint_v1f32:
+; CHECK-i64-SD:       // %bb.0:
+; CHECK-i64-SD-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-i64-SD-NEXT:    frintx s0, s0
+; CHECK-i64-SD-NEXT:    fcvtzs x8, s0
+; CHECK-i64-SD-NEXT:    fmov d0, x8
+; CHECK-i64-SD-NEXT:    ret
+;
+; CHECK-i64-GI-LABEL: lrint_v1f32:
+; CHECK-i64-GI:       // %bb.0:
+; CHECK-i64-GI-NEXT:    frintx s0, s0
+; CHECK-i64-GI-NEXT:    fcvtzs x8, s0
+; CHECK-i64-GI-NEXT:    fmov d0, x8
+; CHECK-i64-GI-NEXT:    ret
   %a = call <1 x iXLen> @llvm.lrint.v1iXLen.v1f32(<1 x float> %x)
   ret <1 x iXLen> %a
 }
@@ -1328,7 +1335,3 @@ define <32 x iXLen> @lrint_v32f64(<32 x double> %x) {
   ret <32 x iXLen> %a
 }
 declare <32 x iXLen> @llvm.lrint.v32iXLen.v32f64(<32 x double>)
-;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
-; CHECK-i32-GI: {{.*}}
-; CHECK-i64-GI: {{.*}}
-; CHECK-i64-SD: {{.*}}

>From 14c11e4bcb262496981a2948af11a3f9e9de23ef Mon Sep 17 00:00:00 2001
From: Adrian Vogelsgesang <avogelsgesang at salesforce.com>
Date: Wed, 11 Jun 2025 09:39:31 +0200
Subject: [PATCH 53/61] [coro][NFC] Move switch basic block to beginning of
 coroutine (#143626)

This makes the code flow when reading the LLVM IR of a split coroutine a
bit more natural. It does not change anything from an end-user
perspective but makes debugging the CoroSplit pass slightly easier.
---
 llvm/lib/Transforms/Coroutines/CoroSplit.cpp | 1 +
 1 file changed, 1 insertion(+)

diff --git a/llvm/lib/Transforms/Coroutines/CoroSplit.cpp b/llvm/lib/Transforms/Coroutines/CoroSplit.cpp
index f9a6c70fedc2d..cebe44581b061 100644
--- a/llvm/lib/Transforms/Coroutines/CoroSplit.cpp
+++ b/llvm/lib/Transforms/Coroutines/CoroSplit.cpp
@@ -703,6 +703,7 @@ void coro::BaseCloner::replaceEntryBlock() {
     auto *SwitchBB =
         cast<BasicBlock>(VMap[Shape.SwitchLowering.ResumeEntryBlock]);
     Builder.CreateBr(SwitchBB);
+    SwitchBB->moveAfter(Entry);
     break;
   }
   case coro::ABI::Async:

>From 24d730b3808a562507f3f1f5fc125acf4b6e03aa Mon Sep 17 00:00:00 2001
From: Iris Shi <0.0 at owo.li>
Date: Wed, 11 Jun 2025 15:56:37 +0800
Subject: [PATCH 54/61] Reland "[SelectionDAG] Make `(a & x) | (~a & y) -> (a &
 (x ^ y)) ^ y` available for all targets" (#143651)

---
 llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp |  57 ++
 .../Target/SystemZ/SystemZISelLowering.cpp    |  14 +
 llvm/lib/Target/SystemZ/SystemZISelLowering.h |   1 +
 llvm/lib/Target/X86/X86ISelLowering.cpp       |  58 --
 llvm/test/CodeGen/AMDGPU/bfi_int.ll           |  30 +-
 .../CodeGen/AMDGPU/insert_vector_dynelt.ll    |  42 +-
 llvm/test/CodeGen/AMDGPU/insert_vector_elt.ll | 161 +++--
 .../CodeGen/AMDGPU/insert_vector_elt.v2i16.ll |  42 +-
 ...unfold-masked-merge-scalar-variablemask.ll |  42 +-
 ...unfold-masked-merge-vector-variablemask.ll | 167 +++--
 llvm/test/CodeGen/RISCV/fold-masked-merge.ll  | 302 +++++++++
 ...unfold-masked-merge-scalar-variablemask.ll |  62 +-
 .../test/CodeGen/SystemZ/fold-masked-merge.ll | 277 ++++++++
 llvm/test/CodeGen/WebAssembly/simd-arith.ll   | 600 +++++++-----------
 llvm/test/CodeGen/X86/bitselect.ll            |  50 +-
 llvm/test/CodeGen/X86/fold-masked-merge.ll    |  54 +-
 ...unfold-masked-merge-scalar-variablemask.ll |  26 +-
 ...unfold-masked-merge-vector-variablemask.ll | 598 +++++++++--------
 18 files changed, 1524 insertions(+), 1059 deletions(-)
 create mode 100644 llvm/test/CodeGen/RISCV/fold-masked-merge.ll
 create mode 100644 llvm/test/CodeGen/SystemZ/fold-masked-merge.ll

diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index b65e8e06eae62..e79a17e86bc87 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -8128,6 +8128,59 @@ static SDValue visitORCommutative(SelectionDAG &DAG, SDValue N0, SDValue N1,
   return SDValue();
 }
 
+static SDValue foldMaskedMergeImpl(SDValue AndL0, SDValue AndR0, SDValue AndL1,
+                                   SDValue AndR1, const SDLoc &DL,
+                                   SelectionDAG &DAG) {
+  if (!isBitwiseNot(AndL0, true) || !AndL0->hasOneUse())
+    return SDValue();
+  SDValue NotOp = AndL0->getOperand(0);
+  if (NotOp == AndR1)
+    std::swap(AndR1, AndL1);
+  if (NotOp != AndL1)
+    return SDValue();
+
+  EVT VT = AndL1.getValueType();
+  SDValue Xor0 = DAG.getNode(ISD::XOR, DL, VT, AndR1, AndR0);
+  SDValue And = DAG.getNode(ISD::AND, DL, VT, Xor0, NotOp);
+  SDValue Xor1 = DAG.getNode(ISD::XOR, DL, VT, And, AndR0);
+  return Xor1;
+}
+
+/// Fold "masked merge" expressions like `(m & x) | (~m & y)` into the
+/// equivalent `((x ^ y) & m) ^ y)` pattern.
+/// This is typically a better representation for targets without a fused
+/// "and-not" operation.
+static SDValue foldMaskedMerge(SDNode *Node, SelectionDAG &DAG,
+                               const TargetLowering &TLI, const SDLoc &DL) {
+  // Note that masked-merge variants using XOR or ADD expressions are
+  // normalized to OR by InstCombine so we only check for OR.
+  assert(Node->getOpcode() == ISD::OR && "Must be called with ISD::OR node");
+  SDValue N0 = Node->getOperand(0);
+  if (N0->getOpcode() != ISD::AND || !N0->hasOneUse())
+    return SDValue();
+  SDValue N1 = Node->getOperand(1);
+  if (N1->getOpcode() != ISD::AND || !N1->hasOneUse())
+    return SDValue();
+
+  // If the target supports and-not, don't fold this.
+  if (TLI.hasAndNot(SDValue(Node, 0)))
+    return SDValue();
+
+  SDValue N00 = N0->getOperand(0);
+  SDValue N01 = N0->getOperand(1);
+  SDValue N10 = N1->getOperand(0);
+  SDValue N11 = N1->getOperand(1);
+  if (SDValue Result = foldMaskedMergeImpl(N00, N01, N10, N11, DL, DAG))
+    return Result;
+  if (SDValue Result = foldMaskedMergeImpl(N01, N00, N10, N11, DL, DAG))
+    return Result;
+  if (SDValue Result = foldMaskedMergeImpl(N10, N11, N00, N01, DL, DAG))
+    return Result;
+  if (SDValue Result = foldMaskedMergeImpl(N11, N10, N00, N01, DL, DAG))
+    return Result;
+  return SDValue();
+}
+
 SDValue DAGCombiner::visitOR(SDNode *N) {
   SDValue N0 = N->getOperand(0);
   SDValue N1 = N->getOperand(1);
@@ -8306,6 +8359,10 @@ SDValue DAGCombiner::visitOR(SDNode *N) {
     if (SDValue R = foldLogicTreeOfShifts(N, N0, N1, DAG))
       return R;
 
+  if (VT.isScalarInteger() && VT != MVT::i1)
+    if (SDValue R = foldMaskedMerge(N, DAG, TLI, DL))
+      return R;
+
   return SDValue();
 }
 
diff --git a/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp b/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp
index f06246706aaa9..1c59b1e63b7bc 100644
--- a/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp
+++ b/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp
@@ -1283,6 +1283,20 @@ bool SystemZTargetLowering::allowsMisalignedMemoryAccesses(
   return true;
 }
 
+bool SystemZTargetLowering::hasAndNot(SDValue Y) const {
+  EVT VT = Y.getValueType();
+
+  // We can use NC(G)RK for types in GPRs ...
+  if (VT == MVT::i32 || VT == MVT::i64)
+    return Subtarget.hasMiscellaneousExtensions3();
+
+  // ... or VNC for types in VRs.
+  if (VT.isVector() || VT == MVT::i128)
+    return Subtarget.hasVector();
+
+  return false;
+}
+
 // Information about the addressing mode for a memory access.
 struct AddressingMode {
   // True if a long displacement is supported.
diff --git a/llvm/lib/Target/SystemZ/SystemZISelLowering.h b/llvm/lib/Target/SystemZ/SystemZISelLowering.h
index f3536a840fda8..f2f0bf6d8b410 100644
--- a/llvm/lib/Target/SystemZ/SystemZISelLowering.h
+++ b/llvm/lib/Target/SystemZ/SystemZISelLowering.h
@@ -671,6 +671,7 @@ class SystemZTargetLowering : public TargetLowering {
   }
 
   unsigned getStackProbeSize(const MachineFunction &MF) const;
+  bool hasAndNot(SDValue Y) const override;
 
 private:
   const SystemZSubtarget &Subtarget;
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 8bcd8670879a9..96714adf78e43 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -52350,59 +52350,6 @@ static SDValue combineOrCmpEqZeroToCtlzSrl(SDNode *N, SelectionDAG &DAG,
   return DAG.getNode(ISD::ZERO_EXTEND, SDLoc(N), N->getValueType(0), Ret);
 }
 
-static SDValue foldMaskedMergeImpl(SDValue And0_L, SDValue And0_R,
-                                   SDValue And1_L, SDValue And1_R,
-                                   const SDLoc &DL, SelectionDAG &DAG) {
-  if (!isBitwiseNot(And0_L, true) || !And0_L->hasOneUse())
-    return SDValue();
-  SDValue NotOp = And0_L->getOperand(0);
-  if (NotOp == And1_R)
-    std::swap(And1_R, And1_L);
-  if (NotOp != And1_L)
-    return SDValue();
-
-  // (~(NotOp) & And0_R) | (NotOp & And1_R)
-  // --> ((And0_R ^ And1_R) & NotOp) ^ And1_R
-  EVT VT = And1_L->getValueType(0);
-  SDValue Freeze_And0_R = DAG.getNode(ISD::FREEZE, SDLoc(), VT, And0_R);
-  SDValue Xor0 = DAG.getNode(ISD::XOR, DL, VT, And1_R, Freeze_And0_R);
-  SDValue And = DAG.getNode(ISD::AND, DL, VT, Xor0, NotOp);
-  SDValue Xor1 = DAG.getNode(ISD::XOR, DL, VT, And, Freeze_And0_R);
-  return Xor1;
-}
-
-/// Fold "masked merge" expressions like `(m & x) | (~m & y)` into the
-/// equivalent `((x ^ y) & m) ^ y)` pattern.
-/// This is typically a better representation for  targets without a fused
-/// "and-not" operation. This function is intended to be called from a
-/// `TargetLowering::PerformDAGCombine` callback on `ISD::OR` nodes.
-static SDValue foldMaskedMerge(SDNode *Node, SelectionDAG &DAG) {
-  // Note that masked-merge variants using XOR or ADD expressions are
-  // normalized to OR by InstCombine so we only check for OR.
-  assert(Node->getOpcode() == ISD::OR && "Must be called with ISD::OR node");
-  SDValue N0 = Node->getOperand(0);
-  if (N0->getOpcode() != ISD::AND || !N0->hasOneUse())
-    return SDValue();
-  SDValue N1 = Node->getOperand(1);
-  if (N1->getOpcode() != ISD::AND || !N1->hasOneUse())
-    return SDValue();
-
-  SDLoc DL(Node);
-  SDValue N00 = N0->getOperand(0);
-  SDValue N01 = N0->getOperand(1);
-  SDValue N10 = N1->getOperand(0);
-  SDValue N11 = N1->getOperand(1);
-  if (SDValue Result = foldMaskedMergeImpl(N00, N01, N10, N11, DL, DAG))
-    return Result;
-  if (SDValue Result = foldMaskedMergeImpl(N01, N00, N10, N11, DL, DAG))
-    return Result;
-  if (SDValue Result = foldMaskedMergeImpl(N10, N11, N00, N01, DL, DAG))
-    return Result;
-  if (SDValue Result = foldMaskedMergeImpl(N11, N10, N00, N01, DL, DAG))
-    return Result;
-  return SDValue();
-}
-
 /// If this is an add or subtract where one operand is produced by a cmp+setcc,
 /// then try to convert it to an ADC or SBB. This replaces TEST+SET+{ADD/SUB}
 /// with CMP+{ADC, SBB}.
@@ -52806,11 +52753,6 @@ static SDValue combineOr(SDNode *N, SelectionDAG &DAG,
     }
   }
 
-  // We should fold "masked merge" patterns when `andn` is not available.
-  if (!Subtarget.hasBMI() && VT.isScalarInteger() && VT != MVT::i1)
-    if (SDValue R = foldMaskedMerge(N, DAG))
-      return R;
-
   if (SDValue R = combineOrXorWithSETCC(N->getOpcode(), dl, VT, N0, N1, DAG))
     return R;
 
diff --git a/llvm/test/CodeGen/AMDGPU/bfi_int.ll b/llvm/test/CodeGen/AMDGPU/bfi_int.ll
index 201b97d479c68..b372dec383344 100644
--- a/llvm/test/CodeGen/AMDGPU/bfi_int.ll
+++ b/llvm/test/CodeGen/AMDGPU/bfi_int.ll
@@ -16,9 +16,9 @@ define amdgpu_kernel void @s_bfi_def_i32(ptr addrspace(1) %out, i32 %x, i32 %y,
 ; GFX7-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX7-NEXT:    s_mov_b32 s6, -1
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_andn2_b32 s2, s2, s0
+; GFX7-NEXT:    s_xor_b32 s1, s1, s2
 ; GFX7-NEXT:    s_and_b32 s0, s1, s0
-; GFX7-NEXT:    s_or_b32 s0, s2, s0
+; GFX7-NEXT:    s_xor_b32 s0, s0, s2
 ; GFX7-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX7-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; GFX7-NEXT:    s_endpgm
@@ -28,9 +28,9 @@ define amdgpu_kernel void @s_bfi_def_i32(ptr addrspace(1) %out, i32 %x, i32 %y,
 ; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x2c
 ; GFX8-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x24
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-NEXT:    s_andn2_b32 s2, s2, s0
+; GFX8-NEXT:    s_xor_b32 s1, s1, s2
 ; GFX8-NEXT:    s_and_b32 s0, s1, s0
-; GFX8-NEXT:    s_or_b32 s0, s2, s0
+; GFX8-NEXT:    s_xor_b32 s0, s0, s2
 ; GFX8-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX8-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX8-NEXT:    v_mov_b32_e32 v2, s0
@@ -44,9 +44,9 @@ define amdgpu_kernel void @s_bfi_def_i32(ptr addrspace(1) %out, i32 %x, i32 %y,
 ; GFX10-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x24
 ; GFX10-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    s_andn2_b32 s2, s2, s0
+; GFX10-NEXT:    s_xor_b32 s1, s1, s2
 ; GFX10-NEXT:    s_and_b32 s0, s1, s0
-; GFX10-NEXT:    s_or_b32 s0, s2, s0
+; GFX10-NEXT:    s_xor_b32 s0, s0, s2
 ; GFX10-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX10-NEXT:    global_store_dword v0, v1, s[4:5]
 ; GFX10-NEXT:    s_endpgm
@@ -1407,9 +1407,9 @@ define amdgpu_kernel void @s_bitselect_i64_pat_0(i64 %a, i64 %b, i64 %mask) {
 ; GFX7-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX7-NEXT:    s_mov_b32 s6, -1
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_and_b64 s[2:3], s[0:1], s[2:3]
-; GFX7-NEXT:    s_andn2_b64 s[0:1], s[4:5], s[0:1]
-; GFX7-NEXT:    s_or_b64 s[0:1], s[2:3], s[0:1]
+; GFX7-NEXT:    s_xor_b64 s[2:3], s[2:3], s[4:5]
+; GFX7-NEXT:    s_and_b64 s[0:1], s[2:3], s[0:1]
+; GFX7-NEXT:    s_xor_b64 s[0:1], s[0:1], s[4:5]
 ; GFX7-NEXT:    s_add_u32 s0, s0, 10
 ; GFX7-NEXT:    s_addc_u32 s1, s1, 0
 ; GFX7-NEXT:    v_mov_b32_e32 v0, s0
@@ -1422,9 +1422,9 @@ define amdgpu_kernel void @s_bitselect_i64_pat_0(i64 %a, i64 %b, i64 %mask) {
 ; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; GFX8-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x34
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-NEXT:    s_and_b64 s[2:3], s[0:1], s[2:3]
-; GFX8-NEXT:    s_andn2_b64 s[0:1], s[4:5], s[0:1]
-; GFX8-NEXT:    s_or_b64 s[0:1], s[2:3], s[0:1]
+; GFX8-NEXT:    s_xor_b64 s[2:3], s[2:3], s[4:5]
+; GFX8-NEXT:    s_and_b64 s[0:1], s[2:3], s[0:1]
+; GFX8-NEXT:    s_xor_b64 s[0:1], s[0:1], s[4:5]
 ; GFX8-NEXT:    s_add_u32 s0, s0, 10
 ; GFX8-NEXT:    s_addc_u32 s1, s1, 0
 ; GFX8-NEXT:    v_mov_b32_e32 v0, s0
@@ -1438,9 +1438,9 @@ define amdgpu_kernel void @s_bitselect_i64_pat_0(i64 %a, i64 %b, i64 %mask) {
 ; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; GFX10-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x34
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    s_and_b64 s[2:3], s[0:1], s[2:3]
-; GFX10-NEXT:    s_andn2_b64 s[0:1], s[4:5], s[0:1]
-; GFX10-NEXT:    s_or_b64 s[0:1], s[2:3], s[0:1]
+; GFX10-NEXT:    s_xor_b64 s[2:3], s[2:3], s[4:5]
+; GFX10-NEXT:    s_and_b64 s[0:1], s[2:3], s[0:1]
+; GFX10-NEXT:    s_xor_b64 s[0:1], s[0:1], s[4:5]
 ; GFX10-NEXT:    s_add_u32 s0, s0, 10
 ; GFX10-NEXT:    s_addc_u32 s1, s1, 0
 ; GFX10-NEXT:    v_mov_b32_e32 v0, s0
diff --git a/llvm/test/CodeGen/AMDGPU/insert_vector_dynelt.ll b/llvm/test/CodeGen/AMDGPU/insert_vector_dynelt.ll
index 6925a98f643b9..e1b4cad370f96 100644
--- a/llvm/test/CodeGen/AMDGPU/insert_vector_dynelt.ll
+++ b/llvm/test/CodeGen/AMDGPU/insert_vector_dynelt.ll
@@ -289,16 +289,16 @@ entry:
 define amdgpu_kernel void @half4_inselt(ptr addrspace(1) %out, <4 x half> %vec, i32 %sel) {
 ; GCN-LABEL: half4_inselt:
 ; GCN:       ; %bb.0: ; %entry
-; GCN-NEXT:    s_load_dword s6, s[4:5], 0x34
 ; GCN-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GCN-NEXT:    s_load_dword s6, s[4:5], 0x34
 ; GCN-NEXT:    s_mov_b32 s4, 0x3c003c00
 ; GCN-NEXT:    s_mov_b32 s5, s4
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    s_xor_b64 s[4:5], s[2:3], s[4:5]
 ; GCN-NEXT:    s_lshl_b32 s6, s6, 4
 ; GCN-NEXT:    s_lshl_b64 s[6:7], 0xffff, s6
-; GCN-NEXT:    s_andn2_b64 s[2:3], s[2:3], s[6:7]
-; GCN-NEXT:    s_and_b64 s[4:5], s[6:7], s[4:5]
-; GCN-NEXT:    s_or_b64 s[2:3], s[4:5], s[2:3]
+; GCN-NEXT:    s_and_b64 s[4:5], s[4:5], s[6:7]
+; GCN-NEXT:    s_xor_b64 s[2:3], s[4:5], s[2:3]
 ; GCN-NEXT:    v_mov_b32_e32 v0, s0
 ; GCN-NEXT:    v_mov_b32_e32 v2, s2
 ; GCN-NEXT:    v_mov_b32_e32 v1, s1
@@ -317,10 +317,10 @@ define amdgpu_kernel void @half2_inselt(ptr addrspace(1) %out, <2 x half> %vec,
 ; GCN-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    s_lshl_b32 s3, s3, 4
+; GCN-NEXT:    s_xor_b32 s4, s2, 0x3c003c00
 ; GCN-NEXT:    s_lshl_b32 s3, 0xffff, s3
-; GCN-NEXT:    s_andn2_b32 s2, s2, s3
-; GCN-NEXT:    s_and_b32 s3, s3, 0x3c003c00
-; GCN-NEXT:    s_or_b32 s2, s3, s2
+; GCN-NEXT:    s_and_b32 s3, s4, s3
+; GCN-NEXT:    s_xor_b32 s2, s3, s2
 ; GCN-NEXT:    v_mov_b32_e32 v0, s0
 ; GCN-NEXT:    v_mov_b32_e32 v1, s1
 ; GCN-NEXT:    v_mov_b32_e32 v2, s2
@@ -399,10 +399,10 @@ define amdgpu_kernel void @short2_inselt(ptr addrspace(1) %out, <2 x i16> %vec,
 ; GCN-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    s_lshl_b32 s3, s3, 4
+; GCN-NEXT:    s_xor_b32 s4, s2, 0x10001
 ; GCN-NEXT:    s_lshl_b32 s3, 0xffff, s3
-; GCN-NEXT:    s_andn2_b32 s2, s2, s3
-; GCN-NEXT:    s_and_b32 s3, s3, 0x10001
-; GCN-NEXT:    s_or_b32 s2, s3, s2
+; GCN-NEXT:    s_and_b32 s3, s4, s3
+; GCN-NEXT:    s_xor_b32 s2, s3, s2
 ; GCN-NEXT:    v_mov_b32_e32 v0, s0
 ; GCN-NEXT:    v_mov_b32_e32 v1, s1
 ; GCN-NEXT:    v_mov_b32_e32 v2, s2
@@ -417,16 +417,16 @@ entry:
 define amdgpu_kernel void @short4_inselt(ptr addrspace(1) %out, <4 x i16> %vec, i32 %sel) {
 ; GCN-LABEL: short4_inselt:
 ; GCN:       ; %bb.0: ; %entry
-; GCN-NEXT:    s_load_dword s6, s[4:5], 0x34
 ; GCN-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GCN-NEXT:    s_load_dword s6, s[4:5], 0x34
 ; GCN-NEXT:    s_mov_b32 s4, 0x10001
 ; GCN-NEXT:    s_mov_b32 s5, s4
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    s_xor_b64 s[4:5], s[2:3], s[4:5]
 ; GCN-NEXT:    s_lshl_b32 s6, s6, 4
 ; GCN-NEXT:    s_lshl_b64 s[6:7], 0xffff, s6
-; GCN-NEXT:    s_andn2_b64 s[2:3], s[2:3], s[6:7]
-; GCN-NEXT:    s_and_b64 s[4:5], s[6:7], s[4:5]
-; GCN-NEXT:    s_or_b64 s[2:3], s[4:5], s[2:3]
+; GCN-NEXT:    s_and_b64 s[4:5], s[4:5], s[6:7]
+; GCN-NEXT:    s_xor_b64 s[2:3], s[4:5], s[2:3]
 ; GCN-NEXT:    v_mov_b32_e32 v0, s0
 ; GCN-NEXT:    v_mov_b32_e32 v2, s2
 ; GCN-NEXT:    v_mov_b32_e32 v1, s1
@@ -442,15 +442,15 @@ entry:
 define amdgpu_kernel void @byte8_inselt(ptr addrspace(1) %out, <8 x i8> %vec, i32 %sel) {
 ; GCN-LABEL: byte8_inselt:
 ; GCN:       ; %bb.0: ; %entry
-; GCN-NEXT:    s_load_dword s6, s[4:5], 0x34
 ; GCN-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GCN-NEXT:    s_load_dword s6, s[4:5], 0x34
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    s_lshl_b32 s4, s6, 3
-; GCN-NEXT:    s_lshl_b64 s[4:5], 0xff, s4
-; GCN-NEXT:    s_and_b32 s7, s5, 0x1010101
-; GCN-NEXT:    s_and_b32 s6, s4, 0x1010101
-; GCN-NEXT:    s_andn2_b64 s[2:3], s[2:3], s[4:5]
-; GCN-NEXT:    s_or_b64 s[2:3], s[6:7], s[2:3]
+; GCN-NEXT:    s_xor_b32 s5, s3, 0x1010101
+; GCN-NEXT:    s_lshl_b32 s6, s6, 3
+; GCN-NEXT:    s_xor_b32 s4, s2, 0x1010101
+; GCN-NEXT:    s_lshl_b64 s[6:7], 0xff, s6
+; GCN-NEXT:    s_and_b64 s[4:5], s[4:5], s[6:7]
+; GCN-NEXT:    s_xor_b64 s[2:3], s[4:5], s[2:3]
 ; GCN-NEXT:    v_mov_b32_e32 v0, s0
 ; GCN-NEXT:    v_mov_b32_e32 v2, s2
 ; GCN-NEXT:    v_mov_b32_e32 v1, s1
diff --git a/llvm/test/CodeGen/AMDGPU/insert_vector_elt.ll b/llvm/test/CodeGen/AMDGPU/insert_vector_elt.ll
index be16fac4c53f7..44bd4090436ef 100644
--- a/llvm/test/CodeGen/AMDGPU/insert_vector_elt.ll
+++ b/llvm/test/CodeGen/AMDGPU/insert_vector_elt.ll
@@ -1511,13 +1511,13 @@ define amdgpu_kernel void @dynamic_insertelement_v2i16(ptr addrspace(1) %out, <2
 ; SI-NEXT:    s_mov_b32 s7, 0x100f000
 ; SI-NEXT:    s_mov_b32 s6, -1
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    s_mov_b32 s4, s0
-; SI-NEXT:    s_lshl_b32 s0, s3, 4
-; SI-NEXT:    s_lshl_b32 s0, 0xffff, s0
 ; SI-NEXT:    s_mov_b32 s5, s1
-; SI-NEXT:    s_andn2_b32 s1, s2, s0
-; SI-NEXT:    s_and_b32 s0, s0, 0x50005
-; SI-NEXT:    s_or_b32 s0, s0, s1
+; SI-NEXT:    s_lshl_b32 s1, s3, 4
+; SI-NEXT:    s_mov_b32 s4, s0
+; SI-NEXT:    s_xor_b32 s0, s2, 0x50005
+; SI-NEXT:    s_lshl_b32 s1, 0xffff, s1
+; SI-NEXT:    s_and_b32 s0, s0, s1
+; SI-NEXT:    s_xor_b32 s0, s0, s2
 ; SI-NEXT:    v_mov_b32_e32 v0, s0
 ; SI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; SI-NEXT:    s_endpgm
@@ -1528,13 +1528,13 @@ define amdgpu_kernel void @dynamic_insertelement_v2i16(ptr addrspace(1) %out, <2
 ; VI-NEXT:    s_mov_b32 s7, 0x1100f000
 ; VI-NEXT:    s_mov_b32 s6, -1
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    s_mov_b32 s4, s0
-; VI-NEXT:    s_lshl_b32 s0, s3, 4
-; VI-NEXT:    s_lshl_b32 s0, 0xffff, s0
 ; VI-NEXT:    s_mov_b32 s5, s1
-; VI-NEXT:    s_andn2_b32 s1, s2, s0
-; VI-NEXT:    s_and_b32 s0, s0, 0x50005
-; VI-NEXT:    s_or_b32 s0, s0, s1
+; VI-NEXT:    s_lshl_b32 s1, s3, 4
+; VI-NEXT:    s_mov_b32 s4, s0
+; VI-NEXT:    s_xor_b32 s0, s2, 0x50005
+; VI-NEXT:    s_lshl_b32 s1, 0xffff, s1
+; VI-NEXT:    s_and_b32 s0, s0, s1
+; VI-NEXT:    s_xor_b32 s0, s0, s2
 ; VI-NEXT:    v_mov_b32_e32 v0, s0
 ; VI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; VI-NEXT:    s_endpgm
@@ -1552,13 +1552,13 @@ define amdgpu_kernel void @dynamic_insertelement_v3i16(ptr addrspace(1) %out, <3
 ; SI-NEXT:    s_mov_b32 s6, -1
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
 ; SI-NEXT:    s_mov_b32 s4, s0
-; SI-NEXT:    s_lshl_b32 s0, s8, 4
+; SI-NEXT:    s_lshl_b32 s8, s8, 4
 ; SI-NEXT:    s_mov_b32 s5, s1
-; SI-NEXT:    s_lshl_b64 s[0:1], 0xffff, s0
-; SI-NEXT:    s_and_b32 s9, s1, 0x50005
-; SI-NEXT:    s_and_b32 s8, s0, 0x50005
-; SI-NEXT:    s_andn2_b64 s[0:1], s[2:3], s[0:1]
-; SI-NEXT:    s_or_b64 s[0:1], s[8:9], s[0:1]
+; SI-NEXT:    s_xor_b32 s1, s3, 0x50005
+; SI-NEXT:    s_xor_b32 s0, s2, 0x50005
+; SI-NEXT:    s_lshl_b64 s[8:9], 0xffff, s8
+; SI-NEXT:    s_and_b64 s[0:1], s[0:1], s[8:9]
+; SI-NEXT:    s_xor_b64 s[0:1], s[0:1], s[2:3]
 ; SI-NEXT:    v_mov_b32_e32 v0, s1
 ; SI-NEXT:    buffer_store_short v0, off, s[4:7], 0 offset:4
 ; SI-NEXT:    v_mov_b32_e32 v0, s0
@@ -1573,14 +1573,14 @@ define amdgpu_kernel void @dynamic_insertelement_v3i16(ptr addrspace(1) %out, <3
 ; VI-NEXT:    s_mov_b32 s6, -1
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    s_mov_b32 s4, s0
-; VI-NEXT:    s_lshl_b32 s0, s8, 4
-; VI-NEXT:    s_mov_b32 s8, 0x50005
+; VI-NEXT:    s_mov_b32 s0, 0x50005
 ; VI-NEXT:    s_mov_b32 s5, s1
-; VI-NEXT:    s_lshl_b64 s[0:1], 0xffff, s0
-; VI-NEXT:    s_mov_b32 s9, s8
-; VI-NEXT:    s_andn2_b64 s[2:3], s[2:3], s[0:1]
+; VI-NEXT:    s_mov_b32 s1, s0
+; VI-NEXT:    s_lshl_b32 s8, s8, 4
+; VI-NEXT:    s_xor_b64 s[0:1], s[2:3], s[0:1]
+; VI-NEXT:    s_lshl_b64 s[8:9], 0xffff, s8
 ; VI-NEXT:    s_and_b64 s[0:1], s[0:1], s[8:9]
-; VI-NEXT:    s_or_b64 s[0:1], s[0:1], s[2:3]
+; VI-NEXT:    s_xor_b64 s[0:1], s[0:1], s[2:3]
 ; VI-NEXT:    v_mov_b32_e32 v0, s1
 ; VI-NEXT:    buffer_store_short v0, off, s[4:7], 0 offset:4
 ; VI-NEXT:    v_mov_b32_e32 v0, s0
@@ -1594,35 +1594,34 @@ define amdgpu_kernel void @dynamic_insertelement_v3i16(ptr addrspace(1) %out, <3
 define amdgpu_kernel void @dynamic_insertelement_v2i8(ptr addrspace(1) %out, [8 x i32], <2 x i8> %a, [8 x i32], i32 %b) nounwind {
 ; SI-LABEL: dynamic_insertelement_v2i8:
 ; SI:       ; %bb.0:
-; SI-NEXT:    s_load_dword s4, s[8:9], 0x13
+; SI-NEXT:    s_load_dword s4, s[8:9], 0xa
+; SI-NEXT:    s_load_dword s5, s[8:9], 0x13
 ; SI-NEXT:    s_load_dwordx2 s[0:1], s[8:9], 0x0
-; SI-NEXT:    s_load_dword s5, s[8:9], 0xa
 ; SI-NEXT:    s_mov_b32 s3, 0x100f000
 ; SI-NEXT:    s_mov_b32 s2, -1
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    s_lshl_b32 s4, s4, 3
-; SI-NEXT:    s_lshl_b32 s4, 0xff, s4
-; SI-NEXT:    s_andn2_b32 s5, s5, s4
-; SI-NEXT:    s_and_b32 s4, s4, 0x505
-; SI-NEXT:    s_or_b32 s4, s4, s5
+; SI-NEXT:    s_xor_b32 s6, s4, 0x505
+; SI-NEXT:    s_lshl_b32 s5, s5, 3
+; SI-NEXT:    s_lshl_b32 s5, 0xff, s5
+; SI-NEXT:    s_and_b32 s5, s6, s5
+; SI-NEXT:    s_xor_b32 s4, s5, s4
 ; SI-NEXT:    v_mov_b32_e32 v0, s4
 ; SI-NEXT:    buffer_store_short v0, off, s[0:3], 0
 ; SI-NEXT:    s_endpgm
 ;
 ; VI-LABEL: dynamic_insertelement_v2i8:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_load_dword s4, s[8:9], 0x4c
+; VI-NEXT:    s_load_dword s4, s[8:9], 0x28
+; VI-NEXT:    s_load_dword s5, s[8:9], 0x4c
 ; VI-NEXT:    s_load_dwordx2 s[0:1], s[8:9], 0x0
-; VI-NEXT:    s_load_dword s5, s[8:9], 0x28
 ; VI-NEXT:    s_mov_b32 s3, 0x1100f000
 ; VI-NEXT:    s_mov_b32 s2, -1
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    s_lshl_b32 s4, s4, 3
-; VI-NEXT:    s_lshl_b32 s4, 0xff, s4
-; VI-NEXT:    s_and_b32 s6, s4, 0x505
-; VI-NEXT:    s_xor_b32 s4, s4, 0xffff
-; VI-NEXT:    s_and_b32 s4, s4, s5
-; VI-NEXT:    s_or_b32 s4, s6, s4
+; VI-NEXT:    s_xor_b32 s6, s4, 0x505
+; VI-NEXT:    s_lshl_b32 s5, s5, 3
+; VI-NEXT:    s_lshl_b32 s5, 0xff, s5
+; VI-NEXT:    s_and_b32 s5, s6, s5
+; VI-NEXT:    s_xor_b32 s4, s5, s4
 ; VI-NEXT:    v_mov_b32_e32 v0, s4
 ; VI-NEXT:    buffer_store_short v0, off, s[0:3], 0
 ; VI-NEXT:    s_endpgm
@@ -1636,17 +1635,17 @@ define amdgpu_kernel void @dynamic_insertelement_v2i8(ptr addrspace(1) %out, [8
 define amdgpu_kernel void @dynamic_insertelement_v3i8(ptr addrspace(1) %out, [8 x i32], <3 x i8> %a, [8 x i32], i32 %b) nounwind {
 ; SI-LABEL: dynamic_insertelement_v3i8:
 ; SI:       ; %bb.0:
-; SI-NEXT:    s_load_dword s4, s[8:9], 0x13
+; SI-NEXT:    s_load_dword s4, s[8:9], 0xa
+; SI-NEXT:    s_load_dword s5, s[8:9], 0x13
 ; SI-NEXT:    s_load_dwordx2 s[0:1], s[8:9], 0x0
-; SI-NEXT:    s_load_dword s5, s[8:9], 0xa
 ; SI-NEXT:    s_mov_b32 s3, 0x100f000
 ; SI-NEXT:    s_mov_b32 s2, -1
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    s_lshl_b32 s4, s4, 3
-; SI-NEXT:    s_lshl_b32 s4, 0xff, s4
-; SI-NEXT:    s_andn2_b32 s5, s5, s4
-; SI-NEXT:    s_and_b32 s4, s4, 0x5050505
-; SI-NEXT:    s_or_b32 s4, s4, s5
+; SI-NEXT:    s_xor_b32 s6, s4, 0x5050505
+; SI-NEXT:    s_lshl_b32 s5, s5, 3
+; SI-NEXT:    s_lshl_b32 s5, 0xff, s5
+; SI-NEXT:    s_and_b32 s5, s6, s5
+; SI-NEXT:    s_xor_b32 s4, s5, s4
 ; SI-NEXT:    s_lshr_b32 s5, s4, 16
 ; SI-NEXT:    v_mov_b32_e32 v0, s4
 ; SI-NEXT:    buffer_store_short v0, off, s[0:3], 0
@@ -1656,17 +1655,17 @@ define amdgpu_kernel void @dynamic_insertelement_v3i8(ptr addrspace(1) %out, [8
 ;
 ; VI-LABEL: dynamic_insertelement_v3i8:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_load_dword s4, s[8:9], 0x4c
+; VI-NEXT:    s_load_dword s4, s[8:9], 0x28
+; VI-NEXT:    s_load_dword s5, s[8:9], 0x4c
 ; VI-NEXT:    s_load_dwordx2 s[0:1], s[8:9], 0x0
-; VI-NEXT:    s_load_dword s5, s[8:9], 0x28
 ; VI-NEXT:    s_mov_b32 s3, 0x1100f000
 ; VI-NEXT:    s_mov_b32 s2, -1
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    s_lshl_b32 s4, s4, 3
-; VI-NEXT:    s_lshl_b32 s4, 0xff, s4
-; VI-NEXT:    s_andn2_b32 s5, s5, s4
-; VI-NEXT:    s_and_b32 s4, s4, 0x5050505
-; VI-NEXT:    s_or_b32 s4, s4, s5
+; VI-NEXT:    s_xor_b32 s6, s4, 0x5050505
+; VI-NEXT:    s_lshl_b32 s5, s5, 3
+; VI-NEXT:    s_lshl_b32 s5, 0xff, s5
+; VI-NEXT:    s_and_b32 s5, s6, s5
+; VI-NEXT:    s_xor_b32 s4, s5, s4
 ; VI-NEXT:    s_lshr_b32 s5, s4, 16
 ; VI-NEXT:    v_mov_b32_e32 v0, s4
 ; VI-NEXT:    buffer_store_short v0, off, s[0:3], 0
@@ -1681,34 +1680,34 @@ define amdgpu_kernel void @dynamic_insertelement_v3i8(ptr addrspace(1) %out, [8
 define amdgpu_kernel void @dynamic_insertelement_v4i8(ptr addrspace(1) %out, [8 x i32], <4 x i8> %a, [8 x i32], i32 %b) nounwind {
 ; SI-LABEL: dynamic_insertelement_v4i8:
 ; SI:       ; %bb.0:
-; SI-NEXT:    s_load_dword s4, s[8:9], 0x13
+; SI-NEXT:    s_load_dword s4, s[8:9], 0xa
+; SI-NEXT:    s_load_dword s5, s[8:9], 0x13
 ; SI-NEXT:    s_load_dwordx2 s[0:1], s[8:9], 0x0
-; SI-NEXT:    s_load_dword s5, s[8:9], 0xa
 ; SI-NEXT:    s_mov_b32 s3, 0x100f000
 ; SI-NEXT:    s_mov_b32 s2, -1
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    s_lshl_b32 s4, s4, 3
-; SI-NEXT:    s_lshl_b32 s4, 0xff, s4
-; SI-NEXT:    s_andn2_b32 s5, s5, s4
-; SI-NEXT:    s_and_b32 s4, s4, 0x5050505
-; SI-NEXT:    s_or_b32 s4, s4, s5
+; SI-NEXT:    s_xor_b32 s6, s4, 0x5050505
+; SI-NEXT:    s_lshl_b32 s5, s5, 3
+; SI-NEXT:    s_lshl_b32 s5, 0xff, s5
+; SI-NEXT:    s_and_b32 s5, s6, s5
+; SI-NEXT:    s_xor_b32 s4, s5, s4
 ; SI-NEXT:    v_mov_b32_e32 v0, s4
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; SI-NEXT:    s_endpgm
 ;
 ; VI-LABEL: dynamic_insertelement_v4i8:
 ; VI:       ; %bb.0:
-; VI-NEXT:    s_load_dword s4, s[8:9], 0x4c
+; VI-NEXT:    s_load_dword s4, s[8:9], 0x28
+; VI-NEXT:    s_load_dword s5, s[8:9], 0x4c
 ; VI-NEXT:    s_load_dwordx2 s[0:1], s[8:9], 0x0
-; VI-NEXT:    s_load_dword s5, s[8:9], 0x28
 ; VI-NEXT:    s_mov_b32 s3, 0x1100f000
 ; VI-NEXT:    s_mov_b32 s2, -1
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    s_lshl_b32 s4, s4, 3
-; VI-NEXT:    s_lshl_b32 s4, 0xff, s4
-; VI-NEXT:    s_andn2_b32 s5, s5, s4
-; VI-NEXT:    s_and_b32 s4, s4, 0x5050505
-; VI-NEXT:    s_or_b32 s4, s4, s5
+; VI-NEXT:    s_xor_b32 s6, s4, 0x5050505
+; VI-NEXT:    s_lshl_b32 s5, s5, 3
+; VI-NEXT:    s_lshl_b32 s5, 0xff, s5
+; VI-NEXT:    s_and_b32 s5, s6, s5
+; VI-NEXT:    s_xor_b32 s4, s5, s4
 ; VI-NEXT:    v_mov_b32_e32 v0, s4
 ; VI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; VI-NEXT:    s_endpgm
@@ -1721,20 +1720,20 @@ define amdgpu_kernel void @s_dynamic_insertelement_v8i8(ptr addrspace(1) %out, p
 ; SI-LABEL: s_dynamic_insertelement_v8i8:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
-; SI-NEXT:    s_load_dword s8, s[8:9], 0x4
 ; SI-NEXT:    s_mov_b32 s7, 0x100f000
 ; SI-NEXT:    s_mov_b32 s6, -1
+; SI-NEXT:    s_load_dword s8, s[8:9], 0x4
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
 ; SI-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x0
 ; SI-NEXT:    s_mov_b32 s4, s0
-; SI-NEXT:    s_lshl_b32 s0, s8, 3
 ; SI-NEXT:    s_mov_b32 s5, s1
-; SI-NEXT:    s_lshl_b64 s[0:1], 0xff, s0
-; SI-NEXT:    s_and_b32 s9, s1, 0x5050505
+; SI-NEXT:    s_lshl_b32 s8, s8, 3
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    s_andn2_b64 s[2:3], s[2:3], s[0:1]
-; SI-NEXT:    s_and_b32 s8, s0, 0x5050505
-; SI-NEXT:    s_or_b64 s[0:1], s[8:9], s[2:3]
+; SI-NEXT:    s_xor_b32 s1, s3, 0x5050505
+; SI-NEXT:    s_xor_b32 s0, s2, 0x5050505
+; SI-NEXT:    s_lshl_b64 s[8:9], 0xff, s8
+; SI-NEXT:    s_and_b64 s[0:1], s[0:1], s[8:9]
+; SI-NEXT:    s_xor_b64 s[0:1], s[0:1], s[2:3]
 ; SI-NEXT:    v_mov_b32_e32 v0, s0
 ; SI-NEXT:    v_mov_b32_e32 v1, s1
 ; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
@@ -1743,20 +1742,20 @@ define amdgpu_kernel void @s_dynamic_insertelement_v8i8(ptr addrspace(1) %out, p
 ; VI-LABEL: s_dynamic_insertelement_v8i8:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
-; VI-NEXT:    s_load_dword s8, s[8:9], 0x10
 ; VI-NEXT:    s_mov_b32 s7, 0x1100f000
 ; VI-NEXT:    s_mov_b32 s6, -1
+; VI-NEXT:    s_load_dword s8, s[8:9], 0x10
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x0
 ; VI-NEXT:    s_mov_b32 s4, s0
-; VI-NEXT:    s_lshl_b32 s0, s8, 3
 ; VI-NEXT:    s_mov_b32 s5, s1
-; VI-NEXT:    s_lshl_b64 s[0:1], 0xff, s0
-; VI-NEXT:    s_and_b32 s9, s1, 0x5050505
+; VI-NEXT:    s_lshl_b32 s8, s8, 3
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    s_andn2_b64 s[2:3], s[2:3], s[0:1]
-; VI-NEXT:    s_and_b32 s8, s0, 0x5050505
-; VI-NEXT:    s_or_b64 s[0:1], s[8:9], s[2:3]
+; VI-NEXT:    s_xor_b32 s1, s3, 0x5050505
+; VI-NEXT:    s_xor_b32 s0, s2, 0x5050505
+; VI-NEXT:    s_lshl_b64 s[8:9], 0xff, s8
+; VI-NEXT:    s_and_b64 s[0:1], s[0:1], s[8:9]
+; VI-NEXT:    s_xor_b64 s[0:1], s[0:1], s[2:3]
 ; VI-NEXT:    v_mov_b32_e32 v0, s0
 ; VI-NEXT:    v_mov_b32_e32 v1, s1
 ; VI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
diff --git a/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll b/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll
index e0dacb7a59a42..a0ad6328b0c01 100644
--- a/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll
+++ b/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll
@@ -1534,11 +1534,11 @@ define amdgpu_kernel void @s_insertelement_v2i16_dynamic(ptr addrspace(1) %out,
 ; GFX9-NEXT:    s_load_dword s6, s[4:5], 0x0
 ; GFX9-NEXT:    s_load_dword s7, s[2:3], 0x0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_lshl_b32 s2, s6, 4
-; GFX9-NEXT:    s_lshl_b32 s2, 0xffff, s2
-; GFX9-NEXT:    s_andn2_b32 s3, s7, s2
-; GFX9-NEXT:    s_and_b32 s2, s2, 0x3e703e7
-; GFX9-NEXT:    s_or_b32 s2, s2, s3
+; GFX9-NEXT:    s_lshl_b32 s3, s6, 4
+; GFX9-NEXT:    s_xor_b32 s2, s7, 0x3e703e7
+; GFX9-NEXT:    s_lshl_b32 s3, 0xffff, s3
+; GFX9-NEXT:    s_and_b32 s2, s2, s3
+; GFX9-NEXT:    s_xor_b32 s2, s2, s7
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
 ; GFX9-NEXT:    s_endpgm
@@ -1553,14 +1553,14 @@ define amdgpu_kernel void @s_insertelement_v2i16_dynamic(ptr addrspace(1) %out,
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    s_load_dword s4, s[4:5], 0x0
 ; VI-NEXT:    s_load_dword s2, s[2:3], 0x0
-; VI-NEXT:    v_mov_b32_e32 v0, s0
 ; VI-NEXT:    v_mov_b32_e32 v1, s1
+; VI-NEXT:    v_mov_b32_e32 v0, s0
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    s_lshl_b32 s0, s4, 4
-; VI-NEXT:    s_lshl_b32 s0, 0xffff, s0
-; VI-NEXT:    s_andn2_b32 s1, s2, s0
-; VI-NEXT:    s_and_b32 s0, s0, 0x3e703e7
-; VI-NEXT:    s_or_b32 s0, s0, s1
+; VI-NEXT:    s_lshl_b32 s1, s4, 4
+; VI-NEXT:    s_xor_b32 s0, s2, 0x3e703e7
+; VI-NEXT:    s_lshl_b32 s1, 0xffff, s1
+; VI-NEXT:    s_and_b32 s0, s0, s1
+; VI-NEXT:    s_xor_b32 s0, s0, s2
 ; VI-NEXT:    v_mov_b32_e32 v2, s0
 ; VI-NEXT:    flat_store_dword v[0:1], v2
 ; VI-NEXT:    s_endpgm
@@ -1575,14 +1575,14 @@ define amdgpu_kernel void @s_insertelement_v2i16_dynamic(ptr addrspace(1) %out,
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    s_load_dword s4, s[4:5], 0x0
 ; CI-NEXT:    s_load_dword s2, s[2:3], 0x0
-; CI-NEXT:    v_mov_b32_e32 v0, s0
 ; CI-NEXT:    v_mov_b32_e32 v1, s1
+; CI-NEXT:    v_mov_b32_e32 v0, s0
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
-; CI-NEXT:    s_lshl_b32 s0, s4, 4
-; CI-NEXT:    s_lshl_b32 s0, 0xffff, s0
-; CI-NEXT:    s_andn2_b32 s1, s2, s0
-; CI-NEXT:    s_and_b32 s0, s0, 0x3e703e7
-; CI-NEXT:    s_or_b32 s0, s0, s1
+; CI-NEXT:    s_lshl_b32 s1, s4, 4
+; CI-NEXT:    s_xor_b32 s0, s2, 0x3e703e7
+; CI-NEXT:    s_lshl_b32 s1, 0xffff, s1
+; CI-NEXT:    s_and_b32 s0, s0, s1
+; CI-NEXT:    s_xor_b32 s0, s0, s2
 ; CI-NEXT:    v_mov_b32_e32 v2, s0
 ; CI-NEXT:    flat_store_dword v[0:1], v2
 ; CI-NEXT:    s_endpgm
@@ -1597,12 +1597,12 @@ define amdgpu_kernel void @s_insertelement_v2i16_dynamic(ptr addrspace(1) %out,
 ; GFX11-NEXT:    s_load_b32 s2, s[2:3], 0x0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    s_lshl_b32 s3, s4, 4
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-NEXT:    s_xor_b32 s4, s2, 0x3e703e7
 ; GFX11-NEXT:    s_lshl_b32 s3, 0xffff, s3
-; GFX11-NEXT:    s_and_not1_b32 s2, s2, s3
-; GFX11-NEXT:    s_and_b32 s3, s3, 0x3e703e7
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GFX11-NEXT:    s_or_b32 s2, s3, s2
+; GFX11-NEXT:    s_and_b32 s3, s4, s3
+; GFX11-NEXT:    s_xor_b32 s2, s3, s2
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
 ; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX11-NEXT:    s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/unfold-masked-merge-scalar-variablemask.ll b/llvm/test/CodeGen/AMDGPU/unfold-masked-merge-scalar-variablemask.ll
index 69724aa75af4f..321b64510c35f 100644
--- a/llvm/test/CodeGen/AMDGPU/unfold-masked-merge-scalar-variablemask.ll
+++ b/llvm/test/CodeGen/AMDGPU/unfold-masked-merge-scalar-variablemask.ll
@@ -5,10 +5,11 @@ define i32 @s_out32(i32 inreg %x, i32 inreg %y, i32 inreg %mask) {
 ; GCN-LABEL: s_out32:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    s_and_b32 s0, s0, s2
-; GCN-NEXT:    s_and_not1_b32 s1, s1, s2
+; GCN-NEXT:    s_xor_b32 s0, s0, s1
 ; GCN-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GCN-NEXT:    s_or_b32 s0, s0, s1
+; GCN-NEXT:    s_and_b32 s0, s0, s2
+; GCN-NEXT:    s_xor_b32 s0, s0, s1
+; GCN-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GCN-NEXT:    v_mov_b32_e32 v0, s0
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %mx = and i32 %x, %mask
@@ -22,10 +23,11 @@ define i64 @s_out64(i64 inreg %x, i64 inreg %y, i64 inreg %mask) {
 ; GCN-LABEL: s_out64:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    s_and_b64 s[0:1], s[0:1], s[16:17]
-; GCN-NEXT:    s_and_not1_b64 s[2:3], s[2:3], s[16:17]
+; GCN-NEXT:    s_xor_b64 s[0:1], s[0:1], s[2:3]
 ; GCN-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GCN-NEXT:    s_or_b64 s[0:1], s[0:1], s[2:3]
+; GCN-NEXT:    s_and_b64 s[0:1], s[0:1], s[16:17]
+; GCN-NEXT:    s_xor_b64 s[0:1], s[0:1], s[2:3]
+; GCN-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GCN-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %mx = and i64 %x, %mask
@@ -427,10 +429,11 @@ define i32 @s_out_constant_varx_42(i32 inreg %x, i32 inreg %y, i32 inreg %mask)
 ; GCN-LABEL: s_out_constant_varx_42:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    s_and_b32 s0, s2, s0
-; GCN-NEXT:    s_and_not1_b32 s1, 42, s2
+; GCN-NEXT:    s_xor_b32 s0, s0, 42
 ; GCN-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GCN-NEXT:    s_or_b32 s0, s0, s1
+; GCN-NEXT:    s_and_b32 s0, s0, s2
+; GCN-NEXT:    s_xor_b32 s0, s0, 42
+; GCN-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GCN-NEXT:    v_mov_b32_e32 v0, s0
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %notmask = xor i32 %mask, -1
@@ -462,10 +465,11 @@ define i32 @s_out_constant_varx_42_invmask(i32 inreg %x, i32 inreg %y, i32 inreg
 ; GCN-LABEL: s_out_constant_varx_42_invmask:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    s_and_not1_b32 s0, s0, s2
-; GCN-NEXT:    s_and_b32 s1, s2, 42
+; GCN-NEXT:    s_xor_b32 s1, s0, 42
 ; GCN-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GCN-NEXT:    s_or_b32 s0, s0, s1
+; GCN-NEXT:    s_and_b32 s1, s1, s2
+; GCN-NEXT:    s_xor_b32 s0, s1, s0
+; GCN-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GCN-NEXT:    v_mov_b32_e32 v0, s0
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %notmask = xor i32 %mask, -1
@@ -560,10 +564,11 @@ define i32 @s_out_constant_42_vary(i32 inreg %x, i32 inreg %y, i32 inreg %mask)
 ; GCN-LABEL: s_out_constant_42_vary:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    s_and_b32 s0, s2, 42
-; GCN-NEXT:    s_and_not1_b32 s1, s1, s2
+; GCN-NEXT:    s_xor_b32 s0, s1, 42
 ; GCN-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GCN-NEXT:    s_or_b32 s0, s0, s1
+; GCN-NEXT:    s_and_b32 s0, s0, s2
+; GCN-NEXT:    s_xor_b32 s0, s0, s1
+; GCN-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GCN-NEXT:    v_mov_b32_e32 v0, s0
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %notmask = xor i32 %mask, -1
@@ -595,10 +600,11 @@ define i32 @s_out_constant_42_vary_invmask(i32 inreg %x, i32 inreg %y, i32 inreg
 ; GCN-LABEL: s_out_constant_42_vary_invmask:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    s_and_not1_b32 s0, 42, s2
-; GCN-NEXT:    s_and_b32 s1, s2, s1
+; GCN-NEXT:    s_xor_b32 s0, s1, 42
 ; GCN-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GCN-NEXT:    s_or_b32 s0, s0, s1
+; GCN-NEXT:    s_and_b32 s0, s0, s2
+; GCN-NEXT:    s_xor_b32 s0, s0, 42
+; GCN-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GCN-NEXT:    v_mov_b32_e32 v0, s0
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
   %notmask = xor i32 %mask, -1
diff --git a/llvm/test/CodeGen/NVPTX/unfold-masked-merge-vector-variablemask.ll b/llvm/test/CodeGen/NVPTX/unfold-masked-merge-vector-variablemask.ll
index 8e4c77e76029c..bac8bbbf0b4de 100644
--- a/llvm/test/CodeGen/NVPTX/unfold-masked-merge-vector-variablemask.ll
+++ b/llvm/test/CodeGen/NVPTX/unfold-masked-merge-vector-variablemask.ll
@@ -8,17 +8,16 @@
 define <1 x i8> @out_v1i8(<1 x i8> %x, <1 x i8> %y, <1 x i8> %mask) nounwind {
 ; CHECK-LABEL: out_v1i8(
 ; CHECK:       {
-; CHECK-NEXT:    .reg .b16 %rs<8>;
+; CHECK-NEXT:    .reg .b16 %rs<7>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
 ; CHECK-NEXT:    ld.param.b8 %rs1, [out_v1i8_param_0];
-; CHECK-NEXT:    ld.param.b8 %rs2, [out_v1i8_param_2];
-; CHECK-NEXT:    and.b16 %rs3, %rs1, %rs2;
-; CHECK-NEXT:    ld.param.b8 %rs4, [out_v1i8_param_1];
-; CHECK-NEXT:    not.b16 %rs5, %rs2;
-; CHECK-NEXT:    and.b16 %rs6, %rs4, %rs5;
-; CHECK-NEXT:    or.b16 %rs7, %rs3, %rs6;
-; CHECK-NEXT:    st.param.b8 [func_retval0], %rs7;
+; CHECK-NEXT:    ld.param.b8 %rs2, [out_v1i8_param_1];
+; CHECK-NEXT:    ld.param.b8 %rs3, [out_v1i8_param_2];
+; CHECK-NEXT:    xor.b16 %rs4, %rs1, %rs2;
+; CHECK-NEXT:    and.b16 %rs5, %rs4, %rs3;
+; CHECK-NEXT:    xor.b16 %rs6, %rs5, %rs2;
+; CHECK-NEXT:    st.param.b8 [func_retval0], %rs6;
 ; CHECK-NEXT:    ret;
   %mx = and <1 x i8> %x, %mask
   %notmask = xor <1 x i8> %mask, <i8 -1>
@@ -34,17 +33,16 @@ define <1 x i8> @out_v1i8(<1 x i8> %x, <1 x i8> %y, <1 x i8> %mask) nounwind {
 define <1 x i16> @out_v1i16(<1 x i16> %x, <1 x i16> %y, <1 x i16> %mask) nounwind {
 ; CHECK-LABEL: out_v1i16(
 ; CHECK:       {
-; CHECK-NEXT:    .reg .b16 %rs<8>;
+; CHECK-NEXT:    .reg .b16 %rs<7>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
 ; CHECK-NEXT:    ld.param.b16 %rs1, [out_v1i16_param_0];
-; CHECK-NEXT:    ld.param.b16 %rs2, [out_v1i16_param_2];
-; CHECK-NEXT:    and.b16 %rs3, %rs1, %rs2;
-; CHECK-NEXT:    ld.param.b16 %rs4, [out_v1i16_param_1];
-; CHECK-NEXT:    not.b16 %rs5, %rs2;
-; CHECK-NEXT:    and.b16 %rs6, %rs4, %rs5;
-; CHECK-NEXT:    or.b16 %rs7, %rs3, %rs6;
-; CHECK-NEXT:    st.param.b16 [func_retval0], %rs7;
+; CHECK-NEXT:    ld.param.b16 %rs2, [out_v1i16_param_1];
+; CHECK-NEXT:    ld.param.b16 %rs3, [out_v1i16_param_2];
+; CHECK-NEXT:    xor.b16 %rs4, %rs1, %rs2;
+; CHECK-NEXT:    and.b16 %rs5, %rs4, %rs3;
+; CHECK-NEXT:    xor.b16 %rs6, %rs5, %rs2;
+; CHECK-NEXT:    st.param.b16 [func_retval0], %rs6;
 ; CHECK-NEXT:    ret;
   %mx = and <1 x i16> %x, %mask
   %notmask = xor <1 x i16> %mask, <i16 -1>
@@ -126,17 +124,16 @@ define <2 x i16> @out_v2i16(<2 x i16> %x, <2 x i16> %y, <2 x i16> %mask) nounwin
 define <1 x i32> @out_v1i32(<1 x i32> %x, <1 x i32> %y, <1 x i32> %mask) nounwind {
 ; CHECK-LABEL: out_v1i32(
 ; CHECK:       {
-; CHECK-NEXT:    .reg .b32 %r<8>;
+; CHECK-NEXT:    .reg .b32 %r<7>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
 ; CHECK-NEXT:    ld.param.b32 %r1, [out_v1i32_param_0];
-; CHECK-NEXT:    ld.param.b32 %r2, [out_v1i32_param_2];
-; CHECK-NEXT:    and.b32 %r3, %r1, %r2;
-; CHECK-NEXT:    ld.param.b32 %r4, [out_v1i32_param_1];
-; CHECK-NEXT:    not.b32 %r5, %r2;
-; CHECK-NEXT:    and.b32 %r6, %r4, %r5;
-; CHECK-NEXT:    or.b32 %r7, %r3, %r6;
-; CHECK-NEXT:    st.param.b32 [func_retval0], %r7;
+; CHECK-NEXT:    ld.param.b32 %r2, [out_v1i32_param_1];
+; CHECK-NEXT:    ld.param.b32 %r3, [out_v1i32_param_2];
+; CHECK-NEXT:    xor.b32 %r4, %r1, %r2;
+; CHECK-NEXT:    and.b32 %r5, %r4, %r3;
+; CHECK-NEXT:    xor.b32 %r6, %r5, %r2;
+; CHECK-NEXT:    st.param.b32 [func_retval0], %r6;
 ; CHECK-NEXT:    ret;
   %mx = and <1 x i32> %x, %mask
   %notmask = xor <1 x i32> %mask, <i32 -1>
@@ -230,21 +227,19 @@ define <4 x i16> @out_v4i16_undef(<4 x i16> %x, <4 x i16> %y, <4 x i16> %mask) n
 define <2 x i32> @out_v2i32(<2 x i32> %x, <2 x i32> %y, <2 x i32> %mask) nounwind {
 ; CHECK-LABEL: out_v2i32(
 ; CHECK:       {
-; CHECK-NEXT:    .reg .b32 %r<15>;
+; CHECK-NEXT:    .reg .b32 %r<13>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
 ; CHECK-NEXT:    ld.param.v2.b32 {%r1, %r2}, [out_v2i32_param_0];
-; CHECK-NEXT:    ld.param.v2.b32 {%r3, %r4}, [out_v2i32_param_2];
-; CHECK-NEXT:    and.b32 %r5, %r1, %r3;
-; CHECK-NEXT:    and.b32 %r6, %r2, %r4;
-; CHECK-NEXT:    ld.param.v2.b32 {%r7, %r8}, [out_v2i32_param_1];
-; CHECK-NEXT:    not.b32 %r9, %r4;
-; CHECK-NEXT:    not.b32 %r10, %r3;
-; CHECK-NEXT:    and.b32 %r11, %r7, %r10;
-; CHECK-NEXT:    and.b32 %r12, %r8, %r9;
-; CHECK-NEXT:    or.b32 %r13, %r6, %r12;
-; CHECK-NEXT:    or.b32 %r14, %r5, %r11;
-; CHECK-NEXT:    st.param.v2.b32 [func_retval0], {%r14, %r13};
+; CHECK-NEXT:    ld.param.v2.b32 {%r3, %r4}, [out_v2i32_param_1];
+; CHECK-NEXT:    ld.param.v2.b32 {%r5, %r6}, [out_v2i32_param_2];
+; CHECK-NEXT:    xor.b32 %r7, %r2, %r4;
+; CHECK-NEXT:    and.b32 %r8, %r7, %r6;
+; CHECK-NEXT:    xor.b32 %r9, %r8, %r4;
+; CHECK-NEXT:    xor.b32 %r10, %r1, %r3;
+; CHECK-NEXT:    and.b32 %r11, %r10, %r5;
+; CHECK-NEXT:    xor.b32 %r12, %r11, %r3;
+; CHECK-NEXT:    st.param.v2.b32 [func_retval0], {%r12, %r9};
 ; CHECK-NEXT:    ret;
   %mx = and <2 x i32> %x, %mask
   %notmask = xor <2 x i32> %mask, <i32 -1, i32 -1>
@@ -256,17 +251,16 @@ define <2 x i32> @out_v2i32(<2 x i32> %x, <2 x i32> %y, <2 x i32> %mask) nounwin
 define <1 x i64> @out_v1i64(<1 x i64> %x, <1 x i64> %y, <1 x i64> %mask) nounwind {
 ; CHECK-LABEL: out_v1i64(
 ; CHECK:       {
-; CHECK-NEXT:    .reg .b64 %rd<8>;
+; CHECK-NEXT:    .reg .b64 %rd<7>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
 ; CHECK-NEXT:    ld.param.b64 %rd1, [out_v1i64_param_0];
-; CHECK-NEXT:    ld.param.b64 %rd2, [out_v1i64_param_2];
-; CHECK-NEXT:    and.b64 %rd3, %rd1, %rd2;
-; CHECK-NEXT:    ld.param.b64 %rd4, [out_v1i64_param_1];
-; CHECK-NEXT:    not.b64 %rd5, %rd2;
-; CHECK-NEXT:    and.b64 %rd6, %rd4, %rd5;
-; CHECK-NEXT:    or.b64 %rd7, %rd3, %rd6;
-; CHECK-NEXT:    st.param.b64 [func_retval0], %rd7;
+; CHECK-NEXT:    ld.param.b64 %rd2, [out_v1i64_param_1];
+; CHECK-NEXT:    ld.param.b64 %rd3, [out_v1i64_param_2];
+; CHECK-NEXT:    xor.b64 %rd4, %rd1, %rd2;
+; CHECK-NEXT:    and.b64 %rd5, %rd4, %rd3;
+; CHECK-NEXT:    xor.b64 %rd6, %rd5, %rd2;
+; CHECK-NEXT:    st.param.b64 [func_retval0], %rd6;
 ; CHECK-NEXT:    ret;
   %mx = and <1 x i64> %x, %mask
   %notmask = xor <1 x i64> %mask, <i64 -1>
@@ -350,29 +344,25 @@ define <8 x i16> @out_v8i16(<8 x i16> %x, <8 x i16> %y, <8 x i16> %mask) nounwin
 define <4 x i32> @out_v4i32(<4 x i32> %x, <4 x i32> %y, <4 x i32> %mask) nounwind {
 ; CHECK-LABEL: out_v4i32(
 ; CHECK:       {
-; CHECK-NEXT:    .reg .b32 %r<29>;
+; CHECK-NEXT:    .reg .b32 %r<25>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
 ; CHECK-NEXT:    ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [out_v4i32_param_0];
-; CHECK-NEXT:    ld.param.v4.b32 {%r5, %r6, %r7, %r8}, [out_v4i32_param_2];
-; CHECK-NEXT:    and.b32 %r9, %r1, %r5;
-; CHECK-NEXT:    and.b32 %r10, %r2, %r6;
-; CHECK-NEXT:    and.b32 %r11, %r3, %r7;
-; CHECK-NEXT:    and.b32 %r12, %r4, %r8;
-; CHECK-NEXT:    ld.param.v4.b32 {%r13, %r14, %r15, %r16}, [out_v4i32_param_1];
-; CHECK-NEXT:    not.b32 %r17, %r8;
-; CHECK-NEXT:    not.b32 %r18, %r7;
-; CHECK-NEXT:    not.b32 %r19, %r6;
-; CHECK-NEXT:    not.b32 %r20, %r5;
-; CHECK-NEXT:    and.b32 %r21, %r13, %r20;
-; CHECK-NEXT:    and.b32 %r22, %r14, %r19;
-; CHECK-NEXT:    and.b32 %r23, %r15, %r18;
-; CHECK-NEXT:    and.b32 %r24, %r16, %r17;
-; CHECK-NEXT:    or.b32 %r25, %r12, %r24;
-; CHECK-NEXT:    or.b32 %r26, %r11, %r23;
-; CHECK-NEXT:    or.b32 %r27, %r10, %r22;
-; CHECK-NEXT:    or.b32 %r28, %r9, %r21;
-; CHECK-NEXT:    st.param.v4.b32 [func_retval0], {%r28, %r27, %r26, %r25};
+; CHECK-NEXT:    ld.param.v4.b32 {%r5, %r6, %r7, %r8}, [out_v4i32_param_1];
+; CHECK-NEXT:    ld.param.v4.b32 {%r9, %r10, %r11, %r12}, [out_v4i32_param_2];
+; CHECK-NEXT:    xor.b32 %r13, %r4, %r8;
+; CHECK-NEXT:    and.b32 %r14, %r13, %r12;
+; CHECK-NEXT:    xor.b32 %r15, %r14, %r8;
+; CHECK-NEXT:    xor.b32 %r16, %r3, %r7;
+; CHECK-NEXT:    and.b32 %r17, %r16, %r11;
+; CHECK-NEXT:    xor.b32 %r18, %r17, %r7;
+; CHECK-NEXT:    xor.b32 %r19, %r2, %r6;
+; CHECK-NEXT:    and.b32 %r20, %r19, %r10;
+; CHECK-NEXT:    xor.b32 %r21, %r20, %r6;
+; CHECK-NEXT:    xor.b32 %r22, %r1, %r5;
+; CHECK-NEXT:    and.b32 %r23, %r22, %r9;
+; CHECK-NEXT:    xor.b32 %r24, %r23, %r5;
+; CHECK-NEXT:    st.param.v4.b32 [func_retval0], {%r24, %r21, %r18, %r15};
 ; CHECK-NEXT:    ret;
   %mx = and <4 x i32> %x, %mask
   %notmask = xor <4 x i32> %mask, <i32 -1, i32 -1, i32 -1, i32 -1>
@@ -384,26 +374,23 @@ define <4 x i32> @out_v4i32(<4 x i32> %x, <4 x i32> %y, <4 x i32> %mask) nounwin
 define <4 x i32> @out_v4i32_undef(<4 x i32> %x, <4 x i32> %y, <4 x i32> %mask) nounwind {
 ; CHECK-LABEL: out_v4i32_undef(
 ; CHECK:       {
-; CHECK-NEXT:    .reg .b32 %r<26>;
+; CHECK-NEXT:    .reg .b32 %r<23>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
 ; CHECK-NEXT:    ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [out_v4i32_undef_param_0];
 ; CHECK-NEXT:    ld.param.v4.b32 {%r5, %r6, %r7, %r8}, [out_v4i32_undef_param_2];
 ; CHECK-NEXT:    and.b32 %r9, %r3, %r7;
-; CHECK-NEXT:    and.b32 %r10, %r1, %r5;
-; CHECK-NEXT:    and.b32 %r11, %r2, %r6;
-; CHECK-NEXT:    and.b32 %r12, %r4, %r8;
-; CHECK-NEXT:    ld.param.v4.b32 {%r13, %r14, %r15, %r16}, [out_v4i32_undef_param_1];
-; CHECK-NEXT:    not.b32 %r17, %r8;
-; CHECK-NEXT:    not.b32 %r18, %r6;
-; CHECK-NEXT:    not.b32 %r19, %r5;
-; CHECK-NEXT:    and.b32 %r20, %r13, %r19;
-; CHECK-NEXT:    and.b32 %r21, %r14, %r18;
-; CHECK-NEXT:    and.b32 %r22, %r16, %r17;
-; CHECK-NEXT:    or.b32 %r23, %r12, %r22;
-; CHECK-NEXT:    or.b32 %r24, %r11, %r21;
-; CHECK-NEXT:    or.b32 %r25, %r10, %r20;
-; CHECK-NEXT:    st.param.v4.b32 [func_retval0], {%r25, %r24, %r9, %r23};
+; CHECK-NEXT:    ld.param.v4.b32 {%r10, %r11, %r12, %r13}, [out_v4i32_undef_param_1];
+; CHECK-NEXT:    xor.b32 %r14, %r4, %r13;
+; CHECK-NEXT:    and.b32 %r15, %r14, %r8;
+; CHECK-NEXT:    xor.b32 %r16, %r15, %r13;
+; CHECK-NEXT:    xor.b32 %r17, %r2, %r11;
+; CHECK-NEXT:    and.b32 %r18, %r17, %r6;
+; CHECK-NEXT:    xor.b32 %r19, %r18, %r11;
+; CHECK-NEXT:    xor.b32 %r20, %r1, %r10;
+; CHECK-NEXT:    and.b32 %r21, %r20, %r5;
+; CHECK-NEXT:    xor.b32 %r22, %r21, %r10;
+; CHECK-NEXT:    st.param.v4.b32 [func_retval0], {%r22, %r19, %r9, %r16};
 ; CHECK-NEXT:    ret;
   %mx = and <4 x i32> %x, %mask
   %notmask = xor <4 x i32> %mask, <i32 -1, i32 -1, i32 undef, i32 -1>
@@ -415,21 +402,19 @@ define <4 x i32> @out_v4i32_undef(<4 x i32> %x, <4 x i32> %y, <4 x i32> %mask) n
 define <2 x i64> @out_v2i64(<2 x i64> %x, <2 x i64> %y, <2 x i64> %mask) nounwind {
 ; CHECK-LABEL: out_v2i64(
 ; CHECK:       {
-; CHECK-NEXT:    .reg .b64 %rd<15>;
+; CHECK-NEXT:    .reg .b64 %rd<13>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
 ; CHECK-NEXT:    ld.param.v2.b64 {%rd1, %rd2}, [out_v2i64_param_0];
-; CHECK-NEXT:    ld.param.v2.b64 {%rd3, %rd4}, [out_v2i64_param_2];
-; CHECK-NEXT:    and.b64 %rd5, %rd1, %rd3;
-; CHECK-NEXT:    and.b64 %rd6, %rd2, %rd4;
-; CHECK-NEXT:    ld.param.v2.b64 {%rd7, %rd8}, [out_v2i64_param_1];
-; CHECK-NEXT:    not.b64 %rd9, %rd4;
-; CHECK-NEXT:    not.b64 %rd10, %rd3;
-; CHECK-NEXT:    and.b64 %rd11, %rd7, %rd10;
-; CHECK-NEXT:    and.b64 %rd12, %rd8, %rd9;
-; CHECK-NEXT:    or.b64 %rd13, %rd6, %rd12;
-; CHECK-NEXT:    or.b64 %rd14, %rd5, %rd11;
-; CHECK-NEXT:    st.param.v2.b64 [func_retval0], {%rd14, %rd13};
+; CHECK-NEXT:    ld.param.v2.b64 {%rd3, %rd4}, [out_v2i64_param_1];
+; CHECK-NEXT:    ld.param.v2.b64 {%rd5, %rd6}, [out_v2i64_param_2];
+; CHECK-NEXT:    xor.b64 %rd7, %rd2, %rd4;
+; CHECK-NEXT:    and.b64 %rd8, %rd7, %rd6;
+; CHECK-NEXT:    xor.b64 %rd9, %rd8, %rd4;
+; CHECK-NEXT:    xor.b64 %rd10, %rd1, %rd3;
+; CHECK-NEXT:    and.b64 %rd11, %rd10, %rd5;
+; CHECK-NEXT:    xor.b64 %rd12, %rd11, %rd3;
+; CHECK-NEXT:    st.param.v2.b64 [func_retval0], {%rd12, %rd9};
 ; CHECK-NEXT:    ret;
   %mx = and <2 x i64> %x, %mask
   %notmask = xor <2 x i64> %mask, <i64 -1, i64 -1>
diff --git a/llvm/test/CodeGen/RISCV/fold-masked-merge.ll b/llvm/test/CodeGen/RISCV/fold-masked-merge.ll
new file mode 100644
index 0000000000000..631b7109281e5
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/fold-masked-merge.ll
@@ -0,0 +1,302 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=riscv32 < %s \
+; RUN:   | FileCheck %s --check-prefixes=CHECK,CHECK-I,RV32,RV32I
+; RUN: llc -mtriple=riscv64 < %s \
+; RUN:   | FileCheck %s --check-prefixes=CHECK,CHECK-I,RV64,RV64I
+; RUN: llc -mtriple=riscv32 -mattr=+zbb < %s \
+; RUN:   | FileCheck %s --check-prefixes=CHECK,CHECK-ZBB,RV32,RV32ZBB
+; RUN: llc -mtriple=riscv64 -mattr=+zbb < %s \
+; RUN:   | FileCheck %s --check-prefixes=CHECK,CHECK-ZBB,RV64,RV64ZBB
+;
+; test that masked-merge code is generated as "xor;and;xor" sequence or
+; "andn ; and; or" if and-not is available.
+
+define i32 @masked_merge0(i32 %a0, i32 %a1, i32 %a2) {
+; CHECK-I-LABEL: masked_merge0:
+; CHECK-I:       # %bb.0:
+; CHECK-I-NEXT:    xor a1, a1, a2
+; CHECK-I-NEXT:    and a0, a1, a0
+; CHECK-I-NEXT:    xor a0, a0, a2
+; CHECK-I-NEXT:    ret
+;
+; CHECK-ZBB-LABEL: masked_merge0:
+; CHECK-ZBB:       # %bb.0:
+; CHECK-ZBB-NEXT:    and a1, a0, a1
+; CHECK-ZBB-NEXT:    andn a0, a2, a0
+; CHECK-ZBB-NEXT:    or a0, a1, a0
+; CHECK-ZBB-NEXT:    ret
+  %and0 = and i32 %a0, %a1
+  %not = xor i32 %a0, -1
+  %and1 = and i32 %not, %a2
+  %or = or i32 %and0, %and1
+  ret i32 %or
+}
+
+define i16 @masked_merge1(i16 %a0, i16 %a1, i16 %a2) {
+; CHECK-I-LABEL: masked_merge1:
+; CHECK-I:       # %bb.0:
+; CHECK-I-NEXT:    xor a1, a1, a2
+; CHECK-I-NEXT:    and a0, a1, a0
+; CHECK-I-NEXT:    xor a0, a0, a2
+; CHECK-I-NEXT:    ret
+;
+; CHECK-ZBB-LABEL: masked_merge1:
+; CHECK-ZBB:       # %bb.0:
+; CHECK-ZBB-NEXT:    and a1, a0, a1
+; CHECK-ZBB-NEXT:    andn a0, a2, a0
+; CHECK-ZBB-NEXT:    or a0, a1, a0
+; CHECK-ZBB-NEXT:    ret
+  %and0 = and i16 %a0, %a1
+  %not = xor i16 %a0, -1
+  %and1 = and i16 %a2, %not
+  %or = or i16 %and0, %and1
+  ret i16 %or
+}
+
+define i8 @masked_merge2(i8 %a0, i8 %a1, i8 %a2) {
+; CHECK-I-LABEL: masked_merge2:
+; CHECK-I:       # %bb.0:
+; CHECK-I-NEXT:    mv a0, a1
+; CHECK-I-NEXT:    ret
+;
+; CHECK-ZBB-LABEL: masked_merge2:
+; CHECK-ZBB:       # %bb.0:
+; CHECK-ZBB-NEXT:    andn a2, a1, a0
+; CHECK-ZBB-NEXT:    and a0, a1, a0
+; CHECK-ZBB-NEXT:    or a0, a2, a0
+; CHECK-ZBB-NEXT:    ret
+  %not = xor i8 %a0, -1
+  %and0 = and i8 %not, %a1
+  %and1 = and i8 %a1, %a0
+  %or = or i8 %and0, %and1
+  ret i8 %or
+}
+
+define i64 @masked_merge3(i64 %a0, i64 %a1, i64 %a2) {
+; RV32I-LABEL: masked_merge3:
+; RV32I:       # %bb.0:
+; RV32I-NEXT:    not a5, a5
+; RV32I-NEXT:    not a4, a4
+; RV32I-NEXT:    xor a3, a3, a5
+; RV32I-NEXT:    xor a2, a2, a4
+; RV32I-NEXT:    not a2, a2
+; RV32I-NEXT:    not a3, a3
+; RV32I-NEXT:    and a0, a2, a0
+; RV32I-NEXT:    and a1, a3, a1
+; RV32I-NEXT:    xor a0, a0, a4
+; RV32I-NEXT:    xor a1, a1, a5
+; RV32I-NEXT:    ret
+;
+; RV64I-LABEL: masked_merge3:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    not a2, a2
+; RV64I-NEXT:    xor a1, a1, a2
+; RV64I-NEXT:    not a1, a1
+; RV64I-NEXT:    and a0, a1, a0
+; RV64I-NEXT:    xor a0, a0, a2
+; RV64I-NEXT:    ret
+;
+; RV32ZBB-LABEL: masked_merge3:
+; RV32ZBB:       # %bb.0:
+; RV32ZBB-NEXT:    not a6, a0
+; RV32ZBB-NEXT:    not a7, a1
+; RV32ZBB-NEXT:    andn a1, a1, a3
+; RV32ZBB-NEXT:    andn a0, a0, a2
+; RV32ZBB-NEXT:    andn a2, a7, a5
+; RV32ZBB-NEXT:    andn a3, a6, a4
+; RV32ZBB-NEXT:    or a0, a3, a0
+; RV32ZBB-NEXT:    or a1, a2, a1
+; RV32ZBB-NEXT:    ret
+;
+; RV64ZBB-LABEL: masked_merge3:
+; RV64ZBB:       # %bb.0:
+; RV64ZBB-NEXT:    not a3, a0
+; RV64ZBB-NEXT:    andn a2, a3, a2
+; RV64ZBB-NEXT:    andn a0, a0, a1
+; RV64ZBB-NEXT:    or a0, a2, a0
+; RV64ZBB-NEXT:    ret
+  %v0 = xor i64 %a1, -1
+  %v1 = xor i64 %a2, -1
+  %not = xor i64 %a0, -1
+  %and0 = and i64 %not, %v1
+  %and1 = and i64 %v0, %a0
+  %or = or i64 %and0, %and1
+  ret i64 %or
+}
+
+define i32 @not_a_masked_merge0(i32 %a0, i32 %a1, i32 %a2) {
+; RV32-LABEL: not_a_masked_merge0:
+; RV32:       # %bb.0:
+; RV32-NEXT:    and a1, a0, a1
+; RV32-NEXT:    neg a0, a0
+; RV32-NEXT:    and a0, a0, a2
+; RV32-NEXT:    or a0, a1, a0
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: not_a_masked_merge0:
+; RV64:       # %bb.0:
+; RV64-NEXT:    and a1, a0, a1
+; RV64-NEXT:    negw a0, a0
+; RV64-NEXT:    and a0, a0, a2
+; RV64-NEXT:    or a0, a1, a0
+; RV64-NEXT:    ret
+  %and0 = and i32 %a0, %a1
+  %not_a_not = sub i32 0, %a0
+  %and1 = and i32 %not_a_not, %a2
+  %or = or i32 %and0, %and1
+  ret i32 %or
+}
+
+define i32 @not_a_masked_merge1(i32 %a0, i32 %a1, i32 %a2, i32 %a3) {
+; CHECK-I-LABEL: not_a_masked_merge1:
+; CHECK-I:       # %bb.0:
+; CHECK-I-NEXT:    and a0, a0, a1
+; CHECK-I-NEXT:    not a1, a3
+; CHECK-I-NEXT:    and a1, a1, a2
+; CHECK-I-NEXT:    or a0, a0, a1
+; CHECK-I-NEXT:    ret
+;
+; CHECK-ZBB-LABEL: not_a_masked_merge1:
+; CHECK-ZBB:       # %bb.0:
+; CHECK-ZBB-NEXT:    and a0, a0, a1
+; CHECK-ZBB-NEXT:    andn a1, a2, a3
+; CHECK-ZBB-NEXT:    or a0, a0, a1
+; CHECK-ZBB-NEXT:    ret
+  %and0 = and i32 %a0, %a1
+  %not = xor i32 %a3, -1
+  %and1 = and i32 %not, %a2
+  %or = or i32 %and0, %and1
+  ret i32 %or
+}
+
+define i32 @not_a_masked_merge2(i32 %a0, i32 %a1, i32 %a2) {
+; CHECK-I-LABEL: not_a_masked_merge2:
+; CHECK-I:       # %bb.0:
+; CHECK-I-NEXT:    or a1, a0, a1
+; CHECK-I-NEXT:    not a0, a0
+; CHECK-I-NEXT:    and a0, a0, a2
+; CHECK-I-NEXT:    or a0, a1, a0
+; CHECK-I-NEXT:    ret
+;
+; CHECK-ZBB-LABEL: not_a_masked_merge2:
+; CHECK-ZBB:       # %bb.0:
+; CHECK-ZBB-NEXT:    or a1, a0, a1
+; CHECK-ZBB-NEXT:    andn a0, a2, a0
+; CHECK-ZBB-NEXT:    or a0, a1, a0
+; CHECK-ZBB-NEXT:    ret
+  %not_an_and0 = or i32 %a0, %a1
+  %not = xor i32 %a0, -1
+  %and1 = and i32 %not, %a2
+  %or = or i32 %not_an_and0, %and1
+  ret i32 %or
+}
+
+define i32 @not_a_masked_merge3(i32 %a0, i32 %a1, i32 %a2) {
+; CHECK-I-LABEL: not_a_masked_merge3:
+; CHECK-I:       # %bb.0:
+; CHECK-I-NEXT:    and a1, a0, a1
+; CHECK-I-NEXT:    xor a0, a0, a2
+; CHECK-I-NEXT:    not a0, a0
+; CHECK-I-NEXT:    or a0, a1, a0
+; CHECK-I-NEXT:    ret
+;
+; CHECK-ZBB-LABEL: not_a_masked_merge3:
+; CHECK-ZBB:       # %bb.0:
+; CHECK-ZBB-NEXT:    and a1, a0, a1
+; CHECK-ZBB-NEXT:    xor a0, a0, a2
+; CHECK-ZBB-NEXT:    orn a0, a1, a0
+; CHECK-ZBB-NEXT:    ret
+  %and0 = and i32 %a0, %a1
+  %not = xor i32 %a0, -1
+  %not_an_and1 = xor i32 %not, %a2
+  %or = or i32 %and0, %not_an_and1
+  ret i32 %or
+}
+
+define i32 @not_a_masked_merge4(i32 %a0, i32 %a1, i32 %a2) {
+; CHECK-LABEL: not_a_masked_merge4:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    and a0, a0, a1
+; CHECK-NEXT:    ret
+  %and0 = and i32 %a0, %a1
+  %not = xor i32 %a2, -1
+  %and1 = and i32 %not, %a2
+  %or = or i32 %and0, %and1
+  ret i32 %or
+}
+
+define i32 @masked_merge_no_transform0(i32 %a0, i32 %a1, i32 %a2, ptr %p1) {
+; CHECK-I-LABEL: masked_merge_no_transform0:
+; CHECK-I:       # %bb.0:
+; CHECK-I-NEXT:    and a1, a0, a1
+; CHECK-I-NEXT:    not a0, a0
+; CHECK-I-NEXT:    and a0, a0, a2
+; CHECK-I-NEXT:    or a0, a1, a0
+; CHECK-I-NEXT:    sw a1, 0(a3)
+; CHECK-I-NEXT:    ret
+;
+; CHECK-ZBB-LABEL: masked_merge_no_transform0:
+; CHECK-ZBB:       # %bb.0:
+; CHECK-ZBB-NEXT:    and a1, a0, a1
+; CHECK-ZBB-NEXT:    andn a0, a2, a0
+; CHECK-ZBB-NEXT:    or a0, a1, a0
+; CHECK-ZBB-NEXT:    sw a1, 0(a3)
+; CHECK-ZBB-NEXT:    ret
+  %and0 = and i32 %a0, %a1
+  %not = xor i32 %a0, -1
+  %and1 = and i32 %not, %a2
+  %or = or i32 %and0, %and1
+  store i32 %and0, ptr %p1
+  ret i32 %or
+}
+
+define i32 @masked_merge_no_transform1(i32 %a0, i32 %a1, i32 %a2, ptr %p1) {
+; CHECK-I-LABEL: masked_merge_no_transform1:
+; CHECK-I:       # %bb.0:
+; CHECK-I-NEXT:    and a1, a0, a1
+; CHECK-I-NEXT:    not a4, a0
+; CHECK-I-NEXT:    and a0, a4, a2
+; CHECK-I-NEXT:    or a0, a1, a0
+; CHECK-I-NEXT:    sw a4, 0(a3)
+; CHECK-I-NEXT:    ret
+;
+; CHECK-ZBB-LABEL: masked_merge_no_transform1:
+; CHECK-ZBB:       # %bb.0:
+; CHECK-ZBB-NEXT:    and a1, a0, a1
+; CHECK-ZBB-NEXT:    not a4, a0
+; CHECK-ZBB-NEXT:    andn a0, a2, a0
+; CHECK-ZBB-NEXT:    or a0, a1, a0
+; CHECK-ZBB-NEXT:    sw a4, 0(a3)
+; CHECK-ZBB-NEXT:    ret
+  %and0 = and i32 %a0, %a1
+  %not = xor i32 %a0, -1
+  %and1 = and i32 %not, %a2
+  %or = or i32 %and0, %and1
+  store i32 %not, ptr %p1
+  ret i32 %or
+}
+
+define i32 @masked_merge_no_transform2(i32 %a0, i32 %a1, i32 %a2, ptr %p1) {
+; CHECK-I-LABEL: masked_merge_no_transform2:
+; CHECK-I:       # %bb.0:
+; CHECK-I-NEXT:    and a1, a0, a1
+; CHECK-I-NEXT:    not a0, a0
+; CHECK-I-NEXT:    and a2, a0, a2
+; CHECK-I-NEXT:    or a0, a1, a2
+; CHECK-I-NEXT:    sw a2, 0(a3)
+; CHECK-I-NEXT:    ret
+;
+; CHECK-ZBB-LABEL: masked_merge_no_transform2:
+; CHECK-ZBB:       # %bb.0:
+; CHECK-ZBB-NEXT:    and a1, a0, a1
+; CHECK-ZBB-NEXT:    andn a2, a2, a0
+; CHECK-ZBB-NEXT:    or a0, a1, a2
+; CHECK-ZBB-NEXT:    sw a2, 0(a3)
+; CHECK-ZBB-NEXT:    ret
+  %and0 = and i32 %a0, %a1
+  %not = xor i32 %a0, -1
+  %and1 = and i32 %not, %a2
+  %or = or i32 %and0, %and1
+  store i32 %and1, ptr %p1
+  ret i32 %or
+}
diff --git a/llvm/test/CodeGen/RISCV/unfold-masked-merge-scalar-variablemask.ll b/llvm/test/CodeGen/RISCV/unfold-masked-merge-scalar-variablemask.ll
index 1517e524a7f78..efc8243df71e0 100644
--- a/llvm/test/CodeGen/RISCV/unfold-masked-merge-scalar-variablemask.ll
+++ b/llvm/test/CodeGen/RISCV/unfold-masked-merge-scalar-variablemask.ll
@@ -8,16 +8,13 @@
 ; RUN: llc -mtriple=riscv64 -mattr=+zbb < %s \
 ; RUN:   | FileCheck %s --check-prefixes=CHECK,CHECK-ZBB,RV64,RV64ZBB
 
-; TODO: Should we convert these to X ^ ((X ^ Y) & M) form when Zbb isn't
-; present?
 
 define i8 @out8(i8 %x, i8 %y, i8 %mask) {
 ; CHECK-I-LABEL: out8:
 ; CHECK-I:       # %bb.0:
+; CHECK-I-NEXT:    xor a0, a0, a1
 ; CHECK-I-NEXT:    and a0, a0, a2
-; CHECK-I-NEXT:    not a2, a2
-; CHECK-I-NEXT:    and a1, a1, a2
-; CHECK-I-NEXT:    or a0, a0, a1
+; CHECK-I-NEXT:    xor a0, a0, a1
 ; CHECK-I-NEXT:    ret
 ;
 ; CHECK-ZBB-LABEL: out8:
@@ -36,10 +33,9 @@ define i8 @out8(i8 %x, i8 %y, i8 %mask) {
 define i16 @out16(i16 %x, i16 %y, i16 %mask) {
 ; CHECK-I-LABEL: out16:
 ; CHECK-I:       # %bb.0:
+; CHECK-I-NEXT:    xor a0, a0, a1
 ; CHECK-I-NEXT:    and a0, a0, a2
-; CHECK-I-NEXT:    not a2, a2
-; CHECK-I-NEXT:    and a1, a1, a2
-; CHECK-I-NEXT:    or a0, a0, a1
+; CHECK-I-NEXT:    xor a0, a0, a1
 ; CHECK-I-NEXT:    ret
 ;
 ; CHECK-ZBB-LABEL: out16:
@@ -58,10 +54,9 @@ define i16 @out16(i16 %x, i16 %y, i16 %mask) {
 define i32 @out32(i32 %x, i32 %y, i32 %mask) {
 ; CHECK-I-LABEL: out32:
 ; CHECK-I:       # %bb.0:
+; CHECK-I-NEXT:    xor a0, a0, a1
 ; CHECK-I-NEXT:    and a0, a0, a2
-; CHECK-I-NEXT:    not a2, a2
-; CHECK-I-NEXT:    and a1, a1, a2
-; CHECK-I-NEXT:    or a0, a0, a1
+; CHECK-I-NEXT:    xor a0, a0, a1
 ; CHECK-I-NEXT:    ret
 ;
 ; CHECK-ZBB-LABEL: out32:
@@ -80,22 +75,19 @@ define i32 @out32(i32 %x, i32 %y, i32 %mask) {
 define i64 @out64(i64 %x, i64 %y, i64 %mask) {
 ; RV32I-LABEL: out64:
 ; RV32I:       # %bb.0:
-; RV32I-NEXT:    and a1, a1, a5
+; RV32I-NEXT:    xor a0, a0, a2
+; RV32I-NEXT:    xor a1, a1, a3
 ; RV32I-NEXT:    and a0, a0, a4
-; RV32I-NEXT:    not a4, a4
-; RV32I-NEXT:    not a5, a5
-; RV32I-NEXT:    and a3, a3, a5
-; RV32I-NEXT:    and a2, a2, a4
-; RV32I-NEXT:    or a0, a0, a2
-; RV32I-NEXT:    or a1, a1, a3
+; RV32I-NEXT:    and a1, a1, a5
+; RV32I-NEXT:    xor a0, a0, a2
+; RV32I-NEXT:    xor a1, a1, a3
 ; RV32I-NEXT:    ret
 ;
 ; RV64I-LABEL: out64:
 ; RV64I:       # %bb.0:
+; RV64I-NEXT:    xor a0, a0, a1
 ; RV64I-NEXT:    and a0, a0, a2
-; RV64I-NEXT:    not a2, a2
-; RV64I-NEXT:    and a1, a1, a2
-; RV64I-NEXT:    or a0, a0, a1
+; RV64I-NEXT:    xor a0, a0, a1
 ; RV64I-NEXT:    ret
 ;
 ; RV32ZBB-LABEL: out64:
@@ -660,10 +652,9 @@ define i32 @in_constant_varx_mone_invmask(i32 %x, i32 %y, i32 %mask) {
 define i32 @out_constant_varx_42(i32 %x, i32 %y, i32 %mask) {
 ; CHECK-I-LABEL: out_constant_varx_42:
 ; CHECK-I:       # %bb.0:
-; CHECK-I-NEXT:    not a1, a2
-; CHECK-I-NEXT:    and a0, a2, a0
-; CHECK-I-NEXT:    andi a1, a1, 42
-; CHECK-I-NEXT:    or a0, a0, a1
+; CHECK-I-NEXT:    xori a0, a0, 42
+; CHECK-I-NEXT:    and a0, a0, a2
+; CHECK-I-NEXT:    xori a0, a0, 42
 ; CHECK-I-NEXT:    ret
 ;
 ; CHECK-ZBB-LABEL: out_constant_varx_42:
@@ -704,10 +695,9 @@ define i32 @in_constant_varx_42(i32 %x, i32 %y, i32 %mask) {
 define i32 @out_constant_varx_42_invmask(i32 %x, i32 %y, i32 %mask) {
 ; CHECK-I-LABEL: out_constant_varx_42_invmask:
 ; CHECK-I:       # %bb.0:
-; CHECK-I-NEXT:    not a1, a2
-; CHECK-I-NEXT:    and a0, a1, a0
-; CHECK-I-NEXT:    andi a1, a2, 42
-; CHECK-I-NEXT:    or a0, a0, a1
+; CHECK-I-NEXT:    xori a1, a0, 42
+; CHECK-I-NEXT:    and a1, a1, a2
+; CHECK-I-NEXT:    xor a0, a1, a0
 ; CHECK-I-NEXT:    ret
 ;
 ; CHECK-ZBB-LABEL: out_constant_varx_42_invmask:
@@ -812,10 +802,9 @@ define i32 @in_constant_mone_vary_invmask(i32 %x, i32 %y, i32 %mask) {
 define i32 @out_constant_42_vary(i32 %x, i32 %y, i32 %mask) {
 ; CHECK-I-LABEL: out_constant_42_vary:
 ; CHECK-I:       # %bb.0:
-; CHECK-I-NEXT:    not a0, a2
-; CHECK-I-NEXT:    andi a2, a2, 42
-; CHECK-I-NEXT:    and a0, a0, a1
-; CHECK-I-NEXT:    or a0, a2, a0
+; CHECK-I-NEXT:    xori a0, a1, 42
+; CHECK-I-NEXT:    and a0, a0, a2
+; CHECK-I-NEXT:    xor a0, a0, a1
 ; CHECK-I-NEXT:    ret
 ;
 ; CHECK-ZBB-LABEL: out_constant_42_vary:
@@ -855,10 +844,9 @@ define i32 @in_constant_42_vary(i32 %x, i32 %y, i32 %mask) {
 define i32 @out_constant_42_vary_invmask(i32 %x, i32 %y, i32 %mask) {
 ; CHECK-I-LABEL: out_constant_42_vary_invmask:
 ; CHECK-I:       # %bb.0:
-; CHECK-I-NEXT:    not a0, a2
-; CHECK-I-NEXT:    andi a0, a0, 42
-; CHECK-I-NEXT:    and a1, a2, a1
-; CHECK-I-NEXT:    or a0, a0, a1
+; CHECK-I-NEXT:    xori a0, a1, 42
+; CHECK-I-NEXT:    and a0, a0, a2
+; CHECK-I-NEXT:    xori a0, a0, 42
 ; CHECK-I-NEXT:    ret
 ;
 ; CHECK-ZBB-LABEL: out_constant_42_vary_invmask:
diff --git a/llvm/test/CodeGen/SystemZ/fold-masked-merge.ll b/llvm/test/CodeGen/SystemZ/fold-masked-merge.ll
new file mode 100644
index 0000000000000..c014345507f69
--- /dev/null
+++ b/llvm/test/CodeGen/SystemZ/fold-masked-merge.ll
@@ -0,0 +1,277 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z13 | FileCheck %s --check-prefix=NO-MISC3
+; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z15 | FileCheck %s --check-prefix=MISC3
+
+; test that masked-merge code is generated as "xor;and;xor" sequence or
+; "andn ; and; or" if and-not is available.
+
+define i32 @masked_merge0(i32 %a0, i32 %a1, i32 %a2) {
+; NO-MISC3-LABEL: masked_merge0:
+; NO-MISC3:       # %bb.0:
+; NO-MISC3-NEXT:    xr %r3, %r4
+; NO-MISC3-NEXT:    nr %r2, %r3
+; NO-MISC3-NEXT:    xr %r2, %r4
+; NO-MISC3-NEXT:    br %r14
+;
+; MISC3-LABEL: masked_merge0:
+; MISC3:       # %bb.0:
+; MISC3-NEXT:    nr %r3, %r2
+; MISC3-NEXT:    ncrk %r2, %r4, %r2
+; MISC3-NEXT:    or %r2, %r3
+; MISC3-NEXT:    br %r14
+  %and0 = and i32 %a0, %a1
+  %not = xor i32 %a0, -1
+  %and1 = and i32 %not, %a2
+  %or = or i32 %and0, %and1
+  ret i32 %or
+}
+
+define i16 @masked_merge1(i16 %a0, i16 %a1, i16 %a2) {
+; NO-MISC3-LABEL: masked_merge1:
+; NO-MISC3:       # %bb.0:
+; NO-MISC3-NEXT:    xr %r3, %r4
+; NO-MISC3-NEXT:    nr %r2, %r3
+; NO-MISC3-NEXT:    xr %r2, %r4
+; NO-MISC3-NEXT:    br %r14
+;
+; MISC3-LABEL: masked_merge1:
+; MISC3:       # %bb.0:
+; MISC3-NEXT:    ncrk %r0, %r4, %r2
+; MISC3-NEXT:    nr %r2, %r3
+; MISC3-NEXT:    or %r2, %r0
+; MISC3-NEXT:    br %r14
+  %and0 = and i16 %a0, %a1
+  %not = xor i16 %a0, -1
+  %and1 = and i16 %a2, %not
+  %or = or i16 %and0, %and1
+  ret i16 %or
+}
+
+define i8 @masked_merge2(i8 %a0, i8 %a1, i8 %a2) {
+; NO-MISC3-LABEL: masked_merge2:
+; NO-MISC3:       # %bb.0:
+; NO-MISC3-NEXT:    lr %r2, %r3
+; NO-MISC3-NEXT:    br %r14
+;
+; MISC3-LABEL: masked_merge2:
+; MISC3:       # %bb.0:
+; MISC3-NEXT:    lr %r2, %r3
+; MISC3-NEXT:    br %r14
+  %not = xor i8 %a0, -1
+  %and0 = and i8 %not, %a1
+  %and1 = and i8 %a1, %a0
+  %or = or i8 %and0, %and1
+  ret i8 %or
+}
+
+define i64 @masked_merge3(i64 %a0, i64 %a1, i64 %a2) {
+; NO-MISC3-LABEL: masked_merge3:
+; NO-MISC3:       # %bb.0:
+; NO-MISC3-NEXT:    lcgr %r0, %r4
+; NO-MISC3-NEXT:    aghi %r0, -1
+; NO-MISC3-NEXT:    xgr %r3, %r0
+; NO-MISC3-NEXT:    ngr %r3, %r2
+; NO-MISC3-NEXT:    xgr %r3, %r2
+; NO-MISC3-NEXT:    xgrk %r2, %r3, %r0
+; NO-MISC3-NEXT:    br %r14
+;
+; MISC3-LABEL: masked_merge3:
+; MISC3:       # %bb.0:
+; MISC3-NEXT:    lcgr %r0, %r2
+; MISC3-NEXT:    aghi %r0, -1
+; MISC3-NEXT:    ncgrk %r0, %r0, %r4
+; MISC3-NEXT:    ncgrk %r2, %r2, %r3
+; MISC3-NEXT:    ogr %r2, %r0
+; MISC3-NEXT:    br %r14
+  %v0 = xor i64 %a1, -1
+  %v1 = xor i64 %a2, -1
+  %not = xor i64 %a0, -1
+  %and0 = and i64 %not, %v1
+  %and1 = and i64 %v0, %a0
+  %or = or i64 %and0, %and1
+  ret i64 %or
+}
+
+define i32 @not_a_masked_merge0(i32 %a0, i32 %a1, i32 %a2) {
+; NO-MISC3-LABEL: not_a_masked_merge0:
+; NO-MISC3:       # %bb.0:
+; NO-MISC3-NEXT:    lcr %r0, %r2
+; NO-MISC3-NEXT:    nr %r3, %r2
+; NO-MISC3-NEXT:    nr %r0, %r4
+; NO-MISC3-NEXT:    ork %r2, %r3, %r0
+; NO-MISC3-NEXT:    br %r14
+;
+; MISC3-LABEL: not_a_masked_merge0:
+; MISC3:       # %bb.0:
+; MISC3-NEXT:    lcr %r0, %r2
+; MISC3-NEXT:    nr %r3, %r2
+; MISC3-NEXT:    nr %r0, %r4
+; MISC3-NEXT:    ork %r2, %r3, %r0
+; MISC3-NEXT:    br %r14
+  %and0 = and i32 %a0, %a1
+  %not_a_not = sub i32 0, %a0
+  %and1 = and i32 %not_a_not, %a2
+  %or = or i32 %and0, %and1
+  ret i32 %or
+}
+
+define i32 @not_a_masked_merge1(i32 %a0, i32 %a1, i32 %a2, i32 %a3) {
+; NO-MISC3-LABEL: not_a_masked_merge1:
+; NO-MISC3:       # %bb.0:
+; NO-MISC3-NEXT:    xilf %r5, 4294967295
+; NO-MISC3-NEXT:    nr %r2, %r3
+; NO-MISC3-NEXT:    nr %r4, %r5
+; NO-MISC3-NEXT:    or %r2, %r4
+; NO-MISC3-NEXT:    br %r14
+;
+; MISC3-LABEL: not_a_masked_merge1:
+; MISC3:       # %bb.0:
+; MISC3-NEXT:    nr %r2, %r3
+; MISC3-NEXT:    ncrk %r0, %r4, %r5
+; MISC3-NEXT:    or %r2, %r0
+; MISC3-NEXT:    br %r14
+  %and0 = and i32 %a0, %a1
+  %not = xor i32 %a3, -1
+  %and1 = and i32 %not, %a2
+  %or = or i32 %and0, %and1
+  ret i32 %or
+}
+
+define i32 @not_a_masked_merge2(i32 %a0, i32 %a1, i32 %a2) {
+; NO-MISC3-LABEL: not_a_masked_merge2:
+; NO-MISC3:       # %bb.0:
+; NO-MISC3-NEXT:    or %r3, %r2
+; NO-MISC3-NEXT:    xilf %r2, 4294967295
+; NO-MISC3-NEXT:    nr %r2, %r4
+; NO-MISC3-NEXT:    or %r2, %r3
+; NO-MISC3-NEXT:    br %r14
+;
+; MISC3-LABEL: not_a_masked_merge2:
+; MISC3:       # %bb.0:
+; MISC3-NEXT:    or %r3, %r2
+; MISC3-NEXT:    ncrk %r2, %r4, %r2
+; MISC3-NEXT:    or %r2, %r3
+; MISC3-NEXT:    br %r14
+  %not_an_and0 = or i32 %a0, %a1
+  %not = xor i32 %a0, -1
+  %and1 = and i32 %not, %a2
+  %or = or i32 %not_an_and0, %and1
+  ret i32 %or
+}
+
+define i32 @not_a_masked_merge3(i32 %a0, i32 %a1, i32 %a2) {
+; NO-MISC3-LABEL: not_a_masked_merge3:
+; NO-MISC3:       # %bb.0:
+; NO-MISC3-NEXT:    nr %r3, %r2
+; NO-MISC3-NEXT:    xr %r2, %r4
+; NO-MISC3-NEXT:    xilf %r2, 4294967295
+; NO-MISC3-NEXT:    or %r2, %r3
+; NO-MISC3-NEXT:    br %r14
+;
+; MISC3-LABEL: not_a_masked_merge3:
+; MISC3:       # %bb.0:
+; MISC3-NEXT:    nr %r3, %r2
+; MISC3-NEXT:    xr %r2, %r4
+; MISC3-NEXT:    ocrk %r2, %r3, %r2
+; MISC3-NEXT:    br %r14
+  %and0 = and i32 %a0, %a1
+  %not = xor i32 %a0, -1
+  %not_an_and1 = xor i32 %not, %a2
+  %or = or i32 %and0, %not_an_and1
+  ret i32 %or
+}
+
+define i32 @not_a_masked_merge4(i32 %a0, i32 %a1, i32 %a2) {
+; NO-MISC3-LABEL: not_a_masked_merge4:
+; NO-MISC3:       # %bb.0:
+; NO-MISC3-NEXT:    nr %r2, %r3
+; NO-MISC3-NEXT:    br %r14
+;
+; MISC3-LABEL: not_a_masked_merge4:
+; MISC3:       # %bb.0:
+; MISC3-NEXT:    nr %r2, %r3
+; MISC3-NEXT:    br %r14
+  %and0 = and i32 %a0, %a1
+  %not = xor i32 %a2, -1
+  %and1 = and i32 %not, %a2
+  %or = or i32 %and0, %and1
+  ret i32 %or
+}
+
+define i32 @masked_merge_no_transform0(i32 %a0, i32 %a1, i32 %a2, ptr %p1) {
+; NO-MISC3-LABEL: masked_merge_no_transform0:
+; NO-MISC3:       # %bb.0:
+; NO-MISC3-NEXT:    nr %r3, %r2
+; NO-MISC3-NEXT:    xilf %r2, 4294967295
+; NO-MISC3-NEXT:    nr %r2, %r4
+; NO-MISC3-NEXT:    or %r2, %r3
+; NO-MISC3-NEXT:    st %r3, 0(%r5)
+; NO-MISC3-NEXT:    br %r14
+;
+; MISC3-LABEL: masked_merge_no_transform0:
+; MISC3:       # %bb.0:
+; MISC3-NEXT:    nr %r3, %r2
+; MISC3-NEXT:    ncrk %r2, %r4, %r2
+; MISC3-NEXT:    or %r2, %r3
+; MISC3-NEXT:    st %r3, 0(%r5)
+; MISC3-NEXT:    br %r14
+  %and0 = and i32 %a0, %a1
+  %not = xor i32 %a0, -1
+  %and1 = and i32 %not, %a2
+  %or = or i32 %and0, %and1
+  store i32 %and0, ptr %p1
+  ret i32 %or
+}
+
+define i32 @masked_merge_no_transform1(i32 %a0, i32 %a1, i32 %a2, ptr %p1) {
+; NO-MISC3-LABEL: masked_merge_no_transform1:
+; NO-MISC3:       # %bb.0:
+; NO-MISC3-NEXT:    nrk %r0, %r2, %r3
+; NO-MISC3-NEXT:    xilf %r2, 4294967295
+; NO-MISC3-NEXT:    nr %r4, %r2
+; NO-MISC3-NEXT:    or %r0, %r4
+; NO-MISC3-NEXT:    st %r2, 0(%r5)
+; NO-MISC3-NEXT:    lr %r2, %r0
+; NO-MISC3-NEXT:    br %r14
+;
+; MISC3-LABEL: masked_merge_no_transform1:
+; MISC3:       # %bb.0:
+; MISC3-NEXT:    nrk %r0, %r2, %r3
+; MISC3-NEXT:    ncrk %r1, %r4, %r2
+; MISC3-NEXT:    xilf %r2, 4294967295
+; MISC3-NEXT:    or %r0, %r1
+; MISC3-NEXT:    st %r2, 0(%r5)
+; MISC3-NEXT:    lr %r2, %r0
+; MISC3-NEXT:    br %r14
+  %and0 = and i32 %a0, %a1
+  %not = xor i32 %a0, -1
+  %and1 = and i32 %not, %a2
+  %or = or i32 %and0, %and1
+  store i32 %not, ptr %p1
+  ret i32 %or
+}
+
+define i32 @masked_merge_no_transform2(i32 %a0, i32 %a1, i32 %a2, ptr %p1) {
+; NO-MISC3-LABEL: masked_merge_no_transform2:
+; NO-MISC3:       # %bb.0:
+; NO-MISC3-NEXT:    nr %r3, %r2
+; NO-MISC3-NEXT:    xilf %r2, 4294967295
+; NO-MISC3-NEXT:    nr %r4, %r2
+; NO-MISC3-NEXT:    ork %r2, %r3, %r4
+; NO-MISC3-NEXT:    st %r4, 0(%r5)
+; NO-MISC3-NEXT:    br %r14
+;
+; MISC3-LABEL: masked_merge_no_transform2:
+; MISC3:       # %bb.0:
+; MISC3-NEXT:    nr %r3, %r2
+; MISC3-NEXT:    ncrk %r0, %r4, %r2
+; MISC3-NEXT:    ork %r2, %r3, %r0
+; MISC3-NEXT:    st %r0, 0(%r5)
+; MISC3-NEXT:    br %r14
+  %and0 = and i32 %a0, %a1
+  %not = xor i32 %a0, -1
+  %and1 = and i32 %not, %a2
+  %or = or i32 %and0, %and1
+  store i32 %and1, ptr %p1
+  ret i32 %or
+}
diff --git a/llvm/test/CodeGen/WebAssembly/simd-arith.ll b/llvm/test/CodeGen/WebAssembly/simd-arith.ll
index 185c46aa5681e..e3607e12bf530 100644
--- a/llvm/test/CodeGen/WebAssembly/simd-arith.ll
+++ b/llvm/test/CodeGen/WebAssembly/simd-arith.ll
@@ -4465,203 +4465,139 @@ define <16 x i8> @bitselect_v16i8(<16 x i8> %c, <16 x i8> %v1, <16 x i8> %v2) {
 ; NO-SIMD128-LABEL: bitselect_v16i8:
 ; NO-SIMD128:         .functype bitselect_v16i8 (i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) -> ()
 ; NO-SIMD128-NEXT:  # %bb.0:
-; NO-SIMD128-NEXT:    i32.and $push0=, $16, $32
-; NO-SIMD128-NEXT:    i32.const $push1=, -1
-; NO-SIMD128-NEXT:    i32.xor $push2=, $16, $pop1
-; NO-SIMD128-NEXT:    i32.and $push3=, $pop2, $48
-; NO-SIMD128-NEXT:    i32.or $push4=, $pop0, $pop3
-; NO-SIMD128-NEXT:    i32.store8 15($0), $pop4
-; NO-SIMD128-NEXT:    i32.and $push5=, $15, $31
-; NO-SIMD128-NEXT:    i32.const $push79=, -1
-; NO-SIMD128-NEXT:    i32.xor $push6=, $15, $pop79
-; NO-SIMD128-NEXT:    i32.and $push7=, $pop6, $47
-; NO-SIMD128-NEXT:    i32.or $push8=, $pop5, $pop7
-; NO-SIMD128-NEXT:    i32.store8 14($0), $pop8
-; NO-SIMD128-NEXT:    i32.and $push9=, $14, $30
-; NO-SIMD128-NEXT:    i32.const $push78=, -1
-; NO-SIMD128-NEXT:    i32.xor $push10=, $14, $pop78
-; NO-SIMD128-NEXT:    i32.and $push11=, $pop10, $46
-; NO-SIMD128-NEXT:    i32.or $push12=, $pop9, $pop11
-; NO-SIMD128-NEXT:    i32.store8 13($0), $pop12
-; NO-SIMD128-NEXT:    i32.and $push13=, $13, $29
-; NO-SIMD128-NEXT:    i32.const $push77=, -1
-; NO-SIMD128-NEXT:    i32.xor $push14=, $13, $pop77
-; NO-SIMD128-NEXT:    i32.and $push15=, $pop14, $45
-; NO-SIMD128-NEXT:    i32.or $push16=, $pop13, $pop15
-; NO-SIMD128-NEXT:    i32.store8 12($0), $pop16
-; NO-SIMD128-NEXT:    i32.and $push17=, $12, $28
-; NO-SIMD128-NEXT:    i32.const $push76=, -1
-; NO-SIMD128-NEXT:    i32.xor $push18=, $12, $pop76
-; NO-SIMD128-NEXT:    i32.and $push19=, $pop18, $44
-; NO-SIMD128-NEXT:    i32.or $push20=, $pop17, $pop19
-; NO-SIMD128-NEXT:    i32.store8 11($0), $pop20
-; NO-SIMD128-NEXT:    i32.and $push21=, $11, $27
-; NO-SIMD128-NEXT:    i32.const $push75=, -1
-; NO-SIMD128-NEXT:    i32.xor $push22=, $11, $pop75
-; NO-SIMD128-NEXT:    i32.and $push23=, $pop22, $43
-; NO-SIMD128-NEXT:    i32.or $push24=, $pop21, $pop23
-; NO-SIMD128-NEXT:    i32.store8 10($0), $pop24
-; NO-SIMD128-NEXT:    i32.and $push25=, $10, $26
-; NO-SIMD128-NEXT:    i32.const $push74=, -1
-; NO-SIMD128-NEXT:    i32.xor $push26=, $10, $pop74
-; NO-SIMD128-NEXT:    i32.and $push27=, $pop26, $42
-; NO-SIMD128-NEXT:    i32.or $push28=, $pop25, $pop27
-; NO-SIMD128-NEXT:    i32.store8 9($0), $pop28
-; NO-SIMD128-NEXT:    i32.and $push29=, $9, $25
-; NO-SIMD128-NEXT:    i32.const $push73=, -1
-; NO-SIMD128-NEXT:    i32.xor $push30=, $9, $pop73
-; NO-SIMD128-NEXT:    i32.and $push31=, $pop30, $41
-; NO-SIMD128-NEXT:    i32.or $push32=, $pop29, $pop31
-; NO-SIMD128-NEXT:    i32.store8 8($0), $pop32
-; NO-SIMD128-NEXT:    i32.and $push33=, $8, $24
-; NO-SIMD128-NEXT:    i32.const $push72=, -1
-; NO-SIMD128-NEXT:    i32.xor $push34=, $8, $pop72
-; NO-SIMD128-NEXT:    i32.and $push35=, $pop34, $40
-; NO-SIMD128-NEXT:    i32.or $push36=, $pop33, $pop35
-; NO-SIMD128-NEXT:    i32.store8 7($0), $pop36
-; NO-SIMD128-NEXT:    i32.and $push37=, $7, $23
-; NO-SIMD128-NEXT:    i32.const $push71=, -1
-; NO-SIMD128-NEXT:    i32.xor $push38=, $7, $pop71
-; NO-SIMD128-NEXT:    i32.and $push39=, $pop38, $39
-; NO-SIMD128-NEXT:    i32.or $push40=, $pop37, $pop39
-; NO-SIMD128-NEXT:    i32.store8 6($0), $pop40
-; NO-SIMD128-NEXT:    i32.and $push41=, $6, $22
-; NO-SIMD128-NEXT:    i32.const $push70=, -1
-; NO-SIMD128-NEXT:    i32.xor $push42=, $6, $pop70
-; NO-SIMD128-NEXT:    i32.and $push43=, $pop42, $38
-; NO-SIMD128-NEXT:    i32.or $push44=, $pop41, $pop43
-; NO-SIMD128-NEXT:    i32.store8 5($0), $pop44
-; NO-SIMD128-NEXT:    i32.and $push45=, $5, $21
-; NO-SIMD128-NEXT:    i32.const $push69=, -1
-; NO-SIMD128-NEXT:    i32.xor $push46=, $5, $pop69
-; NO-SIMD128-NEXT:    i32.and $push47=, $pop46, $37
-; NO-SIMD128-NEXT:    i32.or $push48=, $pop45, $pop47
-; NO-SIMD128-NEXT:    i32.store8 4($0), $pop48
-; NO-SIMD128-NEXT:    i32.and $push49=, $4, $20
-; NO-SIMD128-NEXT:    i32.const $push68=, -1
-; NO-SIMD128-NEXT:    i32.xor $push50=, $4, $pop68
-; NO-SIMD128-NEXT:    i32.and $push51=, $pop50, $36
-; NO-SIMD128-NEXT:    i32.or $push52=, $pop49, $pop51
-; NO-SIMD128-NEXT:    i32.store8 3($0), $pop52
-; NO-SIMD128-NEXT:    i32.and $push53=, $3, $19
-; NO-SIMD128-NEXT:    i32.const $push67=, -1
-; NO-SIMD128-NEXT:    i32.xor $push54=, $3, $pop67
-; NO-SIMD128-NEXT:    i32.and $push55=, $pop54, $35
-; NO-SIMD128-NEXT:    i32.or $push56=, $pop53, $pop55
-; NO-SIMD128-NEXT:    i32.store8 2($0), $pop56
-; NO-SIMD128-NEXT:    i32.and $push57=, $2, $18
-; NO-SIMD128-NEXT:    i32.const $push66=, -1
-; NO-SIMD128-NEXT:    i32.xor $push58=, $2, $pop66
-; NO-SIMD128-NEXT:    i32.and $push59=, $pop58, $34
-; NO-SIMD128-NEXT:    i32.or $push60=, $pop57, $pop59
-; NO-SIMD128-NEXT:    i32.store8 1($0), $pop60
-; NO-SIMD128-NEXT:    i32.and $push61=, $1, $17
-; NO-SIMD128-NEXT:    i32.const $push65=, -1
-; NO-SIMD128-NEXT:    i32.xor $push62=, $1, $pop65
-; NO-SIMD128-NEXT:    i32.and $push63=, $pop62, $33
-; NO-SIMD128-NEXT:    i32.or $push64=, $pop61, $pop63
-; NO-SIMD128-NEXT:    i32.store8 0($0), $pop64
+; NO-SIMD128-NEXT:    i32.xor $push0=, $32, $48
+; NO-SIMD128-NEXT:    i32.and $push1=, $pop0, $16
+; NO-SIMD128-NEXT:    i32.xor $push2=, $pop1, $48
+; NO-SIMD128-NEXT:    i32.store8 15($0), $pop2
+; NO-SIMD128-NEXT:    i32.xor $push3=, $31, $47
+; NO-SIMD128-NEXT:    i32.and $push4=, $pop3, $15
+; NO-SIMD128-NEXT:    i32.xor $push5=, $pop4, $47
+; NO-SIMD128-NEXT:    i32.store8 14($0), $pop5
+; NO-SIMD128-NEXT:    i32.xor $push6=, $30, $46
+; NO-SIMD128-NEXT:    i32.and $push7=, $pop6, $14
+; NO-SIMD128-NEXT:    i32.xor $push8=, $pop7, $46
+; NO-SIMD128-NEXT:    i32.store8 13($0), $pop8
+; NO-SIMD128-NEXT:    i32.xor $push9=, $29, $45
+; NO-SIMD128-NEXT:    i32.and $push10=, $pop9, $13
+; NO-SIMD128-NEXT:    i32.xor $push11=, $pop10, $45
+; NO-SIMD128-NEXT:    i32.store8 12($0), $pop11
+; NO-SIMD128-NEXT:    i32.xor $push12=, $28, $44
+; NO-SIMD128-NEXT:    i32.and $push13=, $pop12, $12
+; NO-SIMD128-NEXT:    i32.xor $push14=, $pop13, $44
+; NO-SIMD128-NEXT:    i32.store8 11($0), $pop14
+; NO-SIMD128-NEXT:    i32.xor $push15=, $27, $43
+; NO-SIMD128-NEXT:    i32.and $push16=, $pop15, $11
+; NO-SIMD128-NEXT:    i32.xor $push17=, $pop16, $43
+; NO-SIMD128-NEXT:    i32.store8 10($0), $pop17
+; NO-SIMD128-NEXT:    i32.xor $push18=, $26, $42
+; NO-SIMD128-NEXT:    i32.and $push19=, $pop18, $10
+; NO-SIMD128-NEXT:    i32.xor $push20=, $pop19, $42
+; NO-SIMD128-NEXT:    i32.store8 9($0), $pop20
+; NO-SIMD128-NEXT:    i32.xor $push21=, $25, $41
+; NO-SIMD128-NEXT:    i32.and $push22=, $pop21, $9
+; NO-SIMD128-NEXT:    i32.xor $push23=, $pop22, $41
+; NO-SIMD128-NEXT:    i32.store8 8($0), $pop23
+; NO-SIMD128-NEXT:    i32.xor $push24=, $24, $40
+; NO-SIMD128-NEXT:    i32.and $push25=, $pop24, $8
+; NO-SIMD128-NEXT:    i32.xor $push26=, $pop25, $40
+; NO-SIMD128-NEXT:    i32.store8 7($0), $pop26
+; NO-SIMD128-NEXT:    i32.xor $push27=, $23, $39
+; NO-SIMD128-NEXT:    i32.and $push28=, $pop27, $7
+; NO-SIMD128-NEXT:    i32.xor $push29=, $pop28, $39
+; NO-SIMD128-NEXT:    i32.store8 6($0), $pop29
+; NO-SIMD128-NEXT:    i32.xor $push30=, $22, $38
+; NO-SIMD128-NEXT:    i32.and $push31=, $pop30, $6
+; NO-SIMD128-NEXT:    i32.xor $push32=, $pop31, $38
+; NO-SIMD128-NEXT:    i32.store8 5($0), $pop32
+; NO-SIMD128-NEXT:    i32.xor $push33=, $21, $37
+; NO-SIMD128-NEXT:    i32.and $push34=, $pop33, $5
+; NO-SIMD128-NEXT:    i32.xor $push35=, $pop34, $37
+; NO-SIMD128-NEXT:    i32.store8 4($0), $pop35
+; NO-SIMD128-NEXT:    i32.xor $push36=, $20, $36
+; NO-SIMD128-NEXT:    i32.and $push37=, $pop36, $4
+; NO-SIMD128-NEXT:    i32.xor $push38=, $pop37, $36
+; NO-SIMD128-NEXT:    i32.store8 3($0), $pop38
+; NO-SIMD128-NEXT:    i32.xor $push39=, $19, $35
+; NO-SIMD128-NEXT:    i32.and $push40=, $pop39, $3
+; NO-SIMD128-NEXT:    i32.xor $push41=, $pop40, $35
+; NO-SIMD128-NEXT:    i32.store8 2($0), $pop41
+; NO-SIMD128-NEXT:    i32.xor $push42=, $18, $34
+; NO-SIMD128-NEXT:    i32.and $push43=, $pop42, $2
+; NO-SIMD128-NEXT:    i32.xor $push44=, $pop43, $34
+; NO-SIMD128-NEXT:    i32.store8 1($0), $pop44
+; NO-SIMD128-NEXT:    i32.xor $push45=, $17, $33
+; NO-SIMD128-NEXT:    i32.and $push46=, $pop45, $1
+; NO-SIMD128-NEXT:    i32.xor $push47=, $pop46, $33
+; NO-SIMD128-NEXT:    i32.store8 0($0), $pop47
 ; NO-SIMD128-NEXT:    return
 ;
 ; NO-SIMD128-FAST-LABEL: bitselect_v16i8:
 ; NO-SIMD128-FAST:         .functype bitselect_v16i8 (i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) -> ()
 ; NO-SIMD128-FAST-NEXT:  # %bb.0:
-; NO-SIMD128-FAST-NEXT:    i32.and $push0=, $1, $17
-; NO-SIMD128-FAST-NEXT:    i32.const $push1=, -1
-; NO-SIMD128-FAST-NEXT:    i32.xor $push2=, $1, $pop1
-; NO-SIMD128-FAST-NEXT:    i32.and $push3=, $pop2, $33
-; NO-SIMD128-FAST-NEXT:    i32.or $push4=, $pop0, $pop3
-; NO-SIMD128-FAST-NEXT:    i32.store8 0($0), $pop4
-; NO-SIMD128-FAST-NEXT:    i32.and $push5=, $2, $18
-; NO-SIMD128-FAST-NEXT:    i32.const $push79=, -1
-; NO-SIMD128-FAST-NEXT:    i32.xor $push6=, $2, $pop79
-; NO-SIMD128-FAST-NEXT:    i32.and $push7=, $pop6, $34
-; NO-SIMD128-FAST-NEXT:    i32.or $push8=, $pop5, $pop7
-; NO-SIMD128-FAST-NEXT:    i32.store8 1($0), $pop8
-; NO-SIMD128-FAST-NEXT:    i32.and $push9=, $3, $19
-; NO-SIMD128-FAST-NEXT:    i32.const $push78=, -1
-; NO-SIMD128-FAST-NEXT:    i32.xor $push10=, $3, $pop78
-; NO-SIMD128-FAST-NEXT:    i32.and $push11=, $pop10, $35
-; NO-SIMD128-FAST-NEXT:    i32.or $push12=, $pop9, $pop11
-; NO-SIMD128-FAST-NEXT:    i32.store8 2($0), $pop12
-; NO-SIMD128-FAST-NEXT:    i32.and $push13=, $4, $20
-; NO-SIMD128-FAST-NEXT:    i32.const $push77=, -1
-; NO-SIMD128-FAST-NEXT:    i32.xor $push14=, $4, $pop77
-; NO-SIMD128-FAST-NEXT:    i32.and $push15=, $pop14, $36
-; NO-SIMD128-FAST-NEXT:    i32.or $push16=, $pop13, $pop15
-; NO-SIMD128-FAST-NEXT:    i32.store8 3($0), $pop16
-; NO-SIMD128-FAST-NEXT:    i32.and $push17=, $5, $21
-; NO-SIMD128-FAST-NEXT:    i32.const $push76=, -1
-; NO-SIMD128-FAST-NEXT:    i32.xor $push18=, $5, $pop76
-; NO-SIMD128-FAST-NEXT:    i32.and $push19=, $pop18, $37
-; NO-SIMD128-FAST-NEXT:    i32.or $push20=, $pop17, $pop19
-; NO-SIMD128-FAST-NEXT:    i32.store8 4($0), $pop20
-; NO-SIMD128-FAST-NEXT:    i32.and $push21=, $6, $22
-; NO-SIMD128-FAST-NEXT:    i32.const $push75=, -1
-; NO-SIMD128-FAST-NEXT:    i32.xor $push22=, $6, $pop75
-; NO-SIMD128-FAST-NEXT:    i32.and $push23=, $pop22, $38
-; NO-SIMD128-FAST-NEXT:    i32.or $push24=, $pop21, $pop23
-; NO-SIMD128-FAST-NEXT:    i32.store8 5($0), $pop24
-; NO-SIMD128-FAST-NEXT:    i32.and $push25=, $7, $23
-; NO-SIMD128-FAST-NEXT:    i32.const $push74=, -1
-; NO-SIMD128-FAST-NEXT:    i32.xor $push26=, $7, $pop74
-; NO-SIMD128-FAST-NEXT:    i32.and $push27=, $pop26, $39
-; NO-SIMD128-FAST-NEXT:    i32.or $push28=, $pop25, $pop27
-; NO-SIMD128-FAST-NEXT:    i32.store8 6($0), $pop28
-; NO-SIMD128-FAST-NEXT:    i32.and $push29=, $8, $24
-; NO-SIMD128-FAST-NEXT:    i32.const $push73=, -1
-; NO-SIMD128-FAST-NEXT:    i32.xor $push30=, $8, $pop73
-; NO-SIMD128-FAST-NEXT:    i32.and $push31=, $pop30, $40
-; NO-SIMD128-FAST-NEXT:    i32.or $push32=, $pop29, $pop31
-; NO-SIMD128-FAST-NEXT:    i32.store8 7($0), $pop32
-; NO-SIMD128-FAST-NEXT:    i32.and $push33=, $9, $25
-; NO-SIMD128-FAST-NEXT:    i32.const $push72=, -1
-; NO-SIMD128-FAST-NEXT:    i32.xor $push34=, $9, $pop72
-; NO-SIMD128-FAST-NEXT:    i32.and $push35=, $pop34, $41
-; NO-SIMD128-FAST-NEXT:    i32.or $push36=, $pop33, $pop35
-; NO-SIMD128-FAST-NEXT:    i32.store8 8($0), $pop36
-; NO-SIMD128-FAST-NEXT:    i32.and $push37=, $10, $26
-; NO-SIMD128-FAST-NEXT:    i32.const $push71=, -1
-; NO-SIMD128-FAST-NEXT:    i32.xor $push38=, $10, $pop71
-; NO-SIMD128-FAST-NEXT:    i32.and $push39=, $pop38, $42
-; NO-SIMD128-FAST-NEXT:    i32.or $push40=, $pop37, $pop39
-; NO-SIMD128-FAST-NEXT:    i32.store8 9($0), $pop40
-; NO-SIMD128-FAST-NEXT:    i32.and $push41=, $11, $27
-; NO-SIMD128-FAST-NEXT:    i32.const $push70=, -1
-; NO-SIMD128-FAST-NEXT:    i32.xor $push42=, $11, $pop70
-; NO-SIMD128-FAST-NEXT:    i32.and $push43=, $pop42, $43
-; NO-SIMD128-FAST-NEXT:    i32.or $push44=, $pop41, $pop43
-; NO-SIMD128-FAST-NEXT:    i32.store8 10($0), $pop44
-; NO-SIMD128-FAST-NEXT:    i32.and $push45=, $12, $28
-; NO-SIMD128-FAST-NEXT:    i32.const $push69=, -1
-; NO-SIMD128-FAST-NEXT:    i32.xor $push46=, $12, $pop69
-; NO-SIMD128-FAST-NEXT:    i32.and $push47=, $pop46, $44
-; NO-SIMD128-FAST-NEXT:    i32.or $push48=, $pop45, $pop47
-; NO-SIMD128-FAST-NEXT:    i32.store8 11($0), $pop48
-; NO-SIMD128-FAST-NEXT:    i32.and $push49=, $13, $29
-; NO-SIMD128-FAST-NEXT:    i32.const $push68=, -1
-; NO-SIMD128-FAST-NEXT:    i32.xor $push50=, $13, $pop68
-; NO-SIMD128-FAST-NEXT:    i32.and $push51=, $pop50, $45
-; NO-SIMD128-FAST-NEXT:    i32.or $push52=, $pop49, $pop51
-; NO-SIMD128-FAST-NEXT:    i32.store8 12($0), $pop52
-; NO-SIMD128-FAST-NEXT:    i32.and $push53=, $14, $30
-; NO-SIMD128-FAST-NEXT:    i32.const $push67=, -1
-; NO-SIMD128-FAST-NEXT:    i32.xor $push54=, $14, $pop67
-; NO-SIMD128-FAST-NEXT:    i32.and $push55=, $pop54, $46
-; NO-SIMD128-FAST-NEXT:    i32.or $push56=, $pop53, $pop55
-; NO-SIMD128-FAST-NEXT:    i32.store8 13($0), $pop56
-; NO-SIMD128-FAST-NEXT:    i32.and $push57=, $15, $31
-; NO-SIMD128-FAST-NEXT:    i32.const $push66=, -1
-; NO-SIMD128-FAST-NEXT:    i32.xor $push58=, $15, $pop66
-; NO-SIMD128-FAST-NEXT:    i32.and $push59=, $pop58, $47
-; NO-SIMD128-FAST-NEXT:    i32.or $push60=, $pop57, $pop59
-; NO-SIMD128-FAST-NEXT:    i32.store8 14($0), $pop60
-; NO-SIMD128-FAST-NEXT:    i32.and $push61=, $16, $32
-; NO-SIMD128-FAST-NEXT:    i32.const $push65=, -1
-; NO-SIMD128-FAST-NEXT:    i32.xor $push62=, $16, $pop65
-; NO-SIMD128-FAST-NEXT:    i32.and $push63=, $pop62, $48
-; NO-SIMD128-FAST-NEXT:    i32.or $push64=, $pop61, $pop63
-; NO-SIMD128-FAST-NEXT:    i32.store8 15($0), $pop64
+; NO-SIMD128-FAST-NEXT:    i32.xor $push0=, $17, $33
+; NO-SIMD128-FAST-NEXT:    i32.and $push1=, $pop0, $1
+; NO-SIMD128-FAST-NEXT:    i32.xor $push2=, $pop1, $33
+; NO-SIMD128-FAST-NEXT:    i32.store8 0($0), $pop2
+; NO-SIMD128-FAST-NEXT:    i32.xor $push3=, $18, $34
+; NO-SIMD128-FAST-NEXT:    i32.and $push4=, $pop3, $2
+; NO-SIMD128-FAST-NEXT:    i32.xor $push5=, $pop4, $34
+; NO-SIMD128-FAST-NEXT:    i32.store8 1($0), $pop5
+; NO-SIMD128-FAST-NEXT:    i32.xor $push6=, $19, $35
+; NO-SIMD128-FAST-NEXT:    i32.and $push7=, $pop6, $3
+; NO-SIMD128-FAST-NEXT:    i32.xor $push8=, $pop7, $35
+; NO-SIMD128-FAST-NEXT:    i32.store8 2($0), $pop8
+; NO-SIMD128-FAST-NEXT:    i32.xor $push9=, $20, $36
+; NO-SIMD128-FAST-NEXT:    i32.and $push10=, $pop9, $4
+; NO-SIMD128-FAST-NEXT:    i32.xor $push11=, $pop10, $36
+; NO-SIMD128-FAST-NEXT:    i32.store8 3($0), $pop11
+; NO-SIMD128-FAST-NEXT:    i32.xor $push12=, $21, $37
+; NO-SIMD128-FAST-NEXT:    i32.and $push13=, $pop12, $5
+; NO-SIMD128-FAST-NEXT:    i32.xor $push14=, $pop13, $37
+; NO-SIMD128-FAST-NEXT:    i32.store8 4($0), $pop14
+; NO-SIMD128-FAST-NEXT:    i32.xor $push15=, $22, $38
+; NO-SIMD128-FAST-NEXT:    i32.and $push16=, $pop15, $6
+; NO-SIMD128-FAST-NEXT:    i32.xor $push17=, $pop16, $38
+; NO-SIMD128-FAST-NEXT:    i32.store8 5($0), $pop17
+; NO-SIMD128-FAST-NEXT:    i32.xor $push18=, $23, $39
+; NO-SIMD128-FAST-NEXT:    i32.and $push19=, $pop18, $7
+; NO-SIMD128-FAST-NEXT:    i32.xor $push20=, $pop19, $39
+; NO-SIMD128-FAST-NEXT:    i32.store8 6($0), $pop20
+; NO-SIMD128-FAST-NEXT:    i32.xor $push21=, $24, $40
+; NO-SIMD128-FAST-NEXT:    i32.and $push22=, $pop21, $8
+; NO-SIMD128-FAST-NEXT:    i32.xor $push23=, $pop22, $40
+; NO-SIMD128-FAST-NEXT:    i32.store8 7($0), $pop23
+; NO-SIMD128-FAST-NEXT:    i32.xor $push24=, $25, $41
+; NO-SIMD128-FAST-NEXT:    i32.and $push25=, $pop24, $9
+; NO-SIMD128-FAST-NEXT:    i32.xor $push26=, $pop25, $41
+; NO-SIMD128-FAST-NEXT:    i32.store8 8($0), $pop26
+; NO-SIMD128-FAST-NEXT:    i32.xor $push27=, $26, $42
+; NO-SIMD128-FAST-NEXT:    i32.and $push28=, $pop27, $10
+; NO-SIMD128-FAST-NEXT:    i32.xor $push29=, $pop28, $42
+; NO-SIMD128-FAST-NEXT:    i32.store8 9($0), $pop29
+; NO-SIMD128-FAST-NEXT:    i32.xor $push30=, $27, $43
+; NO-SIMD128-FAST-NEXT:    i32.and $push31=, $pop30, $11
+; NO-SIMD128-FAST-NEXT:    i32.xor $push32=, $pop31, $43
+; NO-SIMD128-FAST-NEXT:    i32.store8 10($0), $pop32
+; NO-SIMD128-FAST-NEXT:    i32.xor $push33=, $28, $44
+; NO-SIMD128-FAST-NEXT:    i32.and $push34=, $pop33, $12
+; NO-SIMD128-FAST-NEXT:    i32.xor $push35=, $pop34, $44
+; NO-SIMD128-FAST-NEXT:    i32.store8 11($0), $pop35
+; NO-SIMD128-FAST-NEXT:    i32.xor $push36=, $29, $45
+; NO-SIMD128-FAST-NEXT:    i32.and $push37=, $pop36, $13
+; NO-SIMD128-FAST-NEXT:    i32.xor $push38=, $pop37, $45
+; NO-SIMD128-FAST-NEXT:    i32.store8 12($0), $pop38
+; NO-SIMD128-FAST-NEXT:    i32.xor $push39=, $30, $46
+; NO-SIMD128-FAST-NEXT:    i32.and $push40=, $pop39, $14
+; NO-SIMD128-FAST-NEXT:    i32.xor $push41=, $pop40, $46
+; NO-SIMD128-FAST-NEXT:    i32.store8 13($0), $pop41
+; NO-SIMD128-FAST-NEXT:    i32.xor $push42=, $31, $47
+; NO-SIMD128-FAST-NEXT:    i32.and $push43=, $pop42, $15
+; NO-SIMD128-FAST-NEXT:    i32.xor $push44=, $pop43, $47
+; NO-SIMD128-FAST-NEXT:    i32.store8 14($0), $pop44
+; NO-SIMD128-FAST-NEXT:    i32.xor $push45=, $32, $48
+; NO-SIMD128-FAST-NEXT:    i32.and $push46=, $pop45, $16
+; NO-SIMD128-FAST-NEXT:    i32.xor $push47=, $pop46, $48
+; NO-SIMD128-FAST-NEXT:    i32.store8 15($0), $pop47
 ; NO-SIMD128-FAST-NEXT:    return
   %masked_v1 = and <16 x i8> %c, %v1
   %inv_mask = xor <16 x i8> %c,
@@ -7546,107 +7482,75 @@ define <8 x i16> @bitselect_v8i16(<8 x i16> %c, <8 x i16> %v1, <8 x i16> %v2) {
 ; NO-SIMD128-LABEL: bitselect_v8i16:
 ; NO-SIMD128:         .functype bitselect_v8i16 (i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) -> ()
 ; NO-SIMD128-NEXT:  # %bb.0:
-; NO-SIMD128-NEXT:    i32.and $push0=, $16, $8
-; NO-SIMD128-NEXT:    i32.const $push1=, -1
-; NO-SIMD128-NEXT:    i32.xor $push2=, $8, $pop1
-; NO-SIMD128-NEXT:    i32.and $push3=, $24, $pop2
-; NO-SIMD128-NEXT:    i32.or $push4=, $pop0, $pop3
-; NO-SIMD128-NEXT:    i32.store16 14($0), $pop4
-; NO-SIMD128-NEXT:    i32.and $push5=, $15, $7
-; NO-SIMD128-NEXT:    i32.const $push39=, -1
-; NO-SIMD128-NEXT:    i32.xor $push6=, $7, $pop39
-; NO-SIMD128-NEXT:    i32.and $push7=, $23, $pop6
-; NO-SIMD128-NEXT:    i32.or $push8=, $pop5, $pop7
-; NO-SIMD128-NEXT:    i32.store16 12($0), $pop8
-; NO-SIMD128-NEXT:    i32.and $push9=, $14, $6
-; NO-SIMD128-NEXT:    i32.const $push38=, -1
-; NO-SIMD128-NEXT:    i32.xor $push10=, $6, $pop38
-; NO-SIMD128-NEXT:    i32.and $push11=, $22, $pop10
-; NO-SIMD128-NEXT:    i32.or $push12=, $pop9, $pop11
-; NO-SIMD128-NEXT:    i32.store16 10($0), $pop12
-; NO-SIMD128-NEXT:    i32.and $push13=, $13, $5
-; NO-SIMD128-NEXT:    i32.const $push37=, -1
-; NO-SIMD128-NEXT:    i32.xor $push14=, $5, $pop37
-; NO-SIMD128-NEXT:    i32.and $push15=, $21, $pop14
-; NO-SIMD128-NEXT:    i32.or $push16=, $pop13, $pop15
-; NO-SIMD128-NEXT:    i32.store16 8($0), $pop16
-; NO-SIMD128-NEXT:    i32.and $push17=, $12, $4
-; NO-SIMD128-NEXT:    i32.const $push36=, -1
-; NO-SIMD128-NEXT:    i32.xor $push18=, $4, $pop36
-; NO-SIMD128-NEXT:    i32.and $push19=, $20, $pop18
-; NO-SIMD128-NEXT:    i32.or $push20=, $pop17, $pop19
-; NO-SIMD128-NEXT:    i32.store16 6($0), $pop20
-; NO-SIMD128-NEXT:    i32.and $push21=, $11, $3
-; NO-SIMD128-NEXT:    i32.const $push35=, -1
-; NO-SIMD128-NEXT:    i32.xor $push22=, $3, $pop35
-; NO-SIMD128-NEXT:    i32.and $push23=, $19, $pop22
-; NO-SIMD128-NEXT:    i32.or $push24=, $pop21, $pop23
-; NO-SIMD128-NEXT:    i32.store16 4($0), $pop24
-; NO-SIMD128-NEXT:    i32.and $push25=, $10, $2
-; NO-SIMD128-NEXT:    i32.const $push34=, -1
-; NO-SIMD128-NEXT:    i32.xor $push26=, $2, $pop34
-; NO-SIMD128-NEXT:    i32.and $push27=, $18, $pop26
-; NO-SIMD128-NEXT:    i32.or $push28=, $pop25, $pop27
-; NO-SIMD128-NEXT:    i32.store16 2($0), $pop28
-; NO-SIMD128-NEXT:    i32.and $push29=, $9, $1
-; NO-SIMD128-NEXT:    i32.const $push33=, -1
-; NO-SIMD128-NEXT:    i32.xor $push30=, $1, $pop33
-; NO-SIMD128-NEXT:    i32.and $push31=, $17, $pop30
-; NO-SIMD128-NEXT:    i32.or $push32=, $pop29, $pop31
-; NO-SIMD128-NEXT:    i32.store16 0($0), $pop32
+; NO-SIMD128-NEXT:    i32.xor $push0=, $16, $24
+; NO-SIMD128-NEXT:    i32.and $push1=, $pop0, $8
+; NO-SIMD128-NEXT:    i32.xor $push2=, $pop1, $24
+; NO-SIMD128-NEXT:    i32.store16 14($0), $pop2
+; NO-SIMD128-NEXT:    i32.xor $push3=, $15, $23
+; NO-SIMD128-NEXT:    i32.and $push4=, $pop3, $7
+; NO-SIMD128-NEXT:    i32.xor $push5=, $pop4, $23
+; NO-SIMD128-NEXT:    i32.store16 12($0), $pop5
+; NO-SIMD128-NEXT:    i32.xor $push6=, $14, $22
+; NO-SIMD128-NEXT:    i32.and $push7=, $pop6, $6
+; NO-SIMD128-NEXT:    i32.xor $push8=, $pop7, $22
+; NO-SIMD128-NEXT:    i32.store16 10($0), $pop8
+; NO-SIMD128-NEXT:    i32.xor $push9=, $13, $21
+; NO-SIMD128-NEXT:    i32.and $push10=, $pop9, $5
+; NO-SIMD128-NEXT:    i32.xor $push11=, $pop10, $21
+; NO-SIMD128-NEXT:    i32.store16 8($0), $pop11
+; NO-SIMD128-NEXT:    i32.xor $push12=, $12, $20
+; NO-SIMD128-NEXT:    i32.and $push13=, $pop12, $4
+; NO-SIMD128-NEXT:    i32.xor $push14=, $pop13, $20
+; NO-SIMD128-NEXT:    i32.store16 6($0), $pop14
+; NO-SIMD128-NEXT:    i32.xor $push15=, $11, $19
+; NO-SIMD128-NEXT:    i32.and $push16=, $pop15, $3
+; NO-SIMD128-NEXT:    i32.xor $push17=, $pop16, $19
+; NO-SIMD128-NEXT:    i32.store16 4($0), $pop17
+; NO-SIMD128-NEXT:    i32.xor $push18=, $10, $18
+; NO-SIMD128-NEXT:    i32.and $push19=, $pop18, $2
+; NO-SIMD128-NEXT:    i32.xor $push20=, $pop19, $18
+; NO-SIMD128-NEXT:    i32.store16 2($0), $pop20
+; NO-SIMD128-NEXT:    i32.xor $push21=, $9, $17
+; NO-SIMD128-NEXT:    i32.and $push22=, $pop21, $1
+; NO-SIMD128-NEXT:    i32.xor $push23=, $pop22, $17
+; NO-SIMD128-NEXT:    i32.store16 0($0), $pop23
 ; NO-SIMD128-NEXT:    return
 ;
 ; NO-SIMD128-FAST-LABEL: bitselect_v8i16:
 ; NO-SIMD128-FAST:         .functype bitselect_v8i16 (i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) -> ()
 ; NO-SIMD128-FAST-NEXT:  # %bb.0:
-; NO-SIMD128-FAST-NEXT:    i32.and $push0=, $9, $1
-; NO-SIMD128-FAST-NEXT:    i32.const $push1=, -1
-; NO-SIMD128-FAST-NEXT:    i32.xor $push2=, $1, $pop1
-; NO-SIMD128-FAST-NEXT:    i32.and $push3=, $17, $pop2
-; NO-SIMD128-FAST-NEXT:    i32.or $push4=, $pop0, $pop3
-; NO-SIMD128-FAST-NEXT:    i32.store16 0($0), $pop4
-; NO-SIMD128-FAST-NEXT:    i32.and $push5=, $10, $2
-; NO-SIMD128-FAST-NEXT:    i32.const $push39=, -1
-; NO-SIMD128-FAST-NEXT:    i32.xor $push6=, $2, $pop39
-; NO-SIMD128-FAST-NEXT:    i32.and $push7=, $18, $pop6
-; NO-SIMD128-FAST-NEXT:    i32.or $push8=, $pop5, $pop7
-; NO-SIMD128-FAST-NEXT:    i32.store16 2($0), $pop8
-; NO-SIMD128-FAST-NEXT:    i32.and $push9=, $11, $3
-; NO-SIMD128-FAST-NEXT:    i32.const $push38=, -1
-; NO-SIMD128-FAST-NEXT:    i32.xor $push10=, $3, $pop38
-; NO-SIMD128-FAST-NEXT:    i32.and $push11=, $19, $pop10
-; NO-SIMD128-FAST-NEXT:    i32.or $push12=, $pop9, $pop11
-; NO-SIMD128-FAST-NEXT:    i32.store16 4($0), $pop12
-; NO-SIMD128-FAST-NEXT:    i32.and $push13=, $12, $4
-; NO-SIMD128-FAST-NEXT:    i32.const $push37=, -1
-; NO-SIMD128-FAST-NEXT:    i32.xor $push14=, $4, $pop37
-; NO-SIMD128-FAST-NEXT:    i32.and $push15=, $20, $pop14
-; NO-SIMD128-FAST-NEXT:    i32.or $push16=, $pop13, $pop15
-; NO-SIMD128-FAST-NEXT:    i32.store16 6($0), $pop16
-; NO-SIMD128-FAST-NEXT:    i32.and $push17=, $13, $5
-; NO-SIMD128-FAST-NEXT:    i32.const $push36=, -1
-; NO-SIMD128-FAST-NEXT:    i32.xor $push18=, $5, $pop36
-; NO-SIMD128-FAST-NEXT:    i32.and $push19=, $21, $pop18
-; NO-SIMD128-FAST-NEXT:    i32.or $push20=, $pop17, $pop19
-; NO-SIMD128-FAST-NEXT:    i32.store16 8($0), $pop20
-; NO-SIMD128-FAST-NEXT:    i32.and $push21=, $14, $6
-; NO-SIMD128-FAST-NEXT:    i32.const $push35=, -1
-; NO-SIMD128-FAST-NEXT:    i32.xor $push22=, $6, $pop35
-; NO-SIMD128-FAST-NEXT:    i32.and $push23=, $22, $pop22
-; NO-SIMD128-FAST-NEXT:    i32.or $push24=, $pop21, $pop23
-; NO-SIMD128-FAST-NEXT:    i32.store16 10($0), $pop24
-; NO-SIMD128-FAST-NEXT:    i32.and $push25=, $15, $7
-; NO-SIMD128-FAST-NEXT:    i32.const $push34=, -1
-; NO-SIMD128-FAST-NEXT:    i32.xor $push26=, $7, $pop34
-; NO-SIMD128-FAST-NEXT:    i32.and $push27=, $23, $pop26
-; NO-SIMD128-FAST-NEXT:    i32.or $push28=, $pop25, $pop27
-; NO-SIMD128-FAST-NEXT:    i32.store16 12($0), $pop28
-; NO-SIMD128-FAST-NEXT:    i32.and $push29=, $16, $8
-; NO-SIMD128-FAST-NEXT:    i32.const $push33=, -1
-; NO-SIMD128-FAST-NEXT:    i32.xor $push30=, $8, $pop33
-; NO-SIMD128-FAST-NEXT:    i32.and $push31=, $24, $pop30
-; NO-SIMD128-FAST-NEXT:    i32.or $push32=, $pop29, $pop31
-; NO-SIMD128-FAST-NEXT:    i32.store16 14($0), $pop32
+; NO-SIMD128-FAST-NEXT:    i32.xor $push0=, $9, $17
+; NO-SIMD128-FAST-NEXT:    i32.and $push1=, $pop0, $1
+; NO-SIMD128-FAST-NEXT:    i32.xor $push2=, $pop1, $17
+; NO-SIMD128-FAST-NEXT:    i32.store16 0($0), $pop2
+; NO-SIMD128-FAST-NEXT:    i32.xor $push3=, $10, $18
+; NO-SIMD128-FAST-NEXT:    i32.and $push4=, $pop3, $2
+; NO-SIMD128-FAST-NEXT:    i32.xor $push5=, $pop4, $18
+; NO-SIMD128-FAST-NEXT:    i32.store16 2($0), $pop5
+; NO-SIMD128-FAST-NEXT:    i32.xor $push6=, $11, $19
+; NO-SIMD128-FAST-NEXT:    i32.and $push7=, $pop6, $3
+; NO-SIMD128-FAST-NEXT:    i32.xor $push8=, $pop7, $19
+; NO-SIMD128-FAST-NEXT:    i32.store16 4($0), $pop8
+; NO-SIMD128-FAST-NEXT:    i32.xor $push9=, $12, $20
+; NO-SIMD128-FAST-NEXT:    i32.and $push10=, $pop9, $4
+; NO-SIMD128-FAST-NEXT:    i32.xor $push11=, $pop10, $20
+; NO-SIMD128-FAST-NEXT:    i32.store16 6($0), $pop11
+; NO-SIMD128-FAST-NEXT:    i32.xor $push12=, $13, $21
+; NO-SIMD128-FAST-NEXT:    i32.and $push13=, $pop12, $5
+; NO-SIMD128-FAST-NEXT:    i32.xor $push14=, $pop13, $21
+; NO-SIMD128-FAST-NEXT:    i32.store16 8($0), $pop14
+; NO-SIMD128-FAST-NEXT:    i32.xor $push15=, $14, $22
+; NO-SIMD128-FAST-NEXT:    i32.and $push16=, $pop15, $6
+; NO-SIMD128-FAST-NEXT:    i32.xor $push17=, $pop16, $22
+; NO-SIMD128-FAST-NEXT:    i32.store16 10($0), $pop17
+; NO-SIMD128-FAST-NEXT:    i32.xor $push18=, $15, $23
+; NO-SIMD128-FAST-NEXT:    i32.and $push19=, $pop18, $7
+; NO-SIMD128-FAST-NEXT:    i32.xor $push20=, $pop19, $23
+; NO-SIMD128-FAST-NEXT:    i32.store16 12($0), $pop20
+; NO-SIMD128-FAST-NEXT:    i32.xor $push21=, $16, $24
+; NO-SIMD128-FAST-NEXT:    i32.and $push22=, $pop21, $8
+; NO-SIMD128-FAST-NEXT:    i32.xor $push23=, $pop22, $24
+; NO-SIMD128-FAST-NEXT:    i32.store16 14($0), $pop23
 ; NO-SIMD128-FAST-NEXT:    return
   %masked_v1 = and <8 x i16> %v1, %c
   %inv_mask = xor <8 x i16>
@@ -9453,59 +9357,43 @@ define <4 x i32> @bitselect_v4i32(<4 x i32> %c, <4 x i32> %v1, <4 x i32> %v2) {
 ; NO-SIMD128-LABEL: bitselect_v4i32:
 ; NO-SIMD128:         .functype bitselect_v4i32 (i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) -> ()
 ; NO-SIMD128-NEXT:  # %bb.0:
-; NO-SIMD128-NEXT:    i32.const $push1=, -1
-; NO-SIMD128-NEXT:    i32.xor $push2=, $4, $pop1
-; NO-SIMD128-NEXT:    i32.and $push3=, $pop2, $12
-; NO-SIMD128-NEXT:    i32.and $push0=, $4, $8
-; NO-SIMD128-NEXT:    i32.or $push4=, $pop3, $pop0
-; NO-SIMD128-NEXT:    i32.store 12($0), $pop4
-; NO-SIMD128-NEXT:    i32.const $push19=, -1
-; NO-SIMD128-NEXT:    i32.xor $push6=, $3, $pop19
-; NO-SIMD128-NEXT:    i32.and $push7=, $pop6, $11
-; NO-SIMD128-NEXT:    i32.and $push5=, $3, $7
-; NO-SIMD128-NEXT:    i32.or $push8=, $pop7, $pop5
-; NO-SIMD128-NEXT:    i32.store 8($0), $pop8
-; NO-SIMD128-NEXT:    i32.const $push18=, -1
-; NO-SIMD128-NEXT:    i32.xor $push10=, $2, $pop18
-; NO-SIMD128-NEXT:    i32.and $push11=, $pop10, $10
-; NO-SIMD128-NEXT:    i32.and $push9=, $2, $6
-; NO-SIMD128-NEXT:    i32.or $push12=, $pop11, $pop9
-; NO-SIMD128-NEXT:    i32.store 4($0), $pop12
-; NO-SIMD128-NEXT:    i32.const $push17=, -1
-; NO-SIMD128-NEXT:    i32.xor $push14=, $1, $pop17
-; NO-SIMD128-NEXT:    i32.and $push15=, $pop14, $9
-; NO-SIMD128-NEXT:    i32.and $push13=, $1, $5
-; NO-SIMD128-NEXT:    i32.or $push16=, $pop15, $pop13
-; NO-SIMD128-NEXT:    i32.store 0($0), $pop16
+; NO-SIMD128-NEXT:    i32.xor $push0=, $8, $12
+; NO-SIMD128-NEXT:    i32.and $push1=, $pop0, $4
+; NO-SIMD128-NEXT:    i32.xor $push2=, $pop1, $12
+; NO-SIMD128-NEXT:    i32.store 12($0), $pop2
+; NO-SIMD128-NEXT:    i32.xor $push3=, $7, $11
+; NO-SIMD128-NEXT:    i32.and $push4=, $pop3, $3
+; NO-SIMD128-NEXT:    i32.xor $push5=, $pop4, $11
+; NO-SIMD128-NEXT:    i32.store 8($0), $pop5
+; NO-SIMD128-NEXT:    i32.xor $push6=, $6, $10
+; NO-SIMD128-NEXT:    i32.and $push7=, $pop6, $2
+; NO-SIMD128-NEXT:    i32.xor $push8=, $pop7, $10
+; NO-SIMD128-NEXT:    i32.store 4($0), $pop8
+; NO-SIMD128-NEXT:    i32.xor $push9=, $5, $9
+; NO-SIMD128-NEXT:    i32.and $push10=, $pop9, $1
+; NO-SIMD128-NEXT:    i32.xor $push11=, $pop10, $9
+; NO-SIMD128-NEXT:    i32.store 0($0), $pop11
 ; NO-SIMD128-NEXT:    return
 ;
 ; NO-SIMD128-FAST-LABEL: bitselect_v4i32:
 ; NO-SIMD128-FAST:         .functype bitselect_v4i32 (i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) -> ()
 ; NO-SIMD128-FAST-NEXT:  # %bb.0:
-; NO-SIMD128-FAST-NEXT:    i32.const $push1=, -1
-; NO-SIMD128-FAST-NEXT:    i32.xor $push2=, $1, $pop1
-; NO-SIMD128-FAST-NEXT:    i32.and $push3=, $pop2, $9
-; NO-SIMD128-FAST-NEXT:    i32.and $push0=, $1, $5
-; NO-SIMD128-FAST-NEXT:    i32.or $push4=, $pop3, $pop0
-; NO-SIMD128-FAST-NEXT:    i32.store 0($0), $pop4
-; NO-SIMD128-FAST-NEXT:    i32.const $push19=, -1
-; NO-SIMD128-FAST-NEXT:    i32.xor $push6=, $2, $pop19
-; NO-SIMD128-FAST-NEXT:    i32.and $push7=, $pop6, $10
-; NO-SIMD128-FAST-NEXT:    i32.and $push5=, $2, $6
-; NO-SIMD128-FAST-NEXT:    i32.or $push8=, $pop7, $pop5
-; NO-SIMD128-FAST-NEXT:    i32.store 4($0), $pop8
-; NO-SIMD128-FAST-NEXT:    i32.const $push18=, -1
-; NO-SIMD128-FAST-NEXT:    i32.xor $push10=, $3, $pop18
-; NO-SIMD128-FAST-NEXT:    i32.and $push11=, $pop10, $11
-; NO-SIMD128-FAST-NEXT:    i32.and $push9=, $3, $7
-; NO-SIMD128-FAST-NEXT:    i32.or $push12=, $pop11, $pop9
-; NO-SIMD128-FAST-NEXT:    i32.store 8($0), $pop12
-; NO-SIMD128-FAST-NEXT:    i32.const $push17=, -1
-; NO-SIMD128-FAST-NEXT:    i32.xor $push14=, $4, $pop17
-; NO-SIMD128-FAST-NEXT:    i32.and $push15=, $pop14, $12
-; NO-SIMD128-FAST-NEXT:    i32.and $push13=, $4, $8
-; NO-SIMD128-FAST-NEXT:    i32.or $push16=, $pop15, $pop13
-; NO-SIMD128-FAST-NEXT:    i32.store 12($0), $pop16
+; NO-SIMD128-FAST-NEXT:    i32.xor $push0=, $5, $9
+; NO-SIMD128-FAST-NEXT:    i32.and $push1=, $pop0, $1
+; NO-SIMD128-FAST-NEXT:    i32.xor $push2=, $pop1, $9
+; NO-SIMD128-FAST-NEXT:    i32.store 0($0), $pop2
+; NO-SIMD128-FAST-NEXT:    i32.xor $push3=, $6, $10
+; NO-SIMD128-FAST-NEXT:    i32.and $push4=, $pop3, $2
+; NO-SIMD128-FAST-NEXT:    i32.xor $push5=, $pop4, $10
+; NO-SIMD128-FAST-NEXT:    i32.store 4($0), $pop5
+; NO-SIMD128-FAST-NEXT:    i32.xor $push6=, $7, $11
+; NO-SIMD128-FAST-NEXT:    i32.and $push7=, $pop6, $3
+; NO-SIMD128-FAST-NEXT:    i32.xor $push8=, $pop7, $11
+; NO-SIMD128-FAST-NEXT:    i32.store 8($0), $pop8
+; NO-SIMD128-FAST-NEXT:    i32.xor $push9=, $8, $12
+; NO-SIMD128-FAST-NEXT:    i32.and $push10=, $pop9, $4
+; NO-SIMD128-FAST-NEXT:    i32.xor $push11=, $pop10, $12
+; NO-SIMD128-FAST-NEXT:    i32.store 12($0), $pop11
 ; NO-SIMD128-FAST-NEXT:    return
   %masked_v1 = and <4 x i32> %c, %v1
   %inv_mask = xor <4 x i32> <i32 -1, i32 -1, i32 -1, i32 -1>, %c
@@ -10974,35 +10862,27 @@ define <2 x i64> @bitselect_v2i64(<2 x i64> %c, <2 x i64> %v1, <2 x i64> %v2) {
 ; NO-SIMD128-LABEL: bitselect_v2i64:
 ; NO-SIMD128:         .functype bitselect_v2i64 (i32, i64, i64, i64, i64, i64, i64) -> ()
 ; NO-SIMD128-NEXT:  # %bb.0:
-; NO-SIMD128-NEXT:    i64.const $push1=, -1
-; NO-SIMD128-NEXT:    i64.xor $push2=, $2, $pop1
-; NO-SIMD128-NEXT:    i64.and $push3=, $6, $pop2
-; NO-SIMD128-NEXT:    i64.and $push0=, $4, $2
-; NO-SIMD128-NEXT:    i64.or $push4=, $pop3, $pop0
-; NO-SIMD128-NEXT:    i64.store 8($0), $pop4
-; NO-SIMD128-NEXT:    i64.const $push9=, -1
-; NO-SIMD128-NEXT:    i64.xor $push6=, $1, $pop9
-; NO-SIMD128-NEXT:    i64.and $push7=, $5, $pop6
-; NO-SIMD128-NEXT:    i64.and $push5=, $3, $1
-; NO-SIMD128-NEXT:    i64.or $push8=, $pop7, $pop5
-; NO-SIMD128-NEXT:    i64.store 0($0), $pop8
+; NO-SIMD128-NEXT:    i64.xor $push0=, $4, $6
+; NO-SIMD128-NEXT:    i64.and $push1=, $pop0, $2
+; NO-SIMD128-NEXT:    i64.xor $push2=, $pop1, $6
+; NO-SIMD128-NEXT:    i64.store 8($0), $pop2
+; NO-SIMD128-NEXT:    i64.xor $push3=, $3, $5
+; NO-SIMD128-NEXT:    i64.and $push4=, $pop3, $1
+; NO-SIMD128-NEXT:    i64.xor $push5=, $pop4, $5
+; NO-SIMD128-NEXT:    i64.store 0($0), $pop5
 ; NO-SIMD128-NEXT:    return
 ;
 ; NO-SIMD128-FAST-LABEL: bitselect_v2i64:
 ; NO-SIMD128-FAST:         .functype bitselect_v2i64 (i32, i64, i64, i64, i64, i64, i64) -> ()
 ; NO-SIMD128-FAST-NEXT:  # %bb.0:
-; NO-SIMD128-FAST-NEXT:    i64.const $push1=, -1
-; NO-SIMD128-FAST-NEXT:    i64.xor $push2=, $1, $pop1
-; NO-SIMD128-FAST-NEXT:    i64.and $push3=, $5, $pop2
-; NO-SIMD128-FAST-NEXT:    i64.and $push0=, $3, $1
-; NO-SIMD128-FAST-NEXT:    i64.or $push4=, $pop3, $pop0
-; NO-SIMD128-FAST-NEXT:    i64.store 0($0), $pop4
-; NO-SIMD128-FAST-NEXT:    i64.const $push9=, -1
-; NO-SIMD128-FAST-NEXT:    i64.xor $push6=, $2, $pop9
-; NO-SIMD128-FAST-NEXT:    i64.and $push7=, $6, $pop6
-; NO-SIMD128-FAST-NEXT:    i64.and $push5=, $4, $2
-; NO-SIMD128-FAST-NEXT:    i64.or $push8=, $pop7, $pop5
-; NO-SIMD128-FAST-NEXT:    i64.store 8($0), $pop8
+; NO-SIMD128-FAST-NEXT:    i64.xor $push0=, $3, $5
+; NO-SIMD128-FAST-NEXT:    i64.and $push1=, $pop0, $1
+; NO-SIMD128-FAST-NEXT:    i64.xor $push2=, $pop1, $5
+; NO-SIMD128-FAST-NEXT:    i64.store 0($0), $pop2
+; NO-SIMD128-FAST-NEXT:    i64.xor $push3=, $4, $6
+; NO-SIMD128-FAST-NEXT:    i64.and $push4=, $pop3, $2
+; NO-SIMD128-FAST-NEXT:    i64.xor $push5=, $pop4, $6
+; NO-SIMD128-FAST-NEXT:    i64.store 8($0), $pop5
 ; NO-SIMD128-FAST-NEXT:    return
   %masked_v1 = and <2 x i64> %v1, %c
   %inv_mask = xor <2 x i64> <i64 -1, i64 -1>, %c
diff --git a/llvm/test/CodeGen/X86/bitselect.ll b/llvm/test/CodeGen/X86/bitselect.ll
index 2922113b14ea9..4fc0827ac4dd6 100644
--- a/llvm/test/CodeGen/X86/bitselect.ll
+++ b/llvm/test/CodeGen/X86/bitselect.ll
@@ -1,7 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc < %s -mtriple=i686-unknown-unknown | FileCheck %s --check-prefixes=X86
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=-bmi | FileCheck %s --check-prefixes=X64,X64-NOBMI
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+bmi | FileCheck %s --check-prefixes=X64,X64-BMI
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=-bmi | FileCheck %s --check-prefixes=X64-NOBMI
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+bmi | FileCheck %s --check-prefixes=X64-BMI
 
 ; PR46472
 ; bitselect(a,b,m) == or(and(a,not(m)),and(b,m))
@@ -17,14 +17,22 @@ define i8 @bitselect_i8(i8 %a, i8 %b, i8 %m) nounwind {
 ; X86-NEXT:    xorb %cl, %al
 ; X86-NEXT:    retl
 ;
-; X64-LABEL: bitselect_i8:
-; X64:       # %bb.0:
-; X64-NEXT:    andl %edx, %esi
-; X64-NEXT:    movl %edx, %eax
-; X64-NEXT:    notb %al
-; X64-NEXT:    andb %dil, %al
-; X64-NEXT:    orb %sil, %al
-; X64-NEXT:    retq
+; X64-NOBMI-LABEL: bitselect_i8:
+; X64-NOBMI:       # %bb.0:
+; X64-NOBMI-NEXT:    movl %esi, %eax
+; X64-NOBMI-NEXT:    xorl %edi, %eax
+; X64-NOBMI-NEXT:    andl %edx, %eax
+; X64-NOBMI-NEXT:    xorl %edi, %eax
+; X64-NOBMI-NEXT:    # kill: def $al killed $al killed $eax
+; X64-NOBMI-NEXT:    retq
+;
+; X64-BMI-LABEL: bitselect_i8:
+; X64-BMI:       # %bb.0:
+; X64-BMI-NEXT:    andnl %edi, %edx, %eax
+; X64-BMI-NEXT:    andl %edx, %esi
+; X64-BMI-NEXT:    orl %esi, %eax
+; X64-BMI-NEXT:    # kill: def $al killed $al killed $eax
+; X64-BMI-NEXT:    retq
   %not = xor i8 %m, -1
   %ma = and i8 %a, %not
   %mb = and i8 %b, %m
@@ -35,21 +43,20 @@ define i8 @bitselect_i8(i8 %a, i8 %b, i8 %m) nounwind {
 define i16 @bitselect_i16(i16 %a, i16 %b, i16 %m) nounwind {
 ; X86-LABEL: bitselect_i16:
 ; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    xorw %ax, %cx
-; X86-NEXT:    andw {{[0-9]+}}(%esp), %cx
+; X86-NEXT:    xorw %cx, %ax
+; X86-NEXT:    andw {{[0-9]+}}(%esp), %ax
 ; X86-NEXT:    xorl %ecx, %eax
 ; X86-NEXT:    # kill: def $ax killed $ax killed $eax
 ; X86-NEXT:    retl
 ;
 ; X64-NOBMI-LABEL: bitselect_i16:
 ; X64-NOBMI:       # %bb.0:
-; X64-NOBMI-NEXT:    movl %edx, %eax
-; X64-NOBMI-NEXT:    andl %edx, %esi
-; X64-NOBMI-NEXT:    notl %eax
-; X64-NOBMI-NEXT:    andl %edi, %eax
-; X64-NOBMI-NEXT:    orl %esi, %eax
+; X64-NOBMI-NEXT:    movl %esi, %eax
+; X64-NOBMI-NEXT:    xorl %edi, %eax
+; X64-NOBMI-NEXT:    andl %edx, %eax
+; X64-NOBMI-NEXT:    xorl %edi, %eax
 ; X64-NOBMI-NEXT:    # kill: def $ax killed $ax killed $eax
 ; X64-NOBMI-NEXT:    retq
 ;
@@ -186,13 +193,12 @@ define i128 @bitselect_i128(i128 %a, i128 %b, i128 %m) nounwind {
 ;
 ; X64-BMI-LABEL: bitselect_i128:
 ; X64-BMI:       # %bb.0:
-; X64-BMI-NEXT:    andnq %rsi, %r9, %rsi
 ; X64-BMI-NEXT:    andnq %rdi, %r8, %rax
-; X64-BMI-NEXT:    andq %r9, %rcx
-; X64-BMI-NEXT:    orq %rcx, %rsi
 ; X64-BMI-NEXT:    andq %r8, %rdx
 ; X64-BMI-NEXT:    orq %rdx, %rax
-; X64-BMI-NEXT:    movq %rsi, %rdx
+; X64-BMI-NEXT:    andnq %rsi, %r9, %rdx
+; X64-BMI-NEXT:    andq %r9, %rcx
+; X64-BMI-NEXT:    orq %rcx, %rdx
 ; X64-BMI-NEXT:    retq
   %not = xor i128 %m, -1
   %ma = and i128 %a, %not
diff --git a/llvm/test/CodeGen/X86/fold-masked-merge.ll b/llvm/test/CodeGen/X86/fold-masked-merge.ll
index b2614c5fe0493..05e7b2a2de372 100644
--- a/llvm/test/CodeGen/X86/fold-masked-merge.ll
+++ b/llvm/test/CodeGen/X86/fold-masked-merge.ll
@@ -30,18 +30,17 @@ define i32 @masked_merge0(i32 %a0, i32 %a1, i32 %a2) {
 define i16 @masked_merge1(i16 %a0, i16 %a1, i16 %a2) {
 ; NOBMI-LABEL: masked_merge1:
 ; NOBMI:       # %bb.0:
-; NOBMI-NEXT:    movl %edi, %eax
-; NOBMI-NEXT:    andl %edi, %esi
-; NOBMI-NEXT:    notl %eax
-; NOBMI-NEXT:    andl %edx, %eax
-; NOBMI-NEXT:    orl %esi, %eax
+; NOBMI-NEXT:    movl %esi, %eax
+; NOBMI-NEXT:    xorl %edx, %eax
+; NOBMI-NEXT:    andl %edi, %eax
+; NOBMI-NEXT:    xorl %edx, %eax
 ; NOBMI-NEXT:    # kill: def $ax killed $ax killed $eax
 ; NOBMI-NEXT:    retq
 ;
 ; BMI-LABEL: masked_merge1:
 ; BMI:       # %bb.0:
-; BMI-NEXT:    andl %edi, %esi
 ; BMI-NEXT:    andnl %edx, %edi, %eax
+; BMI-NEXT:    andl %edi, %esi
 ; BMI-NEXT:    orl %esi, %eax
 ; BMI-NEXT:    # kill: def $ax killed $ax killed $eax
 ; BMI-NEXT:    retq
@@ -53,20 +52,11 @@ define i16 @masked_merge1(i16 %a0, i16 %a1, i16 %a2) {
 }
 
 define i8 @masked_merge2(i8 %a0, i8 %a1, i8 %a2) {
-; NOBMI-LABEL: masked_merge2:
-; NOBMI:       # %bb.0:
-; NOBMI-NEXT:    movl %esi, %eax
-; NOBMI-NEXT:    # kill: def $al killed $al killed $eax
-; NOBMI-NEXT:    retq
-;
-; BMI-LABEL: masked_merge2:
-; BMI:       # %bb.0:
-; BMI-NEXT:    movl %edi, %eax
-; BMI-NEXT:    notb %al
-; BMI-NEXT:    andb %sil, %al
-; BMI-NEXT:    andb %dil, %sil
-; BMI-NEXT:    orb %sil, %al
-; BMI-NEXT:    retq
+; CHECK-LABEL: masked_merge2:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    movl %esi, %eax
+; CHECK-NEXT:    # kill: def $al killed $al killed $eax
+; CHECK-NEXT:    retq
   %not = xor i8 %a0, -1
   %and0 = and i8 %not, %a1
   %and1 = and i8 %a1, %a0
@@ -279,3 +269,27 @@ define i32 @masked_merge_no_transform2(i32 %a0, i32 %a1, i32 %a2, ptr %p1) {
   store i32 %and1, ptr %p1
   ret i32 %or
 }
+
+define i32 @pr137641_crash({ i8, i32 } %0) {
+; NOBMI-LABEL: pr137641_crash:
+; NOBMI:       # %bb.0:
+; NOBMI-NEXT:    movl %esi, %eax
+; NOBMI-NEXT:    andl $201, %eax
+; NOBMI-NEXT:    xorl $1, %eax
+; NOBMI-NEXT:    retq
+;
+; BMI-LABEL: pr137641_crash:
+; BMI:       # %bb.0:
+; BMI-NEXT:    movl %esi, %eax
+; BMI-NEXT:    notl %eax
+; BMI-NEXT:    andl $1, %eax
+; BMI-NEXT:    andl $200, %esi
+; BMI-NEXT:    orl %esi, %eax
+; BMI-NEXT:    retq
+  %asmresult1.i = extractvalue { i8, i32 } %0, 1
+  %not = xor i32 %asmresult1.i, 1
+  %and = and i32 1, %not
+  %and1 = and i32 %asmresult1.i, 200
+  %2 = or i32 %and, %and1
+  ret i32 %2
+}
diff --git a/llvm/test/CodeGen/X86/unfold-masked-merge-scalar-variablemask.ll b/llvm/test/CodeGen/X86/unfold-masked-merge-scalar-variablemask.ll
index 9c9d06921096c..6a55d740fe421 100644
--- a/llvm/test/CodeGen/X86/unfold-masked-merge-scalar-variablemask.ll
+++ b/llvm/test/CodeGen/X86/unfold-masked-merge-scalar-variablemask.ll
@@ -6,21 +6,18 @@
 define i8 @out8(i8 %x, i8 %y, i8 %mask) {
 ; CHECK-NOBMI-LABEL: out8:
 ; CHECK-NOBMI:       # %bb.0:
-; CHECK-NOBMI-NEXT:    movl %edx, %eax
-; CHECK-NOBMI-NEXT:    andl %edx, %edi
-; CHECK-NOBMI-NEXT:    notb %al
-; CHECK-NOBMI-NEXT:    andb %sil, %al
-; CHECK-NOBMI-NEXT:    orb %dil, %al
+; CHECK-NOBMI-NEXT:    movl %edi, %eax
+; CHECK-NOBMI-NEXT:    xorl %esi, %eax
+; CHECK-NOBMI-NEXT:    andl %edx, %eax
+; CHECK-NOBMI-NEXT:    xorl %esi, %eax
 ; CHECK-NOBMI-NEXT:    # kill: def $al killed $al killed $eax
 ; CHECK-NOBMI-NEXT:    retq
 ;
 ; CHECK-BMI-LABEL: out8:
 ; CHECK-BMI:       # %bb.0:
-; CHECK-BMI-NEXT:    movl %edx, %eax
+; CHECK-BMI-NEXT:    andnl %esi, %edx, %eax
 ; CHECK-BMI-NEXT:    andl %edx, %edi
-; CHECK-BMI-NEXT:    notb %al
-; CHECK-BMI-NEXT:    andb %sil, %al
-; CHECK-BMI-NEXT:    orb %dil, %al
+; CHECK-BMI-NEXT:    orl %edi, %eax
 ; CHECK-BMI-NEXT:    # kill: def $al killed $al killed $eax
 ; CHECK-BMI-NEXT:    retq
   %mx = and i8 %x, %mask
@@ -33,18 +30,17 @@ define i8 @out8(i8 %x, i8 %y, i8 %mask) {
 define i16 @out16(i16 %x, i16 %y, i16 %mask) {
 ; CHECK-NOBMI-LABEL: out16:
 ; CHECK-NOBMI:       # %bb.0:
-; CHECK-NOBMI-NEXT:    movl %edx, %eax
-; CHECK-NOBMI-NEXT:    andl %edx, %edi
-; CHECK-NOBMI-NEXT:    notl %eax
-; CHECK-NOBMI-NEXT:    andl %esi, %eax
-; CHECK-NOBMI-NEXT:    orl %edi, %eax
+; CHECK-NOBMI-NEXT:    movl %edi, %eax
+; CHECK-NOBMI-NEXT:    xorl %esi, %eax
+; CHECK-NOBMI-NEXT:    andl %edx, %eax
+; CHECK-NOBMI-NEXT:    xorl %esi, %eax
 ; CHECK-NOBMI-NEXT:    # kill: def $ax killed $ax killed $eax
 ; CHECK-NOBMI-NEXT:    retq
 ;
 ; CHECK-BMI-LABEL: out16:
 ; CHECK-BMI:       # %bb.0:
-; CHECK-BMI-NEXT:    andl %edx, %edi
 ; CHECK-BMI-NEXT:    andnl %esi, %edx, %eax
+; CHECK-BMI-NEXT:    andl %edx, %edi
 ; CHECK-BMI-NEXT:    orl %edi, %eax
 ; CHECK-BMI-NEXT:    # kill: def $ax killed $ax killed $eax
 ; CHECK-BMI-NEXT:    retq
diff --git a/llvm/test/CodeGen/X86/unfold-masked-merge-vector-variablemask.ll b/llvm/test/CodeGen/X86/unfold-masked-merge-vector-variablemask.ll
index b1194bedc4e1c..809c15881cc9b 100644
--- a/llvm/test/CodeGen/X86/unfold-masked-merge-vector-variablemask.ll
+++ b/llvm/test/CodeGen/X86/unfold-masked-merge-vector-variablemask.ll
@@ -16,11 +16,10 @@
 define <1 x i8> @out_v1i8(<1 x i8> %x, <1 x i8> %y, <1 x i8> %mask) nounwind {
 ; CHECK-LABEL: out_v1i8:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    movl %edx, %eax
-; CHECK-NEXT:    andl %edx, %edi
-; CHECK-NEXT:    notb %al
-; CHECK-NEXT:    andb %sil, %al
-; CHECK-NEXT:    orb %dil, %al
+; CHECK-NEXT:    movl %edi, %eax
+; CHECK-NEXT:    xorl %esi, %eax
+; CHECK-NEXT:    andl %edx, %eax
+; CHECK-NEXT:    xorl %esi, %eax
 ; CHECK-NEXT:    # kill: def $al killed $al killed $eax
 ; CHECK-NEXT:    retq
   %mx = and <1 x i8> %x, %mask
@@ -37,32 +36,28 @@ define <1 x i8> @out_v1i8(<1 x i8> %x, <1 x i8> %y, <1 x i8> %mask) nounwind {
 define <2 x i8> @out_v2i8(<2 x i8> %x, <2 x i8> %y, <2 x i8> %mask) nounwind {
 ; CHECK-BASELINE-LABEL: out_v2i8:
 ; CHECK-BASELINE:       # %bb.0:
-; CHECK-BASELINE-NEXT:    movl %r8d, %eax
+; CHECK-BASELINE-NEXT:    movl %edi, %eax
+; CHECK-BASELINE-NEXT:    xorl %edx, %eax
+; CHECK-BASELINE-NEXT:    andl %r8d, %eax
+; CHECK-BASELINE-NEXT:    xorl %edx, %eax
+; CHECK-BASELINE-NEXT:    xorl %ecx, %esi
 ; CHECK-BASELINE-NEXT:    andl %r9d, %esi
-; CHECK-BASELINE-NEXT:    andl %r8d, %edi
-; CHECK-BASELINE-NEXT:    notb %al
-; CHECK-BASELINE-NEXT:    notb %r9b
-; CHECK-BASELINE-NEXT:    andb %cl, %r9b
-; CHECK-BASELINE-NEXT:    andb %dl, %al
-; CHECK-BASELINE-NEXT:    orb %dil, %al
-; CHECK-BASELINE-NEXT:    orb %sil, %r9b
+; CHECK-BASELINE-NEXT:    xorl %ecx, %esi
 ; CHECK-BASELINE-NEXT:    # kill: def $al killed $al killed $eax
-; CHECK-BASELINE-NEXT:    movl %r9d, %edx
+; CHECK-BASELINE-NEXT:    movl %esi, %edx
 ; CHECK-BASELINE-NEXT:    retq
 ;
 ; CHECK-SSE1-LABEL: out_v2i8:
 ; CHECK-SSE1:       # %bb.0:
-; CHECK-SSE1-NEXT:    movl %r8d, %eax
+; CHECK-SSE1-NEXT:    movl %edi, %eax
+; CHECK-SSE1-NEXT:    xorl %edx, %eax
+; CHECK-SSE1-NEXT:    andl %r8d, %eax
+; CHECK-SSE1-NEXT:    xorl %edx, %eax
+; CHECK-SSE1-NEXT:    xorl %ecx, %esi
 ; CHECK-SSE1-NEXT:    andl %r9d, %esi
-; CHECK-SSE1-NEXT:    andl %r8d, %edi
-; CHECK-SSE1-NEXT:    notb %al
-; CHECK-SSE1-NEXT:    notb %r9b
-; CHECK-SSE1-NEXT:    andb %cl, %r9b
-; CHECK-SSE1-NEXT:    andb %dl, %al
-; CHECK-SSE1-NEXT:    orb %dil, %al
-; CHECK-SSE1-NEXT:    orb %sil, %r9b
+; CHECK-SSE1-NEXT:    xorl %ecx, %esi
 ; CHECK-SSE1-NEXT:    # kill: def $al killed $al killed $eax
-; CHECK-SSE1-NEXT:    movl %r9d, %edx
+; CHECK-SSE1-NEXT:    movl %esi, %edx
 ; CHECK-SSE1-NEXT:    retq
 ;
 ; CHECK-SSE2-LABEL: out_v2i8:
@@ -86,11 +81,10 @@ define <2 x i8> @out_v2i8(<2 x i8> %x, <2 x i8> %y, <2 x i8> %mask) nounwind {
 define <1 x i16> @out_v1i16(<1 x i16> %x, <1 x i16> %y, <1 x i16> %mask) nounwind {
 ; CHECK-LABEL: out_v1i16:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    movl %edx, %eax
-; CHECK-NEXT:    andl %edx, %edi
-; CHECK-NEXT:    notl %eax
-; CHECK-NEXT:    andl %esi, %eax
-; CHECK-NEXT:    orl %edi, %eax
+; CHECK-NEXT:    movl %edi, %eax
+; CHECK-NEXT:    xorl %esi, %eax
+; CHECK-NEXT:    andl %edx, %eax
+; CHECK-NEXT:    xorl %esi, %eax
 ; CHECK-NEXT:    # kill: def $ax killed $ax killed $eax
 ; CHECK-NEXT:    retq
   %mx = and <1 x i16> %x, %mask
@@ -235,32 +229,28 @@ define <4 x i8> @out_v4i8_undef(<4 x i8> %x, <4 x i8> %y, <4 x i8> %mask) nounwi
 define <2 x i16> @out_v2i16(<2 x i16> %x, <2 x i16> %y, <2 x i16> %mask) nounwind {
 ; CHECK-BASELINE-LABEL: out_v2i16:
 ; CHECK-BASELINE:       # %bb.0:
-; CHECK-BASELINE-NEXT:    movl %r8d, %eax
+; CHECK-BASELINE-NEXT:    movl %edi, %eax
+; CHECK-BASELINE-NEXT:    xorl %edx, %eax
+; CHECK-BASELINE-NEXT:    andl %r8d, %eax
+; CHECK-BASELINE-NEXT:    xorl %edx, %eax
+; CHECK-BASELINE-NEXT:    xorl %ecx, %esi
 ; CHECK-BASELINE-NEXT:    andl %r9d, %esi
-; CHECK-BASELINE-NEXT:    andl %r8d, %edi
-; CHECK-BASELINE-NEXT:    notl %eax
-; CHECK-BASELINE-NEXT:    notl %r9d
-; CHECK-BASELINE-NEXT:    andl %ecx, %r9d
-; CHECK-BASELINE-NEXT:    orl %esi, %r9d
-; CHECK-BASELINE-NEXT:    andl %edx, %eax
-; CHECK-BASELINE-NEXT:    orl %edi, %eax
+; CHECK-BASELINE-NEXT:    xorl %ecx, %esi
 ; CHECK-BASELINE-NEXT:    # kill: def $ax killed $ax killed $eax
-; CHECK-BASELINE-NEXT:    movl %r9d, %edx
+; CHECK-BASELINE-NEXT:    movl %esi, %edx
 ; CHECK-BASELINE-NEXT:    retq
 ;
 ; CHECK-SSE1-LABEL: out_v2i16:
 ; CHECK-SSE1:       # %bb.0:
-; CHECK-SSE1-NEXT:    movl %r8d, %eax
+; CHECK-SSE1-NEXT:    movl %edi, %eax
+; CHECK-SSE1-NEXT:    xorl %edx, %eax
+; CHECK-SSE1-NEXT:    andl %r8d, %eax
+; CHECK-SSE1-NEXT:    xorl %edx, %eax
+; CHECK-SSE1-NEXT:    xorl %ecx, %esi
 ; CHECK-SSE1-NEXT:    andl %r9d, %esi
-; CHECK-SSE1-NEXT:    andl %r8d, %edi
-; CHECK-SSE1-NEXT:    notl %eax
-; CHECK-SSE1-NEXT:    notl %r9d
-; CHECK-SSE1-NEXT:    andl %ecx, %r9d
-; CHECK-SSE1-NEXT:    orl %esi, %r9d
-; CHECK-SSE1-NEXT:    andl %edx, %eax
-; CHECK-SSE1-NEXT:    orl %edi, %eax
+; CHECK-SSE1-NEXT:    xorl %ecx, %esi
 ; CHECK-SSE1-NEXT:    # kill: def $ax killed $ax killed $eax
-; CHECK-SSE1-NEXT:    movl %r9d, %edx
+; CHECK-SSE1-NEXT:    movl %esi, %edx
 ; CHECK-SSE1-NEXT:    retq
 ;
 ; CHECK-SSE2-LABEL: out_v2i16:
@@ -439,9 +429,12 @@ define <4 x i16> @out_v4i16(<4 x i16> %x, <4 x i16> %y, <4 x i16> %mask) nounwin
 ; CHECK-BASELINE-LABEL: out_v4i16:
 ; CHECK-BASELINE:       # %bb.0:
 ; CHECK-BASELINE-NEXT:    movq %rdi, %rax
-; CHECK-BASELINE-NEXT:    movzwl {{[0-9]+}}(%rsp), %edi
-; CHECK-BASELINE-NEXT:    movzwl {{[0-9]+}}(%rsp), %r10d
-; CHECK-BASELINE-NEXT:    movzwl {{[0-9]+}}(%rsp), %r11d
+; CHECK-BASELINE-NEXT:    movl {{[0-9]+}}(%rsp), %edi
+; CHECK-BASELINE-NEXT:    movl {{[0-9]+}}(%rsp), %r10d
+; CHECK-BASELINE-NEXT:    movl {{[0-9]+}}(%rsp), %r11d
+; CHECK-BASELINE-NEXT:    xorl %r9d, %esi
+; CHECK-BASELINE-NEXT:    andw {{[0-9]+}}(%rsp), %si
+; CHECK-BASELINE-NEXT:    xorl %r9d, %esi
 ; CHECK-BASELINE-NEXT:    xorl %r11d, %edx
 ; CHECK-BASELINE-NEXT:    andw {{[0-9]+}}(%rsp), %dx
 ; CHECK-BASELINE-NEXT:    xorl %r11d, %edx
@@ -451,21 +444,21 @@ define <4 x i16> @out_v4i16(<4 x i16> %x, <4 x i16> %y, <4 x i16> %mask) nounwin
 ; CHECK-BASELINE-NEXT:    xorl %edi, %r8d
 ; CHECK-BASELINE-NEXT:    andw {{[0-9]+}}(%rsp), %r8w
 ; CHECK-BASELINE-NEXT:    xorl %edi, %r8d
-; CHECK-BASELINE-NEXT:    xorl %r9d, %esi
-; CHECK-BASELINE-NEXT:    andw {{[0-9]+}}(%rsp), %si
-; CHECK-BASELINE-NEXT:    xorl %r9d, %esi
-; CHECK-BASELINE-NEXT:    movw %si, (%rax)
 ; CHECK-BASELINE-NEXT:    movw %r8w, 6(%rax)
 ; CHECK-BASELINE-NEXT:    movw %cx, 4(%rax)
 ; CHECK-BASELINE-NEXT:    movw %dx, 2(%rax)
+; CHECK-BASELINE-NEXT:    movw %si, (%rax)
 ; CHECK-BASELINE-NEXT:    retq
 ;
 ; CHECK-SSE1-LABEL: out_v4i16:
 ; CHECK-SSE1:       # %bb.0:
 ; CHECK-SSE1-NEXT:    movq %rdi, %rax
-; CHECK-SSE1-NEXT:    movzwl {{[0-9]+}}(%rsp), %edi
-; CHECK-SSE1-NEXT:    movzwl {{[0-9]+}}(%rsp), %r10d
-; CHECK-SSE1-NEXT:    movzwl {{[0-9]+}}(%rsp), %r11d
+; CHECK-SSE1-NEXT:    movl {{[0-9]+}}(%rsp), %edi
+; CHECK-SSE1-NEXT:    movl {{[0-9]+}}(%rsp), %r10d
+; CHECK-SSE1-NEXT:    movl {{[0-9]+}}(%rsp), %r11d
+; CHECK-SSE1-NEXT:    xorl %r9d, %esi
+; CHECK-SSE1-NEXT:    andw {{[0-9]+}}(%rsp), %si
+; CHECK-SSE1-NEXT:    xorl %r9d, %esi
 ; CHECK-SSE1-NEXT:    xorl %r11d, %edx
 ; CHECK-SSE1-NEXT:    andw {{[0-9]+}}(%rsp), %dx
 ; CHECK-SSE1-NEXT:    xorl %r11d, %edx
@@ -475,13 +468,10 @@ define <4 x i16> @out_v4i16(<4 x i16> %x, <4 x i16> %y, <4 x i16> %mask) nounwin
 ; CHECK-SSE1-NEXT:    xorl %edi, %r8d
 ; CHECK-SSE1-NEXT:    andw {{[0-9]+}}(%rsp), %r8w
 ; CHECK-SSE1-NEXT:    xorl %edi, %r8d
-; CHECK-SSE1-NEXT:    xorl %r9d, %esi
-; CHECK-SSE1-NEXT:    andw {{[0-9]+}}(%rsp), %si
-; CHECK-SSE1-NEXT:    xorl %r9d, %esi
-; CHECK-SSE1-NEXT:    movw %si, (%rax)
 ; CHECK-SSE1-NEXT:    movw %r8w, 6(%rax)
 ; CHECK-SSE1-NEXT:    movw %cx, 4(%rax)
 ; CHECK-SSE1-NEXT:    movw %dx, 2(%rax)
+; CHECK-SSE1-NEXT:    movw %si, (%rax)
 ; CHECK-SSE1-NEXT:    retq
 ;
 ; CHECK-SSE2-LABEL: out_v4i16:
@@ -506,43 +496,43 @@ define <4 x i16> @out_v4i16_undef(<4 x i16> %x, <4 x i16> %y, <4 x i16> %mask) n
 ; CHECK-BASELINE-LABEL: out_v4i16_undef:
 ; CHECK-BASELINE:       # %bb.0:
 ; CHECK-BASELINE-NEXT:    movq %rdi, %rax
-; CHECK-BASELINE-NEXT:    movzwl {{[0-9]+}}(%rsp), %edi
-; CHECK-BASELINE-NEXT:    movzwl {{[0-9]+}}(%rsp), %r10d
+; CHECK-BASELINE-NEXT:    movl {{[0-9]+}}(%rsp), %edi
+; CHECK-BASELINE-NEXT:    movl {{[0-9]+}}(%rsp), %r10d
 ; CHECK-BASELINE-NEXT:    andw {{[0-9]+}}(%rsp), %cx
+; CHECK-BASELINE-NEXT:    xorl %r9d, %esi
+; CHECK-BASELINE-NEXT:    andw {{[0-9]+}}(%rsp), %si
+; CHECK-BASELINE-NEXT:    xorl %r9d, %esi
 ; CHECK-BASELINE-NEXT:    xorl %r10d, %edx
 ; CHECK-BASELINE-NEXT:    andw {{[0-9]+}}(%rsp), %dx
 ; CHECK-BASELINE-NEXT:    xorl %r10d, %edx
 ; CHECK-BASELINE-NEXT:    xorl %edi, %r8d
 ; CHECK-BASELINE-NEXT:    andw {{[0-9]+}}(%rsp), %r8w
 ; CHECK-BASELINE-NEXT:    xorl %edi, %r8d
-; CHECK-BASELINE-NEXT:    xorl %r9d, %esi
-; CHECK-BASELINE-NEXT:    andw {{[0-9]+}}(%rsp), %si
-; CHECK-BASELINE-NEXT:    xorl %r9d, %esi
 ; CHECK-BASELINE-NEXT:    movw %cx, 4(%rax)
-; CHECK-BASELINE-NEXT:    movw %si, (%rax)
 ; CHECK-BASELINE-NEXT:    movw %r8w, 6(%rax)
 ; CHECK-BASELINE-NEXT:    movw %dx, 2(%rax)
+; CHECK-BASELINE-NEXT:    movw %si, (%rax)
 ; CHECK-BASELINE-NEXT:    retq
 ;
 ; CHECK-SSE1-LABEL: out_v4i16_undef:
 ; CHECK-SSE1:       # %bb.0:
 ; CHECK-SSE1-NEXT:    movq %rdi, %rax
-; CHECK-SSE1-NEXT:    movzwl {{[0-9]+}}(%rsp), %edi
-; CHECK-SSE1-NEXT:    movzwl {{[0-9]+}}(%rsp), %r10d
+; CHECK-SSE1-NEXT:    movl {{[0-9]+}}(%rsp), %edi
+; CHECK-SSE1-NEXT:    movl {{[0-9]+}}(%rsp), %r10d
 ; CHECK-SSE1-NEXT:    andw {{[0-9]+}}(%rsp), %cx
+; CHECK-SSE1-NEXT:    xorl %r9d, %esi
+; CHECK-SSE1-NEXT:    andw {{[0-9]+}}(%rsp), %si
+; CHECK-SSE1-NEXT:    xorl %r9d, %esi
 ; CHECK-SSE1-NEXT:    xorl %r10d, %edx
 ; CHECK-SSE1-NEXT:    andw {{[0-9]+}}(%rsp), %dx
 ; CHECK-SSE1-NEXT:    xorl %r10d, %edx
 ; CHECK-SSE1-NEXT:    xorl %edi, %r8d
 ; CHECK-SSE1-NEXT:    andw {{[0-9]+}}(%rsp), %r8w
 ; CHECK-SSE1-NEXT:    xorl %edi, %r8d
-; CHECK-SSE1-NEXT:    xorl %r9d, %esi
-; CHECK-SSE1-NEXT:    andw {{[0-9]+}}(%rsp), %si
-; CHECK-SSE1-NEXT:    xorl %r9d, %esi
 ; CHECK-SSE1-NEXT:    movw %cx, 4(%rax)
-; CHECK-SSE1-NEXT:    movw %si, (%rax)
 ; CHECK-SSE1-NEXT:    movw %r8w, 6(%rax)
 ; CHECK-SSE1-NEXT:    movw %dx, 2(%rax)
+; CHECK-SSE1-NEXT:    movw %si, (%rax)
 ; CHECK-SSE1-NEXT:    retq
 ;
 ; CHECK-SSE2-LABEL: out_v4i16_undef:
@@ -883,14 +873,14 @@ define <8 x i16> @out_v8i16(<8 x i16> %x, <8 x i16> %y, <8 x i16> %mask) nounwin
 ; CHECK-BASELINE-NEXT:    pushq %r12
 ; CHECK-BASELINE-NEXT:    pushq %rbx
 ; CHECK-BASELINE-NEXT:    movq %rdi, %rax
-; CHECK-BASELINE-NEXT:    movzwl {{[0-9]+}}(%rsp), %edi
-; CHECK-BASELINE-NEXT:    movzwl {{[0-9]+}}(%rsp), %r10d
-; CHECK-BASELINE-NEXT:    movzwl {{[0-9]+}}(%rsp), %r11d
-; CHECK-BASELINE-NEXT:    movzwl {{[0-9]+}}(%rsp), %ebx
-; CHECK-BASELINE-NEXT:    movzwl {{[0-9]+}}(%rsp), %ebp
-; CHECK-BASELINE-NEXT:    movzwl {{[0-9]+}}(%rsp), %r14d
-; CHECK-BASELINE-NEXT:    movzwl {{[0-9]+}}(%rsp), %r15d
-; CHECK-BASELINE-NEXT:    movzwl {{[0-9]+}}(%rsp), %r12d
+; CHECK-BASELINE-NEXT:    movl {{[0-9]+}}(%rsp), %edi
+; CHECK-BASELINE-NEXT:    movl {{[0-9]+}}(%rsp), %r10d
+; CHECK-BASELINE-NEXT:    movl {{[0-9]+}}(%rsp), %r11d
+; CHECK-BASELINE-NEXT:    movl {{[0-9]+}}(%rsp), %ebx
+; CHECK-BASELINE-NEXT:    movl {{[0-9]+}}(%rsp), %ebp
+; CHECK-BASELINE-NEXT:    movl {{[0-9]+}}(%rsp), %r14d
+; CHECK-BASELINE-NEXT:    movl {{[0-9]+}}(%rsp), %r15d
+; CHECK-BASELINE-NEXT:    movl {{[0-9]+}}(%rsp), %r12d
 ; CHECK-BASELINE-NEXT:    xorl %r12d, %esi
 ; CHECK-BASELINE-NEXT:    andw {{[0-9]+}}(%rsp), %si
 ; CHECK-BASELINE-NEXT:    xorl %r12d, %esi
@@ -906,16 +896,16 @@ define <8 x i16> @out_v8i16(<8 x i16> %x, <8 x i16> %y, <8 x i16> %mask) nounwin
 ; CHECK-BASELINE-NEXT:    xorl %ebx, %r9d
 ; CHECK-BASELINE-NEXT:    andw {{[0-9]+}}(%rsp), %r9w
 ; CHECK-BASELINE-NEXT:    xorl %ebx, %r9d
-; CHECK-BASELINE-NEXT:    movl %r11d, %ebx
-; CHECK-BASELINE-NEXT:    xorw {{[0-9]+}}(%rsp), %bx
+; CHECK-BASELINE-NEXT:    movzwl {{[0-9]+}}(%rsp), %ebx
+; CHECK-BASELINE-NEXT:    xorw %r11w, %bx
 ; CHECK-BASELINE-NEXT:    andw {{[0-9]+}}(%rsp), %bx
 ; CHECK-BASELINE-NEXT:    xorl %r11d, %ebx
-; CHECK-BASELINE-NEXT:    movl %r10d, %r11d
-; CHECK-BASELINE-NEXT:    xorw {{[0-9]+}}(%rsp), %r11w
+; CHECK-BASELINE-NEXT:    movzwl {{[0-9]+}}(%rsp), %r11d
+; CHECK-BASELINE-NEXT:    xorw %r10w, %r11w
 ; CHECK-BASELINE-NEXT:    andw {{[0-9]+}}(%rsp), %r11w
 ; CHECK-BASELINE-NEXT:    xorl %r10d, %r11d
-; CHECK-BASELINE-NEXT:    movl %edi, %r10d
-; CHECK-BASELINE-NEXT:    xorw {{[0-9]+}}(%rsp), %r10w
+; CHECK-BASELINE-NEXT:    movzwl {{[0-9]+}}(%rsp), %r10d
+; CHECK-BASELINE-NEXT:    xorw %di, %r10w
 ; CHECK-BASELINE-NEXT:    andw {{[0-9]+}}(%rsp), %r10w
 ; CHECK-BASELINE-NEXT:    xorl %edi, %r10d
 ; CHECK-BASELINE-NEXT:    movw %r10w, 14(%rax)
@@ -941,14 +931,14 @@ define <8 x i16> @out_v8i16(<8 x i16> %x, <8 x i16> %y, <8 x i16> %mask) nounwin
 ; CHECK-SSE1-NEXT:    pushq %r12
 ; CHECK-SSE1-NEXT:    pushq %rbx
 ; CHECK-SSE1-NEXT:    movq %rdi, %rax
-; CHECK-SSE1-NEXT:    movzwl {{[0-9]+}}(%rsp), %edi
-; CHECK-SSE1-NEXT:    movzwl {{[0-9]+}}(%rsp), %r10d
-; CHECK-SSE1-NEXT:    movzwl {{[0-9]+}}(%rsp), %r11d
-; CHECK-SSE1-NEXT:    movzwl {{[0-9]+}}(%rsp), %ebx
-; CHECK-SSE1-NEXT:    movzwl {{[0-9]+}}(%rsp), %ebp
-; CHECK-SSE1-NEXT:    movzwl {{[0-9]+}}(%rsp), %r14d
-; CHECK-SSE1-NEXT:    movzwl {{[0-9]+}}(%rsp), %r15d
-; CHECK-SSE1-NEXT:    movzwl {{[0-9]+}}(%rsp), %r12d
+; CHECK-SSE1-NEXT:    movl {{[0-9]+}}(%rsp), %edi
+; CHECK-SSE1-NEXT:    movl {{[0-9]+}}(%rsp), %r10d
+; CHECK-SSE1-NEXT:    movl {{[0-9]+}}(%rsp), %r11d
+; CHECK-SSE1-NEXT:    movl {{[0-9]+}}(%rsp), %ebx
+; CHECK-SSE1-NEXT:    movl {{[0-9]+}}(%rsp), %ebp
+; CHECK-SSE1-NEXT:    movl {{[0-9]+}}(%rsp), %r14d
+; CHECK-SSE1-NEXT:    movl {{[0-9]+}}(%rsp), %r15d
+; CHECK-SSE1-NEXT:    movl {{[0-9]+}}(%rsp), %r12d
 ; CHECK-SSE1-NEXT:    xorl %r12d, %esi
 ; CHECK-SSE1-NEXT:    andw {{[0-9]+}}(%rsp), %si
 ; CHECK-SSE1-NEXT:    xorl %r12d, %esi
@@ -964,16 +954,16 @@ define <8 x i16> @out_v8i16(<8 x i16> %x, <8 x i16> %y, <8 x i16> %mask) nounwin
 ; CHECK-SSE1-NEXT:    xorl %ebx, %r9d
 ; CHECK-SSE1-NEXT:    andw {{[0-9]+}}(%rsp), %r9w
 ; CHECK-SSE1-NEXT:    xorl %ebx, %r9d
-; CHECK-SSE1-NEXT:    movl %r11d, %ebx
-; CHECK-SSE1-NEXT:    xorw {{[0-9]+}}(%rsp), %bx
+; CHECK-SSE1-NEXT:    movzwl {{[0-9]+}}(%rsp), %ebx
+; CHECK-SSE1-NEXT:    xorw %r11w, %bx
 ; CHECK-SSE1-NEXT:    andw {{[0-9]+}}(%rsp), %bx
 ; CHECK-SSE1-NEXT:    xorl %r11d, %ebx
-; CHECK-SSE1-NEXT:    movl %r10d, %r11d
-; CHECK-SSE1-NEXT:    xorw {{[0-9]+}}(%rsp), %r11w
+; CHECK-SSE1-NEXT:    movzwl {{[0-9]+}}(%rsp), %r11d
+; CHECK-SSE1-NEXT:    xorw %r10w, %r11w
 ; CHECK-SSE1-NEXT:    andw {{[0-9]+}}(%rsp), %r11w
 ; CHECK-SSE1-NEXT:    xorl %r10d, %r11d
-; CHECK-SSE1-NEXT:    movl %edi, %r10d
-; CHECK-SSE1-NEXT:    xorw {{[0-9]+}}(%rsp), %r10w
+; CHECK-SSE1-NEXT:    movzwl {{[0-9]+}}(%rsp), %r10d
+; CHECK-SSE1-NEXT:    xorw %di, %r10w
 ; CHECK-SSE1-NEXT:    andw {{[0-9]+}}(%rsp), %r10w
 ; CHECK-SSE1-NEXT:    xorl %edi, %r10d
 ; CHECK-SSE1-NEXT:    movw %r10w, 14(%rax)
@@ -1759,113 +1749,117 @@ define <16 x i16> @out_v16i16(ptr%px, ptr%py, ptr%pmask) nounwind {
 ; CHECK-BASELINE-NEXT:    pushq %r13
 ; CHECK-BASELINE-NEXT:    pushq %r12
 ; CHECK-BASELINE-NEXT:    pushq %rbx
-; CHECK-BASELINE-NEXT:    movzwl 18(%rdx), %r15d
-; CHECK-BASELINE-NEXT:    movzwl 16(%rdx), %r14d
-; CHECK-BASELINE-NEXT:    movzwl 14(%rdx), %ebp
-; CHECK-BASELINE-NEXT:    movzwl 12(%rdx), %ebx
-; CHECK-BASELINE-NEXT:    movzwl 10(%rdx), %r13d
-; CHECK-BASELINE-NEXT:    movzwl 8(%rdx), %r11d
-; CHECK-BASELINE-NEXT:    movzwl 6(%rdx), %r10d
-; CHECK-BASELINE-NEXT:    movzwl 4(%rdx), %r9d
-; CHECK-BASELINE-NEXT:    movzwl (%rdx), %r8d
-; CHECK-BASELINE-NEXT:    movzwl 2(%rdx), %r12d
-; CHECK-BASELINE-NEXT:    movzwl (%rsi), %eax
-; CHECK-BASELINE-NEXT:    xorw %r8w, %ax
-; CHECK-BASELINE-NEXT:    andw (%rcx), %ax
-; CHECK-BASELINE-NEXT:    xorl %eax, %r8d
-; CHECK-BASELINE-NEXT:    movl %r8d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; CHECK-BASELINE-NEXT:    movzwl 2(%rsi), %eax
-; CHECK-BASELINE-NEXT:    xorw %r12w, %ax
-; CHECK-BASELINE-NEXT:    andw 2(%rcx), %ax
-; CHECK-BASELINE-NEXT:    xorl %eax, %r12d
-; CHECK-BASELINE-NEXT:    movzwl 4(%rsi), %eax
-; CHECK-BASELINE-NEXT:    xorw %r9w, %ax
-; CHECK-BASELINE-NEXT:    andw 4(%rcx), %ax
-; CHECK-BASELINE-NEXT:    xorl %eax, %r9d
-; CHECK-BASELINE-NEXT:    movl %r9d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; CHECK-BASELINE-NEXT:    movzwl 6(%rsi), %eax
-; CHECK-BASELINE-NEXT:    xorw %r10w, %ax
-; CHECK-BASELINE-NEXT:    andw 6(%rcx), %ax
-; CHECK-BASELINE-NEXT:    xorl %eax, %r10d
-; CHECK-BASELINE-NEXT:    movl %r10d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; CHECK-BASELINE-NEXT:    movzwl 8(%rsi), %eax
-; CHECK-BASELINE-NEXT:    xorw %r11w, %ax
-; CHECK-BASELINE-NEXT:    andw 8(%rcx), %ax
-; CHECK-BASELINE-NEXT:    xorl %eax, %r11d
-; CHECK-BASELINE-NEXT:    movl %r11d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; CHECK-BASELINE-NEXT:    movzwl 10(%rsi), %eax
-; CHECK-BASELINE-NEXT:    xorw %r13w, %ax
-; CHECK-BASELINE-NEXT:    andw 10(%rcx), %ax
-; CHECK-BASELINE-NEXT:    xorl %eax, %r13d
-; CHECK-BASELINE-NEXT:    movl %r13d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; CHECK-BASELINE-NEXT:    movzwl 12(%rsi), %eax
+; CHECK-BASELINE-NEXT:    movq %rcx, %r10
+; CHECK-BASELINE-NEXT:    movq %rdx, %r8
+; CHECK-BASELINE-NEXT:    movq %rsi, %r9
+; CHECK-BASELINE-NEXT:    movq %rdi, %r11
+; CHECK-BASELINE-NEXT:    movzwl 18(%rdx), %ebp
+; CHECK-BASELINE-NEXT:    movl 16(%rdx), %r15d
+; CHECK-BASELINE-NEXT:    movzwl 14(%rdx), %r13d
+; CHECK-BASELINE-NEXT:    movl 12(%rdx), %r12d
+; CHECK-BASELINE-NEXT:    movzwl 10(%rdx), %r14d
+; CHECK-BASELINE-NEXT:    movl 8(%rdx), %ebx
+; CHECK-BASELINE-NEXT:    movzwl 6(%rdx), %eax
+; CHECK-BASELINE-NEXT:    movl (%rdx), %ecx
+; CHECK-BASELINE-NEXT:    movl 4(%rdx), %edx
+; CHECK-BASELINE-NEXT:    movzwl 2(%r8), %esi
+; CHECK-BASELINE-NEXT:    movzwl (%r9), %edi
+; CHECK-BASELINE-NEXT:    xorw %cx, %di
+; CHECK-BASELINE-NEXT:    andw (%r10), %di
+; CHECK-BASELINE-NEXT:    xorl %ecx, %edi
+; CHECK-BASELINE-NEXT:    movl %edi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; CHECK-BASELINE-NEXT:    movzwl 2(%r9), %ecx
+; CHECK-BASELINE-NEXT:    xorw %si, %cx
+; CHECK-BASELINE-NEXT:    andw 2(%r10), %cx
+; CHECK-BASELINE-NEXT:    xorl %esi, %ecx
+; CHECK-BASELINE-NEXT:    movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; CHECK-BASELINE-NEXT:    movzwl 4(%r9), %ecx
+; CHECK-BASELINE-NEXT:    xorw %dx, %cx
+; CHECK-BASELINE-NEXT:    andw 4(%r10), %cx
+; CHECK-BASELINE-NEXT:    xorl %edx, %ecx
+; CHECK-BASELINE-NEXT:    movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; CHECK-BASELINE-NEXT:    movzwl 6(%r9), %ecx
+; CHECK-BASELINE-NEXT:    xorw %ax, %cx
+; CHECK-BASELINE-NEXT:    andw 6(%r10), %cx
+; CHECK-BASELINE-NEXT:    xorl %eax, %ecx
+; CHECK-BASELINE-NEXT:    movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; CHECK-BASELINE-NEXT:    movzwl 8(%r9), %eax
 ; CHECK-BASELINE-NEXT:    xorw %bx, %ax
-; CHECK-BASELINE-NEXT:    andw 12(%rcx), %ax
-; CHECK-BASELINE-NEXT:    xorl %eax, %ebx
-; CHECK-BASELINE-NEXT:    movzwl 14(%rsi), %eax
-; CHECK-BASELINE-NEXT:    xorw %bp, %ax
-; CHECK-BASELINE-NEXT:    andw 14(%rcx), %ax
+; CHECK-BASELINE-NEXT:    andw 8(%r10), %ax
+; CHECK-BASELINE-NEXT:    xorl %ebx, %eax
+; CHECK-BASELINE-NEXT:    movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; CHECK-BASELINE-NEXT:    movzwl 10(%r9), %ebx
+; CHECK-BASELINE-NEXT:    xorw %r14w, %bx
+; CHECK-BASELINE-NEXT:    andw 10(%r10), %bx
+; CHECK-BASELINE-NEXT:    xorl %r14d, %ebx
+; CHECK-BASELINE-NEXT:    movzwl 12(%r9), %r14d
+; CHECK-BASELINE-NEXT:    xorw %r12w, %r14w
+; CHECK-BASELINE-NEXT:    andw 12(%r10), %r14w
+; CHECK-BASELINE-NEXT:    xorl %r12d, %r14d
+; CHECK-BASELINE-NEXT:    movzwl 14(%r9), %r12d
+; CHECK-BASELINE-NEXT:    xorw %r13w, %r12w
+; CHECK-BASELINE-NEXT:    andw 14(%r10), %r12w
+; CHECK-BASELINE-NEXT:    xorl %r13d, %r12d
+; CHECK-BASELINE-NEXT:    movzwl 16(%r9), %r13d
+; CHECK-BASELINE-NEXT:    xorw %r15w, %r13w
+; CHECK-BASELINE-NEXT:    andw 16(%r10), %r13w
+; CHECK-BASELINE-NEXT:    xorl %r15d, %r13d
+; CHECK-BASELINE-NEXT:    movzwl 18(%r9), %r15d
+; CHECK-BASELINE-NEXT:    xorw %bp, %r15w
+; CHECK-BASELINE-NEXT:    andw 18(%r10), %r15w
+; CHECK-BASELINE-NEXT:    xorl %ebp, %r15d
+; CHECK-BASELINE-NEXT:    movl 20(%r8), %eax
+; CHECK-BASELINE-NEXT:    movzwl 20(%r9), %ebp
+; CHECK-BASELINE-NEXT:    xorw %ax, %bp
+; CHECK-BASELINE-NEXT:    andw 20(%r10), %bp
 ; CHECK-BASELINE-NEXT:    xorl %eax, %ebp
-; CHECK-BASELINE-NEXT:    movzwl 16(%rsi), %eax
-; CHECK-BASELINE-NEXT:    xorw %r14w, %ax
-; CHECK-BASELINE-NEXT:    andw 16(%rcx), %ax
-; CHECK-BASELINE-NEXT:    xorl %eax, %r14d
-; CHECK-BASELINE-NEXT:    movzwl 18(%rsi), %eax
-; CHECK-BASELINE-NEXT:    xorw %r15w, %ax
-; CHECK-BASELINE-NEXT:    andw 18(%rcx), %ax
-; CHECK-BASELINE-NEXT:    xorl %eax, %r15d
-; CHECK-BASELINE-NEXT:    movzwl 20(%rdx), %r13d
-; CHECK-BASELINE-NEXT:    movzwl 20(%rsi), %eax
-; CHECK-BASELINE-NEXT:    xorw %r13w, %ax
-; CHECK-BASELINE-NEXT:    andw 20(%rcx), %ax
-; CHECK-BASELINE-NEXT:    xorl %eax, %r13d
-; CHECK-BASELINE-NEXT:    movzwl 22(%rdx), %r9d
-; CHECK-BASELINE-NEXT:    movzwl 22(%rsi), %eax
-; CHECK-BASELINE-NEXT:    xorw %r9w, %ax
-; CHECK-BASELINE-NEXT:    andw 22(%rcx), %ax
-; CHECK-BASELINE-NEXT:    xorl %eax, %r9d
-; CHECK-BASELINE-NEXT:    movzwl 24(%rdx), %r8d
-; CHECK-BASELINE-NEXT:    movzwl 24(%rsi), %eax
-; CHECK-BASELINE-NEXT:    xorw %r8w, %ax
-; CHECK-BASELINE-NEXT:    andw 24(%rcx), %ax
-; CHECK-BASELINE-NEXT:    xorl %eax, %r8d
-; CHECK-BASELINE-NEXT:    movzwl 26(%rdx), %eax
-; CHECK-BASELINE-NEXT:    movzwl 26(%rsi), %r10d
-; CHECK-BASELINE-NEXT:    xorw %ax, %r10w
-; CHECK-BASELINE-NEXT:    andw 26(%rcx), %r10w
-; CHECK-BASELINE-NEXT:    xorl %r10d, %eax
-; CHECK-BASELINE-NEXT:    movzwl 28(%rdx), %r10d
-; CHECK-BASELINE-NEXT:    movzwl 28(%rsi), %r11d
-; CHECK-BASELINE-NEXT:    xorw %r10w, %r11w
-; CHECK-BASELINE-NEXT:    andw 28(%rcx), %r11w
-; CHECK-BASELINE-NEXT:    xorl %r11d, %r10d
-; CHECK-BASELINE-NEXT:    movzwl 30(%rdx), %edx
-; CHECK-BASELINE-NEXT:    movzwl 30(%rsi), %esi
-; CHECK-BASELINE-NEXT:    xorw %dx, %si
-; CHECK-BASELINE-NEXT:    andw 30(%rcx), %si
-; CHECK-BASELINE-NEXT:    xorl %esi, %edx
-; CHECK-BASELINE-NEXT:    movw %dx, 30(%rdi)
-; CHECK-BASELINE-NEXT:    movw %r10w, 28(%rdi)
-; CHECK-BASELINE-NEXT:    movw %ax, 26(%rdi)
-; CHECK-BASELINE-NEXT:    movw %r8w, 24(%rdi)
-; CHECK-BASELINE-NEXT:    movw %r9w, 22(%rdi)
-; CHECK-BASELINE-NEXT:    movw %r13w, 20(%rdi)
-; CHECK-BASELINE-NEXT:    movw %r15w, 18(%rdi)
-; CHECK-BASELINE-NEXT:    movw %r14w, 16(%rdi)
-; CHECK-BASELINE-NEXT:    movw %bp, 14(%rdi)
-; CHECK-BASELINE-NEXT:    movw %bx, 12(%rdi)
+; CHECK-BASELINE-NEXT:    movzwl 22(%r8), %eax
+; CHECK-BASELINE-NEXT:    movzwl 22(%r9), %esi
+; CHECK-BASELINE-NEXT:    xorw %ax, %si
+; CHECK-BASELINE-NEXT:    andw 22(%r10), %si
+; CHECK-BASELINE-NEXT:    xorl %eax, %esi
+; CHECK-BASELINE-NEXT:    movl 24(%r8), %eax
+; CHECK-BASELINE-NEXT:    movzwl 24(%r9), %edx
+; CHECK-BASELINE-NEXT:    xorw %ax, %dx
+; CHECK-BASELINE-NEXT:    andw 24(%r10), %dx
+; CHECK-BASELINE-NEXT:    xorl %eax, %edx
+; CHECK-BASELINE-NEXT:    movzwl 26(%r8), %eax
+; CHECK-BASELINE-NEXT:    movzwl 26(%r9), %ecx
+; CHECK-BASELINE-NEXT:    xorw %ax, %cx
+; CHECK-BASELINE-NEXT:    andw 26(%r10), %cx
+; CHECK-BASELINE-NEXT:    xorl %eax, %ecx
+; CHECK-BASELINE-NEXT:    movl 28(%r8), %edi
+; CHECK-BASELINE-NEXT:    movzwl 28(%r9), %eax
+; CHECK-BASELINE-NEXT:    xorw %di, %ax
+; CHECK-BASELINE-NEXT:    andw 28(%r10), %ax
+; CHECK-BASELINE-NEXT:    xorl %edi, %eax
+; CHECK-BASELINE-NEXT:    movzwl 30(%r8), %edi
+; CHECK-BASELINE-NEXT:    movzwl 30(%r9), %r8d
+; CHECK-BASELINE-NEXT:    xorw %di, %r8w
+; CHECK-BASELINE-NEXT:    andw 30(%r10), %r8w
+; CHECK-BASELINE-NEXT:    xorl %edi, %r8d
+; CHECK-BASELINE-NEXT:    movw %r8w, 30(%r11)
+; CHECK-BASELINE-NEXT:    movw %ax, 28(%r11)
+; CHECK-BASELINE-NEXT:    movw %cx, 26(%r11)
+; CHECK-BASELINE-NEXT:    movw %dx, 24(%r11)
+; CHECK-BASELINE-NEXT:    movw %si, 22(%r11)
+; CHECK-BASELINE-NEXT:    movw %bp, 20(%r11)
+; CHECK-BASELINE-NEXT:    movw %r15w, 18(%r11)
+; CHECK-BASELINE-NEXT:    movw %r13w, 16(%r11)
+; CHECK-BASELINE-NEXT:    movw %r12w, 14(%r11)
+; CHECK-BASELINE-NEXT:    movw %r14w, 12(%r11)
+; CHECK-BASELINE-NEXT:    movw %bx, 10(%r11)
 ; CHECK-BASELINE-NEXT:    movl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Reload
-; CHECK-BASELINE-NEXT:    movw %ax, 10(%rdi)
+; CHECK-BASELINE-NEXT:    movw %ax, 8(%r11)
 ; CHECK-BASELINE-NEXT:    movl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Reload
-; CHECK-BASELINE-NEXT:    movw %ax, 8(%rdi)
+; CHECK-BASELINE-NEXT:    movw %ax, 6(%r11)
 ; CHECK-BASELINE-NEXT:    movl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Reload
-; CHECK-BASELINE-NEXT:    movw %ax, 6(%rdi)
+; CHECK-BASELINE-NEXT:    movw %ax, 4(%r11)
 ; CHECK-BASELINE-NEXT:    movl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Reload
-; CHECK-BASELINE-NEXT:    movw %ax, 4(%rdi)
-; CHECK-BASELINE-NEXT:    movw %r12w, 2(%rdi)
+; CHECK-BASELINE-NEXT:    movw %ax, 2(%r11)
 ; CHECK-BASELINE-NEXT:    movl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Reload
-; CHECK-BASELINE-NEXT:    movw %ax, (%rdi)
-; CHECK-BASELINE-NEXT:    movq %rdi, %rax
+; CHECK-BASELINE-NEXT:    movw %ax, (%r11)
+; CHECK-BASELINE-NEXT:    movq %r11, %rax
 ; CHECK-BASELINE-NEXT:    popq %rbx
 ; CHECK-BASELINE-NEXT:    popq %r12
 ; CHECK-BASELINE-NEXT:    popq %r13
@@ -1882,113 +1876,117 @@ define <16 x i16> @out_v16i16(ptr%px, ptr%py, ptr%pmask) nounwind {
 ; CHECK-SSE1-NEXT:    pushq %r13
 ; CHECK-SSE1-NEXT:    pushq %r12
 ; CHECK-SSE1-NEXT:    pushq %rbx
-; CHECK-SSE1-NEXT:    movzwl 18(%rdx), %r15d
-; CHECK-SSE1-NEXT:    movzwl 16(%rdx), %r14d
-; CHECK-SSE1-NEXT:    movzwl 14(%rdx), %ebp
-; CHECK-SSE1-NEXT:    movzwl 12(%rdx), %ebx
-; CHECK-SSE1-NEXT:    movzwl 10(%rdx), %r13d
-; CHECK-SSE1-NEXT:    movzwl 8(%rdx), %r11d
-; CHECK-SSE1-NEXT:    movzwl 6(%rdx), %r10d
-; CHECK-SSE1-NEXT:    movzwl 4(%rdx), %r9d
-; CHECK-SSE1-NEXT:    movzwl (%rdx), %r8d
-; CHECK-SSE1-NEXT:    movzwl 2(%rdx), %r12d
-; CHECK-SSE1-NEXT:    movzwl (%rsi), %eax
-; CHECK-SSE1-NEXT:    xorw %r8w, %ax
-; CHECK-SSE1-NEXT:    andw (%rcx), %ax
-; CHECK-SSE1-NEXT:    xorl %eax, %r8d
-; CHECK-SSE1-NEXT:    movl %r8d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; CHECK-SSE1-NEXT:    movzwl 2(%rsi), %eax
-; CHECK-SSE1-NEXT:    xorw %r12w, %ax
-; CHECK-SSE1-NEXT:    andw 2(%rcx), %ax
-; CHECK-SSE1-NEXT:    xorl %eax, %r12d
-; CHECK-SSE1-NEXT:    movzwl 4(%rsi), %eax
-; CHECK-SSE1-NEXT:    xorw %r9w, %ax
-; CHECK-SSE1-NEXT:    andw 4(%rcx), %ax
-; CHECK-SSE1-NEXT:    xorl %eax, %r9d
-; CHECK-SSE1-NEXT:    movl %r9d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; CHECK-SSE1-NEXT:    movzwl 6(%rsi), %eax
-; CHECK-SSE1-NEXT:    xorw %r10w, %ax
-; CHECK-SSE1-NEXT:    andw 6(%rcx), %ax
-; CHECK-SSE1-NEXT:    xorl %eax, %r10d
-; CHECK-SSE1-NEXT:    movl %r10d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; CHECK-SSE1-NEXT:    movzwl 8(%rsi), %eax
-; CHECK-SSE1-NEXT:    xorw %r11w, %ax
-; CHECK-SSE1-NEXT:    andw 8(%rcx), %ax
-; CHECK-SSE1-NEXT:    xorl %eax, %r11d
-; CHECK-SSE1-NEXT:    movl %r11d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; CHECK-SSE1-NEXT:    movzwl 10(%rsi), %eax
-; CHECK-SSE1-NEXT:    xorw %r13w, %ax
-; CHECK-SSE1-NEXT:    andw 10(%rcx), %ax
-; CHECK-SSE1-NEXT:    xorl %eax, %r13d
-; CHECK-SSE1-NEXT:    movl %r13d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; CHECK-SSE1-NEXT:    movzwl 12(%rsi), %eax
+; CHECK-SSE1-NEXT:    movq %rcx, %r10
+; CHECK-SSE1-NEXT:    movq %rdx, %r8
+; CHECK-SSE1-NEXT:    movq %rsi, %r9
+; CHECK-SSE1-NEXT:    movq %rdi, %r11
+; CHECK-SSE1-NEXT:    movzwl 18(%rdx), %ebp
+; CHECK-SSE1-NEXT:    movl 16(%rdx), %r15d
+; CHECK-SSE1-NEXT:    movzwl 14(%rdx), %r13d
+; CHECK-SSE1-NEXT:    movl 12(%rdx), %r12d
+; CHECK-SSE1-NEXT:    movzwl 10(%rdx), %r14d
+; CHECK-SSE1-NEXT:    movl 8(%rdx), %ebx
+; CHECK-SSE1-NEXT:    movzwl 6(%rdx), %eax
+; CHECK-SSE1-NEXT:    movl (%rdx), %ecx
+; CHECK-SSE1-NEXT:    movl 4(%rdx), %edx
+; CHECK-SSE1-NEXT:    movzwl 2(%r8), %esi
+; CHECK-SSE1-NEXT:    movzwl (%r9), %edi
+; CHECK-SSE1-NEXT:    xorw %cx, %di
+; CHECK-SSE1-NEXT:    andw (%r10), %di
+; CHECK-SSE1-NEXT:    xorl %ecx, %edi
+; CHECK-SSE1-NEXT:    movl %edi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; CHECK-SSE1-NEXT:    movzwl 2(%r9), %ecx
+; CHECK-SSE1-NEXT:    xorw %si, %cx
+; CHECK-SSE1-NEXT:    andw 2(%r10), %cx
+; CHECK-SSE1-NEXT:    xorl %esi, %ecx
+; CHECK-SSE1-NEXT:    movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; CHECK-SSE1-NEXT:    movzwl 4(%r9), %ecx
+; CHECK-SSE1-NEXT:    xorw %dx, %cx
+; CHECK-SSE1-NEXT:    andw 4(%r10), %cx
+; CHECK-SSE1-NEXT:    xorl %edx, %ecx
+; CHECK-SSE1-NEXT:    movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; CHECK-SSE1-NEXT:    movzwl 6(%r9), %ecx
+; CHECK-SSE1-NEXT:    xorw %ax, %cx
+; CHECK-SSE1-NEXT:    andw 6(%r10), %cx
+; CHECK-SSE1-NEXT:    xorl %eax, %ecx
+; CHECK-SSE1-NEXT:    movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; CHECK-SSE1-NEXT:    movzwl 8(%r9), %eax
 ; CHECK-SSE1-NEXT:    xorw %bx, %ax
-; CHECK-SSE1-NEXT:    andw 12(%rcx), %ax
-; CHECK-SSE1-NEXT:    xorl %eax, %ebx
-; CHECK-SSE1-NEXT:    movzwl 14(%rsi), %eax
-; CHECK-SSE1-NEXT:    xorw %bp, %ax
-; CHECK-SSE1-NEXT:    andw 14(%rcx), %ax
+; CHECK-SSE1-NEXT:    andw 8(%r10), %ax
+; CHECK-SSE1-NEXT:    xorl %ebx, %eax
+; CHECK-SSE1-NEXT:    movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; CHECK-SSE1-NEXT:    movzwl 10(%r9), %ebx
+; CHECK-SSE1-NEXT:    xorw %r14w, %bx
+; CHECK-SSE1-NEXT:    andw 10(%r10), %bx
+; CHECK-SSE1-NEXT:    xorl %r14d, %ebx
+; CHECK-SSE1-NEXT:    movzwl 12(%r9), %r14d
+; CHECK-SSE1-NEXT:    xorw %r12w, %r14w
+; CHECK-SSE1-NEXT:    andw 12(%r10), %r14w
+; CHECK-SSE1-NEXT:    xorl %r12d, %r14d
+; CHECK-SSE1-NEXT:    movzwl 14(%r9), %r12d
+; CHECK-SSE1-NEXT:    xorw %r13w, %r12w
+; CHECK-SSE1-NEXT:    andw 14(%r10), %r12w
+; CHECK-SSE1-NEXT:    xorl %r13d, %r12d
+; CHECK-SSE1-NEXT:    movzwl 16(%r9), %r13d
+; CHECK-SSE1-NEXT:    xorw %r15w, %r13w
+; CHECK-SSE1-NEXT:    andw 16(%r10), %r13w
+; CHECK-SSE1-NEXT:    xorl %r15d, %r13d
+; CHECK-SSE1-NEXT:    movzwl 18(%r9), %r15d
+; CHECK-SSE1-NEXT:    xorw %bp, %r15w
+; CHECK-SSE1-NEXT:    andw 18(%r10), %r15w
+; CHECK-SSE1-NEXT:    xorl %ebp, %r15d
+; CHECK-SSE1-NEXT:    movl 20(%r8), %eax
+; CHECK-SSE1-NEXT:    movzwl 20(%r9), %ebp
+; CHECK-SSE1-NEXT:    xorw %ax, %bp
+; CHECK-SSE1-NEXT:    andw 20(%r10), %bp
 ; CHECK-SSE1-NEXT:    xorl %eax, %ebp
-; CHECK-SSE1-NEXT:    movzwl 16(%rsi), %eax
-; CHECK-SSE1-NEXT:    xorw %r14w, %ax
-; CHECK-SSE1-NEXT:    andw 16(%rcx), %ax
-; CHECK-SSE1-NEXT:    xorl %eax, %r14d
-; CHECK-SSE1-NEXT:    movzwl 18(%rsi), %eax
-; CHECK-SSE1-NEXT:    xorw %r15w, %ax
-; CHECK-SSE1-NEXT:    andw 18(%rcx), %ax
-; CHECK-SSE1-NEXT:    xorl %eax, %r15d
-; CHECK-SSE1-NEXT:    movzwl 20(%rdx), %r13d
-; CHECK-SSE1-NEXT:    movzwl 20(%rsi), %eax
-; CHECK-SSE1-NEXT:    xorw %r13w, %ax
-; CHECK-SSE1-NEXT:    andw 20(%rcx), %ax
-; CHECK-SSE1-NEXT:    xorl %eax, %r13d
-; CHECK-SSE1-NEXT:    movzwl 22(%rdx), %r9d
-; CHECK-SSE1-NEXT:    movzwl 22(%rsi), %eax
-; CHECK-SSE1-NEXT:    xorw %r9w, %ax
-; CHECK-SSE1-NEXT:    andw 22(%rcx), %ax
-; CHECK-SSE1-NEXT:    xorl %eax, %r9d
-; CHECK-SSE1-NEXT:    movzwl 24(%rdx), %r8d
-; CHECK-SSE1-NEXT:    movzwl 24(%rsi), %eax
-; CHECK-SSE1-NEXT:    xorw %r8w, %ax
-; CHECK-SSE1-NEXT:    andw 24(%rcx), %ax
-; CHECK-SSE1-NEXT:    xorl %eax, %r8d
-; CHECK-SSE1-NEXT:    movzwl 26(%rdx), %eax
-; CHECK-SSE1-NEXT:    movzwl 26(%rsi), %r10d
-; CHECK-SSE1-NEXT:    xorw %ax, %r10w
-; CHECK-SSE1-NEXT:    andw 26(%rcx), %r10w
-; CHECK-SSE1-NEXT:    xorl %r10d, %eax
-; CHECK-SSE1-NEXT:    movzwl 28(%rdx), %r10d
-; CHECK-SSE1-NEXT:    movzwl 28(%rsi), %r11d
-; CHECK-SSE1-NEXT:    xorw %r10w, %r11w
-; CHECK-SSE1-NEXT:    andw 28(%rcx), %r11w
-; CHECK-SSE1-NEXT:    xorl %r11d, %r10d
-; CHECK-SSE1-NEXT:    movzwl 30(%rdx), %edx
-; CHECK-SSE1-NEXT:    movzwl 30(%rsi), %esi
-; CHECK-SSE1-NEXT:    xorw %dx, %si
-; CHECK-SSE1-NEXT:    andw 30(%rcx), %si
-; CHECK-SSE1-NEXT:    xorl %esi, %edx
-; CHECK-SSE1-NEXT:    movw %dx, 30(%rdi)
-; CHECK-SSE1-NEXT:    movw %r10w, 28(%rdi)
-; CHECK-SSE1-NEXT:    movw %ax, 26(%rdi)
-; CHECK-SSE1-NEXT:    movw %r8w, 24(%rdi)
-; CHECK-SSE1-NEXT:    movw %r9w, 22(%rdi)
-; CHECK-SSE1-NEXT:    movw %r13w, 20(%rdi)
-; CHECK-SSE1-NEXT:    movw %r15w, 18(%rdi)
-; CHECK-SSE1-NEXT:    movw %r14w, 16(%rdi)
-; CHECK-SSE1-NEXT:    movw %bp, 14(%rdi)
-; CHECK-SSE1-NEXT:    movw %bx, 12(%rdi)
+; CHECK-SSE1-NEXT:    movzwl 22(%r8), %eax
+; CHECK-SSE1-NEXT:    movzwl 22(%r9), %esi
+; CHECK-SSE1-NEXT:    xorw %ax, %si
+; CHECK-SSE1-NEXT:    andw 22(%r10), %si
+; CHECK-SSE1-NEXT:    xorl %eax, %esi
+; CHECK-SSE1-NEXT:    movl 24(%r8), %eax
+; CHECK-SSE1-NEXT:    movzwl 24(%r9), %edx
+; CHECK-SSE1-NEXT:    xorw %ax, %dx
+; CHECK-SSE1-NEXT:    andw 24(%r10), %dx
+; CHECK-SSE1-NEXT:    xorl %eax, %edx
+; CHECK-SSE1-NEXT:    movzwl 26(%r8), %eax
+; CHECK-SSE1-NEXT:    movzwl 26(%r9), %ecx
+; CHECK-SSE1-NEXT:    xorw %ax, %cx
+; CHECK-SSE1-NEXT:    andw 26(%r10), %cx
+; CHECK-SSE1-NEXT:    xorl %eax, %ecx
+; CHECK-SSE1-NEXT:    movl 28(%r8), %edi
+; CHECK-SSE1-NEXT:    movzwl 28(%r9), %eax
+; CHECK-SSE1-NEXT:    xorw %di, %ax
+; CHECK-SSE1-NEXT:    andw 28(%r10), %ax
+; CHECK-SSE1-NEXT:    xorl %edi, %eax
+; CHECK-SSE1-NEXT:    movzwl 30(%r8), %edi
+; CHECK-SSE1-NEXT:    movzwl 30(%r9), %r8d
+; CHECK-SSE1-NEXT:    xorw %di, %r8w
+; CHECK-SSE1-NEXT:    andw 30(%r10), %r8w
+; CHECK-SSE1-NEXT:    xorl %edi, %r8d
+; CHECK-SSE1-NEXT:    movw %r8w, 30(%r11)
+; CHECK-SSE1-NEXT:    movw %ax, 28(%r11)
+; CHECK-SSE1-NEXT:    movw %cx, 26(%r11)
+; CHECK-SSE1-NEXT:    movw %dx, 24(%r11)
+; CHECK-SSE1-NEXT:    movw %si, 22(%r11)
+; CHECK-SSE1-NEXT:    movw %bp, 20(%r11)
+; CHECK-SSE1-NEXT:    movw %r15w, 18(%r11)
+; CHECK-SSE1-NEXT:    movw %r13w, 16(%r11)
+; CHECK-SSE1-NEXT:    movw %r12w, 14(%r11)
+; CHECK-SSE1-NEXT:    movw %r14w, 12(%r11)
+; CHECK-SSE1-NEXT:    movw %bx, 10(%r11)
 ; CHECK-SSE1-NEXT:    movl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Reload
-; CHECK-SSE1-NEXT:    movw %ax, 10(%rdi)
+; CHECK-SSE1-NEXT:    movw %ax, 8(%r11)
 ; CHECK-SSE1-NEXT:    movl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Reload
-; CHECK-SSE1-NEXT:    movw %ax, 8(%rdi)
+; CHECK-SSE1-NEXT:    movw %ax, 6(%r11)
 ; CHECK-SSE1-NEXT:    movl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Reload
-; CHECK-SSE1-NEXT:    movw %ax, 6(%rdi)
+; CHECK-SSE1-NEXT:    movw %ax, 4(%r11)
 ; CHECK-SSE1-NEXT:    movl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Reload
-; CHECK-SSE1-NEXT:    movw %ax, 4(%rdi)
-; CHECK-SSE1-NEXT:    movw %r12w, 2(%rdi)
+; CHECK-SSE1-NEXT:    movw %ax, 2(%r11)
 ; CHECK-SSE1-NEXT:    movl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Reload
-; CHECK-SSE1-NEXT:    movw %ax, (%rdi)
-; CHECK-SSE1-NEXT:    movq %rdi, %rax
+; CHECK-SSE1-NEXT:    movw %ax, (%r11)
+; CHECK-SSE1-NEXT:    movq %r11, %rax
 ; CHECK-SSE1-NEXT:    popq %rbx
 ; CHECK-SSE1-NEXT:    popq %r12
 ; CHECK-SSE1-NEXT:    popq %r13

>From 937be177528de156922c1b5f6cab08ba3009dbf2 Mon Sep 17 00:00:00 2001
From: Kareem Ergawy <kareem.ergawy at amd.com>
Date: Wed, 11 Jun 2025 10:10:22 +0200
Subject: [PATCH 55/61] [flang] Enable delayed localization by default for `do
 concurrent` (#142567)

This PR aims to make it easier and more self-contained to revert the
switch/flag if we discover any problems with enabling it by default.
---
 flang/lib/Lower/Bridge.cpp                            | 6 +-----
 flang/test/Lower/do_concurrent_delayed_locality.f90   | 2 +-
 flang/test/Lower/do_concurrent_local_assoc_entity.f90 | 2 +-
 flang/test/Lower/do_concurrent_local_default_init.f90 | 2 +-
 flang/test/Lower/loops.f90                            | 2 +-
 flang/test/Lower/loops3.f90                           | 2 +-
 6 files changed, 6 insertions(+), 10 deletions(-)

diff --git a/flang/lib/Lower/Bridge.cpp b/flang/lib/Lower/Bridge.cpp
index 64b16b3abe991..5ff8101dba097 100644
--- a/flang/lib/Lower/Bridge.cpp
+++ b/flang/lib/Lower/Bridge.cpp
@@ -2033,11 +2033,7 @@ class FirConverter : public Fortran::lower::AbstractConverter {
     fir::LocalitySpecifierOperands privateClauseOps;
     auto doConcurrentLoopOp =
         mlir::dyn_cast_if_present<fir::DoConcurrentLoopOp>(info.loopOp);
-    // TODO Promote to using `enableDelayedPrivatization` (which is enabled by
-    // default unlike the staging flag) once the implementation of this is more
-    // complete.
-    bool useDelayedPriv =
-        enableDelayedPrivatizationStaging && doConcurrentLoopOp;
+    bool useDelayedPriv = enableDelayedPrivatization && doConcurrentLoopOp;
     llvm::SetVector<const Fortran::semantics::Symbol *> allPrivatizedSymbols;
     llvm::SmallSet<const Fortran::semantics::Symbol *, 16> mightHaveReadHostSym;
 
diff --git a/flang/test/Lower/do_concurrent_delayed_locality.f90 b/flang/test/Lower/do_concurrent_delayed_locality.f90
index 6cae0eb46db13..039b17808d19e 100644
--- a/flang/test/Lower/do_concurrent_delayed_locality.f90
+++ b/flang/test/Lower/do_concurrent_delayed_locality.f90
@@ -1,4 +1,4 @@
-! RUN: %flang_fc1 -emit-hlfir -mmlir --enable-delayed-privatization-staging=true -o - %s | FileCheck %s
+! RUN: %flang_fc1 -emit-hlfir -o - %s | FileCheck %s
 
 subroutine do_concurrent_with_locality_specs
   implicit none
diff --git a/flang/test/Lower/do_concurrent_local_assoc_entity.f90 b/flang/test/Lower/do_concurrent_local_assoc_entity.f90
index a3d0c34ed8569..67f080eb2c1c5 100644
--- a/flang/test/Lower/do_concurrent_local_assoc_entity.f90
+++ b/flang/test/Lower/do_concurrent_local_assoc_entity.f90
@@ -1,4 +1,4 @@
-! RUN: %flang_fc1 -emit-hlfir -mmlir --enable-delayed-privatization-staging=true -o - %s | FileCheck %s
+! RUN: %flang_fc1 -emit-hlfir -o - %s | FileCheck %s
 
 subroutine local_assoc
   implicit none
diff --git a/flang/test/Lower/do_concurrent_local_default_init.f90 b/flang/test/Lower/do_concurrent_local_default_init.f90
index d643213854744..798cbb335c8c0 100644
--- a/flang/test/Lower/do_concurrent_local_default_init.f90
+++ b/flang/test/Lower/do_concurrent_local_default_init.f90
@@ -1,5 +1,5 @@
 ! Test default initialization of DO CONCURRENT LOCAL() entities.
-! RUN: bbc -emit-hlfir --enable-delayed-privatization-staging=true -I nowhere -o - %s | FileCheck %s
+! RUN: bbc -emit-hlfir -I nowhere -o - %s | FileCheck %s
 
 subroutine test_ptr(p)
   interface
diff --git a/flang/test/Lower/loops.f90 b/flang/test/Lower/loops.f90
index 60df27a591dc3..64f14ff972272 100644
--- a/flang/test/Lower/loops.f90
+++ b/flang/test/Lower/loops.f90
@@ -1,4 +1,4 @@
-! RUN: bbc -emit-fir -hlfir=false -o - %s | FileCheck %s
+! RUN: bbc -emit-fir -hlfir=false --enable-delayed-privatization=false -o - %s | FileCheck %s
 
 ! CHECK-LABEL: loop_test
 subroutine loop_test
diff --git a/flang/test/Lower/loops3.f90 b/flang/test/Lower/loops3.f90
index 84db1972cca16..34d7bcfb7d7ad 100644
--- a/flang/test/Lower/loops3.f90
+++ b/flang/test/Lower/loops3.f90
@@ -1,5 +1,5 @@
 ! Test do concurrent reduction
-! RUN: bbc -emit-fir -hlfir=false -o - %s | FileCheck %s
+! RUN: bbc -emit-fir -hlfir=false --enable-delayed-privatization=false -o - %s | FileCheck %s
 
 ! CHECK-LABEL: loop_test
 subroutine loop_test

>From afbcf9529a1edb88d067e6fca8d9534901310d5e Mon Sep 17 00:00:00 2001
From: CHANDRA GHALE <chandra.nitdgp at gmail.com>
Date: Wed, 11 Jun 2025 14:01:31 +0530
Subject: [PATCH 56/61] [OpenMP 6.0 ]Codegen for Reduction over private
 variables with reduction clause (#134709)

Codegen support for reduction over private variable with reduction
clause. Section 7.6.10 in in OpenMP 6.0 spec.
- An internal shared copy is initialized with an initializer value.
- The shared copy is updated by combining its value with the values from
the private copies created by the clause.
- Once an encountering thread verifies that all updates are complete,
its original list item is updated by merging its value with that of the
shared copy and then broadcast to all threads.

Sample Test Case from OpenMP 6.0 Example
```
#include <assert.h>
#include <omp.h>
#define N 10

void do_red(int n, int *v, int &sum_v)
{
    sum_v = 0; // sum_v is private
    #pragma omp for reduction(original(private),+: sum_v)
    for (int i = 0; i < n; i++)
    {
        sum_v += v[i];
    }
}

int main(void)
{
    int v[N];
    for (int i = 0; i < N; i++)
        v[i] = i;
    #pragma omp parallel num_threads(4)
    {
        int s_v; // s_v is private
        do_red(N, v, s_v);
        assert(s_v == 45);
    }
    return 0;
}
```
Expected Codegen:
```
 // A shared global/static variable is introduced for the reduction result.
 // This variable is initialized (e.g., using memset or a UDR initializer)
 // e.g., .omp.reduction.internal_private_var

 // Barrier before any thread performs combination
  call void @__kmpc_barrier(...)

 // Initialization block (executed by thread 0)
 // e.g., call void @llvm.memset.p0.i64(...) or call @udr_initializer(...)

  call void @__kmpc_critical(...)
    // Inside critical section:
    // Load the current value from the shared variable
    // Load the thread-local private variable's value
    // Perform the reduction operation
    // Store the result back to the shared variable

  call void @__kmpc_end_critical(...)
  // Barrier after all threads complete their combinations

  call void @__kmpc_barrier(...)
 // Broadcast phase:
 // Load the final result from the shared variable)
 // Store the final result to the original private variable in each thread
 // Final barrier after broadcast

  call void @__kmpc_barrier(...)
```

---------

Co-authored-by: Chandra Ghale <ghale at pe31.hpc.amslabs.hpecorp.net>
---
 clang/docs/OpenMPSupport.rst                  |   3 +-
 clang/docs/ReleaseNotes.rst                   |   1 +
 clang/lib/CodeGen/CGOpenMPRuntime.cpp         | 292 ++++++-
 clang/lib/CodeGen/CGOpenMPRuntime.h           |  12 +
 clang/lib/CodeGen/CGStmtOpenMP.cpp            |  11 +-
 clang/lib/Sema/SemaOpenMP.cpp                 |  41 +-
 .../OpenMP/distribute_simd_misc_messages.c    |   3 +-
 .../OpenMP/for_private_reduction_codegen.cpp  | 710 ++++++++++++++++++
 clang/test/OpenMP/for_reduction_messages.cpp  |   2 +
 .../OpenMP/for_simd_reduction_messages.cpp    |   2 +-
 .../OpenMP/sections_reduction_messages.cpp    |   2 +-
 .../for/omp_for_private_reduction.cpp         | 194 +++++
 12 files changed, 1235 insertions(+), 38 deletions(-)
 create mode 100644 clang/test/OpenMP/for_private_reduction_codegen.cpp
 create mode 100644 openmp/runtime/test/worksharing/for/omp_for_private_reduction.cpp

diff --git a/clang/docs/OpenMPSupport.rst b/clang/docs/OpenMPSupport.rst
index d6507071d4693..986aaabe1eed4 100644
--- a/clang/docs/OpenMPSupport.rst
+++ b/clang/docs/OpenMPSupport.rst
@@ -406,7 +406,8 @@ implementation.
 +-------------------------------------------------------------+---------------------------+---------------------------+--------------------------------------------------------------------------+
 | Extensions to atomic construct                              | :none:`unclaimed`         | :none:`unclaimed`         |                                                                          |
 +-------------------------------------------------------------+---------------------------+---------------------------+--------------------------------------------------------------------------+ 
-| Private reductions                                          | :part:`partial`           | :none:`unclaimed`         | Parse/Sema:https://github.com/llvm/llvm-project/pull/129938              |
+| Private reductions                                          | :good:`mostly`            | :none:`unclaimed`         | Parse/Sema:https://github.com/llvm/llvm-project/pull/129938              |
+|                                                             |                           |                           | Codegen: https://github.com/llvm/llvm-project/pull/134709                |
 +-------------------------------------------------------------+---------------------------+---------------------------+--------------------------------------------------------------------------+
 | Self maps                                                   | :part:`partial`           | :none:`unclaimed`         | parsing/sema done: https://github.com/llvm/llvm-project/pull/129888      |
 +-------------------------------------------------------------+---------------------------+---------------------------+--------------------------------------------------------------------------+
diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst
index f36c82bff2ef8..5645edc73431b 100644
--- a/clang/docs/ReleaseNotes.rst
+++ b/clang/docs/ReleaseNotes.rst
@@ -1100,6 +1100,7 @@ OpenMP Support
   open parenthesis. (#GH139665)
 - An error is now emitted when OpenMP ``collapse`` and ``ordered`` clauses have
   an argument larger than what can fit within a 64-bit integer.
+- Added support for private variable reduction.
 
 Improvements
 ^^^^^^^^^^^^
diff --git a/clang/lib/CodeGen/CGOpenMPRuntime.cpp b/clang/lib/CodeGen/CGOpenMPRuntime.cpp
index 09e3ccc380ae3..4173355491fd4 100644
--- a/clang/lib/CodeGen/CGOpenMPRuntime.cpp
+++ b/clang/lib/CodeGen/CGOpenMPRuntime.cpp
@@ -4907,11 +4907,255 @@ void CGOpenMPRuntime::emitSingleReductionCombiner(CodeGenFunction &CGF,
   }
 }
 
+static std::string generateUniqueName(CodeGenModule &CGM,
+                                      llvm::StringRef Prefix, const Expr *Ref);
+
+void CGOpenMPRuntime::emitPrivateReduction(
+    CodeGenFunction &CGF, SourceLocation Loc, const Expr *Privates,
+    const Expr *LHSExprs, const Expr *RHSExprs, const Expr *ReductionOps) {
+
+  //  Create a shared global variable (__shared_reduction_var) to accumulate the
+  //  final result.
+  //
+  //  Call __kmpc_barrier to synchronize threads before initialization.
+  //
+  //  The master thread (thread_id == 0) initializes __shared_reduction_var
+  //    with the identity value or initializer.
+  //
+  //  Call __kmpc_barrier to synchronize before combining.
+  //  For each i:
+  //    - Thread enters critical section.
+  //    - Reads its private value from LHSExprs[i].
+  //    - Updates __shared_reduction_var[i] = RedOp_i(__shared_reduction_var[i],
+  //    Privates[i]).
+  //    - Exits critical section.
+  //
+  //  Call __kmpc_barrier after combining.
+  //
+  //  Each thread copies __shared_reduction_var[i] back to RHSExprs[i].
+  //
+  //  Final __kmpc_barrier to synchronize after broadcasting
+  QualType PrivateType = Privates->getType();
+  llvm::Type *LLVMType = CGF.ConvertTypeForMem(PrivateType);
+
+  const OMPDeclareReductionDecl *UDR = getReductionInit(ReductionOps);
+  std::string ReductionVarNameStr;
+  if (const auto *DRE = dyn_cast<DeclRefExpr>(Privates->IgnoreParenCasts()))
+    ReductionVarNameStr =
+        generateUniqueName(CGM, DRE->getDecl()->getNameAsString(), Privates);
+  else
+    ReductionVarNameStr = "unnamed_priv_var";
+
+  // Create an internal shared variable
+  std::string SharedName =
+      CGM.getOpenMPRuntime().getName({"internal_pivate_", ReductionVarNameStr});
+  llvm::GlobalVariable *SharedVar = OMPBuilder.getOrCreateInternalVariable(
+      LLVMType, ".omp.reduction." + SharedName);
+
+  SharedVar->setAlignment(
+      llvm::MaybeAlign(CGF.getContext().getTypeAlign(PrivateType) / 8));
+
+  Address SharedResult =
+      CGF.MakeNaturalAlignRawAddrLValue(SharedVar, PrivateType).getAddress();
+
+  llvm::Value *ThreadId = getThreadID(CGF, Loc);
+  llvm::Value *BarrierLoc = emitUpdateLocation(CGF, Loc, OMP_ATOMIC_REDUCE);
+  llvm::Value *BarrierArgs[] = {BarrierLoc, ThreadId};
+
+  llvm::BasicBlock *InitBB = CGF.createBasicBlock("init");
+  llvm::BasicBlock *InitEndBB = CGF.createBasicBlock("init.end");
+
+  llvm::Value *IsWorker = CGF.Builder.CreateICmpEQ(
+      ThreadId, llvm::ConstantInt::get(ThreadId->getType(), 0));
+  CGF.Builder.CreateCondBr(IsWorker, InitBB, InitEndBB);
+
+  CGF.EmitBlock(InitBB);
+
+  auto EmitSharedInit = [&]() {
+    if (UDR) { // Check if it's a User-Defined Reduction
+      if (const Expr *UDRInitExpr = UDR->getInitializer()) {
+        std::pair<llvm::Function *, llvm::Function *> FnPair =
+            getUserDefinedReduction(UDR);
+        llvm::Function *InitializerFn = FnPair.second;
+        if (InitializerFn) {
+          if (const auto *CE =
+                  dyn_cast<CallExpr>(UDRInitExpr->IgnoreParenImpCasts())) {
+            const auto *OutDRE = cast<DeclRefExpr>(
+                cast<UnaryOperator>(CE->getArg(0)->IgnoreParenImpCasts())
+                    ->getSubExpr());
+            const VarDecl *OutVD = cast<VarDecl>(OutDRE->getDecl());
+
+            CodeGenFunction::OMPPrivateScope LocalScope(CGF);
+            LocalScope.addPrivate(OutVD, SharedResult);
+
+            (void)LocalScope.Privatize();
+            if (const auto *OVE = dyn_cast<OpaqueValueExpr>(
+                    CE->getCallee()->IgnoreParenImpCasts())) {
+              CodeGenFunction::OpaqueValueMapping OpaqueMap(
+                  CGF, OVE, RValue::get(InitializerFn));
+              CGF.EmitIgnoredExpr(CE);
+            } else {
+              CGF.EmitAnyExprToMem(UDRInitExpr, SharedResult,
+                                   PrivateType.getQualifiers(),
+                                   /*IsInitializer=*/true);
+            }
+          } else {
+            CGF.EmitAnyExprToMem(UDRInitExpr, SharedResult,
+                                 PrivateType.getQualifiers(),
+                                 /*IsInitializer=*/true);
+          }
+        } else {
+          CGF.EmitAnyExprToMem(UDRInitExpr, SharedResult,
+                               PrivateType.getQualifiers(),
+                               /*IsInitializer=*/true);
+        }
+      } else {
+        // EmitNullInitialization handles default construction for C++ classes
+        // and zeroing for scalars, which is a reasonable default.
+        CGF.EmitNullInitialization(SharedResult, PrivateType);
+      }
+      return; // UDR initialization handled
+    }
+    if (const auto *DRE = dyn_cast<DeclRefExpr>(Privates)) {
+      if (const auto *VD = dyn_cast<VarDecl>(DRE->getDecl())) {
+        if (const Expr *InitExpr = VD->getInit()) {
+          CGF.EmitAnyExprToMem(InitExpr, SharedResult,
+                               PrivateType.getQualifiers(), true);
+          return;
+        }
+      }
+    }
+    CGF.EmitNullInitialization(SharedResult, PrivateType);
+  };
+  EmitSharedInit();
+  CGF.Builder.CreateBr(InitEndBB);
+  CGF.EmitBlock(InitEndBB);
+
+  CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction(
+                          CGM.getModule(), OMPRTL___kmpc_barrier),
+                      BarrierArgs);
+
+  const Expr *ReductionOp = ReductionOps;
+  const OMPDeclareReductionDecl *CurrentUDR = getReductionInit(ReductionOp);
+  LValue SharedLV = CGF.MakeAddrLValue(SharedResult, PrivateType);
+  LValue LHSLV = CGF.EmitLValue(Privates);
+
+  auto EmitCriticalReduction = [&](auto ReductionGen) {
+    std::string CriticalName = getName({"reduction_critical"});
+    emitCriticalRegion(CGF, CriticalName, ReductionGen, Loc);
+  };
+
+  if (CurrentUDR) {
+    // Handle user-defined reduction.
+    auto ReductionGen = [&](CodeGenFunction &CGF, PrePostActionTy &Action) {
+      Action.Enter(CGF);
+      std::pair<llvm::Function *, llvm::Function *> FnPair =
+          getUserDefinedReduction(CurrentUDR);
+      if (FnPair.first) {
+        if (const auto *CE = dyn_cast<CallExpr>(ReductionOp)) {
+          const auto *OutDRE = cast<DeclRefExpr>(
+              cast<UnaryOperator>(CE->getArg(0)->IgnoreParenImpCasts())
+                  ->getSubExpr());
+          const auto *InDRE = cast<DeclRefExpr>(
+              cast<UnaryOperator>(CE->getArg(1)->IgnoreParenImpCasts())
+                  ->getSubExpr());
+          CodeGenFunction::OMPPrivateScope LocalScope(CGF);
+          LocalScope.addPrivate(cast<VarDecl>(OutDRE->getDecl()),
+                                SharedLV.getAddress());
+          LocalScope.addPrivate(cast<VarDecl>(InDRE->getDecl()),
+                                LHSLV.getAddress());
+          (void)LocalScope.Privatize();
+          emitReductionCombiner(CGF, ReductionOp);
+        }
+      }
+    };
+    EmitCriticalReduction(ReductionGen);
+  } else {
+    // Handle built-in reduction operations.
+#ifndef NDEBUG
+    const Expr *ReductionClauseExpr = ReductionOp->IgnoreParenCasts();
+    if (const auto *Cleanup = dyn_cast<ExprWithCleanups>(ReductionClauseExpr))
+      ReductionClauseExpr = Cleanup->getSubExpr()->IgnoreParenCasts();
+
+    const Expr *AssignRHS = nullptr;
+    if (const auto *BinOp = dyn_cast<BinaryOperator>(ReductionClauseExpr)) {
+      if (BinOp->getOpcode() == BO_Assign)
+        AssignRHS = BinOp->getRHS();
+    } else if (const auto *OpCall =
+                   dyn_cast<CXXOperatorCallExpr>(ReductionClauseExpr)) {
+      if (OpCall->getOperator() == OO_Equal)
+        AssignRHS = OpCall->getArg(1);
+    }
+
+    assert(AssignRHS &&
+           "Private Variable Reduction : Invalid ReductionOp expression");
+#endif
+
+    auto ReductionGen = [&](CodeGenFunction &CGF, PrePostActionTy &Action) {
+      Action.Enter(CGF);
+      const auto *OmpOutDRE =
+          dyn_cast<DeclRefExpr>(LHSExprs->IgnoreParenImpCasts());
+      const auto *OmpInDRE =
+          dyn_cast<DeclRefExpr>(RHSExprs->IgnoreParenImpCasts());
+      assert(
+          OmpOutDRE && OmpInDRE &&
+          "Private Variable Reduction : LHSExpr/RHSExpr must be DeclRefExprs");
+      const VarDecl *OmpOutVD = cast<VarDecl>(OmpOutDRE->getDecl());
+      const VarDecl *OmpInVD = cast<VarDecl>(OmpInDRE->getDecl());
+      CodeGenFunction::OMPPrivateScope LocalScope(CGF);
+      LocalScope.addPrivate(OmpOutVD, SharedLV.getAddress());
+      LocalScope.addPrivate(OmpInVD, LHSLV.getAddress());
+      (void)LocalScope.Privatize();
+      // Emit the actual reduction operation
+      CGF.EmitIgnoredExpr(ReductionOp);
+    };
+    EmitCriticalReduction(ReductionGen);
+  }
+
+  CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction(
+                          CGM.getModule(), OMPRTL___kmpc_barrier),
+                      BarrierArgs);
+
+  // Broadcast final result
+  bool IsAggregate = PrivateType->isAggregateType();
+  LValue SharedLV1 = CGF.MakeAddrLValue(SharedResult, PrivateType);
+  llvm::Value *FinalResultVal = nullptr;
+  Address FinalResultAddr = Address::invalid();
+
+  if (IsAggregate)
+    FinalResultAddr = SharedResult;
+  else
+    FinalResultVal = CGF.EmitLoadOfScalar(SharedLV1, Loc);
+
+  LValue TargetLHSLV = CGF.EmitLValue(RHSExprs);
+  if (IsAggregate) {
+    CGF.EmitAggregateCopy(TargetLHSLV,
+                          CGF.MakeAddrLValue(FinalResultAddr, PrivateType),
+                          PrivateType, AggValueSlot::DoesNotOverlap, false);
+  } else {
+    CGF.EmitStoreOfScalar(FinalResultVal, TargetLHSLV);
+  }
+  // Final synchronization barrier
+  CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction(
+                          CGM.getModule(), OMPRTL___kmpc_barrier),
+                      BarrierArgs);
+
+  // Combiner with original list item
+  auto OriginalListCombiner = [&](CodeGenFunction &CGF,
+                                  PrePostActionTy &Action) {
+    Action.Enter(CGF);
+    emitSingleReductionCombiner(CGF, ReductionOps, Privates,
+                                cast<DeclRefExpr>(LHSExprs),
+                                cast<DeclRefExpr>(RHSExprs));
+  };
+  EmitCriticalReduction(OriginalListCombiner);
+}
+
 void CGOpenMPRuntime::emitReduction(CodeGenFunction &CGF, SourceLocation Loc,
-                                    ArrayRef<const Expr *> Privates,
-                                    ArrayRef<const Expr *> LHSExprs,
-                                    ArrayRef<const Expr *> RHSExprs,
-                                    ArrayRef<const Expr *> ReductionOps,
+                                    ArrayRef<const Expr *> OrgPrivates,
+                                    ArrayRef<const Expr *> OrgLHSExprs,
+                                    ArrayRef<const Expr *> OrgRHSExprs,
+                                    ArrayRef<const Expr *> OrgReductionOps,
                                     ReductionOptionsTy Options) {
   if (!CGF.HaveInsertPoint())
     return;
@@ -4958,10 +5202,10 @@ void CGOpenMPRuntime::emitReduction(CodeGenFunction &CGF, SourceLocation Loc,
 
   if (SimpleReduction) {
     CodeGenFunction::RunCleanupsScope Scope(CGF);
-    const auto *IPriv = Privates.begin();
-    const auto *ILHS = LHSExprs.begin();
-    const auto *IRHS = RHSExprs.begin();
-    for (const Expr *E : ReductionOps) {
+    const auto *IPriv = OrgPrivates.begin();
+    const auto *ILHS = OrgLHSExprs.begin();
+    const auto *IRHS = OrgRHSExprs.begin();
+    for (const Expr *E : OrgReductionOps) {
       emitSingleReductionCombiner(CGF, E, *IPriv, cast<DeclRefExpr>(*ILHS),
                                   cast<DeclRefExpr>(*IRHS));
       ++IPriv;
@@ -4971,6 +5215,26 @@ void CGOpenMPRuntime::emitReduction(CodeGenFunction &CGF, SourceLocation Loc,
     return;
   }
 
+  // Filter out shared  reduction variables based on IsPrivateVarReduction flag.
+  // Only keep entries where the corresponding variable is not private.
+  SmallVector<const Expr *> FilteredPrivates, FilteredLHSExprs,
+      FilteredRHSExprs, FilteredReductionOps;
+  for (unsigned I : llvm::seq<unsigned>(
+           std::min(OrgReductionOps.size(), OrgLHSExprs.size()))) {
+    if (!Options.IsPrivateVarReduction[I]) {
+      FilteredPrivates.emplace_back(OrgPrivates[I]);
+      FilteredLHSExprs.emplace_back(OrgLHSExprs[I]);
+      FilteredRHSExprs.emplace_back(OrgRHSExprs[I]);
+      FilteredReductionOps.emplace_back(OrgReductionOps[I]);
+    }
+  }
+  // Wrap filtered vectors in ArrayRef for downstream shared reduction
+  // processing.
+  ArrayRef<const Expr *> Privates = FilteredPrivates;
+  ArrayRef<const Expr *> LHSExprs = FilteredLHSExprs;
+  ArrayRef<const Expr *> RHSExprs = FilteredRHSExprs;
+  ArrayRef<const Expr *> ReductionOps = FilteredReductionOps;
+
   // 1. Build a list of reduction variables.
   // void *RedList[<n>] = {<ReductionVars>[0], ..., <ReductionVars>[<n>-1]};
   auto Size = RHSExprs.size();
@@ -5162,7 +5426,7 @@ void CGOpenMPRuntime::emitReduction(CodeGenFunction &CGF, SourceLocation Loc,
       } else {
         // Emit as a critical region.
         auto &&CritRedGen = [E, Loc](CodeGenFunction &CGF, const Expr *,
-                                           const Expr *, const Expr *) {
+                                     const Expr *, const Expr *) {
           CGOpenMPRuntime &RT = CGF.CGM.getOpenMPRuntime();
           std::string Name = RT.getName({"atomic_reduction"});
           RT.emitCriticalRegion(
@@ -5209,6 +5473,16 @@ void CGOpenMPRuntime::emitReduction(CodeGenFunction &CGF, SourceLocation Loc,
 
   CGF.EmitBranch(DefaultBB);
   CGF.EmitBlock(DefaultBB, /*IsFinished=*/true);
+  assert(OrgLHSExprs.size() == OrgPrivates.size() &&
+         "PrivateVarReduction: Privates size mismatch");
+  assert(OrgLHSExprs.size() == OrgReductionOps.size() &&
+         "PrivateVarReduction: ReductionOps size mismatch");
+  for (unsigned I : llvm::seq<unsigned>(
+           std::min(OrgReductionOps.size(), OrgLHSExprs.size()))) {
+    if (Options.IsPrivateVarReduction[I])
+      emitPrivateReduction(CGF, Loc, OrgPrivates[I], OrgLHSExprs[I],
+                           OrgRHSExprs[I], OrgReductionOps[I]);
+  }
 }
 
 /// Generates unique name for artificial threadprivate variables.
diff --git a/clang/lib/CodeGen/CGOpenMPRuntime.h b/clang/lib/CodeGen/CGOpenMPRuntime.h
index 4321712e1521d..5be48b439f4fd 100644
--- a/clang/lib/CodeGen/CGOpenMPRuntime.h
+++ b/clang/lib/CodeGen/CGOpenMPRuntime.h
@@ -1201,8 +1201,20 @@ class CGOpenMPRuntime {
   struct ReductionOptionsTy {
     bool WithNowait;
     bool SimpleReduction;
+    llvm::SmallVector<bool, 8> IsPrivateVarReduction;
     OpenMPDirectiveKind ReductionKind;
   };
+
+  /// Emits code for private variable reduction
+  /// \param Privates List of private copies for original reduction arguments.
+  /// \param LHSExprs List of LHS in \a ReductionOps reduction operations.
+  /// \param RHSExprs List of RHS in \a ReductionOps reduction operations.
+  /// \param ReductionOps List of reduction operations in form 'LHS binop RHS'
+  /// or 'operator binop(LHS, RHS)'.
+  void emitPrivateReduction(CodeGenFunction &CGF, SourceLocation Loc,
+                            const Expr *Privates, const Expr *LHSExprs,
+                            const Expr *RHSExprs, const Expr *ReductionOps);
+
   /// Emit a code for reduction clause. Next code should be emitted for
   /// reduction:
   /// \code
diff --git a/clang/lib/CodeGen/CGStmtOpenMP.cpp b/clang/lib/CodeGen/CGStmtOpenMP.cpp
index 7fa6bfa75c350..d9195d749e056 100644
--- a/clang/lib/CodeGen/CGStmtOpenMP.cpp
+++ b/clang/lib/CodeGen/CGStmtOpenMP.cpp
@@ -1472,6 +1472,7 @@ void CodeGenFunction::EmitOMPReductionClauseFinal(
   llvm::SmallVector<const Expr *, 8> LHSExprs;
   llvm::SmallVector<const Expr *, 8> RHSExprs;
   llvm::SmallVector<const Expr *, 8> ReductionOps;
+  llvm::SmallVector<bool, 8> IsPrivateVarReduction;
   bool HasAtLeastOneReduction = false;
   bool IsReductionWithTaskMod = false;
   for (const auto *C : D.getClausesOfKind<OMPReductionClause>()) {
@@ -1482,6 +1483,8 @@ void CodeGenFunction::EmitOMPReductionClauseFinal(
     Privates.append(C->privates().begin(), C->privates().end());
     LHSExprs.append(C->lhs_exprs().begin(), C->lhs_exprs().end());
     RHSExprs.append(C->rhs_exprs().begin(), C->rhs_exprs().end());
+    IsPrivateVarReduction.append(C->private_var_reduction_flags().begin(),
+                                 C->private_var_reduction_flags().end());
     ReductionOps.append(C->reduction_ops().begin(), C->reduction_ops().end());
     IsReductionWithTaskMod =
         IsReductionWithTaskMod || C->getModifier() == OMPC_REDUCTION_task;
@@ -1503,7 +1506,7 @@ void CodeGenFunction::EmitOMPReductionClauseFinal(
     // parallel directive (it always has implicit barrier).
     CGM.getOpenMPRuntime().emitReduction(
         *this, D.getEndLoc(), Privates, LHSExprs, RHSExprs, ReductionOps,
-        {WithNowait, SimpleReduction, ReductionKind});
+        {WithNowait, SimpleReduction, IsPrivateVarReduction, ReductionKind});
   }
 }
 
@@ -3944,7 +3947,8 @@ static void emitScanBasedDirective(
       PrivScope.Privatize();
       CGF.CGM.getOpenMPRuntime().emitReduction(
           CGF, S.getEndLoc(), Privates, LHSs, RHSs, ReductionOps,
-          {/*WithNowait=*/true, /*SimpleReduction=*/true, OMPD_unknown});
+          {/*WithNowait=*/true, /*SimpleReduction=*/true,
+           /*IsPrivateVarReduction*/ {}, OMPD_unknown});
     }
     llvm::Value *NextIVal =
         CGF.Builder.CreateNUWSub(IVal, llvm::ConstantInt::get(CGF.SizeTy, 1));
@@ -5749,7 +5753,8 @@ void CodeGenFunction::EmitOMPScanDirective(const OMPScanDirective &S) {
       }
       CGM.getOpenMPRuntime().emitReduction(
           *this, ParentDir.getEndLoc(), Privates, LHSs, RHSs, ReductionOps,
-          {/*WithNowait=*/true, /*SimpleReduction=*/true, OMPD_simd});
+          {/*WithNowait=*/true, /*SimpleReduction=*/true,
+           /*IsPrivateVarReduction*/ {}, OMPD_simd});
       for (unsigned I = 0, E = CopyArrayElems.size(); I < E; ++I) {
         const Expr *PrivateExpr = Privates[I];
         LValue DestLVal;
diff --git a/clang/lib/Sema/SemaOpenMP.cpp b/clang/lib/Sema/SemaOpenMP.cpp
index 4ac3a60ae455f..a3395ac157d96 100644
--- a/clang/lib/Sema/SemaOpenMP.cpp
+++ b/clang/lib/Sema/SemaOpenMP.cpp
@@ -19047,34 +19047,14 @@ static bool actOnOMPReductionKindClause(
         reportOriginalDsa(S, Stack, D, DVar);
         continue;
       }
-      // OpenMP 6.0 [ 7.6.10 ]
-      // Support Reduction over private variables with reduction clause.
-      // A list item in a reduction clause can now be private in the enclosing
-      // context. For orphaned constructs it is assumed to be shared unless the
-      // original(private) modifier appears in the clause.
-      DVar = Stack->getImplicitDSA(D, true);
-      bool IsOrphaned = false;
-      OpenMPDirectiveKind CurrDir = Stack->getCurrentDirective();
-      OpenMPDirectiveKind ParentDir = Stack->getParentDirective();
-      // Check if the construct is orphaned (has no enclosing OpenMP context)
-      IsOrphaned = ParentDir == OMPD_unknown;
-      // OpenMP 6.0: Private DSA check
-      IsPrivate =
-          (S.getLangOpts().OpenMP > 52) &&
-          ((isOpenMPPrivate(DVar.CKind) && DVar.CKind != OMPC_reduction &&
-            isOpenMPWorksharingDirective(CurrDir) &&
-            !isOpenMPParallelDirective(CurrDir) &&
-            !isOpenMPTeamsDirective(CurrDir) &&
-            !isOpenMPSimdDirective(ParentDir)) ||
-           (IsOrphaned && DVar.CKind == OMPC_unknown) ||
-           RD.OrigSharingModifier != OMPC_ORIGINAL_SHARING_shared);
 
       // OpenMP [2.14.3.6, Restrictions, p.1]
       //  A list item that appears in a reduction clause of a worksharing
       //  construct must be shared in the parallel regions to which any of the
       //  worksharing regions arising from the worksharing construct bind.
 
-      if (!IsPrivate && isOpenMPWorksharingDirective(CurrDir) &&
+      if (S.getLangOpts().OpenMP <= 52 &&
+          isOpenMPWorksharingDirective(CurrDir) &&
           !isOpenMPParallelDirective(CurrDir) &&
           !isOpenMPTeamsDirective(CurrDir)) {
         DVar = Stack->getImplicitDSA(D, true);
@@ -19085,6 +19065,23 @@ static bool actOnOMPReductionKindClause(
           reportOriginalDsa(S, Stack, D, DVar);
           continue;
         }
+      } else if (isOpenMPWorksharingDirective(CurrDir) &&
+                 !isOpenMPParallelDirective(CurrDir) &&
+                 !isOpenMPTeamsDirective(CurrDir)) {
+        // OpenMP 6.0 [ 7.6.10 ]
+        // Support Reduction over private variables with reduction clause.
+        // A list item in a reduction clause can now be private in the enclosing
+        // context. For orphaned constructs it is assumed to be shared unless
+        // the original(private) modifier appears in the clause.
+        DVar = Stack->getImplicitDSA(D, true);
+        // Determine if the variable should be considered private
+        IsPrivate = DVar.CKind != OMPC_shared;
+        bool IsOrphaned = false;
+        OpenMPDirectiveKind ParentDir = Stack->getParentDirective();
+        IsOrphaned = ParentDir == OMPD_unknown;
+        if ((IsOrphaned &&
+             RD.OrigSharingModifier == OMPC_ORIGINAL_SHARING_private))
+          IsPrivate = true;
       }
     } else {
       // Threadprivates cannot be shared between threads, so dignose if the base
diff --git a/clang/test/OpenMP/distribute_simd_misc_messages.c b/clang/test/OpenMP/distribute_simd_misc_messages.c
index 8cbf96cd7a014..270e17dcb89bb 100644
--- a/clang/test/OpenMP/distribute_simd_misc_messages.c
+++ b/clang/test/OpenMP/distribute_simd_misc_messages.c
@@ -508,6 +508,7 @@ void test_collapse(void) {
 #pragma omp distribute simd collapse(5 - 5)
   for (i = 0; i < 16; ++i)
     ;
+#if defined(_OPENMP) && (_OPENMP <= 202111)
 // expected-note at +3 2 {{defined as reduction}}
 #pragma omp target
 #pragma omp teams
@@ -520,7 +521,7 @@ void test_collapse(void) {
 #pragma omp for reduction(+ : i, j)
       for (int k = 0; k < 16; ++k)
         i += j;
-
+#endif
 #pragma omp target
 #pragma omp teams
   for (i = 0; i < 16; ++i)
diff --git a/clang/test/OpenMP/for_private_reduction_codegen.cpp b/clang/test/OpenMP/for_private_reduction_codegen.cpp
new file mode 100644
index 0000000000000..c8a6863299fb3
--- /dev/null
+++ b/clang/test/OpenMP/for_private_reduction_codegen.cpp
@@ -0,0 +1,710 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --function-signature --check-globals --include-generated-funcs --replace-value-regex "pl_cond[.].+[.|,]" --prefix-filecheck-ir-name _ --global-value-regex ".omp.reduction..internal[a-zA-Z_0-9.]+"
+// RUN: %clang_cc1 -triple x86_64-unknown-unknown -fopenmp -fopenmp-version=60 -x c++ -std=c++17  -emit-llvm %s -o - | FileCheck %s
+// expected-no-diagnostics
+#define N 10
+class Sum {
+  int val;
+
+public:
+  Sum(int v = 0) : val(v) {}
+  Sum operator+(const Sum &rhs) const { return Sum(val + rhs.val); }
+  Sum &operator+=(const Sum &rhs) {
+    val += rhs.val;
+    return *this;
+  }
+};
+#pragma omp declare reduction(sum_reduction:Sum : omp_out += omp_in)           \
+    initializer(omp_priv = Sum(0))
+
+void func_red() {
+  Sum result(0);
+  Sum array[N];
+
+  for (int i = 0; i < N; i++) {
+    array[i] = Sum(i);
+  }
+
+#pragma omp parallel private(result) num_threads(4)
+  {
+#pragma omp for reduction(sum_reduction : result)
+    for (int i = 0; i < N; i++) {
+      result = result + array[i];
+    }
+  }
+}
+
+void do_red(int n, int *v, int &sum_v) {
+  sum_v = 0;
+#pragma omp for reduction(original(private), + : sum_v)
+  for (int i = 0; i < n; i++) {
+    sum_v += v[i];
+  }
+}
+void do_red_extended(int n, int *v, int &sum_v, int &prod_v) {
+  sum_v = 0;
+  prod_v = 1;
+
+#pragma omp for reduction(original(private), + : sum_v)                        \
+    reduction(original(private), * : prod_v)
+  for (int i = 0; i < n; i++) {
+    sum_v += v[i];
+    prod_v *= v[i];
+  }
+}
+int main(void) {
+  int v[N];
+  for (int i = 0; i < N; i++)
+    v[i] = i;
+#pragma omp parallel num_threads(4)
+  {
+    int s_v;
+    do_red(N, v, s_v);
+  }
+
+  int sum_v_ext = 0, prod_v_ext = 1;
+#pragma omp parallel num_threads(4)
+  {
+    do_red_extended(N, v, sum_v_ext, prod_v_ext);
+  }
+  return 0;
+}
+
+//.
+// CHECK: @.omp.reduction..internal_pivate_.result.result_996 = common global %class.Sum zeroinitializer, align 4
+// CHECK: @.omp.reduction..internal_pivate_.sum_v.sum_v_1188 = common global i32 0, align 4
+// CHECK: @.omp.reduction..internal_pivate_.sum_v.sum_v_1392 = common global i32 0, align 4
+// CHECK: @.omp.reduction..internal_pivate_.prod_v.prod_v_1461 = common global i32 0, align 4
+//.
+// CHECK-LABEL: define {{[^@]+}}@_Z8func_redv
+// CHECK-SAME: () #[[ATTR0:[0-9]+]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[RESULT:%.*]] = alloca [[CLASS_SUM:%.*]], align 4
+// CHECK-NEXT:    [[ARRAY:%.*]] = alloca [10 x %class.Sum], align 16
+// CHECK-NEXT:    [[I:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    [[REF_TMP:%.*]] = alloca [[CLASS_SUM]], align 4
+// CHECK-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB3:[0-9]+]])
+// CHECK-NEXT:    call void @_ZN3SumC1Ei(ptr noundef nonnull align 4 dereferenceable(4) [[RESULT]], i32 noundef 0)
+// CHECK-NEXT:    [[ARRAY_BEGIN:%.*]] = getelementptr inbounds [10 x %class.Sum], ptr [[ARRAY]], i32 0, i32 0
+// CHECK-NEXT:    [[ARRAYCTOR_END:%.*]] = getelementptr inbounds [[CLASS_SUM]], ptr [[ARRAY_BEGIN]], i64 10
+// CHECK-NEXT:    br label [[ARRAYCTOR_LOOP:%.*]]
+// CHECK:       arrayctor.loop:
+// CHECK-NEXT:    [[ARRAYCTOR_CUR:%.*]] = phi ptr [ [[ARRAY_BEGIN]], [[ENTRY:%.*]] ], [ [[ARRAYCTOR_NEXT:%.*]], [[ARRAYCTOR_LOOP]] ]
+// CHECK-NEXT:    call void @_ZN3SumC1Ei(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYCTOR_CUR]], i32 noundef 0)
+// CHECK-NEXT:    [[ARRAYCTOR_NEXT]] = getelementptr inbounds [[CLASS_SUM]], ptr [[ARRAYCTOR_CUR]], i64 1
+// CHECK-NEXT:    [[ARRAYCTOR_DONE:%.*]] = icmp eq ptr [[ARRAYCTOR_NEXT]], [[ARRAYCTOR_END]]
+// CHECK-NEXT:    br i1 [[ARRAYCTOR_DONE]], label [[ARRAYCTOR_CONT:%.*]], label [[ARRAYCTOR_LOOP]]
+// CHECK:       arrayctor.cont:
+// CHECK-NEXT:    store i32 0, ptr [[I]], align 4
+// CHECK-NEXT:    br label [[FOR_COND:%.*]]
+// CHECK:       for.cond:
+// CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr [[I]], align 4
+// CHECK-NEXT:    [[CMP:%.*]] = icmp slt i32 [[TMP1]], 10
+// CHECK-NEXT:    br i1 [[CMP]], label [[FOR_BODY:%.*]], label [[FOR_END:%.*]]
+// CHECK:       for.body:
+// CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr [[I]], align 4
+// CHECK-NEXT:    call void @_ZN3SumC1Ei(ptr noundef nonnull align 4 dereferenceable(4) [[REF_TMP]], i32 noundef [[TMP2]])
+// CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr [[I]], align 4
+// CHECK-NEXT:    [[IDXPROM:%.*]] = sext i32 [[TMP3]] to i64
+// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x %class.Sum], ptr [[ARRAY]], i64 0, i64 [[IDXPROM]]
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 4 [[ARRAYIDX]], ptr align 4 [[REF_TMP]], i64 4, i1 false)
+// CHECK-NEXT:    br label [[FOR_INC:%.*]]
+// CHECK:       for.inc:
+// CHECK-NEXT:    [[TMP4:%.*]] = load i32, ptr [[I]], align 4
+// CHECK-NEXT:    [[INC:%.*]] = add nsw i32 [[TMP4]], 1
+// CHECK-NEXT:    store i32 [[INC]], ptr [[I]], align 4
+// CHECK-NEXT:    br label [[FOR_COND]], !llvm.loop [[LOOP3:![0-9]+]]
+// CHECK:       for.end:
+// CHECK-NEXT:    call void @__kmpc_push_num_threads(ptr @[[GLOB3]], i32 [[TMP0]], i32 4)
+// CHECK-NEXT:    call void (ptr, i32, ptr, ...) @__kmpc_fork_call(ptr @[[GLOB3]], i32 1, ptr @_Z8func_redv.omp_outlined, ptr [[ARRAY]])
+// CHECK-NEXT:    ret void
+//
+//
+// CHECK-LABEL: define {{[^@]+}}@_ZN3SumC1Ei
+// CHECK-SAME: (ptr noundef nonnull align 4 dereferenceable(4) [[THIS:%.*]], i32 noundef [[V:%.*]]) unnamed_addr #[[ATTR0]] comdat align 2 {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[THIS_ADDR:%.*]] = alloca ptr, align 8
+// CHECK-NEXT:    [[V_ADDR:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    store ptr [[THIS]], ptr [[THIS_ADDR]], align 8
+// CHECK-NEXT:    store i32 [[V]], ptr [[V_ADDR]], align 4
+// CHECK-NEXT:    [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 8
+// CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr [[V_ADDR]], align 4
+// CHECK-NEXT:    call void @_ZN3SumC2Ei(ptr noundef nonnull align 4 dereferenceable(4) [[THIS1]], i32 noundef [[TMP0]])
+// CHECK-NEXT:    ret void
+//
+//
+// CHECK-LABEL: define {{[^@]+}}@_Z8func_redv.omp_outlined
+// CHECK-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[ARRAY:%.*]]) #[[ATTR2:[0-9]+]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8
+// CHECK-NEXT:    [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8
+// CHECK-NEXT:    [[ARRAY_ADDR:%.*]] = alloca ptr, align 8
+// CHECK-NEXT:    [[RESULT:%.*]] = alloca [[CLASS_SUM:%.*]], align 4
+// CHECK-NEXT:    [[DOTOMP_IV:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    [[TMP:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    [[DOTOMP_LB:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    [[DOTOMP_UB:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    [[RESULT1:%.*]] = alloca [[CLASS_SUM]], align 4
+// CHECK-NEXT:    [[I:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    [[REF_TMP:%.*]] = alloca [[CLASS_SUM]], align 4
+// CHECK-NEXT:    [[DOTOMP_REDUCTION_RED_LIST:%.*]] = alloca [0 x ptr], align 8
+// CHECK-NEXT:    store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK-NEXT:    store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 8
+// CHECK-NEXT:    store ptr [[ARRAY]], ptr [[ARRAY_ADDR]], align 8
+// CHECK-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[ARRAY_ADDR]], align 8
+// CHECK-NEXT:    call void @_ZN3SumC1Ei(ptr noundef nonnull align 4 dereferenceable(4) [[RESULT]], i32 noundef 0)
+// CHECK-NEXT:    store i32 0, ptr [[DOTOMP_LB]], align 4
+// CHECK-NEXT:    store i32 9, ptr [[DOTOMP_UB]], align 4
+// CHECK-NEXT:    store i32 1, ptr [[DOTOMP_STRIDE]], align 4
+// CHECK-NEXT:    store i32 0, ptr [[DOTOMP_IS_LAST]], align 4
+// CHECK-NEXT:    call void @.omp_initializer.(ptr noundef [[RESULT1]], ptr noundef [[RESULT]])
+// CHECK-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8
+// CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr [[TMP1]], align 4
+// CHECK-NEXT:    call void @__kmpc_for_static_init_4(ptr @[[GLOB1:[0-9]+]], i32 [[TMP2]], i32 34, ptr [[DOTOMP_IS_LAST]], ptr [[DOTOMP_LB]], ptr [[DOTOMP_UB]], ptr [[DOTOMP_STRIDE]], i32 1, i32 1)
+// CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
+// CHECK-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[TMP3]], 9
+// CHECK-NEXT:    br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]]
+// CHECK:       cond.true:
+// CHECK-NEXT:    br label [[COND_END:%.*]]
+// CHECK:       cond.false:
+// CHECK-NEXT:    [[TMP4:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
+// CHECK-NEXT:    br label [[COND_END]]
+// CHECK:       cond.end:
+// CHECK-NEXT:    [[COND:%.*]] = phi i32 [ 9, [[COND_TRUE]] ], [ [[TMP4]], [[COND_FALSE]] ]
+// CHECK-NEXT:    store i32 [[COND]], ptr [[DOTOMP_UB]], align 4
+// CHECK-NEXT:    [[TMP5:%.*]] = load i32, ptr [[DOTOMP_LB]], align 4
+// CHECK-NEXT:    store i32 [[TMP5]], ptr [[DOTOMP_IV]], align 4
+// CHECK-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
+// CHECK:       omp.inner.for.cond:
+// CHECK-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
+// CHECK-NEXT:    [[CMP2:%.*]] = icmp sle i32 [[TMP6]], [[TMP7]]
+// CHECK-NEXT:    br i1 [[CMP2]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// CHECK:       omp.inner.for.body:
+// CHECK-NEXT:    [[TMP8:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP8]], 1
+// CHECK-NEXT:    [[ADD:%.*]] = add nsw i32 0, [[MUL]]
+// CHECK-NEXT:    store i32 [[ADD]], ptr [[I]], align 4
+// CHECK-NEXT:    [[TMP9:%.*]] = load i32, ptr [[I]], align 4
+// CHECK-NEXT:    [[IDXPROM:%.*]] = sext i32 [[TMP9]] to i64
+// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x %class.Sum], ptr [[TMP0]], i64 0, i64 [[IDXPROM]]
+// CHECK-NEXT:    [[CALL:%.*]] = call i32 @_ZNK3SumplERKS_(ptr noundef nonnull align 4 dereferenceable(4) [[RESULT1]], ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYIDX]])
+// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw [[CLASS_SUM]], ptr [[REF_TMP]], i32 0, i32 0
+// CHECK-NEXT:    store i32 [[CALL]], ptr [[COERCE_DIVE]], align 4
+// CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 4 [[RESULT1]], ptr align 4 [[REF_TMP]], i64 4, i1 false)
+// CHECK-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
+// CHECK:       omp.body.continue:
+// CHECK-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
+// CHECK:       omp.inner.for.inc:
+// CHECK-NEXT:    [[TMP10:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK-NEXT:    [[ADD3:%.*]] = add nsw i32 [[TMP10]], 1
+// CHECK-NEXT:    store i32 [[ADD3]], ptr [[DOTOMP_IV]], align 4
+// CHECK-NEXT:    br label [[OMP_INNER_FOR_COND]]
+// CHECK:       omp.inner.for.end:
+// CHECK-NEXT:    br label [[OMP_LOOP_EXIT:%.*]]
+// CHECK:       omp.loop.exit:
+// CHECK-NEXT:    call void @__kmpc_for_static_fini(ptr @[[GLOB1]], i32 [[TMP2]])
+// CHECK-NEXT:    [[TMP11:%.*]] = call i32 @__kmpc_reduce(ptr @[[GLOB2:[0-9]+]], i32 [[TMP2]], i32 0, i64 0, ptr [[DOTOMP_REDUCTION_RED_LIST]], ptr @_Z8func_redv.omp_outlined.omp.reduction.reduction_func, ptr @.gomp_critical_user_.reduction.var)
+// CHECK-NEXT:    switch i32 [[TMP11]], label [[DOTOMP_REDUCTION_DEFAULT:%.*]] [
+// CHECK-NEXT:      i32 1, label [[DOTOMP_REDUCTION_CASE1:%.*]]
+// CHECK-NEXT:      i32 2, label [[DOTOMP_REDUCTION_CASE2:%.*]]
+// CHECK-NEXT:    ]
+// CHECK:       .omp.reduction.case1:
+// CHECK-NEXT:    call void @__kmpc_end_reduce(ptr @[[GLOB2]], i32 [[TMP2]], ptr @.gomp_critical_user_.reduction.var)
+// CHECK-NEXT:    br label [[DOTOMP_REDUCTION_DEFAULT]]
+// CHECK:       .omp.reduction.case2:
+// CHECK-NEXT:    call void @__kmpc_end_reduce(ptr @[[GLOB2]], i32 [[TMP2]], ptr @.gomp_critical_user_.reduction.var)
+// CHECK-NEXT:    br label [[DOTOMP_REDUCTION_DEFAULT]]
+// CHECK:       .omp.reduction.default:
+// CHECK-NEXT:    [[TMP12:%.*]] = icmp eq i32 [[TMP2]], 0
+// CHECK-NEXT:    br i1 [[TMP12]], label [[INIT:%.*]], label [[INIT_END:%.*]]
+// CHECK:       init:
+// CHECK-NEXT:    call void @_ZN3SumC1Ei(ptr noundef nonnull align 4 dereferenceable(4) @.omp.reduction..internal_pivate_.result.result_996, i32 noundef 0)
+// CHECK-NEXT:    br label [[INIT_END]]
+// CHECK:       init.end:
+// CHECK-NEXT:    call void @__kmpc_barrier(ptr @[[GLOB2]], i32 [[TMP2]])
+// CHECK-NEXT:    call void @__kmpc_critical(ptr @[[GLOB3]], i32 [[TMP2]], ptr @.gomp_critical_user_.reduction_critical.var)
+// CHECK-NEXT:    call void @.omp_combiner.(ptr noundef @.omp.reduction..internal_pivate_.result.result_996, ptr noundef [[RESULT1]])
+// CHECK-NEXT:    call void @__kmpc_end_critical(ptr @[[GLOB3]], i32 [[TMP2]], ptr @.gomp_critical_user_.reduction_critical.var)
+// CHECK-NEXT:    call void @__kmpc_barrier(ptr @[[GLOB2]], i32 [[TMP2]])
+// CHECK-NEXT:    [[TMP13:%.*]] = load [[CLASS_SUM]], ptr @.omp.reduction..internal_pivate_.result.result_996, align 4
+// CHECK-NEXT:    store [[CLASS_SUM]] [[TMP13]], ptr [[RESULT1]], align 4
+// CHECK-NEXT:    call void @__kmpc_barrier(ptr @[[GLOB2]], i32 [[TMP2]])
+// CHECK-NEXT:    call void @__kmpc_critical(ptr @[[GLOB3]], i32 [[TMP2]], ptr @.gomp_critical_user_.reduction_critical.var)
+// CHECK-NEXT:    call void @.omp_combiner.(ptr noundef [[RESULT]], ptr noundef [[RESULT1]])
+// CHECK-NEXT:    call void @__kmpc_end_critical(ptr @[[GLOB3]], i32 [[TMP2]], ptr @.gomp_critical_user_.reduction_critical.var)
+// CHECK-NEXT:    call void @__kmpc_barrier(ptr @[[GLOB4:[0-9]+]], i32 [[TMP2]])
+// CHECK-NEXT:    ret void
+//
+//
+// CHECK-LABEL: define {{[^@]+}}@.omp_combiner.
+// CHECK-SAME: (ptr noalias noundef [[TMP0:%.*]], ptr noalias noundef [[TMP1:%.*]]) #[[ATTR3:[0-9]+]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[DOTADDR:%.*]] = alloca ptr, align 8
+// CHECK-NEXT:    [[DOTADDR1:%.*]] = alloca ptr, align 8
+// CHECK-NEXT:    store ptr [[TMP0]], ptr [[DOTADDR]], align 8
+// CHECK-NEXT:    store ptr [[TMP1]], ptr [[DOTADDR1]], align 8
+// CHECK-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[DOTADDR1]], align 8
+// CHECK-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[DOTADDR]], align 8
+// CHECK-NEXT:    [[CALL:%.*]] = call noundef nonnull align 4 dereferenceable(4) ptr @_ZN3SumpLERKS_(ptr noundef nonnull align 4 dereferenceable(4) [[TMP3]], ptr noundef nonnull align 4 dereferenceable(4) [[TMP2]])
+// CHECK-NEXT:    ret void
+//
+//
+// CHECK-LABEL: define {{[^@]+}}@_ZN3SumpLERKS_
+// CHECK-SAME: (ptr noundef nonnull align 4 dereferenceable(4) [[THIS:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[RHS:%.*]]) #[[ATTR0]] comdat align 2 {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[THIS_ADDR:%.*]] = alloca ptr, align 8
+// CHECK-NEXT:    [[RHS_ADDR:%.*]] = alloca ptr, align 8
+// CHECK-NEXT:    store ptr [[THIS]], ptr [[THIS_ADDR]], align 8
+// CHECK-NEXT:    store ptr [[RHS]], ptr [[RHS_ADDR]], align 8
+// CHECK-NEXT:    [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 8
+// CHECK-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[RHS_ADDR]], align 8
+// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds nuw [[CLASS_SUM:%.*]], ptr [[TMP0]], i32 0, i32 0
+// CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr [[VAL]], align 4
+// CHECK-NEXT:    [[VAL2:%.*]] = getelementptr inbounds nuw [[CLASS_SUM]], ptr [[THIS1]], i32 0, i32 0
+// CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr [[VAL2]], align 4
+// CHECK-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP2]], [[TMP1]]
+// CHECK-NEXT:    store i32 [[ADD]], ptr [[VAL2]], align 4
+// CHECK-NEXT:    ret ptr [[THIS1]]
+//
+//
+// CHECK-LABEL: define {{[^@]+}}@.omp_initializer.
+// CHECK-SAME: (ptr noalias noundef [[TMP0:%.*]], ptr noalias noundef [[TMP1:%.*]]) #[[ATTR3]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[DOTADDR:%.*]] = alloca ptr, align 8
+// CHECK-NEXT:    [[DOTADDR1:%.*]] = alloca ptr, align 8
+// CHECK-NEXT:    store ptr [[TMP0]], ptr [[DOTADDR]], align 8
+// CHECK-NEXT:    store ptr [[TMP1]], ptr [[DOTADDR1]], align 8
+// CHECK-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[DOTADDR1]], align 8
+// CHECK-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[DOTADDR]], align 8
+// CHECK-NEXT:    call void @_ZN3SumC1Ei(ptr noundef nonnull align 4 dereferenceable(4) [[TMP3]], i32 noundef 0)
+// CHECK-NEXT:    ret void
+//
+//
+// CHECK-LABEL: define {{[^@]+}}@_ZNK3SumplERKS_
+// CHECK-SAME: (ptr noundef nonnull align 4 dereferenceable(4) [[THIS:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[RHS:%.*]]) #[[ATTR0]] comdat align 2 {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[RETVAL:%.*]] = alloca [[CLASS_SUM:%.*]], align 4
+// CHECK-NEXT:    [[THIS_ADDR:%.*]] = alloca ptr, align 8
+// CHECK-NEXT:    [[RHS_ADDR:%.*]] = alloca ptr, align 8
+// CHECK-NEXT:    store ptr [[THIS]], ptr [[THIS_ADDR]], align 8
+// CHECK-NEXT:    store ptr [[RHS]], ptr [[RHS_ADDR]], align 8
+// CHECK-NEXT:    [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 8
+// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds nuw [[CLASS_SUM]], ptr [[THIS1]], i32 0, i32 0
+// CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr [[VAL]], align 4
+// CHECK-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[RHS_ADDR]], align 8
+// CHECK-NEXT:    [[VAL2:%.*]] = getelementptr inbounds nuw [[CLASS_SUM]], ptr [[TMP1]], i32 0, i32 0
+// CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr [[VAL2]], align 4
+// CHECK-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP0]], [[TMP2]]
+// CHECK-NEXT:    call void @_ZN3SumC1Ei(ptr noundef nonnull align 4 dereferenceable(4) [[RETVAL]], i32 noundef [[ADD]])
+// CHECK-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw [[CLASS_SUM]], ptr [[RETVAL]], i32 0, i32 0
+// CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr [[COERCE_DIVE]], align 4
+// CHECK-NEXT:    ret i32 [[TMP3]]
+//
+//
+// CHECK-LABEL: define {{[^@]+}}@_Z8func_redv.omp_outlined.omp.reduction.reduction_func
+// CHECK-SAME: (ptr noundef [[TMP0:%.*]], ptr noundef [[TMP1:%.*]]) #[[ATTR5:[0-9]+]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[DOTADDR:%.*]] = alloca ptr, align 8
+// CHECK-NEXT:    [[DOTADDR1:%.*]] = alloca ptr, align 8
+// CHECK-NEXT:    store ptr [[TMP0]], ptr [[DOTADDR]], align 8
+// CHECK-NEXT:    store ptr [[TMP1]], ptr [[DOTADDR1]], align 8
+// CHECK-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[DOTADDR]], align 8
+// CHECK-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[DOTADDR1]], align 8
+// CHECK-NEXT:    ret void
+//
+//
+// CHECK-LABEL: define {{[^@]+}}@_ZN3SumC2Ei
+// CHECK-SAME: (ptr noundef nonnull align 4 dereferenceable(4) [[THIS:%.*]], i32 noundef [[V:%.*]]) unnamed_addr #[[ATTR0]] comdat align 2 {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[THIS_ADDR:%.*]] = alloca ptr, align 8
+// CHECK-NEXT:    [[V_ADDR:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    store ptr [[THIS]], ptr [[THIS_ADDR]], align 8
+// CHECK-NEXT:    store i32 [[V]], ptr [[V_ADDR]], align 4
+// CHECK-NEXT:    [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 8
+// CHECK-NEXT:    [[VAL:%.*]] = getelementptr inbounds nuw [[CLASS_SUM:%.*]], ptr [[THIS1]], i32 0, i32 0
+// CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr [[V_ADDR]], align 4
+// CHECK-NEXT:    store i32 [[TMP0]], ptr [[VAL]], align 4
+// CHECK-NEXT:    ret void
+//
+//
+// CHECK-LABEL: define {{[^@]+}}@_Z6do_rediPiRi
+// CHECK-SAME: (i32 noundef [[N:%.*]], ptr noundef [[V:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[SUM_V:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[N_ADDR:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    [[V_ADDR:%.*]] = alloca ptr, align 8
+// CHECK-NEXT:    [[SUM_V_ADDR:%.*]] = alloca ptr, align 8
+// CHECK-NEXT:    [[TMP:%.*]] = alloca ptr, align 8
+// CHECK-NEXT:    [[DOTOMP_IV:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    [[_TMP1:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    [[DOTCAPTURE_EXPR_2:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    [[I:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    [[DOTOMP_LB:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    [[DOTOMP_UB:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    [[SUM_V4:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    [[_TMP5:%.*]] = alloca ptr, align 8
+// CHECK-NEXT:    [[I6:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    [[DOTOMP_REDUCTION_RED_LIST:%.*]] = alloca [0 x ptr], align 8
+// CHECK-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB3]])
+// CHECK-NEXT:    store i32 [[N]], ptr [[N_ADDR]], align 4
+// CHECK-NEXT:    store ptr [[V]], ptr [[V_ADDR]], align 8
+// CHECK-NEXT:    store ptr [[SUM_V]], ptr [[SUM_V_ADDR]], align 8
+// CHECK-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[SUM_V_ADDR]], align 8
+// CHECK-NEXT:    store i32 0, ptr [[TMP1]], align 4
+// CHECK-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[SUM_V_ADDR]], align 8
+// CHECK-NEXT:    store ptr [[TMP2]], ptr [[TMP]], align 8
+// CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr [[N_ADDR]], align 4
+// CHECK-NEXT:    store i32 [[TMP3]], ptr [[DOTCAPTURE_EXPR_]], align 4
+// CHECK-NEXT:    [[TMP4:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4
+// CHECK-NEXT:    [[SUB:%.*]] = sub nsw i32 [[TMP4]], 0
+// CHECK-NEXT:    [[DIV:%.*]] = sdiv i32 [[SUB]], 1
+// CHECK-NEXT:    [[SUB3:%.*]] = sub nsw i32 [[DIV]], 1
+// CHECK-NEXT:    store i32 [[SUB3]], ptr [[DOTCAPTURE_EXPR_2]], align 4
+// CHECK-NEXT:    store i32 0, ptr [[I]], align 4
+// CHECK-NEXT:    [[TMP5:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4
+// CHECK-NEXT:    [[CMP:%.*]] = icmp slt i32 0, [[TMP5]]
+// CHECK-NEXT:    br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]]
+// CHECK:       omp.precond.then:
+// CHECK-NEXT:    store i32 0, ptr [[DOTOMP_LB]], align 4
+// CHECK-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_2]], align 4
+// CHECK-NEXT:    store i32 [[TMP6]], ptr [[DOTOMP_UB]], align 4
+// CHECK-NEXT:    store i32 1, ptr [[DOTOMP_STRIDE]], align 4
+// CHECK-NEXT:    store i32 0, ptr [[DOTOMP_IS_LAST]], align 4
+// CHECK-NEXT:    [[TMP7:%.*]] = load ptr, ptr [[TMP]], align 8
+// CHECK-NEXT:    store i32 0, ptr [[SUM_V4]], align 4
+// CHECK-NEXT:    store ptr [[SUM_V4]], ptr [[_TMP5]], align 8
+// CHECK-NEXT:    call void @__kmpc_for_static_init_4(ptr @[[GLOB1]], i32 [[TMP0]], i32 34, ptr [[DOTOMP_IS_LAST]], ptr [[DOTOMP_LB]], ptr [[DOTOMP_UB]], ptr [[DOTOMP_STRIDE]], i32 1, i32 1)
+// CHECK-NEXT:    [[TMP8:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
+// CHECK-NEXT:    [[TMP9:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_2]], align 4
+// CHECK-NEXT:    [[CMP7:%.*]] = icmp sgt i32 [[TMP8]], [[TMP9]]
+// CHECK-NEXT:    br i1 [[CMP7]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]]
+// CHECK:       cond.true:
+// CHECK-NEXT:    [[TMP10:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_2]], align 4
+// CHECK-NEXT:    br label [[COND_END:%.*]]
+// CHECK:       cond.false:
+// CHECK-NEXT:    [[TMP11:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
+// CHECK-NEXT:    br label [[COND_END]]
+// CHECK:       cond.end:
+// CHECK-NEXT:    [[COND:%.*]] = phi i32 [ [[TMP10]], [[COND_TRUE]] ], [ [[TMP11]], [[COND_FALSE]] ]
+// CHECK-NEXT:    store i32 [[COND]], ptr [[DOTOMP_UB]], align 4
+// CHECK-NEXT:    [[TMP12:%.*]] = load i32, ptr [[DOTOMP_LB]], align 4
+// CHECK-NEXT:    store i32 [[TMP12]], ptr [[DOTOMP_IV]], align 4
+// CHECK-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
+// CHECK:       omp.inner.for.cond:
+// CHECK-NEXT:    [[TMP13:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK-NEXT:    [[TMP14:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
+// CHECK-NEXT:    [[CMP8:%.*]] = icmp sle i32 [[TMP13]], [[TMP14]]
+// CHECK-NEXT:    br i1 [[CMP8]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// CHECK:       omp.inner.for.body:
+// CHECK-NEXT:    [[TMP15:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP15]], 1
+// CHECK-NEXT:    [[ADD:%.*]] = add nsw i32 0, [[MUL]]
+// CHECK-NEXT:    store i32 [[ADD]], ptr [[I6]], align 4
+// CHECK-NEXT:    [[TMP16:%.*]] = load ptr, ptr [[V_ADDR]], align 8
+// CHECK-NEXT:    [[TMP17:%.*]] = load i32, ptr [[I6]], align 4
+// CHECK-NEXT:    [[IDXPROM:%.*]] = sext i32 [[TMP17]] to i64
+// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[TMP16]], i64 [[IDXPROM]]
+// CHECK-NEXT:    [[TMP18:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
+// CHECK-NEXT:    [[TMP19:%.*]] = load ptr, ptr [[_TMP5]], align 8
+// CHECK-NEXT:    [[TMP20:%.*]] = load i32, ptr [[TMP19]], align 4
+// CHECK-NEXT:    [[ADD9:%.*]] = add nsw i32 [[TMP20]], [[TMP18]]
+// CHECK-NEXT:    store i32 [[ADD9]], ptr [[TMP19]], align 4
+// CHECK-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
+// CHECK:       omp.body.continue:
+// CHECK-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
+// CHECK:       omp.inner.for.inc:
+// CHECK-NEXT:    [[TMP21:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK-NEXT:    [[ADD10:%.*]] = add nsw i32 [[TMP21]], 1
+// CHECK-NEXT:    store i32 [[ADD10]], ptr [[DOTOMP_IV]], align 4
+// CHECK-NEXT:    br label [[OMP_INNER_FOR_COND]]
+// CHECK:       omp.inner.for.end:
+// CHECK-NEXT:    br label [[OMP_LOOP_EXIT:%.*]]
+// CHECK:       omp.loop.exit:
+// CHECK-NEXT:    call void @__kmpc_for_static_fini(ptr @[[GLOB1]], i32 [[TMP0]])
+// CHECK-NEXT:    [[TMP22:%.*]] = call i32 @__kmpc_reduce(ptr @[[GLOB2]], i32 [[TMP0]], i32 0, i64 0, ptr [[DOTOMP_REDUCTION_RED_LIST]], ptr @_Z6do_rediPiRi.omp.reduction.reduction_func, ptr @.gomp_critical_user_.reduction.var)
+// CHECK-NEXT:    switch i32 [[TMP22]], label [[DOTOMP_REDUCTION_DEFAULT:%.*]] [
+// CHECK-NEXT:      i32 1, label [[DOTOMP_REDUCTION_CASE1:%.*]]
+// CHECK-NEXT:      i32 2, label [[DOTOMP_REDUCTION_CASE2:%.*]]
+// CHECK-NEXT:    ]
+// CHECK:       .omp.reduction.case1:
+// CHECK-NEXT:    call void @__kmpc_end_reduce(ptr @[[GLOB2]], i32 [[TMP0]], ptr @.gomp_critical_user_.reduction.var)
+// CHECK-NEXT:    br label [[DOTOMP_REDUCTION_DEFAULT]]
+// CHECK:       .omp.reduction.case2:
+// CHECK-NEXT:    call void @__kmpc_end_reduce(ptr @[[GLOB2]], i32 [[TMP0]], ptr @.gomp_critical_user_.reduction.var)
+// CHECK-NEXT:    br label [[DOTOMP_REDUCTION_DEFAULT]]
+// CHECK:       .omp.reduction.default:
+// CHECK-NEXT:    [[TMP23:%.*]] = icmp eq i32 [[TMP0]], 0
+// CHECK-NEXT:    br i1 [[TMP23]], label [[INIT:%.*]], label [[INIT_END:%.*]]
+// CHECK:       init:
+// CHECK-NEXT:    store i32 0, ptr @.omp.reduction..internal_pivate_.sum_v.sum_v_1188, align 4
+// CHECK-NEXT:    br label [[INIT_END]]
+// CHECK:       init.end:
+// CHECK-NEXT:    call void @__kmpc_barrier(ptr @[[GLOB2]], i32 [[TMP0]])
+// CHECK-NEXT:    call void @__kmpc_critical(ptr @[[GLOB3]], i32 [[TMP0]], ptr @.gomp_critical_user_.reduction_critical.var)
+// CHECK-NEXT:    [[TMP24:%.*]] = load i32, ptr @.omp.reduction..internal_pivate_.sum_v.sum_v_1188, align 4
+// CHECK-NEXT:    [[TMP25:%.*]] = load i32, ptr [[SUM_V4]], align 4
+// CHECK-NEXT:    [[ADD11:%.*]] = add nsw i32 [[TMP24]], [[TMP25]]
+// CHECK-NEXT:    store i32 [[ADD11]], ptr @.omp.reduction..internal_pivate_.sum_v.sum_v_1188, align 4
+// CHECK-NEXT:    call void @__kmpc_end_critical(ptr @[[GLOB3]], i32 [[TMP0]], ptr @.gomp_critical_user_.reduction_critical.var)
+// CHECK-NEXT:    call void @__kmpc_barrier(ptr @[[GLOB2]], i32 [[TMP0]])
+// CHECK-NEXT:    [[TMP26:%.*]] = load i32, ptr @.omp.reduction..internal_pivate_.sum_v.sum_v_1188, align 4
+// CHECK-NEXT:    store i32 [[TMP26]], ptr [[SUM_V4]], align 4
+// CHECK-NEXT:    call void @__kmpc_barrier(ptr @[[GLOB2]], i32 [[TMP0]])
+// CHECK-NEXT:    call void @__kmpc_critical(ptr @[[GLOB3]], i32 [[TMP0]], ptr @.gomp_critical_user_.reduction_critical.var)
+// CHECK-NEXT:    [[TMP27:%.*]] = load i32, ptr [[TMP7]], align 4
+// CHECK-NEXT:    [[TMP28:%.*]] = load i32, ptr [[SUM_V4]], align 4
+// CHECK-NEXT:    [[ADD12:%.*]] = add nsw i32 [[TMP27]], [[TMP28]]
+// CHECK-NEXT:    store i32 [[ADD12]], ptr [[TMP7]], align 4
+// CHECK-NEXT:    call void @__kmpc_end_critical(ptr @[[GLOB3]], i32 [[TMP0]], ptr @.gomp_critical_user_.reduction_critical.var)
+// CHECK-NEXT:    br label [[OMP_PRECOND_END]]
+// CHECK:       omp.precond.end:
+// CHECK-NEXT:    call void @__kmpc_barrier(ptr @[[GLOB4]], i32 [[TMP0]])
+// CHECK-NEXT:    ret void
+//
+//
+// CHECK-LABEL: define {{[^@]+}}@_Z6do_rediPiRi.omp.reduction.reduction_func
+// CHECK-SAME: (ptr noundef [[TMP0:%.*]], ptr noundef [[TMP1:%.*]]) #[[ATTR5]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[DOTADDR:%.*]] = alloca ptr, align 8
+// CHECK-NEXT:    [[DOTADDR1:%.*]] = alloca ptr, align 8
+// CHECK-NEXT:    store ptr [[TMP0]], ptr [[DOTADDR]], align 8
+// CHECK-NEXT:    store ptr [[TMP1]], ptr [[DOTADDR1]], align 8
+// CHECK-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[DOTADDR]], align 8
+// CHECK-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[DOTADDR1]], align 8
+// CHECK-NEXT:    ret void
+//
+//
+// CHECK-LABEL: define {{[^@]+}}@_Z15do_red_extendediPiRiS0_
+// CHECK-SAME: (i32 noundef [[N:%.*]], ptr noundef [[V:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[SUM_V:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[PROD_V:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[N_ADDR:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    [[V_ADDR:%.*]] = alloca ptr, align 8
+// CHECK-NEXT:    [[SUM_V_ADDR:%.*]] = alloca ptr, align 8
+// CHECK-NEXT:    [[PROD_V_ADDR:%.*]] = alloca ptr, align 8
+// CHECK-NEXT:    [[TMP:%.*]] = alloca ptr, align 8
+// CHECK-NEXT:    [[_TMP1:%.*]] = alloca ptr, align 8
+// CHECK-NEXT:    [[DOTOMP_IV:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    [[_TMP2:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    [[DOTCAPTURE_EXPR_3:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    [[I:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    [[DOTOMP_LB:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    [[DOTOMP_UB:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    [[SUM_V5:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    [[_TMP6:%.*]] = alloca ptr, align 8
+// CHECK-NEXT:    [[PROD_V7:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    [[_TMP8:%.*]] = alloca ptr, align 8
+// CHECK-NEXT:    [[I9:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    [[DOTOMP_REDUCTION_RED_LIST:%.*]] = alloca [0 x ptr], align 8
+// CHECK-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB3]])
+// CHECK-NEXT:    store i32 [[N]], ptr [[N_ADDR]], align 4
+// CHECK-NEXT:    store ptr [[V]], ptr [[V_ADDR]], align 8
+// CHECK-NEXT:    store ptr [[SUM_V]], ptr [[SUM_V_ADDR]], align 8
+// CHECK-NEXT:    store ptr [[PROD_V]], ptr [[PROD_V_ADDR]], align 8
+// CHECK-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[SUM_V_ADDR]], align 8
+// CHECK-NEXT:    store i32 0, ptr [[TMP1]], align 4
+// CHECK-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[PROD_V_ADDR]], align 8
+// CHECK-NEXT:    store i32 1, ptr [[TMP2]], align 4
+// CHECK-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[SUM_V_ADDR]], align 8
+// CHECK-NEXT:    store ptr [[TMP3]], ptr [[TMP]], align 8
+// CHECK-NEXT:    [[TMP4:%.*]] = load ptr, ptr [[PROD_V_ADDR]], align 8
+// CHECK-NEXT:    store ptr [[TMP4]], ptr [[_TMP1]], align 8
+// CHECK-NEXT:    [[TMP5:%.*]] = load i32, ptr [[N_ADDR]], align 4
+// CHECK-NEXT:    store i32 [[TMP5]], ptr [[DOTCAPTURE_EXPR_]], align 4
+// CHECK-NEXT:    [[TMP6:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4
+// CHECK-NEXT:    [[SUB:%.*]] = sub nsw i32 [[TMP6]], 0
+// CHECK-NEXT:    [[DIV:%.*]] = sdiv i32 [[SUB]], 1
+// CHECK-NEXT:    [[SUB4:%.*]] = sub nsw i32 [[DIV]], 1
+// CHECK-NEXT:    store i32 [[SUB4]], ptr [[DOTCAPTURE_EXPR_3]], align 4
+// CHECK-NEXT:    store i32 0, ptr [[I]], align 4
+// CHECK-NEXT:    [[TMP7:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4
+// CHECK-NEXT:    [[CMP:%.*]] = icmp slt i32 0, [[TMP7]]
+// CHECK-NEXT:    br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]]
+// CHECK:       omp.precond.then:
+// CHECK-NEXT:    store i32 0, ptr [[DOTOMP_LB]], align 4
+// CHECK-NEXT:    [[TMP8:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_3]], align 4
+// CHECK-NEXT:    store i32 [[TMP8]], ptr [[DOTOMP_UB]], align 4
+// CHECK-NEXT:    store i32 1, ptr [[DOTOMP_STRIDE]], align 4
+// CHECK-NEXT:    store i32 0, ptr [[DOTOMP_IS_LAST]], align 4
+// CHECK-NEXT:    [[TMP9:%.*]] = load ptr, ptr [[TMP]], align 8
+// CHECK-NEXT:    store i32 0, ptr [[SUM_V5]], align 4
+// CHECK-NEXT:    store ptr [[SUM_V5]], ptr [[_TMP6]], align 8
+// CHECK-NEXT:    [[TMP10:%.*]] = load ptr, ptr [[_TMP1]], align 8
+// CHECK-NEXT:    store i32 1, ptr [[PROD_V7]], align 4
+// CHECK-NEXT:    store ptr [[PROD_V7]], ptr [[_TMP8]], align 8
+// CHECK-NEXT:    call void @__kmpc_for_static_init_4(ptr @[[GLOB1]], i32 [[TMP0]], i32 34, ptr [[DOTOMP_IS_LAST]], ptr [[DOTOMP_LB]], ptr [[DOTOMP_UB]], ptr [[DOTOMP_STRIDE]], i32 1, i32 1)
+// CHECK-NEXT:    [[TMP11:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
+// CHECK-NEXT:    [[TMP12:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_3]], align 4
+// CHECK-NEXT:    [[CMP10:%.*]] = icmp sgt i32 [[TMP11]], [[TMP12]]
+// CHECK-NEXT:    br i1 [[CMP10]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]]
+// CHECK:       cond.true:
+// CHECK-NEXT:    [[TMP13:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_3]], align 4
+// CHECK-NEXT:    br label [[COND_END:%.*]]
+// CHECK:       cond.false:
+// CHECK-NEXT:    [[TMP14:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
+// CHECK-NEXT:    br label [[COND_END]]
+// CHECK:       cond.end:
+// CHECK-NEXT:    [[COND:%.*]] = phi i32 [ [[TMP13]], [[COND_TRUE]] ], [ [[TMP14]], [[COND_FALSE]] ]
+// CHECK-NEXT:    store i32 [[COND]], ptr [[DOTOMP_UB]], align 4
+// CHECK-NEXT:    [[TMP15:%.*]] = load i32, ptr [[DOTOMP_LB]], align 4
+// CHECK-NEXT:    store i32 [[TMP15]], ptr [[DOTOMP_IV]], align 4
+// CHECK-NEXT:    br label [[OMP_INNER_FOR_COND:%.*]]
+// CHECK:       omp.inner.for.cond:
+// CHECK-NEXT:    [[TMP16:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK-NEXT:    [[TMP17:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4
+// CHECK-NEXT:    [[CMP11:%.*]] = icmp sle i32 [[TMP16]], [[TMP17]]
+// CHECK-NEXT:    br i1 [[CMP11]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
+// CHECK:       omp.inner.for.body:
+// CHECK-NEXT:    [[TMP18:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP18]], 1
+// CHECK-NEXT:    [[ADD:%.*]] = add nsw i32 0, [[MUL]]
+// CHECK-NEXT:    store i32 [[ADD]], ptr [[I9]], align 4
+// CHECK-NEXT:    [[TMP19:%.*]] = load ptr, ptr [[V_ADDR]], align 8
+// CHECK-NEXT:    [[TMP20:%.*]] = load i32, ptr [[I9]], align 4
+// CHECK-NEXT:    [[IDXPROM:%.*]] = sext i32 [[TMP20]] to i64
+// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[TMP19]], i64 [[IDXPROM]]
+// CHECK-NEXT:    [[TMP21:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
+// CHECK-NEXT:    [[TMP22:%.*]] = load ptr, ptr [[_TMP6]], align 8
+// CHECK-NEXT:    [[TMP23:%.*]] = load i32, ptr [[TMP22]], align 4
+// CHECK-NEXT:    [[ADD12:%.*]] = add nsw i32 [[TMP23]], [[TMP21]]
+// CHECK-NEXT:    store i32 [[ADD12]], ptr [[TMP22]], align 4
+// CHECK-NEXT:    [[TMP24:%.*]] = load ptr, ptr [[V_ADDR]], align 8
+// CHECK-NEXT:    [[TMP25:%.*]] = load i32, ptr [[I9]], align 4
+// CHECK-NEXT:    [[IDXPROM13:%.*]] = sext i32 [[TMP25]] to i64
+// CHECK-NEXT:    [[ARRAYIDX14:%.*]] = getelementptr inbounds i32, ptr [[TMP24]], i64 [[IDXPROM13]]
+// CHECK-NEXT:    [[TMP26:%.*]] = load i32, ptr [[ARRAYIDX14]], align 4
+// CHECK-NEXT:    [[TMP27:%.*]] = load ptr, ptr [[_TMP8]], align 8
+// CHECK-NEXT:    [[TMP28:%.*]] = load i32, ptr [[TMP27]], align 4
+// CHECK-NEXT:    [[MUL15:%.*]] = mul nsw i32 [[TMP28]], [[TMP26]]
+// CHECK-NEXT:    store i32 [[MUL15]], ptr [[TMP27]], align 4
+// CHECK-NEXT:    br label [[OMP_BODY_CONTINUE:%.*]]
+// CHECK:       omp.body.continue:
+// CHECK-NEXT:    br label [[OMP_INNER_FOR_INC:%.*]]
+// CHECK:       omp.inner.for.inc:
+// CHECK-NEXT:    [[TMP29:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4
+// CHECK-NEXT:    [[ADD16:%.*]] = add nsw i32 [[TMP29]], 1
+// CHECK-NEXT:    store i32 [[ADD16]], ptr [[DOTOMP_IV]], align 4
+// CHECK-NEXT:    br label [[OMP_INNER_FOR_COND]]
+// CHECK:       omp.inner.for.end:
+// CHECK-NEXT:    br label [[OMP_LOOP_EXIT:%.*]]
+// CHECK:       omp.loop.exit:
+// CHECK-NEXT:    call void @__kmpc_for_static_fini(ptr @[[GLOB1]], i32 [[TMP0]])
+// CHECK-NEXT:    [[TMP30:%.*]] = call i32 @__kmpc_reduce(ptr @[[GLOB2]], i32 [[TMP0]], i32 0, i64 0, ptr [[DOTOMP_REDUCTION_RED_LIST]], ptr @_Z15do_red_extendediPiRiS0_.omp.reduction.reduction_func, ptr @.gomp_critical_user_.reduction.var)
+// CHECK-NEXT:    switch i32 [[TMP30]], label [[DOTOMP_REDUCTION_DEFAULT:%.*]] [
+// CHECK-NEXT:      i32 1, label [[DOTOMP_REDUCTION_CASE1:%.*]]
+// CHECK-NEXT:      i32 2, label [[DOTOMP_REDUCTION_CASE2:%.*]]
+// CHECK-NEXT:    ]
+// CHECK:       .omp.reduction.case1:
+// CHECK-NEXT:    call void @__kmpc_end_reduce(ptr @[[GLOB2]], i32 [[TMP0]], ptr @.gomp_critical_user_.reduction.var)
+// CHECK-NEXT:    br label [[DOTOMP_REDUCTION_DEFAULT]]
+// CHECK:       .omp.reduction.case2:
+// CHECK-NEXT:    call void @__kmpc_end_reduce(ptr @[[GLOB2]], i32 [[TMP0]], ptr @.gomp_critical_user_.reduction.var)
+// CHECK-NEXT:    br label [[DOTOMP_REDUCTION_DEFAULT]]
+// CHECK:       .omp.reduction.default:
+// CHECK-NEXT:    [[TMP31:%.*]] = icmp eq i32 [[TMP0]], 0
+// CHECK-NEXT:    br i1 [[TMP31]], label [[INIT:%.*]], label [[INIT_END:%.*]]
+// CHECK:       init:
+// CHECK-NEXT:    store i32 0, ptr @.omp.reduction..internal_pivate_.sum_v.sum_v_1392, align 4
+// CHECK-NEXT:    br label [[INIT_END]]
+// CHECK:       init.end:
+// CHECK-NEXT:    call void @__kmpc_barrier(ptr @[[GLOB2]], i32 [[TMP0]])
+// CHECK-NEXT:    call void @__kmpc_critical(ptr @[[GLOB3]], i32 [[TMP0]], ptr @.gomp_critical_user_.reduction_critical.var)
+// CHECK-NEXT:    [[TMP32:%.*]] = load i32, ptr @.omp.reduction..internal_pivate_.sum_v.sum_v_1392, align 4
+// CHECK-NEXT:    [[TMP33:%.*]] = load i32, ptr [[SUM_V5]], align 4
+// CHECK-NEXT:    [[ADD17:%.*]] = add nsw i32 [[TMP32]], [[TMP33]]
+// CHECK-NEXT:    store i32 [[ADD17]], ptr @.omp.reduction..internal_pivate_.sum_v.sum_v_1392, align 4
+// CHECK-NEXT:    call void @__kmpc_end_critical(ptr @[[GLOB3]], i32 [[TMP0]], ptr @.gomp_critical_user_.reduction_critical.var)
+// CHECK-NEXT:    call void @__kmpc_barrier(ptr @[[GLOB2]], i32 [[TMP0]])
+// CHECK-NEXT:    [[TMP34:%.*]] = load i32, ptr @.omp.reduction..internal_pivate_.sum_v.sum_v_1392, align 4
+// CHECK-NEXT:    store i32 [[TMP34]], ptr [[SUM_V5]], align 4
+// CHECK-NEXT:    call void @__kmpc_barrier(ptr @[[GLOB2]], i32 [[TMP0]])
+// CHECK-NEXT:    call void @__kmpc_critical(ptr @[[GLOB3]], i32 [[TMP0]], ptr @.gomp_critical_user_.reduction_critical.var)
+// CHECK-NEXT:    [[TMP35:%.*]] = load i32, ptr [[TMP9]], align 4
+// CHECK-NEXT:    [[TMP36:%.*]] = load i32, ptr [[SUM_V5]], align 4
+// CHECK-NEXT:    [[ADD18:%.*]] = add nsw i32 [[TMP35]], [[TMP36]]
+// CHECK-NEXT:    store i32 [[ADD18]], ptr [[TMP9]], align 4
+// CHECK-NEXT:    call void @__kmpc_end_critical(ptr @[[GLOB3]], i32 [[TMP0]], ptr @.gomp_critical_user_.reduction_critical.var)
+// CHECK-NEXT:    [[TMP37:%.*]] = icmp eq i32 [[TMP0]], 0
+// CHECK-NEXT:    br i1 [[TMP37]], label [[INIT19:%.*]], label [[INIT_END20:%.*]]
+// CHECK:       init19:
+// CHECK-NEXT:    store i32 1, ptr @.omp.reduction..internal_pivate_.prod_v.prod_v_1461, align 4
+// CHECK-NEXT:    br label [[INIT_END20]]
+// CHECK:       init.end20:
+// CHECK-NEXT:    call void @__kmpc_barrier(ptr @[[GLOB2]], i32 [[TMP0]])
+// CHECK-NEXT:    call void @__kmpc_critical(ptr @[[GLOB3]], i32 [[TMP0]], ptr @.gomp_critical_user_.reduction_critical.var)
+// CHECK-NEXT:    [[TMP38:%.*]] = load i32, ptr @.omp.reduction..internal_pivate_.prod_v.prod_v_1461, align 4
+// CHECK-NEXT:    [[TMP39:%.*]] = load i32, ptr [[PROD_V7]], align 4
+// CHECK-NEXT:    [[MUL21:%.*]] = mul nsw i32 [[TMP38]], [[TMP39]]
+// CHECK-NEXT:    store i32 [[MUL21]], ptr @.omp.reduction..internal_pivate_.prod_v.prod_v_1461, align 4
+// CHECK-NEXT:    call void @__kmpc_end_critical(ptr @[[GLOB3]], i32 [[TMP0]], ptr @.gomp_critical_user_.reduction_critical.var)
+// CHECK-NEXT:    call void @__kmpc_barrier(ptr @[[GLOB2]], i32 [[TMP0]])
+// CHECK-NEXT:    [[TMP40:%.*]] = load i32, ptr @.omp.reduction..internal_pivate_.prod_v.prod_v_1461, align 4
+// CHECK-NEXT:    store i32 [[TMP40]], ptr [[PROD_V7]], align 4
+// CHECK-NEXT:    call void @__kmpc_barrier(ptr @[[GLOB2]], i32 [[TMP0]])
+// CHECK-NEXT:    call void @__kmpc_critical(ptr @[[GLOB3]], i32 [[TMP0]], ptr @.gomp_critical_user_.reduction_critical.var)
+// CHECK-NEXT:    [[TMP41:%.*]] = load i32, ptr [[TMP10]], align 4
+// CHECK-NEXT:    [[TMP42:%.*]] = load i32, ptr [[PROD_V7]], align 4
+// CHECK-NEXT:    [[MUL22:%.*]] = mul nsw i32 [[TMP41]], [[TMP42]]
+// CHECK-NEXT:    store i32 [[MUL22]], ptr [[TMP10]], align 4
+// CHECK-NEXT:    call void @__kmpc_end_critical(ptr @[[GLOB3]], i32 [[TMP0]], ptr @.gomp_critical_user_.reduction_critical.var)
+// CHECK-NEXT:    br label [[OMP_PRECOND_END]]
+// CHECK:       omp.precond.end:
+// CHECK-NEXT:    call void @__kmpc_barrier(ptr @[[GLOB4]], i32 [[TMP0]])
+// CHECK-NEXT:    ret void
+//
+//
+// CHECK-LABEL: define {{[^@]+}}@_Z15do_red_extendediPiRiS0_.omp.reduction.reduction_func
+// CHECK-SAME: (ptr noundef [[TMP0:%.*]], ptr noundef [[TMP1:%.*]]) #[[ATTR5]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[DOTADDR:%.*]] = alloca ptr, align 8
+// CHECK-NEXT:    [[DOTADDR1:%.*]] = alloca ptr, align 8
+// CHECK-NEXT:    store ptr [[TMP0]], ptr [[DOTADDR]], align 8
+// CHECK-NEXT:    store ptr [[TMP1]], ptr [[DOTADDR1]], align 8
+// CHECK-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[DOTADDR]], align 8
+// CHECK-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[DOTADDR1]], align 8
+// CHECK-NEXT:    ret void
+//
+//
+// CHECK-LABEL: define {{[^@]+}}@main
+// CHECK-SAME: () #[[ATTR7:[0-9]+]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[RETVAL:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    [[V:%.*]] = alloca [10 x i32], align 16
+// CHECK-NEXT:    [[I:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    [[SUM_V_EXT:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    [[PROD_V_EXT:%.*]] = alloca i32, align 4
+// CHECK-NEXT:    [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB3]])
+// CHECK-NEXT:    store i32 0, ptr [[RETVAL]], align 4
+// CHECK-NEXT:    store i32 0, ptr [[I]], align 4
+// CHECK-NEXT:    br label [[FOR_COND:%.*]]
+// CHECK:       for.cond:
+// CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr [[I]], align 4
+// CHECK-NEXT:    [[CMP:%.*]] = icmp slt i32 [[TMP1]], 10
+// CHECK-NEXT:    br i1 [[CMP]], label [[FOR_BODY:%.*]], label [[FOR_END:%.*]]
+// CHECK:       for.body:
+// CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr [[I]], align 4
+// CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr [[I]], align 4
+// CHECK-NEXT:    [[IDXPROM:%.*]] = sext i32 [[TMP3]] to i64
+// CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x i32], ptr [[V]], i64 0, i64 [[IDXPROM]]
+// CHECK-NEXT:    store i32 [[TMP2]], ptr [[ARRAYIDX]], align 4
+// CHECK-NEXT:    br label [[FOR_INC:%.*]]
+// CHECK:       for.inc:
+// CHECK-NEXT:    [[TMP4:%.*]] = load i32, ptr [[I]], align 4
+// CHECK-NEXT:    [[INC:%.*]] = add nsw i32 [[TMP4]], 1
+// CHECK-NEXT:    store i32 [[INC]], ptr [[I]], align 4
+// CHECK-NEXT:    br label [[FOR_COND]], !llvm.loop [[LOOP7:![0-9]+]]
+// CHECK:       for.end:
+// CHECK-NEXT:    call void @__kmpc_push_num_threads(ptr @[[GLOB3]], i32 [[TMP0]], i32 4)
+// CHECK-NEXT:    call void (ptr, i32, ptr, ...) @__kmpc_fork_call(ptr @[[GLOB3]], i32 1, ptr @main.omp_outlined, ptr [[V]])
+// CHECK-NEXT:    store i32 0, ptr [[SUM_V_EXT]], align 4
+// CHECK-NEXT:    store i32 1, ptr [[PROD_V_EXT]], align 4
+// CHECK-NEXT:    call void @__kmpc_push_num_threads(ptr @[[GLOB3]], i32 [[TMP0]], i32 4)
+// CHECK-NEXT:    call void (ptr, i32, ptr, ...) @__kmpc_fork_call(ptr @[[GLOB3]], i32 3, ptr @main.omp_outlined.1, ptr [[V]], ptr [[SUM_V_EXT]], ptr [[PROD_V_EXT]])
+// CHECK-NEXT:    ret i32 0
+
diff --git a/clang/test/OpenMP/for_reduction_messages.cpp b/clang/test/OpenMP/for_reduction_messages.cpp
index de28ba2c3be02..2fdac3048c9cd 100644
--- a/clang/test/OpenMP/for_reduction_messages.cpp
+++ b/clang/test/OpenMP/for_reduction_messages.cpp
@@ -417,10 +417,12 @@ int main(int argc, char **argv) {
 #pragma omp for reduction(+ : qa[1], qa[0])
   for (int i = 0; i < 10; ++i)
     foo();
+#if defined(_OPENMP) && (_OPENMP <= 202111)
 #pragma omp parallel reduction(* : fl) // expected-note {{defined as reduction}}
 #pragma omp for reduction(+ : fl)      // expected-error {{reduction variable must be shared}}
   for (int i = 0; i < 10; ++i)
     foo();
+#endif
   static int m=0;
 #pragma omp for reduction(+:m)
   for (int i = 0; i < 10; ++i)
diff --git a/clang/test/OpenMP/for_simd_reduction_messages.cpp b/clang/test/OpenMP/for_simd_reduction_messages.cpp
index 96b3805b10a86..a9ef6c39cb5d2 100644
--- a/clang/test/OpenMP/for_simd_reduction_messages.cpp
+++ b/clang/test/OpenMP/for_simd_reduction_messages.cpp
@@ -396,11 +396,11 @@ int main(int argc, char **argv) {
 #pragma omp for simd reduction(+ : fl) // expected-error {{reduction variable must be shared}}
   for (int i = 0; i < 10; ++i)
     foo();
-#endif
 #pragma omp parallel reduction(* : fl) // expected-note {{defined as reduction}}
 #pragma omp for simd reduction(+ : fl)      // expected-error {{reduction variable must be shared}}
   for (int i = 0; i < 10; ++i)
     foo();
+#endif
   static int m;
 #pragma omp for simd reduction(+ : m)
   for (int i = 0; i < 10; ++i)
diff --git a/clang/test/OpenMP/sections_reduction_messages.cpp b/clang/test/OpenMP/sections_reduction_messages.cpp
index 42ec3ed6d58e8..8cde6489f325f 100644
--- a/clang/test/OpenMP/sections_reduction_messages.cpp
+++ b/clang/test/OpenMP/sections_reduction_messages.cpp
@@ -461,12 +461,12 @@ int main(int argc, char **argv) {
   {
     foo();
   }
-#endif
 #pragma omp parallel reduction(* : fl) // expected-note {{defined as reduction}}
 #pragma omp sections reduction(+ : fl) // expected-error {{reduction variable must be shared}}
   {
     foo();
   }
+#endif
   static int m;
 #pragma omp sections reduction(+ : m) // OK
   {
diff --git a/openmp/runtime/test/worksharing/for/omp_for_private_reduction.cpp b/openmp/runtime/test/worksharing/for/omp_for_private_reduction.cpp
new file mode 100644
index 0000000000000..9bf3be1e9e45d
--- /dev/null
+++ b/openmp/runtime/test/worksharing/for/omp_for_private_reduction.cpp
@@ -0,0 +1,194 @@
+// RUN: %libomp-cxx-compile -fopenmp-version=60  && %libomp-run
+#include <stdio.h>
+#include <omp.h>
+#include <limits.h>
+#include <complex.h>
+#include <math.h>
+#include "omp_testsuite.h"
+
+#define N 10
+class Sum {
+  int val;
+
+public:
+  Sum(int v = 0) : val(v) {}
+  Sum operator+(const Sum &rhs) const { return Sum(val + rhs.val); }
+  Sum &operator+=(const Sum &rhs) {
+    val += rhs.val;
+    return *this;
+  }
+  int getValue() const { return val; }
+};
+
+// Declare OpenMP reduction
+#pragma omp declare reduction(sum_reduction:Sum : omp_out += omp_in)           \
+    initializer(omp_priv = Sum(0))
+
+#pragma omp declare reduction(sum_pctor_reduction:Sum : omp_out += omp_in)     \
+    initializer(omp_priv = Sum(1)) // non-default ctor
+
+int checkUserDefinedReduction() {
+  Sum final_result_udr(0);
+  Sum final_result_udr_pctor(1);
+  Sum array_sum[N];
+  int error_flag = 0;
+  int expected_value = 0;
+  int expected_value_pctor = 0;
+  for (int i = 0; i < N; ++i) {
+    array_sum[i] = Sum(i);
+    expected_value += i; // Calculate expected sum: 0 + 1 + ... + (N-1)
+    expected_value_pctor += i;
+  }
+  int num_threads_for_pctor_calc = 4; //  num_threads(4)
+  int priv_initializer_val_pctor = 1; //  initializer(omp_priv = Sum(1))
+  expected_value_pctor +=
+      num_threads_for_pctor_calc + priv_initializer_val_pctor;
+#pragma omp parallel num_threads(4) private(final_result_udr) private(         \
+        final_result_udr_pctor)
+  {
+#pragma omp for reduction(sum_reduction : final_result_udr)                    \
+    reduction(sum_pctor_reduction : final_result_udr_pctor)
+    for (int i = 0; i < N; ++i) {
+      final_result_udr += array_sum[i];
+      final_result_udr_pctor += array_sum[i];
+    }
+
+    if (final_result_udr.getValue() != expected_value ||
+        final_result_udr_pctor.getValue() != expected_value_pctor)
+      error_flag += 1;
+  }
+  return error_flag;
+}
+void performMinMaxRed(int &min_val, int &max_val) {
+  int input_data[] = {7, 3, 12, 5, 8};
+  int n_size = sizeof(input_data) / sizeof(input_data[0]);
+  min_val = INT_MAX;
+  max_val = INT_MIN;
+#pragma omp for reduction(original(private), min : min_val)                    \
+    reduction(original(private), max : max_val)
+  for (int i = 0; i < n_size; ++i) {
+    if (input_data[i] < min_val)
+      min_val = input_data[i];
+    if (input_data[i] > max_val)
+      max_val = input_data[i];
+  }
+}
+int performComplexReduction() {
+  double _Complex arr[N];
+  double _Complex expected = 0.0 + 0.0 * I;
+  double _Complex result = 0.0 + 0.0 * I;
+  int error = 0;
+
+  // Initialize the array and compute serial sum
+  for (int i = 0; i < N; ++i) {
+    arr[i] = i - i * I;
+    expected += arr[i];
+  }
+  double real_sum = 0.0, imag_sum = 0.0;
+#pragma omp parallel private(real_sum) private(imag_sum)
+  {
+#pragma omp for reduction(+ : real_sum, imag_sum)
+    for (int i = 0; i < N; ++i) {
+      real_sum += creal(arr[i]);
+      imag_sum += cimag(arr[i]);
+    }
+
+    result = real_sum + imag_sum * I;
+    if (cabs(result - expected) > 1e-6) {
+      error++;
+    }
+  }
+  return error;
+}
+
+std::complex<double> doComplexReduction(std::complex<double> *arr) {
+  std::complex<double> result(1, 0);
+
+#pragma omp declare reduction(* : std::complex<double> : omp_out *= omp_in)    \
+    initializer(omp_priv = std::complex<double>(1, 0))
+
+#pragma omp for reduction(original(private), * : result)
+  for (int i = 0; i < N; ++i)
+    result *= arr[i];
+
+  return result;
+}
+
+void performReductions(int n_elements, const int *input_values,
+                       int &sum_val_out, int &prod_val_out,
+                       float &float_sum_val_out) {
+  // private variables for this thread's reduction.
+  sum_val_out = 0;
+  prod_val_out = 1;
+  float_sum_val_out = 0.0f;
+
+  const float kPiValue = 3.14f;
+#pragma omp for reduction(original(private), + : sum_val_out)                  \
+    reduction(original(private), * : prod_val_out)                             \
+    reduction(original(private), + : float_sum_val_out)
+  for (int i = 0; i < n_elements; ++i) {
+    sum_val_out += input_values[i];
+    prod_val_out *= (i + 1);
+    float_sum_val_out += kPiValue;
+  }
+}
+int main(void) {
+  int input_array[N];
+  int total_errors = 0;
+  const float kPiVal = 3.14f;
+  const int kExpectedSum = 45; // Sum of 0..9
+  const int kExpectedProd = 3628800; // 10!
+  const float kExpectedFsum = kPiVal * N; // 3.14f * 10
+  const int kExpectedMin = 3;
+  const int kExpectedMax = 12;
+  std::complex<double> arr[N];
+  std::complex<double> kExpectedComplex(1, 0);
+  // Initialize the array
+  for (int i = 1; i <= N; ++i) {
+    arr[i - 1] = std::complex<double>(
+        1.0 + 0.1 * i, 0.5 * i); // Avoid zero to prevent multiplication by zero
+    kExpectedComplex *= arr[i - 1];
+  }
+
+  for (int i = 0; i < N; i++)
+    input_array[i] = i;
+#pragma omp parallel num_threads(4)
+  {
+
+    int t_sum_v;
+    int t_prod_v;
+    float t_fsum_v;
+    performReductions(N, input_array, t_sum_v, t_prod_v, t_fsum_v);
+    if (t_sum_v != kExpectedSum)
+      total_errors++;
+    if (t_prod_v != kExpectedProd)
+      total_errors++;
+    if (t_fsum_v != kExpectedFsum)
+      total_errors++;
+  }
+#pragma omp parallel num_threads(4)
+  {
+    int t_min_v;
+    int t_max_v;
+    performMinMaxRed(t_min_v, t_max_v);
+    if (t_min_v != kExpectedMin)
+      total_errors++;
+    if (t_max_v != kExpectedMax)
+      total_errors++;
+  }
+  total_errors += checkUserDefinedReduction();
+  total_errors += performComplexReduction();
+#pragma omp parallel num_threads(4)
+  {
+    std::complex<double> result(1, 0);
+    result = doComplexReduction(arr);
+    if (std::abs(result.real() - kExpectedComplex.real()) > 1e-6 ||
+        std::abs(result.imag() - kExpectedComplex.imag()) > 1e-6) {
+      total_errors++;
+    }
+  }
+  if (total_errors != 0)
+    fprintf(stderr, "ERROR: reduction on private variable  %d\n", total_errors);
+
+  return total_errors;
+}

>From e44a65ed98ad896d0c0c3b1e10937a19f786b9ef Mon Sep 17 00:00:00 2001
From: Kareem Ergawy <kareem.ergawy at amd.com>
Date: Wed, 11 Jun 2025 10:36:12 +0200
Subject: [PATCH 57/61] [flang][OpenMP] Map basic `local` specifiers to
 `private` clauses (#142735)

Starts the effort to map `do concurrent` locality specifiers to OpenMP
clauses. This PR adds support for basic specifiers (no `init` or `copy`
regions yet).
---
 .../OpenMP/DoConcurrentConversion.cpp         | 55 ++++++++++++++++++-
 .../locality_specifiers_simple.mlir           | 48 ++++++++++++++++
 2 files changed, 101 insertions(+), 2 deletions(-)
 create mode 100644 flang/test/Transforms/DoConcurrent/locality_specifiers_simple.mlir

diff --git a/flang/lib/Optimizer/OpenMP/DoConcurrentConversion.cpp b/flang/lib/Optimizer/OpenMP/DoConcurrentConversion.cpp
index 0fdb302fe10ca..283c3052c166c 100644
--- a/flang/lib/Optimizer/OpenMP/DoConcurrentConversion.cpp
+++ b/flang/lib/Optimizer/OpenMP/DoConcurrentConversion.cpp
@@ -7,9 +7,11 @@
 //===----------------------------------------------------------------------===//
 
 #include "flang/Optimizer/Builder/FIRBuilder.h"
+#include "flang/Optimizer/Builder/Todo.h"
 #include "flang/Optimizer/Dialect/FIROps.h"
 #include "flang/Optimizer/OpenMP/Passes.h"
 #include "flang/Optimizer/OpenMP/Utils.h"
+#include "flang/Support/OpenMP-utils.h"
 #include "mlir/Analysis/SliceAnalysis.h"
 #include "mlir/Dialect/OpenMP/OpenMPDialect.h"
 #include "mlir/IR/IRMapping.h"
@@ -308,10 +310,47 @@ class DoConcurrentConversion
               fir::DoConcurrentLoopOp loop, mlir::IRMapping &mapper,
               const mlir::omp::LoopNestOperands &clauseOps,
               bool isComposite) const {
+    mlir::omp::WsloopOperands wsloopClauseOps;
+
+    // For `local` (and `local_init`) opernads, emit corresponding `private`
+    // clauses and attach these clauses to the workshare loop.
+    if (!loop.getLocalOperands().empty())
+      for (auto [op, sym, arg] : llvm::zip_equal(
+               loop.getLocalOperands(),
+               loop.getLocalSymsAttr().getAsRange<mlir::SymbolRefAttr>(),
+               loop.getRegionLocalArgs())) {
+        auto localizer = mlir::SymbolTable::lookupNearestSymbolFrom<
+            fir::LocalitySpecifierOp>(loop, sym);
+        if (localizer.getLocalitySpecifierType() ==
+            fir::LocalitySpecifierType::LocalInit)
+          TODO(localizer.getLoc(),
+               "local_init conversion is not supported yet");
+
+        if (!localizer.getInitRegion().empty())
+          TODO(localizer.getLoc(),
+               "non-empty `init` regions are not supported yet");
+
+        auto oldIP = rewriter.saveInsertionPoint();
+        rewriter.setInsertionPointAfter(localizer);
+        auto privatizer = rewriter.create<mlir::omp::PrivateClauseOp>(
+            localizer.getLoc(), sym.getLeafReference().str() + ".omp",
+            localizer.getTypeAttr().getValue(),
+            mlir::omp::DataSharingClauseType::Private);
+        rewriter.restoreInsertionPoint(oldIP);
+
+        wsloopClauseOps.privateVars.push_back(op);
+        wsloopClauseOps.privateSyms.push_back(
+            mlir::SymbolRefAttr::get(privatizer));
+      }
 
-    auto wsloopOp = rewriter.create<mlir::omp::WsloopOp>(loop.getLoc());
+    auto wsloopOp =
+        rewriter.create<mlir::omp::WsloopOp>(loop.getLoc(), wsloopClauseOps);
     wsloopOp.setComposite(isComposite);
-    rewriter.createBlock(&wsloopOp.getRegion());
+
+    Fortran::common::openmp::EntryBlockArgs wsloopArgs;
+    wsloopArgs.priv.vars = wsloopClauseOps.privateVars;
+    Fortran::common::openmp::genEntryBlock(rewriter, wsloopArgs,
+                                           wsloopOp.getRegion());
 
     auto loopNestOp =
         rewriter.create<mlir::omp::LoopNestOp>(loop.getLoc(), clauseOps);
@@ -324,6 +363,18 @@ class DoConcurrentConversion
     rewriter.setInsertionPointToEnd(&loopNestOp.getRegion().back());
     rewriter.create<mlir::omp::YieldOp>(loop->getLoc());
 
+    // `local` region arguments are transferred/cloned from the `do concurrent`
+    // loop to the loopnest op when the region is cloned above. Instead, these
+    // region arguments should be on the workshare loop's region.
+    for (auto [wsloopArg, loopNestArg] :
+         llvm::zip_equal(wsloopOp.getRegion().getArguments(),
+                         loopNestOp.getRegion().getArguments().drop_front(
+                             clauseOps.loopLowerBounds.size())))
+      rewriter.replaceAllUsesWith(loopNestArg, wsloopArg);
+
+    for (unsigned i = 0; i < loop.getLocalVars().size(); ++i)
+      loopNestOp.getRegion().eraseArgument(clauseOps.loopLowerBounds.size());
+
     return loopNestOp;
   }
 
diff --git a/flang/test/Transforms/DoConcurrent/locality_specifiers_simple.mlir b/flang/test/Transforms/DoConcurrent/locality_specifiers_simple.mlir
new file mode 100644
index 0000000000000..160c1df040680
--- /dev/null
+++ b/flang/test/Transforms/DoConcurrent/locality_specifiers_simple.mlir
@@ -0,0 +1,48 @@
+// Tests mapping `local` locality specifier to `private` clauses for a simple
+// case (not `init` or `copy` regions).
+
+// RUN: fir-opt --omp-do-concurrent-conversion="map-to=host" %s | FileCheck %s
+
+fir.local {type = local} @_QFlocal_spec_translationElocal_var_private_f32 : f32
+
+func.func @_QPlocal_spec_translation() {
+  %3 = fir.alloca f32 {bindc_name = "local_var", uniq_name = "_QFlocal_spec_translationElocal_var"}
+  %4:2 = hlfir.declare %3 {uniq_name = "_QFlocal_spec_translationElocal_var"} : (!fir.ref<f32>) -> (!fir.ref<f32>, !fir.ref<f32>)
+
+  %c4_i32 = arith.constant 4 : index
+  %c11_i32 = arith.constant 11 : index
+  %c1 = arith.constant 1 : index
+
+  fir.do_concurrent {
+    %7 = fir.alloca i32 {bindc_name = "i"}
+    %8:2 = hlfir.declare %7 {uniq_name = "_QFlocal_spec_translationEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+
+    fir.do_concurrent.loop (%arg0) = (%c4_i32) to (%c11_i32) step (%c1)
+      local(@_QFlocal_spec_translationElocal_var_private_f32 %4#0 -> %arg1 : !fir.ref<f32>) {
+      %9 = fir.convert %arg0 : (index) -> i32
+      fir.store %9 to %8#0 : !fir.ref<i32>
+
+      %10:2 = hlfir.declare %arg1 {uniq_name = "_QFlocal_spec_translationElocal_var"} : (!fir.ref<f32>) -> (!fir.ref<f32>, !fir.ref<f32>)
+      %cst = arith.constant 4.200000e+01 : f32
+      hlfir.assign %cst to %10#0 : f32, !fir.ref<f32>
+    }
+  }
+  return
+}
+
+// CHECK: omp.private {type = private} @[[PRIVATIZER:.*local_spec_translationElocal_var.*.omp]] : f32
+
+// CHECK: func.func @_QPlocal_spec_translation
+// CHECK:   %[[LOCAL_VAR:.*]] = fir.alloca f32 {bindc_name = "local_var", {{.*}}}
+// CHECK:   %[[LOCAL_VAR_DECL:.*]]:2 = hlfir.declare %[[LOCAL_VAR]]
+// CHECK:   omp.parallel {
+// CHECK:     omp.wsloop private(@[[PRIVATIZER]] %[[LOCAL_VAR_DECL]]#0 -> %[[LOCAL_ARG:.*]] : !fir.ref<f32>) {
+// CHECK:       omp.loop_nest {{.*}} {
+// CHECK:       %[[PRIV_DECL:.*]]:2 = hlfir.declare %[[LOCAL_ARG]]
+// CHECK:       %[[C42:.*]] = arith.constant
+// CHECK:       hlfir.assign %[[C42]] to %[[PRIV_DECL]]#0
+// CHECK:       omp.yield
+// CHECK:     }
+// CHECK:   }
+// CHECK:   omp.terminator
+// CHECK: }

>From 7460c700ae3026d927952f911d0e667de6e0c18b Mon Sep 17 00:00:00 2001
From: Jameson Nash <vtjnash at gmail.com>
Date: Wed, 11 Jun 2025 04:42:05 -0400
Subject: [PATCH 58/61] [MemCpyOpt] handle memcpy from memset in more cases
 (#140954)

This aims to reduce the divergence between the initial checks in this
function and processMemCpyMemCpyDependence (in particular, adding
handling of offsets), with the goal to eventually reduce duplication
there and improve this pass in other ways.
---
 .../lib/Transforms/Scalar/MemCpyOptimizer.cpp | 74 ++++++++++++-------
 .../MemCpyOpt/memset-memcpy-oversized.ll      | 47 ++++++++++++
 .../MemCpyOpt/memset-memcpy-to-2x-memset.ll   |  3 +-
 llvm/test/Transforms/MemCpyOpt/mixed-sizes.ll |  2 +-
 .../MemCpyOpt/variable-sized-memset-memcpy.ll |  2 +-
 5 files changed, 97 insertions(+), 31 deletions(-)

diff --git a/llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp b/llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp
index a78e3770384ae..960001bf880c6 100644
--- a/llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp
+++ b/llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp
@@ -1364,8 +1364,9 @@ bool MemCpyOptPass::processMemSetMemCpyDependence(MemCpyInst *MemCpy,
   return true;
 }
 
-/// Determine whether the instruction has undefined content for the given Size,
-/// either because it was freshly alloca'd or started its lifetime.
+/// Determine whether the pointer V had only undefined content (due to Def) up
+/// to the given Size, either because it was freshly alloca'd or started its
+/// lifetime.
 static bool hasUndefContents(MemorySSA *MSSA, BatchAAResults &AA, Value *V,
                              MemoryDef *Def, Value *Size) {
   if (MSSA->isLiveOnEntryDef(Def))
@@ -1400,6 +1401,24 @@ static bool hasUndefContents(MemorySSA *MSSA, BatchAAResults &AA, Value *V,
   return false;
 }
 
+// If the memcpy is larger than the previous, but the memory was undef prior to
+// that, we can just ignore the tail. Technically we're only interested in the
+// bytes from 0..MemSrcOffset and MemSrcLength+MemSrcOffset..CopySize here, but
+// as we can't easily represent this location (hasUndefContents uses mustAlias
+// which cannot deal with offsets), we use the full 0..CopySize range.
+static bool overreadUndefContents(MemorySSA *MSSA, MemCpyInst *MemCpy,
+                                  MemIntrinsic *MemSrc, BatchAAResults &BAA) {
+  Value *CopySize = MemCpy->getLength();
+  MemoryLocation MemCpyLoc = MemoryLocation::getForSource(MemCpy);
+  MemoryUseOrDef *MemSrcAccess = MSSA->getMemoryAccess(MemSrc);
+  MemoryAccess *Clobber = MSSA->getWalker()->getClobberingMemoryAccess(
+      MemSrcAccess->getDefiningAccess(), MemCpyLoc, BAA);
+  if (auto *MD = dyn_cast<MemoryDef>(Clobber))
+    if (hasUndefContents(MSSA, BAA, MemCpy->getSource(), MD, CopySize))
+      return true;
+  return false;
+}
+
 /// Transform memcpy to memset when its source was just memset.
 /// In other words, turn:
 /// \code
@@ -1415,19 +1434,25 @@ static bool hasUndefContents(MemorySSA *MSSA, BatchAAResults &AA, Value *V,
 bool MemCpyOptPass::performMemCpyToMemSetOptzn(MemCpyInst *MemCpy,
                                                MemSetInst *MemSet,
                                                BatchAAResults &BAA) {
-  // Make sure that memcpy(..., memset(...), ...), that is we are memsetting and
-  // memcpying from the same address. Otherwise it is hard to reason about.
-  if (!BAA.isMustAlias(MemSet->getRawDest(), MemCpy->getRawSource()))
-    return false;
-
   Value *MemSetSize = MemSet->getLength();
   Value *CopySize = MemCpy->getLength();
 
-  if (MemSetSize != CopySize) {
-    // Make sure the memcpy doesn't read any more than what the memset wrote.
-    // Don't worry about sizes larger than i64.
+  int64_t MOffset = 0;
+  const DataLayout &DL = MemCpy->getModule()->getDataLayout();
+  // We can only transforms memcpy's where the dest of one is the source of the
+  // other, or the memory transfer has a known offset from the memset.
+  if (MemCpy->getSource() != MemSet->getDest()) {
+    std::optional<int64_t> Offset =
+        MemCpy->getSource()->getPointerOffsetFrom(MemSet->getDest(), DL);
+    if (!Offset || *Offset < 0)
+      return false;
+    MOffset = *Offset;
+  }
 
-    // A known memset size is required.
+  if (MOffset != 0 || MemSetSize != CopySize) {
+    // Make sure the memcpy doesn't read any more than what the memset wrote,
+    // other than undef. Don't worry about sizes larger than i64. A known memset
+    // size is required.
     auto *CMemSetSize = dyn_cast<ConstantInt>(MemSetSize);
     if (!CMemSetSize)
       return false;
@@ -1436,23 +1461,18 @@ bool MemCpyOptPass::performMemCpyToMemSetOptzn(MemCpyInst *MemCpy,
     auto *CCopySize = dyn_cast<ConstantInt>(CopySize);
     if (!CCopySize)
       return false;
-    if (CCopySize->getZExtValue() > CMemSetSize->getZExtValue()) {
-      // If the memcpy is larger than the memset, but the memory was undef prior
-      // to the memset, we can just ignore the tail. Technically we're only
-      // interested in the bytes from MemSetSize..CopySize here, but as we can't
-      // easily represent this location, we use the full 0..CopySize range.
-      MemoryLocation MemCpyLoc = MemoryLocation::getForSource(MemCpy);
-      bool CanReduceSize = false;
-      MemoryUseOrDef *MemSetAccess = MSSA->getMemoryAccess(MemSet);
-      MemoryAccess *Clobber = MSSA->getWalker()->getClobberingMemoryAccess(
-          MemSetAccess->getDefiningAccess(), MemCpyLoc, BAA);
-      if (auto *MD = dyn_cast<MemoryDef>(Clobber))
-        if (hasUndefContents(MSSA, BAA, MemCpy->getSource(), MD, CopySize))
-          CanReduceSize = true;
-
-      if (!CanReduceSize)
+    if (CCopySize->getZExtValue() + MOffset > CMemSetSize->getZExtValue()) {
+      if (!overreadUndefContents(MSSA, MemCpy, MemSet, BAA))
         return false;
-      CopySize = MemSetSize;
+      // Clip the memcpy to the bounds of the memset
+      if (MOffset == 0)
+        CopySize = MemSetSize;
+      else
+        CopySize =
+            ConstantInt::get(CopySize->getType(),
+                             CMemSetSize->getZExtValue() <= (uint64_t)MOffset
+                                 ? 0
+                                 : CMemSetSize->getZExtValue() - MOffset);
     }
   }
 
diff --git a/llvm/test/Transforms/MemCpyOpt/memset-memcpy-oversized.ll b/llvm/test/Transforms/MemCpyOpt/memset-memcpy-oversized.ll
index 1c3896407e950..0c16f34590fc7 100644
--- a/llvm/test/Transforms/MemCpyOpt/memset-memcpy-oversized.ll
+++ b/llvm/test/Transforms/MemCpyOpt/memset-memcpy-oversized.ll
@@ -187,6 +187,53 @@ define void @test_write_before_memset_in_both_regions(ptr %result) {
   ret void
 }
 
+define void @test_negative_offset_memset(ptr %result) {
+; CHECK-LABEL: @test_negative_offset_memset(
+; CHECK-NEXT:    [[A1:%.*]] = alloca [16 x i8], align 8
+; CHECK-NEXT:    [[A:%.*]] = getelementptr i8, ptr [[A1]], i32 4
+; CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[A]], i8 0, i64 12, i1 false)
+; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr [[RESULT:%.*]], ptr align 8 [[A1]], i64 12, i1 false)
+; CHECK-NEXT:    ret void
+;
+  %a = alloca [ 16 x i8 ], align 8
+  %b = getelementptr i8, ptr %a, i32 4
+  call void @llvm.memset.p0.i64(ptr align 8 %b, i8 0, i64 12, i1 false)
+  call void @llvm.memcpy.p0.p0.i64(ptr %result, ptr align 8 %a, i64 12, i1 false)
+  ret void
+}
+
+define void @test_offset_memsetcpy(ptr %result) {
+; CHECK-LABEL: @test_offset_memsetcpy(
+; CHECK-NEXT:    [[A1:%.*]] = alloca [16 x i8], align 8
+; CHECK-NEXT:    [[A:%.*]] = getelementptr i8, ptr [[A1]], i32 4
+; CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[A1]], i8 0, i64 12, i1 false)
+; CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr [[RESULT:%.*]], i8 0, i64 8, i1 false)
+; CHECK-NEXT:    ret void
+;
+  %a = alloca [ 16 x i8 ], align 8
+  %b = getelementptr i8, ptr %a, i32 4
+  call void @llvm.memset.p0.i64(ptr align 8 %a, i8 0, i64 12, i1 false)
+  call void @llvm.memcpy.p0.p0.i64(ptr %result, ptr align 8 %b, i64 12, i1 false)
+  ret void
+}
+
+define void @test_two_memset(ptr %result) {
+; CHECK-LABEL: @test_two_memset(
+; CHECK-NEXT:    [[A:%.*]] = alloca [16 x i8], align 8
+; CHECK-NEXT:    [[B:%.*]] = getelementptr i8, ptr [[A]], i32 12
+; CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[A]], i8 0, i64 12, i1 false)
+; CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 8 [[B]], i8 1, i64 4, i1 false)
+; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr [[RESULT:%.*]], ptr align 8 [[A]], i64 16, i1 false)
+; CHECK-NEXT:    ret void
+;
+  %a = alloca [ 16 x i8 ], align 8
+  %b = getelementptr i8, ptr %a, i32 12
+  call void @llvm.memset.p0.i64(ptr align 8 %a, i8 0, i64 12, i1 false)
+  call void @llvm.memset.p0.i64(ptr align 8 %b, i8 1, i64 4, i1 false)
+  call void @llvm.memcpy.p0.p0.i64(ptr %result, ptr align 8 %a, i64 16, i1 false)
+  ret void
+}
+
 declare ptr @malloc(i64)
 declare void @free(ptr)
 
diff --git a/llvm/test/Transforms/MemCpyOpt/memset-memcpy-to-2x-memset.ll b/llvm/test/Transforms/MemCpyOpt/memset-memcpy-to-2x-memset.ll
index 47474e8dac051..18488f03a2d88 100644
--- a/llvm/test/Transforms/MemCpyOpt/memset-memcpy-to-2x-memset.ll
+++ b/llvm/test/Transforms/MemCpyOpt/memset-memcpy-to-2x-memset.ll
@@ -73,11 +73,10 @@ define void @test_different_source_gep(ptr %dst1, ptr %dst2, i8 %c) {
 ; CHECK-LABEL: @test_different_source_gep(
 ; CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr [[DST1:%.*]], i8 [[C:%.*]], i64 128, i1 false)
 ; CHECK-NEXT:    [[P:%.*]] = getelementptr i8, ptr [[DST1]], i64 64
-; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr [[DST2:%.*]], ptr [[P]], i64 64, i1 false)
+; CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr [[DST2:%.*]], i8 [[C]], i64 64, i1 false)
 ; CHECK-NEXT:    ret void
 ;
   call void @llvm.memset.p0.i64(ptr %dst1, i8 %c, i64 128, i1 false)
-  ; FIXME: We could optimize this as well.
   %p = getelementptr i8, ptr %dst1, i64 64
   call void @llvm.memcpy.p0.p0.i64(ptr %dst2, ptr %p, i64 64, i1 false)
   ret void
diff --git a/llvm/test/Transforms/MemCpyOpt/mixed-sizes.ll b/llvm/test/Transforms/MemCpyOpt/mixed-sizes.ll
index 5e13432746bf7..0e312bc42d463 100644
--- a/llvm/test/Transforms/MemCpyOpt/mixed-sizes.ll
+++ b/llvm/test/Transforms/MemCpyOpt/mixed-sizes.ll
@@ -19,7 +19,7 @@ define i32 @foo(i1 %z) {
 ; CHECK:       for.body3.lr.ph:
 ; CHECK-NEXT:    br label [[FOR_INC7_1]]
 ; CHECK:       for.inc7.1:
-; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 4 [[A]], ptr align 4 [[SCEVGEP]], i64 4, i1 false)
+; CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 4 [[A]], i8 0, i64 4, i1 false)
 ; CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr [[A]], align 4
 ; CHECK-NEXT:    ret i32 [[TMP2]]
 ;
diff --git a/llvm/test/Transforms/MemCpyOpt/variable-sized-memset-memcpy.ll b/llvm/test/Transforms/MemCpyOpt/variable-sized-memset-memcpy.ll
index a834d2465dfa5..d5b1ab9b2f299 100644
--- a/llvm/test/Transforms/MemCpyOpt/variable-sized-memset-memcpy.ll
+++ b/llvm/test/Transforms/MemCpyOpt/variable-sized-memset-memcpy.ll
@@ -18,7 +18,7 @@ define void @test(ptr %src, i8 %c, i64 %size) {
   ret void
 }
 
-; Differing sizes, so left as it is.
+; Differing sizes, but would be UB if size1 < size2 since the memcpy would reference outside of the first alloca
 define void @negative_test(ptr %src, i8 %c, i64 %size1, i64 %size2) {
 ; CHECK-LABEL: @negative_test(
 ; CHECK-NEXT:    [[DST1:%.*]] = alloca i8, i64 [[SIZE1:%.*]], align 1

>From 2bcb596f887219f450dc7e895f4f9336f39f738c Mon Sep 17 00:00:00 2001
From: Stephen Tozer <stephen.tozer at sony.com>
Date: Tue, 10 Jun 2025 19:55:58 +0100
Subject: [PATCH 59/61] [DLCov] Origin-Tracking: Add CMake options

---
 llvm/CMakeLists.txt                        |  4 ++--
 llvm/cmake/modules/HandleLLVMOptions.cmake |  3 +++
 llvm/docs/CMake.rst                        | 13 ++++++++-----
 llvm/include/llvm/Config/config.h.cmake    |  8 ++++++++
 4 files changed, 21 insertions(+), 7 deletions(-)

diff --git a/llvm/CMakeLists.txt b/llvm/CMakeLists.txt
index 206f009b45f59..9b6f45c57bcd9 100644
--- a/llvm/CMakeLists.txt
+++ b/llvm/CMakeLists.txt
@@ -569,8 +569,8 @@ endif()
 option(LLVM_ENABLE_CRASH_DUMPS "Turn on memory dumps on crashes. Currently only implemented on Windows." OFF)
 
 set(LLVM_ENABLE_DEBUGLOC_COVERAGE_TRACKING "DISABLED" CACHE STRING
-  "Enhance Debugify's line number coverage tracking; enabling this is ABI-breaking. Can be DISABLED, or COVERAGE.")
-set_property(CACHE LLVM_ENABLE_DEBUGLOC_COVERAGE_TRACKING PROPERTY STRINGS DISABLED COVERAGE)
+  "Enhance debugify's line number coverage tracking; enabling this is abi-breaking. Can be DISABLED, COVERAGE, or COVERAGE_AND_ORIGIN.")
+set_property(CACHE LLVM_ENABLE_DEBUGLOC_COVERAGE_TRACKING PROPERTY STRINGS DISABLED COVERAGE COVERAGE_AND_ORIGIN)
 
 option(LLVM_EXPERIMENTAL_KEY_INSTRUCTIONS
   "Add additional fields to DILocations to support Key Instructions" OFF)
diff --git a/llvm/cmake/modules/HandleLLVMOptions.cmake b/llvm/cmake/modules/HandleLLVMOptions.cmake
index 9721dacbcbe84..c35d9763a3301 100644
--- a/llvm/cmake/modules/HandleLLVMOptions.cmake
+++ b/llvm/cmake/modules/HandleLLVMOptions.cmake
@@ -200,6 +200,9 @@ string(TOUPPER "${LLVM_ENABLE_DEBUGLOC_COVERAGE_TRACKING}" uppercase_LLVM_ENABLE
 
 if( uppercase_LLVM_ENABLE_DEBUGLOC_COVERAGE_TRACKING STREQUAL "COVERAGE" )
   set( LLVM_ENABLE_DEBUGLOC_COVERAGE_TRACKING 1 )
+elseif( uppercase_LLVM_ENABLE_DEBUGLOC_COVERAGE_TRACKING STREQUAL "COVERAGE_AND_ORIGIN" )
+  set( LLVM_ENABLE_DEBUGLOC_COVERAGE_TRACKING 1 )
+  set( LLVM_ENABLE_DEBUGLOC_ORIGIN_TRACKING 1 )
 elseif( uppercase_LLVM_ENABLE_DEBUGLOC_COVERAGE_TRACKING STREQUAL "DISABLED" OR NOT DEFINED LLVM_ENABLE_DEBUGLOC_COVERAGE_TRACKING )
   # The DISABLED setting is default and requires no additional defines.
 else()
diff --git a/llvm/docs/CMake.rst b/llvm/docs/CMake.rst
index 674e4969c6912..72f19fd353922 100644
--- a/llvm/docs/CMake.rst
+++ b/llvm/docs/CMake.rst
@@ -482,11 +482,14 @@ enabled sub-projects. Nearly all of these variable names begin with
 **LLVM_ENABLE_DEBUGLOC_COVERAGE_TRACKING**:STRING
   Enhances Debugify's ability to detect line number errors by storing extra
   information inside Instructions, removing false positives from Debugify's
-  results at the cost of performance. Allowed values are `DISABLED` (default)
-  and `COVERAGE`. `COVERAGE` tracks whether and why a line number was
-  intentionally dropped or not generated for an instruction, allowing Debugify
-  to avoid reporting these as errors; this comes with a small performance cost
-  of ~0.1%. `COVERAGE` is an ABI-breaking option.
+  results at the cost of performance. Allowed values are `DISABLED` (default),
+  `COVERAGE`, and `COVERAGE_AND_ORIGIN`. `COVERAGE` tracks whether and why a
+  line number was intentionally dropped or not generated for an instruction,
+  allowing Debugify to avoid reporting these as errors; this comes with a small
+  performance cost of ~0.1%. `COVERAGE_AND_ORIGIN` additionally stores a
+  stacktrace of the point where each DebugLoc is unintentionally dropped,
+  allowing for much easier bug triaging at the cost of a ~10x performance
+  slowdown. `COVERAGE` and `COVERAGE_AND_ORIGIN` are ABI-breaking options.
 
 **LLVM_ENABLE_DIA_SDK**:BOOL
   Enable building with MSVC DIA SDK for PDB debugging support. Available
diff --git a/llvm/include/llvm/Config/config.h.cmake b/llvm/include/llvm/Config/config.h.cmake
index 06d4756397911..d4397d74b59d6 100644
--- a/llvm/include/llvm/Config/config.h.cmake
+++ b/llvm/include/llvm/Config/config.h.cmake
@@ -19,6 +19,14 @@
 /* Define to 1 to enable crash memory dumps, and to 0 otherwise. */
 #cmakedefine01 LLVM_ENABLE_CRASH_DUMPS
 
+/* Define to 1 to enable expensive checks for debug location coverage checking,
+   and to 0 otherwise. */
+#cmakedefine01 ENABLE_DEBUGLOC_COVERAGE_TRACKING
+
+/* Define to 1 to enable expensive tracking of the origin of debug location
+   coverage bugs, and to 0 otherwise. */
+#cmakedefine01 LLVM_ENABLE_DEBUGLOC_ORIGIN_TRACKING
+
 /* Define to 1 to prefer forward slashes on Windows, and to 0 prefer
    backslashes. */
 #cmakedefine01 LLVM_WINDOWS_PREFER_FORWARD_SLASH

>From fe689ddeffec55986babe116da1a321a804d8093 Mon Sep 17 00:00:00 2001
From: Stephen Tozer <stephen.tozer at sony.com>
Date: Tue, 10 Jun 2025 19:58:09 +0100
Subject: [PATCH 60/61] [DLCov] Origin-Tracking: SymbolizeAddresses

---
 llvm/include/llvm/Support/Signals.h  |  40 +++++++++
 llvm/lib/Support/Signals.cpp         | 116 +++++++++++++++++++++++++++
 llvm/lib/Support/Unix/Signals.inc    |  15 ++++
 llvm/lib/Support/Windows/Signals.inc |   5 ++
 4 files changed, 176 insertions(+)

diff --git a/llvm/include/llvm/Support/Signals.h b/llvm/include/llvm/Support/Signals.h
index 6ce26acdd458e..a6f99d8bbdc95 100644
--- a/llvm/include/llvm/Support/Signals.h
+++ b/llvm/include/llvm/Support/Signals.h
@@ -14,7 +14,9 @@
 #ifndef LLVM_SUPPORT_SIGNALS_H
 #define LLVM_SUPPORT_SIGNALS_H
 
+#include "llvm/Config/llvm-config.h"
 #include "llvm/Support/Compiler.h"
+#include <array>
 #include <cstdint>
 #include <string>
 
@@ -22,6 +24,22 @@ namespace llvm {
 class StringRef;
 class raw_ostream;
 
+#if LLVM_ENABLE_DEBUGLOC_ORIGIN_TRACKING
+// Typedefs that are convenient but only used by the stack-trace-collection code
+// added if DebugLoc origin-tracking is enabled.
+template <typename T, typename Enable> struct DenseMapInfo;
+template <typename ValueT, typename ValueInfoT> class DenseSet;
+namespace detail {
+template <typename KeyT, typename ValueT> struct DenseMapPair;
+}
+template <typename KeyT, typename ValueT, typename KeyInfoT, typename BucketT>
+class DenseMap;
+using AddressSet = DenseSet<void *, DenseMapInfo<void *, void>>;
+using SymbolizedAddressMap =
+    DenseMap<void *, std::string, DenseMapInfo<void *, void>,
+             detail::DenseMapPair<void *, std::string>>;
+#endif
+
 namespace sys {
 
 /// This function runs all the registered interrupt handlers, including the
@@ -57,6 +75,28 @@ LLVM_ABI void DisableSystemDialogsOnCrash();
 ///        specified, the entire frame is printed.
 LLVM_ABI void PrintStackTrace(raw_ostream &OS, int Depth = 0);
 
+#if LLVM_ENABLE_DEBUGLOC_ORIGIN_TRACKING
+#ifdef NDEBUG
+#error DebugLoc origin-tracking should not be enabled in Release builds.
+#endif
+/// Populates the given array with a stack trace of the current program, up to
+/// MaxDepth frames. Returns the number of frames returned, which will be
+/// inserted into \p StackTrace from index 0. All entries after the returned
+/// depth will be unmodified. NB: This is only intended to be used for
+/// introspection of LLVM by Debugify, will not be enabled in release builds,
+/// and should not be relied on for other purposes.
+template <unsigned long MaxDepth>
+int getStackTrace(std::array<void *, MaxDepth> &StackTrace);
+
+/// Takes a set of \p Addresses, symbolizes them and stores the result in the
+/// provided \p SymbolizedAddresses map.
+/// NB: This is only intended to be used for introspection of LLVM by
+/// Debugify, will not be enabled in release builds, and should not be relied
+/// on for other purposes.
+void symbolizeAddresses(AddressSet &Addresses,
+                        SymbolizedAddressMap &SymbolizedAddresses);
+#endif
+
 // Run all registered signal handlers.
 LLVM_ABI void RunSignalHandlers();
 
diff --git a/llvm/lib/Support/Signals.cpp b/llvm/lib/Support/Signals.cpp
index 9f9030e79d104..50b0d6e78ddd1 100644
--- a/llvm/lib/Support/Signals.cpp
+++ b/llvm/lib/Support/Signals.cpp
@@ -253,6 +253,122 @@ static bool printSymbolizedStackTrace(StringRef Argv0, void **StackTrace,
   return true;
 }
 
+#if LLVM_ENABLE_DEBUGLOC_ORIGIN_TRACKING
+void sys::symbolizeAddresses(AddressSet &Addresses,
+                             SymbolizedAddressMap &SymbolizedAddresses) {
+  assert(!DisableSymbolicationFlag && !getenv(DisableSymbolizationEnv) &&
+         "Debugify origin stacktraces require symbolization to be enabled.");
+
+  // Convert Set of Addresses to ordered list.
+  SmallVector<void *, 0> AddressList(Addresses.begin(), Addresses.end());
+  if (AddressList.empty())
+    return;
+  int NumAddresses = AddressList.size();
+  llvm::sort(AddressList);
+
+  // Use llvm-symbolizer tool to symbolize the stack traces. First look for it
+  // alongside our binary, then in $PATH.
+  ErrorOr<std::string> LLVMSymbolizerPathOrErr = std::error_code();
+  if (const char *Path = getenv(LLVMSymbolizerPathEnv)) {
+    LLVMSymbolizerPathOrErr = sys::findProgramByName(Path);
+  }
+  if (!LLVMSymbolizerPathOrErr)
+    LLVMSymbolizerPathOrErr = sys::findProgramByName("llvm-symbolizer");
+  assert(!!LLVMSymbolizerPathOrErr &&
+         "Debugify origin stacktraces require llvm-symbolizer.");
+  const std::string &LLVMSymbolizerPath = *LLVMSymbolizerPathOrErr;
+
+  // Try to guess the main executable name, since we don't have argv0 available
+  // here.
+  std::string MainExecutableName = sys::fs::getMainExecutable(nullptr, nullptr);
+
+  BumpPtrAllocator Allocator;
+  StringSaver StrPool(Allocator);
+  std::vector<const char *> Modules(NumAddresses, nullptr);
+  std::vector<intptr_t> Offsets(NumAddresses, 0);
+  if (!findModulesAndOffsets(AddressList.data(), NumAddresses, Modules.data(),
+                             Offsets.data(), MainExecutableName.c_str(),
+                             StrPool))
+    return;
+  int InputFD;
+  SmallString<32> InputFile, OutputFile;
+  sys::fs::createTemporaryFile("symbolizer-input", "", InputFD, InputFile);
+  sys::fs::createTemporaryFile("symbolizer-output", "", OutputFile);
+  FileRemover InputRemover(InputFile.c_str());
+  FileRemover OutputRemover(OutputFile.c_str());
+
+  {
+    raw_fd_ostream Input(InputFD, true);
+    for (int i = 0; i < NumAddresses; i++) {
+      if (Modules[i])
+        Input << Modules[i] << " " << (void *)Offsets[i] << "\n";
+    }
+  }
+
+  std::optional<StringRef> Redirects[] = {InputFile.str(), OutputFile.str(),
+                                          StringRef("")};
+  StringRef Args[] = {"llvm-symbolizer", "--functions=linkage", "--inlining",
+#ifdef _WIN32
+                      // Pass --relative-address on Windows so that we don't
+                      // have to add ImageBase from PE file.
+                      // FIXME: Make this the default for llvm-symbolizer.
+                      "--relative-address",
+#endif
+                      "--demangle"};
+  int RunResult =
+      sys::ExecuteAndWait(LLVMSymbolizerPath, Args, std::nullopt, Redirects);
+  if (RunResult != 0)
+    return;
+
+  // This report format is based on the sanitizer stack trace printer.  See
+  // sanitizer_stacktrace_printer.cc in compiler-rt.
+  auto OutputBuf = MemoryBuffer::getFile(OutputFile.c_str());
+  if (!OutputBuf)
+    return;
+  StringRef Output = OutputBuf.get()->getBuffer();
+  SmallVector<StringRef, 32> Lines;
+  Output.split(Lines, "\n");
+  auto CurLine = Lines.begin();
+  for (int i = 0; i < NumAddresses; i++) {
+    assert(!SymbolizedAddresses.contains(AddressList[i]));
+    std::string &SymbolizedAddr = SymbolizedAddresses[AddressList[i]];
+    raw_string_ostream OS(SymbolizedAddr);
+    if (!Modules[i]) {
+      OS << format_ptr(AddressList[i]) << '\n';
+      continue;
+    }
+    // Read pairs of lines (function name and file/line info) until we
+    // encounter empty line.
+    for (bool IsFirst = true;; IsFirst = false) {
+      if (CurLine == Lines.end())
+        return;
+      StringRef FunctionName = *CurLine++;
+      if (FunctionName.empty())
+        break;
+      // Add indentation for lines after the first; we use 3 spaces, because
+      // currently that aligns with the expected indentation that will be added
+      // to the first line by Debugify.
+      if (!IsFirst)
+        OS << "   ";
+      OS << format_ptr(AddressList[i]) << ' ';
+      if (!FunctionName.starts_with("??"))
+        OS << FunctionName << ' ';
+      if (CurLine == Lines.end()) {
+        OS << '\n';
+        return;
+      }
+      StringRef FileLineInfo = *CurLine++;
+      if (!FileLineInfo.starts_with("??"))
+        OS << FileLineInfo;
+      else
+        OS << "(" << Modules[i] << '+' << format_hex(Offsets[i], 0) << ")";
+      OS << '\n';
+    }
+  }
+  return;
+}
+#endif
+
 static bool printMarkupContext(raw_ostream &OS, const char *MainExecutableName);
 
 LLVM_ATTRIBUTE_USED
diff --git a/llvm/lib/Support/Unix/Signals.inc b/llvm/lib/Support/Unix/Signals.inc
index 6668a2953b3b2..70b2cf7c756a9 100644
--- a/llvm/lib/Support/Unix/Signals.inc
+++ b/llvm/lib/Support/Unix/Signals.inc
@@ -507,6 +507,21 @@ static int dl_iterate_phdr_cb(dl_phdr_info *info, size_t size, void *arg) {
   return 0;
 }
 
+#if LLVM_ENABLE_DEBUGLOC_ORIGIN_TRACKING
+#if !defined(HAVE_BACKTRACE)
+#error DebugLoc origin-tracking currently requires `backtrace()`.
+#endif
+namespace llvm {
+namespace sys {
+template <unsigned long MaxDepth>
+int getStackTrace(std::array<void *, MaxDepth> &StackTrace) {
+  return backtrace(StackTrace.data(), MaxDepth);
+}
+template int getStackTrace<16ul>(std::array<void *, 16ul> &);
+} // namespace sys
+} // namespace llvm
+#endif
+
 /// If this is an ELF platform, we can find all loaded modules and their virtual
 /// addresses with dl_iterate_phdr.
 static bool findModulesAndOffsets(void **StackTrace, int Depth,
diff --git a/llvm/lib/Support/Windows/Signals.inc b/llvm/lib/Support/Windows/Signals.inc
index f11ad09f37139..0fe6967b291da 100644
--- a/llvm/lib/Support/Windows/Signals.inc
+++ b/llvm/lib/Support/Windows/Signals.inc
@@ -9,6 +9,7 @@
 // This file provides the Win32 specific implementation of the Signals class.
 //
 //===----------------------------------------------------------------------===//
+#include "llvm/Config/llvm-config.h"
 #include "llvm/Support/ConvertUTF.h"
 #include "llvm/Support/ExitCodes.h"
 #include "llvm/Support/FileSystem.h"
@@ -542,6 +543,10 @@ void sys::PrintStackTraceOnErrorSignal(StringRef Argv0,
 extern "C" VOID WINAPI RtlCaptureContext(PCONTEXT ContextRecord);
 #endif
 
+#if LLVM_ENABLE_DEBUGLOC_ORIGIN_TRACKING
+#error DebugLoc origin-tracking currently unimplemented for Windows.
+#endif
+
 static void LocalPrintStackTrace(raw_ostream &OS, PCONTEXT C) {
   STACKFRAME64 StackFrame{};
   CONTEXT Context{};

>From 37c0c6899b38513704a72fb658b5cebd2ceba51d Mon Sep 17 00:00:00 2001
From: Stephen Tozer <stephen.tozer at sony.com>
Date: Tue, 10 Jun 2025 20:00:51 +0100
Subject: [PATCH 61/61] [DLCov] Origin-Tracking: Core implementation

---
 llvm/include/llvm/IR/DebugLoc.h    | 49 +++++++++++++++++++++++++-----
 llvm/include/llvm/IR/Instruction.h |  2 +-
 llvm/lib/CodeGen/BranchFolding.cpp |  7 +++++
 llvm/lib/IR/DebugLoc.cpp           | 22 +++++++++++++-
 llvm/lib/IR/Instruction.cpp        |  2 +-
 5 files changed, 72 insertions(+), 10 deletions(-)

diff --git a/llvm/include/llvm/IR/DebugLoc.h b/llvm/include/llvm/IR/DebugLoc.h
index c3d0fb80354a4..1930199607204 100644
--- a/llvm/include/llvm/IR/DebugLoc.h
+++ b/llvm/include/llvm/IR/DebugLoc.h
@@ -27,6 +27,21 @@ namespace llvm {
   class Function;
 
 #if LLVM_ENABLE_DEBUGLOC_COVERAGE_TRACKING
+#if LLVM_ENABLE_DEBUGLOC_ORIGIN_TRACKING
+  struct DbgLocOrigin {
+    static constexpr unsigned long MaxDepth = 16;
+    using StackTracesTy =
+        SmallVector<std::pair<int, std::array<void *, MaxDepth>>, 0>;
+    StackTracesTy StackTraces;
+    DbgLocOrigin(bool ShouldCollectTrace);
+    void addTrace();
+    const StackTracesTy &getOriginStackTraces() const { return StackTraces; };
+  };
+#else
+  struct DbgLocOrigin {
+    DbgLocOrigin(bool) {}
+  };
+#endif
   // Used to represent different "kinds" of DebugLoc, expressing that the
   // instruction it is part of is either normal and should contain a valid
   // DILocation, or otherwise describing the reason why the instruction does
@@ -55,22 +70,29 @@ namespace llvm {
     Temporary
   };
 
-  // Extends TrackingMDNodeRef to also store a DebugLocKind, allowing Debugify
-  // to ignore intentionally-empty DebugLocs.
-  class DILocAndCoverageTracking : public TrackingMDNodeRef {
+  // Extends TrackingMDNodeRef to also store a DebugLocKind and Origin,
+  // allowing Debugify to ignore intentionally-empty DebugLocs and display the
+  // code responsible for generating unintentionally-empty DebugLocs.
+  // Currently we only need to track the Origin of this DILoc when using a
+  // DebugLoc that is not annotated (i.e. has DebugLocKind::Normal) and has a
+  // null DILocation, so only collect the origin stacktrace in those cases.
+  class DILocAndCoverageTracking : public TrackingMDNodeRef,
+                                   public DbgLocOrigin {
   public:
     DebugLocKind Kind;
     // Default constructor for empty DebugLocs.
     DILocAndCoverageTracking()
-        : TrackingMDNodeRef(nullptr), Kind(DebugLocKind::Normal) {}
-    // Valid or nullptr MDNode*, normal DebugLocKind.
+        : TrackingMDNodeRef(nullptr), DbgLocOrigin(true),
+          Kind(DebugLocKind::Normal) {}
+    // Valid or nullptr MDNode*, no annotative DebugLocKind.
     DILocAndCoverageTracking(const MDNode *Loc)
-        : TrackingMDNodeRef(const_cast<MDNode *>(Loc)),
+        : TrackingMDNodeRef(const_cast<MDNode *>(Loc)), DbgLocOrigin(!Loc),
           Kind(DebugLocKind::Normal) {}
     LLVM_ABI DILocAndCoverageTracking(const DILocation *Loc);
     // Explicit DebugLocKind, which always means a nullptr MDNode*.
     DILocAndCoverageTracking(DebugLocKind Kind)
-        : TrackingMDNodeRef(nullptr), Kind(Kind) {}
+        : TrackingMDNodeRef(nullptr),
+          DbgLocOrigin(Kind == DebugLocKind::Normal), Kind(Kind) {}
   };
   template <> struct simplify_type<DILocAndCoverageTracking> {
     using SimpleType = MDNode *;
@@ -142,6 +164,19 @@ namespace llvm {
     static inline DebugLoc getDropped() { return DebugLoc(); }
 #endif // LLVM_ENABLE_DEBUGLOC_COVERAGE_TRACKING
 
+#if LLVM_ENABLE_DEBUGLOC_ORIGIN_TRACKING
+    const DbgLocOrigin::StackTracesTy &getOriginStackTraces() const {
+      return Loc.getOriginStackTraces();
+    }
+    DebugLoc getCopied() const {
+      DebugLoc NewDL = *this;
+      NewDL.Loc.addTrace();
+      return NewDL;
+    }
+#else
+    DebugLoc getCopied() const { return *this; }
+#endif
+
     /// Get the underlying \a DILocation.
     ///
     /// \pre !*this or \c isa<DILocation>(getAsMDNode()).
diff --git a/llvm/include/llvm/IR/Instruction.h b/llvm/include/llvm/IR/Instruction.h
index 10fc9c1298607..1d22bdb0c3f43 100644
--- a/llvm/include/llvm/IR/Instruction.h
+++ b/llvm/include/llvm/IR/Instruction.h
@@ -507,7 +507,7 @@ class Instruction : public User,
   LLVM_ABI bool extractProfTotalWeight(uint64_t &TotalVal) const;
 
   /// Set the debug location information for this instruction.
-  void setDebugLoc(DebugLoc Loc) { DbgLoc = std::move(Loc); }
+  void setDebugLoc(DebugLoc Loc) { DbgLoc = std::move(Loc).getCopied(); }
 
   /// Return the debug location for this node as a DebugLoc.
   const DebugLoc &getDebugLoc() const { return DbgLoc; }
diff --git a/llvm/lib/CodeGen/BranchFolding.cpp b/llvm/lib/CodeGen/BranchFolding.cpp
index e0f7466ceacff..47fc0ec7549e0 100644
--- a/llvm/lib/CodeGen/BranchFolding.cpp
+++ b/llvm/lib/CodeGen/BranchFolding.cpp
@@ -42,6 +42,7 @@
 #include "llvm/CodeGen/TargetPassConfig.h"
 #include "llvm/CodeGen/TargetRegisterInfo.h"
 #include "llvm/CodeGen/TargetSubtargetInfo.h"
+#include "llvm/Config/llvm-config.h"
 #include "llvm/IR/DebugInfoMetadata.h"
 #include "llvm/IR/DebugLoc.h"
 #include "llvm/IR/Function.h"
@@ -933,7 +934,13 @@ bool BranchFolder::TryTailMergeBlocks(MachineBasicBlock *SuccBB,
 
   // Sort by hash value so that blocks with identical end sequences sort
   // together.
+#if LLVM_ENABLE_DEBUGLOC_ORIGIN_TRACKING
+  // If origin-tracking is enabled then MergePotentialElt is no longer a POD
+  // type, so we need std::sort instead.
+  std::sort(MergePotentials.begin(), MergePotentials.end());
+#else
   array_pod_sort(MergePotentials.begin(), MergePotentials.end());
+#endif
 
   // Walk through equivalence sets looking for actual exact matches.
   while (MergePotentials.size() > 1) {
diff --git a/llvm/lib/IR/DebugLoc.cpp b/llvm/lib/IR/DebugLoc.cpp
index 0e65ddcec8934..05aad5d393547 100644
--- a/llvm/lib/IR/DebugLoc.cpp
+++ b/llvm/lib/IR/DebugLoc.cpp
@@ -9,11 +9,31 @@
 #include "llvm/IR/DebugLoc.h"
 #include "llvm/Config/llvm-config.h"
 #include "llvm/IR/DebugInfo.h"
+
+#if LLVM_ENABLE_DEBUGLOC_ORIGIN_TRACKING
+#include "llvm/Support/Signals.h"
+
+namespace llvm {
+DbgLocOrigin::DbgLocOrigin(bool ShouldCollectTrace) {
+  if (ShouldCollectTrace) {
+    auto &[Depth, StackTrace] = StackTraces.emplace_back();
+    Depth = sys::getStackTrace(StackTrace);
+  }
+}
+void DbgLocOrigin::addTrace() {
+  if (StackTraces.empty())
+    return;
+  auto &[Depth, StackTrace] = StackTraces.emplace_back();
+  Depth = sys::getStackTrace(StackTrace);
+}
+} // namespace llvm
+#endif
+
 using namespace llvm;
 
 #if LLVM_ENABLE_DEBUGLOC_COVERAGE_TRACKING
 DILocAndCoverageTracking::DILocAndCoverageTracking(const DILocation *L)
-    : TrackingMDNodeRef(const_cast<DILocation *>(L)),
+    : TrackingMDNodeRef(const_cast<DILocation *>(L)), DbgLocOrigin(!L),
       Kind(DebugLocKind::Normal) {}
 #endif // LLVM_ENABLE_DEBUGLOC_COVERAGE_TRACKING
 
diff --git a/llvm/lib/IR/Instruction.cpp b/llvm/lib/IR/Instruction.cpp
index 109d516c61b7c..123bc7ecce01a 100644
--- a/llvm/lib/IR/Instruction.cpp
+++ b/llvm/lib/IR/Instruction.cpp
@@ -1375,7 +1375,7 @@ void Instruction::copyMetadata(const Instruction &SrcInst,
       setMetadata(MD.first, MD.second);
   }
   if (WL.empty() || WLS.count(LLVMContext::MD_dbg))
-    setDebugLoc(SrcInst.getDebugLoc());
+    setDebugLoc(SrcInst.getDebugLoc().getCopied());
 }
 
 Instruction *Instruction::clone() const {



More information about the llvm-branch-commits mailing list