[Lldb-commits] [clang] [compiler-rt] [flang] [libc] [libcxx] [lldb] [llvm] [mlir] [X86] Fast AVX-512-VNNI vpdpwssd tuning (PR #85033)

Thu Mar 14 01:02:04 PDT 2024

https://github.com/ganeshgit updated https://github.com/llvm/llvm-project/pull/85033

>From 7fdcfd61e3106e8f98f38699f1355d1bc2e69c6f Mon Sep 17 00:00:00 2001
From: Ganesh Gopalasubramanian <Ganesh.Gopalasubramanian at amd.com>
Date: Wed, 13 Mar 2024 12:08:46 +0530
Subject: [PATCH 01/63] [X86] Fast AVX-512-VNNI vpdpwssd tuning

---
 llvm/lib/Target/X86/X86.td                   | 12 +++++++++++-
 llvm/lib/Target/X86/X86ISelLowering.cpp      |  2 +-
 llvm/lib/Target/X86/X86InstrInfo.cpp         | 13 +++++++++----
 llvm/lib/Target/X86/X86InstrPredicates.td    |  1 +
 llvm/lib/Target/X86/X86TargetTransformInfo.h |  1 +
 llvm/test/CodeGen/X86/vpdpwssd.ll            | 15 +++++++++++++++
 6 files changed, 38 insertions(+), 6 deletions(-)
 create mode 100644 llvm/test/CodeGen/X86/vpdpwssd.ll

diff --git a/llvm/lib/Target/X86/X86.td b/llvm/lib/Target/X86/X86.td
index 8367f938c0ddfa..3d426ec612bb2a 100644
--- a/llvm/lib/Target/X86/X86.td
+++ b/llvm/lib/Target/X86/X86.td
@@ -683,6 +683,12 @@ def TuningFastGather
     : SubtargetFeature<"fast-gather", "HasFastGather", "true",
                        "Indicates if gather is reasonably fast (this is true for Skylake client and all AVX-512 CPUs)">;
 
+// Generate vpdpwssd instead of vpmaddwd+vpaddd sequence.
+def TuningFastPWSSD
+    : SubtargetFeature<
+          "fast-pwssd", "HasFastPWSSD", "true",
+          "Prefer vpdpwssd instruction over vpmaddwd+vpaddd instruction sequence">;
+
 def TuningPreferNoGather
     : SubtargetFeature<"prefer-no-gather", "PreferGather", "false",
                        "Prefer no gather instructions">;
@@ -1502,7 +1508,11 @@ def ProcessorFeatures {
     !listconcat(ZN2Tuning, ZN3AdditionalTuning);
   list<SubtargetFeature> ZN3Features =
     !listconcat(ZN2Features, ZN3AdditionalFeatures);
-  list<SubtargetFeature> ZN4Tuning = ZN3Tuning;
+
+
+  list<SubtargetFeature> ZN4AdditionalTuning = [TuningFastPWSSD];
+  list<SubtargetFeature> ZN4Tuning =
+    !listconcat(ZN3Tuning, ZN4AdditionalTuning);
   list<SubtargetFeature> ZN4AdditionalFeatures = [FeatureAVX512,
                                                   FeatureEVEX512,
                                                   FeatureCDI,
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 2b5e3c0379a138..2367591152f7e1 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -48215,7 +48215,7 @@ static SDValue combineAndShuffleNot(SDNode *N, SelectionDAG &DAG,
 
   // We do not split for SSE at all, but we need to split vectors for AVX1 and
   // AVX2.
-  if (!Subtarget.useAVX512Regs() && VT.is512BitVector() && 
+  if (!Subtarget.useAVX512Regs() && VT.is512BitVector() &&
       TLI.isTypeLegal(VT.getHalfNumVectorElementsVT(*DAG.getContext()))) {
     SDValue LoX, HiX;
     std::tie(LoX, HiX) = splitVector(X, DAG, DL);
diff --git a/llvm/lib/Target/X86/X86InstrInfo.cpp b/llvm/lib/Target/X86/X86InstrInfo.cpp
index b65f49527ae5dd..57010f1b02e1f1 100644
--- a/llvm/lib/Target/X86/X86InstrInfo.cpp
+++ b/llvm/lib/Target/X86/X86InstrInfo.cpp
@@ -10572,8 +10572,11 @@ bool X86InstrInfo::getMachineCombinerPatterns(
   case X86::VPDPWSSDrm:
   case X86::VPDPWSSDYrr:
   case X86::VPDPWSSDYrm: {
-    Patterns.push_back(MachineCombinerPattern::DPWSSD);
-    return true;
+    if (!Subtarget.hasFastPWSSD()) {
+      Patterns.push_back(MachineCombinerPattern::DPWSSD);
+      return true;
+    }
+    return false;
   }
   case X86::VPDPWSSDZ128r:
   case X86::VPDPWSSDZ128m:
@@ -10581,9 +10584,11 @@ bool X86InstrInfo::getMachineCombinerPatterns(
   case X86::VPDPWSSDZ256m:
   case X86::VPDPWSSDZr:
   case X86::VPDPWSSDZm: {
-    if (Subtarget.hasBWI())
+    if (Subtarget.hasBWI() && !Subtarget.hasFastPWSSD()) {
       Patterns.push_back(MachineCombinerPattern::DPWSSD);
-    return true;
+      return true;
+    }
+    return false;
   }
   }
 }
diff --git a/llvm/lib/Target/X86/X86InstrPredicates.td b/llvm/lib/Target/X86/X86InstrPredicates.td
index 7dd51ba6c027ae..9fd846dd05c5d0 100644
--- a/llvm/lib/Target/X86/X86InstrPredicates.td
+++ b/llvm/lib/Target/X86/X86InstrPredicates.td
@@ -238,5 +238,6 @@ def HasFastSHLDRotate : Predicate<"Subtarget->hasFastSHLDRotate()">;
 def HasERMSB : Predicate<"Subtarget->hasERMSB()">;
 def HasFSRM : Predicate<"Subtarget->hasFSRM()">;
 def HasMFence    : Predicate<"Subtarget->hasMFence()">;
+def HasFastPWSSD: Predicate<"Subtarget->hasFastPWSSD()">;
 def UseIndirectThunkCalls : Predicate<"Subtarget->useIndirectThunkCalls()">;
 def NotUseIndirectThunkCalls : Predicate<"!Subtarget->useIndirectThunkCalls()">;
diff --git a/llvm/lib/Target/X86/X86TargetTransformInfo.h b/llvm/lib/Target/X86/X86TargetTransformInfo.h
index 1a5e6bc886aa67..691e318cda3d33 100644
--- a/llvm/lib/Target/X86/X86TargetTransformInfo.h
+++ b/llvm/lib/Target/X86/X86TargetTransformInfo.h
@@ -94,6 +94,7 @@ class X86TTIImpl : public BasicTTIImplBase<X86TTIImpl> {
       X86::TuningNoDomainDelayBlend,
       X86::TuningPreferShiftShuffle,
       X86::TuningFastImmVectorShift,
+      X86::TuningFastPWSSD,
 
       // Perf-tuning flags.
       X86::TuningFastGather,
diff --git a/llvm/test/CodeGen/X86/vpdpwssd.ll b/llvm/test/CodeGen/X86/vpdpwssd.ll
new file mode 100644
index 00000000000000..a044378fae255c
--- /dev/null
+++ b/llvm/test/CodeGen/X86/vpdpwssd.ll
@@ -0,0 +1,15 @@
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=znver4 | FileCheck %s
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+
+define <16 x i32> @vpdpwssd_test(<16 x i32> %0, <16 x i32> %1, <16 x i32> %2) {
+; CHECK-LABEL: vpdpwssd_test:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vpdpwssd %zmm2, %zmm1, %zmm0
+; CHECK-NEXT:    retq
+  %4 = tail call <16 x i32> @llvm.x86.avx512.vpdpwssd.512(<16 x i32> %0, <16 x i32> %1, <16 x i32> %2)
+  ret <16 x i32> %4
+}
+
+declare <16 x i32> @llvm.x86.avx512.vpdpwssd.512(<16 x i32>, <16 x i32>, <16 x i32>) #1
+
+attributes #1 = { mustprogress nocallback nofree nosync nounwind willreturn memory(none) }

>From 15397aef28a9730769fac5a261ce211cd8d8b3a0 Mon Sep 17 00:00:00 2001
From: Matt Arsenault <Matthew.Arsenault at amd.com>
Date: Wed, 13 Mar 2024 12:42:15 +0530
Subject: [PATCH 02/63] AMDGPU: Use list-table for metadata table (#85024)

The table syntax for sphinx is really insufferably whitespace dependent.
I've been meaning to convert the existing attribute and intrinsic tables
to use list-table, which is less painful to merge.
---
 llvm/docs/AMDGPUUsage.rst | 15 ++++++++-------
 1 file changed, 8 insertions(+), 7 deletions(-)

diff --git a/llvm/docs/AMDGPUUsage.rst b/llvm/docs/AMDGPUUsage.rst
index 99d7a482710f5e..fd9ad7fac19a95 100644
--- a/llvm/docs/AMDGPUUsage.rst
+++ b/llvm/docs/AMDGPUUsage.rst
@@ -1317,15 +1317,16 @@ LLVM IR Metadata
 
 The AMDGPU backend implements the following LLVM IR metadata.
 
-.. table:: AMDGPU LLVM IR Metadata
+.. list-table:: AMDGPU LLVM IR Metatdata
   :name: amdgpu-llvm-ir-metadata-table
 
-  ==============================================   ==========================================================
-  LLVM IR Metadata                                   Description
-  ==============================================   ==========================================================
-  !amdgpu.last.use                                 Sets TH_LOAD_LU temporal hint on load instructions that support it.
-                                                   Takes priority over nontemporal hint (TH_LOAD_NT).
-  ==============================================   ==========================================================
+  * - Metadata Name
+    - Description
+    - Values
+  * - !amdgpu.last.use
+    - Sets TH_LOAD_LU temporal hint on load instructions that support it.
+      Takes priority over nontemporal hint (TH_LOAD_NT).
+    - {}
 
 LLVM IR Attributes
 ------------------

>From 72417addc2db391c2eb12af0c334351f22452fa7 Mon Sep 17 00:00:00 2001
From: Vyacheslav Levytskyy <vyacheslav.levytskyy at intel.com>
Date: Wed, 13 Mar 2024 08:32:01 +0100
Subject: [PATCH 03/63] [SPIR-V] Add implementation of G_SPLAT_VECTOR opcode
 and fix invalid types processing (#84766)

This PR:
* adds support for G_SPLAT_VECTOR generic opcode that may be legally
generated instead of G_BUILD_VECTOR by previous passes of the translator
(see https://github.com/llvm/llvm-project/pull/80378 for the source of
breaking changes);
* improves deduction of types for opaque pointers.

This PR also fixes the following issues:
* if a function has ptr argument(s), two functions that have different
SPIR-V type definitions may get identical LLVM function types and break
agreements of global register and duplicate checker;
* checks for pointer types do not account for TypedPointerType.

Update of tests:
* A test case is added to cover the issue with function ptr parameters.
* The first case, that is support for G_SPLAT_VECTOR generic opcode, is
covered by existing test cases.
* Multiple additional checks by `spirv-val` is added to cover more
possibilities of generation of invalid code.
---
 llvm/lib/Target/SPIRV/SPIRVCallLowering.cpp   |  49 ++++++-
 llvm/lib/Target/SPIRV/SPIRVEmitIntrinsics.cpp | 136 +++++++++++++-----
 llvm/lib/Target/SPIRV/SPIRVGlobalRegistry.cpp |  32 +++--
 llvm/lib/Target/SPIRV/SPIRVGlobalRegistry.h   |   3 +-
 .../Target/SPIRV/SPIRVInstructionSelector.cpp |  41 ++++++
 llvm/lib/Target/SPIRV/SPIRVLegalizerInfo.cpp  |   4 +-
 llvm/lib/Target/SPIRV/SPIRVUtils.h            |  26 ++++
 llvm/test/CodeGen/SPIRV/ComparePointers.ll    |   1 +
 llvm/test/CodeGen/SPIRV/capability-kernel.ll  |   1 +
 .../pointers/getelementptr-addressspace.ll    |   1 +
 .../SPIRV/pointers/getelementptr-base-type.ll |   1 +
 .../kernel-argument-pointer-addressspace.ll   |   1 +
 ...er-type-deduction-no-bitcast-to-generic.ll |   1 +
 .../pointers/kernel-argument-pointer-type.ll  |   1 +
 .../SPIRV/pointers/load-addressspace.ll       |   1 +
 .../pointers/store-operand-ptr-to-struct.ll   |   1 +
 .../SPIRV/pointers/struct-opaque-pointers.ll  |   2 +-
 .../pointers/two-bitcast-or-param-users.ll    |   1 +
 .../SPIRV/pointers/two-subsequent-bitcasts.ll |   1 +
 .../SPIRV/pointers/type-deduce-by-call-rev.ll |  28 ++++
 .../SPIRV/pointers/type-deduce-by-call.ll     |  28 ++++
 .../CodeGen/SPIRV/pointers/typeof-ptr-int.ll  |  29 ++++
 llvm/test/CodeGen/SPIRV/relationals.ll        |   1 +
 llvm/test/CodeGen/SPIRV/simple.ll             |   1 +
 .../AtomicCompareExchangeExplicit_cl20.ll     |   1 +
 .../SPIRV/transcoding/BitReversePref.ll       |   1 +
 .../CodeGen/SPIRV/transcoding/BuildNDRange.ll |   1 +
 .../SPIRV/transcoding/BuildNDRange_2.ll       |   1 +
 .../CodeGen/SPIRV/transcoding/ConvertPtr.ll   |   1 +
 .../SPIRV/transcoding/DecorationAlignment.ll  |   1 +
 .../transcoding/DecorationMaxByteOffset.ll    |   1 +
 llvm/test/CodeGen/SPIRV/transcoding/DivRem.ll |   1 +
 .../ExecutionMode_SPIR_to_SPIRV.ll            |   1 +
 .../SPIRV/transcoding/GlobalFunAnnotate.ll    |   1 +
 .../transcoding/OpenCL/atomic_cmpxchg.ll      |   1 +
 .../SPIRV/transcoding/OpenCL/atomic_legacy.ll |   1 +
 .../OpenCL/atomic_work_item_fence.ll          |   1 +
 .../SPIRV/transcoding/OpenCL/barrier.ll       |   1 +
 .../transcoding/OpenCL/sub_group_mask.ll      |   1 +
 .../transcoding/OpenCL/work_group_barrier.ll  |   1 +
 .../CodeGen/SPIRV/transcoding/atomic_flag.ll  |   1 +
 .../SPIRV/transcoding/atomic_load_store.ll    |   1 +
 .../test/CodeGen/SPIRV/transcoding/bitcast.ll |   1 +
 .../transcoding/block_w_struct_return.ll      |   1 +
 .../SPIRV/transcoding/builtin_calls.ll        |   1 +
 .../CodeGen/SPIRV/transcoding/builtin_vars.ll |   1 +
 .../transcoding/builtin_vars_arithmetics.ll   |   1 +
 .../SPIRV/transcoding/builtin_vars_opt.ll     |   1 +
 .../SPIRV/transcoding/check_ro_qualifier.ll   |   1 +
 .../CodeGen/SPIRV/transcoding/cl-types.ll     |   1 +
 .../CodeGen/SPIRV/transcoding/clk_event_t.ll  |   1 +
 .../SPIRV/transcoding/enqueue_kernel.ll       |   1 +
 .../SPIRV/transcoding/explicit-conversions.ll |   1 +
 .../SPIRV/transcoding/extract_insert_value.ll |   1 +
 llvm/test/CodeGen/SPIRV/transcoding/fadd.ll   |   1 +
 llvm/test/CodeGen/SPIRV/transcoding/fclamp.ll |   1 +
 llvm/test/CodeGen/SPIRV/transcoding/fcmp.ll   |   1 +
 llvm/test/CodeGen/SPIRV/transcoding/fdiv.ll   |   1 +
 llvm/test/CodeGen/SPIRV/transcoding/fmod.ll   |   1 +
 llvm/test/CodeGen/SPIRV/transcoding/fmul.ll   |   1 +
 llvm/test/CodeGen/SPIRV/transcoding/fneg.ll   |   1 +
 .../fp_contract_reassoc_fast_mode.ll          |   1 +
 llvm/test/CodeGen/SPIRV/transcoding/frem.ll   |   1 +
 llvm/test/CodeGen/SPIRV/transcoding/fsub.ll   |   1 +
 .../transcoding/get_image_num_mip_levels.ll   |   1 +
 .../CodeGen/SPIRV/transcoding/global_block.ll |   1 +
 .../CodeGen/SPIRV/transcoding/group_ops.ll    |   1 +
 .../test/CodeGen/SPIRV/transcoding/isequal.ll |   1 +
 .../SPIRV/transcoding/relationals_double.ll   |   1 +
 .../SPIRV/transcoding/relationals_float.ll    |   1 +
 .../SPIRV/transcoding/relationals_half.ll     |   1 +
 71 files changed, 382 insertions(+), 56 deletions(-)
 create mode 100644 llvm/test/CodeGen/SPIRV/pointers/type-deduce-by-call-rev.ll
 create mode 100644 llvm/test/CodeGen/SPIRV/pointers/type-deduce-by-call.ll
 create mode 100644 llvm/test/CodeGen/SPIRV/pointers/typeof-ptr-int.ll

diff --git a/llvm/lib/Target/SPIRV/SPIRVCallLowering.cpp b/llvm/lib/Target/SPIRV/SPIRVCallLowering.cpp
index 2d7a00bab38e91..f1fbe2ba1bc416 100644
--- a/llvm/lib/Target/SPIRV/SPIRVCallLowering.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVCallLowering.cpp
@@ -85,6 +85,42 @@ static ConstantInt *getConstInt(MDNode *MD, unsigned NumOp) {
   return nullptr;
 }
 
+// If the function has pointer arguments, we are forced to re-create this
+// function type from the very beginning, changing PointerType by
+// TypedPointerType for each pointer argument. Otherwise, the same `Type*`
+// potentially corresponds to different SPIR-V function type, effectively
+// invalidating logic behind global registry and duplicates tracker.
+static FunctionType *
+fixFunctionTypeIfPtrArgs(SPIRVGlobalRegistry *GR, const Function &F,
+                         FunctionType *FTy, const SPIRVType *SRetTy,
+                         const SmallVector<SPIRVType *, 4> &SArgTys) {
+  if (F.getParent()->getNamedMetadata("spv.cloned_funcs"))
+    return FTy;
+
+  bool hasArgPtrs = false;
+  for (auto &Arg : F.args()) {
+    // check if it's an instance of a non-typed PointerType
+    if (Arg.getType()->isPointerTy()) {
+      hasArgPtrs = true;
+      break;
+    }
+  }
+  if (!hasArgPtrs) {
+    Type *RetTy = FTy->getReturnType();
+    // check if it's an instance of a non-typed PointerType
+    if (!RetTy->isPointerTy())
+      return FTy;
+  }
+
+  // re-create function type, using TypedPointerType instead of PointerType to
+  // properly trace argument types
+  const Type *RetTy = GR->getTypeForSPIRVType(SRetTy);
+  SmallVector<Type *, 4> ArgTys;
+  for (auto SArgTy : SArgTys)
+    ArgTys.push_back(const_cast<Type *>(GR->getTypeForSPIRVType(SArgTy)));
+  return FunctionType::get(const_cast<Type *>(RetTy), ArgTys, false);
+}
+
 // This code restores function args/retvalue types for composite cases
 // because the final types should still be aggregate whereas they're i32
 // during the translation to cope with aggregate flattening etc.
@@ -162,7 +198,7 @@ static SPIRVType *getArgSPIRVType(const Function &F, unsigned ArgIdx,
 
   // If OriginalArgType is non-pointer, use the OriginalArgType (the type cannot
   // be legally reassigned later).
-  if (!OriginalArgType->isPointerTy())
+  if (!isPointerTy(OriginalArgType))
     return GR->getOrCreateSPIRVType(OriginalArgType, MIRBuilder, ArgAccessQual);
 
   // In case OriginalArgType is of pointer type, there are three possibilities:
@@ -179,8 +215,7 @@ static SPIRVType *getArgSPIRVType(const Function &F, unsigned ArgIdx,
     SPIRVType *ElementType = GR->getOrCreateSPIRVType(ByValRefType, MIRBuilder);
     return GR->getOrCreateSPIRVPointerType(
         ElementType, MIRBuilder,
-        addressSpaceToStorageClass(Arg->getType()->getPointerAddressSpace(),
-                                   ST));
+        addressSpaceToStorageClass(getPointerAddressSpace(Arg->getType()), ST));
   }
 
   for (auto User : Arg->users()) {
@@ -240,7 +275,6 @@ bool SPIRVCallLowering::lowerFormalArguments(MachineIRBuilder &MIRBuilder,
       static_cast<const SPIRVSubtarget *>(&MIRBuilder.getMF().getSubtarget());
 
   // Assign types and names to all args, and store their types for later.
-  FunctionType *FTy = getOriginalFunctionType(F);
   SmallVector<SPIRVType *, 4> ArgTypeVRegs;
   if (VRegs.size() > 0) {
     unsigned i = 0;
@@ -255,7 +289,7 @@ bool SPIRVCallLowering::lowerFormalArguments(MachineIRBuilder &MIRBuilder,
 
       if (Arg.hasName())
         buildOpName(VRegs[i][0], Arg.getName(), MIRBuilder);
-      if (Arg.getType()->isPointerTy()) {
+      if (isPointerTy(Arg.getType())) {
         auto DerefBytes = static_cast<unsigned>(Arg.getDereferenceableBytes());
         if (DerefBytes != 0)
           buildOpDecorate(VRegs[i][0], MIRBuilder,
@@ -322,7 +356,9 @@ bool SPIRVCallLowering::lowerFormalArguments(MachineIRBuilder &MIRBuilder,
   MRI->setRegClass(FuncVReg, &SPIRV::IDRegClass);
   if (F.isDeclaration())
     GR->add(&F, &MIRBuilder.getMF(), FuncVReg);
+  FunctionType *FTy = getOriginalFunctionType(F);
   SPIRVType *RetTy = GR->getOrCreateSPIRVType(FTy->getReturnType(), MIRBuilder);
+  FTy = fixFunctionTypeIfPtrArgs(GR, F, FTy, RetTy, ArgTypeVRegs);
   SPIRVType *FuncTy = GR->getOrCreateOpTypeFunctionWithArgs(
       FTy, RetTy, ArgTypeVRegs, MIRBuilder);
   uint32_t FuncControl = getFunctionControl(F);
@@ -429,7 +465,6 @@ bool SPIRVCallLowering::lowerCall(MachineIRBuilder &MIRBuilder,
     return false;
   MachineFunction &MF = MIRBuilder.getMF();
   GR->setCurrentFunc(MF);
-  FunctionType *FTy = nullptr;
   const Function *CF = nullptr;
   std::string DemangledName;
   const Type *OrigRetTy = Info.OrigRet.Ty;
@@ -444,7 +479,7 @@ bool SPIRVCallLowering::lowerCall(MachineIRBuilder &MIRBuilder,
     // TODO: support constexpr casts and indirect calls.
     if (CF == nullptr)
       return false;
-    if ((FTy = getOriginalFunctionType(*CF)) != nullptr)
+    if (FunctionType *FTy = getOriginalFunctionType(*CF))
       OrigRetTy = FTy->getReturnType();
   }
 
diff --git a/llvm/lib/Target/SPIRV/SPIRVEmitIntrinsics.cpp b/llvm/lib/Target/SPIRV/SPIRVEmitIntrinsics.cpp
index 575e903d05bb97..c5b901235402c1 100644
--- a/llvm/lib/Target/SPIRV/SPIRVEmitIntrinsics.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVEmitIntrinsics.cpp
@@ -57,8 +57,14 @@ class SPIRVEmitIntrinsics
   bool TrackConstants = true;
   DenseMap<Instruction *, Constant *> AggrConsts;
   DenseSet<Instruction *> AggrStores;
+
+  // deduce values type
+  DenseMap<Value *, Type *> DeducedElTys;
+  Type *deduceElementType(Value *I);
+
   void preprocessCompositeConstants(IRBuilder<> &B);
   void preprocessUndefs(IRBuilder<> &B);
+
   CallInst *buildIntrWithMD(Intrinsic::ID IntrID, ArrayRef<Type *> Types,
                             Value *Arg, Value *Arg2, ArrayRef<Constant *> Imms,
                             IRBuilder<> &B) {
@@ -72,6 +78,7 @@ class SPIRVEmitIntrinsics
       Args.push_back(Imm);
     return B.CreateIntrinsic(IntrID, {Types}, Args);
   }
+
   void replaceMemInstrUses(Instruction *Old, Instruction *New, IRBuilder<> &B);
   void processInstrAfterVisit(Instruction *I, IRBuilder<> &B);
   void insertAssignPtrTypeIntrs(Instruction *I, IRBuilder<> &B);
@@ -156,6 +163,48 @@ static inline void reportFatalOnTokenType(const Instruction *I) {
                        false);
 }
 
+// Deduce and return a successfully deduced Type of the Instruction,
+// or nullptr otherwise.
+static Type *deduceElementTypeHelper(Value *I,
+                                     std::unordered_set<Value *> &Visited,
+                                     DenseMap<Value *, Type *> &DeducedElTys) {
+  // maybe already known
+  auto It = DeducedElTys.find(I);
+  if (It != DeducedElTys.end())
+    return It->second;
+
+  // maybe a cycle
+  if (Visited.find(I) != Visited.end())
+    return nullptr;
+  Visited.insert(I);
+
+  // fallback value in case when we fail to deduce a type
+  Type *Ty = nullptr;
+  // look for known basic patterns of type inference
+  if (auto *Ref = dyn_cast<AllocaInst>(I))
+    Ty = Ref->getAllocatedType();
+  else if (auto *Ref = dyn_cast<GetElementPtrInst>(I))
+    Ty = Ref->getResultElementType();
+  else if (auto *Ref = dyn_cast<GlobalValue>(I))
+    Ty = Ref->getValueType();
+  else if (auto *Ref = dyn_cast<AddrSpaceCastInst>(I))
+    Ty = deduceElementTypeHelper(Ref->getPointerOperand(), Visited,
+                                 DeducedElTys);
+
+  // remember the found relationship
+  if (Ty)
+    DeducedElTys[I] = Ty;
+
+  return Ty;
+}
+
+Type *SPIRVEmitIntrinsics::deduceElementType(Value *I) {
+  std::unordered_set<Value *> Visited;
+  if (Type *Ty = deduceElementTypeHelper(I, Visited, DeducedElTys))
+    return Ty;
+  return IntegerType::getInt8Ty(I->getContext());
+}
+
 void SPIRVEmitIntrinsics::replaceMemInstrUses(Instruction *Old,
                                               Instruction *New,
                                               IRBuilder<> &B) {
@@ -280,7 +329,7 @@ Instruction *SPIRVEmitIntrinsics::visitBitCastInst(BitCastInst &I) {
   // varying element types. In case of IR coming from older versions of LLVM
   // such bitcasts do not provide sufficient information, should be just skipped
   // here, and handled in insertPtrCastOrAssignTypeInstr.
-  if (I.getType()->isPointerTy()) {
+  if (isPointerTy(I.getType())) {
     I.replaceAllUsesWith(Source);
     I.eraseFromParent();
     return nullptr;
@@ -333,20 +382,10 @@ void SPIRVEmitIntrinsics::replacePointerOperandWithPtrCast(
   while (BitCastInst *BC = dyn_cast<BitCastInst>(Pointer))
     Pointer = BC->getOperand(0);
 
-  // Do not emit spv_ptrcast if Pointer is a GlobalValue of expected type.
-  GlobalValue *GV = dyn_cast<GlobalValue>(Pointer);
-  if (GV && GV->getValueType() == ExpectedElementType)
-    return;
-
-  // Do not emit spv_ptrcast if Pointer is a result of alloca with expected
-  // type.
-  AllocaInst *A = dyn_cast<AllocaInst>(Pointer);
-  if (A && A->getAllocatedType() == ExpectedElementType)
-    return;
-
-  // Do not emit spv_ptrcast if Pointer is a result of GEP of expected type.
-  GetElementPtrInst *GEPI = dyn_cast<GetElementPtrInst>(Pointer);
-  if (GEPI && GEPI->getResultElementType() == ExpectedElementType)
+  // Do not emit spv_ptrcast if Pointer's element type is ExpectedElementType
+  std::unordered_set<Value *> Visited;
+  Type *PointerElemTy = deduceElementTypeHelper(Pointer, Visited, DeducedElTys);
+  if (PointerElemTy == ExpectedElementType)
     return;
 
   setInsertPointSkippingPhis(B, I);
@@ -356,7 +395,7 @@ void SPIRVEmitIntrinsics::replacePointerOperandWithPtrCast(
       ValueAsMetadata::getConstant(ExpectedElementTypeConst);
   MDTuple *TyMD = MDNode::get(F->getContext(), CM);
   MetadataAsValue *VMD = MetadataAsValue::get(F->getContext(), TyMD);
-  unsigned AddressSpace = Pointer->getType()->getPointerAddressSpace();
+  unsigned AddressSpace = getPointerAddressSpace(Pointer->getType());
   bool FirstPtrCastOrAssignPtrType = true;
 
   // Do not emit new spv_ptrcast if equivalent one already exists or when
@@ -401,9 +440,11 @@ void SPIRVEmitIntrinsics::replacePointerOperandWithPtrCast(
   // spv_assign_ptr_type instead.
   if (FirstPtrCastOrAssignPtrType &&
       (isa<Instruction>(Pointer) || isa<Argument>(Pointer))) {
-    buildIntrWithMD(Intrinsic::spv_assign_ptr_type, {Pointer->getType()},
-                    ExpectedElementTypeConst, Pointer,
-                    {B.getInt32(AddressSpace)}, B);
+    CallInst *CI = buildIntrWithMD(
+        Intrinsic::spv_assign_ptr_type, {Pointer->getType()},
+        ExpectedElementTypeConst, Pointer, {B.getInt32(AddressSpace)}, B);
+    DeducedElTys[CI] = ExpectedElementType;
+    DeducedElTys[Pointer] = ExpectedElementType;
     return;
   }
 
@@ -419,7 +460,7 @@ void SPIRVEmitIntrinsics::insertPtrCastOrAssignTypeInstr(Instruction *I,
   // Handle basic instructions:
   StoreInst *SI = dyn_cast<StoreInst>(I);
   if (SI && F->getCallingConv() == CallingConv::SPIR_KERNEL &&
-      SI->getValueOperand()->getType()->isPointerTy() &&
+      isPointerTy(SI->getValueOperand()->getType()) &&
       isa<Argument>(SI->getValueOperand())) {
     return replacePointerOperandWithPtrCast(
         I, SI->getValueOperand(), IntegerType::getInt8Ty(F->getContext()), 0,
@@ -440,9 +481,34 @@ void SPIRVEmitIntrinsics::insertPtrCastOrAssignTypeInstr(Instruction *I,
   if (!CI || CI->isIndirectCall() || CI->getCalledFunction()->isIntrinsic())
     return;
 
+  // collect information about formal parameter types
+  Function *CalledF = CI->getCalledFunction();
+  SmallVector<Type *, 4> CalledArgTys;
+  bool HaveTypes = false;
+  for (auto &CalledArg : CalledF->args()) {
+    if (!isPointerTy(CalledArg.getType())) {
+      CalledArgTys.push_back(nullptr);
+      continue;
+    }
+    auto It = DeducedElTys.find(&CalledArg);
+    Type *ParamTy = It != DeducedElTys.end() ? It->second : nullptr;
+    if (!ParamTy) {
+      for (User *U : CalledArg.users()) {
+        if (Instruction *Inst = dyn_cast<Instruction>(U)) {
+          std::unordered_set<Value *> Visited;
+          ParamTy = deduceElementTypeHelper(Inst, Visited, DeducedElTys);
+          if (ParamTy)
+            break;
+        }
+      }
+    }
+    HaveTypes |= ParamTy != nullptr;
+    CalledArgTys.push_back(ParamTy);
+  }
+
   std::string DemangledName =
       getOclOrSpirvBuiltinDemangledName(CI->getCalledFunction()->getName());
-  if (DemangledName.empty())
+  if (DemangledName.empty() && !HaveTypes)
     return;
 
   for (unsigned OpIdx = 0; OpIdx < CI->arg_size(); OpIdx++) {
@@ -455,8 +521,11 @@ void SPIRVEmitIntrinsics::insertPtrCastOrAssignTypeInstr(Instruction *I,
     if (!isa<Instruction>(ArgOperand) && !isa<Argument>(ArgOperand))
       continue;
 
-    Type *ExpectedType = SPIRV::parseBuiltinCallArgumentBaseType(
-        DemangledName, OpIdx, I->getContext());
+    Type *ExpectedType =
+        OpIdx < CalledArgTys.size() ? CalledArgTys[OpIdx] : nullptr;
+    if (!ExpectedType && !DemangledName.empty())
+      ExpectedType = SPIRV::parseBuiltinCallArgumentBaseType(
+          DemangledName, OpIdx, I->getContext());
     if (!ExpectedType)
       continue;
 
@@ -639,30 +708,25 @@ void SPIRVEmitIntrinsics::processGlobalValue(GlobalVariable &GV,
 void SPIRVEmitIntrinsics::insertAssignPtrTypeIntrs(Instruction *I,
                                                    IRBuilder<> &B) {
   reportFatalOnTokenType(I);
-  if (!I->getType()->isPointerTy() || !requireAssignType(I) ||
+  if (!isPointerTy(I->getType()) || !requireAssignType(I) ||
       isa<BitCastInst>(I))
     return;
 
   setInsertPointSkippingPhis(B, I->getNextNode());
 
-  Constant *EltTyConst;
-  unsigned AddressSpace = I->getType()->getPointerAddressSpace();
-  if (auto *AI = dyn_cast<AllocaInst>(I))
-    EltTyConst = UndefValue::get(AI->getAllocatedType());
-  else if (auto *GEP = dyn_cast<GetElementPtrInst>(I))
-    EltTyConst = UndefValue::get(GEP->getResultElementType());
-  else
-    EltTyConst = UndefValue::get(IntegerType::getInt8Ty(I->getContext()));
-
-  buildIntrWithMD(Intrinsic::spv_assign_ptr_type, {I->getType()}, EltTyConst, I,
-                  {B.getInt32(AddressSpace)}, B);
+  Type *ElemTy = deduceElementType(I);
+  Constant *EltTyConst = UndefValue::get(ElemTy);
+  unsigned AddressSpace = getPointerAddressSpace(I->getType());
+  CallInst *CI = buildIntrWithMD(Intrinsic::spv_assign_ptr_type, {I->getType()},
+                                 EltTyConst, I, {B.getInt32(AddressSpace)}, B);
+  DeducedElTys[CI] = ElemTy;
 }
 
 void SPIRVEmitIntrinsics::insertAssignTypeIntrs(Instruction *I,
                                                 IRBuilder<> &B) {
   reportFatalOnTokenType(I);
   Type *Ty = I->getType();
-  if (!Ty->isVoidTy() && !Ty->isPointerTy() && requireAssignType(I)) {
+  if (!Ty->isVoidTy() && !isPointerTy(Ty) && requireAssignType(I)) {
     setInsertPointSkippingPhis(B, I->getNextNode());
     Type *TypeToAssign = Ty;
     if (auto *II = dyn_cast<IntrinsicInst>(I)) {
diff --git a/llvm/lib/Target/SPIRV/SPIRVGlobalRegistry.cpp b/llvm/lib/Target/SPIRV/SPIRVGlobalRegistry.cpp
index 8556581996fede..bda9c57e534c3a 100644
--- a/llvm/lib/Target/SPIRV/SPIRVGlobalRegistry.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVGlobalRegistry.cpp
@@ -750,7 +750,7 @@ SPIRVType *SPIRVGlobalRegistry::createSPIRVType(
 SPIRVType *SPIRVGlobalRegistry::restOfCreateSPIRVType(
     const Type *Ty, MachineIRBuilder &MIRBuilder,
     SPIRV::AccessQualifier::AccessQualifier AccessQual, bool EmitIR) {
-  if (TypesInProcessing.count(Ty) && !Ty->isPointerTy())
+  if (TypesInProcessing.count(Ty) && !isPointerTy(Ty))
     return nullptr;
   TypesInProcessing.insert(Ty);
   SPIRVType *SpirvType = createSPIRVType(Ty, MIRBuilder, AccessQual, EmitIR);
@@ -762,11 +762,15 @@ SPIRVType *SPIRVGlobalRegistry::restOfCreateSPIRVType(
   // will be added later. For special types it is already added to DT.
   if (SpirvType->getOpcode() != SPIRV::OpTypeForwardPointer && !Reg.isValid() &&
       !isSpecialOpaqueType(Ty)) {
-    if (!Ty->isPointerTy())
+    if (!isPointerTy(Ty))
       DT.add(Ty, &MIRBuilder.getMF(), getSPIRVTypeID(SpirvType));
+    else if (isTypedPointerTy(Ty))
+      DT.add(cast<TypedPointerType>(Ty)->getElementType(),
+             getPointerAddressSpace(Ty), &MIRBuilder.getMF(),
+             getSPIRVTypeID(SpirvType));
     else
       DT.add(Type::getInt8Ty(MIRBuilder.getMF().getFunction().getContext()),
-             Ty->getPointerAddressSpace(), &MIRBuilder.getMF(),
+             getPointerAddressSpace(Ty), &MIRBuilder.getMF(),
              getSPIRVTypeID(SpirvType));
   }
 
@@ -787,12 +791,15 @@ SPIRVType *SPIRVGlobalRegistry::getOrCreateSPIRVType(
     const Type *Ty, MachineIRBuilder &MIRBuilder,
     SPIRV::AccessQualifier::AccessQualifier AccessQual, bool EmitIR) {
   Register Reg;
-  if (!Ty->isPointerTy())
+  if (!isPointerTy(Ty))
     Reg = DT.find(Ty, &MIRBuilder.getMF());
+  else if (isTypedPointerTy(Ty))
+    Reg = DT.find(cast<TypedPointerType>(Ty)->getElementType(),
+                  getPointerAddressSpace(Ty), &MIRBuilder.getMF());
   else
     Reg =
         DT.find(Type::getInt8Ty(MIRBuilder.getMF().getFunction().getContext()),
-                Ty->getPointerAddressSpace(), &MIRBuilder.getMF());
+                getPointerAddressSpace(Ty), &MIRBuilder.getMF());
 
   if (Reg.isValid() && !isSpecialOpaqueType(Ty))
     return getSPIRVTypeForVReg(Reg);
@@ -836,11 +843,16 @@ bool SPIRVGlobalRegistry::isScalarOrVectorOfType(Register VReg,
 
 unsigned
 SPIRVGlobalRegistry::getScalarOrVectorComponentCount(Register VReg) const {
-  if (SPIRVType *Type = getSPIRVTypeForVReg(VReg))
-    return Type->getOpcode() == SPIRV::OpTypeVector
-               ? static_cast<unsigned>(Type->getOperand(2).getImm())
-               : 1;
-  return 0;
+  return getScalarOrVectorComponentCount(getSPIRVTypeForVReg(VReg));
+}
+
+unsigned
+SPIRVGlobalRegistry::getScalarOrVectorComponentCount(SPIRVType *Type) const {
+  if (!Type)
+    return 0;
+  return Type->getOpcode() == SPIRV::OpTypeVector
+             ? static_cast<unsigned>(Type->getOperand(2).getImm())
+             : 1;
 }
 
 unsigned
diff --git a/llvm/lib/Target/SPIRV/SPIRVGlobalRegistry.h b/llvm/lib/Target/SPIRV/SPIRVGlobalRegistry.h
index 9c0061d13fd0cf..25d82ebf9bc79b 100644
--- a/llvm/lib/Target/SPIRV/SPIRVGlobalRegistry.h
+++ b/llvm/lib/Target/SPIRV/SPIRVGlobalRegistry.h
@@ -198,9 +198,10 @@ class SPIRVGlobalRegistry {
   // opcode (e.g. OpTypeBool, or OpTypeVector %x 4, where %x is OpTypeBool).
   bool isScalarOrVectorOfType(Register VReg, unsigned TypeOpcode) const;
 
-  // Return number of elements in a vector if the given VReg is associated with
+  // Return number of elements in a vector if the argument is associated with
   // a vector type. Return 1 for a scalar type, and 0 for a missing type.
   unsigned getScalarOrVectorComponentCount(Register VReg) const;
+  unsigned getScalarOrVectorComponentCount(SPIRVType *Type) const;
 
   // For vectors or scalars of booleans, integers and floats, return the scalar
   // type's bitwidth. Otherwise calls llvm_unreachable().
diff --git a/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp b/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp
index 74df8de6eb90aa..fd19b7412c4c9c 100644
--- a/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp
@@ -125,6 +125,8 @@ class SPIRVInstructionSelector : public InstructionSelector {
 
   bool selectConstVector(Register ResVReg, const SPIRVType *ResType,
                          MachineInstr &I) const;
+  bool selectSplatVector(Register ResVReg, const SPIRVType *ResType,
+                         MachineInstr &I) const;
 
   bool selectCmp(Register ResVReg, const SPIRVType *ResType,
                  unsigned comparisonOpcode, MachineInstr &I) const;
@@ -313,6 +315,8 @@ bool SPIRVInstructionSelector::spvSelect(Register ResVReg,
 
   case TargetOpcode::G_BUILD_VECTOR:
     return selectConstVector(ResVReg, ResType, I);
+  case TargetOpcode::G_SPLAT_VECTOR:
+    return selectSplatVector(ResVReg, ResType, I);
 
   case TargetOpcode::G_SHUFFLE_VECTOR: {
     MachineBasicBlock &BB = *I.getParent();
@@ -1185,6 +1189,43 @@ bool SPIRVInstructionSelector::selectConstVector(Register ResVReg,
   return MIB.constrainAllUses(TII, TRI, RBI);
 }
 
+bool SPIRVInstructionSelector::selectSplatVector(Register ResVReg,
+                                                 const SPIRVType *ResType,
+                                                 MachineInstr &I) const {
+  if (ResType->getOpcode() != SPIRV::OpTypeVector)
+    report_fatal_error("Cannot select G_SPLAT_VECTOR with a non-vector result");
+  unsigned N = GR.getScalarOrVectorComponentCount(ResType);
+  unsigned OpIdx = I.getNumExplicitDefs();
+  if (!I.getOperand(OpIdx).isReg())
+    report_fatal_error("Unexpected argument in G_SPLAT_VECTOR");
+
+  // check if we may construct a constant vector
+  Register OpReg = I.getOperand(OpIdx).getReg();
+  bool IsConst = false;
+  if (SPIRVType *OpDef = MRI->getVRegDef(OpReg)) {
+    if (OpDef->getOpcode() == SPIRV::ASSIGN_TYPE &&
+        OpDef->getOperand(1).isReg()) {
+      if (SPIRVType *RefDef = MRI->getVRegDef(OpDef->getOperand(1).getReg()))
+        OpDef = RefDef;
+    }
+    IsConst = OpDef->getOpcode() == TargetOpcode::G_CONSTANT ||
+              OpDef->getOpcode() == TargetOpcode::G_FCONSTANT;
+  }
+
+  if (!IsConst && N < 2)
+    report_fatal_error(
+        "There must be at least two constituent operands in a vector");
+
+  auto MIB = BuildMI(*I.getParent(), I, I.getDebugLoc(),
+                     TII.get(IsConst ? SPIRV::OpConstantComposite
+                                     : SPIRV::OpCompositeConstruct))
+                 .addDef(ResVReg)
+                 .addUse(GR.getSPIRVTypeID(ResType));
+  for (unsigned i = 0; i < N; ++i)
+    MIB.addUse(OpReg);
+  return MIB.constrainAllUses(TII, TRI, RBI);
+}
+
 bool SPIRVInstructionSelector::selectCmp(Register ResVReg,
                                          const SPIRVType *ResType,
                                          unsigned CmpOpc,
diff --git a/llvm/lib/Target/SPIRV/SPIRVLegalizerInfo.cpp b/llvm/lib/Target/SPIRV/SPIRVLegalizerInfo.cpp
index f81548742a11e2..4b871bdd5d0758 100644
--- a/llvm/lib/Target/SPIRV/SPIRVLegalizerInfo.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVLegalizerInfo.cpp
@@ -149,7 +149,9 @@ SPIRVLegalizerInfo::SPIRVLegalizerInfo(const SPIRVSubtarget &ST) {
   getActionDefinitionsBuilder(G_GLOBAL_VALUE).alwaysLegal();
 
   // TODO: add proper rules for vectors legalization.
-  getActionDefinitionsBuilder({G_BUILD_VECTOR, G_SHUFFLE_VECTOR}).alwaysLegal();
+  getActionDefinitionsBuilder(
+      {G_BUILD_VECTOR, G_SHUFFLE_VECTOR, G_SPLAT_VECTOR})
+      .alwaysLegal();
 
   // Vector Reduction Operations
   getActionDefinitionsBuilder(
diff --git a/llvm/lib/Target/SPIRV/SPIRVUtils.h b/llvm/lib/Target/SPIRV/SPIRVUtils.h
index e5f35aaca9a8ba..d5ed501def9986 100644
--- a/llvm/lib/Target/SPIRV/SPIRVUtils.h
+++ b/llvm/lib/Target/SPIRV/SPIRVUtils.h
@@ -15,6 +15,7 @@
 
 #include "MCTargetDesc/SPIRVBaseInfo.h"
 #include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/TypedPointerType.h"
 #include <string>
 
 namespace llvm {
@@ -100,5 +101,30 @@ bool isEntryPoint(const Function &F);
 
 // Parse basic scalar type name, substring TypeName, and return LLVM type.
 Type *parseBasicTypeName(StringRef TypeName, LLVMContext &Ctx);
+
+// True if this is an instance of TypedPointerType.
+inline bool isTypedPointerTy(const Type *T) {
+  return T->getTypeID() == Type::TypedPointerTyID;
+}
+
+// True if this is an instance of PointerType.
+inline bool isUntypedPointerTy(const Type *T) {
+  return T->getTypeID() == Type::PointerTyID;
+}
+
+// True if this is an instance of PointerType or TypedPointerType.
+inline bool isPointerTy(const Type *T) {
+  return isUntypedPointerTy(T) || isTypedPointerTy(T);
+}
+
+// Get the address space of this pointer or pointer vector type for instances of
+// PointerType or TypedPointerType.
+inline unsigned getPointerAddressSpace(const Type *T) {
+  Type *SubT = T->getScalarType();
+  return SubT->getTypeID() == Type::PointerTyID
+             ? cast<PointerType>(SubT)->getAddressSpace()
+             : cast<TypedPointerType>(SubT)->getAddressSpace();
+}
+
 } // namespace llvm
 #endif // LLVM_LIB_TARGET_SPIRV_SPIRVUTILS_H
diff --git a/llvm/test/CodeGen/SPIRV/ComparePointers.ll b/llvm/test/CodeGen/SPIRV/ComparePointers.ll
index fd2084dbc260a3..9be05944789b6f 100644
--- a/llvm/test/CodeGen/SPIRV/ComparePointers.ll
+++ b/llvm/test/CodeGen/SPIRV/ComparePointers.ll
@@ -1,4 +1,5 @@
 ; RUN: llc -O0 -mtriple=spirv64-unknown-unknown --mattr=+spirv1.3  %s -o - | FileCheck %s --check-prefix=CHECK-SPIRV
+; TODO: %if spirv-tools %{ llc -O0 -mtriple=spirv64-unknown-unknown %s -o - -filetype=obj | spirv-val %}
 
 ;; kernel void test(int global *in, int global *in2) {
 ;;   if (!in)
diff --git a/llvm/test/CodeGen/SPIRV/capability-kernel.ll b/llvm/test/CodeGen/SPIRV/capability-kernel.ll
index 03ea58c985adb8..fea19511d4fdcc 100644
--- a/llvm/test/CodeGen/SPIRV/capability-kernel.ll
+++ b/llvm/test/CodeGen/SPIRV/capability-kernel.ll
@@ -1,4 +1,5 @@
 ; RUN: llc -O0 -mtriple=spirv64-unknown-unknown %s -o - | FileCheck %s
+; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv64-unknown-unknown %s -o - -filetype=obj | spirv-val %}
 
 ; CHECK-DAG: OpCapability Addresses
 
diff --git a/llvm/test/CodeGen/SPIRV/pointers/getelementptr-addressspace.ll b/llvm/test/CodeGen/SPIRV/pointers/getelementptr-addressspace.ll
index 062863a0e3adc9..7e9c6214c2818a 100644
--- a/llvm/test/CodeGen/SPIRV/pointers/getelementptr-addressspace.ll
+++ b/llvm/test/CodeGen/SPIRV/pointers/getelementptr-addressspace.ll
@@ -1,4 +1,5 @@
 ; RUN: llc -O0 -mtriple=spirv64-unknown-unknown %s -o - | FileCheck %s
+; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv64-unknown-unknown %s -o - -filetype=obj | spirv-val %}
 
 ; CHECK:  %[[#INT8:]] = OpTypeInt 8 0
 ; CHECK:  %[[#PTR1:]] = OpTypePointer CrossWorkgroup %[[#INT8]]
diff --git a/llvm/test/CodeGen/SPIRV/pointers/getelementptr-base-type.ll b/llvm/test/CodeGen/SPIRV/pointers/getelementptr-base-type.ll
index aaf97f8cc836c6..fc999ba1a3cdac 100644
--- a/llvm/test/CodeGen/SPIRV/pointers/getelementptr-base-type.ll
+++ b/llvm/test/CodeGen/SPIRV/pointers/getelementptr-base-type.ll
@@ -1,4 +1,5 @@
 ; RUN: llc -O0 -mtriple=spirv64-unknown-unknown %s -o - | FileCheck %s
+; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv64-unknown-unknown %s -o - -filetype=obj | spirv-val %}
 
 ; CHECK:  %[[#FLOAT32:]] = OpTypeFloat 32
 ; CHECK:  %[[#PTR:]] = OpTypePointer CrossWorkgroup %[[#FLOAT32]]
diff --git a/llvm/test/CodeGen/SPIRV/pointers/kernel-argument-pointer-addressspace.ll b/llvm/test/CodeGen/SPIRV/pointers/kernel-argument-pointer-addressspace.ll
index 6d1202328197d9..a3a730ac67e782 100644
--- a/llvm/test/CodeGen/SPIRV/pointers/kernel-argument-pointer-addressspace.ll
+++ b/llvm/test/CodeGen/SPIRV/pointers/kernel-argument-pointer-addressspace.ll
@@ -1,4 +1,5 @@
 ; RUN: llc -O0 -mtriple=spirv64-unknown-unknown %s -o - | FileCheck %s
+; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv64-unknown-unknown %s -o - -filetype=obj | spirv-val %}
 
 ; CHECK-DAG:  %[[#INT:]] = OpTypeInt 32 0
 ; CHECK-DAG:  %[[#PTR1:]] = OpTypePointer Function %[[#INT]]
diff --git a/llvm/test/CodeGen/SPIRV/pointers/kernel-argument-pointer-type-deduction-no-bitcast-to-generic.ll b/llvm/test/CodeGen/SPIRV/pointers/kernel-argument-pointer-type-deduction-no-bitcast-to-generic.ll
index 9e136ce887468d..b74a3449980d97 100644
--- a/llvm/test/CodeGen/SPIRV/pointers/kernel-argument-pointer-type-deduction-no-bitcast-to-generic.ll
+++ b/llvm/test/CodeGen/SPIRV/pointers/kernel-argument-pointer-type-deduction-no-bitcast-to-generic.ll
@@ -1,4 +1,5 @@
 ; RUN: llc -O0 -mtriple=spirv64-unknown-unknown %s -o - | FileCheck %s
+; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv64-unknown-unknown %s -o - -filetype=obj | spirv-val %}
 
 ; CHECK-DAG: %[[#IMAGE:]] = OpTypeImage %2 2D 0 0 0 0 Unknown ReadOnly
 
diff --git a/llvm/test/CodeGen/SPIRV/pointers/kernel-argument-pointer-type.ll b/llvm/test/CodeGen/SPIRV/pointers/kernel-argument-pointer-type.ll
index 1fcc6d9da9c787..b8f205a68e5616 100644
--- a/llvm/test/CodeGen/SPIRV/pointers/kernel-argument-pointer-type.ll
+++ b/llvm/test/CodeGen/SPIRV/pointers/kernel-argument-pointer-type.ll
@@ -1,4 +1,5 @@
 ; RUN: llc -O0 -mtriple=spirv64-unknown-unknown %s -o - | FileCheck %s
+; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv64-unknown-unknown %s -o - -filetype=obj | spirv-val %}
 
 ; CHECK-DAG:  %[[#FLOAT32:]] = OpTypeFloat 32
 ; CHECK-DAG:  %[[#PTR1:]] = OpTypePointer Function %[[#FLOAT32]]
diff --git a/llvm/test/CodeGen/SPIRV/pointers/load-addressspace.ll b/llvm/test/CodeGen/SPIRV/pointers/load-addressspace.ll
index 1b4e7a3e733fc6..1667abc51be9fc 100644
--- a/llvm/test/CodeGen/SPIRV/pointers/load-addressspace.ll
+++ b/llvm/test/CodeGen/SPIRV/pointers/load-addressspace.ll
@@ -1,4 +1,5 @@
 ; RUN: llc -O0 -mtriple=spirv64-unknown-unknown %s -o - | FileCheck %s
+; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv64-unknown-unknown %s -o - -filetype=obj | spirv-val %}
 
 ; CHECK:  %[[#INT8:]] = OpTypeInt 8 0
 ; CHECK:  %[[#PTR1:]] = OpTypePointer CrossWorkgroup %[[#INT8]]
diff --git a/llvm/test/CodeGen/SPIRV/pointers/store-operand-ptr-to-struct.ll b/llvm/test/CodeGen/SPIRV/pointers/store-operand-ptr-to-struct.ll
index 00b03c08e7bbcd..3a0d65e1e95f19 100644
--- a/llvm/test/CodeGen/SPIRV/pointers/store-operand-ptr-to-struct.ll
+++ b/llvm/test/CodeGen/SPIRV/pointers/store-operand-ptr-to-struct.ll
@@ -1,4 +1,5 @@
 ; RUN: llc -O0 -mtriple=spirv32-unknown-unknown %s -o - | FileCheck %s
+; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv32-unknown-unknown %s -o - -filetype=obj | spirv-val %}
 
 ; TODO: OpFunctionParameter should be a pointer of struct base type.
 ; XFAIL: *
diff --git a/llvm/test/CodeGen/SPIRV/pointers/struct-opaque-pointers.ll b/llvm/test/CodeGen/SPIRV/pointers/struct-opaque-pointers.ll
index 86f5f5bf24f5be..d426fc4dfd4eec 100644
--- a/llvm/test/CodeGen/SPIRV/pointers/struct-opaque-pointers.ll
+++ b/llvm/test/CodeGen/SPIRV/pointers/struct-opaque-pointers.ll
@@ -1,5 +1,5 @@
 ; RUN: llc -O0 -mtriple=spirv32-unknown-unknown %s -o - | FileCheck %s
-; TODO: %if spirv-tools %{ llc -O0 -mtriple=spirv64-unknown-unknown %s -o - -filetype=obj | spirv-val %}
+; TODO: %if spirv-tools %{ llc -O0 -mtriple=spirv32-unknown-unknown %s -o - -filetype=obj | spirv-val %}
 
 ; CHECK: %[[TyInt8:.*]] = OpTypeInt 8 0
 ; CHECK: %[[TyInt8Ptr:.*]] = OpTypePointer {{[a-zA-Z]+}} %[[TyInt8]]
diff --git a/llvm/test/CodeGen/SPIRV/pointers/two-bitcast-or-param-users.ll b/llvm/test/CodeGen/SPIRV/pointers/two-bitcast-or-param-users.ll
index 52180d53740883..23c3faaf88151f 100644
--- a/llvm/test/CodeGen/SPIRV/pointers/two-bitcast-or-param-users.ll
+++ b/llvm/test/CodeGen/SPIRV/pointers/two-bitcast-or-param-users.ll
@@ -1,4 +1,5 @@
 ; RUN: llc -O0 -mtriple=spirv32-unknown-unknown %s -o - | FileCheck %s
+; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv32-unknown-unknown %s -o - -filetype=obj | spirv-val %}
 
 ; CHECK-DAG: %[[#INT:]] = OpTypeInt 32
 ; CHECK-DAG: %[[#GLOBAL_PTR_INT:]] = OpTypePointer CrossWorkgroup %[[#INT]]
diff --git a/llvm/test/CodeGen/SPIRV/pointers/two-subsequent-bitcasts.ll b/llvm/test/CodeGen/SPIRV/pointers/two-subsequent-bitcasts.ll
index 473c2a8b731115..83234e3986c84f 100644
--- a/llvm/test/CodeGen/SPIRV/pointers/two-subsequent-bitcasts.ll
+++ b/llvm/test/CodeGen/SPIRV/pointers/two-subsequent-bitcasts.ll
@@ -1,4 +1,5 @@
 ; RUN: llc -O0 -mtriple=spirv64-unknown-unknown %s -o - | FileCheck %s
+; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv64-unknown-unknown %s -o - -filetype=obj | spirv-val %}
 
 ; CHECK-DAG: %[[#float:]] = OpTypeFloat 32
 ; CHECK-DAG: %[[#pointer:]] = OpTypePointer CrossWorkgroup %[[#float]]
diff --git a/llvm/test/CodeGen/SPIRV/pointers/type-deduce-by-call-rev.ll b/llvm/test/CodeGen/SPIRV/pointers/type-deduce-by-call-rev.ll
new file mode 100644
index 00000000000000..76769ab8743082
--- /dev/null
+++ b/llvm/test/CodeGen/SPIRV/pointers/type-deduce-by-call-rev.ll
@@ -0,0 +1,28 @@
+; RUN: llc -O0 -mtriple=spirv32-unknown-unknown %s -o - | FileCheck %s --check-prefix=CHECK-SPIRV
+; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv32-unknown-unknown %s -o - -filetype=obj | spirv-val %}
+
+; CHECK-SPIRV-DAG: OpName %[[FooArg:.*]] "known_type_ptr"
+; CHECK-SPIRV-DAG: OpName %[[Foo:.*]] "foo"
+; CHECK-SPIRV-DAG: OpName %[[ArgToDeduce:.*]] "unknown_type_ptr"
+; CHECK-SPIRV-DAG: OpName %[[Bar:.*]] "bar"
+; CHECK-SPIRV-DAG: %[[Long:.*]] = OpTypeInt 32 0
+; CHECK-SPIRV-DAG: %[[Void:.*]] = OpTypeVoid
+; CHECK-SPIRV-DAG: %[[LongPtr:.*]] = OpTypePointer CrossWorkgroup %[[Long]]
+; CHECK-SPIRV-DAG: %[[Fun:.*]] = OpTypeFunction %[[Void]] %[[LongPtr]]
+; CHECK-SPIRV: %[[Bar]] = OpFunction %[[Void]] None %[[Fun]]
+; CHECK-SPIRV: %[[ArgToDeduce]] = OpFunctionParameter %[[LongPtr]]
+; CHECK-SPIRV: OpFunctionCall %[[Void]] %[[Foo]] %[[ArgToDeduce]]
+; CHECK-SPIRV: %[[Foo]] = OpFunction %[[Void]] None %[[Fun]]
+; CHECK-SPIRV: %[[FooArg]] = OpFunctionParameter %[[LongPtr]]
+
+define spir_kernel void @bar(ptr addrspace(1) %unknown_type_ptr) {
+entry:
+  call spir_func void @foo(ptr addrspace(1) %unknown_type_ptr)
+  ret void
+}
+
+define void @foo(ptr addrspace(1) %known_type_ptr) {
+entry:
+  %elem = getelementptr inbounds i32, ptr addrspace(1) %known_type_ptr, i64 0
+  ret void
+}
diff --git a/llvm/test/CodeGen/SPIRV/pointers/type-deduce-by-call.ll b/llvm/test/CodeGen/SPIRV/pointers/type-deduce-by-call.ll
new file mode 100644
index 00000000000000..8cbf360a2e38d4
--- /dev/null
+++ b/llvm/test/CodeGen/SPIRV/pointers/type-deduce-by-call.ll
@@ -0,0 +1,28 @@
+; RUN: llc -O0 -mtriple=spirv32-unknown-unknown %s -o - | FileCheck %s --check-prefix=CHECK-SPIRV
+; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv32-unknown-unknown %s -o - -filetype=obj | spirv-val %}
+
+; CHECK-SPIRV-DAG: OpName %[[FooArg:.*]] "known_type_ptr"
+; CHECK-SPIRV-DAG: OpName %[[Foo:.*]] "foo"
+; CHECK-SPIRV-DAG: OpName %[[ArgToDeduce:.*]] "unknown_type_ptr"
+; CHECK-SPIRV-DAG: OpName %[[Bar:.*]] "bar"
+; CHECK-SPIRV-DAG: %[[Long:.*]] = OpTypeInt 32 0
+; CHECK-SPIRV-DAG: %[[Void:.*]] = OpTypeVoid
+; CHECK-SPIRV-DAG: %[[LongPtr:.*]] = OpTypePointer CrossWorkgroup %[[Long]]
+; CHECK-SPIRV-DAG: %[[Fun:.*]] = OpTypeFunction %[[Void]] %[[LongPtr]]
+; CHECK-SPIRV: %[[Foo]] = OpFunction %[[Void]] None %[[Fun]]
+; CHECK-SPIRV: %[[FooArg]] = OpFunctionParameter %[[LongPtr]]
+; CHECK-SPIRV: %[[Bar]] = OpFunction %[[Void]] None %[[Fun]]
+; CHECK-SPIRV: %[[ArgToDeduce]] = OpFunctionParameter %[[LongPtr]]
+; CHECK-SPIRV: OpFunctionCall %[[Void]] %[[Foo]] %[[ArgToDeduce]]
+
+define void @foo(ptr addrspace(1) %known_type_ptr) {
+entry:
+  %elem = getelementptr inbounds i32, ptr addrspace(1) %known_type_ptr, i64 0
+  ret void
+}
+
+define spir_kernel void @bar(ptr addrspace(1) %unknown_type_ptr) {
+entry:
+  call spir_func void @foo(ptr addrspace(1) %unknown_type_ptr)
+  ret void
+}
diff --git a/llvm/test/CodeGen/SPIRV/pointers/typeof-ptr-int.ll b/llvm/test/CodeGen/SPIRV/pointers/typeof-ptr-int.ll
new file mode 100644
index 00000000000000..f144418cf54259
--- /dev/null
+++ b/llvm/test/CodeGen/SPIRV/pointers/typeof-ptr-int.ll
@@ -0,0 +1,29 @@
+; This test is to check that two functions have different SPIR-V type
+; definitions, even though their LLVM function types are identical.
+
+; RUN: llc -O0 -mtriple=spirv64-unknown-unknown %s -o - | FileCheck %s
+; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv64-unknown-unknown %s -o - -filetype=obj | spirv-val %}
+
+; CHECK-DAG: OpName %[[Fun32:.*]] "tp_arg_i32"
+; CHECK-DAG: OpName %[[Fun64:.*]] "tp_arg_i64"
+; CHECK-DAG: %[[TyI32:.*]] = OpTypeInt 32 0
+; CHECK-DAG: %[[TyVoid:.*]] = OpTypeVoid
+; CHECK-DAG: %[[TyPtr32:.*]] = OpTypePointer Function %[[TyI32]]
+; CHECK-DAG: %[[TyFun32:.*]] = OpTypeFunction %[[TyVoid]] %[[TyPtr32]]
+; CHECK-DAG: %[[TyI64:.*]] = OpTypeInt 64 0
+; CHECK-DAG: %[[TyPtr64:.*]] = OpTypePointer Function %[[TyI64]]
+; CHECK-DAG: %[[TyFun64:.*]] = OpTypeFunction %[[TyVoid]] %[[TyPtr64]]
+; CHECK-DAG: %[[Fun32]] = OpFunction %[[TyVoid]] None %[[TyFun32]]
+; CHECK-DAG: %[[Fun64]] = OpFunction %[[TyVoid]] None %[[TyFun64]]
+
+define spir_kernel void @tp_arg_i32(ptr %ptr) {
+entry:
+  store i32 1, ptr %ptr
+  ret void
+}
+
+define spir_kernel void @tp_arg_i64(ptr %ptr) {
+entry:
+  store i64 1, ptr %ptr
+  ret void
+}
diff --git a/llvm/test/CodeGen/SPIRV/relationals.ll b/llvm/test/CodeGen/SPIRV/relationals.ll
index 1644dc7c03d911..f4fcf4d9f77b8c 100644
--- a/llvm/test/CodeGen/SPIRV/relationals.ll
+++ b/llvm/test/CodeGen/SPIRV/relationals.ll
@@ -1,4 +1,5 @@
 ; RUN: llc -O0 -mtriple=spirv32-unknown-unknown %s -o - | FileCheck %s --check-prefix=CHECK-SPIRV
+; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv32-unknown-unknown %s -o - -filetype=obj | spirv-val %}
 
 declare dso_local spir_func <4 x i8> @_Z13__spirv_IsNanIDv4_aDv4_fET_T0_(<4 x float>)
 declare dso_local spir_func <4 x i8> @_Z13__spirv_IsInfIDv4_aDv4_fET_T0_(<4 x float>)
diff --git a/llvm/test/CodeGen/SPIRV/simple.ll b/llvm/test/CodeGen/SPIRV/simple.ll
index de9efa83838582..63c15968c72535 100644
--- a/llvm/test/CodeGen/SPIRV/simple.ll
+++ b/llvm/test/CodeGen/SPIRV/simple.ll
@@ -1,4 +1,5 @@
 ; RUN: llc -O0 -mtriple=spirv64-unknown-unknown %s -o - | FileCheck %s
+; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv64-unknown-unknown %s -o - -filetype=obj | spirv-val %}
 
 ;; Support of doubles is required.
 ; CHECK: OpCapability Float64
diff --git a/llvm/test/CodeGen/SPIRV/transcoding/AtomicCompareExchangeExplicit_cl20.ll b/llvm/test/CodeGen/SPIRV/transcoding/AtomicCompareExchangeExplicit_cl20.ll
index fdb26bab60fe14..55cfcea999d84b 100644
--- a/llvm/test/CodeGen/SPIRV/transcoding/AtomicCompareExchangeExplicit_cl20.ll
+++ b/llvm/test/CodeGen/SPIRV/transcoding/AtomicCompareExchangeExplicit_cl20.ll
@@ -1,4 +1,5 @@
 ; RUN: llc -O0 -mtriple=spirv32-unknown-unknown %s -o - | FileCheck %s --check-prefix=CHECK-SPIRV
+; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv32-unknown-unknown %s -o - -filetype=obj | spirv-val %}
 
 ;; __kernel void testAtomicCompareExchangeExplicit_cl20(
 ;;     volatile global atomic_int* object,
diff --git a/llvm/test/CodeGen/SPIRV/transcoding/BitReversePref.ll b/llvm/test/CodeGen/SPIRV/transcoding/BitReversePref.ll
index 55161e670ca133..11b0578a0c9c07 100644
--- a/llvm/test/CodeGen/SPIRV/transcoding/BitReversePref.ll
+++ b/llvm/test/CodeGen/SPIRV/transcoding/BitReversePref.ll
@@ -1,4 +1,5 @@
 ; RUN: llc -O0 -mtriple=spirv64-unknown-linux %s -o - | FileCheck %s
+; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv64-unknown-unknown %s -o - -filetype=obj | spirv-val %}
 
 ; CHECK:     OpDecorate %[[#FUNC_NAME:]] LinkageAttributes "_Z10BitReversei"
 ; CHECK-NOT: OpBitReverse
diff --git a/llvm/test/CodeGen/SPIRV/transcoding/BuildNDRange.ll b/llvm/test/CodeGen/SPIRV/transcoding/BuildNDRange.ll
index 95f3673d1c968d..b63c1c60d00736 100644
--- a/llvm/test/CodeGen/SPIRV/transcoding/BuildNDRange.ll
+++ b/llvm/test/CodeGen/SPIRV/transcoding/BuildNDRange.ll
@@ -1,4 +1,5 @@
 ; RUN: llc -O0 -mtriple=spirv32-unknown-unknown %s -o - | FileCheck %s --check-prefix=CHECK-SPIRV
+; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv32-unknown-unknown %s -o - -filetype=obj | spirv-val %}
 
 ; CHECK-SPIRV-DAG: %[[#]] = OpBuildNDRange %[[#]] %[[#GWS:]] %[[#LWS:]] %[[#GWO:]]
 ; CHECK-SPIRV-DAG: %[[#GWS]] = OpConstant %[[#]] 123
diff --git a/llvm/test/CodeGen/SPIRV/transcoding/BuildNDRange_2.ll b/llvm/test/CodeGen/SPIRV/transcoding/BuildNDRange_2.ll
index a2ae808259a32a..65c992c9b28ed3 100644
--- a/llvm/test/CodeGen/SPIRV/transcoding/BuildNDRange_2.ll
+++ b/llvm/test/CodeGen/SPIRV/transcoding/BuildNDRange_2.ll
@@ -19,6 +19,7 @@
 ;; bash$ $PATH_TO_GEN/bin/clang -cc1 -x cl -cl-std=CL2.0 -triple spir64-unknown-unknown -emit-llvm  -include opencl-20.h  BuildNDRange_2.cl -o BuildNDRange_2.ll
 
 ; RUN: llc -O0 -mtriple=spirv64-unknown-unknown %s -o - | FileCheck %s --check-prefix=CHECK-SPIRV
+; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv64-unknown-unknown %s -o - -filetype=obj | spirv-val %}
 
 ; TODO(#60133): Requires updates following opaque pointer migration.
 ; XFAIL: *
diff --git a/llvm/test/CodeGen/SPIRV/transcoding/ConvertPtr.ll b/llvm/test/CodeGen/SPIRV/transcoding/ConvertPtr.ll
index 34036951e31e0b..93aecc5331aa4f 100644
--- a/llvm/test/CodeGen/SPIRV/transcoding/ConvertPtr.ll
+++ b/llvm/test/CodeGen/SPIRV/transcoding/ConvertPtr.ll
@@ -1,4 +1,5 @@
 ; RUN: llc -O0 -mtriple=spirv32-unknown-unknown %s -o - | FileCheck %s --check-prefix=CHECK-SPIRV
+; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv32-unknown-unknown %s -o - -filetype=obj | spirv-val %}
 
 ;; kernel void testConvertPtrToU(global int *a, global unsigned long *res) {
 ;;   res[0] = (unsigned long)&a[0];
diff --git a/llvm/test/CodeGen/SPIRV/transcoding/DecorationAlignment.ll b/llvm/test/CodeGen/SPIRV/transcoding/DecorationAlignment.ll
index 2e9b4a494c04d8..d4fc5c3280b714 100644
--- a/llvm/test/CodeGen/SPIRV/transcoding/DecorationAlignment.ll
+++ b/llvm/test/CodeGen/SPIRV/transcoding/DecorationAlignment.ll
@@ -1,4 +1,5 @@
 ; RUN: llc -O0 -mtriple=spirv64-unknown-unknown %s -o - | FileCheck %s --check-prefix=CHECK-SPIRV
+; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv64-unknown-unknown %s -o - -filetype=obj | spirv-val %}
 
 ; CHECK-SPIRV: OpDecorate %[[#ALIGNMENT:]] Alignment 16
 ; CHECK-SPIRV: %[[#ALIGNMENT]] = OpFunctionParameter %[[#]]
diff --git a/llvm/test/CodeGen/SPIRV/transcoding/DecorationMaxByteOffset.ll b/llvm/test/CodeGen/SPIRV/transcoding/DecorationMaxByteOffset.ll
index 64f25b7f420355..966d83516bb3ae 100644
--- a/llvm/test/CodeGen/SPIRV/transcoding/DecorationMaxByteOffset.ll
+++ b/llvm/test/CodeGen/SPIRV/transcoding/DecorationMaxByteOffset.ll
@@ -1,4 +1,5 @@
 ; RUN: llc -O0 -mtriple=spirv32-unknown-unknown %s -o - | FileCheck %s --check-prefix=CHECK-SPIRV
+; TODO: %if spirv-tools %{ llc -O0 -mtriple=spirv32-unknown-unknown %s -o - -filetype=obj | spirv-val %}
 
 ; CHECK-SPIRV:     OpName %[[#PTR_ID:]] "ptr"
 ; CHECK-SPIRV:     OpName %[[#PTR2_ID:]] "ptr2"
diff --git a/llvm/test/CodeGen/SPIRV/transcoding/DivRem.ll b/llvm/test/CodeGen/SPIRV/transcoding/DivRem.ll
index 2f423c2518e83a..67c3380941887d 100644
--- a/llvm/test/CodeGen/SPIRV/transcoding/DivRem.ll
+++ b/llvm/test/CodeGen/SPIRV/transcoding/DivRem.ll
@@ -1,4 +1,5 @@
 ; RUN: llc -O0 -mtriple=spirv32-unknown-unknown %s -o - | FileCheck %s --check-prefix=CHECK-SPIRV
+; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv32-unknown-unknown %s -o - -filetype=obj | spirv-val %}
 
 ; CHECK-SPIRV-DAG: %[[#int:]] = OpTypeInt 32 0
 ; CHECK-SPIRV-DAG: %[[#int2:]] = OpTypeVector %[[#int]] 2
diff --git a/llvm/test/CodeGen/SPIRV/transcoding/ExecutionMode_SPIR_to_SPIRV.ll b/llvm/test/CodeGen/SPIRV/transcoding/ExecutionMode_SPIR_to_SPIRV.ll
index 6d6dd2481b17db..6e8726cf03d442 100644
--- a/llvm/test/CodeGen/SPIRV/transcoding/ExecutionMode_SPIR_to_SPIRV.ll
+++ b/llvm/test/CodeGen/SPIRV/transcoding/ExecutionMode_SPIR_to_SPIRV.ll
@@ -1,4 +1,5 @@
 ; RUN: llc -O0 -mtriple=spirv32-unknown-unknown %s -o - | FileCheck %s --check-prefix=CHECK-SPIRV
+; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv32-unknown-unknown %s -o - -filetype=obj | spirv-val %}
 
 ; CHECK-SPIRV-DAG: OpEntryPoint Kernel %[[#WORKER:]] "worker"
 ; CHECK-SPIRV-DAG: OpExecutionMode %[[#WORKER]] LocalSizeHint 128 10 1
diff --git a/llvm/test/CodeGen/SPIRV/transcoding/GlobalFunAnnotate.ll b/llvm/test/CodeGen/SPIRV/transcoding/GlobalFunAnnotate.ll
index 2796dcbdca948a..33bece5b9c00f7 100644
--- a/llvm/test/CodeGen/SPIRV/transcoding/GlobalFunAnnotate.ll
+++ b/llvm/test/CodeGen/SPIRV/transcoding/GlobalFunAnnotate.ll
@@ -1,4 +1,5 @@
 ; RUN: llc -O0 -mtriple=spirv64-unknown-linux %s -o - | FileCheck %s --check-prefix=CHECK-SPIRV
+; TODO: %if spirv-tools %{ llc -O0 -mtriple=spirv64-unknown-unknown %s -o - -filetype=obj | spirv-val %}
 
 ; CHECK-SPIRV: OpDecorate %[[#]] UserSemantic "annotation_on_function"
 
diff --git a/llvm/test/CodeGen/SPIRV/transcoding/OpenCL/atomic_cmpxchg.ll b/llvm/test/CodeGen/SPIRV/transcoding/OpenCL/atomic_cmpxchg.ll
index 331960cdb341e7..417b89eb36f0f5 100644
--- a/llvm/test/CodeGen/SPIRV/transcoding/OpenCL/atomic_cmpxchg.ll
+++ b/llvm/test/CodeGen/SPIRV/transcoding/OpenCL/atomic_cmpxchg.ll
@@ -1,4 +1,5 @@
 ; RUN: llc -O0 -mtriple=spirv32-unknown-unknown %s -o - | FileCheck %s --check-prefix=CHECK-SPIRV
+; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv64-unknown-unknown %s -o - -filetype=obj | spirv-val %}
 
 ;; This test checks that the backend is capable to correctly translate
 ;; atomic_cmpxchg OpenCL C 1.2 built-in function [1] into corresponding SPIR-V
diff --git a/llvm/test/CodeGen/SPIRV/transcoding/OpenCL/atomic_legacy.ll b/llvm/test/CodeGen/SPIRV/transcoding/OpenCL/atomic_legacy.ll
index 95eb6ade11a25c..3180b57731d01c 100644
--- a/llvm/test/CodeGen/SPIRV/transcoding/OpenCL/atomic_legacy.ll
+++ b/llvm/test/CodeGen/SPIRV/transcoding/OpenCL/atomic_legacy.ll
@@ -1,4 +1,5 @@
 ; RUN: llc -O0 -mtriple=spirv32-unknown-unknown %s -o - | FileCheck %s --check-prefix=CHECK-SPIRV
+; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv32-unknown-unknown %s -o - -filetype=obj | spirv-val %}
 
 ;; This test checks that the backend is capable to correctly translate
 ;; legacy atomic OpenCL C 1.2 built-in functions [1] into corresponding SPIR-V
diff --git a/llvm/test/CodeGen/SPIRV/transcoding/OpenCL/atomic_work_item_fence.ll b/llvm/test/CodeGen/SPIRV/transcoding/OpenCL/atomic_work_item_fence.ll
index 0f3a62a3e40107..c94c1304418546 100644
--- a/llvm/test/CodeGen/SPIRV/transcoding/OpenCL/atomic_work_item_fence.ll
+++ b/llvm/test/CodeGen/SPIRV/transcoding/OpenCL/atomic_work_item_fence.ll
@@ -1,4 +1,5 @@
 ; RUN: llc -O0 -mtriple=spirv32-unknown-unknown %s -o - | FileCheck %s --check-prefix=CHECK-SPIRV
+; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv32-unknown-unknown %s -o - -filetype=obj | spirv-val %}
 
 ;; This test checks that the backend is capable to correctly translate
 ;; atomic_work_item_fence OpenCL C 2.0 built-in function [1] into corresponding
diff --git a/llvm/test/CodeGen/SPIRV/transcoding/OpenCL/barrier.ll b/llvm/test/CodeGen/SPIRV/transcoding/OpenCL/barrier.ll
index a126d94e06334f..cf4a24754e7bfa 100644
--- a/llvm/test/CodeGen/SPIRV/transcoding/OpenCL/barrier.ll
+++ b/llvm/test/CodeGen/SPIRV/transcoding/OpenCL/barrier.ll
@@ -1,4 +1,5 @@
 ; RUN: llc -O0 -mtriple=spirv32-unknown-unknown %s -o - | FileCheck %s --check-prefix=CHECK-SPIRV
+; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv32-unknown-unknown %s -o - -filetype=obj | spirv-val %}
 
 ;; This test checks that the backend is capable to correctly translate
 ;; barrier OpenCL C 1.2 built-in function [1] into corresponding SPIR-V
diff --git a/llvm/test/CodeGen/SPIRV/transcoding/OpenCL/sub_group_mask.ll b/llvm/test/CodeGen/SPIRV/transcoding/OpenCL/sub_group_mask.ll
index 42b127cf3b69b6..5d9840d3bd5b9c 100644
--- a/llvm/test/CodeGen/SPIRV/transcoding/OpenCL/sub_group_mask.ll
+++ b/llvm/test/CodeGen/SPIRV/transcoding/OpenCL/sub_group_mask.ll
@@ -1,4 +1,5 @@
 ; RUN: llc -O0 -mtriple=spirv32-unknown-unknown %s -o - | FileCheck %s --check-prefix=CHECK-SPIRV
+; TODO: %if spirv-tools %{ llc -O0 -mtriple=spirv32-unknown-unknown %s -o - -filetype=obj | spirv-val %}
 
 ; CHECK-SPIRV: OpCapability GroupNonUniformBallot
 ; CHECK-SPIRV: OpDecorate %[[#]] BuiltIn SubgroupGtMask
diff --git a/llvm/test/CodeGen/SPIRV/transcoding/OpenCL/work_group_barrier.ll b/llvm/test/CodeGen/SPIRV/transcoding/OpenCL/work_group_barrier.ll
index 0874e6f71e0407..0702fd0c9cb9b1 100644
--- a/llvm/test/CodeGen/SPIRV/transcoding/OpenCL/work_group_barrier.ll
+++ b/llvm/test/CodeGen/SPIRV/transcoding/OpenCL/work_group_barrier.ll
@@ -1,4 +1,5 @@
 ; RUN: llc -O0 -mtriple=spirv32-unknown-unknown %s -o - | FileCheck %s --check-prefix=CHECK-SPIRV
+; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv32-unknown-unknown %s -o - -filetype=obj | spirv-val %}
 
 ;; This test checks that the backend is capable to correctly translate
 ;; sub_group_barrier built-in function [1] from cl_khr_subgroups extension into
diff --git a/llvm/test/CodeGen/SPIRV/transcoding/atomic_flag.ll b/llvm/test/CodeGen/SPIRV/transcoding/atomic_flag.ll
index 3c563d373f1bd4..20204acb1ef584 100644
--- a/llvm/test/CodeGen/SPIRV/transcoding/atomic_flag.ll
+++ b/llvm/test/CodeGen/SPIRV/transcoding/atomic_flag.ll
@@ -1,4 +1,5 @@
 ; RUN: llc -O0 -mtriple=spirv64-unknown-unknown %s -o - | FileCheck %s
+; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv64-unknown-unknown %s -o - -filetype=obj | spirv-val %}
 
 ;; Types:
 ; CHECK-DAG:  %[[#INT:]] = OpTypeInt 32
diff --git a/llvm/test/CodeGen/SPIRV/transcoding/atomic_load_store.ll b/llvm/test/CodeGen/SPIRV/transcoding/atomic_load_store.ll
index d013abcade8bb4..3e5a3ac356936c 100644
--- a/llvm/test/CodeGen/SPIRV/transcoding/atomic_load_store.ll
+++ b/llvm/test/CodeGen/SPIRV/transcoding/atomic_load_store.ll
@@ -1,4 +1,5 @@
 ; RUN: llc -O0 -mtriple=spirv32-unknown-unknown %s -o - | FileCheck %s --check-prefix=CHECK-SPIRV
+; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv32-unknown-unknown %s -o - -filetype=obj | spirv-val %}
 
 ;; Check 'LLVM ==> SPIR-V' conversion of atomic_load and atomic_store.
 
diff --git a/llvm/test/CodeGen/SPIRV/transcoding/bitcast.ll b/llvm/test/CodeGen/SPIRV/transcoding/bitcast.ll
index 8dbf4d2c58b4bd..2c0fc393b135a2 100644
--- a/llvm/test/CodeGen/SPIRV/transcoding/bitcast.ll
+++ b/llvm/test/CodeGen/SPIRV/transcoding/bitcast.ll
@@ -1,4 +1,5 @@
 ; RUN: llc -O0 -mtriple=spirv64-unknown-unknown %s -o - | FileCheck %s
+; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv64-unknown-unknown %s -o - -filetype=obj | spirv-val %}
 
 ;; Check the bitcast is translated back to bitcast
 
diff --git a/llvm/test/CodeGen/SPIRV/transcoding/block_w_struct_return.ll b/llvm/test/CodeGen/SPIRV/transcoding/block_w_struct_return.ll
index 5ecd7f73a52e37..2249cbe4e98a54 100644
--- a/llvm/test/CodeGen/SPIRV/transcoding/block_w_struct_return.ll
+++ b/llvm/test/CodeGen/SPIRV/transcoding/block_w_struct_return.ll
@@ -1,4 +1,5 @@
 ; RUN: llc -O0 -mtriple=spirv32-unknown-unknown %s -o - | FileCheck %s --check-prefixes=CHECK-SPIRV,CHECK-SPIRV1_4
+; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv32-unknown-unknown %s -o - -filetype=obj | spirv-val %}
 
 ; TODO(#60133): Requires updates following opaque pointer migration.
 ; XFAIL: *
diff --git a/llvm/test/CodeGen/SPIRV/transcoding/builtin_calls.ll b/llvm/test/CodeGen/SPIRV/transcoding/builtin_calls.ll
index 9b1ce76631809d..0a02a8bf56ace5 100644
--- a/llvm/test/CodeGen/SPIRV/transcoding/builtin_calls.ll
+++ b/llvm/test/CodeGen/SPIRV/transcoding/builtin_calls.ll
@@ -1,4 +1,5 @@
 ; RUN: llc -O0 -mtriple=spirv32-unknown-unknown %s -o - | FileCheck %s --check-prefix=CHECK-SPIRV
+; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv32-unknown-unknown %s -o - -filetype=obj | spirv-val %}
 
 ; CHECK-SPIRV-DAG: OpDecorate %[[#Id:]] BuiltIn GlobalInvocationId
 ; CHECK-SPIRV-DAG: OpDecorate %[[#Id:]] BuiltIn GlobalLinearId
diff --git a/llvm/test/CodeGen/SPIRV/transcoding/builtin_vars.ll b/llvm/test/CodeGen/SPIRV/transcoding/builtin_vars.ll
index 82866712c07788..f18f27a6de51d4 100644
--- a/llvm/test/CodeGen/SPIRV/transcoding/builtin_vars.ll
+++ b/llvm/test/CodeGen/SPIRV/transcoding/builtin_vars.ll
@@ -1,4 +1,5 @@
 ; RUN: llc -O0 -mtriple=spirv32-unknown-unknown %s -o - | FileCheck %s --check-prefix=CHECK-SPIRV
+; TODO: %if spirv-tools %{ llc -O0 -mtriple=spirv32-unknown-unknown %s -o - -filetype=obj | spirv-val %}
 
 ; CHECK-SPIRV: OpDecorate %[[#Id:]] BuiltIn GlobalLinearId
 ; CHECK-SPIRV: %[[#Id:]] = OpVariable %[[#]]
diff --git a/llvm/test/CodeGen/SPIRV/transcoding/builtin_vars_arithmetics.ll b/llvm/test/CodeGen/SPIRV/transcoding/builtin_vars_arithmetics.ll
index 22aa40c0c7a796..d39ca3c39383c0 100644
--- a/llvm/test/CodeGen/SPIRV/transcoding/builtin_vars_arithmetics.ll
+++ b/llvm/test/CodeGen/SPIRV/transcoding/builtin_vars_arithmetics.ll
@@ -1,4 +1,5 @@
 ; RUN: llc -O0 -mtriple=spirv64-unknown-linux %s -o - | FileCheck %s --check-prefix=CHECK-SPIRV
+; TODO: %if spirv-tools %{ llc -O0 -mtriple=spirv64-unknown-unknown %s -o - -filetype=obj | spirv-val %}
 
 ;; The IR was generated from the following source:
 ;; #include <CL/sycl.hpp>
diff --git a/llvm/test/CodeGen/SPIRV/transcoding/builtin_vars_opt.ll b/llvm/test/CodeGen/SPIRV/transcoding/builtin_vars_opt.ll
index 5b3474f97bfedf..03456aef6b6b2e 100644
--- a/llvm/test/CodeGen/SPIRV/transcoding/builtin_vars_opt.ll
+++ b/llvm/test/CodeGen/SPIRV/transcoding/builtin_vars_opt.ll
@@ -1,4 +1,5 @@
 ; RUN: llc -O0 -mtriple=spirv64-unknown-linux %s -o - | FileCheck %s --check-prefix=CHECK-SPIRV
+; TODO: %if spirv-tools %{ llc -O0 -mtriple=spirv64-unknown-unknown %s -o - -filetype=obj | spirv-val %}
 
 ;; The IR was generated from the following source:
 ;; #include <CL/sycl.hpp>
diff --git a/llvm/test/CodeGen/SPIRV/transcoding/check_ro_qualifier.ll b/llvm/test/CodeGen/SPIRV/transcoding/check_ro_qualifier.ll
index 6de610b2240da7..824ca1b2d69249 100644
--- a/llvm/test/CodeGen/SPIRV/transcoding/check_ro_qualifier.ll
+++ b/llvm/test/CodeGen/SPIRV/transcoding/check_ro_qualifier.ll
@@ -1,4 +1,5 @@
 ; RUN: llc -O0 -mtriple=spirv64-unknown-unknown %s -o - | FileCheck %s --check-prefix=CHECK-SPIRV
+; TODO: %if spirv-tools %{ llc -O0 -mtriple=spirv64-unknown-unknown %s -o - -filetype=obj | spirv-val %}
 
 ; CHECK-SPIRV: %[[#IMAGE_TYPE:]] = OpTypeImage
 ; CHECK-SPIRV: %[[#IMAGE_ARG:]] = OpFunctionParameter %[[#IMAGE_TYPE]]
diff --git a/llvm/test/CodeGen/SPIRV/transcoding/cl-types.ll b/llvm/test/CodeGen/SPIRV/transcoding/cl-types.ll
index 52b7dac8866f69..d7e87c05340d14 100644
--- a/llvm/test/CodeGen/SPIRV/transcoding/cl-types.ll
+++ b/llvm/test/CodeGen/SPIRV/transcoding/cl-types.ll
@@ -19,6 +19,7 @@
 ;; }
 
 ; RUN: llc -O0 -mtriple=spirv32-unknown-unknown %s -o - | FileCheck %s --check-prefix=CHECK-SPIRV
+; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv32-unknown-unknown %s -o - -filetype=obj | spirv-val %}
 
 ; CHECK-SPIRV-DAG: OpCapability Sampled1D
 ; CHECK-SPIRV-DAG: OpCapability SampledBuffer
diff --git a/llvm/test/CodeGen/SPIRV/transcoding/clk_event_t.ll b/llvm/test/CodeGen/SPIRV/transcoding/clk_event_t.ll
index 9054454879cc26..0cd75bb215ada2 100644
--- a/llvm/test/CodeGen/SPIRV/transcoding/clk_event_t.ll
+++ b/llvm/test/CodeGen/SPIRV/transcoding/clk_event_t.ll
@@ -1,4 +1,5 @@
 ; RUN: llc -O0 -mtriple=spirv32-unknown-unknown %s -o - | FileCheck %s --check-prefix=CHECK-SPIRV
+; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv32-unknown-unknown %s -o - -filetype=obj | spirv-val %}
 
 ; CHECK-SPIRV: OpTypeDeviceEvent
 ; CHECK-SPIRV: OpFunction
diff --git a/llvm/test/CodeGen/SPIRV/transcoding/enqueue_kernel.ll b/llvm/test/CodeGen/SPIRV/transcoding/enqueue_kernel.ll
index cf124ec0a2782e..d23b0687face59 100644
--- a/llvm/test/CodeGen/SPIRV/transcoding/enqueue_kernel.ll
+++ b/llvm/test/CodeGen/SPIRV/transcoding/enqueue_kernel.ll
@@ -1,4 +1,5 @@
 ; RUN: llc -O0 -mtriple=spirv32-unknown-unknown %s -o - | FileCheck %s --check-prefix=CHECK-SPIRV
+; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv32-unknown-unknown %s -o - -filetype=obj | spirv-val %}
 
 ; TODO(#60133): Requires updates following opaque pointer migration.
 ; XFAIL: *
diff --git a/llvm/test/CodeGen/SPIRV/transcoding/explicit-conversions.ll b/llvm/test/CodeGen/SPIRV/transcoding/explicit-conversions.ll
index c186a8135fee7c..49b84c1e9530a5 100644
--- a/llvm/test/CodeGen/SPIRV/transcoding/explicit-conversions.ll
+++ b/llvm/test/CodeGen/SPIRV/transcoding/explicit-conversions.ll
@@ -1,4 +1,5 @@
 ; RUN: llc -O0 -mtriple=spirv32-unknown-unknown %s -o - | FileCheck %s --check-prefix=CHECK-SPIRV
+; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv32-unknown-unknown %s -o - -filetype=obj | spirv-val %}
 
 ; CHECK-SPIRV: OpSatConvertSToU
 
diff --git a/llvm/test/CodeGen/SPIRV/transcoding/extract_insert_value.ll b/llvm/test/CodeGen/SPIRV/transcoding/extract_insert_value.ll
index fd29bc8a1ebf89..0ed1dc76628ca0 100644
--- a/llvm/test/CodeGen/SPIRV/transcoding/extract_insert_value.ll
+++ b/llvm/test/CodeGen/SPIRV/transcoding/extract_insert_value.ll
@@ -1,4 +1,5 @@
 ; RUN: llc -O0 -mtriple=spirv32-unknown-unknown %s -o - | FileCheck %s --check-prefix=CHECK-SPIRV
+; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv32-unknown-unknown %s -o - -filetype=obj | spirv-val %}
 
 ; TODO(#60133): Requires updates following opaque pointer migration.
 ; XFAIL: *
diff --git a/llvm/test/CodeGen/SPIRV/transcoding/fadd.ll b/llvm/test/CodeGen/SPIRV/transcoding/fadd.ll
index 78d9a232665585..af76c0e96f9f49 100644
--- a/llvm/test/CodeGen/SPIRV/transcoding/fadd.ll
+++ b/llvm/test/CodeGen/SPIRV/transcoding/fadd.ll
@@ -1,4 +1,5 @@
 ; RUN: llc -O0 -mtriple=spirv32-unknown-unknown %s -o - | FileCheck %s --check-prefix=CHECK-SPIRV
+; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv32-unknown-unknown %s -o - -filetype=obj | spirv-val %}
 
 ; CHECK-SPIRV:     OpName %[[#r1:]] "r1"
 ; CHECK-SPIRV:     OpName %[[#r2:]] "r2"
diff --git a/llvm/test/CodeGen/SPIRV/transcoding/fclamp.ll b/llvm/test/CodeGen/SPIRV/transcoding/fclamp.ll
index cfdcc728fbe43a..550ec1a6f25507 100644
--- a/llvm/test/CodeGen/SPIRV/transcoding/fclamp.ll
+++ b/llvm/test/CodeGen/SPIRV/transcoding/fclamp.ll
@@ -1,4 +1,5 @@
 ; RUN: llc -O0 -mtriple=spirv32-unknown-unknown %s -o - | FileCheck %s --check-prefix=CHECK-SPIRV
+; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv32-unknown-unknown %s -o - -filetype=obj | spirv-val %}
 
 ; CHECK-SPIRV:     %[[#]] = OpExtInst %[[#]] %[[#]] fclamp
 ; CHECK-SPIRV-NOT: %[[#]] = OpExtInst %[[#]] %[[#]] clamp
diff --git a/llvm/test/CodeGen/SPIRV/transcoding/fcmp.ll b/llvm/test/CodeGen/SPIRV/transcoding/fcmp.ll
index 572ccc3ed625d2..46eaba9d5ceb1e 100644
--- a/llvm/test/CodeGen/SPIRV/transcoding/fcmp.ll
+++ b/llvm/test/CodeGen/SPIRV/transcoding/fcmp.ll
@@ -1,4 +1,5 @@
 ; RUN: llc -O0 -mtriple=spirv32-unknown-unknown %s -o - | FileCheck %s --check-prefix=CHECK-SPIRV
+; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv32-unknown-unknown %s -o - -filetype=obj | spirv-val %}
 
 ; CHECK-SPIRV: OpName %[[#r1:]] "r1"
 ; CHECK-SPIRV: OpName %[[#r2:]] "r2"
diff --git a/llvm/test/CodeGen/SPIRV/transcoding/fdiv.ll b/llvm/test/CodeGen/SPIRV/transcoding/fdiv.ll
index d0ed5640e7066d..79b786814c716e 100644
--- a/llvm/test/CodeGen/SPIRV/transcoding/fdiv.ll
+++ b/llvm/test/CodeGen/SPIRV/transcoding/fdiv.ll
@@ -1,4 +1,5 @@
 ; RUN: llc -O0 -mtriple=spirv32-unknown-unknown %s -o - | FileCheck %s --check-prefix=CHECK-SPIRV
+; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv32-unknown-unknown %s -o - -filetype=obj | spirv-val %}
 
 ; CHECK-SPIRV:     OpName %[[#r1:]] "r1"
 ; CHECK-SPIRV:     OpName %[[#r2:]] "r2"
diff --git a/llvm/test/CodeGen/SPIRV/transcoding/fmod.ll b/llvm/test/CodeGen/SPIRV/transcoding/fmod.ll
index f506787bcb9ce6..683b5c24f5b712 100644
--- a/llvm/test/CodeGen/SPIRV/transcoding/fmod.ll
+++ b/llvm/test/CodeGen/SPIRV/transcoding/fmod.ll
@@ -2,6 +2,7 @@
 ;; { out = fmod( in1, in2 ); }
 
 ; RUN: llc -O0 -mtriple=spirv64-unknown-unknown %s -o - | FileCheck %s --check-prefix=CHECK-SPIRV
+; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv64-unknown-unknown %s -o - -filetype=obj | spirv-val %}
 
 ; CHECK-SPIRV: %[[#]] = OpExtInst %[[#]] %[[#]] fmod %[[#]] %[[#]]
 
diff --git a/llvm/test/CodeGen/SPIRV/transcoding/fmul.ll b/llvm/test/CodeGen/SPIRV/transcoding/fmul.ll
index 886077a67b4e60..fdab29c9041cb8 100644
--- a/llvm/test/CodeGen/SPIRV/transcoding/fmul.ll
+++ b/llvm/test/CodeGen/SPIRV/transcoding/fmul.ll
@@ -1,4 +1,5 @@
 ; RUN: llc -O0 -mtriple=spirv32-unknown-unknown %s -o - | FileCheck %s --check-prefix=CHECK-SPIRV
+; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv32-unknown-unknown %s -o - -filetype=obj | spirv-val %}
 
 ; CHECK-SPIRV:     OpName %[[#r1:]] "r1"
 ; CHECK-SPIRV:     OpName %[[#r2:]] "r2"
diff --git a/llvm/test/CodeGen/SPIRV/transcoding/fneg.ll b/llvm/test/CodeGen/SPIRV/transcoding/fneg.ll
index e17601a2c25a7c..60bbfe6b7f3931 100644
--- a/llvm/test/CodeGen/SPIRV/transcoding/fneg.ll
+++ b/llvm/test/CodeGen/SPIRV/transcoding/fneg.ll
@@ -1,4 +1,5 @@
 ; RUN: llc -O0 -mtriple=spirv32-unknown-unknown %s -o - | FileCheck %s --check-prefix=CHECK-SPIRV
+; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv32-unknown-unknown %s -o - -filetype=obj | spirv-val %}
 
 ; CHECK-SPIRV: OpName %[[#r1:]] "r1"
 ; CHECK-SPIRV: OpName %[[#r2:]] "r2"
diff --git a/llvm/test/CodeGen/SPIRV/transcoding/fp_contract_reassoc_fast_mode.ll b/llvm/test/CodeGen/SPIRV/transcoding/fp_contract_reassoc_fast_mode.ll
index c035c35a339ee9..974043c11991f7 100644
--- a/llvm/test/CodeGen/SPIRV/transcoding/fp_contract_reassoc_fast_mode.ll
+++ b/llvm/test/CodeGen/SPIRV/transcoding/fp_contract_reassoc_fast_mode.ll
@@ -1,4 +1,5 @@
 ; RUN: llc -O0 -mtriple=spirv32-unknown-unknown %s -o - | FileCheck %s --check-prefix=CHECK-SPIRV
+; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv32-unknown-unknown %s -o - -filetype=obj | spirv-val %}
 
 ; CHECK-SPIRV-NOT: OpCapability FPFastMathModeINTEL
 ; CHECK-SPIRV:     OpName %[[#mu:]] "mul"
diff --git a/llvm/test/CodeGen/SPIRV/transcoding/frem.ll b/llvm/test/CodeGen/SPIRV/transcoding/frem.ll
index ecb8f6f950cabf..d36ba7f70e453d 100644
--- a/llvm/test/CodeGen/SPIRV/transcoding/frem.ll
+++ b/llvm/test/CodeGen/SPIRV/transcoding/frem.ll
@@ -1,4 +1,5 @@
 ; RUN: llc -O0 -mtriple=spirv32-unknown-unknown %s -o - | FileCheck %s --check-prefix=CHECK-SPIRV
+; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv32-unknown-unknown %s -o - -filetype=obj | spirv-val %}
 
 ; CHECK-SPIRV:     OpName %[[#r1:]] "r1"
 ; CHECK-SPIRV:     OpName %[[#r2:]] "r2"
diff --git a/llvm/test/CodeGen/SPIRV/transcoding/fsub.ll b/llvm/test/CodeGen/SPIRV/transcoding/fsub.ll
index 99d0d0eb84f95f..3677c00405626f 100644
--- a/llvm/test/CodeGen/SPIRV/transcoding/fsub.ll
+++ b/llvm/test/CodeGen/SPIRV/transcoding/fsub.ll
@@ -1,4 +1,5 @@
 ; RUN: llc -O0 -mtriple=spirv32-unknown-unknown %s -o - | FileCheck %s --check-prefix=CHECK-SPIRV
+; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv32-unknown-unknown %s -o - -filetype=obj | spirv-val %}
 
 ; CHECK-SPIRV:     OpName %[[#r1:]] "r1"
 ; CHECK-SPIRV:     OpName %[[#r2:]] "r2"
diff --git a/llvm/test/CodeGen/SPIRV/transcoding/get_image_num_mip_levels.ll b/llvm/test/CodeGen/SPIRV/transcoding/get_image_num_mip_levels.ll
index dc307c70612eba..fd241963d1e98d 100644
--- a/llvm/test/CodeGen/SPIRV/transcoding/get_image_num_mip_levels.ll
+++ b/llvm/test/CodeGen/SPIRV/transcoding/get_image_num_mip_levels.ll
@@ -1,4 +1,5 @@
 ; RUN: llc -O0 -mtriple=spirv64-unknown-unknown %s -o - | FileCheck %s
+; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv64-unknown-unknown %s -o - -filetype=obj | spirv-val %}
 
 ;; Types:
 ; CHECK-DAG:  %[[#INT:]] = OpTypeInt 32
diff --git a/llvm/test/CodeGen/SPIRV/transcoding/global_block.ll b/llvm/test/CodeGen/SPIRV/transcoding/global_block.ll
index 2f44e1943b6a6d..ff1bec4497ba2a 100644
--- a/llvm/test/CodeGen/SPIRV/transcoding/global_block.ll
+++ b/llvm/test/CodeGen/SPIRV/transcoding/global_block.ll
@@ -1,4 +1,5 @@
 ; RUN: llc -O0 -mtriple=spirv32-unknown-unknown %s -o - | FileCheck %s --check-prefixes=CHECK-SPIRV,CHECK-SPIRV1_4
+; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv32-unknown-unknown %s -o - -filetype=obj | spirv-val %}
 
 ; TODO(#60133): Requires updates following opaque pointer migration.
 ; XFAIL: *
diff --git a/llvm/test/CodeGen/SPIRV/transcoding/group_ops.ll b/llvm/test/CodeGen/SPIRV/transcoding/group_ops.ll
index 6aa9faa6c893e9..2412f406a9c62e 100644
--- a/llvm/test/CodeGen/SPIRV/transcoding/group_ops.ll
+++ b/llvm/test/CodeGen/SPIRV/transcoding/group_ops.ll
@@ -1,4 +1,5 @@
 ; RUN: llc -O0 -mtriple=spirv32-unknown-unknown %s -o - | FileCheck %s --check-prefix=CHECK-SPIRV
+; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv32-unknown-unknown %s -o - -filetype=obj | spirv-val %}
 
 ; CHECK-SPIRV-DAG: %[[#int:]] = OpTypeInt 32 0
 ; CHECK-SPIRV-DAG: %[[#float:]] = OpTypeFloat 32
diff --git a/llvm/test/CodeGen/SPIRV/transcoding/isequal.ll b/llvm/test/CodeGen/SPIRV/transcoding/isequal.ll
index 3c818afcdb1670..c5f3f9e1e2e74c 100644
--- a/llvm/test/CodeGen/SPIRV/transcoding/isequal.ll
+++ b/llvm/test/CodeGen/SPIRV/transcoding/isequal.ll
@@ -1,4 +1,5 @@
 ; RUN: llc -O0 -mtriple=spirv64-unknown-unknown %s -o - | FileCheck %s --check-prefix=CHECK-SPIRV
+; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv64-unknown-unknown %s -o - -filetype=obj | spirv-val %}
 
 ; CHECK-SPIRV-NOT: OpSConvert
 
diff --git a/llvm/test/CodeGen/SPIRV/transcoding/relationals_double.ll b/llvm/test/CodeGen/SPIRV/transcoding/relationals_double.ll
index f771854672ce16..de7673ad7f17eb 100644
--- a/llvm/test/CodeGen/SPIRV/transcoding/relationals_double.ll
+++ b/llvm/test/CodeGen/SPIRV/transcoding/relationals_double.ll
@@ -1,4 +1,5 @@
 ; RUN: llc -O0 -mtriple=spirv32-unknown-unknown %s -o - | FileCheck %s --check-prefix=CHECK-SPIRV
+; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv32-unknown-unknown %s -o - -filetype=obj | spirv-val %}
 
 ;; This test checks following SYCL relational builtins with double and double2
 ;; types:
diff --git a/llvm/test/CodeGen/SPIRV/transcoding/relationals_float.ll b/llvm/test/CodeGen/SPIRV/transcoding/relationals_float.ll
index 1f55cebb0911b2..69a4a30fd65ef3 100644
--- a/llvm/test/CodeGen/SPIRV/transcoding/relationals_float.ll
+++ b/llvm/test/CodeGen/SPIRV/transcoding/relationals_float.ll
@@ -1,4 +1,5 @@
 ; RUN: llc -O0 -mtriple=spirv32-unknown-unknown %s -o - | FileCheck %s --check-prefix=CHECK-SPIRV
+; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv32-unknown-unknown %s -o - -filetype=obj | spirv-val %}
 
 ;; This test checks following SYCL relational builtins with float and float2
 ;; types:
diff --git a/llvm/test/CodeGen/SPIRV/transcoding/relationals_half.ll b/llvm/test/CodeGen/SPIRV/transcoding/relationals_half.ll
index 864fb4f29efdc2..d6a7fda41afd08 100644
--- a/llvm/test/CodeGen/SPIRV/transcoding/relationals_half.ll
+++ b/llvm/test/CodeGen/SPIRV/transcoding/relationals_half.ll
@@ -1,4 +1,5 @@
 ; RUN: llc -O0 -mtriple=spirv32-unknown-unknown %s -o - | FileCheck %s --check-prefix=CHECK-SPIRV
+; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv32-unknown-unknown %s -o - -filetype=obj | spirv-val %}
 
 ;; This test checks following SYCL relational builtins with half and half2 types:
 ;;   isfinite, isinf, isnan, isnormal, signbit, isequal, isnotequal, isgreater

>From c03dbed1a9d14263b84a003ec7ac035ba9704a4e Mon Sep 17 00:00:00 2001
From: Aiden Grossman <agrossman154 at yahoo.com>
Date: Wed, 13 Mar 2024 00:48:46 -0700
Subject: [PATCH 04/63] Revert "[llvm-exegesis] Use LLVM Support to get thread
 ID"

This reverts commit 1c3b15e9f5bc671e40bcf5d3475f5425466754ce.

This (and/or) a related patch was causing build failures on one of the
buildbots. More information is available at
https://lab.llvm.org/buildbot/#/builders/178/builds/7015.
---
 llvm/tools/llvm-exegesis/lib/BenchmarkRunner.cpp |  2 +-
 .../tools/llvm-exegesis/lib/SubprocessMemory.cpp | 16 +++++++++++-----
 llvm/tools/llvm-exegesis/lib/SubprocessMemory.h  |  5 ++++-
 3 files changed, 16 insertions(+), 7 deletions(-)

diff --git a/llvm/tools/llvm-exegesis/lib/BenchmarkRunner.cpp b/llvm/tools/llvm-exegesis/lib/BenchmarkRunner.cpp
index 17ce0355ef4fd0..4e97d188d17259 100644
--- a/llvm/tools/llvm-exegesis/lib/BenchmarkRunner.cpp
+++ b/llvm/tools/llvm-exegesis/lib/BenchmarkRunner.cpp
@@ -301,7 +301,7 @@ class SubProcessFunctionExecutorImpl
     if (AddMemDefError)
       return AddMemDefError;
 
-    long ParentTID = get_threadid();
+    long ParentTID = SubprocessMemory::getCurrentTID();
     pid_t ParentOrChildPID = fork();
 
     if (ParentOrChildPID == -1) {
diff --git a/llvm/tools/llvm-exegesis/lib/SubprocessMemory.cpp b/llvm/tools/llvm-exegesis/lib/SubprocessMemory.cpp
index 28b341c461804f..1fd81bd407becb 100644
--- a/llvm/tools/llvm-exegesis/lib/SubprocessMemory.cpp
+++ b/llvm/tools/llvm-exegesis/lib/SubprocessMemory.cpp
@@ -10,7 +10,6 @@
 #include "Error.h"
 #include "llvm/Support/Error.h"
 #include "llvm/Support/FormatVariadic.h"
-#include "llvm/Support/Threading.h"
 #include <cerrno>
 
 #ifdef __linux__
@@ -25,6 +24,13 @@ namespace exegesis {
 
 #if defined(__linux__) && !defined(__ANDROID__)
 
+long SubprocessMemory::getCurrentTID() {
+  // We're using the raw syscall here rather than the gettid() function provided
+  // by most libcs for compatibility as gettid() was only added to glibc in
+  // version 2.30.
+  return syscall(SYS_gettid);
+}
+
 Error SubprocessMemory::initializeSubprocessMemory(pid_t ProcessID) {
   // Add the PID to the shared memory name so that if we're running multiple
   // processes at the same time, they won't interfere with each other.
@@ -32,7 +38,7 @@ Error SubprocessMemory::initializeSubprocessMemory(pid_t ProcessID) {
   // llvm-lit. Additionally add the TID so that downstream consumers
   // using multiple threads don't run into conflicts.
   std::string AuxiliaryMemoryName =
-      formatv("/{0}auxmem{1}", get_threadid(), ProcessID);
+      formatv("/{0}auxmem{1}", getCurrentTID(), ProcessID);
   int AuxiliaryMemoryFD = shm_open(AuxiliaryMemoryName.c_str(),
                                    O_RDWR | O_CREAT, S_IRUSR | S_IWUSR);
   if (AuxiliaryMemoryFD == -1)
@@ -53,7 +59,7 @@ Error SubprocessMemory::addMemoryDefinition(
   SharedMemoryNames.reserve(MemoryDefinitions.size());
   for (auto &[Name, MemVal] : MemoryDefinitions) {
     std::string SharedMemoryName =
-        formatv("/{0}t{1}memdef{2}", ProcessPID, get_threadid(), MemVal.Index);
+        formatv("/{0}t{1}memdef{2}", ProcessPID, getCurrentTID(), MemVal.Index);
     SharedMemoryNames.push_back(SharedMemoryName);
     int SharedMemoryFD =
         shm_open(SharedMemoryName.c_str(), O_RDWR | O_CREAT, S_IRUSR | S_IWUSR);
@@ -87,7 +93,7 @@ Error SubprocessMemory::addMemoryDefinition(
 
 Expected<int> SubprocessMemory::setupAuxiliaryMemoryInSubprocess(
     std::unordered_map<std::string, MemoryValue> MemoryDefinitions,
-    pid_t ParentPID, uint64_t ParentTID, int CounterFileDescriptor) {
+    pid_t ParentPID, long ParentTID, int CounterFileDescriptor) {
   std::string AuxiliaryMemoryName =
       formatv("/{0}auxmem{1}", ParentTID, ParentPID);
   int AuxiliaryMemoryFileDescriptor =
@@ -139,7 +145,7 @@ Error SubprocessMemory::addMemoryDefinition(
 
 Expected<int> SubprocessMemory::setupAuxiliaryMemoryInSubprocess(
     std::unordered_map<std::string, MemoryValue> MemoryDefinitions,
-    pid_t ParentPID, uint64_t ParentTID, int CounterFileDescriptor) {
+    pid_t ParentPID, long ParentTID, int CounterFileDescriptor) {
   return make_error<Failure>(
       "setupAuxiliaryMemoryInSubprocess is only supported on Linux");
 }
diff --git a/llvm/tools/llvm-exegesis/lib/SubprocessMemory.h b/llvm/tools/llvm-exegesis/lib/SubprocessMemory.h
index 807046e38ce6d2..572d1085d9cffa 100644
--- a/llvm/tools/llvm-exegesis/lib/SubprocessMemory.h
+++ b/llvm/tools/llvm-exegesis/lib/SubprocessMemory.h
@@ -35,6 +35,9 @@ class SubprocessMemory {
   static constexpr const size_t AuxiliaryMemoryOffset = 1;
   static constexpr const size_t AuxiliaryMemorySize = 4096;
 
+  // Gets the thread ID for the calling thread.
+  static long getCurrentTID();
+
   Error initializeSubprocessMemory(pid_t ProcessID);
 
   // The following function sets up memory definitions. It creates shared
@@ -54,7 +57,7 @@ class SubprocessMemory {
   // section.
   static Expected<int> setupAuxiliaryMemoryInSubprocess(
       std::unordered_map<std::string, MemoryValue> MemoryDefinitions,
-      pid_t ParentPID, uint64_t ParentTID, int CounterFileDescriptor);
+      pid_t ParentPID, long ParentTID, int CounterFileDescriptor);
 
   ~SubprocessMemory();
 

>From 15dfe032328344d9aeae46dde36538e6b585d067 Mon Sep 17 00:00:00 2001
From: Aiden Grossman <agrossman154 at yahoo.com>
Date: Wed, 13 Mar 2024 00:49:23 -0700
Subject: [PATCH 05/63] Revert "Reland "[llvm-exegesis] Add thread IDs to
 subprocess memory names (#84451)""

This reverts commit 8003f553a01a9a2a7eb09fe07e88f1ba9ee7d3a7.

This (and/or a related commit) was causing build failures on one of the
buildbots that needs more investigation. More information is available
at https://lab.llvm.org/buildbot/#/builders/178/builds/7015.
---
 .../llvm-exegesis/lib/BenchmarkRunner.cpp     |  9 +++---
 .../llvm-exegesis/lib/SubprocessMemory.cpp    | 30 ++++++-------------
 .../llvm-exegesis/lib/SubprocessMemory.h      |  5 +---
 .../X86/SubprocessMemoryTest.cpp              |  5 +---
 4 files changed, 15 insertions(+), 34 deletions(-)

diff --git a/llvm/tools/llvm-exegesis/lib/BenchmarkRunner.cpp b/llvm/tools/llvm-exegesis/lib/BenchmarkRunner.cpp
index 4e97d188d17259..5c9848f3c68885 100644
--- a/llvm/tools/llvm-exegesis/lib/BenchmarkRunner.cpp
+++ b/llvm/tools/llvm-exegesis/lib/BenchmarkRunner.cpp
@@ -301,7 +301,6 @@ class SubProcessFunctionExecutorImpl
     if (AddMemDefError)
       return AddMemDefError;
 
-    long ParentTID = SubprocessMemory::getCurrentTID();
     pid_t ParentOrChildPID = fork();
 
     if (ParentOrChildPID == -1) {
@@ -315,7 +314,7 @@ class SubProcessFunctionExecutorImpl
       // Unregister handlers, signal handling is now handled through ptrace in
       // the host process.
       sys::unregisterHandlers();
-      prepareAndRunBenchmark(PipeFiles[0], Key, ParentTID);
+      prepareAndRunBenchmark(PipeFiles[0], Key);
       // The child process terminates in the above function, so we should never
       // get to this point.
       llvm_unreachable("Child process didn't exit when expected.");
@@ -416,8 +415,8 @@ class SubProcessFunctionExecutorImpl
     setrlimit(RLIMIT_CORE, &rlim);
   }
 
-  [[noreturn]] void prepareAndRunBenchmark(int Pipe, const BenchmarkKey &Key,
-                                           long ParentTID) const {
+  [[noreturn]] void prepareAndRunBenchmark(int Pipe,
+                                           const BenchmarkKey &Key) const {
     // Disable core dumps in the child process as otherwise everytime we
     // encounter an execution failure like a segmentation fault, we will create
     // a core dump. We report the information directly rather than require the
@@ -474,7 +473,7 @@ class SubProcessFunctionExecutorImpl
 
     Expected<int> AuxMemFDOrError =
         SubprocessMemory::setupAuxiliaryMemoryInSubprocess(
-            Key.MemoryValues, ParentPID, ParentTID, CounterFileDescriptor);
+            Key.MemoryValues, ParentPID, CounterFileDescriptor);
     if (!AuxMemFDOrError)
       exit(ChildProcessExitCodeE::AuxiliaryMemorySetupFailed);
 
diff --git a/llvm/tools/llvm-exegesis/lib/SubprocessMemory.cpp b/llvm/tools/llvm-exegesis/lib/SubprocessMemory.cpp
index 1fd81bd407becb..a49fa077257d00 100644
--- a/llvm/tools/llvm-exegesis/lib/SubprocessMemory.cpp
+++ b/llvm/tools/llvm-exegesis/lib/SubprocessMemory.cpp
@@ -9,13 +9,11 @@
 #include "SubprocessMemory.h"
 #include "Error.h"
 #include "llvm/Support/Error.h"
-#include "llvm/Support/FormatVariadic.h"
 #include <cerrno>
 
 #ifdef __linux__
 #include <fcntl.h>
 #include <sys/mman.h>
-#include <sys/syscall.h>
 #include <unistd.h>
 #endif
 
@@ -24,21 +22,12 @@ namespace exegesis {
 
 #if defined(__linux__) && !defined(__ANDROID__)
 
-long SubprocessMemory::getCurrentTID() {
-  // We're using the raw syscall here rather than the gettid() function provided
-  // by most libcs for compatibility as gettid() was only added to glibc in
-  // version 2.30.
-  return syscall(SYS_gettid);
-}
-
 Error SubprocessMemory::initializeSubprocessMemory(pid_t ProcessID) {
   // Add the PID to the shared memory name so that if we're running multiple
   // processes at the same time, they won't interfere with each other.
   // This comes up particularly often when running the exegesis tests with
-  // llvm-lit. Additionally add the TID so that downstream consumers
-  // using multiple threads don't run into conflicts.
-  std::string AuxiliaryMemoryName =
-      formatv("/{0}auxmem{1}", getCurrentTID(), ProcessID);
+  // llvm-lit
+  std::string AuxiliaryMemoryName = "/auxmem" + std::to_string(ProcessID);
   int AuxiliaryMemoryFD = shm_open(AuxiliaryMemoryName.c_str(),
                                    O_RDWR | O_CREAT, S_IRUSR | S_IWUSR);
   if (AuxiliaryMemoryFD == -1)
@@ -58,8 +47,8 @@ Error SubprocessMemory::addMemoryDefinition(
     pid_t ProcessPID) {
   SharedMemoryNames.reserve(MemoryDefinitions.size());
   for (auto &[Name, MemVal] : MemoryDefinitions) {
-    std::string SharedMemoryName =
-        formatv("/{0}t{1}memdef{2}", ProcessPID, getCurrentTID(), MemVal.Index);
+    std::string SharedMemoryName = "/" + std::to_string(ProcessPID) + "memdef" +
+                                   std::to_string(MemVal.Index);
     SharedMemoryNames.push_back(SharedMemoryName);
     int SharedMemoryFD =
         shm_open(SharedMemoryName.c_str(), O_RDWR | O_CREAT, S_IRUSR | S_IWUSR);
@@ -93,9 +82,8 @@ Error SubprocessMemory::addMemoryDefinition(
 
 Expected<int> SubprocessMemory::setupAuxiliaryMemoryInSubprocess(
     std::unordered_map<std::string, MemoryValue> MemoryDefinitions,
-    pid_t ParentPID, long ParentTID, int CounterFileDescriptor) {
-  std::string AuxiliaryMemoryName =
-      formatv("/{0}auxmem{1}", ParentTID, ParentPID);
+    pid_t ParentPID, int CounterFileDescriptor) {
+  std::string AuxiliaryMemoryName = "/auxmem" + std::to_string(ParentPID);
   int AuxiliaryMemoryFileDescriptor =
       shm_open(AuxiliaryMemoryName.c_str(), O_RDWR, S_IRUSR | S_IWUSR);
   if (AuxiliaryMemoryFileDescriptor == -1)
@@ -109,8 +97,8 @@ Expected<int> SubprocessMemory::setupAuxiliaryMemoryInSubprocess(
     return make_error<Failure>("Mapping auxiliary memory failed");
   AuxiliaryMemoryMapping[0] = CounterFileDescriptor;
   for (auto &[Name, MemVal] : MemoryDefinitions) {
-    std::string MemoryValueName =
-        formatv("/{0}t{1}memdef{2}", ParentPID, ParentTID, MemVal.Index);
+    std::string MemoryValueName = "/" + std::to_string(ParentPID) + "memdef" +
+                                  std::to_string(MemVal.Index);
     AuxiliaryMemoryMapping[AuxiliaryMemoryOffset + MemVal.Index] =
         shm_open(MemoryValueName.c_str(), O_RDWR, S_IRUSR | S_IWUSR);
     if (AuxiliaryMemoryMapping[AuxiliaryMemoryOffset + MemVal.Index] == -1)
@@ -145,7 +133,7 @@ Error SubprocessMemory::addMemoryDefinition(
 
 Expected<int> SubprocessMemory::setupAuxiliaryMemoryInSubprocess(
     std::unordered_map<std::string, MemoryValue> MemoryDefinitions,
-    pid_t ParentPID, long ParentTID, int CounterFileDescriptor) {
+    pid_t ParentPID, int CounterFileDescriptor) {
   return make_error<Failure>(
       "setupAuxiliaryMemoryInSubprocess is only supported on Linux");
 }
diff --git a/llvm/tools/llvm-exegesis/lib/SubprocessMemory.h b/llvm/tools/llvm-exegesis/lib/SubprocessMemory.h
index 572d1085d9cffa..e20b50cdc8118f 100644
--- a/llvm/tools/llvm-exegesis/lib/SubprocessMemory.h
+++ b/llvm/tools/llvm-exegesis/lib/SubprocessMemory.h
@@ -35,9 +35,6 @@ class SubprocessMemory {
   static constexpr const size_t AuxiliaryMemoryOffset = 1;
   static constexpr const size_t AuxiliaryMemorySize = 4096;
 
-  // Gets the thread ID for the calling thread.
-  static long getCurrentTID();
-
   Error initializeSubprocessMemory(pid_t ProcessID);
 
   // The following function sets up memory definitions. It creates shared
@@ -57,7 +54,7 @@ class SubprocessMemory {
   // section.
   static Expected<int> setupAuxiliaryMemoryInSubprocess(
       std::unordered_map<std::string, MemoryValue> MemoryDefinitions,
-      pid_t ParentPID, long ParentTID, int CounterFileDescriptor);
+      pid_t ParentPID, int CounterFileDescriptor);
 
   ~SubprocessMemory();
 
diff --git a/llvm/unittests/tools/llvm-exegesis/X86/SubprocessMemoryTest.cpp b/llvm/unittests/tools/llvm-exegesis/X86/SubprocessMemoryTest.cpp
index 7c23e7b7e9c5a5..c07ec188a602c4 100644
--- a/llvm/unittests/tools/llvm-exegesis/X86/SubprocessMemoryTest.cpp
+++ b/llvm/unittests/tools/llvm-exegesis/X86/SubprocessMemoryTest.cpp
@@ -17,7 +17,6 @@
 #include <endian.h>
 #include <fcntl.h>
 #include <sys/mman.h>
-#include <sys/syscall.h>
 #include <unistd.h>
 #endif // __linux__
 
@@ -50,9 +49,7 @@ class SubprocessMemoryTest : public X86TestBase {
 
   std::string getSharedMemoryName(const unsigned TestNumber,
                                   const unsigned DefinitionNumber) {
-    long CurrentTID = syscall(SYS_gettid);
-    return "/" + std::to_string(getSharedMemoryNumber(TestNumber)) + "t" +
-           std::to_string(CurrentTID) + "memdef" +
+    return "/" + std::to_string(getSharedMemoryNumber(TestNumber)) + "memdef" +
            std::to_string(DefinitionNumber);
   }
 

>From 1b99c60357cd3462a568e4641fd6f0a408750b0f Mon Sep 17 00:00:00 2001
From: Danial Klimkin <dklimkin at google.com>
Date: Wed, 13 Mar 2024 09:00:09 +0100
Subject: [PATCH 06/63] Fix missing include past
 a38b7a432d3cbb093af9310eba5b4982dc0a0243 (#85041)

---
 clang/include/clang/InstallAPI/FrontendRecords.h | 1 +
 1 file changed, 1 insertion(+)

diff --git a/clang/include/clang/InstallAPI/FrontendRecords.h b/clang/include/clang/InstallAPI/FrontendRecords.h
index 333015b6a11365..1f5bc37798befd 100644
--- a/clang/include/clang/InstallAPI/FrontendRecords.h
+++ b/clang/include/clang/InstallAPI/FrontendRecords.h
@@ -11,6 +11,7 @@
 
 #include "clang/AST/Availability.h"
 #include "clang/AST/DeclObjC.h"
+#include "clang/InstallAPI/HeaderFile.h"
 #include "clang/InstallAPI/MachO.h"
 
 namespace clang {

>From 61c109d7d0a955420bf9f0146b800c9b96368362 Mon Sep 17 00:00:00 2001
From: Marco Elver <elver at google.com>
Date: Wed, 13 Mar 2024 09:01:00 +0100
Subject: [PATCH 07/63] [compiler-rt] Fix interceptors with AArch64 BTI
 (#84061)

On AArch64 with BTI, we have to start functions with the appropriate
BTI hint to indicate that the function is a valid call target.

To support interceptors with AArch64 BTI, add "BTI c".
---
 compiler-rt/lib/interception/interception.h      |  4 ++--
 compiler-rt/lib/sanitizer_common/sanitizer_asm.h | 14 ++++++++++++--
 2 files changed, 14 insertions(+), 4 deletions(-)

diff --git a/compiler-rt/lib/interception/interception.h b/compiler-rt/lib/interception/interception.h
index 00bcd979638b53..38c152952e3232 100644
--- a/compiler-rt/lib/interception/interception.h
+++ b/compiler-rt/lib/interception/interception.h
@@ -204,11 +204,11 @@ const interpose_substitution substitution_##func_name[]             \
        ".type  " SANITIZER_STRINGIFY(TRAMPOLINE(func)) ", "                    \
          ASM_TYPE_FUNCTION_STR "\n"                                            \
        SANITIZER_STRINGIFY(TRAMPOLINE(func)) ":\n"                             \
-       SANITIZER_STRINGIFY(CFI_STARTPROC) "\n"                                 \
+       C_ASM_STARTPROC "\n"                                                    \
        C_ASM_TAIL_CALL(SANITIZER_STRINGIFY(TRAMPOLINE(func)),                  \
                        "__interceptor_"                                        \
                          SANITIZER_STRINGIFY(ASM_PREEMPTIBLE_SYM(func))) "\n"  \
-       SANITIZER_STRINGIFY(CFI_ENDPROC) "\n"                                   \
+       C_ASM_ENDPROC "\n"                                                      \
        ".size  " SANITIZER_STRINGIFY(TRAMPOLINE(func)) ", "                    \
             ".-" SANITIZER_STRINGIFY(TRAMPOLINE(func)) "\n"                    \
      );
diff --git a/compiler-rt/lib/sanitizer_common/sanitizer_asm.h b/compiler-rt/lib/sanitizer_common/sanitizer_asm.h
index 3af66a4e449988..30e9d15184e5d5 100644
--- a/compiler-rt/lib/sanitizer_common/sanitizer_asm.h
+++ b/compiler-rt/lib/sanitizer_common/sanitizer_asm.h
@@ -42,6 +42,16 @@
 # define CFI_RESTORE(reg)
 #endif
 
+#if defined(__aarch64__) && defined(__ARM_FEATURE_BTI_DEFAULT)
+# define ASM_STARTPROC CFI_STARTPROC; hint #34
+# define C_ASM_STARTPROC SANITIZER_STRINGIFY(CFI_STARTPROC) "\nhint #34"
+#else
+# define ASM_STARTPROC CFI_STARTPROC
+# define C_ASM_STARTPROC SANITIZER_STRINGIFY(CFI_STARTPROC)
+#endif
+#define ASM_ENDPROC CFI_ENDPROC
+#define C_ASM_ENDPROC SANITIZER_STRINGIFY(CFI_ENDPROC)
+
 #if defined(__x86_64__) || defined(__i386__) || defined(__sparc__)
 # define ASM_TAIL_CALL jmp
 #elif defined(__arm__) || defined(__aarch64__) || defined(__mips__) || \
@@ -114,9 +124,9 @@
          .globl __interceptor_trampoline_##name;                               \
          ASM_TYPE_FUNCTION(__interceptor_trampoline_##name);                   \
          __interceptor_trampoline_##name:                                      \
-                 CFI_STARTPROC;                                                \
+                 ASM_STARTPROC;                                                \
                  ASM_TAIL_CALL ASM_PREEMPTIBLE_SYM(__interceptor_##name);      \
-                 CFI_ENDPROC;                                                  \
+                 ASM_ENDPROC;                                                  \
          ASM_SIZE(__interceptor_trampoline_##name)
 #  define ASM_INTERCEPTOR_TRAMPOLINE_SUPPORT 1
 # endif  // Architecture supports interceptor trampoline

>From 835e7c1b09ff524d993eaa281864c2fa19e9b6b4 Mon Sep 17 00:00:00 2001
From: Haojian Wu <hokein.wu at gmail.com>
Date: Wed, 13 Mar 2024 09:02:05 +0100
Subject: [PATCH 08/63] [clang] CTAD: Respect requires-clause of the original
 function template for the synthesized deduction guide (#84913)

We ignored the require-clause of the original template when building the deduction guide for type-alias CTAD, this resulted in accepting code which should be rejected (see the test case). This patch fixes it, part of #84492.
---
 clang/lib/Sema/SemaTemplate.cpp              | 19 ++++++++++++++-----
 clang/test/SemaCXX/cxx20-ctad-type-alias.cpp | 17 +++++++++++++++++
 2 files changed, 31 insertions(+), 5 deletions(-)

diff --git a/clang/lib/Sema/SemaTemplate.cpp b/clang/lib/Sema/SemaTemplate.cpp
index d8c9a5c09944c4..51e8db2dfbaac8 100644
--- a/clang/lib/Sema/SemaTemplate.cpp
+++ b/clang/lib/Sema/SemaTemplate.cpp
@@ -2906,18 +2906,27 @@ void DeclareImplicitDeductionGuidesForTypeAlias(
           Context.getCanonicalTemplateArgument(
               Context.getInjectedTemplateArg(NewParam));
     }
-    // FIXME: implement the associated constraint per C++
+    // Substitute new template parameters into requires-clause if present.
+    Expr *RequiresClause = nullptr;
+    if (Expr *InnerRC = F->getTemplateParameters()->getRequiresClause()) {
+      MultiLevelTemplateArgumentList Args;
+      Args.setKind(TemplateSubstitutionKind::Rewrite);
+      Args.addOuterTemplateArguments(TemplateArgsForBuildingFPrime);
+      ExprResult E = SemaRef.SubstExpr(InnerRC, Args);
+      if (E.isInvalid())
+        return;
+      RequiresClause = E.getAs<Expr>();
+    }
+    // FIXME: implement the is_deducible constraint per C++
     // [over.match.class.deduct]p3.3:
-    //    The associated constraints ([temp.constr.decl]) are the
-    //    conjunction of the associated constraints of g and a
-    //    constraint that is satisfied if and only if the arguments
+    //    ... and a constraint that is satisfied if and only if the arguments
     //    of A are deducible (see below) from the return type.
     auto *FPrimeTemplateParamList = TemplateParameterList::Create(
         Context, AliasTemplate->getTemplateParameters()->getTemplateLoc(),
         AliasTemplate->getTemplateParameters()->getLAngleLoc(),
         FPrimeTemplateParams,
         AliasTemplate->getTemplateParameters()->getRAngleLoc(),
-        /*RequiresClause=*/nullptr);
+        /*RequiresClause=*/RequiresClause);
 
     // To form a deduction guide f' from f, we leverage clang's instantiation
     // mechanism, we construct a template argument list where the template
diff --git a/clang/test/SemaCXX/cxx20-ctad-type-alias.cpp b/clang/test/SemaCXX/cxx20-ctad-type-alias.cpp
index 794496ed418489..3ce26c8fcd984e 100644
--- a/clang/test/SemaCXX/cxx20-ctad-type-alias.cpp
+++ b/clang/test/SemaCXX/cxx20-ctad-type-alias.cpp
@@ -230,3 +230,20 @@ using AFoo = Foo<U>*; // expected-note {{template is declared here}}
 
 AFoo s = {1}; // expected-error {{alias template 'AFoo' requires template arguments; argument deduction only allowed for}}
 } // namespace test17
+
+namespace test18 {
+template<typename T>
+concept False = false; // expected-note {{because 'false' evaluated to false}}
+
+template <typename T> struct Foo { T t; };
+
+template<typename T> requires False<T> // expected-note {{because 'int' does not satisfy 'False'}}
+Foo(T) -> Foo<int>;
+
+template <typename U>
+using Bar = Foo<U>; // expected-note {{could not match 'Foo<type-parameter-0-0>' against 'int'}} \
+                    // expected-note {{candidate template ignored: constraints not satisfied}} \
+                    // expected-note {{candidate function template not viable}}
+
+Bar s = {1}; // expected-error {{no viable constructor or deduction guide for deduction of template arguments}}
+} // namespace test18

>From 94f6b2a57167063b6bf762865ea2255a82f1a8c8 Mon Sep 17 00:00:00 2001
From: Sander de Smalen <sander.desmalen at arm.com>
Date: Wed, 13 Mar 2024 08:21:33 +0000
Subject: [PATCH 09/63] [AArch64][SME] Don't mark 'smstart za' as
 using/defining VG. (#84775)

VG is only used/defined when changing the streaming mode, using 'smstart
sm' or plainly 'smstart' (same for smstop).
---
 .../Target/AArch64/AArch64ISelLowering.cpp    | 12 +++++++++-
 llvm/lib/Target/AArch64/SMEInstrFormats.td    |  2 --
 llvm/test/CodeGen/AArch64/sme-write-vg.ll     | 24 +++++++++++++++++++
 3 files changed, 35 insertions(+), 3 deletions(-)
 create mode 100644 llvm/test/CodeGen/AArch64/sme-write-vg.ll

diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 054311d39e7b83..5b7a36d2eba76f 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -7751,7 +7751,7 @@ void AArch64TargetLowering::AdjustInstrPostInstrSelection(MachineInstr &MI,
   // register allocator to pass call args in callee saved regs, without extra
   // copies to avoid these fake clobbers of actually-preserved GPRs.
   if (MI.getOpcode() == AArch64::MSRpstatesvcrImm1 ||
-      MI.getOpcode() == AArch64::MSRpstatePseudo)
+      MI.getOpcode() == AArch64::MSRpstatePseudo) {
     for (unsigned I = MI.getNumOperands() - 1; I > 0; --I)
       if (MachineOperand &MO = MI.getOperand(I);
           MO.isReg() && MO.isImplicit() && MO.isDef() &&
@@ -7759,6 +7759,16 @@ void AArch64TargetLowering::AdjustInstrPostInstrSelection(MachineInstr &MI,
            AArch64::GPR64RegClass.contains(MO.getReg())))
         MI.removeOperand(I);
 
+    // The SVE vector length can change when entering/leaving streaming mode.
+    if (MI.getOperand(0).getImm() == AArch64SVCR::SVCRSM ||
+        MI.getOperand(0).getImm() == AArch64SVCR::SVCRSMZA) {
+      MI.addOperand(MachineOperand::CreateReg(AArch64::VG, /*IsDef=*/false,
+                                              /*IsImplicit=*/true));
+      MI.addOperand(MachineOperand::CreateReg(AArch64::VG, /*IsDef=*/true,
+                                              /*IsImplicit=*/true));
+    }
+  }
+
   // Add an implicit use of 'VG' for ADDXri/SUBXri, which are instructions that
   // have nothing to do with VG, were it not that they are used to materialise a
   // frame-address. If they contain a frame-index to a scalable vector, this
diff --git a/llvm/lib/Target/AArch64/SMEInstrFormats.td b/llvm/lib/Target/AArch64/SMEInstrFormats.td
index 33cb5f9734b819..44d9a8ac7cb677 100644
--- a/llvm/lib/Target/AArch64/SMEInstrFormats.td
+++ b/llvm/lib/Target/AArch64/SMEInstrFormats.td
@@ -223,8 +223,6 @@ def MSRpstatesvcrImm1
   let Inst{8} = imm;
   let Inst{7-5} = 0b011; // op2
   let hasPostISelHook = 1;
-  let Uses = [VG];
-  let Defs = [VG];
 }
 
 def : InstAlias<"smstart",    (MSRpstatesvcrImm1 0b011, 0b1)>;
diff --git a/llvm/test/CodeGen/AArch64/sme-write-vg.ll b/llvm/test/CodeGen/AArch64/sme-write-vg.ll
new file mode 100644
index 00000000000000..577606d45484f1
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/sme-write-vg.ll
@@ -0,0 +1,24 @@
+; RUN: llc -mattr=+sme -stop-after=finalize-isel < %s | FileCheck %s
+
+target triple = "aarch64"
+
+; Check that we don't define VG for 'smstart za' and 'smstop za'
+define void @smstart_za() "aarch64_new_za" nounwind {
+  ; CHECK-LABEL:    name: smstart_za
+  ; CHECK-NOT:        implicit-def {{[^,]*}}$vg
+  ret void
+}
+
+; Check that we do define VG for 'smstart sm' and 'smstop sm'
+define void @smstart_sm() nounwind {
+  ; CHECK-LABEL: name: smstart_sm
+  ; CHECK:          MSRpstatesvcrImm1 1, 1,
+  ; CHECK-SAME:       implicit-def {{[^,]*}}$vg
+  ; CHECK:          MSRpstatesvcrImm1 1, 0,
+  ; CHECK-SAME:       implicit-def {{[^,]*}}$vg
+  call void @require_sm()
+  ret void
+}
+
+declare void @require_sm() "aarch64_pstate_sm_enabled"
+declare void @require_za() "aarch64_inout_za"

>From b32cd54d3cb25b3afe1920ec43ee65457252110b Mon Sep 17 00:00:00 2001
From: AtariDreams <83477269+AtariDreams at users.noreply.github.com>
Date: Wed, 13 Mar 2024 04:23:04 -0400
Subject: [PATCH 10/63] [Float2Int] Resolve FIXME: Pick the smallest legal type
 that fits (#79158)

Pick the type based on the smallest bit-width possible, using
DataLayout.
---
 .../llvm/Transforms/Scalar/Float2Int.h        |   2 +-
 llvm/lib/Transforms/Scalar/Float2Int.cpp      |  29 +-
 llvm/test/Transforms/Float2Int/basic.ll       | 251 ++++++++++++++----
 3 files changed, 214 insertions(+), 68 deletions(-)

diff --git a/llvm/include/llvm/Transforms/Scalar/Float2Int.h b/llvm/include/llvm/Transforms/Scalar/Float2Int.h
index 83be329bed60ba..337e229efcf379 100644
--- a/llvm/include/llvm/Transforms/Scalar/Float2Int.h
+++ b/llvm/include/llvm/Transforms/Scalar/Float2Int.h
@@ -44,7 +44,7 @@ class Float2IntPass : public PassInfoMixin<Float2IntPass> {
   std::optional<ConstantRange> calcRange(Instruction *I);
   void walkBackwards();
   void walkForwards();
-  bool validateAndTransform();
+  bool validateAndTransform(const DataLayout &DL);
   Value *convert(Instruction *I, Type *ToTy);
   void cleanup();
 
diff --git a/llvm/lib/Transforms/Scalar/Float2Int.cpp b/llvm/lib/Transforms/Scalar/Float2Int.cpp
index ccca8bcc1a56ac..6ad4be169b589a 100644
--- a/llvm/lib/Transforms/Scalar/Float2Int.cpp
+++ b/llvm/lib/Transforms/Scalar/Float2Int.cpp
@@ -311,7 +311,7 @@ void Float2IntPass::walkForwards() {
 }
 
 // If there is a valid transform to be done, do it.
-bool Float2IntPass::validateAndTransform() {
+bool Float2IntPass::validateAndTransform(const DataLayout &DL) {
   bool MadeChange = false;
 
   // Iterate over every disjoint partition of the def-use graph.
@@ -376,15 +376,23 @@ bool Float2IntPass::validateAndTransform() {
       LLVM_DEBUG(dbgs() << "F2I: Value not guaranteed to be representable!\n");
       continue;
     }
-    if (MinBW > 64) {
-      LLVM_DEBUG(
-          dbgs() << "F2I: Value requires more than 64 bits to represent!\n");
-      continue;
-    }
 
-    // OK, R is known to be representable. Now pick a type for it.
-    // FIXME: Pick the smallest legal type that will fit.
-    Type *Ty = (MinBW > 32) ? Type::getInt64Ty(*Ctx) : Type::getInt32Ty(*Ctx);
+    // OK, R is known to be representable.
+    // Pick the smallest legal type that will fit.
+    Type *Ty = DL.getSmallestLegalIntType(*Ctx, MinBW);
+    if (!Ty) {
+      // Every supported target supports 64-bit and 32-bit integers,
+      // so fallback to a 32 or 64-bit integer if the value fits.
+      if (MinBW <= 32) {
+        Ty = Type::getInt32Ty(*Ctx);
+      } else if (MinBW <= 64) {
+        Ty = Type::getInt64Ty(*Ctx);
+      } else {
+        LLVM_DEBUG(dbgs() << "F2I: Value requires more than bits to represent "
+                             "than the target supports!\n");
+        continue;
+      }
+    }
 
     for (auto MI = ECs.member_begin(It), ME = ECs.member_end();
          MI != ME; ++MI)
@@ -491,7 +499,8 @@ bool Float2IntPass::runImpl(Function &F, const DominatorTree &DT) {
   walkBackwards();
   walkForwards();
 
-  bool Modified = validateAndTransform();
+  const DataLayout &DL = F.getParent()->getDataLayout();
+  bool Modified = validateAndTransform(DL);
   if (Modified)
     cleanup();
   return Modified;
diff --git a/llvm/test/Transforms/Float2Int/basic.ll b/llvm/test/Transforms/Float2Int/basic.ll
index 2854a83179b7eb..a454b773f4ebab 100644
--- a/llvm/test/Transforms/Float2Int/basic.ll
+++ b/llvm/test/Transforms/Float2Int/basic.ll
@@ -1,16 +1,29 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt < %s -passes='float2int' -S | FileCheck %s
+; RUN: opt < %s -passes=float2int -S | FileCheck %s -check-prefixes=CHECK,NONE
+; RUN: opt < %s -passes=float2int -S --data-layout="n64" | FileCheck %s -check-prefixes=CHECK,ONLY64
+; RUN: opt < %s -passes=float2int -S --data-layout="n8:16:32:64"| FileCheck %s -check-prefixes=CHECK,MULTIPLE
 
 ;
 ; Positive tests
 ;
 
 define i16 @simple1(i8 %a) {
-; CHECK-LABEL: @simple1(
-; CHECK-NEXT:    [[TMP1:%.*]] = zext i8 [[A:%.*]] to i32
-; CHECK-NEXT:    [[T21:%.*]] = add i32 [[TMP1]], 1
-; CHECK-NEXT:    [[TMP2:%.*]] = trunc i32 [[T21]] to i16
-; CHECK-NEXT:    ret i16 [[TMP2]]
+; NONE-LABEL: @simple1(
+; NONE-NEXT:    [[TMP1:%.*]] = zext i8 [[A:%.*]] to i32
+; NONE-NEXT:    [[T21:%.*]] = add i32 [[TMP1]], 1
+; NONE-NEXT:    [[TMP2:%.*]] = trunc i32 [[T21]] to i16
+; NONE-NEXT:    ret i16 [[TMP2]]
+;
+; ONLY64-LABEL: @simple1(
+; ONLY64-NEXT:    [[TMP1:%.*]] = zext i8 [[A:%.*]] to i64
+; ONLY64-NEXT:    [[T21:%.*]] = add i64 [[TMP1]], 1
+; ONLY64-NEXT:    [[TMP2:%.*]] = trunc i64 [[T21]] to i16
+; ONLY64-NEXT:    ret i16 [[TMP2]]
+;
+; MULTIPLE-LABEL: @simple1(
+; MULTIPLE-NEXT:    [[TMP1:%.*]] = zext i8 [[A:%.*]] to i16
+; MULTIPLE-NEXT:    [[T21:%.*]] = add i16 [[TMP1]], 1
+; MULTIPLE-NEXT:    ret i16 [[T21]]
 ;
   %t1 = uitofp i8 %a to float
   %t2 = fadd float %t1, 1.0
@@ -19,11 +32,23 @@ define i16 @simple1(i8 %a) {
 }
 
 define i8 @simple2(i8 %a) {
-; CHECK-LABEL: @simple2(
-; CHECK-NEXT:    [[TMP1:%.*]] = zext i8 [[A:%.*]] to i32
-; CHECK-NEXT:    [[T21:%.*]] = sub i32 [[TMP1]], 1
-; CHECK-NEXT:    [[TMP2:%.*]] = trunc i32 [[T21]] to i8
-; CHECK-NEXT:    ret i8 [[TMP2]]
+; NONE-LABEL: @simple2(
+; NONE-NEXT:    [[TMP1:%.*]] = zext i8 [[A:%.*]] to i32
+; NONE-NEXT:    [[T21:%.*]] = sub i32 [[TMP1]], 1
+; NONE-NEXT:    [[TMP2:%.*]] = trunc i32 [[T21]] to i8
+; NONE-NEXT:    ret i8 [[TMP2]]
+;
+; ONLY64-LABEL: @simple2(
+; ONLY64-NEXT:    [[TMP1:%.*]] = zext i8 [[A:%.*]] to i64
+; ONLY64-NEXT:    [[T21:%.*]] = sub i64 [[TMP1]], 1
+; ONLY64-NEXT:    [[TMP2:%.*]] = trunc i64 [[T21]] to i8
+; ONLY64-NEXT:    ret i8 [[TMP2]]
+;
+; MULTIPLE-LABEL: @simple2(
+; MULTIPLE-NEXT:    [[TMP1:%.*]] = zext i8 [[A:%.*]] to i16
+; MULTIPLE-NEXT:    [[T21:%.*]] = sub i16 [[TMP1]], 1
+; MULTIPLE-NEXT:    [[TMP2:%.*]] = trunc i16 [[T21]] to i8
+; MULTIPLE-NEXT:    ret i8 [[TMP2]]
 ;
   %t1 = uitofp i8 %a to float
   %t2 = fsub float %t1, 1.0
@@ -32,10 +57,22 @@ define i8 @simple2(i8 %a) {
 }
 
 define i32 @simple3(i8 %a) {
-; CHECK-LABEL: @simple3(
-; CHECK-NEXT:    [[TMP1:%.*]] = zext i8 [[A:%.*]] to i32
-; CHECK-NEXT:    [[T21:%.*]] = sub i32 [[TMP1]], 1
-; CHECK-NEXT:    ret i32 [[T21]]
+; NONE-LABEL: @simple3(
+; NONE-NEXT:    [[TMP1:%.*]] = zext i8 [[A:%.*]] to i32
+; NONE-NEXT:    [[T21:%.*]] = sub i32 [[TMP1]], 1
+; NONE-NEXT:    ret i32 [[T21]]
+;
+; ONLY64-LABEL: @simple3(
+; ONLY64-NEXT:    [[TMP1:%.*]] = zext i8 [[A:%.*]] to i64
+; ONLY64-NEXT:    [[T21:%.*]] = sub i64 [[TMP1]], 1
+; ONLY64-NEXT:    [[TMP2:%.*]] = trunc i64 [[T21]] to i32
+; ONLY64-NEXT:    ret i32 [[TMP2]]
+;
+; MULTIPLE-LABEL: @simple3(
+; MULTIPLE-NEXT:    [[TMP1:%.*]] = zext i8 [[A:%.*]] to i16
+; MULTIPLE-NEXT:    [[T21:%.*]] = sub i16 [[TMP1]], 1
+; MULTIPLE-NEXT:    [[TMP2:%.*]] = zext i16 [[T21]] to i32
+; MULTIPLE-NEXT:    ret i32 [[TMP2]]
 ;
   %t1 = uitofp i8 %a to float
   %t2 = fsub float %t1, 1.0
@@ -44,11 +81,23 @@ define i32 @simple3(i8 %a) {
 }
 
 define i1 @cmp(i8 %a, i8 %b) {
-; CHECK-LABEL: @cmp(
-; CHECK-NEXT:    [[TMP1:%.*]] = zext i8 [[A:%.*]] to i32
-; CHECK-NEXT:    [[TMP2:%.*]] = zext i8 [[B:%.*]] to i32
-; CHECK-NEXT:    [[T31:%.*]] = icmp slt i32 [[TMP1]], [[TMP2]]
-; CHECK-NEXT:    ret i1 [[T31]]
+; NONE-LABEL: @cmp(
+; NONE-NEXT:    [[TMP1:%.*]] = zext i8 [[A:%.*]] to i32
+; NONE-NEXT:    [[TMP2:%.*]] = zext i8 [[B:%.*]] to i32
+; NONE-NEXT:    [[T31:%.*]] = icmp slt i32 [[TMP1]], [[TMP2]]
+; NONE-NEXT:    ret i1 [[T31]]
+;
+; ONLY64-LABEL: @cmp(
+; ONLY64-NEXT:    [[TMP1:%.*]] = zext i8 [[A:%.*]] to i64
+; ONLY64-NEXT:    [[TMP2:%.*]] = zext i8 [[B:%.*]] to i64
+; ONLY64-NEXT:    [[T31:%.*]] = icmp slt i64 [[TMP1]], [[TMP2]]
+; ONLY64-NEXT:    ret i1 [[T31]]
+;
+; MULTIPLE-LABEL: @cmp(
+; MULTIPLE-NEXT:    [[TMP1:%.*]] = zext i8 [[A:%.*]] to i16
+; MULTIPLE-NEXT:    [[TMP2:%.*]] = zext i8 [[B:%.*]] to i16
+; MULTIPLE-NEXT:    [[T31:%.*]] = icmp slt i16 [[TMP1]], [[TMP2]]
+; MULTIPLE-NEXT:    ret i1 [[T31]]
 ;
   %t1 = uitofp i8 %a to float
   %t2 = uitofp i8 %b to float
@@ -70,12 +119,27 @@ define i32 @simple4(i32 %a) {
 }
 
 define i32 @simple5(i8 %a, i8 %b) {
-; CHECK-LABEL: @simple5(
-; CHECK-NEXT:    [[TMP1:%.*]] = zext i8 [[A:%.*]] to i32
-; CHECK-NEXT:    [[TMP2:%.*]] = zext i8 [[B:%.*]] to i32
-; CHECK-NEXT:    [[T31:%.*]] = add i32 [[TMP1]], 1
-; CHECK-NEXT:    [[T42:%.*]] = mul i32 [[T31]], [[TMP2]]
-; CHECK-NEXT:    ret i32 [[T42]]
+; NONE-LABEL: @simple5(
+; NONE-NEXT:    [[TMP1:%.*]] = zext i8 [[A:%.*]] to i32
+; NONE-NEXT:    [[TMP2:%.*]] = zext i8 [[B:%.*]] to i32
+; NONE-NEXT:    [[T31:%.*]] = add i32 [[TMP1]], 1
+; NONE-NEXT:    [[T42:%.*]] = mul i32 [[T31]], [[TMP2]]
+; NONE-NEXT:    ret i32 [[T42]]
+;
+; ONLY64-LABEL: @simple5(
+; ONLY64-NEXT:    [[TMP1:%.*]] = zext i8 [[A:%.*]] to i64
+; ONLY64-NEXT:    [[TMP2:%.*]] = zext i8 [[B:%.*]] to i64
+; ONLY64-NEXT:    [[T31:%.*]] = add i64 [[TMP1]], 1
+; ONLY64-NEXT:    [[T42:%.*]] = mul i64 [[T31]], [[TMP2]]
+; ONLY64-NEXT:    [[TMP3:%.*]] = trunc i64 [[T42]] to i32
+; ONLY64-NEXT:    ret i32 [[TMP3]]
+;
+; MULTIPLE-LABEL: @simple5(
+; MULTIPLE-NEXT:    [[TMP1:%.*]] = zext i8 [[A:%.*]] to i32
+; MULTIPLE-NEXT:    [[TMP2:%.*]] = zext i8 [[B:%.*]] to i32
+; MULTIPLE-NEXT:    [[T31:%.*]] = add i32 [[TMP1]], 1
+; MULTIPLE-NEXT:    [[T42:%.*]] = mul i32 [[T31]], [[TMP2]]
+; MULTIPLE-NEXT:    ret i32 [[T42]]
 ;
   %t1 = uitofp i8 %a to float
   %t2 = uitofp i8 %b to float
@@ -86,12 +150,27 @@ define i32 @simple5(i8 %a, i8 %b) {
 }
 
 define i32 @simple6(i8 %a, i8 %b) {
-; CHECK-LABEL: @simple6(
-; CHECK-NEXT:    [[TMP1:%.*]] = zext i8 [[A:%.*]] to i32
-; CHECK-NEXT:    [[TMP2:%.*]] = zext i8 [[B:%.*]] to i32
-; CHECK-NEXT:    [[T31:%.*]] = sub i32 0, [[TMP1]]
-; CHECK-NEXT:    [[T42:%.*]] = mul i32 [[T31]], [[TMP2]]
-; CHECK-NEXT:    ret i32 [[T42]]
+; NONE-LABEL: @simple6(
+; NONE-NEXT:    [[TMP1:%.*]] = zext i8 [[A:%.*]] to i32
+; NONE-NEXT:    [[TMP2:%.*]] = zext i8 [[B:%.*]] to i32
+; NONE-NEXT:    [[T31:%.*]] = sub i32 0, [[TMP1]]
+; NONE-NEXT:    [[T42:%.*]] = mul i32 [[T31]], [[TMP2]]
+; NONE-NEXT:    ret i32 [[T42]]
+;
+; ONLY64-LABEL: @simple6(
+; ONLY64-NEXT:    [[TMP1:%.*]] = zext i8 [[A:%.*]] to i64
+; ONLY64-NEXT:    [[TMP2:%.*]] = zext i8 [[B:%.*]] to i64
+; ONLY64-NEXT:    [[T31:%.*]] = sub i64 0, [[TMP1]]
+; ONLY64-NEXT:    [[T42:%.*]] = mul i64 [[T31]], [[TMP2]]
+; ONLY64-NEXT:    [[TMP3:%.*]] = trunc i64 [[T42]] to i32
+; ONLY64-NEXT:    ret i32 [[TMP3]]
+;
+; MULTIPLE-LABEL: @simple6(
+; MULTIPLE-NEXT:    [[TMP1:%.*]] = zext i8 [[A:%.*]] to i32
+; MULTIPLE-NEXT:    [[TMP2:%.*]] = zext i8 [[B:%.*]] to i32
+; MULTIPLE-NEXT:    [[T31:%.*]] = sub i32 0, [[TMP1]]
+; MULTIPLE-NEXT:    [[T42:%.*]] = mul i32 [[T31]], [[TMP2]]
+; MULTIPLE-NEXT:    ret i32 [[T42]]
 ;
   %t1 = uitofp i8 %a to float
   %t2 = uitofp i8 %b to float
@@ -105,15 +184,37 @@ define i32 @simple6(i8 %a, i8 %b) {
 ; cause failure of the other.
 
 define i32 @multi1(i8 %a, i8 %b, i8 %c, float %d) {
-; CHECK-LABEL: @multi1(
-; CHECK-NEXT:    [[TMP1:%.*]] = zext i8 [[A:%.*]] to i32
-; CHECK-NEXT:    [[TMP2:%.*]] = zext i8 [[B:%.*]] to i32
-; CHECK-NEXT:    [[FC:%.*]] = uitofp i8 [[C:%.*]] to float
-; CHECK-NEXT:    [[X1:%.*]] = add i32 [[TMP1]], [[TMP2]]
-; CHECK-NEXT:    [[Z:%.*]] = fadd float [[FC]], [[D:%.*]]
-; CHECK-NEXT:    [[W:%.*]] = fptoui float [[Z]] to i32
-; CHECK-NEXT:    [[R:%.*]] = add i32 [[X1]], [[W]]
-; CHECK-NEXT:    ret i32 [[R]]
+; NONE-LABEL: @multi1(
+; NONE-NEXT:    [[TMP1:%.*]] = zext i8 [[A:%.*]] to i32
+; NONE-NEXT:    [[TMP2:%.*]] = zext i8 [[B:%.*]] to i32
+; NONE-NEXT:    [[FC:%.*]] = uitofp i8 [[C:%.*]] to float
+; NONE-NEXT:    [[X1:%.*]] = add i32 [[TMP1]], [[TMP2]]
+; NONE-NEXT:    [[Z:%.*]] = fadd float [[FC]], [[D:%.*]]
+; NONE-NEXT:    [[W:%.*]] = fptoui float [[Z]] to i32
+; NONE-NEXT:    [[R:%.*]] = add i32 [[X1]], [[W]]
+; NONE-NEXT:    ret i32 [[R]]
+;
+; ONLY64-LABEL: @multi1(
+; ONLY64-NEXT:    [[TMP1:%.*]] = zext i8 [[A:%.*]] to i64
+; ONLY64-NEXT:    [[TMP2:%.*]] = zext i8 [[B:%.*]] to i64
+; ONLY64-NEXT:    [[FC:%.*]] = uitofp i8 [[C:%.*]] to float
+; ONLY64-NEXT:    [[X1:%.*]] = add i64 [[TMP1]], [[TMP2]]
+; ONLY64-NEXT:    [[TMP3:%.*]] = trunc i64 [[X1]] to i32
+; ONLY64-NEXT:    [[Z:%.*]] = fadd float [[FC]], [[D:%.*]]
+; ONLY64-NEXT:    [[W:%.*]] = fptoui float [[Z]] to i32
+; ONLY64-NEXT:    [[R:%.*]] = add i32 [[TMP3]], [[W]]
+; ONLY64-NEXT:    ret i32 [[R]]
+;
+; MULTIPLE-LABEL: @multi1(
+; MULTIPLE-NEXT:    [[TMP1:%.*]] = zext i8 [[A:%.*]] to i16
+; MULTIPLE-NEXT:    [[TMP2:%.*]] = zext i8 [[B:%.*]] to i16
+; MULTIPLE-NEXT:    [[FC:%.*]] = uitofp i8 [[C:%.*]] to float
+; MULTIPLE-NEXT:    [[X1:%.*]] = add i16 [[TMP1]], [[TMP2]]
+; MULTIPLE-NEXT:    [[TMP3:%.*]] = zext i16 [[X1]] to i32
+; MULTIPLE-NEXT:    [[Z:%.*]] = fadd float [[FC]], [[D:%.*]]
+; MULTIPLE-NEXT:    [[W:%.*]] = fptoui float [[Z]] to i32
+; MULTIPLE-NEXT:    [[R:%.*]] = add i32 [[TMP3]], [[W]]
+; MULTIPLE-NEXT:    ret i32 [[R]]
 ;
   %fa = uitofp i8 %a to float
   %fb = uitofp i8 %b to float
@@ -127,11 +228,22 @@ define i32 @multi1(i8 %a, i8 %b, i8 %c, float %d) {
 }
 
 define i16 @simple_negzero(i8 %a) {
-; CHECK-LABEL: @simple_negzero(
-; CHECK-NEXT:    [[TMP1:%.*]] = zext i8 [[A:%.*]] to i32
-; CHECK-NEXT:    [[T21:%.*]] = add i32 [[TMP1]], 0
-; CHECK-NEXT:    [[TMP2:%.*]] = trunc i32 [[T21]] to i16
-; CHECK-NEXT:    ret i16 [[TMP2]]
+; NONE-LABEL: @simple_negzero(
+; NONE-NEXT:    [[TMP1:%.*]] = zext i8 [[A:%.*]] to i32
+; NONE-NEXT:    [[T21:%.*]] = add i32 [[TMP1]], 0
+; NONE-NEXT:    [[TMP2:%.*]] = trunc i32 [[T21]] to i16
+; NONE-NEXT:    ret i16 [[TMP2]]
+;
+; ONLY64-LABEL: @simple_negzero(
+; ONLY64-NEXT:    [[TMP1:%.*]] = zext i8 [[A:%.*]] to i64
+; ONLY64-NEXT:    [[T21:%.*]] = add i64 [[TMP1]], 0
+; ONLY64-NEXT:    [[TMP2:%.*]] = trunc i64 [[T21]] to i16
+; ONLY64-NEXT:    ret i16 [[TMP2]]
+;
+; MULTIPLE-LABEL: @simple_negzero(
+; MULTIPLE-NEXT:    [[TMP1:%.*]] = zext i8 [[A:%.*]] to i16
+; MULTIPLE-NEXT:    [[T21:%.*]] = add i16 [[TMP1]], 0
+; MULTIPLE-NEXT:    ret i16 [[T21]]
 ;
   %t1 = uitofp i8 %a to float
   %t2 = fadd fast float %t1, -0.0
@@ -140,12 +252,26 @@ define i16 @simple_negzero(i8 %a) {
 }
 
 define i32 @simple_negative(i8 %call) {
-; CHECK-LABEL: @simple_negative(
-; CHECK-NEXT:    [[TMP1:%.*]] = sext i8 [[CALL:%.*]] to i32
-; CHECK-NEXT:    [[MUL1:%.*]] = mul i32 [[TMP1]], -3
-; CHECK-NEXT:    [[TMP2:%.*]] = trunc i32 [[MUL1]] to i8
-; CHECK-NEXT:    [[CONV3:%.*]] = sext i8 [[TMP2]] to i32
-; CHECK-NEXT:    ret i32 [[CONV3]]
+; NONE-LABEL: @simple_negative(
+; NONE-NEXT:    [[TMP1:%.*]] = sext i8 [[CALL:%.*]] to i32
+; NONE-NEXT:    [[MUL1:%.*]] = mul i32 [[TMP1]], -3
+; NONE-NEXT:    [[TMP2:%.*]] = trunc i32 [[MUL1]] to i8
+; NONE-NEXT:    [[CONV3:%.*]] = sext i8 [[TMP2]] to i32
+; NONE-NEXT:    ret i32 [[CONV3]]
+;
+; ONLY64-LABEL: @simple_negative(
+; ONLY64-NEXT:    [[TMP1:%.*]] = sext i8 [[CALL:%.*]] to i64
+; ONLY64-NEXT:    [[MUL1:%.*]] = mul i64 [[TMP1]], -3
+; ONLY64-NEXT:    [[TMP2:%.*]] = trunc i64 [[MUL1]] to i8
+; ONLY64-NEXT:    [[CONV3:%.*]] = sext i8 [[TMP2]] to i32
+; ONLY64-NEXT:    ret i32 [[CONV3]]
+;
+; MULTIPLE-LABEL: @simple_negative(
+; MULTIPLE-NEXT:    [[TMP1:%.*]] = sext i8 [[CALL:%.*]] to i16
+; MULTIPLE-NEXT:    [[MUL1:%.*]] = mul i16 [[TMP1]], -3
+; MULTIPLE-NEXT:    [[TMP2:%.*]] = trunc i16 [[MUL1]] to i8
+; MULTIPLE-NEXT:    [[CONV3:%.*]] = sext i8 [[TMP2]] to i32
+; MULTIPLE-NEXT:    ret i32 [[CONV3]]
 ;
   %conv1 = sitofp i8 %call to float
   %mul = fmul float %conv1, -3.000000e+00
@@ -155,11 +281,22 @@ define i32 @simple_negative(i8 %call) {
 }
 
 define i16 @simple_fneg(i8 %a) {
-; CHECK-LABEL: @simple_fneg(
-; CHECK-NEXT:    [[TMP1:%.*]] = zext i8 [[A:%.*]] to i32
-; CHECK-NEXT:    [[T21:%.*]] = sub i32 0, [[TMP1]]
-; CHECK-NEXT:    [[TMP2:%.*]] = trunc i32 [[T21]] to i16
-; CHECK-NEXT:    ret i16 [[TMP2]]
+; NONE-LABEL: @simple_fneg(
+; NONE-NEXT:    [[TMP1:%.*]] = zext i8 [[A:%.*]] to i32
+; NONE-NEXT:    [[T21:%.*]] = sub i32 0, [[TMP1]]
+; NONE-NEXT:    [[TMP2:%.*]] = trunc i32 [[T21]] to i16
+; NONE-NEXT:    ret i16 [[TMP2]]
+;
+; ONLY64-LABEL: @simple_fneg(
+; ONLY64-NEXT:    [[TMP1:%.*]] = zext i8 [[A:%.*]] to i64
+; ONLY64-NEXT:    [[T21:%.*]] = sub i64 0, [[TMP1]]
+; ONLY64-NEXT:    [[TMP2:%.*]] = trunc i64 [[T21]] to i16
+; ONLY64-NEXT:    ret i16 [[TMP2]]
+;
+; MULTIPLE-LABEL: @simple_fneg(
+; MULTIPLE-NEXT:    [[TMP1:%.*]] = zext i8 [[A:%.*]] to i16
+; MULTIPLE-NEXT:    [[T21:%.*]] = sub i16 0, [[TMP1]]
+; MULTIPLE-NEXT:    ret i16 [[T21]]
 ;
   %t1 = uitofp i8 %a to float
   %t2 = fneg fast float %t1

>From 9ad0929568395567d0157501fe71b6470fcaaaa3 Mon Sep 17 00:00:00 2001
From: lcvon007 <141613945+lcvon007 at users.noreply.github.com>
Date: Wed, 13 Mar 2024 16:38:48 +0800
Subject: [PATCH 11/63] [Attributor][FIX] Register right new created BB.
 (#84929)

CBBB will keep same after the first iteration so
registerManifestAddedBasicBlock would always register the same basic
block later.

Co-authored-by: laichunfeng <laichunfeng at tencent.com>
---
 llvm/lib/Transforms/IPO/AttributorAttributes.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llvm/lib/Transforms/IPO/AttributorAttributes.cpp b/llvm/lib/Transforms/IPO/AttributorAttributes.cpp
index 488a6f0bb153a4..f98833bd119891 100644
--- a/llvm/lib/Transforms/IPO/AttributorAttributes.cpp
+++ b/llvm/lib/Transforms/IPO/AttributorAttributes.cpp
@@ -12371,7 +12371,7 @@ struct AAIndirectCallInfoCallSite : public AAIndirectCallInfo {
           SplitBlockAndInsertIfThen(LastCmp, IP, /* Unreachable */ false);
       BasicBlock *CBBB = CB->getParent();
       A.registerManifestAddedBasicBlock(*ThenTI->getParent());
-      A.registerManifestAddedBasicBlock(*CBBB);
+      A.registerManifestAddedBasicBlock(*IP->getParent());
       auto *SplitTI = cast<BranchInst>(LastCmp->getNextNode());
       BasicBlock *ElseBB;
       if (&*IP == CB) {

>From e731275d96cb051f38c6fe79d6271dfbad867a67 Mon Sep 17 00:00:00 2001
From: mikaelholmen <mikael.holmen at ericsson.com>
Date: Wed, 13 Mar 2024 09:58:47 +0100
Subject: [PATCH 12/63] [ValueTracking] Remove faulty dereference of
 "InsertBefore" (#85034)

In 2fe81edef6f
 [NFC][RemoveDIs] Insert instruction using iterators in Transforms/
we changed
       if (*req_idx != *i)
         return FindInsertedValue(I->getAggregateOperand(), idx_range,
-                                 InsertBefore);
+                                 *InsertBefore);
     }
but there is no guarantee that is InsertBefore is non-empty at that
point,
which we e.g can see in the added testcase.

Instead just pass on the optional InsertBefore in the recursive call to
FindInsertedValue, as we do at several other places already.
---
 llvm/lib/Analysis/ValueTracking.cpp           |  2 +-
 .../Analysis/Lint/crash_empty_iterator.ll     | 22 +++++++++++++++++++
 2 files changed, 23 insertions(+), 1 deletion(-)
 create mode 100644 llvm/test/Analysis/Lint/crash_empty_iterator.ll

diff --git a/llvm/lib/Analysis/ValueTracking.cpp b/llvm/lib/Analysis/ValueTracking.cpp
index 371ad41ee96562..8a4a2c4f92a0dc 100644
--- a/llvm/lib/Analysis/ValueTracking.cpp
+++ b/llvm/lib/Analysis/ValueTracking.cpp
@@ -5709,7 +5709,7 @@ llvm::FindInsertedValue(Value *V, ArrayRef<unsigned> idx_range,
       // looking for, then.
       if (*req_idx != *i)
         return FindInsertedValue(I->getAggregateOperand(), idx_range,
-                                 *InsertBefore);
+                                 InsertBefore);
     }
     // If we end up here, the indices of the insertvalue match with those
     // requested (though possibly only partially). Now we recursively look at
diff --git a/llvm/test/Analysis/Lint/crash_empty_iterator.ll b/llvm/test/Analysis/Lint/crash_empty_iterator.ll
new file mode 100644
index 00000000000000..2fbecbcef5cfb4
--- /dev/null
+++ b/llvm/test/Analysis/Lint/crash_empty_iterator.ll
@@ -0,0 +1,22 @@
+; RUN: opt -passes="lint" -S < %s | FileCheck %s
+
+; After 2fe81edef6f0b
+;  [NFC][RemoveDIs] Insert instruction using iterators in Transforms/
+; this crashed in FindInsertedValue when dereferencing an empty
+; optional iterator.
+; Just see that it doesn't crash anymore.
+
+; CHECK-LABEL: @test1
+
+%struct = type { i32, i32 }
+
+define void @test1() {
+entry:
+  %.fca.1.insert = insertvalue %struct zeroinitializer, i32 0, 1
+  %0 = extractvalue %struct %.fca.1.insert, 0
+  %1 = tail call %struct @foo(i32 %0)
+  ret void
+}
+
+declare %struct @foo(i32)
+

>From 8480f7aaed7fe5521a1e531a34bcd2847df6de0f Mon Sep 17 00:00:00 2001
From: David CARLIER <devnexen at gmail.com>
Date: Wed, 13 Mar 2024 09:25:43 +0000
Subject: [PATCH 13/63] [compiler-rt] reimplements GetMemoryProfile for netbsd.
 (#84841)

The actual solution relies on the premise /proc/self/smaps existence.
instead relying on native api like freebsd.
fixing fuzzer build too.
---
 compiler-rt/lib/fuzzer/FuzzerUtilLinux.cpp          |  2 +-
 .../lib/sanitizer_common/sanitizer_procmaps_bsd.cpp | 13 +++++++++++++
 .../sanitizer_common/sanitizer_procmaps_common.cpp  |  2 +-
 3 files changed, 15 insertions(+), 2 deletions(-)

diff --git a/compiler-rt/lib/fuzzer/FuzzerUtilLinux.cpp b/compiler-rt/lib/fuzzer/FuzzerUtilLinux.cpp
index 5729448b0beb13..e5409f22f0e3a2 100644
--- a/compiler-rt/lib/fuzzer/FuzzerUtilLinux.cpp
+++ b/compiler-rt/lib/fuzzer/FuzzerUtilLinux.cpp
@@ -44,7 +44,7 @@ void SetThreadName(std::thread &thread, const std::string &name) {
 #if LIBFUZZER_LINUX || LIBFUZZER_FREEBSD
   (void)pthread_setname_np(thread.native_handle(), name.c_str());
 #elif LIBFUZZER_NETBSD
-  (void)pthread_set_name_np(thread.native_handle(), "%s", name.c_str());
+  (void)pthread_setname_np(thread.native_handle(), "%s", const_cast<char *>(name.c_str()));
 #endif
 }
 
diff --git a/compiler-rt/lib/sanitizer_common/sanitizer_procmaps_bsd.cpp b/compiler-rt/lib/sanitizer_common/sanitizer_procmaps_bsd.cpp
index 7c2d8e6f173131..7d7c009f644426 100644
--- a/compiler-rt/lib/sanitizer_common/sanitizer_procmaps_bsd.cpp
+++ b/compiler-rt/lib/sanitizer_common/sanitizer_procmaps_bsd.cpp
@@ -42,6 +42,19 @@ void GetMemoryProfile(fill_profile_f cb, uptr *stats) {
   cb(0, InfoProc->ki_rssize * GetPageSizeCached(), false, stats);
   UnmapOrDie(InfoProc, Size, true);
 }
+#elif SANITIZER_NETBSD
+void GetMemoryProfile(fill_profile_f cb, uptr *stats) {
+  struct kinfo_proc2 *InfoProc;
+  uptr Len = sizeof(*InfoProc);
+  uptr Size = Len;
+  const int Mib[] = {CTL_KERN, KERN_PROC2, KERN_PROC_PID, getpid(), Size, 1};
+  InfoProc = (struct kinfo_proc2 *)MmapOrDie(Size, "GetMemoryProfile()");
+  CHECK_EQ(
+      internal_sysctl(Mib, ARRAY_SIZE(Mib), nullptr, (uptr *)InfoProc, &Len, 0),
+      0);
+  cb(0, InfoProc->p_vm_rssize * GetPageSizeCached(), false, stats);
+  UnmapOrDie(InfoProc, Size, true);
+}
 #endif
 
 void ReadProcMaps(ProcSelfMapsBuff *proc_maps) {
diff --git a/compiler-rt/lib/sanitizer_common/sanitizer_procmaps_common.cpp b/compiler-rt/lib/sanitizer_common/sanitizer_procmaps_common.cpp
index a7805ad1b083b0..7214a2b9ea4681 100644
--- a/compiler-rt/lib/sanitizer_common/sanitizer_procmaps_common.cpp
+++ b/compiler-rt/lib/sanitizer_common/sanitizer_procmaps_common.cpp
@@ -145,7 +145,7 @@ void MemoryMappingLayout::DumpListOfModules(
   }
 }
 
-#if SANITIZER_LINUX || SANITIZER_ANDROID || SANITIZER_SOLARIS || SANITIZER_NETBSD
+#if SANITIZER_LINUX || SANITIZER_ANDROID || SANITIZER_SOLARIS
 void GetMemoryProfile(fill_profile_f cb, uptr *stats) {
   char *smaps = nullptr;
   uptr smaps_cap = 0;

>From b6d13e0c93580a34b689d0813889c7bc35f7fc27 Mon Sep 17 00:00:00 2001
From: Florian Hahn <flo at fhahn.com>
Date: Wed, 13 Mar 2024 09:47:16 +0000
Subject: [PATCH 14/63] [SjLjEHPrepare] Use inverse_depth_first() instead of
 _ext variant (NFC). (#84920)

inverse_depth_first df_iterator_default_set as default set, so there's
no need to explicitly use inverse_depth_first_ext.

PR: https://github.com/llvm/llvm-project/pull/84920
---
 llvm/lib/CodeGen/SjLjEHPrepare.cpp | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/llvm/lib/CodeGen/SjLjEHPrepare.cpp b/llvm/lib/CodeGen/SjLjEHPrepare.cpp
index 515b5764a09479..4bad57d279e9e4 100644
--- a/llvm/lib/CodeGen/SjLjEHPrepare.cpp
+++ b/llvm/lib/CodeGen/SjLjEHPrepare.cpp
@@ -150,9 +150,7 @@ static void MarkBlocksLiveIn(BasicBlock *BB,
   if (!LiveBBs.insert(BB).second)
     return; // already been here.
 
-  df_iterator_default_set<BasicBlock*> Visited;
-
-  for (BasicBlock *B : inverse_depth_first_ext(BB, Visited))
+  for (BasicBlock *B : inverse_depth_first(BB))
     LiveBBs.insert(B);
 }
 

>From f8b826e30baf557bf9b5b19f3541b7b117d7c0b1 Mon Sep 17 00:00:00 2001
From: Nikita Popov <npopov at redhat.com>
Date: Wed, 13 Mar 2024 11:48:33 +0100
Subject: [PATCH 15/63] [Tests] Drop inrange attribute from some tests (NFC)

These don't actually test anything related to inrange, so drop the
attribute.
---
 llvm/test/CodeGen/AArch64/fold-global-offsets.ll     |  2 +-
 llvm/test/CodeGen/Hexagon/addrmode-immop.mir         |  4 ++--
 llvm/test/CodeGen/NVPTX/b52037.ll                    |  2 +-
 llvm/test/CodeGen/WinCFGuard/cfguard-mingw.ll        | 12 ++++++------
 llvm/test/CodeGen/X86/tls-align.ll                   |  2 +-
 llvm/test/DebugInfo/X86/tu-to-non-tu.ll              |  8 ++++----
 .../Transforms/Utils/CallPromotionUtilsTest.cpp      | 12 ++++++------
 7 files changed, 21 insertions(+), 21 deletions(-)

diff --git a/llvm/test/CodeGen/AArch64/fold-global-offsets.ll b/llvm/test/CodeGen/AArch64/fold-global-offsets.ll
index 897d35a78495b1..8de0f0d121f90a 100644
--- a/llvm/test/CodeGen/AArch64/fold-global-offsets.ll
+++ b/llvm/test/CodeGen/AArch64/fold-global-offsets.ll
@@ -131,7 +131,7 @@ define i32 @f7() {
 ; GISEL-NEXT:    ret
 
 entry:
-  %lshr = lshr i128 bitcast (<2 x i64> <i64 undef, i64 ptrtoint (ptr getelementptr inbounds ({ [9 x ptr], [8 x ptr] }, ptr @x3, i64 0, inrange i32 1, i64 2) to i64)> to i128), 64
+  %lshr = lshr i128 bitcast (<2 x i64> <i64 undef, i64 ptrtoint (ptr getelementptr inbounds ({ [9 x ptr], [8 x ptr] }, ptr @x3, i64 0, i32 1, i64 2) to i64)> to i128), 64
   %trunc = trunc i128 %lshr to i64
   %inttoptr = inttoptr i64 %trunc to ptr
   %gep = getelementptr i32, ptr %inttoptr, i64 5
diff --git a/llvm/test/CodeGen/Hexagon/addrmode-immop.mir b/llvm/test/CodeGen/Hexagon/addrmode-immop.mir
index 3069cbe5969d66..1412d31f35ac10 100644
--- a/llvm/test/CodeGen/Hexagon/addrmode-immop.mir
+++ b/llvm/test/CodeGen/Hexagon/addrmode-immop.mir
@@ -15,7 +15,7 @@
   ; Function Attrs: norecurse
   define void @f0() #0 {
   b0:
-    %v0 = load ptr, ptr getelementptr (i8, ptr getelementptr inbounds ({ [3 x ptr], [3 x ptr] }, ptr @g0, i32 0, inrange i32 0, i32 3), i32 sub (i32 ptrtoint (ptr @f1 to i32), i32 1)), align 4
+    %v0 = load ptr, ptr getelementptr (i8, ptr getelementptr inbounds ({ [3 x ptr], [3 x ptr] }, ptr @g0, i32 0, i32 0, i32 3), i32 sub (i32 ptrtoint (ptr @f1 to i32), i32 1)), align 4
     %v1 = call i32 %v0(ptr nonnull undef)
     unreachable
   }
@@ -33,7 +33,7 @@ tracksRegLiveness: true
 body: |
   bb.0.b0:
     $r2 = A2_tfrsi @g0 + 12
-    $r2 = L2_loadri_io killed $r2, @f1 - 1 :: (load (s32) from `ptr getelementptr (i8, ptr getelementptr inbounds ({ [3 x ptr], [3 x ptr] }, ptr @g0, i32 0, inrange i32 0, i32 3), i32 sub (i32 ptrtoint (ptr @f1 to i32), i32 1))`)
+    $r2 = L2_loadri_io killed $r2, @f1 - 1 :: (load (s32) from `ptr getelementptr (i8, ptr getelementptr inbounds ({ [3 x ptr], [3 x ptr] }, ptr @g0, i32 0, i32 0, i32 3), i32 sub (i32 ptrtoint (ptr @f1 to i32), i32 1))`)
     ADJCALLSTACKDOWN 0, 0, implicit-def $r29, implicit-def dead $r30, implicit $r31, implicit $r30, implicit $r29
     PS_callr_nr killed $r2, hexagoncsr, implicit undef $r0, implicit-def $r29, implicit-def dead $r0
     ADJCALLSTACKUP 0, 0, implicit-def dead $r29, implicit-def dead $r30, implicit-def dead $r31, implicit $r29
diff --git a/llvm/test/CodeGen/NVPTX/b52037.ll b/llvm/test/CodeGen/NVPTX/b52037.ll
index d9322dabfa065e..5d1c390909f6a5 100644
--- a/llvm/test/CodeGen/NVPTX/b52037.ll
+++ b/llvm/test/CodeGen/NVPTX/b52037.ll
@@ -47,7 +47,7 @@ bb:
   %tmp5 = load ptr, ptr %tmp4, align 8
   %tmp9 = getelementptr inbounds %struct.zot, ptr %tmp, i64 0, i32 2, i32 1
   store ptr %tmp5, ptr %tmp9, align 8
-  store ptr getelementptr inbounds ({ [3 x ptr] }, ptr @global_1, i64 0, inrange i32 0, i64 3), ptr %tmp, align 16
+  store ptr getelementptr inbounds ({ [3 x ptr] }, ptr @global_1, i64 0, i32 0, i64 3), ptr %tmp, align 16
   %tmp.i1 = tail call i64 @foo()
   %tmp44.i16 = getelementptr inbounds i16, ptr %tmp5, i64 undef
   %tmp45.i17 = load i16, ptr %tmp44.i16, align 2
diff --git a/llvm/test/CodeGen/WinCFGuard/cfguard-mingw.ll b/llvm/test/CodeGen/WinCFGuard/cfguard-mingw.ll
index 085cde8c61693a..7a5baa09f95e94 100644
--- a/llvm/test/CodeGen/WinCFGuard/cfguard-mingw.ll
+++ b/llvm/test/CodeGen/WinCFGuard/cfguard-mingw.ll
@@ -97,7 +97,7 @@ $_ZTI7Derived = comdat any
 
 ; Function Attrs: nounwind uwtable
 define weak_odr dso_local dllexport void @_ZN4BaseC2Ev(ptr noundef nonnull align 8 dereferenceable(12) %0) unnamed_addr #0 comdat align 2 {
-  store ptr getelementptr inbounds ({ [5 x ptr] }, ptr @_ZTV4Base, i64 0, inrange i32 0, i64 2), ptr %0, align 8, !tbaa !5
+  store ptr getelementptr inbounds ({ [5 x ptr] }, ptr @_ZTV4Base, i64 0, i32 0, i64 2), ptr %0, align 8, !tbaa !5
   %2 = getelementptr inbounds %class.Base, ptr %0, i64 0, i32 1
   store i32 0, ptr %2, align 8, !tbaa !8
   ret void
@@ -105,7 +105,7 @@ define weak_odr dso_local dllexport void @_ZN4BaseC2Ev(ptr noundef nonnull align
 
 ; Function Attrs: nounwind uwtable
 define weak_odr dso_local dllexport void @_ZN4BaseC1Ev(ptr noundef nonnull align 8 dereferenceable(12) %0) unnamed_addr #0 comdat align 2 {
-  store ptr getelementptr inbounds ({ [5 x ptr] }, ptr @_ZTV4Base, i64 0, inrange i32 0, i64 2), ptr %0, align 8, !tbaa !5
+  store ptr getelementptr inbounds ({ [5 x ptr] }, ptr @_ZTV4Base, i64 0, i32 0, i64 2), ptr %0, align 8, !tbaa !5
   %2 = getelementptr inbounds %class.Base, ptr %0, i64 0, i32 1
   store i32 0, ptr %2, align 8, !tbaa !8
   ret void
@@ -140,10 +140,10 @@ declare dso_local void @_ZdlPv(ptr noundef) local_unnamed_addr #2
 
 ; Function Attrs: nounwind uwtable
 define weak_odr dso_local dllexport void @_ZN7DerivedC2Ev(ptr noundef nonnull align 8 dereferenceable(16) %0) unnamed_addr #0 comdat align 2 {
-  store ptr getelementptr inbounds ({ [5 x ptr] }, ptr @_ZTV4Base, i64 0, inrange i32 0, i64 2), ptr %0, align 8, !tbaa !5
+  store ptr getelementptr inbounds ({ [5 x ptr] }, ptr @_ZTV4Base, i64 0, i32 0, i64 2), ptr %0, align 8, !tbaa !5
   %2 = getelementptr inbounds %class.Base, ptr %0, i64 0, i32 1
   store i32 0, ptr %2, align 8, !tbaa !8
-  store ptr getelementptr inbounds ({ [5 x ptr] }, ptr @_ZTV7Derived, i64 0, inrange i32 0, i64 2), ptr %0, align 8, !tbaa !5
+  store ptr getelementptr inbounds ({ [5 x ptr] }, ptr @_ZTV7Derived, i64 0, i32 0, i64 2), ptr %0, align 8, !tbaa !5
   %3 = getelementptr inbounds %class.Derived, ptr %0, i64 0, i32 1
   store i32 0, ptr %3, align 4, !tbaa !12
   ret void
@@ -151,10 +151,10 @@ define weak_odr dso_local dllexport void @_ZN7DerivedC2Ev(ptr noundef nonnull al
 
 ; Function Attrs: nounwind uwtable
 define weak_odr dso_local dllexport void @_ZN7DerivedC1Ev(ptr noundef nonnull align 8 dereferenceable(16) %0) unnamed_addr #0 comdat align 2 {
-  store ptr getelementptr inbounds ({ [5 x ptr] }, ptr @_ZTV4Base, i64 0, inrange i32 0, i64 2), ptr %0, align 8, !tbaa !5
+  store ptr getelementptr inbounds ({ [5 x ptr] }, ptr @_ZTV4Base, i64 0, i32 0, i64 2), ptr %0, align 8, !tbaa !5
   %2 = getelementptr inbounds %class.Base, ptr %0, i64 0, i32 1
   store i32 0, ptr %2, align 8, !tbaa !8
-  store ptr getelementptr inbounds ({ [5 x ptr] }, ptr @_ZTV7Derived, i64 0, inrange i32 0, i64 2), ptr %0, align 8, !tbaa !5
+  store ptr getelementptr inbounds ({ [5 x ptr] }, ptr @_ZTV7Derived, i64 0, i32 0, i64 2), ptr %0, align 8, !tbaa !5
   %3 = getelementptr inbounds %class.Derived, ptr %0, i64 0, i32 1
   store i32 0, ptr %3, align 4, !tbaa !12
   ret void
diff --git a/llvm/test/CodeGen/X86/tls-align.ll b/llvm/test/CodeGen/X86/tls-align.ll
index 3c8ee6b3f8ab2b..e996c00dbf1d4a 100644
--- a/llvm/test/CodeGen/X86/tls-align.ll
+++ b/llvm/test/CodeGen/X86/tls-align.ll
@@ -12,7 +12,7 @@
 
 define internal fastcc void @foo() unnamed_addr {
 entry:
-  store <8 x ptr> <ptr getelementptr inbounds ({ [4 x ptr] }, ptr @_ZTV7Derived, i64 0, inrange i32 0, i64 2), ptr null, ptr getelementptr inbounds ({ [4 x ptr] }, ptr @_ZTV7Derived, i64 0, inrange i32 0, i64 2), ptr null, ptr getelementptr inbounds ({ [4 x ptr] }, ptr @_ZTV7Derived, i64 0, inrange i32 0, i64 2), ptr null, ptr getelementptr inbounds ({ [4 x ptr] }, ptr @_ZTV7Derived, i64 0, inrange i32 0, i64 2), ptr null>, ptr @array, align 32
+  store <8 x ptr> <ptr getelementptr inbounds ({ [4 x ptr] }, ptr @_ZTV7Derived, i64 0, i32 0, i64 2), ptr null, ptr getelementptr inbounds ({ [4 x ptr] }, ptr @_ZTV7Derived, i64 0, i32 0, i64 2), ptr null, ptr getelementptr inbounds ({ [4 x ptr] }, ptr @_ZTV7Derived, i64 0, i32 0, i64 2), ptr null, ptr getelementptr inbounds ({ [4 x ptr] }, ptr @_ZTV7Derived, i64 0, i32 0, i64 2), ptr null>, ptr @array, align 32
   ret void
 }
 
diff --git a/llvm/test/DebugInfo/X86/tu-to-non-tu.ll b/llvm/test/DebugInfo/X86/tu-to-non-tu.ll
index 3ad97adbb5adaf..f80bd8b978963e 100644
--- a/llvm/test/DebugInfo/X86/tu-to-non-tu.ll
+++ b/llvm/test/DebugInfo/X86/tu-to-non-tu.ll
@@ -156,14 +156,14 @@
 %struct.templ_non_tu.1 = type { ptr }
 
 @_ZTV6non_tu = dso_local unnamed_addr constant { [3 x ptr] } { [3 x ptr] [ptr null, ptr @_ZTI6non_tu, ptr @_ZN6non_tu2f1Ev] }, align 8
- at v1 = dso_local global { { ptr } } { { ptr } { ptr getelementptr inbounds ({ [3 x ptr] }, ptr @_ZTV6non_tu, i32 0, inrange i32 0, i32 2) } }, align 8, !dbg !0
+ at v1 = dso_local global { { ptr } } { { ptr } { ptr getelementptr inbounds ({ [3 x ptr] }, ptr @_ZTV6non_tu, i32 0, i32 0, i32 2) } }, align 8, !dbg !0
 @v5 = dso_local global %struct.ref_internal zeroinitializer, align 1, !dbg !5
 @_ZTV12templ_non_tuIiE = dso_local unnamed_addr constant { [3 x ptr] } { [3 x ptr] [ptr null, ptr @_ZTI12templ_non_tuIiE, ptr @_ZN12templ_non_tuIiE2f1Ev] }, align 8
- at v2 = dso_local global { { ptr } } { { ptr } { ptr getelementptr inbounds ({ [3 x ptr] }, ptr @_ZTV12templ_non_tuIiE, i32 0, inrange i32 0, i32 2) } }, align 8, !dbg !13
+ at v2 = dso_local global { { ptr } } { { ptr } { ptr getelementptr inbounds ({ [3 x ptr] }, ptr @_ZTV12templ_non_tuIiE, i32 0, i32 0, i32 2) } }, align 8, !dbg !13
 @_ZTV12templ_non_tuIlE = dso_local unnamed_addr constant { [3 x ptr] } { [3 x ptr] [ptr null, ptr @_ZTI12templ_non_tuIlE, ptr @_ZN12templ_non_tuIlE2f1Ev] }, align 8
- at v3 = dso_local global { { ptr } } { { ptr } { ptr getelementptr inbounds ({ [3 x ptr] }, ptr @_ZTV12templ_non_tuIlE, i32 0, inrange i32 0, i32 2) } }, align 8, !dbg !32
+ at v3 = dso_local global { { ptr } } { { ptr } { ptr getelementptr inbounds ({ [3 x ptr] }, ptr @_ZTV12templ_non_tuIlE, i32 0, i32 0, i32 2) } }, align 8, !dbg !32
 @_ZTV12templ_non_tuIbE = dso_local unnamed_addr constant { [3 x ptr] } { [3 x ptr] [ptr null, ptr @_ZTI12templ_non_tuIbE, ptr @_ZN12templ_non_tuIbE2f1Ev] }, align 8
- at v4 = dso_local global { { ptr } } { { ptr } { ptr getelementptr inbounds ({ [3 x ptr] }, ptr @_ZTV12templ_non_tuIbE, i32 0, inrange i32 0, i32 2) } }, align 8, !dbg !46
+ at v4 = dso_local global { { ptr } } { { ptr } { ptr getelementptr inbounds ({ [3 x ptr] }, ptr @_ZTV12templ_non_tuIbE, i32 0, i32 0, i32 2) } }, align 8, !dbg !46
 @v6 = dso_local global %class.ref_internal_template zeroinitializer, align 1, !dbg !60
 @v7 = dso_local global %class.ref_from_ref_internal_template zeroinitializer, align 1, !dbg !69
 @_ZTVN10__cxxabiv117__class_type_infoE = external dso_local global ptr
diff --git a/llvm/unittests/Transforms/Utils/CallPromotionUtilsTest.cpp b/llvm/unittests/Transforms/Utils/CallPromotionUtilsTest.cpp
index eff8e27d36d641..0e9641c5846f35 100644
--- a/llvm/unittests/Transforms/Utils/CallPromotionUtilsTest.cpp
+++ b/llvm/unittests/Transforms/Utils/CallPromotionUtilsTest.cpp
@@ -37,7 +37,7 @@ define void @f() {
 entry:
   %o = alloca %class.Impl
   %base = getelementptr %class.Impl, %class.Impl* %o, i64 0, i32 0, i32 0
-  store i32 (...)** bitcast (i8** getelementptr inbounds ({ [3 x i8*] }, { [3 x i8*] }* @_ZTV4Impl, i64 0, inrange i32 0, i64 2) to i32 (...)**), i32 (...)*** %base
+  store i32 (...)** bitcast (i8** getelementptr inbounds ({ [3 x i8*] }, { [3 x i8*] }* @_ZTV4Impl, i64 0, i32 0, i64 2) to i32 (...)**), i32 (...)*** %base
   %f = getelementptr inbounds %class.Impl, %class.Impl* %o, i64 0, i32 1
   store i32 3, i32* %f
   %base.i = getelementptr inbounds %class.Impl, %class.Impl* %o, i64 0, i32 0
@@ -171,7 +171,7 @@ define void @f() {
 entry:
   %o = alloca %class.Impl
   %base = getelementptr %class.Impl, %class.Impl* %o, i64 0, i32 0, i32 0
-  store i32 (...)** bitcast (i8** getelementptr inbounds ({ [3 x i8*] }, { [3 x i8*] }* @_ZTV4Impl, i64 0, inrange i32 0, i64 2) to i32 (...)**), i32 (...)*** %base
+  store i32 (...)** bitcast (i8** getelementptr inbounds ({ [3 x i8*] }, { [3 x i8*] }* @_ZTV4Impl, i64 0, i32 0, i64 2) to i32 (...)**), i32 (...)*** %base
   %f = getelementptr inbounds %class.Impl, %class.Impl* %o, i64 0, i32 1
   store i32 3, i32* %f
   %base.i = getelementptr inbounds %class.Impl, %class.Impl* %o, i64 0, i32 0
@@ -213,7 +213,7 @@ define void @f() {
 entry:
   %o = alloca %class.Impl
   %base = getelementptr %class.Impl, %class.Impl* %o, i64 0, i32 0, i32 0
-  store i32 (...)** bitcast (i8** getelementptr inbounds ({ [3 x i8*] }, { [3 x i8*] }* @_ZTV4Impl, i64 0, inrange i32 0, i64 2) to i32 (...)**), i32 (...)*** %base
+  store i32 (...)** bitcast (i8** getelementptr inbounds ({ [3 x i8*] }, { [3 x i8*] }* @_ZTV4Impl, i64 0, i32 0, i64 2) to i32 (...)**), i32 (...)*** %base
   %f = getelementptr inbounds %class.Impl, %class.Impl* %o, i64 0, i32 1
   store i32 3, i32* %f
   %base.i = getelementptr inbounds %class.Impl, %class.Impl* %o, i64 0, i32 0
@@ -256,7 +256,7 @@ define i32 @_Z2g1v() {
   %a = alloca %struct.A, align 8
   %0 = bitcast %struct.A* %a to i8*
   %1 = getelementptr %struct.A, %struct.A* %a, i64 0, i32 0
-  store i32 (...)** bitcast (i8** getelementptr inbounds ({ [4 x i8*] }, { [4 x i8*] }* @_ZTV1A, i64 0, inrange i32 0, i64 2) to i32 (...)**), i32 (...)*** %1, align 8
+  store i32 (...)** bitcast (i8** getelementptr inbounds ({ [4 x i8*] }, { [4 x i8*] }* @_ZTV1A, i64 0, i32 0, i64 2) to i32 (...)**), i32 (...)*** %1, align 8
   %2 = bitcast %struct.A* %a to i8*
   %3 = bitcast i8* %2 to i8**
   %vtable.i = load i8*, i8** %3, align 8
@@ -271,7 +271,7 @@ define i32 @_Z2g2v() {
   %a = alloca %struct.A, align 8
   %0 = bitcast %struct.A* %a to i8*
   %1 = getelementptr %struct.A, %struct.A* %a, i64 0, i32 0
-  store i32 (...)** bitcast (i8** getelementptr inbounds ({ [4 x i8*] }, { [4 x i8*] }* @_ZTV1A, i64 0, inrange i32 0, i64 2) to i32 (...)**), i32 (...)*** %1, align 8
+  store i32 (...)** bitcast (i8** getelementptr inbounds ({ [4 x i8*] }, { [4 x i8*] }* @_ZTV1A, i64 0, i32 0, i64 2) to i32 (...)**), i32 (...)*** %1, align 8
   %2 = bitcast %struct.A* %a to i8*
   %3 = bitcast i8* %2 to i8**
   %vtable.i = load i8*, i8** %3, align 8
@@ -340,7 +340,7 @@ define %struct1 @f() {
 entry:
   %o = alloca %class.Impl
   %base = getelementptr %class.Impl, %class.Impl* %o, i64 0, i32 0, i32 0
-  store i32 (...)** bitcast (i8** getelementptr inbounds ({ [3 x i8*] }, { [3 x i8*] }* @_ZTV4Impl, i64 0, inrange i32 0, i64 2) to i32 (...)**), i32 (...)*** %base
+  store i32 (...)** bitcast (i8** getelementptr inbounds ({ [3 x i8*] }, { [3 x i8*] }* @_ZTV4Impl, i64 0, i32 0, i64 2) to i32 (...)**), i32 (...)*** %base
   %f = getelementptr inbounds %class.Impl, %class.Impl* %o, i64 0, i32 1
   store i32 3, i32* %f
   %base.i = getelementptr inbounds %class.Impl, %class.Impl* %o, i64 0, i32 0

>From 8f895a87e811bd22ea1d1728f4c48d8497f4727c Mon Sep 17 00:00:00 2001
From: Nikita Popov <npopov at redhat.com>
Date: Wed, 13 Mar 2024 11:53:35 +0100
Subject: [PATCH 16/63] [ThinLTO] Drop inrange attribute from tests (NFC)

The inrange attribute is not relevant to the optimizations being
tested here. Additionally, all the inrange attributes in these
files don't actually carry any additional information, as the
"range" covers the whole object.
---
 llvm/test/ThinLTO/X86/Inputs/devirt_single_hybrid_bar.ll    | 2 +-
 llvm/test/ThinLTO/X86/devirt_after_filtering_unreachable.ll | 2 +-
 llvm/test/ThinLTO/X86/devirt_external_comdat_same_guid.ll   | 2 +-
 llvm/test/ThinLTO/X86/devirt_local_same_guid.ll             | 2 +-
 llvm/test/ThinLTO/X86/lower_type_test_phi.ll                | 4 ++--
 llvm/test/ThinLTO/X86/nodevirt-nonpromoted-typeid.ll        | 2 +-
 llvm/test/ThinLTO/X86/type_test_noindircall.ll              | 4 ++--
 7 files changed, 9 insertions(+), 9 deletions(-)

diff --git a/llvm/test/ThinLTO/X86/Inputs/devirt_single_hybrid_bar.ll b/llvm/test/ThinLTO/X86/Inputs/devirt_single_hybrid_bar.ll
index 721d6efb7b53f8..d8c6525ed6065f 100644
--- a/llvm/test/ThinLTO/X86/Inputs/devirt_single_hybrid_bar.ll
+++ b/llvm/test/ThinLTO/X86/Inputs/devirt_single_hybrid_bar.ll
@@ -23,7 +23,7 @@ define hidden i32 @_Z3barv() local_unnamed_addr #0 {
 entry:
   %b = alloca %struct.A, align 8
   call void @llvm.lifetime.start.p0(i64 8, ptr nonnull %b)
-  store ptr getelementptr inbounds ({ [3 x ptr] }, ptr @_ZTV1A, i64 0, inrange i32 0, i64 2), ptr %b, align 8, !tbaa !4
+  store ptr getelementptr inbounds ({ [3 x ptr] }, ptr @_ZTV1A, i64 0, i32 0, i64 2), ptr %b, align 8, !tbaa !4
   %call = call i32 @_Z3fooP1A(ptr nonnull %b)
   %add = add nsw i32 %call, 10
   call void @llvm.lifetime.end.p0(i64 8, ptr nonnull %b) #4
diff --git a/llvm/test/ThinLTO/X86/devirt_after_filtering_unreachable.ll b/llvm/test/ThinLTO/X86/devirt_after_filtering_unreachable.ll
index 68b83debef7d35..39f42da77d8161 100644
--- a/llvm/test/ThinLTO/X86/devirt_after_filtering_unreachable.ll
+++ b/llvm/test/ThinLTO/X86/devirt_after_filtering_unreachable.ll
@@ -71,7 +71,7 @@ target triple = "x86_64-unknown-linux-gnu"
 define hidden i32 @main() {
 entry:
   %call = tail call ptr @_Znwm(i64 8)
-  store ptr getelementptr inbounds ({ [5 x ptr] }, ptr @_ZTV7Derived, i64 0, inrange i32 0, i64 2), ptr %call
+  store ptr getelementptr inbounds ({ [5 x ptr] }, ptr @_ZTV7Derived, i64 0, i32 0, i64 2), ptr %call
   tail call void @_Z3fooP4Base(ptr nonnull %call)
   ret i32 0
 }
diff --git a/llvm/test/ThinLTO/X86/devirt_external_comdat_same_guid.ll b/llvm/test/ThinLTO/X86/devirt_external_comdat_same_guid.ll
index 24175325680436..1f0737b7192541 100644
--- a/llvm/test/ThinLTO/X86/devirt_external_comdat_same_guid.ll
+++ b/llvm/test/ThinLTO/X86/devirt_external_comdat_same_guid.ll
@@ -51,7 +51,7 @@ define i32 @_ZN1B1nEi(ptr %this, i32 %a) #0 comdat($_ZTV1B) {
 ; Ensures that vtable of B is live so that we will attempt devirt.
 define dso_local i32 @use_B(ptr %a) {
 entry:
-  store ptr getelementptr inbounds ({ [4 x ptr] }, ptr @_ZTV1B, i64 0, inrange i32 0, i64 2), ptr %a, align 8
+  store ptr getelementptr inbounds ({ [4 x ptr] }, ptr @_ZTV1B, i64 0, i32 0, i64 2), ptr %a, align 8
   ret i32 0
 }
 
diff --git a/llvm/test/ThinLTO/X86/devirt_local_same_guid.ll b/llvm/test/ThinLTO/X86/devirt_local_same_guid.ll
index 3efea8de3fbc6b..2205545e32507a 100644
--- a/llvm/test/ThinLTO/X86/devirt_local_same_guid.ll
+++ b/llvm/test/ThinLTO/X86/devirt_local_same_guid.ll
@@ -37,7 +37,7 @@ define internal i32 @_ZN1B1nEi(ptr %this, i32 %a) #0 {
 ; Ensures that vtable of B is live so that we will attempt devirt.
 define dso_local i32 @use_B(ptr %a) {
 entry:
-  store ptr getelementptr inbounds ({ [4 x ptr] }, ptr @_ZTV1B, i64 0, inrange i32 0, i64 2), ptr %a, align 8
+  store ptr getelementptr inbounds ({ [4 x ptr] }, ptr @_ZTV1B, i64 0, i32 0, i64 2), ptr %a, align 8
   ret i32 0
 }
 
diff --git a/llvm/test/ThinLTO/X86/lower_type_test_phi.ll b/llvm/test/ThinLTO/X86/lower_type_test_phi.ll
index 722ffe3cc2282c..81d85f64584fa2 100644
--- a/llvm/test/ThinLTO/X86/lower_type_test_phi.ll
+++ b/llvm/test/ThinLTO/X86/lower_type_test_phi.ll
@@ -117,7 +117,7 @@ $_ZTV2D2 = comdat any
 define ptr @_Z2b1v() {
 entry:
   %call = tail call ptr @_Znwm(i64 8)
-  store ptr getelementptr inbounds ({ [3 x ptr] }, ptr @_ZTV2D1, i64 0, inrange i32 0, i64 2), ptr %call, align 8
+  store ptr getelementptr inbounds ({ [3 x ptr] }, ptr @_ZTV2D1, i64 0, i32 0, i64 2), ptr %call, align 8
   ret ptr %call
 }
 
@@ -126,7 +126,7 @@ declare ptr @_Znwm(i64)
 define ptr @_Z2b2v() {
 entry:
   %call = tail call ptr @_Znwm(i64 8)
-  store ptr getelementptr inbounds ({ [3 x ptr] }, ptr @_ZTV2D2, i64 0, inrange i32 0, i64 2), ptr %call, align 8
+  store ptr getelementptr inbounds ({ [3 x ptr] }, ptr @_ZTV2D2, i64 0, i32 0, i64 2), ptr %call, align 8
   ret ptr %call
 }
 
diff --git a/llvm/test/ThinLTO/X86/nodevirt-nonpromoted-typeid.ll b/llvm/test/ThinLTO/X86/nodevirt-nonpromoted-typeid.ll
index c6e61edcd97a94..7d71c595fbb168 100644
--- a/llvm/test/ThinLTO/X86/nodevirt-nonpromoted-typeid.ll
+++ b/llvm/test/ThinLTO/X86/nodevirt-nonpromoted-typeid.ll
@@ -55,7 +55,7 @@ entry:
   %this.addr = alloca ptr, align 8
   store ptr %this, ptr %this.addr, align 8
   %this1 = load ptr, ptr %this.addr
-  store ptr getelementptr inbounds ({ [3 x ptr] }, ptr @_ZTV1D, i64 0, inrange i32 0, i64 2), ptr %this1, align 8
+  store ptr getelementptr inbounds ({ [3 x ptr] }, ptr @_ZTV1D, i64 0, i32 0, i64 2), ptr %this1, align 8
   ret void
 }
 
diff --git a/llvm/test/ThinLTO/X86/type_test_noindircall.ll b/llvm/test/ThinLTO/X86/type_test_noindircall.ll
index 2d0faaa2560237..cc85e44c280621 100644
--- a/llvm/test/ThinLTO/X86/type_test_noindircall.ll
+++ b/llvm/test/ThinLTO/X86/type_test_noindircall.ll
@@ -38,8 +38,8 @@ target triple = "x86_64-grtev4-linux-gnu"
 define internal void @_ZN12_GLOBAL__N_18RealFileD2Ev(ptr %this) unnamed_addr #0 align 2 {
 entry:
 ; CHECK-IR: store
-  store ptr getelementptr inbounds ({ [3 x ptr] }, ptr @_ZTVN12_GLOBAL__N_18RealFileE, i64 0, inrange i32 0, i64 2), ptr %this, align 8
-  %0 = tail call i1 @llvm.type.test(ptr getelementptr inbounds ({ [3 x ptr] }, ptr @_ZTVN12_GLOBAL__N_18RealFileE, i64 0, inrange i32 0, i64 2), metadata !"4$09c6cc733fc6accb91e5d7b87cb48f2d")
+  store ptr getelementptr inbounds ({ [3 x ptr] }, ptr @_ZTVN12_GLOBAL__N_18RealFileE, i64 0, i32 0, i64 2), ptr %this, align 8
+  %0 = tail call i1 @llvm.type.test(ptr getelementptr inbounds ({ [3 x ptr] }, ptr @_ZTVN12_GLOBAL__N_18RealFileE, i64 0, i32 0, i64 2), metadata !"4$09c6cc733fc6accb91e5d7b87cb48f2d")
   tail call void @llvm.assume(i1 %0)
 ; CHECK-IR-NEXT: ret void
   ret void

>From ac3d642cace83aad12a435ed7f6fc0cd3e986bdf Mon Sep 17 00:00:00 2001
From: dyung <douglas.yung at sony.com>
Date: Wed, 13 Mar 2024 04:10:03 -0700
Subject: [PATCH 17/63] Mark test as XFAIL that started failing after
 418f0066eb. (#85027)

Similar failures were previously seen and XFAILed in
https://reviews.llvm.org/D118468.

See the phabricator review for a description of the problem, and the
linked discourse thread for what the failing output looks like.

This change should fix the issue on two buildbots that are running older
versions of GDB:
- https://lab.llvm.org/buildbot/#/builders/217/builds/37559
- https://lab.llvm.org/buildbot/#/builders/247/builds/15173
---
 .../debuginfo-tests/llgdb-tests/forward-declare-class.cpp        | 1 +
 1 file changed, 1 insertion(+)

diff --git a/cross-project-tests/debuginfo-tests/llgdb-tests/forward-declare-class.cpp b/cross-project-tests/debuginfo-tests/llgdb-tests/forward-declare-class.cpp
index 850eed6ad95d98..130c439faec8cd 100644
--- a/cross-project-tests/debuginfo-tests/llgdb-tests/forward-declare-class.cpp
+++ b/cross-project-tests/debuginfo-tests/llgdb-tests/forward-declare-class.cpp
@@ -1,6 +1,7 @@
 // RUN: %clangxx %target_itanium_abi_host_triple -O0 -g %s -c -o %t.o
 // RUN: %test_debuginfo %s %t.o
 // Radar 9168773
+// XFAIL: !system-darwin && gdb-clang-incompatibility
 
 // DEBUGGER: ptype A
 // Work around a gdb bug where it believes that a class is a

>From 6444d3624bd1fb8114002adc1c8954fa93fe5ef5 Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev at redking.me.uk>
Date: Wed, 13 Mar 2024 11:13:28 +0000
Subject: [PATCH 18/63] [DAG] Add SDPatternMatch m_And/m_Or/m_Xor matchers for
 logic ops

---
 llvm/include/llvm/CodeGen/SDPatternMatch.h        | 15 +++++++++++++++
 .../CodeGen/SelectionDAGPatternMatchTest.cpp      | 11 +++++++++++
 2 files changed, 26 insertions(+)

diff --git a/llvm/include/llvm/CodeGen/SDPatternMatch.h b/llvm/include/llvm/CodeGen/SDPatternMatch.h
index 412bf42677cc5d..92b478cec0a077 100644
--- a/llvm/include/llvm/CodeGen/SDPatternMatch.h
+++ b/llvm/include/llvm/CodeGen/SDPatternMatch.h
@@ -495,6 +495,21 @@ inline BinaryOpc_match<LHS, RHS, true> m_Mul(const LHS &L, const RHS &R) {
   return BinaryOpc_match<LHS, RHS, true>(ISD::MUL, L, R);
 }
 
+template <typename LHS, typename RHS>
+inline BinaryOpc_match<LHS, RHS, true> m_And(const LHS &L, const RHS &R) {
+  return BinaryOpc_match<LHS, RHS, true>(ISD::AND, L, R);
+}
+
+template <typename LHS, typename RHS>
+inline BinaryOpc_match<LHS, RHS, true> m_Or(const LHS &L, const RHS &R) {
+  return BinaryOpc_match<LHS, RHS, true>(ISD::OR, L, R);
+}
+
+template <typename LHS, typename RHS>
+inline BinaryOpc_match<LHS, RHS, true> m_Xor(const LHS &L, const RHS &R) {
+  return BinaryOpc_match<LHS, RHS, true>(ISD::XOR, L, R);
+}
+
 template <typename LHS, typename RHS>
 inline BinaryOpc_match<LHS, RHS, false> m_UDiv(const LHS &L, const RHS &R) {
   return BinaryOpc_match<LHS, RHS, false>(ISD::UDIV, L, R);
diff --git a/llvm/unittests/CodeGen/SelectionDAGPatternMatchTest.cpp b/llvm/unittests/CodeGen/SelectionDAGPatternMatchTest.cpp
index 17fc3ce8af2677..2b764c9e1f0fd6 100644
--- a/llvm/unittests/CodeGen/SelectionDAGPatternMatchTest.cpp
+++ b/llvm/unittests/CodeGen/SelectionDAGPatternMatchTest.cpp
@@ -130,6 +130,9 @@ TEST_F(SelectionDAGPatternMatchTest, matchBinaryOp) {
   SDValue Add = DAG->getNode(ISD::ADD, DL, Int32VT, Op0, Op1);
   SDValue Sub = DAG->getNode(ISD::SUB, DL, Int32VT, Add, Op0);
   SDValue Mul = DAG->getNode(ISD::MUL, DL, Int32VT, Add, Sub);
+  SDValue And = DAG->getNode(ISD::AND, DL, Int32VT, Op0, Op1);
+  SDValue Xor = DAG->getNode(ISD::XOR, DL, Int32VT, Op1, Op0);
+  SDValue Or  = DAG->getNode(ISD::OR, DL, Int32VT, Op0, Op1);
 
   SDValue SFAdd = DAG->getNode(ISD::STRICT_FADD, DL, {Float32VT, MVT::Other},
                                {DAG->getEntryNode(), Op2, Op2});
@@ -144,6 +147,14 @@ TEST_F(SelectionDAGPatternMatchTest, matchBinaryOp) {
   EXPECT_TRUE(
       sd_match(SFAdd, m_ChainedBinOp(ISD::STRICT_FADD, m_SpecificVT(Float32VT),
                                      m_SpecificVT(Float32VT))));
+
+  EXPECT_TRUE(sd_match(And, m_c_BinOp(ISD::AND, m_Value(), m_Value())));
+  EXPECT_TRUE(sd_match(And, m_And(m_Value(), m_Value())));
+  EXPECT_TRUE(sd_match(Xor, m_c_BinOp(ISD::XOR, m_Value(), m_Value())));
+  EXPECT_TRUE(sd_match(Xor, m_Xor(m_Value(), m_Value())));
+  EXPECT_TRUE(sd_match(Or, m_c_BinOp(ISD::OR, m_Value(), m_Value())));
+  EXPECT_TRUE(sd_match(Or, m_Or(m_Value(), m_Value())));
+
   SDValue BindVal;
   EXPECT_TRUE(sd_match(SFAdd, m_ChainedBinOp(ISD::STRICT_FADD, m_Value(BindVal),
                                              m_Deferred(BindVal))));

>From dae250e37563c19ea00a105ce9e4f90d6fbdee90 Mon Sep 17 00:00:00 2001
From: Jonathan Thackray <jonathan.thackray at arm.com>
Date: Wed, 13 Mar 2024 11:19:32 +0000
Subject: [PATCH 19/63] [ARM][AArch64] Add missing Arm CPU part-ids to enable
 -mcpu=native (#84899)

Update Host.cpp with some missing Arm CPU part identifiers,
to enable `-mcpu=native` on these processors. These are found in
the Technical Reference Manuals listed under "part num" or "part no"
---
 llvm/lib/TargetParser/Host.cpp | 15 +++++++++++++++
 1 file changed, 15 insertions(+)

diff --git a/llvm/lib/TargetParser/Host.cpp b/llvm/lib/TargetParser/Host.cpp
index ee4fd0425ca5e8..f65ed259eade4c 100644
--- a/llvm/lib/TargetParser/Host.cpp
+++ b/llvm/lib/TargetParser/Host.cpp
@@ -196,14 +196,24 @@ StringRef sys::detail::getHostCPUNameForARM(StringRef ProcCpuinfoContent) {
         .Case("0xb36", "arm1136j-s")
         .Case("0xb56", "arm1156t2-s")
         .Case("0xb76", "arm1176jz-s")
+        .Case("0xc05", "cortex-a5")
+        .Case("0xc07", "cortex-a7")
         .Case("0xc08", "cortex-a8")
         .Case("0xc09", "cortex-a9")
         .Case("0xc0f", "cortex-a15")
+        .Case("0xc0e", "cortex-a17")
         .Case("0xc20", "cortex-m0")
         .Case("0xc23", "cortex-m3")
         .Case("0xc24", "cortex-m4")
+        .Case("0xc27", "cortex-m7")
+        .Case("0xd20", "cortex-m23")
+        .Case("0xd21", "cortex-m33")
         .Case("0xd24", "cortex-m52")
         .Case("0xd22", "cortex-m55")
+        .Case("0xd23", "cortex-m85")
+        .Case("0xc18", "cortex-r8")
+        .Case("0xd13", "cortex-r52")
+        .Case("0xd15", "cortex-r82")
         .Case("0xd02", "cortex-a34")
         .Case("0xd04", "cortex-a35")
         .Case("0xd03", "cortex-a53")
@@ -211,13 +221,17 @@ StringRef sys::detail::getHostCPUNameForARM(StringRef ProcCpuinfoContent) {
         .Case("0xd46", "cortex-a510")
         .Case("0xd80", "cortex-a520")
         .Case("0xd07", "cortex-a57")
+        .Case("0xd06", "cortex-a65")
+        .Case("0xd43", "cortex-a65ae")
         .Case("0xd08", "cortex-a72")
         .Case("0xd09", "cortex-a73")
         .Case("0xd0a", "cortex-a75")
         .Case("0xd0b", "cortex-a76")
+        .Case("0xd0e", "cortex-a76ae")
         .Case("0xd0d", "cortex-a77")
         .Case("0xd41", "cortex-a78")
         .Case("0xd42", "cortex-a78ae")
+        .Case("0xd4b", "cortex-a78c")
         .Case("0xd47", "cortex-a710")
         .Case("0xd4d", "cortex-a715")
         .Case("0xd81", "cortex-a720")
@@ -226,6 +240,7 @@ StringRef sys::detail::getHostCPUNameForARM(StringRef ProcCpuinfoContent) {
         .Case("0xd48", "cortex-x2")
         .Case("0xd4e", "cortex-x3")
         .Case("0xd82", "cortex-x4")
+        .Case("0xd4a", "neoverse-e1")
         .Case("0xd0c", "neoverse-n1")
         .Case("0xd49", "neoverse-n2")
         .Case("0xd40", "neoverse-v1")

>From 76597c022895a53fd1bc2dca6c7a5380ec2050dd Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev at redking.me.uk>
Date: Wed, 13 Mar 2024 12:00:15 +0000
Subject: [PATCH 20/63] [DAG] visitSUB - convert some folds to use
 SDPatternMatch

General cleanup and allows us to handle several commutable matches with a single pattern
---
 llvm/include/llvm/CodeGen/SDPatternMatch.h    |  1 +
 llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp | 78 ++++++-------------
 llvm/test/CodeGen/AArch64/xor.ll              |  2 +-
 3 files changed, 25 insertions(+), 56 deletions(-)

diff --git a/llvm/include/llvm/CodeGen/SDPatternMatch.h b/llvm/include/llvm/CodeGen/SDPatternMatch.h
index 92b478cec0a077..a86c7400fa0945 100644
--- a/llvm/include/llvm/CodeGen/SDPatternMatch.h
+++ b/llvm/include/llvm/CodeGen/SDPatternMatch.h
@@ -663,6 +663,7 @@ inline SpecificInt_match m_SpecificInt(uint64_t V) {
 }
 
 inline SpecificInt_match m_Zero() { return m_SpecificInt(0U); }
+inline SpecificInt_match m_One() { return m_SpecificInt(1U); }
 inline SpecificInt_match m_AllOnes() { return m_SpecificInt(~0U); }
 
 /// Match true boolean value based on the information provided by
diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index 735cec8ecc0627..87033e824aa593 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -3789,63 +3789,34 @@ SDValue DAGCombiner::visitSUB(SDNode *N) {
       return DAG.getNode(ISD::SUB, DL, VT, NewC, N0.getOperand(1));
   }
 
-  // fold ((A+(B+or-C))-B) -> A+or-C
-  if (N0.getOpcode() == ISD::ADD &&
-      (N0.getOperand(1).getOpcode() == ISD::SUB ||
-       N0.getOperand(1).getOpcode() == ISD::ADD) &&
-      N0.getOperand(1).getOperand(0) == N1)
-    return DAG.getNode(N0.getOperand(1).getOpcode(), DL, VT, N0.getOperand(0),
-                       N0.getOperand(1).getOperand(1));
-
-  // fold ((A+(C+B))-B) -> A+C
-  if (N0.getOpcode() == ISD::ADD && N0.getOperand(1).getOpcode() == ISD::ADD &&
-      N0.getOperand(1).getOperand(1) == N1)
-    return DAG.getNode(ISD::ADD, DL, VT, N0.getOperand(0),
-                       N0.getOperand(1).getOperand(0));
+  SDValue A, B, C;
+
+  // fold ((A+(B+C))-B) -> A+C
+  if (sd_match(N0, m_Add(m_Value(A), m_Add(m_Specific(N1), m_Value(C)))))
+    return DAG.getNode(ISD::ADD, DL, VT, A, C);
+
+  // fold ((A+(B-C))-B) -> A-C
+  if (sd_match(N0, m_Add(m_Value(A), m_Sub(m_Specific(N1), m_Value(C)))))
+    return DAG.getNode(ISD::SUB, DL, VT, A, C);
 
   // fold ((A-(B-C))-C) -> A-B
-  if (N0.getOpcode() == ISD::SUB && N0.getOperand(1).getOpcode() == ISD::SUB &&
-      N0.getOperand(1).getOperand(1) == N1)
-    return DAG.getNode(ISD::SUB, DL, VT, N0.getOperand(0),
-                       N0.getOperand(1).getOperand(0));
+  if (sd_match(N0, m_Sub(m_Value(A), m_Sub(m_Value(B), m_Specific(N1)))))
+    return DAG.getNode(ISD::SUB, DL, VT, A, B);
 
   // fold (A-(B-C)) -> A+(C-B)
-  if (N1.getOpcode() == ISD::SUB && N1.hasOneUse())
+  if (sd_match(N1, m_OneUse(m_Sub(m_Value(B), m_Value(C)))))
     return DAG.getNode(ISD::ADD, DL, VT, N0,
-                       DAG.getNode(ISD::SUB, DL, VT, N1.getOperand(1),
-                                   N1.getOperand(0)));
+                       DAG.getNode(ISD::SUB, DL, VT, C, B));
 
   // A - (A & B)  ->  A & (~B)
-  if (N1.getOpcode() == ISD::AND) {
-    SDValue A = N1.getOperand(0);
-    SDValue B = N1.getOperand(1);
-    if (A != N0)
-      std::swap(A, B);
-    if (A == N0 &&
-        (N1.hasOneUse() || isConstantOrConstantVector(B, /*NoOpaques=*/true))) {
-      SDValue InvB =
-          DAG.getNode(ISD::XOR, DL, VT, B, DAG.getAllOnesConstant(DL, VT));
-      return DAG.getNode(ISD::AND, DL, VT, A, InvB);
-    }
-  }
+  if (sd_match(N1, m_And(m_Specific(N0), m_Value(B))) &&
+      (N1.hasOneUse() || isConstantOrConstantVector(B, /*NoOpaques=*/true)))
+    return DAG.getNode(ISD::AND, DL, VT, N0, DAG.getNOT(DL, B, VT));
 
-  // fold (X - (-Y * Z)) -> (X + (Y * Z))
-  if (N1.getOpcode() == ISD::MUL && N1.hasOneUse()) {
-    if (N1.getOperand(0).getOpcode() == ISD::SUB &&
-        isNullOrNullSplat(N1.getOperand(0).getOperand(0))) {
-      SDValue Mul = DAG.getNode(ISD::MUL, DL, VT,
-                                N1.getOperand(0).getOperand(1),
-                                N1.getOperand(1));
-      return DAG.getNode(ISD::ADD, DL, VT, N0, Mul);
-    }
-    if (N1.getOperand(1).getOpcode() == ISD::SUB &&
-        isNullOrNullSplat(N1.getOperand(1).getOperand(0))) {
-      SDValue Mul = DAG.getNode(ISD::MUL, DL, VT,
-                                N1.getOperand(0),
-                                N1.getOperand(1).getOperand(1));
-      return DAG.getNode(ISD::ADD, DL, VT, N0, Mul);
-    }
-  }
+  // fold (A - (-B * C)) -> (A + (B * C))
+  if (sd_match(N1, m_OneUse(m_Mul(m_Sub(m_Zero(), m_Value(B)), m_Value(C)))))
+    return DAG.getNode(ISD::ADD, DL, VT, N0,
+                       DAG.getNode(ISD::MUL, DL, VT, B, C));
 
   // If either operand of a sub is undef, the result is undef
   if (N0.isUndef())
@@ -3865,12 +3836,9 @@ SDValue DAGCombiner::visitSUB(SDNode *N) {
   if (SDValue V = foldSubToUSubSat(VT, N))
     return V;
 
-  // (x - y) - 1  ->  add (xor y, -1), x
-  if (N0.getOpcode() == ISD::SUB && N0.hasOneUse() && isOneOrOneSplat(N1)) {
-    SDValue Xor = DAG.getNode(ISD::XOR, DL, VT, N0.getOperand(1),
-                              DAG.getAllOnesConstant(DL, VT));
-    return DAG.getNode(ISD::ADD, DL, VT, Xor, N0.getOperand(0));
-  }
+  // (A - B) - 1  ->  add (xor B, -1), A
+  if (sd_match(N, m_Sub(m_OneUse(m_Sub(m_Value(A), m_Value(B))), m_One())))
+    return DAG.getNode(ISD::ADD, DL, VT, A, DAG.getNOT(DL, B, VT));
 
   // Look for:
   //   sub y, (xor x, -1)
diff --git a/llvm/test/CodeGen/AArch64/xor.ll b/llvm/test/CodeGen/AArch64/xor.ll
index d92402cf43b33e..7d7f7bfdbf38e8 100644
--- a/llvm/test/CodeGen/AArch64/xor.ll
+++ b/llvm/test/CodeGen/AArch64/xor.ll
@@ -51,7 +51,7 @@ define <4 x i32> @vec_add_of_not_decrement(<4 x i32> %x, <4 x i32> %y) {
 ; CHECK-LABEL: vec_add_of_not_decrement:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    mvn v1.16b, v1.16b
-; CHECK-NEXT:    add v0.4s, v1.4s, v0.4s
+; CHECK-NEXT:    add v0.4s, v0.4s, v1.4s
 ; CHECK-NEXT:    ret
   %t0 = sub <4 x i32> %x, %y
   %r = sub <4 x i32> %t0, <i32 1, i32 1, i32 1, i32 1>

>From 1efeb68e22d27b7ce8f743c048feacb0fcd1047e Mon Sep 17 00:00:00 2001
From: Jay Foad <jay.foad at amd.com>
Date: Wed, 13 Mar 2024 11:50:58 +0000
Subject: [PATCH 21/63] [AMDGPU] Remove unneeded MnemonicAlias. NFC.

This is unneeded because MUBUF_Real_Atomic_gfx11_gfx12 on the line above
generates it automatically.
---
 llvm/lib/Target/AMDGPU/BUFInstructions.td | 1 -
 1 file changed, 1 deletion(-)

diff --git a/llvm/lib/Target/AMDGPU/BUFInstructions.td b/llvm/lib/Target/AMDGPU/BUFInstructions.td
index c7091028b3b5e5..4ae514ffcf7850 100644
--- a/llvm/lib/Target/AMDGPU/BUFInstructions.td
+++ b/llvm/lib/Target/AMDGPU/BUFInstructions.td
@@ -2616,7 +2616,6 @@ defm BUFFER_ATOMIC_CMPSWAP_X2     : MUBUF_Real_Atomic_gfx11_gfx12<0x042, "buffer
 defm BUFFER_ATOMIC_FCMPSWAP       : MUBUF_Real_Atomic_gfx11<0x050, "buffer_atomic_cmpswap_f32">;
 defm BUFFER_ATOMIC_COND_SUB_U32   : MUBUF_Real_Atomic_gfx12<0x050>;
 defm BUFFER_ATOMIC_CSUB           : MUBUF_Real_Atomic_gfx11_gfx12<0x037, "buffer_atomic_sub_clamp_u32", "buffer_atomic_csub_u32">;
-def : Mnem_gfx11_gfx12<"buffer_atomic_csub", "buffer_atomic_csub_u32">;
 defm BUFFER_ATOMIC_DEC            : MUBUF_Real_Atomic_gfx11_gfx12<0x040, "buffer_atomic_dec_u32">;
 defm BUFFER_ATOMIC_DEC_X2         : MUBUF_Real_Atomic_gfx11_gfx12<0x04D, "buffer_atomic_dec_u64">;
 defm BUFFER_ATOMIC_INC            : MUBUF_Real_Atomic_gfx11_gfx12<0x03F, "buffer_atomic_inc_u32">;

>From b148af6a4d709595e2e2f6126b90e9d67852ca8f Mon Sep 17 00:00:00 2001
From: Harald van Dijk <harald at gigawatt.nl>
Date: Wed, 13 Mar 2024 12:08:39 +0000
Subject: [PATCH 22/63] [AMDGPU] Fix canonicalization of truncated values.
 (#83054)

We were relying on roundings to implicitly canonicalize, which is
generally safe, except with roundings that may be optimized away.

Fixes #82937.
---
 llvm/lib/Target/AMDGPU/AMDGPUInstructions.td  |  24 +-
 llvm/lib/Target/AMDGPU/SIISelLowering.cpp     |  65 +-
 llvm/lib/Target/AMDGPU/SIISelLowering.h       |   4 +-
 llvm/lib/Target/AMDGPU/SIInstructions.td      |  51 +-
 llvm/test/CodeGen/AMDGPU/bf16.ll              | 911 ++++--------------
 llvm/test/CodeGen/AMDGPU/clamp.ll             |  64 +-
 .../AMDGPU/fcanonicalize-elimination.ll       | 103 +-
 llvm/test/CodeGen/AMDGPU/fcanonicalize.f16.ll | 728 ++++++++------
 llvm/test/CodeGen/AMDGPU/fcanonicalize.ll     |  22 +-
 llvm/test/CodeGen/AMDGPU/fneg-combines.f16.ll |  22 +-
 llvm/test/CodeGen/AMDGPU/fneg-combines.new.ll |   2 -
 llvm/test/CodeGen/AMDGPU/llvm.maxnum.f16.ll   | 122 +--
 llvm/test/CodeGen/AMDGPU/llvm.minnum.f16.ll   | 122 +--
 llvm/test/CodeGen/AMDGPU/mad-mix-lo.ll        | 191 ++--
 14 files changed, 1021 insertions(+), 1410 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td b/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td
index 5ed82c0c4b1b8e..86f77f7b64e88d 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td
@@ -194,7 +194,25 @@ class HasOneUseTernaryOp<SDPatternOperator op> : PatFrag<
   }];
 }
 
-class is_canonicalized<SDPatternOperator op> : PatFrag<
+class is_canonicalized_1<SDPatternOperator op> : PatFrag<
+  (ops node:$src0),
+  (op $src0),
+  [{
+    const SITargetLowering &Lowering =
+              *static_cast<const SITargetLowering *>(getTargetLowering());
+
+    return Lowering.isCanonicalized(*CurDAG, N->getOperand(0));
+   }]> {
+
+  let GISelPredicateCode = [{
+    const SITargetLowering *TLI = static_cast<const SITargetLowering *>(
+      MF.getSubtarget().getTargetLowering());
+
+    return TLI->isCanonicalized(MI.getOperand(1).getReg(), MF);
+  }];
+}
+
+class is_canonicalized_2<SDPatternOperator op> : PatFrag<
   (ops node:$src0, node:$src1),
   (op $src0, $src1),
   [{
@@ -210,8 +228,8 @@ class is_canonicalized<SDPatternOperator op> : PatFrag<
     const SITargetLowering *TLI = static_cast<const SITargetLowering *>(
       MF.getSubtarget().getTargetLowering());
 
-    return TLI->isCanonicalized(MI.getOperand(1).getReg(), const_cast<MachineFunction&>(MF)) &&
-      TLI->isCanonicalized(MI.getOperand(2).getReg(), const_cast<MachineFunction&>(MF));
+    return TLI->isCanonicalized(MI.getOperand(1).getReg(), MF) &&
+      TLI->isCanonicalized(MI.getOperand(2).getReg(), MF);
   }];
 }
 
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 9bc1b8eb598f3a..5ccf21f76015de 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -12572,6 +12572,10 @@ bool SITargetLowering::isCanonicalized(SelectionDAG &DAG, SDValue Op,
   case ISD::FREM:
   case ISD::FP_ROUND:
   case ISD::FP_EXTEND:
+  case ISD::FP16_TO_FP:
+  case ISD::FP_TO_FP16:
+  case ISD::BF16_TO_FP:
+  case ISD::FP_TO_BF16:
   case ISD::FLDEXP:
   case AMDGPUISD::FMUL_LEGACY:
   case AMDGPUISD::FMAD_FTZ:
@@ -12591,6 +12595,9 @@ bool SITargetLowering::isCanonicalized(SelectionDAG &DAG, SDValue Op,
   case AMDGPUISD::CVT_F32_UBYTE1:
   case AMDGPUISD::CVT_F32_UBYTE2:
   case AMDGPUISD::CVT_F32_UBYTE3:
+  case AMDGPUISD::FP_TO_FP16:
+  case AMDGPUISD::SIN_HW:
+  case AMDGPUISD::COS_HW:
     return true;
 
   // It can/will be lowered or combined as a bit operation.
@@ -12600,6 +12607,20 @@ bool SITargetLowering::isCanonicalized(SelectionDAG &DAG, SDValue Op,
   case ISD::FCOPYSIGN:
     return isCanonicalized(DAG, Op.getOperand(0), MaxDepth - 1);
 
+  case ISD::AND:
+    if (Op.getValueType() == MVT::i32) {
+      // Be careful as we only know it is a bitcast floating point type. It
+      // could be f32, v2f16, we have no way of knowing. Luckily the constant
+      // value that we optimize for, which comes up in fp32 to bf16 conversions,
+      // is valid to optimize for all types.
+      if (auto *RHS = dyn_cast<ConstantSDNode>(Op.getOperand(1))) {
+        if (RHS->getZExtValue() == 0xffff0000) {
+          return isCanonicalized(DAG, Op.getOperand(0), MaxDepth - 1);
+        }
+      }
+    }
+    break;
+
   case ISD::FSIN:
   case ISD::FCOS:
   case ISD::FSINCOS:
@@ -12665,6 +12686,9 @@ bool SITargetLowering::isCanonicalized(SelectionDAG &DAG, SDValue Op,
     return false;
 
   case ISD::BITCAST:
+    // TODO: This is incorrect as it loses track of the operand's type. We may
+    // end up effectively bitcasting from f32 to v2f16 or vice versa, and the
+    // same bits that are canonicalized in one type need not be in the other.
     return isCanonicalized(DAG, Op.getOperand(0), MaxDepth - 1);
   case ISD::TRUNCATE: {
     // Hack round the mess we make when legalizing extract_vector_elt
@@ -12694,25 +12718,26 @@ bool SITargetLowering::isCanonicalized(SelectionDAG &DAG, SDValue Op,
     case Intrinsic::amdgcn_trig_preop:
     case Intrinsic::amdgcn_log:
     case Intrinsic::amdgcn_exp2:
+    case Intrinsic::amdgcn_sqrt:
       return true;
     default:
       break;
     }
 
-    [[fallthrough]];
+    break;
   }
   default:
-    // FIXME: denormalsEnabledForType is broken for dynamic
-    return denormalsEnabledForType(DAG, Op.getValueType()) &&
-           DAG.isKnownNeverSNaN(Op);
+    break;
   }
 
-  llvm_unreachable("invalid operation");
+  // FIXME: denormalsEnabledForType is broken for dynamic
+  return denormalsEnabledForType(DAG, Op.getValueType()) &&
+         DAG.isKnownNeverSNaN(Op);
 }
 
-bool SITargetLowering::isCanonicalized(Register Reg, MachineFunction &MF,
+bool SITargetLowering::isCanonicalized(Register Reg, const MachineFunction &MF,
                                        unsigned MaxDepth) const {
-  MachineRegisterInfo &MRI = MF.getRegInfo();
+  const MachineRegisterInfo &MRI = MF.getRegInfo();
   MachineInstr *MI = MRI.getVRegDef(Reg);
   unsigned Opcode = MI->getOpcode();
 
@@ -12931,27 +12956,7 @@ SDValue SITargetLowering::performFCanonicalizeCombine(
     }
   }
 
-  unsigned SrcOpc = N0.getOpcode();
-
-  // If it's free to do so, push canonicalizes further up the source, which may
-  // find a canonical source.
-  //
-  // TODO: More opcodes. Note this is unsafe for the _ieee minnum/maxnum for
-  // sNaNs.
-  if (SrcOpc == ISD::FMINNUM || SrcOpc == ISD::FMAXNUM) {
-    auto *CRHS = dyn_cast<ConstantFPSDNode>(N0.getOperand(1));
-    if (CRHS && N0.hasOneUse()) {
-      SDLoc SL(N);
-      SDValue Canon0 = DAG.getNode(ISD::FCANONICALIZE, SL, VT,
-                                   N0.getOperand(0));
-      SDValue Canon1 = getCanonicalConstantFP(DAG, SL, VT, CRHS->getValueAPF());
-      DCI.AddToWorklist(Canon0.getNode());
-
-      return DAG.getNode(N0.getOpcode(), SL, VT, Canon0, Canon1);
-    }
-  }
-
-  return isCanonicalized(DAG, N0) ? N0 : SDValue();
+  return SDValue();
 }
 
 static unsigned minMaxOpcToMin3Max3Opc(unsigned Opc) {
@@ -15939,8 +15944,8 @@ bool SITargetLowering::denormalsEnabledForType(const SelectionDAG &DAG,
   }
 }
 
-bool SITargetLowering::denormalsEnabledForType(LLT Ty,
-                                               MachineFunction &MF) const {
+bool SITargetLowering::denormalsEnabledForType(
+    LLT Ty, const MachineFunction &MF) const {
   switch (Ty.getScalarSizeInBits()) {
   case 32:
     return !denormalModeIsFlushAllF32(MF);
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.h b/llvm/lib/Target/AMDGPU/SIISelLowering.h
index a20442e3737ee1..89da4428e3ab0a 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.h
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.h
@@ -523,10 +523,10 @@ class SITargetLowering final : public AMDGPUTargetLowering {
 
   bool isCanonicalized(SelectionDAG &DAG, SDValue Op,
                        unsigned MaxDepth = 5) const;
-  bool isCanonicalized(Register Reg, MachineFunction &MF,
+  bool isCanonicalized(Register Reg, const MachineFunction &MF,
                        unsigned MaxDepth = 5) const;
   bool denormalsEnabledForType(const SelectionDAG &DAG, EVT VT) const;
-  bool denormalsEnabledForType(LLT Ty, MachineFunction &MF) const;
+  bool denormalsEnabledForType(LLT Ty, const MachineFunction &MF) const;
 
   bool checkForPhysRegDependency(SDNode *Def, SDNode *User, unsigned Op,
                                  const TargetRegisterInfo *TRI,
diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td
index 33c93cdf20c43b..3ab788406ecb28 100644
--- a/llvm/lib/Target/AMDGPU/SIInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SIInstructions.td
@@ -2944,6 +2944,34 @@ def : GCNPat<
    (V_BFREV_B32_e64 (i32 (EXTRACT_SUBREG VReg_64:$a, sub1))), sub0,
    (V_BFREV_B32_e64 (i32 (EXTRACT_SUBREG VReg_64:$a, sub0))), sub1)>;
 
+// If fcanonicalize's operand is implicitly canonicalized, we only need a copy.
+let AddedComplexity = 1000 in {
+def : GCNPat<
+  (is_canonicalized_1<fcanonicalize> f16:$src),
+  (COPY f16:$src)
+>;
+
+def : GCNPat<
+  (is_canonicalized_1<fcanonicalize> v2f16:$src),
+  (COPY v2f16:$src)
+>;
+
+def : GCNPat<
+  (is_canonicalized_1<fcanonicalize> f32:$src),
+  (COPY f32:$src)
+>;
+
+def : GCNPat<
+  (is_canonicalized_1<fcanonicalize> v2f32:$src),
+  (COPY v2f32:$src)
+>;
+
+def : GCNPat<
+  (is_canonicalized_1<fcanonicalize> f64:$src),
+  (COPY f64:$src)
+>;
+}
+
 // Prefer selecting to max when legal, but using mul is always valid.
 let AddedComplexity = -5 in {
 
@@ -3277,8 +3305,8 @@ def : GCNPat <
 
 let AddedComplexity = 5 in {
 def : GCNPat <
-  (v2f16 (is_canonicalized<build_vector> (f16 (VOP3Mods (f16 VGPR_32:$src0), i32:$src0_mods)),
-                                         (f16 (VOP3Mods (f16 VGPR_32:$src1), i32:$src1_mods)))),
+  (v2f16 (is_canonicalized_2<build_vector> (f16 (VOP3Mods (f16 VGPR_32:$src0), i32:$src0_mods)),
+                                           (f16 (VOP3Mods (f16 VGPR_32:$src1), i32:$src1_mods)))),
   (V_PACK_B32_F16_e64 $src0_mods, VGPR_32:$src0, $src1_mods, VGPR_32:$src1)
 >;
 }
@@ -3590,6 +3618,17 @@ FPMinMaxPat<Instruction minmaxInst, ValueType vt, SDPatternOperator min_or_max,
               DSTCLAMP.NONE, DSTOMOD.NONE)
 >;
 
+class
+FPMinCanonMaxPat<Instruction minmaxInst, ValueType vt, SDPatternOperator min_or_max,
+                 SDPatternOperator max_or_min_oneuse> : GCNPat <
+  (min_or_max (is_canonicalized_1<fcanonicalize>
+                  (max_or_min_oneuse (VOP3Mods vt:$src0, i32:$src0_mods),
+                                     (VOP3Mods vt:$src1, i32:$src1_mods))),
+               (vt (VOP3Mods vt:$src2, i32:$src2_mods))),
+  (minmaxInst $src0_mods, $src0, $src1_mods, $src1, $src2_mods, $src2,
+              DSTCLAMP.NONE, DSTOMOD.NONE)
+>;
+
 let OtherPredicates = [isGFX11Plus] in {
 def : IntMinMaxPat<V_MAXMIN_I32_e64, smin, smax_oneuse>;
 def : IntMinMaxPat<V_MINMAX_I32_e64, smax, smin_oneuse>;
@@ -3599,6 +3638,10 @@ def : FPMinMaxPat<V_MINMAX_F32_e64, f32, fmaxnum_like, fminnum_like_oneuse>;
 def : FPMinMaxPat<V_MAXMIN_F32_e64, f32, fminnum_like, fmaxnum_like_oneuse>;
 def : FPMinMaxPat<V_MINMAX_F16_e64, f16, fmaxnum_like, fminnum_like_oneuse>;
 def : FPMinMaxPat<V_MAXMIN_F16_e64, f16, fminnum_like, fmaxnum_like_oneuse>;
+def : FPMinCanonMaxPat<V_MINMAX_F32_e64, f32, fmaxnum_like, fminnum_like_oneuse>;
+def : FPMinCanonMaxPat<V_MAXMIN_F32_e64, f32, fminnum_like, fmaxnum_like_oneuse>;
+def : FPMinCanonMaxPat<V_MINMAX_F16_e64, f16, fmaxnum_like, fminnum_like_oneuse>;
+def : FPMinCanonMaxPat<V_MAXMIN_F16_e64, f16, fminnum_like, fmaxnum_like_oneuse>;
 }
 
 let OtherPredicates = [isGFX9Plus] in {
@@ -3612,6 +3655,10 @@ def : FPMinMaxPat<V_MINIMUMMAXIMUM_F32_e64, f32, DivergentBinFrag<fmaximum>, fmi
 def : FPMinMaxPat<V_MAXIMUMMINIMUM_F32_e64, f32, DivergentBinFrag<fminimum>, fmaximum_oneuse>;
 def : FPMinMaxPat<V_MINIMUMMAXIMUM_F16_e64, f16, DivergentBinFrag<fmaximum>, fminimum_oneuse>;
 def : FPMinMaxPat<V_MAXIMUMMINIMUM_F16_e64, f16, DivergentBinFrag<fminimum>, fmaximum_oneuse>;
+def : FPMinCanonMaxPat<V_MINIMUMMAXIMUM_F32_e64, f32, DivergentBinFrag<fmaximum>, fminimum_oneuse>;
+def : FPMinCanonMaxPat<V_MAXIMUMMINIMUM_F32_e64, f32, DivergentBinFrag<fminimum>, fmaximum_oneuse>;
+def : FPMinCanonMaxPat<V_MINIMUMMAXIMUM_F16_e64, f16, DivergentBinFrag<fmaximum>, fminimum_oneuse>;
+def : FPMinCanonMaxPat<V_MAXIMUMMINIMUM_F16_e64, f16, DivergentBinFrag<fminimum>, fmaximum_oneuse>;
 }
 
 // Convert a floating-point power of 2 to the integer exponent.
diff --git a/llvm/test/CodeGen/AMDGPU/bf16.ll b/llvm/test/CodeGen/AMDGPU/bf16.ll
index ebb77c13c4af7b..98658834e89784 100644
--- a/llvm/test/CodeGen/AMDGPU/bf16.ll
+++ b/llvm/test/CodeGen/AMDGPU/bf16.ll
@@ -16968,7 +16968,7 @@ define bfloat @v_fabs_bf16(bfloat %a) {
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_mul_f32_e32 v0, 1.0, v0
 ; GCN-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GCN-NEXT:    v_mul_f32_e64 v0, 1.0, |v0|
+; GCN-NEXT:    v_and_b32_e32 v0, 0x7fffffff, v0
 ; GCN-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -16977,7 +16977,7 @@ define bfloat @v_fabs_bf16(bfloat %a) {
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    v_mul_f32_e32 v0, 1.0, v0
 ; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX7-NEXT:    v_mul_f32_e64 v0, 1.0, |v0|
+; GFX7-NEXT:    v_and_b32_e32 v0, 0x7fffffff, v0
 ; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -17163,9 +17163,9 @@ define bfloat @v_fneg_fabs_bf16(bfloat %a) {
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    v_mul_f32_e32 v0, 1.0, v0
 ; GCN-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GCN-NEXT:    v_mul_f32_e64 v0, 1.0, |v0|
+; GCN-NEXT:    v_and_b32_e32 v0, 0x7fffffff, v0
 ; GCN-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GCN-NEXT:    v_mul_f32_e32 v0, -1.0, v0
+; GCN-NEXT:    v_xor_b32_e32 v0, 0x80000000, v0
 ; GCN-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -17174,9 +17174,9 @@ define bfloat @v_fneg_fabs_bf16(bfloat %a) {
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    v_mul_f32_e32 v0, 1.0, v0
 ; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX7-NEXT:    v_mul_f32_e64 v0, 1.0, |v0|
+; GFX7-NEXT:    v_and_b32_e32 v0, 0x7fffffff, v0
 ; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX7-NEXT:    v_mul_f32_e32 v0, -1.0, v0
+; GFX7-NEXT:    v_xor_b32_e32 v0, 0x80000000, v0
 ; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -17280,8 +17280,6 @@ define bfloat @v_minnum_bf16(bfloat %a, bfloat %b) {
 ; GCN-NEXT:    v_mul_f32_e32 v1, 1.0, v1
 ; GCN-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
 ; GCN-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GCN-NEXT:    v_mul_f32_e32 v1, 1.0, v1
-; GCN-NEXT:    v_mul_f32_e32 v0, 1.0, v0
 ; GCN-NEXT:    v_min_f32_e32 v0, v0, v1
 ; GCN-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
@@ -17293,8 +17291,6 @@ define bfloat @v_minnum_bf16(bfloat %a, bfloat %b) {
 ; GFX7-NEXT:    v_mul_f32_e32 v1, 1.0, v1
 ; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
 ; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX7-NEXT:    v_mul_f32_e32 v1, 1.0, v1
-; GFX7-NEXT:    v_mul_f32_e32 v0, 1.0, v0
 ; GFX7-NEXT:    v_min_f32_e32 v0, v0, v1
 ; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
@@ -17375,10 +17371,6 @@ define <2 x bfloat> @v_minnum_v2bf16(<2 x bfloat> %a, <2 x bfloat> %b) {
 ; GCN-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
 ; GCN-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GCN-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GCN-NEXT:    v_mul_f32_e32 v3, 1.0, v3
-; GCN-NEXT:    v_mul_f32_e32 v1, 1.0, v1
-; GCN-NEXT:    v_mul_f32_e32 v2, 1.0, v2
-; GCN-NEXT:    v_mul_f32_e32 v0, 1.0, v0
 ; GCN-NEXT:    v_min_f32_e32 v1, v1, v3
 ; GCN-NEXT:    v_min_f32_e32 v0, v0, v2
 ; GCN-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
@@ -17396,10 +17388,6 @@ define <2 x bfloat> @v_minnum_v2bf16(<2 x bfloat> %a, <2 x bfloat> %b) {
 ; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
 ; GFX7-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX7-NEXT:    v_mul_f32_e32 v3, 1.0, v3
-; GFX7-NEXT:    v_mul_f32_e32 v1, 1.0, v1
-; GFX7-NEXT:    v_mul_f32_e32 v2, 1.0, v2
-; GFX7-NEXT:    v_mul_f32_e32 v0, 1.0, v0
 ; GFX7-NEXT:    v_min_f32_e32 v1, v1, v3
 ; GFX7-NEXT:    v_min_f32_e32 v0, v0, v2
 ; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
@@ -17522,12 +17510,6 @@ define <3 x bfloat> @v_minnum_v3bf16(<3 x bfloat> %a, <3 x bfloat> %b) {
 ; GCN-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
 ; GCN-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
 ; GCN-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GCN-NEXT:    v_mul_f32_e32 v5, 1.0, v5
-; GCN-NEXT:    v_mul_f32_e32 v2, 1.0, v2
-; GCN-NEXT:    v_mul_f32_e32 v4, 1.0, v4
-; GCN-NEXT:    v_mul_f32_e32 v1, 1.0, v1
-; GCN-NEXT:    v_mul_f32_e32 v3, 1.0, v3
-; GCN-NEXT:    v_mul_f32_e32 v0, 1.0, v0
 ; GCN-NEXT:    v_min_f32_e32 v2, v2, v5
 ; GCN-NEXT:    v_min_f32_e32 v1, v1, v4
 ; GCN-NEXT:    v_min_f32_e32 v0, v0, v3
@@ -17551,12 +17533,6 @@ define <3 x bfloat> @v_minnum_v3bf16(<3 x bfloat> %a, <3 x bfloat> %b) {
 ; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
 ; GFX7-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
 ; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX7-NEXT:    v_mul_f32_e32 v5, 1.0, v5
-; GFX7-NEXT:    v_mul_f32_e32 v2, 1.0, v2
-; GFX7-NEXT:    v_mul_f32_e32 v4, 1.0, v4
-; GFX7-NEXT:    v_mul_f32_e32 v1, 1.0, v1
-; GFX7-NEXT:    v_mul_f32_e32 v3, 1.0, v3
-; GFX7-NEXT:    v_mul_f32_e32 v0, 1.0, v0
 ; GFX7-NEXT:    v_min_f32_e32 v2, v2, v5
 ; GFX7-NEXT:    v_min_f32_e32 v1, v1, v4
 ; GFX7-NEXT:    v_min_f32_e32 v0, v0, v3
@@ -17688,14 +17664,6 @@ define <4 x bfloat> @v_minnum_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b) {
 ; GCN-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
 ; GCN-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
 ; GCN-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GCN-NEXT:    v_mul_f32_e32 v7, 1.0, v7
-; GCN-NEXT:    v_mul_f32_e32 v3, 1.0, v3
-; GCN-NEXT:    v_mul_f32_e32 v6, 1.0, v6
-; GCN-NEXT:    v_mul_f32_e32 v2, 1.0, v2
-; GCN-NEXT:    v_mul_f32_e32 v5, 1.0, v5
-; GCN-NEXT:    v_mul_f32_e32 v1, 1.0, v1
-; GCN-NEXT:    v_mul_f32_e32 v4, 1.0, v4
-; GCN-NEXT:    v_mul_f32_e32 v0, 1.0, v0
 ; GCN-NEXT:    v_min_f32_e32 v3, v3, v7
 ; GCN-NEXT:    v_min_f32_e32 v2, v2, v6
 ; GCN-NEXT:    v_min_f32_e32 v1, v1, v5
@@ -17725,14 +17693,6 @@ define <4 x bfloat> @v_minnum_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b) {
 ; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
 ; GFX7-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
 ; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX7-NEXT:    v_mul_f32_e32 v7, 1.0, v7
-; GFX7-NEXT:    v_mul_f32_e32 v3, 1.0, v3
-; GFX7-NEXT:    v_mul_f32_e32 v6, 1.0, v6
-; GFX7-NEXT:    v_mul_f32_e32 v2, 1.0, v2
-; GFX7-NEXT:    v_mul_f32_e32 v5, 1.0, v5
-; GFX7-NEXT:    v_mul_f32_e32 v1, 1.0, v1
-; GFX7-NEXT:    v_mul_f32_e32 v4, 1.0, v4
-; GFX7-NEXT:    v_mul_f32_e32 v0, 1.0, v0
 ; GFX7-NEXT:    v_min_f32_e32 v3, v3, v7
 ; GFX7-NEXT:    v_min_f32_e32 v2, v2, v6
 ; GFX7-NEXT:    v_min_f32_e32 v1, v1, v5
@@ -17951,22 +17911,6 @@ define <8 x bfloat> @v_minnum_v8bf16(<8 x bfloat> %a, <8 x bfloat> %b) {
 ; GCN-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
 ; GCN-NEXT:    v_and_b32_e32 v8, 0xffff0000, v8
 ; GCN-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GCN-NEXT:    v_mul_f32_e32 v15, 1.0, v15
-; GCN-NEXT:    v_mul_f32_e32 v7, 1.0, v7
-; GCN-NEXT:    v_mul_f32_e32 v14, 1.0, v14
-; GCN-NEXT:    v_mul_f32_e32 v6, 1.0, v6
-; GCN-NEXT:    v_mul_f32_e32 v13, 1.0, v13
-; GCN-NEXT:    v_mul_f32_e32 v5, 1.0, v5
-; GCN-NEXT:    v_mul_f32_e32 v12, 1.0, v12
-; GCN-NEXT:    v_mul_f32_e32 v4, 1.0, v4
-; GCN-NEXT:    v_mul_f32_e32 v11, 1.0, v11
-; GCN-NEXT:    v_mul_f32_e32 v3, 1.0, v3
-; GCN-NEXT:    v_mul_f32_e32 v10, 1.0, v10
-; GCN-NEXT:    v_mul_f32_e32 v2, 1.0, v2
-; GCN-NEXT:    v_mul_f32_e32 v9, 1.0, v9
-; GCN-NEXT:    v_mul_f32_e32 v1, 1.0, v1
-; GCN-NEXT:    v_mul_f32_e32 v8, 1.0, v8
-; GCN-NEXT:    v_mul_f32_e32 v0, 1.0, v0
 ; GCN-NEXT:    v_min_f32_e32 v7, v7, v15
 ; GCN-NEXT:    v_min_f32_e32 v6, v6, v14
 ; GCN-NEXT:    v_min_f32_e32 v5, v5, v13
@@ -18020,22 +17964,6 @@ define <8 x bfloat> @v_minnum_v8bf16(<8 x bfloat> %a, <8 x bfloat> %b) {
 ; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
 ; GFX7-NEXT:    v_and_b32_e32 v8, 0xffff0000, v8
 ; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX7-NEXT:    v_mul_f32_e32 v15, 1.0, v15
-; GFX7-NEXT:    v_mul_f32_e32 v7, 1.0, v7
-; GFX7-NEXT:    v_mul_f32_e32 v14, 1.0, v14
-; GFX7-NEXT:    v_mul_f32_e32 v6, 1.0, v6
-; GFX7-NEXT:    v_mul_f32_e32 v13, 1.0, v13
-; GFX7-NEXT:    v_mul_f32_e32 v5, 1.0, v5
-; GFX7-NEXT:    v_mul_f32_e32 v12, 1.0, v12
-; GFX7-NEXT:    v_mul_f32_e32 v4, 1.0, v4
-; GFX7-NEXT:    v_mul_f32_e32 v11, 1.0, v11
-; GFX7-NEXT:    v_mul_f32_e32 v3, 1.0, v3
-; GFX7-NEXT:    v_mul_f32_e32 v10, 1.0, v10
-; GFX7-NEXT:    v_mul_f32_e32 v2, 1.0, v2
-; GFX7-NEXT:    v_mul_f32_e32 v9, 1.0, v9
-; GFX7-NEXT:    v_mul_f32_e32 v1, 1.0, v1
-; GFX7-NEXT:    v_mul_f32_e32 v8, 1.0, v8
-; GFX7-NEXT:    v_mul_f32_e32 v0, 1.0, v0
 ; GFX7-NEXT:    v_min_f32_e32 v7, v7, v15
 ; GFX7-NEXT:    v_min_f32_e32 v6, v6, v14
 ; GFX7-NEXT:    v_min_f32_e32 v5, v5, v13
@@ -18382,71 +18310,51 @@ define <16 x bfloat> @v_minnum_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) {
 ; GCN-NEXT:    v_mul_f32_e32 v30, 1.0, v30
 ; GCN-NEXT:    v_and_b32_e32 v30, 0xffff0000, v30
 ; GCN-NEXT:    v_and_b32_e32 v14, 0xffff0000, v14
-; GCN-NEXT:    v_mul_f32_e32 v30, 1.0, v30
-; GCN-NEXT:    v_mul_f32_e32 v14, 1.0, v14
 ; GCN-NEXT:    v_min_f32_e32 v14, v14, v30
 ; GCN-NEXT:    v_mul_f32_e32 v13, 1.0, v13
 ; GCN-NEXT:    v_mul_f32_e32 v29, 1.0, v29
 ; GCN-NEXT:    v_and_b32_e32 v29, 0xffff0000, v29
 ; GCN-NEXT:    v_and_b32_e32 v13, 0xffff0000, v13
-; GCN-NEXT:    v_mul_f32_e32 v29, 1.0, v29
-; GCN-NEXT:    v_mul_f32_e32 v13, 1.0, v13
 ; GCN-NEXT:    v_min_f32_e32 v13, v13, v29
 ; GCN-NEXT:    v_mul_f32_e32 v12, 1.0, v12
 ; GCN-NEXT:    v_mul_f32_e32 v28, 1.0, v28
 ; GCN-NEXT:    v_and_b32_e32 v28, 0xffff0000, v28
 ; GCN-NEXT:    v_and_b32_e32 v12, 0xffff0000, v12
-; GCN-NEXT:    v_mul_f32_e32 v28, 1.0, v28
-; GCN-NEXT:    v_mul_f32_e32 v12, 1.0, v12
 ; GCN-NEXT:    v_min_f32_e32 v12, v12, v28
 ; GCN-NEXT:    v_mul_f32_e32 v11, 1.0, v11
 ; GCN-NEXT:    v_mul_f32_e32 v27, 1.0, v27
 ; GCN-NEXT:    v_and_b32_e32 v27, 0xffff0000, v27
 ; GCN-NEXT:    v_and_b32_e32 v11, 0xffff0000, v11
-; GCN-NEXT:    v_mul_f32_e32 v27, 1.0, v27
-; GCN-NEXT:    v_mul_f32_e32 v11, 1.0, v11
 ; GCN-NEXT:    v_min_f32_e32 v11, v11, v27
 ; GCN-NEXT:    v_mul_f32_e32 v10, 1.0, v10
 ; GCN-NEXT:    v_mul_f32_e32 v26, 1.0, v26
 ; GCN-NEXT:    v_and_b32_e32 v26, 0xffff0000, v26
 ; GCN-NEXT:    v_and_b32_e32 v10, 0xffff0000, v10
-; GCN-NEXT:    v_mul_f32_e32 v26, 1.0, v26
-; GCN-NEXT:    v_mul_f32_e32 v10, 1.0, v10
 ; GCN-NEXT:    v_min_f32_e32 v10, v10, v26
 ; GCN-NEXT:    v_mul_f32_e32 v9, 1.0, v9
 ; GCN-NEXT:    v_mul_f32_e32 v25, 1.0, v25
 ; GCN-NEXT:    v_and_b32_e32 v25, 0xffff0000, v25
 ; GCN-NEXT:    v_and_b32_e32 v9, 0xffff0000, v9
-; GCN-NEXT:    v_mul_f32_e32 v25, 1.0, v25
-; GCN-NEXT:    v_mul_f32_e32 v9, 1.0, v9
 ; GCN-NEXT:    v_min_f32_e32 v9, v9, v25
 ; GCN-NEXT:    v_mul_f32_e32 v8, 1.0, v8
 ; GCN-NEXT:    v_mul_f32_e32 v24, 1.0, v24
 ; GCN-NEXT:    v_and_b32_e32 v24, 0xffff0000, v24
 ; GCN-NEXT:    v_and_b32_e32 v8, 0xffff0000, v8
-; GCN-NEXT:    v_mul_f32_e32 v24, 1.0, v24
-; GCN-NEXT:    v_mul_f32_e32 v8, 1.0, v8
 ; GCN-NEXT:    v_min_f32_e32 v8, v8, v24
 ; GCN-NEXT:    v_mul_f32_e32 v7, 1.0, v7
 ; GCN-NEXT:    v_mul_f32_e32 v23, 1.0, v23
 ; GCN-NEXT:    v_and_b32_e32 v23, 0xffff0000, v23
 ; GCN-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
-; GCN-NEXT:    v_mul_f32_e32 v23, 1.0, v23
-; GCN-NEXT:    v_mul_f32_e32 v7, 1.0, v7
 ; GCN-NEXT:    v_min_f32_e32 v7, v7, v23
 ; GCN-NEXT:    v_mul_f32_e32 v6, 1.0, v6
 ; GCN-NEXT:    v_mul_f32_e32 v22, 1.0, v22
 ; GCN-NEXT:    v_and_b32_e32 v22, 0xffff0000, v22
 ; GCN-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
-; GCN-NEXT:    v_mul_f32_e32 v22, 1.0, v22
-; GCN-NEXT:    v_mul_f32_e32 v6, 1.0, v6
 ; GCN-NEXT:    v_min_f32_e32 v6, v6, v22
 ; GCN-NEXT:    v_mul_f32_e32 v5, 1.0, v5
 ; GCN-NEXT:    v_mul_f32_e32 v21, 1.0, v21
 ; GCN-NEXT:    v_and_b32_e32 v21, 0xffff0000, v21
 ; GCN-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
-; GCN-NEXT:    v_mul_f32_e32 v21, 1.0, v21
-; GCN-NEXT:    v_mul_f32_e32 v5, 1.0, v5
 ; GCN-NEXT:    v_min_f32_e32 v5, v5, v21
 ; GCN-NEXT:    v_mul_f32_e32 v0, 1.0, v0
 ; GCN-NEXT:    v_mul_f32_e32 v16, 1.0, v16
@@ -18461,8 +18369,6 @@ define <16 x bfloat> @v_minnum_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) {
 ; GCN-NEXT:    v_mul_f32_e32 v15, 1.0, v15
 ; GCN-NEXT:    v_and_b32_e32 v20, 0xffff0000, v20
 ; GCN-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
-; GCN-NEXT:    v_mul_f32_e32 v20, 1.0, v20
-; GCN-NEXT:    v_mul_f32_e32 v4, 1.0, v4
 ; GCN-NEXT:    v_min_f32_e32 v4, v4, v20
 ; GCN-NEXT:    buffer_load_dword v20, off, s[0:3], s32
 ; GCN-NEXT:    v_and_b32_e32 v15, 0xffff0000, v15
@@ -18474,21 +18380,10 @@ define <16 x bfloat> @v_minnum_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) {
 ; GCN-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
 ; GCN-NEXT:    v_and_b32_e32 v16, 0xffff0000, v16
 ; GCN-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GCN-NEXT:    v_mul_f32_e32 v15, 1.0, v15
-; GCN-NEXT:    v_mul_f32_e32 v19, 1.0, v19
-; GCN-NEXT:    v_mul_f32_e32 v3, 1.0, v3
-; GCN-NEXT:    v_mul_f32_e32 v18, 1.0, v18
-; GCN-NEXT:    v_mul_f32_e32 v2, 1.0, v2
-; GCN-NEXT:    v_mul_f32_e32 v17, 1.0, v17
-; GCN-NEXT:    v_mul_f32_e32 v1, 1.0, v1
-; GCN-NEXT:    v_mul_f32_e32 v16, 1.0, v16
-; GCN-NEXT:    v_mul_f32_e32 v0, 1.0, v0
 ; GCN-NEXT:    v_min_f32_e32 v3, v3, v19
 ; GCN-NEXT:    v_min_f32_e32 v2, v2, v18
 ; GCN-NEXT:    v_min_f32_e32 v1, v1, v17
 ; GCN-NEXT:    v_min_f32_e32 v0, v0, v16
-; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    v_mul_f32_e32 v16, 1.0, v20
 ; GCN-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
 ; GCN-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
 ; GCN-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
@@ -18503,8 +18398,9 @@ define <16 x bfloat> @v_minnum_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) {
 ; GCN-NEXT:    v_and_b32_e32 v11, 0xffff0000, v11
 ; GCN-NEXT:    v_and_b32_e32 v12, 0xffff0000, v12
 ; GCN-NEXT:    v_and_b32_e32 v13, 0xffff0000, v13
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_mul_f32_e32 v16, 1.0, v20
 ; GCN-NEXT:    v_and_b32_e32 v16, 0xffff0000, v16
-; GCN-NEXT:    v_mul_f32_e32 v16, 1.0, v16
 ; GCN-NEXT:    v_min_f32_e32 v15, v15, v16
 ; GCN-NEXT:    v_and_b32_e32 v14, 0xffff0000, v14
 ; GCN-NEXT:    v_and_b32_e32 v15, 0xffff0000, v15
@@ -18513,14 +18409,12 @@ define <16 x bfloat> @v_minnum_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) {
 ; GFX7-LABEL: v_minnum_v16bf16:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    v_mul_f32_e32 v9, 1.0, v9
-; GFX7-NEXT:    v_mul_f32_e32 v25, 1.0, v25
-; GFX7-NEXT:    v_and_b32_e32 v25, 0xffff0000, v25
-; GFX7-NEXT:    v_and_b32_e32 v9, 0xffff0000, v9
-; GFX7-NEXT:    v_mul_f32_e32 v25, 1.0, v25
-; GFX7-NEXT:    v_mul_f32_e32 v9, 1.0, v9
-; GFX7-NEXT:    v_min_f32_e32 v9, v9, v25
-; GFX7-NEXT:    buffer_load_dword v25, off, s[0:3], s32
+; GFX7-NEXT:    v_mul_f32_e32 v6, 1.0, v6
+; GFX7-NEXT:    v_mul_f32_e32 v22, 1.0, v22
+; GFX7-NEXT:    v_and_b32_e32 v22, 0xffff0000, v22
+; GFX7-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
+; GFX7-NEXT:    v_min_f32_e32 v6, v6, v22
+; GFX7-NEXT:    buffer_load_dword v22, off, s[0:3], s32
 ; GFX7-NEXT:    v_mul_f32_e32 v14, 1.0, v14
 ; GFX7-NEXT:    v_mul_f32_e32 v30, 1.0, v30
 ; GFX7-NEXT:    v_mul_f32_e32 v13, 1.0, v13
@@ -18531,13 +18425,13 @@ define <16 x bfloat> @v_minnum_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) {
 ; GFX7-NEXT:    v_mul_f32_e32 v27, 1.0, v27
 ; GFX7-NEXT:    v_mul_f32_e32 v10, 1.0, v10
 ; GFX7-NEXT:    v_mul_f32_e32 v26, 1.0, v26
-; GFX7-NEXT:    v_mul_f32_e32 v15, 1.0, v15
+; GFX7-NEXT:    v_mul_f32_e32 v9, 1.0, v9
+; GFX7-NEXT:    v_mul_f32_e32 v25, 1.0, v25
 ; GFX7-NEXT:    v_mul_f32_e32 v8, 1.0, v8
 ; GFX7-NEXT:    v_mul_f32_e32 v24, 1.0, v24
 ; GFX7-NEXT:    v_mul_f32_e32 v7, 1.0, v7
 ; GFX7-NEXT:    v_mul_f32_e32 v23, 1.0, v23
-; GFX7-NEXT:    v_mul_f32_e32 v6, 1.0, v6
-; GFX7-NEXT:    v_mul_f32_e32 v22, 1.0, v22
+; GFX7-NEXT:    v_mul_f32_e32 v15, 1.0, v15
 ; GFX7-NEXT:    v_mul_f32_e32 v5, 1.0, v5
 ; GFX7-NEXT:    v_mul_f32_e32 v21, 1.0, v21
 ; GFX7-NEXT:    v_mul_f32_e32 v0, 1.0, v0
@@ -18560,13 +18454,13 @@ define <16 x bfloat> @v_minnum_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) {
 ; GFX7-NEXT:    v_and_b32_e32 v11, 0xffff0000, v11
 ; GFX7-NEXT:    v_and_b32_e32 v26, 0xffff0000, v26
 ; GFX7-NEXT:    v_and_b32_e32 v10, 0xffff0000, v10
-; GFX7-NEXT:    v_and_b32_e32 v15, 0xffff0000, v15
+; GFX7-NEXT:    v_and_b32_e32 v25, 0xffff0000, v25
+; GFX7-NEXT:    v_and_b32_e32 v9, 0xffff0000, v9
 ; GFX7-NEXT:    v_and_b32_e32 v24, 0xffff0000, v24
 ; GFX7-NEXT:    v_and_b32_e32 v8, 0xffff0000, v8
 ; GFX7-NEXT:    v_and_b32_e32 v23, 0xffff0000, v23
 ; GFX7-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
-; GFX7-NEXT:    v_and_b32_e32 v22, 0xffff0000, v22
-; GFX7-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
+; GFX7-NEXT:    v_and_b32_e32 v15, 0xffff0000, v15
 ; GFX7-NEXT:    v_and_b32_e32 v21, 0xffff0000, v21
 ; GFX7-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
 ; GFX7-NEXT:    v_and_b32_e32 v20, 0xffff0000, v20
@@ -18579,48 +18473,14 @@ define <16 x bfloat> @v_minnum_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) {
 ; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
 ; GFX7-NEXT:    v_and_b32_e32 v16, 0xffff0000, v16
 ; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX7-NEXT:    v_mul_f32_e32 v30, 1.0, v30
-; GFX7-NEXT:    v_mul_f32_e32 v14, 1.0, v14
-; GFX7-NEXT:    v_mul_f32_e32 v29, 1.0, v29
-; GFX7-NEXT:    v_mul_f32_e32 v13, 1.0, v13
-; GFX7-NEXT:    v_mul_f32_e32 v28, 1.0, v28
-; GFX7-NEXT:    v_mul_f32_e32 v12, 1.0, v12
-; GFX7-NEXT:    v_mul_f32_e32 v27, 1.0, v27
-; GFX7-NEXT:    v_mul_f32_e32 v11, 1.0, v11
-; GFX7-NEXT:    v_mul_f32_e32 v26, 1.0, v26
-; GFX7-NEXT:    v_mul_f32_e32 v10, 1.0, v10
-; GFX7-NEXT:    v_mul_f32_e32 v15, 1.0, v15
-; GFX7-NEXT:    v_mul_f32_e32 v24, 1.0, v24
-; GFX7-NEXT:    v_mul_f32_e32 v8, 1.0, v8
-; GFX7-NEXT:    v_mul_f32_e32 v23, 1.0, v23
-; GFX7-NEXT:    v_mul_f32_e32 v7, 1.0, v7
-; GFX7-NEXT:    v_mul_f32_e32 v22, 1.0, v22
-; GFX7-NEXT:    v_mul_f32_e32 v6, 1.0, v6
-; GFX7-NEXT:    v_mul_f32_e32 v21, 1.0, v21
-; GFX7-NEXT:    v_mul_f32_e32 v5, 1.0, v5
-; GFX7-NEXT:    v_mul_f32_e32 v20, 1.0, v20
-; GFX7-NEXT:    v_mul_f32_e32 v4, 1.0, v4
-; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_mul_f32_e32 v25, 1.0, v25
-; GFX7-NEXT:    v_and_b32_e32 v25, 0xffff0000, v25
-; GFX7-NEXT:    v_mul_f32_e32 v25, 1.0, v25
-; GFX7-NEXT:    v_mul_f32_e32 v19, 1.0, v19
-; GFX7-NEXT:    v_mul_f32_e32 v3, 1.0, v3
-; GFX7-NEXT:    v_mul_f32_e32 v18, 1.0, v18
-; GFX7-NEXT:    v_mul_f32_e32 v2, 1.0, v2
-; GFX7-NEXT:    v_mul_f32_e32 v17, 1.0, v17
-; GFX7-NEXT:    v_mul_f32_e32 v1, 1.0, v1
-; GFX7-NEXT:    v_mul_f32_e32 v16, 1.0, v16
-; GFX7-NEXT:    v_mul_f32_e32 v0, 1.0, v0
 ; GFX7-NEXT:    v_min_f32_e32 v14, v14, v30
 ; GFX7-NEXT:    v_min_f32_e32 v13, v13, v29
 ; GFX7-NEXT:    v_min_f32_e32 v12, v12, v28
 ; GFX7-NEXT:    v_min_f32_e32 v11, v11, v27
 ; GFX7-NEXT:    v_min_f32_e32 v10, v10, v26
-; GFX7-NEXT:    v_min_f32_e32 v15, v15, v25
+; GFX7-NEXT:    v_min_f32_e32 v9, v9, v25
 ; GFX7-NEXT:    v_min_f32_e32 v8, v8, v24
 ; GFX7-NEXT:    v_min_f32_e32 v7, v7, v23
-; GFX7-NEXT:    v_min_f32_e32 v6, v6, v22
 ; GFX7-NEXT:    v_min_f32_e32 v5, v5, v21
 ; GFX7-NEXT:    v_min_f32_e32 v4, v4, v20
 ; GFX7-NEXT:    v_min_f32_e32 v3, v3, v19
@@ -18634,6 +18494,10 @@ define <16 x bfloat> @v_minnum_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) {
 ; GFX7-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
 ; GFX7-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
 ; GFX7-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_mul_f32_e32 v22, 1.0, v22
+; GFX7-NEXT:    v_and_b32_e32 v22, 0xffff0000, v22
+; GFX7-NEXT:    v_min_f32_e32 v15, v15, v22
 ; GFX7-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
 ; GFX7-NEXT:    v_and_b32_e32 v8, 0xffff0000, v8
 ; GFX7-NEXT:    v_and_b32_e32 v9, 0xffff0000, v9
@@ -19267,287 +19131,223 @@ define <32 x bfloat> @v_minnum_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) {
 ; GCN-NEXT:    v_mul_f32_e32 v32, 1.0, v32
 ; GCN-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
 ; GCN-NEXT:    v_and_b32_e32 v31, 0xffff0000, v31
-; GCN-NEXT:    v_mul_f32_e32 v32, 1.0, v32
-; GCN-NEXT:    v_mul_f32_e32 v31, 1.0, v31
+; GCN-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:124
 ; GCN-NEXT:    v_min_f32_e32 v31, v31, v32
-; GCN-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:124
 ; GCN-NEXT:    v_mul_f32_e32 v30, 1.0, v30
 ; GCN-NEXT:    v_and_b32_e32 v30, 0xffff0000, v30
-; GCN-NEXT:    v_mul_f32_e32 v30, 1.0, v30
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT:    v_mul_f32_e32 v32, 1.0, v33
 ; GCN-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
-; GCN-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:120
 ; GCN-NEXT:    v_min_f32_e32 v30, v30, v32
-; GCN-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:120
 ; GCN-NEXT:    v_mul_f32_e32 v29, 1.0, v29
 ; GCN-NEXT:    v_and_b32_e32 v29, 0xffff0000, v29
-; GCN-NEXT:    v_mul_f32_e32 v29, 1.0, v29
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT:    v_mul_f32_e32 v32, 1.0, v33
 ; GCN-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
-; GCN-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:116
 ; GCN-NEXT:    v_min_f32_e32 v29, v29, v32
-; GCN-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:116
 ; GCN-NEXT:    v_mul_f32_e32 v28, 1.0, v28
 ; GCN-NEXT:    v_and_b32_e32 v28, 0xffff0000, v28
-; GCN-NEXT:    v_mul_f32_e32 v28, 1.0, v28
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT:    v_mul_f32_e32 v32, 1.0, v33
 ; GCN-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
-; GCN-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:112
 ; GCN-NEXT:    v_min_f32_e32 v28, v28, v32
-; GCN-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:112
 ; GCN-NEXT:    v_mul_f32_e32 v27, 1.0, v27
 ; GCN-NEXT:    v_and_b32_e32 v27, 0xffff0000, v27
-; GCN-NEXT:    v_mul_f32_e32 v27, 1.0, v27
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT:    v_mul_f32_e32 v32, 1.0, v33
 ; GCN-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
-; GCN-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:108
 ; GCN-NEXT:    v_min_f32_e32 v27, v27, v32
-; GCN-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:108
 ; GCN-NEXT:    v_mul_f32_e32 v26, 1.0, v26
 ; GCN-NEXT:    v_and_b32_e32 v26, 0xffff0000, v26
-; GCN-NEXT:    v_mul_f32_e32 v26, 1.0, v26
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT:    v_mul_f32_e32 v32, 1.0, v33
 ; GCN-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
-; GCN-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:104
 ; GCN-NEXT:    v_min_f32_e32 v26, v26, v32
-; GCN-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:104
 ; GCN-NEXT:    v_mul_f32_e32 v25, 1.0, v25
 ; GCN-NEXT:    v_and_b32_e32 v25, 0xffff0000, v25
-; GCN-NEXT:    v_mul_f32_e32 v25, 1.0, v25
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT:    v_mul_f32_e32 v32, 1.0, v33
 ; GCN-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
-; GCN-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:100
 ; GCN-NEXT:    v_min_f32_e32 v25, v25, v32
-; GCN-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:100
 ; GCN-NEXT:    v_mul_f32_e32 v24, 1.0, v24
 ; GCN-NEXT:    v_and_b32_e32 v24, 0xffff0000, v24
-; GCN-NEXT:    v_mul_f32_e32 v24, 1.0, v24
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT:    v_mul_f32_e32 v32, 1.0, v33
 ; GCN-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
-; GCN-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:96
 ; GCN-NEXT:    v_min_f32_e32 v24, v24, v32
-; GCN-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:96
 ; GCN-NEXT:    v_mul_f32_e32 v23, 1.0, v23
 ; GCN-NEXT:    v_and_b32_e32 v23, 0xffff0000, v23
-; GCN-NEXT:    v_mul_f32_e32 v23, 1.0, v23
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT:    v_mul_f32_e32 v32, 1.0, v33
 ; GCN-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
-; GCN-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:92
 ; GCN-NEXT:    v_min_f32_e32 v23, v23, v32
-; GCN-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:92
 ; GCN-NEXT:    v_mul_f32_e32 v22, 1.0, v22
 ; GCN-NEXT:    v_and_b32_e32 v22, 0xffff0000, v22
-; GCN-NEXT:    v_mul_f32_e32 v22, 1.0, v22
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT:    v_mul_f32_e32 v32, 1.0, v33
 ; GCN-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
-; GCN-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:88
 ; GCN-NEXT:    v_min_f32_e32 v22, v22, v32
-; GCN-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:88
 ; GCN-NEXT:    v_mul_f32_e32 v21, 1.0, v21
 ; GCN-NEXT:    v_and_b32_e32 v21, 0xffff0000, v21
-; GCN-NEXT:    v_mul_f32_e32 v21, 1.0, v21
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT:    v_mul_f32_e32 v32, 1.0, v33
 ; GCN-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
-; GCN-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:84
 ; GCN-NEXT:    v_min_f32_e32 v21, v21, v32
-; GCN-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:84
 ; GCN-NEXT:    v_mul_f32_e32 v20, 1.0, v20
 ; GCN-NEXT:    v_and_b32_e32 v20, 0xffff0000, v20
-; GCN-NEXT:    v_mul_f32_e32 v20, 1.0, v20
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT:    v_mul_f32_e32 v32, 1.0, v33
 ; GCN-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
-; GCN-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:80
 ; GCN-NEXT:    v_min_f32_e32 v20, v20, v32
-; GCN-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:80
 ; GCN-NEXT:    v_mul_f32_e32 v19, 1.0, v19
 ; GCN-NEXT:    v_and_b32_e32 v19, 0xffff0000, v19
-; GCN-NEXT:    v_mul_f32_e32 v19, 1.0, v19
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT:    v_mul_f32_e32 v32, 1.0, v33
 ; GCN-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
-; GCN-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:76
 ; GCN-NEXT:    v_min_f32_e32 v19, v19, v32
-; GCN-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:76
 ; GCN-NEXT:    v_mul_f32_e32 v18, 1.0, v18
 ; GCN-NEXT:    v_and_b32_e32 v18, 0xffff0000, v18
-; GCN-NEXT:    v_mul_f32_e32 v18, 1.0, v18
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT:    v_mul_f32_e32 v32, 1.0, v33
 ; GCN-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
-; GCN-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:72
 ; GCN-NEXT:    v_min_f32_e32 v18, v18, v32
-; GCN-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:72
 ; GCN-NEXT:    v_mul_f32_e32 v17, 1.0, v17
 ; GCN-NEXT:    v_and_b32_e32 v17, 0xffff0000, v17
-; GCN-NEXT:    v_mul_f32_e32 v17, 1.0, v17
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT:    v_mul_f32_e32 v32, 1.0, v33
 ; GCN-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
-; GCN-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:68
 ; GCN-NEXT:    v_min_f32_e32 v17, v17, v32
-; GCN-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:68
 ; GCN-NEXT:    v_mul_f32_e32 v16, 1.0, v16
 ; GCN-NEXT:    v_and_b32_e32 v16, 0xffff0000, v16
-; GCN-NEXT:    v_mul_f32_e32 v16, 1.0, v16
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT:    v_mul_f32_e32 v32, 1.0, v33
 ; GCN-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
-; GCN-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:64
 ; GCN-NEXT:    v_min_f32_e32 v16, v16, v32
-; GCN-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:64
 ; GCN-NEXT:    v_mul_f32_e32 v15, 1.0, v15
 ; GCN-NEXT:    v_and_b32_e32 v15, 0xffff0000, v15
-; GCN-NEXT:    v_mul_f32_e32 v15, 1.0, v15
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT:    v_mul_f32_e32 v32, 1.0, v33
 ; GCN-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
-; GCN-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:60
 ; GCN-NEXT:    v_min_f32_e32 v15, v15, v32
-; GCN-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:60
 ; GCN-NEXT:    v_mul_f32_e32 v14, 1.0, v14
 ; GCN-NEXT:    v_and_b32_e32 v14, 0xffff0000, v14
-; GCN-NEXT:    v_mul_f32_e32 v14, 1.0, v14
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT:    v_mul_f32_e32 v32, 1.0, v33
 ; GCN-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
-; GCN-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:56
 ; GCN-NEXT:    v_min_f32_e32 v14, v14, v32
-; GCN-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:56
 ; GCN-NEXT:    v_mul_f32_e32 v13, 1.0, v13
 ; GCN-NEXT:    v_and_b32_e32 v13, 0xffff0000, v13
-; GCN-NEXT:    v_mul_f32_e32 v13, 1.0, v13
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT:    v_mul_f32_e32 v32, 1.0, v33
 ; GCN-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
-; GCN-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:52
 ; GCN-NEXT:    v_min_f32_e32 v13, v13, v32
-; GCN-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:52
 ; GCN-NEXT:    v_mul_f32_e32 v12, 1.0, v12
 ; GCN-NEXT:    v_and_b32_e32 v12, 0xffff0000, v12
-; GCN-NEXT:    v_mul_f32_e32 v12, 1.0, v12
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT:    v_mul_f32_e32 v32, 1.0, v33
 ; GCN-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
-; GCN-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:48
 ; GCN-NEXT:    v_min_f32_e32 v12, v12, v32
-; GCN-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:48
 ; GCN-NEXT:    v_mul_f32_e32 v11, 1.0, v11
 ; GCN-NEXT:    v_and_b32_e32 v11, 0xffff0000, v11
-; GCN-NEXT:    v_mul_f32_e32 v11, 1.0, v11
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT:    v_mul_f32_e32 v32, 1.0, v33
 ; GCN-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
-; GCN-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:44
 ; GCN-NEXT:    v_min_f32_e32 v11, v11, v32
-; GCN-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:44
 ; GCN-NEXT:    v_mul_f32_e32 v10, 1.0, v10
 ; GCN-NEXT:    v_and_b32_e32 v10, 0xffff0000, v10
-; GCN-NEXT:    v_mul_f32_e32 v10, 1.0, v10
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT:    v_mul_f32_e32 v32, 1.0, v33
 ; GCN-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
-; GCN-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:40
 ; GCN-NEXT:    v_min_f32_e32 v10, v10, v32
-; GCN-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:40
 ; GCN-NEXT:    v_mul_f32_e32 v9, 1.0, v9
 ; GCN-NEXT:    v_and_b32_e32 v9, 0xffff0000, v9
-; GCN-NEXT:    v_mul_f32_e32 v9, 1.0, v9
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT:    v_mul_f32_e32 v32, 1.0, v33
 ; GCN-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
-; GCN-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:36
 ; GCN-NEXT:    v_min_f32_e32 v9, v9, v32
-; GCN-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:36
 ; GCN-NEXT:    v_mul_f32_e32 v8, 1.0, v8
 ; GCN-NEXT:    v_and_b32_e32 v8, 0xffff0000, v8
-; GCN-NEXT:    v_mul_f32_e32 v8, 1.0, v8
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT:    v_mul_f32_e32 v32, 1.0, v33
 ; GCN-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
-; GCN-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:32
 ; GCN-NEXT:    v_min_f32_e32 v8, v8, v32
-; GCN-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:32
 ; GCN-NEXT:    v_mul_f32_e32 v7, 1.0, v7
 ; GCN-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
-; GCN-NEXT:    v_mul_f32_e32 v7, 1.0, v7
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT:    v_mul_f32_e32 v32, 1.0, v33
 ; GCN-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
-; GCN-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:28
 ; GCN-NEXT:    v_min_f32_e32 v7, v7, v32
-; GCN-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:28
 ; GCN-NEXT:    v_mul_f32_e32 v6, 1.0, v6
 ; GCN-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
-; GCN-NEXT:    v_mul_f32_e32 v6, 1.0, v6
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT:    v_mul_f32_e32 v32, 1.0, v33
 ; GCN-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
-; GCN-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:24
 ; GCN-NEXT:    v_min_f32_e32 v6, v6, v32
-; GCN-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:24
 ; GCN-NEXT:    v_mul_f32_e32 v5, 1.0, v5
 ; GCN-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
-; GCN-NEXT:    v_mul_f32_e32 v5, 1.0, v5
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT:    v_mul_f32_e32 v32, 1.0, v33
 ; GCN-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
-; GCN-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:20
 ; GCN-NEXT:    v_min_f32_e32 v5, v5, v32
-; GCN-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:20
 ; GCN-NEXT:    v_mul_f32_e32 v4, 1.0, v4
 ; GCN-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
-; GCN-NEXT:    v_mul_f32_e32 v4, 1.0, v4
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT:    v_mul_f32_e32 v32, 1.0, v33
 ; GCN-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
-; GCN-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:16
 ; GCN-NEXT:    v_min_f32_e32 v4, v4, v32
-; GCN-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:16
 ; GCN-NEXT:    v_mul_f32_e32 v3, 1.0, v3
 ; GCN-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
-; GCN-NEXT:    v_mul_f32_e32 v3, 1.0, v3
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT:    v_mul_f32_e32 v32, 1.0, v33
 ; GCN-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
-; GCN-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:12
 ; GCN-NEXT:    v_min_f32_e32 v3, v3, v32
-; GCN-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:12
 ; GCN-NEXT:    v_mul_f32_e32 v2, 1.0, v2
 ; GCN-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
-; GCN-NEXT:    v_mul_f32_e32 v2, 1.0, v2
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT:    v_mul_f32_e32 v32, 1.0, v33
 ; GCN-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
-; GCN-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:8
 ; GCN-NEXT:    v_min_f32_e32 v2, v2, v32
-; GCN-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:8
 ; GCN-NEXT:    v_mul_f32_e32 v1, 1.0, v1
 ; GCN-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; GCN-NEXT:    v_mul_f32_e32 v1, 1.0, v1
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT:    v_mul_f32_e32 v32, 1.0, v33
 ; GCN-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
-; GCN-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:4
 ; GCN-NEXT:    v_min_f32_e32 v1, v1, v32
-; GCN-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:4
 ; GCN-NEXT:    v_mul_f32_e32 v0, 1.0, v0
 ; GCN-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GCN-NEXT:    v_mul_f32_e32 v0, 1.0, v0
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT:    v_mul_f32_e32 v32, 1.0, v33
 ; GCN-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
-; GCN-NEXT:    v_mul_f32_e32 v32, 1.0, v32
 ; GCN-NEXT:    v_min_f32_e32 v0, v0, v32
 ; GCN-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
 ; GCN-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
@@ -19590,322 +19390,258 @@ define <32 x bfloat> @v_minnum_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) {
 ; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:128
 ; GFX7-NEXT:    v_mul_f32_e32 v30, 1.0, v30
 ; GFX7-NEXT:    v_and_b32_e32 v30, 0xffff0000, v30
-; GFX7-NEXT:    v_mul_f32_e32 v30, 1.0, v30
 ; GFX7-NEXT:    v_mul_f32_e32 v29, 1.0, v29
 ; GFX7-NEXT:    v_and_b32_e32 v29, 0xffff0000, v29
-; GFX7-NEXT:    v_mul_f32_e32 v29, 1.0, v29
 ; GFX7-NEXT:    v_mul_f32_e32 v28, 1.0, v28
 ; GFX7-NEXT:    v_and_b32_e32 v28, 0xffff0000, v28
-; GFX7-NEXT:    v_mul_f32_e32 v28, 1.0, v28
 ; GFX7-NEXT:    v_mul_f32_e32 v27, 1.0, v27
 ; GFX7-NEXT:    v_and_b32_e32 v27, 0xffff0000, v27
-; GFX7-NEXT:    v_mul_f32_e32 v27, 1.0, v27
 ; GFX7-NEXT:    v_mul_f32_e32 v26, 1.0, v26
 ; GFX7-NEXT:    v_and_b32_e32 v26, 0xffff0000, v26
-; GFX7-NEXT:    v_mul_f32_e32 v26, 1.0, v26
 ; GFX7-NEXT:    v_mul_f32_e32 v25, 1.0, v25
 ; GFX7-NEXT:    v_and_b32_e32 v25, 0xffff0000, v25
-; GFX7-NEXT:    v_mul_f32_e32 v25, 1.0, v25
 ; GFX7-NEXT:    v_mul_f32_e32 v24, 1.0, v24
 ; GFX7-NEXT:    v_and_b32_e32 v24, 0xffff0000, v24
-; GFX7-NEXT:    v_mul_f32_e32 v24, 1.0, v24
 ; GFX7-NEXT:    v_mul_f32_e32 v23, 1.0, v23
 ; GFX7-NEXT:    v_and_b32_e32 v23, 0xffff0000, v23
-; GFX7-NEXT:    v_mul_f32_e32 v23, 1.0, v23
 ; GFX7-NEXT:    v_mul_f32_e32 v22, 1.0, v22
 ; GFX7-NEXT:    v_and_b32_e32 v22, 0xffff0000, v22
-; GFX7-NEXT:    v_mul_f32_e32 v22, 1.0, v22
 ; GFX7-NEXT:    v_mul_f32_e32 v21, 1.0, v21
 ; GFX7-NEXT:    v_and_b32_e32 v21, 0xffff0000, v21
-; GFX7-NEXT:    v_mul_f32_e32 v21, 1.0, v21
 ; GFX7-NEXT:    v_mul_f32_e32 v20, 1.0, v20
 ; GFX7-NEXT:    v_and_b32_e32 v20, 0xffff0000, v20
-; GFX7-NEXT:    v_mul_f32_e32 v20, 1.0, v20
 ; GFX7-NEXT:    v_mul_f32_e32 v19, 1.0, v19
 ; GFX7-NEXT:    v_and_b32_e32 v19, 0xffff0000, v19
-; GFX7-NEXT:    v_mul_f32_e32 v19, 1.0, v19
 ; GFX7-NEXT:    v_mul_f32_e32 v18, 1.0, v18
 ; GFX7-NEXT:    v_and_b32_e32 v18, 0xffff0000, v18
-; GFX7-NEXT:    v_mul_f32_e32 v18, 1.0, v18
 ; GFX7-NEXT:    v_mul_f32_e32 v17, 1.0, v17
 ; GFX7-NEXT:    v_and_b32_e32 v17, 0xffff0000, v17
-; GFX7-NEXT:    v_mul_f32_e32 v17, 1.0, v17
 ; GFX7-NEXT:    v_mul_f32_e32 v16, 1.0, v16
 ; GFX7-NEXT:    v_and_b32_e32 v16, 0xffff0000, v16
-; GFX7-NEXT:    v_mul_f32_e32 v16, 1.0, v16
 ; GFX7-NEXT:    v_mul_f32_e32 v15, 1.0, v15
 ; GFX7-NEXT:    v_and_b32_e32 v15, 0xffff0000, v15
-; GFX7-NEXT:    v_mul_f32_e32 v15, 1.0, v15
 ; GFX7-NEXT:    v_mul_f32_e32 v14, 1.0, v14
 ; GFX7-NEXT:    v_and_b32_e32 v14, 0xffff0000, v14
-; GFX7-NEXT:    v_mul_f32_e32 v14, 1.0, v14
 ; GFX7-NEXT:    v_mul_f32_e32 v13, 1.0, v13
 ; GFX7-NEXT:    v_and_b32_e32 v13, 0xffff0000, v13
-; GFX7-NEXT:    v_mul_f32_e32 v13, 1.0, v13
 ; GFX7-NEXT:    v_mul_f32_e32 v12, 1.0, v12
 ; GFX7-NEXT:    v_and_b32_e32 v12, 0xffff0000, v12
-; GFX7-NEXT:    v_mul_f32_e32 v12, 1.0, v12
 ; GFX7-NEXT:    v_mul_f32_e32 v11, 1.0, v11
 ; GFX7-NEXT:    v_and_b32_e32 v11, 0xffff0000, v11
-; GFX7-NEXT:    v_mul_f32_e32 v11, 1.0, v11
 ; GFX7-NEXT:    v_mul_f32_e32 v10, 1.0, v10
 ; GFX7-NEXT:    v_and_b32_e32 v10, 0xffff0000, v10
-; GFX7-NEXT:    v_mul_f32_e32 v10, 1.0, v10
 ; GFX7-NEXT:    v_mul_f32_e32 v9, 1.0, v9
 ; GFX7-NEXT:    v_and_b32_e32 v9, 0xffff0000, v9
-; GFX7-NEXT:    v_mul_f32_e32 v9, 1.0, v9
 ; GFX7-NEXT:    v_mul_f32_e32 v8, 1.0, v8
 ; GFX7-NEXT:    v_and_b32_e32 v8, 0xffff0000, v8
-; GFX7-NEXT:    v_mul_f32_e32 v8, 1.0, v8
 ; GFX7-NEXT:    v_mul_f32_e32 v7, 1.0, v7
 ; GFX7-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
-; GFX7-NEXT:    v_mul_f32_e32 v7, 1.0, v7
 ; GFX7-NEXT:    v_mul_f32_e32 v6, 1.0, v6
 ; GFX7-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
-; GFX7-NEXT:    v_mul_f32_e32 v6, 1.0, v6
 ; GFX7-NEXT:    v_mul_f32_e32 v5, 1.0, v5
 ; GFX7-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
-; GFX7-NEXT:    v_mul_f32_e32 v5, 1.0, v5
-; GFX7-NEXT:    s_waitcnt vmcnt(1)
-; GFX7-NEXT:    v_mul_f32_e32 v31, 1.0, v31
-; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_mul_f32_e32 v32, 1.0, v32
-; GFX7-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
-; GFX7-NEXT:    v_and_b32_e32 v31, 0xffff0000, v31
-; GFX7-NEXT:    v_mul_f32_e32 v32, 1.0, v32
-; GFX7-NEXT:    v_mul_f32_e32 v31, 1.0, v31
-; GFX7-NEXT:    v_min_f32_e32 v31, v31, v32
-; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:124
 ; GFX7-NEXT:    v_mul_f32_e32 v4, 1.0, v4
 ; GFX7-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
-; GFX7-NEXT:    v_mul_f32_e32 v4, 1.0, v4
 ; GFX7-NEXT:    v_mul_f32_e32 v3, 1.0, v3
 ; GFX7-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
-; GFX7-NEXT:    v_mul_f32_e32 v3, 1.0, v3
 ; GFX7-NEXT:    v_mul_f32_e32 v2, 1.0, v2
 ; GFX7-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
-; GFX7-NEXT:    v_mul_f32_e32 v2, 1.0, v2
 ; GFX7-NEXT:    v_mul_f32_e32 v1, 1.0, v1
 ; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; GFX7-NEXT:    v_mul_f32_e32 v1, 1.0, v1
 ; GFX7-NEXT:    v_mul_f32_e32 v0, 1.0, v0
 ; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX7-NEXT:    v_mul_f32_e32 v0, 1.0, v0
-; GFX7-NEXT:    v_and_b32_e32 v31, 0xffff0000, v31
+; GFX7-NEXT:    s_waitcnt vmcnt(1)
+; GFX7-NEXT:    v_mul_f32_e32 v31, 1.0, v31
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    v_mul_f32_e32 v32, 1.0, v32
 ; GFX7-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
+; GFX7-NEXT:    v_and_b32_e32 v31, 0xffff0000, v31
+; GFX7-NEXT:    v_min_f32_e32 v31, v31, v32
+; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:124
+; GFX7-NEXT:    v_and_b32_e32 v31, 0xffff0000, v31
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GFX7-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
 ; GFX7-NEXT:    v_min_f32_e32 v30, v30, v32
 ; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:120
 ; GFX7-NEXT:    v_and_b32_e32 v30, 0xffff0000, v30
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    v_mul_f32_e32 v32, 1.0, v32
 ; GFX7-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
-; GFX7-NEXT:    v_mul_f32_e32 v32, 1.0, v32
 ; GFX7-NEXT:    v_min_f32_e32 v29, v29, v32
 ; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:116
 ; GFX7-NEXT:    v_and_b32_e32 v29, 0xffff0000, v29
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    v_mul_f32_e32 v32, 1.0, v32
 ; GFX7-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
-; GFX7-NEXT:    v_mul_f32_e32 v32, 1.0, v32
 ; GFX7-NEXT:    v_min_f32_e32 v28, v28, v32
 ; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:112
 ; GFX7-NEXT:    v_and_b32_e32 v28, 0xffff0000, v28
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    v_mul_f32_e32 v32, 1.0, v32
 ; GFX7-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
-; GFX7-NEXT:    v_mul_f32_e32 v32, 1.0, v32
 ; GFX7-NEXT:    v_min_f32_e32 v27, v27, v32
 ; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:108
 ; GFX7-NEXT:    v_and_b32_e32 v27, 0xffff0000, v27
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    v_mul_f32_e32 v32, 1.0, v32
 ; GFX7-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
-; GFX7-NEXT:    v_mul_f32_e32 v32, 1.0, v32
 ; GFX7-NEXT:    v_min_f32_e32 v26, v26, v32
 ; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:104
 ; GFX7-NEXT:    v_and_b32_e32 v26, 0xffff0000, v26
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    v_mul_f32_e32 v32, 1.0, v32
 ; GFX7-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
-; GFX7-NEXT:    v_mul_f32_e32 v32, 1.0, v32
 ; GFX7-NEXT:    v_min_f32_e32 v25, v25, v32
 ; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:100
 ; GFX7-NEXT:    v_and_b32_e32 v25, 0xffff0000, v25
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    v_mul_f32_e32 v32, 1.0, v32
 ; GFX7-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
-; GFX7-NEXT:    v_mul_f32_e32 v32, 1.0, v32
 ; GFX7-NEXT:    v_min_f32_e32 v24, v24, v32
 ; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:96
 ; GFX7-NEXT:    v_and_b32_e32 v24, 0xffff0000, v24
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    v_mul_f32_e32 v32, 1.0, v32
 ; GFX7-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
-; GFX7-NEXT:    v_mul_f32_e32 v32, 1.0, v32
 ; GFX7-NEXT:    v_min_f32_e32 v23, v23, v32
 ; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:92
 ; GFX7-NEXT:    v_and_b32_e32 v23, 0xffff0000, v23
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    v_mul_f32_e32 v32, 1.0, v32
 ; GFX7-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
-; GFX7-NEXT:    v_mul_f32_e32 v32, 1.0, v32
 ; GFX7-NEXT:    v_min_f32_e32 v22, v22, v32
 ; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:88
 ; GFX7-NEXT:    v_and_b32_e32 v22, 0xffff0000, v22
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    v_mul_f32_e32 v32, 1.0, v32
 ; GFX7-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
-; GFX7-NEXT:    v_mul_f32_e32 v32, 1.0, v32
 ; GFX7-NEXT:    v_min_f32_e32 v21, v21, v32
 ; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:84
 ; GFX7-NEXT:    v_and_b32_e32 v21, 0xffff0000, v21
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    v_mul_f32_e32 v32, 1.0, v32
 ; GFX7-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
-; GFX7-NEXT:    v_mul_f32_e32 v32, 1.0, v32
 ; GFX7-NEXT:    v_min_f32_e32 v20, v20, v32
 ; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:80
 ; GFX7-NEXT:    v_and_b32_e32 v20, 0xffff0000, v20
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    v_mul_f32_e32 v32, 1.0, v32
 ; GFX7-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
-; GFX7-NEXT:    v_mul_f32_e32 v32, 1.0, v32
 ; GFX7-NEXT:    v_min_f32_e32 v19, v19, v32
 ; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:76
 ; GFX7-NEXT:    v_and_b32_e32 v19, 0xffff0000, v19
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    v_mul_f32_e32 v32, 1.0, v32
 ; GFX7-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
-; GFX7-NEXT:    v_mul_f32_e32 v32, 1.0, v32
 ; GFX7-NEXT:    v_min_f32_e32 v18, v18, v32
 ; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:72
 ; GFX7-NEXT:    v_and_b32_e32 v18, 0xffff0000, v18
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    v_mul_f32_e32 v32, 1.0, v32
 ; GFX7-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
-; GFX7-NEXT:    v_mul_f32_e32 v32, 1.0, v32
 ; GFX7-NEXT:    v_min_f32_e32 v17, v17, v32
 ; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:68
 ; GFX7-NEXT:    v_and_b32_e32 v17, 0xffff0000, v17
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    v_mul_f32_e32 v32, 1.0, v32
 ; GFX7-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
-; GFX7-NEXT:    v_mul_f32_e32 v32, 1.0, v32
 ; GFX7-NEXT:    v_min_f32_e32 v16, v16, v32
 ; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:64
 ; GFX7-NEXT:    v_and_b32_e32 v16, 0xffff0000, v16
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    v_mul_f32_e32 v32, 1.0, v32
 ; GFX7-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
-; GFX7-NEXT:    v_mul_f32_e32 v32, 1.0, v32
 ; GFX7-NEXT:    v_min_f32_e32 v15, v15, v32
 ; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:60
 ; GFX7-NEXT:    v_and_b32_e32 v15, 0xffff0000, v15
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    v_mul_f32_e32 v32, 1.0, v32
 ; GFX7-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
-; GFX7-NEXT:    v_mul_f32_e32 v32, 1.0, v32
 ; GFX7-NEXT:    v_min_f32_e32 v14, v14, v32
 ; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:56
 ; GFX7-NEXT:    v_and_b32_e32 v14, 0xffff0000, v14
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    v_mul_f32_e32 v32, 1.0, v32
 ; GFX7-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
-; GFX7-NEXT:    v_mul_f32_e32 v32, 1.0, v32
 ; GFX7-NEXT:    v_min_f32_e32 v13, v13, v32
 ; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:52
 ; GFX7-NEXT:    v_and_b32_e32 v13, 0xffff0000, v13
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    v_mul_f32_e32 v32, 1.0, v32
 ; GFX7-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
-; GFX7-NEXT:    v_mul_f32_e32 v32, 1.0, v32
 ; GFX7-NEXT:    v_min_f32_e32 v12, v12, v32
 ; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:48
 ; GFX7-NEXT:    v_and_b32_e32 v12, 0xffff0000, v12
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    v_mul_f32_e32 v32, 1.0, v32
 ; GFX7-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
-; GFX7-NEXT:    v_mul_f32_e32 v32, 1.0, v32
 ; GFX7-NEXT:    v_min_f32_e32 v11, v11, v32
 ; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:44
 ; GFX7-NEXT:    v_and_b32_e32 v11, 0xffff0000, v11
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    v_mul_f32_e32 v32, 1.0, v32
 ; GFX7-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
-; GFX7-NEXT:    v_mul_f32_e32 v32, 1.0, v32
 ; GFX7-NEXT:    v_min_f32_e32 v10, v10, v32
 ; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:40
 ; GFX7-NEXT:    v_and_b32_e32 v10, 0xffff0000, v10
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    v_mul_f32_e32 v32, 1.0, v32
 ; GFX7-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
-; GFX7-NEXT:    v_mul_f32_e32 v32, 1.0, v32
 ; GFX7-NEXT:    v_min_f32_e32 v9, v9, v32
 ; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:36
 ; GFX7-NEXT:    v_and_b32_e32 v9, 0xffff0000, v9
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    v_mul_f32_e32 v32, 1.0, v32
 ; GFX7-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
-; GFX7-NEXT:    v_mul_f32_e32 v32, 1.0, v32
 ; GFX7-NEXT:    v_min_f32_e32 v8, v8, v32
 ; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:32
 ; GFX7-NEXT:    v_and_b32_e32 v8, 0xffff0000, v8
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    v_mul_f32_e32 v32, 1.0, v32
 ; GFX7-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
-; GFX7-NEXT:    v_mul_f32_e32 v32, 1.0, v32
 ; GFX7-NEXT:    v_min_f32_e32 v7, v7, v32
 ; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:28
 ; GFX7-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    v_mul_f32_e32 v32, 1.0, v32
 ; GFX7-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
-; GFX7-NEXT:    v_mul_f32_e32 v32, 1.0, v32
 ; GFX7-NEXT:    v_min_f32_e32 v6, v6, v32
 ; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:24
 ; GFX7-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    v_mul_f32_e32 v32, 1.0, v32
 ; GFX7-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
-; GFX7-NEXT:    v_mul_f32_e32 v32, 1.0, v32
 ; GFX7-NEXT:    v_min_f32_e32 v5, v5, v32
 ; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:20
 ; GFX7-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    v_mul_f32_e32 v32, 1.0, v32
 ; GFX7-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
-; GFX7-NEXT:    v_mul_f32_e32 v32, 1.0, v32
 ; GFX7-NEXT:    v_min_f32_e32 v4, v4, v32
 ; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:16
 ; GFX7-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    v_mul_f32_e32 v32, 1.0, v32
 ; GFX7-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
-; GFX7-NEXT:    v_mul_f32_e32 v32, 1.0, v32
 ; GFX7-NEXT:    v_min_f32_e32 v3, v3, v32
 ; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:12
 ; GFX7-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    v_mul_f32_e32 v32, 1.0, v32
 ; GFX7-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
-; GFX7-NEXT:    v_mul_f32_e32 v32, 1.0, v32
 ; GFX7-NEXT:    v_min_f32_e32 v2, v2, v32
 ; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:8
 ; GFX7-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    v_mul_f32_e32 v32, 1.0, v32
 ; GFX7-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
-; GFX7-NEXT:    v_mul_f32_e32 v32, 1.0, v32
 ; GFX7-NEXT:    v_min_f32_e32 v1, v1, v32
 ; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:4
 ; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    v_mul_f32_e32 v32, 1.0, v32
 ; GFX7-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
-; GFX7-NEXT:    v_mul_f32_e32 v32, 1.0, v32
 ; GFX7-NEXT:    v_min_f32_e32 v0, v0, v32
 ; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
@@ -21097,8 +20833,6 @@ define bfloat @v_maxnum_bf16(bfloat %a, bfloat %b) {
 ; GCN-NEXT:    v_mul_f32_e32 v1, 1.0, v1
 ; GCN-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
 ; GCN-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GCN-NEXT:    v_mul_f32_e32 v1, 1.0, v1
-; GCN-NEXT:    v_mul_f32_e32 v0, 1.0, v0
 ; GCN-NEXT:    v_max_f32_e32 v0, v0, v1
 ; GCN-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
@@ -21110,8 +20844,6 @@ define bfloat @v_maxnum_bf16(bfloat %a, bfloat %b) {
 ; GFX7-NEXT:    v_mul_f32_e32 v1, 1.0, v1
 ; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
 ; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX7-NEXT:    v_mul_f32_e32 v1, 1.0, v1
-; GFX7-NEXT:    v_mul_f32_e32 v0, 1.0, v0
 ; GFX7-NEXT:    v_max_f32_e32 v0, v0, v1
 ; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
@@ -21192,10 +20924,6 @@ define <2 x bfloat> @v_maxnum_v2bf16(<2 x bfloat> %a, <2 x bfloat> %b) {
 ; GCN-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
 ; GCN-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GCN-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GCN-NEXT:    v_mul_f32_e32 v3, 1.0, v3
-; GCN-NEXT:    v_mul_f32_e32 v1, 1.0, v1
-; GCN-NEXT:    v_mul_f32_e32 v2, 1.0, v2
-; GCN-NEXT:    v_mul_f32_e32 v0, 1.0, v0
 ; GCN-NEXT:    v_max_f32_e32 v1, v1, v3
 ; GCN-NEXT:    v_max_f32_e32 v0, v0, v2
 ; GCN-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
@@ -21213,10 +20941,6 @@ define <2 x bfloat> @v_maxnum_v2bf16(<2 x bfloat> %a, <2 x bfloat> %b) {
 ; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
 ; GFX7-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX7-NEXT:    v_mul_f32_e32 v3, 1.0, v3
-; GFX7-NEXT:    v_mul_f32_e32 v1, 1.0, v1
-; GFX7-NEXT:    v_mul_f32_e32 v2, 1.0, v2
-; GFX7-NEXT:    v_mul_f32_e32 v0, 1.0, v0
 ; GFX7-NEXT:    v_max_f32_e32 v1, v1, v3
 ; GFX7-NEXT:    v_max_f32_e32 v0, v0, v2
 ; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
@@ -21339,12 +21063,6 @@ define <3 x bfloat> @v_maxnum_v3bf16(<3 x bfloat> %a, <3 x bfloat> %b) {
 ; GCN-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
 ; GCN-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
 ; GCN-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GCN-NEXT:    v_mul_f32_e32 v5, 1.0, v5
-; GCN-NEXT:    v_mul_f32_e32 v2, 1.0, v2
-; GCN-NEXT:    v_mul_f32_e32 v4, 1.0, v4
-; GCN-NEXT:    v_mul_f32_e32 v1, 1.0, v1
-; GCN-NEXT:    v_mul_f32_e32 v3, 1.0, v3
-; GCN-NEXT:    v_mul_f32_e32 v0, 1.0, v0
 ; GCN-NEXT:    v_max_f32_e32 v2, v2, v5
 ; GCN-NEXT:    v_max_f32_e32 v1, v1, v4
 ; GCN-NEXT:    v_max_f32_e32 v0, v0, v3
@@ -21368,12 +21086,6 @@ define <3 x bfloat> @v_maxnum_v3bf16(<3 x bfloat> %a, <3 x bfloat> %b) {
 ; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
 ; GFX7-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
 ; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX7-NEXT:    v_mul_f32_e32 v5, 1.0, v5
-; GFX7-NEXT:    v_mul_f32_e32 v2, 1.0, v2
-; GFX7-NEXT:    v_mul_f32_e32 v4, 1.0, v4
-; GFX7-NEXT:    v_mul_f32_e32 v1, 1.0, v1
-; GFX7-NEXT:    v_mul_f32_e32 v3, 1.0, v3
-; GFX7-NEXT:    v_mul_f32_e32 v0, 1.0, v0
 ; GFX7-NEXT:    v_max_f32_e32 v2, v2, v5
 ; GFX7-NEXT:    v_max_f32_e32 v1, v1, v4
 ; GFX7-NEXT:    v_max_f32_e32 v0, v0, v3
@@ -21505,14 +21217,6 @@ define <4 x bfloat> @v_maxnum_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b) {
 ; GCN-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
 ; GCN-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
 ; GCN-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GCN-NEXT:    v_mul_f32_e32 v7, 1.0, v7
-; GCN-NEXT:    v_mul_f32_e32 v3, 1.0, v3
-; GCN-NEXT:    v_mul_f32_e32 v6, 1.0, v6
-; GCN-NEXT:    v_mul_f32_e32 v2, 1.0, v2
-; GCN-NEXT:    v_mul_f32_e32 v5, 1.0, v5
-; GCN-NEXT:    v_mul_f32_e32 v1, 1.0, v1
-; GCN-NEXT:    v_mul_f32_e32 v4, 1.0, v4
-; GCN-NEXT:    v_mul_f32_e32 v0, 1.0, v0
 ; GCN-NEXT:    v_max_f32_e32 v3, v3, v7
 ; GCN-NEXT:    v_max_f32_e32 v2, v2, v6
 ; GCN-NEXT:    v_max_f32_e32 v1, v1, v5
@@ -21542,14 +21246,6 @@ define <4 x bfloat> @v_maxnum_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b) {
 ; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
 ; GFX7-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
 ; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX7-NEXT:    v_mul_f32_e32 v7, 1.0, v7
-; GFX7-NEXT:    v_mul_f32_e32 v3, 1.0, v3
-; GFX7-NEXT:    v_mul_f32_e32 v6, 1.0, v6
-; GFX7-NEXT:    v_mul_f32_e32 v2, 1.0, v2
-; GFX7-NEXT:    v_mul_f32_e32 v5, 1.0, v5
-; GFX7-NEXT:    v_mul_f32_e32 v1, 1.0, v1
-; GFX7-NEXT:    v_mul_f32_e32 v4, 1.0, v4
-; GFX7-NEXT:    v_mul_f32_e32 v0, 1.0, v0
 ; GFX7-NEXT:    v_max_f32_e32 v3, v3, v7
 ; GFX7-NEXT:    v_max_f32_e32 v2, v2, v6
 ; GFX7-NEXT:    v_max_f32_e32 v1, v1, v5
@@ -21768,22 +21464,6 @@ define <8 x bfloat> @v_maxnum_v8bf16(<8 x bfloat> %a, <8 x bfloat> %b) {
 ; GCN-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
 ; GCN-NEXT:    v_and_b32_e32 v8, 0xffff0000, v8
 ; GCN-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GCN-NEXT:    v_mul_f32_e32 v15, 1.0, v15
-; GCN-NEXT:    v_mul_f32_e32 v7, 1.0, v7
-; GCN-NEXT:    v_mul_f32_e32 v14, 1.0, v14
-; GCN-NEXT:    v_mul_f32_e32 v6, 1.0, v6
-; GCN-NEXT:    v_mul_f32_e32 v13, 1.0, v13
-; GCN-NEXT:    v_mul_f32_e32 v5, 1.0, v5
-; GCN-NEXT:    v_mul_f32_e32 v12, 1.0, v12
-; GCN-NEXT:    v_mul_f32_e32 v4, 1.0, v4
-; GCN-NEXT:    v_mul_f32_e32 v11, 1.0, v11
-; GCN-NEXT:    v_mul_f32_e32 v3, 1.0, v3
-; GCN-NEXT:    v_mul_f32_e32 v10, 1.0, v10
-; GCN-NEXT:    v_mul_f32_e32 v2, 1.0, v2
-; GCN-NEXT:    v_mul_f32_e32 v9, 1.0, v9
-; GCN-NEXT:    v_mul_f32_e32 v1, 1.0, v1
-; GCN-NEXT:    v_mul_f32_e32 v8, 1.0, v8
-; GCN-NEXT:    v_mul_f32_e32 v0, 1.0, v0
 ; GCN-NEXT:    v_max_f32_e32 v7, v7, v15
 ; GCN-NEXT:    v_max_f32_e32 v6, v6, v14
 ; GCN-NEXT:    v_max_f32_e32 v5, v5, v13
@@ -21837,22 +21517,6 @@ define <8 x bfloat> @v_maxnum_v8bf16(<8 x bfloat> %a, <8 x bfloat> %b) {
 ; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
 ; GFX7-NEXT:    v_and_b32_e32 v8, 0xffff0000, v8
 ; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX7-NEXT:    v_mul_f32_e32 v15, 1.0, v15
-; GFX7-NEXT:    v_mul_f32_e32 v7, 1.0, v7
-; GFX7-NEXT:    v_mul_f32_e32 v14, 1.0, v14
-; GFX7-NEXT:    v_mul_f32_e32 v6, 1.0, v6
-; GFX7-NEXT:    v_mul_f32_e32 v13, 1.0, v13
-; GFX7-NEXT:    v_mul_f32_e32 v5, 1.0, v5
-; GFX7-NEXT:    v_mul_f32_e32 v12, 1.0, v12
-; GFX7-NEXT:    v_mul_f32_e32 v4, 1.0, v4
-; GFX7-NEXT:    v_mul_f32_e32 v11, 1.0, v11
-; GFX7-NEXT:    v_mul_f32_e32 v3, 1.0, v3
-; GFX7-NEXT:    v_mul_f32_e32 v10, 1.0, v10
-; GFX7-NEXT:    v_mul_f32_e32 v2, 1.0, v2
-; GFX7-NEXT:    v_mul_f32_e32 v9, 1.0, v9
-; GFX7-NEXT:    v_mul_f32_e32 v1, 1.0, v1
-; GFX7-NEXT:    v_mul_f32_e32 v8, 1.0, v8
-; GFX7-NEXT:    v_mul_f32_e32 v0, 1.0, v0
 ; GFX7-NEXT:    v_max_f32_e32 v7, v7, v15
 ; GFX7-NEXT:    v_max_f32_e32 v6, v6, v14
 ; GFX7-NEXT:    v_max_f32_e32 v5, v5, v13
@@ -22199,71 +21863,51 @@ define <16 x bfloat> @v_maxnum_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) {
 ; GCN-NEXT:    v_mul_f32_e32 v30, 1.0, v30
 ; GCN-NEXT:    v_and_b32_e32 v30, 0xffff0000, v30
 ; GCN-NEXT:    v_and_b32_e32 v14, 0xffff0000, v14
-; GCN-NEXT:    v_mul_f32_e32 v30, 1.0, v30
-; GCN-NEXT:    v_mul_f32_e32 v14, 1.0, v14
 ; GCN-NEXT:    v_max_f32_e32 v14, v14, v30
 ; GCN-NEXT:    v_mul_f32_e32 v13, 1.0, v13
 ; GCN-NEXT:    v_mul_f32_e32 v29, 1.0, v29
 ; GCN-NEXT:    v_and_b32_e32 v29, 0xffff0000, v29
 ; GCN-NEXT:    v_and_b32_e32 v13, 0xffff0000, v13
-; GCN-NEXT:    v_mul_f32_e32 v29, 1.0, v29
-; GCN-NEXT:    v_mul_f32_e32 v13, 1.0, v13
 ; GCN-NEXT:    v_max_f32_e32 v13, v13, v29
 ; GCN-NEXT:    v_mul_f32_e32 v12, 1.0, v12
 ; GCN-NEXT:    v_mul_f32_e32 v28, 1.0, v28
 ; GCN-NEXT:    v_and_b32_e32 v28, 0xffff0000, v28
 ; GCN-NEXT:    v_and_b32_e32 v12, 0xffff0000, v12
-; GCN-NEXT:    v_mul_f32_e32 v28, 1.0, v28
-; GCN-NEXT:    v_mul_f32_e32 v12, 1.0, v12
 ; GCN-NEXT:    v_max_f32_e32 v12, v12, v28
 ; GCN-NEXT:    v_mul_f32_e32 v11, 1.0, v11
 ; GCN-NEXT:    v_mul_f32_e32 v27, 1.0, v27
 ; GCN-NEXT:    v_and_b32_e32 v27, 0xffff0000, v27
 ; GCN-NEXT:    v_and_b32_e32 v11, 0xffff0000, v11
-; GCN-NEXT:    v_mul_f32_e32 v27, 1.0, v27
-; GCN-NEXT:    v_mul_f32_e32 v11, 1.0, v11
 ; GCN-NEXT:    v_max_f32_e32 v11, v11, v27
 ; GCN-NEXT:    v_mul_f32_e32 v10, 1.0, v10
 ; GCN-NEXT:    v_mul_f32_e32 v26, 1.0, v26
 ; GCN-NEXT:    v_and_b32_e32 v26, 0xffff0000, v26
 ; GCN-NEXT:    v_and_b32_e32 v10, 0xffff0000, v10
-; GCN-NEXT:    v_mul_f32_e32 v26, 1.0, v26
-; GCN-NEXT:    v_mul_f32_e32 v10, 1.0, v10
 ; GCN-NEXT:    v_max_f32_e32 v10, v10, v26
 ; GCN-NEXT:    v_mul_f32_e32 v9, 1.0, v9
 ; GCN-NEXT:    v_mul_f32_e32 v25, 1.0, v25
 ; GCN-NEXT:    v_and_b32_e32 v25, 0xffff0000, v25
 ; GCN-NEXT:    v_and_b32_e32 v9, 0xffff0000, v9
-; GCN-NEXT:    v_mul_f32_e32 v25, 1.0, v25
-; GCN-NEXT:    v_mul_f32_e32 v9, 1.0, v9
 ; GCN-NEXT:    v_max_f32_e32 v9, v9, v25
 ; GCN-NEXT:    v_mul_f32_e32 v8, 1.0, v8
 ; GCN-NEXT:    v_mul_f32_e32 v24, 1.0, v24
 ; GCN-NEXT:    v_and_b32_e32 v24, 0xffff0000, v24
 ; GCN-NEXT:    v_and_b32_e32 v8, 0xffff0000, v8
-; GCN-NEXT:    v_mul_f32_e32 v24, 1.0, v24
-; GCN-NEXT:    v_mul_f32_e32 v8, 1.0, v8
 ; GCN-NEXT:    v_max_f32_e32 v8, v8, v24
 ; GCN-NEXT:    v_mul_f32_e32 v7, 1.0, v7
 ; GCN-NEXT:    v_mul_f32_e32 v23, 1.0, v23
 ; GCN-NEXT:    v_and_b32_e32 v23, 0xffff0000, v23
 ; GCN-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
-; GCN-NEXT:    v_mul_f32_e32 v23, 1.0, v23
-; GCN-NEXT:    v_mul_f32_e32 v7, 1.0, v7
 ; GCN-NEXT:    v_max_f32_e32 v7, v7, v23
 ; GCN-NEXT:    v_mul_f32_e32 v6, 1.0, v6
 ; GCN-NEXT:    v_mul_f32_e32 v22, 1.0, v22
 ; GCN-NEXT:    v_and_b32_e32 v22, 0xffff0000, v22
 ; GCN-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
-; GCN-NEXT:    v_mul_f32_e32 v22, 1.0, v22
-; GCN-NEXT:    v_mul_f32_e32 v6, 1.0, v6
 ; GCN-NEXT:    v_max_f32_e32 v6, v6, v22
 ; GCN-NEXT:    v_mul_f32_e32 v5, 1.0, v5
 ; GCN-NEXT:    v_mul_f32_e32 v21, 1.0, v21
 ; GCN-NEXT:    v_and_b32_e32 v21, 0xffff0000, v21
 ; GCN-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
-; GCN-NEXT:    v_mul_f32_e32 v21, 1.0, v21
-; GCN-NEXT:    v_mul_f32_e32 v5, 1.0, v5
 ; GCN-NEXT:    v_max_f32_e32 v5, v5, v21
 ; GCN-NEXT:    v_mul_f32_e32 v0, 1.0, v0
 ; GCN-NEXT:    v_mul_f32_e32 v16, 1.0, v16
@@ -22278,8 +21922,6 @@ define <16 x bfloat> @v_maxnum_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) {
 ; GCN-NEXT:    v_mul_f32_e32 v15, 1.0, v15
 ; GCN-NEXT:    v_and_b32_e32 v20, 0xffff0000, v20
 ; GCN-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
-; GCN-NEXT:    v_mul_f32_e32 v20, 1.0, v20
-; GCN-NEXT:    v_mul_f32_e32 v4, 1.0, v4
 ; GCN-NEXT:    v_max_f32_e32 v4, v4, v20
 ; GCN-NEXT:    buffer_load_dword v20, off, s[0:3], s32
 ; GCN-NEXT:    v_and_b32_e32 v15, 0xffff0000, v15
@@ -22291,21 +21933,10 @@ define <16 x bfloat> @v_maxnum_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) {
 ; GCN-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
 ; GCN-NEXT:    v_and_b32_e32 v16, 0xffff0000, v16
 ; GCN-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GCN-NEXT:    v_mul_f32_e32 v15, 1.0, v15
-; GCN-NEXT:    v_mul_f32_e32 v19, 1.0, v19
-; GCN-NEXT:    v_mul_f32_e32 v3, 1.0, v3
-; GCN-NEXT:    v_mul_f32_e32 v18, 1.0, v18
-; GCN-NEXT:    v_mul_f32_e32 v2, 1.0, v2
-; GCN-NEXT:    v_mul_f32_e32 v17, 1.0, v17
-; GCN-NEXT:    v_mul_f32_e32 v1, 1.0, v1
-; GCN-NEXT:    v_mul_f32_e32 v16, 1.0, v16
-; GCN-NEXT:    v_mul_f32_e32 v0, 1.0, v0
 ; GCN-NEXT:    v_max_f32_e32 v3, v3, v19
 ; GCN-NEXT:    v_max_f32_e32 v2, v2, v18
 ; GCN-NEXT:    v_max_f32_e32 v1, v1, v17
 ; GCN-NEXT:    v_max_f32_e32 v0, v0, v16
-; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    v_mul_f32_e32 v16, 1.0, v20
 ; GCN-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
 ; GCN-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
 ; GCN-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
@@ -22320,8 +21951,9 @@ define <16 x bfloat> @v_maxnum_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) {
 ; GCN-NEXT:    v_and_b32_e32 v11, 0xffff0000, v11
 ; GCN-NEXT:    v_and_b32_e32 v12, 0xffff0000, v12
 ; GCN-NEXT:    v_and_b32_e32 v13, 0xffff0000, v13
+; GCN-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NEXT:    v_mul_f32_e32 v16, 1.0, v20
 ; GCN-NEXT:    v_and_b32_e32 v16, 0xffff0000, v16
-; GCN-NEXT:    v_mul_f32_e32 v16, 1.0, v16
 ; GCN-NEXT:    v_max_f32_e32 v15, v15, v16
 ; GCN-NEXT:    v_and_b32_e32 v14, 0xffff0000, v14
 ; GCN-NEXT:    v_and_b32_e32 v15, 0xffff0000, v15
@@ -22330,14 +21962,12 @@ define <16 x bfloat> @v_maxnum_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) {
 ; GFX7-LABEL: v_maxnum_v16bf16:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    v_mul_f32_e32 v9, 1.0, v9
-; GFX7-NEXT:    v_mul_f32_e32 v25, 1.0, v25
-; GFX7-NEXT:    v_and_b32_e32 v25, 0xffff0000, v25
-; GFX7-NEXT:    v_and_b32_e32 v9, 0xffff0000, v9
-; GFX7-NEXT:    v_mul_f32_e32 v25, 1.0, v25
-; GFX7-NEXT:    v_mul_f32_e32 v9, 1.0, v9
-; GFX7-NEXT:    v_max_f32_e32 v9, v9, v25
-; GFX7-NEXT:    buffer_load_dword v25, off, s[0:3], s32
+; GFX7-NEXT:    v_mul_f32_e32 v6, 1.0, v6
+; GFX7-NEXT:    v_mul_f32_e32 v22, 1.0, v22
+; GFX7-NEXT:    v_and_b32_e32 v22, 0xffff0000, v22
+; GFX7-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
+; GFX7-NEXT:    v_max_f32_e32 v6, v6, v22
+; GFX7-NEXT:    buffer_load_dword v22, off, s[0:3], s32
 ; GFX7-NEXT:    v_mul_f32_e32 v14, 1.0, v14
 ; GFX7-NEXT:    v_mul_f32_e32 v30, 1.0, v30
 ; GFX7-NEXT:    v_mul_f32_e32 v13, 1.0, v13
@@ -22348,13 +21978,13 @@ define <16 x bfloat> @v_maxnum_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) {
 ; GFX7-NEXT:    v_mul_f32_e32 v27, 1.0, v27
 ; GFX7-NEXT:    v_mul_f32_e32 v10, 1.0, v10
 ; GFX7-NEXT:    v_mul_f32_e32 v26, 1.0, v26
-; GFX7-NEXT:    v_mul_f32_e32 v15, 1.0, v15
+; GFX7-NEXT:    v_mul_f32_e32 v9, 1.0, v9
+; GFX7-NEXT:    v_mul_f32_e32 v25, 1.0, v25
 ; GFX7-NEXT:    v_mul_f32_e32 v8, 1.0, v8
 ; GFX7-NEXT:    v_mul_f32_e32 v24, 1.0, v24
 ; GFX7-NEXT:    v_mul_f32_e32 v7, 1.0, v7
 ; GFX7-NEXT:    v_mul_f32_e32 v23, 1.0, v23
-; GFX7-NEXT:    v_mul_f32_e32 v6, 1.0, v6
-; GFX7-NEXT:    v_mul_f32_e32 v22, 1.0, v22
+; GFX7-NEXT:    v_mul_f32_e32 v15, 1.0, v15
 ; GFX7-NEXT:    v_mul_f32_e32 v5, 1.0, v5
 ; GFX7-NEXT:    v_mul_f32_e32 v21, 1.0, v21
 ; GFX7-NEXT:    v_mul_f32_e32 v0, 1.0, v0
@@ -22377,13 +22007,13 @@ define <16 x bfloat> @v_maxnum_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) {
 ; GFX7-NEXT:    v_and_b32_e32 v11, 0xffff0000, v11
 ; GFX7-NEXT:    v_and_b32_e32 v26, 0xffff0000, v26
 ; GFX7-NEXT:    v_and_b32_e32 v10, 0xffff0000, v10
-; GFX7-NEXT:    v_and_b32_e32 v15, 0xffff0000, v15
+; GFX7-NEXT:    v_and_b32_e32 v25, 0xffff0000, v25
+; GFX7-NEXT:    v_and_b32_e32 v9, 0xffff0000, v9
 ; GFX7-NEXT:    v_and_b32_e32 v24, 0xffff0000, v24
 ; GFX7-NEXT:    v_and_b32_e32 v8, 0xffff0000, v8
 ; GFX7-NEXT:    v_and_b32_e32 v23, 0xffff0000, v23
 ; GFX7-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
-; GFX7-NEXT:    v_and_b32_e32 v22, 0xffff0000, v22
-; GFX7-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
+; GFX7-NEXT:    v_and_b32_e32 v15, 0xffff0000, v15
 ; GFX7-NEXT:    v_and_b32_e32 v21, 0xffff0000, v21
 ; GFX7-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
 ; GFX7-NEXT:    v_and_b32_e32 v20, 0xffff0000, v20
@@ -22392,52 +22022,18 @@ define <16 x bfloat> @v_maxnum_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) {
 ; GFX7-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
 ; GFX7-NEXT:    v_and_b32_e32 v18, 0xffff0000, v18
 ; GFX7-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
-; GFX7-NEXT:    v_and_b32_e32 v17, 0xffff0000, v17
-; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; GFX7-NEXT:    v_and_b32_e32 v16, 0xffff0000, v16
-; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX7-NEXT:    v_mul_f32_e32 v30, 1.0, v30
-; GFX7-NEXT:    v_mul_f32_e32 v14, 1.0, v14
-; GFX7-NEXT:    v_mul_f32_e32 v29, 1.0, v29
-; GFX7-NEXT:    v_mul_f32_e32 v13, 1.0, v13
-; GFX7-NEXT:    v_mul_f32_e32 v28, 1.0, v28
-; GFX7-NEXT:    v_mul_f32_e32 v12, 1.0, v12
-; GFX7-NEXT:    v_mul_f32_e32 v27, 1.0, v27
-; GFX7-NEXT:    v_mul_f32_e32 v11, 1.0, v11
-; GFX7-NEXT:    v_mul_f32_e32 v26, 1.0, v26
-; GFX7-NEXT:    v_mul_f32_e32 v10, 1.0, v10
-; GFX7-NEXT:    v_mul_f32_e32 v15, 1.0, v15
-; GFX7-NEXT:    v_mul_f32_e32 v24, 1.0, v24
-; GFX7-NEXT:    v_mul_f32_e32 v8, 1.0, v8
-; GFX7-NEXT:    v_mul_f32_e32 v23, 1.0, v23
-; GFX7-NEXT:    v_mul_f32_e32 v7, 1.0, v7
-; GFX7-NEXT:    v_mul_f32_e32 v22, 1.0, v22
-; GFX7-NEXT:    v_mul_f32_e32 v6, 1.0, v6
-; GFX7-NEXT:    v_mul_f32_e32 v21, 1.0, v21
-; GFX7-NEXT:    v_mul_f32_e32 v5, 1.0, v5
-; GFX7-NEXT:    v_mul_f32_e32 v20, 1.0, v20
-; GFX7-NEXT:    v_mul_f32_e32 v4, 1.0, v4
-; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_mul_f32_e32 v25, 1.0, v25
-; GFX7-NEXT:    v_and_b32_e32 v25, 0xffff0000, v25
-; GFX7-NEXT:    v_mul_f32_e32 v25, 1.0, v25
-; GFX7-NEXT:    v_mul_f32_e32 v19, 1.0, v19
-; GFX7-NEXT:    v_mul_f32_e32 v3, 1.0, v3
-; GFX7-NEXT:    v_mul_f32_e32 v18, 1.0, v18
-; GFX7-NEXT:    v_mul_f32_e32 v2, 1.0, v2
-; GFX7-NEXT:    v_mul_f32_e32 v17, 1.0, v17
-; GFX7-NEXT:    v_mul_f32_e32 v1, 1.0, v1
-; GFX7-NEXT:    v_mul_f32_e32 v16, 1.0, v16
-; GFX7-NEXT:    v_mul_f32_e32 v0, 1.0, v0
+; GFX7-NEXT:    v_and_b32_e32 v17, 0xffff0000, v17
+; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX7-NEXT:    v_and_b32_e32 v16, 0xffff0000, v16
+; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
 ; GFX7-NEXT:    v_max_f32_e32 v14, v14, v30
 ; GFX7-NEXT:    v_max_f32_e32 v13, v13, v29
 ; GFX7-NEXT:    v_max_f32_e32 v12, v12, v28
 ; GFX7-NEXT:    v_max_f32_e32 v11, v11, v27
 ; GFX7-NEXT:    v_max_f32_e32 v10, v10, v26
-; GFX7-NEXT:    v_max_f32_e32 v15, v15, v25
+; GFX7-NEXT:    v_max_f32_e32 v9, v9, v25
 ; GFX7-NEXT:    v_max_f32_e32 v8, v8, v24
 ; GFX7-NEXT:    v_max_f32_e32 v7, v7, v23
-; GFX7-NEXT:    v_max_f32_e32 v6, v6, v22
 ; GFX7-NEXT:    v_max_f32_e32 v5, v5, v21
 ; GFX7-NEXT:    v_max_f32_e32 v4, v4, v20
 ; GFX7-NEXT:    v_max_f32_e32 v3, v3, v19
@@ -22451,6 +22047,10 @@ define <16 x bfloat> @v_maxnum_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) {
 ; GFX7-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
 ; GFX7-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
 ; GFX7-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    v_mul_f32_e32 v22, 1.0, v22
+; GFX7-NEXT:    v_and_b32_e32 v22, 0xffff0000, v22
+; GFX7-NEXT:    v_max_f32_e32 v15, v15, v22
 ; GFX7-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
 ; GFX7-NEXT:    v_and_b32_e32 v8, 0xffff0000, v8
 ; GFX7-NEXT:    v_and_b32_e32 v9, 0xffff0000, v9
@@ -23084,287 +22684,223 @@ define <32 x bfloat> @v_maxnum_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) {
 ; GCN-NEXT:    v_mul_f32_e32 v32, 1.0, v32
 ; GCN-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
 ; GCN-NEXT:    v_and_b32_e32 v31, 0xffff0000, v31
-; GCN-NEXT:    v_mul_f32_e32 v32, 1.0, v32
-; GCN-NEXT:    v_mul_f32_e32 v31, 1.0, v31
+; GCN-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:124
 ; GCN-NEXT:    v_max_f32_e32 v31, v31, v32
-; GCN-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:124
 ; GCN-NEXT:    v_mul_f32_e32 v30, 1.0, v30
 ; GCN-NEXT:    v_and_b32_e32 v30, 0xffff0000, v30
-; GCN-NEXT:    v_mul_f32_e32 v30, 1.0, v30
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT:    v_mul_f32_e32 v32, 1.0, v33
 ; GCN-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
-; GCN-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:120
 ; GCN-NEXT:    v_max_f32_e32 v30, v30, v32
-; GCN-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:120
 ; GCN-NEXT:    v_mul_f32_e32 v29, 1.0, v29
 ; GCN-NEXT:    v_and_b32_e32 v29, 0xffff0000, v29
-; GCN-NEXT:    v_mul_f32_e32 v29, 1.0, v29
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT:    v_mul_f32_e32 v32, 1.0, v33
 ; GCN-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
-; GCN-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:116
 ; GCN-NEXT:    v_max_f32_e32 v29, v29, v32
-; GCN-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:116
 ; GCN-NEXT:    v_mul_f32_e32 v28, 1.0, v28
 ; GCN-NEXT:    v_and_b32_e32 v28, 0xffff0000, v28
-; GCN-NEXT:    v_mul_f32_e32 v28, 1.0, v28
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT:    v_mul_f32_e32 v32, 1.0, v33
 ; GCN-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
-; GCN-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:112
 ; GCN-NEXT:    v_max_f32_e32 v28, v28, v32
-; GCN-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:112
 ; GCN-NEXT:    v_mul_f32_e32 v27, 1.0, v27
 ; GCN-NEXT:    v_and_b32_e32 v27, 0xffff0000, v27
-; GCN-NEXT:    v_mul_f32_e32 v27, 1.0, v27
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT:    v_mul_f32_e32 v32, 1.0, v33
 ; GCN-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
-; GCN-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:108
 ; GCN-NEXT:    v_max_f32_e32 v27, v27, v32
-; GCN-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:108
 ; GCN-NEXT:    v_mul_f32_e32 v26, 1.0, v26
 ; GCN-NEXT:    v_and_b32_e32 v26, 0xffff0000, v26
-; GCN-NEXT:    v_mul_f32_e32 v26, 1.0, v26
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT:    v_mul_f32_e32 v32, 1.0, v33
 ; GCN-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
-; GCN-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:104
 ; GCN-NEXT:    v_max_f32_e32 v26, v26, v32
-; GCN-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:104
 ; GCN-NEXT:    v_mul_f32_e32 v25, 1.0, v25
 ; GCN-NEXT:    v_and_b32_e32 v25, 0xffff0000, v25
-; GCN-NEXT:    v_mul_f32_e32 v25, 1.0, v25
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT:    v_mul_f32_e32 v32, 1.0, v33
 ; GCN-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
-; GCN-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:100
 ; GCN-NEXT:    v_max_f32_e32 v25, v25, v32
-; GCN-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:100
 ; GCN-NEXT:    v_mul_f32_e32 v24, 1.0, v24
 ; GCN-NEXT:    v_and_b32_e32 v24, 0xffff0000, v24
-; GCN-NEXT:    v_mul_f32_e32 v24, 1.0, v24
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT:    v_mul_f32_e32 v32, 1.0, v33
 ; GCN-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
-; GCN-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:96
 ; GCN-NEXT:    v_max_f32_e32 v24, v24, v32
-; GCN-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:96
 ; GCN-NEXT:    v_mul_f32_e32 v23, 1.0, v23
 ; GCN-NEXT:    v_and_b32_e32 v23, 0xffff0000, v23
-; GCN-NEXT:    v_mul_f32_e32 v23, 1.0, v23
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT:    v_mul_f32_e32 v32, 1.0, v33
 ; GCN-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
-; GCN-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:92
 ; GCN-NEXT:    v_max_f32_e32 v23, v23, v32
-; GCN-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:92
 ; GCN-NEXT:    v_mul_f32_e32 v22, 1.0, v22
 ; GCN-NEXT:    v_and_b32_e32 v22, 0xffff0000, v22
-; GCN-NEXT:    v_mul_f32_e32 v22, 1.0, v22
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT:    v_mul_f32_e32 v32, 1.0, v33
 ; GCN-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
-; GCN-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:88
 ; GCN-NEXT:    v_max_f32_e32 v22, v22, v32
-; GCN-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:88
 ; GCN-NEXT:    v_mul_f32_e32 v21, 1.0, v21
 ; GCN-NEXT:    v_and_b32_e32 v21, 0xffff0000, v21
-; GCN-NEXT:    v_mul_f32_e32 v21, 1.0, v21
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT:    v_mul_f32_e32 v32, 1.0, v33
 ; GCN-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
-; GCN-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:84
 ; GCN-NEXT:    v_max_f32_e32 v21, v21, v32
-; GCN-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:84
 ; GCN-NEXT:    v_mul_f32_e32 v20, 1.0, v20
 ; GCN-NEXT:    v_and_b32_e32 v20, 0xffff0000, v20
-; GCN-NEXT:    v_mul_f32_e32 v20, 1.0, v20
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT:    v_mul_f32_e32 v32, 1.0, v33
 ; GCN-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
-; GCN-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:80
 ; GCN-NEXT:    v_max_f32_e32 v20, v20, v32
-; GCN-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:80
 ; GCN-NEXT:    v_mul_f32_e32 v19, 1.0, v19
 ; GCN-NEXT:    v_and_b32_e32 v19, 0xffff0000, v19
-; GCN-NEXT:    v_mul_f32_e32 v19, 1.0, v19
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT:    v_mul_f32_e32 v32, 1.0, v33
 ; GCN-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
-; GCN-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:76
 ; GCN-NEXT:    v_max_f32_e32 v19, v19, v32
-; GCN-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:76
 ; GCN-NEXT:    v_mul_f32_e32 v18, 1.0, v18
 ; GCN-NEXT:    v_and_b32_e32 v18, 0xffff0000, v18
-; GCN-NEXT:    v_mul_f32_e32 v18, 1.0, v18
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT:    v_mul_f32_e32 v32, 1.0, v33
 ; GCN-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
-; GCN-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:72
 ; GCN-NEXT:    v_max_f32_e32 v18, v18, v32
-; GCN-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:72
 ; GCN-NEXT:    v_mul_f32_e32 v17, 1.0, v17
 ; GCN-NEXT:    v_and_b32_e32 v17, 0xffff0000, v17
-; GCN-NEXT:    v_mul_f32_e32 v17, 1.0, v17
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT:    v_mul_f32_e32 v32, 1.0, v33
 ; GCN-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
-; GCN-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:68
 ; GCN-NEXT:    v_max_f32_e32 v17, v17, v32
-; GCN-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:68
 ; GCN-NEXT:    v_mul_f32_e32 v16, 1.0, v16
 ; GCN-NEXT:    v_and_b32_e32 v16, 0xffff0000, v16
-; GCN-NEXT:    v_mul_f32_e32 v16, 1.0, v16
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT:    v_mul_f32_e32 v32, 1.0, v33
 ; GCN-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
-; GCN-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:64
 ; GCN-NEXT:    v_max_f32_e32 v16, v16, v32
-; GCN-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:64
 ; GCN-NEXT:    v_mul_f32_e32 v15, 1.0, v15
 ; GCN-NEXT:    v_and_b32_e32 v15, 0xffff0000, v15
-; GCN-NEXT:    v_mul_f32_e32 v15, 1.0, v15
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT:    v_mul_f32_e32 v32, 1.0, v33
 ; GCN-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
-; GCN-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:60
 ; GCN-NEXT:    v_max_f32_e32 v15, v15, v32
-; GCN-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:60
 ; GCN-NEXT:    v_mul_f32_e32 v14, 1.0, v14
 ; GCN-NEXT:    v_and_b32_e32 v14, 0xffff0000, v14
-; GCN-NEXT:    v_mul_f32_e32 v14, 1.0, v14
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT:    v_mul_f32_e32 v32, 1.0, v33
 ; GCN-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
-; GCN-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:56
 ; GCN-NEXT:    v_max_f32_e32 v14, v14, v32
-; GCN-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:56
 ; GCN-NEXT:    v_mul_f32_e32 v13, 1.0, v13
 ; GCN-NEXT:    v_and_b32_e32 v13, 0xffff0000, v13
-; GCN-NEXT:    v_mul_f32_e32 v13, 1.0, v13
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT:    v_mul_f32_e32 v32, 1.0, v33
 ; GCN-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
-; GCN-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:52
 ; GCN-NEXT:    v_max_f32_e32 v13, v13, v32
-; GCN-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:52
 ; GCN-NEXT:    v_mul_f32_e32 v12, 1.0, v12
 ; GCN-NEXT:    v_and_b32_e32 v12, 0xffff0000, v12
-; GCN-NEXT:    v_mul_f32_e32 v12, 1.0, v12
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT:    v_mul_f32_e32 v32, 1.0, v33
 ; GCN-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
-; GCN-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:48
 ; GCN-NEXT:    v_max_f32_e32 v12, v12, v32
-; GCN-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:48
 ; GCN-NEXT:    v_mul_f32_e32 v11, 1.0, v11
 ; GCN-NEXT:    v_and_b32_e32 v11, 0xffff0000, v11
-; GCN-NEXT:    v_mul_f32_e32 v11, 1.0, v11
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT:    v_mul_f32_e32 v32, 1.0, v33
 ; GCN-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
-; GCN-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:44
 ; GCN-NEXT:    v_max_f32_e32 v11, v11, v32
-; GCN-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:44
 ; GCN-NEXT:    v_mul_f32_e32 v10, 1.0, v10
 ; GCN-NEXT:    v_and_b32_e32 v10, 0xffff0000, v10
-; GCN-NEXT:    v_mul_f32_e32 v10, 1.0, v10
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT:    v_mul_f32_e32 v32, 1.0, v33
 ; GCN-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
-; GCN-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:40
 ; GCN-NEXT:    v_max_f32_e32 v10, v10, v32
-; GCN-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:40
 ; GCN-NEXT:    v_mul_f32_e32 v9, 1.0, v9
 ; GCN-NEXT:    v_and_b32_e32 v9, 0xffff0000, v9
-; GCN-NEXT:    v_mul_f32_e32 v9, 1.0, v9
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT:    v_mul_f32_e32 v32, 1.0, v33
 ; GCN-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
-; GCN-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:36
 ; GCN-NEXT:    v_max_f32_e32 v9, v9, v32
-; GCN-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:36
 ; GCN-NEXT:    v_mul_f32_e32 v8, 1.0, v8
 ; GCN-NEXT:    v_and_b32_e32 v8, 0xffff0000, v8
-; GCN-NEXT:    v_mul_f32_e32 v8, 1.0, v8
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT:    v_mul_f32_e32 v32, 1.0, v33
 ; GCN-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
-; GCN-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:32
 ; GCN-NEXT:    v_max_f32_e32 v8, v8, v32
-; GCN-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:32
 ; GCN-NEXT:    v_mul_f32_e32 v7, 1.0, v7
 ; GCN-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
-; GCN-NEXT:    v_mul_f32_e32 v7, 1.0, v7
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT:    v_mul_f32_e32 v32, 1.0, v33
 ; GCN-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
-; GCN-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:28
 ; GCN-NEXT:    v_max_f32_e32 v7, v7, v32
-; GCN-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:28
 ; GCN-NEXT:    v_mul_f32_e32 v6, 1.0, v6
 ; GCN-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
-; GCN-NEXT:    v_mul_f32_e32 v6, 1.0, v6
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT:    v_mul_f32_e32 v32, 1.0, v33
 ; GCN-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
-; GCN-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:24
 ; GCN-NEXT:    v_max_f32_e32 v6, v6, v32
-; GCN-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:24
 ; GCN-NEXT:    v_mul_f32_e32 v5, 1.0, v5
 ; GCN-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
-; GCN-NEXT:    v_mul_f32_e32 v5, 1.0, v5
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT:    v_mul_f32_e32 v32, 1.0, v33
 ; GCN-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
-; GCN-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:20
 ; GCN-NEXT:    v_max_f32_e32 v5, v5, v32
-; GCN-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:20
 ; GCN-NEXT:    v_mul_f32_e32 v4, 1.0, v4
 ; GCN-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
-; GCN-NEXT:    v_mul_f32_e32 v4, 1.0, v4
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT:    v_mul_f32_e32 v32, 1.0, v33
 ; GCN-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
-; GCN-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:16
 ; GCN-NEXT:    v_max_f32_e32 v4, v4, v32
-; GCN-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:16
 ; GCN-NEXT:    v_mul_f32_e32 v3, 1.0, v3
 ; GCN-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
-; GCN-NEXT:    v_mul_f32_e32 v3, 1.0, v3
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT:    v_mul_f32_e32 v32, 1.0, v33
 ; GCN-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
-; GCN-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:12
 ; GCN-NEXT:    v_max_f32_e32 v3, v3, v32
-; GCN-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:12
 ; GCN-NEXT:    v_mul_f32_e32 v2, 1.0, v2
 ; GCN-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
-; GCN-NEXT:    v_mul_f32_e32 v2, 1.0, v2
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT:    v_mul_f32_e32 v32, 1.0, v33
 ; GCN-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
-; GCN-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:8
 ; GCN-NEXT:    v_max_f32_e32 v2, v2, v32
-; GCN-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:8
 ; GCN-NEXT:    v_mul_f32_e32 v1, 1.0, v1
 ; GCN-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; GCN-NEXT:    v_mul_f32_e32 v1, 1.0, v1
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT:    v_mul_f32_e32 v32, 1.0, v33
 ; GCN-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
-; GCN-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:4
 ; GCN-NEXT:    v_max_f32_e32 v1, v1, v32
-; GCN-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:4
 ; GCN-NEXT:    v_mul_f32_e32 v0, 1.0, v0
 ; GCN-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GCN-NEXT:    v_mul_f32_e32 v0, 1.0, v0
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT:    v_mul_f32_e32 v32, 1.0, v33
 ; GCN-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
-; GCN-NEXT:    v_mul_f32_e32 v32, 1.0, v32
 ; GCN-NEXT:    v_max_f32_e32 v0, v0, v32
 ; GCN-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
 ; GCN-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
@@ -23407,322 +22943,258 @@ define <32 x bfloat> @v_maxnum_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) {
 ; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:128
 ; GFX7-NEXT:    v_mul_f32_e32 v30, 1.0, v30
 ; GFX7-NEXT:    v_and_b32_e32 v30, 0xffff0000, v30
-; GFX7-NEXT:    v_mul_f32_e32 v30, 1.0, v30
 ; GFX7-NEXT:    v_mul_f32_e32 v29, 1.0, v29
 ; GFX7-NEXT:    v_and_b32_e32 v29, 0xffff0000, v29
-; GFX7-NEXT:    v_mul_f32_e32 v29, 1.0, v29
 ; GFX7-NEXT:    v_mul_f32_e32 v28, 1.0, v28
 ; GFX7-NEXT:    v_and_b32_e32 v28, 0xffff0000, v28
-; GFX7-NEXT:    v_mul_f32_e32 v28, 1.0, v28
 ; GFX7-NEXT:    v_mul_f32_e32 v27, 1.0, v27
 ; GFX7-NEXT:    v_and_b32_e32 v27, 0xffff0000, v27
-; GFX7-NEXT:    v_mul_f32_e32 v27, 1.0, v27
 ; GFX7-NEXT:    v_mul_f32_e32 v26, 1.0, v26
 ; GFX7-NEXT:    v_and_b32_e32 v26, 0xffff0000, v26
-; GFX7-NEXT:    v_mul_f32_e32 v26, 1.0, v26
 ; GFX7-NEXT:    v_mul_f32_e32 v25, 1.0, v25
 ; GFX7-NEXT:    v_and_b32_e32 v25, 0xffff0000, v25
-; GFX7-NEXT:    v_mul_f32_e32 v25, 1.0, v25
 ; GFX7-NEXT:    v_mul_f32_e32 v24, 1.0, v24
 ; GFX7-NEXT:    v_and_b32_e32 v24, 0xffff0000, v24
-; GFX7-NEXT:    v_mul_f32_e32 v24, 1.0, v24
 ; GFX7-NEXT:    v_mul_f32_e32 v23, 1.0, v23
 ; GFX7-NEXT:    v_and_b32_e32 v23, 0xffff0000, v23
-; GFX7-NEXT:    v_mul_f32_e32 v23, 1.0, v23
 ; GFX7-NEXT:    v_mul_f32_e32 v22, 1.0, v22
 ; GFX7-NEXT:    v_and_b32_e32 v22, 0xffff0000, v22
-; GFX7-NEXT:    v_mul_f32_e32 v22, 1.0, v22
 ; GFX7-NEXT:    v_mul_f32_e32 v21, 1.0, v21
 ; GFX7-NEXT:    v_and_b32_e32 v21, 0xffff0000, v21
-; GFX7-NEXT:    v_mul_f32_e32 v21, 1.0, v21
 ; GFX7-NEXT:    v_mul_f32_e32 v20, 1.0, v20
 ; GFX7-NEXT:    v_and_b32_e32 v20, 0xffff0000, v20
-; GFX7-NEXT:    v_mul_f32_e32 v20, 1.0, v20
 ; GFX7-NEXT:    v_mul_f32_e32 v19, 1.0, v19
 ; GFX7-NEXT:    v_and_b32_e32 v19, 0xffff0000, v19
-; GFX7-NEXT:    v_mul_f32_e32 v19, 1.0, v19
 ; GFX7-NEXT:    v_mul_f32_e32 v18, 1.0, v18
 ; GFX7-NEXT:    v_and_b32_e32 v18, 0xffff0000, v18
-; GFX7-NEXT:    v_mul_f32_e32 v18, 1.0, v18
 ; GFX7-NEXT:    v_mul_f32_e32 v17, 1.0, v17
 ; GFX7-NEXT:    v_and_b32_e32 v17, 0xffff0000, v17
-; GFX7-NEXT:    v_mul_f32_e32 v17, 1.0, v17
 ; GFX7-NEXT:    v_mul_f32_e32 v16, 1.0, v16
 ; GFX7-NEXT:    v_and_b32_e32 v16, 0xffff0000, v16
-; GFX7-NEXT:    v_mul_f32_e32 v16, 1.0, v16
 ; GFX7-NEXT:    v_mul_f32_e32 v15, 1.0, v15
 ; GFX7-NEXT:    v_and_b32_e32 v15, 0xffff0000, v15
-; GFX7-NEXT:    v_mul_f32_e32 v15, 1.0, v15
 ; GFX7-NEXT:    v_mul_f32_e32 v14, 1.0, v14
 ; GFX7-NEXT:    v_and_b32_e32 v14, 0xffff0000, v14
-; GFX7-NEXT:    v_mul_f32_e32 v14, 1.0, v14
 ; GFX7-NEXT:    v_mul_f32_e32 v13, 1.0, v13
 ; GFX7-NEXT:    v_and_b32_e32 v13, 0xffff0000, v13
-; GFX7-NEXT:    v_mul_f32_e32 v13, 1.0, v13
 ; GFX7-NEXT:    v_mul_f32_e32 v12, 1.0, v12
 ; GFX7-NEXT:    v_and_b32_e32 v12, 0xffff0000, v12
-; GFX7-NEXT:    v_mul_f32_e32 v12, 1.0, v12
 ; GFX7-NEXT:    v_mul_f32_e32 v11, 1.0, v11
 ; GFX7-NEXT:    v_and_b32_e32 v11, 0xffff0000, v11
-; GFX7-NEXT:    v_mul_f32_e32 v11, 1.0, v11
 ; GFX7-NEXT:    v_mul_f32_e32 v10, 1.0, v10
 ; GFX7-NEXT:    v_and_b32_e32 v10, 0xffff0000, v10
-; GFX7-NEXT:    v_mul_f32_e32 v10, 1.0, v10
 ; GFX7-NEXT:    v_mul_f32_e32 v9, 1.0, v9
 ; GFX7-NEXT:    v_and_b32_e32 v9, 0xffff0000, v9
-; GFX7-NEXT:    v_mul_f32_e32 v9, 1.0, v9
 ; GFX7-NEXT:    v_mul_f32_e32 v8, 1.0, v8
 ; GFX7-NEXT:    v_and_b32_e32 v8, 0xffff0000, v8
-; GFX7-NEXT:    v_mul_f32_e32 v8, 1.0, v8
 ; GFX7-NEXT:    v_mul_f32_e32 v7, 1.0, v7
 ; GFX7-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
-; GFX7-NEXT:    v_mul_f32_e32 v7, 1.0, v7
 ; GFX7-NEXT:    v_mul_f32_e32 v6, 1.0, v6
 ; GFX7-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
-; GFX7-NEXT:    v_mul_f32_e32 v6, 1.0, v6
 ; GFX7-NEXT:    v_mul_f32_e32 v5, 1.0, v5
 ; GFX7-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
-; GFX7-NEXT:    v_mul_f32_e32 v5, 1.0, v5
-; GFX7-NEXT:    s_waitcnt vmcnt(1)
-; GFX7-NEXT:    v_mul_f32_e32 v31, 1.0, v31
-; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_mul_f32_e32 v32, 1.0, v32
-; GFX7-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
-; GFX7-NEXT:    v_and_b32_e32 v31, 0xffff0000, v31
-; GFX7-NEXT:    v_mul_f32_e32 v32, 1.0, v32
-; GFX7-NEXT:    v_mul_f32_e32 v31, 1.0, v31
-; GFX7-NEXT:    v_max_f32_e32 v31, v31, v32
-; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:124
 ; GFX7-NEXT:    v_mul_f32_e32 v4, 1.0, v4
 ; GFX7-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
-; GFX7-NEXT:    v_mul_f32_e32 v4, 1.0, v4
 ; GFX7-NEXT:    v_mul_f32_e32 v3, 1.0, v3
 ; GFX7-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
-; GFX7-NEXT:    v_mul_f32_e32 v3, 1.0, v3
 ; GFX7-NEXT:    v_mul_f32_e32 v2, 1.0, v2
 ; GFX7-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
-; GFX7-NEXT:    v_mul_f32_e32 v2, 1.0, v2
 ; GFX7-NEXT:    v_mul_f32_e32 v1, 1.0, v1
 ; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; GFX7-NEXT:    v_mul_f32_e32 v1, 1.0, v1
 ; GFX7-NEXT:    v_mul_f32_e32 v0, 1.0, v0
 ; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
-; GFX7-NEXT:    v_mul_f32_e32 v0, 1.0, v0
-; GFX7-NEXT:    v_and_b32_e32 v31, 0xffff0000, v31
+; GFX7-NEXT:    s_waitcnt vmcnt(1)
+; GFX7-NEXT:    v_mul_f32_e32 v31, 1.0, v31
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    v_mul_f32_e32 v32, 1.0, v32
 ; GFX7-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
+; GFX7-NEXT:    v_and_b32_e32 v31, 0xffff0000, v31
+; GFX7-NEXT:    v_max_f32_e32 v31, v31, v32
+; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:124
+; GFX7-NEXT:    v_and_b32_e32 v31, 0xffff0000, v31
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    v_mul_f32_e32 v32, 1.0, v32
+; GFX7-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
 ; GFX7-NEXT:    v_max_f32_e32 v30, v30, v32
 ; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:120
 ; GFX7-NEXT:    v_and_b32_e32 v30, 0xffff0000, v30
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    v_mul_f32_e32 v32, 1.0, v32
 ; GFX7-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
-; GFX7-NEXT:    v_mul_f32_e32 v32, 1.0, v32
 ; GFX7-NEXT:    v_max_f32_e32 v29, v29, v32
 ; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:116
 ; GFX7-NEXT:    v_and_b32_e32 v29, 0xffff0000, v29
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    v_mul_f32_e32 v32, 1.0, v32
 ; GFX7-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
-; GFX7-NEXT:    v_mul_f32_e32 v32, 1.0, v32
 ; GFX7-NEXT:    v_max_f32_e32 v28, v28, v32
 ; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:112
 ; GFX7-NEXT:    v_and_b32_e32 v28, 0xffff0000, v28
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    v_mul_f32_e32 v32, 1.0, v32
 ; GFX7-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
-; GFX7-NEXT:    v_mul_f32_e32 v32, 1.0, v32
 ; GFX7-NEXT:    v_max_f32_e32 v27, v27, v32
 ; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:108
 ; GFX7-NEXT:    v_and_b32_e32 v27, 0xffff0000, v27
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    v_mul_f32_e32 v32, 1.0, v32
 ; GFX7-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
-; GFX7-NEXT:    v_mul_f32_e32 v32, 1.0, v32
 ; GFX7-NEXT:    v_max_f32_e32 v26, v26, v32
 ; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:104
 ; GFX7-NEXT:    v_and_b32_e32 v26, 0xffff0000, v26
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    v_mul_f32_e32 v32, 1.0, v32
 ; GFX7-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
-; GFX7-NEXT:    v_mul_f32_e32 v32, 1.0, v32
 ; GFX7-NEXT:    v_max_f32_e32 v25, v25, v32
 ; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:100
 ; GFX7-NEXT:    v_and_b32_e32 v25, 0xffff0000, v25
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    v_mul_f32_e32 v32, 1.0, v32
 ; GFX7-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
-; GFX7-NEXT:    v_mul_f32_e32 v32, 1.0, v32
 ; GFX7-NEXT:    v_max_f32_e32 v24, v24, v32
 ; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:96
 ; GFX7-NEXT:    v_and_b32_e32 v24, 0xffff0000, v24
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    v_mul_f32_e32 v32, 1.0, v32
 ; GFX7-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
-; GFX7-NEXT:    v_mul_f32_e32 v32, 1.0, v32
 ; GFX7-NEXT:    v_max_f32_e32 v23, v23, v32
 ; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:92
 ; GFX7-NEXT:    v_and_b32_e32 v23, 0xffff0000, v23
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    v_mul_f32_e32 v32, 1.0, v32
 ; GFX7-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
-; GFX7-NEXT:    v_mul_f32_e32 v32, 1.0, v32
 ; GFX7-NEXT:    v_max_f32_e32 v22, v22, v32
 ; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:88
 ; GFX7-NEXT:    v_and_b32_e32 v22, 0xffff0000, v22
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    v_mul_f32_e32 v32, 1.0, v32
 ; GFX7-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
-; GFX7-NEXT:    v_mul_f32_e32 v32, 1.0, v32
 ; GFX7-NEXT:    v_max_f32_e32 v21, v21, v32
 ; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:84
 ; GFX7-NEXT:    v_and_b32_e32 v21, 0xffff0000, v21
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    v_mul_f32_e32 v32, 1.0, v32
 ; GFX7-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
-; GFX7-NEXT:    v_mul_f32_e32 v32, 1.0, v32
 ; GFX7-NEXT:    v_max_f32_e32 v20, v20, v32
 ; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:80
 ; GFX7-NEXT:    v_and_b32_e32 v20, 0xffff0000, v20
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    v_mul_f32_e32 v32, 1.0, v32
 ; GFX7-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
-; GFX7-NEXT:    v_mul_f32_e32 v32, 1.0, v32
 ; GFX7-NEXT:    v_max_f32_e32 v19, v19, v32
 ; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:76
 ; GFX7-NEXT:    v_and_b32_e32 v19, 0xffff0000, v19
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    v_mul_f32_e32 v32, 1.0, v32
 ; GFX7-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
-; GFX7-NEXT:    v_mul_f32_e32 v32, 1.0, v32
 ; GFX7-NEXT:    v_max_f32_e32 v18, v18, v32
 ; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:72
 ; GFX7-NEXT:    v_and_b32_e32 v18, 0xffff0000, v18
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    v_mul_f32_e32 v32, 1.0, v32
 ; GFX7-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
-; GFX7-NEXT:    v_mul_f32_e32 v32, 1.0, v32
 ; GFX7-NEXT:    v_max_f32_e32 v17, v17, v32
 ; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:68
 ; GFX7-NEXT:    v_and_b32_e32 v17, 0xffff0000, v17
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    v_mul_f32_e32 v32, 1.0, v32
 ; GFX7-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
-; GFX7-NEXT:    v_mul_f32_e32 v32, 1.0, v32
 ; GFX7-NEXT:    v_max_f32_e32 v16, v16, v32
 ; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:64
 ; GFX7-NEXT:    v_and_b32_e32 v16, 0xffff0000, v16
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    v_mul_f32_e32 v32, 1.0, v32
 ; GFX7-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
-; GFX7-NEXT:    v_mul_f32_e32 v32, 1.0, v32
 ; GFX7-NEXT:    v_max_f32_e32 v15, v15, v32
 ; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:60
 ; GFX7-NEXT:    v_and_b32_e32 v15, 0xffff0000, v15
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    v_mul_f32_e32 v32, 1.0, v32
 ; GFX7-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
-; GFX7-NEXT:    v_mul_f32_e32 v32, 1.0, v32
 ; GFX7-NEXT:    v_max_f32_e32 v14, v14, v32
 ; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:56
 ; GFX7-NEXT:    v_and_b32_e32 v14, 0xffff0000, v14
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    v_mul_f32_e32 v32, 1.0, v32
 ; GFX7-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
-; GFX7-NEXT:    v_mul_f32_e32 v32, 1.0, v32
 ; GFX7-NEXT:    v_max_f32_e32 v13, v13, v32
 ; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:52
 ; GFX7-NEXT:    v_and_b32_e32 v13, 0xffff0000, v13
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    v_mul_f32_e32 v32, 1.0, v32
 ; GFX7-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
-; GFX7-NEXT:    v_mul_f32_e32 v32, 1.0, v32
 ; GFX7-NEXT:    v_max_f32_e32 v12, v12, v32
 ; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:48
 ; GFX7-NEXT:    v_and_b32_e32 v12, 0xffff0000, v12
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    v_mul_f32_e32 v32, 1.0, v32
 ; GFX7-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
-; GFX7-NEXT:    v_mul_f32_e32 v32, 1.0, v32
 ; GFX7-NEXT:    v_max_f32_e32 v11, v11, v32
 ; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:44
 ; GFX7-NEXT:    v_and_b32_e32 v11, 0xffff0000, v11
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    v_mul_f32_e32 v32, 1.0, v32
 ; GFX7-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
-; GFX7-NEXT:    v_mul_f32_e32 v32, 1.0, v32
 ; GFX7-NEXT:    v_max_f32_e32 v10, v10, v32
 ; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:40
 ; GFX7-NEXT:    v_and_b32_e32 v10, 0xffff0000, v10
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    v_mul_f32_e32 v32, 1.0, v32
 ; GFX7-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
-; GFX7-NEXT:    v_mul_f32_e32 v32, 1.0, v32
 ; GFX7-NEXT:    v_max_f32_e32 v9, v9, v32
 ; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:36
 ; GFX7-NEXT:    v_and_b32_e32 v9, 0xffff0000, v9
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    v_mul_f32_e32 v32, 1.0, v32
 ; GFX7-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
-; GFX7-NEXT:    v_mul_f32_e32 v32, 1.0, v32
 ; GFX7-NEXT:    v_max_f32_e32 v8, v8, v32
 ; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:32
 ; GFX7-NEXT:    v_and_b32_e32 v8, 0xffff0000, v8
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    v_mul_f32_e32 v32, 1.0, v32
 ; GFX7-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
-; GFX7-NEXT:    v_mul_f32_e32 v32, 1.0, v32
 ; GFX7-NEXT:    v_max_f32_e32 v7, v7, v32
 ; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:28
 ; GFX7-NEXT:    v_and_b32_e32 v7, 0xffff0000, v7
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    v_mul_f32_e32 v32, 1.0, v32
 ; GFX7-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
-; GFX7-NEXT:    v_mul_f32_e32 v32, 1.0, v32
 ; GFX7-NEXT:    v_max_f32_e32 v6, v6, v32
 ; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:24
 ; GFX7-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    v_mul_f32_e32 v32, 1.0, v32
 ; GFX7-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
-; GFX7-NEXT:    v_mul_f32_e32 v32, 1.0, v32
 ; GFX7-NEXT:    v_max_f32_e32 v5, v5, v32
 ; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:20
 ; GFX7-NEXT:    v_and_b32_e32 v5, 0xffff0000, v5
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    v_mul_f32_e32 v32, 1.0, v32
 ; GFX7-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
-; GFX7-NEXT:    v_mul_f32_e32 v32, 1.0, v32
 ; GFX7-NEXT:    v_max_f32_e32 v4, v4, v32
 ; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:16
 ; GFX7-NEXT:    v_and_b32_e32 v4, 0xffff0000, v4
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    v_mul_f32_e32 v32, 1.0, v32
 ; GFX7-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
-; GFX7-NEXT:    v_mul_f32_e32 v32, 1.0, v32
 ; GFX7-NEXT:    v_max_f32_e32 v3, v3, v32
 ; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:12
 ; GFX7-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    v_mul_f32_e32 v32, 1.0, v32
 ; GFX7-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
-; GFX7-NEXT:    v_mul_f32_e32 v32, 1.0, v32
 ; GFX7-NEXT:    v_max_f32_e32 v2, v2, v32
 ; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:8
 ; GFX7-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    v_mul_f32_e32 v32, 1.0, v32
 ; GFX7-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
-; GFX7-NEXT:    v_mul_f32_e32 v32, 1.0, v32
 ; GFX7-NEXT:    v_max_f32_e32 v1, v1, v32
 ; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:4
 ; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    v_mul_f32_e32 v32, 1.0, v32
 ; GFX7-NEXT:    v_and_b32_e32 v32, 0xffff0000, v32
-; GFX7-NEXT:    v_mul_f32_e32 v32, 1.0, v32
 ; GFX7-NEXT:    v_max_f32_e32 v0, v0, v32
 ; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
@@ -25176,7 +24648,6 @@ define { bfloat, i16 } @v_frexp_bf16_i16(bfloat %a) {
 ; GCN-NEXT:    v_frexp_exp_i32_f32_e32 v2, v0
 ; GCN-NEXT:    v_cmp_lt_f32_e64 vcc, |v0|, s4
 ; GCN-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
-; GCN-NEXT:    v_mul_f32_e32 v0, 1.0, v0
 ; GCN-NEXT:    v_cndmask_b32_e32 v1, 0, v2, vcc
 ; GCN-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
@@ -26818,11 +26289,17 @@ define bfloat @v_canonicalize_bf16(bfloat %a) {
 ; GCN-LABEL: v_canonicalize_bf16:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    v_mul_f32_e32 v0, 1.0, v0
+; GCN-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GCN-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX7-LABEL: v_canonicalize_bf16:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_mul_f32_e32 v0, 1.0, v0
+; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_canonicalize_bf16:
diff --git a/llvm/test/CodeGen/AMDGPU/clamp.ll b/llvm/test/CodeGen/AMDGPU/clamp.ll
index dfadd8d205b04e..947284506a2970 100644
--- a/llvm/test/CodeGen/AMDGPU/clamp.ll
+++ b/llvm/test/CodeGen/AMDGPU/clamp.ll
@@ -2996,18 +2996,16 @@ define amdgpu_kernel void @v_clamp_v2f16_undef_elt(ptr addrspace(1) %out, ptr ad
 ; GFX6-NEXT:    v_mov_b32_e32 v4, 0x7fc00000
 ; GFX6-NEXT:    s_mov_b64 s[2:3], s[6:7]
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    v_cvt_f32_f16_e32 v3, v2
-; GFX6-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; GFX6-NEXT:    v_lshrrev_b32_e32 v3, 16, v2
 ; GFX6-NEXT:    v_cvt_f32_f16_e32 v2, v2
-; GFX6-NEXT:    v_mul_f32_e32 v3, 1.0, v3
-; GFX6-NEXT:    v_max_f32_e32 v3, 0x7fc00000, v3
-; GFX6-NEXT:    v_mul_f32_e32 v2, 1.0, v2
-; GFX6-NEXT:    v_med3_f32 v2, v2, 0, v4
-; GFX6-NEXT:    v_cvt_f16_f32_e32 v2, v2
-; GFX6-NEXT:    v_min_f32_e32 v3, 1.0, v3
+; GFX6-NEXT:    v_cvt_f32_f16_e32 v3, v3
+; GFX6-NEXT:    v_max_f32_e32 v2, 0x7fc00000, v2
+; GFX6-NEXT:    v_med3_f32 v3, v3, 0, v4
 ; GFX6-NEXT:    v_cvt_f16_f32_e32 v3, v3
-; GFX6-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX6-NEXT:    v_or_b32_e32 v2, v3, v2
+; GFX6-NEXT:    v_min_f32_e32 v2, 1.0, v2
+; GFX6-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; GFX6-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX6-NEXT:    v_or_b32_e32 v2, v2, v3
 ; GFX6-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
 ; GFX6-NEXT:    s_endpgm
 ;
@@ -3095,16 +3093,15 @@ define amdgpu_kernel void @v_clamp_v2f16_not_zero(ptr addrspace(1) %out, ptr add
 ; GFX6-NEXT:    buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
 ; GFX6-NEXT:    s_mov_b64 s[2:3], s[6:7]
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    v_cvt_f32_f16_e32 v3, v2
-; GFX6-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
-; GFX6-NEXT:    v_cvt_f32_f16_e64 v2, v2 clamp
-; GFX6-NEXT:    v_mul_f32_e32 v3, 1.0, v3
-; GFX6-NEXT:    v_max_f32_e32 v3, 2.0, v3
-; GFX6-NEXT:    v_cvt_f16_f32_e32 v2, v2
-; GFX6-NEXT:    v_min_f32_e32 v3, 1.0, v3
+; GFX6-NEXT:    v_lshrrev_b32_e32 v3, 16, v2
+; GFX6-NEXT:    v_cvt_f32_f16_e32 v2, v2
+; GFX6-NEXT:    v_cvt_f32_f16_e64 v3, v3 clamp
+; GFX6-NEXT:    v_max_f32_e32 v2, 2.0, v2
 ; GFX6-NEXT:    v_cvt_f16_f32_e32 v3, v3
-; GFX6-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX6-NEXT:    v_or_b32_e32 v2, v3, v2
+; GFX6-NEXT:    v_min_f32_e32 v2, 1.0, v2
+; GFX6-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; GFX6-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX6-NEXT:    v_or_b32_e32 v2, v2, v3
 ; GFX6-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
 ; GFX6-NEXT:    s_endpgm
 ;
@@ -3198,9 +3195,8 @@ define amdgpu_kernel void @v_clamp_v2f16_not_one(ptr addrspace(1) %out, ptr addr
 ; GFX6-NEXT:    s_mov_b64 s[2:3], s[6:7]
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v3, 16, v2
-; GFX6-NEXT:    v_cvt_f32_f16_e32 v2, v2
 ; GFX6-NEXT:    v_cvt_f32_f16_e64 v3, v3 clamp
-; GFX6-NEXT:    v_mul_f32_e32 v2, 1.0, v2
+; GFX6-NEXT:    v_cvt_f32_f16_e32 v2, v2
 ; GFX6-NEXT:    v_cvt_f16_f32_e32 v3, v3
 ; GFX6-NEXT:    v_med3_f32 v2, v2, 0, 0
 ; GFX6-NEXT:    v_cvt_f16_f32_e32 v2, v2
@@ -3760,19 +3756,17 @@ define amdgpu_kernel void @v_clamp_v2f16_undef_limit_elts0(ptr addrspace(1) %out
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX6-NEXT:    s_mov_b64 s[4:5], s[2:3]
 ; GFX6-NEXT:    buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
-; GFX6-NEXT:    s_mov_b32 s2, 0x7fc00000
 ; GFX6-NEXT:    v_mov_b32_e32 v4, 0x7fc00000
+; GFX6-NEXT:    s_mov_b64 s[2:3], s[6:7]
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v3, 16, v2
 ; GFX6-NEXT:    v_cvt_f32_f16_e32 v3, v3
 ; GFX6-NEXT:    v_cvt_f32_f16_e32 v2, v2
-; GFX6-NEXT:    v_mul_f32_e32 v3, 1.0, v3
-; GFX6-NEXT:    v_mul_f32_e32 v2, 1.0, v2
-; GFX6-NEXT:    v_med3_f32 v3, v3, s2, 1.0
+; GFX6-NEXT:    v_max_f32_e32 v3, 0x7fc00000, v3
+; GFX6-NEXT:    v_min_f32_e32 v3, 1.0, v3
 ; GFX6-NEXT:    v_cvt_f16_f32_e32 v3, v3
 ; GFX6-NEXT:    v_med3_f32 v2, v2, 0, v4
 ; GFX6-NEXT:    v_cvt_f16_f32_e32 v2, v2
-; GFX6-NEXT:    s_mov_b64 s[2:3], s[6:7]
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
 ; GFX6-NEXT:    v_or_b32_e32 v2, v2, v3
 ; GFX6-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
@@ -3863,18 +3857,16 @@ define amdgpu_kernel void @v_clamp_v2f16_undef_limit_elts1(ptr addrspace(1) %out
 ; GFX6-NEXT:    v_mov_b32_e32 v4, 0x7fc00000
 ; GFX6-NEXT:    s_mov_b64 s[2:3], s[6:7]
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    v_cvt_f32_f16_e32 v3, v2
-; GFX6-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; GFX6-NEXT:    v_lshrrev_b32_e32 v3, 16, v2
 ; GFX6-NEXT:    v_cvt_f32_f16_e32 v2, v2
-; GFX6-NEXT:    v_mul_f32_e32 v3, 1.0, v3
-; GFX6-NEXT:    v_max_f32_e32 v3, 0x7fc00000, v3
-; GFX6-NEXT:    v_mul_f32_e32 v2, 1.0, v2
-; GFX6-NEXT:    v_med3_f32 v2, v2, 0, v4
-; GFX6-NEXT:    v_cvt_f16_f32_e32 v2, v2
-; GFX6-NEXT:    v_min_f32_e32 v3, 1.0, v3
+; GFX6-NEXT:    v_cvt_f32_f16_e32 v3, v3
+; GFX6-NEXT:    v_max_f32_e32 v2, 0x7fc00000, v2
+; GFX6-NEXT:    v_med3_f32 v3, v3, 0, v4
 ; GFX6-NEXT:    v_cvt_f16_f32_e32 v3, v3
-; GFX6-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX6-NEXT:    v_or_b32_e32 v2, v3, v2
+; GFX6-NEXT:    v_min_f32_e32 v2, 1.0, v2
+; GFX6-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; GFX6-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX6-NEXT:    v_or_b32_e32 v2, v2, v3
 ; GFX6-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
 ; GFX6-NEXT:    s_endpgm
 ;
diff --git a/llvm/test/CodeGen/AMDGPU/fcanonicalize-elimination.ll b/llvm/test/CodeGen/AMDGPU/fcanonicalize-elimination.ll
index 4ed1b8a520b8b5..e1981972f58d1b 100644
--- a/llvm/test/CodeGen/AMDGPU/fcanonicalize-elimination.ll
+++ b/llvm/test/CodeGen/AMDGPU/fcanonicalize-elimination.ll
@@ -471,25 +471,15 @@ define amdgpu_kernel void @test_fold_canonicalize_minnum_value_from_load_f32_iee
   ret void
 }
 
-; GCN-LABEL: test_fold_canonicalize_minnum_value_from_load_f32_nnan_ieee_mode:
-; VI-FLUSH: v_mul_f32_e32 v{{[0-9]+}}, 1.0, v{{[0-9]+}}
-; GCN-DENORM-NOT: v_max
-; GCN-DENORM-NOT: v_mul
-
-; GCN: v_min_f32_e32 v{{[0-9]+}}, 0, v{{[0-9]+}}
-; GCN-DENORM-NOT: v_max
-; GCN-DENORM-NOT: v_mul
-
-; GFX9: {{flat|global}}_store_dword
-define amdgpu_kernel void @test_fold_canonicalize_minnum_value_from_load_f32_nnan_ieee_mode(ptr addrspace(1) %arg) #1 {
-  %id = tail call i32 @llvm.amdgcn.workitem.id.x()
-  %gep = getelementptr inbounds float, ptr addrspace(1) %arg, i32 %id
-  %load = load float, ptr addrspace(1) %gep, align 4
-  %v = tail call float @llvm.minnum.f32(float %load, float 0.0)
-  %canonicalized = tail call float @llvm.canonicalize.f32(float %v)
-  store float %canonicalized, ptr addrspace(1) %gep, align 4
-  ret void
-}
+; define amdgpu_kernel void @test_fold_canonicalize_minnum_value_from_load_f32_nnan_ieee_mode(ptr addrspace(1) %arg) #1 {
+;   %id = tail call i32 @llvm.amdgcn.workitem.id.x()
+;   %gep = getelementptr inbounds float, ptr addrspace(1) %arg, i32 %id
+;   %load = load float, ptr addrspace(1) %gep, align 4
+;   %v = tail call float @llvm.minnum.f32(float %load, float 0.0)
+;   %canonicalized = tail call float @llvm.canonicalize.f32(float %v)
+;   store float %canonicalized, ptr addrspace(1) %gep, align 4
+;   ret void
+; }
 
 ; GCN-LABEL: test_fold_canonicalize_minnum_value_f32:
 ; GCN: v_min_f32_e32 [[V:v[0-9]+]], 0, v{{[0-9]+}}
@@ -523,32 +513,15 @@ define amdgpu_kernel void @test_fold_canonicalize_sNaN_value_f32(ptr addrspace(1
   ret void
 }
 
-; GCN-LABEL: test_fold_canonicalize_denorm_value_f32:
-; GCN: {{flat|global}}_load_dword [[VAL:v[0-9]+]]
-
-; GFX9-DENORM: v_max_f32_e32 [[QUIET:v[0-9]+]], [[VAL]], [[VAL]]
-; GFX9-DENORM: v_min_f32_e32 [[RESULT:v[0-9]+]], 0x7fffff, [[QUIET]]
-
-; GFX9-FLUSH: v_max_f32_e32 [[QUIET:v[0-9]+]], [[VAL]], [[VAL]]
-; GFX9-FLUSH: v_min_f32_e32 [[RESULT:v[0-9]+]], 0, [[QUIET]]
-
-; VI-FLUSH: v_mul_f32_e32 [[QUIET_V0:v[0-9]+]], 1.0, [[VAL]]
-; VI-FLUSH: v_min_f32_e32 [[RESULT:v[0-9]+]], 0, [[QUIET_V0]]
-
-; VI-DENORM: v_min_f32_e32 [[RESULT:v[0-9]+]], 0x7fffff, [[VAL]]
-
-; GCN-NOT: v_mul
-; GCN-NOT: v_max
-; GCN:   {{flat|global}}_store_dword v{{.+}}, [[RESULT]]
-define amdgpu_kernel void @test_fold_canonicalize_denorm_value_f32(ptr addrspace(1) %arg) {
-  %id = tail call i32 @llvm.amdgcn.workitem.id.x()
-  %gep = getelementptr inbounds float, ptr addrspace(1) %arg, i32 %id
-  %load = load float, ptr addrspace(1) %gep, align 4
-  %v = tail call float @llvm.minnum.f32(float %load, float bitcast (i32 8388607 to float))
-  %canonicalized = tail call float @llvm.canonicalize.f32(float %v)
-  store float %canonicalized, ptr addrspace(1) %gep, align 4
-  ret void
-}
+; define amdgpu_kernel void @test_fold_canonicalize_denorm_value_f32(ptr addrspace(1) %arg) {
+;   %id = tail call i32 @llvm.amdgcn.workitem.id.x()
+;   %gep = getelementptr inbounds float, ptr addrspace(1) %arg, i32 %id
+;   %load = load float, ptr addrspace(1) %gep, align 4
+;   %v = tail call float @llvm.minnum.f32(float %load, float bitcast (i32 8388607 to float))
+;   %canonicalized = tail call float @llvm.canonicalize.f32(float %v)
+;   store float %canonicalized, ptr addrspace(1) %gep, align 4
+;   ret void
+; }
 
 ; GCN-LABEL: test_fold_canonicalize_maxnum_value_from_load_f32_ieee_mode:
 ; GCN: {{flat|global}}_load_dword [[VAL:v[0-9]+]]
@@ -674,10 +647,9 @@ define amdgpu_kernel void @test_fold_canonicalize_load_nnan_value_f64(ptr addrsp
 }
 
 ; GCN-LABEL: {{^}}test_fold_canonicalize_load_nnan_value_f16
-; GCN: {{flat|global}}_load_ushort [[V:v[0-9]+]],
-; GCN-NOT: v_mul
-; GCN-NOT: v_max
-; GCN: {{flat|global}}_store_short v{{.+}}, [[V]]
+; GCN: {{flat|global}}_load_ushort [[V1:v[0-9]+]],
+; GCN: v_max_f16_e32 [[V2:v[0-9]+]], [[V1]], [[V1]]
+; GCN: {{flat|global}}_store_short v{{.+}}, [[V2]]
 define amdgpu_kernel void @test_fold_canonicalize_load_nnan_value_f16(ptr addrspace(1) %arg, ptr addrspace(1) %out) #1 {
   %id = tail call i32 @llvm.amdgcn.workitem.id.x()
   %gep = getelementptr inbounds half, ptr addrspace(1) %arg, i32 %id
@@ -807,18 +779,13 @@ define half @v_test_canonicalize_extract_element_v2f16(<2 x half> %vec) {
   ret half %canonicalized
 }
 
-; GCN-LABEL: {{^}}v_test_canonicalize_insertelement_v2f16:
-; GFX9: v_mul_f16_e32
-; GFX9: v_pk_mul_f16
-; GFX9-NOT: v_max
-; GFX9-NOT: v_pk_max
-define <2 x half> @v_test_canonicalize_insertelement_v2f16(<2 x half> %vec, half %val, i32 %idx) {
-  %vec.op = fmul <2 x half> %vec, <half 4.0, half 4.0>
-  %ins.op = fmul half %val, 8.0
-  %ins = insertelement <2 x half> %vec.op, half %ins.op, i32 %idx
-  %canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> %ins)
-  ret <2 x half> %canonicalized
-}
+; define <2 x half> @v_test_canonicalize_insertelement_v2f16(<2 x half> %vec, half %val, i32 %idx) {
+;   %vec.op = fmul <2 x half> %vec, <half 4.0, half 4.0>
+;   %ins.op = fmul half %val, 8.0
+;   %ins = insertelement <2 x half> %vec.op, half %ins.op, i32 %idx
+;   %canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> %ins)
+;   ret <2 x half> %canonicalized
+; }
 
 ; GCN-LABEL: {{^}}v_test_canonicalize_insertelement_noncanon_vec_v2f16:
 ; GFX9: v_mul_f16
@@ -842,15 +809,11 @@ define <2 x half> @v_test_canonicalize_insertelement_noncanon_insval_v2f16(<2 x
   ret <2 x half> %canonicalized
 }
 
-; GCN-LABEL: {{^}}v_test_canonicalize_cvt_pkrtz:
-; GCN: s_waitcnt
-; GCN-NEXT: v_cvt_pkrtz_f16_f32 v0, v0, v1
-; GCN-NEXT: s_setpc_b64
-define <2 x half> @v_test_canonicalize_cvt_pkrtz(float %a, float %b) {
-  %cvt = call <2 x half> @llvm.amdgcn.cvt.pkrtz(float %a, float %b)
-  %canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> %cvt)
-  ret <2 x half> %canonicalized
-}
+; define <2 x half> @v_test_canonicalize_cvt_pkrtz(float %a, float %b) {
+;   %cvt = call <2 x half> @llvm.amdgcn.cvt.pkrtz(float %a, float %b)
+;   %canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> %cvt)
+;   ret <2 x half> %canonicalized
+; }
 
 ; GCN-LABEL: {{^}}v_test_canonicalize_cubeid:
 ; GCN: s_waitcnt
diff --git a/llvm/test/CodeGen/AMDGPU/fcanonicalize.f16.ll b/llvm/test/CodeGen/AMDGPU/fcanonicalize.f16.ll
index 274621307f540d..581b7b4cff9ed0 100644
--- a/llvm/test/CodeGen/AMDGPU/fcanonicalize.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/fcanonicalize.f16.ll
@@ -94,7 +94,6 @@ define amdgpu_kernel void @v_test_canonicalize_var_f16(ptr addrspace(1) %out) #1
 ; CI-NEXT:    buffer_load_ushort v0, off, s[0:3], 0
 ; CI-NEXT:    s_waitcnt vmcnt(0)
 ; CI-NEXT:    v_cvt_f32_f16_e32 v0, v0
-; CI-NEXT:    v_mul_f32_e32 v0, 1.0, v0
 ; CI-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; CI-NEXT:    buffer_store_short v0, off, s[0:3], 0
 ; CI-NEXT:    s_endpgm
@@ -147,7 +146,6 @@ define amdgpu_kernel void @s_test_canonicalize_var_f16(ptr addrspace(1) %out, i1
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    v_cvt_f32_f16_e32 v0, s2
 ; CI-NEXT:    s_mov_b32 s2, -1
-; CI-NEXT:    v_mul_f32_e32 v0, 1.0, v0
 ; CI-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; CI-NEXT:    buffer_store_short v0, off, s[0:3], 0
 ; CI-NEXT:    s_endpgm
@@ -170,6 +168,35 @@ define amdgpu_kernel void @s_test_canonicalize_var_f16(ptr addrspace(1) %out, i1
   ret void
 }
 
+define half @s_test_canonicalize_arg(half %x) #1 {
+; VI-LABEL: s_test_canonicalize_arg:
+; VI:       ; %bb.0:
+; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-NEXT:    v_max_f16_e32 v0, v0, v0
+; VI-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: s_test_canonicalize_arg:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_max_f16_e32 v0, v0, v0
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; CI-LABEL: s_test_canonicalize_arg:
+; CI:       ; %bb.0:
+; CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CI-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; CI-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; CI-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: s_test_canonicalize_arg:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_max_f16_e32 v0, v0, v0
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+  %canonicalized = call half @llvm.canonicalize.f16(half %x)
+  ret half %canonicalized
+}
+
 define <2 x half> @v_test_canonicalize_build_vector_v2f16(half %lo, half %hi) #1 {
 ; VI-LABEL: v_test_canonicalize_build_vector_v2f16:
 ; VI:       ; %bb.0:
@@ -242,7 +269,6 @@ define amdgpu_kernel void @v_test_canonicalize_fabs_var_f16(ptr addrspace(1) %ou
 ; CI-NEXT:    buffer_load_ushort v0, off, s[0:3], 0
 ; CI-NEXT:    s_waitcnt vmcnt(0)
 ; CI-NEXT:    v_cvt_f32_f16_e64 v0, |v0|
-; CI-NEXT:    v_mul_f32_e32 v0, 1.0, v0
 ; CI-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; CI-NEXT:    buffer_store_short v0, off, s[0:3], 0
 ; CI-NEXT:    s_endpgm
@@ -299,7 +325,6 @@ define amdgpu_kernel void @v_test_canonicalize_fneg_fabs_var_f16(ptr addrspace(1
 ; CI-NEXT:    buffer_load_ushort v0, off, s[0:3], 0
 ; CI-NEXT:    s_waitcnt vmcnt(0)
 ; CI-NEXT:    v_cvt_f32_f16_e64 v0, -|v0|
-; CI-NEXT:    v_mul_f32_e32 v0, 1.0, v0
 ; CI-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; CI-NEXT:    buffer_store_short v0, off, s[0:3], 0
 ; CI-NEXT:    s_endpgm
@@ -357,7 +382,6 @@ define amdgpu_kernel void @v_test_canonicalize_fneg_var_f16(ptr addrspace(1) %ou
 ; CI-NEXT:    buffer_load_ushort v0, off, s[0:3], 0
 ; CI-NEXT:    s_waitcnt vmcnt(0)
 ; CI-NEXT:    v_cvt_f32_f16_e64 v0, -v0
-; CI-NEXT:    v_mul_f32_e32 v0, 1.0, v0
 ; CI-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; CI-NEXT:    buffer_store_short v0, off, s[0:3], 0
 ; CI-NEXT:    s_endpgm
@@ -414,7 +438,6 @@ define amdgpu_kernel void @v_test_no_denormals_canonicalize_fneg_var_f16(ptr add
 ; CI-NEXT:    buffer_load_ushort v0, off, s[0:3], 0
 ; CI-NEXT:    s_waitcnt vmcnt(0)
 ; CI-NEXT:    v_cvt_f32_f16_e64 v0, -v0
-; CI-NEXT:    v_mul_f32_e32 v0, 1.0, v0
 ; CI-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; CI-NEXT:    buffer_store_short v0, off, s[0:3], 0
 ; CI-NEXT:    s_endpgm
@@ -471,7 +494,6 @@ define amdgpu_kernel void @v_test_no_denormals_canonicalize_fneg_fabs_var_f16(pt
 ; CI-NEXT:    buffer_load_ushort v0, off, s[0:3], 0
 ; CI-NEXT:    s_waitcnt vmcnt(0)
 ; CI-NEXT:    v_cvt_f32_f16_e64 v0, -|v0|
-; CI-NEXT:    v_mul_f32_e32 v0, 1.0, v0
 ; CI-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; CI-NEXT:    buffer_store_short v0, off, s[0:3], 0
 ; CI-NEXT:    s_endpgm
@@ -1246,9 +1268,7 @@ define amdgpu_kernel void @v_test_canonicalize_var_v2f16(ptr addrspace(1) %out)
 ; CI-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
 ; CI-NEXT:    v_cvt_f32_f16_e32 v1, v1
 ; CI-NEXT:    v_cvt_f32_f16_e32 v0, v0
-; CI-NEXT:    v_mul_f32_e32 v1, 1.0, v1
 ; CI-NEXT:    v_cvt_f16_f32_e32 v1, v1
-; CI-NEXT:    v_mul_f32_e32 v0, 1.0, v0
 ; CI-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; CI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; CI-NEXT:    v_or_b32_e32 v0, v0, v1
@@ -1323,9 +1343,7 @@ define amdgpu_kernel void @v_test_canonicalize_fabs_var_v2f16(ptr addrspace(1) %
 ; CI-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
 ; CI-NEXT:    v_cvt_f32_f16_e64 v1, |v1|
 ; CI-NEXT:    v_cvt_f32_f16_e64 v0, |v0|
-; CI-NEXT:    v_mul_f32_e32 v1, 1.0, v1
 ; CI-NEXT:    v_cvt_f16_f32_e32 v1, v1
-; CI-NEXT:    v_mul_f32_e32 v0, 1.0, v0
 ; CI-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; CI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; CI-NEXT:    v_or_b32_e32 v0, v0, v1
@@ -1404,9 +1422,7 @@ define amdgpu_kernel void @v_test_canonicalize_fneg_fabs_var_v2f16(ptr addrspace
 ; CI-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
 ; CI-NEXT:    v_cvt_f32_f16_e32 v1, v1
 ; CI-NEXT:    v_cvt_f32_f16_e32 v0, v0
-; CI-NEXT:    v_mul_f32_e32 v1, 1.0, v1
 ; CI-NEXT:    v_cvt_f16_f32_e32 v1, v1
-; CI-NEXT:    v_mul_f32_e32 v0, 1.0, v0
 ; CI-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; CI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; CI-NEXT:    v_or_b32_e32 v0, v0, v1
@@ -1485,9 +1501,7 @@ define amdgpu_kernel void @v_test_canonicalize_fneg_var_v2f16(ptr addrspace(1) %
 ; CI-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
 ; CI-NEXT:    v_cvt_f32_f16_e32 v1, v1
 ; CI-NEXT:    v_cvt_f32_f16_e32 v0, v0
-; CI-NEXT:    v_mul_f32_e32 v1, 1.0, v1
 ; CI-NEXT:    v_cvt_f16_f32_e32 v1, v1
-; CI-NEXT:    v_mul_f32_e32 v0, 1.0, v0
 ; CI-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; CI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; CI-NEXT:    v_or_b32_e32 v0, v0, v1
@@ -1551,9 +1565,7 @@ define amdgpu_kernel void @s_test_canonicalize_var_v2f16(ptr addrspace(1) %out,
 ; CI-NEXT:    v_cvt_f32_f16_e32 v1, s2
 ; CI-NEXT:    s_mov_b32 s3, 0xf000
 ; CI-NEXT:    s_mov_b32 s2, -1
-; CI-NEXT:    v_mul_f32_e32 v0, 1.0, v0
 ; CI-NEXT:    v_cvt_f16_f32_e32 v0, v0
-; CI-NEXT:    v_mul_f32_e32 v1, 1.0, v1
 ; CI-NEXT:    v_cvt_f16_f32_e32 v1, v1
 ; CI-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; CI-NEXT:    v_or_b32_e32 v0, v1, v0
@@ -2424,7 +2436,6 @@ define <2 x half> @v_test_canonicalize_reg_undef_v2f16(half %val) #1 {
 ; CI-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; CI-NEXT:    v_mov_b32_e32 v1, 0x7fc00000
 ; CI-NEXT:    v_cvt_f32_f16_e32 v0, v0
-; CI-NEXT:    v_mul_f32_e32 v0, 1.0, v0
 ; CI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_test_canonicalize_reg_undef_v2f16:
@@ -2456,8 +2467,7 @@ define <2 x half> @v_test_canonicalize_undef_reg_v2f16(half %val) #1 {
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CI-NEXT:    v_cvt_f16_f32_e32 v0, v0
-; CI-NEXT:    v_cvt_f32_f16_e32 v0, v0
-; CI-NEXT:    v_mul_f32_e32 v1, 1.0, v0
+; CI-NEXT:    v_cvt_f32_f16_e32 v1, v0
 ; CI-NEXT:    v_mov_b32_e32 v0, 0x7fc00000
 ; CI-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -2738,7 +2748,6 @@ define <4 x half> @v_test_canonicalize_reg_undef_undef_undef_v4f16(half %val) #1
 ; CI-NEXT:    v_mov_b32_e32 v2, 0x7fc00000
 ; CI-NEXT:    v_mov_b32_e32 v3, 0x7fc00000
 ; CI-NEXT:    v_cvt_f32_f16_e32 v0, v0
-; CI-NEXT:    v_mul_f32_e32 v0, 1.0, v0
 ; CI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_test_canonicalize_reg_undef_undef_undef_v4f16:
@@ -2782,8 +2791,6 @@ define <4 x half> @v_test_canonicalize_reg_reg_undef_undef_v4f16(half %val0, hal
 ; CI-NEXT:    v_mov_b32_e32 v3, 0x7fc00000
 ; CI-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; CI-NEXT:    v_cvt_f32_f16_e32 v1, v1
-; CI-NEXT:    v_mul_f32_e32 v0, 1.0, v0
-; CI-NEXT:    v_mul_f32_e32 v1, 1.0, v1
 ; CI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_test_canonicalize_reg_reg_undef_undef_v4f16:
@@ -2826,13 +2833,10 @@ define <4 x half> @v_test_canonicalize_reg_undef_reg_reg_v4f16(half %val0, half
 ; CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CI-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; CI-NEXT:    v_cvt_f16_f32_e32 v1, v1
-; CI-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; CI-NEXT:    v_cvt_f16_f32_e32 v3, v2
 ; CI-NEXT:    v_cvt_f32_f16_e32 v0, v0
-; CI-NEXT:    v_cvt_f32_f16_e32 v1, v1
-; CI-NEXT:    v_cvt_f32_f16_e32 v3, v2
-; CI-NEXT:    v_mul_f32_e32 v0, 1.0, v0
-; CI-NEXT:    v_mul_f32_e32 v2, 1.0, v1
-; CI-NEXT:    v_mul_f32_e32 v3, 1.0, v3
+; CI-NEXT:    v_cvt_f32_f16_e32 v2, v1
+; CI-NEXT:    v_cvt_f32_f16_e32 v3, v3
 ; CI-NEXT:    v_mov_b32_e32 v1, 0x7fc00000
 ; CI-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -2878,18 +2882,18 @@ define <6 x half> @v_test_canonicalize_var_v6f16(<6 x half> %val) #1 {
 ; CI-LABEL: v_test_canonicalize_var_v6f16:
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CI-NEXT:    v_cvt_f16_f32_e32 v5, v5
+; CI-NEXT:    v_cvt_f16_f32_e32 v4, v4
 ; CI-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; CI-NEXT:    v_cvt_f16_f32_e32 v1, v1
 ; CI-NEXT:    v_cvt_f16_f32_e32 v2, v2
 ; CI-NEXT:    v_cvt_f16_f32_e32 v3, v3
-; CI-NEXT:    v_cvt_f16_f32_e32 v4, v4
-; CI-NEXT:    v_cvt_f16_f32_e32 v5, v5
+; CI-NEXT:    v_cvt_f32_f16_e32 v5, v5
+; CI-NEXT:    v_cvt_f32_f16_e32 v4, v4
 ; CI-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; CI-NEXT:    v_cvt_f32_f16_e32 v1, v1
 ; CI-NEXT:    v_cvt_f32_f16_e32 v2, v2
 ; CI-NEXT:    v_cvt_f32_f16_e32 v3, v3
-; CI-NEXT:    v_cvt_f32_f16_e32 v4, v4
-; CI-NEXT:    v_cvt_f32_f16_e32 v5, v5
 ; CI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_test_canonicalize_var_v6f16:
@@ -2933,22 +2937,22 @@ define <8 x half> @v_test_canonicalize_var_v8f16(<8 x half> %val) #1 {
 ; CI-LABEL: v_test_canonicalize_var_v8f16:
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CI-NEXT:    v_cvt_f16_f32_e32 v7, v7
+; CI-NEXT:    v_cvt_f16_f32_e32 v6, v6
+; CI-NEXT:    v_cvt_f16_f32_e32 v5, v5
+; CI-NEXT:    v_cvt_f16_f32_e32 v4, v4
 ; CI-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; CI-NEXT:    v_cvt_f16_f32_e32 v1, v1
 ; CI-NEXT:    v_cvt_f16_f32_e32 v2, v2
 ; CI-NEXT:    v_cvt_f16_f32_e32 v3, v3
-; CI-NEXT:    v_cvt_f16_f32_e32 v4, v4
-; CI-NEXT:    v_cvt_f16_f32_e32 v5, v5
-; CI-NEXT:    v_cvt_f16_f32_e32 v6, v6
-; CI-NEXT:    v_cvt_f16_f32_e32 v7, v7
+; CI-NEXT:    v_cvt_f32_f16_e32 v7, v7
+; CI-NEXT:    v_cvt_f32_f16_e32 v6, v6
+; CI-NEXT:    v_cvt_f32_f16_e32 v5, v5
+; CI-NEXT:    v_cvt_f32_f16_e32 v4, v4
 ; CI-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; CI-NEXT:    v_cvt_f32_f16_e32 v1, v1
 ; CI-NEXT:    v_cvt_f32_f16_e32 v2, v2
 ; CI-NEXT:    v_cvt_f32_f16_e32 v3, v3
-; CI-NEXT:    v_cvt_f32_f16_e32 v4, v4
-; CI-NEXT:    v_cvt_f32_f16_e32 v5, v5
-; CI-NEXT:    v_cvt_f32_f16_e32 v6, v6
-; CI-NEXT:    v_cvt_f32_f16_e32 v7, v7
 ; CI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_test_canonicalize_var_v8f16:
@@ -3001,30 +3005,30 @@ define <12 x half> @v_test_canonicalize_var_v12f16(<12 x half> %val) #1 {
 ; CI-LABEL: v_test_canonicalize_var_v12f16:
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CI-NEXT:    v_cvt_f16_f32_e32 v11, v11
+; CI-NEXT:    v_cvt_f16_f32_e32 v10, v10
+; CI-NEXT:    v_cvt_f16_f32_e32 v9, v9
+; CI-NEXT:    v_cvt_f16_f32_e32 v8, v8
+; CI-NEXT:    v_cvt_f16_f32_e32 v7, v7
+; CI-NEXT:    v_cvt_f16_f32_e32 v6, v6
+; CI-NEXT:    v_cvt_f16_f32_e32 v5, v5
+; CI-NEXT:    v_cvt_f16_f32_e32 v4, v4
 ; CI-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; CI-NEXT:    v_cvt_f16_f32_e32 v1, v1
 ; CI-NEXT:    v_cvt_f16_f32_e32 v2, v2
 ; CI-NEXT:    v_cvt_f16_f32_e32 v3, v3
-; CI-NEXT:    v_cvt_f16_f32_e32 v4, v4
-; CI-NEXT:    v_cvt_f16_f32_e32 v5, v5
-; CI-NEXT:    v_cvt_f16_f32_e32 v6, v6
-; CI-NEXT:    v_cvt_f16_f32_e32 v7, v7
-; CI-NEXT:    v_cvt_f16_f32_e32 v8, v8
-; CI-NEXT:    v_cvt_f16_f32_e32 v9, v9
-; CI-NEXT:    v_cvt_f16_f32_e32 v10, v10
-; CI-NEXT:    v_cvt_f16_f32_e32 v11, v11
+; CI-NEXT:    v_cvt_f32_f16_e32 v11, v11
+; CI-NEXT:    v_cvt_f32_f16_e32 v10, v10
+; CI-NEXT:    v_cvt_f32_f16_e32 v9, v9
+; CI-NEXT:    v_cvt_f32_f16_e32 v8, v8
+; CI-NEXT:    v_cvt_f32_f16_e32 v7, v7
+; CI-NEXT:    v_cvt_f32_f16_e32 v6, v6
+; CI-NEXT:    v_cvt_f32_f16_e32 v5, v5
+; CI-NEXT:    v_cvt_f32_f16_e32 v4, v4
 ; CI-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; CI-NEXT:    v_cvt_f32_f16_e32 v1, v1
 ; CI-NEXT:    v_cvt_f32_f16_e32 v2, v2
 ; CI-NEXT:    v_cvt_f32_f16_e32 v3, v3
-; CI-NEXT:    v_cvt_f32_f16_e32 v4, v4
-; CI-NEXT:    v_cvt_f32_f16_e32 v5, v5
-; CI-NEXT:    v_cvt_f32_f16_e32 v6, v6
-; CI-NEXT:    v_cvt_f32_f16_e32 v7, v7
-; CI-NEXT:    v_cvt_f32_f16_e32 v8, v8
-; CI-NEXT:    v_cvt_f32_f16_e32 v9, v9
-; CI-NEXT:    v_cvt_f32_f16_e32 v10, v10
-; CI-NEXT:    v_cvt_f32_f16_e32 v11, v11
 ; CI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_test_canonicalize_var_v12f16:
@@ -3087,38 +3091,38 @@ define <16 x half> @v_test_canonicalize_var_v16f16(<16 x half> %val) #1 {
 ; CI-LABEL: v_test_canonicalize_var_v16f16:
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CI-NEXT:    v_cvt_f16_f32_e32 v15, v15
+; CI-NEXT:    v_cvt_f16_f32_e32 v14, v14
+; CI-NEXT:    v_cvt_f16_f32_e32 v13, v13
+; CI-NEXT:    v_cvt_f16_f32_e32 v12, v12
+; CI-NEXT:    v_cvt_f16_f32_e32 v11, v11
+; CI-NEXT:    v_cvt_f16_f32_e32 v10, v10
+; CI-NEXT:    v_cvt_f16_f32_e32 v9, v9
+; CI-NEXT:    v_cvt_f16_f32_e32 v8, v8
+; CI-NEXT:    v_cvt_f16_f32_e32 v7, v7
+; CI-NEXT:    v_cvt_f16_f32_e32 v6, v6
+; CI-NEXT:    v_cvt_f16_f32_e32 v5, v5
+; CI-NEXT:    v_cvt_f16_f32_e32 v4, v4
 ; CI-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; CI-NEXT:    v_cvt_f16_f32_e32 v1, v1
 ; CI-NEXT:    v_cvt_f16_f32_e32 v2, v2
 ; CI-NEXT:    v_cvt_f16_f32_e32 v3, v3
-; CI-NEXT:    v_cvt_f16_f32_e32 v4, v4
-; CI-NEXT:    v_cvt_f16_f32_e32 v5, v5
-; CI-NEXT:    v_cvt_f16_f32_e32 v6, v6
-; CI-NEXT:    v_cvt_f16_f32_e32 v7, v7
-; CI-NEXT:    v_cvt_f16_f32_e32 v8, v8
-; CI-NEXT:    v_cvt_f16_f32_e32 v9, v9
-; CI-NEXT:    v_cvt_f16_f32_e32 v10, v10
-; CI-NEXT:    v_cvt_f16_f32_e32 v11, v11
-; CI-NEXT:    v_cvt_f16_f32_e32 v12, v12
-; CI-NEXT:    v_cvt_f16_f32_e32 v13, v13
-; CI-NEXT:    v_cvt_f16_f32_e32 v14, v14
-; CI-NEXT:    v_cvt_f16_f32_e32 v15, v15
+; CI-NEXT:    v_cvt_f32_f16_e32 v15, v15
+; CI-NEXT:    v_cvt_f32_f16_e32 v14, v14
+; CI-NEXT:    v_cvt_f32_f16_e32 v13, v13
+; CI-NEXT:    v_cvt_f32_f16_e32 v12, v12
+; CI-NEXT:    v_cvt_f32_f16_e32 v11, v11
+; CI-NEXT:    v_cvt_f32_f16_e32 v10, v10
+; CI-NEXT:    v_cvt_f32_f16_e32 v9, v9
+; CI-NEXT:    v_cvt_f32_f16_e32 v8, v8
+; CI-NEXT:    v_cvt_f32_f16_e32 v7, v7
+; CI-NEXT:    v_cvt_f32_f16_e32 v6, v6
+; CI-NEXT:    v_cvt_f32_f16_e32 v5, v5
+; CI-NEXT:    v_cvt_f32_f16_e32 v4, v4
 ; CI-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; CI-NEXT:    v_cvt_f32_f16_e32 v1, v1
 ; CI-NEXT:    v_cvt_f32_f16_e32 v2, v2
 ; CI-NEXT:    v_cvt_f32_f16_e32 v3, v3
-; CI-NEXT:    v_cvt_f32_f16_e32 v4, v4
-; CI-NEXT:    v_cvt_f32_f16_e32 v5, v5
-; CI-NEXT:    v_cvt_f32_f16_e32 v6, v6
-; CI-NEXT:    v_cvt_f32_f16_e32 v7, v7
-; CI-NEXT:    v_cvt_f32_f16_e32 v8, v8
-; CI-NEXT:    v_cvt_f32_f16_e32 v9, v9
-; CI-NEXT:    v_cvt_f32_f16_e32 v10, v10
-; CI-NEXT:    v_cvt_f32_f16_e32 v11, v11
-; CI-NEXT:    v_cvt_f32_f16_e32 v12, v12
-; CI-NEXT:    v_cvt_f32_f16_e32 v13, v13
-; CI-NEXT:    v_cvt_f32_f16_e32 v14, v14
-; CI-NEXT:    v_cvt_f32_f16_e32 v15, v15
 ; CI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_test_canonicalize_var_v16f16:
@@ -3216,68 +3220,68 @@ define <32 x half> @v_test_canonicalize_var_v32f16(<32 x half> %val) #1 {
 ; CI:       ; %bb.0:
 ; CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CI-NEXT:    buffer_load_dword v31, off, s[0:3], s32
+; CI-NEXT:    v_cvt_f16_f32_e32 v30, v30
+; CI-NEXT:    v_cvt_f16_f32_e32 v29, v29
+; CI-NEXT:    v_cvt_f16_f32_e32 v28, v28
+; CI-NEXT:    v_cvt_f16_f32_e32 v27, v27
+; CI-NEXT:    v_cvt_f16_f32_e32 v26, v26
+; CI-NEXT:    v_cvt_f16_f32_e32 v25, v25
+; CI-NEXT:    v_cvt_f16_f32_e32 v24, v24
+; CI-NEXT:    v_cvt_f16_f32_e32 v23, v23
+; CI-NEXT:    v_cvt_f16_f32_e32 v22, v22
+; CI-NEXT:    v_cvt_f16_f32_e32 v21, v21
+; CI-NEXT:    v_cvt_f16_f32_e32 v20, v20
+; CI-NEXT:    v_cvt_f16_f32_e32 v19, v19
+; CI-NEXT:    v_cvt_f16_f32_e32 v18, v18
+; CI-NEXT:    v_cvt_f16_f32_e32 v17, v17
+; CI-NEXT:    v_cvt_f16_f32_e32 v16, v16
+; CI-NEXT:    v_cvt_f16_f32_e32 v15, v15
+; CI-NEXT:    v_cvt_f16_f32_e32 v14, v14
+; CI-NEXT:    v_cvt_f16_f32_e32 v13, v13
+; CI-NEXT:    v_cvt_f16_f32_e32 v12, v12
+; CI-NEXT:    v_cvt_f16_f32_e32 v11, v11
+; CI-NEXT:    v_cvt_f16_f32_e32 v10, v10
+; CI-NEXT:    v_cvt_f16_f32_e32 v9, v9
+; CI-NEXT:    v_cvt_f16_f32_e32 v8, v8
+; CI-NEXT:    v_cvt_f16_f32_e32 v7, v7
+; CI-NEXT:    v_cvt_f16_f32_e32 v6, v6
+; CI-NEXT:    v_cvt_f16_f32_e32 v5, v5
+; CI-NEXT:    v_cvt_f16_f32_e32 v4, v4
 ; CI-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; CI-NEXT:    v_cvt_f16_f32_e32 v1, v1
 ; CI-NEXT:    v_cvt_f16_f32_e32 v2, v2
 ; CI-NEXT:    v_cvt_f16_f32_e32 v3, v3
-; CI-NEXT:    v_cvt_f16_f32_e32 v4, v4
-; CI-NEXT:    v_cvt_f16_f32_e32 v5, v5
-; CI-NEXT:    v_cvt_f16_f32_e32 v6, v6
-; CI-NEXT:    v_cvt_f16_f32_e32 v7, v7
-; CI-NEXT:    v_cvt_f16_f32_e32 v8, v8
-; CI-NEXT:    v_cvt_f16_f32_e32 v9, v9
-; CI-NEXT:    v_cvt_f16_f32_e32 v10, v10
-; CI-NEXT:    v_cvt_f16_f32_e32 v11, v11
-; CI-NEXT:    v_cvt_f16_f32_e32 v12, v12
-; CI-NEXT:    v_cvt_f16_f32_e32 v13, v13
-; CI-NEXT:    v_cvt_f16_f32_e32 v14, v14
-; CI-NEXT:    v_cvt_f16_f32_e32 v15, v15
-; CI-NEXT:    v_cvt_f16_f32_e32 v16, v16
-; CI-NEXT:    v_cvt_f16_f32_e32 v17, v17
-; CI-NEXT:    v_cvt_f16_f32_e32 v18, v18
-; CI-NEXT:    v_cvt_f16_f32_e32 v19, v19
-; CI-NEXT:    v_cvt_f16_f32_e32 v20, v20
-; CI-NEXT:    v_cvt_f16_f32_e32 v21, v21
-; CI-NEXT:    v_cvt_f16_f32_e32 v22, v22
-; CI-NEXT:    v_cvt_f16_f32_e32 v23, v23
-; CI-NEXT:    v_cvt_f16_f32_e32 v24, v24
-; CI-NEXT:    v_cvt_f16_f32_e32 v25, v25
-; CI-NEXT:    v_cvt_f16_f32_e32 v26, v26
-; CI-NEXT:    v_cvt_f16_f32_e32 v27, v27
-; CI-NEXT:    v_cvt_f16_f32_e32 v28, v28
-; CI-NEXT:    v_cvt_f16_f32_e32 v29, v29
-; CI-NEXT:    v_cvt_f16_f32_e32 v30, v30
+; CI-NEXT:    v_cvt_f32_f16_e32 v30, v30
+; CI-NEXT:    v_cvt_f32_f16_e32 v29, v29
+; CI-NEXT:    v_cvt_f32_f16_e32 v28, v28
+; CI-NEXT:    v_cvt_f32_f16_e32 v27, v27
+; CI-NEXT:    v_cvt_f32_f16_e32 v26, v26
+; CI-NEXT:    v_cvt_f32_f16_e32 v25, v25
+; CI-NEXT:    v_cvt_f32_f16_e32 v24, v24
+; CI-NEXT:    v_cvt_f32_f16_e32 v23, v23
+; CI-NEXT:    v_cvt_f32_f16_e32 v22, v22
+; CI-NEXT:    v_cvt_f32_f16_e32 v21, v21
+; CI-NEXT:    v_cvt_f32_f16_e32 v20, v20
+; CI-NEXT:    v_cvt_f32_f16_e32 v19, v19
+; CI-NEXT:    v_cvt_f32_f16_e32 v18, v18
+; CI-NEXT:    v_cvt_f32_f16_e32 v17, v17
+; CI-NEXT:    v_cvt_f32_f16_e32 v16, v16
+; CI-NEXT:    v_cvt_f32_f16_e32 v15, v15
+; CI-NEXT:    v_cvt_f32_f16_e32 v14, v14
+; CI-NEXT:    v_cvt_f32_f16_e32 v13, v13
+; CI-NEXT:    v_cvt_f32_f16_e32 v12, v12
+; CI-NEXT:    v_cvt_f32_f16_e32 v11, v11
+; CI-NEXT:    v_cvt_f32_f16_e32 v10, v10
+; CI-NEXT:    v_cvt_f32_f16_e32 v9, v9
+; CI-NEXT:    v_cvt_f32_f16_e32 v8, v8
+; CI-NEXT:    v_cvt_f32_f16_e32 v7, v7
+; CI-NEXT:    v_cvt_f32_f16_e32 v6, v6
+; CI-NEXT:    v_cvt_f32_f16_e32 v5, v5
+; CI-NEXT:    v_cvt_f32_f16_e32 v4, v4
 ; CI-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; CI-NEXT:    v_cvt_f32_f16_e32 v1, v1
 ; CI-NEXT:    v_cvt_f32_f16_e32 v2, v2
 ; CI-NEXT:    v_cvt_f32_f16_e32 v3, v3
-; CI-NEXT:    v_cvt_f32_f16_e32 v4, v4
-; CI-NEXT:    v_cvt_f32_f16_e32 v5, v5
-; CI-NEXT:    v_cvt_f32_f16_e32 v6, v6
-; CI-NEXT:    v_cvt_f32_f16_e32 v7, v7
-; CI-NEXT:    v_cvt_f32_f16_e32 v8, v8
-; CI-NEXT:    v_cvt_f32_f16_e32 v9, v9
-; CI-NEXT:    v_cvt_f32_f16_e32 v10, v10
-; CI-NEXT:    v_cvt_f32_f16_e32 v11, v11
-; CI-NEXT:    v_cvt_f32_f16_e32 v12, v12
-; CI-NEXT:    v_cvt_f32_f16_e32 v13, v13
-; CI-NEXT:    v_cvt_f32_f16_e32 v14, v14
-; CI-NEXT:    v_cvt_f32_f16_e32 v15, v15
-; CI-NEXT:    v_cvt_f32_f16_e32 v16, v16
-; CI-NEXT:    v_cvt_f32_f16_e32 v17, v17
-; CI-NEXT:    v_cvt_f32_f16_e32 v18, v18
-; CI-NEXT:    v_cvt_f32_f16_e32 v19, v19
-; CI-NEXT:    v_cvt_f32_f16_e32 v20, v20
-; CI-NEXT:    v_cvt_f32_f16_e32 v21, v21
-; CI-NEXT:    v_cvt_f32_f16_e32 v22, v22
-; CI-NEXT:    v_cvt_f32_f16_e32 v23, v23
-; CI-NEXT:    v_cvt_f32_f16_e32 v24, v24
-; CI-NEXT:    v_cvt_f32_f16_e32 v25, v25
-; CI-NEXT:    v_cvt_f32_f16_e32 v26, v26
-; CI-NEXT:    v_cvt_f32_f16_e32 v27, v27
-; CI-NEXT:    v_cvt_f32_f16_e32 v28, v28
-; CI-NEXT:    v_cvt_f32_f16_e32 v29, v29
-; CI-NEXT:    v_cvt_f32_f16_e32 v30, v30
 ; CI-NEXT:    s_waitcnt vmcnt(0)
 ; CI-NEXT:    v_cvt_f16_f32_e32 v31, v31
 ; CI-NEXT:    v_cvt_f32_f16_e32 v31, v31
@@ -3456,228 +3460,354 @@ define <64 x half> @v_test_canonicalize_var_v64f16(<64 x half> %val) #1 {
 ; CI-NEXT:    v_cvt_f16_f32_e32 v2, v2
 ; CI-NEXT:    v_cvt_f16_f32_e32 v1, v1
 ; CI-NEXT:    v_cvt_f16_f32_e32 v3, v3
+; CI-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:104
+; CI-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:108
+; CI-NEXT:    v_cvt_f32_f16_e32 v2, v2
+; CI-NEXT:    v_cvt_f32_f16_e32 v1, v1
+; CI-NEXT:    v_cvt_f32_f16_e32 v3, v3
+; CI-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; CI-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; CI-NEXT:    v_cvt_f16_f32_e32 v3, v3
 ; CI-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
 ; CI-NEXT:    v_or_b32_e32 v1, v1, v2
 ; CI-NEXT:    v_cvt_f16_f32_e32 v2, v4
 ; CI-NEXT:    v_cvt_f16_f32_e32 v4, v5
 ; CI-NEXT:    v_cvt_f16_f32_e32 v5, v7
 ; CI-NEXT:    v_cvt_f16_f32_e32 v7, v9
+; CI-NEXT:    v_cvt_f32_f16_e32 v2, v2
+; CI-NEXT:    v_cvt_f32_f16_e32 v4, v4
+; CI-NEXT:    v_cvt_f32_f16_e32 v5, v5
+; CI-NEXT:    v_cvt_f32_f16_e32 v7, v7
+; CI-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; CI-NEXT:    v_cvt_f16_f32_e32 v4, v4
+; CI-NEXT:    v_cvt_f16_f32_e32 v5, v5
+; CI-NEXT:    v_cvt_f16_f32_e32 v7, v7
 ; CI-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
 ; CI-NEXT:    v_or_b32_e32 v2, v3, v2
 ; CI-NEXT:    v_cvt_f16_f32_e32 v3, v6
 ; CI-NEXT:    v_cvt_f16_f32_e32 v6, v10
 ; CI-NEXT:    v_cvt_f16_f32_e32 v9, v13
-; CI-NEXT:    v_cvt_f16_f32_e32 v10, v18
+; CI-NEXT:    v_cvt_f16_f32_e32 v10, v16
+; CI-NEXT:    v_cvt_f32_f16_e32 v3, v3
+; CI-NEXT:    v_cvt_f32_f16_e32 v6, v6
+; CI-NEXT:    v_cvt_f32_f16_e32 v9, v9
+; CI-NEXT:    v_cvt_f16_f32_e32 v13, v17
+; CI-NEXT:    v_cvt_f16_f32_e32 v3, v3
+; CI-NEXT:    v_cvt_f16_f32_e32 v6, v6
+; CI-NEXT:    v_cvt_f16_f32_e32 v9, v9
+; CI-NEXT:    v_cvt_f32_f16_e32 v13, v13
 ; CI-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
 ; CI-NEXT:    v_or_b32_e32 v3, v4, v3
 ; CI-NEXT:    v_cvt_f16_f32_e32 v4, v8
 ; CI-NEXT:    v_cvt_f16_f32_e32 v8, v14
-; CI-NEXT:    v_cvt_f16_f32_e32 v13, v21
-; CI-NEXT:    v_cvt_f16_f32_e32 v14, v26
+; CI-NEXT:    buffer_load_dword v14, off, s[0:3], s32
+; CI-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:4
+; CI-NEXT:    v_cvt_f16_f32_e32 v17, v23
+; CI-NEXT:    v_cvt_f32_f16_e32 v4, v4
+; CI-NEXT:    v_cvt_f32_f16_e32 v8, v8
+; CI-NEXT:    v_cvt_f32_f16_e32 v17, v17
+; CI-NEXT:    v_cvt_f16_f32_e32 v4, v4
+; CI-NEXT:    v_cvt_f16_f32_e32 v8, v8
 ; CI-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
 ; CI-NEXT:    v_or_b32_e32 v4, v5, v4
 ; CI-NEXT:    v_lshlrev_b32_e32 v5, 16, v6
 ; CI-NEXT:    v_cvt_f16_f32_e32 v6, v12
 ; CI-NEXT:    v_or_b32_e32 v5, v7, v5
 ; CI-NEXT:    v_cvt_f16_f32_e32 v7, v11
-; CI-NEXT:    v_cvt_f16_f32_e32 v11, v17
+; CI-NEXT:    v_cvt_f16_f32_e32 v11, v15
+; CI-NEXT:    v_cvt_f32_f16_e32 v6, v6
+; CI-NEXT:    v_cvt_f16_f32_e32 v15, v21
+; CI-NEXT:    v_cvt_f32_f16_e32 v7, v7
+; CI-NEXT:    v_cvt_f16_f32_e32 v6, v6
+; CI-NEXT:    v_cvt_f16_f32_e32 v7, v7
 ; CI-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
-; CI-NEXT:    v_cvt_f16_f32_e32 v12, v22
 ; CI-NEXT:    v_or_b32_e32 v6, v7, v6
 ; CI-NEXT:    v_lshlrev_b32_e32 v7, 16, v8
-; CI-NEXT:    v_cvt_f16_f32_e32 v8, v16
+; CI-NEXT:    v_cvt_f16_f32_e32 v8, v19
 ; CI-NEXT:    v_or_b32_e32 v7, v9, v7
-; CI-NEXT:    v_cvt_f16_f32_e32 v9, v15
-; CI-NEXT:    v_cvt_f16_f32_e32 v15, v25
+; CI-NEXT:    v_cvt_f16_f32_e32 v9, v20
+; CI-NEXT:    v_cvt_f32_f16_e32 v12, v8
+; CI-NEXT:    v_cvt_f32_f16_e32 v8, v10
+; CI-NEXT:    v_cvt_f32_f16_e32 v10, v11
+; CI-NEXT:    v_cvt_f16_f32_e32 v11, v18
+; CI-NEXT:    buffer_load_dword v18, off, s[0:3], s32 offset:124
+; CI-NEXT:    buffer_load_dword v19, off, s[0:3], s32 offset:112
+; CI-NEXT:    buffer_load_dword v20, off, s[0:3], s32 offset:116
+; CI-NEXT:    v_cvt_f16_f32_e32 v8, v8
+; CI-NEXT:    v_cvt_f16_f32_e32 v10, v10
+; CI-NEXT:    v_cvt_f32_f16_e32 v11, v11
+; CI-NEXT:    v_cvt_f32_f16_e32 v9, v9
 ; CI-NEXT:    v_lshlrev_b32_e32 v8, 16, v8
-; CI-NEXT:    v_cvt_f16_f32_e32 v25, v29
-; CI-NEXT:    v_or_b32_e32 v8, v9, v8
+; CI-NEXT:    v_or_b32_e32 v8, v10, v8
+; CI-NEXT:    v_cvt_f16_f32_e32 v10, v11
+; CI-NEXT:    v_cvt_f16_f32_e32 v11, v13
+; CI-NEXT:    v_cvt_f16_f32_e32 v13, v9
+; CI-NEXT:    v_cvt_f16_f32_e32 v12, v12
 ; CI-NEXT:    v_lshlrev_b32_e32 v9, 16, v10
-; CI-NEXT:    v_cvt_f16_f32_e32 v10, v20
 ; CI-NEXT:    v_or_b32_e32 v9, v11, v9
-; CI-NEXT:    v_cvt_f16_f32_e32 v11, v19
-; CI-NEXT:    buffer_load_dword v16, off, s[0:3], s32 offset:4
-; CI-NEXT:    buffer_load_dword v17, off, s[0:3], s32
-; CI-NEXT:    buffer_load_dword v18, off, s[0:3], s32 offset:12
-; CI-NEXT:    buffer_load_dword v19, off, s[0:3], s32 offset:8
-; CI-NEXT:    v_lshlrev_b32_e32 v10, 16, v10
-; CI-NEXT:    v_or_b32_e32 v10, v11, v10
-; CI-NEXT:    v_lshlrev_b32_e32 v11, 16, v12
-; CI-NEXT:    v_cvt_f16_f32_e32 v12, v24
+; CI-NEXT:    v_lshlrev_b32_e32 v10, 16, v13
+; CI-NEXT:    v_cvt_f16_f32_e32 v11, v25
+; CI-NEXT:    v_cvt_f16_f32_e32 v13, v22
+; CI-NEXT:    v_or_b32_e32 v10, v12, v10
+; CI-NEXT:    v_cvt_f16_f32_e32 v12, v26
+; CI-NEXT:    v_cvt_f32_f16_e32 v16, v11
+; CI-NEXT:    v_cvt_f32_f16_e32 v11, v13
+; CI-NEXT:    v_cvt_f32_f16_e32 v13, v15
+; CI-NEXT:    v_cvt_f16_f32_e32 v15, v24
+; CI-NEXT:    v_cvt_f32_f16_e32 v12, v12
+; CI-NEXT:    v_cvt_f16_f32_e32 v11, v11
+; CI-NEXT:    v_cvt_f16_f32_e32 v13, v13
+; CI-NEXT:    v_cvt_f32_f16_e32 v15, v15
+; CI-NEXT:    v_cvt_f16_f32_e32 v22, v30
+; CI-NEXT:    v_lshlrev_b32_e32 v11, 16, v11
 ; CI-NEXT:    v_or_b32_e32 v11, v13, v11
-; CI-NEXT:    v_cvt_f16_f32_e32 v13, v23
-; CI-NEXT:    buffer_load_dword v20, off, s[0:3], s32 offset:20
-; CI-NEXT:    buffer_load_dword v21, off, s[0:3], s32 offset:16
-; CI-NEXT:    buffer_load_dword v22, off, s[0:3], s32 offset:28
-; CI-NEXT:    buffer_load_dword v23, off, s[0:3], s32 offset:24
-; CI-NEXT:    v_lshlrev_b32_e32 v12, 16, v12
-; CI-NEXT:    v_cvt_f16_f32_e32 v24, v30
-; CI-NEXT:    v_or_b32_e32 v12, v13, v12
-; CI-NEXT:    v_lshlrev_b32_e32 v13, 16, v14
-; CI-NEXT:    v_or_b32_e32 v13, v15, v13
-; CI-NEXT:    v_cvt_f16_f32_e32 v14, v28
+; CI-NEXT:    v_cvt_f16_f32_e32 v13, v15
+; CI-NEXT:    v_cvt_f16_f32_e32 v15, v17
+; CI-NEXT:    v_cvt_f16_f32_e32 v17, v12
+; CI-NEXT:    v_cvt_f16_f32_e32 v25, v29
+; CI-NEXT:    v_lshlrev_b32_e32 v12, 16, v13
+; CI-NEXT:    v_or_b32_e32 v12, v15, v12
+; CI-NEXT:    s_waitcnt vmcnt(6)
+; CI-NEXT:    v_cvt_f16_f32_e32 v15, v31
+; CI-NEXT:    v_lshlrev_b32_e32 v13, 16, v17
+; CI-NEXT:    buffer_load_dword v17, off, s[0:3], s32 offset:128
+; CI-NEXT:    buffer_load_dword v26, off, s[0:3], s32 offset:132
+; CI-NEXT:    buffer_load_dword v34, off, s[0:3], s32 offset:120
+; CI-NEXT:    v_cvt_f32_f16_e32 v22, v22
+; CI-NEXT:    v_cvt_f32_f16_e32 v23, v15
 ; CI-NEXT:    v_cvt_f16_f32_e32 v15, v27
-; CI-NEXT:    buffer_load_dword v26, off, s[0:3], s32 offset:36
-; CI-NEXT:    buffer_load_dword v27, off, s[0:3], s32 offset:32
-; CI-NEXT:    buffer_load_dword v28, off, s[0:3], s32 offset:44
-; CI-NEXT:    buffer_load_dword v29, off, s[0:3], s32 offset:40
+; CI-NEXT:    v_cvt_f32_f16_e32 v25, v25
+; CI-NEXT:    s_waitcnt vmcnt(7)
+; CI-NEXT:    v_cvt_f16_f32_e32 v14, v14
+; CI-NEXT:    s_waitcnt vmcnt(6)
+; CI-NEXT:    v_cvt_f16_f32_e32 v21, v33
+; CI-NEXT:    v_cvt_f32_f16_e32 v15, v15
+; CI-NEXT:    v_cvt_f16_f32_e32 v22, v22
+; CI-NEXT:    v_cvt_f32_f16_e32 v24, v14
+; CI-NEXT:    v_cvt_f16_f32_e32 v14, v28
+; CI-NEXT:    v_cvt_f16_f32_e32 v15, v15
+; CI-NEXT:    v_cvt_f32_f16_e32 v21, v21
+; CI-NEXT:    v_cvt_f16_f32_e32 v25, v25
+; CI-NEXT:    v_cvt_f32_f16_e32 v14, v14
+; CI-NEXT:    v_cvt_f16_f32_e32 v16, v16
+; CI-NEXT:    v_cvt_f16_f32_e32 v24, v24
+; CI-NEXT:    v_cvt_f16_f32_e32 v28, v23
+; CI-NEXT:    v_cvt_f16_f32_e32 v14, v14
+; CI-NEXT:    v_or_b32_e32 v13, v16, v13
+; CI-NEXT:    v_cvt_f16_f32_e32 v16, v32
+; CI-NEXT:    buffer_load_dword v23, off, s[0:3], s32 offset:12
 ; CI-NEXT:    v_lshlrev_b32_e32 v14, 16, v14
 ; CI-NEXT:    v_or_b32_e32 v14, v15, v14
-; CI-NEXT:    v_lshlrev_b32_e32 v15, 16, v24
+; CI-NEXT:    v_lshlrev_b32_e32 v15, 16, v22
 ; CI-NEXT:    v_or_b32_e32 v15, v25, v15
-; CI-NEXT:    s_waitcnt vmcnt(11)
-; CI-NEXT:    v_cvt_f16_f32_e32 v16, v16
-; CI-NEXT:    s_waitcnt vmcnt(10)
-; CI-NEXT:    v_cvt_f16_f32_e32 v17, v17
+; CI-NEXT:    v_cvt_f16_f32_e32 v25, v21
+; CI-NEXT:    buffer_load_dword v21, off, s[0:3], s32 offset:96
+; CI-NEXT:    buffer_load_dword v22, off, s[0:3], s32 offset:100
+; CI-NEXT:    v_cvt_f32_f16_e32 v16, v16
+; CI-NEXT:    buffer_load_dword v29, off, s[0:3], s32 offset:64
+; CI-NEXT:    v_lshlrev_b32_e32 v25, 16, v25
+; CI-NEXT:    v_cvt_f16_f32_e32 v27, v16
+; CI-NEXT:    v_or_b32_e32 v16, v24, v25
+; CI-NEXT:    v_lshlrev_b32_e32 v24, 16, v27
+; CI-NEXT:    v_or_b32_e32 v25, v28, v24
 ; CI-NEXT:    s_waitcnt vmcnt(9)
 ; CI-NEXT:    v_cvt_f16_f32_e32 v18, v18
 ; CI-NEXT:    s_waitcnt vmcnt(8)
 ; CI-NEXT:    v_cvt_f16_f32_e32 v19, v19
-; CI-NEXT:    v_lshlrev_b32_e32 v16, 16, v16
-; CI-NEXT:    v_or_b32_e32 v16, v17, v16
-; CI-NEXT:    v_lshlrev_b32_e32 v17, 16, v18
-; CI-NEXT:    v_or_b32_e32 v17, v19, v17
 ; CI-NEXT:    s_waitcnt vmcnt(7)
-; CI-NEXT:    v_cvt_f16_f32_e32 v18, v20
+; CI-NEXT:    v_cvt_f16_f32_e32 v20, v20
+; CI-NEXT:    v_cvt_f32_f16_e32 v18, v18
+; CI-NEXT:    v_cvt_f32_f16_e32 v19, v19
+; CI-NEXT:    v_cvt_f32_f16_e32 v20, v20
+; CI-NEXT:    v_cvt_f16_f32_e32 v18, v18
+; CI-NEXT:    v_cvt_f16_f32_e32 v19, v19
+; CI-NEXT:    v_cvt_f16_f32_e32 v20, v20
+; CI-NEXT:    v_lshlrev_b32_e32 v18, 16, v18
+; CI-NEXT:    v_lshlrev_b32_e32 v20, 16, v20
+; CI-NEXT:    v_or_b32_e32 v20, v19, v20
+; CI-NEXT:    buffer_load_dword v19, off, s[0:3], s32 offset:20
+; CI-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:8
+; CI-NEXT:    s_waitcnt vmcnt(8)
+; CI-NEXT:    v_cvt_f16_f32_e32 v17, v17
+; CI-NEXT:    s_waitcnt vmcnt(7)
+; CI-NEXT:    v_cvt_f16_f32_e32 v26, v26
 ; CI-NEXT:    s_waitcnt vmcnt(6)
-; CI-NEXT:    v_cvt_f16_f32_e32 v19, v21
-; CI-NEXT:    s_waitcnt vmcnt(5)
+; CI-NEXT:    v_cvt_f16_f32_e32 v27, v34
+; CI-NEXT:    v_cvt_f32_f16_e32 v17, v17
+; CI-NEXT:    v_cvt_f32_f16_e32 v26, v26
+; CI-NEXT:    v_cvt_f32_f16_e32 v27, v27
+; CI-NEXT:    v_cvt_f16_f32_e32 v17, v17
+; CI-NEXT:    v_cvt_f16_f32_e32 v26, v26
+; CI-NEXT:    v_cvt_f16_f32_e32 v27, v27
+; CI-NEXT:    v_lshlrev_b32_e32 v26, 16, v26
+; CI-NEXT:    v_or_b32_e32 v17, v17, v26
+; CI-NEXT:    v_add_i32_e32 v26, vcc, 0x7c, v0
+; CI-NEXT:    v_or_b32_e32 v18, v27, v18
+; CI-NEXT:    buffer_store_dword v17, v26, s[0:3], 0 offen
+; CI-NEXT:    v_add_i32_e32 v17, vcc, 0x78, v0
+; CI-NEXT:    buffer_store_dword v18, v17, s[0:3], 0 offen
+; CI-NEXT:    v_add_i32_e32 v17, vcc, 0x74, v0
+; CI-NEXT:    buffer_store_dword v20, v17, s[0:3], 0 offen
+; CI-NEXT:    v_add_i32_e32 v17, vcc, 0x70, v0
+; CI-NEXT:    buffer_store_dword v25, v17, s[0:3], 0 offen
+; CI-NEXT:    s_waitcnt vmcnt(8)
+; CI-NEXT:    v_cvt_f16_f32_e32 v21, v21
+; CI-NEXT:    s_waitcnt vmcnt(7)
 ; CI-NEXT:    v_cvt_f16_f32_e32 v20, v22
-; CI-NEXT:    s_waitcnt vmcnt(4)
-; CI-NEXT:    v_cvt_f16_f32_e32 v21, v23
-; CI-NEXT:    v_lshlrev_b32_e32 v18, 16, v18
-; CI-NEXT:    v_or_b32_e32 v18, v19, v18
-; CI-NEXT:    v_lshlrev_b32_e32 v19, 16, v20
-; CI-NEXT:    v_or_b32_e32 v19, v21, v19
-; CI-NEXT:    s_waitcnt vmcnt(3)
-; CI-NEXT:    v_cvt_f16_f32_e32 v20, v26
-; CI-NEXT:    s_waitcnt vmcnt(2)
-; CI-NEXT:    v_cvt_f16_f32_e32 v21, v27
-; CI-NEXT:    s_waitcnt vmcnt(1)
-; CI-NEXT:    v_cvt_f16_f32_e32 v26, v28
-; CI-NEXT:    s_waitcnt vmcnt(0)
-; CI-NEXT:    v_cvt_f16_f32_e32 v27, v29
+; CI-NEXT:    buffer_load_dword v27, off, s[0:3], s32 offset:88
+; CI-NEXT:    buffer_load_dword v28, off, s[0:3], s32 offset:92
+; CI-NEXT:    buffer_load_dword v17, off, s[0:3], s32 offset:80
+; CI-NEXT:    buffer_load_dword v18, off, s[0:3], s32 offset:84
+; CI-NEXT:    buffer_load_dword v25, off, s[0:3], s32 offset:72
+; CI-NEXT:    buffer_load_dword v26, off, s[0:3], s32 offset:76
+; CI-NEXT:    v_cvt_f16_f32_e32 v22, v23
+; CI-NEXT:    v_cvt_f32_f16_e32 v21, v21
+; CI-NEXT:    v_cvt_f32_f16_e32 v20, v20
+; CI-NEXT:    s_waitcnt vmcnt(12)
+; CI-NEXT:    v_cvt_f16_f32_e32 v29, v29
+; CI-NEXT:    v_cvt_f32_f16_e32 v22, v22
+; CI-NEXT:    v_cvt_f16_f32_e32 v21, v21
+; CI-NEXT:    v_cvt_f16_f32_e32 v20, v20
+; CI-NEXT:    v_cvt_f32_f16_e32 v29, v29
+; CI-NEXT:    v_cvt_f16_f32_e32 v22, v22
 ; CI-NEXT:    v_lshlrev_b32_e32 v20, 16, v20
 ; CI-NEXT:    v_or_b32_e32 v20, v21, v20
-; CI-NEXT:    v_lshlrev_b32_e32 v21, 16, v26
-; CI-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:52
-; CI-NEXT:    buffer_load_dword v25, off, s[0:3], s32 offset:48
-; CI-NEXT:    buffer_load_dword v23, off, s[0:3], s32 offset:60
-; CI-NEXT:    buffer_load_dword v22, off, s[0:3], s32 offset:56
-; CI-NEXT:    v_or_b32_e32 v21, v27, v21
-; CI-NEXT:    buffer_load_dword v26, off, s[0:3], s32 offset:132
-; CI-NEXT:    buffer_load_dword v27, off, s[0:3], s32 offset:128
-; CI-NEXT:    s_waitcnt vmcnt(5)
-; CI-NEXT:    v_cvt_f16_f32_e32 v24, v24
-; CI-NEXT:    s_waitcnt vmcnt(4)
-; CI-NEXT:    v_cvt_f16_f32_e32 v25, v25
-; CI-NEXT:    s_waitcnt vmcnt(3)
+; CI-NEXT:    v_add_i32_e32 v21, vcc, 0x6c, v0
+; CI-NEXT:    buffer_store_dword v20, v21, s[0:3], 0 offen
+; CI-NEXT:    v_lshlrev_b32_e32 v20, 16, v22
+; CI-NEXT:    buffer_load_dword v22, off, s[0:3], s32 offset:24
+; CI-NEXT:    v_cvt_f16_f32_e32 v29, v29
+; CI-NEXT:    s_waitcnt vmcnt(13)
+; CI-NEXT:    v_cvt_f16_f32_e32 v19, v19
+; CI-NEXT:    s_waitcnt vmcnt(12)
+; CI-NEXT:    v_cvt_f16_f32_e32 v23, v24
+; CI-NEXT:    buffer_load_dword v21, off, s[0:3], s32 offset:28
+; CI-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:16
+; CI-NEXT:    v_cvt_f32_f16_e32 v19, v19
+; CI-NEXT:    v_cvt_f32_f16_e32 v23, v23
+; CI-NEXT:    v_cvt_f16_f32_e32 v19, v19
 ; CI-NEXT:    v_cvt_f16_f32_e32 v23, v23
-; CI-NEXT:    s_waitcnt vmcnt(2)
-; CI-NEXT:    v_cvt_f16_f32_e32 v22, v22
-; CI-NEXT:    s_waitcnt vmcnt(1)
+; CI-NEXT:    v_lshlrev_b32_e32 v19, 16, v19
+; CI-NEXT:    v_or_b32_e32 v20, v23, v20
+; CI-NEXT:    s_waitcnt vmcnt(9)
+; CI-NEXT:    v_cvt_f16_f32_e32 v27, v27
+; CI-NEXT:    s_waitcnt vmcnt(8)
+; CI-NEXT:    v_cvt_f16_f32_e32 v23, v28
+; CI-NEXT:    s_waitcnt vmcnt(7)
+; CI-NEXT:    v_cvt_f16_f32_e32 v17, v17
+; CI-NEXT:    s_waitcnt vmcnt(6)
+; CI-NEXT:    v_cvt_f16_f32_e32 v18, v18
+; CI-NEXT:    v_cvt_f32_f16_e32 v27, v27
+; CI-NEXT:    v_cvt_f32_f16_e32 v23, v23
+; CI-NEXT:    s_waitcnt vmcnt(4)
 ; CI-NEXT:    v_cvt_f16_f32_e32 v26, v26
-; CI-NEXT:    s_waitcnt vmcnt(0)
+; CI-NEXT:    v_cvt_f16_f32_e32 v25, v25
 ; CI-NEXT:    v_cvt_f16_f32_e32 v27, v27
-; CI-NEXT:    v_lshlrev_b32_e32 v24, 16, v24
-; CI-NEXT:    v_or_b32_e32 v24, v25, v24
-; CI-NEXT:    v_lshlrev_b32_e32 v26, 16, v26
-; CI-NEXT:    v_or_b32_e32 v26, v27, v26
-; CI-NEXT:    v_add_i32_e32 v27, vcc, 0x7c, v0
-; CI-NEXT:    buffer_store_dword v26, v27, s[0:3], 0 offen
-; CI-NEXT:    buffer_load_dword v26, off, s[0:3], s32 offset:124
-; CI-NEXT:    buffer_load_dword v27, off, s[0:3], s32 offset:120
+; CI-NEXT:    v_cvt_f16_f32_e32 v23, v23
+; CI-NEXT:    v_cvt_f32_f16_e32 v18, v18
+; CI-NEXT:    v_cvt_f32_f16_e32 v26, v26
+; CI-NEXT:    v_cvt_f32_f16_e32 v17, v17
 ; CI-NEXT:    v_lshlrev_b32_e32 v23, 16, v23
-; CI-NEXT:    v_or_b32_e32 v22, v22, v23
-; CI-NEXT:    buffer_load_dword v23, off, s[0:3], s32 offset:88
-; CI-NEXT:    s_waitcnt vmcnt(2)
+; CI-NEXT:    v_or_b32_e32 v23, v27, v23
+; CI-NEXT:    v_add_i32_e32 v27, vcc, 0x68, v0
+; CI-NEXT:    buffer_store_dword v23, v27, s[0:3], 0 offen
+; CI-NEXT:    buffer_load_dword v23, off, s[0:3], s32 offset:32
+; CI-NEXT:    buffer_load_dword v27, off, s[0:3], s32 offset:36
+; CI-NEXT:    v_cvt_f32_f16_e32 v25, v25
+; CI-NEXT:    v_cvt_f16_f32_e32 v18, v18
 ; CI-NEXT:    v_cvt_f16_f32_e32 v26, v26
-; CI-NEXT:    s_waitcnt vmcnt(1)
-; CI-NEXT:    v_cvt_f16_f32_e32 v27, v27
+; CI-NEXT:    v_cvt_f16_f32_e32 v17, v17
+; CI-NEXT:    v_cvt_f16_f32_e32 v25, v25
+; CI-NEXT:    v_lshlrev_b32_e32 v18, 16, v18
 ; CI-NEXT:    v_lshlrev_b32_e32 v26, 16, v26
-; CI-NEXT:    v_or_b32_e32 v26, v27, v26
-; CI-NEXT:    v_add_i32_e32 v27, vcc, 0x78, v0
-; CI-NEXT:    buffer_store_dword v26, v27, s[0:3], 0 offen
-; CI-NEXT:    buffer_load_dword v26, off, s[0:3], s32 offset:116
-; CI-NEXT:    buffer_load_dword v27, off, s[0:3], s32 offset:112
-; CI-NEXT:    s_waitcnt vmcnt(3)
+; CI-NEXT:    v_or_b32_e32 v17, v17, v18
+; CI-NEXT:    v_add_i32_e32 v18, vcc, 0x64, v0
+; CI-NEXT:    v_or_b32_e32 v25, v25, v26
+; CI-NEXT:    buffer_store_dword v17, v18, s[0:3], 0 offen
+; CI-NEXT:    v_add_i32_e32 v17, vcc, 0x60, v0
+; CI-NEXT:    buffer_store_dword v25, v17, s[0:3], 0 offen
+; CI-NEXT:    v_add_i32_e32 v17, vcc, 0x5c, v0
+; CI-NEXT:    s_waitcnt vmcnt(5)
+; CI-NEXT:    v_cvt_f16_f32_e32 v24, v24
+; CI-NEXT:    v_cvt_f16_f32_e32 v21, v21
+; CI-NEXT:    v_cvt_f32_f16_e32 v24, v24
+; CI-NEXT:    v_cvt_f32_f16_e32 v21, v21
+; CI-NEXT:    v_cvt_f16_f32_e32 v24, v24
+; CI-NEXT:    v_cvt_f16_f32_e32 v21, v21
+; CI-NEXT:    v_or_b32_e32 v19, v24, v19
+; CI-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:44
+; CI-NEXT:    v_cvt_f16_f32_e32 v22, v22
+; CI-NEXT:    v_lshlrev_b32_e32 v21, 16, v21
+; CI-NEXT:    v_cvt_f32_f16_e32 v22, v22
+; CI-NEXT:    v_cvt_f16_f32_e32 v22, v22
+; CI-NEXT:    v_or_b32_e32 v21, v22, v21
+; CI-NEXT:    buffer_load_dword v22, off, s[0:3], s32 offset:40
+; CI-NEXT:    s_waitcnt vmcnt(5)
+; CI-NEXT:    v_cvt_f16_f32_e32 v23, v23
+; CI-NEXT:    s_waitcnt vmcnt(4)
+; CI-NEXT:    v_cvt_f16_f32_e32 v27, v27
+; CI-NEXT:    v_cvt_f32_f16_e32 v23, v23
+; CI-NEXT:    v_cvt_f32_f16_e32 v27, v27
 ; CI-NEXT:    v_cvt_f16_f32_e32 v23, v23
+; CI-NEXT:    v_cvt_f16_f32_e32 v27, v27
+; CI-NEXT:    v_lshlrev_b32_e32 v27, 16, v27
 ; CI-NEXT:    s_waitcnt vmcnt(1)
-; CI-NEXT:    v_cvt_f16_f32_e32 v26, v26
+; CI-NEXT:    v_cvt_f16_f32_e32 v24, v24
+; CI-NEXT:    v_cvt_f32_f16_e32 v24, v24
+; CI-NEXT:    v_cvt_f16_f32_e32 v24, v24
 ; CI-NEXT:    s_waitcnt vmcnt(0)
+; CI-NEXT:    v_cvt_f16_f32_e32 v22, v22
+; CI-NEXT:    v_cvt_f32_f16_e32 v22, v22
+; CI-NEXT:    v_cvt_f16_f32_e32 v28, v22
+; CI-NEXT:    v_or_b32_e32 v22, v23, v27
+; CI-NEXT:    buffer_load_dword v27, off, s[0:3], s32 offset:52
+; CI-NEXT:    v_lshlrev_b32_e32 v23, 16, v24
+; CI-NEXT:    v_or_b32_e32 v23, v28, v23
+; CI-NEXT:    buffer_load_dword v28, off, s[0:3], s32 offset:56
+; CI-NEXT:    buffer_load_dword v24, off, s[0:3], s32 offset:48
+; CI-NEXT:    s_waitcnt vmcnt(2)
 ; CI-NEXT:    v_cvt_f16_f32_e32 v27, v27
-; CI-NEXT:    v_lshlrev_b32_e32 v26, 16, v26
-; CI-NEXT:    v_or_b32_e32 v26, v27, v26
-; CI-NEXT:    v_add_i32_e32 v27, vcc, 0x74, v0
-; CI-NEXT:    buffer_store_dword v26, v27, s[0:3], 0 offen
-; CI-NEXT:    buffer_load_dword v26, off, s[0:3], s32 offset:108
-; CI-NEXT:    buffer_load_dword v27, off, s[0:3], s32 offset:104
 ; CI-NEXT:    s_waitcnt vmcnt(1)
-; CI-NEXT:    v_cvt_f16_f32_e32 v25, v26
+; CI-NEXT:    v_cvt_f16_f32_e32 v28, v28
 ; CI-NEXT:    s_waitcnt vmcnt(0)
-; CI-NEXT:    v_cvt_f16_f32_e32 v26, v27
-; CI-NEXT:    buffer_load_dword v27, off, s[0:3], s32 offset:92
-; CI-NEXT:    v_lshlrev_b32_e32 v25, 16, v25
-; CI-NEXT:    v_or_b32_e32 v25, v26, v25
-; CI-NEXT:    v_add_i32_e32 v26, vcc, 0x70, v0
-; CI-NEXT:    buffer_store_dword v25, v26, s[0:3], 0 offen
-; CI-NEXT:    buffer_load_dword v25, off, s[0:3], s32 offset:100
-; CI-NEXT:    buffer_load_dword v26, off, s[0:3], s32 offset:96
-; CI-NEXT:    s_waitcnt vmcnt(3)
+; CI-NEXT:    v_cvt_f16_f32_e32 v24, v24
+; CI-NEXT:    v_cvt_f32_f16_e32 v27, v27
+; CI-NEXT:    v_cvt_f32_f16_e32 v28, v28
+; CI-NEXT:    v_cvt_f32_f16_e32 v24, v24
 ; CI-NEXT:    v_cvt_f16_f32_e32 v27, v27
+; CI-NEXT:    v_cvt_f16_f32_e32 v28, v28
+; CI-NEXT:    v_cvt_f16_f32_e32 v24, v24
 ; CI-NEXT:    v_lshlrev_b32_e32 v27, 16, v27
-; CI-NEXT:    v_or_b32_e32 v23, v23, v27
-; CI-NEXT:    s_waitcnt vmcnt(1)
-; CI-NEXT:    v_cvt_f16_f32_e32 v25, v25
+; CI-NEXT:    v_or_b32_e32 v24, v24, v27
+; CI-NEXT:    buffer_load_dword v27, off, s[0:3], s32 offset:60
 ; CI-NEXT:    s_waitcnt vmcnt(0)
-; CI-NEXT:    v_cvt_f16_f32_e32 v26, v26
-; CI-NEXT:    v_add_i32_e32 v27, vcc, 0x68, v0
-; CI-NEXT:    v_lshlrev_b32_e32 v25, 16, v25
-; CI-NEXT:    v_or_b32_e32 v25, v26, v25
-; CI-NEXT:    v_add_i32_e32 v26, vcc, 0x6c, v0
-; CI-NEXT:    buffer_store_dword v25, v26, s[0:3], 0 offen
-; CI-NEXT:    buffer_load_dword v25, off, s[0:3], s32 offset:68
-; CI-NEXT:    buffer_load_dword v26, off, s[0:3], s32 offset:64
-; CI-NEXT:    buffer_store_dword v23, v27, s[0:3], 0 offen
-; CI-NEXT:    buffer_load_dword v23, off, s[0:3], s32 offset:76
-; CI-NEXT:    buffer_load_dword v27, off, s[0:3], s32 offset:72
-; CI-NEXT:    buffer_load_dword v28, off, s[0:3], s32 offset:84
-; CI-NEXT:    buffer_load_dword v29, off, s[0:3], s32 offset:80
-; CI-NEXT:    s_waitcnt vmcnt(3)
-; CI-NEXT:    v_cvt_f16_f32_e32 v23, v23
-; CI-NEXT:    v_cvt_f16_f32_e32 v25, v25
-; CI-NEXT:    v_cvt_f16_f32_e32 v26, v26
-; CI-NEXT:    v_lshlrev_b32_e32 v23, 16, v23
-; CI-NEXT:    v_lshlrev_b32_e32 v25, 16, v25
-; CI-NEXT:    v_or_b32_e32 v25, v26, v25
-; CI-NEXT:    s_waitcnt vmcnt(2)
-; CI-NEXT:    v_cvt_f16_f32_e32 v26, v27
+; CI-NEXT:    v_cvt_f16_f32_e32 v27, v27
+; CI-NEXT:    v_cvt_f32_f16_e32 v27, v27
+; CI-NEXT:    v_cvt_f16_f32_e32 v27, v27
+; CI-NEXT:    v_lshlrev_b32_e32 v27, 16, v27
+; CI-NEXT:    v_or_b32_e32 v27, v28, v27
+; CI-NEXT:    buffer_load_dword v28, off, s[0:3], s32 offset:68
 ; CI-NEXT:    s_waitcnt vmcnt(0)
-; CI-NEXT:    v_cvt_f16_f32_e32 v27, v29
-; CI-NEXT:    v_or_b32_e32 v23, v26, v23
-; CI-NEXT:    v_cvt_f16_f32_e32 v26, v28
-; CI-NEXT:    v_lshlrev_b32_e32 v26, 16, v26
-; CI-NEXT:    v_or_b32_e32 v26, v27, v26
-; CI-NEXT:    v_add_i32_e32 v27, vcc, 0x64, v0
-; CI-NEXT:    buffer_store_dword v26, v27, s[0:3], 0 offen
-; CI-NEXT:    v_add_i32_e32 v26, vcc, 0x60, v0
-; CI-NEXT:    buffer_store_dword v23, v26, s[0:3], 0 offen
-; CI-NEXT:    v_add_i32_e32 v23, vcc, 0x5c, v0
-; CI-NEXT:    buffer_store_dword v25, v23, s[0:3], 0 offen
-; CI-NEXT:    v_add_i32_e32 v23, vcc, 0x58, v0
-; CI-NEXT:    buffer_store_dword v22, v23, s[0:3], 0 offen
-; CI-NEXT:    v_add_i32_e32 v22, vcc, 0x54, v0
-; CI-NEXT:    buffer_store_dword v24, v22, s[0:3], 0 offen
-; CI-NEXT:    v_add_i32_e32 v22, vcc, 0x50, v0
-; CI-NEXT:    buffer_store_dword v21, v22, s[0:3], 0 offen
-; CI-NEXT:    v_add_i32_e32 v21, vcc, 0x4c, v0
-; CI-NEXT:    buffer_store_dword v20, v21, s[0:3], 0 offen
-; CI-NEXT:    v_add_i32_e32 v20, vcc, 0x48, v0
-; CI-NEXT:    buffer_store_dword v19, v20, s[0:3], 0 offen
-; CI-NEXT:    v_add_i32_e32 v19, vcc, 0x44, v0
-; CI-NEXT:    buffer_store_dword v18, v19, s[0:3], 0 offen
-; CI-NEXT:    v_add_i32_e32 v18, vcc, 64, v0
-; CI-NEXT:    buffer_store_dword v17, v18, s[0:3], 0 offen
+; CI-NEXT:    v_cvt_f16_f32_e32 v28, v28
+; CI-NEXT:    v_cvt_f32_f16_e32 v28, v28
+; CI-NEXT:    v_cvt_f16_f32_e32 v28, v28
+; CI-NEXT:    v_lshlrev_b32_e32 v28, 16, v28
+; CI-NEXT:    v_or_b32_e32 v28, v29, v28
+; CI-NEXT:    buffer_store_dword v28, v17, s[0:3], 0 offen
+; CI-NEXT:    v_add_i32_e32 v17, vcc, 0x58, v0
+; CI-NEXT:    buffer_store_dword v27, v17, s[0:3], 0 offen
+; CI-NEXT:    v_add_i32_e32 v17, vcc, 0x54, v0
+; CI-NEXT:    buffer_store_dword v24, v17, s[0:3], 0 offen
+; CI-NEXT:    v_add_i32_e32 v17, vcc, 0x50, v0
+; CI-NEXT:    buffer_store_dword v23, v17, s[0:3], 0 offen
+; CI-NEXT:    v_add_i32_e32 v17, vcc, 0x4c, v0
+; CI-NEXT:    buffer_store_dword v22, v17, s[0:3], 0 offen
+; CI-NEXT:    v_add_i32_e32 v17, vcc, 0x48, v0
+; CI-NEXT:    buffer_store_dword v21, v17, s[0:3], 0 offen
+; CI-NEXT:    v_add_i32_e32 v17, vcc, 0x44, v0
+; CI-NEXT:    buffer_store_dword v19, v17, s[0:3], 0 offen
+; CI-NEXT:    v_add_i32_e32 v17, vcc, 64, v0
+; CI-NEXT:    buffer_store_dword v20, v17, s[0:3], 0 offen
 ; CI-NEXT:    v_add_i32_e32 v17, vcc, 60, v0
 ; CI-NEXT:    buffer_store_dword v16, v17, s[0:3], 0 offen
 ; CI-NEXT:    v_add_i32_e32 v16, vcc, 56, v0
diff --git a/llvm/test/CodeGen/AMDGPU/fcanonicalize.ll b/llvm/test/CodeGen/AMDGPU/fcanonicalize.ll
index c1093a1e89c886..d53c0411ad88c1 100644
--- a/llvm/test/CodeGen/AMDGPU/fcanonicalize.ll
+++ b/llvm/test/CodeGen/AMDGPU/fcanonicalize.ll
@@ -2389,7 +2389,6 @@ define amdgpu_kernel void @test_canonicalize_value_f16_flush(ptr addrspace(1) %a
 ; GFX6-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    v_cvt_f32_f16_e32 v0, v0
-; GFX6-NEXT:    v_mul_f32_e32 v0, 1.0, v0
 ; GFX6-NEXT:    v_cvt_f16_f32_e32 v3, v0
 ; GFX6-NEXT:    v_add_i32_e32 v0, vcc, s2, v2
 ; GFX6-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
@@ -2471,15 +2470,13 @@ define amdgpu_kernel void @test_canonicalize_value_v2f16_flush(ptr addrspace(1)
 ; GFX6-NEXT:    flat_load_dword v0, v[0:1]
 ; GFX6-NEXT:    v_mov_b32_e32 v3, s3
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    v_cvt_f32_f16_e32 v1, v0
-; GFX6-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX6-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
+; GFX6-NEXT:    v_cvt_f32_f16_e32 v1, v1
 ; GFX6-NEXT:    v_cvt_f32_f16_e32 v0, v0
-; GFX6-NEXT:    v_mul_f32_e32 v1, 1.0, v1
 ; GFX6-NEXT:    v_cvt_f16_f32_e32 v1, v1
-; GFX6-NEXT:    v_mul_f32_e32 v0, 1.0, v0
 ; GFX6-NEXT:    v_cvt_f16_f32_e32 v0, v0
-; GFX6-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX6-NEXT:    v_or_b32_e32 v4, v1, v0
+; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX6-NEXT:    v_or_b32_e32 v4, v0, v1
 ; GFX6-NEXT:    v_add_i32_e32 v0, vcc, s2, v2
 ; GFX6-NEXT:    v_addc_u32_e32 v1, vcc, 0, v3, vcc
 ; GFX6-NEXT:    flat_store_dword v[0:1], v4
@@ -2724,7 +2721,6 @@ define amdgpu_kernel void @test_canonicalize_value_f16_denorm(ptr addrspace(1) %
 ; GFX6-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    v_cvt_f32_f16_e32 v0, v0
-; GFX6-NEXT:    v_mul_f32_e32 v0, 1.0, v0
 ; GFX6-NEXT:    v_cvt_f16_f32_e32 v3, v0
 ; GFX6-NEXT:    v_add_i32_e32 v0, vcc, s2, v2
 ; GFX6-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
@@ -2807,15 +2803,13 @@ define amdgpu_kernel void @test_canonicalize_value_v2f16_denorm(ptr addrspace(1)
 ; GFX6-NEXT:    flat_load_dword v0, v[0:1]
 ; GFX6-NEXT:    v_mov_b32_e32 v3, s3
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    v_cvt_f32_f16_e32 v1, v0
-; GFX6-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX6-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
+; GFX6-NEXT:    v_cvt_f32_f16_e32 v1, v1
 ; GFX6-NEXT:    v_cvt_f32_f16_e32 v0, v0
-; GFX6-NEXT:    v_mul_f32_e32 v1, 1.0, v1
 ; GFX6-NEXT:    v_cvt_f16_f32_e32 v1, v1
-; GFX6-NEXT:    v_mul_f32_e32 v0, 1.0, v0
 ; GFX6-NEXT:    v_cvt_f16_f32_e32 v0, v0
-; GFX6-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX6-NEXT:    v_or_b32_e32 v4, v1, v0
+; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX6-NEXT:    v_or_b32_e32 v4, v0, v1
 ; GFX6-NEXT:    v_add_i32_e32 v0, vcc, s2, v2
 ; GFX6-NEXT:    v_addc_u32_e32 v1, vcc, 0, v3, vcc
 ; GFX6-NEXT:    flat_store_dword v[0:1], v4
diff --git a/llvm/test/CodeGen/AMDGPU/fneg-combines.f16.ll b/llvm/test/CodeGen/AMDGPU/fneg-combines.f16.ll
index 78fb89c71e2e6a..b32630a97b3ad0 100644
--- a/llvm/test/CodeGen/AMDGPU/fneg-combines.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/fneg-combines.f16.ll
@@ -951,8 +951,6 @@ define half @v_fneg_minnum_f16_ieee(half %a, half %b) #0 {
 ; SI-NEXT:    v_cvt_f16_f32_e64 v0, -v0
 ; SI-NEXT:    v_cvt_f32_f16_e32 v1, v1
 ; SI-NEXT:    v_cvt_f32_f16_e32 v0, v0
-; SI-NEXT:    v_mul_f32_e32 v1, 1.0, v1
-; SI-NEXT:    v_mul_f32_e32 v0, 1.0, v0
 ; SI-NEXT:    v_max_f32_e32 v0, v0, v1
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -1056,7 +1054,6 @@ define half @v_fneg_posk_minnum_f16_ieee(half %a) #0 {
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e64 v0, -v0
 ; SI-NEXT:    v_cvt_f32_f16_e32 v0, v0
-; SI-NEXT:    v_mul_f32_e32 v0, 1.0, v0
 ; SI-NEXT:    v_max_f32_e32 v0, -4.0, v0
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -1110,7 +1107,6 @@ define half @v_fneg_negk_minnum_f16_ieee(half %a) #0 {
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e64 v0, -v0
 ; SI-NEXT:    v_cvt_f32_f16_e32 v0, v0
-; SI-NEXT:    v_mul_f32_e32 v0, 1.0, v0
 ; SI-NEXT:    v_max_f32_e32 v0, 4.0, v0
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -1193,7 +1189,6 @@ define half @v_fneg_neg0_minnum_f16_ieee(half %a) #0 {
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e64 v0, -v0
 ; SI-NEXT:    v_cvt_f32_f16_e32 v0, v0
-; SI-NEXT:    v_mul_f32_e32 v0, 1.0, v0
 ; SI-NEXT:    v_max_f32_e32 v0, 0, v0
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -1222,7 +1217,6 @@ define half @v_fneg_inv2pi_minnum_f16(half %a) #0 {
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e64 v0, -v0
 ; SI-NEXT:    v_cvt_f32_f16_e32 v0, v0
-; SI-NEXT:    v_mul_f32_e32 v0, 1.0, v0
 ; SI-NEXT:    v_max_f32_e32 v0, 0xbe230000, v0
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -1253,7 +1247,6 @@ define half @v_fneg_neg_inv2pi_minnum_f16(half %a) #0 {
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e64 v0, -v0
 ; SI-NEXT:    v_cvt_f32_f16_e32 v0, v0
-; SI-NEXT:    v_mul_f32_e32 v0, 1.0, v0
 ; SI-NEXT:    v_max_f32_e32 v0, 0xbe230000, v0
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -1311,7 +1304,6 @@ define half @v_fneg_0_minnum_foldable_use_f16_ieee(half %a, half %b) #0 {
 ; SI-NEXT:    v_cvt_f16_f32_e32 v1, v1
 ; SI-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; SI-NEXT:    v_cvt_f32_f16_e32 v1, v1
-; SI-NEXT:    v_mul_f32_e32 v0, 1.0, v0
 ; SI-NEXT:    v_min_f32_e32 v0, 0, v0
 ; SI-NEXT:    v_mul_f32_e64 v0, -v0, v1
 ; SI-NEXT:    s_setpc_b64 s[30:31]
@@ -1346,7 +1338,6 @@ define half @v_fneg_inv2pi_minnum_foldable_use_f16(half %a, half %b) #0 {
 ; SI-NEXT:    v_cvt_f16_f32_e32 v1, v1
 ; SI-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; SI-NEXT:    v_cvt_f32_f16_e32 v1, v1
-; SI-NEXT:    v_mul_f32_e32 v0, 1.0, v0
 ; SI-NEXT:    v_max_f32_e32 v0, 0xbe230000, v0
 ; SI-NEXT:    v_mul_f32_e32 v0, v0, v1
 ; SI-NEXT:    s_setpc_b64 s[30:31]
@@ -1413,8 +1404,6 @@ define { half, half } @v_fneg_minnum_multi_use_minnum_f16_ieee(half %a, half %b)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; SI-NEXT:    v_cvt_f32_f16_e64 v1, -v1
 ; SI-NEXT:    v_cvt_f32_f16_e64 v0, -v0
-; SI-NEXT:    v_mul_f32_e32 v1, 1.0, v1
-; SI-NEXT:    v_mul_f32_e32 v0, 1.0, v0
 ; SI-NEXT:    v_max_f32_e32 v0, v0, v1
 ; SI-NEXT:    v_mul_f32_e32 v1, -4.0, v0
 ; SI-NEXT:    s_setpc_b64 s[30:31]
@@ -1494,8 +1483,6 @@ define half @v_fneg_maxnum_f16_ieee(half %a, half %b) #0 {
 ; SI-NEXT:    v_cvt_f16_f32_e64 v0, -v0
 ; SI-NEXT:    v_cvt_f32_f16_e32 v1, v1
 ; SI-NEXT:    v_cvt_f32_f16_e32 v0, v0
-; SI-NEXT:    v_mul_f32_e32 v1, 1.0, v1
-; SI-NEXT:    v_mul_f32_e32 v0, 1.0, v0
 ; SI-NEXT:    v_min_f32_e32 v0, v0, v1
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -1599,7 +1586,6 @@ define half @v_fneg_posk_maxnum_f16_ieee(half %a) #0 {
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e64 v0, -v0
 ; SI-NEXT:    v_cvt_f32_f16_e32 v0, v0
-; SI-NEXT:    v_mul_f32_e32 v0, 1.0, v0
 ; SI-NEXT:    v_min_f32_e32 v0, -4.0, v0
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -1653,7 +1639,6 @@ define half @v_fneg_negk_maxnum_f16_ieee(half %a) #0 {
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e64 v0, -v0
 ; SI-NEXT:    v_cvt_f32_f16_e32 v0, v0
-; SI-NEXT:    v_mul_f32_e32 v0, 1.0, v0
 ; SI-NEXT:    v_min_f32_e32 v0, 4.0, v0
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -1736,7 +1721,6 @@ define half @v_fneg_neg0_maxnum_f16_ieee(half %a) #0 {
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e64 v0, -v0
 ; SI-NEXT:    v_cvt_f32_f16_e32 v0, v0
-; SI-NEXT:    v_mul_f32_e32 v0, 1.0, v0
 ; SI-NEXT:    v_min_f32_e32 v0, 0, v0
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -1792,7 +1776,6 @@ define half @v_fneg_0_maxnum_foldable_use_f16_ieee(half %a, half %b) #0 {
 ; SI-NEXT:    v_cvt_f16_f32_e32 v1, v1
 ; SI-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; SI-NEXT:    v_cvt_f32_f16_e32 v1, v1
-; SI-NEXT:    v_mul_f32_e32 v0, 1.0, v0
 ; SI-NEXT:    v_max_f32_e32 v0, 0, v0
 ; SI-NEXT:    v_mul_f32_e64 v0, -v0, v1
 ; SI-NEXT:    s_setpc_b64 s[30:31]
@@ -1859,8 +1842,6 @@ define { half, half } @v_fneg_maxnum_multi_use_maxnum_f16_ieee(half %a, half %b)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; SI-NEXT:    v_cvt_f32_f16_e64 v1, -v1
 ; SI-NEXT:    v_cvt_f32_f16_e64 v0, -v0
-; SI-NEXT:    v_mul_f32_e32 v1, 1.0, v1
-; SI-NEXT:    v_mul_f32_e32 v0, 1.0, v0
 ; SI-NEXT:    v_min_f32_e32 v0, v0, v1
 ; SI-NEXT:    v_mul_f32_e32 v1, -4.0, v0
 ; SI-NEXT:    s_setpc_b64 s[30:31]
@@ -3980,7 +3961,8 @@ define half @v_fneg_canonicalize_f16(half %a) #0 {
 ; SI-LABEL: v_fneg_canonicalize_f16:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-NEXT:    v_xor_b32_e32 v0, 0x80000000, v0
+; SI-NEXT:    v_cvt_f16_f32_e64 v0, -v0
+; SI-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-LABEL: v_fneg_canonicalize_f16:
diff --git a/llvm/test/CodeGen/AMDGPU/fneg-combines.new.ll b/llvm/test/CodeGen/AMDGPU/fneg-combines.new.ll
index 17f67615c29f2e..b5440b9c38c9f2 100644
--- a/llvm/test/CodeGen/AMDGPU/fneg-combines.new.ll
+++ b/llvm/test/CodeGen/AMDGPU/fneg-combines.new.ll
@@ -1021,7 +1021,6 @@ define half @v_fneg_inv2pi_minnum_f16(half %a) #0 {
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e64 v0, -v0
 ; SI-NEXT:    v_cvt_f32_f16_e32 v0, v0
-; SI-NEXT:    v_mul_f32_e32 v0, 1.0, v0
 ; SI-NEXT:    v_max_f32_e32 v0, 0xbe230000, v0
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -1043,7 +1042,6 @@ define half @v_fneg_neg_inv2pi_minnum_f16(half %a) #0 {
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e64 v0, -v0
 ; SI-NEXT:    v_cvt_f32_f16_e32 v0, v0
-; SI-NEXT:    v_mul_f32_e32 v0, 1.0, v0
 ; SI-NEXT:    v_max_f32_e32 v0, 0x3e230000, v0
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.maxnum.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.maxnum.f16.ll
index ab7ab4de186142..d056a97dc54442 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.maxnum.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.maxnum.f16.ll
@@ -32,8 +32,6 @@ define amdgpu_kernel void @maxnum_f16(
 ; SI-NEXT:    s_mov_b32 s1, s5
 ; SI-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; SI-NEXT:    v_cvt_f32_f16_e32 v1, v1
-; SI-NEXT:    v_mul_f32_e32 v0, 1.0, v0
-; SI-NEXT:    v_mul_f32_e32 v1, 1.0, v1
 ; SI-NEXT:    v_max_f32_e32 v0, v0, v1
 ; SI-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; SI-NEXT:    buffer_store_short v0, off, s[0:3], 0
@@ -170,7 +168,6 @@ define amdgpu_kernel void @maxnum_f16_imm_a(
 ; SI-NEXT:    s_mov_b32 s5, s1
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_cvt_f32_f16_e32 v0, v0
-; SI-NEXT:    v_mul_f32_e32 v0, 1.0, v0
 ; SI-NEXT:    v_max_f32_e32 v0, 0x40400000, v0
 ; SI-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; SI-NEXT:    buffer_store_short v0, off, s[4:7], 0
@@ -279,7 +276,6 @@ define amdgpu_kernel void @maxnum_f16_imm_b(
 ; SI-NEXT:    s_mov_b32 s5, s1
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_cvt_f32_f16_e32 v0, v0
-; SI-NEXT:    v_mul_f32_e32 v0, 1.0, v0
 ; SI-NEXT:    v_max_f32_e32 v0, 4.0, v0
 ; SI-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; SI-NEXT:    buffer_store_short v0, off, s[4:7], 0
@@ -384,21 +380,17 @@ define amdgpu_kernel void @maxnum_v2f16(
 ; SI-NEXT:    s_mov_b32 s6, -1
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
 ; SI-NEXT:    s_lshr_b32 s1, s2, 16
-; SI-NEXT:    v_cvt_f32_f16_e32 v1, s0
-; SI-NEXT:    s_lshr_b32 s0, s0, 16
-; SI-NEXT:    v_cvt_f32_f16_e32 v2, s0
-; SI-NEXT:    v_cvt_f32_f16_e32 v3, s1
-; SI-NEXT:    v_cvt_f32_f16_e32 v0, s2
-; SI-NEXT:    v_mul_f32_e32 v1, 1.0, v1
-; SI-NEXT:    v_mul_f32_e32 v2, 1.0, v2
-; SI-NEXT:    v_mul_f32_e32 v3, 1.0, v3
-; SI-NEXT:    v_mul_f32_e32 v0, 1.0, v0
-; SI-NEXT:    v_max_f32_e32 v2, v3, v2
-; SI-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; SI-NEXT:    s_lshr_b32 s3, s0, 16
+; SI-NEXT:    v_cvt_f32_f16_e32 v0, s1
+; SI-NEXT:    v_cvt_f32_f16_e32 v1, s3
+; SI-NEXT:    v_cvt_f32_f16_e32 v2, s2
+; SI-NEXT:    v_cvt_f32_f16_e32 v3, s0
 ; SI-NEXT:    v_max_f32_e32 v0, v0, v1
 ; SI-NEXT:    v_cvt_f16_f32_e32 v0, v0
-; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v2
-; SI-NEXT:    v_or_b32_e32 v0, v0, v1
+; SI-NEXT:    v_max_f32_e32 v1, v2, v3
+; SI-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; SI-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; SI-NEXT:    v_or_b32_e32 v0, v1, v0
 ; SI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; SI-NEXT:    s_endpgm
 ;
@@ -497,20 +489,18 @@ define amdgpu_kernel void @maxnum_v2f16_imm_a(
 ; SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
 ; SI-NEXT:    s_load_dword s2, s[2:3], 0x0
-; SI-NEXT:    s_mov_b32 s3, 0xf000
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    v_cvt_f32_f16_e32 v0, s2
-; SI-NEXT:    s_lshr_b32 s2, s2, 16
+; SI-NEXT:    s_lshr_b32 s3, s2, 16
+; SI-NEXT:    v_cvt_f32_f16_e32 v0, s3
 ; SI-NEXT:    v_cvt_f32_f16_e32 v1, s2
+; SI-NEXT:    s_mov_b32 s3, 0xf000
 ; SI-NEXT:    s_mov_b32 s2, -1
-; SI-NEXT:    v_mul_f32_e32 v0, 1.0, v0
-; SI-NEXT:    v_max_f32_e32 v0, 0x40400000, v0
-; SI-NEXT:    v_mul_f32_e32 v1, 1.0, v1
-; SI-NEXT:    v_max_f32_e32 v1, 4.0, v1
-; SI-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; SI-NEXT:    v_max_f32_e32 v0, 4.0, v0
 ; SI-NEXT:    v_cvt_f16_f32_e32 v0, v0
-; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; SI-NEXT:    v_or_b32_e32 v0, v0, v1
+; SI-NEXT:    v_max_f32_e32 v1, 0x40400000, v1
+; SI-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; SI-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; SI-NEXT:    v_or_b32_e32 v0, v1, v0
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; SI-NEXT:    s_endpgm
 ;
@@ -589,20 +579,18 @@ define amdgpu_kernel void @maxnum_v2f16_imm_b(
 ; SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
 ; SI-NEXT:    s_load_dword s2, s[2:3], 0x0
-; SI-NEXT:    s_mov_b32 s3, 0xf000
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    v_cvt_f32_f16_e32 v0, s2
-; SI-NEXT:    s_lshr_b32 s2, s2, 16
+; SI-NEXT:    s_lshr_b32 s3, s2, 16
+; SI-NEXT:    v_cvt_f32_f16_e32 v0, s3
 ; SI-NEXT:    v_cvt_f32_f16_e32 v1, s2
+; SI-NEXT:    s_mov_b32 s3, 0xf000
 ; SI-NEXT:    s_mov_b32 s2, -1
-; SI-NEXT:    v_mul_f32_e32 v0, 1.0, v0
-; SI-NEXT:    v_max_f32_e32 v0, 4.0, v0
-; SI-NEXT:    v_mul_f32_e32 v1, 1.0, v1
-; SI-NEXT:    v_max_f32_e32 v1, 0x40400000, v1
-; SI-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; SI-NEXT:    v_max_f32_e32 v0, 0x40400000, v0
 ; SI-NEXT:    v_cvt_f16_f32_e32 v0, v0
-; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; SI-NEXT:    v_or_b32_e32 v0, v0, v1
+; SI-NEXT:    v_max_f32_e32 v1, 4.0, v1
+; SI-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; SI-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; SI-NEXT:    v_or_b32_e32 v0, v1, v0
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; SI-NEXT:    s_endpgm
 ;
@@ -688,27 +676,21 @@ define amdgpu_kernel void @maxnum_v3f16(
 ; SI-NEXT:    s_mov_b32 s6, -1
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
 ; SI-NEXT:    v_cvt_f32_f16_e32 v0, s3
-; SI-NEXT:    v_cvt_f32_f16_e32 v1, s2
-; SI-NEXT:    s_lshr_b32 s2, s2, 16
-; SI-NEXT:    s_lshr_b32 s3, s0, 16
-; SI-NEXT:    v_cvt_f32_f16_e32 v2, s3
+; SI-NEXT:    s_lshr_b32 s3, s2, 16
+; SI-NEXT:    s_lshr_b32 s8, s0, 16
+; SI-NEXT:    v_cvt_f32_f16_e32 v1, s3
+; SI-NEXT:    v_cvt_f32_f16_e32 v2, s8
 ; SI-NEXT:    v_cvt_f32_f16_e32 v3, s2
-; SI-NEXT:    v_cvt_f32_f16_e32 v5, s0
-; SI-NEXT:    v_cvt_f32_f16_e32 v4, s1
-; SI-NEXT:    v_mul_f32_e32 v2, 1.0, v2
-; SI-NEXT:    v_mul_f32_e32 v3, 1.0, v3
-; SI-NEXT:    v_max_f32_e32 v2, v3, v2
-; SI-NEXT:    v_mul_f32_e32 v3, 1.0, v5
-; SI-NEXT:    v_mul_f32_e32 v1, 1.0, v1
-; SI-NEXT:    v_max_f32_e32 v1, v1, v3
-; SI-NEXT:    v_mul_f32_e32 v3, 1.0, v4
-; SI-NEXT:    v_mul_f32_e32 v0, 1.0, v0
-; SI-NEXT:    v_cvt_f16_f32_e32 v2, v2
-; SI-NEXT:    v_max_f32_e32 v0, v0, v3
+; SI-NEXT:    v_cvt_f32_f16_e32 v4, s0
+; SI-NEXT:    v_cvt_f32_f16_e32 v5, s1
+; SI-NEXT:    v_max_f32_e32 v1, v1, v2
 ; SI-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; SI-NEXT:    v_max_f32_e32 v2, v3, v4
+; SI-NEXT:    v_max_f32_e32 v0, v0, v5
+; SI-NEXT:    v_cvt_f16_f32_e32 v2, v2
 ; SI-NEXT:    v_cvt_f16_f32_e32 v0, v0
-; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; SI-NEXT:    v_or_b32_e32 v1, v1, v2
+; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; SI-NEXT:    v_or_b32_e32 v1, v2, v1
 ; SI-NEXT:    buffer_store_short v0, off, s[4:7], 0 offset:4
 ; SI-NEXT:    buffer_store_dword v1, off, s[4:7], 0
 ; SI-NEXT:    s_endpgm
@@ -837,25 +819,17 @@ define amdgpu_kernel void @maxnum_v4f16(
 ; SI-NEXT:    v_cvt_f32_f16_e32 v2, s6
 ; SI-NEXT:    s_lshr_b32 s6, s7, 16
 ; SI-NEXT:    v_cvt_f32_f16_e32 v3, s6
+; SI-NEXT:    v_cvt_f32_f16_e32 v4, s4
 ; SI-NEXT:    s_lshr_b32 s6, s5, 16
+; SI-NEXT:    s_lshr_b32 s4, s4, 16
 ; SI-NEXT:    v_cvt_f32_f16_e32 v5, s6
+; SI-NEXT:    v_cvt_f32_f16_e32 v7, s4
 ; SI-NEXT:    v_cvt_f32_f16_e32 v1, s7
-; SI-NEXT:    v_cvt_f32_f16_e32 v4, s4
-; SI-NEXT:    s_lshr_b32 s4, s4, 16
-; SI-NEXT:    v_cvt_f32_f16_e32 v7, s5
-; SI-NEXT:    v_cvt_f32_f16_e32 v6, s4
-; SI-NEXT:    v_mul_f32_e32 v5, 1.0, v5
-; SI-NEXT:    v_mul_f32_e32 v3, 1.0, v3
+; SI-NEXT:    v_cvt_f32_f16_e32 v6, s5
 ; SI-NEXT:    v_max_f32_e32 v3, v3, v5
-; SI-NEXT:    v_mul_f32_e32 v5, 1.0, v7
-; SI-NEXT:    v_mul_f32_e32 v1, 1.0, v1
-; SI-NEXT:    v_max_f32_e32 v1, v1, v5
-; SI-NEXT:    v_mul_f32_e32 v5, 1.0, v6
-; SI-NEXT:    v_mul_f32_e32 v2, 1.0, v2
-; SI-NEXT:    v_max_f32_e32 v2, v2, v5
-; SI-NEXT:    v_mul_f32_e32 v4, 1.0, v4
-; SI-NEXT:    v_mul_f32_e32 v0, 1.0, v0
+; SI-NEXT:    v_max_f32_e32 v2, v2, v7
 ; SI-NEXT:    v_cvt_f16_f32_e32 v3, v3
+; SI-NEXT:    v_max_f32_e32 v1, v1, v6
 ; SI-NEXT:    v_cvt_f16_f32_e32 v2, v2
 ; SI-NEXT:    v_max_f32_e32 v0, v0, v4
 ; SI-NEXT:    v_cvt_f16_f32_e32 v1, v1
@@ -986,20 +960,16 @@ define amdgpu_kernel void @fmax_v4f16_imm_a(
 ; SI-NEXT:    v_cvt_f32_f16_e32 v1, s5
 ; SI-NEXT:    s_lshr_b32 s5, s5, 16
 ; SI-NEXT:    v_cvt_f32_f16_e32 v0, s4
-; SI-NEXT:    v_cvt_f32_f16_e32 v2, s5
 ; SI-NEXT:    s_lshr_b32 s4, s4, 16
+; SI-NEXT:    v_cvt_f32_f16_e32 v2, s5
 ; SI-NEXT:    v_cvt_f32_f16_e32 v3, s4
-; SI-NEXT:    v_mul_f32_e32 v1, 1.0, v1
-; SI-NEXT:    v_mul_f32_e32 v2, 1.0, v2
-; SI-NEXT:    v_max_f32_e32 v2, 4.0, v2
-; SI-NEXT:    v_mul_f32_e32 v3, 1.0, v3
 ; SI-NEXT:    v_max_f32_e32 v1, 0x40400000, v1
+; SI-NEXT:    v_max_f32_e32 v0, 0x41000000, v0
+; SI-NEXT:    v_max_f32_e32 v2, 4.0, v2
 ; SI-NEXT:    v_cvt_f16_f32_e32 v2, v2
 ; SI-NEXT:    v_max_f32_e32 v3, 2.0, v3
-; SI-NEXT:    v_mul_f32_e32 v0, 1.0, v0
 ; SI-NEXT:    v_cvt_f16_f32_e32 v1, v1
 ; SI-NEXT:    v_cvt_f16_f32_e32 v3, v3
-; SI-NEXT:    v_max_f32_e32 v0, 0x41000000, v0
 ; SI-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
 ; SI-NEXT:    v_or_b32_e32 v1, v1, v2
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.minnum.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.minnum.f16.ll
index b7370ce0fde1ab..f934a2de9247f0 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.minnum.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.minnum.f16.ll
@@ -32,8 +32,6 @@ define amdgpu_kernel void @minnum_f16_ieee(
 ; SI-NEXT:    s_mov_b32 s1, s5
 ; SI-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; SI-NEXT:    v_cvt_f32_f16_e32 v1, v1
-; SI-NEXT:    v_mul_f32_e32 v0, 1.0, v0
-; SI-NEXT:    v_mul_f32_e32 v1, 1.0, v1
 ; SI-NEXT:    v_min_f32_e32 v0, v0, v1
 ; SI-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; SI-NEXT:    buffer_store_short v0, off, s[0:3], 0
@@ -197,7 +195,6 @@ define amdgpu_kernel void @minnum_f16_imm_a(
 ; SI-NEXT:    s_mov_b32 s5, s1
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_cvt_f32_f16_e32 v0, v0
-; SI-NEXT:    v_mul_f32_e32 v0, 1.0, v0
 ; SI-NEXT:    v_min_f32_e32 v0, 0x40400000, v0
 ; SI-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; SI-NEXT:    buffer_store_short v0, off, s[4:7], 0
@@ -305,7 +302,6 @@ define amdgpu_kernel void @minnum_f16_imm_b(
 ; SI-NEXT:    s_mov_b32 s5, s1
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_cvt_f32_f16_e32 v0, v0
-; SI-NEXT:    v_mul_f32_e32 v0, 1.0, v0
 ; SI-NEXT:    v_min_f32_e32 v0, 4.0, v0
 ; SI-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; SI-NEXT:    buffer_store_short v0, off, s[4:7], 0
@@ -409,21 +405,17 @@ define amdgpu_kernel void @minnum_v2f16_ieee(
 ; SI-NEXT:    s_mov_b32 s6, -1
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
 ; SI-NEXT:    s_lshr_b32 s1, s2, 16
-; SI-NEXT:    v_cvt_f32_f16_e32 v1, s0
-; SI-NEXT:    s_lshr_b32 s0, s0, 16
-; SI-NEXT:    v_cvt_f32_f16_e32 v2, s0
-; SI-NEXT:    v_cvt_f32_f16_e32 v3, s1
-; SI-NEXT:    v_cvt_f32_f16_e32 v0, s2
-; SI-NEXT:    v_mul_f32_e32 v1, 1.0, v1
-; SI-NEXT:    v_mul_f32_e32 v2, 1.0, v2
-; SI-NEXT:    v_mul_f32_e32 v3, 1.0, v3
-; SI-NEXT:    v_mul_f32_e32 v0, 1.0, v0
-; SI-NEXT:    v_min_f32_e32 v2, v3, v2
-; SI-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; SI-NEXT:    s_lshr_b32 s3, s0, 16
+; SI-NEXT:    v_cvt_f32_f16_e32 v0, s1
+; SI-NEXT:    v_cvt_f32_f16_e32 v1, s3
+; SI-NEXT:    v_cvt_f32_f16_e32 v2, s2
+; SI-NEXT:    v_cvt_f32_f16_e32 v3, s0
 ; SI-NEXT:    v_min_f32_e32 v0, v0, v1
 ; SI-NEXT:    v_cvt_f16_f32_e32 v0, v0
-; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v2
-; SI-NEXT:    v_or_b32_e32 v0, v0, v1
+; SI-NEXT:    v_min_f32_e32 v1, v2, v3
+; SI-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; SI-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; SI-NEXT:    v_or_b32_e32 v0, v1, v0
 ; SI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; SI-NEXT:    s_endpgm
 ;
@@ -556,20 +548,18 @@ define amdgpu_kernel void @minnum_v2f16_imm_a(
 ; SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
 ; SI-NEXT:    s_load_dword s2, s[2:3], 0x0
-; SI-NEXT:    s_mov_b32 s3, 0xf000
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    v_cvt_f32_f16_e32 v0, s2
-; SI-NEXT:    s_lshr_b32 s2, s2, 16
+; SI-NEXT:    s_lshr_b32 s3, s2, 16
+; SI-NEXT:    v_cvt_f32_f16_e32 v0, s3
 ; SI-NEXT:    v_cvt_f32_f16_e32 v1, s2
+; SI-NEXT:    s_mov_b32 s3, 0xf000
 ; SI-NEXT:    s_mov_b32 s2, -1
-; SI-NEXT:    v_mul_f32_e32 v0, 1.0, v0
-; SI-NEXT:    v_min_f32_e32 v0, 0x40400000, v0
-; SI-NEXT:    v_mul_f32_e32 v1, 1.0, v1
-; SI-NEXT:    v_min_f32_e32 v1, 4.0, v1
-; SI-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; SI-NEXT:    v_min_f32_e32 v0, 4.0, v0
 ; SI-NEXT:    v_cvt_f16_f32_e32 v0, v0
-; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; SI-NEXT:    v_or_b32_e32 v0, v0, v1
+; SI-NEXT:    v_min_f32_e32 v1, 0x40400000, v1
+; SI-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; SI-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; SI-NEXT:    v_or_b32_e32 v0, v1, v0
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; SI-NEXT:    s_endpgm
 ;
@@ -647,20 +637,18 @@ define amdgpu_kernel void @minnum_v2f16_imm_b(
 ; SI-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
 ; SI-NEXT:    s_load_dword s2, s[2:3], 0x0
-; SI-NEXT:    s_mov_b32 s3, 0xf000
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    v_cvt_f32_f16_e32 v0, s2
-; SI-NEXT:    s_lshr_b32 s2, s2, 16
+; SI-NEXT:    s_lshr_b32 s3, s2, 16
+; SI-NEXT:    v_cvt_f32_f16_e32 v0, s3
 ; SI-NEXT:    v_cvt_f32_f16_e32 v1, s2
+; SI-NEXT:    s_mov_b32 s3, 0xf000
 ; SI-NEXT:    s_mov_b32 s2, -1
-; SI-NEXT:    v_mul_f32_e32 v0, 1.0, v0
-; SI-NEXT:    v_min_f32_e32 v0, 4.0, v0
-; SI-NEXT:    v_mul_f32_e32 v1, 1.0, v1
-; SI-NEXT:    v_min_f32_e32 v1, 0x40400000, v1
-; SI-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; SI-NEXT:    v_min_f32_e32 v0, 0x40400000, v0
 ; SI-NEXT:    v_cvt_f16_f32_e32 v0, v0
-; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; SI-NEXT:    v_or_b32_e32 v0, v0, v1
+; SI-NEXT:    v_min_f32_e32 v1, 4.0, v1
+; SI-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; SI-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; SI-NEXT:    v_or_b32_e32 v0, v1, v0
 ; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; SI-NEXT:    s_endpgm
 ;
@@ -745,27 +733,21 @@ define amdgpu_kernel void @minnum_v3f16(
 ; SI-NEXT:    s_mov_b32 s6, -1
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
 ; SI-NEXT:    v_cvt_f32_f16_e32 v0, s3
-; SI-NEXT:    v_cvt_f32_f16_e32 v1, s2
-; SI-NEXT:    s_lshr_b32 s2, s2, 16
-; SI-NEXT:    s_lshr_b32 s3, s0, 16
-; SI-NEXT:    v_cvt_f32_f16_e32 v2, s3
+; SI-NEXT:    s_lshr_b32 s3, s2, 16
+; SI-NEXT:    s_lshr_b32 s8, s0, 16
+; SI-NEXT:    v_cvt_f32_f16_e32 v1, s3
+; SI-NEXT:    v_cvt_f32_f16_e32 v2, s8
 ; SI-NEXT:    v_cvt_f32_f16_e32 v3, s2
-; SI-NEXT:    v_cvt_f32_f16_e32 v5, s0
-; SI-NEXT:    v_cvt_f32_f16_e32 v4, s1
-; SI-NEXT:    v_mul_f32_e32 v2, 1.0, v2
-; SI-NEXT:    v_mul_f32_e32 v3, 1.0, v3
-; SI-NEXT:    v_min_f32_e32 v2, v3, v2
-; SI-NEXT:    v_mul_f32_e32 v3, 1.0, v5
-; SI-NEXT:    v_mul_f32_e32 v1, 1.0, v1
-; SI-NEXT:    v_min_f32_e32 v1, v1, v3
-; SI-NEXT:    v_mul_f32_e32 v3, 1.0, v4
-; SI-NEXT:    v_mul_f32_e32 v0, 1.0, v0
-; SI-NEXT:    v_cvt_f16_f32_e32 v2, v2
-; SI-NEXT:    v_min_f32_e32 v0, v0, v3
+; SI-NEXT:    v_cvt_f32_f16_e32 v4, s0
+; SI-NEXT:    v_cvt_f32_f16_e32 v5, s1
+; SI-NEXT:    v_min_f32_e32 v1, v1, v2
 ; SI-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; SI-NEXT:    v_min_f32_e32 v2, v3, v4
+; SI-NEXT:    v_min_f32_e32 v0, v0, v5
+; SI-NEXT:    v_cvt_f16_f32_e32 v2, v2
 ; SI-NEXT:    v_cvt_f16_f32_e32 v0, v0
-; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; SI-NEXT:    v_or_b32_e32 v1, v1, v2
+; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; SI-NEXT:    v_or_b32_e32 v1, v2, v1
 ; SI-NEXT:    buffer_store_short v0, off, s[4:7], 0 offset:4
 ; SI-NEXT:    buffer_store_dword v1, off, s[4:7], 0
 ; SI-NEXT:    s_endpgm
@@ -893,25 +875,17 @@ define amdgpu_kernel void @minnum_v4f16(
 ; SI-NEXT:    v_cvt_f32_f16_e32 v2, s6
 ; SI-NEXT:    s_lshr_b32 s6, s7, 16
 ; SI-NEXT:    v_cvt_f32_f16_e32 v3, s6
+; SI-NEXT:    v_cvt_f32_f16_e32 v4, s4
 ; SI-NEXT:    s_lshr_b32 s6, s5, 16
+; SI-NEXT:    s_lshr_b32 s4, s4, 16
 ; SI-NEXT:    v_cvt_f32_f16_e32 v5, s6
+; SI-NEXT:    v_cvt_f32_f16_e32 v7, s4
 ; SI-NEXT:    v_cvt_f32_f16_e32 v1, s7
-; SI-NEXT:    v_cvt_f32_f16_e32 v4, s4
-; SI-NEXT:    s_lshr_b32 s4, s4, 16
-; SI-NEXT:    v_cvt_f32_f16_e32 v7, s5
-; SI-NEXT:    v_cvt_f32_f16_e32 v6, s4
-; SI-NEXT:    v_mul_f32_e32 v5, 1.0, v5
-; SI-NEXT:    v_mul_f32_e32 v3, 1.0, v3
+; SI-NEXT:    v_cvt_f32_f16_e32 v6, s5
 ; SI-NEXT:    v_min_f32_e32 v3, v3, v5
-; SI-NEXT:    v_mul_f32_e32 v5, 1.0, v7
-; SI-NEXT:    v_mul_f32_e32 v1, 1.0, v1
-; SI-NEXT:    v_min_f32_e32 v1, v1, v5
-; SI-NEXT:    v_mul_f32_e32 v5, 1.0, v6
-; SI-NEXT:    v_mul_f32_e32 v2, 1.0, v2
-; SI-NEXT:    v_min_f32_e32 v2, v2, v5
-; SI-NEXT:    v_mul_f32_e32 v4, 1.0, v4
-; SI-NEXT:    v_mul_f32_e32 v0, 1.0, v0
+; SI-NEXT:    v_min_f32_e32 v2, v2, v7
 ; SI-NEXT:    v_cvt_f16_f32_e32 v3, v3
+; SI-NEXT:    v_min_f32_e32 v1, v1, v6
 ; SI-NEXT:    v_cvt_f16_f32_e32 v2, v2
 ; SI-NEXT:    v_min_f32_e32 v0, v0, v4
 ; SI-NEXT:    v_cvt_f16_f32_e32 v1, v1
@@ -1041,20 +1015,16 @@ define amdgpu_kernel void @fmin_v4f16_imm_a(
 ; SI-NEXT:    v_cvt_f32_f16_e32 v1, s5
 ; SI-NEXT:    s_lshr_b32 s5, s5, 16
 ; SI-NEXT:    v_cvt_f32_f16_e32 v0, s4
-; SI-NEXT:    v_cvt_f32_f16_e32 v2, s5
 ; SI-NEXT:    s_lshr_b32 s4, s4, 16
+; SI-NEXT:    v_cvt_f32_f16_e32 v2, s5
 ; SI-NEXT:    v_cvt_f32_f16_e32 v3, s4
-; SI-NEXT:    v_mul_f32_e32 v1, 1.0, v1
-; SI-NEXT:    v_mul_f32_e32 v2, 1.0, v2
-; SI-NEXT:    v_min_f32_e32 v2, 4.0, v2
-; SI-NEXT:    v_mul_f32_e32 v3, 1.0, v3
 ; SI-NEXT:    v_min_f32_e32 v1, 0x40400000, v1
+; SI-NEXT:    v_min_f32_e32 v0, 0x41000000, v0
+; SI-NEXT:    v_min_f32_e32 v2, 4.0, v2
 ; SI-NEXT:    v_cvt_f16_f32_e32 v2, v2
 ; SI-NEXT:    v_min_f32_e32 v3, 2.0, v3
-; SI-NEXT:    v_mul_f32_e32 v0, 1.0, v0
 ; SI-NEXT:    v_cvt_f16_f32_e32 v1, v1
 ; SI-NEXT:    v_cvt_f16_f32_e32 v3, v3
-; SI-NEXT:    v_min_f32_e32 v0, 0x41000000, v0
 ; SI-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
 ; SI-NEXT:    v_or_b32_e32 v1, v1, v2
diff --git a/llvm/test/CodeGen/AMDGPU/mad-mix-lo.ll b/llvm/test/CodeGen/AMDGPU/mad-mix-lo.ll
index fb3e79b2cf2934..5b7f0e72b70da5 100644
--- a/llvm/test/CodeGen/AMDGPU/mad-mix-lo.ll
+++ b/llvm/test/CodeGen/AMDGPU/mad-mix-lo.ll
@@ -951,56 +951,70 @@ define <3 x half> @v_mad_mix_v3f32_clamp_postcvt(<3 x half> %src0, <3 x half> %s
 ; SDAG-GFX1100-LABEL: v_mad_mix_v3f32_clamp_postcvt:
 ; SDAG-GFX1100:       ; %bb.0:
 ; SDAG-GFX1100-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-GFX1100-NEXT:    v_fma_mixlo_f16 v6, v0, v2, v4 op_sel_hi:[1,1,1]
 ; SDAG-GFX1100-NEXT:    v_fma_mixlo_f16 v1, v1, v3, v5 op_sel_hi:[1,1,1]
-; SDAG-GFX1100-NEXT:    v_fma_mixlo_f16 v3, v0, v2, v4 op_sel_hi:[1,1,1] clamp
 ; SDAG-GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; SDAG-GFX1100-NEXT:    v_pack_b32_f16 v1, v1, 0
-; SDAG-GFX1100-NEXT:    v_fma_mixhi_f16 v3, v0, v2, v4 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp
+; SDAG-GFX1100-NEXT:    v_fma_mixhi_f16 v6, v0, v2, v4 op_sel:[1,1,1] op_sel_hi:[1,1,1]
+; SDAG-GFX1100-NEXT:    v_pack_b32_f16 v0, v1, 0
 ; SDAG-GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; SDAG-GFX1100-NEXT:    v_pk_max_f16 v1, v1, v1 clamp
-; SDAG-GFX1100-NEXT:    v_mov_b32_e32 v0, v3
+; SDAG-GFX1100-NEXT:    v_pk_max_f16 v1, v6, 0
+; SDAG-GFX1100-NEXT:    v_pk_max_f16 v2, v0, 0
+; SDAG-GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; SDAG-GFX1100-NEXT:    v_pk_min_f16 v0, v1, 1.0 op_sel_hi:[1,0]
+; SDAG-GFX1100-NEXT:    v_pk_min_f16 v1, v2, 1.0 op_sel_hi:[1,0]
 ; SDAG-GFX1100-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; SDAG-GFX900-LABEL: v_mad_mix_v3f32_clamp_postcvt:
 ; SDAG-GFX900:       ; %bb.0:
 ; SDAG-GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-GFX900-NEXT:    v_mad_mixlo_f16 v6, v0, v2, v4 op_sel_hi:[1,1,1]
 ; SDAG-GFX900-NEXT:    v_mad_mixlo_f16 v1, v1, v3, v5 op_sel_hi:[1,1,1]
-; SDAG-GFX900-NEXT:    v_mad_mixlo_f16 v3, v0, v2, v4 op_sel_hi:[1,1,1] clamp
 ; SDAG-GFX900-NEXT:    v_pack_b32_f16 v1, v1, 0
-; SDAG-GFX900-NEXT:    v_mad_mixhi_f16 v3, v0, v2, v4 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp
-; SDAG-GFX900-NEXT:    v_pk_max_f16 v1, v1, v1 clamp
-; SDAG-GFX900-NEXT:    v_mov_b32_e32 v0, v3
+; SDAG-GFX900-NEXT:    v_mad_mixhi_f16 v6, v0, v2, v4 op_sel:[1,1,1] op_sel_hi:[1,1,1]
+; SDAG-GFX900-NEXT:    v_pk_max_f16 v1, v1, 0
+; SDAG-GFX900-NEXT:    v_pk_max_f16 v0, v6, 0
+; SDAG-GFX900-NEXT:    v_pk_min_f16 v0, v0, 1.0 op_sel_hi:[1,0]
+; SDAG-GFX900-NEXT:    v_pk_min_f16 v1, v1, 1.0 op_sel_hi:[1,0]
 ; SDAG-GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; SDAG-GFX906-LABEL: v_mad_mix_v3f32_clamp_postcvt:
 ; SDAG-GFX906:       ; %bb.0:
 ; SDAG-GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-GFX906-NEXT:    v_fma_mixlo_f16 v6, v0, v2, v4 op_sel_hi:[1,1,1]
 ; SDAG-GFX906-NEXT:    v_fma_mixlo_f16 v1, v1, v3, v5 op_sel_hi:[1,1,1]
-; SDAG-GFX906-NEXT:    v_fma_mixlo_f16 v3, v0, v2, v4 op_sel_hi:[1,1,1] clamp
 ; SDAG-GFX906-NEXT:    v_pack_b32_f16 v1, v1, 0
-; SDAG-GFX906-NEXT:    v_fma_mixhi_f16 v3, v0, v2, v4 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp
-; SDAG-GFX906-NEXT:    v_pk_max_f16 v1, v1, v1 clamp
-; SDAG-GFX906-NEXT:    v_mov_b32_e32 v0, v3
+; SDAG-GFX906-NEXT:    v_fma_mixhi_f16 v6, v0, v2, v4 op_sel:[1,1,1] op_sel_hi:[1,1,1]
+; SDAG-GFX906-NEXT:    v_pk_max_f16 v1, v1, 0
+; SDAG-GFX906-NEXT:    v_pk_max_f16 v0, v6, 0
+; SDAG-GFX906-NEXT:    v_pk_min_f16 v0, v0, 1.0 op_sel_hi:[1,0]
+; SDAG-GFX906-NEXT:    v_pk_min_f16 v1, v1, 1.0 op_sel_hi:[1,0]
 ; SDAG-GFX906-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; SDAG-VI-LABEL: v_mad_mix_v3f32_clamp_postcvt:
 ; SDAG-VI:       ; %bb.0:
 ; SDAG-VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SDAG-VI-NEXT:    v_cvt_f32_f16_sdwa v6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; SDAG-VI-NEXT:    v_cvt_f32_f16_e32 v1, v1
 ; SDAG-VI-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; SDAG-VI-NEXT:    v_cvt_f32_f16_sdwa v7, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; SDAG-VI-NEXT:    v_cvt_f32_f16_e32 v3, v3
 ; SDAG-VI-NEXT:    v_cvt_f32_f16_e32 v2, v2
 ; SDAG-VI-NEXT:    v_cvt_f32_f16_sdwa v8, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
 ; SDAG-VI-NEXT:    v_cvt_f32_f16_e32 v4, v4
-; SDAG-VI-NEXT:    v_cvt_f32_f16_e32 v1, v1
-; SDAG-VI-NEXT:    v_cvt_f32_f16_e32 v3, v3
 ; SDAG-VI-NEXT:    v_cvt_f32_f16_e32 v5, v5
 ; SDAG-VI-NEXT:    v_mac_f32_e32 v8, v6, v7
 ; SDAG-VI-NEXT:    v_mac_f32_e32 v4, v0, v2
-; SDAG-VI-NEXT:    v_cvt_f16_f32_sdwa v0, v8 clamp dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
-; SDAG-VI-NEXT:    v_cvt_f16_f32_e64 v2, v4 clamp
 ; SDAG-VI-NEXT:    v_mac_f32_e32 v5, v1, v3
-; SDAG-VI-NEXT:    v_cvt_f16_f32_e64 v1, v5 clamp
+; SDAG-VI-NEXT:    v_cvt_f16_f32_e32 v0, v8
+; SDAG-VI-NEXT:    v_cvt_f16_f32_e32 v1, v4
+; SDAG-VI-NEXT:    v_cvt_f16_f32_e32 v2, v5
+; SDAG-VI-NEXT:    v_max_f16_e32 v0, 0, v0
+; SDAG-VI-NEXT:    v_max_f16_e32 v3, 0, v1
+; SDAG-VI-NEXT:    v_max_f16_e32 v1, 0, v2
+; SDAG-VI-NEXT:    v_mov_b32_e32 v2, 0x3c00
+; SDAG-VI-NEXT:    v_min_f16_sdwa v0, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; SDAG-VI-NEXT:    v_min_f16_e32 v2, 1.0, v3
+; SDAG-VI-NEXT:    v_min_f16_e32 v1, 1.0, v1
 ; SDAG-VI-NEXT:    v_or_b32_e32 v0, v2, v0
 ; SDAG-VI-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -1139,63 +1153,80 @@ define <3 x half> @v_mad_mix_v3f32_clamp_postcvt(<3 x half> %src0, <3 x half> %s
 }
 
 define <4 x half> @v_mad_mix_v4f32_clamp_postcvt(<4 x half> %src0, <4 x half> %src1, <4 x half> %src2) #0 {
-; GFX1100-LABEL: v_mad_mix_v4f32_clamp_postcvt:
-; GFX1100:       ; %bb.0:
-; GFX1100-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1100-NEXT:    v_fma_mixlo_f16 v6, v0, v2, v4 op_sel_hi:[1,1,1] clamp
-; GFX1100-NEXT:    v_fma_mixlo_f16 v7, v1, v3, v5 op_sel_hi:[1,1,1] clamp
-; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1100-NEXT:    v_fma_mixhi_f16 v6, v0, v2, v4 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp
-; GFX1100-NEXT:    v_fma_mixhi_f16 v7, v1, v3, v5 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp
-; GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1100-NEXT:    v_dual_mov_b32 v0, v6 :: v_dual_mov_b32 v1, v7
-; GFX1100-NEXT:    s_setpc_b64 s[30:31]
+; SDAG-GFX1100-LABEL: v_mad_mix_v4f32_clamp_postcvt:
+; SDAG-GFX1100:       ; %bb.0:
+; SDAG-GFX1100-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-GFX1100-NEXT:    v_fma_mixlo_f16 v6, v0, v2, v4 op_sel_hi:[1,1,1]
+; SDAG-GFX1100-NEXT:    v_fma_mixlo_f16 v7, v1, v3, v5 op_sel_hi:[1,1,1]
+; SDAG-GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; SDAG-GFX1100-NEXT:    v_fma_mixhi_f16 v6, v0, v2, v4 op_sel:[1,1,1] op_sel_hi:[1,1,1]
+; SDAG-GFX1100-NEXT:    v_fma_mixhi_f16 v7, v1, v3, v5 op_sel:[1,1,1] op_sel_hi:[1,1,1]
+; SDAG-GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; SDAG-GFX1100-NEXT:    v_pk_max_f16 v0, v6, 0
+; SDAG-GFX1100-NEXT:    v_pk_max_f16 v1, v7, 0
+; SDAG-GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; SDAG-GFX1100-NEXT:    v_pk_min_f16 v0, v0, 1.0 op_sel_hi:[1,0]
+; SDAG-GFX1100-NEXT:    v_pk_min_f16 v1, v1, 1.0 op_sel_hi:[1,0]
+; SDAG-GFX1100-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX900-LABEL: v_mad_mix_v4f32_clamp_postcvt:
-; GFX900:       ; %bb.0:
-; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX900-NEXT:    v_mad_mixlo_f16 v6, v0, v2, v4 op_sel_hi:[1,1,1] clamp
-; GFX900-NEXT:    v_mad_mixhi_f16 v6, v0, v2, v4 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp
-; GFX900-NEXT:    v_mad_mixlo_f16 v2, v1, v3, v5 op_sel_hi:[1,1,1] clamp
-; GFX900-NEXT:    v_mad_mixhi_f16 v2, v1, v3, v5 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp
-; GFX900-NEXT:    v_mov_b32_e32 v0, v6
-; GFX900-NEXT:    v_mov_b32_e32 v1, v2
-; GFX900-NEXT:    s_setpc_b64 s[30:31]
+; SDAG-GFX900-LABEL: v_mad_mix_v4f32_clamp_postcvt:
+; SDAG-GFX900:       ; %bb.0:
+; SDAG-GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-GFX900-NEXT:    v_mad_mixlo_f16 v6, v0, v2, v4 op_sel_hi:[1,1,1]
+; SDAG-GFX900-NEXT:    v_mad_mixlo_f16 v7, v1, v3, v5 op_sel_hi:[1,1,1]
+; SDAG-GFX900-NEXT:    v_mad_mixhi_f16 v7, v1, v3, v5 op_sel:[1,1,1] op_sel_hi:[1,1,1]
+; SDAG-GFX900-NEXT:    v_mad_mixhi_f16 v6, v0, v2, v4 op_sel:[1,1,1] op_sel_hi:[1,1,1]
+; SDAG-GFX900-NEXT:    v_pk_max_f16 v1, v7, 0
+; SDAG-GFX900-NEXT:    v_pk_max_f16 v0, v6, 0
+; SDAG-GFX900-NEXT:    v_pk_min_f16 v0, v0, 1.0 op_sel_hi:[1,0]
+; SDAG-GFX900-NEXT:    v_pk_min_f16 v1, v1, 1.0 op_sel_hi:[1,0]
+; SDAG-GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX906-LABEL: v_mad_mix_v4f32_clamp_postcvt:
-; GFX906:       ; %bb.0:
-; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX906-NEXT:    v_fma_mixlo_f16 v6, v0, v2, v4 op_sel_hi:[1,1,1] clamp
-; GFX906-NEXT:    v_fma_mixhi_f16 v6, v0, v2, v4 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp
-; GFX906-NEXT:    v_fma_mixlo_f16 v2, v1, v3, v5 op_sel_hi:[1,1,1] clamp
-; GFX906-NEXT:    v_fma_mixhi_f16 v2, v1, v3, v5 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp
-; GFX906-NEXT:    v_mov_b32_e32 v0, v6
-; GFX906-NEXT:    v_mov_b32_e32 v1, v2
-; GFX906-NEXT:    s_setpc_b64 s[30:31]
+; SDAG-GFX906-LABEL: v_mad_mix_v4f32_clamp_postcvt:
+; SDAG-GFX906:       ; %bb.0:
+; SDAG-GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-GFX906-NEXT:    v_fma_mixlo_f16 v6, v0, v2, v4 op_sel_hi:[1,1,1]
+; SDAG-GFX906-NEXT:    v_fma_mixlo_f16 v7, v1, v3, v5 op_sel_hi:[1,1,1]
+; SDAG-GFX906-NEXT:    v_fma_mixhi_f16 v7, v1, v3, v5 op_sel:[1,1,1] op_sel_hi:[1,1,1]
+; SDAG-GFX906-NEXT:    v_fma_mixhi_f16 v6, v0, v2, v4 op_sel:[1,1,1] op_sel_hi:[1,1,1]
+; SDAG-GFX906-NEXT:    v_pk_max_f16 v1, v7, 0
+; SDAG-GFX906-NEXT:    v_pk_max_f16 v0, v6, 0
+; SDAG-GFX906-NEXT:    v_pk_min_f16 v0, v0, 1.0 op_sel_hi:[1,0]
+; SDAG-GFX906-NEXT:    v_pk_min_f16 v1, v1, 1.0 op_sel_hi:[1,0]
+; SDAG-GFX906-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; SDAG-VI-LABEL: v_mad_mix_v4f32_clamp_postcvt:
 ; SDAG-VI:       ; %bb.0:
 ; SDAG-VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-VI-NEXT:    v_cvt_f32_f16_sdwa v6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; SDAG-VI-NEXT:    v_cvt_f32_f16_sdwa v7, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; SDAG-VI-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; SDAG-VI-NEXT:    v_cvt_f32_f16_sdwa v6, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; SDAG-VI-NEXT:    v_cvt_f32_f16_sdwa v7, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
 ; SDAG-VI-NEXT:    v_cvt_f32_f16_e32 v1, v1
-; SDAG-VI-NEXT:    v_cvt_f32_f16_sdwa v8, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; SDAG-VI-NEXT:    v_cvt_f32_f16_sdwa v9, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; SDAG-VI-NEXT:    v_cvt_f32_f16_e32 v2, v2
+; SDAG-VI-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; SDAG-VI-NEXT:    v_cvt_f32_f16_sdwa v8, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; SDAG-VI-NEXT:    v_cvt_f32_f16_sdwa v9, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
 ; SDAG-VI-NEXT:    v_cvt_f32_f16_e32 v3, v3
-; SDAG-VI-NEXT:    v_cvt_f32_f16_sdwa v10, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; SDAG-VI-NEXT:    v_cvt_f32_f16_sdwa v11, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; SDAG-VI-NEXT:    v_cvt_f32_f16_e32 v5, v5
+; SDAG-VI-NEXT:    v_cvt_f32_f16_e32 v2, v2
+; SDAG-VI-NEXT:    v_cvt_f32_f16_sdwa v10, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; SDAG-VI-NEXT:    v_cvt_f32_f16_sdwa v11, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
 ; SDAG-VI-NEXT:    v_cvt_f32_f16_e32 v4, v4
+; SDAG-VI-NEXT:    v_cvt_f32_f16_e32 v5, v5
 ; SDAG-VI-NEXT:    v_mac_f32_e32 v10, v7, v9
 ; SDAG-VI-NEXT:    v_mac_f32_e32 v11, v6, v8
-; SDAG-VI-NEXT:    v_mac_f32_e32 v5, v1, v3
 ; SDAG-VI-NEXT:    v_mac_f32_e32 v4, v0, v2
-; SDAG-VI-NEXT:    v_cvt_f16_f32_sdwa v0, v11 clamp dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
-; SDAG-VI-NEXT:    v_cvt_f16_f32_sdwa v1, v10 clamp dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
-; SDAG-VI-NEXT:    v_cvt_f16_f32_e64 v2, v4 clamp
-; SDAG-VI-NEXT:    v_cvt_f16_f32_e64 v3, v5 clamp
+; SDAG-VI-NEXT:    v_mac_f32_e32 v5, v1, v3
+; SDAG-VI-NEXT:    v_cvt_f16_f32_e32 v0, v10
+; SDAG-VI-NEXT:    v_cvt_f16_f32_e32 v1, v11
+; SDAG-VI-NEXT:    v_cvt_f16_f32_e32 v2, v4
+; SDAG-VI-NEXT:    v_cvt_f16_f32_e32 v3, v5
+; SDAG-VI-NEXT:    v_max_f16_e32 v0, 0, v0
+; SDAG-VI-NEXT:    v_max_f16_e32 v1, 0, v1
+; SDAG-VI-NEXT:    v_max_f16_e32 v2, 0, v2
+; SDAG-VI-NEXT:    v_max_f16_e32 v3, 0, v3
+; SDAG-VI-NEXT:    v_mov_b32_e32 v4, 0x3c00
+; SDAG-VI-NEXT:    v_min_f16_sdwa v1, v1, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; SDAG-VI-NEXT:    v_min_f16_sdwa v0, v0, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; SDAG-VI-NEXT:    v_min_f16_e32 v3, 1.0, v3
+; SDAG-VI-NEXT:    v_min_f16_e32 v2, 1.0, v2
 ; SDAG-VI-NEXT:    v_or_b32_e32 v0, v2, v0
 ; SDAG-VI-NEXT:    v_or_b32_e32 v1, v3, v1
 ; SDAG-VI-NEXT:    s_setpc_b64 s[30:31]
@@ -1241,6 +1272,40 @@ define <4 x half> @v_mad_mix_v4f32_clamp_postcvt(<4 x half> %src0, <4 x half> %s
 ; SDAG-CI-NEXT:    v_cvt_f32_f16_e64 v3, v3 clamp
 ; SDAG-CI-NEXT:    s_setpc_b64 s[30:31]
 ;
+; GISEL-GFX1100-LABEL: v_mad_mix_v4f32_clamp_postcvt:
+; GISEL-GFX1100:       ; %bb.0:
+; GISEL-GFX1100-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-GFX1100-NEXT:    v_fma_mixlo_f16 v6, v0, v2, v4 op_sel_hi:[1,1,1] clamp
+; GISEL-GFX1100-NEXT:    v_fma_mixlo_f16 v7, v1, v3, v5 op_sel_hi:[1,1,1] clamp
+; GISEL-GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GISEL-GFX1100-NEXT:    v_fma_mixhi_f16 v6, v0, v2, v4 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp
+; GISEL-GFX1100-NEXT:    v_fma_mixhi_f16 v7, v1, v3, v5 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp
+; GISEL-GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GISEL-GFX1100-NEXT:    v_dual_mov_b32 v0, v6 :: v_dual_mov_b32 v1, v7
+; GISEL-GFX1100-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL-GFX900-LABEL: v_mad_mix_v4f32_clamp_postcvt:
+; GISEL-GFX900:       ; %bb.0:
+; GISEL-GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-GFX900-NEXT:    v_mad_mixlo_f16 v6, v0, v2, v4 op_sel_hi:[1,1,1] clamp
+; GISEL-GFX900-NEXT:    v_mad_mixhi_f16 v6, v0, v2, v4 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp
+; GISEL-GFX900-NEXT:    v_mad_mixlo_f16 v2, v1, v3, v5 op_sel_hi:[1,1,1] clamp
+; GISEL-GFX900-NEXT:    v_mad_mixhi_f16 v2, v1, v3, v5 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp
+; GISEL-GFX900-NEXT:    v_mov_b32_e32 v0, v6
+; GISEL-GFX900-NEXT:    v_mov_b32_e32 v1, v2
+; GISEL-GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL-GFX906-LABEL: v_mad_mix_v4f32_clamp_postcvt:
+; GISEL-GFX906:       ; %bb.0:
+; GISEL-GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-GFX906-NEXT:    v_fma_mixlo_f16 v6, v0, v2, v4 op_sel_hi:[1,1,1] clamp
+; GISEL-GFX906-NEXT:    v_fma_mixhi_f16 v6, v0, v2, v4 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp
+; GISEL-GFX906-NEXT:    v_fma_mixlo_f16 v2, v1, v3, v5 op_sel_hi:[1,1,1] clamp
+; GISEL-GFX906-NEXT:    v_fma_mixhi_f16 v2, v1, v3, v5 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp
+; GISEL-GFX906-NEXT:    v_mov_b32_e32 v0, v6
+; GISEL-GFX906-NEXT:    v_mov_b32_e32 v1, v2
+; GISEL-GFX906-NEXT:    s_setpc_b64 s[30:31]
+;
 ; GISEL-VI-LABEL: v_mad_mix_v4f32_clamp_postcvt:
 ; GISEL-VI:       ; %bb.0:
 ; GISEL-VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)

>From cd28373a232ff55708b014f8e97d6120ad173a74 Mon Sep 17 00:00:00 2001
From: Yingwei Zheng <dtcxzyw2333 at gmail.com>
Date: Wed, 13 Mar 2024 20:15:29 +0800
Subject: [PATCH 23/63] [InstCombine] Simplify `zext nneg i1 X` to zero
 (#85043)

Alive2: https://alive2.llvm.org/ce/z/Wm6kCk
---
 .../InstCombine/InstCombineCasts.cpp          |  4 +++
 llvm/test/Transforms/InstCombine/zext.ll      | 31 +++++++++++++++++++
 2 files changed, 35 insertions(+)

diff --git a/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp b/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp
index 45afa6363ae01f..a9817f1af8c1ac 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp
@@ -1121,6 +1121,10 @@ Instruction *InstCombinerImpl::visitZExt(ZExtInst &Zext) {
   Value *Src = Zext.getOperand(0);
   Type *SrcTy = Src->getType(), *DestTy = Zext.getType();
 
+  // zext nneg bool x -> 0
+  if (SrcTy->isIntOrIntVectorTy(1) && Zext.hasNonNeg())
+    return replaceInstUsesWith(Zext, Constant::getNullValue(Zext.getType()));
+
   // Try to extend the entire expression tree to the wide destination type.
   unsigned BitsToClear;
   if (shouldChangeType(SrcTy, DestTy) &&
diff --git a/llvm/test/Transforms/InstCombine/zext.ll b/llvm/test/Transforms/InstCombine/zext.ll
index edbd4850fb1190..88cd9c70af40d8 100644
--- a/llvm/test/Transforms/InstCombine/zext.ll
+++ b/llvm/test/Transforms/InstCombine/zext.ll
@@ -836,3 +836,34 @@ define i64 @zext_nneg_demanded_constant(i8 %a) nounwind {
   %c = and i64 %b, 254
   ret i64 %c
 }
+
+define i32 @zext_nneg_i1(i1 %x) {
+; CHECK-LABEL: @zext_nneg_i1(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    ret i32 0
+;
+entry:
+  %res = zext nneg i1 %x to i32
+  ret i32 %res
+}
+
+define <2 x i32> @zext_nneg_i1_vec(<2 x i1> %x) {
+; CHECK-LABEL: @zext_nneg_i1_vec(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    ret <2 x i32> zeroinitializer
+;
+entry:
+  %res = zext nneg <2 x i1> %x to <2 x i32>
+  ret <2 x i32> %res
+}
+
+define i32 @zext_nneg_i2(i2 %x) {
+; CHECK-LABEL: @zext_nneg_i2(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[RES:%.*]] = zext nneg i2 [[X:%.*]] to i32
+; CHECK-NEXT:    ret i32 [[RES]]
+;
+entry:
+  %res = zext nneg i2 %x to i32
+  ret i32 %res
+}

>From 14bc2d7a612fc6d84883a619f607516b0c85137b Mon Sep 17 00:00:00 2001
From: Jacek Caban <jacek at codeweavers.com>
Date: Wed, 13 Mar 2024 13:27:20 +0100
Subject: [PATCH 24/63] [llvm-ar] Use COFF archive format for COFF targets.
 (#82898)

Detect COFF files by default and allow specifying it with --format
argument.

This is important for ARM64EC, which uses a separated symbol map for EC
symbols. Since K_COFF is mostly compatible with K_GNU, this shouldn't
really make a difference for other targets.

This originally landed as #82642, but was reverted due to test failures
in tests using no symbol table. Since COFF symbol can't express it,
fallback to GNU format in that case.
---
 llvm/docs/CommandGuide/llvm-ar.rst       |  2 +-
 llvm/docs/ReleaseNotes.rst               |  3 +
 llvm/include/llvm/Object/Archive.h       |  1 +
 llvm/lib/Object/Archive.cpp              | 15 ++--
 llvm/lib/Object/ArchiveWriter.cpp        | 28 ++++----
 llvm/test/tools/llvm-ar/coff-symtab.test | 91 ++++++++++++++++++++++++
 llvm/test/tools/llvm-ar/no-symtab.yaml   | 32 +++++++++
 llvm/tools/llvm-ar/llvm-ar.cpp           | 22 ++++--
 8 files changed, 171 insertions(+), 23 deletions(-)
 create mode 100644 llvm/test/tools/llvm-ar/coff-symtab.test
 create mode 100644 llvm/test/tools/llvm-ar/no-symtab.yaml

diff --git a/llvm/docs/CommandGuide/llvm-ar.rst b/llvm/docs/CommandGuide/llvm-ar.rst
index 03d5b9e41ada38..63b3a519550bc0 100644
--- a/llvm/docs/CommandGuide/llvm-ar.rst
+++ b/llvm/docs/CommandGuide/llvm-ar.rst
@@ -261,7 +261,7 @@ Other
 
 .. option:: --format=<type>
 
- This option allows for default, gnu, darwin or bsd ``<type>`` to be selected.
+ This option allows for default, gnu, darwin, bsd or coff ``<type>`` to be selected.
  When creating an ``archive`` with the default ``<type>``, :program:``llvm-ar``
  will attempt to infer it from the input files and fallback to the default
  toolchain target if unable to do so.
diff --git a/llvm/docs/ReleaseNotes.rst b/llvm/docs/ReleaseNotes.rst
index b34a5f31c5eb0a..7be51730663bd1 100644
--- a/llvm/docs/ReleaseNotes.rst
+++ b/llvm/docs/ReleaseNotes.rst
@@ -153,6 +153,9 @@ Changes to the LLVM tools
   if it's not specified with the ``--format`` argument and cannot be inferred from
   input files.
 
+* llvm-ar now allows specifying COFF archive format with ``--format`` argument
+  and uses it by default for COFF targets.
+
 * llvm-objcopy now supports ``--set-symbol-visibility`` and
   ``--set-symbols-visibility`` options for ELF input to change the
   visibility of symbols.
diff --git a/llvm/include/llvm/Object/Archive.h b/llvm/include/llvm/Object/Archive.h
index f71630054dc637..a3165c3235e0ed 100644
--- a/llvm/include/llvm/Object/Archive.h
+++ b/llvm/include/llvm/Object/Archive.h
@@ -339,6 +339,7 @@ class Archive : public Binary {
   Kind kind() const { return (Kind)Format; }
   bool isThin() const { return IsThin; }
   static object::Archive::Kind getDefaultKind();
+  static object::Archive::Kind getDefaultKindForTriple(Triple &T);
 
   child_iterator child_begin(Error &Err, bool SkipInternal = true) const;
   child_iterator child_end() const;
diff --git a/llvm/lib/Object/Archive.cpp b/llvm/lib/Object/Archive.cpp
index 9000e9aa81ff41..6139d9996bdad3 100644
--- a/llvm/lib/Object/Archive.cpp
+++ b/llvm/lib/Object/Archive.cpp
@@ -969,12 +969,19 @@ Archive::Archive(MemoryBufferRef Source, Error &Err)
   Err = Error::success();
 }
 
+object::Archive::Kind Archive::getDefaultKindForTriple(Triple &T) {
+  if (T.isOSDarwin())
+    return object::Archive::K_DARWIN;
+  if (T.isOSAIX())
+    return object::Archive::K_AIXBIG;
+  if (T.isOSWindows())
+    return object::Archive::K_COFF;
+  return object::Archive::K_GNU;
+}
+
 object::Archive::Kind Archive::getDefaultKind() {
   Triple HostTriple(sys::getDefaultTargetTriple());
-  return HostTriple.isOSDarwin()
-             ? object::Archive::K_DARWIN
-             : (HostTriple.isOSAIX() ? object::Archive::K_AIXBIG
-                                     : object::Archive::K_GNU);
+  return getDefaultKindForTriple(HostTriple);
 }
 
 Archive::child_iterator Archive::child_begin(Error &Err,
diff --git a/llvm/lib/Object/ArchiveWriter.cpp b/llvm/lib/Object/ArchiveWriter.cpp
index e0629747b40cd7..aa57e55de70c8d 100644
--- a/llvm/lib/Object/ArchiveWriter.cpp
+++ b/llvm/lib/Object/ArchiveWriter.cpp
@@ -62,12 +62,16 @@ object::Archive::Kind NewArchiveMember::detectKindFromObject() const {
   Expected<std::unique_ptr<object::ObjectFile>> OptionalObject =
       object::ObjectFile::createObjectFile(MemBufferRef);
 
-  if (OptionalObject)
-    return isa<object::MachOObjectFile>(**OptionalObject)
-               ? object::Archive::K_DARWIN
-               : (isa<object::XCOFFObjectFile>(**OptionalObject)
-                      ? object::Archive::K_AIXBIG
-                      : object::Archive::K_GNU);
+  if (OptionalObject) {
+    if (isa<object::MachOObjectFile>(**OptionalObject))
+      return object::Archive::K_DARWIN;
+    if (isa<object::XCOFFObjectFile>(**OptionalObject))
+      return object::Archive::K_AIXBIG;
+    if (isa<object::COFFObjectFile>(**OptionalObject) ||
+        isa<object::COFFImportFile>(**OptionalObject))
+      return object::Archive::K_COFF;
+    return object::Archive::K_GNU;
+  }
 
   // Squelch the error in case we had a non-object file.
   consumeError(OptionalObject.takeError());
@@ -80,10 +84,7 @@ object::Archive::Kind NewArchiveMember::detectKindFromObject() const {
             MemBufferRef, file_magic::bitcode, &Context)) {
       auto &IRObject = cast<object::IRObjectFile>(**ObjOrErr);
       auto TargetTriple = Triple(IRObject.getTargetTriple());
-      return TargetTriple.isOSDarwin()
-                 ? object::Archive::K_DARWIN
-                 : (TargetTriple.isOSAIX() ? object::Archive::K_AIXBIG
-                                           : object::Archive::K_GNU);
+      return object::Archive::getDefaultKindForTriple(TargetTriple);
     } else {
       // Squelch the error in case this was not a SymbolicFile.
       consumeError(ObjOrErr.takeError());
@@ -976,10 +977,12 @@ static Error writeArchiveToStream(raw_ostream &Out,
   SmallString<0> StringTableBuf;
   raw_svector_ostream StringTable(StringTableBuf);
   SymMap SymMap;
+  bool ShouldWriteSymtab = WriteSymtab != SymtabWritingMode::NoSymtab;
 
   // COFF symbol map uses 16-bit indexes, so we can't use it if there are too
-  // many members.
-  if (isCOFFArchive(Kind) && NewMembers.size() > 0xfffe)
+  // many members. COFF format also requires symbol table presence, so use
+  // GNU format when NoSymtab is requested.
+  if (isCOFFArchive(Kind) && (NewMembers.size() > 0xfffe || !ShouldWriteSymtab))
     Kind = object::Archive::K_GNU;
 
   // In the scenario when LLVMContext is populated SymbolicFile will contain a
@@ -1008,7 +1011,6 @@ static Error writeArchiveToStream(raw_ostream &Out,
   uint64_t LastMemberHeaderOffset = 0;
   uint64_t NumSyms = 0;
   uint64_t NumSyms32 = 0; // Store symbol number of 32-bit member files.
-  bool ShouldWriteSymtab = WriteSymtab != SymtabWritingMode::NoSymtab;
 
   for (const auto &M : Data) {
     // Record the start of the member's offset
diff --git a/llvm/test/tools/llvm-ar/coff-symtab.test b/llvm/test/tools/llvm-ar/coff-symtab.test
new file mode 100644
index 00000000000000..4f7270d9e2c6e7
--- /dev/null
+++ b/llvm/test/tools/llvm-ar/coff-symtab.test
@@ -0,0 +1,91 @@
+Verify that llvm-ar uses COFF archive format by ensuring that archive map is sorted.
+
+RUN: rm -rf %t.dir && split-file %s %t.dir && cd %t.dir
+
+RUN: yaml2obj coff-symtab.yaml -o coff-symtab.obj
+RUN: llvm-ar crs out.a coff-symtab.obj
+RUN: llvm-nm --print-armap out.a | FileCheck %s
+
+RUN: llvm-as coff-symtab.ll -o coff-symtab.bc
+RUN: llvm-ar crs out2.a coff-symtab.bc
+RUN: llvm-nm --print-armap out2.a | FileCheck %s
+
+RUN: yaml2obj elf.yaml -o coff-symtab.o
+RUN: llvm-ar crs --format coff out3.a coff-symtab.o
+RUN: llvm-nm --print-armap out3.a | FileCheck %s
+
+Create an empty archive with no symbol map, add a COFF file to it and check that the output archive is a COFF archive.
+
+RUN: llvm-ar rcS out4.a
+RUN: llvm-ar rs out4.a coff-symtab.obj
+RUN: llvm-nm --print-armap out4.a | FileCheck %s
+
+CHECK: Archive map
+CHECK-NEXT: a in coff-symtab
+CHECK-NEXT: b in coff-symtab
+CHECK-NEXT: c in coff-symtab
+CHECK-EMPTY:
+
+#--- coff-symtab.yaml
+--- !COFF
+header:
+  Machine:           IMAGE_FILE_MACHINE_UNKNOWN
+  Characteristics:   [  ]
+sections:
+  - Name:            .text
+    Characteristics: [ IMAGE_SCN_CNT_CODE, IMAGE_SCN_MEM_EXECUTE, IMAGE_SCN_MEM_READ ]
+    Alignment:       4
+    SectionData:     ''
+symbols:
+  - Name:            b
+    Value:           0
+    SectionNumber:   1
+    SimpleType:      IMAGE_SYM_TYPE_NULL
+    ComplexType:     IMAGE_SYM_DTYPE_FUNCTION
+    StorageClass:    IMAGE_SYM_CLASS_EXTERNAL
+  - Name:            c
+    Value:           0
+    SectionNumber:   1
+    SimpleType:      IMAGE_SYM_TYPE_NULL
+    ComplexType:     IMAGE_SYM_DTYPE_FUNCTION
+    StorageClass:    IMAGE_SYM_CLASS_EXTERNAL
+  - Name:            a
+    Value:           0
+    SectionNumber:   1
+    SimpleType:      IMAGE_SYM_TYPE_NULL
+    ComplexType:     IMAGE_SYM_DTYPE_FUNCTION
+    StorageClass:    IMAGE_SYM_CLASS_EXTERNAL
+...
+
+
+#--- coff-symtab.ll
+target triple = "x86_64-unknown-windows-msvc"
+
+define void @b() { ret void }
+define void @c() { ret void }
+define void @a() { ret void }
+
+#--- elf.yaml
+--- !ELF
+FileHeader:
+  Class:             ELFCLASS64
+  Data  :            ELFDATA2LSB
+  Type:              ET_REL
+  Machine:           EM_X86_64
+Sections:
+  - Name:            .text
+    Type:            SHT_PROGBITS
+    Flags:           [ SHF_ALLOC, SHF_EXECINSTR ]
+    AddressAlign:    0x0000000000000004
+    Content:         ''
+Symbols:
+  - Name:            b
+    Binding:         STB_GLOBAL
+    Section:         .text
+  - Name:            c
+    Binding:         STB_GLOBAL
+    Section:         .text
+  - Name:            a
+    Binding:         STB_GLOBAL
+    Section:         .text
+...
diff --git a/llvm/test/tools/llvm-ar/no-symtab.yaml b/llvm/test/tools/llvm-ar/no-symtab.yaml
new file mode 100644
index 00000000000000..7370c9b3235522
--- /dev/null
+++ b/llvm/test/tools/llvm-ar/no-symtab.yaml
@@ -0,0 +1,32 @@
+## Create archives with no symtab in various formats and check that we can read them.
+
+# RUN: yaml2obj %s -o %t.o
+# RUN: rm -f %t.*.a
+
+# RUN: llvm-ar --format=gnu rcS %t.gnu.a %t.o
+# RUN: llvm-ar --format=coff rcS %t.coff.a %t.o
+# RUN: llvm-ar --format=darwin rcS %t.darwin.a %t.o
+# RUN: llvm-ar --format=bsd rcS %t.bsd.a %t.o
+# RUN: llvm-ar --format=bigarchive rcS %t.bigarchive.a %t.o
+
+# RUN: llvm-nm --print-armap %t.gnu.a | FileCheck %s
+# RUN: llvm-nm --print-armap %t.coff.a | FileCheck %s
+# RUN: llvm-nm --print-armap %t.darwin.a | FileCheck %s
+# RUN: llvm-nm --print-armap %t.bsd.a | FileCheck %s
+# RUN: llvm-nm --print-armap %t.bigarchive.a | FileCheck %s
+
+# CHECK-NOT: Archive map
+
+--- !ELF
+FileHeader:
+  Class:   ELFCLASS64
+  Data:    ELFDATA2LSB
+  Type:    ET_REL
+  Machine: EM_X86_64
+Sections:
+  - Name: .text
+    Type: SHT_PROGBITS
+Symbols:
+  - Name:    symbol
+    Binding: STB_GLOBAL
+    Section: .text
diff --git a/llvm/tools/llvm-ar/llvm-ar.cpp b/llvm/tools/llvm-ar/llvm-ar.cpp
index 81cb2a21daf1f2..294b8531b08f13 100644
--- a/llvm/tools/llvm-ar/llvm-ar.cpp
+++ b/llvm/tools/llvm-ar/llvm-ar.cpp
@@ -82,6 +82,7 @@ static void printArHelp(StringRef ToolName) {
     =darwin             -   darwin
     =bsd                -   bsd
     =bigarchive         -   big archive (AIX OS)
+    =coff               -   coff
   --plugin=<string>     - ignored for compatibility
   -h --help             - display this help and exit
   --output              - the directory to extract archive members to
@@ -193,7 +194,7 @@ static SmallVector<const char *, 256> PositionalArgs;
 static bool MRI;
 
 namespace {
-enum Format { Default, GNU, BSD, DARWIN, BIGARCHIVE, Unknown };
+enum Format { Default, GNU, COFF, BSD, DARWIN, BIGARCHIVE, Unknown };
 }
 
 static Format FormatType = Default;
@@ -1025,14 +1026,21 @@ static void performWriteOperation(ArchiveOperation Operation,
       Kind = object::Archive::K_GNU;
     else if (OldArchive) {
       Kind = OldArchive->kind();
-      if (Kind == object::Archive::K_BSD) {
-        auto InferredKind = object::Archive::K_BSD;
+      std::optional<object::Archive::Kind> AltKind;
+      if (Kind == object::Archive::K_BSD)
+        AltKind = object::Archive::K_DARWIN;
+      else if (Kind == object::Archive::K_GNU && !OldArchive->hasSymbolTable())
+        // If there is no symbol table, we can't tell GNU from COFF format
+        // from the old archive type.
+        AltKind = object::Archive::K_COFF;
+      if (AltKind) {
+        auto InferredKind = Kind;
         if (NewMembersP && !NewMembersP->empty())
           InferredKind = NewMembersP->front().detectKindFromObject();
         else if (!NewMembers.empty())
           InferredKind = NewMembers.front().detectKindFromObject();
-        if (InferredKind == object::Archive::K_DARWIN)
-          Kind = object::Archive::K_DARWIN;
+        if (InferredKind == AltKind)
+          Kind = *AltKind;
       }
     } else if (NewMembersP)
       Kind = !NewMembersP->empty() ? NewMembersP->front().detectKindFromObject()
@@ -1044,6 +1052,9 @@ static void performWriteOperation(ArchiveOperation Operation,
   case GNU:
     Kind = object::Archive::K_GNU;
     break;
+  case COFF:
+    Kind = object::Archive::K_COFF;
+    break;
   case BSD:
     if (Thin)
       fail("only the gnu format has a thin mode");
@@ -1376,6 +1387,7 @@ static int ar_main(int argc, char **argv) {
                        .Case("darwin", DARWIN)
                        .Case("bsd", BSD)
                        .Case("bigarchive", BIGARCHIVE)
+                       .Case("coff", COFF)
                        .Default(Unknown);
       if (FormatType == Unknown)
         fail(std::string("Invalid format ") + Match);

>From db9ea5110d3f383586b3387344e34d720c912a72 Mon Sep 17 00:00:00 2001
From: Louis Dionne <ldionne.2 at gmail.com>
Date: Wed, 13 Mar 2024 09:00:53 -0400
Subject: [PATCH 25/63] [libc++] Remove
 _LIBCPP_ENABLE_NARROWING_CONVERSIONS_IN_VARIANT (#83928)

This was slated for removal in LLVM 19.
---
 libcxx/docs/ReleaseNotes/19.rst               |  2 +-
 libcxx/include/variant                        | 32 +++----------------
 .../variant.variant/variant.assign/T.pass.cpp | 15 ++-------
 .../variant.assign/conv.pass.cpp              |  8 ++---
 .../variant.variant/variant.ctor/T.pass.cpp   | 15 ++-------
 .../variant.ctor/conv.pass.cpp                |  8 ++---
 libcxx/test/support/variant_test_helpers.h    | 10 ------
 7 files changed, 18 insertions(+), 72 deletions(-)

diff --git a/libcxx/docs/ReleaseNotes/19.rst b/libcxx/docs/ReleaseNotes/19.rst
index 04f16610f8117e..2b62a36ca8e5cd 100644
--- a/libcxx/docs/ReleaseNotes/19.rst
+++ b/libcxx/docs/ReleaseNotes/19.rst
@@ -65,7 +65,7 @@ Deprecations and Removals
   provided, and such a base template is bound to be incorrect for some types, which could currently cause unexpected behavior
   while going undetected.
 
-- TODO: The ``_LIBCPP_ENABLE_NARROWING_CONVERSIONS_IN_VARIANT`` macro that changed the behavior for narrowing conversions
+- The ``_LIBCPP_ENABLE_NARROWING_CONVERSIONS_IN_VARIANT`` macro that changed the behavior for narrowing conversions
   in ``std::variant`` has been removed in LLVM 19.
 
 - TODO: The ``_LIBCPP_ENABLE_CXX20_REMOVED_ALLOCATOR_MEMBERS`` macro has been removed in LLVM 19.
diff --git a/libcxx/include/variant b/libcxx/include/variant
index d1eea52f0a9301..59d7f9b740f3be 100644
--- a/libcxx/include/variant
+++ b/libcxx/include/variant
@@ -347,13 +347,13 @@ inline constexpr size_t variant_npos = static_cast<size_t>(-1);
 
 template <size_t _NumAlternatives>
 _LIBCPP_HIDE_FROM_ABI constexpr auto __choose_index_type() {
-#ifdef _LIBCPP_ABI_VARIANT_INDEX_TYPE_OPTIMIZATION
+#  ifdef _LIBCPP_ABI_VARIANT_INDEX_TYPE_OPTIMIZATION
   if constexpr (_NumAlternatives < numeric_limits<unsigned char>::max())
     return static_cast<unsigned char>(0);
   else if constexpr (_NumAlternatives < numeric_limits<unsigned short>::max())
     return static_cast<unsigned short>(0);
   else
-#endif // _LIBCPP_ABI_VARIANT_INDEX_TYPE_OPTIMIZATION
+#  endif // _LIBCPP_ABI_VARIANT_INDEX_TYPE_OPTIMIZATION
     return static_cast<unsigned int>(0);
 }
 
@@ -1085,13 +1085,9 @@ struct __narrowing_check {
 };
 
 template <class _Dest, class _Source>
-using __check_for_narrowing _LIBCPP_NODEBUG = typename _If<
-#  ifdef _LIBCPP_ENABLE_NARROWING_CONVERSIONS_IN_VARIANT
-    false &&
-#  endif
-        is_arithmetic<_Dest>::value,
-    __narrowing_check,
-    __no_narrowing_check >::template _Apply<_Dest, _Source>;
+using __check_for_narrowing _LIBCPP_NODEBUG =
+    typename _If< is_arithmetic<_Dest>::value, __narrowing_check, __no_narrowing_check >::template _Apply<_Dest,
+                                                                                                          _Source>;
 
 template <class _Tp, size_t _Idx>
 struct __overload {
@@ -1099,24 +1095,6 @@ struct __overload {
   auto operator()(_Tp, _Up&&) const -> __check_for_narrowing<_Tp, _Up>;
 };
 
-// TODO(LLVM-19): Remove all occurrences of this macro.
-#  ifdef _LIBCPP_ENABLE_NARROWING_CONVERSIONS_IN_VARIANT
-template <class _Tp, size_t>
-struct __overload_bool {
-  template <class _Up, class _Ap = __remove_cvref_t<_Up>>
-  auto operator()(bool, _Up&&) const -> enable_if_t<is_same_v<_Ap, bool>, __type_identity<_Tp>>;
-};
-
-template <size_t _Idx>
-struct __overload<bool, _Idx> : __overload_bool<bool, _Idx> {};
-template <size_t _Idx>
-struct __overload<bool const, _Idx> : __overload_bool<bool const, _Idx> {};
-template <size_t _Idx>
-struct __overload<bool volatile, _Idx> : __overload_bool<bool volatile, _Idx> {};
-template <size_t _Idx>
-struct __overload<bool const volatile, _Idx> : __overload_bool<bool const volatile, _Idx> {};
-#  endif
-
 template <class... _Bases>
 struct __all_overloads : _Bases... {
   void operator()() const;
diff --git a/libcxx/test/std/utilities/variant/variant.variant/variant.assign/T.pass.cpp b/libcxx/test/std/utilities/variant/variant.variant/variant.assign/T.pass.cpp
index b3fc2021a6b223..b38b10d89dfd7e 100644
--- a/libcxx/test/std/utilities/variant/variant.variant/variant.assign/T.pass.cpp
+++ b/libcxx/test/std/utilities/variant/variant.variant/variant.assign/T.pass.cpp
@@ -134,8 +134,7 @@ void test_T_assignment_sfinae() {
   }
   {
     using V = std::variant<std::string, float>;
-    static_assert(std::is_assignable<V, int>::value == VariantAllowsNarrowingConversions,
-    "no matching operator=");
+    static_assert(!std::is_assignable<V, int>::value, "no matching operator=");
   }
   {
     using V = std::variant<std::unique_ptr<int>, bool>;
@@ -144,12 +143,8 @@ void test_T_assignment_sfinae() {
     struct X {
       operator void*();
     };
-    static_assert(!std::is_assignable<V, X>::value,
-                  "no boolean conversion in operator=");
-#ifndef _LIBCPP_ENABLE_NARROWING_CONVERSIONS_IN_VARIANT
-    static_assert(std::is_assignable<V, std::false_type>::value,
-                  "converted to bool in operator=");
-#endif
+    static_assert(!std::is_assignable<V, X>::value, "no boolean conversion in operator=");
+    static_assert(std::is_assignable<V, std::false_type>::value, "converted to bool in operator=");
   }
   {
     struct X {};
@@ -188,7 +183,6 @@ void test_T_assignment_basic() {
     assert(v.index() == 1);
     assert(std::get<1>(v) == 43);
   }
-#ifndef TEST_VARIANT_ALLOWS_NARROWING_CONVERSIONS
   {
     std::variant<unsigned, long> v;
     v = 42;
@@ -198,7 +192,6 @@ void test_T_assignment_basic() {
     assert(v.index() == 0);
     assert(std::get<0>(v) == 43);
   }
-#endif
   {
     std::variant<std::string, bool> v = true;
     v = "bar";
@@ -299,13 +292,11 @@ void test_T_assignment_performs_assignment() {
 }
 
 void test_T_assignment_vector_bool() {
-#ifndef _LIBCPP_ENABLE_NARROWING_CONVERSIONS_IN_VARIANT
   std::vector<bool> vec = {true};
   std::variant<bool, int> v;
   v = vec[0];
   assert(v.index() == 0);
   assert(std::get<0>(v) == true);
-#endif
 }
 
 int main(int, char**) {
diff --git a/libcxx/test/std/utilities/variant/variant.variant/variant.assign/conv.pass.cpp b/libcxx/test/std/utilities/variant/variant.variant/variant.assign/conv.pass.cpp
index 246309c01b4d76..90e405d577500d 100644
--- a/libcxx/test/std/utilities/variant/variant.variant/variant.assign/conv.pass.cpp
+++ b/libcxx/test/std/utilities/variant/variant.variant/variant.assign/conv.pass.cpp
@@ -25,18 +25,16 @@ int main(int, char**)
 {
   static_assert(!std::is_assignable<std::variant<int, int>, int>::value, "");
   static_assert(!std::is_assignable<std::variant<long, long long>, int>::value, "");
-  static_assert(std::is_assignable<std::variant<char>, int>::value == VariantAllowsNarrowingConversions, "");
+  static_assert(!std::is_assignable<std::variant<char>, int>::value, "");
 
-  static_assert(std::is_assignable<std::variant<std::string, float>, int>::value == VariantAllowsNarrowingConversions, "");
-  static_assert(std::is_assignable<std::variant<std::string, double>, int>::value == VariantAllowsNarrowingConversions, "");
+  static_assert(!std::is_assignable<std::variant<std::string, float>, int>::value, "");
+  static_assert(!std::is_assignable<std::variant<std::string, double>, int>::value, "");
   static_assert(!std::is_assignable<std::variant<std::string, bool>, int>::value, "");
 
   static_assert(!std::is_assignable<std::variant<int, bool>, decltype("meow")>::value, "");
   static_assert(!std::is_assignable<std::variant<int, const bool>, decltype("meow")>::value, "");
 
-#ifndef _LIBCPP_ENABLE_NARROWING_CONVERSIONS_IN_VARIANT
   static_assert(std::is_assignable<std::variant<bool>, std::true_type>::value, "");
-#endif
   static_assert(!std::is_assignable<std::variant<bool>, std::unique_ptr<char> >::value, "");
   static_assert(!std::is_assignable<std::variant<bool>, decltype(nullptr)>::value, "");
 
diff --git a/libcxx/test/std/utilities/variant/variant.variant/variant.ctor/T.pass.cpp b/libcxx/test/std/utilities/variant/variant.variant/variant.ctor/T.pass.cpp
index 89fd646878eeca..6b7de888884986 100644
--- a/libcxx/test/std/utilities/variant/variant.variant/variant.ctor/T.pass.cpp
+++ b/libcxx/test/std/utilities/variant/variant.variant/variant.ctor/T.pass.cpp
@@ -68,8 +68,7 @@ void test_T_ctor_sfinae() {
   }
   {
     using V = std::variant<std::string, float>;
-    static_assert(std::is_constructible<V, int>::value == VariantAllowsNarrowingConversions,
-                  "no matching constructor");
+    static_assert(!std::is_constructible<V, int>::value, "no matching constructor");
   }
   {
     using V = std::variant<std::unique_ptr<int>, bool>;
@@ -78,12 +77,8 @@ void test_T_ctor_sfinae() {
     struct X {
       operator void*();
     };
-    static_assert(!std::is_constructible<V, X>::value,
-                  "no boolean conversion in constructor");
-#ifndef _LIBCPP_ENABLE_NARROWING_CONVERSIONS_IN_VARIANT
-    static_assert(std::is_constructible<V, std::false_type>::value,
-                  "converted to bool in constructor");
-#endif
+    static_assert(!std::is_constructible<V, X>::value, "no boolean conversion in constructor");
+    static_assert(std::is_constructible<V, std::false_type>::value, "converted to bool in constructor");
   }
   {
     struct X {};
@@ -128,13 +123,11 @@ void test_T_ctor_basic() {
     static_assert(v.index() == 1, "");
     static_assert(std::get<1>(v) == 42, "");
   }
-#ifndef TEST_VARIANT_ALLOWS_NARROWING_CONVERSIONS
   {
     constexpr std::variant<unsigned, long> v(42);
     static_assert(v.index() == 1, "");
     static_assert(std::get<1>(v) == 42, "");
   }
-#endif
   {
     std::variant<std::string, bool const> v = "foo";
     assert(v.index() == 0);
@@ -202,12 +195,10 @@ void test_construction_with_repeated_types() {
 }
 
 void test_vector_bool() {
-#ifndef _LIBCPP_ENABLE_NARROWING_CONVERSIONS_IN_VARIANT
   std::vector<bool> vec = {true};
   std::variant<bool, int> v = vec[0];
   assert(v.index() == 0);
   assert(std::get<0>(v) == true);
-#endif
 }
 
 int main(int, char**) {
diff --git a/libcxx/test/std/utilities/variant/variant.variant/variant.ctor/conv.pass.cpp b/libcxx/test/std/utilities/variant/variant.variant/variant.ctor/conv.pass.cpp
index 7fb44ff407653b..0b8eeed1eac86d 100644
--- a/libcxx/test/std/utilities/variant/variant.variant/variant.ctor/conv.pass.cpp
+++ b/libcxx/test/std/utilities/variant/variant.variant/variant.ctor/conv.pass.cpp
@@ -24,18 +24,16 @@ int main(int, char**)
 {
   static_assert(!std::is_constructible<std::variant<int, int>, int>::value, "");
   static_assert(!std::is_constructible<std::variant<long, long long>, int>::value, "");
-  static_assert(std::is_constructible<std::variant<char>, int>::value == VariantAllowsNarrowingConversions, "");
+  static_assert(!std::is_constructible<std::variant<char>, int>::value, "");
 
-  static_assert(std::is_constructible<std::variant<std::string, float>, int>::value == VariantAllowsNarrowingConversions, "");
-  static_assert(std::is_constructible<std::variant<std::string, double>, int>::value == VariantAllowsNarrowingConversions, "");
+  static_assert(!std::is_constructible<std::variant<std::string, float>, int>::value, "");
+  static_assert(!std::is_constructible<std::variant<std::string, double>, int>::value, "");
   static_assert(!std::is_constructible<std::variant<std::string, bool>, int>::value, "");
 
   static_assert(!std::is_constructible<std::variant<int, bool>, decltype("meow")>::value, "");
   static_assert(!std::is_constructible<std::variant<int, const bool>, decltype("meow")>::value, "");
 
-#ifndef _LIBCPP_ENABLE_NARROWING_CONVERSIONS_IN_VARIANT
   static_assert(std::is_constructible<std::variant<bool>, std::true_type>::value, "");
-#endif
   static_assert(!std::is_constructible<std::variant<bool>, std::unique_ptr<char> >::value, "");
   static_assert(!std::is_constructible<std::variant<bool>, decltype(nullptr)>::value, "");
 
diff --git a/libcxx/test/support/variant_test_helpers.h b/libcxx/test/support/variant_test_helpers.h
index c174cba3284019..345e32170e5844 100644
--- a/libcxx/test/support/variant_test_helpers.h
+++ b/libcxx/test/support/variant_test_helpers.h
@@ -24,16 +24,6 @@
 // FIXME: Currently the variant<T&> tests are disabled using this macro.
 #define TEST_VARIANT_HAS_NO_REFERENCES
 
-// TODO(LLVM-19): Remove TEST_VARIANT_ALLOWS_NARROWING_CONVERSIONS
-#ifdef _LIBCPP_ENABLE_NARROWING_CONVERSIONS_IN_VARIANT
-# define TEST_VARIANT_ALLOWS_NARROWING_CONVERSIONS
-#endif
-#ifdef TEST_VARIANT_ALLOWS_NARROWING_CONVERSIONS
-constexpr bool VariantAllowsNarrowingConversions = true;
-#else
-constexpr bool VariantAllowsNarrowingConversions = false;
-#endif
-
 #ifndef TEST_HAS_NO_EXCEPTIONS
 struct CopyThrows {
   CopyThrows() = default;

>From 6aa08baeec14953483f6c7d2d5a827f6840380d8 Mon Sep 17 00:00:00 2001
From: Matt Arsenault <Matthew.Arsenault at amd.com>
Date: Wed, 13 Mar 2024 18:34:23 +0530
Subject: [PATCH 26/63] AMDGPU: Don't use table for metadata docs, and fix
 section headers (#85046)

---
 llvm/docs/AMDGPUUsage.rst | 32 +++++++++++++++++++-------------
 1 file changed, 19 insertions(+), 13 deletions(-)

diff --git a/llvm/docs/AMDGPUUsage.rst b/llvm/docs/AMDGPUUsage.rst
index fd9ad7fac19a95..fe37e85c2a40a6 100644
--- a/llvm/docs/AMDGPUUsage.rst
+++ b/llvm/docs/AMDGPUUsage.rst
@@ -1312,24 +1312,30 @@ The AMDGPU backend implements the following LLVM IR intrinsics.
 
    List AMDGPU intrinsics.
 
+.. _amdgpu_metadata:
+
 LLVM IR Metadata
-------------------
+================
+
+The AMDGPU backend implements the following target custom LLVM IR
+metadata.
+
+.. _amdgpu_last_use:
 
-The AMDGPU backend implements the following LLVM IR metadata.
+'``amdgpu.last.use``' Metadata
+------------------------------
+
+Sets TH_LOAD_LU temporal hint on load instructions that support it.
+Takes priority over nontemporal hint (TH_LOAD_NT). This takes no
+arguments.
+
+.. code-block:: llvm
 
-.. list-table:: AMDGPU LLVM IR Metatdata
-  :name: amdgpu-llvm-ir-metadata-table
+  %val = load i32, ptr %in, align 4, !amdgpu.last.use !{}
 
-  * - Metadata Name
-    - Description
-    - Values
-  * - !amdgpu.last.use
-    - Sets TH_LOAD_LU temporal hint on load instructions that support it.
-      Takes priority over nontemporal hint (TH_LOAD_NT).
-    - {}
 
 LLVM IR Attributes
-------------------
+==================
 
 The AMDGPU backend supports the following LLVM IR attributes.
 
@@ -1451,7 +1457,7 @@ The AMDGPU backend supports the following LLVM IR attributes.
      ======================================= ==========================================================
 
 Calling Conventions
--------------------
+===================
 
 The AMDGPU backend supports the following calling conventions:
 

>From 7f7930aaa92874a7393bb8d96c329d6a8091ca02 Mon Sep 17 00:00:00 2001
From: Jay Foad <jay.foad at amd.com>
Date: Wed, 13 Mar 2024 13:11:45 +0000
Subject: [PATCH 27/63] [AMDGPU] Test new GFX12 opcode name
 buffer_atomic_min_num_f32

The old name buffer_atomic_min_f32 is still tested as part of the alias
tests.
---
 llvm/test/MC/AMDGPU/gfx12_asm_vbuffer_mubuf.s | 44 +++++++++----------
 1 file changed, 22 insertions(+), 22 deletions(-)

diff --git a/llvm/test/MC/AMDGPU/gfx12_asm_vbuffer_mubuf.s b/llvm/test/MC/AMDGPU/gfx12_asm_vbuffer_mubuf.s
index 08ec5b3f6a5220..efeaf8339f6928 100644
--- a/llvm/test/MC/AMDGPU/gfx12_asm_vbuffer_mubuf.s
+++ b/llvm/test/MC/AMDGPU/gfx12_asm_vbuffer_mubuf.s
@@ -3970,70 +3970,70 @@ buffer_atomic_max_u64 v[5:6], off, s[8:11], s3 offset:8388607 dlc
 buffer_atomic_max_u64 v[5:6], off, s[8:11], s3 offset:8388607 glc slc dlc
 // GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
 
-buffer_atomic_min_f32 v5, off, s[8:11], s3 offset:8388607
+buffer_atomic_min_num_f32 v5, off, s[8:11], s3 offset:8388607
 // GFX12: encoding: [0x03,0x40,0x14,0xc4,0x05,0x10,0x80,0x00,0x00,0xff,0xff,0x7f]
 
-buffer_atomic_min_f32 v255, off, s[8:11], s3 offset:8388607
+buffer_atomic_min_num_f32 v255, off, s[8:11], s3 offset:8388607
 // GFX12: encoding: [0x03,0x40,0x14,0xc4,0xff,0x10,0x80,0x00,0x00,0xff,0xff,0x7f]
 
-buffer_atomic_min_f32 v5, off, s[12:15], s3 offset:8388607
+buffer_atomic_min_num_f32 v5, off, s[12:15], s3 offset:8388607
 // GFX12: encoding: [0x03,0x40,0x14,0xc4,0x05,0x18,0x80,0x00,0x00,0xff,0xff,0x7f]
 
-buffer_atomic_min_f32 v5, off, s[96:99], s3 offset:8388607
+buffer_atomic_min_num_f32 v5, off, s[96:99], s3 offset:8388607
 // GFX12: encoding: [0x03,0x40,0x14,0xc4,0x05,0xc0,0x80,0x00,0x00,0xff,0xff,0x7f]
 
-buffer_atomic_min_f32 v5, off, s[8:11], s101 offset:8388607
+buffer_atomic_min_num_f32 v5, off, s[8:11], s101 offset:8388607
 // GFX12: encoding: [0x65,0x40,0x14,0xc4,0x05,0x10,0x80,0x00,0x00,0xff,0xff,0x7f]
 
-buffer_atomic_min_f32 v5, off, s[8:11], m0 offset:8388607
+buffer_atomic_min_num_f32 v5, off, s[8:11], m0 offset:8388607
 // GFX12: encoding: [0x7d,0x40,0x14,0xc4,0x05,0x10,0x80,0x00,0x00,0xff,0xff,0x7f]
 
-buffer_atomic_min_f32 v5, off, s[8:11], 0 offset:8388607
+buffer_atomic_min_num_f32 v5, off, s[8:11], 0 offset:8388607
 // GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
 
-buffer_atomic_min_f32 v5, off, s[8:11], -1 offset:8388607
+buffer_atomic_min_num_f32 v5, off, s[8:11], -1 offset:8388607
 // GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
 
-buffer_atomic_min_f32 v5, off, s[8:11], 0.5 offset:8388607
+buffer_atomic_min_num_f32 v5, off, s[8:11], 0.5 offset:8388607
 // GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
 
-buffer_atomic_min_f32 v5, off, s[8:11], -4.0 offset:8388607
+buffer_atomic_min_num_f32 v5, off, s[8:11], -4.0 offset:8388607
 // GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
 
-buffer_atomic_min_f32 v5, v0, s[8:11], s3 idxen offset:8388607
+buffer_atomic_min_num_f32 v5, v0, s[8:11], s3 idxen offset:8388607
 // GFX12: encoding: [0x03,0x40,0x14,0xc4,0x05,0x10,0x80,0x80,0x00,0xff,0xff,0x7f]
 
-buffer_atomic_min_f32 v5, v0, s[8:11], s3 offen offset:8388607
+buffer_atomic_min_num_f32 v5, v0, s[8:11], s3 offen offset:8388607
 // GFX12: encoding: [0x03,0x40,0x14,0xc4,0x05,0x10,0x80,0x40,0x00,0xff,0xff,0x7f]
 
-buffer_atomic_min_f32 v5, off, s[8:11], s3
+buffer_atomic_min_num_f32 v5, off, s[8:11], s3
 // GFX12: encoding: [0x03,0x40,0x14,0xc4,0x05,0x10,0x80,0x00,0x00,0x00,0x00,0x00]
 
-buffer_atomic_min_f32 v5, off, s[8:11], s3 offset:0
+buffer_atomic_min_num_f32 v5, off, s[8:11], s3 offset:0
 // GFX12: encoding: [0x03,0x40,0x14,0xc4,0x05,0x10,0x80,0x00,0x00,0x00,0x00,0x00]
 
-buffer_atomic_min_f32 v5, off, s[8:11], s3 offset:7
+buffer_atomic_min_num_f32 v5, off, s[8:11], s3 offset:7
 // GFX12: encoding: [0x03,0x40,0x14,0xc4,0x05,0x10,0x80,0x00,0x00,0x07,0x00,0x00]
 
-buffer_atomic_min_f32 v5, off, s[8:11], s3 offset:8388607 th:TH_ATOMIC_RETURN
+buffer_atomic_min_num_f32 v5, off, s[8:11], s3 offset:8388607 th:TH_ATOMIC_RETURN
 // GFX12: encoding: [0x03,0x40,0x14,0xc4,0x05,0x10,0x90,0x00,0x00,0xff,0xff,0x7f]
 
-buffer_atomic_min_f32 v5, off, s[8:11], s3 offset:8388607 th:TH_ATOMIC_RT_RETURN scope:SCOPE_SE
+buffer_atomic_min_num_f32 v5, off, s[8:11], s3 offset:8388607 th:TH_ATOMIC_RT_RETURN scope:SCOPE_SE
 // GFX12: encoding: [0x03,0x40,0x14,0xc4,0x05,0x10,0x94,0x00,0x00,0xff,0xff,0x7f]
 
-buffer_atomic_min_f32 v5, off, s[8:11], s3 offset:8388607 th:TH_ATOMIC_CASCADE_NT scope:SCOPE_DEV
+buffer_atomic_min_num_f32 v5, off, s[8:11], s3 offset:8388607 th:TH_ATOMIC_CASCADE_NT scope:SCOPE_DEV
 // GFX12: encoding: [0x03,0x40,0x14,0xc4,0x05,0x10,0xe8,0x00,0x00,0xff,0xff,0x7f]
 
-buffer_atomic_min_f32 v5, off, s[8:11], s3 offset:8388607 glc
+buffer_atomic_min_num_f32 v5, off, s[8:11], s3 offset:8388607 glc
 // GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
 
-buffer_atomic_min_f32 v5, off, s[8:11], s3 offset:8388607 slc
+buffer_atomic_min_num_f32 v5, off, s[8:11], s3 offset:8388607 slc
 // GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
 
-buffer_atomic_min_f32 v5, off, s[8:11], s3 offset:8388607 dlc
+buffer_atomic_min_num_f32 v5, off, s[8:11], s3 offset:8388607 dlc
 // GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
 
-buffer_atomic_min_f32 v5, off, s[8:11], s3 offset:8388607 glc slc dlc
+buffer_atomic_min_num_f32 v5, off, s[8:11], s3 offset:8388607 glc slc dlc
 // GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
 
 buffer_atomic_min_i32 v5, off, s[8:11], s3 offset:8388607

>From 65a900ccf9175dd18d1f6e7589c808e9ef5f6fd3 Mon Sep 17 00:00:00 2001
From: NagyDonat <donat.nagy at ericsson.com>
Date: Tue, 12 Mar 2024 18:24:26 +0100
Subject: [PATCH 28/63] Reapply "[analyzer] Accept C library functions from the
 `std` namespace"

This reapplies f32b04d4ea91ad1018c25a1d4178cc4392d34968i, after fixing
the use-after-free of ASTUnit in the unittest.
https://github.com/llvm/llvm-project/pull/84469#issuecomment-1992163439

Co-authored-by: Balazs Benics <benicsbalazs at gmail.com>
---
 .../Core/PathSensitive/CallDescription.h      |  8 +-
 .../StaticAnalyzer/Core/CheckerContext.cpp    |  8 +-
 clang/unittests/StaticAnalyzer/CMakeLists.txt |  1 +
 .../StaticAnalyzer/IsCLibraryFunctionTest.cpp | 84 +++++++++++++++++++
 .../clang/unittests/StaticAnalyzer/BUILD.gn   |  1 +
 5 files changed, 93 insertions(+), 9 deletions(-)
 create mode 100644 clang/unittests/StaticAnalyzer/IsCLibraryFunctionTest.cpp

diff --git a/clang/include/clang/StaticAnalyzer/Core/PathSensitive/CallDescription.h b/clang/include/clang/StaticAnalyzer/Core/PathSensitive/CallDescription.h
index 3432d2648633c2..b4e1636130ca7c 100644
--- a/clang/include/clang/StaticAnalyzer/Core/PathSensitive/CallDescription.h
+++ b/clang/include/clang/StaticAnalyzer/Core/PathSensitive/CallDescription.h
@@ -41,12 +41,8 @@ class CallDescription {
     ///  - We also accept calls where the number of arguments or parameters is
     ///    greater than the specified value.
     /// For the exact heuristics, see CheckerContext::isCLibraryFunction().
-    /// Note that functions whose declaration context is not a TU (e.g.
-    /// methods, functions in namespaces) are not accepted as C library
-    /// functions.
-    /// FIXME: If I understand it correctly, this discards calls where C++ code
-    /// refers a C library function through the namespace `std::` via headers
-    /// like <cstdlib>.
+    /// (This mode only matches functions that are declared either directly
+    /// within a TU or in the namespace `std`.)
     CLibrary,
 
     /// Matches "simple" functions that are not methods. (Static methods are
diff --git a/clang/lib/StaticAnalyzer/Core/CheckerContext.cpp b/clang/lib/StaticAnalyzer/Core/CheckerContext.cpp
index d6d4cec9dd3d4d..1a9bff529e9bb1 100644
--- a/clang/lib/StaticAnalyzer/Core/CheckerContext.cpp
+++ b/clang/lib/StaticAnalyzer/Core/CheckerContext.cpp
@@ -87,9 +87,11 @@ bool CheckerContext::isCLibraryFunction(const FunctionDecl *FD,
   if (!II)
     return false;
 
-  // Look through 'extern "C"' and anything similar invented in the future.
-  // If this function is not in TU directly, it is not a C library function.
-  if (!FD->getDeclContext()->getRedeclContext()->isTranslationUnit())
+  // C library functions are either declared directly within a TU (the common
+  // case) or they are accessed through the namespace `std` (when they are used
+  // in C++ via headers like <cstdlib>).
+  const DeclContext *DC = FD->getDeclContext()->getRedeclContext();
+  if (!(DC->isTranslationUnit() || DC->isStdNamespace()))
     return false;
 
   // If this function is not externally visible, it is not a C library function.
diff --git a/clang/unittests/StaticAnalyzer/CMakeLists.txt b/clang/unittests/StaticAnalyzer/CMakeLists.txt
index 775f0f8486b8f9..db56e77331b821 100644
--- a/clang/unittests/StaticAnalyzer/CMakeLists.txt
+++ b/clang/unittests/StaticAnalyzer/CMakeLists.txt
@@ -11,6 +11,7 @@ add_clang_unittest(StaticAnalysisTests
   CallEventTest.cpp
   ConflictingEvalCallsTest.cpp
   FalsePositiveRefutationBRVisitorTest.cpp
+  IsCLibraryFunctionTest.cpp
   NoStateChangeFuncVisitorTest.cpp
   ParamRegionTest.cpp
   RangeSetTest.cpp
diff --git a/clang/unittests/StaticAnalyzer/IsCLibraryFunctionTest.cpp b/clang/unittests/StaticAnalyzer/IsCLibraryFunctionTest.cpp
new file mode 100644
index 00000000000000..31ff13f428da36
--- /dev/null
+++ b/clang/unittests/StaticAnalyzer/IsCLibraryFunctionTest.cpp
@@ -0,0 +1,84 @@
+#include "clang/ASTMatchers/ASTMatchFinder.h"
+#include "clang/ASTMatchers/ASTMatchers.h"
+#include "clang/Analysis/AnalysisDeclContext.h"
+#include "clang/Frontend/ASTUnit.h"
+#include "clang/StaticAnalyzer/Core/PathSensitive/CheckerContext.h"
+#include "clang/Tooling/Tooling.h"
+#include "gtest/gtest.h"
+
+#include <memory>
+
+using namespace clang;
+using namespace ento;
+using namespace ast_matchers;
+
+class IsCLibraryFunctionTest : public testing::Test {
+public:
+  const FunctionDecl *getFunctionDecl() const { return Result; }
+
+  testing::AssertionResult buildAST(StringRef Code) {
+    ASTUnit = tooling::buildASTFromCode(Code);
+    if (!ASTUnit)
+      return testing::AssertionFailure() << "AST construction failed";
+
+    ASTContext &Context = ASTUnit->getASTContext();
+    if (Context.getDiagnostics().hasErrorOccurred())
+      return testing::AssertionFailure() << "Compilation error";
+
+    auto Matches = ast_matchers::match(functionDecl().bind("fn"), Context);
+    if (Matches.empty())
+      return testing::AssertionFailure() << "No function declaration found";
+
+    if (Matches.size() > 1)
+      return testing::AssertionFailure()
+             << "Multiple function declarations found";
+
+    Result = Matches[0].getNodeAs<FunctionDecl>("fn");
+    return testing::AssertionSuccess();
+  }
+
+private:
+  std::unique_ptr<ASTUnit> ASTUnit;
+  const FunctionDecl *Result = nullptr;
+};
+
+TEST_F(IsCLibraryFunctionTest, AcceptsGlobal) {
+  ASSERT_TRUE(buildAST(R"cpp(void fun();)cpp"));
+  EXPECT_TRUE(CheckerContext::isCLibraryFunction(getFunctionDecl()));
+}
+
+TEST_F(IsCLibraryFunctionTest, AcceptsExternCGlobal) {
+  ASSERT_TRUE(buildAST(R"cpp(extern "C" { void fun(); })cpp"));
+  EXPECT_TRUE(CheckerContext::isCLibraryFunction(getFunctionDecl()));
+}
+
+TEST_F(IsCLibraryFunctionTest, RejectsNoInlineNoExternalLinkage) {
+  // Functions that are neither inlined nor externally visible cannot be C library functions.
+  ASSERT_TRUE(buildAST(R"cpp(static void fun();)cpp"));
+  EXPECT_FALSE(CheckerContext::isCLibraryFunction(getFunctionDecl()));
+}
+
+TEST_F(IsCLibraryFunctionTest, RejectsAnonymousNamespace) {
+  ASSERT_TRUE(buildAST(R"cpp(namespace { void fun(); })cpp"));
+  EXPECT_FALSE(CheckerContext::isCLibraryFunction(getFunctionDecl()));
+}
+
+TEST_F(IsCLibraryFunctionTest, AcceptsStdNamespace) {
+  ASSERT_TRUE(buildAST(R"cpp(namespace std { void fun(); })cpp"));
+  EXPECT_TRUE(CheckerContext::isCLibraryFunction(getFunctionDecl()));
+}
+
+TEST_F(IsCLibraryFunctionTest, RejectsOtherNamespaces) {
+  ASSERT_TRUE(buildAST(R"cpp(namespace stdx { void fun(); })cpp"));
+  EXPECT_FALSE(CheckerContext::isCLibraryFunction(getFunctionDecl()));
+}
+
+TEST_F(IsCLibraryFunctionTest, RejectsClassStatic) {
+  ASSERT_TRUE(buildAST(R"cpp(class A { static void fun(); };)cpp"));
+  EXPECT_FALSE(CheckerContext::isCLibraryFunction(getFunctionDecl()));
+}
+
+TEST_F(IsCLibraryFunctionTest, RejectsClassMember) {
+  ASSERT_TRUE(buildAST(R"cpp(class A { void fun(); };)cpp"));
+  EXPECT_FALSE(CheckerContext::isCLibraryFunction(getFunctionDecl()));
+}
diff --git a/llvm/utils/gn/secondary/clang/unittests/StaticAnalyzer/BUILD.gn b/llvm/utils/gn/secondary/clang/unittests/StaticAnalyzer/BUILD.gn
index 01c2b6ced3366f..9c240cff181634 100644
--- a/llvm/utils/gn/secondary/clang/unittests/StaticAnalyzer/BUILD.gn
+++ b/llvm/utils/gn/secondary/clang/unittests/StaticAnalyzer/BUILD.gn
@@ -19,6 +19,7 @@ unittest("StaticAnalysisTests") {
     "CallEventTest.cpp",
     "ConflictingEvalCallsTest.cpp",
     "FalsePositiveRefutationBRVisitorTest.cpp",
+    "IsCLibraryFunctionTest.cpp",
     "NoStateChangeFuncVisitorTest.cpp",
     "ParamRegionTest.cpp",
     "RangeSetTest.cpp",

>From bd4f4fec444ef72a2b40a901fb0309755f0a2550 Mon Sep 17 00:00:00 2001
From: Yingwei Zheng <dtcxzyw2333 at gmail.com>
Date: Wed, 13 Mar 2024 21:52:40 +0800
Subject: [PATCH 29/63] [InstCombine] Set zero_is_poison for ctlz/cttz if they
 are only used as shift amounts (#85035)

Alive2: https://alive2.llvm.org/ce/z/r-67t9

It would improve the codegen if the target doesn't provide a defined
value for ctlz/cttz with zero.
---
 .../InstCombine/InstCombineCalls.cpp          |  5 +
 .../Transforms/InstCombine/shift-cttz-ctlz.ll | 93 +++++++++++++++++++
 2 files changed, 98 insertions(+)
 create mode 100644 llvm/test/Transforms/InstCombine/shift-cttz-ctlz.ll

diff --git a/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp b/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp
index f5f3716d390d77..694b18017babc5 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp
@@ -504,6 +504,11 @@ static Instruction *foldCttzCtlz(IntrinsicInst &II, InstCombinerImpl &IC) {
     return IC.replaceInstUsesWith(II, ConstantInt::getNullValue(II.getType()));
   }
 
+  // If ctlz/cttz is only used as a shift amount, set is_zero_poison to true.
+  if (II.hasOneUse() && match(Op1, m_Zero()) &&
+      match(II.user_back(), m_Shift(m_Value(), m_Specific(&II))))
+    return IC.replaceOperand(II, 1, IC.Builder.getTrue());
+
   Constant *C;
 
   if (IsTZ) {
diff --git a/llvm/test/Transforms/InstCombine/shift-cttz-ctlz.ll b/llvm/test/Transforms/InstCombine/shift-cttz-ctlz.ll
new file mode 100644
index 00000000000000..2b2f820c9a0956
--- /dev/null
+++ b/llvm/test/Transforms/InstCombine/shift-cttz-ctlz.ll
@@ -0,0 +1,93 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
+; RUN: opt < %s -passes=instcombine -S | FileCheck %s
+
+define i32 @shl_cttz_false(i32 %x, i32 %y) {
+; CHECK-LABEL: define i32 @shl_cttz_false(
+; CHECK-SAME: i32 [[X:%.*]], i32 [[Y:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[CTTZ:%.*]] = call i32 @llvm.cttz.i32(i32 [[Y]], i1 true), !range [[RNG0:![0-9]+]]
+; CHECK-NEXT:    [[RES:%.*]] = shl i32 [[X]], [[CTTZ]]
+; CHECK-NEXT:    ret i32 [[RES]]
+;
+entry:
+  %cttz = call i32 @llvm.cttz.i32(i32 %y, i1 false)
+  %res = shl i32 %x, %cttz
+  ret i32 %res
+}
+
+define i32 @shl_ctlz_false(i32 %x, i32 %y) {
+; CHECK-LABEL: define i32 @shl_ctlz_false(
+; CHECK-SAME: i32 [[X:%.*]], i32 [[Y:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[CTTZ:%.*]] = call i32 @llvm.ctlz.i32(i32 [[Y]], i1 true), !range [[RNG0]]
+; CHECK-NEXT:    [[RES:%.*]] = shl i32 [[X]], [[CTTZ]]
+; CHECK-NEXT:    ret i32 [[RES]]
+;
+entry:
+  %cttz = call i32 @llvm.ctlz.i32(i32 %y, i1 false)
+  %res = shl i32 %x, %cttz
+  ret i32 %res
+}
+
+define i32 @lshr_cttz_false(i32 %x, i32 %y) {
+; CHECK-LABEL: define i32 @lshr_cttz_false(
+; CHECK-SAME: i32 [[X:%.*]], i32 [[Y:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[CTTZ:%.*]] = call i32 @llvm.cttz.i32(i32 [[Y]], i1 true), !range [[RNG0]]
+; CHECK-NEXT:    [[RES:%.*]] = lshr i32 [[X]], [[CTTZ]]
+; CHECK-NEXT:    ret i32 [[RES]]
+;
+entry:
+  %cttz = call i32 @llvm.cttz.i32(i32 %y, i1 false)
+  %res = lshr i32 %x, %cttz
+  ret i32 %res
+}
+
+define i32 @ashr_cttz_false(i32 %x, i32 %y) {
+; CHECK-LABEL: define i32 @ashr_cttz_false(
+; CHECK-SAME: i32 [[X:%.*]], i32 [[Y:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[CTTZ:%.*]] = call i32 @llvm.cttz.i32(i32 [[Y]], i1 true), !range [[RNG0]]
+; CHECK-NEXT:    [[RES:%.*]] = ashr i32 [[X]], [[CTTZ]]
+; CHECK-NEXT:    ret i32 [[RES]]
+;
+entry:
+  %cttz = call i32 @llvm.cttz.i32(i32 %y, i1 false)
+  %res = ashr i32 %x, %cttz
+  ret i32 %res
+}
+
+define i32 @shl_cttz_false_multiuse(i32 %x, i32 %y) {
+; CHECK-LABEL: define i32 @shl_cttz_false_multiuse(
+; CHECK-SAME: i32 [[X:%.*]], i32 [[Y:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[CTTZ:%.*]] = call i32 @llvm.cttz.i32(i32 [[Y]], i1 false), !range [[RNG0]]
+; CHECK-NEXT:    call void @use(i32 [[CTTZ]])
+; CHECK-NEXT:    [[RES:%.*]] = shl i32 [[X]], [[CTTZ]]
+; CHECK-NEXT:    ret i32 [[RES]]
+;
+entry:
+  %cttz = call i32 @llvm.cttz.i32(i32 %y, i1 false)
+  call void @use(i32 %cttz)
+  %res = shl i32 %x, %cttz
+  ret i32 %res
+}
+
+define i32 @shl_cttz_as_lhs(i32 %x, i32 %y) {
+; CHECK-LABEL: define i32 @shl_cttz_as_lhs(
+; CHECK-SAME: i32 [[X:%.*]], i32 [[Y:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[CTTZ:%.*]] = call i32 @llvm.cttz.i32(i32 [[Y]], i1 false), !range [[RNG0]]
+; CHECK-NEXT:    [[RES:%.*]] = shl i32 [[CTTZ]], [[X]]
+; CHECK-NEXT:    ret i32 [[RES]]
+;
+entry:
+  %cttz = call i32 @llvm.cttz.i32(i32 %y, i1 false)
+  %res = shl i32 %cttz, %x
+  ret i32 %res
+}
+
+declare void @use(i32)
+;.
+; CHECK: [[RNG0]] = !{i32 0, i32 33}
+;.

>From 5731c9e80455466c6a08386803e8946773cc9082 Mon Sep 17 00:00:00 2001
From: Jake Egan <Jake.egan at ibm.com>
Date: Wed, 13 Mar 2024 09:54:35 -0400
Subject: [PATCH 30/63] [AIX][ClangRepl] Disable new test on AIX

This new test fails on the AIX bot with error `LLVM ERROR: Incompatible object format!`. Disable for now to investigate.

Same as 86337beca2e6f939127cd3e088ec80c0cf4a0a64.
---
 clang/unittests/Interpreter/InterpreterExtensionsTest.cpp | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/clang/unittests/Interpreter/InterpreterExtensionsTest.cpp b/clang/unittests/Interpreter/InterpreterExtensionsTest.cpp
index 77fd1b4e19816c..1cf564b5671b0e 100644
--- a/clang/unittests/Interpreter/InterpreterExtensionsTest.cpp
+++ b/clang/unittests/Interpreter/InterpreterExtensionsTest.cpp
@@ -56,7 +56,11 @@ class TestCreateResetExecutor : public Interpreter {
   void resetExecutor() { Interpreter::ResetExecutor(); }
 };
 
+ifdef _AIX
+TEST(InterpreterExtensionsTest, DISABLED_ExecutorCreateReset) {
+#else
 TEST(InterpreterExtensionsTest, ExecutorCreateReset) {
+#endif
   // Make sure we can create the executer on the platform.
   if (!HostSupportsJit())
     GTEST_SKIP();

>From 38e5764d5c178e6bdedbd36dc95dce68d5284ed0 Mon Sep 17 00:00:00 2001
From: Jake Egan <Jake.egan at ibm.com>
Date: Wed, 13 Mar 2024 09:57:31 -0400
Subject: [PATCH 31/63] [ClangRepl] Add missing hashtag

Hashtag missing from commit 960b4aa6dab69125778f230c4c94f2d19c96cc87
---
 clang/unittests/Interpreter/InterpreterExtensionsTest.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/clang/unittests/Interpreter/InterpreterExtensionsTest.cpp b/clang/unittests/Interpreter/InterpreterExtensionsTest.cpp
index 1cf564b5671b0e..b7708616fd24d3 100644
--- a/clang/unittests/Interpreter/InterpreterExtensionsTest.cpp
+++ b/clang/unittests/Interpreter/InterpreterExtensionsTest.cpp
@@ -56,7 +56,7 @@ class TestCreateResetExecutor : public Interpreter {
   void resetExecutor() { Interpreter::ResetExecutor(); }
 };
 
-ifdef _AIX
+#ifdef _AIX
 TEST(InterpreterExtensionsTest, DISABLED_ExecutorCreateReset) {
 #else
 TEST(InterpreterExtensionsTest, ExecutorCreateReset) {

>From 479627ca533abe7d44aab50e6d0051edaffc87da Mon Sep 17 00:00:00 2001
From: Sirraide <aeternalmail at gmail.com>
Date: Wed, 13 Mar 2024 14:59:55 +0100
Subject: [PATCH 32/63] [Clang] [CodeGen] Fix codegen bug in constant
 initialisation in C23 mode (#84981)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Consider the following code:
```c
bool const inf =  (1.0/0.0);
```

When trying to emit the initialiser of this variable in C23, we end up
hitting a code path in codegen in `VarDecl::evaluateValueImpl()` where
we check for `IsConstantInitialization && (Ctx.getLangOpts().CPlusPlus
|| Ctx.getLangOpts().C23)`, and if that is the case and we emitted any
notes, constant evaluation fails, and as a result, codegen issues this
error:
```
<source>:1:12: error: cannot compile this static initializer yet
    1 | bool const inf =  (1.0/0.0);
      |
```

As a fix, only fail in C23 mode if we’re initialising a `constexpr`
variable.

This fixes #84784.
---
 clang/lib/AST/Decl.cpp          | 11 +++++++----
 clang/test/CodeGen/const-init.c |  3 +++
 clang/test/Sema/const-init.c    |  6 ++++++
 3 files changed, 16 insertions(+), 4 deletions(-)
 create mode 100644 clang/test/Sema/const-init.c

diff --git a/clang/lib/AST/Decl.cpp b/clang/lib/AST/Decl.cpp
index 8626f04012f7d4..95900afdd2c5d8 100644
--- a/clang/lib/AST/Decl.cpp
+++ b/clang/lib/AST/Decl.cpp
@@ -2577,11 +2577,14 @@ APValue *VarDecl::evaluateValueImpl(SmallVectorImpl<PartialDiagnosticAt> &Notes,
   bool Result = Init->EvaluateAsInitializer(Eval->Evaluated, Ctx, this, Notes,
                                             IsConstantInitialization);
 
-  // In C++/C23, this isn't a constant initializer if we produced notes. In that
-  // case, we can't keep the result, because it may only be correct under the
-  // assumption that the initializer is a constant context.
+  // In C++, or in C23 if we're initialising a 'constexpr' variable, this isn't
+  // a constant initializer if we produced notes. In that case, we can't keep
+  // the result, because it may only be correct under the assumption that the
+  // initializer is a constant context.
   if (IsConstantInitialization &&
-      (Ctx.getLangOpts().CPlusPlus || Ctx.getLangOpts().C23) && !Notes.empty())
+      (Ctx.getLangOpts().CPlusPlus ||
+       (isConstexpr() && Ctx.getLangOpts().C23)) &&
+      !Notes.empty())
     Result = false;
 
   // Ensure the computed APValue is cleaned up later if evaluation succeeded,
diff --git a/clang/test/CodeGen/const-init.c b/clang/test/CodeGen/const-init.c
index 0e4fc4ad48af8d..ad3e9551199ac2 100644
--- a/clang/test/CodeGen/const-init.c
+++ b/clang/test/CodeGen/const-init.c
@@ -216,3 +216,6 @@ int PR4517_x2 = PR4517_arrc[PR4517_idx];
 // CHECK: @PR4517_x = global i32 42, align 4
 // CHECK: @PR4517_idx = constant i32 1, align 4
 // CHECK: @PR4517_x2 = global i32 42, align 4
+
+// CHECK: @GH84784_inf = constant i8 1
+_Bool const GH84784_inf = (1.0/0.0);
diff --git a/clang/test/Sema/const-init.c b/clang/test/Sema/const-init.c
new file mode 100644
index 00000000000000..5b07ede747ebc4
--- /dev/null
+++ b/clang/test/Sema/const-init.c
@@ -0,0 +1,6 @@
+// RUN: %clang_cc1 -fsyntax-only -verify -std=c23 %s
+
+// Division by 0 here is an error iff the variable is 'constexpr'.
+const _Bool inf1 =  (1.0/0.0 == __builtin_inf());
+constexpr _Bool inf2 = (1.0/0.0 == __builtin_inf()); // expected-error {{must be initialized by a constant expression}} expected-note {{division by zero}}
+constexpr _Bool inf3 = __builtin_inf() == __builtin_inf();

>From 65664acc4494a3bbf50180cd698208acc20ff0c5 Mon Sep 17 00:00:00 2001
From: mahesh-attarde <145317060+mahesh-attarde at users.noreply.github.com>
Date: Wed, 13 Mar 2024 07:03:15 -0700
Subject: [PATCH 33/63] [CodeGen][Tablegen] Fix uninitialized var and shift
 overflow. (#84896)

Fix uninitialized var and shift overflow.
---
 llvm/include/llvm/CodeGen/AccelTable.h     | 2 +-
 llvm/lib/CodeGen/AsmPrinter/AccelTable.cpp | 3 ++-
 llvm/utils/TableGen/DecoderEmitter.cpp     | 4 ++--
 3 files changed, 5 insertions(+), 4 deletions(-)

diff --git a/llvm/include/llvm/CodeGen/AccelTable.h b/llvm/include/llvm/CodeGen/AccelTable.h
index 6ee817a7124dc3..cff8fcbaf2cd7b 100644
--- a/llvm/include/llvm/CodeGen/AccelTable.h
+++ b/llvm/include/llvm/CodeGen/AccelTable.h
@@ -353,7 +353,7 @@ class DebugNamesAbbrev : public FoldingSetNode {
     dwarf::Index Index;
     dwarf::Form Form;
   };
-  DebugNamesAbbrev(uint32_t DieTag) : DieTag(DieTag) {}
+  DebugNamesAbbrev(uint32_t DieTag) : DieTag(DieTag), Number(0) {}
   /// Add attribute encoding to an abbreviation.
   void addAttribute(const DebugNamesAbbrev::AttributeEncoding &Attr) {
     AttrVect.push_back(Attr);
diff --git a/llvm/lib/CodeGen/AsmPrinter/AccelTable.cpp b/llvm/lib/CodeGen/AsmPrinter/AccelTable.cpp
index 55cdc3c9286420..2e8e7d0a88af03 100644
--- a/llvm/lib/CodeGen/AsmPrinter/AccelTable.cpp
+++ b/llvm/lib/CodeGen/AsmPrinter/AccelTable.cpp
@@ -369,7 +369,8 @@ void AppleAccelTableWriter::emit() const {
 DWARF5AccelTableData::DWARF5AccelTableData(const DIE &Die,
                                            const uint32_t UnitID,
                                            const bool IsTU)
-    : OffsetVal(&Die), DieTag(Die.getTag()), IsTU(IsTU), UnitID(UnitID) {}
+    : OffsetVal(&Die), DieTag(Die.getTag()), AbbrevNumber(0), IsTU(IsTU),
+      UnitID(UnitID) {}
 
 void Dwarf5AccelTableWriter::Header::emit(Dwarf5AccelTableWriter &Ctx) {
   assert(CompUnitCount > 0 && "Index must have at least one CU.");
diff --git a/llvm/utils/TableGen/DecoderEmitter.cpp b/llvm/utils/TableGen/DecoderEmitter.cpp
index dd78dc02159b94..628bff520a12e9 100644
--- a/llvm/utils/TableGen/DecoderEmitter.cpp
+++ b/llvm/utils/TableGen/DecoderEmitter.cpp
@@ -934,7 +934,7 @@ void DecoderEmitter::emitTable(formatted_raw_ostream &OS, DecoderTable &Table,
       unsigned Shift = 0;
       do {
         OS << ", " << (unsigned)*I;
-        Value += (*I & 0x7f) << Shift;
+        Value += ((uint64_t)(*I & 0x7f)) << Shift;
         Shift += 7;
       } while (*I++ >= 128);
       if (Value > 127) {
@@ -947,7 +947,7 @@ void DecoderEmitter::emitTable(formatted_raw_ostream &OS, DecoderTable &Table,
       Shift = 0;
       do {
         OS << ", " << (unsigned)*I;
-        Value += (*I & 0x7f) << Shift;
+        Value += ((uint64_t)(*I & 0x7f)) << Shift;
         Shift += 7;
       } while (*I++ >= 128);
       if (Value > 127) {

>From d2d9a14f34c6d6b9b28f470d293cdca80fbb8a8b Mon Sep 17 00:00:00 2001
From: David Spickett <david.spickett at linaro.org>
Date: Wed, 13 Mar 2024 14:14:24 +0000
Subject: [PATCH 34/63] [lldb][test] Disable other runlocker test on AArch64
 Linux

Flaky on the bot: https://lab.llvm.org/buildbot/#/builders/96/builds/54435
---
 lldb/test/API/python_api/run_locker/TestRunLocker.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/lldb/test/API/python_api/run_locker/TestRunLocker.py b/lldb/test/API/python_api/run_locker/TestRunLocker.py
index 10832840ac0955..4e0dd26bff70d2 100644
--- a/lldb/test/API/python_api/run_locker/TestRunLocker.py
+++ b/lldb/test/API/python_api/run_locker/TestRunLocker.py
@@ -15,6 +15,8 @@ class TestRunLocker(TestBase):
     NO_DEBUG_INFO_TESTCASE = True
 
     @expectedFailureAll(oslist=["windows"])
+    # Is flaky on Linux AArch64 buildbot.
+    @skipIf(oslist=["linux"], archs=["aarch64"])
     def test_run_locker(self):
         """Test that the run locker is set correctly when we launch"""
         self.build()

>From 46c2627926c0e8b135afbe0d7f5749c653cd15f6 Mon Sep 17 00:00:00 2001
From: Zepp <luzepu678 at gmail.com>
Date: Wed, 13 Mar 2024 22:25:29 +0800
Subject: [PATCH 35/63] [Clang] [Docs] Add reference to documentation of
 `SysVABIAttr` (#85022)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

We for some reason already had documentation for this attribute, but just weren’t linking to it.
---
 clang/include/clang/Basic/Attr.td | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/clang/include/clang/Basic/Attr.td b/clang/include/clang/Basic/Attr.td
index 63efd85dcd4e58..67d87eca16ede8 100644
--- a/clang/include/clang/Basic/Attr.td
+++ b/clang/include/clang/Basic/Attr.td
@@ -2934,7 +2934,7 @@ def Suppress : DeclOrStmtAttr {
 def SysVABI : DeclOrTypeAttr {
   let Spellings = [GCC<"sysv_abi">];
 //  let Subjects = [Function, ObjCMethod];
-  let Documentation = [Undocumented];
+  let Documentation = [SysVABIDocs];
 }
 
 def ThisCall : DeclOrTypeAttr {

>From b4ae8df8bf3d3f0bce71d8e9d89651b3056b9fa7 Mon Sep 17 00:00:00 2001
From: Zaara Syeda <95926691+syzaara at users.noreply.github.com>
Date: Wed, 13 Mar 2024 10:26:31 -0400
Subject: [PATCH 36/63] [AIX][TOC] Add -mtocdata/-mno-tocdata options on AIX
 (#67999)

This patch enables support that the XL compiler had for AIX under
-qdatalocal/-qdataimported.
---
 clang/docs/UsersManual.rst                    |  61 ++++++++++
 clang/include/clang/Basic/CodeGenOptions.h    |   9 ++
 .../clang/Basic/DiagnosticDriverKinds.td      |   3 +
 .../clang/Basic/DiagnosticFrontendKinds.td    |   2 +
 clang/include/clang/Driver/Options.td         |  21 ++++
 clang/lib/CodeGen/CGDecl.cpp                  |   1 +
 clang/lib/CodeGen/CodeGenModule.cpp           |  26 +++++
 clang/lib/CodeGen/Targets/PPC.cpp             |  59 ++++++++++
 clang/lib/Driver/ToolChains/AIX.cpp           |  87 ++++++++++++++
 clang/lib/Frontend/CompilerInstance.cpp       |   5 +
 .../test/CodeGen/PowerPC/toc-data-attribute.c |  50 ++++++++
 .../CodeGen/PowerPC/toc-data-attribute.cpp    |  39 +++++++
 .../CodeGen/PowerPC/toc-data-diagnostics.c    |  68 +++++++++++
 .../PowerPC/toc-data-structs-arrays.cpp       |  65 +++++++++++
 clang/test/Driver/toc-conf.c                  |  30 +++++
 clang/test/Driver/tocdata-cc1.c               |  16 +++
 llvm/include/llvm/ADT/STLExtras.h             |  13 +++
 llvm/lib/MC/MCSectionXCOFF.cpp                |   3 +-
 llvm/lib/Target/PowerPC/PPCAsmPrinter.cpp     |   2 +
 llvm/lib/Target/PowerPC/PPCISelDAGToDAG.cpp   |  34 ------
 llvm/lib/Target/PowerPC/PPCSubtarget.cpp      |  22 ++++
 llvm/lib/Target/PowerPC/PPCSubtarget.h        |   2 +
 .../CodeGen/PowerPC/toc-data-large-array.ll   |  16 +++
 .../CodeGen/PowerPC/toc-data-large-array2.ll  |   8 ++
 .../CodeGen/PowerPC/toc-data-struct-array.ll  | 110 ++++++++++++++++++
 25 files changed, 716 insertions(+), 36 deletions(-)
 create mode 100644 clang/test/CodeGen/PowerPC/toc-data-attribute.c
 create mode 100644 clang/test/CodeGen/PowerPC/toc-data-attribute.cpp
 create mode 100644 clang/test/CodeGen/PowerPC/toc-data-diagnostics.c
 create mode 100644 clang/test/CodeGen/PowerPC/toc-data-structs-arrays.cpp
 create mode 100644 clang/test/Driver/toc-conf.c
 create mode 100644 clang/test/Driver/tocdata-cc1.c
 create mode 100644 llvm/test/CodeGen/PowerPC/toc-data-large-array.ll
 create mode 100644 llvm/test/CodeGen/PowerPC/toc-data-large-array2.ll
 create mode 100644 llvm/test/CodeGen/PowerPC/toc-data-struct-array.ll

diff --git a/clang/docs/UsersManual.rst b/clang/docs/UsersManual.rst
index 7391e4cf3a9aeb..7a63d720241a7a 100644
--- a/clang/docs/UsersManual.rst
+++ b/clang/docs/UsersManual.rst
@@ -4227,7 +4227,68 @@ Clang expects the GCC executable "gcc.exe" compiled for
 
 AIX
 ^^^
+TOC Data Transformation
+"""""""""""""""""""""""
+TOC data transformation is off by default (``-mno-tocdata``).
+When ``-mtocdata`` is specified, the TOC data transformation will be applied to
+all suitable variables with static storage duration, including static data
+members of classes and block-scope static variables (if not marked as exceptions,
+see further below).
 
+Suitable variables must:
+
+-  have complete types
+-  be independently generated (i.e., not placed in a pool)
+-  be at most as large as a pointer
+-  not be aligned more strictly than a pointer
+-  not be structs containing flexible array members
+-  not have internal linkage
+-  not have aliases
+-  not have section attributes
+-  not be thread local storage
+
+The TOC data transformation results in the variable, not its address,
+being placed in the TOC. This eliminates the need to load the address of the
+variable from the TOC.
+
+Note:
+If the TOC data transformation is applied to a variable whose definition
+is imported, the linker will generate fixup code for reading or writing to the
+variable.
+
+When multiple toc-data options are used, the last option used has the affect.
+For example: -mno-tocdata=g5,g1 -mtocdata=g1,g2 -mno-tocdata=g2 -mtocdata=g3,g4
+results in -mtocdata=g1,g3,g4
+
+Names of variables not having external linkage will be ignored.
+
+**Options:**
+
+.. option:: -mno-tocdata
+
+  This is the default behaviour. Only variables explicitly specified with
+  ``-mtocdata=`` will have the TOC data transformation applied.
+
+.. option:: -mtocdata
+
+  Apply the TOC data transformation to all suitable variables with static
+  storage duration (including static data members of classes and block-scope
+  static variables) that are not explicitly specified with ``-mno-tocdata=``.
+
+.. option:: -mno-tocdata=
+
+  Can be used in conjunction with ``-mtocdata`` to mark the comma-separated
+  list of external linkage variables, specified using their mangled names, as
+  exceptions to ``-mtocdata``.
+
+.. option:: -mtocdata=
+
+  Apply the TOC data transformation to the comma-separated list of external
+  linkage variables, specified using their mangled names, if they are suitable.
+  Emit diagnostics for all unsuitable variables specified.
+
+Default Visibility Export Mapping
+"""""""""""""""""""""""""""""""""
 The ``-mdefault-visibility-export-mapping=`` option can be used to control
 mapping of default visibility to an explicit shared object export
 (i.e. XCOFF exported visibility). Three values are provided for the option:
diff --git a/clang/include/clang/Basic/CodeGenOptions.h b/clang/include/clang/Basic/CodeGenOptions.h
index 3f8fe385fef3df..cf29e576ef3209 100644
--- a/clang/include/clang/Basic/CodeGenOptions.h
+++ b/clang/include/clang/Basic/CodeGenOptions.h
@@ -404,6 +404,15 @@ class CodeGenOptions : public CodeGenOptionsBase {
   /// List of pass builder callbacks.
   std::vector<std::function<void(llvm::PassBuilder &)>> PassBuilderCallbacks;
 
+  /// List of global variables explicitly specified by the user as toc-data.
+  std::vector<std::string> TocDataVarsUserSpecified;
+
+  /// List of global variables that over-ride the toc-data default.
+  std::vector<std::string> NoTocDataVars;
+
+  /// Flag for all global variables to be treated as toc-data.
+  bool AllTocData;
+
   /// Path to allowlist file specifying which objects
   /// (files, functions) should exclusively be instrumented
   /// by sanitizer coverage pass.
diff --git a/clang/include/clang/Basic/DiagnosticDriverKinds.td b/clang/include/clang/Basic/DiagnosticDriverKinds.td
index 1bc9885849d54b..e33a1f4c45b949 100644
--- a/clang/include/clang/Basic/DiagnosticDriverKinds.td
+++ b/clang/include/clang/Basic/DiagnosticDriverKinds.td
@@ -587,6 +587,9 @@ def warn_drv_unsupported_gpopt : Warning<
   "ignoring '-mgpopt' option as it cannot be used with %select{|the implicit"
   " usage of }0-mabicalls">,
   InGroup<UnsupportedGPOpt>;
+def warn_drv_unsupported_tocdata: Warning<
+  "ignoring '-mtocdata' as it is only supported for -mcmodel=small">,
+  InGroup<OptionIgnored>;
 def warn_drv_unsupported_sdata : Warning<
   "ignoring '-msmall-data-limit=' with -mcmodel=large for -fpic or RV64">,
   InGroup<OptionIgnored>;
diff --git a/clang/include/clang/Basic/DiagnosticFrontendKinds.td b/clang/include/clang/Basic/DiagnosticFrontendKinds.td
index dcd2c19fb7ee36..794a0a82be6d74 100644
--- a/clang/include/clang/Basic/DiagnosticFrontendKinds.td
+++ b/clang/include/clang/Basic/DiagnosticFrontendKinds.td
@@ -94,6 +94,8 @@ def err_fe_backend_error_attr :
 def warn_fe_backend_warning_attr :
   Warning<"call to '%0' declared with 'warning' attribute: %1">, BackendInfo,
   InGroup<BackendWarningAttributes>;
+def warn_toc_unsupported_type : Warning<"-mtocdata option is ignored "
+  "for %0 because %1">, InGroup<BackendWarningAttributes>;
 
 def err_fe_invalid_code_complete_file : Error<
     "cannot locate code-completion file %0">, DefaultFatal;
diff --git a/clang/include/clang/Driver/Options.td b/clang/include/clang/Driver/Options.td
index aca8c9b0d5487a..1fac7b6f0093d8 100644
--- a/clang/include/clang/Driver/Options.td
+++ b/clang/include/clang/Driver/Options.td
@@ -3609,6 +3609,27 @@ def fpass_plugin_EQ : Joined<["-"], "fpass-plugin=">,
   MetaVarName<"<dsopath>">,
   HelpText<"Load pass plugin from a dynamic shared object file (only with new pass manager).">,
   MarshallingInfoStringVector<CodeGenOpts<"PassPlugins">>;
+defm tocdata : BoolOption<"m","tocdata",
+  CodeGenOpts<"AllTocData">, DefaultFalse,
+  PosFlag<SetTrue, [], [ClangOption, CC1Option],
+  "All suitable variables will have the TOC data transformation applied">,
+  NegFlag<SetFalse, [], [ClangOption, CC1Option],
+  "This is the default. TOC data transformation is not applied to any"
+  "variables. Only variables specified explicitly in -mtocdata= will"
+  "have the TOC data transformation.">,
+  BothFlags<[TargetSpecific], [ClangOption, CLOption]>>, Group<m_Group>;
+def mtocdata_EQ : CommaJoined<["-"], "mtocdata=">,
+  Visibility<[ClangOption, CC1Option]>,
+  Flags<[TargetSpecific]>, Group<m_Group>,
+  HelpText<"Specifies a list of variables to which the TOC data transformation"
+           "will be applied.">,
+  MarshallingInfoStringVector<CodeGenOpts<"TocDataVarsUserSpecified">>;
+def mno_tocdata_EQ : CommaJoined<["-"], "mno-tocdata=">,
+  Visibility<[ClangOption, CC1Option]>,
+  Flags<[TargetSpecific]>, Group<m_Group>,
+  HelpText<"Specifies a list of variables to be exempt from the TOC data"
+           "transformation.">,
+  MarshallingInfoStringVector<CodeGenOpts<"NoTocDataVars">>;
 defm preserve_as_comments : BoolFOption<"preserve-as-comments",
   CodeGenOpts<"PreserveAsmComments">, DefaultTrue,
   NegFlag<SetFalse, [], [ClangOption, CC1Option],
diff --git a/clang/lib/CodeGen/CGDecl.cpp b/clang/lib/CodeGen/CGDecl.cpp
index bbe14ef4c17244..dc42faf8dbb9fd 100644
--- a/clang/lib/CodeGen/CGDecl.cpp
+++ b/clang/lib/CodeGen/CGDecl.cpp
@@ -284,6 +284,7 @@ llvm::Constant *CodeGenModule::getOrCreateStaticVarDecl(
     setTLSMode(GV, D);
 
   setGVProperties(GV, &D);
+  getTargetCodeGenInfo().setTargetAttributes(cast<Decl>(&D), GV, *this);
 
   // Make sure the result is of the correct type.
   LangAS ExpectedAS = Ty.getAddressSpace();
diff --git a/clang/lib/CodeGen/CodeGenModule.cpp b/clang/lib/CodeGen/CodeGenModule.cpp
index 967319bdfc4571..8ceecff28cbc63 100644
--- a/clang/lib/CodeGen/CodeGenModule.cpp
+++ b/clang/lib/CodeGen/CodeGenModule.cpp
@@ -626,6 +626,26 @@ static bool checkAliasedGlobal(
   return true;
 }
 
+// Emit a warning if toc-data attribute is requested for global variables that
+// have aliases and remove the toc-data attribute.
+static void checkAliasForTocData(llvm::GlobalVariable *GVar,
+                                 const CodeGenOptions &CodeGenOpts,
+                                 DiagnosticsEngine &Diags,
+                                 SourceLocation Location) {
+  if (GVar->hasAttribute("toc-data")) {
+    auto GVId = GVar->getName();
+    // Is this a global variable specified by the user as local?
+    if ((llvm::binary_search(CodeGenOpts.TocDataVarsUserSpecified, GVId))) {
+      Diags.Report(Location, diag::warn_toc_unsupported_type)
+          << GVId << "the variable has an alias";
+    }
+    llvm::AttributeSet CurrAttributes = GVar->getAttributes();
+    llvm::AttributeSet NewAttributes =
+        CurrAttributes.removeAttribute(GVar->getContext(), "toc-data");
+    GVar->setAttributes(NewAttributes);
+  }
+}
+
 void CodeGenModule::checkAliases() {
   // Check if the constructed aliases are well formed. It is really unfortunate
   // that we have to do this in CodeGen, but we only construct mangled names
@@ -652,6 +672,12 @@ void CodeGenModule::checkAliases() {
       continue;
     }
 
+    if (getContext().getTargetInfo().getTriple().isOSAIX())
+      if (const llvm::GlobalVariable *GVar =
+              dyn_cast<const llvm::GlobalVariable>(GV))
+        checkAliasForTocData(const_cast<llvm::GlobalVariable *>(GVar),
+                             getCodeGenOpts(), Diags, Location);
+
     llvm::Constant *Aliasee =
         IsIFunc ? cast<llvm::GlobalIFunc>(Alias)->getResolver()
                 : cast<llvm::GlobalAlias>(Alias)->getAliasee();
diff --git a/clang/lib/CodeGen/Targets/PPC.cpp b/clang/lib/CodeGen/Targets/PPC.cpp
index 40dddde508c177..00b04723f17dd2 100644
--- a/clang/lib/CodeGen/Targets/PPC.cpp
+++ b/clang/lib/CodeGen/Targets/PPC.cpp
@@ -8,6 +8,7 @@
 
 #include "ABIInfoImpl.h"
 #include "TargetInfo.h"
+#include "clang/Basic/DiagnosticFrontend.h"
 
 using namespace clang;
 using namespace clang::CodeGen;
@@ -145,6 +146,9 @@ class AIXTargetCodeGenInfo : public TargetCodeGenInfo {
 
   bool initDwarfEHRegSizeTable(CodeGen::CodeGenFunction &CGF,
                                llvm::Value *Address) const override;
+
+  void setTargetAttributes(const Decl *D, llvm::GlobalValue *GV,
+                           CodeGen::CodeGenModule &M) const override;
 };
 } // namespace
 
@@ -265,6 +269,61 @@ bool AIXTargetCodeGenInfo::initDwarfEHRegSizeTable(
   return PPC_initDwarfEHRegSizeTable(CGF, Address, Is64Bit, /*IsAIX*/ true);
 }
 
+void AIXTargetCodeGenInfo::setTargetAttributes(
+    const Decl *D, llvm::GlobalValue *GV, CodeGen::CodeGenModule &M) const {
+  if (!isa<llvm::GlobalVariable>(GV))
+    return;
+
+  auto *GVar = dyn_cast<llvm::GlobalVariable>(GV);
+  auto GVId = GV->getName();
+
+  // Is this a global variable specified by the user as toc-data?
+  bool UserSpecifiedTOC =
+      llvm::binary_search(M.getCodeGenOpts().TocDataVarsUserSpecified, GVId);
+  // Assumes the same variable cannot be in both TocVarsUserSpecified and
+  // NoTocVars.
+  if (UserSpecifiedTOC ||
+      ((M.getCodeGenOpts().AllTocData) &&
+       !llvm::binary_search(M.getCodeGenOpts().NoTocDataVars, GVId))) {
+    const unsigned long PointerSize =
+        GV->getParent()->getDataLayout().getPointerSizeInBits() / 8;
+    auto *VarD = dyn_cast<VarDecl>(D);
+    assert(VarD && "Invalid declaration of global variable.");
+
+    ASTContext &Context = D->getASTContext();
+    unsigned Alignment = Context.toBits(Context.getDeclAlign(D)) / 8;
+    const auto *Ty = VarD->getType().getTypePtr();
+    const RecordDecl *RDecl =
+        Ty->isRecordType() ? Ty->getAs<RecordType>()->getDecl() : nullptr;
+
+    bool EmitDiagnostic = UserSpecifiedTOC && GV->hasExternalLinkage();
+    auto reportUnsupportedWarning = [&](bool ShouldEmitWarning, StringRef Msg) {
+      if (ShouldEmitWarning)
+        M.getDiags().Report(D->getLocation(), diag::warn_toc_unsupported_type)
+            << GVId << Msg;
+    };
+    if (!Ty || Ty->isIncompleteType())
+      reportUnsupportedWarning(EmitDiagnostic, "of incomplete type");
+    else if (RDecl && RDecl->hasFlexibleArrayMember())
+      reportUnsupportedWarning(EmitDiagnostic,
+                               "it contains a flexible array member");
+    else if (VarD->getTLSKind() != VarDecl::TLS_None)
+      reportUnsupportedWarning(EmitDiagnostic, "of thread local storage");
+    else if (PointerSize < Context.getTypeInfo(VarD->getType()).Width / 8)
+      reportUnsupportedWarning(EmitDiagnostic,
+                               "variable is larger than a pointer");
+    else if (PointerSize < Alignment)
+      reportUnsupportedWarning(EmitDiagnostic,
+                               "variable is aligned wider than a pointer");
+    else if (D->hasAttr<SectionAttr>())
+      reportUnsupportedWarning(EmitDiagnostic,
+                               "variable has a section attribute");
+    else if (GV->hasExternalLinkage() ||
+             (M.getCodeGenOpts().AllTocData && !GV->hasLocalLinkage()))
+      GVar->addAttribute("toc-data");
+  }
+}
+
 // PowerPC-32
 namespace {
 /// PPC32_SVR4_ABIInfo - The 32-bit PowerPC ELF (SVR4) ABI information.
diff --git a/clang/lib/Driver/ToolChains/AIX.cpp b/clang/lib/Driver/ToolChains/AIX.cpp
index 3c7049a99982d0..6e089903a3158a 100644
--- a/clang/lib/Driver/ToolChains/AIX.cpp
+++ b/clang/lib/Driver/ToolChains/AIX.cpp
@@ -433,6 +433,88 @@ void AIX::AddCXXStdlibLibArgs(const llvm::opt::ArgList &Args,
   llvm_unreachable("Unexpected C++ library type; only libc++ is supported.");
 }
 
+// This function processes all the mtocdata options to build the final
+// simplified toc data options to pass to CC1.
+static void addTocDataOptions(const llvm::opt::ArgList &Args,
+                              llvm::opt::ArgStringList &CC1Args,
+                              const Driver &D) {
+
+  // Check the global toc-data setting. The default is -mno-tocdata.
+  // To enable toc-data globally, -mtocdata must be specified.
+  // Additionally, it must be last to take effect.
+  const bool TOCDataGloballyinEffect = [&Args]() {
+    if (const Arg *LastArg =
+            Args.getLastArg(options::OPT_mtocdata, options::OPT_mno_tocdata))
+      return LastArg->getOption().matches(options::OPT_mtocdata);
+    else
+      return false;
+  }();
+
+  // Currently only supported for small code model.
+  if (TOCDataGloballyinEffect &&
+      (Args.getLastArgValue(options::OPT_mcmodel_EQ).equals("large") ||
+       Args.getLastArgValue(options::OPT_mcmodel_EQ).equals("medium"))) {
+    D.Diag(clang::diag::warn_drv_unsupported_tocdata);
+    return;
+  }
+
+  enum TOCDataSetting {
+    AddressInTOC = 0, // Address of the symbol stored in the TOC.
+    DataInTOC = 1     // Symbol defined in the TOC.
+  };
+
+  const TOCDataSetting DefaultTocDataSetting =
+      TOCDataGloballyinEffect ? DataInTOC : AddressInTOC;
+
+  // Process the list of variables in the explicitly specified options
+  // -mtocdata= and -mno-tocdata= to see which variables are opposite to
+  // the global setting of tocdata in TOCDataGloballyinEffect.
+  // Those that have the opposite setting to TOCDataGloballyinEffect, are added
+  // to ExplicitlySpecifiedGlobals.
+  llvm::StringSet<> ExplicitlySpecifiedGlobals;
+  for (const auto Arg :
+       Args.filtered(options::OPT_mtocdata_EQ, options::OPT_mno_tocdata_EQ)) {
+    TOCDataSetting ArgTocDataSetting =
+        Arg->getOption().matches(options::OPT_mtocdata_EQ) ? DataInTOC
+                                                           : AddressInTOC;
+
+    if (ArgTocDataSetting != DefaultTocDataSetting)
+      for (const char *Val : Arg->getValues())
+        ExplicitlySpecifiedGlobals.insert(Val);
+    else
+      for (const char *Val : Arg->getValues())
+        ExplicitlySpecifiedGlobals.erase(Val);
+  }
+
+  auto buildExceptionList = [](const llvm::StringSet<> &ExplicitValues,
+                               const char *OptionSpelling) {
+    std::string Option(OptionSpelling);
+    bool IsFirst = true;
+    for (const auto &E : ExplicitValues) {
+      if (!IsFirst)
+        Option += ",";
+
+      IsFirst = false;
+      Option += E.first();
+    }
+    return Option;
+  };
+
+  // Pass the final tocdata options to CC1 consisting of the default
+  // tocdata option (-mtocdata/-mno-tocdata) along with the list
+  // option (-mno-tocdata=/-mtocdata=) if there are any explicitly specified
+  // variables which would be exceptions to the default setting.
+  const char *TocDataGlobalOption =
+      TOCDataGloballyinEffect ? "-mtocdata" : "-mno-tocdata";
+  CC1Args.push_back(TocDataGlobalOption);
+
+  const char *TocDataListOption =
+      TOCDataGloballyinEffect ? "-mno-tocdata=" : "-mtocdata=";
+  if (!ExplicitlySpecifiedGlobals.empty())
+    CC1Args.push_back(Args.MakeArgString(llvm::Twine(
+        buildExceptionList(ExplicitlySpecifiedGlobals, TocDataListOption))));
+}
+
 void AIX::addClangTargetOptions(
     const llvm::opt::ArgList &Args, llvm::opt::ArgStringList &CC1Args,
     Action::OffloadKind DeviceOffloadingKind) const {
@@ -440,6 +522,11 @@ void AIX::addClangTargetOptions(
   Args.AddLastArg(CC1Args, options::OPT_mdefault_visibility_export_mapping_EQ);
   Args.addOptInFlag(CC1Args, options::OPT_mxcoff_roptr, options::OPT_mno_xcoff_roptr);
 
+  // Forward last mtocdata/mno_tocdata options to -cc1.
+  if (Args.hasArg(options::OPT_mtocdata_EQ, options::OPT_mno_tocdata_EQ,
+                  options::OPT_mtocdata))
+    addTocDataOptions(Args, CC1Args, getDriver());
+
   if (Args.hasFlag(options::OPT_fxl_pragma_pack,
                    options::OPT_fno_xl_pragma_pack, true))
     CC1Args.push_back("-fxl-pragma-pack");
diff --git a/clang/lib/Frontend/CompilerInstance.cpp b/clang/lib/Frontend/CompilerInstance.cpp
index ec4e68209d657d..019f847ccbaad0 100644
--- a/clang/lib/Frontend/CompilerInstance.cpp
+++ b/clang/lib/Frontend/CompilerInstance.cpp
@@ -1047,6 +1047,11 @@ bool CompilerInstance::ExecuteAction(FrontendAction &Act) {
   if (getFrontendOpts().ShowStats || !getFrontendOpts().StatsFile.empty())
     llvm::EnableStatistics(false);
 
+  // Sort vectors containing toc data and no toc data variables to facilitate
+  // binary search later.
+  llvm::sort(getCodeGenOpts().TocDataVarsUserSpecified);
+  llvm::sort(getCodeGenOpts().NoTocDataVars);
+
   for (const FrontendInputFile &FIF : getFrontendOpts().Inputs) {
     // Reset the ID tables if we are reusing the SourceManager and parsing
     // regular files.
diff --git a/clang/test/CodeGen/PowerPC/toc-data-attribute.c b/clang/test/CodeGen/PowerPC/toc-data-attribute.c
new file mode 100644
index 00000000000000..db23d74759eeff
--- /dev/null
+++ b/clang/test/CodeGen/PowerPC/toc-data-attribute.c
@@ -0,0 +1,50 @@
+// RUN: %clang_cc1 %s -triple powerpc-ibm-aix-xcoff -S -mtocdata=f,g,h,i,j,k,l,m,n,o,p -emit-llvm -o - 2>&1 | FileCheck %s -check-prefixes=COMMON,CHECK32 --match-full-lines
+// RUN: %clang_cc1 %s -triple powerpc-ibm-aix-xcoff -S -mtocdata -emit-llvm -o - 2>&1 | FileCheck %s -check-prefixes=COMMON,CHECK32 --match-full-lines
+
+// RUN: %clang_cc1 %s -triple powerpc64-ibm-aix-xcoff -S -mtocdata=f,g,h,i,j,k,l,m,n,o,p -emit-llvm -o - 2>&1 | FileCheck %s -check-prefixes=COMMON,CHECK64 --match-full-lines
+// RUN: %clang_cc1 %s -triple powerpc64-ibm-aix-xcoff -S -mtocdata -emit-llvm -o - 2>&1 | FileCheck %s -check-prefixes=COMMON,CHECK64 --match-full-lines
+
+extern int f;
+long long g = 5;
+const char *h = "h";
+int *i;
+int __attribute__((aligned(128))) j = 0;
+float k = 100.00;
+double l = 2.5;
+int m __attribute__((section("foo"))) = 10;
+__thread int n;
+
+extern int p[];
+
+struct SomeStruct;
+extern struct SomeStruct o;
+
+static int func_a() {
+  return g+(int)h[0]+*i+j+k+l+m+n+p[0];
+}
+
+int func_b() {
+  f = 1;
+  return func_a();
+}
+
+struct SomeStruct* getAddress(void) {
+  return &o;
+}
+
+// CHECK32: @g = global i64 5, align 8
+// CHECK64: @g = global i64 5, align 8 #0
+// COMMON: {{.*}}  = private unnamed_addr constant [2 x i8] c"h\00", align 1
+// COMMON: @h = global {{...*}} #0
+// COMMON: @j = global i32 0, align 128
+// COMMON: @k = global float 1.000000e+02, align 4 #0
+// CHECK32: @l = global double 2.500000e+00, align 8
+// CHECK64: @l = global double 2.500000e+00, align 8 #0
+// COMMON: @m = global i32 10, section "foo", align 4
+// COMMON: @f = external global i32, align 4 #0
+// COMMON: @o = external global %struct.SomeStruct, align 1
+// CHECK32: @i = global ptr null, align 4 #0
+// CHECK64: @i = global ptr null, align 8 #0
+// COMMON: @n = thread_local global i32 0, align 4
+// COMMON: @p = external global [0 x i32], align 4
+// COMMON: attributes #0 = { "toc-data" }
diff --git a/clang/test/CodeGen/PowerPC/toc-data-attribute.cpp b/clang/test/CodeGen/PowerPC/toc-data-attribute.cpp
new file mode 100644
index 00000000000000..8183e3b727e7b3
--- /dev/null
+++ b/clang/test/CodeGen/PowerPC/toc-data-attribute.cpp
@@ -0,0 +1,39 @@
+// RUN: %clang_cc1 %s -triple powerpc-ibm-aix-xcoff -S -mtocdata -emit-llvm -o - 2>&1 | FileCheck %s -check-prefixes=COMMON,ALLTOC
+// RUN: %clang_cc1 %s -triple powerpc-ibm-aix-xcoff -S -mtocdata=n,_ZN11MyNamespace10myVariableE,_ZL1s,_ZZ4testvE7counter -emit-llvm -o - 2>&1 | FileCheck %s -check-prefixes=COMMON,TOCLIST
+// RUN: %clang_cc1 %s -triple powerpc64-ibm-aix-xcoff -S -mtocdata -emit-llvm -o - 2>&1 | FileCheck %s -check-prefixes=COMMON,ALLTOC
+// RUN: %clang_cc1 %s -triple powerpc64-ibm-aix-xcoff -S -mtocdata=n,_ZN11MyNamespace10myVariableE,_ZL1s,_ZZ4testvE7counter -emit-llvm -o - 2>&1 | FileCheck %s -check-prefixes=COMMON,TOCLIST
+
+extern int n;
+static int s = 100;
+
+inline int test() {
+    static int counter = 0;
+    counter++;
+    return counter;
+}
+
+int a () {
+    n = test();
+    return 0;
+}
+
+namespace MyNamespace {
+    int myVariable = 10;
+}
+
+int b(int x) {
+    using namespace MyNamespace;
+    return x + myVariable;
+}
+
+int c(int x) {
+  s += x;
+  return s;
+}
+
+// COMMON: @n = external global i32, align 4 #0
+// COMMON: @_ZN11MyNamespace10myVariableE = global i32 10, align 4 #0
+// COMMON-NOT: @_ZL1s = internal global i32 100, align 4 #0
+// ALLTOC: @_ZZ4testvE7counter = linkonce_odr global i32 0, align 4 #0
+// TOCLIST-NOT: @_ZZ4testvE7counter = linkonce_odr global i32 0, align 4 #0
+// COMMON: attributes #0 = { "toc-data" }
diff --git a/clang/test/CodeGen/PowerPC/toc-data-diagnostics.c b/clang/test/CodeGen/PowerPC/toc-data-diagnostics.c
new file mode 100644
index 00000000000000..ba8955530e4670
--- /dev/null
+++ b/clang/test/CodeGen/PowerPC/toc-data-diagnostics.c
@@ -0,0 +1,68 @@
+// RUN: %clang_cc1 %s -triple=powerpc-ibm-aix-xcoff -S -mtocdata=h,g,f,e,d,c,b,a,globalOneWithAlias,globalTwoWithAlias,ll,t3 -verify -emit-llvm -o - | FileCheck %s -check-prefix=CHECK --match-full-lines
+// RUN: %clang_cc1 %s -triple=powerpc-ibm-aix-xcoff -S -mtocdata -verify=none -emit-llvm -o - | FileCheck %s -check-prefix=CHECK --match-full-lines
+
+// none-no-diagnostics
+
+struct large_struct {
+  int x;
+  short y;
+  short z;
+  char c;
+};
+
+struct large_struct a;                      // expected-warning {{-mtocdata option is ignored for a because variable is larger than a pointer}}
+long long b = 5;                            // expected-warning {{-mtocdata option is ignored for b because variable is larger than a pointer}}
+int __attribute__((aligned(128))) c = 0;    // expected-warning {{-mtocdata option is ignored for c because variable is aligned wider than a pointer}}
+double d = 2.5;                             // expected-warning {{-mtocdata option is ignored for d because variable is larger than a pointer}}
+int e __attribute__((section("foo"))) = 10; // expected-warning {{-mtocdata option is ignored for e because variable has a section attribute}}
+__thread int f;                             // expected-warning {{-mtocdata option is ignored for f because of thread local storage}}
+
+struct SomeStruct;
+extern struct SomeStruct g;                 // expected-warning {{-mtocdata option is ignored for g because of incomplete type}}
+
+extern int h[];                             // expected-warning {{-mtocdata option is ignored for h because of incomplete type}}
+
+struct ty3 {
+  int A;
+  char C[];
+};
+struct ty3 t3 = { 4, "fo" }; // expected-warning {{-mtocdata option is ignored for t3 because it contains a flexible array member}}
+
+int globalOneWithAlias = 10;
+__attribute__((__alias__("globalOneWithAlias"))) extern int aliasOne; // expected-warning {{-mtocdata option is ignored for globalOneWithAlias because the variable has an alias}}
+__attribute__((__alias__("globalTwoWithAlias"))) extern int aliasTwo; // expected-warning {{-mtocdata option is ignored for globalTwoWithAlias because the variable has an alias}}
+int globalTwoWithAlias = 20;
+
+
+int func() {
+  return a.x+b+c+d+e+f+h[0];
+}
+
+struct SomeStruct* getAddress(void) {
+  return &g;
+}
+
+int test() {
+  return globalOneWithAlias + globalTwoWithAlias + aliasOne + aliasTwo;
+}
+
+long long test2() {
+  static long long ll = 5;
+  ll++;
+  return ll;
+}
+
+// CHECK: @b = global i64 5, align 8
+// CHECK: @c = global i32 0, align 128
+// CHECK: @d = global double 2.500000e+00, align 8
+// CHECK: @e = global i32 10, section "foo", align 4
+// CHECK: @globalOneWithAlias = global i32 10, align 4
+// CHECK: @globalTwoWithAlias = global i32 20, align 4
+// CHECK: @a = global %struct.large_struct zeroinitializer, align 4
+// CHECK: @f = thread_local global i32 0, align 4
+// CHECK: @h = external global [0 x i32], align 4
+// CHECK: @g = external global %struct.SomeStruct, align 1 
+// CHECK: @test2.ll = internal global i64 5, align 8
+// CHECK: @aliasOne = alias i32, ptr @globalOneWithAlias
+// CHECK: @aliasTwo = alias i32, ptr @globalTwoWithAlias
+// CHECK-NOT: attributes #0 = { "toc-data" }
diff --git a/clang/test/CodeGen/PowerPC/toc-data-structs-arrays.cpp b/clang/test/CodeGen/PowerPC/toc-data-structs-arrays.cpp
new file mode 100644
index 00000000000000..a717995cdceb19
--- /dev/null
+++ b/clang/test/CodeGen/PowerPC/toc-data-structs-arrays.cpp
@@ -0,0 +1,65 @@
+// RUN: %clang_cc1 %s -triple powerpc-ibm-aix-xcoff -S -mtocdata=a4,a5,a8,a9,b,c,d,e,v -emit-llvm -o - 2>&1 \
+// RUN:          | FileCheck %s -check-prefixes=CHECK32 --match-full-lines
+// RUN: %clang_cc1  %s -triple powerpc-ibm-aix-xcoff -S -mtocdata -emit-llvm -o - 2>&1 \
+// RUN:          | FileCheck %s -check-prefixes=CHECK32 --match-full-lines
+
+// RUN: %clang_cc1  %s -triple powerpc64-ibm-aix-xcoff -S -mtocdata=a4,a5,a8,a9,b,c,d,e,v -emit-llvm -o - 2>&1 \
+// RUN:          | FileCheck %s -check-prefixes=CHECK64 --match-full-lines
+// RUN: %clang_cc1  %s -triple powerpc64-ibm-aix-xcoff -S -mtocdata -emit-llvm -o - 2>&1 \
+// RUN:          | FileCheck %s -check-prefixes=CHECK64 --match-full-lines
+
+struct size4_struct {
+  int x;
+};
+
+struct size5_struct {
+  int x;
+  char c;
+};
+
+struct size8_struct {
+  int x;
+  short y;
+  short z;
+};
+
+struct size9_struct {
+  int x;
+  short y;
+  short z;
+  char c;
+};
+
+struct size4_struct a4;
+struct size5_struct a5;
+struct size8_struct a8;
+struct size9_struct a9;
+
+short b[2];
+short c[3];
+short d[4];
+short e[5];
+
+int func_a() {
+  return a4.x+a5.x+a8.x+a9.x+b[0]+c[0]+d[0]+e[0];
+}
+
+// CHECK32: @a4 = global %struct.size4_struct zeroinitializer, align 4 #0
+// CHECK32: @a5 = global %struct.size5_struct zeroinitializer, align 4
+// CHECK32: @a8 = global %struct.size8_struct zeroinitializer, align 4
+// CHECK32: @a9 = global %struct.size9_struct zeroinitializer, align 4
+// CHECK32: @b = global [2 x i16] zeroinitializer, align 2 #0
+// CHECK32: @c = global [3 x i16] zeroinitializer, align 2
+// CHECK32: @d = global [4 x i16] zeroinitializer, align 2
+// CHECK32: @e = global [5 x i16] zeroinitializer, align 2
+// CHECK32: attributes #0 = { "toc-data" }
+
+// CHECK64: @a4 = global %struct.size4_struct zeroinitializer, align 4 #0
+// CHECK64: @a5 = global %struct.size5_struct zeroinitializer, align 4 #0
+// CHECK64: @a8 = global %struct.size8_struct zeroinitializer, align 4 #0
+// CHECK64: @a9 = global %struct.size9_struct zeroinitializer, align 4
+// CHECK64: @b = global [2 x i16] zeroinitializer, align 2 #0
+// CHECK64: @c = global [3 x i16] zeroinitializer, align 2 #0
+// CHECK64: @d = global [4 x i16] zeroinitializer, align 2 #0
+// CHECK64: @e = global [5 x i16] zeroinitializer, align 2
+// CHECK64: attributes #0 = { "toc-data" }
diff --git a/clang/test/Driver/toc-conf.c b/clang/test/Driver/toc-conf.c
new file mode 100644
index 00000000000000..80d92ee1a90b4b
--- /dev/null
+++ b/clang/test/Driver/toc-conf.c
@@ -0,0 +1,30 @@
+// RUN: %clang %s --target=powerpc-unknown-aix -mno-tocdata -mtocdata -mno-tocdata -### 2>&1 | FileCheck %s -check-prefix=CHECK-FLAG1
+// RUN: %clang %s --target=powerpc-unknown-aix -mno-tocdata -mtocdata -mno-tocdata -mtocdata -### 2>&1 | FileCheck %s -check-prefix=CHECK-FLAG2
+// RUN: %clang %s --target=powerpc-unknown-aix -mtocdata=g1,g2 -mno-tocdata=g2 -mtocdata=g3,g4 -mno-tocdata=g5,g1 -### 2>&1 | FileCheck %s -check-prefix=CHECK-EQCONF
+// RUN: %clang %s --target=powerpc-unknown-aix -mtocdata=g1 -mtocdata -mno-tocdata -mtocdata=g2,g3 -mno-tocdata=g4,g5,g3 -### 2>&1 | FileCheck %s -check-prefix=CHECK-CONF1
+// RUN: %clang %s --target=powerpc-unknown-aix -mno-tocdata=g1 -mno-tocdata -mtocdata -### 2>&1 | FileCheck %s -check-prefix=CHECK-CONF2
+
+int g1, g4, g5;
+extern int g2;
+int g3 = 0;
+void func() {
+  g2 = 0;
+}
+
+// CHECK-FLAG1-NOT: warning:
+// CHECK-FLAG1: "-cc1"{{.*}}" "-mno-tocdata"
+
+// CHECK-FLAG2-NOT: warning:
+// CHECK-FLAG2: "-cc1"{{.*}}" "-mtocdata"
+
+// CHECK-EQCONF-NOT: warning:
+// CHECK-EQCONF: "-cc1"{{.*}}" "-mno-tocdata"
+// CHECK-EQCONF: "-mtocdata=g3,g4"
+
+// CHECK-CONF1-NOT: warning:
+// CHECK-CONF1: "-cc1"{{.*}}" "-mno-tocdata"
+// CHECK-CONF1: "-mtocdata=g2,g1"
+
+// CHECK-CONF2-NOT: warning:
+// CHECK-CONF2: "-cc1"{{.*}}" "-mtocdata"
+// CHECK-CONF2: "-mno-tocdata=g1"
diff --git a/clang/test/Driver/tocdata-cc1.c b/clang/test/Driver/tocdata-cc1.c
new file mode 100644
index 00000000000000..fe0d97ea02db1f
--- /dev/null
+++ b/clang/test/Driver/tocdata-cc1.c
@@ -0,0 +1,16 @@
+// RUN: %clang -### --target=powerpc-ibm-aix-xcoff -mcmodel=medium -mtocdata %s 2>&1 \
+// RUN:   | FileCheck -check-prefix=CHECK-NOTOC %s
+// RUN: %clang -### --target=powerpc-ibm-aix-xcoff -mcmodel=large -mtocdata %s 2>&1 \
+// RUN:   | FileCheck -check-prefix=CHECK-NOTOC %s
+// RUN: %clang -### --target=powerpc-ibm-aix-xcoff -mtocdata %s 2>&1 \
+// RUN:   | FileCheck -check-prefix=CHECK-TOC %s
+// RUN: %clang -### --target=powerpc64-ibm-aix-xcoff -mcmodel=medium -mtocdata %s 2>&1 \
+// RUN:   | FileCheck -check-prefix=CHECK-NOTOC %s
+// RUN: %clang -### --target=powerpc64-ibm-aix-xcoff -mcmodel=large -mtocdata %s 2>&1 \
+// RUN:   | FileCheck -check-prefix=CHECK-NOTOC %s
+// RUN: %clang -### --target=powerpc64-ibm-aix-xcoff -mtocdata %s 2>&1 \
+// RUN:   | FileCheck -check-prefix=CHECK-TOC %s
+// CHECK-NOTOC: warning: ignoring '-mtocdata' as it is only supported for -mcmodel=small
+// CHECK-NOTOC-NOT: "-cc1"{{.*}}" "-mtocdata"
+// CHECK-TOC: "-cc1"{{.*}}" "-mtocdata"
+// CHECK-TOC-NOT: warning: ignoring '-mtocdata' as it is only supported for -mcmodel=small
diff --git a/llvm/include/llvm/ADT/STLExtras.h b/llvm/include/llvm/ADT/STLExtras.h
index 5ac549c4475605..02a3074ae1f0d7 100644
--- a/llvm/include/llvm/ADT/STLExtras.h
+++ b/llvm/include/llvm/ADT/STLExtras.h
@@ -1945,6 +1945,19 @@ auto partition(R &&Range, UnaryPredicate P) {
   return std::partition(adl_begin(Range), adl_end(Range), P);
 }
 
+/// Provide wrappers to std::binary_search which take ranges instead of having
+/// to pass begin/end explicitly.
+template <typename R, typename T> auto binary_search(R &&Range, T &&Value) {
+  return std::binary_search(adl_begin(Range), adl_end(Range),
+                            std::forward<T>(Value));
+}
+
+template <typename R, typename T, typename Compare>
+auto binary_search(R &&Range, T &&Value, Compare C) {
+  return std::binary_search(adl_begin(Range), adl_end(Range),
+                            std::forward<T>(Value), C);
+}
+
 /// Provide wrappers to std::lower_bound which take ranges instead of having to
 /// pass begin/end explicitly.
 template <typename R, typename T> auto lower_bound(R &&Range, T &&Value) {
diff --git a/llvm/lib/MC/MCSectionXCOFF.cpp b/llvm/lib/MC/MCSectionXCOFF.cpp
index 95d32e3580e3f5..609ef096293042 100644
--- a/llvm/lib/MC/MCSectionXCOFF.cpp
+++ b/llvm/lib/MC/MCSectionXCOFF.cpp
@@ -87,8 +87,7 @@ void MCSectionXCOFF::printSwitchToSection(const MCAsmInfo &MAI, const Triple &T,
     if (getKind().isCommon() && !getKind().isBSSLocal())
       return;
 
-    assert((getKind().isBSSExtern() || getKind().isBSSLocal()) &&
-           "Unexepected section kind for toc-data");
+    assert(getKind().isBSS() && "Unexpected section kind for toc-data");
     printCsectDirective(OS);
     return;
   }
diff --git a/llvm/lib/Target/PowerPC/PPCAsmPrinter.cpp b/llvm/lib/Target/PowerPC/PPCAsmPrinter.cpp
index 9396ca22dacf86..6f33b16f045a87 100644
--- a/llvm/lib/Target/PowerPC/PPCAsmPrinter.cpp
+++ b/llvm/lib/Target/PowerPC/PPCAsmPrinter.cpp
@@ -2659,6 +2659,8 @@ void PPCAIXAsmPrinter::emitGlobalVariable(const GlobalVariable *GV) {
   // If the Global Variable has the toc-data attribute, it needs to be emitted
   // when we emit the .toc section.
   if (GV->hasAttribute("toc-data")) {
+    unsigned PointerSize = GV->getParent()->getDataLayout().getPointerSize();
+    Subtarget->tocDataChecks(PointerSize, GV);
     TOCDataGlobalVars.push_back(GV);
     return;
   }
diff --git a/llvm/lib/Target/PowerPC/PPCISelDAGToDAG.cpp b/llvm/lib/Target/PowerPC/PPCISelDAGToDAG.cpp
index 9e5f0b36616d1b..2462cbb1928235 100644
--- a/llvm/lib/Target/PowerPC/PPCISelDAGToDAG.cpp
+++ b/llvm/lib/Target/PowerPC/PPCISelDAGToDAG.cpp
@@ -521,40 +521,6 @@ static bool hasTocDataAttr(SDValue Val, unsigned PointerSize) {
 
   if (!GV->hasAttribute("toc-data"))
     return false;
-
-  // TODO: These asserts should be updated as more support for the toc data
-  // transformation is added (struct support, etc.).
-
-  assert(
-      PointerSize >= GV->getAlign().valueOrOne().value() &&
-      "GlobalVariables with an alignment requirement stricter than TOC entry "
-      "size not supported by the toc data transformation.");
-
-  Type *GVType = GV->getValueType();
-
-  assert(GVType->isSized() && "A GlobalVariable's size must be known to be "
-                              "supported by the toc data transformation.");
-
-  if (GVType->isVectorTy())
-    report_fatal_error("A GlobalVariable of Vector type is not currently "
-                       "supported by the toc data transformation.");
-
-  if (GVType->isArrayTy())
-    report_fatal_error("A GlobalVariable of Array type is not currently "
-                       "supported by the toc data transformation.");
-
-  if (GVType->isStructTy())
-    report_fatal_error("A GlobalVariable of Struct type is not currently "
-                       "supported by the toc data transformation.");
-
-  assert(GVType->getPrimitiveSizeInBits() <= PointerSize * 8 &&
-         "A GlobalVariable with size larger than a TOC entry is not currently "
-         "supported by the toc data transformation.");
-
-  if (GV->hasPrivateLinkage())
-    report_fatal_error("A GlobalVariable with private linkage is not "
-                       "currently supported by the toc data transformation.");
-
   return true;
 }
 
diff --git a/llvm/lib/Target/PowerPC/PPCSubtarget.cpp b/llvm/lib/Target/PowerPC/PPCSubtarget.cpp
index 5380ec1c4c0d9c..884f2f5c57b258 100644
--- a/llvm/lib/Target/PowerPC/PPCSubtarget.cpp
+++ b/llvm/lib/Target/PowerPC/PPCSubtarget.cpp
@@ -185,6 +185,28 @@ bool PPCSubtarget::enableSubRegLiveness() const {
   return UseSubRegLiveness;
 }
 
+void PPCSubtarget::tocDataChecks(unsigned PointerSize,
+                                 const GlobalVariable *GV) const {
+  // TODO: These asserts should be updated as more support for the toc data
+  // transformation is added (struct support, etc.).
+  assert(
+      PointerSize >= GV->getAlign().valueOrOne().value() &&
+      "GlobalVariables with an alignment requirement stricter than TOC entry "
+      "size not supported by the toc data transformation.");
+
+  Type *GVType = GV->getValueType();
+  assert(GVType->isSized() && "A GlobalVariable's size must be known to be "
+                              "supported by the toc data transformation.");
+  if (GV->getParent()->getDataLayout().getTypeSizeInBits(GVType) >
+      PointerSize * 8)
+    report_fatal_error(
+        "A GlobalVariable with size larger than a TOC entry is not currently "
+        "supported by the toc data transformation.");
+  if (GV->hasPrivateLinkage())
+    report_fatal_error("A GlobalVariable with private linkage is not "
+                       "currently supported by the toc data transformation.");
+}
+
 bool PPCSubtarget::isGVIndirectSymbol(const GlobalValue *GV) const {
   // Large code model always uses the TOC even for local symbols.
   if (TM.getCodeModel() == CodeModel::Large)
diff --git a/llvm/lib/Target/PowerPC/PPCSubtarget.h b/llvm/lib/Target/PowerPC/PPCSubtarget.h
index 306a52dca8362b..d913f22bd5ba9d 100644
--- a/llvm/lib/Target/PowerPC/PPCSubtarget.h
+++ b/llvm/lib/Target/PowerPC/PPCSubtarget.h
@@ -245,6 +245,8 @@ class PPCSubtarget : public PPCGenSubtargetInfo {
   /// True if the GV will be accessed via an indirect symbol.
   bool isGVIndirectSymbol(const GlobalValue *GV) const;
 
+  void tocDataChecks(unsigned PointerSize, const GlobalVariable *GV) const;
+
   /// True if the ABI is descriptor based.
   bool usesFunctionDescriptors() const {
     // Both 32-bit and 64-bit AIX are descriptor based. For ELF only the 64-bit
diff --git a/llvm/test/CodeGen/PowerPC/toc-data-large-array.ll b/llvm/test/CodeGen/PowerPC/toc-data-large-array.ll
new file mode 100644
index 00000000000000..90f40d9f0fec95
--- /dev/null
+++ b/llvm/test/CodeGen/PowerPC/toc-data-large-array.ll
@@ -0,0 +1,16 @@
+; RUN: not --crash llc -mtriple powerpc-ibm-aix-xcoff < %s 2>&1 | FileCheck %s --check-prefix CHECK-ERROR
+; RUN: not --crash llc -mtriple powerpc64-ibm-aix-xcoff < %s 2>&1 | FileCheck %s --check-prefix CHECK-ERROR
+
+ at a = global [5 x i16] zeroinitializer, align 2 #0
+
+; Function Attrs: noinline
+define i16 @foo() #1 {
+entry:
+  %0 = load i16, ptr @a, align 2
+  ret i16 %0
+}
+
+attributes #0 = { "toc-data" }
+attributes #1 = { noinline }
+
+; CHECK-ERROR: LLVM ERROR: A GlobalVariable with size larger than a TOC entry is not currently supported by the toc data transformation.
diff --git a/llvm/test/CodeGen/PowerPC/toc-data-large-array2.ll b/llvm/test/CodeGen/PowerPC/toc-data-large-array2.ll
new file mode 100644
index 00000000000000..f870e996a40192
--- /dev/null
+++ b/llvm/test/CodeGen/PowerPC/toc-data-large-array2.ll
@@ -0,0 +1,8 @@
+; RUN: not --crash llc -mtriple powerpc-ibm-aix-xcoff < %s 2>&1 | FileCheck %s --check-prefix CHECK-ERROR
+; RUN: not --crash llc -mtriple powerpc64-ibm-aix-xcoff < %s 2>&1 | FileCheck %s --check-prefix CHECK-ERROR
+
+ at a = global [5 x i16] zeroinitializer, align 2 #0
+
+attributes #0 = { "toc-data" }
+
+; CHECK-ERROR: LLVM ERROR: A GlobalVariable with size larger than a TOC entry is not currently supported by the toc data transformation.
diff --git a/llvm/test/CodeGen/PowerPC/toc-data-struct-array.ll b/llvm/test/CodeGen/PowerPC/toc-data-struct-array.ll
new file mode 100644
index 00000000000000..a5c9a8b909d1cf
--- /dev/null
+++ b/llvm/test/CodeGen/PowerPC/toc-data-struct-array.ll
@@ -0,0 +1,110 @@
+; RUN: llc -mtriple powerpc-ibm-aix-xcoff < %s | FileCheck %s --check-prefix CHECK
+; RUN: llc -mtriple powerpc64-ibm-aix-xcoff < %s | FileCheck %s --check-prefix CHECK
+
+; RUN: llc -filetype=obj -mtriple powerpc-ibm-aix-xcoff < %s -o %t32.o
+; RUN: llvm-readobj %t32.o --syms | FileCheck %s --check-prefix=OBJ32
+; RUN: llc -filetype=obj -mtriple powerpc64-ibm-aix-xcoff < %s -o %t64.o
+; RUN: llvm-readobj %t64.o --syms | FileCheck %s --check-prefix=OBJ64
+
+%struct.small_struct = type { i16 }
+
+ at a = global %struct.small_struct zeroinitializer, align 2 #0
+ at b = global [2 x i16] zeroinitializer, align 2 #0
+
+; Function Attrs: noinline
+define i16 @foo() #1 {
+entry:
+  %0 = load i16, ptr @a, align 2
+  %1 = load i16, ptr @b, align 2
+  %add = add nsw i16 %0, %1
+  ret i16 %add
+}
+
+attributes #0 = { "toc-data" }
+attributes #1 = { noinline }
+
+; CHECK:      .toc
+; CHECK-NEXT: .csect a[TD],2
+; CHECK-NEXT: .globl    a[TD]                           # @a
+; CHECK-NEXT: .align    1
+; CHECK-NEXT: .space    2
+; CHECK-NEXT: .csect b[TD],2
+; CHECK-NEXT: .globl    b[TD]                           # @b
+; CHECK-NEXT: .align    1
+; CHECK-NEXT: .space    4
+
+; OBJ32:  Symbol {
+; OBJ32:    Name: a
+; OBJ32-NEXT:    Value (RelocatableAddress): 0x3C
+; OBJ32-NEXT:    Section: .data
+; OBJ32-NEXT:    Type: 0x0
+; OBJ32-NEXT:    StorageClass: C_EXT (0x2)
+; OBJ32-NEXT:    NumberOfAuxEntries: 1
+; OBJ32-NEXT:    CSECT Auxiliary Entry {
+; OBJ32-NEXT:      Index: {{[0-9]+}}
+; OBJ32-NEXT:      SectionLen: 2
+; OBJ32-NEXT:      ParameterHashIndex: 0x0
+; OBJ32-NEXT:      TypeChkSectNum: 0x0
+; OBJ32-NEXT:      SymbolAlignmentLog2: 2
+; OBJ32-NEXT:      SymbolType: XTY_SD (0x1)
+; OBJ32-NEXT:      StorageMappingClass: XMC_TD (0x10)
+; OBJ32-NEXT:      StabInfoIndex: 0x0
+; OBJ32-NEXT:      StabSectNum: 0x0
+; OBJ32-NEXT:    }
+; OBJ32-NEXT:  }
+; OBJ32-NEXT:  Symbol {
+; OBJ32:    Name: b
+; OBJ32-NEXT:    Value (RelocatableAddress): 0x40
+; OBJ32-NEXT:    Section: .data
+; OBJ32-NEXT:    Type: 0x0
+; OBJ32-NEXT:    StorageClass: C_EXT (0x2)
+; OBJ32-NEXT:    NumberOfAuxEntries: 1
+; OBJ32-NEXT:    CSECT Auxiliary Entry {
+; OBJ32-NEXT:      Index: {{[0-9]+}}
+; OBJ32-NEXT:      SectionLen: 4
+; OBJ32-NEXT:      ParameterHashIndex: 0x0
+; OBJ32-NEXT:      TypeChkSectNum: 0x0
+; OBJ32-NEXT:      SymbolAlignmentLog2: 2
+; OBJ32-NEXT:      SymbolType: XTY_SD (0x1)
+; OBJ32-NEXT:      StorageMappingClass: XMC_TD (0x10)
+; OBJ32-NEXT:      StabInfoIndex: 0x0
+; OBJ32-NEXT:      StabSectNum: 0x0
+; OBJ32-NEXT:    }
+; OBJ32-NEXT:  }
+
+; OBJ64:  Symbol {
+; OBJ64:    Name: a
+; OBJ64-NEXT:    Value (RelocatableAddress): 0x48
+; OBJ64-NEXT:    Section: .data
+; OBJ64-NEXT:    Type: 0x0
+; OBJ64-NEXT:    StorageClass: C_EXT (0x2)
+; OBJ64-NEXT:    NumberOfAuxEntries: 1
+; OBJ64-NEXT:    CSECT Auxiliary Entry {
+; OBJ64-NEXT:      Index: {{[0-9]+}}
+; OBJ64-NEXT:      SectionLen: 2
+; OBJ64-NEXT:      ParameterHashIndex: 0x0
+; OBJ64-NEXT:      TypeChkSectNum: 0x0
+; OBJ64-NEXT:      SymbolAlignmentLog2: 2
+; OBJ64-NEXT:      SymbolType: XTY_SD (0x1)
+; OBJ64-NEXT:      StorageMappingClass: XMC_TD (0x10)
+; OBJ64-NEXT:      Auxiliary Type: AUX_CSECT (0xFB)
+; OBJ64-NEXT:    }
+; OBJ64-NEXT:  }
+; OBJ64-NEXT:  Symbol {
+; OBJ64:    Name: b
+; OBJ64-NEXT:    Value (RelocatableAddress): 0x4C
+; OBJ64-NEXT:    Section: .data
+; OBJ64-NEXT:    Type: 0x0
+; OBJ64-NEXT:    StorageClass: C_EXT (0x2)
+; OBJ64-NEXT:    NumberOfAuxEntries: 1
+; OBJ64-NEXT:    CSECT Auxiliary Entry {
+; OBJ64-NEXT:      Index: {{[0-9]+}}
+; OBJ64-NEXT:      SectionLen: 4
+; OBJ64-NEXT:      ParameterHashIndex: 0x0
+; OBJ64-NEXT:      TypeChkSectNum: 0x0
+; OBJ64-NEXT:      SymbolAlignmentLog2: 2
+; OBJ64-NEXT:      SymbolType: XTY_SD (0x1)
+; OBJ64-NEXT:      StorageMappingClass: XMC_TD (0x10)
+; OBJ64-NEXT:      Auxiliary Type: AUX_CSECT (0xFB)
+; OBJ64-NEXT:    }
+; OBJ64-NEXT:  }

>From 9f9c3baf30af8502c772cd1e44f08b5052ca2e71 Mon Sep 17 00:00:00 2001
From: Florian Hahn <flo at fhahn.com>
Date: Wed, 13 Mar 2024 14:29:59 +0000
Subject: [PATCH 37/63] [VPlan] Use VPBuilder to create BranchOnCond in
 VPHCFGBuilder.

This simplifies the code to create the recipe slightly as well as
properly retaining the debug location of the input IR.
---
 .../Transforms/Vectorize/VPlanHCFGBuilder.cpp |  3 +-
 .../LoopVectorize/dbg-outer-loop-vect.ll      | 40 +++++++++----------
 2 files changed, 21 insertions(+), 22 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/VPlanHCFGBuilder.cpp b/llvm/lib/Transforms/Vectorize/VPlanHCFGBuilder.cpp
index 6474a9697dce89..877b5d438115de 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanHCFGBuilder.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanHCFGBuilder.cpp
@@ -296,8 +296,7 @@ void PlainCFGBuilder::createVPInstructionsForVPBB(VPBasicBlock *VPBB,
       // recipes.
       if (Br->isConditional()) {
         VPValue *Cond = getOrCreateVPOperand(Br->getCondition());
-        VPBB->appendRecipe(
-            new VPInstruction(VPInstruction::BranchOnCond, {Cond}));
+        VPIRBuilder.createNaryOp(VPInstruction::BranchOnCond, {Cond}, Inst);
       }
 
       // Skip the rest of the Instruction processing for Branch instructions.
diff --git a/llvm/test/Transforms/LoopVectorize/dbg-outer-loop-vect.ll b/llvm/test/Transforms/LoopVectorize/dbg-outer-loop-vect.ll
index 7f209634fe053d..2c665a417ab599 100644
--- a/llvm/test/Transforms/LoopVectorize/dbg-outer-loop-vect.ll
+++ b/llvm/test/Transforms/LoopVectorize/dbg-outer-loop-vect.ll
@@ -7,7 +7,7 @@ define void @foo(ptr %h) !dbg !4 {
 ; CHECK-LABEL: define void @foo(
 ; CHECK-SAME: ptr [[H:%.*]]) !dbg [[DBG4:![0-9]+]] {
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    call void @llvm.dbg.value(metadata i64 0, metadata [[META11:![0-9]+]], metadata !DIExpression()), !dbg [[DBG20:![0-9]+]]
+; CHECK-NEXT:    tail call void @llvm.dbg.value(metadata i64 0, metadata [[META11:![0-9]+]], metadata !DIExpression()), !dbg [[DBG20:![0-9]+]]
 ; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]], !dbg [[DBG21:![0-9]+]]
 ; CHECK:       vector.ph:
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]], !dbg [[DBG21]]
@@ -27,15 +27,15 @@ define void @foo(ptr %h) !dbg !4 {
 ; CHECK-NEXT:    call void @llvm.masked.scatter.v4i32.v4p0(<4 x i32> <i32 3, i32 3, i32 3, i32 3>, <4 x ptr> [[TMP3]], i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>), !dbg [[DBG22]]
 ; CHECK-NEXT:    [[TMP4]] = add nuw nsw <4 x i64> [[VEC_PHI]], <i64 1, i64 1, i64 1, i64 1>, !dbg [[DBG24:![0-9]+]]
 ; CHECK-NEXT:    [[TMP5:%.*]] = icmp eq <4 x i64> [[TMP4]], <i64 5, i64 5, i64 5, i64 5>, !dbg [[DBG25:![0-9]+]]
-; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <4 x i1> [[TMP5]], i32 0
-; CHECK-NEXT:    br i1 [[TMP6]], label [[FOR_COND_CLEANUP32]], label [[FOR_COND5_PREHEADER1]]
+; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <4 x i1> [[TMP5]], i32 0, !dbg [[DBG26:![0-9]+]]
+; CHECK-NEXT:    br i1 [[TMP6]], label [[FOR_COND_CLEANUP32]], label [[FOR_COND5_PREHEADER1]], !dbg [[DBG26]]
 ; CHECK:       for.cond.cleanup32:
-; CHECK-NEXT:    [[TMP7:%.*]] = add nuw nsw <4 x i64> [[VEC_IND]], <i64 1, i64 1, i64 1, i64 1>, !dbg [[DBG26:![0-9]+]]
-; CHECK-NEXT:    [[TMP8:%.*]] = icmp eq <4 x i64> [[TMP7]], <i64 23, i64 23, i64 23, i64 23>, !dbg [[DBG27:![0-9]+]]
+; CHECK-NEXT:    [[TMP7:%.*]] = add nuw nsw <4 x i64> [[VEC_IND]], <i64 1, i64 1, i64 1, i64 1>, !dbg [[DBG27:![0-9]+]]
+; CHECK-NEXT:    [[TMP8:%.*]] = icmp eq <4 x i64> [[TMP7]], <i64 23, i64 23, i64 23, i64 23>, !dbg [[DBG28:![0-9]+]]
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 ; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], <i64 4, i64 4, i64 4, i64 4>
 ; CHECK-NEXT:    [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], 20
-; CHECK-NEXT:    br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP28:![0-9]+]]
+; CHECK-NEXT:    br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP29:![0-9]+]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    br i1 false, label [[EXIT:%.*]], label [[SCALAR_PH]], !dbg [[DBG21]]
 ; CHECK:       scalar.ph:
@@ -43,8 +43,8 @@ define void @foo(ptr %h) !dbg !4 {
 ; CHECK-NEXT:    br label [[FOR_COND1_PREHEADER:%.*]], !dbg [[DBG21]]
 ; CHECK:       for.cond1.preheader:
 ; CHECK-NEXT:    [[I_023:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INC13:%.*]], [[FOR_COND_CLEANUP3:%.*]] ]
-; CHECK-NEXT:    call void @llvm.dbg.value(metadata i64 [[I_023]], metadata [[META11]], metadata !DIExpression()), !dbg [[DBG20]]
-; CHECK-NEXT:    br label [[FOR_COND5_PREHEADER:%.*]], !dbg [[DBG32:![0-9]+]]
+; CHECK-NEXT:    tail call void @llvm.dbg.value(metadata i64 [[I_023]], metadata [[META11]], metadata !DIExpression()), !dbg [[DBG20]]
+; CHECK-NEXT:    br label [[FOR_COND5_PREHEADER:%.*]], !dbg [[DBG26]]
 ; CHECK:       for.cond5.preheader:
 ; CHECK-NEXT:    [[L_022:%.*]] = phi i64 [ 0, [[FOR_COND1_PREHEADER]] ], [ [[INC10:%.*]], [[FOR_COND5_PREHEADER]] ]
 ; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr i32, ptr [[H]], i64 [[L_022]]
@@ -57,11 +57,11 @@ define void @foo(ptr %h) !dbg !4 {
 ; CHECK-NEXT:    store i32 3, ptr [[ARRAYIDX_3]], align 4, !dbg [[DBG22]]
 ; CHECK-NEXT:    [[INC10]] = add nuw nsw i64 [[L_022]], 1, !dbg [[DBG24]]
 ; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INC10]], 5, !dbg [[DBG25]]
-; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP3]], label [[FOR_COND5_PREHEADER]], !dbg [[DBG32]]
+; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP3]], label [[FOR_COND5_PREHEADER]], !dbg [[DBG26]]
 ; CHECK:       for.cond.cleanup3:
-; CHECK-NEXT:    [[INC13]] = add nuw nsw i64 [[I_023]], 1, !dbg [[DBG26]]
-; CHECK-NEXT:    call void @llvm.dbg.value(metadata i64 [[INC13]], metadata [[META11]], metadata !DIExpression()), !dbg [[DBG20]]
-; CHECK-NEXT:    [[EXITCOND24_NOT:%.*]] = icmp eq i64 [[INC13]], 23, !dbg [[DBG27]]
+; CHECK-NEXT:    [[INC13]] = add nuw nsw i64 [[I_023]], 1, !dbg [[DBG27]]
+; CHECK-NEXT:    tail call void @llvm.dbg.value(metadata i64 [[INC13]], metadata [[META11]], metadata !DIExpression()), !dbg [[DBG20]]
+; CHECK-NEXT:    [[EXITCOND24_NOT:%.*]] = icmp eq i64 [[INC13]], 23, !dbg [[DBG28]]
 ; CHECK-NEXT:    br i1 [[EXITCOND24_NOT]], label [[EXIT]], label [[FOR_COND1_PREHEADER]], !dbg [[DBG21]], !llvm.loop [[LOOP34:![0-9]+]]
 ; CHECK:       exit:
 ; CHECK-NEXT:    ret void, !dbg [[DBG35:![0-9]+]]
@@ -163,14 +163,14 @@ declare void @llvm.dbg.value(metadata, metadata, metadata)
 ; CHECK: [[META23]] = distinct !DILexicalBlock(scope: [[META18]], file: [[META1]], line: 12, column: 7)
 ; CHECK: [[DBG24]] = !DILocation(line: 11, column: 32, scope: [[META19]])
 ; CHECK: [[DBG25]] = !DILocation(line: 11, column: 26, scope: [[META19]])
-; CHECK: [[DBG26]] = !DILocation(line: 10, column: 30, scope: [[META16]])
-; CHECK: [[DBG27]] = !DILocation(line: 10, column: 24, scope: [[META16]])
-; CHECK: [[LOOP28]] = distinct !{[[LOOP28]], [[DBG21]], [[META29:![0-9]+]], [[META30:![0-9]+]], [[META31:![0-9]+]]}
-; CHECK: [[META29]] = !DILocation(line: 13, column: 13, scope: [[META12]])
-; CHECK: [[META30]] = !{!"llvm.loop.isvectorized", i32 1}
-; CHECK: [[META31]] = !{!"llvm.loop.unroll.runtime.disable"}
-; CHECK: [[DBG32]] = !DILocation(line: 11, column: 5, scope: [[META15]])
+; CHECK: [[DBG26]] = !DILocation(line: 11, column: 5, scope: [[META15]])
+; CHECK: [[DBG27]] = !DILocation(line: 10, column: 30, scope: [[META16]])
+; CHECK: [[DBG28]] = !DILocation(line: 10, column: 24, scope: [[META16]])
+; CHECK: [[LOOP29]] = distinct !{[[LOOP29]], [[DBG21]], [[META30:![0-9]+]], [[META31:![0-9]+]], [[META32:![0-9]+]]}
+; CHECK: [[META30]] = !DILocation(line: 13, column: 13, scope: [[META12]])
+; CHECK: [[META31]] = !{!"llvm.loop.isvectorized", i32 1}
+; CHECK: [[META32]] = !{!"llvm.loop.unroll.runtime.disable"}
 ; CHECK: [[DBG33]] = !DILocation(line: 13, column: 2, scope: [[META23]])
-; CHECK: [[LOOP34]] = distinct !{[[LOOP34]], [[DBG21]], [[META29]], [[META30]]}
+; CHECK: [[LOOP34]] = distinct !{[[LOOP34]], [[DBG21]], [[META30]], [[META31]]}
 ; CHECK: [[DBG35]] = !DILocation(line: 14, column: 1, scope: [[DBG4]])
 ;.

>From 764117ec2b6e3caf11b0dd380cfb4479387a7012 Mon Sep 17 00:00:00 2001
From: David Spickett <david.spickett at linaro.org>
Date: Wed, 13 Mar 2024 14:29:43 +0000
Subject: [PATCH 38/63] [lldb][Test] Disable ConcurrentVFork tests on
 Arm/AArch64 Linux

They are either flaky, or not cleaning up after themselves.
See https://github.com/llvm/llvm-project/issues/85084.
---
 .../fork/concurrent_vfork/TestConcurrentVFork.py | 16 ++++++++++++++++
 1 file changed, 16 insertions(+)

diff --git a/lldb/test/API/functionalities/fork/concurrent_vfork/TestConcurrentVFork.py b/lldb/test/API/functionalities/fork/concurrent_vfork/TestConcurrentVFork.py
index 2dcbb728549fb4..1790bd497f4e6b 100644
--- a/lldb/test/API/functionalities/fork/concurrent_vfork/TestConcurrentVFork.py
+++ b/lldb/test/API/functionalities/fork/concurrent_vfork/TestConcurrentVFork.py
@@ -48,6 +48,8 @@ def follow_child_helper(self, use_fork, call_exec):
         self.expect("continue", patterns=[r"exited with status = 1[0-4]"])
 
     @skipUnlessPlatform(["linux"])
+    # See https://github.com/llvm/llvm-project/issues/85084.
+    @skipIf(oslist=["linux"], archs=["aarch64", "arm"])
     def test_follow_parent_vfork_no_exec(self):
         """
         Make sure that debugging concurrent vfork() from multiple threads won't crash lldb during follow-parent.
@@ -56,6 +58,8 @@ def test_follow_parent_vfork_no_exec(self):
         self.follow_parent_helper(use_fork=False, call_exec=False)
 
     @skipUnlessPlatform(["linux"])
+    # See https://github.com/llvm/llvm-project/issues/85084.
+    @skipIf(oslist=["linux"], archs=["aarch64", "arm"])
     def test_follow_parent_fork_no_exec(self):
         """
         Make sure that debugging concurrent fork() from multiple threads won't crash lldb during follow-parent.
@@ -64,6 +68,8 @@ def test_follow_parent_fork_no_exec(self):
         self.follow_parent_helper(use_fork=True, call_exec=False)
 
     @skipUnlessPlatform(["linux"])
+    # See https://github.com/llvm/llvm-project/issues/85084.
+    @skipIf(oslist=["linux"], archs=["aarch64", "arm"])
     def test_follow_parent_vfork_call_exec(self):
         """
         Make sure that debugging concurrent vfork() from multiple threads won't crash lldb during follow-parent.
@@ -72,6 +78,8 @@ def test_follow_parent_vfork_call_exec(self):
         self.follow_parent_helper(use_fork=False, call_exec=True)
 
     @skipUnlessPlatform(["linux"])
+    # See https://github.com/llvm/llvm-project/issues/85084.
+    @skipIf(oslist=["linux"], archs=["aarch64", "arm"])
     def test_follow_parent_fork_call_exec(self):
         """
         Make sure that debugging concurrent vfork() from multiple threads won't crash lldb during follow-parent.
@@ -80,6 +88,8 @@ def test_follow_parent_fork_call_exec(self):
         self.follow_parent_helper(use_fork=True, call_exec=True)
 
     @skipUnlessPlatform(["linux"])
+    # See https://github.com/llvm/llvm-project/issues/85084.
+    @skipIf(oslist=["linux"], archs=["aarch64", "arm"])
     def test_follow_child_vfork_no_exec(self):
         """
         Make sure that debugging concurrent vfork() from multiple threads won't crash lldb during follow-child.
@@ -88,6 +98,8 @@ def test_follow_child_vfork_no_exec(self):
         self.follow_child_helper(use_fork=False, call_exec=False)
 
     @skipUnlessPlatform(["linux"])
+    # See https://github.com/llvm/llvm-project/issues/85084.
+    @skipIf(oslist=["linux"], archs=["aarch64", "arm"])
     def test_follow_child_fork_no_exec(self):
         """
         Make sure that debugging concurrent fork() from multiple threads won't crash lldb during follow-child.
@@ -96,6 +108,8 @@ def test_follow_child_fork_no_exec(self):
         self.follow_child_helper(use_fork=True, call_exec=False)
 
     @skipUnlessPlatform(["linux"])
+    # See https://github.com/llvm/llvm-project/issues/85084.
+    @skipIf(oslist=["linux"], archs=["aarch64", "arm"])
     def test_follow_child_vfork_call_exec(self):
         """
         Make sure that debugging concurrent vfork() from multiple threads won't crash lldb during follow-child.
@@ -104,6 +118,8 @@ def test_follow_child_vfork_call_exec(self):
         self.follow_child_helper(use_fork=False, call_exec=True)
 
     @skipUnlessPlatform(["linux"])
+    # See https://github.com/llvm/llvm-project/issues/85084.
+    @skipIf(oslist=["linux"], archs=["aarch64", "arm"])
     def test_follow_child_fork_call_exec(self):
         """
         Make sure that debugging concurrent fork() from multiple threads won't crash lldb during follow-child.

>From 6296d658ff410f26f1e408cda1365f88d4da59d9 Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev at redking.me.uk>
Date: Wed, 13 Mar 2024 14:02:46 +0000
Subject: [PATCH 39/63] [DAG] Use SelectionDAG::getNOT helper where possible.
 NFC.

---
 llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index 87033e824aa593..40b078a201ace6 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -2799,8 +2799,7 @@ SDValue DAGCombiner::visitADDLike(SDNode *N) {
         // Limit this to after legalization if the add has wrap flags
         (Level >= AfterLegalizeDAG || (!N->getFlags().hasNoUnsignedWrap() &&
                                        !N->getFlags().hasNoSignedWrap()))) {
-      SDValue Not = DAG.getNode(ISD::XOR, DL, VT, N0.getOperand(0),
-                                DAG.getAllOnesConstant(DL, VT));
+      SDValue Not = DAG.getNOT(DL, N0.getOperand(0), VT);
       return DAG.getNode(ISD::SUB, DL, VT, N0.getOperand(1), Not);
     }
   }
@@ -3025,8 +3024,7 @@ SDValue DAGCombiner::visitADDLikeCommutative(SDValue N0, SDValue N1,
       // Limit this to after legalization if the add has wrap flags
       (Level >= AfterLegalizeDAG || (!N0->getFlags().hasNoUnsignedWrap() &&
                                      !N0->getFlags().hasNoSignedWrap()))) {
-    SDValue Not = DAG.getNode(ISD::XOR, DL, VT, N0.getOperand(0),
-                              DAG.getAllOnesConstant(DL, VT));
+    SDValue Not = DAG.getNOT(DL, N0.getOperand(0), VT);
     return DAG.getNode(ISD::SUB, DL, VT, N1, Not);
   }
 

>From f0e0086aead8b3579eb2a6286f11eadb9ce72d06 Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev at redking.me.uk>
Date: Wed, 13 Mar 2024 14:47:23 +0000
Subject: [PATCH 40/63] [DAG] isKnownToBeAPowerOfTwo - use sd_match to match
 both commutations of `x & -x` pattern`. NFC.

Allows us to remove some tricky commutation matching
---
 .../lib/CodeGen/SelectionDAG/SelectionDAG.cpp | 26 ++++++++-----------
 1 file changed, 11 insertions(+), 15 deletions(-)

diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
index c24303592769a0..b8c7d08da3e2da 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
@@ -37,6 +37,7 @@
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineMemOperand.h"
 #include "llvm/CodeGen/RuntimeLibcalls.h"
+#include "llvm/CodeGen/SDPatternMatch.h"
 #include "llvm/CodeGen/SelectionDAGAddressAnalysis.h"
 #include "llvm/CodeGen/SelectionDAGNodes.h"
 #include "llvm/CodeGen/SelectionDAGTargetInfo.h"
@@ -81,6 +82,7 @@
 #include <vector>
 
 using namespace llvm;
+using namespace llvm::SDPatternMatch;
 
 /// makeVTList - Return an instance of the SDVTList struct initialized with the
 /// specified members.
@@ -4290,21 +4292,15 @@ bool SelectionDAG::isKnownToBeAPowerOfTwo(SDValue Val, unsigned Depth) const {
     return isKnownToBeAPowerOfTwo(Val.getOperand(2), Depth + 1) &&
            isKnownToBeAPowerOfTwo(Val.getOperand(1), Depth + 1);
 
-  if (Val.getOpcode() == ISD::AND) {
-    // Looking for `x & -x` pattern:
-    // If x == 0:
-    //    x & -x -> 0
-    // If x != 0:
-    //    x & -x -> non-zero pow2
-    // so if we find the pattern return whether we know `x` is non-zero.
-    for (unsigned OpIdx = 0; OpIdx < 2; ++OpIdx) {
-      SDValue NegOp = Val.getOperand(OpIdx);
-      if (NegOp.getOpcode() == ISD::SUB &&
-          NegOp.getOperand(1) == Val.getOperand(1 - OpIdx) &&
-          isNullOrNullSplat(NegOp.getOperand(0)))
-        return isKnownNeverZero(Val.getOperand(1 - OpIdx), Depth);
-    }
-  }
+  // Looking for `x & -x` pattern:
+  // If x == 0:
+  //    x & -x -> 0
+  // If x != 0:
+  //    x & -x -> non-zero pow2
+  // so if we find the pattern return whether we know `x` is non-zero.
+  SDValue X;
+  if (sd_match(Val, m_And(m_Value(X), m_Sub(m_Zero(), m_Deferred(X)))))
+    return isKnownNeverZero(X, Depth);
 
   if (Val.getOpcode() == ISD::ZERO_EXTEND)
     return isKnownToBeAPowerOfTwo(Val.getOperand(0), Depth + 1);

>From 366e9a22a2b84c6c9d0c900a5932eae127f91bac Mon Sep 17 00:00:00 2001
From: Tom Eccles <tom.eccles at arm.com>
Date: Wed, 13 Mar 2024 14:51:09 +0000
Subject: [PATCH 41/63] [flang][OpenMP][OMPIRBuilder][mlir] Optionally pass
 reduction vars by ref (#84304)

Previously reduction variables were always passed by value into and out
of the initialization and combiner regions of the OpenMP reduction
declare operation.

This worked well for reductions of primitive types (and might perform
better than passing by reference). But passing by reference will be
useful for array and derived type reductions (e.g. to move allocation
inside of the init region).

Passing reductions by reference requires different LLVM-IR generation
when lowering from MLIR because some of the loads/stores/allocations
will now be moved inside of the init and combiner regions. This
alternate code generation is requested using a new attribute to
omp.wsloop and omp.parallel.

Existing lowerings from mlir are unaffected (these will continue to use
the by-value argument passing.

Flang will continue to pass by-value argument passing for trivial types
unless a (hidden) command line argument is supplied. Non-trivial types
will always use the by-ref lowering.

Array reductions are not ready yet (but are coming very soon). In the
meantime, this is tested by forcing existing reductions to use by-ref.

Commit series for by-ref OpenMP reductions 3/3

---------

Co-authored-by: Mats Petersson <mats.petersson at arm.com>
---
 flang/lib/Lower/OpenMP/OpenMP.cpp             |  18 +-
 flang/lib/Lower/OpenMP/ReductionProcessor.cpp | 137 ++++--
 flang/lib/Lower/OpenMP/ReductionProcessor.h   |  18 +-
 .../FIR/parallel-reduction-add-byref.f90      | 117 +++++
 .../OpenMP/FIR/wsloop-reduction-add-byref.f90 | 392 ++++++++++++++++
 .../FIR/wsloop-reduction-iand-byref.f90       |  46 ++
 .../FIR/wsloop-reduction-ieor-byref.f90       |  45 ++
 .../OpenMP/FIR/wsloop-reduction-ior-byref.f90 |  45 ++
 .../wsloop-reduction-logical-eqv-byref.f90    | 187 ++++++++
 .../wsloop-reduction-logical-neqv-byref.f90   | 189 ++++++++
 .../OpenMP/FIR/wsloop-reduction-max-byref.f90 |  90 ++++
 .../OpenMP/FIR/wsloop-reduction-min-byref.f90 |  91 ++++
 .../Lower/OpenMP/default-clause-byref.f90     | 385 ++++++++++++++++
 .../delayed-privatization-reduction-byref.f90 |  30 ++
 .../OpenMP/parallel-reduction-add-byref.f90   | 125 +++++
 .../Lower/OpenMP/parallel-reduction-byref.f90 |  44 ++
 .../parallel-wsloop-reduction-byref.f90       |  16 +
 .../OpenMP/wsloop-reduction-add-byref.f90     | 433 ++++++++++++++++++
 .../wsloop-reduction-add-hlfir-byref.f90      |  58 +++
 .../OpenMP/wsloop-reduction-iand-byref.f90    |  64 +++
 .../OpenMP/wsloop-reduction-ieor-byref.f90    |  55 +++
 .../OpenMP/wsloop-reduction-ior-byref.f90     |  64 +++
 .../wsloop-reduction-logical-and-byref.f90    | 206 +++++++++
 .../wsloop-reduction-logical-eqv-byref.f90    | 202 ++++++++
 .../wsloop-reduction-logical-neqv-byref.f90   | 207 +++++++++
 .../wsloop-reduction-logical-or-byref.f90     | 204 +++++++++
 .../OpenMP/wsloop-reduction-max-2-byref.f90   |  20 +
 .../OpenMP/wsloop-reduction-max-byref.f90     | 152 ++++++
 .../wsloop-reduction-max-hlfir-byref.f90      |  62 +++
 .../OpenMP/wsloop-reduction-min-byref.f90     | 154 +++++++
 .../Lower/OpenMP/wsloop-reduction-min2.f90    |  41 ++
 .../OpenMP/wsloop-reduction-mul-byref.f90     | 414 +++++++++++++++++
 .../llvm/Frontend/OpenMP/OMPIRBuilder.h       |   4 +-
 llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp     |  30 +-
 mlir/include/mlir/Dialect/OpenMP/OpenMPOps.td |  12 +-
 mlir/lib/Dialect/OpenMP/IR/OpenMPDialect.cpp  |   5 +-
 .../OpenMP/OpenMPToLLVMIRTranslation.cpp      |  99 ++--
 .../Target/LLVMIR/openmp-reduction-byref.mlir |  66 +++
 38 files changed, 4451 insertions(+), 76 deletions(-)
 create mode 100644 flang/test/Lower/OpenMP/FIR/parallel-reduction-add-byref.f90
 create mode 100644 flang/test/Lower/OpenMP/FIR/wsloop-reduction-add-byref.f90
 create mode 100644 flang/test/Lower/OpenMP/FIR/wsloop-reduction-iand-byref.f90
 create mode 100644 flang/test/Lower/OpenMP/FIR/wsloop-reduction-ieor-byref.f90
 create mode 100644 flang/test/Lower/OpenMP/FIR/wsloop-reduction-ior-byref.f90
 create mode 100644 flang/test/Lower/OpenMP/FIR/wsloop-reduction-logical-eqv-byref.f90
 create mode 100644 flang/test/Lower/OpenMP/FIR/wsloop-reduction-logical-neqv-byref.f90
 create mode 100644 flang/test/Lower/OpenMP/FIR/wsloop-reduction-max-byref.f90
 create mode 100644 flang/test/Lower/OpenMP/FIR/wsloop-reduction-min-byref.f90
 create mode 100644 flang/test/Lower/OpenMP/default-clause-byref.f90
 create mode 100644 flang/test/Lower/OpenMP/delayed-privatization-reduction-byref.f90
 create mode 100644 flang/test/Lower/OpenMP/parallel-reduction-add-byref.f90
 create mode 100644 flang/test/Lower/OpenMP/parallel-reduction-byref.f90
 create mode 100644 flang/test/Lower/OpenMP/parallel-wsloop-reduction-byref.f90
 create mode 100644 flang/test/Lower/OpenMP/wsloop-reduction-add-byref.f90
 create mode 100644 flang/test/Lower/OpenMP/wsloop-reduction-add-hlfir-byref.f90
 create mode 100644 flang/test/Lower/OpenMP/wsloop-reduction-iand-byref.f90
 create mode 100644 flang/test/Lower/OpenMP/wsloop-reduction-ieor-byref.f90
 create mode 100644 flang/test/Lower/OpenMP/wsloop-reduction-ior-byref.f90
 create mode 100644 flang/test/Lower/OpenMP/wsloop-reduction-logical-and-byref.f90
 create mode 100644 flang/test/Lower/OpenMP/wsloop-reduction-logical-eqv-byref.f90
 create mode 100644 flang/test/Lower/OpenMP/wsloop-reduction-logical-neqv-byref.f90
 create mode 100644 flang/test/Lower/OpenMP/wsloop-reduction-logical-or-byref.f90
 create mode 100644 flang/test/Lower/OpenMP/wsloop-reduction-max-2-byref.f90
 create mode 100644 flang/test/Lower/OpenMP/wsloop-reduction-max-byref.f90
 create mode 100644 flang/test/Lower/OpenMP/wsloop-reduction-max-hlfir-byref.f90
 create mode 100644 flang/test/Lower/OpenMP/wsloop-reduction-min-byref.f90
 create mode 100644 flang/test/Lower/OpenMP/wsloop-reduction-min2.f90
 create mode 100644 flang/test/Lower/OpenMP/wsloop-reduction-mul-byref.f90
 create mode 100644 mlir/test/Target/LLVMIR/openmp-reduction-byref.mlir

diff --git a/flang/lib/Lower/OpenMP/OpenMP.cpp b/flang/lib/Lower/OpenMP/OpenMP.cpp
index 4f0bb80cd7fdf6..1016c8389c6e6a 100644
--- a/flang/lib/Lower/OpenMP/OpenMP.cpp
+++ b/flang/lib/Lower/OpenMP/OpenMP.cpp
@@ -601,6 +601,10 @@ genParallelOp(Fortran::lower::AbstractConverter &converter,
     return reductionSymbols;
   };
 
+  mlir::UnitAttr byrefAttr;
+  if (ReductionProcessor::doReductionByRef(reductionVars))
+    byrefAttr = converter.getFirOpBuilder().getUnitAttr();
+
   OpWithBodyGenInfo genInfo =
       OpWithBodyGenInfo(converter, semaCtx, currentLocation, eval)
           .setGenNested(genNested)
@@ -620,7 +624,7 @@ genParallelOp(Fortran::lower::AbstractConverter &converter,
             : mlir::ArrayAttr::get(converter.getFirOpBuilder().getContext(),
                                    reductionDeclSymbols),
         procBindKindAttr, /*private_vars=*/llvm::SmallVector<mlir::Value>{},
-        /*privatizers=*/nullptr);
+        /*privatizers=*/nullptr, byrefAttr);
   }
 
   bool privatize = !outerCombined;
@@ -684,7 +688,8 @@ genParallelOp(Fortran::lower::AbstractConverter &converter,
       delayedPrivatizationInfo.privatizers.empty()
           ? nullptr
           : mlir::ArrayAttr::get(converter.getFirOpBuilder().getContext(),
-                                 privatizers));
+                                 privatizers),
+      byrefAttr);
 }
 
 static mlir::omp::SectionOp
@@ -1583,7 +1588,7 @@ static void createWsLoop(Fortran::lower::AbstractConverter &converter,
   llvm::SmallVector<const Fortran::semantics::Symbol *> reductionSymbols;
   mlir::omp::ClauseOrderKindAttr orderClauseOperand;
   mlir::omp::ClauseScheduleKindAttr scheduleValClauseOperand;
-  mlir::UnitAttr nowaitClauseOperand, scheduleSimdClauseOperand;
+  mlir::UnitAttr nowaitClauseOperand, byrefOperand, scheduleSimdClauseOperand;
   mlir::IntegerAttr orderedClauseOperand;
   mlir::omp::ScheduleModifierAttr scheduleModClauseOperand;
   std::size_t loopVarTypeSize;
@@ -1600,6 +1605,9 @@ static void createWsLoop(Fortran::lower::AbstractConverter &converter,
   convertLoopBounds(converter, loc, lowerBound, upperBound, step,
                     loopVarTypeSize);
 
+  if (ReductionProcessor::doReductionByRef(reductionVars))
+    byrefOperand = firOpBuilder.getUnitAttr();
+
   auto wsLoopOp = firOpBuilder.create<mlir::omp::WsLoopOp>(
       loc, lowerBound, upperBound, step, linearVars, linearStepVars,
       reductionVars,
@@ -1609,8 +1617,8 @@ static void createWsLoop(Fortran::lower::AbstractConverter &converter,
                                  reductionDeclSymbols),
       scheduleValClauseOperand, scheduleChunkClauseOperand,
       /*schedule_modifiers=*/nullptr,
-      /*simd_modifier=*/nullptr, nowaitClauseOperand, orderedClauseOperand,
-      orderClauseOperand,
+      /*simd_modifier=*/nullptr, nowaitClauseOperand, byrefOperand,
+      orderedClauseOperand, orderClauseOperand,
       /*inclusive=*/firOpBuilder.getUnitAttr());
 
   // Handle attribute based clauses.
diff --git a/flang/lib/Lower/OpenMP/ReductionProcessor.cpp b/flang/lib/Lower/OpenMP/ReductionProcessor.cpp
index a8b98f3f567249..e6a63dd4b939ce 100644
--- a/flang/lib/Lower/OpenMP/ReductionProcessor.cpp
+++ b/flang/lib/Lower/OpenMP/ReductionProcessor.cpp
@@ -14,9 +14,16 @@
 
 #include "flang/Lower/AbstractConverter.h"
 #include "flang/Optimizer/Builder/Todo.h"
+#include "flang/Optimizer/Dialect/FIRType.h"
 #include "flang/Optimizer/HLFIR/HLFIROps.h"
 #include "flang/Parser/tools.h"
 #include "mlir/Dialect/OpenMP/OpenMPDialect.h"
+#include "llvm/Support/CommandLine.h"
+
+static llvm::cl::opt<bool> forceByrefReduction(
+    "force-byref-reduction",
+    llvm::cl::desc("Pass all reduction arguments by reference"),
+    llvm::cl::Hidden);
 
 namespace Fortran {
 namespace lower {
@@ -76,16 +83,24 @@ bool ReductionProcessor::supportedIntrinsicProcReduction(
 }
 
 std::string ReductionProcessor::getReductionName(llvm::StringRef name,
-                                                 mlir::Type ty) {
+                                                 mlir::Type ty, bool isByRef) {
+  ty = fir::unwrapRefType(ty);
+
+  // extra string to distinguish reduction functions for variables passed by
+  // reference
+  llvm::StringRef byrefAddition{""};
+  if (isByRef)
+    byrefAddition = "_byref";
+
   return (llvm::Twine(name) +
           (ty.isIntOrIndex() ? llvm::Twine("_i_") : llvm::Twine("_f_")) +
-          llvm::Twine(ty.getIntOrFloatBitWidth()))
+          llvm::Twine(ty.getIntOrFloatBitWidth()) + byrefAddition)
       .str();
 }
 
 std::string ReductionProcessor::getReductionName(
     Fortran::parser::DefinedOperator::IntrinsicOperator intrinsicOp,
-    mlir::Type ty) {
+    mlir::Type ty, bool isByRef) {
   std::string reductionName;
 
   switch (intrinsicOp) {
@@ -108,13 +123,14 @@ std::string ReductionProcessor::getReductionName(
     break;
   }
 
-  return getReductionName(reductionName, ty);
+  return getReductionName(reductionName, ty, isByRef);
 }
 
 mlir::Value
 ReductionProcessor::getReductionInitValue(mlir::Location loc, mlir::Type type,
                                           ReductionIdentifier redId,
                                           fir::FirOpBuilder &builder) {
+  type = fir::unwrapRefType(type);
   assert((fir::isa_integer(type) || fir::isa_real(type) ||
           type.isa<fir::LogicalType>()) &&
          "only integer, logical and real types are currently supported");
@@ -188,6 +204,7 @@ mlir::Value ReductionProcessor::createScalarCombiner(
     fir::FirOpBuilder &builder, mlir::Location loc, ReductionIdentifier redId,
     mlir::Type type, mlir::Value op1, mlir::Value op2) {
   mlir::Value reductionOp;
+  type = fir::unwrapRefType(type);
   switch (redId) {
   case ReductionIdentifier::MAX:
     reductionOp =
@@ -268,7 +285,8 @@ mlir::Value ReductionProcessor::createScalarCombiner(
 
 mlir::omp::ReductionDeclareOp ReductionProcessor::createReductionDecl(
     fir::FirOpBuilder &builder, llvm::StringRef reductionOpName,
-    const ReductionIdentifier redId, mlir::Type type, mlir::Location loc) {
+    const ReductionIdentifier redId, mlir::Type type, mlir::Location loc,
+    bool isByRef) {
   mlir::OpBuilder::InsertionGuard guard(builder);
   mlir::ModuleOp module = builder.getModule();
 
@@ -278,14 +296,24 @@ mlir::omp::ReductionDeclareOp ReductionProcessor::createReductionDecl(
     return decl;
 
   mlir::OpBuilder modBuilder(module.getBodyRegion());
+  mlir::Type valTy = fir::unwrapRefType(type);
+  if (!isByRef)
+    type = valTy;
 
   decl = modBuilder.create<mlir::omp::ReductionDeclareOp>(loc, reductionOpName,
                                                           type);
   builder.createBlock(&decl.getInitializerRegion(),
                       decl.getInitializerRegion().end(), {type}, {loc});
   builder.setInsertionPointToEnd(&decl.getInitializerRegion().back());
+
   mlir::Value init = getReductionInitValue(loc, type, redId, builder);
-  builder.create<mlir::omp::YieldOp>(loc, init);
+  if (isByRef) {
+    mlir::Value alloca = builder.create<fir::AllocaOp>(loc, valTy);
+    builder.createStoreWithConvert(loc, init, alloca);
+    builder.create<mlir::omp::YieldOp>(loc, alloca);
+  } else {
+    builder.create<mlir::omp::YieldOp>(loc, init);
+  }
 
   builder.createBlock(&decl.getReductionRegion(),
                       decl.getReductionRegion().end(), {type, type},
@@ -294,14 +322,45 @@ mlir::omp::ReductionDeclareOp ReductionProcessor::createReductionDecl(
   builder.setInsertionPointToEnd(&decl.getReductionRegion().back());
   mlir::Value op1 = decl.getReductionRegion().front().getArgument(0);
   mlir::Value op2 = decl.getReductionRegion().front().getArgument(1);
+  mlir::Value outAddr = op1;
+
+  op1 = builder.loadIfRef(loc, op1);
+  op2 = builder.loadIfRef(loc, op2);
 
   mlir::Value reductionOp =
       createScalarCombiner(builder, loc, redId, type, op1, op2);
-  builder.create<mlir::omp::YieldOp>(loc, reductionOp);
+  if (isByRef) {
+    builder.create<fir::StoreOp>(loc, reductionOp, outAddr);
+    builder.create<mlir::omp::YieldOp>(loc, outAddr);
+  } else {
+    builder.create<mlir::omp::YieldOp>(loc, reductionOp);
+  }
 
   return decl;
 }
 
+// TODO: By-ref vs by-val reductions are currently toggled for the whole
+//       operation (possibly effecting multiple reduction variables).
+//       This could cause a problem with openmp target reductions because
+//       by-ref trivial types may not be supported.
+bool ReductionProcessor::doReductionByRef(
+    const llvm::SmallVectorImpl<mlir::Value> &reductionVars) {
+  if (reductionVars.empty())
+    return false;
+  if (forceByrefReduction)
+    return true;
+
+  for (mlir::Value reductionVar : reductionVars) {
+    if (auto declare =
+            mlir::dyn_cast<hlfir::DeclareOp>(reductionVar.getDefiningOp()))
+      reductionVar = declare.getMemref();
+
+    if (!fir::isa_trivial(fir::unwrapRefType(reductionVar.getType())))
+      return true;
+  }
+  return false;
+}
+
 void ReductionProcessor::addReductionDecl(
     mlir::Location currentLocation,
     Fortran::lower::AbstractConverter &converter,
@@ -315,6 +374,37 @@ void ReductionProcessor::addReductionDecl(
   const auto &redOperator{
       std::get<Fortran::parser::OmpReductionOperator>(reduction.t)};
   const auto &objectList{std::get<Fortran::parser::OmpObjectList>(reduction.t)};
+
+  if (!std::holds_alternative<Fortran::parser::DefinedOperator>(
+          redOperator.u)) {
+    if (const auto *reductionIntrinsic =
+            std::get_if<Fortran::parser::ProcedureDesignator>(&redOperator.u)) {
+      if (!ReductionProcessor::supportedIntrinsicProcReduction(
+              *reductionIntrinsic)) {
+        return;
+      }
+    } else {
+      return;
+    }
+  }
+
+  // initial pass to collect all recuction vars so we can figure out if this
+  // should happen byref
+  for (const Fortran::parser::OmpObject &ompObject : objectList.v) {
+    if (const auto *name{
+            Fortran::parser::Unwrap<Fortran::parser::Name>(ompObject)}) {
+      if (const Fortran::semantics::Symbol * symbol{name->symbol}) {
+        if (reductionSymbols)
+          reductionSymbols->push_back(symbol);
+        mlir::Value symVal = converter.getSymbolAddress(*symbol);
+        if (auto declOp = symVal.getDefiningOp<hlfir::DeclareOp>())
+          symVal = declOp.getBase();
+        reductionVars.push_back(symVal);
+      }
+    }
+  }
+  const bool isByRef = doReductionByRef(reductionVars);
+
   if (const auto &redDefinedOp =
           std::get_if<Fortran::parser::DefinedOperator>(&redOperator.u)) {
     const auto &intrinsicOp{
@@ -338,23 +428,20 @@ void ReductionProcessor::addReductionDecl(
       if (const auto *name{
               Fortran::parser::Unwrap<Fortran::parser::Name>(ompObject)}) {
         if (const Fortran::semantics::Symbol * symbol{name->symbol}) {
-          if (reductionSymbols)
-            reductionSymbols->push_back(symbol);
           mlir::Value symVal = converter.getSymbolAddress(*symbol);
           if (auto declOp = symVal.getDefiningOp<hlfir::DeclareOp>())
             symVal = declOp.getBase();
-          mlir::Type redType =
-              symVal.getType().cast<fir::ReferenceType>().getEleTy();
-          reductionVars.push_back(symVal);
-          if (redType.isa<fir::LogicalType>())
+          auto redType = symVal.getType().cast<fir::ReferenceType>();
+          if (redType.getEleTy().isa<fir::LogicalType>())
             decl = createReductionDecl(
                 firOpBuilder,
-                getReductionName(intrinsicOp, firOpBuilder.getI1Type()), redId,
-                redType, currentLocation);
-          else if (redType.isIntOrIndexOrFloat()) {
-            decl = createReductionDecl(firOpBuilder,
-                                       getReductionName(intrinsicOp, redType),
-                                       redId, redType, currentLocation);
+                getReductionName(intrinsicOp, firOpBuilder.getI1Type(),
+                                 isByRef),
+                redId, redType, currentLocation, isByRef);
+          else if (redType.getEleTy().isIntOrIndexOrFloat()) {
+            decl = createReductionDecl(
+                firOpBuilder, getReductionName(intrinsicOp, redType, isByRef),
+                redId, redType, currentLocation, isByRef);
           } else {
             TODO(currentLocation, "Reduction of some types is not supported");
           }
@@ -374,21 +461,17 @@ void ReductionProcessor::addReductionDecl(
         if (const auto *name{
                 Fortran::parser::Unwrap<Fortran::parser::Name>(ompObject)}) {
           if (const Fortran::semantics::Symbol * symbol{name->symbol}) {
-            if (reductionSymbols)
-              reductionSymbols->push_back(symbol);
             mlir::Value symVal = converter.getSymbolAddress(*symbol);
             if (auto declOp = symVal.getDefiningOp<hlfir::DeclareOp>())
               symVal = declOp.getBase();
-            mlir::Type redType =
-                symVal.getType().cast<fir::ReferenceType>().getEleTy();
-            reductionVars.push_back(symVal);
-            assert(redType.isIntOrIndexOrFloat() &&
+            auto redType = symVal.getType().cast<fir::ReferenceType>();
+            assert(redType.getEleTy().isIntOrIndexOrFloat() &&
                    "Unsupported reduction type");
             decl = createReductionDecl(
                 firOpBuilder,
                 getReductionName(getRealName(*reductionIntrinsic).ToString(),
-                                 redType),
-                redId, redType, currentLocation);
+                                 redType, isByRef),
+                redId, redType, currentLocation, isByRef);
             reductionDeclSymbols.push_back(mlir::SymbolRefAttr::get(
                 firOpBuilder.getContext(), decl.getSymName()));
           }
diff --git a/flang/lib/Lower/OpenMP/ReductionProcessor.h b/flang/lib/Lower/OpenMP/ReductionProcessor.h
index 00770fe81d1ef6..679580f2a3cac7 100644
--- a/flang/lib/Lower/OpenMP/ReductionProcessor.h
+++ b/flang/lib/Lower/OpenMP/ReductionProcessor.h
@@ -14,6 +14,7 @@
 #define FORTRAN_LOWER_REDUCTIONPROCESSOR_H
 
 #include "flang/Optimizer/Builder/FIRBuilder.h"
+#include "flang/Optimizer/Dialect/FIRType.h"
 #include "flang/Parser/parse-tree.h"
 #include "flang/Semantics/symbol.h"
 #include "flang/Semantics/type.h"
@@ -71,11 +72,15 @@ class ReductionProcessor {
   static const Fortran::semantics::SourceName
   getRealName(const Fortran::parser::ProcedureDesignator &pd);
 
-  static std::string getReductionName(llvm::StringRef name, mlir::Type ty);
+  static bool
+  doReductionByRef(const llvm::SmallVectorImpl<mlir::Value> &reductionVars);
+
+  static std::string getReductionName(llvm::StringRef name, mlir::Type ty,
+                                      bool isByRef);
 
   static std::string getReductionName(
       Fortran::parser::DefinedOperator::IntrinsicOperator intrinsicOp,
-      mlir::Type ty);
+      mlir::Type ty, bool isByRef);
 
   /// This function returns the identity value of the operator \p
   /// reductionOpName. For example:
@@ -103,9 +108,11 @@ class ReductionProcessor {
   /// symbol table. The declaration has a constant initializer with the neutral
   /// value `initValue`, and the reduction combiner carried over from `reduce`.
   /// TODO: Generalize this for non-integer types, add atomic region.
-  static mlir::omp::ReductionDeclareOp createReductionDecl(
-      fir::FirOpBuilder &builder, llvm::StringRef reductionOpName,
-      const ReductionIdentifier redId, mlir::Type type, mlir::Location loc);
+  static mlir::omp::ReductionDeclareOp
+  createReductionDecl(fir::FirOpBuilder &builder,
+                      llvm::StringRef reductionOpName,
+                      const ReductionIdentifier redId, mlir::Type type,
+                      mlir::Location loc, bool isByRef);
 
   /// Creates a reduction declaration and associates it with an OpenMP block
   /// directive.
@@ -124,6 +131,7 @@ mlir::Value
 ReductionProcessor::getReductionOperation(fir::FirOpBuilder &builder,
                                           mlir::Type type, mlir::Location loc,
                                           mlir::Value op1, mlir::Value op2) {
+  type = fir::unwrapRefType(type);
   assert(type.isIntOrIndexOrFloat() &&
          "only integer and float types are currently supported");
   if (type.isIntOrIndex())
diff --git a/flang/test/Lower/OpenMP/FIR/parallel-reduction-add-byref.f90 b/flang/test/Lower/OpenMP/FIR/parallel-reduction-add-byref.f90
new file mode 100644
index 00000000000000..ca432662b77c44
--- /dev/null
+++ b/flang/test/Lower/OpenMP/FIR/parallel-reduction-add-byref.f90
@@ -0,0 +1,117 @@
+! RUN: bbc -emit-fir -hlfir=false -fopenmp --force-byref-reduction -o - %s 2>&1 | FileCheck %s
+! RUN: %flang_fc1 -emit-fir -flang-deprecated-no-hlfir -fopenmp -mmlir --force-byref-reduction -o - %s 2>&1 | FileCheck %s
+
+!CHECK-LABEL: omp.reduction.declare
+!CHECK-SAME: @[[RED_F32_NAME:.*]] : !fir.ref<f32>
+!CHECK-SAME: init {
+!CHECK: ^bb0(%{{.*}}: !fir.ref<f32>):
+!CHECK:  %[[C0_1:.*]] = arith.constant 0.000000e+00 : f32
+!CHECK:  %[[REF:.*]] = fir.alloca f32
+!CHECKL  fir.store [[%C0_1]] to %[[REF]] : !fir.ref<f32>
+!CHECK:  omp.yield(%[[REF]] : !fir.ref<f32>)
+!CHECK: } combiner {
+!CHECK: ^bb0(%[[ARG0:.*]]: !fir.ref<f32>, %[[ARG1:.*]]: !fir.ref<f32>):
+!CHECK:  %[[LD0:.*]] = fir.load %[[ARG0]] : !fir.ref<f32>
+!CHECK:  %[[LD1:.*]] = fir.load %[[ARG1]] : !fir.ref<f32>
+!CHECK:  %[[RES:.*]] = arith.addf %[[LD0]], %[[LD1]] {{.*}}: f32
+!CHECK:  fir.store %[[RES]] to %[[ARG0]] : !fir.ref<f32>
+!CHECK:  omp.yield(%[[ARG0]] : !fir.ref<f32>)
+!CHECK: }
+
+!CHECK-LABEL: omp.reduction.declare
+!CHECK-SAME: @[[RED_I32_NAME:.*]] : !fir.ref<i32>
+!CHECK-SAME: init {
+!CHECK: ^bb0(%{{.*}}: !fir.ref<i32>):
+!CHECK:  %[[C0_1:.*]] = arith.constant 0 : i32
+!CHECK:  %[[REF:.*]] = fir.alloca i32
+!CHECKL  fir.store [[%C0_1]] to %[[REF]] : !fir.ref<i32>
+!CHECK:  omp.yield(%[[REF]] : !fir.ref<i32>)
+!CHECK: } combiner {
+!CHECK: ^bb0(%[[ARG0:.*]]: !fir.ref<i32>, %[[ARG1:.*]]: !fir.ref<i32>):
+!CHECK:  %[[LD0:.*]] = fir.load %[[ARG0]] : !fir.ref<i32>
+!CHECK:  %[[LD1:.*]] = fir.load %[[ARG1]] : !fir.ref<i32>
+!CHECK:  %[[RES:.*]] = arith.addi %[[LD0]], %[[LD1]] : i32
+!CHECK:  fir.store %[[RES]] to %[[ARG0]] : !fir.ref<i32>
+!CHECK:  omp.yield(%[[ARG0]] : !fir.ref<i32>)
+!CHECK: }
+
+!CHECK-LABEL: func.func @_QPsimple_int_add
+!CHECK:  %[[IREF:.*]] = fir.alloca i32 {bindc_name = "i", uniq_name = "_QFsimple_int_addEi"}
+!CHECK:  %[[I_START:.*]] = arith.constant 0 : i32
+!CHECK:  fir.store %[[I_START]] to %[[IREF]] : !fir.ref<i32>
+!CHECK:  omp.parallel byref reduction(@[[RED_I32_NAME]] %[[IREF]] -> %[[PRV:.+]] : !fir.ref<i32>) {
+!CHECK:    %[[LPRV:.+]] = fir.load %[[PRV]] : !fir.ref<i32>
+!CHECK:    %[[I_INCR:.+]] = arith.constant 1 : i32
+!CHECK:    %[[RES:.+]] = arith.addi %[[LPRV]], %[[I_INCR]]
+!CHECK:    fir.store %[[RES]] to %[[PRV]] : !fir.ref<i32>
+!CHECK:    omp.terminator
+!CHECK:  }
+!CHECK: return
+subroutine simple_int_add
+    integer :: i
+    i = 0
+
+    !$omp parallel reduction(+:i)
+    i = i + 1
+    !$omp end parallel
+
+    print *, i
+end subroutine
+
+!CHECK-LABEL: func.func @_QPsimple_real_add
+!CHECK:  %[[RREF:.*]] = fir.alloca f32 {bindc_name = "r", uniq_name = "_QFsimple_real_addEr"}
+!CHECK:  %[[R_START:.*]] = arith.constant 0.000000e+00 : f32
+!CHECK:  fir.store %[[R_START]] to %[[RREF]] : !fir.ref<f32>
+!CHECK:  omp.parallel byref reduction(@[[RED_F32_NAME]] %[[RREF]] -> %[[PRV:.+]] : !fir.ref<f32>) {
+!CHECK:    %[[LPRV:.+]] = fir.load %[[PRV]] : !fir.ref<f32>
+!CHECK:    %[[R_INCR:.+]] = arith.constant 1.500000e+00 : f32
+!CHECK:    %[[RES]] = arith.addf %[[LPRV]], %[[R_INCR]] {{.*}} : f32
+!CHECK:    fir.store %[[RES]] to %[[PRV]] : !fir.ref<f32>
+!CHECK:    omp.terminator
+!CHECK:  }
+!CHECK: return
+subroutine simple_real_add
+    real :: r
+    r = 0.0
+
+    !$omp parallel reduction(+:r)
+    r = r + 1.5
+    !$omp end parallel
+
+    print *, r
+end subroutine
+
+!CHECK-LABEL: func.func @_QPint_real_add
+!CHECK:  %[[IREF:.*]] = fir.alloca i32 {bindc_name = "i", uniq_name = "_QFint_real_addEi"}
+!CHECK:  %[[RREF:.*]] = fir.alloca f32 {bindc_name = "r", uniq_name = "_QFint_real_addEr"}
+!CHECK:  %[[R_START:.*]] = arith.constant 0.000000e+00 : f32
+!CHECK:  fir.store %[[R_START]] to %[[RREF]] : !fir.ref<f32>
+!CHECK:  %[[I_START:.*]] = arith.constant 0 : i32
+!CHECK:  fir.store %[[I_START]] to %[[IREF]] : !fir.ref<i32>
+!CHECK:  omp.parallel byref reduction(@[[RED_I32_NAME]] %[[IREF]] -> %[[PRV0:.+]] : !fir.ref<i32>, @[[RED_F32_NAME]] %[[RREF]] -> %[[PRV1:.+]] : !fir.ref<f32>) {
+!CHECK:    %[[R_INCR:.*]] = arith.constant 1.500000e+00 : f32
+!CHECK:    %[[LPRV1:.+]] = fir.load %[[PRV1]] : !fir.ref<f32>
+!CHECK:    %[[RES1:.+]] = arith.addf %[[R_INCR]], %[[LPRV1]] {{.*}} : f32
+!CHECK:    fir.store %[[RES1]] to %[[PRV1]]
+!CHECK:    %[[LPRV0:.+]] = fir.load %[[PRV0]] : !fir.ref<i32>
+!CHECK:    %[[I_INCR:.*]] = arith.constant 3 : i32
+!CHECK:    %[[RES0:.+]] = arith.addi %[[LPRV0]], %[[I_INCR]]
+!CHECK:    fir.store %[[RES0]] to %[[PRV0]]
+!CHECK:    omp.terminator
+!CHECK:  }
+!CHECK: return
+subroutine int_real_add
+    real :: r
+    integer :: i
+
+    r = 0.0
+    i = 0
+
+    !$omp parallel reduction(+:i,r)
+    r = 1.5 + r
+    i = i + 3
+    !$omp end parallel
+
+    print *, r
+    print *, i
+end subroutine
diff --git a/flang/test/Lower/OpenMP/FIR/wsloop-reduction-add-byref.f90 b/flang/test/Lower/OpenMP/FIR/wsloop-reduction-add-byref.f90
new file mode 100644
index 00000000000000..d4d3452d3e8682
--- /dev/null
+++ b/flang/test/Lower/OpenMP/FIR/wsloop-reduction-add-byref.f90
@@ -0,0 +1,392 @@
+! RUN: bbc -emit-fir -hlfir=false -fopenmp --force-byref-reduction %s -o - | FileCheck %s
+! RUN: %flang_fc1 -emit-fir -flang-deprecated-no-hlfir -fopenmp -mmlir --force-byref-reduction %s -o - | FileCheck %s
+! NOTE: Assertions have been autogenerated by utils/generate-test-checks.py
+
+! CHECK-LABEL:   omp.reduction.declare @add_reduction_f_64_byref : !fir.ref<f64>
+! CHECK-SAME:    init {
+! CHECK:         ^bb0(%[[VAL_0:.*]]: !fir.ref<f64>):
+! CHECK:            %[[C0_1:.*]] = arith.constant 0.000000e+00 : f64
+! CHECK:            %[[REF:.*]] = fir.alloca f64
+! CHECK:            fir.store %[[C0_1]] to %[[REF]] : !fir.ref<f64>
+! CHECK:           omp.yield(%[[REF]] : !fir.ref<f64>)
+
+! CHECK-LABEL:   } combiner {
+! CHECK:         ^bb0(%[[ARG0:.*]]: !fir.ref<f64>, %[[ARG1:.*]]: !fir.ref<f64>):
+! CHECK:           %[[LD0:.*]] = fir.load %[[ARG0]] : !fir.ref<f64>
+! CHECK:           %[[LD1:.*]] = fir.load %[[ARG1]] : !fir.ref<f64>
+! CHECK:           %[[RES:.*]] = arith.addf %[[LD0]], %[[LD1]] fastmath<contract> : f64
+! CHECK:           fir.store %[[RES]] to %[[ARG0]] : !fir.ref<f64>
+! CHECK:           omp.yield(%[[ARG0]] : !fir.ref<f64>)
+! CHECK:         }
+
+! CHECK-LABEL:   omp.reduction.declare @add_reduction_i_64_byref : !fir.ref<i64>
+! CHECK-SAME:    init {
+! CHECK:         ^bb0(%[[VAL_0:.*]]: !fir.ref<i64>):
+! CHECK:            %[[C0_1:.*]] = arith.constant 0 : i64
+! CHECK:            %[[REF:.*]] = fir.alloca i64
+! CHECK:            fir.store %[[C0_1]] to %[[REF]] : !fir.ref<i64>
+! CHECK:           omp.yield(%[[REF]] : !fir.ref<i64>)
+
+! CHECK-LABEL:   } combiner {
+! CHECK:         ^bb0(%[[ARG0:.*]]: !fir.ref<i64>, %[[ARG1:.*]]: !fir.ref<i64>):
+! CHECK:           %[[LD0:.*]] = fir.load %[[ARG0]] : !fir.ref<i64>
+! CHECK:           %[[LD1:.*]] = fir.load %[[ARG1]] : !fir.ref<i64>
+! CHECK:           %[[RES:.*]] = arith.addi %[[LD0]], %[[LD1]] : i64
+! CHECK:           fir.store %[[RES]] to %[[ARG0]] : !fir.ref<i64>
+! CHECK:           omp.yield(%[[ARG0]] : !fir.ref<i64>)
+! CHECK:         }
+
+! CHECK-LABEL:   omp.reduction.declare @add_reduction_f_32_byref : !fir.ref<f32>
+! CHECK-SAME:    init {
+! CHECK:         ^bb0(%[[VAL_0:.*]]: !fir.ref<f32>):
+! CHECK:            %[[C0_1:.*]] = arith.constant 0.000000e+00 : f32
+! CHECK:            %[[REF:.*]] = fir.alloca f32
+! CHECK:            fir.store %[[C0_1]] to %[[REF]] : !fir.ref<f32>
+! CHECK:           omp.yield(%[[REF]] : !fir.ref<f32>)
+
+! CHECK-LABEL:   } combiner {
+! CHECK:         ^bb0(%[[ARG0:.*]]: !fir.ref<f32>, %[[ARG1:.*]]: !fir.ref<f32>):
+! CHECK:           %[[LD0:.*]] = fir.load %[[ARG0]] : !fir.ref<f32>
+! CHECK:           %[[LD1:.*]] = fir.load %[[ARG1]] : !fir.ref<f32>
+! CHECK:           %[[RES:.*]] = arith.addf %[[LD0]], %[[LD1]] fastmath<contract> : f32
+! CHECK:           fir.store %[[RES]] to %[[ARG0]] : !fir.ref<f32>
+! CHECK:           omp.yield(%[[ARG0]] : !fir.ref<f32>)
+! CHECK:         }
+
+! CHECK-LABEL:   omp.reduction.declare @add_reduction_i_32_byref : !fir.ref<i32>
+! CHECK-SAME:    init {
+! CHECK:         ^bb0(%[[VAL_0:.*]]: !fir.ref<i32>):
+! CHECK:            %[[C0_1:.*]] = arith.constant 0 : i32
+! CHECK:            %[[REF:.*]] = fir.alloca i32
+! CHECK:            fir.store %[[C0_1]] to %[[REF]] : !fir.ref<i32>
+! CHECK:           omp.yield(%[[REF]] : !fir.ref<i32>)
+
+! CHECK-LABEL:   } combiner {
+! CHECK:         ^bb0(%[[ARG0:.*]]: !fir.ref<i32>, %[[ARG1:.*]]: !fir.ref<i32>):
+! CHECK:           %[[LD0:.*]] = fir.load %[[ARG0]] : !fir.ref<i32>
+! CHECK:           %[[LD1:.*]] = fir.load %[[ARG1]] : !fir.ref<i32>
+! CHECK:           %[[RES:.*]] = arith.addi %[[LD0]], %[[LD1]] : i32
+! CHECK:           fir.store %[[RES]] to %[[ARG0]] : !fir.ref<i32>
+! CHECK:           omp.yield(%[[ARG0]] : !fir.ref<i32>)
+! CHECK:         }
+
+! CHECK-LABEL:   func.func @_QPsimple_int_reduction() {
+! CHECK:           %[[VAL_0:.*]] = fir.alloca i32 {bindc_name = "i", uniq_name = "_QFsimple_int_reductionEi"}
+! CHECK:           %[[VAL_1:.*]] = fir.alloca i32 {bindc_name = "x", uniq_name = "_QFsimple_int_reductionEx"}
+! CHECK:           %[[VAL_2:.*]] = arith.constant 0 : i32
+! CHECK:           fir.store %[[VAL_2]] to %[[VAL_1]] : !fir.ref<i32>
+! CHECK:           omp.parallel {
+! CHECK:             %[[VAL_3:.*]] = fir.alloca i32 {adapt.valuebyref, pinned}
+! CHECK:             %[[VAL_4:.*]] = arith.constant 1 : i32
+! CHECK:             %[[VAL_5:.*]] = arith.constant 100 : i32
+! CHECK:             %[[VAL_6:.*]] = arith.constant 1 : i32
+! CHECK:             omp.wsloop byref reduction(@add_reduction_i_32_byref %[[VAL_1]] -> %[[VAL_7:.*]] : !fir.ref<i32>)  for  (%[[VAL_8:.*]]) : i32 = (%[[VAL_4]]) to (%[[VAL_5]]) inclusive step (%[[VAL_6]]) {
+! CHECK:               fir.store %[[VAL_8]] to %[[VAL_3]] : !fir.ref<i32>
+! CHECK:               %[[VAL_9:.*]] = fir.load %[[VAL_7]] : !fir.ref<i32>
+! CHECK:               %[[VAL_10:.*]] = fir.load %[[VAL_3]] : !fir.ref<i32>
+! CHECK:               %[[VAL_11:.*]] = arith.addi %[[VAL_9]], %[[VAL_10]] : i32
+! CHECK:               fir.store %[[VAL_11]] to %[[VAL_7]] : !fir.ref<i32>
+! CHECK:               omp.yield
+! CHECK:             }
+! CHECK:             omp.terminator
+! CHECK:           }
+! CHECK:           return
+! CHECK:         }
+
+subroutine simple_int_reduction
+  integer :: x
+  x = 0
+  !$omp parallel
+  !$omp do reduction(+:x)
+  do i=1, 100
+    x = x + i
+  end do
+  !$omp end do
+  !$omp end parallel
+end subroutine
+
+
+! CHECK-LABEL:   func.func @_QPsimple_real_reduction() {
+! CHECK:           %[[VAL_0:.*]] = fir.alloca i32 {bindc_name = "i", uniq_name = "_QFsimple_real_reductionEi"}
+! CHECK:           %[[VAL_1:.*]] = fir.alloca f32 {bindc_name = "x", uniq_name = "_QFsimple_real_reductionEx"}
+! CHECK:           %[[VAL_2:.*]] = arith.constant 0.000000e+00 : f32
+! CHECK:           fir.store %[[VAL_2]] to %[[VAL_1]] : !fir.ref<f32>
+! CHECK:           omp.parallel {
+! CHECK:             %[[VAL_3:.*]] = fir.alloca i32 {adapt.valuebyref, pinned}
+! CHECK:             %[[VAL_4:.*]] = arith.constant 1 : i32
+! CHECK:             %[[VAL_5:.*]] = arith.constant 100 : i32
+! CHECK:             %[[VAL_6:.*]] = arith.constant 1 : i32
+! CHECK:             omp.wsloop byref reduction(@add_reduction_f_32_byref %[[VAL_1]] -> %[[VAL_7:.*]] : !fir.ref<f32>)  for  (%[[VAL_8:.*]]) : i32 = (%[[VAL_4]]) to (%[[VAL_5]]) inclusive step (%[[VAL_6]]) {
+! CHECK:               fir.store %[[VAL_8]] to %[[VAL_3]] : !fir.ref<i32>
+! CHECK:               %[[VAL_9:.*]] = fir.load %[[VAL_7]] : !fir.ref<f32>
+! CHECK:               %[[VAL_10:.*]] = fir.load %[[VAL_3]] : !fir.ref<i32>
+! CHECK:               %[[VAL_11:.*]] = fir.convert %[[VAL_10]] : (i32) -> f32
+! CHECK:               %[[VAL_12:.*]] = arith.addf %[[VAL_9]], %[[VAL_11]] fastmath<contract> : f32
+! CHECK:               fir.store %[[VAL_12]] to %[[VAL_7]] : !fir.ref<f32>
+! CHECK:               omp.yield
+! CHECK:             }
+! CHECK:             omp.terminator
+! CHECK:           }
+! CHECK:           return
+! CHECK:         }
+
+subroutine simple_real_reduction
+  real :: x
+  x = 0.0
+  !$omp parallel
+  !$omp do reduction(+:x)
+  do i=1, 100
+    x = x + i
+  end do
+  !$omp end do
+  !$omp end parallel
+end subroutine
+
+! CHECK-LABEL:   func.func @_QPsimple_int_reduction_switch_order() {
+! CHECK:           %[[VAL_0:.*]] = fir.alloca i32 {bindc_name = "i", uniq_name = "_QFsimple_int_reduction_switch_orderEi"}
+! CHECK:           %[[VAL_1:.*]] = fir.alloca i32 {bindc_name = "x", uniq_name = "_QFsimple_int_reduction_switch_orderEx"}
+! CHECK:           %[[VAL_2:.*]] = arith.constant 0 : i32
+! CHECK:           fir.store %[[VAL_2]] to %[[VAL_1]] : !fir.ref<i32>
+! CHECK:           omp.parallel {
+! CHECK:             %[[VAL_3:.*]] = fir.alloca i32 {adapt.valuebyref, pinned}
+! CHECK:             %[[VAL_4:.*]] = arith.constant 1 : i32
+! CHECK:             %[[VAL_5:.*]] = arith.constant 100 : i32
+! CHECK:             %[[VAL_6:.*]] = arith.constant 1 : i32
+! CHECK:             omp.wsloop byref reduction(@add_reduction_i_32_byref %[[VAL_1]] -> %[[VAL_7:.*]] : !fir.ref<i32>)  for  (%[[VAL_8:.*]]) : i32 = (%[[VAL_4]]) to (%[[VAL_5]]) inclusive step (%[[VAL_6]]) {
+! CHECK:               fir.store %[[VAL_8]] to %[[VAL_3]] : !fir.ref<i32>
+! CHECK:               %[[VAL_9:.*]] = fir.load %[[VAL_3]] : !fir.ref<i32>
+! CHECK:               %[[VAL_10:.*]] = fir.load %[[VAL_7]] : !fir.ref<i32>
+! CHECK:               %[[VAL_11:.*]] = arith.addi %[[VAL_9]], %[[VAL_10]] : i32
+! CHECK:               fir.store %[[VAL_11]] to %[[VAL_7]] : !fir.ref<i32>
+! CHECK:               omp.yield
+! CHECK:             }
+! CHECK:             omp.terminator
+! CHECK:           }
+! CHECK:           return
+! CHECK:         }
+
+subroutine simple_int_reduction_switch_order
+  integer :: x
+  x = 0
+  !$omp parallel
+  !$omp do reduction(+:x)
+  do i=1, 100
+    x = i + x
+  end do
+  !$omp end do
+  !$omp end parallel
+end subroutine
+
+! CHECK-LABEL:   func.func @_QPsimple_real_reduction_switch_order() {
+! CHECK:           %[[VAL_0:.*]] = fir.alloca i32 {bindc_name = "i", uniq_name = "_QFsimple_real_reduction_switch_orderEi"}
+! CHECK:           %[[VAL_1:.*]] = fir.alloca f32 {bindc_name = "x", uniq_name = "_QFsimple_real_reduction_switch_orderEx"}
+! CHECK:           %[[VAL_2:.*]] = arith.constant 0.000000e+00 : f32
+! CHECK:           fir.store %[[VAL_2]] to %[[VAL_1]] : !fir.ref<f32>
+! CHECK:           omp.parallel {
+! CHECK:             %[[VAL_3:.*]] = fir.alloca i32 {adapt.valuebyref, pinned}
+! CHECK:             %[[VAL_4:.*]] = arith.constant 1 : i32
+! CHECK:             %[[VAL_5:.*]] = arith.constant 100 : i32
+! CHECK:             %[[VAL_6:.*]] = arith.constant 1 : i32
+! CHECK:             omp.wsloop byref reduction(@add_reduction_f_32_byref %[[VAL_1]] -> %[[VAL_7:.*]] : !fir.ref<f32>)  for  (%[[VAL_8:.*]]) : i32 = (%[[VAL_4]]) to (%[[VAL_5]]) inclusive step (%[[VAL_6]]) {
+! CHECK:               fir.store %[[VAL_8]] to %[[VAL_3]] : !fir.ref<i32>
+! CHECK:               %[[VAL_9:.*]] = fir.load %[[VAL_3]] : !fir.ref<i32>
+! CHECK:               %[[VAL_10:.*]] = fir.convert %[[VAL_9]] : (i32) -> f32
+! CHECK:               %[[VAL_11:.*]] = fir.load %[[VAL_7]] : !fir.ref<f32>
+! CHECK:               %[[VAL_12:.*]] = arith.addf %[[VAL_10]], %[[VAL_11]] fastmath<contract> : f32
+! CHECK:               fir.store %[[VAL_12]] to %[[VAL_7]] : !fir.ref<f32>
+! CHECK:               omp.yield
+! CHECK:             }
+! CHECK:             omp.terminator
+! CHECK:           }
+! CHECK:           return
+! CHECK:         }
+
+subroutine simple_real_reduction_switch_order
+  real :: x
+  x = 0.0
+  !$omp parallel
+  !$omp do reduction(+:x)
+  do i=1, 100
+    x = i + x
+  end do
+  !$omp end do
+  !$omp end parallel
+end subroutine
+
+! CHECK-LABEL:   func.func @_QPmultiple_int_reductions_same_type() {
+! CHECK:           %[[VAL_0:.*]] = fir.alloca i32 {bindc_name = "i", uniq_name = "_QFmultiple_int_reductions_same_typeEi"}
+! CHECK:           %[[VAL_1:.*]] = fir.alloca i32 {bindc_name = "x", uniq_name = "_QFmultiple_int_reductions_same_typeEx"}
+! CHECK:           %[[VAL_2:.*]] = fir.alloca i32 {bindc_name = "y", uniq_name = "_QFmultiple_int_reductions_same_typeEy"}
+! CHECK:           %[[VAL_3:.*]] = fir.alloca i32 {bindc_name = "z", uniq_name = "_QFmultiple_int_reductions_same_typeEz"}
+! CHECK:           %[[VAL_4:.*]] = arith.constant 0 : i32
+! CHECK:           fir.store %[[VAL_4]] to %[[VAL_1]] : !fir.ref<i32>
+! CHECK:           %[[VAL_5:.*]] = arith.constant 0 : i32
+! CHECK:           fir.store %[[VAL_5]] to %[[VAL_2]] : !fir.ref<i32>
+! CHECK:           %[[VAL_6:.*]] = arith.constant 0 : i32
+! CHECK:           fir.store %[[VAL_6]] to %[[VAL_3]] : !fir.ref<i32>
+! CHECK:           omp.parallel {
+! CHECK:             %[[VAL_7:.*]] = fir.alloca i32 {adapt.valuebyref, pinned}
+! CHECK:             %[[VAL_8:.*]] = arith.constant 1 : i32
+! CHECK:             %[[VAL_9:.*]] = arith.constant 100 : i32
+! CHECK:             %[[VAL_10:.*]] = arith.constant 1 : i32
+! CHECK:             omp.wsloop byref reduction(@add_reduction_i_32_byref %[[VAL_1]] -> %[[VAL_11:.*]] : !fir.ref<i32>, @add_reduction_i_32_byref %[[VAL_2]] -> %[[VAL_12:.*]] : !fir.ref<i32>, @add_reduction_i_32_byref %[[VAL_3]] -> %[[VAL_13:.*]] : !fir.ref<i32>)  for  (%[[VAL_14:.*]]) : i32 = (%[[VAL_8]]) to (%[[VAL_9]]) inclusive step (%[[VAL_10]]) {
+! CHECK:               fir.store %[[VAL_14]] to %[[VAL_7]] : !fir.ref<i32>
+! CHECK:               %[[VAL_15:.*]] = fir.load %[[VAL_11]] : !fir.ref<i32>
+! CHECK:               %[[VAL_16:.*]] = fir.load %[[VAL_7]] : !fir.ref<i32>
+! CHECK:               %[[VAL_17:.*]] = arith.addi %[[VAL_15]], %[[VAL_16]] : i32
+! CHECK:               fir.store %[[VAL_17]] to %[[VAL_11]] : !fir.ref<i32>
+! CHECK:               %[[VAL_18:.*]] = fir.load %[[VAL_12]] : !fir.ref<i32>
+! CHECK:               %[[VAL_19:.*]] = fir.load %[[VAL_7]] : !fir.ref<i32>
+! CHECK:               %[[VAL_20:.*]] = arith.addi %[[VAL_18]], %[[VAL_19]] : i32
+! CHECK:               fir.store %[[VAL_20]] to %[[VAL_12]] : !fir.ref<i32>
+! CHECK:               %[[VAL_21:.*]] = fir.load %[[VAL_13]] : !fir.ref<i32>
+! CHECK:               %[[VAL_22:.*]] = fir.load %[[VAL_7]] : !fir.ref<i32>
+! CHECK:               %[[VAL_23:.*]] = arith.addi %[[VAL_21]], %[[VAL_22]] : i32
+! CHECK:               fir.store %[[VAL_23]] to %[[VAL_13]] : !fir.ref<i32>
+! CHECK:               omp.yield
+! CHECK:             }
+! CHECK:             omp.terminator
+! CHECK:           }
+! CHECK:           return
+! CHECK:         }
+
+subroutine multiple_int_reductions_same_type
+  integer :: x,y,z
+  x = 0
+  y = 0
+  z = 0
+  !$omp parallel
+  !$omp do reduction(+:x,y,z)
+  do i=1, 100
+    x = x + i
+    y = y + i
+    z = z + i
+  end do
+  !$omp end do
+  !$omp end parallel
+end subroutine
+
+! CHECK-LABEL:   func.func @_QPmultiple_real_reductions_same_type() {
+! CHECK:           %[[VAL_0:.*]] = fir.alloca i32 {bindc_name = "i", uniq_name = "_QFmultiple_real_reductions_same_typeEi"}
+! CHECK:           %[[VAL_1:.*]] = fir.alloca f32 {bindc_name = "x", uniq_name = "_QFmultiple_real_reductions_same_typeEx"}
+! CHECK:           %[[VAL_2:.*]] = fir.alloca f32 {bindc_name = "y", uniq_name = "_QFmultiple_real_reductions_same_typeEy"}
+! CHECK:           %[[VAL_3:.*]] = fir.alloca f32 {bindc_name = "z", uniq_name = "_QFmultiple_real_reductions_same_typeEz"}
+! CHECK:           %[[VAL_4:.*]] = arith.constant 0.000000e+00 : f32
+! CHECK:           fir.store %[[VAL_4]] to %[[VAL_1]] : !fir.ref<f32>
+! CHECK:           %[[VAL_5:.*]] = arith.constant 0.000000e+00 : f32
+! CHECK:           fir.store %[[VAL_5]] to %[[VAL_2]] : !fir.ref<f32>
+! CHECK:           %[[VAL_6:.*]] = arith.constant 0.000000e+00 : f32
+! CHECK:           fir.store %[[VAL_6]] to %[[VAL_3]] : !fir.ref<f32>
+! CHECK:           omp.parallel {
+! CHECK:             %[[VAL_7:.*]] = fir.alloca i32 {adapt.valuebyref, pinned}
+! CHECK:             %[[VAL_8:.*]] = arith.constant 1 : i32
+! CHECK:             %[[VAL_9:.*]] = arith.constant 100 : i32
+! CHECK:             %[[VAL_10:.*]] = arith.constant 1 : i32
+! CHECK:             omp.wsloop byref reduction(@add_reduction_f_32_byref %[[VAL_1]] -> %[[VAL_11:.*]] : !fir.ref<f32>, @add_reduction_f_32_byref %[[VAL_2]] -> %[[VAL_12:.*]] : !fir.ref<f32>, @add_reduction_f_32_byref %[[VAL_3]] -> %[[VAL_13:.*]] : !fir.ref<f32>)  for  (%[[VAL_14:.*]]) : i32 = (%[[VAL_8]]) to (%[[VAL_9]]) inclusive step (%[[VAL_10]]) {
+! CHECK:               fir.store %[[VAL_14]] to %[[VAL_7]] : !fir.ref<i32>
+! CHECK:               %[[VAL_15:.*]] = fir.load %[[VAL_11]] : !fir.ref<f32>
+! CHECK:               %[[VAL_16:.*]] = fir.load %[[VAL_7]] : !fir.ref<i32>
+! CHECK:               %[[VAL_17:.*]] = fir.convert %[[VAL_16]] : (i32) -> f32
+! CHECK:               %[[VAL_18:.*]] = arith.addf %[[VAL_15]], %[[VAL_17]] fastmath<contract> : f32
+! CHECK:               fir.store %[[VAL_18]] to %[[VAL_11]] : !fir.ref<f32>
+! CHECK:               %[[VAL_19:.*]] = fir.load %[[VAL_12]] : !fir.ref<f32>
+! CHECK:               %[[VAL_20:.*]] = fir.load %[[VAL_7]] : !fir.ref<i32>
+! CHECK:               %[[VAL_21:.*]] = fir.convert %[[VAL_20]] : (i32) -> f32
+! CHECK:               %[[VAL_22:.*]] = arith.addf %[[VAL_19]], %[[VAL_21]] fastmath<contract> : f32
+! CHECK:               fir.store %[[VAL_22]] to %[[VAL_12]] : !fir.ref<f32>
+! CHECK:               %[[VAL_23:.*]] = fir.load %[[VAL_13]] : !fir.ref<f32>
+! CHECK:               %[[VAL_24:.*]] = fir.load %[[VAL_7]] : !fir.ref<i32>
+! CHECK:               %[[VAL_25:.*]] = fir.convert %[[VAL_24]] : (i32) -> f32
+! CHECK:               %[[VAL_26:.*]] = arith.addf %[[VAL_23]], %[[VAL_25]] fastmath<contract> : f32
+! CHECK:               fir.store %[[VAL_26]] to %[[VAL_13]] : !fir.ref<f32>
+! CHECK:               omp.yield
+! CHECK:             }
+! CHECK:             omp.terminator
+! CHECK:           }
+! CHECK:           return
+! CHECK:         }
+
+subroutine multiple_real_reductions_same_type
+  real :: x,y,z
+  x = 0.0
+  y = 0.0
+  z = 0.0
+  !$omp parallel
+  !$omp do reduction(+:x,y,z)
+  do i=1, 100
+    x = x + i
+    y = y + i
+    z = z + i
+  end do
+  !$omp end do
+  !$omp end parallel
+end subroutine
+
+! CHECK-LABEL:   func.func @_QPmultiple_reductions_different_type() {
+! CHECK:           %[[VAL_0:.*]] = fir.alloca i32 {bindc_name = "i", uniq_name = "_QFmultiple_reductions_different_typeEi"}
+! CHECK:           %[[VAL_1:.*]] = fir.alloca f64 {bindc_name = "w", uniq_name = "_QFmultiple_reductions_different_typeEw"}
+! CHECK:           %[[VAL_2:.*]] = fir.alloca i32 {bindc_name = "x", uniq_name = "_QFmultiple_reductions_different_typeEx"}
+! CHECK:           %[[VAL_3:.*]] = fir.alloca i64 {bindc_name = "y", uniq_name = "_QFmultiple_reductions_different_typeEy"}
+! CHECK:           %[[VAL_4:.*]] = fir.alloca f32 {bindc_name = "z", uniq_name = "_QFmultiple_reductions_different_typeEz"}
+! CHECK:           %[[VAL_5:.*]] = arith.constant 0 : i32
+! CHECK:           fir.store %[[VAL_5]] to %[[VAL_2]] : !fir.ref<i32>
+! CHECK:           %[[VAL_6:.*]] = arith.constant 0 : i64
+! CHECK:           fir.store %[[VAL_6]] to %[[VAL_3]] : !fir.ref<i64>
+! CHECK:           %[[VAL_7:.*]] = arith.constant 0.000000e+00 : f32
+! CHECK:           fir.store %[[VAL_7]] to %[[VAL_4]] : !fir.ref<f32>
+! CHECK:           %[[VAL_8:.*]] = arith.constant 0.000000e+00 : f64
+! CHECK:           fir.store %[[VAL_8]] to %[[VAL_1]] : !fir.ref<f64>
+! CHECK:           omp.parallel {
+! CHECK:             %[[VAL_9:.*]] = fir.alloca i32 {adapt.valuebyref, pinned}
+! CHECK:             %[[VAL_10:.*]] = arith.constant 1 : i32
+! CHECK:             %[[VAL_11:.*]] = arith.constant 100 : i32
+! CHECK:             %[[VAL_12:.*]] = arith.constant 1 : i32
+! CHECK:             omp.wsloop byref reduction(@add_reduction_i_32_byref %[[VAL_2]] -> %[[VAL_13:.*]] : !fir.ref<i32>, @add_reduction_i_64_byref %[[VAL_3]] -> %[[VAL_14:.*]] : !fir.ref<i64>, @add_reduction_f_32_byref %[[VAL_4]] -> %[[VAL_15:.*]] : !fir.ref<f32>, @add_reduction_f_64_byref %[[VAL_1]] -> %[[VAL_16:.*]] : !fir.ref<f64>)  for  (%[[VAL_17:.*]]) : i32 = (%[[VAL_10]]) to (%[[VAL_11]]) inclusive step (%[[VAL_12]]) {
+! CHECK:               fir.store %[[VAL_17]] to %[[VAL_9]] : !fir.ref<i32>
+! CHECK:               %[[VAL_18:.*]] = fir.load %[[VAL_13]] : !fir.ref<i32>
+! CHECK:               %[[VAL_19:.*]] = fir.load %[[VAL_9]] : !fir.ref<i32>
+! CHECK:               %[[VAL_20:.*]] = arith.addi %[[VAL_18]], %[[VAL_19]] : i32
+! CHECK:               fir.store %[[VAL_20]] to %[[VAL_13]] : !fir.ref<i32>
+! CHECK:               %[[VAL_21:.*]] = fir.load %[[VAL_14]] : !fir.ref<i64>
+! CHECK:               %[[VAL_22:.*]] = fir.load %[[VAL_9]] : !fir.ref<i32>
+! CHECK:               %[[VAL_23:.*]] = fir.convert %[[VAL_22]] : (i32) -> i64
+! CHECK:               %[[VAL_24:.*]] = arith.addi %[[VAL_21]], %[[VAL_23]] : i64
+! CHECK:               fir.store %[[VAL_24]] to %[[VAL_14]] : !fir.ref<i64>
+! CHECK:               %[[VAL_25:.*]] = fir.load %[[VAL_15]] : !fir.ref<f32>
+! CHECK:               %[[VAL_26:.*]] = fir.load %[[VAL_9]] : !fir.ref<i32>
+! CHECK:               %[[VAL_27:.*]] = fir.convert %[[VAL_26]] : (i32) -> f32
+! CHECK:               %[[VAL_28:.*]] = arith.addf %[[VAL_25]], %[[VAL_27]] fastmath<contract> : f32
+! CHECK:               fir.store %[[VAL_28]] to %[[VAL_15]] : !fir.ref<f32>
+! CHECK:               %[[VAL_29:.*]] = fir.load %[[VAL_16]] : !fir.ref<f64>
+! CHECK:               %[[VAL_30:.*]] = fir.load %[[VAL_9]] : !fir.ref<i32>
+! CHECK:               %[[VAL_31:.*]] = fir.convert %[[VAL_30]] : (i32) -> f64
+! CHECK:               %[[VAL_32:.*]] = arith.addf %[[VAL_29]], %[[VAL_31]] fastmath<contract> : f64
+! CHECK:               fir.store %[[VAL_32]] to %[[VAL_16]] : !fir.ref<f64>
+! CHECK:               omp.yield
+! CHECK:             }
+! CHECK:             omp.terminator
+! CHECK:           }
+! CHECK:           return
+! CHECK:         }
+
+
+subroutine multiple_reductions_different_type
+  integer :: x
+  integer(kind=8) :: y
+  real :: z
+  real(kind=8) :: w
+  x = 0
+  y = 0
+  z = 0.0
+  w = 0.0
+  !$omp parallel
+  !$omp do reduction(+:x,y,z,w)
+  do i=1, 100
+    x = x + i
+    y = y + i
+    z = z + i
+    w = w + i
+  end do
+  !$omp end do
+  !$omp end parallel
+end subroutine
diff --git a/flang/test/Lower/OpenMP/FIR/wsloop-reduction-iand-byref.f90 b/flang/test/Lower/OpenMP/FIR/wsloop-reduction-iand-byref.f90
new file mode 100644
index 00000000000000..00dfad69248bc5
--- /dev/null
+++ b/flang/test/Lower/OpenMP/FIR/wsloop-reduction-iand-byref.f90
@@ -0,0 +1,46 @@
+! RUN: bbc -emit-fir -hlfir=false -fopenmp --force-byref-reduction %s -o - | FileCheck %s
+! RUN: %flang_fc1 -emit-fir -flang-deprecated-no-hlfir -fopenmp -mmlir --force-byref-reduction %s -o - | FileCheck %s
+
+!CHECK-LABEL:   omp.reduction.declare @iand_i_32_byref : !fir.ref<i32>
+!CHECK-SAME:    init {
+!CHECK:         ^bb0(%[[VAL_0:.*]]: !fir.ref<i32>):
+!CHECK:            %[[C0_1:.*]] = arith.constant -1 : i32
+!CHECK:            %[[REF:.*]] = fir.alloca i32
+!CHECK:            fir.store %[[C0_1]] to %[[REF]] : !fir.ref<i32>
+!CHECK:            omp.yield(%[[REF]] : !fir.ref<i32>)
+
+!CHECK-LABEL:   } combiner {
+!CHECK:         ^bb0(%[[ARG0:.*]]: !fir.ref<i32>, %[[ARG1:.*]]: !fir.ref<i32>):
+!CHECK:           %[[LD0:.*]] = fir.load %[[ARG0]] : !fir.ref<i32>
+!CHECK:           %[[LD1:.*]] = fir.load %[[ARG1]] : !fir.ref<i32>
+!CHECK:           %[[RES:.*]] = arith.andi %[[LD0]], %[[LD1]] : i32
+!CHECK:           fir.store %[[RES]] to %[[ARG0]] : !fir.ref<i32>
+!CHECK:           omp.yield(%[[ARG0]] : !fir.ref<i32>)
+!CHECK:         }
+
+
+!CHECK-LABEL: @_QPreduction_iand
+!CHECK-SAME: %[[Y_BOX:.*]]: !fir.box<!fir.array<?xi32>>
+!CHECK: %[[X_REF:.*]] = fir.alloca i32 {bindc_name = "x", uniq_name = "_QFreduction_iandEx"}
+!CHECK: omp.parallel
+!CHECK: omp.wsloop byref reduction(@iand_i_32_byref %[[X_REF]] -> %[[PRV:.+]] : !fir.ref<i32>) for
+!CHECK: %[[LPRV:.+]] = fir.load %[[PRV]] : !fir.ref<i32>
+!CHECK: %[[Y_I_REF:.*]] = fir.coordinate_of %[[Y_BOX]]
+!CHECK: %[[Y_I:.*]] = fir.load %[[Y_I_REF]] : !fir.ref<i32>
+!CHECK: %[[RES:.+]] = arith.andi %[[LPRV]], %[[Y_I]] : i32
+!CHECK: fir.store %[[RES]] to %[[PRV]] : !fir.ref<i32>
+!CHECK: omp.yield
+!CHECK: omp.terminator
+
+subroutine reduction_iand(y)
+  integer :: x, y(:)
+  x = 0
+  !$omp parallel
+  !$omp do reduction(iand:x)
+  do i=1, 100
+    x = iand(x, y(i))
+  end do
+  !$omp end do
+  !$omp end parallel
+  print *, x
+end subroutine
diff --git a/flang/test/Lower/OpenMP/FIR/wsloop-reduction-ieor-byref.f90 b/flang/test/Lower/OpenMP/FIR/wsloop-reduction-ieor-byref.f90
new file mode 100644
index 00000000000000..fb35c405ca1d54
--- /dev/null
+++ b/flang/test/Lower/OpenMP/FIR/wsloop-reduction-ieor-byref.f90
@@ -0,0 +1,45 @@
+! RUN: bbc -emit-fir -hlfir=false -fopenmp --force-byref-reduction %s -o - | FileCheck %s
+! RUN: %flang_fc1 -emit-fir -flang-deprecated-no-hlfir -mmlir --force-byref-reduction -fopenmp %s -o - | FileCheck %s
+
+! CHECK-LABEL:   omp.reduction.declare @ieor_i_32_byref : !fir.ref<i32>
+! CHECK-SAME:    init {
+! CHECK:         ^bb0(%[[VAL_0:.*]]: !fir.ref<i32>):
+! CHECK:            %[[C0_1:.*]] = arith.constant 0 : i32
+! CHECK:            %[[REF:.*]] = fir.alloca i32
+! CHECK:            fir.store %[[C0_1]] to %[[REF]] : !fir.ref<i32>
+! CHECK:            omp.yield(%[[REF]] : !fir.ref<i32>)
+
+! CHECK-LABEL:   } combiner {
+! CHECK:         ^bb0(%[[ARG0:.*]]: !fir.ref<i32>, %[[ARG1:.*]]: !fir.ref<i32>):
+! CHECK:           %[[LD0:.*]] = fir.load %[[ARG0]] : !fir.ref<i32>
+! CHECK:           %[[LD1:.*]] = fir.load %[[ARG1]] : !fir.ref<i32>
+! CHECK:           %[[RES:.*]] = arith.xori %[[LD0]], %[[LD1]] : i32
+! CHECK:           fir.store %[[RES]] to %[[ARG0]] : !fir.ref<i32>
+! CHECK:           omp.yield(%[[ARG0]] : !fir.ref<i32>)
+! CHECK:         }
+
+!CHECK-LABEL: @_QPreduction_ieor
+!CHECK-SAME: %[[Y_BOX:.*]]: !fir.box<!fir.array<?xi32>>
+!CHECK: %[[X_REF:.*]] = fir.alloca i32 {bindc_name = "x", uniq_name = "_QFreduction_ieorEx"}
+!CHECK: omp.parallel
+!CHECK: omp.wsloop byref reduction(@ieor_i_32_byref %[[X_REF]] -> %[[PRV:.+]] : !fir.ref<i32>) for
+!CHECK: %[[LPRV:.+]] = fir.load %[[PRV]] : !fir.ref<i32>
+!CHECK: %[[Y_I_REF:.*]] = fir.coordinate_of %[[Y_BOX]]
+!CHECK: %[[Y_I:.*]] = fir.load %[[Y_I_REF]] : !fir.ref<i32>
+!CHECK: %[[RES:.+]] = arith.xori %[[LPRV]], %[[Y_I]] : i32
+!CHECK: fir.store %[[RES]] to %[[PRV]] : !fir.ref<i32>
+!CHECK: omp.yield
+!CHECK: omp.terminator
+
+subroutine reduction_ieor(y)
+  integer :: x, y(:)
+  x = 0
+  !$omp parallel
+  !$omp do reduction(ieor:x)
+  do i=1, 100
+    x = ieor(x, y(i))
+  end do
+  !$omp end do
+  !$omp end parallel
+  print *, x
+end subroutine
diff --git a/flang/test/Lower/OpenMP/FIR/wsloop-reduction-ior-byref.f90 b/flang/test/Lower/OpenMP/FIR/wsloop-reduction-ior-byref.f90
new file mode 100644
index 00000000000000..0365eff3fd8136
--- /dev/null
+++ b/flang/test/Lower/OpenMP/FIR/wsloop-reduction-ior-byref.f90
@@ -0,0 +1,45 @@
+! RUN: bbc -emit-fir -hlfir=false -fopenmp --force-byref-reduction %s -o - | FileCheck %s
+! RUN: %flang_fc1 -emit-fir -flang-deprecated-no-hlfir -fopenmp -mmlir --force-byref-reduction %s -o - | FileCheck %s
+
+! CHECK-LABEL:   omp.reduction.declare @ior_i_32_byref : !fir.ref<i32>
+! CHECK-SAME:    init {
+! CHECK:         ^bb0(%[[VAL_0:.*]]: !fir.ref<i32>):
+! CHECK:            %[[C0_1:.*]] = arith.constant 0 : i32
+! CHECK:            %[[REF:.*]] = fir.alloca i32
+! CHECK:            fir.store %[[C0_1]] to %[[REF]] : !fir.ref<i32>
+! CHECK:            omp.yield(%[[REF]] : !fir.ref<i32>)
+
+! CHECK-LABEL:   } combiner {
+! CHECK:         ^bb0(%[[ARG0:.*]]: !fir.ref<i32>, %[[ARG1:.*]]: !fir.ref<i32>):
+! CHECK:           %[[LD0:.*]] = fir.load %[[ARG0]] : !fir.ref<i32>
+! CHECK:           %[[LD1:.*]] = fir.load %[[ARG1]] : !fir.ref<i32>
+! CHECK:           %[[RES:.*]] = arith.ori %[[LD0]], %[[LD1]] : i32
+! CHECK:           fir.store %[[RES]] to %[[ARG0]] : !fir.ref<i32>
+! CHECK:           omp.yield(%[[ARG0]] : !fir.ref<i32>)
+! CHECK:         }
+
+!CHECK-LABEL: @_QPreduction_ior
+!CHECK-SAME: %[[Y_BOX:.*]]: !fir.box<!fir.array<?xi32>> 
+!CHECK: %[[X_REF:.*]] = fir.alloca i32 {bindc_name = "x", uniq_name = "_QFreduction_iorEx"}
+!CHECK: omp.parallel
+!CHECK: omp.wsloop byref reduction(@ior_i_32_byref %[[X_REF]] -> %[[PRV:.+]] : !fir.ref<i32>) for 
+!CHECK: %[[LPRV:.+]] = fir.load %[[PRV]] : !fir.ref<i32>
+!CHECK: %[[Y_I_REF:.*]] = fir.coordinate_of %[[Y_BOX]]
+!CHECK: %[[Y_I:.*]] = fir.load %[[Y_I_REF]] : !fir.ref<i32>
+!CHECK: %[[RES:.+]] = arith.ori %[[LPRV]], %[[Y_I]] : i32
+!CHECK: fir.store %[[RES]] to %[[PRV]] : !fir.ref<i32>
+!CHECK: omp.yield
+!CHECK: omp.terminator
+
+subroutine reduction_ior(y)
+  integer :: x, y(:)
+  x = 0
+  !$omp parallel
+  !$omp do reduction(ior:x)
+  do i=1, 100
+    x = ior(x, y(i))
+  end do
+  !$omp end do
+  !$omp end parallel
+  print *, x
+end subroutine
diff --git a/flang/test/Lower/OpenMP/FIR/wsloop-reduction-logical-eqv-byref.f90 b/flang/test/Lower/OpenMP/FIR/wsloop-reduction-logical-eqv-byref.f90
new file mode 100644
index 00000000000000..84219b34f964f4
--- /dev/null
+++ b/flang/test/Lower/OpenMP/FIR/wsloop-reduction-logical-eqv-byref.f90
@@ -0,0 +1,187 @@
+! RUN: bbc -emit-fir -hlfir=false -fopenmp --force-byref-reduction %s -o - | FileCheck %s
+! RUN: %flang_fc1 -emit-fir -flang-deprecated-no-hlfir -fopenmp -mmlir --force-byref-reduction %s -o - | FileCheck %s
+
+! NOTE: Assertions have been autogenerated by utils/generate-test-checks.py
+
+! CHECK-LABEL:   omp.reduction.declare @eqv_reduction : !fir.ref<!fir.logical<4>>
+! CHECK-SAME:    init {
+! CHECK:         ^bb0(%[[VAL_0:.*]]: !fir.ref<!fir.logical<4>>):
+! CHECK:           %[[VAL_1:.*]] = arith.constant true
+! CHECK:           %[[VAL_2:.*]] = fir.convert %[[VAL_1]] : (i1) -> !fir.logical<4>
+! CHECK:           %[[REF:.*]] = fir.alloca !fir.logical<4>
+! CHECK:           fir.store %[[VAL_2]] to %[[REF]] : !fir.ref<!fir.logical<4>>
+! CHECK:           omp.yield(%[[REF]] : !fir.ref<!fir.logical<4>>)
+
+! CHECK-LABEL:   } combiner {
+! CHECK:         ^bb0(%[[ARG0:.*]]: !fir.ref<!fir.logical<4>>, %[[ARG1:.*]]: !fir.ref<!fir.logical<4>>):
+! CHECK:           %[[LD0:.*]] = fir.load %[[ARG0]] : !fir.ref<!fir.logical<4>>
+! CHECK:           %[[LD1:.*]] = fir.load %[[ARG1]] : !fir.ref<!fir.logical<4>>
+! CHECK:           %[[VAL_2:.*]] = fir.convert %[[LD0]] : (!fir.logical<4>) -> i1
+! CHECK:           %[[VAL_3:.*]] = fir.convert %[[LD1]] : (!fir.logical<4>) -> i1
+! CHECK:           %[[RES:.*]] = arith.cmpi eq, %[[VAL_2]], %[[VAL_3]] : i1
+! CHECK:           %[[VAL_5:.*]] = fir.convert %[[RES]] : (i1) -> !fir.logical<4>
+! CHECK:           fir.store %[[VAL_5]] to %[[ARG0]] : !fir.ref<!fir.logical<4>>
+! CHECK:           omp.yield(%[[ARG0]] : !fir.ref<!fir.logical<4>>)
+! CHECK:         }
+
+! CHECK-LABEL:   func.func @_QPsimple_reduction(
+! CHECK-SAME:                                   %[[VAL_0:.*]]: !fir.ref<!fir.array<100x!fir.logical<4>>> {fir.bindc_name = "y"}) {
+! CHECK:           %[[VAL_1:.*]] = fir.alloca i32 {bindc_name = "i", uniq_name = "_QFsimple_reductionEi"}
+! CHECK:           %[[VAL_2:.*]] = fir.alloca !fir.logical<4> {bindc_name = "x", uniq_name = "_QFsimple_reductionEx"}
+! CHECK:           %[[VAL_3:.*]] = arith.constant true
+! CHECK:           %[[VAL_4:.*]] = fir.convert %[[VAL_3]] : (i1) -> !fir.logical<4>
+! CHECK:           fir.store %[[VAL_4]] to %[[VAL_2]] : !fir.ref<!fir.logical<4>>
+! CHECK:           omp.parallel {
+! CHECK:             %[[VAL_5:.*]] = fir.alloca i32 {adapt.valuebyref, pinned}
+! CHECK:             %[[VAL_6:.*]] = arith.constant 1 : i32
+! CHECK:             %[[VAL_7:.*]] = arith.constant 100 : i32
+! CHECK:             %[[VAL_8:.*]] = arith.constant 1 : i32
+! CHECK:             omp.wsloop byref reduction(@eqv_reduction %[[VAL_2]] -> %[[VAL_9:.*]] : !fir.ref<!fir.logical<4>>)  for  (%[[VAL_10:.*]]) : i32 = (%[[VAL_6]]) to (%[[VAL_7]]) inclusive step (%[[VAL_8]]) {
+! CHECK:               fir.store %[[VAL_10]] to %[[VAL_5]] : !fir.ref<i32>
+! CHECK:               %[[VAL_11:.*]] = fir.load %[[VAL_9]] : !fir.ref<!fir.logical<4>>
+! CHECK:               %[[VAL_12:.*]] = fir.load %[[VAL_5]] : !fir.ref<i32>
+! CHECK:               %[[VAL_13:.*]] = fir.convert %[[VAL_12]] : (i32) -> i64
+! CHECK:               %[[VAL_14:.*]] = arith.constant 1 : i64
+! CHECK:               %[[VAL_15:.*]] = arith.subi %[[VAL_13]], %[[VAL_14]] : i64
+! CHECK:               %[[VAL_16:.*]] = fir.coordinate_of %[[VAL_0]], %[[VAL_15]] : (!fir.ref<!fir.array<100x!fir.logical<4>>>, i64) -> !fir.ref<!fir.logical<4>>
+! CHECK:               %[[VAL_17:.*]] = fir.load %[[VAL_16]] : !fir.ref<!fir.logical<4>>
+! CHECK:               %[[VAL_18:.*]] = fir.convert %[[VAL_11]] : (!fir.logical<4>) -> i1
+! CHECK:               %[[VAL_19:.*]] = fir.convert %[[VAL_17]] : (!fir.logical<4>) -> i1
+! CHECK:               %[[VAL_20:.*]] = arith.cmpi eq, %[[VAL_18]], %[[VAL_19]] : i1
+! CHECK:               %[[VAL_21:.*]] = fir.convert %[[VAL_20]] : (i1) -> !fir.logical<4>
+! CHECK:               fir.store %[[VAL_21]] to %[[VAL_9]] : !fir.ref<!fir.logical<4>>
+! CHECK:               omp.yield
+! CHECK:             omp.terminator
+! CHECK:           return
+
+subroutine simple_reduction(y)
+  logical :: x, y(100)
+  x = .true.
+  !$omp parallel
+  !$omp do reduction(.eqv.:x)
+  do i=1, 100
+    x = x .eqv. y(i)
+  end do
+  !$omp end do
+  !$omp end parallel
+end subroutine
+
+! CHECK-LABEL:   func.func @_QPsimple_reduction_switch_order(
+! CHECK-SAME:                                                %[[VAL_0:.*]]: !fir.ref<!fir.array<100x!fir.logical<4>>> {fir.bindc_name = "y"}) {
+! CHECK:           %[[VAL_1:.*]] = fir.alloca i32 {bindc_name = "i", uniq_name = "_QFsimple_reduction_switch_orderEi"}
+! CHECK:           %[[VAL_2:.*]] = fir.alloca !fir.logical<4> {bindc_name = "x", uniq_name = "_QFsimple_reduction_switch_orderEx"}
+! CHECK:           %[[VAL_3:.*]] = arith.constant true
+! CHECK:           %[[VAL_4:.*]] = fir.convert %[[VAL_3]] : (i1) -> !fir.logical<4>
+! CHECK:           fir.store %[[VAL_4]] to %[[VAL_2]] : !fir.ref<!fir.logical<4>>
+! CHECK:           omp.parallel {
+! CHECK:             %[[VAL_5:.*]] = fir.alloca i32 {adapt.valuebyref, pinned}
+! CHECK:             %[[VAL_6:.*]] = arith.constant 1 : i32
+! CHECK:             %[[VAL_7:.*]] = arith.constant 100 : i32
+! CHECK:             %[[VAL_8:.*]] = arith.constant 1 : i32
+! CHECK:             omp.wsloop byref reduction(@eqv_reduction %[[VAL_2]] -> %[[VAL_9:.*]] : !fir.ref<!fir.logical<4>>)  for  (%[[VAL_10:.*]]) : i32 = (%[[VAL_6]]) to (%[[VAL_7]]) inclusive step (%[[VAL_8]]) {
+! CHECK:               fir.store %[[VAL_10]] to %[[VAL_5]] : !fir.ref<i32>
+! CHECK:               %[[VAL_11:.*]] = fir.load %[[VAL_5]] : !fir.ref<i32>
+! CHECK:               %[[VAL_12:.*]] = fir.convert %[[VAL_11]] : (i32) -> i64
+! CHECK:               %[[VAL_13:.*]] = arith.constant 1 : i64
+! CHECK:               %[[VAL_14:.*]] = arith.subi %[[VAL_12]], %[[VAL_13]] : i64
+! CHECK:               %[[VAL_15:.*]] = fir.coordinate_of %[[VAL_0]], %[[VAL_14]] : (!fir.ref<!fir.array<100x!fir.logical<4>>>, i64) -> !fir.ref<!fir.logical<4>>
+! CHECK:               %[[VAL_16:.*]] = fir.load %[[VAL_15]] : !fir.ref<!fir.logical<4>>
+! CHECK:               %[[VAL_17:.*]] = fir.load %[[VAL_9]] : !fir.ref<!fir.logical<4>>
+! CHECK:               %[[VAL_18:.*]] = fir.convert %[[VAL_16]] : (!fir.logical<4>) -> i1
+! CHECK:               %[[VAL_19:.*]] = fir.convert %[[VAL_17]] : (!fir.logical<4>) -> i1
+! CHECK:               %[[VAL_20:.*]] = arith.cmpi eq, %[[VAL_18]], %[[VAL_19]] : i1
+! CHECK:               %[[VAL_21:.*]] = fir.convert %[[VAL_20]] : (i1) -> !fir.logical<4>
+! CHECK:               fir.store %[[VAL_21]] to %[[VAL_9]] : !fir.ref<!fir.logical<4>>
+! CHECK:               omp.yield
+! CHECK:             omp.terminator
+! CHECK:           return
+
+subroutine simple_reduction_switch_order(y)
+  logical :: x, y(100)
+  x = .true.
+  !$omp parallel
+  !$omp do reduction(.eqv.:x)
+  do i=1, 100
+  x = y(i) .eqv. x
+  end do
+  !$omp end do
+  !$omp end parallel
+end subroutine
+
+! CHECK-LABEL:   func.func @_QPmultiple_reductions(
+! CHECK-SAME:                                      %[[VAL_0:.*]]: !fir.ref<!fir.array<100x!fir.logical<4>>> {fir.bindc_name = "w"}) {
+! CHECK:           %[[VAL_1:.*]] = fir.alloca i32 {bindc_name = "i", uniq_name = "_QFmultiple_reductionsEi"}
+! CHECK:           %[[VAL_2:.*]] = fir.alloca !fir.logical<4> {bindc_name = "x", uniq_name = "_QFmultiple_reductionsEx"}
+! CHECK:           %[[VAL_3:.*]] = fir.alloca !fir.logical<4> {bindc_name = "y", uniq_name = "_QFmultiple_reductionsEy"}
+! CHECK:           %[[VAL_4:.*]] = fir.alloca !fir.logical<4> {bindc_name = "z", uniq_name = "_QFmultiple_reductionsEz"}
+! CHECK:           %[[VAL_5:.*]] = arith.constant true
+! CHECK:           %[[VAL_6:.*]] = fir.convert %[[VAL_5]] : (i1) -> !fir.logical<4>
+! CHECK:           fir.store %[[VAL_6]] to %[[VAL_2]] : !fir.ref<!fir.logical<4>>
+! CHECK:           %[[VAL_7:.*]] = arith.constant true
+! CHECK:           %[[VAL_8:.*]] = fir.convert %[[VAL_7]] : (i1) -> !fir.logical<4>
+! CHECK:           fir.store %[[VAL_8]] to %[[VAL_3]] : !fir.ref<!fir.logical<4>>
+! CHECK:           %[[VAL_9:.*]] = arith.constant true
+! CHECK:           %[[VAL_10:.*]] = fir.convert %[[VAL_9]] : (i1) -> !fir.logical<4>
+! CHECK:           fir.store %[[VAL_10]] to %[[VAL_4]] : !fir.ref<!fir.logical<4>>
+! CHECK:           omp.parallel {
+! CHECK:             %[[VAL_11:.*]] = fir.alloca i32 {adapt.valuebyref, pinned}
+! CHECK:             %[[VAL_12:.*]] = arith.constant 1 : i32
+! CHECK:             %[[VAL_13:.*]] = arith.constant 100 : i32
+! CHECK:             %[[VAL_14:.*]] = arith.constant 1 : i32
+! CHECK:             omp.wsloop byref reduction(@eqv_reduction %[[VAL_2]] -> %[[VAL_15:.*]] : !fir.ref<!fir.logical<4>>, @eqv_reduction %[[VAL_3]] -> %[[VAL_16:.*]] : !fir.ref<!fir.logical<4>>, @eqv_reduction %[[VAL_4]] -> %[[VAL_17:.*]] : !fir.ref<!fir.logical<4>>)  for  (%[[VAL_18:.*]]) : i32 = (%[[VAL_12]]) to (%[[VAL_13]]) inclusive step (%[[VAL_14]]) {
+! CHECK:               fir.store %[[VAL_18]] to %[[VAL_11]] : !fir.ref<i32>
+! CHECK:               %[[VAL_19:.*]] = fir.load %[[VAL_15]] : !fir.ref<!fir.logical<4>>
+! CHECK:               %[[VAL_20:.*]] = fir.load %[[VAL_11]] : !fir.ref<i32>
+! CHECK:               %[[VAL_21:.*]] = fir.convert %[[VAL_20]] : (i32) -> i64
+! CHECK:               %[[VAL_22:.*]] = arith.constant 1 : i64
+! CHECK:               %[[VAL_23:.*]] = arith.subi %[[VAL_21]], %[[VAL_22]] : i64
+! CHECK:               %[[VAL_24:.*]] = fir.coordinate_of %[[VAL_0]], %[[VAL_23]] : (!fir.ref<!fir.array<100x!fir.logical<4>>>, i64) -> !fir.ref<!fir.logical<4>>
+! CHECK:               %[[VAL_25:.*]] = fir.load %[[VAL_24]] : !fir.ref<!fir.logical<4>>
+! CHECK:               %[[VAL_26:.*]] = fir.convert %[[VAL_19]] : (!fir.logical<4>) -> i1
+! CHECK:               %[[VAL_27:.*]] = fir.convert %[[VAL_25]] : (!fir.logical<4>) -> i1
+! CHECK:               %[[VAL_28:.*]] = arith.cmpi eq, %[[VAL_26]], %[[VAL_27]] : i1
+! CHECK:               %[[VAL_29:.*]] = fir.convert %[[VAL_28]] : (i1) -> !fir.logical<4>
+! CHECK:               fir.store %[[VAL_29]] to %[[VAL_15]] : !fir.ref<!fir.logical<4>>
+! CHECK:               %[[VAL_30:.*]] = fir.load %[[VAL_16]] : !fir.ref<!fir.logical<4>>
+! CHECK:               %[[VAL_31:.*]] = fir.load %[[VAL_11]] : !fir.ref<i32>
+! CHECK:               %[[VAL_32:.*]] = fir.convert %[[VAL_31]] : (i32) -> i64
+! CHECK:               %[[VAL_33:.*]] = arith.constant 1 : i64
+! CHECK:               %[[VAL_34:.*]] = arith.subi %[[VAL_32]], %[[VAL_33]] : i64
+! CHECK:               %[[VAL_35:.*]] = fir.coordinate_of %[[VAL_0]], %[[VAL_34]] : (!fir.ref<!fir.array<100x!fir.logical<4>>>, i64) -> !fir.ref<!fir.logical<4>>
+! CHECK:               %[[VAL_36:.*]] = fir.load %[[VAL_35]] : !fir.ref<!fir.logical<4>>
+! CHECK:               %[[VAL_37:.*]] = fir.convert %[[VAL_30]] : (!fir.logical<4>) -> i1
+! CHECK:               %[[VAL_38:.*]] = fir.convert %[[VAL_36]] : (!fir.logical<4>) -> i1
+! CHECK:               %[[VAL_39:.*]] = arith.cmpi eq, %[[VAL_37]], %[[VAL_38]] : i1
+! CHECK:               %[[VAL_40:.*]] = fir.convert %[[VAL_39]] : (i1) -> !fir.logical<4>
+! CHECK:               fir.store %[[VAL_40]] to %[[VAL_16]] : !fir.ref<!fir.logical<4>>
+! CHECK:               %[[VAL_41:.*]] = fir.load %[[VAL_17]] : !fir.ref<!fir.logical<4>>
+! CHECK:               %[[VAL_42:.*]] = fir.load %[[VAL_11]] : !fir.ref<i32>
+! CHECK:               %[[VAL_43:.*]] = fir.convert %[[VAL_42]] : (i32) -> i64
+! CHECK:               %[[VAL_44:.*]] = arith.constant 1 : i64
+! CHECK:               %[[VAL_45:.*]] = arith.subi %[[VAL_43]], %[[VAL_44]] : i64
+! CHECK:               %[[VAL_46:.*]] = fir.coordinate_of %[[VAL_0]], %[[VAL_45]] : (!fir.ref<!fir.array<100x!fir.logical<4>>>, i64) -> !fir.ref<!fir.logical<4>>
+! CHECK:               %[[VAL_47:.*]] = fir.load %[[VAL_46]] : !fir.ref<!fir.logical<4>>
+! CHECK:               %[[VAL_48:.*]] = fir.convert %[[VAL_41]] : (!fir.logical<4>) -> i1
+! CHECK:               %[[VAL_49:.*]] = fir.convert %[[VAL_47]] : (!fir.logical<4>) -> i1
+! CHECK:               %[[VAL_50:.*]] = arith.cmpi eq, %[[VAL_48]], %[[VAL_49]] : i1
+! CHECK:               %[[VAL_51:.*]] = fir.convert %[[VAL_50]] : (i1) -> !fir.logical<4>
+! CHECK:               fir.store %[[VAL_51]] to %[[VAL_17]] : !fir.ref<!fir.logical<4>>
+! CHECK:               omp.yield
+! CHECK:             omp.terminator
+! CHECK:           return
+
+subroutine multiple_reductions(w)
+  logical :: x,y,z,w(100)
+  x = .true.
+  y = .true.
+  z = .true.
+  !$omp parallel
+  !$omp do reduction(.eqv.:x,y,z)
+  do i=1, 100
+  x = x .eqv. w(i)
+  y = y .eqv. w(i)
+  z = z .eqv. w(i)
+  end do
+  !$omp end do
+  !$omp end parallel
+end subroutine
diff --git a/flang/test/Lower/OpenMP/FIR/wsloop-reduction-logical-neqv-byref.f90 b/flang/test/Lower/OpenMP/FIR/wsloop-reduction-logical-neqv-byref.f90
new file mode 100644
index 00000000000000..ec4d8500850b48
--- /dev/null
+++ b/flang/test/Lower/OpenMP/FIR/wsloop-reduction-logical-neqv-byref.f90
@@ -0,0 +1,189 @@
+! RUN: bbc -emit-fir -hlfir=false -fopenmp --force-byref-reduction %s -o - | FileCheck %s
+! RUN: %flang_fc1 -emit-fir -flang-deprecated-no-hlfir -fopenmp -mmlir --force-byref-reduction %s -o - | FileCheck %s
+
+! NOTE: Assertions have been autogenerated by utils/generate-test-checks.py
+
+
+! CHECK-LABEL:   omp.reduction.declare @neqv_reduction : !fir.ref<!fir.logical<4>>
+! CHECK-SAME:    init {
+! CHECK:         ^bb0(%[[VAL_0:.*]]: !fir.ref<!fir.logical<4>>):
+! CHECK:           %[[VAL_1:.*]] = arith.constant false
+! CHECK:           %[[VAL_2:.*]] = fir.convert %[[VAL_1]] : (i1) -> !fir.logical<4>
+! CHECK:           %[[REF:.*]] = fir.alloca !fir.logical<4>
+! CHECK:           fir.store %[[VAL_2]] to %[[REF]] : !fir.ref<!fir.logical<4>>
+! CHECK:           omp.yield(%[[REF]] : !fir.ref<!fir.logical<4>>)
+
+! CHECK-LABEL:   } combiner {
+! CHECK:         ^bb0(%[[ARG0:.*]]: !fir.ref<!fir.logical<4>>, %[[ARG1:.*]]: !fir.ref<!fir.logical<4>>):
+! CHECK:           %[[LD0:.*]] = fir.load %[[ARG0]] : !fir.ref<!fir.logical<4>>
+! CHECK:           %[[LD1:.*]] = fir.load %[[ARG1]] : !fir.ref<!fir.logical<4>>
+! CHECK:           %[[VAL_2:.*]] = fir.convert %[[LD0]] : (!fir.logical<4>) -> i1
+! CHECK:           %[[VAL_3:.*]] = fir.convert %[[LD1]] : (!fir.logical<4>) -> i1
+! CHECK:           %[[RES:.*]] = arith.cmpi ne, %[[VAL_2]], %[[VAL_3]] : i1
+! CHECK:           %[[VAL_5:.*]] = fir.convert %[[RES]] : (i1) -> !fir.logical<4>
+! CHECK:           fir.store %[[VAL_5]] to %[[ARG0]] : !fir.ref<!fir.logical<4>>
+! CHECK:           omp.yield(%[[ARG0]] : !fir.ref<!fir.logical<4>>)
+! CHECK:         }
+
+! CHECK-LABEL:   func.func @_QPsimple_reduction(
+! CHECK-SAME:                                   %[[VAL_0:.*]]: !fir.ref<!fir.array<100x!fir.logical<4>>> {fir.bindc_name = "y"}) {
+! CHECK:           %[[VAL_1:.*]] = fir.alloca i32 {bindc_name = "i", uniq_name = "_QFsimple_reductionEi"}
+! CHECK:           %[[VAL_2:.*]] = fir.alloca !fir.logical<4> {bindc_name = "x", uniq_name = "_QFsimple_reductionEx"}
+! CHECK:           %[[VAL_3:.*]] = arith.constant true
+! CHECK:           %[[VAL_4:.*]] = fir.convert %[[VAL_3]] : (i1) -> !fir.logical<4>
+! CHECK:           fir.store %[[VAL_4]] to %[[VAL_2]] : !fir.ref<!fir.logical<4>>
+! CHECK:           omp.parallel {
+! CHECK:             %[[VAL_5:.*]] = fir.alloca i32 {adapt.valuebyref, pinned}
+! CHECK:             %[[VAL_6:.*]] = arith.constant 1 : i32
+! CHECK:             %[[VAL_7:.*]] = arith.constant 100 : i32
+! CHECK:             %[[VAL_8:.*]] = arith.constant 1 : i32
+! CHECK:             omp.wsloop byref reduction(@neqv_reduction %[[VAL_2]] -> %[[VAL_9:.*]] : !fir.ref<!fir.logical<4>>)  for  (%[[VAL_10:.*]]) : i32 = (%[[VAL_6]]) to (%[[VAL_7]]) inclusive step (%[[VAL_8]]) {
+! CHECK:               fir.store %[[VAL_10]] to %[[VAL_5]] : !fir.ref<i32>
+! CHECK:               %[[VAL_11:.*]] = fir.load %[[VAL_9]] : !fir.ref<!fir.logical<4>>
+! CHECK:               %[[VAL_12:.*]] = fir.load %[[VAL_5]] : !fir.ref<i32>
+! CHECK:               %[[VAL_13:.*]] = fir.convert %[[VAL_12]] : (i32) -> i64
+! CHECK:               %[[VAL_14:.*]] = arith.constant 1 : i64
+! CHECK:               %[[VAL_15:.*]] = arith.subi %[[VAL_13]], %[[VAL_14]] : i64
+! CHECK:               %[[VAL_16:.*]] = fir.coordinate_of %[[VAL_0]], %[[VAL_15]] : (!fir.ref<!fir.array<100x!fir.logical<4>>>, i64) -> !fir.ref<!fir.logical<4>>
+! CHECK:               %[[VAL_17:.*]] = fir.load %[[VAL_16]] : !fir.ref<!fir.logical<4>>
+! CHECK:               %[[VAL_18:.*]] = fir.convert %[[VAL_11]] : (!fir.logical<4>) -> i1
+! CHECK:               %[[VAL_19:.*]] = fir.convert %[[VAL_17]] : (!fir.logical<4>) -> i1
+! CHECK:               %[[VAL_20:.*]] = arith.cmpi ne, %[[VAL_18]], %[[VAL_19]] : i1
+! CHECK:               %[[VAL_21:.*]] = fir.convert %[[VAL_20]] : (i1) -> !fir.logical<4>
+! CHECK:               fir.store %[[VAL_21]] to %[[VAL_9]] : !fir.ref<!fir.logical<4>>
+! CHECK:               omp.yield
+! CHECK:             omp.terminator
+! CHECK:           return
+
+subroutine simple_reduction(y)
+  logical :: x, y(100)
+  x = .true.
+  !$omp parallel
+  !$omp do reduction(.neqv.:x)
+  do i=1, 100
+    x = x .neqv. y(i)
+  end do
+  !$omp end do
+  !$omp end parallel
+end subroutine
+
+! CHECK-LABEL:   func.func @_QPsimple_reduction_switch_order(
+! CHECK-SAME:                                                %[[VAL_0:.*]]: !fir.ref<!fir.array<100x!fir.logical<4>>> {fir.bindc_name = "y"}) {
+! CHECK:           %[[VAL_1:.*]] = fir.alloca i32 {bindc_name = "i", uniq_name = "_QFsimple_reduction_switch_orderEi"}
+! CHECK:           %[[VAL_2:.*]] = fir.alloca !fir.logical<4> {bindc_name = "x", uniq_name = "_QFsimple_reduction_switch_orderEx"}
+! CHECK:           %[[VAL_3:.*]] = arith.constant true
+! CHECK:           %[[VAL_4:.*]] = fir.convert %[[VAL_3]] : (i1) -> !fir.logical<4>
+! CHECK:           fir.store %[[VAL_4]] to %[[VAL_2]] : !fir.ref<!fir.logical<4>>
+! CHECK:           omp.parallel {
+! CHECK:             %[[VAL_5:.*]] = fir.alloca i32 {adapt.valuebyref, pinned}
+! CHECK:             %[[VAL_6:.*]] = arith.constant 1 : i32
+! CHECK:             %[[VAL_7:.*]] = arith.constant 100 : i32
+! CHECK:             %[[VAL_8:.*]] = arith.constant 1 : i32
+! CHECK:             omp.wsloop byref reduction(@neqv_reduction %[[VAL_2]] -> %[[VAL_9:.*]] : !fir.ref<!fir.logical<4>>)  for  (%[[VAL_10:.*]]) : i32 = (%[[VAL_6]]) to (%[[VAL_7]]) inclusive step (%[[VAL_8]]) {
+! CHECK:               fir.store %[[VAL_10]] to %[[VAL_5]] : !fir.ref<i32>
+! CHECK:               %[[VAL_11:.*]] = fir.load %[[VAL_5]] : !fir.ref<i32>
+! CHECK:               %[[VAL_12:.*]] = fir.convert %[[VAL_11]] : (i32) -> i64
+! CHECK:               %[[VAL_13:.*]] = arith.constant 1 : i64
+! CHECK:               %[[VAL_14:.*]] = arith.subi %[[VAL_12]], %[[VAL_13]] : i64
+! CHECK:               %[[VAL_15:.*]] = fir.coordinate_of %[[VAL_0]], %[[VAL_14]] : (!fir.ref<!fir.array<100x!fir.logical<4>>>, i64) -> !fir.ref<!fir.logical<4>>
+! CHECK:               %[[VAL_16:.*]] = fir.load %[[VAL_15]] : !fir.ref<!fir.logical<4>>
+! CHECK:               %[[VAL_17:.*]] = fir.load %[[VAL_9]] : !fir.ref<!fir.logical<4>>
+! CHECK:               %[[VAL_18:.*]] = fir.convert %[[VAL_16]] : (!fir.logical<4>) -> i1
+! CHECK:               %[[VAL_19:.*]] = fir.convert %[[VAL_17]] : (!fir.logical<4>) -> i1
+! CHECK:               %[[VAL_20:.*]] = arith.cmpi ne, %[[VAL_18]], %[[VAL_19]] : i1
+! CHECK:               %[[VAL_21:.*]] = fir.convert %[[VAL_20]] : (i1) -> !fir.logical<4>
+! CHECK:               fir.store %[[VAL_21]] to %[[VAL_9]] : !fir.ref<!fir.logical<4>>
+! CHECK:               omp.yield
+! CHECK:             omp.terminator
+! CHECK:           return
+
+subroutine simple_reduction_switch_order(y)
+  logical :: x, y(100)
+  x = .true.
+  !$omp parallel
+  !$omp do reduction(.neqv.:x)
+  do i=1, 100
+  x = y(i) .neqv. x
+  end do
+  !$omp end do
+  !$omp end parallel
+end subroutine
+
+! CHECK-LABEL:   func.func @_QPmultiple_reductions(
+! CHECK-SAME:                                      %[[VAL_0:.*]]: !fir.ref<!fir.array<100x!fir.logical<4>>> {fir.bindc_name = "w"}) {
+! CHECK:           %[[VAL_1:.*]] = fir.alloca i32 {bindc_name = "i", uniq_name = "_QFmultiple_reductionsEi"}
+! CHECK:           %[[VAL_2:.*]] = fir.alloca !fir.logical<4> {bindc_name = "x", uniq_name = "_QFmultiple_reductionsEx"}
+! CHECK:           %[[VAL_3:.*]] = fir.alloca !fir.logical<4> {bindc_name = "y", uniq_name = "_QFmultiple_reductionsEy"}
+! CHECK:           %[[VAL_4:.*]] = fir.alloca !fir.logical<4> {bindc_name = "z", uniq_name = "_QFmultiple_reductionsEz"}
+! CHECK:           %[[VAL_5:.*]] = arith.constant true
+! CHECK:           %[[VAL_6:.*]] = fir.convert %[[VAL_5]] : (i1) -> !fir.logical<4>
+! CHECK:           fir.store %[[VAL_6]] to %[[VAL_2]] : !fir.ref<!fir.logical<4>>
+! CHECK:           %[[VAL_7:.*]] = arith.constant true
+! CHECK:           %[[VAL_8:.*]] = fir.convert %[[VAL_7]] : (i1) -> !fir.logical<4>
+! CHECK:           fir.store %[[VAL_8]] to %[[VAL_3]] : !fir.ref<!fir.logical<4>>
+! CHECK:           %[[VAL_9:.*]] = arith.constant true
+! CHECK:           %[[VAL_10:.*]] = fir.convert %[[VAL_9]] : (i1) -> !fir.logical<4>
+! CHECK:           fir.store %[[VAL_10]] to %[[VAL_4]] : !fir.ref<!fir.logical<4>>
+! CHECK:           omp.parallel {
+! CHECK:             %[[VAL_11:.*]] = fir.alloca i32 {adapt.valuebyref, pinned}
+! CHECK:             %[[VAL_12:.*]] = arith.constant 1 : i32
+! CHECK:             %[[VAL_13:.*]] = arith.constant 100 : i32
+! CHECK:             %[[VAL_14:.*]] = arith.constant 1 : i32
+! CHECK:             omp.wsloop byref reduction(@neqv_reduction %[[VAL_2]] -> %[[VAL_15:.*]] : !fir.ref<!fir.logical<4>>, @neqv_reduction %[[VAL_3]] -> %[[VAL_16:.*]] : !fir.ref<!fir.logical<4>>, @neqv_reduction %[[VAL_4]] -> %[[VAL_17:.*]] : !fir.ref<!fir.logical<4>>)  for  (%[[VAL_18:.*]]) : i32 = (%[[VAL_12]]) to (%[[VAL_13]]) inclusive step (%[[VAL_14]]) {
+! CHECK:               fir.store %[[VAL_18]] to %[[VAL_11]] : !fir.ref<i32>
+! CHECK:               %[[VAL_19:.*]] = fir.load %[[VAL_15]] : !fir.ref<!fir.logical<4>>
+! CHECK:               %[[VAL_20:.*]] = fir.load %[[VAL_11]] : !fir.ref<i32>
+! CHECK:               %[[VAL_21:.*]] = fir.convert %[[VAL_20]] : (i32) -> i64
+! CHECK:               %[[VAL_22:.*]] = arith.constant 1 : i64
+! CHECK:               %[[VAL_23:.*]] = arith.subi %[[VAL_21]], %[[VAL_22]] : i64
+! CHECK:               %[[VAL_24:.*]] = fir.coordinate_of %[[VAL_0]], %[[VAL_23]] : (!fir.ref<!fir.array<100x!fir.logical<4>>>, i64) -> !fir.ref<!fir.logical<4>>
+! CHECK:               %[[VAL_25:.*]] = fir.load %[[VAL_24]] : !fir.ref<!fir.logical<4>>
+! CHECK:               %[[VAL_26:.*]] = fir.convert %[[VAL_19]] : (!fir.logical<4>) -> i1
+! CHECK:               %[[VAL_27:.*]] = fir.convert %[[VAL_25]] : (!fir.logical<4>) -> i1
+! CHECK:               %[[VAL_28:.*]] = arith.cmpi ne, %[[VAL_26]], %[[VAL_27]] : i1
+! CHECK:               %[[VAL_29:.*]] = fir.convert %[[VAL_28]] : (i1) -> !fir.logical<4>
+! CHECK:               fir.store %[[VAL_29]] to %[[VAL_15]] : !fir.ref<!fir.logical<4>>
+! CHECK:               %[[VAL_30:.*]] = fir.load %[[VAL_16]] : !fir.ref<!fir.logical<4>>
+! CHECK:               %[[VAL_31:.*]] = fir.load %[[VAL_11]] : !fir.ref<i32>
+! CHECK:               %[[VAL_32:.*]] = fir.convert %[[VAL_31]] : (i32) -> i64
+! CHECK:               %[[VAL_33:.*]] = arith.constant 1 : i64
+! CHECK:               %[[VAL_34:.*]] = arith.subi %[[VAL_32]], %[[VAL_33]] : i64
+! CHECK:               %[[VAL_35:.*]] = fir.coordinate_of %[[VAL_0]], %[[VAL_34]] : (!fir.ref<!fir.array<100x!fir.logical<4>>>, i64) -> !fir.ref<!fir.logical<4>>
+! CHECK:               %[[VAL_36:.*]] = fir.load %[[VAL_35]] : !fir.ref<!fir.logical<4>>
+! CHECK:               %[[VAL_37:.*]] = fir.convert %[[VAL_30]] : (!fir.logical<4>) -> i1
+! CHECK:               %[[VAL_38:.*]] = fir.convert %[[VAL_36]] : (!fir.logical<4>) -> i1
+! CHECK:               %[[VAL_39:.*]] = arith.cmpi ne, %[[VAL_37]], %[[VAL_38]] : i1
+! CHECK:               %[[VAL_40:.*]] = fir.convert %[[VAL_39]] : (i1) -> !fir.logical<4>
+! CHECK:               fir.store %[[VAL_40]] to %[[VAL_16]] : !fir.ref<!fir.logical<4>>
+! CHECK:               %[[VAL_41:.*]] = fir.load %[[VAL_17]] : !fir.ref<!fir.logical<4>>
+! CHECK:               %[[VAL_42:.*]] = fir.load %[[VAL_11]] : !fir.ref<i32>
+! CHECK:               %[[VAL_43:.*]] = fir.convert %[[VAL_42]] : (i32) -> i64
+! CHECK:               %[[VAL_44:.*]] = arith.constant 1 : i64
+! CHECK:               %[[VAL_45:.*]] = arith.subi %[[VAL_43]], %[[VAL_44]] : i64
+! CHECK:               %[[VAL_46:.*]] = fir.coordinate_of %[[VAL_0]], %[[VAL_45]] : (!fir.ref<!fir.array<100x!fir.logical<4>>>, i64) -> !fir.ref<!fir.logical<4>>
+! CHECK:               %[[VAL_47:.*]] = fir.load %[[VAL_46]] : !fir.ref<!fir.logical<4>>
+! CHECK:               %[[VAL_48:.*]] = fir.convert %[[VAL_41]] : (!fir.logical<4>) -> i1
+! CHECK:               %[[VAL_49:.*]] = fir.convert %[[VAL_47]] : (!fir.logical<4>) -> i1
+! CHECK:               %[[VAL_50:.*]] = arith.cmpi ne, %[[VAL_48]], %[[VAL_49]] : i1
+! CHECK:               %[[VAL_51:.*]] = fir.convert %[[VAL_50]] : (i1) -> !fir.logical<4>
+! CHECK:               fir.store %[[VAL_51]] to %[[VAL_17]] : !fir.ref<!fir.logical<4>>
+! CHECK:               omp.yield
+! CHECK:             omp.terminator
+! CHECK:           return
+
+
+subroutine multiple_reductions(w)
+  logical :: x,y,z,w(100)
+  x = .true.
+  y = .true.
+  z = .true.
+  !$omp parallel
+  !$omp do reduction(.neqv.:x,y,z)
+  do i=1, 100
+  x = x .neqv. w(i)
+  y = y .neqv. w(i)
+  z = z .neqv. w(i)
+  end do
+  !$omp end do
+  !$omp end parallel
+end subroutine
diff --git a/flang/test/Lower/OpenMP/FIR/wsloop-reduction-max-byref.f90 b/flang/test/Lower/OpenMP/FIR/wsloop-reduction-max-byref.f90
new file mode 100644
index 00000000000000..ddda24a17a8311
--- /dev/null
+++ b/flang/test/Lower/OpenMP/FIR/wsloop-reduction-max-byref.f90
@@ -0,0 +1,90 @@
+! RUN: bbc -emit-fir -hlfir=false -fopenmp --force-byref-reduction -o - %s 2>&1 | FileCheck %s
+! RUN: %flang_fc1 -emit-fir -flang-deprecated-no-hlfir -fopenmp -mmlir --force-byref-reduction -o - %s 2>&1 | FileCheck %s
+
+!CHECK: omp.reduction.declare @max_f_32_byref : !fir.ref<f32>
+!CHECK-SAME: init {
+!CHECK:   %[[MINIMUM_VAL:.*]] = arith.constant -3.40282347E+38 : f32
+!CHECK:   %[[REF:.*]] = fir.alloca f32
+!CHECK:   fir.store %[[MINIMUM_VAL]] to %[[REF]] : !fir.ref<f32>
+!CHECK:   omp.yield(%[[REF]] : !fir.ref<f32>)
+!CHECK: combiner
+!CHECK: ^bb0(%[[ARG0:.*]]: !fir.ref<f32>, %[[ARG1:.*]]: !fir.ref<f32>):
+!CHECK:   %[[LD0:.*]] = fir.load %[[ARG0]] : !fir.ref<f32>
+!CHECK:   %[[LD1:.*]] = fir.load %[[ARG1]] : !fir.ref<f32>
+!CHECK:   %[[RES:.*]] = arith.maximumf %[[LD0]], %[[LD1]] {{.*}}: f32
+!CHECK:   fir.store %[[RES]] to %[[ARG0]] : !fir.ref<f32>
+!CHECK:   omp.yield(%[[ARG0]] : !fir.ref<f32>)
+
+!CHECK-LABEL: omp.reduction.declare @max_i_32_byref : !fir.ref<i32>
+!CHECK-SAME: init {
+!CHECK:   %[[MINIMUM_VAL:.*]] = arith.constant -2147483648 : i32
+!CHECK:   fir.store %[[MINIMUM_VAL]] to %[[REF]] : !fir.ref<i32>
+!CHECK:   omp.yield(%[[REF]] : !fir.ref<i32>)
+!CHECK: combiner
+!CHECK: ^bb0(%[[ARG0:.*]]: !fir.ref<i32>, %[[ARG1:.*]]: !fir.ref<i32>):
+!CHECK:   %[[LD0:.*]] = fir.load %[[ARG0]] : !fir.ref<i32>
+!CHECK:   %[[LD1:.*]] = fir.load %[[ARG1]] : !fir.ref<i32>
+!CHECK:   %[[RES:.*]] = arith.maxsi %[[LD0]], %[[LD1]] : i32
+!CHECK:   fir.store %[[RES]] to %[[ARG0]] : !fir.ref<i32>
+!CHECK:   omp.yield(%[[ARG0]] : !fir.ref<i32>)
+
+!CHECK-LABEL: @_QPreduction_max_int
+!CHECK-SAME: %[[Y_BOX:.*]]: !fir.box<!fir.array<?xi32>>
+!CHECK:   %[[X_REF:.*]] = fir.alloca i32 {bindc_name = "x", uniq_name = "_QFreduction_max_intEx"}
+!CHECK:   omp.parallel
+!CHECK:     omp.wsloop byref reduction(@max_i_32_byref %[[X_REF]] -> %[[PRV:.+]] : !fir.ref<i32>) for
+!CHECK:       %[[LPRV:.+]] = fir.load %[[PRV]] : !fir.ref<i32>
+!CHECK:       %[[Y_I_REF:.*]] = fir.coordinate_of %[[Y_BOX]]
+!CHECK:       %[[Y_I:.*]] = fir.load %[[Y_I_REF]] : !fir.ref<i32>
+!CHECK:       %[[RES:.+]] = arith.cmpi sgt, %[[LPRV]], %[[Y_I]] : i32
+!CHECK:       %[[SEL:.+]] = arith.select %[[RES]], %[[LPRV]], %[[Y_I]]
+!CHECK:       fir.store %[[SEL]] to %[[PRV]] : !fir.ref<i32>
+!CHECK:     omp.terminator
+
+!CHECK-LABEL: @_QPreduction_max_real
+!CHECK-SAME: %[[Y_BOX:.*]]: !fir.box<!fir.array<?xf32>>
+!CHECK:   %[[X_REF:.*]] = fir.alloca f32 {bindc_name = "x", uniq_name = "_QFreduction_max_realEx"}
+!CHECK:   omp.parallel
+!CHECK:     omp.wsloop byref reduction(@max_f_32_byref %[[X_REF]] -> %[[PRV:.+]] : !fir.ref<f32>) for
+!CHECK:       %[[LPRV:.+]] = fir.load %[[PRV]] : !fir.ref<f32>
+!CHECK:       %[[Y_I_REF:.*]] = fir.coordinate_of %[[Y_BOX]]
+!CHECK:       %[[Y_I:.*]] = fir.load %[[Y_I_REF]] : !fir.ref<f32>
+!CHECK:       %[[RES:.+]] = arith.cmpf ogt, %[[Y_I]], %[[LPRV]] {{.*}} : f32
+!CHECK:       omp.yield
+!CHECK:     omp.terminator
+
+subroutine reduction_max_int(y)
+  integer :: x, y(:)
+  x = 0
+  !$omp parallel
+  !$omp do reduction(max:x)
+  do i=1, 100
+    x = max(x, y(i))
+  end do
+  !$omp end do
+  !$omp end parallel
+  print *, x
+end subroutine
+
+subroutine reduction_max_real(y)
+  real :: x, y(:)
+  x = 0.0
+  !$omp parallel
+  !$omp do reduction(max:x)
+  do i=1, 100
+    x = max(y(i), x)
+  end do
+  !$omp end do
+  !$omp end parallel
+  print *, x
+
+  !$omp parallel
+  !$omp do reduction(max:x)
+  do i=1, 100
+    !CHECK-NOT: omp.reduction
+    if (y(i) .gt. x) x = y(i)
+  end do
+  !$omp end do
+  !$omp end parallel
+  print *, x
+end subroutine
diff --git a/flang/test/Lower/OpenMP/FIR/wsloop-reduction-min-byref.f90 b/flang/test/Lower/OpenMP/FIR/wsloop-reduction-min-byref.f90
new file mode 100644
index 00000000000000..d767c54cdbc5f8
--- /dev/null
+++ b/flang/test/Lower/OpenMP/FIR/wsloop-reduction-min-byref.f90
@@ -0,0 +1,91 @@
+! RUN: bbc -emit-fir -hlfir=false -fopenmp --force-byref-reduction -o - %s 2>&1 | FileCheck %s
+! RUN: %flang_fc1 -emit-fir -flang-deprecated-no-hlfir -mmlir --force-byref-reduction -fopenmp -o - %s 2>&1 | FileCheck %s
+
+!CHECK: omp.reduction.declare @min_f_32_byref : !fir.ref<f32>
+!CHECK-SAME: init {
+!CHECK:   %[[MAXIMUM_VAL:.*]] = arith.constant 3.40282347E+38 : f32
+!CHECK:   %[[REF:.*]] = fir.alloca f32
+!CHECK:   fir.store %[[MAXIMUM_VAL]] to %[[REF]] : !fir.ref<f32>
+!CHECK:   omp.yield(%[[REF]] : !fir.ref<f32>)
+!CHECK: combiner
+!CHECK: ^bb0(%[[ARG0:.*]]: !fir.ref<f32>, %[[ARG1:.*]]: !fir.ref<f32>):
+!CHECK:   %[[LD0:.*]] = fir.load %[[ARG0]] : !fir.ref<f32>
+!CHECK:   %[[LD1:.*]] = fir.load %[[ARG1]] : !fir.ref<f32>
+!CHECK:   %[[RES:.*]] = arith.minimumf %[[LD0]], %[[LD1]] {{.*}}: f32
+!CHECK:   fir.store %[[RES]] to %[[ARG0]] : !fir.ref<f32>
+!CHECK:   omp.yield(%[[ARG0]] : !fir.ref<f32>)
+
+!CHECK-LABEL: omp.reduction.declare @min_i_32_byref : !fir.ref<i32>
+!CHECK-SAME: init {
+!CHECK:   %[[MAXIMUM_VAL:.*]] = arith.constant 2147483647 : i32
+!CHECK:   fir.store %[[MAXIMUM_VAL]] to %[[REF]] : !fir.ref<i32>
+!CHECK:   omp.yield(%[[REF]] : !fir.ref<i32>)
+!CHECK: combiner
+!CHECK: ^bb0(%[[ARG0:.*]]: !fir.ref<i32>, %[[ARG1:.*]]: !fir.ref<i32>):
+!CHECK:   %[[LD0:.*]] = fir.load %[[ARG0]] : !fir.ref<i32>
+!CHECK:   %[[LD1:.*]] = fir.load %[[ARG1]] : !fir.ref<i32>
+!CHECK:   %[[RES:.*]] = arith.minsi %[[LD0]], %[[LD1]] : i32
+!CHECK:   fir.store %[[RES]] to %[[ARG0]] : !fir.ref<i32>
+!CHECK:   omp.yield(%[[ARG0]] : !fir.ref<i32>)
+
+!CHECK-LABEL: @_QPreduction_min_int
+!CHECK-SAME: %[[Y_BOX:.*]]: !fir.box<!fir.array<?xi32>>
+!CHECK:   %[[X_REF:.*]] = fir.alloca i32 {bindc_name = "x", uniq_name = "_QFreduction_min_intEx"}
+!CHECK:   omp.parallel
+!CHECK:     omp.wsloop byref reduction(@min_i_32_byref %[[X_REF]] -> %[[PRV:.+]] : !fir.ref<i32>) for
+!CHECK:       %[[LPRV:.+]] = fir.load %[[PRV]] : !fir.ref<i32>
+!CHECK:       %[[Y_I_REF:.*]] = fir.coordinate_of %[[Y_BOX]]
+!CHECK:       %[[Y_I:.*]] = fir.load %[[Y_I_REF]] : !fir.ref<i32>
+!CHECK:       %[[RES:.+]] = arith.cmpi slt, %[[LPRV]], %[[Y_I]] : i32
+!CHECK:       %[[SEL:.+]] = arith.select %[[RES]], %[[LPRV]], %[[Y_I]]
+!CHECK:       fir.store %[[SEL]] to %[[PRV]] : !fir.ref<i32>
+!CHECK:       omp.yield
+!CHECK:     omp.terminator
+
+!CHECK-LABEL: @_QPreduction_min_real
+!CHECK-SAME: %[[Y_BOX:.*]]: !fir.box<!fir.array<?xf32>>
+!CHECK:   %[[X_REF:.*]] = fir.alloca f32 {bindc_name = "x", uniq_name = "_QFreduction_min_realEx"}
+!CHECK:   omp.parallel
+!CHECK:     omp.wsloop byref reduction(@min_f_32_byref %[[X_REF]] -> %[[PRV:.+]] : !fir.ref<f32>) for
+!CHECK:       %[[LPRV:.+]] = fir.load %[[PRV]] : !fir.ref<f32>
+!CHECK:       %[[Y_I_REF:.*]] = fir.coordinate_of %[[Y_BOX]]
+!CHECK:       %[[Y_I:.*]] = fir.load %[[Y_I_REF]] : !fir.ref<f32>
+!CHECK:       %[[RES:.+]] = arith.cmpf ogt, %[[Y_I]], %[[LPRV]] {{.*}} : f32
+!CHECK:       omp.yield
+!CHECK:     omp.terminator
+
+subroutine reduction_min_int(y)
+  integer :: x, y(:)
+  x = 0
+  !$omp parallel
+  !$omp do reduction(min:x)
+  do i=1, 100
+    x = min(x, y(i))
+  end do
+  !$omp end do
+  !$omp end parallel
+  print *, x
+end subroutine
+
+subroutine reduction_min_real(y)
+  real :: x, y(:)
+  x = 0.0
+  !$omp parallel
+  !$omp do reduction(min:x)
+  do i=1, 100
+    x = min(y(i), x)
+  end do
+  !$omp end do
+  !$omp end parallel
+  print *, x
+
+  !$omp parallel
+  !$omp do reduction(min:x)
+  do i=1, 100
+  !CHECK-NOT: omp.reduction
+    if (y(i) .gt. x) x = y(i)
+  end do
+  !$omp end do
+  !$omp end parallel
+  print *, x
+end subroutine
diff --git a/flang/test/Lower/OpenMP/default-clause-byref.f90 b/flang/test/Lower/OpenMP/default-clause-byref.f90
new file mode 100644
index 00000000000000..5d9538e53069d6
--- /dev/null
+++ b/flang/test/Lower/OpenMP/default-clause-byref.f90
@@ -0,0 +1,385 @@
+! This test checks lowering of OpenMP parallel directive
+! with `DEFAULT` clause present.
+
+! RUN: %flang_fc1 -emit-hlfir -fopenmp -mmlir --force-byref-reduction %s -o - | FileCheck %s
+! RUN: bbc -fopenmp -emit-hlfir --force-byref-reduction %s -o - | FileCheck %s
+
+
+!CHECK: func @_QQmain() attributes {fir.bindc_name = "default_clause_lowering"} {
+!CHECK: %[[W:.*]] = fir.alloca i32 {bindc_name = "w", uniq_name = "_QFEw"}
+!CHECK: %[[W_DECL:.*]]:2 = hlfir.declare %[[W]] {uniq_name = "_QFEw"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+!CHECK: %[[X:.*]] = fir.alloca i32 {bindc_name = "x", uniq_name = "_QFEx"}
+!CHECK: %[[X_DECL:.*]]:2 = hlfir.declare %[[X]] {uniq_name = "_QFEx"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+!CHECK: %[[Y:.*]] = fir.alloca i32 {bindc_name = "y", uniq_name = "_QFEy"}
+!CHECK: %[[Y_DECL:.*]]:2 = hlfir.declare %[[Y]] {uniq_name = "_QFEy"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+!CHECK: %[[Z:.*]] = fir.alloca i32 {bindc_name = "z", uniq_name = "_QFEz"}
+!CHECK: %[[Z_DECL:.*]]:2 = hlfir.declare %[[Z]] {uniq_name = "_QFEz"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+!CHECK: omp.parallel {
+!CHECK: %[[PRIVATE_X:.*]] = fir.alloca i32 {bindc_name = "x", pinned, uniq_name = "_QFEx"}
+!CHECK: %[[PRIVATE_X_DECL:.*]]:2 = hlfir.declare %[[PRIVATE_X]] {uniq_name = "_QFEx"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+!CHECK: %[[CONST:.*]] = fir.load %[[X_DECL]]#0 : !fir.ref<i32>
+!CHECK: hlfir.assign %[[CONST]] to %[[PRIVATE_X_DECL]]#0 temporary_lhs : i32, !fir.ref<i32>
+!CHECK: %[[PRIVATE_Y:.*]] = fir.alloca i32 {bindc_name = "y", pinned, uniq_name = "_QFEy"}
+!CHECK: %[[PRIVATE_Y_DECL:.*]]:2 = hlfir.declare %[[PRIVATE_Y]] {uniq_name = "_QFEy"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+!CHECK: %[[PRIVATE_W:.*]] = fir.alloca i32 {bindc_name = "w", pinned, uniq_name = "_QFEw"}
+!CHECK: %[[PRIVATE_W_DECL:.*]]:2 = hlfir.declare %[[PRIVATE_W]] {uniq_name = "_QFEw"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+!CHECK: %[[CONST:.*]] = arith.constant 2 : i32
+!CHECK: %[[TEMP:.*]] = fir.load %[[PRIVATE_Y_DECL]]#0 : !fir.ref<i32>
+!CHECK: %[[RESULT:.*]] = arith.muli %[[CONST]], %[[TEMP]] : i32
+!CHECK: hlfir.assign %[[RESULT]] to %[[PRIVATE_X_DECL]]#0 : i32, !fir.ref<i32>
+!CHECK: %[[TEMP:.*]] = fir.load %[[PRIVATE_W_DECL]]#0 : !fir.ref<i32>
+!CHECK: %[[CONST:.*]] = arith.constant 45 : i32
+!CHECK: %[[RESULT:.*]] = arith.addi %[[TEMP]], %[[CONST]] : i32
+!CHECK: hlfir.assign %[[RESULT]] to %[[Z_DECL]]#0 : i32, !fir.ref<i32>
+!CHECK: omp.terminator
+!CHECK: }
+
+program default_clause_lowering
+    integer :: x, y, z, w
+
+    !$omp parallel default(private) firstprivate(x) shared(z)
+        x = y * 2
+        z = w + 45
+    !$omp end parallel
+
+!CHECK: omp.parallel {
+!CHECK: %[[TEMP:.*]] = fir.load %[[Y_DECL]]#0 : !fir.ref<i32>
+!CHECK: hlfir.assign %[[TEMP]] to %[[X_DECL]]#0 : i32, !fir.ref<i32>
+!CHECK: omp.terminator
+!CHECK: }
+
+    !$omp parallel default(shared)
+        x = y
+    !$omp end parallel
+
+!CHECK: omp.parallel {
+!CHECK: %[[PRIVATE_X:.*]] = fir.alloca i32 {bindc_name = "x", pinned, uniq_name = "_QFEx"}
+!CHECK: %[[PRIVATE_X_DECL:.*]]:2 = hlfir.declare %[[PRIVATE_X]] {uniq_name = "_QFEx"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+!CHECK: %[[PRIVATE_Y:.*]] = fir.alloca i32 {bindc_name = "y", pinned, uniq_name = "_QFEy"}
+!CHECK: %[[PRIVATE_Y_DECL:.*]]:2 = hlfir.declare %[[PRIVATE_Y]] {uniq_name = "_QFEy"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+!CHECK: %[[TEMP:.*]] = fir.load %[[PRIVATE_Y_DECL]]#0 : !fir.ref<i32>
+!CHECK: hlfir.assign %[[TEMP]] to %[[PRIVATE_X_DECL]]#0 : i32, !fir.ref<i32>
+!CHECK: omp.terminator
+!CHECK: }
+
+    !$omp parallel default(none) private(x, y)
+        x = y
+    !$omp end parallel
+
+!CHECK: omp.parallel   {
+!CHECK: %[[PRIVATE_Y:.*]] = fir.alloca i32 {bindc_name = "y", pinned, uniq_name = "_QFEy"}
+!CHECK: %[[PRIVATE_Y_DECL:.*]]:2 = hlfir.declare %[[PRIVATE_Y]] {uniq_name = "_QFEy"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+!CHECK: %[[TEMP:.*]] = fir.load %[[Y_DECL]]#0 : !fir.ref<i32>
+!CHECK: hlfir.assign %[[TEMP]] to %[[PRIVATE_Y_DECL]]#0 temporary_lhs : i32, !fir.ref<i32>
+!CHECK: %[[PRIVATE_X:.*]] = fir.alloca i32 {bindc_name = "x", pinned, uniq_name = "_QFEx"}
+!CHECK: %[[PRIVATE_X_DECL:.*]]:2 = hlfir.declare %[[PRIVATE_X]] {uniq_name = "_QFEx"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+!CHECK: %[[TEMP:.*]] = fir.load %[[X_DECL]]#0 : !fir.ref<i32>
+!CHECK: hlfir.assign %[[TEMP]] to %[[PRIVATE_X_DECL]]#0 temporary_lhs : i32, !fir.ref<i32>
+!CHECK: %[[TEMP:.*]] = fir.load %[[PRIVATE_Y_DECL]]#0 : !fir.ref<i32>
+!CHECK: hlfir.assign %[[TEMP]] to %[[PRIVATE_X_DECL]]#0 : i32, !fir.ref<i32>
+!CHECK: omp.terminator
+!CHECK: }
+
+    !$omp parallel default(firstprivate) firstprivate(y)
+        x = y
+    !$omp end parallel
+
+!CHECK: omp.parallel {
+!CHECK: %[[PRIVATE_X:.*]] = fir.alloca i32 {bindc_name = "x", pinned, uniq_name = "_QFEx"}
+!CHECK: %[[PRIVATE_X_DECL:.*]]:2 = hlfir.declare %[[PRIVATE_X]] {uniq_name = "_QFEx"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+!CHECK: %[[PRIVATE_Y:.*]] = fir.alloca i32 {bindc_name = "y", pinned, uniq_name = "_QFEy"}
+!CHECK: %[[PRIVATE_Y_DECL:.*]]:2 = hlfir.declare %[[PRIVATE_Y]] {uniq_name = "_QFEy"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+!CHECK: %[[TEMP:.*]] = fir.load %[[Y_DECL]]#0 : !fir.ref<i32>
+!CHECK: hlfir.assign %[[TEMP]] to %[[PRIVATE_Y_DECL]]#0 temporary_lhs : i32, !fir.ref<i32>
+!CHECK: %[[PRIVATE_W:.*]] = fir.alloca i32 {bindc_name = "w", pinned, uniq_name = "_QFEw"}
+!CHECK: %[[PRIVATE_W_DECL:.*]]:2 = hlfir.declare %[[PRIVATE_W]] {uniq_name = "_QFEw"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+!CHECK: %[[TEMP:.*]] = fir.load %[[W_DECL]]#0 : !fir.ref<i32>
+!CHECK: hlfir.assign %[[TEMP]] to %[[PRIVATE_W_DECL]]#0 temporary_lhs : i32, !fir.ref<i32>
+!CHECK: %[[CONST:.*]] = arith.constant 2 : i32
+!CHECK: %[[RESULT:.*]] = fir.load %[[PRIVATE_Y_DECL]]#0 : !fir.ref<i32>
+!CHECK: %[[TEMP:.*]] = arith.muli %[[CONST]], %[[RESULT]] : i32
+!CHECK: hlfir.assign %[[TEMP]] to %[[PRIVATE_X_DECL]]#0 : i32, !fir.ref<i32>
+!CHECK: %[[TEMP:.*]] = fir.load %[[PRIVATE_W_DECL]]#0 : !fir.ref<i32>
+!CHECK: %[[CONST:.*]] = arith.constant 45 : i32
+!CHECK: %[[RESULT:.*]] = arith.addi %[[TEMP]], %[[CONST]] : i32
+!CHECK: hlfir.assign %[[RESULT]] to %[[Z_DECL]]#0 : i32, !fir.ref<i32>
+!CHECK: omp.terminator
+!CHECK: }
+
+    !$omp parallel default(firstprivate) private(x) shared(z)
+        x = y * 2
+        z = w + 45
+    !$omp end parallel
+
+!CHECK: omp.parallel   {
+!CHECK: omp.parallel   {
+!CHECK: %[[PRIVATE_X:.*]] = fir.alloca i32 {bindc_name = "x", pinned, uniq_name = "_QFEx"}
+!CHECK: %[[PRIVATE_X_DECL:.*]]:2 = hlfir.declare %[[PRIVATE_X]] {uniq_name = "_QFEx"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+!CHECK: %[[PRIVATE_Y:.*]] = fir.alloca i32 {bindc_name = "y", pinned, uniq_name = "_QFEy"}
+!CHECK: %[[PRIVATE_Y_DECL:.*]]:2 = hlfir.declare %[[PRIVATE_Y]] {uniq_name = "_QFEy"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+!CHECK: %[[TEMP:.*]] = fir.load %[[PRIVATE_Y_DECL]]#0 : !fir.ref<i32>
+!CHECK: hlfir.assign %[[TEMP]] to %[[PRIVATE_X_DECL]]#0 : i32, !fir.ref<i32>
+!CHECK: omp.terminator
+!CHECK: }
+!CHECK: omp.parallel {
+!CHECK: %[[PRIVATE_W:.*]] = fir.alloca i32 {bindc_name = "w", pinned, uniq_name = "_QFEw"}
+!CHECK: %[[PRIVATE_W_DECL:.*]]:2 = hlfir.declare %[[PRIVATE_W]] {uniq_name = "_QFEw"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+!CHECK: %[[TEMP:.*]] = fir.load %[[W_DECL]]#0 : !fir.ref<i32>
+!CHECK: hlfir.assign %[[TEMP]] to %[[PRIVATE_W_DECL]]#0 temporary_lhs : i32, !fir.ref<i32>
+!CHECK: %[[PRIVATE_X:.*]] = fir.alloca i32 {bindc_name = "x", pinned, uniq_name = "_QFEx"}
+!CHECK: %[[PRIVATE_X_DECL:.*]]:2 = hlfir.declare %[[PRIVATE_X]] {uniq_name = "_QFEx"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+!CHECK: %[[TEMP:.*]] = fir.load %[[X_DECL]]#0 : !fir.ref<i32>
+!CHECK: hlfir.assign %[[TEMP]] to %[[PRIVATE_X_DECL]]#0 temporary_lhs : i32, !fir.ref<i32>
+!CHECK: %[[TEMP:.*]] = fir.load %[[PRIVATE_X_DECL]]#0 : !fir.ref<i32>
+!CHECK: hlfir.assign %[[TEMP]] to %[[PRIVATE_W_DECL]]#0 : i32, !fir.ref<i32>
+!CHECK: omp.terminator
+!CHECK: }
+!CHECK: omp.terminator
+!CHECK: }
+    !$omp parallel
+        !$omp parallel default(private)
+            x = y
+        !$omp end parallel
+
+        !$omp parallel default(firstprivate)
+            w = x
+        !$omp end parallel
+    !$omp end parallel
+
+end program default_clause_lowering
+
+subroutine nested_default_clause_tests
+    integer :: x, y, z, w, k, a
+!CHECK: %[[K:.*]] = fir.alloca i32 {bindc_name = "k", uniq_name = "_QFnested_default_clause_testsEk"}
+!CHECK: %[[K_DECL:.*]]:2 = hlfir.declare %[[K]] {uniq_name = "_QFnested_default_clause_testsEk"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+!CHECK: %[[W:.*]] = fir.alloca i32 {bindc_name = "w", uniq_name = "_QFnested_default_clause_testsEw"}
+!CHECK: %[[W_DECL:.*]]:2 = hlfir.declare %[[W]] {uniq_name = "_QFnested_default_clause_testsEw"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+!CHECK: %[[X:.*]] = fir.alloca i32 {bindc_name = "x", uniq_name = "_QFnested_default_clause_testsEx"}
+!CHECK: %[[X_DECL:.*]]:2 = hlfir.declare %[[X]] {uniq_name = "_QFnested_default_clause_testsEx"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+!CHECK: %[[Y:.*]] = fir.alloca i32 {bindc_name = "y", uniq_name = "_QFnested_default_clause_testsEy"}
+!CHECK: %[[Y_DECL:.*]]:2 = hlfir.declare %[[Y]] {uniq_name = "_QFnested_default_clause_testsEy"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+!CHECK: %[[Z:.*]] = fir.alloca i32 {bindc_name = "z", uniq_name = "_QFnested_default_clause_testsEz"}
+!CHECK: %[[Z_DECL:.*]]:2 = hlfir.declare %[[Z]] {uniq_name = "_QFnested_default_clause_testsEz"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+!CHECK: omp.parallel   {
+!CHECK: %[[PRIVATE_X:.*]] = fir.alloca i32 {bindc_name = "x", pinned, uniq_name = "_QFnested_default_clause_testsEx"}
+!CHECK: %[[PRIVATE_X_DECL:.*]]:2 = hlfir.declare %[[PRIVATE_X]] {uniq_name = "_QFnested_default_clause_testsEx"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+!CHECK: %[[TEMP:.*]] = fir.load %[[X_DECL]]#0 : !fir.ref<i32>
+!CHECK: hlfir.assign %[[TEMP]] to %[[PRIVATE_X_DECL]]#0 temporary_lhs : i32, !fir.ref<i32>
+!CHECK: %[[PRIVATE_Y:.*]] = fir.alloca i32 {bindc_name = "y", pinned, uniq_name = "_QFnested_default_clause_testsEy"}
+!CHECK: %[[PRIVATE_Y_DECL:.*]]:2 = hlfir.declare %[[PRIVATE_Y]] {uniq_name = "_QFnested_default_clause_testsEy"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+!CHECK: %[[PRIVATE_Z:.*]] = fir.alloca i32 {bindc_name = "z", pinned, uniq_name = "_QFnested_default_clause_testsEz"}
+!CHECK: %[[PRIVATE_Z_DECL:.*]]:2 = hlfir.declare %[[PRIVATE_Z]] {uniq_name = "_QFnested_default_clause_testsEz"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+!CHECK: %[[PRIVATE_K:.*]] = fir.alloca i32 {bindc_name = "k", pinned, uniq_name = "_QFnested_default_clause_testsEk"}
+!CHECK: %[[PRIVATE_K_DECL:.*]]:2 = hlfir.declare %[[PRIVATE_K]] {uniq_name = "_QFnested_default_clause_testsEk"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+!CHECK: omp.parallel {
+!CHECK: %[[INNER_PRIVATE_Y:.*]] = fir.alloca i32 {bindc_name = "y", pinned, uniq_name = "_QFnested_default_clause_testsEy"}
+!CHECK: %[[INNER_PRIVATE_Y_DECL:.*]]:2 = hlfir.declare %[[INNER_PRIVATE_Y]] {uniq_name = "_QFnested_default_clause_testsEy"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+!CHECK: %[[INNER_PRIVATE_X:.*]] = fir.alloca i32 {bindc_name = "x", pinned, uniq_name = "_QFnested_default_clause_testsEx"}
+!CHECK: %[[INNER_PRIVATE_X_DECL:.*]]:2 = hlfir.declare %[[INNER_PRIVATE_X]] {uniq_name = "_QFnested_default_clause_testsEx"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+!CHECK: %[[CONST:.*]] = arith.constant 20 : i32
+!CHECK: hlfir.assign %[[CONST]] to %[[INNER_PRIVATE_Y_DECL]]#0 : i32, !fir.ref<i32>
+!CHECK: %[[CONST:.*]] = arith.constant 10 : i32
+!CHECK: hlfir.assign %[[CONST]] to %[[INNER_PRIVATE_X_DECL]]#0 : i32, !fir.ref<i32>
+!CHECK: omp.terminator
+!CHECK: }
+!CHECK: omp.parallel   {
+!CHECK: %[[INNER_PRIVATE_W:.*]] = fir.alloca i32 {bindc_name = "w", pinned, uniq_name = "_QFnested_default_clause_testsEw"}
+!CHECK: %[[INNER_PRIVATE_W_DECL:.*]]:2 = hlfir.declare %[[INNER_PRIVATE_W]] {uniq_name = "_QFnested_default_clause_testsEw"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+!CHECK: %[[INNER_PRIVATE_Z:.*]] = fir.alloca i32 {bindc_name = "z", pinned, uniq_name = "_QFnested_default_clause_testsEz"}
+!CHECK: %[[INNER_PRIVATE_Z_DECL:.*]]:2 = hlfir.declare %[[INNER_PRIVATE_Z]] {uniq_name = "_QFnested_default_clause_testsEz"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+!CHECK: %[[TEMP:.*]] = fir.load %[[PRIVATE_Z_DECL]]#0 : !fir.ref<i32>
+!CHECK: hlfir.assign %[[TEMP]] to %[[INNER_PRIVATE_Z_DECL]]#0 temporary_lhs : i32, !fir.ref<i32>
+!CHECK: %[[INNER_PRIVATE_K:.*]] = fir.alloca i32 {bindc_name = "k", pinned, uniq_name = "_QFnested_default_clause_testsEk"}
+!CHECK: %[[INNER_PRIVATE_K_DECL:.*]]:2 = hlfir.declare %[[INNER_PRIVATE_K]] {uniq_name = "_QFnested_default_clause_testsEk"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+!CHECK: %[[TEMP:.*]] = fir.load %[[PRIVATE_K_DECL]]#0 : !fir.ref<i32>
+!CHECK: hlfir.assign %[[TEMP]] to %[[INNER_PRIVATE_K_DECL]]#0 temporary_lhs : i32, !fir.ref<i32> 
+!CHECK: %[[CONST:.*]] = arith.constant 30 : i32
+!CHECK: hlfir.assign %[[CONST]] to %[[PRIVATE_Y_DECL]]#0 : i32, !fir.ref<i32>
+!CHECK: %[[CONST:.*]] = arith.constant 40 : i32
+!CHECK: hlfir.assign %[[CONST]] to %[[INNER_PRIVATE_W_DECL]]#0 : i32, !fir.ref<i32>
+!CHECK: %[[CONST:.*]] = arith.constant 50 : i32
+!CHECK: hlfir.assign %[[CONST]] to %[[INNER_PRIVATE_Z_DECL]]#0 : i32, !fir.ref<i32>
+!CHECK: %[[CONST:.*]] = arith.constant 40 : i32
+!CHECK: hlfir.assign %[[CONST]] to %[[INNER_PRIVATE_K_DECL]]#0 : i32, !fir.ref<i32>
+!CHECK: omp.terminator
+!CHECK: }
+!CHECK: omp.terminator
+!CHECK: }
+    !$omp parallel  firstprivate(x) private(y) shared(w) default(private)  
+        !$omp parallel default(private)
+           y = 20
+           x = 10 
+        !$omp end parallel 
+
+        !$omp parallel default(firstprivate) shared(y) private(w) 
+            y = 30
+            w = 40 
+            z = 50
+            k = 40
+        !$omp end parallel
+    !$omp end parallel
+    
+    
+!CHECK: omp.parallel {
+!CHECK: %[[PRIVATE_X_DECL:.*]]:2 = hlfir.declare %[[PRIVATE_X]] {uniq_name = "_QFnested_default_clause_testsEx"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+!CHECK: %[[PRIVATE_Y:.*]] = fir.alloca i32 {bindc_name = "y", pinned, uniq_name = "_QFnested_default_clause_testsEy"}
+!CHECK: %[[PRIVATE_Y_DECL:.*]]:2 = hlfir.declare %[[PRIVATE_Y]] {uniq_name = "_QFnested_default_clause_testsEy"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+!CHECK: %[[PRIVATE_Z:.*]] = fir.alloca i32 {bindc_name = "z", pinned, uniq_name = "_QFnested_default_clause_testsEz"}
+!CHECK: %[[PRIVATE_Z_DECL:.*]]:2 = hlfir.declare %[[PRIVATE_Z]] {uniq_name = "_QFnested_default_clause_testsEz"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+!CHECK: %[[PRIVATE_W:.*]] = fir.alloca i32 {bindc_name = "w", pinned, uniq_name = "_QFnested_default_clause_testsEw"}
+!CHECK: %[[PRIVATE_W_DECL:.*]]:2 = hlfir.declare %[[PRIVATE_W]] {uniq_name = "_QFnested_default_clause_testsEw"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+!CHECK: omp.parallel {
+!CHECK: %[[PRIVATE_INNER_X:.*]] = fir.alloca i32 {bindc_name = "x", pinned, uniq_name = "_QFnested_default_clause_testsEx"}
+!CHECK: %[[PRIVATE_INNER_X_DECL:.*]]:2 = hlfir.declare %[[PRIVATE_INNER_X]] {uniq_name = "_QFnested_default_clause_testsEx"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+!CHECK: %[[TEMP:.*]] = fir.load %[[PRIVATE_X_DECL]]#0 : !fir.ref<i32>
+!CHECK: hlfir.assign %[[TEMP]] to %[[PRIVATE_INNER_X_DECL]]#0 temporary_lhs : i32, !fir.ref<i32>
+!CHECK: %[[INNER_PRIVATE_Y:.*]] = fir.alloca i32 {bindc_name = "y", pinned, uniq_name = "_QFnested_default_clause_testsEy"}
+!CHECK: %[[INNER_PRIVATE_Y_DECL:.*]]:2 = hlfir.declare %[[INNER_PRIVATE_Y]] {uniq_name = "_QFnested_default_clause_testsEy"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+!CHECK: %[[TEMP:.*]] = fir.load %[[PRIVATE_Y_DECL]]#0 : !fir.ref<i32>
+!CHECK: hlfir.assign %[[TEMP]] to %[[INNER_PRIVATE_Y_DECL]]#0 temporary_lhs : i32, !fir.ref<i32>
+!CHECK: %[[TEMP:.*]] = fir.load %[[INNER_PRIVATE_Y_DECL]]#0 : !fir.ref<i32>
+!CHECK: hlfir.assign %[[TEMP]] to %[[PRIVATE_INNER_X_DECL]]#0 : i32, !fir.ref<i32>
+!CHECK: omp.terminator
+!CHECK: }
+!CHECK: omp.parallel {
+!CHECK: %[[PRIVATE_INNER_W:.*]] = fir.alloca i32 {bindc_name = "w", pinned, uniq_name = "_QFnested_default_clause_testsEw"}
+!CHECK: %[[PRIVATE_INNER_W_DECL:.*]]:2 = hlfir.declare %[[PRIVATE_INNER_W]] {uniq_name = "_QFnested_default_clause_testsEw"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+!CHECK: %[[PRIVATE_INNER_X:.*]] = fir.alloca i32 {bindc_name = "x", pinned, uniq_name = "_QFnested_default_clause_testsEx"}
+!CHECK: %[[PRIVATE_INNER_X_DECL:.*]]:2 = hlfir.declare %[[PRIVATE_INNER_X]] {uniq_name = "_QFnested_default_clause_testsEx"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+!CHECK: %[[TEMP_1:.*]] = fir.load %[[PRIVATE_INNER_X_DECL]]#0 : !fir.ref<i32>
+!CHECK: %[[TEMP_2:.*]] = fir.load %[[PRIVATE_Z_DECL]]#0 : !fir.ref<i32>
+!CHECK: %[[RESULT:.*]] = arith.addi %{{.*}}, %{{.*}} : i32
+!CHECK: hlfir.assign %[[RESULT]] to %[[PRIVATE_INNER_W_DECL]]#0 : i32, !fir.ref<i32>
+!CHECK: omp.terminator
+!CHECK: }
+    !$omp parallel default(private)
+        !$omp parallel default(firstprivate)
+            x = y
+        !$omp end parallel
+
+        !$omp parallel default(private) shared(z)
+            w = x + z
+        !$omp end parallel
+    !$omp end parallel    
+    
+!CHECK: omp.parallel {
+!CHECK: %[[PRIVATE_X:.*]] = fir.alloca i32 {bindc_name = "x", pinned, uniq_name = "_QFnested_default_clause_testsEx"}
+!CHECK: %[[PRIVATE_X_DECL:.*]]:2 = hlfir.declare %[[PRIVATE_X]] {uniq_name = "_QFnested_default_clause_testsEx"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+!CHECK: %[[PRIVATE_Y:.*]] = fir.alloca i32 {bindc_name = "y", pinned, uniq_name = "_QFnested_default_clause_testsEy"}
+!CHECK: %[[PRIVATE_Y_DECL:.*]]:2 = hlfir.declare %[[PRIVATE_Y]] {uniq_name = "_QFnested_default_clause_testsEy"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+!CHECK: %[[PRIVATE_W:.*]] = fir.alloca i32 {bindc_name = "w", pinned, uniq_name = "_QFnested_default_clause_testsEw"}
+!CHECK: %[[PRIVATE_W_DECL:.*]]:2 = hlfir.declare %[[PRIVATE_W]] {uniq_name = "_QFnested_default_clause_testsEw"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+!CHECK: %[[PRIVATE_Z:.*]] = fir.alloca i32 {bindc_name = "z", pinned, uniq_name = "_QFnested_default_clause_testsEz"}
+!CHECK: %[[PRIVATE_Z_DECL:.*]]:2 = hlfir.declare %[[PRIVATE_Z]] {uniq_name = "_QFnested_default_clause_testsEz"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+!CHECK: omp.parallel {
+!CHECK: %[[INNER_PRIVATE_X:.*]] = fir.alloca i32 {bindc_name = "x", pinned, uniq_name = "_QFnested_default_clause_testsEx"}
+!CHECK: %[[INNER_PRIVATE_X_DECL:.*]]:2 = hlfir.declare %[[INNER_PRIVATE_X]] {uniq_name = "_QFnested_default_clause_testsEx"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+!CHECK: %[[TEMP:.*]] = fir.load %[[PRIVATE_X_DECL]]#0 : !fir.ref<i32>
+!CHECK: hlfir.assign %[[TEMP]] to %[[INNER_PRIVATE_X_DECL]]#0 temporary_lhs : i32, !fir.ref<i32>
+!CHECK: %[[INNER_PRIVATE_Y:.*]] = fir.alloca i32 {bindc_name = "y", pinned, uniq_name = "_QFnested_default_clause_testsEy"}
+!CHECK: %[[INNER_PRIVATE_Y_DECL:.*]]:2 = hlfir.declare %[[INNER_PRIVATE_Y]] {uniq_name = "_QFnested_default_clause_testsEy"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+!CHECK: %[[TEMP:.*]] = fir.load %[[PRIVATE_Y_DECL]]#0 : !fir.ref<i32>
+!CHECK: hlfir.assign %[[TEMP]] to %[[INNER_PRIVATE_Y_DECL]]#0 temporary_lhs : i32, !fir.ref<i32>
+!CHECK: %[[TEMP:.*]] = fir.load %[[INNER_PRIVATE_Y_DECL]]#0 : !fir.ref<i32>
+!CHECK: hlfir.assign %[[TEMP]] to %[[INNER_PRIVATE_X_DECL]]#0 : i32, !fir.ref<i32>
+!CHECK: omp.terminator
+!CHECK: }
+!CHECK: omp.parallel {
+!CHECK: %[[TEMP_1:.*]] = fir.load %[[PRIVATE_X_DECL]]#0 : !fir.ref<i32>
+!CHECK: %[[TEMP_2:.*]] = fir.load %[[PRIVATE_Z_DECL]]#0 : !fir.ref<i32>
+!CHECK: %[[TEMP_3:.*]] = arith.addi %[[TEMP_1]], %[[TEMP_2]] : i32
+!CHECK: hlfir.assign %[[TEMP_3]] to %[[PRIVATE_W_DECL]]#0 : i32, !fir.ref<i32>
+!CHECK: omp.terminator
+!CHECK: }
+!CHECK: }
+    !$omp parallel default(private)
+		!$omp parallel default(firstprivate)
+			x = y
+		!$omp end parallel
+
+		!$omp parallel default(shared)
+			w = x + z
+		!$omp end parallel
+	!$omp end parallel
+
+!CHECK: omp.parallel {
+!CHECK: %[[PRIVATE_X:.*]] = fir.alloca i32 {bindc_name = "x", pinned, uniq_name = "_QFnested_default_clause_testsEx"}
+!CHECK: %[[PRIVATE_X_DECL:.*]]:2 = hlfir.declare %[[PRIVATE_X]] {uniq_name = "_QFnested_default_clause_testsEx"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+!CHECK: %[[TEMP:.*]] = fir.load %[[X_DECL]]#0 : !fir.ref<i32>
+!CHECK: hlfir.assign %[[TEMP]] to %[[PRIVATE_X_DECL]]#0 temporary_lhs : i32, !fir.ref<i32>
+!CHECK: %[[PRIVATE_Y:.*]] = fir.alloca i32 {bindc_name = "y", pinned, uniq_name = "_QFnested_default_clause_testsEy"}
+!CHECK: %[[PRIVATE_Y_DECL:.*]]:2 = hlfir.declare %[[PRIVATE_Y]] {uniq_name = "_QFnested_default_clause_testsEy"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+!CHECK: %[[TEMP:.*]] = fir.load %[[Y_DECL]]#0 : !fir.ref<i32>
+!CHECK: hlfir.assign %[[TEMP]] to %[[PRIVATE_Y_DECL]]#0 temporary_lhs : i32, !fir.ref<i32>
+!CHECK: omp.single {
+!CHECK: %[[TEMP:.*]] = fir.load %[[PRIVATE_Y_DECL]]#0 : !fir.ref<i32>
+!CHECK: hlfir.assign %[[TEMP]] to %[[PRIVATE_X_DECL]]#0 : i32, !fir.ref<i32>
+!CHECK: omp.terminator
+!CHECK: }
+!CHECK: omp.terminator
+!CHECK: }
+!CHECK: return
+!CHECK: } 
+	!$omp parallel default(firstprivate)
+		!$omp single
+			x = y
+		!$omp end single
+	!$omp end parallel
+end subroutine
+
+!CHECK: func.func @_QPskipped_default_clause_checks() {
+!CHECK: %[[TYPE_ADDR:.*]] = fir.address_of(@_QFskipped_default_clause_checksE.n.i1) : !fir.ref<!fir.char<1,2>>
+!CHECK: %[[VAL_CONST_2:.*]] = arith.constant 2 : index
+!CHECK: %[[VAL_I1_DECLARE:.*]]:2 = hlfir.declare %[[TYPE_ADDR]] typeparams %[[VAL_CONST_2]] {{.*}}
+!CHECK: %[[TYPE_ADDR_IT:.*]] = fir.address_of(@_QFskipped_default_clause_checksE.n.it) : !fir.ref<!fir.char<1,2>>
+!CHECK: %[[VAL_CONST_2_0:.*]] = arith.constant 2 : index
+!CHECK: %[[VAL_IT_DECLARE:.*]]:2 = hlfir.declare %[[TYPE_ADDR_IT]] typeparams %[[VAL_CONST_2_0]] {{.*}}
+!CHECK: %[[VAL_I_ALLOCA:.*]] = fir.alloca i32 {bindc_name = "i", uniq_name = "_QFskipped_default_clause_checksEi"}
+!CHECK: %[[VAL_I_DECLARE:.*]]:2 = hlfir.declare %[[VAL_I_ALLOCA]] {{.*}}
+!CHECK: %[[VAL_III_ALLOCA:.*]] = fir.alloca !fir.type<_QFskipped_default_clause_checksTit{i1:i32}> {bindc_name = "iii", uniq_name = "_QFskipped_default_clause_checksEiii"}
+!CHECK: %[[VAL_III_DECLARE:.*]]:2 = hlfir.declare %[[VAL_III_ALLOCA]] {{.*}}
+!CHECK: %[[VAL_X_ALLOCA:.*]] = fir.alloca i32 {bindc_name = "x", uniq_name = "_QFskipped_default_clause_checksEx"}
+!CHECK: %[[VAL_X_DECLARE:.*]]:2 = hlfir.declare %[[VAL_X_ALLOCA]] {{.*}}
+!CHECK: %[[VAL_Y_ALLOCA:.*]] = fir.alloca i32 {bindc_name = "y", uniq_name = "_QFskipped_default_clause_checksEy"}
+!CHECK: %[[VAL_Y_DECLARE:.*]]:2 = hlfir.declare %[[VAL_Y_ALLOCA]] {{.*}}
+!CHECK: %[[VAL_Z_ALLOCA:.*]] = fir.alloca i32 {bindc_name = "z", uniq_name = "_QFskipped_default_clause_checksEz"}
+!CHECK: %[[VAL_Z_DECLARE:.*]]:2 = hlfir.declare %[[VAL_Z_ALLOCA]] {{.*}}
+subroutine skipped_default_clause_checks()
+       integer :: x,y,z
+       type it
+         integer::i1
+       end type
+       type(it)::iii
+
+!CHECK: omp.parallel {
+!CHECK: omp.wsloop byref reduction(@min_i_32_byref %[[VAL_Z_DECLARE]]#0 -> %[[PRV:.+]] : !fir.ref<i32>) for (%[[ARG:.*]]) {{.*}} {
+!CHECK: omp.yield
+!CHECK: }
+!CHECK: omp.terminator
+!CHECK: }
+       !$omp parallel do default(private) REDUCTION(MIN:z)
+         do i = 1, 10
+           x = x + MIN(y,x)
+         enddo
+       !$omp end parallel do
+
+!CHECK: omp.parallel {
+!CHECK: omp.terminator
+!CHECK: }
+       namelist /nam/i
+       !$omp parallel default(private)
+          write(1,nam )
+       !$omp endparallel
+
+!CHECK: omp.parallel {
+!CHECK: %[[PRIVATE_III_ALLOCA:.*]] = fir.alloca !fir.type<_QFskipped_default_clause_checksTit{i1:i32}> {{.*}}
+!CHECK: %[[PRIVATE_III_DECLARE:.*]]:2 = hlfir.declare %[[PRIVATE_III_ALLOCA]] {{.*}}
+!CHECK: %[[PRIVATE_ADDR:.*]] = fir.address_of(@_QQro._QFskipped_default_clause_checksTit.0) : !fir.ref<!fir.type<_QFskipped_default_clause_checksTit{i1:i32}>>
+!CHECK: %[[PRIVATE_PARAM:.*]]:2 = hlfir.declare %[[PRIVATE_ADDR]] {{.*}}
+!CHECK: hlfir.assign %[[PRIVATE_PARAM]]#0 to %[[PRIVATE_III_DECLARE]]#0 {{.*}}
+!CHECK: omp.terminator
+!CHECK: }
+       !$omp parallel default(private)
+          iii=it(11)
+       !$omp end parallel
+end subroutine
diff --git a/flang/test/Lower/OpenMP/delayed-privatization-reduction-byref.f90 b/flang/test/Lower/OpenMP/delayed-privatization-reduction-byref.f90
new file mode 100644
index 00000000000000..067a71340b8dda
--- /dev/null
+++ b/flang/test/Lower/OpenMP/delayed-privatization-reduction-byref.f90
@@ -0,0 +1,30 @@
+! Test that reductions and delayed privatization work properly togehter. Since
+! both types of clauses add block arguments to the OpenMP region, we make sure
+! that the block arguments are added in the proper order (reductions first and
+! then delayed privatization.
+
+! RUN: bbc -emit-hlfir -fopenmp --force-byref-reduction --openmp-enable-delayed-privatization -o - %s 2>&1 | FileCheck %s
+
+subroutine red_and_delayed_private
+    integer :: red
+    integer :: prv
+
+    red = 0
+    prv = 10
+
+    !$omp parallel reduction(+:red) private(prv)
+    red = red + 1
+    prv = 20
+    !$omp end parallel
+end subroutine
+
+! CHECK-LABEL: omp.private {type = private}
+! CHECK-SAME: @[[PRIVATIZER_SYM:.*]] : !fir.ref<i32> alloc {
+
+! CHECK-LABEL: omp.reduction.declare
+! CHECK-SAME: @[[REDUCTION_SYM:.*]] : !fir.ref<i32> init
+
+! CHECK-LABEL: _QPred_and_delayed_private
+! CHECK: omp.parallel
+! CHECK-SAME: reduction(@[[REDUCTION_SYM]] %{{.*}} -> %arg0 : !fir.ref<i32>)
+! CHECK-SAME: private(@[[PRIVATIZER_SYM]] %{{.*}} -> %arg1 : !fir.ref<i32>) {
diff --git a/flang/test/Lower/OpenMP/parallel-reduction-add-byref.f90 b/flang/test/Lower/OpenMP/parallel-reduction-add-byref.f90
new file mode 100644
index 00000000000000..c4a4695b8d9fc5
--- /dev/null
+++ b/flang/test/Lower/OpenMP/parallel-reduction-add-byref.f90
@@ -0,0 +1,125 @@
+! RUN: bbc -emit-hlfir --force-byref-reduction -fopenmp -o - %s 2>&1 | FileCheck %s
+! RUN: %flang_fc1 -emit-hlfir -fopenmp -mmlir --force-byref-reduction -o - %s 2>&1 | FileCheck %s
+
+!CHECK-LABEL: omp.reduction.declare
+!CHECK-SAME: @[[RED_F32_NAME:.*]] : !fir.ref<f32>
+!CHECK-SAME: init {
+!CHECK: ^bb0(%{{.*}}: !fir.ref<f32>):
+!CHECK:  %[[C0_1:.*]] = arith.constant 0.000000e+00 : f32
+!CHECK:  %[[REF:.*]] = fir.alloca f32
+!CHECKL  fir.store [[%C0_1]] to %[[REF]] : !fir.ref<f32>
+!CHECK:  omp.yield(%[[REF]] : !fir.ref<f32>)
+!CHECK: } combiner {
+!CHECK: ^bb0(%[[ARG0:.*]]: !fir.ref<f32>, %[[ARG1:.*]]: !fir.ref<f32>):
+!CHECK:  %[[LD0:.*]] = fir.load %[[ARG0]] : !fir.ref<f32>
+!CHECK:  %[[LD1:.*]] = fir.load %[[ARG1]] : !fir.ref<f32>
+!CHECK:  %[[RES:.*]] = arith.addf %[[LD0]], %[[LD1]] {{.*}}: f32
+!CHECK:  fir.store %[[RES]] to %[[ARG0]] : !fir.ref<f32>
+!CHECK:  omp.yield(%[[ARG0]] : !fir.ref<f32>)
+!CHECK: }
+
+!CHECK-LABEL: omp.reduction.declare
+!CHECK-SAME: @[[RED_I32_NAME:.*]] : !fir.ref<i32>
+!CHECK-SAME: init {
+!CHECK: ^bb0(%{{.*}}: !fir.ref<i32>):
+!CHECK:  %[[C0_1:.*]] = arith.constant 0 : i32
+!CHECK:  %[[REF:.*]] = fir.alloca i32
+!CHECK:  fir.store %[[C0_1]] to %[[REF]] : !fir.ref<i32>
+!CHECK:  omp.yield(%[[REF]] : !fir.ref<i32>)
+!CHECK: } combiner {
+!CHECK: ^bb0(%[[ARG0:.*]]: !fir.ref<i32>, %[[ARG1:.*]]: !fir.ref<i32>):
+!CHECK:  %[[LD0:.*]] = fir.load %[[ARG0]] : !fir.ref<i32>
+!CHECK:  %[[LD1:.*]] = fir.load %[[ARG1]] : !fir.ref<i32>
+!CHECK:  %[[RES:.*]] = arith.addi %[[LD0]], %[[LD1]] : i32
+!CHECK:  fir.store %[[RES]] to %[[ARG0]] : !fir.ref<i32>
+!CHECK:  omp.yield(%[[ARG0]] : !fir.ref<i32>)
+!CHECK: }
+
+!CHECK-LABEL: func.func @_QPsimple_int_add
+!CHECK:  %[[IREF:.*]] = fir.alloca i32 {bindc_name = "i", uniq_name = "_QFsimple_int_addEi"}
+!CHECK:  %[[I_DECL:.*]]:2 = hlfir.declare %[[IREF]] {uniq_name = "_QFsimple_int_addEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+!CHECK:  %[[I_START:.*]] = arith.constant 0 : i32
+!CHECK:  hlfir.assign %[[I_START]] to %[[I_DECL]]#0 : i32, !fir.ref<i32>
+!CHECK:  omp.parallel byref reduction(@[[RED_I32_NAME]] %[[I_DECL]]#0 -> %[[PRV:.+]] : !fir.ref<i32>) {
+!CHECK:    %[[P_DECL:.+]]:2 = hlfir.declare %[[PRV]] {{.*}} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+!CHECK:    %[[LPRV:.+]] = fir.load %[[P_DECL]]#0 : !fir.ref<i32>
+!CHECK:    %[[I_INCR:.*]] = arith.constant 1 : i32
+!CHECK:    %[[RES:.+]] = arith.addi %[[LPRV]], %[[I_INCR]] : i32
+!CHECK:    hlfir.assign %[[RES]] to %[[P_DECL]]#0 : i32, !fir.ref<i32>
+!CHECK:    omp.terminator
+!CHECK:  }
+!CHECK: return
+subroutine simple_int_add
+    integer :: i
+    i = 0
+
+    !$omp parallel reduction(+:i)
+    i = i + 1
+    !$omp end parallel
+
+    print *, i
+end subroutine
+
+!CHECK-LABEL: func.func @_QPsimple_real_add
+!CHECK:  %[[RREF:.*]] = fir.alloca f32 {bindc_name = "r", uniq_name = "_QFsimple_real_addEr"}
+!CHECK:  %[[R_DECL:.*]]:2 = hlfir.declare %[[RREF]] {uniq_name = "_QFsimple_real_addEr"} : (!fir.ref<f32>) -> (!fir.ref<f32>, !fir.ref<f32>)
+!CHECK:  %[[R_START:.*]] = arith.constant 0.000000e+00 : f32
+!CHECK:  hlfir.assign %[[R_START]] to %[[R_DECL]]#0 : f32, !fir.ref<f32>
+!CHECK:  omp.parallel byref reduction(@[[RED_F32_NAME]] %[[R_DECL]]#0 -> %[[PRV:.+]] : !fir.ref<f32>) {
+!CHECK:    %[[P_DECL:.+]]:2 = hlfir.declare %[[PRV]] {{.*}} : (!fir.ref<f32>) -> (!fir.ref<f32>, !fir.ref<f32>)
+!CHECK:    %[[LPRV:.+]] = fir.load %[[P_DECL]]#0 : !fir.ref<f32>
+!CHECK:    %[[R_INCR:.*]] = arith.constant 1.500000e+00 : f32
+!CHECK:    %[[RES:.+]] = arith.addf %[[LPRV]], %[[R_INCR]] {{.*}} : f32
+!CHECK:    hlfir.assign %[[RES]] to %[[P_DECL]]#0 : f32, !fir.ref<f32>
+!CHECK:    omp.terminator
+!CHECK:  }
+!CHECK: return
+subroutine simple_real_add
+    real :: r
+    r = 0.0
+
+    !$omp parallel reduction(+:r)
+    r = r + 1.5
+    !$omp end parallel
+
+    print *, r
+end subroutine
+
+!CHECK-LABEL: func.func @_QPint_real_add
+!CHECK:  %[[IREF:.*]] = fir.alloca i32 {bindc_name = "i", uniq_name = "_QFint_real_addEi"}
+!CHECK:  %[[I_DECL:.*]]:2 = hlfir.declare %[[IREF]] {uniq_name = "_QFint_real_addEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+!CHECK:  %[[RREF:.*]] = fir.alloca f32 {bindc_name = "r", uniq_name = "_QFint_real_addEr"}
+!CHECK:  %[[R_DECL:.*]]:2 = hlfir.declare %[[RREF]] {uniq_name = "_QFint_real_addEr"} : (!fir.ref<f32>) -> (!fir.ref<f32>, !fir.ref<f32>)
+!CHECK:  %[[R_START:.*]] = arith.constant 0.000000e+00 : f32
+!CHECK:  hlfir.assign %[[R_START]] to %[[R_DECL]]#0 : f32, !fir.ref<f32>
+!CHECK:  %[[I_START:.*]] = arith.constant 0 : i32
+!CHECK:  hlfir.assign %[[I_START]] to %[[I_DECL]]#0 : i32, !fir.ref<i32>
+!CHECK:  omp.parallel byref reduction(@[[RED_I32_NAME]] %[[I_DECL]]#0 -> %[[IPRV:.+]] : !fir.ref<i32>, @[[RED_F32_NAME]] %[[R_DECL]]#0 -> %[[RPRV:.+]] : !fir.ref<f32>) {
+!CHECK:    %[[IP_DECL:.+]]:2 = hlfir.declare %[[IPRV]] {{.*}} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+!CHECK:    %[[RP_DECL:.+]]:2 = hlfir.declare %[[RPRV]] {{.*}} : (!fir.ref<f32>) -> (!fir.ref<f32>, !fir.ref<f32>)
+!CHECK:    %[[R_INCR:.*]] = arith.constant 1.500000e+00 : f32
+!CHECK:    %[[R_LPRV:.+]] = fir.load %[[RP_DECL]]#0 : !fir.ref<f32>
+!CHECK:    %[[RES1:.+]] = arith.addf %[[R_INCR]], %[[R_LPRV]] {{.*}} : f32
+!CHECK:    hlfir.assign %[[RES1]] to %[[RP_DECL]]#0 : f32, !fir.ref<f32>
+!CHECK:    %[[I_LPRV:.+]] = fir.load %[[IP_DECL]]#0 : !fir.ref<i32>
+!CHECK:    %[[I_INCR:.*]] = arith.constant 3 : i32
+!CHECK:    %[[RES0:.+]] = arith.addi %[[I_LPRV]], %[[I_INCR]] : i32
+!CHECK:    hlfir.assign %[[RES0]] to %[[IP_DECL]]#0 : i32, !fir.ref<i32>
+!CHECK:    omp.terminator
+!CHECK:  }
+!CHECK: return
+subroutine int_real_add
+    real :: r
+    integer :: i
+
+    r = 0.0
+    i = 0
+
+    !$omp parallel reduction(+:i,r)
+    r = 1.5 + r
+    i = i + 3
+    !$omp end parallel
+
+    print *, r
+    print *, i
+end subroutine
diff --git a/flang/test/Lower/OpenMP/parallel-reduction-byref.f90 b/flang/test/Lower/OpenMP/parallel-reduction-byref.f90
new file mode 100644
index 00000000000000..a7c77c52674ee9
--- /dev/null
+++ b/flang/test/Lower/OpenMP/parallel-reduction-byref.f90
@@ -0,0 +1,44 @@
+! RUN: bbc -emit-hlfir -fopenmp --force-byref-reduction -o - %s 2>&1 | FileCheck %s
+! RUN: %flang_fc1 -emit-hlfir -fopenmp -mmlir --force-byref-reduction -o - %s 2>&1 | FileCheck %s
+
+!CHECK:  omp.reduction.declare @[[REDUCTION_DECLARE:[_a-z0-9]+]] : !fir.ref<i32>
+!CHECK-SAME: init {
+!CHECK:  ^bb0(%{{.*}}: !fir.ref<i32>):
+!CHECK:    %[[I0:[_a-z0-9]+]] = arith.constant 0 : i32
+!CHECK:  %[[REF:.*]] = fir.alloca i32
+!CHECKL  fir.store [[%I0]] to %[[REF]] : !fir.ref<i32>
+!CHECK:    omp.yield(%[[REF]] : !fir.ref<i32>)
+!CHECK:  } combiner {
+!CHECK:  ^bb0(%[[C0:[_a-z0-9]+]]: !fir.ref<i32>, %[[C1:[_a-z0-9]+]]: !fir.ref<i32>):
+!CHECK:    %[[LD0:.*]] = fir.load %[[C0]] : !fir.ref<i32>
+!CHECK:    %[[LD1:.*]] = fir.load %[[C1]] : !fir.ref<i32>
+!CHECK:    %[[CR:[_a-z0-9]+]] = arith.addi %[[LD0]], %[[LD1]] : i32
+!CHECK:    fir.store %[[CR]] to %[[C0]] : !fir.ref<i32>
+!CHECK:    omp.yield(%[[C0]] : !fir.ref<i32>)
+!CHECK:  }
+!CHECK:  func.func @_QQmain() attributes {fir.bindc_name = "mn"} {
+!CHECK:    %[[RED_ACCUM_REF:[_a-z0-9]+]] = fir.alloca i32 {bindc_name = "i", uniq_name = "_QFEi"}
+!CHECK:    %[[RED_ACCUM_DECL:[_a-z0-9]+]]:2 = hlfir.declare %[[RED_ACCUM_REF]] {uniq_name = "_QFEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+!CHECK:    %[[C0:[_a-z0-9]+]] = arith.constant 0 : i32
+!CHECK:    hlfir.assign %[[C0]] to %[[RED_ACCUM_DECL]]#0 : i32, !fir.ref<i32>
+!CHECK:    omp.parallel byref reduction(@[[REDUCTION_DECLARE]] %[[RED_ACCUM_DECL]]#0 -> %[[PRIVATE_RED:[a-z0-9]+]] : !fir.ref<i32>) {
+!CHECK:      %[[PRIVATE_DECL:[_a-z0-9]+]]:2 = hlfir.declare %[[PRIVATE_RED]] {uniq_name = "_QFEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+!CHECK:      %[[C1:[_a-z0-9]+]] = arith.constant 1 : i32
+!CHECK:      hlfir.assign %[[C1]] to %[[PRIVATE_DECL]]#0 : i32, !fir.ref<i32>
+!CHECK:      omp.terminator
+!CHECK:    }
+!CHECK:    %[[RED_ACCUM_VAL:[_a-z0-9]+]] = fir.load %[[RED_ACCUM_DECL]]#0 : !fir.ref<i32>
+!CHECK:    {{.*}} = fir.call @_FortranAioOutputInteger32(%{{.*}}, %[[RED_ACCUM_VAL]]) fastmath<contract> : (!fir.ref<i8>, i32) -> i1
+!CHECK:    return
+!CHECK:  }
+
+program mn
+    integer :: i
+    i = 0
+
+    !$omp parallel reduction(+:i)
+      i = 1
+    !$omp end parallel
+
+    print *, i
+end program
diff --git a/flang/test/Lower/OpenMP/parallel-wsloop-reduction-byref.f90 b/flang/test/Lower/OpenMP/parallel-wsloop-reduction-byref.f90
new file mode 100644
index 00000000000000..8492a69fed5825
--- /dev/null
+++ b/flang/test/Lower/OpenMP/parallel-wsloop-reduction-byref.f90
@@ -0,0 +1,16 @@
+! Check that for parallel do, reduction is only processed for the loop
+
+! RUN: bbc -fopenmp --force-byref-reduction -emit-hlfir %s -o - | FileCheck %s
+! RUN: flang-new -fc1 -fopenmp -mmlir --force-byref-reduction -emit-hlfir %s -o - | FileCheck %s
+
+! CHECK: omp.parallel {
+! CHECK: omp.wsloop byref reduction(@add_reduction_i_32
+subroutine sb
+  integer :: x
+  x = 0
+  !$omp parallel do reduction(+:x)
+  do i=1,100
+    x = x + 1
+  end do
+  !$omp end parallel do
+end subroutine
diff --git a/flang/test/Lower/OpenMP/wsloop-reduction-add-byref.f90 b/flang/test/Lower/OpenMP/wsloop-reduction-add-byref.f90
new file mode 100644
index 00000000000000..e8a04c23f4ea04
--- /dev/null
+++ b/flang/test/Lower/OpenMP/wsloop-reduction-add-byref.f90
@@ -0,0 +1,433 @@
+! RUN: bbc -emit-hlfir -fopenmp --force-byref-reduction %s -o - | FileCheck %s
+! RUN: %flang_fc1 -emit-hlfir -fopenmp -mmlir --force-byref-reduction %s -o - | FileCheck %s
+
+! CHECK-LABEL:   omp.reduction.declare @add_reduction_f_64_byref : !fir.ref<f64>
+! CHECK-SAME:    init {
+! CHECK:         ^bb0(%[[VAL_0:.*]]: !fir.ref<f64>):
+! CHECK:            %[[C0_1:.*]] = arith.constant 0.000000e+00 : f64
+! CHECK:            %[[REF:.*]] = fir.alloca f64
+! CHECK:            fir.store %[[C0_1]] to %[[REF]] : !fir.ref<f64>
+! CHECK:           omp.yield(%[[REF]] : !fir.ref<f64>)
+
+! CHECK-LABEL:   } combiner {
+! CHECK:         ^bb0(%[[ARG0:.*]]: !fir.ref<f64>, %[[ARG1:.*]]: !fir.ref<f64>):
+! CHECK:           %[[LD0:.*]] = fir.load %[[ARG0]] : !fir.ref<f64>
+! CHECK:           %[[LD1:.*]] = fir.load %[[ARG1]] : !fir.ref<f64>
+! CHECK:           %[[RES:.*]] = arith.addf %[[LD0]], %[[LD1]] fastmath<contract> : f64
+! CHECK:           fir.store %[[RES]] to %[[ARG0]] : !fir.ref<f64>
+! CHECK:           omp.yield(%[[ARG0]] : !fir.ref<f64>)
+! CHECK:         }
+
+! CHECK-LABEL:   omp.reduction.declare @add_reduction_i_64_byref : !fir.ref<i64>
+! CHECK-SAME:    init {
+! CHECK:         ^bb0(%[[VAL_0:.*]]: !fir.ref<i64>):
+! CHECK:            %[[C0_1:.*]] = arith.constant 0 : i64
+! CHECK:            %[[REF:.*]] = fir.alloca i64
+! CHECK:            fir.store %[[C0_1]] to %[[REF]] : !fir.ref<i64>
+! CHECK:            omp.yield(%[[REF]] : !fir.ref<i64>)
+
+! CHECK-LABEL:   } combiner {
+! CHECK:         ^bb0(%[[ARG0:.*]]: !fir.ref<i64>, %[[ARG1:.*]]: !fir.ref<i64>):
+! CHECK:           %[[LD0:.*]] = fir.load %[[ARG0]] : !fir.ref<i64>
+! CHECK:           %[[LD1:.*]] = fir.load %[[ARG1]] : !fir.ref<i64>
+! CHECK:           %[[RES:.*]] = arith.addi %[[LD0]], %[[LD1]] : i64
+! CHECK:           fir.store %[[RES]] to %[[ARG0]] : !fir.ref<i64>
+! CHECK:           omp.yield(%[[ARG0]] : !fir.ref<i64>)
+! CHECK:         }
+
+! CHECK-LABEL:   omp.reduction.declare @add_reduction_f_32_byref : !fir.ref<f32>
+! CHECK-SAME:    init {
+! CHECK:         ^bb0(%[[VAL_0:.*]]: !fir.ref<f32>):
+! CHECK:            %[[C0_1:.*]] = arith.constant 0.000000e+00 : f32
+! CHECK:            %[[REF:.*]] = fir.alloca f32
+! CHECK:            fir.store %[[C0_1]] to %[[REF]] : !fir.ref<f32>
+! CHECK:           omp.yield(%[[REF]] : !fir.ref<f32>)
+
+! CHECK-LABEL:   } combiner {
+! CHECK:         ^bb0(%[[ARG0:.*]]: !fir.ref<f32>, %[[ARG1:.*]]: !fir.ref<f32>):
+! CHECK:           %[[LD0:.*]] = fir.load %[[ARG0]] : !fir.ref<f32>
+! CHECK:           %[[LD1:.*]] = fir.load %[[ARG1]] : !fir.ref<f32>
+! CHECK:           %[[RES:.*]] = arith.addf %[[LD0]], %[[LD1]] fastmath<contract> : f32
+! CHECK:           fir.store %[[RES]] to %[[ARG0]] : !fir.ref<f32>
+! CHECK:           omp.yield(%[[ARG0]] : !fir.ref<f32>)
+! CHECK:         }
+
+! CHECK-LABEL:   omp.reduction.declare @add_reduction_i_32_byref : !fir.ref<i32>
+! CHECK-SAME:    init {
+! CHECK:         ^bb0(%[[VAL_0:.*]]: !fir.ref<i32>):
+! CHECK:            %[[C0_1:.*]] = arith.constant 0 : i32
+! CHECK:            %[[REF:.*]] = fir.alloca i32
+! CHECK:            fir.store %[[C0_1]] to %[[REF]] : !fir.ref<i32>
+! CHECK:           omp.yield(%[[REF]] : !fir.ref<i32>)
+
+! CHECK-LABEL:   } combiner {
+! CHECK:         ^bb0(%[[ARG0:.*]]: !fir.ref<i32>, %[[ARG1:.*]]: !fir.ref<i32>):
+! CHECK:           %[[LD0:.*]] = fir.load %[[ARG0]] : !fir.ref<i32>
+! CHECK:           %[[LD1:.*]] = fir.load %[[ARG1]] : !fir.ref<i32>
+! CHECK:           %[[RES:.*]] = arith.addi %[[LD0]], %[[LD1]] : i32
+! CHECK:           fir.store %[[RES]] to %[[ARG0]] : !fir.ref<i32>
+! CHECK:           omp.yield(%[[ARG0]] : !fir.ref<i32>)
+! CHECK:         }
+
+! CHECK-LABEL:   func.func @_QPsimple_int_reduction() {
+! CHECK:           %[[VAL_0:.*]] = fir.alloca i32 {bindc_name = "i", uniq_name = "_QFsimple_int_reductionEi"}
+! CHECK:           %[[VAL_1:.*]]:2 = hlfir.declare %[[VAL_0]] {uniq_name = "_QFsimple_int_reductionEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK:           %[[VAL_2:.*]] = fir.alloca i32 {bindc_name = "x", uniq_name = "_QFsimple_int_reductionEx"}
+! CHECK:           %[[VAL_3:.*]]:2 = hlfir.declare %[[VAL_2]] {uniq_name = "_QFsimple_int_reductionEx"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK:           %[[VAL_4:.*]] = arith.constant 0 : i32
+! CHECK:           hlfir.assign %[[VAL_4]] to %[[VAL_3]]#0 : i32, !fir.ref<i32>
+! CHECK:           omp.parallel {
+! CHECK:             %[[VAL_5:.*]] = fir.alloca i32 {adapt.valuebyref, pinned}
+! CHECK:             %[[VAL_6:.*]]:2 = hlfir.declare %[[VAL_5]] {uniq_name = "_QFsimple_int_reductionEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK:             %[[VAL_7:.*]] = arith.constant 1 : i32
+! CHECK:             %[[VAL_8:.*]] = arith.constant 100 : i32
+! CHECK:             %[[VAL_9:.*]] = arith.constant 1 : i32
+! CHECK:             omp.wsloop byref reduction(@add_reduction_i_32_byref %[[VAL_3]]#0 -> %[[VAL_10:.*]] : !fir.ref<i32>)  for  (%[[VAL_11:.*]]) : i32 = (%[[VAL_7]]) to (%[[VAL_8]]) inclusive step (%[[VAL_9]]) {
+! CHECK:               fir.store %[[VAL_11]] to %[[VAL_6]]#1 : !fir.ref<i32>
+! CHECK:               %[[VAL_12:.*]]:2 = hlfir.declare %[[VAL_10]] {uniq_name = "_QFsimple_int_reductionEx"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK:               %[[VAL_13:.*]] = fir.load %[[VAL_12]]#0 : !fir.ref<i32>
+! CHECK:               %[[VAL_14:.*]] = fir.load %[[VAL_6]]#0 : !fir.ref<i32>
+! CHECK:               %[[VAL_15:.*]] = arith.addi %[[VAL_13]], %[[VAL_14]] : i32
+! CHECK:               hlfir.assign %[[VAL_15]] to %[[VAL_12]]#0 : i32, !fir.ref<i32>
+! CHECK:               omp.yield
+! CHECK:             }
+! CHECK:             omp.terminator
+! CHECK:           }
+! CHECK:           return
+! CHECK:         }
+
+subroutine simple_int_reduction
+  integer :: x
+  x = 0
+  !$omp parallel
+  !$omp do reduction(+:x)
+  do i=1, 100
+    x = x + i
+  end do
+  !$omp end do
+  !$omp end parallel
+end subroutine
+
+
+! CHECK-LABEL:   func.func @_QPsimple_real_reduction() {
+! CHECK:           %[[VAL_0:.*]] = fir.alloca i32 {bindc_name = "i", uniq_name = "_QFsimple_real_reductionEi"}
+! CHECK:           %[[VAL_1:.*]]:2 = hlfir.declare %[[VAL_0]] {uniq_name = "_QFsimple_real_reductionEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK:           %[[VAL_2:.*]] = fir.alloca f32 {bindc_name = "x", uniq_name = "_QFsimple_real_reductionEx"}
+! CHECK:           %[[VAL_3:.*]]:2 = hlfir.declare %[[VAL_2]] {uniq_name = "_QFsimple_real_reductionEx"} : (!fir.ref<f32>) -> (!fir.ref<f32>, !fir.ref<f32>)
+! CHECK:           %[[VAL_4:.*]] = arith.constant 0.000000e+00 : f32
+! CHECK:           hlfir.assign %[[VAL_4]] to %[[VAL_3]]#0 : f32, !fir.ref<f32>
+! CHECK:           omp.parallel {
+! CHECK:             %[[VAL_5:.*]] = fir.alloca i32 {adapt.valuebyref, pinned}
+! CHECK:             %[[VAL_6:.*]]:2 = hlfir.declare %[[VAL_5]] {uniq_name = "_QFsimple_real_reductionEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK:             %[[VAL_7:.*]] = arith.constant 1 : i32
+! CHECK:             %[[VAL_8:.*]] = arith.constant 100 : i32
+! CHECK:             %[[VAL_9:.*]] = arith.constant 1 : i32
+! CHECK:             omp.wsloop byref reduction(@add_reduction_f_32_byref %[[VAL_3]]#0 -> %[[VAL_10:.*]] : !fir.ref<f32>)  for  (%[[VAL_11:.*]]) : i32 = (%[[VAL_7]]) to (%[[VAL_8]]) inclusive step (%[[VAL_9]]) {
+! CHECK:               fir.store %[[VAL_11]] to %[[VAL_6]]#1 : !fir.ref<i32>
+! CHECK:               %[[VAL_12:.*]]:2 = hlfir.declare %[[VAL_10]] {uniq_name = "_QFsimple_real_reductionEx"} : (!fir.ref<f32>) -> (!fir.ref<f32>, !fir.ref<f32>)
+! CHECK:               %[[VAL_13:.*]] = fir.load %[[VAL_12]]#0 : !fir.ref<f32>
+! CHECK:               %[[VAL_14:.*]] = fir.load %[[VAL_6]]#0 : !fir.ref<i32>
+! CHECK:               %[[VAL_15:.*]] = fir.convert %[[VAL_14]] : (i32) -> f32
+! CHECK:               %[[VAL_16:.*]] = arith.addf %[[VAL_13]], %[[VAL_15]] fastmath<contract> : f32
+! CHECK:               hlfir.assign %[[VAL_16]] to %[[VAL_12]]#0 : f32, !fir.ref<f32>
+! CHECK:               omp.yield
+! CHECK:             }
+! CHECK:             omp.terminator
+! CHECK:           }
+! CHECK:           return
+! CHECK:         }
+
+subroutine simple_real_reduction
+  real :: x
+  x = 0.0
+  !$omp parallel
+  !$omp do reduction(+:x)
+  do i=1, 100
+    x = x + i
+  end do
+  !$omp end do
+  !$omp end parallel
+end subroutine
+
+
+! CHECK-LABEL:   func.func @_QPsimple_int_reduction_switch_order() {
+! CHECK:           %[[VAL_0:.*]] = fir.alloca i32 {bindc_name = "i", uniq_name = "_QFsimple_int_reduction_switch_orderEi"}
+! CHECK:           %[[VAL_1:.*]]:2 = hlfir.declare %[[VAL_0]] {uniq_name = "_QFsimple_int_reduction_switch_orderEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK:           %[[VAL_2:.*]] = fir.alloca i32 {bindc_name = "x", uniq_name = "_QFsimple_int_reduction_switch_orderEx"}
+! CHECK:           %[[VAL_3:.*]]:2 = hlfir.declare %[[VAL_2]] {uniq_name = "_QFsimple_int_reduction_switch_orderEx"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK:           %[[VAL_4:.*]] = arith.constant 0 : i32
+! CHECK:           hlfir.assign %[[VAL_4]] to %[[VAL_3]]#0 : i32, !fir.ref<i32>
+! CHECK:           omp.parallel {
+! CHECK:             %[[VAL_5:.*]] = fir.alloca i32 {adapt.valuebyref, pinned}
+! CHECK:             %[[VAL_6:.*]]:2 = hlfir.declare %[[VAL_5]] {uniq_name = "_QFsimple_int_reduction_switch_orderEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK:             %[[VAL_7:.*]] = arith.constant 1 : i32
+! CHECK:             %[[VAL_8:.*]] = arith.constant 100 : i32
+! CHECK:             %[[VAL_9:.*]] = arith.constant 1 : i32
+! CHECK:             omp.wsloop byref reduction(@add_reduction_i_32_byref %[[VAL_3]]#0 -> %[[VAL_10:.*]] : !fir.ref<i32>)  for  (%[[VAL_11:.*]]) : i32 = (%[[VAL_7]]) to (%[[VAL_8]]) inclusive step (%[[VAL_9]]) {
+! CHECK:               fir.store %[[VAL_11]] to %[[VAL_6]]#1 : !fir.ref<i32>
+! CHECK:               %[[VAL_12:.*]]:2 = hlfir.declare %[[VAL_10]] {uniq_name = "_QFsimple_int_reduction_switch_orderEx"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK:               %[[VAL_13:.*]] = fir.load %[[VAL_6]]#0 : !fir.ref<i32>
+! CHECK:               %[[VAL_14:.*]] = fir.load %[[VAL_12]]#0 : !fir.ref<i32>
+! CHECK:               %[[VAL_15:.*]] = arith.addi %[[VAL_13]], %[[VAL_14]] : i32
+! CHECK:               hlfir.assign %[[VAL_15]] to %[[VAL_12]]#0 : i32, !fir.ref<i32>
+! CHECK:               omp.yield
+! CHECK:             }
+! CHECK:             omp.terminator
+! CHECK:           }
+! CHECK:           return
+! CHECK:         }
+
+subroutine simple_int_reduction_switch_order
+  integer :: x
+  x = 0
+  !$omp parallel
+  !$omp do reduction(+:x)
+  do i=1, 100
+    x = i + x
+  end do
+  !$omp end do
+  !$omp end parallel
+end subroutine
+
+! CHECK-LABEL:   func.func @_QPsimple_real_reduction_switch_order() {
+! CHECK:           %[[VAL_0:.*]] = fir.alloca i32 {bindc_name = "i", uniq_name = "_QFsimple_real_reduction_switch_orderEi"}
+! CHECK:           %[[VAL_1:.*]]:2 = hlfir.declare %[[VAL_0]] {uniq_name = "_QFsimple_real_reduction_switch_orderEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK:           %[[VAL_2:.*]] = fir.alloca f32 {bindc_name = "x", uniq_name = "_QFsimple_real_reduction_switch_orderEx"}
+! CHECK:           %[[VAL_3:.*]]:2 = hlfir.declare %[[VAL_2]] {uniq_name = "_QFsimple_real_reduction_switch_orderEx"} : (!fir.ref<f32>) -> (!fir.ref<f32>, !fir.ref<f32>)
+! CHECK:           %[[VAL_4:.*]] = arith.constant 0.000000e+00 : f32
+! CHECK:           hlfir.assign %[[VAL_4]] to %[[VAL_3]]#0 : f32, !fir.ref<f32>
+! CHECK:           omp.parallel {
+! CHECK:             %[[VAL_5:.*]] = fir.alloca i32 {adapt.valuebyref, pinned}
+! CHECK:             %[[VAL_6:.*]]:2 = hlfir.declare %[[VAL_5]] {uniq_name = "_QFsimple_real_reduction_switch_orderEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK:             %[[VAL_7:.*]] = arith.constant 1 : i32
+! CHECK:             %[[VAL_8:.*]] = arith.constant 100 : i32
+! CHECK:             %[[VAL_9:.*]] = arith.constant 1 : i32
+! CHECK:             omp.wsloop byref reduction(@add_reduction_f_32_byref %[[VAL_3]]#0 -> %[[VAL_10:.*]] : !fir.ref<f32>)  for  (%[[VAL_11:.*]]) : i32 = (%[[VAL_7]]) to (%[[VAL_8]]) inclusive step (%[[VAL_9]]) {
+! CHECK:               fir.store %[[VAL_11]] to %[[VAL_6]]#1 : !fir.ref<i32>
+! CHECK:               %[[VAL_12:.*]]:2 = hlfir.declare %[[VAL_10]] {uniq_name = "_QFsimple_real_reduction_switch_orderEx"} : (!fir.ref<f32>) -> (!fir.ref<f32>, !fir.ref<f32>)
+! CHECK:               %[[VAL_13:.*]] = fir.load %[[VAL_6]]#0 : !fir.ref<i32>
+! CHECK:               %[[VAL_14:.*]] = fir.convert %[[VAL_13]] : (i32) -> f32
+! CHECK:               %[[VAL_15:.*]] = fir.load %[[VAL_12]]#0 : !fir.ref<f32>
+! CHECK:               %[[VAL_16:.*]] = arith.addf %[[VAL_14]], %[[VAL_15]] fastmath<contract> : f32
+! CHECK:               hlfir.assign %[[VAL_16]] to %[[VAL_12]]#0 : f32, !fir.ref<f32>
+! CHECK:               omp.yield
+! CHECK:             }
+! CHECK:             omp.terminator
+! CHECK:           }
+! CHECK:           return
+! CHECK:         }
+
+subroutine simple_real_reduction_switch_order
+  real :: x
+  x = 0.0
+  !$omp parallel
+  !$omp do reduction(+:x)
+  do i=1, 100
+    x = i + x
+  end do
+  !$omp end do
+  !$omp end parallel
+end subroutine
+
+! CHECK-LABEL:   func.func @_QPmultiple_int_reductions_same_type() {
+! CHECK:           %[[VAL_0:.*]] = fir.alloca i32 {bindc_name = "i", uniq_name = "_QFmultiple_int_reductions_same_typeEi"}
+! CHECK:           %[[VAL_1:.*]]:2 = hlfir.declare %[[VAL_0]] {uniq_name = "_QFmultiple_int_reductions_same_typeEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK:           %[[VAL_2:.*]] = fir.alloca i32 {bindc_name = "x", uniq_name = "_QFmultiple_int_reductions_same_typeEx"}
+! CHECK:           %[[VAL_3:.*]]:2 = hlfir.declare %[[VAL_2]] {uniq_name = "_QFmultiple_int_reductions_same_typeEx"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK:           %[[VAL_4:.*]] = fir.alloca i32 {bindc_name = "y", uniq_name = "_QFmultiple_int_reductions_same_typeEy"}
+! CHECK:           %[[VAL_5:.*]]:2 = hlfir.declare %[[VAL_4]] {uniq_name = "_QFmultiple_int_reductions_same_typeEy"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK:           %[[VAL_6:.*]] = fir.alloca i32 {bindc_name = "z", uniq_name = "_QFmultiple_int_reductions_same_typeEz"}
+! CHECK:           %[[VAL_7:.*]]:2 = hlfir.declare %[[VAL_6]] {uniq_name = "_QFmultiple_int_reductions_same_typeEz"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK:           %[[VAL_8:.*]] = arith.constant 0 : i32
+! CHECK:           hlfir.assign %[[VAL_8]] to %[[VAL_3]]#0 : i32, !fir.ref<i32>
+! CHECK:           %[[VAL_9:.*]] = arith.constant 0 : i32
+! CHECK:           hlfir.assign %[[VAL_9]] to %[[VAL_5]]#0 : i32, !fir.ref<i32>
+! CHECK:           %[[VAL_10:.*]] = arith.constant 0 : i32
+! CHECK:           hlfir.assign %[[VAL_10]] to %[[VAL_7]]#0 : i32, !fir.ref<i32>
+! CHECK:           omp.parallel {
+! CHECK:             %[[VAL_11:.*]] = fir.alloca i32 {adapt.valuebyref, pinned}
+! CHECK:             %[[VAL_12:.*]]:2 = hlfir.declare %[[VAL_11]] {uniq_name = "_QFmultiple_int_reductions_same_typeEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK:             %[[VAL_13:.*]] = arith.constant 1 : i32
+! CHECK:             %[[VAL_14:.*]] = arith.constant 100 : i32
+! CHECK:             %[[VAL_15:.*]] = arith.constant 1 : i32
+! CHECK:             omp.wsloop byref reduction(@add_reduction_i_32_byref %[[VAL_3]]#0 -> %[[VAL_16:.*]] : !fir.ref<i32>, @add_reduction_i_32_byref %[[VAL_5]]#0 -> %[[VAL_17:.*]] : !fir.ref<i32>, @add_reduction_i_32_byref %[[VAL_7]]#0 -> %[[VAL_18:.*]] : !fir.ref<i32>)  for  (%[[VAL_19:.*]]) : i32 = (%[[VAL_13]]) to (%[[VAL_14]]) inclusive step (%[[VAL_15]]) {
+! CHECK:               fir.store %[[VAL_19]] to %[[VAL_12]]#1 : !fir.ref<i32>
+! CHECK:               %[[VAL_20:.*]]:2 = hlfir.declare %[[VAL_16]] {uniq_name = "_QFmultiple_int_reductions_same_typeEx"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK:               %[[VAL_21:.*]]:2 = hlfir.declare %[[VAL_17]] {uniq_name = "_QFmultiple_int_reductions_same_typeEy"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK:               %[[VAL_22:.*]]:2 = hlfir.declare %[[VAL_18]] {uniq_name = "_QFmultiple_int_reductions_same_typeEz"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK:               %[[VAL_23:.*]] = fir.load %[[VAL_20]]#0 : !fir.ref<i32>
+! CHECK:               %[[VAL_24:.*]] = fir.load %[[VAL_12]]#0 : !fir.ref<i32>
+! CHECK:               %[[VAL_25:.*]] = arith.addi %[[VAL_23]], %[[VAL_24]] : i32
+! CHECK:               hlfir.assign %[[VAL_25]] to %[[VAL_20]]#0 : i32, !fir.ref<i32>
+! CHECK:               %[[VAL_26:.*]] = fir.load %[[VAL_21]]#0 : !fir.ref<i32>
+! CHECK:               %[[VAL_27:.*]] = fir.load %[[VAL_12]]#0 : !fir.ref<i32>
+! CHECK:               %[[VAL_28:.*]] = arith.addi %[[VAL_26]], %[[VAL_27]] : i32
+! CHECK:               hlfir.assign %[[VAL_28]] to %[[VAL_21]]#0 : i32, !fir.ref<i32>
+! CHECK:               %[[VAL_29:.*]] = fir.load %[[VAL_22]]#0 : !fir.ref<i32>
+! CHECK:               %[[VAL_30:.*]] = fir.load %[[VAL_12]]#0 : !fir.ref<i32>
+! CHECK:               %[[VAL_31:.*]] = arith.addi %[[VAL_29]], %[[VAL_30]] : i32
+! CHECK:               hlfir.assign %[[VAL_31]] to %[[VAL_22]]#0 : i32, !fir.ref<i32>
+! CHECK:               omp.yield
+! CHECK:             }
+! CHECK:             omp.terminator
+! CHECK:           }
+! CHECK:           return
+! CHECK:         }
+
+subroutine multiple_int_reductions_same_type
+  integer :: x,y,z
+  x = 0
+  y = 0
+  z = 0
+  !$omp parallel
+  !$omp do reduction(+:x,y,z)
+  do i=1, 100
+    x = x + i
+    y = y + i
+    z = z + i
+  end do
+  !$omp end do
+  !$omp end parallel
+end subroutine
+
+! CHECK-LABEL:   func.func @_QPmultiple_real_reductions_same_type() {
+! CHECK:           %[[VAL_0:.*]] = fir.alloca i32 {bindc_name = "i", uniq_name = "_QFmultiple_real_reductions_same_typeEi"}
+! CHECK:           %[[VAL_1:.*]]:2 = hlfir.declare %[[VAL_0]] {uniq_name = "_QFmultiple_real_reductions_same_typeEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK:           %[[VAL_2:.*]] = fir.alloca f32 {bindc_name = "x", uniq_name = "_QFmultiple_real_reductions_same_typeEx"}
+! CHECK:           %[[VAL_3:.*]]:2 = hlfir.declare %[[VAL_2]] {uniq_name = "_QFmultiple_real_reductions_same_typeEx"} : (!fir.ref<f32>) -> (!fir.ref<f32>, !fir.ref<f32>)
+! CHECK:           %[[VAL_4:.*]] = fir.alloca f32 {bindc_name = "y", uniq_name = "_QFmultiple_real_reductions_same_typeEy"}
+! CHECK:           %[[VAL_5:.*]]:2 = hlfir.declare %[[VAL_4]] {uniq_name = "_QFmultiple_real_reductions_same_typeEy"} : (!fir.ref<f32>) -> (!fir.ref<f32>, !fir.ref<f32>)
+! CHECK:           %[[VAL_6:.*]] = fir.alloca f32 {bindc_name = "z", uniq_name = "_QFmultiple_real_reductions_same_typeEz"}
+! CHECK:           %[[VAL_7:.*]]:2 = hlfir.declare %[[VAL_6]] {uniq_name = "_QFmultiple_real_reductions_same_typeEz"} : (!fir.ref<f32>) -> (!fir.ref<f32>, !fir.ref<f32>)
+! CHECK:           %[[VAL_8:.*]] = arith.constant 0.000000e+00 : f32
+! CHECK:           hlfir.assign %[[VAL_8]] to %[[VAL_3]]#0 : f32, !fir.ref<f32>
+! CHECK:           %[[VAL_9:.*]] = arith.constant 0.000000e+00 : f32
+! CHECK:           hlfir.assign %[[VAL_9]] to %[[VAL_5]]#0 : f32, !fir.ref<f32>
+! CHECK:           %[[VAL_10:.*]] = arith.constant 0.000000e+00 : f32
+! CHECK:           hlfir.assign %[[VAL_10]] to %[[VAL_7]]#0 : f32, !fir.ref<f32>
+! CHECK:           omp.parallel {
+! CHECK:             %[[VAL_11:.*]] = fir.alloca i32 {adapt.valuebyref, pinned}
+! CHECK:             %[[VAL_12:.*]]:2 = hlfir.declare %[[VAL_11]] {uniq_name = "_QFmultiple_real_reductions_same_typeEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK:             %[[VAL_13:.*]] = arith.constant 1 : i32
+! CHECK:             %[[VAL_14:.*]] = arith.constant 100 : i32
+! CHECK:             %[[VAL_15:.*]] = arith.constant 1 : i32
+! CHECK:             omp.wsloop byref reduction(@add_reduction_f_32_byref %[[VAL_3]]#0 -> %[[VAL_16:.*]] : !fir.ref<f32>, @add_reduction_f_32_byref %[[VAL_5]]#0 -> %[[VAL_17:.*]] : !fir.ref<f32>, @add_reduction_f_32_byref %[[VAL_7]]#0 -> %[[VAL_18:.*]] : !fir.ref<f32>)  for  (%[[VAL_19:.*]]) : i32 = (%[[VAL_13]]) to (%[[VAL_14]]) inclusive step (%[[VAL_15]]) {
+! CHECK:               fir.store %[[VAL_19]] to %[[VAL_12]]#1 : !fir.ref<i32>
+! CHECK:               %[[VAL_20:.*]]:2 = hlfir.declare %[[VAL_16]] {uniq_name = "_QFmultiple_real_reductions_same_typeEx"} : (!fir.ref<f32>) -> (!fir.ref<f32>, !fir.ref<f32>)
+! CHECK:               %[[VAL_21:.*]]:2 = hlfir.declare %[[VAL_17]] {uniq_name = "_QFmultiple_real_reductions_same_typeEy"} : (!fir.ref<f32>) -> (!fir.ref<f32>, !fir.ref<f32>)
+! CHECK:               %[[VAL_22:.*]]:2 = hlfir.declare %[[VAL_18]] {uniq_name = "_QFmultiple_real_reductions_same_typeEz"} : (!fir.ref<f32>) -> (!fir.ref<f32>, !fir.ref<f32>)
+! CHECK:               %[[VAL_23:.*]] = fir.load %[[VAL_20]]#0 : !fir.ref<f32>
+! CHECK:               %[[VAL_24:.*]] = fir.load %[[VAL_12]]#0 : !fir.ref<i32>
+! CHECK:               %[[VAL_25:.*]] = fir.convert %[[VAL_24]] : (i32) -> f32
+! CHECK:               %[[VAL_26:.*]] = arith.addf %[[VAL_23]], %[[VAL_25]] fastmath<contract> : f32
+! CHECK:               hlfir.assign %[[VAL_26]] to %[[VAL_20]]#0 : f32, !fir.ref<f32>
+! CHECK:               %[[VAL_27:.*]] = fir.load %[[VAL_21]]#0 : !fir.ref<f32>
+! CHECK:               %[[VAL_28:.*]] = fir.load %[[VAL_12]]#0 : !fir.ref<i32>
+! CHECK:               %[[VAL_29:.*]] = fir.convert %[[VAL_28]] : (i32) -> f32
+! CHECK:               %[[VAL_30:.*]] = arith.addf %[[VAL_27]], %[[VAL_29]] fastmath<contract> : f32
+! CHECK:               hlfir.assign %[[VAL_30]] to %[[VAL_21]]#0 : f32, !fir.ref<f32>
+! CHECK:               %[[VAL_31:.*]] = fir.load %[[VAL_22]]#0 : !fir.ref<f32>
+! CHECK:               %[[VAL_32:.*]] = fir.load %[[VAL_12]]#0 : !fir.ref<i32>
+! CHECK:               %[[VAL_33:.*]] = fir.convert %[[VAL_32]] : (i32) -> f32
+! CHECK:               %[[VAL_34:.*]] = arith.addf %[[VAL_31]], %[[VAL_33]] fastmath<contract> : f32
+! CHECK:               hlfir.assign %[[VAL_34]] to %[[VAL_22]]#0 : f32, !fir.ref<f32>
+! CHECK:               omp.yield
+! CHECK:             }
+! CHECK:             omp.terminator
+! CHECK:           }
+! CHECK:           return
+! CHECK:         }
+
+subroutine multiple_real_reductions_same_type
+  real :: x,y,z
+  x = 0.0
+  y = 0.0
+  z = 0.0
+  !$omp parallel
+  !$omp do reduction(+:x,y,z)
+  do i=1, 100
+    x = x + i
+    y = y + i
+    z = z + i
+  end do
+  !$omp end do
+  !$omp end parallel
+end subroutine
+
+! CHECK-LABEL:   func.func @_QPmultiple_reductions_different_type() {
+! CHECK:           %[[VAL_0:.*]] = fir.alloca i32 {bindc_name = "i", uniq_name = "_QFmultiple_reductions_different_typeEi"}
+! CHECK:           %[[VAL_1:.*]]:2 = hlfir.declare %[[VAL_0]] {uniq_name = "_QFmultiple_reductions_different_typeEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK:           %[[VAL_2:.*]] = fir.alloca f64 {bindc_name = "w", uniq_name = "_QFmultiple_reductions_different_typeEw"}
+! CHECK:           %[[VAL_3:.*]]:2 = hlfir.declare %[[VAL_2]] {uniq_name = "_QFmultiple_reductions_different_typeEw"} : (!fir.ref<f64>) -> (!fir.ref<f64>, !fir.ref<f64>)
+! CHECK:           %[[VAL_4:.*]] = fir.alloca i32 {bindc_name = "x", uniq_name = "_QFmultiple_reductions_different_typeEx"}
+! CHECK:           %[[VAL_5:.*]]:2 = hlfir.declare %[[VAL_4]] {uniq_name = "_QFmultiple_reductions_different_typeEx"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK:           %[[VAL_6:.*]] = fir.alloca i64 {bindc_name = "y", uniq_name = "_QFmultiple_reductions_different_typeEy"}
+! CHECK:           %[[VAL_7:.*]]:2 = hlfir.declare %[[VAL_6]] {uniq_name = "_QFmultiple_reductions_different_typeEy"} : (!fir.ref<i64>) -> (!fir.ref<i64>, !fir.ref<i64>)
+! CHECK:           %[[VAL_8:.*]] = fir.alloca f32 {bindc_name = "z", uniq_name = "_QFmultiple_reductions_different_typeEz"}
+! CHECK:           %[[VAL_9:.*]]:2 = hlfir.declare %[[VAL_8]] {uniq_name = "_QFmultiple_reductions_different_typeEz"} : (!fir.ref<f32>) -> (!fir.ref<f32>, !fir.ref<f32>)
+! CHECK:           %[[VAL_10:.*]] = arith.constant 0 : i32
+! CHECK:           hlfir.assign %[[VAL_10]] to %[[VAL_5]]#0 : i32, !fir.ref<i32>
+! CHECK:           %[[VAL_11:.*]] = arith.constant 0 : i64
+! CHECK:           hlfir.assign %[[VAL_11]] to %[[VAL_7]]#0 : i64, !fir.ref<i64>
+! CHECK:           %[[VAL_12:.*]] = arith.constant 0.000000e+00 : f32
+! CHECK:           hlfir.assign %[[VAL_12]] to %[[VAL_9]]#0 : f32, !fir.ref<f32>
+! CHECK:           %[[VAL_13:.*]] = arith.constant 0.000000e+00 : f64
+! CHECK:           hlfir.assign %[[VAL_13]] to %[[VAL_3]]#0 : f64, !fir.ref<f64>
+! CHECK:           omp.parallel {
+! CHECK:             %[[VAL_14:.*]] = fir.alloca i32 {adapt.valuebyref, pinned}
+! CHECK:             %[[VAL_15:.*]]:2 = hlfir.declare %[[VAL_14]] {uniq_name = "_QFmultiple_reductions_different_typeEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK:             %[[VAL_16:.*]] = arith.constant 1 : i32
+! CHECK:             %[[VAL_17:.*]] = arith.constant 100 : i32
+! CHECK:             %[[VAL_18:.*]] = arith.constant 1 : i32
+! CHECK:             omp.wsloop byref reduction(@add_reduction_i_32_byref %[[VAL_5]]#0 -> %[[VAL_19:.*]] : !fir.ref<i32>, @add_reduction_i_64_byref %[[VAL_7]]#0 -> %[[VAL_20:.*]] : !fir.ref<i64>, @add_reduction_f_32_byref %[[VAL_9]]#0 -> %[[VAL_21:.*]] : !fir.ref<f32>, @add_reduction_f_64_byref %[[VAL_3]]#0 -> %[[VAL_22:.*]] : !fir.ref<f64>)  for  (%[[VAL_23:.*]]) : i32 = (%[[VAL_16]]) to (%[[VAL_17]]) inclusive step (%[[VAL_18]]) {
+! CHECK:               fir.store %[[VAL_23]] to %[[VAL_15]]#1 : !fir.ref<i32>
+! CHECK:               %[[VAL_24:.*]]:2 = hlfir.declare %[[VAL_19]] {uniq_name = "_QFmultiple_reductions_different_typeEx"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK:               %[[VAL_25:.*]]:2 = hlfir.declare %[[VAL_20]] {uniq_name = "_QFmultiple_reductions_different_typeEy"} : (!fir.ref<i64>) -> (!fir.ref<i64>, !fir.ref<i64>)
+! CHECK:               %[[VAL_26:.*]]:2 = hlfir.declare %[[VAL_21]] {uniq_name = "_QFmultiple_reductions_different_typeEz"} : (!fir.ref<f32>) -> (!fir.ref<f32>, !fir.ref<f32>)
+! CHECK:               %[[VAL_27:.*]]:2 = hlfir.declare %[[VAL_22]] {uniq_name = "_QFmultiple_reductions_different_typeEw"} : (!fir.ref<f64>) -> (!fir.ref<f64>, !fir.ref<f64>)
+! CHECK:               %[[VAL_28:.*]] = fir.load %[[VAL_24]]#0 : !fir.ref<i32>
+! CHECK:               %[[VAL_29:.*]] = fir.load %[[VAL_15]]#0 : !fir.ref<i32>
+! CHECK:               %[[VAL_30:.*]] = arith.addi %[[VAL_28]], %[[VAL_29]] : i32
+! CHECK:               hlfir.assign %[[VAL_30]] to %[[VAL_24]]#0 : i32, !fir.ref<i32>
+! CHECK:               %[[VAL_31:.*]] = fir.load %[[VAL_25]]#0 : !fir.ref<i64>
+! CHECK:               %[[VAL_32:.*]] = fir.load %[[VAL_15]]#0 : !fir.ref<i32>
+! CHECK:               %[[VAL_33:.*]] = fir.convert %[[VAL_32]] : (i32) -> i64
+! CHECK:               %[[VAL_34:.*]] = arith.addi %[[VAL_31]], %[[VAL_33]] : i64
+! CHECK:               hlfir.assign %[[VAL_34]] to %[[VAL_25]]#0 : i64, !fir.ref<i64>
+! CHECK:               %[[VAL_35:.*]] = fir.load %[[VAL_26]]#0 : !fir.ref<f32>
+! CHECK:               %[[VAL_36:.*]] = fir.load %[[VAL_15]]#0 : !fir.ref<i32>
+! CHECK:               %[[VAL_37:.*]] = fir.convert %[[VAL_36]] : (i32) -> f32
+! CHECK:               %[[VAL_38:.*]] = arith.addf %[[VAL_35]], %[[VAL_37]] fastmath<contract> : f32
+! CHECK:               hlfir.assign %[[VAL_38]] to %[[VAL_26]]#0 : f32, !fir.ref<f32>
+! CHECK:               %[[VAL_39:.*]] = fir.load %[[VAL_27]]#0 : !fir.ref<f64>
+! CHECK:               %[[VAL_40:.*]] = fir.load %[[VAL_15]]#0 : !fir.ref<i32>
+! CHECK:               %[[VAL_41:.*]] = fir.convert %[[VAL_40]] : (i32) -> f64
+! CHECK:               %[[VAL_42:.*]] = arith.addf %[[VAL_39]], %[[VAL_41]] fastmath<contract> : f64
+! CHECK:               hlfir.assign %[[VAL_42]] to %[[VAL_27]]#0 : f64, !fir.ref<f64>
+! CHECK:               omp.yield
+! CHECK:             }
+! CHECK:             omp.terminator
+! CHECK:           }
+! CHECK:           return
+! CHECK:         }
+
+subroutine multiple_reductions_different_type
+  integer :: x
+  integer(kind=8) :: y
+  real :: z
+  real(kind=8) :: w
+  x = 0
+  y = 0
+  z = 0.0
+  w = 0.0
+  !$omp parallel
+  !$omp do reduction(+:x,y,z,w)
+  do i=1, 100
+    x = x + i
+    y = y + i
+    z = z + i
+    w = w + i
+  end do
+  !$omp end do
+  !$omp end parallel
+end subroutine
diff --git a/flang/test/Lower/OpenMP/wsloop-reduction-add-hlfir-byref.f90 b/flang/test/Lower/OpenMP/wsloop-reduction-add-hlfir-byref.f90
new file mode 100644
index 00000000000000..3739b3ae36ea13
--- /dev/null
+++ b/flang/test/Lower/OpenMP/wsloop-reduction-add-hlfir-byref.f90
@@ -0,0 +1,58 @@
+! RUN: bbc -emit-hlfir -fopenmp --force-byref-reduction %s -o - | FileCheck %s
+! RUN: %flang_fc1 -emit-hlfir -fopenmp -mmlir --force-byref-reduction %s -o - | FileCheck %s
+
+! NOTE: Assertions have been autogenerated by utils/generate-test-checks.py
+
+! CHECK-LABEL:   omp.reduction.declare @add_reduction_i_32_byref : !fir.ref<i32>
+! CHECK-SAME:    init {
+! CHECK:         ^bb0(%[[VAL_0:.*]]: !fir.ref<i32>):
+! CHECK:            %[[C0_1:.*]] = arith.constant 0 : i32
+! CHECK:            %[[REF:.*]] = fir.alloca i32
+! CHECK:            fir.store %[[C0_1]] to %[[REF]] : !fir.ref<i32>
+! CHECK:           omp.yield(%[[REF]] : !fir.ref<i32>)
+
+! CHECK-LABEL:   } combiner {
+! CHECK:         ^bb0(%[[ARG0:.*]]: !fir.ref<i32>, %[[ARG1:.*]]: !fir.ref<i32>):
+! CHECK:           %[[LD0:.*]] = fir.load %[[ARG0]] : !fir.ref<i32>
+! CHECK:           %[[LD1:.*]] = fir.load %[[ARG1]] : !fir.ref<i32>
+! CHECK:           %[[RES:.*]] = arith.addi %[[LD0]], %[[LD1]] : i32
+! CHECK:           fir.store %[[RES]] to %[[ARG0]] : !fir.ref<i32>
+! CHECK:           omp.yield(%[[ARG0]] : !fir.ref<i32>)
+! CHECK:         }
+
+! CHECK-LABEL:   func.func @_QPsimple_int_reduction()
+! CHECK:           %[[VAL_0:.*]] = fir.alloca i32 {bindc_name = "i", uniq_name = "_QFsimple_int_reductionEi"}
+! CHECK:           %[[VAL_1:.*]]:2 = hlfir.declare %[[VAL_0]] {uniq_name = "_QFsimple_int_reductionEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK:           %[[VAL_2:.*]] = fir.alloca i32 {bindc_name = "x", uniq_name = "_QFsimple_int_reductionEx"}
+! CHECK:           %[[VAL_3:.*]]:2 = hlfir.declare %[[VAL_2]] {uniq_name = "_QFsimple_int_reductionEx"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK:           %[[VAL_4:.*]] = arith.constant 0 : i32
+! CHECK:           hlfir.assign %[[VAL_4]] to %[[VAL_3]]#0 : i32, !fir.ref<i32>
+! CHECK:           omp.parallel {
+! CHECK:             %[[VAL_5:.*]] = fir.alloca i32 {adapt.valuebyref, pinned}
+! CHECK:             %[[VAL_6:.*]]:2 = hlfir.declare %[[VAL_5]] {uniq_name = "_QFsimple_int_reductionEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK:             %[[VAL_7:.*]] = arith.constant 1 : i32
+! CHECK:             %[[VAL_8:.*]] = arith.constant 100 : i32
+! CHECK:             %[[VAL_9:.*]] = arith.constant 1 : i32
+! CHECK:             omp.wsloop byref reduction(@add_reduction_i_32_byref %[[VAL_3]]#0 -> %[[VAL_10:.*]] : !fir.ref<i32>)  for  (%[[VAL_11:.*]]) : i32 = (%[[VAL_7]]) to (%[[VAL_8]]) inclusive step (%[[VAL_9]])
+! CHECK:               fir.store %[[VAL_11]] to %[[VAL_6]]#1 : !fir.ref<i32>
+! CHECK:               %[[VAL_12:.*]]:2 = hlfir.declare %[[VAL_10]] {uniq_name = "_QFsimple_int_reductionEx"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK:               %[[VAL_13:.*]] = fir.load %[[VAL_12]]#0 : !fir.ref<i32>
+! CHECK:               %[[VAL_14:.*]] = fir.load %[[VAL_6]]#0 : !fir.ref<i32>
+! CHECK:               %[[VAL_15:.*]] = arith.addi %[[VAL_13]], %[[VAL_14]] : i32
+! CHECK:               hlfir.assign %[[VAL_15]] to %[[VAL_12]]#0 : i32, !fir.ref<i32>
+! CHECK:               omp.yield
+! CHECK:             omp.terminator
+! CHECK:           return
+
+
+subroutine simple_int_reduction
+  integer :: x
+  x = 0
+  !$omp parallel
+  !$omp do reduction(+:x)
+  do i=1, 100
+    x = x + i
+  end do
+  !$omp end do
+  !$omp end parallel
+end subroutine
diff --git a/flang/test/Lower/OpenMP/wsloop-reduction-iand-byref.f90 b/flang/test/Lower/OpenMP/wsloop-reduction-iand-byref.f90
new file mode 100644
index 00000000000000..15a6dde046fee1
--- /dev/null
+++ b/flang/test/Lower/OpenMP/wsloop-reduction-iand-byref.f90
@@ -0,0 +1,64 @@
+! RUN: bbc -emit-hlfir -fopenmp --force-byref-reduction %s -o - | FileCheck %s
+! RUN: %flang_fc1 -emit-hlfir -fopenmp -mmlir --force-byref-reduction %s -o - | FileCheck %s
+
+! NOTE: Assertions have been autogenerated by utils/generate-test-checks.py
+
+! CHECK-LABEL:   omp.reduction.declare @iand_i_32_byref : !fir.ref<i32>
+! CHECK-SAME:    init {
+! CHECK:         ^bb0(%[[VAL_0:.*]]: !fir.ref<i32>):
+! CHECK:            %[[C0_1:.*]] = arith.constant -1 : i32
+! CHECK:            %[[REF:.*]] = fir.alloca i32
+! CHECK:            fir.store %[[C0_1]] to %[[REF]] : !fir.ref<i32>
+! CHECK:            omp.yield(%[[REF]] : !fir.ref<i32>)
+
+! CHECK-LABEL:   } combiner {
+! CHECK:         ^bb0(%[[ARG0:.*]]: !fir.ref<i32>, %[[ARG1:.*]]: !fir.ref<i32>):
+! CHECK:           %[[LD0:.*]] = fir.load %[[ARG0]] : !fir.ref<i32>
+! CHECK:           %[[LD1:.*]] = fir.load %[[ARG1]] : !fir.ref<i32>
+! CHECK:           %[[RES:.*]] = arith.andi %[[LD0]], %[[LD1]] : i32
+! CHECK:           fir.store %[[RES]] to %[[ARG0]] : !fir.ref<i32>
+! CHECK:           omp.yield(%[[ARG0]] : !fir.ref<i32>)
+! CHECK:         }
+
+! CHECK-LABEL:   func.func @_QPreduction_iand(
+! CHECK-SAME:                                 %[[VAL_0:.*]]: !fir.box<!fir.array<?xi32>> {fir.bindc_name = "y"}) {
+! CHECK:           %[[VAL_1:.*]] = fir.alloca i32 {bindc_name = "i", uniq_name = "_QFreduction_iandEi"}
+! CHECK:           %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_1]] {uniq_name = "_QFreduction_iandEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK:           %[[VAL_3:.*]] = fir.alloca i32 {bindc_name = "x", uniq_name = "_QFreduction_iandEx"}
+! CHECK:           %[[VAL_4:.*]]:2 = hlfir.declare %[[VAL_3]] {uniq_name = "_QFreduction_iandEx"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK:           %[[VAL_5:.*]]:2 = hlfir.declare %[[VAL_0]] {uniq_name = "_QFreduction_iandEy"} : (!fir.box<!fir.array<?xi32>>) -> (!fir.box<!fir.array<?xi32>>, !fir.box<!fir.array<?xi32>>)
+! CHECK:           %[[VAL_6:.*]] = arith.constant 0 : i32
+! CHECK:           hlfir.assign %[[VAL_6]] to %[[VAL_4]]#0 : i32, !fir.ref<i32>
+! CHECK:           omp.parallel {
+! CHECK:             %[[VAL_7:.*]] = fir.alloca i32 {adapt.valuebyref, pinned}
+! CHECK:             %[[VAL_8:.*]]:2 = hlfir.declare %[[VAL_7]] {uniq_name = "_QFreduction_iandEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK:             %[[VAL_9:.*]] = arith.constant 1 : i32
+! CHECK:             %[[VAL_10:.*]] = arith.constant 100 : i32
+! CHECK:             %[[VAL_11:.*]] = arith.constant 1 : i32
+! CHECK:             omp.wsloop byref reduction(@iand_i_32_byref %[[VAL_4]]#0 -> %[[VAL_12:.*]] : !fir.ref<i32>)  for  (%[[VAL_13:.*]]) : i32 = (%[[VAL_9]]) to (%[[VAL_10]]) inclusive step (%[[VAL_11]]) {
+! CHECK:               fir.store %[[VAL_13]] to %[[VAL_8]]#1 : !fir.ref<i32>
+! CHECK:               %[[VAL_14:.*]]:2 = hlfir.declare %[[VAL_12]] {uniq_name = "_QFreduction_iandEx"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK:               %[[VAL_15:.*]] = fir.load %[[VAL_8]]#0 : !fir.ref<i32>
+! CHECK:               %[[VAL_16:.*]] = fir.convert %[[VAL_15]] : (i32) -> i64
+! CHECK:               %[[VAL_17:.*]] = hlfir.designate %[[VAL_5]]#0 (%[[VAL_16]])  : (!fir.box<!fir.array<?xi32>>, i64) -> !fir.ref<i32>
+! CHECK:               %[[VAL_18:.*]] = fir.load %[[VAL_14]]#0 : !fir.ref<i32>
+! CHECK:               %[[VAL_19:.*]] = fir.load %[[VAL_17]] : !fir.ref<i32>
+! CHECK:               %[[VAL_20:.*]] = arith.andi %[[VAL_18]], %[[VAL_19]] : i32
+! CHECK:               hlfir.assign %[[VAL_20]] to %[[VAL_14]]#0 : i32, !fir.ref<i32>
+! CHECK:               omp.yield
+! CHECK:             omp.terminator
+
+
+
+subroutine reduction_iand(y)
+  integer :: x, y(:)
+  x = 0
+  !$omp parallel
+  !$omp do reduction(iand:x)
+  do i=1, 100
+    x = iand(x, y(i))
+  end do
+  !$omp end do
+  !$omp end parallel
+  print *, x
+end subroutine
diff --git a/flang/test/Lower/OpenMP/wsloop-reduction-ieor-byref.f90 b/flang/test/Lower/OpenMP/wsloop-reduction-ieor-byref.f90
new file mode 100644
index 00000000000000..4e0957219fa5db
--- /dev/null
+++ b/flang/test/Lower/OpenMP/wsloop-reduction-ieor-byref.f90
@@ -0,0 +1,55 @@
+! RUN: bbc -emit-hlfir -fopenmp --force-byref-reduction %s -o - | FileCheck %s
+! RUN: %flang_fc1 -emit-hlfir -fopenmp -mmlir --force-byref-reduction %s -o - | FileCheck %s
+
+! CHECK-LABEL:   omp.reduction.declare @ieor_i_32_byref : !fir.ref<i32>
+! CHECK-SAME:    init {
+! CHECK:         ^bb0(%[[VAL_0:.*]]: !fir.ref<i32>):
+! CHECK:            %[[C0_1:.*]] = arith.constant 0 : i32
+! CHECK:            %[[REF:.*]] = fir.alloca i32
+! CHECK:            fir.store %[[C0_1]] to %[[REF]] : !fir.ref<i32>
+! CHECK:            omp.yield(%[[REF]] : !fir.ref<i32>)
+
+! CHECK-LABEL:   } combiner {
+! CHECK:         ^bb0(%[[ARG0:.*]]: !fir.ref<i32>, %[[ARG1:.*]]: !fir.ref<i32>):
+! CHECK:           %[[LD0:.*]] = fir.load %[[ARG0]] : !fir.ref<i32>
+! CHECK:           %[[LD1:.*]] = fir.load %[[ARG1]] : !fir.ref<i32>
+! CHECK:           %[[RES:.*]] = arith.xori %[[LD0]], %[[LD1]] : i32
+! CHECK:           fir.store %[[RES]] to %[[ARG0]] : !fir.ref<i32>
+! CHECK:           omp.yield(%[[ARG0]] : !fir.ref<i32>)
+! CHECK:         }
+
+!CHECK-LABEL: @_QPreduction_ieor
+!CHECK-SAME: %[[Y_BOX:.*]]: !fir.box<!fir.array<?xi32>>
+!CHECK: %[[X_REF:.*]] = fir.alloca i32 {bindc_name = "x", uniq_name = "_QFreduction_ieorEx"}
+!CHECK: %[[X_DECL:.*]]:2 = hlfir.declare %[[X_REF]] {uniq_name = "_QFreduction_ieorEx"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+!CHECK: %[[Y_DECL:.*]]:2 = hlfir.declare %[[Y_BOX]] {uniq_name = "_QFreduction_ieorEy"} : (!fir.box<!fir.array<?xi32>>) -> (!fir.box<!fir.array<?xi32>>, !fir.box<!fir.array<?xi32>>)
+
+
+!CHECK: omp.parallel
+!CHECK: %[[I_REF:.*]] = fir.alloca i32 {adapt.valuebyref, pinned}
+!CHECK: %[[I_DECL:.*]]:2 = hlfir.declare %[[I_REF]] {uniq_name = "_QFreduction_ieorEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+!CHECK: omp.wsloop byref reduction(@ieor_i_32_byref %[[X_DECL]]#0 -> %[[PRV:.+]] : !fir.ref<i32>) for
+!CHECK: fir.store %{{.*}} to %[[I_DECL]]#1 : !fir.ref<i32>
+!CHECK: %[[PRV_DECL:.+]]:2 = hlfir.declare %[[PRV]] {{.*}} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+!CHECK: %[[I_32:.*]] = fir.load %[[I_DECL]]#0 : !fir.ref<i32>
+!CHECK: %[[I_64:.*]] = fir.convert %[[I_32]] : (i32) -> i64
+!CHECK: %[[Y_I_REF:.*]] = hlfir.designate %[[Y_DECL]]#0 (%[[I_64]])  : (!fir.box<!fir.array<?xi32>>, i64) -> !fir.ref<i32>
+!CHECK: %[[LPRV:.+]] = fir.load %[[PRV_DECL]]#0 : !fir.ref<i32>
+!CHECK: %[[Y_I:.*]] = fir.load %[[Y_I_REF]] : !fir.ref<i32>
+!CHECK: %[[RES:.+]] = arith.xori %[[LPRV]], %[[Y_I]] : i32
+!CHECK: hlfir.assign %[[RES]] to %[[PRV_DECL]]#0 : i32, !fir.ref<i32>
+!CHECK: omp.yield
+!CHECK: omp.terminator
+
+subroutine reduction_ieor(y)
+  integer :: x, y(:)
+  x = 0
+  !$omp parallel
+  !$omp do reduction(ieor:x)
+  do i=1, 100
+    x = ieor(x, y(i))
+  end do
+  !$omp end do
+  !$omp end parallel
+  print *, x
+end subroutine
diff --git a/flang/test/Lower/OpenMP/wsloop-reduction-ior-byref.f90 b/flang/test/Lower/OpenMP/wsloop-reduction-ior-byref.f90
new file mode 100644
index 00000000000000..712edaf95575da
--- /dev/null
+++ b/flang/test/Lower/OpenMP/wsloop-reduction-ior-byref.f90
@@ -0,0 +1,64 @@
+! RUN: bbc -emit-hlfir -fopenmp --force-byref-reduction %s -o - | FileCheck %s
+! RUN: %flang_fc1 -emit-hlfir -fopenmp -mmlir --force-byref-reduction %s -o - | FileCheck %s
+
+! NOTE: Assertions have been autogenerated by utils/generate-test-checks.py
+
+! CHECK-LABEL:   omp.reduction.declare @ior_i_32_byref : !fir.ref<i32>
+! CHECK-SAME:    init {
+! CHECK:         ^bb0(%[[VAL_0:.*]]: !fir.ref<i32>):
+! CHECK:            %[[C0_1:.*]] = arith.constant 0 : i32
+! CHECK:            %[[REF:.*]] = fir.alloca i32
+! CHECK:            fir.store %[[C0_1]] to %[[REF]] : !fir.ref<i32>
+! CHECK:            omp.yield(%[[REF]] : !fir.ref<i32>)
+
+! CHECK-LABEL:   } combiner {
+! CHECK:         ^bb0(%[[ARG0:.*]]: !fir.ref<i32>, %[[ARG1:.*]]: !fir.ref<i32>):
+! CHECK:           %[[LD0:.*]] = fir.load %[[ARG0]] : !fir.ref<i32>
+! CHECK:           %[[LD1:.*]] = fir.load %[[ARG1]] : !fir.ref<i32>
+! CHECK:           %[[RES:.*]] = arith.ori %[[LD0]], %[[LD1]] : i32
+! CHECK:           fir.store %[[RES]] to %[[ARG0]] : !fir.ref<i32>
+! CHECK:           omp.yield(%[[ARG0]] : !fir.ref<i32>)
+! CHECK:         }
+
+! CHECK-LABEL:   func.func @_QPreduction_ior(
+! CHECK-SAME:                                %[[VAL_0:.*]]: !fir.box<!fir.array<?xi32>> {fir.bindc_name = "y"}) {
+! CHECK:           %[[VAL_1:.*]] = fir.alloca i32 {bindc_name = "i", uniq_name = "_QFreduction_iorEi"}
+! CHECK:           %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_1]] {uniq_name = "_QFreduction_iorEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK:           %[[VAL_3:.*]] = fir.alloca i32 {bindc_name = "x", uniq_name = "_QFreduction_iorEx"}
+! CHECK:           %[[VAL_4:.*]]:2 = hlfir.declare %[[VAL_3]] {uniq_name = "_QFreduction_iorEx"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK:           %[[VAL_5:.*]]:2 = hlfir.declare %[[VAL_0]] {uniq_name = "_QFreduction_iorEy"} : (!fir.box<!fir.array<?xi32>>) -> (!fir.box<!fir.array<?xi32>>, !fir.box<!fir.array<?xi32>>)
+! CHECK:           %[[VAL_6:.*]] = arith.constant 0 : i32
+! CHECK:           hlfir.assign %[[VAL_6]] to %[[VAL_4]]#0 : i32, !fir.ref<i32>
+! CHECK:           omp.parallel
+! CHECK:             %[[VAL_7:.*]] = fir.alloca i32 {adapt.valuebyref, pinned}
+! CHECK:             %[[VAL_8:.*]]:2 = hlfir.declare %[[VAL_7]] {uniq_name = "_QFreduction_iorEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK:             %[[VAL_9:.*]] = arith.constant 1 : i32
+! CHECK:             %[[VAL_10:.*]] = arith.constant 100 : i32
+! CHECK:             %[[VAL_11:.*]] = arith.constant 1 : i32
+! CHECK:             omp.wsloop byref reduction(@ior_i_32_byref %[[VAL_4]]#0 -> %[[VAL_12:.*]] : !fir.ref<i32>)  for  (%[[VAL_13:.*]]) : i32 = (%[[VAL_9]]) to (%[[VAL_10]]) inclusive step (%[[VAL_11]])
+! CHECK:               fir.store %[[VAL_13]] to %[[VAL_8]]#1 : !fir.ref<i32>
+! CHECK:               %[[VAL_14:.*]]:2 = hlfir.declare %[[VAL_12]] {uniq_name = "_QFreduction_iorEx"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK:               %[[VAL_15:.*]] = fir.load %[[VAL_8]]#0 : !fir.ref<i32>
+! CHECK:               %[[VAL_16:.*]] = fir.convert %[[VAL_15]] : (i32) -> i64
+! CHECK:               %[[VAL_17:.*]] = hlfir.designate %[[VAL_5]]#0 (%[[VAL_16]])  : (!fir.box<!fir.array<?xi32>>, i64) -> !fir.ref<i32>
+! CHECK:               %[[VAL_18:.*]] = fir.load %[[VAL_14]]#0 : !fir.ref<i32>
+! CHECK:               %[[VAL_19:.*]] = fir.load %[[VAL_17]] : !fir.ref<i32>
+! CHECK:               %[[VAL_20:.*]] = arith.ori %[[VAL_18]], %[[VAL_19]] : i32
+! CHECK:               hlfir.assign %[[VAL_20]] to %[[VAL_14]]#0 : i32, !fir.ref<i32>
+! CHECK:               omp.yield
+! CHECK:             omp.terminator
+
+
+
+subroutine reduction_ior(y)
+  integer :: x, y(:)
+  x = 0
+  !$omp parallel
+  !$omp do reduction(ior:x)
+  do i=1, 100
+    x = ior(x, y(i))
+  end do
+  !$omp end do
+  !$omp end parallel
+  print *, x
+end subroutine
diff --git a/flang/test/Lower/OpenMP/wsloop-reduction-logical-and-byref.f90 b/flang/test/Lower/OpenMP/wsloop-reduction-logical-and-byref.f90
new file mode 100644
index 00000000000000..4162626f926acc
--- /dev/null
+++ b/flang/test/Lower/OpenMP/wsloop-reduction-logical-and-byref.f90
@@ -0,0 +1,206 @@
+! RUN: bbc -emit-hlfir -fopenmp --force-byref-reduction %s -o - | FileCheck %s
+! RUN: %flang_fc1 -emit-hlfir -fopenmp -mmlir --force-byref-reduction %s -o - | FileCheck %s
+
+! NOTE: Assertions have been autogenerated by utils/generate-test-checks.py
+
+! CHECK-LABEL:   omp.reduction.declare @and_reduction : !fir.ref<!fir.logical<4>>
+! CHECK-SAME:    init {
+! CHECK:         ^bb0(%[[VAL_0:.*]]: !fir.ref<!fir.logical<4>>):
+! CHECK:           %[[VAL_1:.*]] = arith.constant true
+! CHECK:           %[[VAL_2:.*]] = fir.convert %[[VAL_1]] : (i1) -> !fir.logical<4>
+! CHECK:           %[[REF:.*]] = fir.alloca !fir.logical<4>
+! CHECK:           fir.store %[[VAL_2]] to %[[REF]] : !fir.ref<!fir.logical<4>>
+! CHECK:           omp.yield(%[[REF]] : !fir.ref<!fir.logical<4>>)
+
+! CHECK-LABEL:   } combiner {
+! CHECK:         ^bb0(%[[ARG0:.*]]: !fir.ref<!fir.logical<4>>, %[[ARG1:.*]]: !fir.ref<!fir.logical<4>>):
+! CHECK:           %[[LD0:.*]] = fir.load %[[ARG0]] : !fir.ref<!fir.logical<4>>
+! CHECK:           %[[LD1:.*]] = fir.load %[[ARG1]] : !fir.ref<!fir.logical<4>>
+! CHECK:           %[[VAL_2:.*]] = fir.convert %[[LD0]] : (!fir.logical<4>) -> i1
+! CHECK:           %[[VAL_3:.*]] = fir.convert %[[LD1]] : (!fir.logical<4>) -> i1
+! CHECK:           %[[RES:.*]] = arith.andi %[[VAL_2]], %[[VAL_3]] : i1
+! CHECK:           %[[VAL_5:.*]] = fir.convert %[[RES]] : (i1) -> !fir.logical<4>
+! CHECK:           fir.store %[[VAL_5]] to %[[ARG0]] : !fir.ref<!fir.logical<4>>
+! CHECK:           omp.yield(%[[ARG0]] : !fir.ref<!fir.logical<4>>)
+! CHECK:         }
+
+! CHECK-LABEL:   func.func @_QPsimple_reduction(
+! CHECK-SAME:                                   %[[VAL_0:.*]]: !fir.ref<!fir.array<100x!fir.logical<4>>> {fir.bindc_name = "y"}) {
+! CHECK:           %[[VAL_1:.*]] = fir.alloca i32 {bindc_name = "i", uniq_name = "_QFsimple_reductionEi"}
+! CHECK:           %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_1]] {uniq_name = "_QFsimple_reductionEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK:           %[[VAL_3:.*]] = fir.alloca !fir.logical<4> {bindc_name = "x", uniq_name = "_QFsimple_reductionEx"}
+! CHECK:           %[[VAL_4:.*]]:2 = hlfir.declare %[[VAL_3]] {uniq_name = "_QFsimple_reductionEx"} : (!fir.ref<!fir.logical<4>>) -> (!fir.ref<!fir.logical<4>>, !fir.ref<!fir.logical<4>>)
+! CHECK:           %[[VAL_5:.*]] = arith.constant 100 : index
+! CHECK:           %[[VAL_6:.*]] = fir.shape %[[VAL_5]] : (index) -> !fir.shape<1>
+! CHECK:           %[[VAL_7:.*]]:2 = hlfir.declare %[[VAL_0]](%[[VAL_6]]) {uniq_name = "_QFsimple_reductionEy"} : (!fir.ref<!fir.array<100x!fir.logical<4>>>, !fir.shape<1>) -> (!fir.ref<!fir.array<100x!fir.logical<4>>>, !fir.ref<!fir.array<100x!fir.logical<4>>>)
+! CHECK:           %[[VAL_8:.*]] = arith.constant true
+! CHECK:           %[[VAL_9:.*]] = fir.convert %[[VAL_8]] : (i1) -> !fir.logical<4>
+! CHECK:           hlfir.assign %[[VAL_9]] to %[[VAL_4]]#0 : !fir.logical<4>, !fir.ref<!fir.logical<4>>
+! CHECK:           omp.parallel {
+! CHECK:             %[[VAL_10:.*]] = fir.alloca i32 {adapt.valuebyref, pinned}
+! CHECK:             %[[VAL_11:.*]]:2 = hlfir.declare %[[VAL_10]] {uniq_name = "_QFsimple_reductionEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK:             %[[VAL_12:.*]] = arith.constant 1 : i32
+! CHECK:             %[[VAL_13:.*]] = arith.constant 100 : i32
+! CHECK:             %[[VAL_14:.*]] = arith.constant 1 : i32
+! CHECK:             omp.wsloop byref reduction(@and_reduction %[[VAL_4]]#0 -> %[[VAL_15:.*]] : !fir.ref<!fir.logical<4>>)  for  (%[[VAL_16:.*]]) : i32 = (%[[VAL_12]]) to (%[[VAL_13]]) inclusive step (%[[VAL_14]]) {
+! CHECK:               fir.store %[[VAL_16]] to %[[VAL_11]]#1 : !fir.ref<i32>
+! CHECK:               %[[VAL_17:.*]]:2 = hlfir.declare %[[VAL_15]] {uniq_name = "_QFsimple_reductionEx"} : (!fir.ref<!fir.logical<4>>) -> (!fir.ref<!fir.logical<4>>, !fir.ref<!fir.logical<4>>)
+! CHECK:               %[[VAL_18:.*]] = fir.load %[[VAL_17]]#0 : !fir.ref<!fir.logical<4>>
+! CHECK:               %[[VAL_19:.*]] = fir.load %[[VAL_11]]#0 : !fir.ref<i32>
+! CHECK:               %[[VAL_20:.*]] = fir.convert %[[VAL_19]] : (i32) -> i64
+! CHECK:               %[[VAL_21:.*]] = hlfir.designate %[[VAL_7]]#0 (%[[VAL_20]])  : (!fir.ref<!fir.array<100x!fir.logical<4>>>, i64) -> !fir.ref<!fir.logical<4>>
+! CHECK:               %[[VAL_22:.*]] = fir.load %[[VAL_21]] : !fir.ref<!fir.logical<4>>
+! CHECK:               %[[VAL_23:.*]] = fir.convert %[[VAL_18]] : (!fir.logical<4>) -> i1
+! CHECK:               %[[VAL_24:.*]] = fir.convert %[[VAL_22]] : (!fir.logical<4>) -> i1
+! CHECK:               %[[VAL_25:.*]] = arith.andi %[[VAL_23]], %[[VAL_24]] : i1
+! CHECK:               %[[VAL_26:.*]] = fir.convert %[[VAL_25]] : (i1) -> !fir.logical<4>
+! CHECK:               hlfir.assign %[[VAL_26]] to %[[VAL_17]]#0 : !fir.logical<4>, !fir.ref<!fir.logical<4>>
+! CHECK:               omp.yield
+! CHECK:             omp.terminator
+! CHECK:           return
+
+subroutine simple_reduction(y)
+  logical :: x, y(100)
+  x = .true.
+  !$omp parallel
+  !$omp do reduction(.and.:x)
+  do i=1, 100
+     x = x .and. y(i)
+  end do
+  !$omp end do
+  !$omp end parallel
+end subroutine simple_reduction
+
+
+! CHECK-LABEL:   func.func @_QPsimple_reduction_switch_order(
+! CHECK-SAME:                                                %[[VAL_0:.*]]: !fir.ref<!fir.array<100x!fir.logical<4>>> {fir.bindc_name = "y"}) {
+! CHECK:           %[[VAL_1:.*]] = fir.alloca i32 {bindc_name = "i", uniq_name = "_QFsimple_reduction_switch_orderEi"}
+! CHECK:           %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_1]] {uniq_name = "_QFsimple_reduction_switch_orderEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK:           %[[VAL_3:.*]] = fir.alloca !fir.logical<4> {bindc_name = "x", uniq_name = "_QFsimple_reduction_switch_orderEx"}
+! CHECK:           %[[VAL_4:.*]]:2 = hlfir.declare %[[VAL_3]] {uniq_name = "_QFsimple_reduction_switch_orderEx"} : (!fir.ref<!fir.logical<4>>) -> (!fir.ref<!fir.logical<4>>, !fir.ref<!fir.logical<4>>)
+! CHECK:           %[[VAL_5:.*]] = arith.constant 100 : index
+! CHECK:           %[[VAL_6:.*]] = fir.shape %[[VAL_5]] : (index) -> !fir.shape<1>
+! CHECK:           %[[VAL_7:.*]]:2 = hlfir.declare %[[VAL_0]](%[[VAL_6]]) {uniq_name = "_QFsimple_reduction_switch_orderEy"} : (!fir.ref<!fir.array<100x!fir.logical<4>>>, !fir.shape<1>) -> (!fir.ref<!fir.array<100x!fir.logical<4>>>, !fir.ref<!fir.array<100x!fir.logical<4>>>)
+! CHECK:           %[[VAL_8:.*]] = arith.constant true
+! CHECK:           %[[VAL_9:.*]] = fir.convert %[[VAL_8]] : (i1) -> !fir.logical<4>
+! CHECK:           hlfir.assign %[[VAL_9]] to %[[VAL_4]]#0 : !fir.logical<4>, !fir.ref<!fir.logical<4>>
+! CHECK:           omp.parallel {
+! CHECK:             %[[VAL_10:.*]] = fir.alloca i32 {adapt.valuebyref, pinned}
+! CHECK:             %[[VAL_11:.*]]:2 = hlfir.declare %[[VAL_10]] {uniq_name = "_QFsimple_reduction_switch_orderEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK:             %[[VAL_12:.*]] = arith.constant 1 : i32
+! CHECK:             %[[VAL_13:.*]] = arith.constant 100 : i32
+! CHECK:             %[[VAL_14:.*]] = arith.constant 1 : i32
+! CHECK:             omp.wsloop byref reduction(@and_reduction %[[VAL_4]]#0 -> %[[VAL_15:.*]] : !fir.ref<!fir.logical<4>>)  for  (%[[VAL_16:.*]]) : i32 = (%[[VAL_12]]) to (%[[VAL_13]]) inclusive step (%[[VAL_14]]) {
+! CHECK:               fir.store %[[VAL_16]] to %[[VAL_11]]#1 : !fir.ref<i32>
+! CHECK:               %[[VAL_17:.*]]:2 = hlfir.declare %[[VAL_15]] {uniq_name = "_QFsimple_reduction_switch_orderEx"} : (!fir.ref<!fir.logical<4>>) -> (!fir.ref<!fir.logical<4>>, !fir.ref<!fir.logical<4>>)
+! CHECK:               %[[VAL_18:.*]] = fir.load %[[VAL_11]]#0 : !fir.ref<i32>
+! CHECK:               %[[VAL_19:.*]] = fir.convert %[[VAL_18]] : (i32) -> i64
+! CHECK:               %[[VAL_20:.*]] = hlfir.designate %[[VAL_7]]#0 (%[[VAL_19]])  : (!fir.ref<!fir.array<100x!fir.logical<4>>>, i64) -> !fir.ref<!fir.logical<4>>
+! CHECK:               %[[VAL_21:.*]] = fir.load %[[VAL_20]] : !fir.ref<!fir.logical<4>>
+! CHECK:               %[[VAL_22:.*]] = fir.load %[[VAL_17]]#0 : !fir.ref<!fir.logical<4>>
+! CHECK:               %[[VAL_23:.*]] = fir.convert %[[VAL_21]] : (!fir.logical<4>) -> i1
+! CHECK:               %[[VAL_24:.*]] = fir.convert %[[VAL_22]] : (!fir.logical<4>) -> i1
+! CHECK:               %[[VAL_25:.*]] = arith.andi %[[VAL_23]], %[[VAL_24]] : i1
+! CHECK:               %[[VAL_26:.*]] = fir.convert %[[VAL_25]] : (i1) -> !fir.logical<4>
+! CHECK:               hlfir.assign %[[VAL_26]] to %[[VAL_17]]#0 : !fir.logical<4>, !fir.ref<!fir.logical<4>>
+! CHECK:               omp.yield
+! CHECK:             omp.terminator
+! CHECK:           return
+
+subroutine simple_reduction_switch_order(y)
+  logical :: x, y(100)
+  x = .true.
+  !$omp parallel
+  !$omp do reduction(.and.:x)
+  do i=1, 100
+  x = y(i) .and. x
+  end do
+  !$omp end do
+  !$omp end parallel
+end subroutine
+
+! CHECK-LABEL:   func.func @_QPmultiple_reductions(
+! CHECK-SAME:                                      %[[VAL_0:.*]]: !fir.ref<!fir.array<100x!fir.logical<4>>> {fir.bindc_name = "w"}) {
+! CHECK:           %[[VAL_1:.*]] = fir.alloca i32 {bindc_name = "i", uniq_name = "_QFmultiple_reductionsEi"}
+! CHECK:           %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_1]] {uniq_name = "_QFmultiple_reductionsEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK:           %[[VAL_3:.*]] = arith.constant 100 : index
+! CHECK:           %[[VAL_4:.*]] = fir.shape %[[VAL_3]] : (index) -> !fir.shape<1>
+! CHECK:           %[[VAL_5:.*]]:2 = hlfir.declare %[[VAL_0]](%[[VAL_4]]) {uniq_name = "_QFmultiple_reductionsEw"} : (!fir.ref<!fir.array<100x!fir.logical<4>>>, !fir.shape<1>) -> (!fir.ref<!fir.array<100x!fir.logical<4>>>, !fir.ref<!fir.array<100x!fir.logical<4>>>)
+! CHECK:           %[[VAL_6:.*]] = fir.alloca !fir.logical<4> {bindc_name = "x", uniq_name = "_QFmultiple_reductionsEx"}
+! CHECK:           %[[VAL_7:.*]]:2 = hlfir.declare %[[VAL_6]] {uniq_name = "_QFmultiple_reductionsEx"} : (!fir.ref<!fir.logical<4>>) -> (!fir.ref<!fir.logical<4>>, !fir.ref<!fir.logical<4>>)
+! CHECK:           %[[VAL_8:.*]] = fir.alloca !fir.logical<4> {bindc_name = "y", uniq_name = "_QFmultiple_reductionsEy"}
+! CHECK:           %[[VAL_9:.*]]:2 = hlfir.declare %[[VAL_8]] {uniq_name = "_QFmultiple_reductionsEy"} : (!fir.ref<!fir.logical<4>>) -> (!fir.ref<!fir.logical<4>>, !fir.ref<!fir.logical<4>>)
+! CHECK:           %[[VAL_10:.*]] = fir.alloca !fir.logical<4> {bindc_name = "z", uniq_name = "_QFmultiple_reductionsEz"}
+! CHECK:           %[[VAL_11:.*]]:2 = hlfir.declare %[[VAL_10]] {uniq_name = "_QFmultiple_reductionsEz"} : (!fir.ref<!fir.logical<4>>) -> (!fir.ref<!fir.logical<4>>, !fir.ref<!fir.logical<4>>)
+! CHECK:           %[[VAL_12:.*]] = arith.constant true
+! CHECK:           %[[VAL_13:.*]] = fir.convert %[[VAL_12]] : (i1) -> !fir.logical<4>
+! CHECK:           hlfir.assign %[[VAL_13]] to %[[VAL_7]]#0 : !fir.logical<4>, !fir.ref<!fir.logical<4>>
+! CHECK:           %[[VAL_14:.*]] = arith.constant true
+! CHECK:           %[[VAL_15:.*]] = fir.convert %[[VAL_14]] : (i1) -> !fir.logical<4>
+! CHECK:           hlfir.assign %[[VAL_15]] to %[[VAL_9]]#0 : !fir.logical<4>, !fir.ref<!fir.logical<4>>
+! CHECK:           %[[VAL_16:.*]] = arith.constant true
+! CHECK:           %[[VAL_17:.*]] = fir.convert %[[VAL_16]] : (i1) -> !fir.logical<4>
+! CHECK:           hlfir.assign %[[VAL_17]] to %[[VAL_11]]#0 : !fir.logical<4>, !fir.ref<!fir.logical<4>>
+! CHECK:           omp.parallel {
+! CHECK:             %[[VAL_18:.*]] = fir.alloca i32 {adapt.valuebyref, pinned}
+! CHECK:             %[[VAL_19:.*]]:2 = hlfir.declare %[[VAL_18]] {uniq_name = "_QFmultiple_reductionsEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK:             %[[VAL_20:.*]] = arith.constant 1 : i32
+! CHECK:             %[[VAL_21:.*]] = arith.constant 100 : i32
+! CHECK:             %[[VAL_22:.*]] = arith.constant 1 : i32
+! CHECK:             omp.wsloop byref reduction(@and_reduction %[[VAL_7]]#0 -> %[[VAL_23:.*]] : !fir.ref<!fir.logical<4>>, @and_reduction %[[VAL_9]]#0 -> %[[VAL_24:.*]] : !fir.ref<!fir.logical<4>>, @and_reduction %[[VAL_11]]#0 -> %[[VAL_25:.*]] : !fir.ref<!fir.logical<4>>)  for  (%[[VAL_26:.*]]) : i32 = (%[[VAL_20]]) to (%[[VAL_21]]) inclusive step (%[[VAL_22]]) {
+! CHECK:               fir.store %[[VAL_26]] to %[[VAL_19]]#1 : !fir.ref<i32>
+! CHECK:               %[[VAL_27:.*]]:2 = hlfir.declare %[[VAL_23]] {uniq_name = "_QFmultiple_reductionsEx"} : (!fir.ref<!fir.logical<4>>) -> (!fir.ref<!fir.logical<4>>, !fir.ref<!fir.logical<4>>)
+! CHECK:               %[[VAL_28:.*]]:2 = hlfir.declare %[[VAL_24]] {uniq_name = "_QFmultiple_reductionsEy"} : (!fir.ref<!fir.logical<4>>) -> (!fir.ref<!fir.logical<4>>, !fir.ref<!fir.logical<4>>)
+! CHECK:               %[[VAL_29:.*]]:2 = hlfir.declare %[[VAL_25]] {uniq_name = "_QFmultiple_reductionsEz"} : (!fir.ref<!fir.logical<4>>) -> (!fir.ref<!fir.logical<4>>, !fir.ref<!fir.logical<4>>)
+! CHECK:               %[[VAL_30:.*]] = fir.load %[[VAL_27]]#0 : !fir.ref<!fir.logical<4>>
+! CHECK:               %[[VAL_31:.*]] = fir.load %[[VAL_19]]#0 : !fir.ref<i32>
+! CHECK:               %[[VAL_32:.*]] = fir.convert %[[VAL_31]] : (i32) -> i64
+! CHECK:               %[[VAL_33:.*]] = hlfir.designate %[[VAL_5]]#0 (%[[VAL_32]])  : (!fir.ref<!fir.array<100x!fir.logical<4>>>, i64) -> !fir.ref<!fir.logical<4>>
+! CHECK:               %[[VAL_34:.*]] = fir.load %[[VAL_33]] : !fir.ref<!fir.logical<4>>
+! CHECK:               %[[VAL_35:.*]] = fir.convert %[[VAL_30]] : (!fir.logical<4>) -> i1
+! CHECK:               %[[VAL_36:.*]] = fir.convert %[[VAL_34]] : (!fir.logical<4>) -> i1
+! CHECK:               %[[VAL_37:.*]] = arith.andi %[[VAL_35]], %[[VAL_36]] : i1
+! CHECK:               %[[VAL_38:.*]] = fir.convert %[[VAL_37]] : (i1) -> !fir.logical<4>
+! CHECK:               hlfir.assign %[[VAL_38]] to %[[VAL_27]]#0 : !fir.logical<4>, !fir.ref<!fir.logical<4>>
+! CHECK:               %[[VAL_39:.*]] = fir.load %[[VAL_28]]#0 : !fir.ref<!fir.logical<4>>
+! CHECK:               %[[VAL_40:.*]] = fir.load %[[VAL_19]]#0 : !fir.ref<i32>
+! CHECK:               %[[VAL_41:.*]] = fir.convert %[[VAL_40]] : (i32) -> i64
+! CHECK:               %[[VAL_42:.*]] = hlfir.designate %[[VAL_5]]#0 (%[[VAL_41]])  : (!fir.ref<!fir.array<100x!fir.logical<4>>>, i64) -> !fir.ref<!fir.logical<4>>
+! CHECK:               %[[VAL_43:.*]] = fir.load %[[VAL_42]] : !fir.ref<!fir.logical<4>>
+! CHECK:               %[[VAL_44:.*]] = fir.convert %[[VAL_39]] : (!fir.logical<4>) -> i1
+! CHECK:               %[[VAL_45:.*]] = fir.convert %[[VAL_43]] : (!fir.logical<4>) -> i1
+! CHECK:               %[[VAL_46:.*]] = arith.andi %[[VAL_44]], %[[VAL_45]] : i1
+! CHECK:               %[[VAL_47:.*]] = fir.convert %[[VAL_46]] : (i1) -> !fir.logical<4>
+! CHECK:               hlfir.assign %[[VAL_47]] to %[[VAL_28]]#0 : !fir.logical<4>, !fir.ref<!fir.logical<4>>
+! CHECK:               %[[VAL_48:.*]] = fir.load %[[VAL_29]]#0 : !fir.ref<!fir.logical<4>>
+! CHECK:               %[[VAL_49:.*]] = fir.load %[[VAL_19]]#0 : !fir.ref<i32>
+! CHECK:               %[[VAL_50:.*]] = fir.convert %[[VAL_49]] : (i32) -> i64
+! CHECK:               %[[VAL_51:.*]] = hlfir.designate %[[VAL_5]]#0 (%[[VAL_50]])  : (!fir.ref<!fir.array<100x!fir.logical<4>>>, i64) -> !fir.ref<!fir.logical<4>>
+! CHECK:               %[[VAL_52:.*]] = fir.load %[[VAL_51]] : !fir.ref<!fir.logical<4>>
+! CHECK:               %[[VAL_53:.*]] = fir.convert %[[VAL_48]] : (!fir.logical<4>) -> i1
+! CHECK:               %[[VAL_54:.*]] = fir.convert %[[VAL_52]] : (!fir.logical<4>) -> i1
+! CHECK:               %[[VAL_55:.*]] = arith.andi %[[VAL_53]], %[[VAL_54]] : i1
+! CHECK:               %[[VAL_56:.*]] = fir.convert %[[VAL_55]] : (i1) -> !fir.logical<4>
+! CHECK:               hlfir.assign %[[VAL_56]] to %[[VAL_29]]#0 : !fir.logical<4>, !fir.ref<!fir.logical<4>>
+! CHECK:               omp.yield
+! CHECK:             omp.terminator
+! CHECK:           return
+
+
+
+subroutine multiple_reductions(w)
+  logical :: x,y,z,w(100)
+  x = .true.
+  y = .true.
+  z = .true.
+  !$omp parallel
+  !$omp do reduction(.and.:x,y,z)
+  do i=1, 100
+  x = x .and. w(i)
+  y = y .and. w(i)
+  z = z .and. w(i)
+  end do
+  !$omp end do
+  !$omp end parallel
+end subroutine
+
diff --git a/flang/test/Lower/OpenMP/wsloop-reduction-logical-eqv-byref.f90 b/flang/test/Lower/OpenMP/wsloop-reduction-logical-eqv-byref.f90
new file mode 100644
index 00000000000000..4c159f3a69f9fd
--- /dev/null
+++ b/flang/test/Lower/OpenMP/wsloop-reduction-logical-eqv-byref.f90
@@ -0,0 +1,202 @@
+! RUN: bbc -emit-hlfir -fopenmp --force-byref-reduction %s -o - | FileCheck %s
+! RUN: %flang_fc1 -emit-hlfir -fopenmp -mmlir --force-byref-reduction %s -o - | FileCheck %s
+
+! NOTE: Assertions have been autogenerated by utils/generate-test-checks.py
+
+! CHECK-LABEL:   omp.reduction.declare @eqv_reduction : !fir.ref<!fir.logical<4>>
+! CHECK-SAME:    init {
+! CHECK:         ^bb0(%[[VAL_0:.*]]: !fir.ref<!fir.logical<4>>):
+! CHECK:           %[[VAL_1:.*]] = arith.constant true
+! CHECK:           %[[VAL_2:.*]] = fir.convert %[[VAL_1]] : (i1) -> !fir.logical<4>
+! CHECK:           %[[REF:.*]] = fir.alloca !fir.logical<4>
+! CHECK:           fir.store %[[VAL_2]] to %[[REF]] : !fir.ref<!fir.logical<4>>
+! CHECK:           omp.yield(%[[REF]] : !fir.ref<!fir.logical<4>>)
+
+! CHECK-LABEL:   } combiner {
+! CHECK:         ^bb0(%[[ARG0:.*]]: !fir.ref<!fir.logical<4>>, %[[ARG1:.*]]: !fir.ref<!fir.logical<4>>):
+! CHECK:           %[[LD0:.*]] = fir.load %[[ARG0]] : !fir.ref<!fir.logical<4>>
+! CHECK:           %[[LD1:.*]] = fir.load %[[ARG1]] : !fir.ref<!fir.logical<4>>
+! CHECK:           %[[VAL_2:.*]] = fir.convert %[[LD0]] : (!fir.logical<4>) -> i1
+! CHECK:           %[[VAL_3:.*]] = fir.convert %[[LD1]] : (!fir.logical<4>) -> i1
+! CHECK:           %[[RES:.*]] = arith.cmpi eq, %[[VAL_2]], %[[VAL_3]] : i1
+! CHECK:           %[[VAL_5:.*]] = fir.convert %[[RES]] : (i1) -> !fir.logical<4>
+! CHECK:           fir.store %[[VAL_5]] to %[[ARG0]] : !fir.ref<!fir.logical<4>>
+! CHECK:           omp.yield(%[[ARG0]] : !fir.ref<!fir.logical<4>>)
+! CHECK:         }
+
+! CHECK-LABEL:   func.func @_QPsimple_reduction(
+! CHECK-SAME:                                   %[[VAL_0:.*]]: !fir.ref<!fir.array<100x!fir.logical<4>>> {fir.bindc_name = "y"}) {
+! CHECK:           %[[VAL_1:.*]] = fir.alloca i32 {bindc_name = "i", uniq_name = "_QFsimple_reductionEi"}
+! CHECK:           %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_1]] {uniq_name = "_QFsimple_reductionEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK:           %[[VAL_3:.*]] = fir.alloca !fir.logical<4> {bindc_name = "x", uniq_name = "_QFsimple_reductionEx"}
+! CHECK:           %[[VAL_4:.*]]:2 = hlfir.declare %[[VAL_3]] {uniq_name = "_QFsimple_reductionEx"} : (!fir.ref<!fir.logical<4>>) -> (!fir.ref<!fir.logical<4>>, !fir.ref<!fir.logical<4>>)
+! CHECK:           %[[VAL_5:.*]] = arith.constant 100 : index
+! CHECK:           %[[VAL_6:.*]] = fir.shape %[[VAL_5]] : (index) -> !fir.shape<1>
+! CHECK:           %[[VAL_7:.*]]:2 = hlfir.declare %[[VAL_0]](%[[VAL_6]]) {uniq_name = "_QFsimple_reductionEy"} : (!fir.ref<!fir.array<100x!fir.logical<4>>>, !fir.shape<1>) -> (!fir.ref<!fir.array<100x!fir.logical<4>>>, !fir.ref<!fir.array<100x!fir.logical<4>>>)
+! CHECK:           %[[VAL_8:.*]] = arith.constant true
+! CHECK:           %[[VAL_9:.*]] = fir.convert %[[VAL_8]] : (i1) -> !fir.logical<4>
+! CHECK:           hlfir.assign %[[VAL_9]] to %[[VAL_4]]#0 : !fir.logical<4>, !fir.ref<!fir.logical<4>>
+! CHECK:           omp.parallel {
+! CHECK:             %[[VAL_10:.*]] = fir.alloca i32 {adapt.valuebyref, pinned}
+! CHECK:             %[[VAL_11:.*]]:2 = hlfir.declare %[[VAL_10]] {uniq_name = "_QFsimple_reductionEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK:             %[[VAL_12:.*]] = arith.constant 1 : i32
+! CHECK:             %[[VAL_13:.*]] = arith.constant 100 : i32
+! CHECK:             %[[VAL_14:.*]] = arith.constant 1 : i32
+! CHECK:             omp.wsloop byref reduction(@eqv_reduction %[[VAL_4]]#0 -> %[[VAL_15:.*]] : !fir.ref<!fir.logical<4>>)  for  (%[[VAL_16:.*]]) : i32 = (%[[VAL_12]]) to (%[[VAL_13]]) inclusive step (%[[VAL_14]]) {
+! CHECK:               fir.store %[[VAL_16]] to %[[VAL_11]]#1 : !fir.ref<i32>
+! CHECK:               %[[VAL_17:.*]]:2 = hlfir.declare %[[VAL_15]] {uniq_name = "_QFsimple_reductionEx"} : (!fir.ref<!fir.logical<4>>) -> (!fir.ref<!fir.logical<4>>, !fir.ref<!fir.logical<4>>)
+! CHECK:               %[[VAL_18:.*]] = fir.load %[[VAL_17]]#0 : !fir.ref<!fir.logical<4>>
+! CHECK:               %[[VAL_19:.*]] = fir.load %[[VAL_11]]#0 : !fir.ref<i32>
+! CHECK:               %[[VAL_20:.*]] = fir.convert %[[VAL_19]] : (i32) -> i64
+! CHECK:               %[[VAL_21:.*]] = hlfir.designate %[[VAL_7]]#0 (%[[VAL_20]])  : (!fir.ref<!fir.array<100x!fir.logical<4>>>, i64) -> !fir.ref<!fir.logical<4>>
+! CHECK:               %[[VAL_22:.*]] = fir.load %[[VAL_21]] : !fir.ref<!fir.logical<4>>
+! CHECK:               %[[VAL_23:.*]] = fir.convert %[[VAL_18]] : (!fir.logical<4>) -> i1
+! CHECK:               %[[VAL_24:.*]] = fir.convert %[[VAL_22]] : (!fir.logical<4>) -> i1
+! CHECK:               %[[VAL_25:.*]] = arith.cmpi eq, %[[VAL_23]], %[[VAL_24]] : i1
+! CHECK:               %[[VAL_26:.*]] = fir.convert %[[VAL_25]] : (i1) -> !fir.logical<4>
+! CHECK:               hlfir.assign %[[VAL_26]] to %[[VAL_17]]#0 : !fir.logical<4>, !fir.ref<!fir.logical<4>>
+! CHECK:               omp.yield
+! CHECK:             omp.terminator
+! CHECK:           return
+
+subroutine simple_reduction(y)
+  logical :: x, y(100)
+  x = .true.
+  !$omp parallel
+  !$omp do reduction(.eqv.:x)
+  do i=1, 100
+    x = x .eqv. y(i)
+  end do
+  !$omp end do
+  !$omp end parallel
+end subroutine
+
+! CHECK-LABEL:   func.func @_QPsimple_reduction_switch_order(
+! CHECK-SAME:                                                %[[VAL_0:.*]]: !fir.ref<!fir.array<100x!fir.logical<4>>> {fir.bindc_name = "y"}) {
+! CHECK:           %[[VAL_1:.*]] = fir.alloca i32 {bindc_name = "i", uniq_name = "_QFsimple_reduction_switch_orderEi"}
+! CHECK:           %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_1]] {uniq_name = "_QFsimple_reduction_switch_orderEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK:           %[[VAL_3:.*]] = fir.alloca !fir.logical<4> {bindc_name = "x", uniq_name = "_QFsimple_reduction_switch_orderEx"}
+! CHECK:           %[[VAL_4:.*]]:2 = hlfir.declare %[[VAL_3]] {uniq_name = "_QFsimple_reduction_switch_orderEx"} : (!fir.ref<!fir.logical<4>>) -> (!fir.ref<!fir.logical<4>>, !fir.ref<!fir.logical<4>>)
+! CHECK:           %[[VAL_5:.*]] = arith.constant 100 : index
+! CHECK:           %[[VAL_6:.*]] = fir.shape %[[VAL_5]] : (index) -> !fir.shape<1>
+! CHECK:           %[[VAL_7:.*]]:2 = hlfir.declare %[[VAL_0]](%[[VAL_6]]) {uniq_name = "_QFsimple_reduction_switch_orderEy"} : (!fir.ref<!fir.array<100x!fir.logical<4>>>, !fir.shape<1>) -> (!fir.ref<!fir.array<100x!fir.logical<4>>>, !fir.ref<!fir.array<100x!fir.logical<4>>>)
+! CHECK:           %[[VAL_8:.*]] = arith.constant true
+! CHECK:           %[[VAL_9:.*]] = fir.convert %[[VAL_8]] : (i1) -> !fir.logical<4>
+! CHECK:           hlfir.assign %[[VAL_9]] to %[[VAL_4]]#0 : !fir.logical<4>, !fir.ref<!fir.logical<4>>
+! CHECK:           omp.parallel {
+! CHECK:             %[[VAL_10:.*]] = fir.alloca i32 {adapt.valuebyref, pinned}
+! CHECK:             %[[VAL_11:.*]]:2 = hlfir.declare %[[VAL_10]] {uniq_name = "_QFsimple_reduction_switch_orderEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK:             %[[VAL_12:.*]] = arith.constant 1 : i32
+! CHECK:             %[[VAL_13:.*]] = arith.constant 100 : i32
+! CHECK:             %[[VAL_14:.*]] = arith.constant 1 : i32
+! CHECK:             omp.wsloop byref reduction(@eqv_reduction %[[VAL_4]]#0 -> %[[VAL_15:.*]] : !fir.ref<!fir.logical<4>>)  for  (%[[VAL_16:.*]]) : i32 = (%[[VAL_12]]) to (%[[VAL_13]]) inclusive step (%[[VAL_14]]) {
+! CHECK:               fir.store %[[VAL_16]] to %[[VAL_11]]#1 : !fir.ref<i32>
+! CHECK:               %[[VAL_17:.*]]:2 = hlfir.declare %[[VAL_15]] {uniq_name = "_QFsimple_reduction_switch_orderEx"} : (!fir.ref<!fir.logical<4>>) -> (!fir.ref<!fir.logical<4>>, !fir.ref<!fir.logical<4>>)
+! CHECK:               %[[VAL_18:.*]] = fir.load %[[VAL_11]]#0 : !fir.ref<i32>
+! CHECK:               %[[VAL_19:.*]] = fir.convert %[[VAL_18]] : (i32) -> i64
+! CHECK:               %[[VAL_20:.*]] = hlfir.designate %[[VAL_7]]#0 (%[[VAL_19]])  : (!fir.ref<!fir.array<100x!fir.logical<4>>>, i64) -> !fir.ref<!fir.logical<4>>
+! CHECK:               %[[VAL_21:.*]] = fir.load %[[VAL_20]] : !fir.ref<!fir.logical<4>>
+! CHECK:               %[[VAL_22:.*]] = fir.load %[[VAL_17]]#0 : !fir.ref<!fir.logical<4>>
+! CHECK:               %[[VAL_23:.*]] = fir.convert %[[VAL_21]] : (!fir.logical<4>) -> i1
+! CHECK:               %[[VAL_24:.*]] = fir.convert %[[VAL_22]] : (!fir.logical<4>) -> i1
+! CHECK:               %[[VAL_25:.*]] = arith.cmpi eq, %[[VAL_23]], %[[VAL_24]] : i1
+! CHECK:               %[[VAL_26:.*]] = fir.convert %[[VAL_25]] : (i1) -> !fir.logical<4>
+! CHECK:               hlfir.assign %[[VAL_26]] to %[[VAL_17]]#0 : !fir.logical<4>, !fir.ref<!fir.logical<4>>
+! CHECK:               omp.yield
+! CHECK:             omp.terminator
+! CHECK:           return
+
+subroutine simple_reduction_switch_order(y)
+  logical :: x, y(100)
+  x = .true.
+  !$omp parallel
+  !$omp do reduction(.eqv.:x)
+  do i=1, 100
+  x = y(i) .eqv. x
+  end do
+  !$omp end do
+  !$omp end parallel
+end subroutine
+
+! CHECK-LABEL:   func.func @_QPmultiple_reductions(
+! CHECK-SAME:                                      %[[VAL_0:.*]]: !fir.ref<!fir.array<100x!fir.logical<4>>> {fir.bindc_name = "w"}) {
+! CHECK:           %[[VAL_1:.*]] = fir.alloca i32 {bindc_name = "i", uniq_name = "_QFmultiple_reductionsEi"}
+! CHECK:           %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_1]] {uniq_name = "_QFmultiple_reductionsEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK:           %[[VAL_3:.*]] = arith.constant 100 : index
+! CHECK:           %[[VAL_4:.*]] = fir.shape %[[VAL_3]] : (index) -> !fir.shape<1>
+! CHECK:           %[[VAL_5:.*]]:2 = hlfir.declare %[[VAL_0]](%[[VAL_4]]) {uniq_name = "_QFmultiple_reductionsEw"} : (!fir.ref<!fir.array<100x!fir.logical<4>>>, !fir.shape<1>) -> (!fir.ref<!fir.array<100x!fir.logical<4>>>, !fir.ref<!fir.array<100x!fir.logical<4>>>)
+! CHECK:           %[[VAL_6:.*]] = fir.alloca !fir.logical<4> {bindc_name = "x", uniq_name = "_QFmultiple_reductionsEx"}
+! CHECK:           %[[VAL_7:.*]]:2 = hlfir.declare %[[VAL_6]] {uniq_name = "_QFmultiple_reductionsEx"} : (!fir.ref<!fir.logical<4>>) -> (!fir.ref<!fir.logical<4>>, !fir.ref<!fir.logical<4>>)
+! CHECK:           %[[VAL_8:.*]] = fir.alloca !fir.logical<4> {bindc_name = "y", uniq_name = "_QFmultiple_reductionsEy"}
+! CHECK:           %[[VAL_9:.*]]:2 = hlfir.declare %[[VAL_8]] {uniq_name = "_QFmultiple_reductionsEy"} : (!fir.ref<!fir.logical<4>>) -> (!fir.ref<!fir.logical<4>>, !fir.ref<!fir.logical<4>>)
+! CHECK:           %[[VAL_10:.*]] = fir.alloca !fir.logical<4> {bindc_name = "z", uniq_name = "_QFmultiple_reductionsEz"}
+! CHECK:           %[[VAL_11:.*]]:2 = hlfir.declare %[[VAL_10]] {uniq_name = "_QFmultiple_reductionsEz"} : (!fir.ref<!fir.logical<4>>) -> (!fir.ref<!fir.logical<4>>, !fir.ref<!fir.logical<4>>)
+! CHECK:           %[[VAL_12:.*]] = arith.constant true
+! CHECK:           %[[VAL_13:.*]] = fir.convert %[[VAL_12]] : (i1) -> !fir.logical<4>
+! CHECK:           hlfir.assign %[[VAL_13]] to %[[VAL_7]]#0 : !fir.logical<4>, !fir.ref<!fir.logical<4>>
+! CHECK:           %[[VAL_14:.*]] = arith.constant true
+! CHECK:           %[[VAL_15:.*]] = fir.convert %[[VAL_14]] : (i1) -> !fir.logical<4>
+! CHECK:           hlfir.assign %[[VAL_15]] to %[[VAL_9]]#0 : !fir.logical<4>, !fir.ref<!fir.logical<4>>
+! CHECK:           %[[VAL_16:.*]] = arith.constant true
+! CHECK:           %[[VAL_17:.*]] = fir.convert %[[VAL_16]] : (i1) -> !fir.logical<4>
+! CHECK:           hlfir.assign %[[VAL_17]] to %[[VAL_11]]#0 : !fir.logical<4>, !fir.ref<!fir.logical<4>>
+! CHECK:           omp.parallel {
+! CHECK:             %[[VAL_18:.*]] = fir.alloca i32 {adapt.valuebyref, pinned}
+! CHECK:             %[[VAL_19:.*]]:2 = hlfir.declare %[[VAL_18]] {uniq_name = "_QFmultiple_reductionsEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK:             %[[VAL_20:.*]] = arith.constant 1 : i32
+! CHECK:             %[[VAL_21:.*]] = arith.constant 100 : i32
+! CHECK:             %[[VAL_22:.*]] = arith.constant 1 : i32
+! CHECK:             omp.wsloop byref reduction(@eqv_reduction %[[VAL_7]]#0 -> %[[VAL_23:.*]] : !fir.ref<!fir.logical<4>>, @eqv_reduction %[[VAL_9]]#0 -> %[[VAL_24:.*]] : !fir.ref<!fir.logical<4>>, @eqv_reduction %[[VAL_11]]#0 -> %[[VAL_25:.*]] : !fir.ref<!fir.logical<4>>)  for  (%[[VAL_26:.*]]) : i32 = (%[[VAL_20]]) to (%[[VAL_21]]) inclusive step (%[[VAL_22]]) {
+! CHECK:               fir.store %[[VAL_26]] to %[[VAL_19]]#1 : !fir.ref<i32>
+! CHECK:               %[[VAL_27:.*]]:2 = hlfir.declare %[[VAL_23]] {uniq_name = "_QFmultiple_reductionsEx"} : (!fir.ref<!fir.logical<4>>) -> (!fir.ref<!fir.logical<4>>, !fir.ref<!fir.logical<4>>)
+! CHECK:               %[[VAL_28:.*]]:2 = hlfir.declare %[[VAL_24]] {uniq_name = "_QFmultiple_reductionsEy"} : (!fir.ref<!fir.logical<4>>) -> (!fir.ref<!fir.logical<4>>, !fir.ref<!fir.logical<4>>)
+! CHECK:               %[[VAL_29:.*]]:2 = hlfir.declare %[[VAL_25]] {uniq_name = "_QFmultiple_reductionsEz"} : (!fir.ref<!fir.logical<4>>) -> (!fir.ref<!fir.logical<4>>, !fir.ref<!fir.logical<4>>)
+! CHECK:               %[[VAL_30:.*]] = fir.load %[[VAL_27]]#0 : !fir.ref<!fir.logical<4>>
+! CHECK:               %[[VAL_31:.*]] = fir.load %[[VAL_19]]#0 : !fir.ref<i32>
+! CHECK:               %[[VAL_32:.*]] = fir.convert %[[VAL_31]] : (i32) -> i64
+! CHECK:               %[[VAL_33:.*]] = hlfir.designate %[[VAL_5]]#0 (%[[VAL_32]])  : (!fir.ref<!fir.array<100x!fir.logical<4>>>, i64) -> !fir.ref<!fir.logical<4>>
+! CHECK:               %[[VAL_34:.*]] = fir.load %[[VAL_33]] : !fir.ref<!fir.logical<4>>
+! CHECK:               %[[VAL_35:.*]] = fir.convert %[[VAL_30]] : (!fir.logical<4>) -> i1
+! CHECK:               %[[VAL_36:.*]] = fir.convert %[[VAL_34]] : (!fir.logical<4>) -> i1
+! CHECK:               %[[VAL_37:.*]] = arith.cmpi eq, %[[VAL_35]], %[[VAL_36]] : i1
+! CHECK:               %[[VAL_38:.*]] = fir.convert %[[VAL_37]] : (i1) -> !fir.logical<4>
+! CHECK:               hlfir.assign %[[VAL_38]] to %[[VAL_27]]#0 : !fir.logical<4>, !fir.ref<!fir.logical<4>>
+! CHECK:               %[[VAL_39:.*]] = fir.load %[[VAL_28]]#0 : !fir.ref<!fir.logical<4>>
+! CHECK:               %[[VAL_40:.*]] = fir.load %[[VAL_19]]#0 : !fir.ref<i32>
+! CHECK:               %[[VAL_41:.*]] = fir.convert %[[VAL_40]] : (i32) -> i64
+! CHECK:               %[[VAL_42:.*]] = hlfir.designate %[[VAL_5]]#0 (%[[VAL_41]])  : (!fir.ref<!fir.array<100x!fir.logical<4>>>, i64) -> !fir.ref<!fir.logical<4>>
+! CHECK:               %[[VAL_43:.*]] = fir.load %[[VAL_42]] : !fir.ref<!fir.logical<4>>
+! CHECK:               %[[VAL_44:.*]] = fir.convert %[[VAL_39]] : (!fir.logical<4>) -> i1
+! CHECK:               %[[VAL_45:.*]] = fir.convert %[[VAL_43]] : (!fir.logical<4>) -> i1
+! CHECK:               %[[VAL_46:.*]] = arith.cmpi eq, %[[VAL_44]], %[[VAL_45]] : i1
+! CHECK:               %[[VAL_47:.*]] = fir.convert %[[VAL_46]] : (i1) -> !fir.logical<4>
+! CHECK:               hlfir.assign %[[VAL_47]] to %[[VAL_28]]#0 : !fir.logical<4>, !fir.ref<!fir.logical<4>>
+! CHECK:               %[[VAL_48:.*]] = fir.load %[[VAL_29]]#0 : !fir.ref<!fir.logical<4>>
+! CHECK:               %[[VAL_49:.*]] = fir.load %[[VAL_19]]#0 : !fir.ref<i32>
+! CHECK:               %[[VAL_50:.*]] = fir.convert %[[VAL_49]] : (i32) -> i64
+! CHECK:               %[[VAL_51:.*]] = hlfir.designate %[[VAL_5]]#0 (%[[VAL_50]])  : (!fir.ref<!fir.array<100x!fir.logical<4>>>, i64) -> !fir.ref<!fir.logical<4>>
+! CHECK:               %[[VAL_52:.*]] = fir.load %[[VAL_51]] : !fir.ref<!fir.logical<4>>
+! CHECK:               %[[VAL_53:.*]] = fir.convert %[[VAL_48]] : (!fir.logical<4>) -> i1
+! CHECK:               %[[VAL_54:.*]] = fir.convert %[[VAL_52]] : (!fir.logical<4>) -> i1
+! CHECK:               %[[VAL_55:.*]] = arith.cmpi eq, %[[VAL_53]], %[[VAL_54]] : i1
+! CHECK:               %[[VAL_56:.*]] = fir.convert %[[VAL_55]] : (i1) -> !fir.logical<4>
+! CHECK:               hlfir.assign %[[VAL_56]] to %[[VAL_29]]#0 : !fir.logical<4>, !fir.ref<!fir.logical<4>>
+! CHECK:               omp.yield
+! CHECK:             omp.terminator
+! CHECK:           return
+
+subroutine multiple_reductions(w)
+  logical :: x,y,z,w(100)
+  x = .true.
+  y = .true.
+  z = .true.
+  !$omp parallel
+  !$omp do reduction(.eqv.:x,y,z)
+  do i=1, 100
+  x = x .eqv. w(i)
+  y = y .eqv. w(i)
+  z = z .eqv. w(i)
+  end do
+  !$omp end do
+  !$omp end parallel
+end subroutine
diff --git a/flang/test/Lower/OpenMP/wsloop-reduction-logical-neqv-byref.f90 b/flang/test/Lower/OpenMP/wsloop-reduction-logical-neqv-byref.f90
new file mode 100644
index 00000000000000..cfa8e47d3ca7ba
--- /dev/null
+++ b/flang/test/Lower/OpenMP/wsloop-reduction-logical-neqv-byref.f90
@@ -0,0 +1,207 @@
+! RUN: bbc -emit-hlfir -fopenmp --force-byref-reduction %s -o - | FileCheck %s
+! RUN: %flang_fc1 -emit-hlfir -fopenmp -mmlir --force-byref-reduction %s -o - | FileCheck %s
+
+! NOTE: Assertions have been autogenerated by utils/generate-test-checks.py
+
+! CHECK-LABEL:   omp.reduction.declare @neqv_reduction : !fir.ref<!fir.logical<4>>
+! CHECK-SAME:    init {
+! CHECK:         ^bb0(%[[VAL_0:.*]]: !fir.ref<!fir.logical<4>>):
+! CHECK:           %[[VAL_1:.*]] = arith.constant false
+! CHECK:           %[[VAL_2:.*]] = fir.convert %[[VAL_1]] : (i1) -> !fir.logical<4>
+! CHECK:           %[[REF:.*]] = fir.alloca !fir.logical<4>
+! CHECK:           fir.store %[[VAL_2]] to %[[REF]] : !fir.ref<!fir.logical<4>>
+! CHECK:           omp.yield(%[[REF]] : !fir.ref<!fir.logical<4>>)
+
+! CHECK-LABEL:   } combiner {
+! CHECK:         ^bb0(%[[ARG0:.*]]: !fir.ref<!fir.logical<4>>, %[[ARG1:.*]]: !fir.ref<!fir.logical<4>>):
+! CHECK:           %[[LD0:.*]] = fir.load %[[ARG0]] : !fir.ref<!fir.logical<4>>
+! CHECK:           %[[LD1:.*]] = fir.load %[[ARG1]] : !fir.ref<!fir.logical<4>>
+! CHECK:           %[[VAL_2:.*]] = fir.convert %[[LD0]] : (!fir.logical<4>) -> i1
+! CHECK:           %[[VAL_3:.*]] = fir.convert %[[LD1]] : (!fir.logical<4>) -> i1
+! CHECK:           %[[RES:.*]] = arith.cmpi ne, %[[VAL_2]], %[[VAL_3]] : i1
+! CHECK:           %[[VAL_5:.*]] = fir.convert %[[RES]] : (i1) -> !fir.logical<4>
+! CHECK:           fir.store %[[VAL_5]] to %[[ARG0]] : !fir.ref<!fir.logical<4>>
+! CHECK:           omp.yield(%[[ARG0]] : !fir.ref<!fir.logical<4>>)
+! CHECK:         }
+
+! CHECK-LABEL:   func.func @_QPsimple_reduction(
+! CHECK-SAME:                                   %[[VAL_0:.*]]: !fir.ref<!fir.array<100x!fir.logical<4>>> {fir.bindc_name = "y"}) {
+! CHECK:           %[[VAL_1:.*]] = fir.alloca i32 {bindc_name = "i", uniq_name = "_QFsimple_reductionEi"}
+! CHECK:           %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_1]] {uniq_name = "_QFsimple_reductionEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK:           %[[VAL_3:.*]] = fir.alloca !fir.logical<4> {bindc_name = "x", uniq_name = "_QFsimple_reductionEx"}
+! CHECK:           %[[VAL_4:.*]]:2 = hlfir.declare %[[VAL_3]] {uniq_name = "_QFsimple_reductionEx"} : (!fir.ref<!fir.logical<4>>) -> (!fir.ref<!fir.logical<4>>, !fir.ref<!fir.logical<4>>)
+! CHECK:           %[[VAL_5:.*]] = arith.constant 100 : index
+! CHECK:           %[[VAL_6:.*]] = fir.shape %[[VAL_5]] : (index) -> !fir.shape<1>
+! CHECK:           %[[VAL_7:.*]]:2 = hlfir.declare %[[VAL_0]](%[[VAL_6]]) {uniq_name = "_QFsimple_reductionEy"} : (!fir.ref<!fir.array<100x!fir.logical<4>>>, !fir.shape<1>) -> (!fir.ref<!fir.array<100x!fir.logical<4>>>, !fir.ref<!fir.array<100x!fir.logical<4>>>)
+! CHECK:           %[[VAL_8:.*]] = arith.constant true
+! CHECK:           %[[VAL_9:.*]] = fir.convert %[[VAL_8]] : (i1) -> !fir.logical<4>
+! CHECK:           hlfir.assign %[[VAL_9]] to %[[VAL_4]]#0 : !fir.logical<4>, !fir.ref<!fir.logical<4>>
+! CHECK:           omp.parallel {
+! CHECK:             %[[VAL_10:.*]] = fir.alloca i32 {adapt.valuebyref, pinned}
+! CHECK:             %[[VAL_11:.*]]:2 = hlfir.declare %[[VAL_10]] {uniq_name = "_QFsimple_reductionEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK:             %[[VAL_12:.*]] = arith.constant 1 : i32
+! CHECK:             %[[VAL_13:.*]] = arith.constant 100 : i32
+! CHECK:             %[[VAL_14:.*]] = arith.constant 1 : i32
+! CHECK:             omp.wsloop byref reduction(@neqv_reduction %[[VAL_4]]#0 -> %[[VAL_15:.*]] : !fir.ref<!fir.logical<4>>)  for  (%[[VAL_16:.*]]) : i32 = (%[[VAL_12]]) to (%[[VAL_13]]) inclusive step (%[[VAL_14]]) {
+! CHECK:               fir.store %[[VAL_16]] to %[[VAL_11]]#1 : !fir.ref<i32>
+! CHECK:               %[[VAL_17:.*]]:2 = hlfir.declare %[[VAL_15]] {uniq_name = "_QFsimple_reductionEx"} : (!fir.ref<!fir.logical<4>>) -> (!fir.ref<!fir.logical<4>>, !fir.ref<!fir.logical<4>>)
+! CHECK:               %[[VAL_18:.*]] = fir.load %[[VAL_17]]#0 : !fir.ref<!fir.logical<4>>
+! CHECK:               %[[VAL_19:.*]] = fir.load %[[VAL_11]]#0 : !fir.ref<i32>
+! CHECK:               %[[VAL_20:.*]] = fir.convert %[[VAL_19]] : (i32) -> i64
+! CHECK:               %[[VAL_21:.*]] = hlfir.designate %[[VAL_7]]#0 (%[[VAL_20]])  : (!fir.ref<!fir.array<100x!fir.logical<4>>>, i64) -> !fir.ref<!fir.logical<4>>
+! CHECK:               %[[VAL_22:.*]] = fir.load %[[VAL_21]] : !fir.ref<!fir.logical<4>>
+! CHECK:               %[[VAL_23:.*]] = fir.convert %[[VAL_18]] : (!fir.logical<4>) -> i1
+! CHECK:               %[[VAL_24:.*]] = fir.convert %[[VAL_22]] : (!fir.logical<4>) -> i1
+! CHECK:               %[[VAL_25:.*]] = arith.cmpi ne, %[[VAL_23]], %[[VAL_24]] : i1
+! CHECK:               %[[VAL_26:.*]] = fir.convert %[[VAL_25]] : (i1) -> !fir.logical<4>
+! CHECK:               hlfir.assign %[[VAL_26]] to %[[VAL_17]]#0 : !fir.logical<4>, !fir.ref<!fir.logical<4>>
+! CHECK:               omp.yield
+! CHECK:             omp.terminator
+! CHECK:           return
+
+subroutine simple_reduction(y)
+  logical :: x, y(100)
+  x = .true.
+  !$omp parallel
+  !$omp do reduction(.neqv.:x)
+  do i=1, 100
+    x = x .neqv. y(i)
+  end do
+  !$omp end do
+  !$omp end parallel
+end subroutine
+
+
+! CHECK-LABEL:   func.func @_QPsimple_reduction_switch_order(
+! CHECK-SAME:                                                %[[VAL_0:.*]]: !fir.ref<!fir.array<100x!fir.logical<4>>> {fir.bindc_name = "y"}) {
+! CHECK:           %[[VAL_1:.*]] = fir.alloca i32 {bindc_name = "i", uniq_name = "_QFsimple_reduction_switch_orderEi"}
+! CHECK:           %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_1]] {uniq_name = "_QFsimple_reduction_switch_orderEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK:           %[[VAL_3:.*]] = fir.alloca !fir.logical<4> {bindc_name = "x", uniq_name = "_QFsimple_reduction_switch_orderEx"}
+! CHECK:           %[[VAL_4:.*]]:2 = hlfir.declare %[[VAL_3]] {uniq_name = "_QFsimple_reduction_switch_orderEx"} : (!fir.ref<!fir.logical<4>>) -> (!fir.ref<!fir.logical<4>>, !fir.ref<!fir.logical<4>>)
+! CHECK:           %[[VAL_5:.*]] = arith.constant 100 : index
+! CHECK:           %[[VAL_6:.*]] = fir.shape %[[VAL_5]] : (index) -> !fir.shape<1>
+! CHECK:           %[[VAL_7:.*]]:2 = hlfir.declare %[[VAL_0]](%[[VAL_6]]) {uniq_name = "_QFsimple_reduction_switch_orderEy"} : (!fir.ref<!fir.array<100x!fir.logical<4>>>, !fir.shape<1>) -> (!fir.ref<!fir.array<100x!fir.logical<4>>>, !fir.ref<!fir.array<100x!fir.logical<4>>>)
+! CHECK:           %[[VAL_8:.*]] = arith.constant true
+! CHECK:           %[[VAL_9:.*]] = fir.convert %[[VAL_8]] : (i1) -> !fir.logical<4>
+! CHECK:           hlfir.assign %[[VAL_9]] to %[[VAL_4]]#0 : !fir.logical<4>, !fir.ref<!fir.logical<4>>
+! CHECK:           omp.parallel {
+! CHECK:             %[[VAL_10:.*]] = fir.alloca i32 {adapt.valuebyref, pinned}
+! CHECK:             %[[VAL_11:.*]]:2 = hlfir.declare %[[VAL_10]] {uniq_name = "_QFsimple_reduction_switch_orderEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK:             %[[VAL_12:.*]] = arith.constant 1 : i32
+! CHECK:             %[[VAL_13:.*]] = arith.constant 100 : i32
+! CHECK:             %[[VAL_14:.*]] = arith.constant 1 : i32
+! CHECK:             omp.wsloop byref reduction(@neqv_reduction %[[VAL_4]]#0 -> %[[VAL_15:.*]] : !fir.ref<!fir.logical<4>>)  for  (%[[VAL_16:.*]]) : i32 = (%[[VAL_12]]) to (%[[VAL_13]]) inclusive step (%[[VAL_14]]) {
+! CHECK:               fir.store %[[VAL_16]] to %[[VAL_11]]#1 : !fir.ref<i32>
+! CHECK:               %[[VAL_17:.*]]:2 = hlfir.declare %[[VAL_15]] {uniq_name = "_QFsimple_reduction_switch_orderEx"} : (!fir.ref<!fir.logical<4>>) -> (!fir.ref<!fir.logical<4>>, !fir.ref<!fir.logical<4>>)
+! CHECK:               %[[VAL_18:.*]] = fir.load %[[VAL_11]]#0 : !fir.ref<i32>
+! CHECK:               %[[VAL_19:.*]] = fir.convert %[[VAL_18]] : (i32) -> i64
+! CHECK:               %[[VAL_20:.*]] = hlfir.designate %[[VAL_7]]#0 (%[[VAL_19]])  : (!fir.ref<!fir.array<100x!fir.logical<4>>>, i64) -> !fir.ref<!fir.logical<4>>
+! CHECK:               %[[VAL_21:.*]] = fir.load %[[VAL_20]] : !fir.ref<!fir.logical<4>>
+! CHECK:               %[[VAL_22:.*]] = fir.load %[[VAL_17]]#0 : !fir.ref<!fir.logical<4>>
+! CHECK:               %[[VAL_23:.*]] = fir.convert %[[VAL_21]] : (!fir.logical<4>) -> i1
+! CHECK:               %[[VAL_24:.*]] = fir.convert %[[VAL_22]] : (!fir.logical<4>) -> i1
+! CHECK:               %[[VAL_25:.*]] = arith.cmpi ne, %[[VAL_23]], %[[VAL_24]] : i1
+! CHECK:               %[[VAL_26:.*]] = fir.convert %[[VAL_25]] : (i1) -> !fir.logical<4>
+! CHECK:               hlfir.assign %[[VAL_26]] to %[[VAL_17]]#0 : !fir.logical<4>, !fir.ref<!fir.logical<4>>
+! CHECK:               omp.yield
+! CHECK:             omp.terminator
+! CHECK:           return
+
+
+subroutine simple_reduction_switch_order(y)
+  logical :: x, y(100)
+  x = .true.
+  !$omp parallel
+  !$omp do reduction(.neqv.:x)
+  do i=1, 100
+  x = y(i) .neqv. x
+  end do
+  !$omp end do
+  !$omp end parallel
+end subroutine
+
+
+! CHECK-LABEL:   func.func @_QPmultiple_reductions(
+! CHECK-SAME:                                      %[[VAL_0:.*]]: !fir.ref<!fir.array<100x!fir.logical<4>>> {fir.bindc_name = "w"}) {
+! CHECK:           %[[VAL_1:.*]] = fir.alloca i32 {bindc_name = "i", uniq_name = "_QFmultiple_reductionsEi"}
+! CHECK:           %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_1]] {uniq_name = "_QFmultiple_reductionsEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK:           %[[VAL_3:.*]] = arith.constant 100 : index
+! CHECK:           %[[VAL_4:.*]] = fir.shape %[[VAL_3]] : (index) -> !fir.shape<1>
+! CHECK:           %[[VAL_5:.*]]:2 = hlfir.declare %[[VAL_0]](%[[VAL_4]]) {uniq_name = "_QFmultiple_reductionsEw"} : (!fir.ref<!fir.array<100x!fir.logical<4>>>, !fir.shape<1>) -> (!fir.ref<!fir.array<100x!fir.logical<4>>>, !fir.ref<!fir.array<100x!fir.logical<4>>>)
+! CHECK:           %[[VAL_6:.*]] = fir.alloca !fir.logical<4> {bindc_name = "x", uniq_name = "_QFmultiple_reductionsEx"}
+! CHECK:           %[[VAL_7:.*]]:2 = hlfir.declare %[[VAL_6]] {uniq_name = "_QFmultiple_reductionsEx"} : (!fir.ref<!fir.logical<4>>) -> (!fir.ref<!fir.logical<4>>, !fir.ref<!fir.logical<4>>)
+! CHECK:           %[[VAL_8:.*]] = fir.alloca !fir.logical<4> {bindc_name = "y", uniq_name = "_QFmultiple_reductionsEy"}
+! CHECK:           %[[VAL_9:.*]]:2 = hlfir.declare %[[VAL_8]] {uniq_name = "_QFmultiple_reductionsEy"} : (!fir.ref<!fir.logical<4>>) -> (!fir.ref<!fir.logical<4>>, !fir.ref<!fir.logical<4>>)
+! CHECK:           %[[VAL_10:.*]] = fir.alloca !fir.logical<4> {bindc_name = "z", uniq_name = "_QFmultiple_reductionsEz"}
+! CHECK:           %[[VAL_11:.*]]:2 = hlfir.declare %[[VAL_10]] {uniq_name = "_QFmultiple_reductionsEz"} : (!fir.ref<!fir.logical<4>>) -> (!fir.ref<!fir.logical<4>>, !fir.ref<!fir.logical<4>>)
+! CHECK:           %[[VAL_12:.*]] = arith.constant true
+! CHECK:           %[[VAL_13:.*]] = fir.convert %[[VAL_12]] : (i1) -> !fir.logical<4>
+! CHECK:           hlfir.assign %[[VAL_13]] to %[[VAL_7]]#0 : !fir.logical<4>, !fir.ref<!fir.logical<4>>
+! CHECK:           %[[VAL_14:.*]] = arith.constant true
+! CHECK:           %[[VAL_15:.*]] = fir.convert %[[VAL_14]] : (i1) -> !fir.logical<4>
+! CHECK:           hlfir.assign %[[VAL_15]] to %[[VAL_9]]#0 : !fir.logical<4>, !fir.ref<!fir.logical<4>>
+! CHECK:           %[[VAL_16:.*]] = arith.constant true
+! CHECK:           %[[VAL_17:.*]] = fir.convert %[[VAL_16]] : (i1) -> !fir.logical<4>
+! CHECK:           hlfir.assign %[[VAL_17]] to %[[VAL_11]]#0 : !fir.logical<4>, !fir.ref<!fir.logical<4>>
+! CHECK:           omp.parallel {
+! CHECK:             %[[VAL_18:.*]] = fir.alloca i32 {adapt.valuebyref, pinned}
+! CHECK:             %[[VAL_19:.*]]:2 = hlfir.declare %[[VAL_18]] {uniq_name = "_QFmultiple_reductionsEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK:             %[[VAL_20:.*]] = arith.constant 1 : i32
+! CHECK:             %[[VAL_21:.*]] = arith.constant 100 : i32
+! CHECK:             %[[VAL_22:.*]] = arith.constant 1 : i32
+! CHECK:             omp.wsloop byref reduction(@neqv_reduction %[[VAL_7]]#0 -> %[[VAL_23:.*]] : !fir.ref<!fir.logical<4>>, @neqv_reduction %[[VAL_9]]#0 -> %[[VAL_24:.*]] : !fir.ref<!fir.logical<4>>, @neqv_reduction %[[VAL_11]]#0 -> %[[VAL_25:.*]] : !fir.ref<!fir.logical<4>>)  for  (%[[VAL_26:.*]]) : i32 = (%[[VAL_20]]) to (%[[VAL_21]]) inclusive step (%[[VAL_22]]) {
+! CHECK:               fir.store %[[VAL_26]] to %[[VAL_19]]#1 : !fir.ref<i32>
+! CHECK:               %[[VAL_27:.*]]:2 = hlfir.declare %[[VAL_23]] {uniq_name = "_QFmultiple_reductionsEx"} : (!fir.ref<!fir.logical<4>>) -> (!fir.ref<!fir.logical<4>>, !fir.ref<!fir.logical<4>>)
+! CHECK:               %[[VAL_28:.*]]:2 = hlfir.declare %[[VAL_24]] {uniq_name = "_QFmultiple_reductionsEy"} : (!fir.ref<!fir.logical<4>>) -> (!fir.ref<!fir.logical<4>>, !fir.ref<!fir.logical<4>>)
+! CHECK:               %[[VAL_29:.*]]:2 = hlfir.declare %[[VAL_25]] {uniq_name = "_QFmultiple_reductionsEz"} : (!fir.ref<!fir.logical<4>>) -> (!fir.ref<!fir.logical<4>>, !fir.ref<!fir.logical<4>>)
+! CHECK:               %[[VAL_30:.*]] = fir.load %[[VAL_27]]#0 : !fir.ref<!fir.logical<4>>
+! CHECK:               %[[VAL_31:.*]] = fir.load %[[VAL_19]]#0 : !fir.ref<i32>
+! CHECK:               %[[VAL_32:.*]] = fir.convert %[[VAL_31]] : (i32) -> i64
+! CHECK:               %[[VAL_33:.*]] = hlfir.designate %[[VAL_5]]#0 (%[[VAL_32]])  : (!fir.ref<!fir.array<100x!fir.logical<4>>>, i64) -> !fir.ref<!fir.logical<4>>
+! CHECK:               %[[VAL_34:.*]] = fir.load %[[VAL_33]] : !fir.ref<!fir.logical<4>>
+! CHECK:               %[[VAL_35:.*]] = fir.convert %[[VAL_30]] : (!fir.logical<4>) -> i1
+! CHECK:               %[[VAL_36:.*]] = fir.convert %[[VAL_34]] : (!fir.logical<4>) -> i1
+! CHECK:               %[[VAL_37:.*]] = arith.cmpi ne, %[[VAL_35]], %[[VAL_36]] : i1
+! CHECK:               %[[VAL_38:.*]] = fir.convert %[[VAL_37]] : (i1) -> !fir.logical<4>
+! CHECK:               hlfir.assign %[[VAL_38]] to %[[VAL_27]]#0 : !fir.logical<4>, !fir.ref<!fir.logical<4>>
+! CHECK:               %[[VAL_39:.*]] = fir.load %[[VAL_28]]#0 : !fir.ref<!fir.logical<4>>
+! CHECK:               %[[VAL_40:.*]] = fir.load %[[VAL_19]]#0 : !fir.ref<i32>
+! CHECK:               %[[VAL_41:.*]] = fir.convert %[[VAL_40]] : (i32) -> i64
+! CHECK:               %[[VAL_42:.*]] = hlfir.designate %[[VAL_5]]#0 (%[[VAL_41]])  : (!fir.ref<!fir.array<100x!fir.logical<4>>>, i64) -> !fir.ref<!fir.logical<4>>
+! CHECK:               %[[VAL_43:.*]] = fir.load %[[VAL_42]] : !fir.ref<!fir.logical<4>>
+! CHECK:               %[[VAL_44:.*]] = fir.convert %[[VAL_39]] : (!fir.logical<4>) -> i1
+! CHECK:               %[[VAL_45:.*]] = fir.convert %[[VAL_43]] : (!fir.logical<4>) -> i1
+! CHECK:               %[[VAL_46:.*]] = arith.cmpi ne, %[[VAL_44]], %[[VAL_45]] : i1
+! CHECK:               %[[VAL_47:.*]] = fir.convert %[[VAL_46]] : (i1) -> !fir.logical<4>
+! CHECK:               hlfir.assign %[[VAL_47]] to %[[VAL_28]]#0 : !fir.logical<4>, !fir.ref<!fir.logical<4>>
+! CHECK:               %[[VAL_48:.*]] = fir.load %[[VAL_29]]#0 : !fir.ref<!fir.logical<4>>
+! CHECK:               %[[VAL_49:.*]] = fir.load %[[VAL_19]]#0 : !fir.ref<i32>
+! CHECK:               %[[VAL_50:.*]] = fir.convert %[[VAL_49]] : (i32) -> i64
+! CHECK:               %[[VAL_51:.*]] = hlfir.designate %[[VAL_5]]#0 (%[[VAL_50]])  : (!fir.ref<!fir.array<100x!fir.logical<4>>>, i64) -> !fir.ref<!fir.logical<4>>
+! CHECK:               %[[VAL_52:.*]] = fir.load %[[VAL_51]] : !fir.ref<!fir.logical<4>>
+! CHECK:               %[[VAL_53:.*]] = fir.convert %[[VAL_48]] : (!fir.logical<4>) -> i1
+! CHECK:               %[[VAL_54:.*]] = fir.convert %[[VAL_52]] : (!fir.logical<4>) -> i1
+! CHECK:               %[[VAL_55:.*]] = arith.cmpi ne, %[[VAL_53]], %[[VAL_54]] : i1
+! CHECK:               %[[VAL_56:.*]] = fir.convert %[[VAL_55]] : (i1) -> !fir.logical<4>
+! CHECK:               hlfir.assign %[[VAL_56]] to %[[VAL_29]]#0 : !fir.logical<4>, !fir.ref<!fir.logical<4>>
+! CHECK:               omp.yield
+! CHECK:             omp.terminator
+! CHECK:           return
+! CHECK:         }
+
+
+subroutine multiple_reductions(w)
+  logical :: x,y,z,w(100)
+  x = .true.
+  y = .true.
+  z = .true.
+  !$omp parallel
+  !$omp do reduction(.neqv.:x,y,z)
+  do i=1, 100
+  x = x .neqv. w(i)
+  y = y .neqv. w(i)
+  z = z .neqv. w(i)
+  end do
+  !$omp end do
+  !$omp end parallel
+end subroutine
diff --git a/flang/test/Lower/OpenMP/wsloop-reduction-logical-or-byref.f90 b/flang/test/Lower/OpenMP/wsloop-reduction-logical-or-byref.f90
new file mode 100644
index 00000000000000..c71ea02f9373fd
--- /dev/null
+++ b/flang/test/Lower/OpenMP/wsloop-reduction-logical-or-byref.f90
@@ -0,0 +1,204 @@
+! RUN: bbc -emit-hlfir -fopenmp --force-byref-reduction %s -o - | FileCheck %s
+! RUN: %flang_fc1 -emit-hlfir -fopenmp -mmlir --force-byref-reduction %s -o - | FileCheck %s
+
+! NOTE: Assertions have been autogenerated by utils/generate-test-checks.py
+
+! CHECK-LABEL:   omp.reduction.declare @or_reduction : !fir.ref<!fir.logical<4>>
+! CHECK-SAME:    init {
+! CHECK:         ^bb0(%[[VAL_0:.*]]: !fir.ref<!fir.logical<4>>):
+! CHECK:           %[[VAL_1:.*]] = arith.constant false
+! CHECK:           %[[VAL_2:.*]] = fir.convert %[[VAL_1]] : (i1) -> !fir.logical<4>
+! CHECK:           %[[REF:.*]] = fir.alloca !fir.logical<4>
+! CHECK:           fir.store %[[VAL_2]] to %[[REF]] : !fir.ref<!fir.logical<4>>
+! CHECK:           omp.yield(%[[REF]] : !fir.ref<!fir.logical<4>>)
+
+! CHECK-LABEL:   } combiner {
+! CHECK:         ^bb0(%[[ARG0:.*]]: !fir.ref<!fir.logical<4>>, %[[ARG1:.*]]: !fir.ref<!fir.logical<4>>):
+! CHECK:           %[[LD0:.*]] = fir.load %[[ARG0]] : !fir.ref<!fir.logical<4>>
+! CHECK:           %[[LD1:.*]] = fir.load %[[ARG1]] : !fir.ref<!fir.logical<4>>
+! CHECK:           %[[VAL_2:.*]] = fir.convert %[[LD0]] : (!fir.logical<4>) -> i1
+! CHECK:           %[[VAL_3:.*]] = fir.convert %[[LD1]] : (!fir.logical<4>) -> i1
+! CHECK:           %[[RES:.*]] = arith.ori %[[VAL_2]], %[[VAL_3]] : i1
+! CHECK:           %[[VAL_5:.*]] = fir.convert %[[RES]] : (i1) -> !fir.logical<4>
+! CHECK:           fir.store %[[VAL_5]] to %[[ARG0]] : !fir.ref<!fir.logical<4>>
+! CHECK:           omp.yield(%[[ARG0]] : !fir.ref<!fir.logical<4>>)
+
+! CHECK-LABEL:   func.func @_QPsimple_reduction(
+! CHECK-SAME:                                   %[[VAL_0:.*]]: !fir.ref<!fir.array<100x!fir.logical<4>>> {fir.bindc_name = "y"}) {
+! CHECK:           %[[VAL_1:.*]] = fir.alloca i32 {bindc_name = "i", uniq_name = "_QFsimple_reductionEi"}
+! CHECK:           %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_1]] {uniq_name = "_QFsimple_reductionEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK:           %[[VAL_3:.*]] = fir.alloca !fir.logical<4> {bindc_name = "x", uniq_name = "_QFsimple_reductionEx"}
+! CHECK:           %[[VAL_4:.*]]:2 = hlfir.declare %[[VAL_3]] {uniq_name = "_QFsimple_reductionEx"} : (!fir.ref<!fir.logical<4>>) -> (!fir.ref<!fir.logical<4>>, !fir.ref<!fir.logical<4>>)
+! CHECK:           %[[VAL_5:.*]] = arith.constant 100 : index
+! CHECK:           %[[VAL_6:.*]] = fir.shape %[[VAL_5]] : (index) -> !fir.shape<1>
+! CHECK:           %[[VAL_7:.*]]:2 = hlfir.declare %[[VAL_0]](%[[VAL_6]]) {uniq_name = "_QFsimple_reductionEy"} : (!fir.ref<!fir.array<100x!fir.logical<4>>>, !fir.shape<1>) -> (!fir.ref<!fir.array<100x!fir.logical<4>>>, !fir.ref<!fir.array<100x!fir.logical<4>>>)
+! CHECK:           %[[VAL_8:.*]] = arith.constant true
+! CHECK:           %[[VAL_9:.*]] = fir.convert %[[VAL_8]] : (i1) -> !fir.logical<4>
+! CHECK:           hlfir.assign %[[VAL_9]] to %[[VAL_4]]#0 : !fir.logical<4>, !fir.ref<!fir.logical<4>>
+! CHECK:           omp.parallel {
+! CHECK:             %[[VAL_10:.*]] = fir.alloca i32 {adapt.valuebyref, pinned}
+! CHECK:             %[[VAL_11:.*]]:2 = hlfir.declare %[[VAL_10]] {uniq_name = "_QFsimple_reductionEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK:             %[[VAL_12:.*]] = arith.constant 1 : i32
+! CHECK:             %[[VAL_13:.*]] = arith.constant 100 : i32
+! CHECK:             %[[VAL_14:.*]] = arith.constant 1 : i32
+! CHECK:             omp.wsloop byref reduction(@or_reduction %[[VAL_4]]#0 -> %[[VAL_15:.*]] : !fir.ref<!fir.logical<4>>)  for  (%[[VAL_16:.*]]) : i32 = (%[[VAL_12]]) to (%[[VAL_13]]) inclusive step (%[[VAL_14]]) {
+! CHECK:               fir.store %[[VAL_16]] to %[[VAL_11]]#1 : !fir.ref<i32>
+! CHECK:               %[[VAL_17:.*]]:2 = hlfir.declare %[[VAL_15]] {uniq_name = "_QFsimple_reductionEx"} : (!fir.ref<!fir.logical<4>>) -> (!fir.ref<!fir.logical<4>>, !fir.ref<!fir.logical<4>>)
+! CHECK:               %[[VAL_18:.*]] = fir.load %[[VAL_17]]#0 : !fir.ref<!fir.logical<4>>
+! CHECK:               %[[VAL_19:.*]] = fir.load %[[VAL_11]]#0 : !fir.ref<i32>
+! CHECK:               %[[VAL_20:.*]] = fir.convert %[[VAL_19]] : (i32) -> i64
+! CHECK:               %[[VAL_21:.*]] = hlfir.designate %[[VAL_7]]#0 (%[[VAL_20]])  : (!fir.ref<!fir.array<100x!fir.logical<4>>>, i64) -> !fir.ref<!fir.logical<4>>
+! CHECK:               %[[VAL_22:.*]] = fir.load %[[VAL_21]] : !fir.ref<!fir.logical<4>>
+! CHECK:               %[[VAL_23:.*]] = fir.convert %[[VAL_18]] : (!fir.logical<4>) -> i1
+! CHECK:               %[[VAL_24:.*]] = fir.convert %[[VAL_22]] : (!fir.logical<4>) -> i1
+! CHECK:               %[[VAL_25:.*]] = arith.ori %[[VAL_23]], %[[VAL_24]] : i1
+! CHECK:               %[[VAL_26:.*]] = fir.convert %[[VAL_25]] : (i1) -> !fir.logical<4>
+! CHECK:               hlfir.assign %[[VAL_26]] to %[[VAL_17]]#0 : !fir.logical<4>, !fir.ref<!fir.logical<4>>
+! CHECK:               omp.yield
+! CHECK:             omp.terminator
+! CHECK:           return
+
+subroutine simple_reduction(y)
+  logical :: x, y(100)
+  x = .true.
+  !$omp parallel
+  !$omp do reduction(.or.:x)
+  do i=1, 100
+    x = x .or. y(i)
+  end do
+  !$omp end do
+  !$omp end parallel
+end subroutine
+
+! CHECK-LABEL:   func.func @_QPsimple_reduction_switch_order(
+! CHECK-SAME:                                                %[[VAL_0:.*]]: !fir.ref<!fir.array<100x!fir.logical<4>>> {fir.bindc_name = "y"}) {
+! CHECK:           %[[VAL_1:.*]] = fir.alloca i32 {bindc_name = "i", uniq_name = "_QFsimple_reduction_switch_orderEi"}
+! CHECK:           %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_1]] {uniq_name = "_QFsimple_reduction_switch_orderEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK:           %[[VAL_3:.*]] = fir.alloca !fir.logical<4> {bindc_name = "x", uniq_name = "_QFsimple_reduction_switch_orderEx"}
+! CHECK:           %[[VAL_4:.*]]:2 = hlfir.declare %[[VAL_3]] {uniq_name = "_QFsimple_reduction_switch_orderEx"} : (!fir.ref<!fir.logical<4>>) -> (!fir.ref<!fir.logical<4>>, !fir.ref<!fir.logical<4>>)
+! CHECK:           %[[VAL_5:.*]] = arith.constant 100 : index
+! CHECK:           %[[VAL_6:.*]] = fir.shape %[[VAL_5]] : (index) -> !fir.shape<1>
+! CHECK:           %[[VAL_7:.*]]:2 = hlfir.declare %[[VAL_0]](%[[VAL_6]]) {uniq_name = "_QFsimple_reduction_switch_orderEy"} : (!fir.ref<!fir.array<100x!fir.logical<4>>>, !fir.shape<1>) -> (!fir.ref<!fir.array<100x!fir.logical<4>>>, !fir.ref<!fir.array<100x!fir.logical<4>>>)
+! CHECK:           %[[VAL_8:.*]] = arith.constant true
+! CHECK:           %[[VAL_9:.*]] = fir.convert %[[VAL_8]] : (i1) -> !fir.logical<4>
+! CHECK:           hlfir.assign %[[VAL_9]] to %[[VAL_4]]#0 : !fir.logical<4>, !fir.ref<!fir.logical<4>>
+! CHECK:           omp.parallel {
+! CHECK:             %[[VAL_10:.*]] = fir.alloca i32 {adapt.valuebyref, pinned}
+! CHECK:             %[[VAL_11:.*]]:2 = hlfir.declare %[[VAL_10]] {uniq_name = "_QFsimple_reduction_switch_orderEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK:             %[[VAL_12:.*]] = arith.constant 1 : i32
+! CHECK:             %[[VAL_13:.*]] = arith.constant 100 : i32
+! CHECK:             %[[VAL_14:.*]] = arith.constant 1 : i32
+! CHECK:             omp.wsloop byref reduction(@or_reduction %[[VAL_4]]#0 -> %[[VAL_15:.*]] : !fir.ref<!fir.logical<4>>)  for  (%[[VAL_16:.*]]) : i32 = (%[[VAL_12]]) to (%[[VAL_13]]) inclusive step (%[[VAL_14]]) {
+! CHECK:               fir.store %[[VAL_16]] to %[[VAL_11]]#1 : !fir.ref<i32>
+! CHECK:               %[[VAL_17:.*]]:2 = hlfir.declare %[[VAL_15]] {uniq_name = "_QFsimple_reduction_switch_orderEx"} : (!fir.ref<!fir.logical<4>>) -> (!fir.ref<!fir.logical<4>>, !fir.ref<!fir.logical<4>>)
+! CHECK:               %[[VAL_18:.*]] = fir.load %[[VAL_11]]#0 : !fir.ref<i32>
+! CHECK:               %[[VAL_19:.*]] = fir.convert %[[VAL_18]] : (i32) -> i64
+! CHECK:               %[[VAL_20:.*]] = hlfir.designate %[[VAL_7]]#0 (%[[VAL_19]])  : (!fir.ref<!fir.array<100x!fir.logical<4>>>, i64) -> !fir.ref<!fir.logical<4>>
+! CHECK:               %[[VAL_21:.*]] = fir.load %[[VAL_20]] : !fir.ref<!fir.logical<4>>
+! CHECK:               %[[VAL_22:.*]] = fir.load %[[VAL_17]]#0 : !fir.ref<!fir.logical<4>>
+! CHECK:               %[[VAL_23:.*]] = fir.convert %[[VAL_21]] : (!fir.logical<4>) -> i1
+! CHECK:               %[[VAL_24:.*]] = fir.convert %[[VAL_22]] : (!fir.logical<4>) -> i1
+! CHECK:               %[[VAL_25:.*]] = arith.ori %[[VAL_23]], %[[VAL_24]] : i1
+! CHECK:               %[[VAL_26:.*]] = fir.convert %[[VAL_25]] : (i1) -> !fir.logical<4>
+! CHECK:               hlfir.assign %[[VAL_26]] to %[[VAL_17]]#0 : !fir.logical<4>, !fir.ref<!fir.logical<4>>
+! CHECK:               omp.yield
+! CHECK:             omp.terminator
+! CHECK:           return
+
+subroutine simple_reduction_switch_order(y)
+  logical :: x, y(100)
+  x = .true.
+  !$omp parallel
+  !$omp do reduction(.or.:x)
+  do i=1, 100
+  x = y(i) .or. x
+  end do
+  !$omp end do
+  !$omp end parallel
+end subroutine
+
+! CHECK-LABEL:   func.func @_QPmultiple_reductions(
+! CHECK-SAME:                                      %[[VAL_0:.*]]: !fir.ref<!fir.array<100x!fir.logical<4>>> {fir.bindc_name = "w"}) {
+! CHECK:           %[[VAL_1:.*]] = fir.alloca i32 {bindc_name = "i", uniq_name = "_QFmultiple_reductionsEi"}
+! CHECK:           %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_1]] {uniq_name = "_QFmultiple_reductionsEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK:           %[[VAL_3:.*]] = arith.constant 100 : index
+! CHECK:           %[[VAL_4:.*]] = fir.shape %[[VAL_3]] : (index) -> !fir.shape<1>
+! CHECK:           %[[VAL_5:.*]]:2 = hlfir.declare %[[VAL_0]](%[[VAL_4]]) {uniq_name = "_QFmultiple_reductionsEw"} : (!fir.ref<!fir.array<100x!fir.logical<4>>>, !fir.shape<1>) -> (!fir.ref<!fir.array<100x!fir.logical<4>>>, !fir.ref<!fir.array<100x!fir.logical<4>>>)
+! CHECK:           %[[VAL_6:.*]] = fir.alloca !fir.logical<4> {bindc_name = "x", uniq_name = "_QFmultiple_reductionsEx"}
+! CHECK:           %[[VAL_7:.*]]:2 = hlfir.declare %[[VAL_6]] {uniq_name = "_QFmultiple_reductionsEx"} : (!fir.ref<!fir.logical<4>>) -> (!fir.ref<!fir.logical<4>>, !fir.ref<!fir.logical<4>>)
+! CHECK:           %[[VAL_8:.*]] = fir.alloca !fir.logical<4> {bindc_name = "y", uniq_name = "_QFmultiple_reductionsEy"}
+! CHECK:           %[[VAL_9:.*]]:2 = hlfir.declare %[[VAL_8]] {uniq_name = "_QFmultiple_reductionsEy"} : (!fir.ref<!fir.logical<4>>) -> (!fir.ref<!fir.logical<4>>, !fir.ref<!fir.logical<4>>)
+! CHECK:           %[[VAL_10:.*]] = fir.alloca !fir.logical<4> {bindc_name = "z", uniq_name = "_QFmultiple_reductionsEz"}
+! CHECK:           %[[VAL_11:.*]]:2 = hlfir.declare %[[VAL_10]] {uniq_name = "_QFmultiple_reductionsEz"} : (!fir.ref<!fir.logical<4>>) -> (!fir.ref<!fir.logical<4>>, !fir.ref<!fir.logical<4>>)
+! CHECK:           %[[VAL_12:.*]] = arith.constant true
+! CHECK:           %[[VAL_13:.*]] = fir.convert %[[VAL_12]] : (i1) -> !fir.logical<4>
+! CHECK:           hlfir.assign %[[VAL_13]] to %[[VAL_7]]#0 : !fir.logical<4>, !fir.ref<!fir.logical<4>>
+! CHECK:           %[[VAL_14:.*]] = arith.constant true
+! CHECK:           %[[VAL_15:.*]] = fir.convert %[[VAL_14]] : (i1) -> !fir.logical<4>
+! CHECK:           hlfir.assign %[[VAL_15]] to %[[VAL_9]]#0 : !fir.logical<4>, !fir.ref<!fir.logical<4>>
+! CHECK:           %[[VAL_16:.*]] = arith.constant true
+! CHECK:           %[[VAL_17:.*]] = fir.convert %[[VAL_16]] : (i1) -> !fir.logical<4>
+! CHECK:           hlfir.assign %[[VAL_17]] to %[[VAL_11]]#0 : !fir.logical<4>, !fir.ref<!fir.logical<4>>
+! CHECK:           omp.parallel {
+! CHECK:             %[[VAL_18:.*]] = fir.alloca i32 {adapt.valuebyref, pinned}
+! CHECK:             %[[VAL_19:.*]]:2 = hlfir.declare %[[VAL_18]] {uniq_name = "_QFmultiple_reductionsEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK:             %[[VAL_20:.*]] = arith.constant 1 : i32
+! CHECK:             %[[VAL_21:.*]] = arith.constant 100 : i32
+! CHECK:             %[[VAL_22:.*]] = arith.constant 1 : i32
+! CHECK:             omp.wsloop byref reduction(@or_reduction %[[VAL_7]]#0 -> %[[VAL_23:.*]] : !fir.ref<!fir.logical<4>>, @or_reduction %[[VAL_9]]#0 -> %[[VAL_24:.*]] : !fir.ref<!fir.logical<4>>, @or_reduction %[[VAL_11]]#0 -> %[[VAL_25:.*]] : !fir.ref<!fir.logical<4>>)  for  (%[[VAL_26:.*]]) : i32 = (%[[VAL_20]]) to (%[[VAL_21]]) inclusive step (%[[VAL_22]]) {
+! CHECK:               fir.store %[[VAL_26]] to %[[VAL_19]]#1 : !fir.ref<i32>
+! CHECK:               %[[VAL_27:.*]]:2 = hlfir.declare %[[VAL_23]] {uniq_name = "_QFmultiple_reductionsEx"} : (!fir.ref<!fir.logical<4>>) -> (!fir.ref<!fir.logical<4>>, !fir.ref<!fir.logical<4>>)
+! CHECK:               %[[VAL_28:.*]]:2 = hlfir.declare %[[VAL_24]] {uniq_name = "_QFmultiple_reductionsEy"} : (!fir.ref<!fir.logical<4>>) -> (!fir.ref<!fir.logical<4>>, !fir.ref<!fir.logical<4>>)
+! CHECK:               %[[VAL_29:.*]]:2 = hlfir.declare %[[VAL_25]] {uniq_name = "_QFmultiple_reductionsEz"} : (!fir.ref<!fir.logical<4>>) -> (!fir.ref<!fir.logical<4>>, !fir.ref<!fir.logical<4>>)
+! CHECK:               %[[VAL_30:.*]] = fir.load %[[VAL_27]]#0 : !fir.ref<!fir.logical<4>>
+! CHECK:               %[[VAL_31:.*]] = fir.load %[[VAL_19]]#0 : !fir.ref<i32>
+! CHECK:               %[[VAL_32:.*]] = fir.convert %[[VAL_31]] : (i32) -> i64
+! CHECK:               %[[VAL_33:.*]] = hlfir.designate %[[VAL_5]]#0 (%[[VAL_32]])  : (!fir.ref<!fir.array<100x!fir.logical<4>>>, i64) -> !fir.ref<!fir.logical<4>>
+! CHECK:               %[[VAL_34:.*]] = fir.load %[[VAL_33]] : !fir.ref<!fir.logical<4>>
+! CHECK:               %[[VAL_35:.*]] = fir.convert %[[VAL_30]] : (!fir.logical<4>) -> i1
+! CHECK:               %[[VAL_36:.*]] = fir.convert %[[VAL_34]] : (!fir.logical<4>) -> i1
+! CHECK:               %[[VAL_37:.*]] = arith.ori %[[VAL_35]], %[[VAL_36]] : i1
+! CHECK:               %[[VAL_38:.*]] = fir.convert %[[VAL_37]] : (i1) -> !fir.logical<4>
+! CHECK:               hlfir.assign %[[VAL_38]] to %[[VAL_27]]#0 : !fir.logical<4>, !fir.ref<!fir.logical<4>>
+! CHECK:               %[[VAL_39:.*]] = fir.load %[[VAL_28]]#0 : !fir.ref<!fir.logical<4>>
+! CHECK:               %[[VAL_40:.*]] = fir.load %[[VAL_19]]#0 : !fir.ref<i32>
+! CHECK:               %[[VAL_41:.*]] = fir.convert %[[VAL_40]] : (i32) -> i64
+! CHECK:               %[[VAL_42:.*]] = hlfir.designate %[[VAL_5]]#0 (%[[VAL_41]])  : (!fir.ref<!fir.array<100x!fir.logical<4>>>, i64) -> !fir.ref<!fir.logical<4>>
+! CHECK:               %[[VAL_43:.*]] = fir.load %[[VAL_42]] : !fir.ref<!fir.logical<4>>
+! CHECK:               %[[VAL_44:.*]] = fir.convert %[[VAL_39]] : (!fir.logical<4>) -> i1
+! CHECK:               %[[VAL_45:.*]] = fir.convert %[[VAL_43]] : (!fir.logical<4>) -> i1
+! CHECK:               %[[VAL_46:.*]] = arith.ori %[[VAL_44]], %[[VAL_45]] : i1
+! CHECK:               %[[VAL_47:.*]] = fir.convert %[[VAL_46]] : (i1) -> !fir.logical<4>
+! CHECK:               hlfir.assign %[[VAL_47]] to %[[VAL_28]]#0 : !fir.logical<4>, !fir.ref<!fir.logical<4>>
+! CHECK:               %[[VAL_48:.*]] = fir.load %[[VAL_29]]#0 : !fir.ref<!fir.logical<4>>
+! CHECK:               %[[VAL_49:.*]] = fir.load %[[VAL_19]]#0 : !fir.ref<i32>
+! CHECK:               %[[VAL_50:.*]] = fir.convert %[[VAL_49]] : (i32) -> i64
+! CHECK:               %[[VAL_51:.*]] = hlfir.designate %[[VAL_5]]#0 (%[[VAL_50]])  : (!fir.ref<!fir.array<100x!fir.logical<4>>>, i64) -> !fir.ref<!fir.logical<4>>
+! CHECK:               %[[VAL_52:.*]] = fir.load %[[VAL_51]] : !fir.ref<!fir.logical<4>>
+! CHECK:               %[[VAL_53:.*]] = fir.convert %[[VAL_48]] : (!fir.logical<4>) -> i1
+! CHECK:               %[[VAL_54:.*]] = fir.convert %[[VAL_52]] : (!fir.logical<4>) -> i1
+! CHECK:               %[[VAL_55:.*]] = arith.ori %[[VAL_53]], %[[VAL_54]] : i1
+! CHECK:               %[[VAL_56:.*]] = fir.convert %[[VAL_55]] : (i1) -> !fir.logical<4>
+! CHECK:               hlfir.assign %[[VAL_56]] to %[[VAL_29]]#0 : !fir.logical<4>, !fir.ref<!fir.logical<4>>
+! CHECK:               omp.yield
+! CHECK:             omp.terminator
+! CHECK:           return
+
+
+
+subroutine multiple_reductions(w)
+  logical :: x,y,z,w(100)
+  x = .true.
+  y = .true.
+  z = .true.
+  !$omp parallel
+  !$omp do reduction(.or.:x,y,z)
+  do i=1, 100
+  x = x .or. w(i)
+  y = y .or. w(i)
+  z = z .or. w(i)
+  end do
+  !$omp end do
+  !$omp end parallel
+end subroutine
+
diff --git a/flang/test/Lower/OpenMP/wsloop-reduction-max-2-byref.f90 b/flang/test/Lower/OpenMP/wsloop-reduction-max-2-byref.f90
new file mode 100644
index 00000000000000..360cd34df2d110
--- /dev/null
+++ b/flang/test/Lower/OpenMP/wsloop-reduction-max-2-byref.f90
@@ -0,0 +1,20 @@
+! RUN: bbc -emit-hlfir -fopenmp --force-byref-reduction -o - %s 2>&1 | FileCheck %s
+! RUN: %flang_fc1 -emit-hlfir -fopenmp -mmlir --force-byref-reduction -o - %s 2>&1 | FileCheck %s
+
+! CHECK: omp.wsloop byref reduction(@max_i_32
+! CHECK: arith.cmpi sgt
+! CHECK: arith.select
+
+module m1
+  intrinsic max
+end module m1
+program main
+  use m1, ren=>max
+  n=0
+  !$omp parallel do reduction(ren:n)
+  do i=1,100
+     n=max(n,i)
+  end do
+  if (n/=100) print *,101
+  print *,'pass'
+end program main
diff --git a/flang/test/Lower/OpenMP/wsloop-reduction-max-byref.f90 b/flang/test/Lower/OpenMP/wsloop-reduction-max-byref.f90
new file mode 100644
index 00000000000000..f9bffc269bb8f5
--- /dev/null
+++ b/flang/test/Lower/OpenMP/wsloop-reduction-max-byref.f90
@@ -0,0 +1,152 @@
+! RUN: bbc -emit-hlfir -fopenmp --force-byref-reduction -o - %s 2>&1 | FileCheck %s
+! RUN: %flang_fc1 -emit-hlfir -fopenmp -mmlir --force-byref-reduction -o - %s 2>&1 | FileCheck %s
+
+! NOTE: Assertions have been autogenerated by utils/generate-test-checks.py
+
+!CHECK: omp.reduction.declare @max_f_32_byref : !fir.ref<f32>
+!CHECK-SAME: init {
+!CHECK:   %[[MINIMUM_VAL:.*]] = arith.constant -3.40282347E+38 : f32
+!CHECK:   %[[REF:.*]] = fir.alloca f32
+!CHECK:   fir.store %[[MINIMUM_VAL]] to %[[REF]] : !fir.ref<f32>
+!CHECK:   omp.yield(%[[REF]] : !fir.ref<f32>)
+!CHECK: combiner
+!CHECK: ^bb0(%[[ARG0:.*]]: !fir.ref<f32>, %[[ARG1:.*]]: !fir.ref<f32>):
+!CHECK:   %[[LD0:.*]] = fir.load %[[ARG0]] : !fir.ref<f32>
+!CHECK:   %[[LD1:.*]] = fir.load %[[ARG1]] : !fir.ref<f32>
+!CHECK:   %[[RES:.*]] = arith.maximumf %[[LD0]], %[[LD1]] {{.*}}: f32
+!CHECK:   fir.store %[[RES]] to %[[ARG0]] : !fir.ref<f32>
+!CHECK:   omp.yield(%[[ARG0]] : !fir.ref<f32>)
+
+!CHECK-LABEL: omp.reduction.declare @max_i_32_byref : !fir.ref<i32>
+!CHECK-SAME: init {
+!CHECK:   %[[MINIMUM_VAL:.*]] = arith.constant -2147483648 : i32
+!CHECK:   %[[REF:.*]] = fir.alloca i32
+!CHECK:   fir.store %[[MINIMUM_VAL]] to %[[REF]] : !fir.ref<i32>
+!CHECK:   omp.yield(%[[REF]] : !fir.ref<i32>)
+!CHECK: combiner
+!CHECK: ^bb0(%[[ARG0:.*]]: !fir.ref<i32>, %[[ARG1:.*]]: !fir.ref<i32>):
+!CHECK:   %[[LD0:.*]] = fir.load %[[ARG0]] : !fir.ref<i32>
+!CHECK:   %[[LD1:.*]] = fir.load %[[ARG1]] : !fir.ref<i32>
+!CHECK:   %[[RES:.*]] = arith.maxsi %[[LD0]], %[[LD1]] : i32
+!CHECK:   fir.store %[[RES]] to %[[ARG0]] : !fir.ref<i32>
+!CHECK:   omp.yield(%[[ARG0]] : !fir.ref<i32>)
+
+! CHECK-LABEL:   func.func @_QPreduction_max_int(
+! CHECK-SAME:                                    %[[VAL_0:.*]]: !fir.box<!fir.array<?xi32>> {fir.bindc_name = "y"}) {
+! CHECK:           %[[VAL_1:.*]] = fir.alloca i32 {bindc_name = "i", uniq_name = "_QFreduction_max_intEi"}
+! CHECK:           %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_1]] {uniq_name = "_QFreduction_max_intEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK:           %[[VAL_3:.*]] = fir.alloca i32 {bindc_name = "x", uniq_name = "_QFreduction_max_intEx"}
+! CHECK:           %[[VAL_4:.*]]:2 = hlfir.declare %[[VAL_3]] {uniq_name = "_QFreduction_max_intEx"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK:           %[[VAL_5:.*]]:2 = hlfir.declare %[[VAL_0]] {uniq_name = "_QFreduction_max_intEy"} : (!fir.box<!fir.array<?xi32>>) -> (!fir.box<!fir.array<?xi32>>, !fir.box<!fir.array<?xi32>>)
+! CHECK:           %[[VAL_6:.*]] = arith.constant 0 : i32
+! CHECK:           hlfir.assign %[[VAL_6]] to %[[VAL_4]]#0 : i32, !fir.ref<i32>
+! CHECK:           omp.parallel {
+! CHECK:             %[[VAL_7:.*]] = fir.alloca i32 {adapt.valuebyref, pinned}
+! CHECK:             %[[VAL_8:.*]]:2 = hlfir.declare %[[VAL_7]] {uniq_name = "_QFreduction_max_intEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK:             %[[VAL_9:.*]] = arith.constant 1 : i32
+! CHECK:             %[[VAL_10:.*]] = arith.constant 100 : i32
+! CHECK:             %[[VAL_11:.*]] = arith.constant 1 : i32
+! CHECK:             omp.wsloop byref reduction(@max_i_32_byref %[[VAL_4]]#0 -> %[[VAL_12:.*]] : !fir.ref<i32>)  for  (%[[VAL_13:.*]]) : i32 = (%[[VAL_9]]) to (%[[VAL_10]]) inclusive step (%[[VAL_11]]) {
+! CHECK:               fir.store %[[VAL_13]] to %[[VAL_8]]#1 : !fir.ref<i32>
+! CHECK:               %[[VAL_14:.*]]:2 = hlfir.declare %[[VAL_12]] {uniq_name = "_QFreduction_max_intEx"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK:               %[[VAL_15:.*]] = fir.load %[[VAL_8]]#0 : !fir.ref<i32>
+! CHECK:               %[[VAL_16:.*]] = fir.convert %[[VAL_15]] : (i32) -> i64
+! CHECK:               %[[VAL_17:.*]] = hlfir.designate %[[VAL_5]]#0 (%[[VAL_16]])  : (!fir.box<!fir.array<?xi32>>, i64) -> !fir.ref<i32>
+! CHECK:               %[[VAL_18:.*]] = fir.load %[[VAL_14]]#0 : !fir.ref<i32>
+! CHECK:               %[[VAL_19:.*]] = fir.load %[[VAL_17]] : !fir.ref<i32>
+! CHECK:               %[[VAL_20:.*]] = arith.cmpi sgt, %[[VAL_18]], %[[VAL_19]] : i32
+! CHECK:               %[[VAL_21:.*]] = arith.select %[[VAL_20]], %[[VAL_18]], %[[VAL_19]] : i32
+! CHECK:               hlfir.assign %[[VAL_21]] to %[[VAL_14]]#0 : i32, !fir.ref<i32>
+! CHECK:               omp.yield
+! CHECK:             omp.terminator
+
+! CHECK-LABEL:   func.func @_QPreduction_max_real(
+! CHECK-SAME:                                     %[[VAL_0:.*]]: !fir.box<!fir.array<?xf32>> {fir.bindc_name = "y"}) {
+! CHECK:           %[[VAL_1:.*]] = fir.alloca i32 {bindc_name = "i", uniq_name = "_QFreduction_max_realEi"}
+! CHECK:           %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_1]] {uniq_name = "_QFreduction_max_realEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK:           %[[VAL_3:.*]] = fir.alloca f32 {bindc_name = "x", uniq_name = "_QFreduction_max_realEx"}
+! CHECK:           %[[VAL_4:.*]]:2 = hlfir.declare %[[VAL_3]] {uniq_name = "_QFreduction_max_realEx"} : (!fir.ref<f32>) -> (!fir.ref<f32>, !fir.ref<f32>)
+! CHECK:           %[[VAL_5:.*]]:2 = hlfir.declare %[[VAL_0]] {uniq_name = "_QFreduction_max_realEy"} : (!fir.box<!fir.array<?xf32>>) -> (!fir.box<!fir.array<?xf32>>, !fir.box<!fir.array<?xf32>>)
+! CHECK:           %[[VAL_6:.*]] = arith.constant 0.000000e+00 : f32
+! CHECK:           hlfir.assign %[[VAL_6]] to %[[VAL_4]]#0 : f32, !fir.ref<f32>
+! CHECK:           omp.parallel {
+! CHECK:             %[[VAL_7:.*]] = fir.alloca i32 {adapt.valuebyref, pinned}
+! CHECK:             %[[VAL_8:.*]]:2 = hlfir.declare %[[VAL_7]] {uniq_name = "_QFreduction_max_realEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK:             %[[VAL_9:.*]] = arith.constant 1 : i32
+! CHECK:             %[[VAL_10:.*]] = arith.constant 100 : i32
+! CHECK:             %[[VAL_11:.*]] = arith.constant 1 : i32
+! CHECK:             omp.wsloop byref reduction(@max_f_32_byref %[[VAL_4]]#0 -> %[[VAL_12:.*]] : !fir.ref<f32>)  for  (%[[VAL_13:.*]]) : i32 = (%[[VAL_9]]) to (%[[VAL_10]]) inclusive step (%[[VAL_11]]) {
+! CHECK:               fir.store %[[VAL_13]] to %[[VAL_8]]#1 : !fir.ref<i32>
+! CHECK:               %[[VAL_14:.*]]:2 = hlfir.declare %[[VAL_12]] {uniq_name = "_QFreduction_max_realEx"} : (!fir.ref<f32>) -> (!fir.ref<f32>, !fir.ref<f32>)
+! CHECK:               %[[VAL_15:.*]] = fir.load %[[VAL_8]]#0 : !fir.ref<i32>
+! CHECK:               %[[VAL_16:.*]] = fir.convert %[[VAL_15]] : (i32) -> i64
+! CHECK:               %[[VAL_17:.*]] = hlfir.designate %[[VAL_5]]#0 (%[[VAL_16]])  : (!fir.box<!fir.array<?xf32>>, i64) -> !fir.ref<f32>
+! CHECK:               %[[VAL_18:.*]] = fir.load %[[VAL_17]] : !fir.ref<f32>
+! CHECK:               %[[VAL_19:.*]] = fir.load %[[VAL_14]]#0 : !fir.ref<f32>
+! CHECK:               %[[VAL_20:.*]] = arith.cmpf ogt, %[[VAL_18]], %[[VAL_19]] fastmath<contract> : f32
+! CHECK:               %[[VAL_21:.*]] = arith.select %[[VAL_20]], %[[VAL_18]], %[[VAL_19]] : f32
+! CHECK:               hlfir.assign %[[VAL_21]] to %[[VAL_14]]#0 : f32, !fir.ref<f32>
+! CHECK:               omp.yield
+! CHECK:             omp.terminator
+! CHECK:           omp.parallel {
+! CHECK:             %[[VAL_30:.*]] = fir.alloca i32 {adapt.valuebyref, pinned}
+! CHECK:             %[[VAL_31:.*]]:2 = hlfir.declare %[[VAL_30]] {uniq_name = "_QFreduction_max_realEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK:             %[[VAL_32:.*]] = arith.constant 1 : i32
+! CHECK:             %[[VAL_33:.*]] = arith.constant 100 : i32
+! CHECK:             %[[VAL_34:.*]] = arith.constant 1 : i32
+! CHECK:             omp.wsloop byref reduction(@max_f_32_byref %[[VAL_4]]#0 -> %[[VAL_35:.*]] : !fir.ref<f32>)  for  (%[[VAL_36:.*]]) : i32 = (%[[VAL_32]]) to (%[[VAL_33]]) inclusive step (%[[VAL_34]]) {
+! CHECK:               fir.store %[[VAL_36]] to %[[VAL_31]]#1 : !fir.ref<i32>
+! CHECK:               %[[VAL_37:.*]]:2 = hlfir.declare %[[VAL_35]] {uniq_name = "_QFreduction_max_realEx"} : (!fir.ref<f32>) -> (!fir.ref<f32>, !fir.ref<f32>)
+! CHECK:               %[[VAL_38:.*]] = fir.load %[[VAL_31]]#0 : !fir.ref<i32>
+! CHECK:               %[[VAL_39:.*]] = fir.convert %[[VAL_38]] : (i32) -> i64
+! CHECK:               %[[VAL_40:.*]] = hlfir.designate %[[VAL_5]]#0 (%[[VAL_39]])  : (!fir.box<!fir.array<?xf32>>, i64) -> !fir.ref<f32>
+! CHECK:               %[[VAL_41:.*]] = fir.load %[[VAL_40]] : !fir.ref<f32>
+! CHECK:               %[[VAL_42:.*]] = fir.load %[[VAL_37]]#0 : !fir.ref<f32>
+! CHECK:               %[[VAL_43:.*]] = arith.cmpf ogt, %[[VAL_41]], %[[VAL_42]] fastmath<contract> : f32
+! CHECK:               fir.if %[[VAL_43]] {
+! CHECK:                 %[[VAL_44:.*]] = fir.load %[[VAL_31]]#0 : !fir.ref<i32>
+! CHECK:                 %[[VAL_45:.*]] = fir.convert %[[VAL_44]] : (i32) -> i64
+! CHECK:                 %[[VAL_46:.*]] = hlfir.designate %[[VAL_5]]#0 (%[[VAL_45]])  : (!fir.box<!fir.array<?xf32>>, i64) -> !fir.ref<f32>
+! CHECK:                 %[[VAL_47:.*]] = fir.load %[[VAL_46]] : !fir.ref<f32>
+! CHECK:                 hlfir.assign %[[VAL_47]] to %[[VAL_37]]#0 : f32, !fir.ref<f32>
+! CHECK:               } else {
+! CHECK:               }
+! CHECK:               omp.yield
+! CHECK:             omp.terminator
+
+
+
+subroutine reduction_max_int(y)
+  integer :: x, y(:)
+  x = 0
+  !$omp parallel
+  !$omp do reduction(max:x)
+  do i=1, 100
+    x = max(x, y(i))
+  end do
+  !$omp end do
+  !$omp end parallel
+  print *, x
+end subroutine
+
+subroutine reduction_max_real(y)
+  real :: x, y(:)
+  x = 0.0
+  !$omp parallel
+  !$omp do reduction(max:x)
+  do i=1, 100
+    x = max(y(i), x)
+  end do
+  !$omp end do
+  !$omp end parallel
+  print *, x
+
+  !$omp parallel
+  !$omp do reduction(max:x)
+  do i=1, 100
+    if (y(i) .gt. x) x = y(i)
+  end do
+  !$omp end do
+  !$omp end parallel
+  print *, x
+end subroutine
diff --git a/flang/test/Lower/OpenMP/wsloop-reduction-max-hlfir-byref.f90 b/flang/test/Lower/OpenMP/wsloop-reduction-max-hlfir-byref.f90
new file mode 100644
index 00000000000000..a296ce47f20f25
--- /dev/null
+++ b/flang/test/Lower/OpenMP/wsloop-reduction-max-hlfir-byref.f90
@@ -0,0 +1,62 @@
+! RUN: bbc -emit-hlfir -fopenmp --force-byref-reduction -o - %s 2>&1 | FileCheck %s
+! RUN: %flang_fc1 -emit-hlfir -fopenmp -mmlir --force-byref-reduction -o - %s 2>&1 | FileCheck %s
+
+! NOTE: Assertions have been autogenerated by utils/generate-test-checks.py
+
+! CHECK-LABEL:  omp.reduction.declare @max_i_32_byref : !fir.ref<i32>
+! CHECK-SAME:   init {
+! CHECK:        ^bb0(%[[VAL_0:.*]]: !fir.ref<i32>):
+! CHECK:          %[[MINIMUM_VAL:.*]] = arith.constant -2147483648 : i32
+! CHECK:          %[[REF:.*]] = fir.alloca i32
+! CHECK:          fir.store %[[MINIMUM_VAL]] to %[[REF]] : !fir.ref<i32>
+! CHECK:          omp.yield(%[[REF]] : !fir.ref<i32>)
+! CHECK:        combiner
+! CHECK:        ^bb0(%[[ARG0:.*]]: !fir.ref<i32>, %[[ARG1:.*]]: !fir.ref<i32>):
+! CHECK:          %[[LD0:.*]] = fir.load %[[ARG0]] : !fir.ref<i32>
+! CHECK:          %[[LD1:.*]] = fir.load %[[ARG1]] : !fir.ref<i32>
+! CHECK:          %[[RES:.*]] = arith.maxsi %[[LD0]], %[[LD1]] : i32
+! CHECK:          fir.store %[[RES]] to %[[ARG0]] : !fir.ref<i32>
+! CHECK:          omp.yield(%[[ARG0]] : !fir.ref<i32>)
+
+! CHECK-LABEL:   func.func @_QPreduction_max_int(
+! CHECK-SAME:                                    %[[VAL_0:.*]]: !fir.box<!fir.array<?xi32>> {fir.bindc_name = "y"}) {
+! CHECK:           %[[VAL_1:.*]] = fir.alloca i32 {bindc_name = "i", uniq_name = "_QFreduction_max_intEi"}
+! CHECK:           %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_1]] {uniq_name = "_QFreduction_max_intEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK:           %[[VAL_3:.*]] = fir.alloca i32 {bindc_name = "x", uniq_name = "_QFreduction_max_intEx"}
+! CHECK:           %[[VAL_4:.*]]:2 = hlfir.declare %[[VAL_3]] {uniq_name = "_QFreduction_max_intEx"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK:           %[[VAL_5:.*]]:2 = hlfir.declare %[[VAL_0]] {uniq_name = "_QFreduction_max_intEy"} : (!fir.box<!fir.array<?xi32>>) -> (!fir.box<!fir.array<?xi32>>, !fir.box<!fir.array<?xi32>>)
+! CHECK:           %[[VAL_6:.*]] = arith.constant 0 : i32
+! CHECK:           hlfir.assign %[[VAL_6]] to %[[VAL_4]]#0 : i32, !fir.ref<i32>
+! CHECK:           omp.parallel {
+! CHECK:             %[[VAL_7:.*]] = fir.alloca i32 {adapt.valuebyref, pinned}
+! CHECK:             %[[VAL_8:.*]]:2 = hlfir.declare %[[VAL_7]] {uniq_name = "_QFreduction_max_intEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK:             %[[VAL_9:.*]] = arith.constant 1 : i32
+! CHECK:             %[[VAL_10:.*]] = arith.constant 100 : i32
+! CHECK:             %[[VAL_11:.*]] = arith.constant 1 : i32
+! CHECK:             omp.wsloop byref reduction(@max_i_32_byref %[[VAL_4]]#0 -> %[[VAL_12:.*]] : !fir.ref<i32>)  for  (%[[VAL_13:.*]]) : i32 = (%[[VAL_9]]) to (%[[VAL_10]]) inclusive step (%[[VAL_11]]) {
+! CHECK:               fir.store %[[VAL_13]] to %[[VAL_8]]#1 : !fir.ref<i32>
+! CHECK:               %[[VAL_14:.*]]:2 = hlfir.declare %[[VAL_12]] {uniq_name = "_QFreduction_max_intEx"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK:               %[[VAL_15:.*]] = fir.load %[[VAL_8]]#0 : !fir.ref<i32>
+! CHECK:               %[[VAL_16:.*]] = fir.convert %[[VAL_15]] : (i32) -> i64
+! CHECK:               %[[VAL_17:.*]] = hlfir.designate %[[VAL_5]]#0 (%[[VAL_16]])  : (!fir.box<!fir.array<?xi32>>, i64) -> !fir.ref<i32>
+! CHECK:               %[[VAL_18:.*]] = fir.load %[[VAL_14]]#0 : !fir.ref<i32>
+! CHECK:               %[[VAL_19:.*]] = fir.load %[[VAL_17]] : !fir.ref<i32>
+! CHECK:               %[[VAL_20:.*]] = arith.cmpi sgt, %[[VAL_18]], %[[VAL_19]] : i32
+! CHECK:               %[[VAL_21:.*]] = arith.select %[[VAL_20]], %[[VAL_18]], %[[VAL_19]] : i32
+! CHECK:               hlfir.assign %[[VAL_21]] to %[[VAL_14]]#0 : i32, !fir.ref<i32>
+! CHECK:               omp.yield
+! CHECK:             omp.terminator
+
+
+subroutine reduction_max_int(y)
+  integer :: x, y(:)
+  x = 0
+  !$omp parallel
+  !$omp do reduction(max:x)
+  do i=1, 100
+    x = max(x, y(i))
+  end do
+  !$omp end do
+  !$omp end parallel
+  print *, x
+end subroutine
diff --git a/flang/test/Lower/OpenMP/wsloop-reduction-min-byref.f90 b/flang/test/Lower/OpenMP/wsloop-reduction-min-byref.f90
new file mode 100644
index 00000000000000..da9e686362b668
--- /dev/null
+++ b/flang/test/Lower/OpenMP/wsloop-reduction-min-byref.f90
@@ -0,0 +1,154 @@
+! RUN: bbc -emit-hlfir -fopenmp --force-byref-reduction -o - %s 2>&1 | FileCheck %s
+! RUN: %flang_fc1 -emit-hlfir -fopenmp -mmlir --force-byref-reduction -o - %s 2>&1 | FileCheck %s
+
+! NOTE: Assertions have been autogenerated by utils/generate-test-checks.py
+
+!CHECK: omp.reduction.declare @min_f_32_byref : !fir.ref<f32>
+!CHECK-SAME: init {
+!CHECK:   %[[MAXIMUM_VAL:.*]] = arith.constant 3.40282347E+38 : f32
+!CHECK:   %[[REF:.*]] = fir.alloca f32
+!CHECK:   fir.store %[[MAXIMUM_VAL]] to %[[REF]] : !fir.ref<f32>
+!CHECK:   omp.yield(%[[REF]] : !fir.ref<f32>)
+!CHECK: combiner
+!CHECK: ^bb0(%[[ARG0:.*]]: !fir.ref<f32>, %[[ARG1:.*]]: !fir.ref<f32>):
+!CHECK:   %[[LD0:.*]] = fir.load %[[ARG0]] : !fir.ref<f32>
+!CHECK:   %[[LD1:.*]] = fir.load %[[ARG1]] : !fir.ref<f32>
+!CHECK:   %[[RES:.*]] = arith.minimumf %[[LD0]], %[[LD1]] {{.*}}: f32
+!CHECK:   fir.store %[[RES]] to %[[ARG0]] : !fir.ref<f32>
+!CHECK:   omp.yield(%[[ARG0]] : !fir.ref<f32>)
+
+!CHECK-LABEL: omp.reduction.declare @min_i_32_byref : !fir.ref<i32>
+!CHECK-SAME: init {
+!CHECK:   %[[MAXIMUM_VAL:.*]] = arith.constant 2147483647 : i32
+!CHECK:   %[[REF:.*]] = fir.alloca i32
+!CHECK:   fir.store %[[MAXIMUM_VAL]] to %[[REF]] : !fir.ref<i32>
+!CHECK:   omp.yield(%[[REF]] : !fir.ref<i32>)
+!CHECK: combiner
+!CHECK: ^bb0(%[[ARG0:.*]]: !fir.ref<i32>, %[[ARG1:.*]]: !fir.ref<i32>):
+!CHECK:   %[[LD0:.*]] = fir.load %[[ARG0]] : !fir.ref<i32>
+!CHECK:   %[[LD1:.*]] = fir.load %[[ARG1]] : !fir.ref<i32>
+!CHECK:   %[[RES:.*]] = arith.minsi %[[LD0]], %[[LD1]] : i32
+!CHECK:   fir.store %[[RES]] to %[[ARG0]] : !fir.ref<i32>
+!CHECK:   omp.yield(%[[ARG0]] : !fir.ref<i32>)
+
+! CHECK-LABEL:   func.func @_QPreduction_min_int(
+! CHECK-SAME:                                    %[[VAL_0:.*]]: !fir.box<!fir.array<?xi32>> {fir.bindc_name = "y"}) {
+! CHECK:           %[[VAL_1:.*]] = fir.alloca i32 {bindc_name = "i", uniq_name = "_QFreduction_min_intEi"}
+! CHECK:           %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_1]] {uniq_name = "_QFreduction_min_intEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK:           %[[VAL_3:.*]] = fir.alloca i32 {bindc_name = "x", uniq_name = "_QFreduction_min_intEx"}
+! CHECK:           %[[VAL_4:.*]]:2 = hlfir.declare %[[VAL_3]] {uniq_name = "_QFreduction_min_intEx"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK:           %[[VAL_5:.*]]:2 = hlfir.declare %[[VAL_0]] {uniq_name = "_QFreduction_min_intEy"} : (!fir.box<!fir.array<?xi32>>) -> (!fir.box<!fir.array<?xi32>>, !fir.box<!fir.array<?xi32>>)
+! CHECK:           %[[VAL_6:.*]] = arith.constant 0 : i32
+! CHECK:           hlfir.assign %[[VAL_6]] to %[[VAL_4]]#0 : i32, !fir.ref<i32>
+! CHECK:           omp.parallel {
+! CHECK:             %[[VAL_7:.*]] = fir.alloca i32 {adapt.valuebyref, pinned}
+! CHECK:             %[[VAL_8:.*]]:2 = hlfir.declare %[[VAL_7]] {uniq_name = "_QFreduction_min_intEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK:             %[[VAL_9:.*]] = arith.constant 1 : i32
+! CHECK:             %[[VAL_10:.*]] = arith.constant 100 : i32
+! CHECK:             %[[VAL_11:.*]] = arith.constant 1 : i32
+! CHECK:             omp.wsloop byref reduction(@min_i_32_byref %[[VAL_4]]#0 -> %[[VAL_12:.*]] : !fir.ref<i32>)  for  (%[[VAL_13:.*]]) : i32 = (%[[VAL_9]]) to (%[[VAL_10]]) inclusive step (%[[VAL_11]]) {
+! CHECK:               fir.store %[[VAL_13]] to %[[VAL_8]]#1 : !fir.ref<i32>
+! CHECK:               %[[VAL_14:.*]]:2 = hlfir.declare %[[VAL_12]] {uniq_name = "_QFreduction_min_intEx"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK:               %[[VAL_15:.*]] = fir.load %[[VAL_8]]#0 : !fir.ref<i32>
+! CHECK:               %[[VAL_16:.*]] = fir.convert %[[VAL_15]] : (i32) -> i64
+! CHECK:               %[[VAL_17:.*]] = hlfir.designate %[[VAL_5]]#0 (%[[VAL_16]])  : (!fir.box<!fir.array<?xi32>>, i64) -> !fir.ref<i32>
+! CHECK:               %[[VAL_18:.*]] = fir.load %[[VAL_14]]#0 : !fir.ref<i32>
+! CHECK:               %[[VAL_19:.*]] = fir.load %[[VAL_17]] : !fir.ref<i32>
+! CHECK:               %[[VAL_20:.*]] = arith.cmpi slt, %[[VAL_18]], %[[VAL_19]] : i32
+! CHECK:               %[[VAL_21:.*]] = arith.select %[[VAL_20]], %[[VAL_18]], %[[VAL_19]] : i32
+! CHECK:               hlfir.assign %[[VAL_21]] to %[[VAL_14]]#0 : i32, !fir.ref<i32>
+! CHECK:               omp.yield
+! CHECK:             omp.terminator
+
+! CHECK-LABEL:   func.func @_QPreduction_min_real(
+! CHECK-SAME:                                     %[[VAL_0:.*]]: !fir.box<!fir.array<?xf32>> {fir.bindc_name = "y"}) {
+! CHECK:           %[[VAL_1:.*]] = fir.alloca i32 {bindc_name = "i", uniq_name = "_QFreduction_min_realEi"}
+! CHECK:           %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_1]] {uniq_name = "_QFreduction_min_realEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK:           %[[VAL_3:.*]] = fir.alloca f32 {bindc_name = "x", uniq_name = "_QFreduction_min_realEx"}
+! CHECK:           %[[VAL_4:.*]]:2 = hlfir.declare %[[VAL_3]] {uniq_name = "_QFreduction_min_realEx"} : (!fir.ref<f32>) -> (!fir.ref<f32>, !fir.ref<f32>)
+! CHECK:           %[[VAL_5:.*]]:2 = hlfir.declare %[[VAL_0]] {uniq_name = "_QFreduction_min_realEy"} : (!fir.box<!fir.array<?xf32>>) -> (!fir.box<!fir.array<?xf32>>, !fir.box<!fir.array<?xf32>>)
+! CHECK:           %[[VAL_6:.*]] = arith.constant 0.000000e+00 : f32
+! CHECK:           hlfir.assign %[[VAL_6]] to %[[VAL_4]]#0 : f32, !fir.ref<f32>
+! CHECK:           omp.parallel {
+! CHECK:             %[[VAL_7:.*]] = fir.alloca i32 {adapt.valuebyref, pinned}
+! CHECK:             %[[VAL_8:.*]]:2 = hlfir.declare %[[VAL_7]] {uniq_name = "_QFreduction_min_realEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK:             %[[VAL_9:.*]] = arith.constant 1 : i32
+! CHECK:             %[[VAL_10:.*]] = arith.constant 100 : i32
+! CHECK:             %[[VAL_11:.*]] = arith.constant 1 : i32
+! CHECK:             omp.wsloop byref reduction(@min_f_32_byref %[[VAL_4]]#0 -> %[[VAL_12:.*]] : !fir.ref<f32>)  for  (%[[VAL_13:.*]]) : i32 = (%[[VAL_9]]) to (%[[VAL_10]]) inclusive step (%[[VAL_11]]) {
+! CHECK:               fir.store %[[VAL_13]] to %[[VAL_8]]#1 : !fir.ref<i32>
+! CHECK:               %[[VAL_14:.*]]:2 = hlfir.declare %[[VAL_12]] {uniq_name = "_QFreduction_min_realEx"} : (!fir.ref<f32>) -> (!fir.ref<f32>, !fir.ref<f32>)
+! CHECK:               %[[VAL_15:.*]] = fir.load %[[VAL_8]]#0 : !fir.ref<i32>
+! CHECK:               %[[VAL_16:.*]] = fir.convert %[[VAL_15]] : (i32) -> i64
+! CHECK:               %[[VAL_17:.*]] = hlfir.designate %[[VAL_5]]#0 (%[[VAL_16]])  : (!fir.box<!fir.array<?xf32>>, i64) -> !fir.ref<f32>
+! CHECK:               %[[VAL_18:.*]] = fir.load %[[VAL_17]] : !fir.ref<f32>
+! CHECK:               %[[VAL_19:.*]] = fir.load %[[VAL_14]]#0 : !fir.ref<f32>
+! CHECK:               %[[VAL_20:.*]] = arith.cmpf olt, %[[VAL_18]], %[[VAL_19]] fastmath<contract> : f32
+! CHECK:               %[[VAL_21:.*]] = arith.select %[[VAL_20]], %[[VAL_18]], %[[VAL_19]] : f32
+! CHECK:               hlfir.assign %[[VAL_21]] to %[[VAL_14]]#0 : f32, !fir.ref<f32>
+! CHECK:               omp.yield
+! CHECK:             }
+! CHECK:             omp.terminator
+! CHECK:           }
+! CHECK:           omp.parallel {
+! CHECK:             %[[VAL_30:.*]] = fir.alloca i32 {adapt.valuebyref, pinned}
+! CHECK:             %[[VAL_31:.*]]:2 = hlfir.declare %[[VAL_30]] {uniq_name = "_QFreduction_min_realEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK:             %[[VAL_32:.*]] = arith.constant 1 : i32
+! CHECK:             %[[VAL_33:.*]] = arith.constant 100 : i32
+! CHECK:             %[[VAL_34:.*]] = arith.constant 1 : i32
+! CHECK:             omp.wsloop byref reduction(@min_f_32_byref %[[VAL_4]]#0 -> %[[VAL_35:.*]] : !fir.ref<f32>)  for  (%[[VAL_36:.*]]) : i32 = (%[[VAL_32]]) to (%[[VAL_33]]) inclusive step (%[[VAL_34]]) {
+! CHECK:               fir.store %[[VAL_36]] to %[[VAL_31]]#1 : !fir.ref<i32>
+! CHECK:               %[[VAL_37:.*]]:2 = hlfir.declare %[[VAL_35]] {uniq_name = "_QFreduction_min_realEx"} : (!fir.ref<f32>) -> (!fir.ref<f32>, !fir.ref<f32>)
+! CHECK:               %[[VAL_38:.*]] = fir.load %[[VAL_31]]#0 : !fir.ref<i32>
+! CHECK:               %[[VAL_39:.*]] = fir.convert %[[VAL_38]] : (i32) -> i64
+! CHECK:               %[[VAL_40:.*]] = hlfir.designate %[[VAL_5]]#0 (%[[VAL_39]])  : (!fir.box<!fir.array<?xf32>>, i64) -> !fir.ref<f32>
+! CHECK:               %[[VAL_41:.*]] = fir.load %[[VAL_40]] : !fir.ref<f32>
+! CHECK:               %[[VAL_42:.*]] = fir.load %[[VAL_37]]#0 : !fir.ref<f32>
+! CHECK:               %[[VAL_43:.*]] = arith.cmpf ogt, %[[VAL_41]], %[[VAL_42]] fastmath<contract> : f32
+! CHECK:               fir.if %[[VAL_43]] {
+! CHECK:                 %[[VAL_44:.*]] = fir.load %[[VAL_31]]#0 : !fir.ref<i32>
+! CHECK:                 %[[VAL_45:.*]] = fir.convert %[[VAL_44]] : (i32) -> i64
+! CHECK:                 %[[VAL_46:.*]] = hlfir.designate %[[VAL_5]]#0 (%[[VAL_45]])  : (!fir.box<!fir.array<?xf32>>, i64) -> !fir.ref<f32>
+! CHECK:                 %[[VAL_47:.*]] = fir.load %[[VAL_46]] : !fir.ref<f32>
+! CHECK:                 hlfir.assign %[[VAL_47]] to %[[VAL_37]]#0 : f32, !fir.ref<f32>
+! CHECK:               } else {
+! CHECK:               }
+! CHECK:               omp.yield
+! CHECK:             omp.terminator
+
+
+
+subroutine reduction_min_int(y)
+  integer :: x, y(:)
+  x = 0
+  !$omp parallel
+  !$omp do reduction(min:x)
+  do i=1, 100
+    x = min(x, y(i))
+  end do
+  !$omp end do
+  !$omp end parallel
+  print *, x
+end subroutine
+
+subroutine reduction_min_real(y)
+  real :: x, y(:)
+  x = 0.0
+  !$omp parallel
+  !$omp do reduction(min:x)
+  do i=1, 100
+    x = min(y(i), x)
+  end do
+  !$omp end do
+  !$omp end parallel
+  print *, x
+
+  !$omp parallel
+  !$omp do reduction(min:x)
+  do i=1, 100
+    if (y(i) .gt. x) x = y(i)
+  end do
+  !$omp end do
+  !$omp end parallel
+  print *, x
+end subroutine
diff --git a/flang/test/Lower/OpenMP/wsloop-reduction-min2.f90 b/flang/test/Lower/OpenMP/wsloop-reduction-min2.f90
new file mode 100644
index 00000000000000..9289973bae2029
--- /dev/null
+++ b/flang/test/Lower/OpenMP/wsloop-reduction-min2.f90
@@ -0,0 +1,41 @@
+! RUN: bbc -emit-hlfir -fopenmp -o - %s | FileCheck %s
+! RUN: %flang_fc1 -emit-hlfir -fopenmp -o - %s | FileCheck %s
+
+! regression test for crash
+
+program reduce
+integer :: i = 0
+integer :: r = 0
+
+!$omp parallel do reduction(min:r)
+do i=0,10
+   r = i
+enddo
+!$omp end parallel do
+
+print *,r
+
+end program
+
+! TODO: the reduction is not curently lowered correctly. This test is checking
+! that we do not crash and we still produce the same broken IR as before.
+
+! CHECK-LABEL:   func.func @_QQmain() attributes {fir.bindc_name = "reduce"} {
+! CHECK:           %[[VAL_0:.*]] = fir.address_of(@_QFEi) : !fir.ref<i32>
+! CHECK:           %[[VAL_1:.*]]:2 = hlfir.declare %[[VAL_0]] {uniq_name = "_QFEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK:           %[[VAL_2:.*]] = fir.address_of(@_QFEr) : !fir.ref<i32>
+! CHECK:           %[[VAL_3:.*]]:2 = hlfir.declare %[[VAL_2]] {uniq_name = "_QFEr"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK:           omp.parallel {
+! CHECK:             %[[VAL_4:.*]] = fir.alloca i32 {adapt.valuebyref, pinned}
+! CHECK:             %[[VAL_5:.*]]:2 = hlfir.declare %[[VAL_4]] {uniq_name = "_QFEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK:             %[[VAL_6:.*]] = arith.constant 0 : i32
+! CHECK:             %[[VAL_7:.*]] = arith.constant 10 : i32
+! CHECK:             %[[VAL_8:.*]] = arith.constant 1 : i32
+! CHECK:             omp.wsloop  for  (%[[VAL_9:.*]]) : i32 = (%[[VAL_6]]) to (%[[VAL_7]]) inclusive step (%[[VAL_8]]) {
+! CHECK:               fir.store %[[VAL_9]] to %[[VAL_5]]#1 : !fir.ref<i32>
+! CHECK:               %[[VAL_10:.*]] = fir.load %[[VAL_5]]#0 : !fir.ref<i32>
+! CHECK:               hlfir.assign %[[VAL_10]] to %[[VAL_3]]#0 : i32, !fir.ref<i32>
+! CHECK:               omp.yield
+! CHECK:             }
+! CHECK:             omp.terminator
+! CHECK:           }
diff --git a/flang/test/Lower/OpenMP/wsloop-reduction-mul-byref.f90 b/flang/test/Lower/OpenMP/wsloop-reduction-mul-byref.f90
new file mode 100644
index 00000000000000..00854281b87eac
--- /dev/null
+++ b/flang/test/Lower/OpenMP/wsloop-reduction-mul-byref.f90
@@ -0,0 +1,414 @@
+! RUN: bbc -emit-hlfir -fopenmp --force-byref-reduction %s -o - | FileCheck %s
+! RUN: %flang_fc1 -emit-hlfir -fopenmp -mmlir --force-byref-reduction %s -o - | FileCheck %s
+
+
+! NOTE: Assertions have been autogenerated by utils/generate-test-checks.py
+
+! CHECK-LABEL:   omp.reduction.declare @multiply_reduction_f_64_byref : !fir.ref<f64>
+! CHECK-SAME:    init {
+! CHECK:         ^bb0(%[[VAL_0:.*]]: !fir.ref<f64>):
+! CHECK:           %[[VAL_1:.*]] = arith.constant 1.000000e+00 : f64
+! CHECK:           %[[REF:.*]] = fir.alloca f64
+! CHECK:           fir.store %[[VAL_1]] to %[[REF]] : !fir.ref<f64>
+! CHECK:           omp.yield(%[[REF]] : !fir.ref<f64>)
+
+! CHECK-LABEL:   } combiner {
+! CHECK:         ^bb0(%[[ARG0:.*]]: !fir.ref<f64>, %[[ARG1:.*]]: !fir.ref<f64>):
+! CHECK:           %[[LD0:.*]] = fir.load %[[ARG0]] : !fir.ref<f64>
+! CHECK:           %[[LD1:.*]] = fir.load %[[ARG1]] : !fir.ref<f64>
+! CHECK:           %[[RES:.*]] = arith.mulf %[[LD0]], %[[LD1]] fastmath<contract> : f64
+! CHECK:           fir.store %[[RES]] to %[[ARG0]] : !fir.ref<f64>
+! CHECK:           omp.yield(%[[ARG0]] : !fir.ref<f64>)
+! CHECK:         }
+
+! CHECK-LABEL:   omp.reduction.declare @multiply_reduction_i_64_byref : !fir.ref<i64>
+! CHECK-SAME:    init {
+! CHECK:         ^bb0(%[[VAL_0:.*]]: !fir.ref<i64>):
+! CHECK:           %[[VAL_1:.*]] = arith.constant 1 : i64
+! CHECK:           %[[REF:.*]] = fir.alloca i64
+! CHECK:           fir.store %[[VAL_1]] to %[[REF]] : !fir.ref<i64>
+! CHECK:           omp.yield(%[[REF]] : !fir.ref<i64>)
+
+! CHECK-LABEL:   } combiner {
+! CHECK:          ^bb0(%[[ARG0:.*]]: !fir.ref<i64>, %[[ARG1:.*]]: !fir.ref<i64>):
+! CHECK:           %[[LD0:.*]] = fir.load %[[ARG0]] : !fir.ref<i64>
+! CHECK:           %[[LD1:.*]] = fir.load %[[ARG1]] : !fir.ref<i64>
+! CHECK:           %[[RES:.*]] = arith.muli %[[LD0]], %[[LD1]] : i64
+! CHECK:           fir.store %[[RES]] to %[[ARG0]] : !fir.ref<i64>
+! CHECK:           omp.yield(%[[ARG0]] : !fir.ref<i64>)
+! CHECK:         }
+
+! CHECK-LABEL:   omp.reduction.declare @multiply_reduction_f_32_byref : !fir.ref<f32>
+! CHECK-SAME:    init {
+! CHECK:         ^bb0(%[[VAL_0:.*]]: !fir.ref<f32>):
+! CHECK:           %[[VAL_1:.*]] = arith.constant 1.000000e+00 : f32
+! CHECK:           %[[REF:.*]] = fir.alloca f32
+! CHECK:           fir.store %[[VAL_1]] to %[[REF]] : !fir.ref<f32>
+! CHECK:           omp.yield(%[[REF]] : !fir.ref<f32>)
+
+! CHECK-LABEL:   } combiner {
+! CHECK:         ^bb0(%[[ARG0:.*]]: !fir.ref<f32>, %[[ARG1:.*]]: !fir.ref<f32>):
+! CHECK:           %[[LD0:.*]] = fir.load %[[ARG0]] : !fir.ref<f32>
+! CHECK:           %[[LD1:.*]] = fir.load %[[ARG1]] : !fir.ref<f32>
+! CHECK:           %[[RES:.*]] = arith.mulf %[[LD0]], %[[LD1]] fastmath<contract> : f32
+! CHECK:           fir.store %[[RES]] to %[[ARG0]] : !fir.ref<f32>
+! CHECK:           omp.yield(%[[ARG0]] : !fir.ref<f32>)
+! CHECK:         }
+
+! CHECK-LABEL:   omp.reduction.declare @multiply_reduction_i_32_byref : !fir.ref<i32>
+! CHECK-SAME:    init {
+! CHECK:         ^bb0(%[[VAL_0:.*]]: !fir.ref<i32>):
+! CHECK:           %[[VAL_1:.*]] = arith.constant 1 : i32
+! CHECK:           %[[REF:.*]] = fir.alloca i32
+! CHECK:           fir.store %[[VAL_1]] to %[[REF]] : !fir.ref<i32>
+! CHECK:           omp.yield(%[[REF]] : !fir.ref<i32>)
+
+! CHECK-LABEL:   } combiner {
+! CHECK:         ^bb0(%[[ARG0:.*]]: !fir.ref<i32>, %[[ARG1:.*]]: !fir.ref<i32>):
+! CHECK:           %[[LD0:.*]] = fir.load %[[ARG0]] : !fir.ref<i32>
+! CHECK:           %[[LD1:.*]] = fir.load %[[ARG1]] : !fir.ref<i32>
+! CHECK:           %[[RES:.*]] = arith.muli %[[LD0]], %[[LD1]] : i32
+! CHECK:           fir.store %[[RES]] to %[[ARG0]] : !fir.ref<i32>
+! CHECK:           omp.yield(%[[ARG0]] : !fir.ref<i32>)
+! CHECK:         }
+
+! CHECK-LABEL:   func.func @_QPsimple_int_reduction() {
+! CHECK:           %[[VAL_0:.*]] = fir.alloca i32 {bindc_name = "i", uniq_name = "_QFsimple_int_reductionEi"}
+! CHECK:           %[[VAL_1:.*]]:2 = hlfir.declare %[[VAL_0]] {uniq_name = "_QFsimple_int_reductionEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK:           %[[VAL_2:.*]] = fir.alloca i32 {bindc_name = "x", uniq_name = "_QFsimple_int_reductionEx"}
+! CHECK:           %[[VAL_3:.*]]:2 = hlfir.declare %[[VAL_2]] {uniq_name = "_QFsimple_int_reductionEx"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK:           %[[VAL_4:.*]] = arith.constant 1 : i32
+! CHECK:           hlfir.assign %[[VAL_4]] to %[[VAL_3]]#0 : i32, !fir.ref<i32>
+! CHECK:           omp.parallel {
+! CHECK:             %[[VAL_5:.*]] = fir.alloca i32 {adapt.valuebyref, pinned}
+! CHECK:             %[[VAL_6:.*]]:2 = hlfir.declare %[[VAL_5]] {uniq_name = "_QFsimple_int_reductionEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK:             %[[VAL_7:.*]] = arith.constant 1 : i32
+! CHECK:             %[[VAL_8:.*]] = arith.constant 10 : i32
+! CHECK:             %[[VAL_9:.*]] = arith.constant 1 : i32
+! CHECK:             omp.wsloop byref reduction(@multiply_reduction_i_32_byref %[[VAL_3]]#0 -> %[[VAL_10:.*]] : !fir.ref<i32>)  for  (%[[VAL_11:.*]]) : i32 = (%[[VAL_7]]) to (%[[VAL_8]]) inclusive step (%[[VAL_9]]) {
+! CHECK:               fir.store %[[VAL_11]] to %[[VAL_6]]#1 : !fir.ref<i32>
+! CHECK:               %[[VAL_12:.*]]:2 = hlfir.declare %[[VAL_10]] {uniq_name = "_QFsimple_int_reductionEx"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK:               %[[VAL_13:.*]] = fir.load %[[VAL_12]]#0 : !fir.ref<i32>
+! CHECK:               %[[VAL_14:.*]] = fir.load %[[VAL_6]]#0 : !fir.ref<i32>
+! CHECK:               %[[VAL_15:.*]] = arith.muli %[[VAL_13]], %[[VAL_14]] : i32
+! CHECK:               hlfir.assign %[[VAL_15]] to %[[VAL_12]]#0 : i32, !fir.ref<i32>
+! CHECK:               omp.yield
+! CHECK:             omp.terminator
+! CHECK:           return
+
+subroutine simple_int_reduction
+  integer :: x
+  x = 1
+  !$omp parallel
+  !$omp do reduction(*:x)
+  do i=1, 10
+    x = x * i
+  end do
+  !$omp end do
+  !$omp end parallel
+end subroutine
+
+! CHECK-LABEL:   func.func @_QPsimple_real_reduction() {
+! CHECK:           %[[VAL_0:.*]] = fir.alloca i32 {bindc_name = "i", uniq_name = "_QFsimple_real_reductionEi"}
+! CHECK:           %[[VAL_1:.*]]:2 = hlfir.declare %[[VAL_0]] {uniq_name = "_QFsimple_real_reductionEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK:           %[[VAL_2:.*]] = fir.alloca f32 {bindc_name = "x", uniq_name = "_QFsimple_real_reductionEx"}
+! CHECK:           %[[VAL_3:.*]]:2 = hlfir.declare %[[VAL_2]] {uniq_name = "_QFsimple_real_reductionEx"} : (!fir.ref<f32>) -> (!fir.ref<f32>, !fir.ref<f32>)
+! CHECK:           %[[VAL_4:.*]] = arith.constant 1.000000e+00 : f32
+! CHECK:           hlfir.assign %[[VAL_4]] to %[[VAL_3]]#0 : f32, !fir.ref<f32>
+! CHECK:           omp.parallel {
+! CHECK:             %[[VAL_5:.*]] = fir.alloca i32 {adapt.valuebyref, pinned}
+! CHECK:             %[[VAL_6:.*]]:2 = hlfir.declare %[[VAL_5]] {uniq_name = "_QFsimple_real_reductionEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK:             %[[VAL_7:.*]] = arith.constant 1 : i32
+! CHECK:             %[[VAL_8:.*]] = arith.constant 10 : i32
+! CHECK:             %[[VAL_9:.*]] = arith.constant 1 : i32
+! CHECK:             omp.wsloop byref reduction(@multiply_reduction_f_32_byref %[[VAL_3]]#0 -> %[[VAL_10:.*]] : !fir.ref<f32>)  for  (%[[VAL_11:.*]]) : i32 = (%[[VAL_7]]) to (%[[VAL_8]]) inclusive step (%[[VAL_9]]) {
+! CHECK:               fir.store %[[VAL_11]] to %[[VAL_6]]#1 : !fir.ref<i32>
+! CHECK:               %[[VAL_12:.*]]:2 = hlfir.declare %[[VAL_10]] {uniq_name = "_QFsimple_real_reductionEx"} : (!fir.ref<f32>) -> (!fir.ref<f32>, !fir.ref<f32>)
+! CHECK:               %[[VAL_13:.*]] = fir.load %[[VAL_12]]#0 : !fir.ref<f32>
+! CHECK:               %[[VAL_14:.*]] = fir.load %[[VAL_6]]#0 : !fir.ref<i32>
+! CHECK:               %[[VAL_15:.*]] = fir.convert %[[VAL_14]] : (i32) -> f32
+! CHECK:               %[[VAL_16:.*]] = arith.mulf %[[VAL_13]], %[[VAL_15]] fastmath<contract> : f32
+! CHECK:               hlfir.assign %[[VAL_16]] to %[[VAL_12]]#0 : f32, !fir.ref<f32>
+! CHECK:               omp.yield
+! CHECK:             omp.terminator
+! CHECK:           return
+
+subroutine simple_real_reduction
+  real :: x
+  x = 1.0
+  !$omp parallel
+  !$omp do reduction(*:x)
+  do i=1, 10
+    x = x * i
+  end do
+  !$omp end do
+  !$omp end parallel
+end subroutine
+
+! CHECK-LABEL:   func.func @_QPsimple_int_reduction_switch_order() {
+! CHECK:           %[[VAL_0:.*]] = fir.alloca i32 {bindc_name = "i", uniq_name = "_QFsimple_int_reduction_switch_orderEi"}
+! CHECK:           %[[VAL_1:.*]]:2 = hlfir.declare %[[VAL_0]] {uniq_name = "_QFsimple_int_reduction_switch_orderEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK:           %[[VAL_2:.*]] = fir.alloca i32 {bindc_name = "x", uniq_name = "_QFsimple_int_reduction_switch_orderEx"}
+! CHECK:           %[[VAL_3:.*]]:2 = hlfir.declare %[[VAL_2]] {uniq_name = "_QFsimple_int_reduction_switch_orderEx"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK:           %[[VAL_4:.*]] = arith.constant 1 : i32
+! CHECK:           hlfir.assign %[[VAL_4]] to %[[VAL_3]]#0 : i32, !fir.ref<i32>
+! CHECK:           omp.parallel {
+! CHECK:             %[[VAL_5:.*]] = fir.alloca i32 {adapt.valuebyref, pinned}
+! CHECK:             %[[VAL_6:.*]]:2 = hlfir.declare %[[VAL_5]] {uniq_name = "_QFsimple_int_reduction_switch_orderEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK:             %[[VAL_7:.*]] = arith.constant 1 : i32
+! CHECK:             %[[VAL_8:.*]] = arith.constant 10 : i32
+! CHECK:             %[[VAL_9:.*]] = arith.constant 1 : i32
+! CHECK:             omp.wsloop byref reduction(@multiply_reduction_i_32_byref %[[VAL_3]]#0 -> %[[VAL_10:.*]] : !fir.ref<i32>)  for  (%[[VAL_11:.*]]) : i32 = (%[[VAL_7]]) to (%[[VAL_8]]) inclusive step (%[[VAL_9]]) {
+! CHECK:               fir.store %[[VAL_11]] to %[[VAL_6]]#1 : !fir.ref<i32>
+! CHECK:               %[[VAL_12:.*]]:2 = hlfir.declare %[[VAL_10]] {uniq_name = "_QFsimple_int_reduction_switch_orderEx"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK:               %[[VAL_13:.*]] = fir.load %[[VAL_6]]#0 : !fir.ref<i32>
+! CHECK:               %[[VAL_14:.*]] = fir.load %[[VAL_12]]#0 : !fir.ref<i32>
+! CHECK:               %[[VAL_15:.*]] = arith.muli %[[VAL_13]], %[[VAL_14]] : i32
+! CHECK:               hlfir.assign %[[VAL_15]] to %[[VAL_12]]#0 : i32, !fir.ref<i32>
+! CHECK:               omp.yield
+! CHECK:             omp.terminator
+! CHECK:           return
+
+subroutine simple_int_reduction_switch_order
+  integer :: x
+  x = 1
+  !$omp parallel
+  !$omp do reduction(*:x)
+  do i=1, 10
+  x = i * x
+  end do
+  !$omp end do
+  !$omp end parallel
+end subroutine
+
+! CHECK-LABEL:   func.func @_QPsimple_real_reduction_switch_order() {
+! CHECK:           %[[VAL_0:.*]] = fir.alloca i32 {bindc_name = "i", uniq_name = "_QFsimple_real_reduction_switch_orderEi"}
+! CHECK:           %[[VAL_1:.*]]:2 = hlfir.declare %[[VAL_0]] {uniq_name = "_QFsimple_real_reduction_switch_orderEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK:           %[[VAL_2:.*]] = fir.alloca f32 {bindc_name = "x", uniq_name = "_QFsimple_real_reduction_switch_orderEx"}
+! CHECK:           %[[VAL_3:.*]]:2 = hlfir.declare %[[VAL_2]] {uniq_name = "_QFsimple_real_reduction_switch_orderEx"} : (!fir.ref<f32>) -> (!fir.ref<f32>, !fir.ref<f32>)
+! CHECK:           %[[VAL_4:.*]] = arith.constant 1.000000e+00 : f32
+! CHECK:           hlfir.assign %[[VAL_4]] to %[[VAL_3]]#0 : f32, !fir.ref<f32>
+! CHECK:           omp.parallel {
+! CHECK:             %[[VAL_5:.*]] = fir.alloca i32 {adapt.valuebyref, pinned}
+! CHECK:             %[[VAL_6:.*]]:2 = hlfir.declare %[[VAL_5]] {uniq_name = "_QFsimple_real_reduction_switch_orderEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK:             %[[VAL_7:.*]] = arith.constant 1 : i32
+! CHECK:             %[[VAL_8:.*]] = arith.constant 10 : i32
+! CHECK:             %[[VAL_9:.*]] = arith.constant 1 : i32
+! CHECK:             omp.wsloop byref reduction(@multiply_reduction_f_32_byref %[[VAL_3]]#0 -> %[[VAL_10:.*]] : !fir.ref<f32>)  for  (%[[VAL_11:.*]]) : i32 = (%[[VAL_7]]) to (%[[VAL_8]]) inclusive step (%[[VAL_9]]) {
+! CHECK:               fir.store %[[VAL_11]] to %[[VAL_6]]#1 : !fir.ref<i32>
+! CHECK:               %[[VAL_12:.*]]:2 = hlfir.declare %[[VAL_10]] {uniq_name = "_QFsimple_real_reduction_switch_orderEx"} : (!fir.ref<f32>) -> (!fir.ref<f32>, !fir.ref<f32>)
+! CHECK:               %[[VAL_13:.*]] = fir.load %[[VAL_6]]#0 : !fir.ref<i32>
+! CHECK:               %[[VAL_14:.*]] = fir.convert %[[VAL_13]] : (i32) -> f32
+! CHECK:               %[[VAL_15:.*]] = fir.load %[[VAL_12]]#0 : !fir.ref<f32>
+! CHECK:               %[[VAL_16:.*]] = arith.mulf %[[VAL_14]], %[[VAL_15]] fastmath<contract> : f32
+! CHECK:               hlfir.assign %[[VAL_16]] to %[[VAL_12]]#0 : f32, !fir.ref<f32>
+! CHECK:               omp.yield
+! CHECK:             omp.terminator
+! CHECK:           return
+
+subroutine simple_real_reduction_switch_order
+  real :: x
+  x = 1.0
+  !$omp parallel
+  !$omp do reduction(*:x)
+  do i=1, 10
+  x = i * x
+  end do
+  !$omp end do
+  !$omp end parallel
+end subroutine
+
+! CHECK-LABEL:   func.func @_QPmultiple_int_reductions_same_type() {
+! CHECK:           %[[VAL_0:.*]] = fir.alloca i32 {bindc_name = "i", uniq_name = "_QFmultiple_int_reductions_same_typeEi"}
+! CHECK:           %[[VAL_1:.*]]:2 = hlfir.declare %[[VAL_0]] {uniq_name = "_QFmultiple_int_reductions_same_typeEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK:           %[[VAL_2:.*]] = fir.alloca i32 {bindc_name = "x", uniq_name = "_QFmultiple_int_reductions_same_typeEx"}
+! CHECK:           %[[VAL_3:.*]]:2 = hlfir.declare %[[VAL_2]] {uniq_name = "_QFmultiple_int_reductions_same_typeEx"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK:           %[[VAL_4:.*]] = fir.alloca i32 {bindc_name = "y", uniq_name = "_QFmultiple_int_reductions_same_typeEy"}
+! CHECK:           %[[VAL_5:.*]]:2 = hlfir.declare %[[VAL_4]] {uniq_name = "_QFmultiple_int_reductions_same_typeEy"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK:           %[[VAL_6:.*]] = fir.alloca i32 {bindc_name = "z", uniq_name = "_QFmultiple_int_reductions_same_typeEz"}
+! CHECK:           %[[VAL_7:.*]]:2 = hlfir.declare %[[VAL_6]] {uniq_name = "_QFmultiple_int_reductions_same_typeEz"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK:           %[[VAL_8:.*]] = arith.constant 1 : i32
+! CHECK:           hlfir.assign %[[VAL_8]] to %[[VAL_3]]#0 : i32, !fir.ref<i32>
+! CHECK:           %[[VAL_9:.*]] = arith.constant 1 : i32
+! CHECK:           hlfir.assign %[[VAL_9]] to %[[VAL_5]]#0 : i32, !fir.ref<i32>
+! CHECK:           %[[VAL_10:.*]] = arith.constant 1 : i32
+! CHECK:           hlfir.assign %[[VAL_10]] to %[[VAL_7]]#0 : i32, !fir.ref<i32>
+! CHECK:           omp.parallel {
+! CHECK:             %[[VAL_11:.*]] = fir.alloca i32 {adapt.valuebyref, pinned}
+! CHECK:             %[[VAL_12:.*]]:2 = hlfir.declare %[[VAL_11]] {uniq_name = "_QFmultiple_int_reductions_same_typeEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK:             %[[VAL_13:.*]] = arith.constant 1 : i32
+! CHECK:             %[[VAL_14:.*]] = arith.constant 10 : i32
+! CHECK:             %[[VAL_15:.*]] = arith.constant 1 : i32
+! CHECK:             omp.wsloop byref reduction(@multiply_reduction_i_32_byref %[[VAL_3]]#0 -> %[[VAL_16:.*]] : !fir.ref<i32>, @multiply_reduction_i_32_byref %[[VAL_5]]#0 -> %[[VAL_17:.*]] : !fir.ref<i32>, @multiply_reduction_i_32_byref %[[VAL_7]]#0 -> %[[VAL_18:.*]] : !fir.ref<i32>)  for  (%[[VAL_19:.*]]) : i32 = (%[[VAL_13]]) to (%[[VAL_14]]) inclusive step (%[[VAL_15]]) {
+! CHECK:               fir.store %[[VAL_19]] to %[[VAL_12]]#1 : !fir.ref<i32>
+! CHECK:               %[[VAL_20:.*]]:2 = hlfir.declare %[[VAL_16]] {uniq_name = "_QFmultiple_int_reductions_same_typeEx"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK:               %[[VAL_21:.*]]:2 = hlfir.declare %[[VAL_17]] {uniq_name = "_QFmultiple_int_reductions_same_typeEy"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK:               %[[VAL_22:.*]]:2 = hlfir.declare %[[VAL_18]] {uniq_name = "_QFmultiple_int_reductions_same_typeEz"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK:               %[[VAL_23:.*]] = fir.load %[[VAL_20]]#0 : !fir.ref<i32>
+! CHECK:               %[[VAL_24:.*]] = fir.load %[[VAL_12]]#0 : !fir.ref<i32>
+! CHECK:               %[[VAL_25:.*]] = arith.muli %[[VAL_23]], %[[VAL_24]] : i32
+! CHECK:               hlfir.assign %[[VAL_25]] to %[[VAL_20]]#0 : i32, !fir.ref<i32>
+! CHECK:               %[[VAL_26:.*]] = fir.load %[[VAL_21]]#0 : !fir.ref<i32>
+! CHECK:               %[[VAL_27:.*]] = fir.load %[[VAL_12]]#0 : !fir.ref<i32>
+! CHECK:               %[[VAL_28:.*]] = arith.muli %[[VAL_26]], %[[VAL_27]] : i32
+! CHECK:               hlfir.assign %[[VAL_28]] to %[[VAL_21]]#0 : i32, !fir.ref<i32>
+! CHECK:               %[[VAL_29:.*]] = fir.load %[[VAL_22]]#0 : !fir.ref<i32>
+! CHECK:               %[[VAL_30:.*]] = fir.load %[[VAL_12]]#0 : !fir.ref<i32>
+! CHECK:               %[[VAL_31:.*]] = arith.muli %[[VAL_29]], %[[VAL_30]] : i32
+! CHECK:               hlfir.assign %[[VAL_31]] to %[[VAL_22]]#0 : i32, !fir.ref<i32>
+! CHECK:               omp.yield
+! CHECK:             omp.terminator
+! CHECK:           return
+
+subroutine multiple_int_reductions_same_type
+  integer :: x,y,z
+  x = 1
+  y = 1
+  z = 1
+  !$omp parallel
+  !$omp do reduction(*:x,y,z)
+  do i=1, 10
+  x = x * i
+  y = y * i
+  z = z * i
+  end do
+  !$omp end do
+  !$omp end parallel
+end subroutine
+
+! CHECK-LABEL:   func.func @_QPmultiple_real_reductions_same_type() {
+! CHECK:           %[[VAL_0:.*]] = fir.alloca i32 {bindc_name = "i", uniq_name = "_QFmultiple_real_reductions_same_typeEi"}
+! CHECK:           %[[VAL_1:.*]]:2 = hlfir.declare %[[VAL_0]] {uniq_name = "_QFmultiple_real_reductions_same_typeEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK:           %[[VAL_2:.*]] = fir.alloca f32 {bindc_name = "x", uniq_name = "_QFmultiple_real_reductions_same_typeEx"}
+! CHECK:           %[[VAL_3:.*]]:2 = hlfir.declare %[[VAL_2]] {uniq_name = "_QFmultiple_real_reductions_same_typeEx"} : (!fir.ref<f32>) -> (!fir.ref<f32>, !fir.ref<f32>)
+! CHECK:           %[[VAL_4:.*]] = fir.alloca f32 {bindc_name = "y", uniq_name = "_QFmultiple_real_reductions_same_typeEy"}
+! CHECK:           %[[VAL_5:.*]]:2 = hlfir.declare %[[VAL_4]] {uniq_name = "_QFmultiple_real_reductions_same_typeEy"} : (!fir.ref<f32>) -> (!fir.ref<f32>, !fir.ref<f32>)
+! CHECK:           %[[VAL_6:.*]] = fir.alloca f32 {bindc_name = "z", uniq_name = "_QFmultiple_real_reductions_same_typeEz"}
+! CHECK:           %[[VAL_7:.*]]:2 = hlfir.declare %[[VAL_6]] {uniq_name = "_QFmultiple_real_reductions_same_typeEz"} : (!fir.ref<f32>) -> (!fir.ref<f32>, !fir.ref<f32>)
+! CHECK:           %[[VAL_8:.*]] = arith.constant 1.000000e+00 : f32
+! CHECK:           hlfir.assign %[[VAL_8]] to %[[VAL_3]]#0 : f32, !fir.ref<f32>
+! CHECK:           %[[VAL_9:.*]] = arith.constant 1.000000e+00 : f32
+! CHECK:           hlfir.assign %[[VAL_9]] to %[[VAL_5]]#0 : f32, !fir.ref<f32>
+! CHECK:           %[[VAL_10:.*]] = arith.constant 1.000000e+00 : f32
+! CHECK:           hlfir.assign %[[VAL_10]] to %[[VAL_7]]#0 : f32, !fir.ref<f32>
+! CHECK:           omp.parallel {
+! CHECK:             %[[VAL_11:.*]] = fir.alloca i32 {adapt.valuebyref, pinned}
+! CHECK:             %[[VAL_12:.*]]:2 = hlfir.declare %[[VAL_11]] {uniq_name = "_QFmultiple_real_reductions_same_typeEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK:             %[[VAL_13:.*]] = arith.constant 1 : i32
+! CHECK:             %[[VAL_14:.*]] = arith.constant 10 : i32
+! CHECK:             %[[VAL_15:.*]] = arith.constant 1 : i32
+! CHECK:             omp.wsloop byref reduction(@multiply_reduction_f_32_byref %[[VAL_3]]#0 -> %[[VAL_16:.*]] : !fir.ref<f32>, @multiply_reduction_f_32_byref %[[VAL_5]]#0 -> %[[VAL_17:.*]] : !fir.ref<f32>, @multiply_reduction_f_32_byref %[[VAL_7]]#0 -> %[[VAL_18:.*]] : !fir.ref<f32>)  for  (%[[VAL_19:.*]]) : i32 = (%[[VAL_13]]) to (%[[VAL_14]]) inclusive step (%[[VAL_15]]) {
+! CHECK:               fir.store %[[VAL_19]] to %[[VAL_12]]#1 : !fir.ref<i32>
+! CHECK:               %[[VAL_20:.*]]:2 = hlfir.declare %[[VAL_16]] {uniq_name = "_QFmultiple_real_reductions_same_typeEx"} : (!fir.ref<f32>) -> (!fir.ref<f32>, !fir.ref<f32>)
+! CHECK:               %[[VAL_21:.*]]:2 = hlfir.declare %[[VAL_17]] {uniq_name = "_QFmultiple_real_reductions_same_typeEy"} : (!fir.ref<f32>) -> (!fir.ref<f32>, !fir.ref<f32>)
+! CHECK:               %[[VAL_22:.*]]:2 = hlfir.declare %[[VAL_18]] {uniq_name = "_QFmultiple_real_reductions_same_typeEz"} : (!fir.ref<f32>) -> (!fir.ref<f32>, !fir.ref<f32>)
+! CHECK:               %[[VAL_23:.*]] = fir.load %[[VAL_20]]#0 : !fir.ref<f32>
+! CHECK:               %[[VAL_24:.*]] = fir.load %[[VAL_12]]#0 : !fir.ref<i32>
+! CHECK:               %[[VAL_25:.*]] = fir.convert %[[VAL_24]] : (i32) -> f32
+! CHECK:               %[[VAL_26:.*]] = arith.mulf %[[VAL_23]], %[[VAL_25]] fastmath<contract> : f32
+! CHECK:               hlfir.assign %[[VAL_26]] to %[[VAL_20]]#0 : f32, !fir.ref<f32>
+! CHECK:               %[[VAL_27:.*]] = fir.load %[[VAL_21]]#0 : !fir.ref<f32>
+! CHECK:               %[[VAL_28:.*]] = fir.load %[[VAL_12]]#0 : !fir.ref<i32>
+! CHECK:               %[[VAL_29:.*]] = fir.convert %[[VAL_28]] : (i32) -> f32
+! CHECK:               %[[VAL_30:.*]] = arith.mulf %[[VAL_27]], %[[VAL_29]] fastmath<contract> : f32
+! CHECK:               hlfir.assign %[[VAL_30]] to %[[VAL_21]]#0 : f32, !fir.ref<f32>
+! CHECK:               %[[VAL_31:.*]] = fir.load %[[VAL_22]]#0 : !fir.ref<f32>
+! CHECK:               %[[VAL_32:.*]] = fir.load %[[VAL_12]]#0 : !fir.ref<i32>
+! CHECK:               %[[VAL_33:.*]] = fir.convert %[[VAL_32]] : (i32) -> f32
+! CHECK:               %[[VAL_34:.*]] = arith.mulf %[[VAL_31]], %[[VAL_33]] fastmath<contract> : f32
+! CHECK:               hlfir.assign %[[VAL_34]] to %[[VAL_22]]#0 : f32, !fir.ref<f32>
+! CHECK:               omp.yield
+! CHECK:             omp.terminator
+! CHECK:           return
+
+subroutine multiple_real_reductions_same_type
+  real :: x,y,z
+  x = 1
+  y = 1
+  z = 1
+  !$omp parallel
+  !$omp do reduction(*:x,y,z)
+  do i=1, 10
+    x = x * i
+    y = y * i
+    z = z * i
+  end do
+  !$omp end do
+  !$omp end parallel
+end subroutine
+
+! CHECK-LABEL:   func.func @_QPmultiple_reductions_different_type() {
+! CHECK:           %[[VAL_0:.*]] = fir.alloca i32 {bindc_name = "i", uniq_name = "_QFmultiple_reductions_different_typeEi"}
+! CHECK:           %[[VAL_1:.*]]:2 = hlfir.declare %[[VAL_0]] {uniq_name = "_QFmultiple_reductions_different_typeEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK:           %[[VAL_2:.*]] = fir.alloca f64 {bindc_name = "w", uniq_name = "_QFmultiple_reductions_different_typeEw"}
+! CHECK:           %[[VAL_3:.*]]:2 = hlfir.declare %[[VAL_2]] {uniq_name = "_QFmultiple_reductions_different_typeEw"} : (!fir.ref<f64>) -> (!fir.ref<f64>, !fir.ref<f64>)
+! CHECK:           %[[VAL_4:.*]] = fir.alloca i32 {bindc_name = "x", uniq_name = "_QFmultiple_reductions_different_typeEx"}
+! CHECK:           %[[VAL_5:.*]]:2 = hlfir.declare %[[VAL_4]] {uniq_name = "_QFmultiple_reductions_different_typeEx"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK:           %[[VAL_6:.*]] = fir.alloca i64 {bindc_name = "y", uniq_name = "_QFmultiple_reductions_different_typeEy"}
+! CHECK:           %[[VAL_7:.*]]:2 = hlfir.declare %[[VAL_6]] {uniq_name = "_QFmultiple_reductions_different_typeEy"} : (!fir.ref<i64>) -> (!fir.ref<i64>, !fir.ref<i64>)
+! CHECK:           %[[VAL_8:.*]] = fir.alloca f32 {bindc_name = "z", uniq_name = "_QFmultiple_reductions_different_typeEz"}
+! CHECK:           %[[VAL_9:.*]]:2 = hlfir.declare %[[VAL_8]] {uniq_name = "_QFmultiple_reductions_different_typeEz"} : (!fir.ref<f32>) -> (!fir.ref<f32>, !fir.ref<f32>)
+! CHECK:           %[[VAL_10:.*]] = arith.constant 1 : i32
+! CHECK:           hlfir.assign %[[VAL_10]] to %[[VAL_5]]#0 : i32, !fir.ref<i32>
+! CHECK:           %[[VAL_11:.*]] = arith.constant 1 : i64
+! CHECK:           hlfir.assign %[[VAL_11]] to %[[VAL_7]]#0 : i64, !fir.ref<i64>
+! CHECK:           %[[VAL_12:.*]] = arith.constant 1.000000e+00 : f32
+! CHECK:           hlfir.assign %[[VAL_12]] to %[[VAL_9]]#0 : f32, !fir.ref<f32>
+! CHECK:           %[[VAL_13:.*]] = arith.constant 1.000000e+00 : f64
+! CHECK:           hlfir.assign %[[VAL_13]] to %[[VAL_3]]#0 : f64, !fir.ref<f64>
+! CHECK:           omp.parallel {
+! CHECK:             %[[VAL_14:.*]] = fir.alloca i32 {adapt.valuebyref, pinned}
+! CHECK:             %[[VAL_15:.*]]:2 = hlfir.declare %[[VAL_14]] {uniq_name = "_QFmultiple_reductions_different_typeEi"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK:             %[[VAL_16:.*]] = arith.constant 1 : i32
+! CHECK:             %[[VAL_17:.*]] = arith.constant 10 : i32
+! CHECK:             %[[VAL_18:.*]] = arith.constant 1 : i32
+! CHECK:             omp.wsloop byref reduction(@multiply_reduction_i_32_byref %[[VAL_5]]#0 -> %[[VAL_19:.*]] : !fir.ref<i32>, @multiply_reduction_i_64_byref %[[VAL_7]]#0 -> %[[VAL_20:.*]] : !fir.ref<i64>, @multiply_reduction_f_32_byref %[[VAL_9]]#0 -> %[[VAL_21:.*]] : !fir.ref<f32>, @multiply_reduction_f_64_byref %[[VAL_3]]#0 -> %[[VAL_22:.*]] : !fir.ref<f64>)  for  (%[[VAL_23:.*]]) : i32 = (%[[VAL_16]]) to (%[[VAL_17]]) inclusive step (%[[VAL_18]]) {
+! CHECK:               fir.store %[[VAL_23]] to %[[VAL_15]]#1 : !fir.ref<i32>
+! CHECK:               %[[VAL_24:.*]]:2 = hlfir.declare %[[VAL_19]] {uniq_name = "_QFmultiple_reductions_different_typeEx"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK:               %[[VAL_25:.*]]:2 = hlfir.declare %[[VAL_20]] {uniq_name = "_QFmultiple_reductions_different_typeEy"} : (!fir.ref<i64>) -> (!fir.ref<i64>, !fir.ref<i64>)
+! CHECK:               %[[VAL_26:.*]]:2 = hlfir.declare %[[VAL_21]] {uniq_name = "_QFmultiple_reductions_different_typeEz"} : (!fir.ref<f32>) -> (!fir.ref<f32>, !fir.ref<f32>)
+! CHECK:               %[[VAL_27:.*]]:2 = hlfir.declare %[[VAL_22]] {uniq_name = "_QFmultiple_reductions_different_typeEw"} : (!fir.ref<f64>) -> (!fir.ref<f64>, !fir.ref<f64>)
+! CHECK:               %[[VAL_28:.*]] = fir.load %[[VAL_24]]#0 : !fir.ref<i32>
+! CHECK:               %[[VAL_29:.*]] = fir.load %[[VAL_15]]#0 : !fir.ref<i32>
+! CHECK:               %[[VAL_30:.*]] = arith.muli %[[VAL_28]], %[[VAL_29]] : i32
+! CHECK:               hlfir.assign %[[VAL_30]] to %[[VAL_24]]#0 : i32, !fir.ref<i32>
+! CHECK:               %[[VAL_31:.*]] = fir.load %[[VAL_25]]#0 : !fir.ref<i64>
+! CHECK:               %[[VAL_32:.*]] = fir.load %[[VAL_15]]#0 : !fir.ref<i32>
+! CHECK:               %[[VAL_33:.*]] = fir.convert %[[VAL_32]] : (i32) -> i64
+! CHECK:               %[[VAL_34:.*]] = arith.muli %[[VAL_31]], %[[VAL_33]] : i64
+! CHECK:               hlfir.assign %[[VAL_34]] to %[[VAL_25]]#0 : i64, !fir.ref<i64>
+! CHECK:               %[[VAL_35:.*]] = fir.load %[[VAL_26]]#0 : !fir.ref<f32>
+! CHECK:               %[[VAL_36:.*]] = fir.load %[[VAL_15]]#0 : !fir.ref<i32>
+! CHECK:               %[[VAL_37:.*]] = fir.convert %[[VAL_36]] : (i32) -> f32
+! CHECK:               %[[VAL_38:.*]] = arith.mulf %[[VAL_35]], %[[VAL_37]] fastmath<contract> : f32
+! CHECK:               hlfir.assign %[[VAL_38]] to %[[VAL_26]]#0 : f32, !fir.ref<f32>
+! CHECK:               %[[VAL_39:.*]] = fir.load %[[VAL_27]]#0 : !fir.ref<f64>
+! CHECK:               %[[VAL_40:.*]] = fir.load %[[VAL_15]]#0 : !fir.ref<i32>
+! CHECK:               %[[VAL_41:.*]] = fir.convert %[[VAL_40]] : (i32) -> f64
+! CHECK:               %[[VAL_42:.*]] = arith.mulf %[[VAL_39]], %[[VAL_41]] fastmath<contract> : f64
+! CHECK:               hlfir.assign %[[VAL_42]] to %[[VAL_27]]#0 : f64, !fir.ref<f64>
+! CHECK:               omp.yield
+! CHECK:             omp.terminator
+! CHECK:           return
+
+
+subroutine multiple_reductions_different_type
+  integer :: x
+  integer(kind=8) :: y
+  real :: z
+  real(kind=8) :: w
+  x = 1
+  y = 1
+  z = 1
+  w = 1
+  !$omp parallel
+  !$omp do reduction(*:x,y,z,w)
+  do i=1, 10
+    x = x * i
+    y = y * i
+    z = z * i
+    w = w * i
+  end do
+  !$omp end do
+  !$omp end parallel
+end subroutine
diff --git a/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h b/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h
index 5bbaa8c208b8cd..c9ee0c25194c23 100644
--- a/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h
+++ b/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h
@@ -1339,10 +1339,12 @@ class OpenMPIRBuilder {
   ///                           in reductions.
   /// \param ReductionInfos     A list of info on each reduction variable.
   /// \param IsNoWait           A flag set if the reduction is marked as nowait.
+  /// \param IsByRef            A flag set if the reduction is using reference
+  /// or direct value.
   InsertPointTy createReductions(const LocationDescription &Loc,
                                  InsertPointTy AllocaIP,
                                  ArrayRef<ReductionInfo> ReductionInfos,
-                                 bool IsNoWait = false);
+                                 bool IsNoWait = false, bool IsByRef = false);
 
   ///}
 
diff --git a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
index d65ed8c11d86cc..c74c898e1e882d 100644
--- a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
+++ b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
@@ -2110,7 +2110,7 @@ Function *getFreshReductionFunc(Module &M) {
 
 OpenMPIRBuilder::InsertPointTy OpenMPIRBuilder::createReductions(
     const LocationDescription &Loc, InsertPointTy AllocaIP,
-    ArrayRef<ReductionInfo> ReductionInfos, bool IsNoWait) {
+    ArrayRef<ReductionInfo> ReductionInfos, bool IsNoWait, bool IsByRef) {
   for (const ReductionInfo &RI : ReductionInfos) {
     (void)RI;
     assert(RI.Variable && "expected non-null variable");
@@ -2197,17 +2197,29 @@ OpenMPIRBuilder::InsertPointTy OpenMPIRBuilder::createReductions(
   for (auto En : enumerate(ReductionInfos)) {
     const ReductionInfo &RI = En.value();
     Type *ValueType = RI.ElementType;
-    Value *RedValue = Builder.CreateLoad(ValueType, RI.Variable,
-                                         "red.value." + Twine(En.index()));
+    // We have one less load for by-ref case because that load is now inside of
+    // the reduction region
+    Value *RedValue = nullptr;
+    if (!IsByRef) {
+      RedValue = Builder.CreateLoad(ValueType, RI.Variable,
+                                    "red.value." + Twine(En.index()));
+    }
     Value *PrivateRedValue =
         Builder.CreateLoad(ValueType, RI.PrivateVariable,
                            "red.private.value." + Twine(En.index()));
     Value *Reduced;
-    Builder.restoreIP(
-        RI.ReductionGen(Builder.saveIP(), RedValue, PrivateRedValue, Reduced));
+    if (IsByRef) {
+      Builder.restoreIP(RI.ReductionGen(Builder.saveIP(), RI.Variable,
+                                        PrivateRedValue, Reduced));
+    } else {
+      Builder.restoreIP(RI.ReductionGen(Builder.saveIP(), RedValue,
+                                        PrivateRedValue, Reduced));
+    }
     if (!Builder.GetInsertBlock())
       return InsertPointTy();
-    Builder.CreateStore(Reduced, RI.Variable);
+    // for by-ref case, the load is inside of the reduction region
+    if (!IsByRef)
+      Builder.CreateStore(Reduced, RI.Variable);
   }
   Function *EndReduceFunc = getOrCreateRuntimeFunctionPtr(
       IsNoWait ? RuntimeFunction::OMPRTL___kmpc_end_reduce_nowait
@@ -2219,7 +2231,7 @@ OpenMPIRBuilder::InsertPointTy OpenMPIRBuilder::createReductions(
   // function. There are no loads/stores here because they will be happening
   // inside the atomic elementwise reduction.
   Builder.SetInsertPoint(AtomicRedBlock);
-  if (CanGenerateAtomic) {
+  if (CanGenerateAtomic && !IsByRef) {
     for (const ReductionInfo &RI : ReductionInfos) {
       Builder.restoreIP(RI.AtomicReductionGen(Builder.saveIP(), RI.ElementType,
                                               RI.Variable, RI.PrivateVariable));
@@ -2257,7 +2269,9 @@ OpenMPIRBuilder::InsertPointTy OpenMPIRBuilder::createReductions(
     Builder.restoreIP(RI.ReductionGen(Builder.saveIP(), LHS, RHS, Reduced));
     if (!Builder.GetInsertBlock())
       return InsertPointTy();
-    Builder.CreateStore(Reduced, LHSPtr);
+    // store is inside of the reduction region when using by-ref
+    if (!IsByRef)
+      Builder.CreateStore(Reduced, LHSPtr);
   }
   Builder.CreateRetVoid();
 
diff --git a/mlir/include/mlir/Dialect/OpenMP/OpenMPOps.td b/mlir/include/mlir/Dialect/OpenMP/OpenMPOps.td
index 39ff49f0ccf541..0bd402d626dc42 100644
--- a/mlir/include/mlir/Dialect/OpenMP/OpenMPOps.td
+++ b/mlir/include/mlir/Dialect/OpenMP/OpenMPOps.td
@@ -268,6 +268,9 @@ def ParallelOp : OpenMP_Op<"parallel", [
 
     The optional $proc_bind_val attribute controls the thread affinity for the execution
     of the parallel region.
+
+    The optional byref attribute controls whether reduction arguments are passed by
+    reference or by value.
   }];
 
   let arguments = (ins Optional<I1>:$if_expr_var,
@@ -278,7 +281,8 @@ def ParallelOp : OpenMP_Op<"parallel", [
              OptionalAttr<SymbolRefArrayAttr>:$reductions,
              OptionalAttr<ProcBindKindAttr>:$proc_bind_val,
              Variadic<AnyType>:$private_vars,
-             OptionalAttr<SymbolRefArrayAttr>:$privatizers);
+             OptionalAttr<SymbolRefArrayAttr>:$privatizers,
+             UnitAttr:$byref);
 
   let regions = (region AnyRegion:$region);
 
@@ -299,6 +303,7 @@ def ParallelOp : OpenMP_Op<"parallel", [
                 $allocators_vars, type($allocators_vars)
               ) `)`
           | `proc_bind` `(` custom<ClauseAttr>($proc_bind_val) `)`
+          | `byref` $byref
     ) custom<ParallelRegion>($region, $reduction_vars, type($reduction_vars),
                              $reductions, $private_vars, type($private_vars),
                              $privatizers) attr-dict
@@ -570,6 +575,9 @@ def WsLoopOp : OpenMP_Op<"wsloop", [AttrSizedOperandSegments,
     The optional `order` attribute specifies which order the iterations of the
     associate loops are executed in. Currently the only option for this
     attribute is "concurrent".
+    
+    The optional `byref` attribute indicates that reduction arguments should be
+    passed by reference.
   }];
 
   let arguments = (ins Variadic<IntLikeType>:$lowerBound,
@@ -584,6 +592,7 @@ def WsLoopOp : OpenMP_Op<"wsloop", [AttrSizedOperandSegments,
              OptionalAttr<ScheduleModifierAttr>:$schedule_modifier,
              UnitAttr:$simd_modifier,
              UnitAttr:$nowait,
+             UnitAttr:$byref,
              ConfinedAttr<OptionalAttr<I64Attr>, [IntMinValue<0>]>:$ordered_val,
              OptionalAttr<OrderKindAttr>:$order_val,
              UnitAttr:$inclusive);
@@ -613,6 +622,7 @@ def WsLoopOp : OpenMP_Op<"wsloop", [AttrSizedOperandSegments,
                 $schedule_val, $schedule_modifier, $simd_modifier,
                 $schedule_chunk_var, type($schedule_chunk_var)) `)`
           |`nowait` $nowait
+          |`byref` $byref
           |`ordered` `(` $ordered_val `)`
           |`order` `(` custom<ClauseAttr>($order_val) `)`
     ) custom<WsLoop>($region, $lowerBound, $upperBound, $step,
diff --git a/mlir/lib/Dialect/OpenMP/IR/OpenMPDialect.cpp b/mlir/lib/Dialect/OpenMP/IR/OpenMPDialect.cpp
index 8a6980e2c6a2d9..e7b899aec4afbf 100644
--- a/mlir/lib/Dialect/OpenMP/IR/OpenMPDialect.cpp
+++ b/mlir/lib/Dialect/OpenMP/IR/OpenMPDialect.cpp
@@ -1209,7 +1209,7 @@ void ParallelOp::build(OpBuilder &builder, OperationState &state,
       /*allocate_vars=*/ValueRange(), /*allocators_vars=*/ValueRange(),
       /*reduction_vars=*/ValueRange(), /*reductions=*/nullptr,
       /*proc_bind_val=*/nullptr, /*private_vars=*/ValueRange(),
-      /*privatizers=*/nullptr);
+      /*privatizers=*/nullptr, /*byref=*/false);
   state.addAttributes(attributes);
 }
 
@@ -1674,7 +1674,8 @@ void WsLoopOp::build(OpBuilder &builder, OperationState &state,
         /*linear_step_vars=*/ValueRange(), /*reduction_vars=*/ValueRange(),
         /*reductions=*/nullptr, /*schedule_val=*/nullptr,
         /*schedule_chunk_var=*/nullptr, /*schedule_modifier=*/nullptr,
-        /*simd_modifier=*/false, /*nowait=*/false, /*ordered_val=*/nullptr,
+        /*simd_modifier=*/false, /*nowait=*/false, /*byref=*/false,
+        /*ordered_val=*/nullptr,
         /*order_val=*/nullptr, /*inclusive=*/false);
   state.addAttributes(attributes);
 }
diff --git a/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp b/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp
index bef227f2c583ed..5027f2afe92154 100644
--- a/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp
+++ b/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp
@@ -805,12 +805,12 @@ convertOmpTaskgroupOp(omp::TaskGroupOp tgOp, llvm::IRBuilderBase &builder,
 /// Allocate space for privatized reduction variables.
 template <typename T>
 static void
-allocReductionVars(T loop, llvm::IRBuilderBase &builder,
-                   LLVM::ModuleTranslation &moduleTranslation,
-                   llvm::OpenMPIRBuilder::InsertPointTy &allocaIP,
-                   SmallVector<omp::ReductionDeclareOp> &reductionDecls,
-                   SmallVector<llvm::Value *> &privateReductionVariables,
-                   DenseMap<Value, llvm::Value *> &reductionVariableMap) {
+allocByValReductionVars(T loop, llvm::IRBuilderBase &builder,
+                        LLVM::ModuleTranslation &moduleTranslation,
+                        llvm::OpenMPIRBuilder::InsertPointTy &allocaIP,
+                        SmallVector<omp::ReductionDeclareOp> &reductionDecls,
+                        SmallVector<llvm::Value *> &privateReductionVariables,
+                        DenseMap<Value, llvm::Value *> &reductionVariableMap) {
   llvm::IRBuilderBase::InsertPointGuard guard(builder);
   builder.restoreIP(allocaIP);
   auto args =
@@ -863,6 +863,7 @@ static LogicalResult
 convertOmpWsLoop(Operation &opInst, llvm::IRBuilderBase &builder,
                  LLVM::ModuleTranslation &moduleTranslation) {
   auto loop = cast<omp::WsLoopOp>(opInst);
+  const bool isByRef = loop.getByref();
   // TODO: this should be in the op verifier instead.
   if (loop.getLowerBound().empty())
     return failure();
@@ -888,18 +889,17 @@ convertOmpWsLoop(Operation &opInst, llvm::IRBuilderBase &builder,
 
   SmallVector<llvm::Value *> privateReductionVariables;
   DenseMap<Value, llvm::Value *> reductionVariableMap;
-  allocReductionVars(loop, builder, moduleTranslation, allocaIP, reductionDecls,
-                     privateReductionVariables, reductionVariableMap);
-
-  // Store the mapping between reduction variables and their private copies on
-  // ModuleTranslation stack. It can be then recovered when translating
-  // omp.reduce operations in a separate call.
-  LLVM::ModuleTranslation::SaveStack<OpenMPVarMappingStackFrame> mappingGuard(
-      moduleTranslation, reductionVariableMap);
+  if (!isByRef) {
+    allocByValReductionVars(loop, builder, moduleTranslation, allocaIP,
+                            reductionDecls, privateReductionVariables,
+                            reductionVariableMap);
+  }
 
   // Before the loop, store the initial values of reductions into reduction
   // variables. Although this could be done after allocas, we don't want to mess
   // up with the alloca insertion point.
+  MutableArrayRef<BlockArgument> reductionArgs =
+      loop.getRegion().getArguments().take_back(loop.getNumReductionVars());
   for (unsigned i = 0; i < loop.getNumReductionVars(); ++i) {
     SmallVector<llvm::Value *> phis;
     if (failed(inlineConvertOmpRegions(reductionDecls[i].getInitializerRegion(),
@@ -908,9 +908,31 @@ convertOmpWsLoop(Operation &opInst, llvm::IRBuilderBase &builder,
       return failure();
     assert(phis.size() == 1 && "expected one value to be yielded from the "
                                "reduction neutral element declaration region");
-    builder.CreateStore(phis[0], privateReductionVariables[i]);
+    if (isByRef) {
+      // Allocate reduction variable (which is a pointer to the real reduction
+      // variable allocated in the inlined region)
+      llvm::Value *var = builder.CreateAlloca(
+          moduleTranslation.convertType(reductionDecls[i].getType()));
+      // Store the result of the inlined region to the allocated reduction var
+      // ptr
+      builder.CreateStore(phis[0], var);
+
+      privateReductionVariables.push_back(var);
+      moduleTranslation.mapValue(reductionArgs[i], phis[0]);
+      reductionVariableMap.try_emplace(loop.getReductionVars()[i], phis[0]);
+    } else {
+      // for by-ref case the store is inside of the reduction region
+      builder.CreateStore(phis[0], privateReductionVariables[i]);
+      // the rest was handled in allocByValReductionVars
+    }
   }
 
+  // Store the mapping between reduction variables and their private copies on
+  // ModuleTranslation stack. It can be then recovered when translating
+  // omp.reduce operations in a separate call.
+  LLVM::ModuleTranslation::SaveStack<OpenMPVarMappingStackFrame> mappingGuard(
+      moduleTranslation, reductionVariableMap);
+
   // Set up the source location value for OpenMP runtime.
   llvm::OpenMPIRBuilder::LocationDescription ompLoc(builder);
 
@@ -1014,7 +1036,7 @@ convertOmpWsLoop(Operation &opInst, llvm::IRBuilderBase &builder,
   builder.SetInsertPoint(tempTerminator);
   llvm::OpenMPIRBuilder::InsertPointTy contInsertPoint =
       ompBuilder->createReductions(builder.saveIP(), allocaIP, reductionInfos,
-                                   loop.getNowait());
+                                   loop.getNowait(), isByRef);
   if (!contInsertPoint.getBlock())
     return loop->emitOpError() << "failed to convert reductions";
   auto nextInsertionPoint =
@@ -1068,6 +1090,7 @@ convertOmpParallel(omp::ParallelOp opInst, llvm::IRBuilderBase &builder,
                    LLVM::ModuleTranslation &moduleTranslation) {
   using InsertPointTy = llvm::OpenMPIRBuilder::InsertPointTy;
   OmpParallelOpConversionManager raii(opInst);
+  const bool isByRef = opInst.getByref();
 
   // TODO: support error propagation in OpenMPIRBuilder and use it instead of
   // relying on captured variables.
@@ -1082,18 +1105,17 @@ convertOmpParallel(omp::ParallelOp opInst, llvm::IRBuilderBase &builder,
     // Allocate reduction vars
     SmallVector<llvm::Value *> privateReductionVariables;
     DenseMap<Value, llvm::Value *> reductionVariableMap;
-    allocReductionVars(opInst, builder, moduleTranslation, allocaIP,
-                       reductionDecls, privateReductionVariables,
-                       reductionVariableMap);
-
-    // Store the mapping between reduction variables and their private copies on
-    // ModuleTranslation stack. It can be then recovered when translating
-    // omp.reduce operations in a separate call.
-    LLVM::ModuleTranslation::SaveStack<OpenMPVarMappingStackFrame> mappingGuard(
-        moduleTranslation, reductionVariableMap);
+    if (!isByRef) {
+      allocByValReductionVars(opInst, builder, moduleTranslation, allocaIP,
+                              reductionDecls, privateReductionVariables,
+                              reductionVariableMap);
+    }
 
     // Initialize reduction vars
     builder.restoreIP(allocaIP);
+    MutableArrayRef<BlockArgument> reductionArgs =
+        opInst.getRegion().getArguments().take_back(
+            opInst.getNumReductionVars());
     for (unsigned i = 0; i < opInst.getNumReductionVars(); ++i) {
       SmallVector<llvm::Value *> phis;
       if (failed(inlineConvertOmpRegions(
@@ -1104,9 +1126,32 @@ convertOmpParallel(omp::ParallelOp opInst, llvm::IRBuilderBase &builder,
              "expected one value to be yielded from the "
              "reduction neutral element declaration region");
       builder.restoreIP(allocaIP);
-      builder.CreateStore(phis[0], privateReductionVariables[i]);
+
+      if (isByRef) {
+        // Allocate reduction variable (which is a pointer to the real reduciton
+        // variable allocated in the inlined region)
+        llvm::Value *var = builder.CreateAlloca(
+            moduleTranslation.convertType(reductionDecls[i].getType()));
+        // Store the result of the inlined region to the allocated reduction var
+        // ptr
+        builder.CreateStore(phis[0], var);
+
+        privateReductionVariables.push_back(var);
+        moduleTranslation.mapValue(reductionArgs[i], phis[0]);
+        reductionVariableMap.try_emplace(opInst.getReductionVars()[i], phis[0]);
+      } else {
+        // for by-ref case the store is inside of the reduction init region
+        builder.CreateStore(phis[0], privateReductionVariables[i]);
+        // the rest is done in allocByValReductionVars
+      }
     }
 
+    // Store the mapping between reduction variables and their private copies on
+    // ModuleTranslation stack. It can be then recovered when translating
+    // omp.reduce operations in a separate call.
+    LLVM::ModuleTranslation::SaveStack<OpenMPVarMappingStackFrame> mappingGuard(
+        moduleTranslation, reductionVariableMap);
+
     // Save the alloca insertion point on ModuleTranslation stack for use in
     // nested regions.
     LLVM::ModuleTranslation::SaveStack<OpenMPAllocaStackFrame> frame(
@@ -1137,7 +1182,7 @@ convertOmpParallel(omp::ParallelOp opInst, llvm::IRBuilderBase &builder,
 
       llvm::OpenMPIRBuilder::InsertPointTy contInsertPoint =
           ompBuilder->createReductions(builder.saveIP(), allocaIP,
-                                       reductionInfos, false);
+                                       reductionInfos, false, isByRef);
       if (!contInsertPoint.getBlock()) {
         bodyGenStatus = opInst->emitOpError() << "failed to convert reductions";
         return;
diff --git a/mlir/test/Target/LLVMIR/openmp-reduction-byref.mlir b/mlir/test/Target/LLVMIR/openmp-reduction-byref.mlir
new file mode 100644
index 00000000000000..4ac1ebd43e1ee0
--- /dev/null
+++ b/mlir/test/Target/LLVMIR/openmp-reduction-byref.mlir
@@ -0,0 +1,66 @@
+// RUN: mlir-translate -mlir-to-llvmir -split-input-file %s | FileCheck %s
+
+  omp.reduction.declare @add_reduction_i_32 : !llvm.ptr init {
+  ^bb0(%arg0: !llvm.ptr):
+    %0 = llvm.mlir.constant(0 : i32) : i32
+    %1 = llvm.mlir.constant(1 : i64) : i64
+    %2 = llvm.alloca %1 x i32 : (i64) -> !llvm.ptr
+    llvm.store %0, %2 : i32, !llvm.ptr
+    omp.yield(%2 : !llvm.ptr)
+  } combiner {
+  ^bb0(%arg0: !llvm.ptr, %arg1: !llvm.ptr):
+    %0 = llvm.load %arg0 : !llvm.ptr -> i32
+    %1 = llvm.load %arg1 : !llvm.ptr -> i32
+    %2 = llvm.add %0, %1  : i32
+    llvm.store %2, %arg0 : i32, !llvm.ptr
+    omp.yield(%arg0 : !llvm.ptr)
+  } 
+
+  // CHECK-LABEL: @main
+  llvm.func @main()  {
+    %0 = llvm.mlir.constant(-1 : i32) : i32
+    %1 = llvm.mlir.addressof @i : !llvm.ptr
+    omp.parallel byref reduction(@add_reduction_i_32 %1 -> %arg0 : !llvm.ptr) {
+      llvm.store %0, %arg0 : i32, !llvm.ptr
+      omp.terminator
+    }
+    llvm.return
+  }
+  llvm.mlir.global internal @i() {addr_space = 0 : i32} : i32 {
+    %0 = llvm.mlir.constant(0 : i32) : i32
+    llvm.return %0 : i32
+  }
+
+// CHECK: %{{.+}} = 
+// Call to the outlined function.
+// CHECK: call void {{.*}} @__kmpc_fork_call
+// CHECK-SAME: @[[OUTLINED:[A-Za-z_.][A-Za-z0-9_.]*]]
+
+// Outlined function.
+// CHECK: define internal void @[[OUTLINED]]
+
+// Private reduction variable and its initialization.
+// CHECK: %tid.addr.local = alloca i32
+// CHECK: %[[PRIVATE:.+]] = alloca i32
+// CHECK: store i32 0, ptr %[[PRIVATE]]
+// CHECK: store ptr %[[PRIVATE]], ptr %[[PRIV_PTR:.+]],
+
+// Call to the reduction function.
+// CHECK: call i32 @__kmpc_reduce
+// CHECK-SAME: @[[REDFUNC:[A-Za-z_.][A-Za-z0-9_.]*]]
+
+
+// Non-atomic reduction:
+// CHECK: %[[PRIV_VAL_PTR:.+]] = load ptr, ptr %[[PRIV_PTR]]
+// CHECK: %[[LOAD:.+]] = load i32, ptr @i
+// CHECK: %[[PRIV_VAL:.+]] = load i32, ptr %[[PRIV_VAL_PTR]]
+// CHECK: %[[SUM:.+]] = add i32 %[[LOAD]], %[[PRIV_VAL]]
+// CHECK: store i32 %[[SUM]], ptr @i
+// CHECK: call void @__kmpc_end_reduce
+// CHECK: br label %[[FINALIZE:.+]]
+
+// CHECK: [[FINALIZE]]:
+
+// Reduction function.
+// CHECK: define internal void @[[REDFUNC]]
+// CHECK: add i32

>From 440a3bbd5a56b50a2cf471df44e5d568a0ae8eca Mon Sep 17 00:00:00 2001
From: Jon Roelofs <jonathan_roelofs at apple.com>
Date: Wed, 13 Mar 2024 08:02:22 -0700
Subject: [PATCH 42/63] [cmake] Silence a duplicate libraries warning from
 Apple's linker (#85012)

ld: warning: ignoring duplicate libraries:

This triggers quite frequently in llvm's build because CMake's library
depends mechanism doesn't de-duplicate libraries on the link line.
Duplication is necessary for ELF platforms, but means something subtly
different on Darwin platforms, hence the warning. Since we don't have
much control over that from CMake, just disable the warning wholesale
whenever the linker is detected to support it.
---
 llvm/cmake/modules/AddLLVM.cmake | 15 +++++++++++++++
 1 file changed, 15 insertions(+)

diff --git a/llvm/cmake/modules/AddLLVM.cmake b/llvm/cmake/modules/AddLLVM.cmake
index 374f5e085d9118..eb9e6101bdce71 100644
--- a/llvm/cmake/modules/AddLLVM.cmake
+++ b/llvm/cmake/modules/AddLLVM.cmake
@@ -257,6 +257,16 @@ if (NOT DEFINED LLVM_LINKER_DETECTED AND NOT WIN32)
       message(STATUS "Linker detection: unknown")
     endif()
   endif()
+
+  # Apple's linker complains about duplicate libraries, which CMake likes to do
+  # to support ELF platforms. To silence that warning, we can use
+  # -no_warn_duplicate_libraries, but only in versions of the linker that
+  # support that flag.
+  if(NOT LLVM_USE_LINKER AND ${CMAKE_SYSTEM_NAME} MATCHES "Darwin")
+    check_linker_flag(C "-Wl,-no_warn_duplicate_libraries" LLVM_LINKER_SUPPORTS_NO_WARN_DUPLICATE_LIBRARIES)
+  else()
+    set(LLVM_LINKER_SUPPORTS_NO_WARN_DUPLICATE_LIBRARIES OFF CACHE INTERNAL "")
+  endif()
 endif()
 
 function(add_link_opts target_name)
@@ -310,6 +320,11 @@ function(add_link_opts target_name)
     endif()
   endif()
 
+  if(LLVM_LINKER_SUPPORTS_NO_WARN_DUPLICATE_LIBRARIES)
+    set_property(TARGET ${target_name} APPEND_STRING PROPERTY
+                 LINK_FLAGS " -Wl,-no_warn_duplicate_libraries")
+  endif()
+
   if(ARG_SUPPORT_PLUGINS AND ${CMAKE_SYSTEM_NAME} MATCHES "AIX")
     set_property(TARGET ${target_name} APPEND_STRING PROPERTY
                  LINK_FLAGS " -Wl,-brtl")

>From 8e0ccce81fd4937cccc7efa20a75683fd2d849c7 Mon Sep 17 00:00:00 2001
From: Nikita Popov <npopov at redhat.com>
Date: Wed, 13 Mar 2024 16:09:25 +0100
Subject: [PATCH 43/63] [InstCombine] Don't generate crash dialog for fixpoint
 verification failure (NFC)

Fixpoint verification failures outside our tests are usually not
indicative of a bug -- don't be pushy about having people report them.
---
 llvm/lib/Transforms/InstCombine/InstructionCombining.cpp | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp b/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp
index 1688005de2104d..c9bbe437b368b3 100644
--- a/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp
@@ -5202,7 +5202,8 @@ static bool combineInstructionsOverFunction(
     if (Iteration > Opts.MaxIterations) {
       report_fatal_error(
           "Instruction Combining did not reach a fixpoint after " +
-          Twine(Opts.MaxIterations) + " iterations");
+              Twine(Opts.MaxIterations) + " iterations",
+          /*GenCrashDiag=*/false);
     }
   }
 

>From 8bfc5e021590fa2c00b4fd5db000715ce08c5537 Mon Sep 17 00:00:00 2001
From: Alexey Bataev <a.bataev at outlook.com>
Date: Wed, 13 Mar 2024 07:45:14 -0700
Subject: [PATCH 44/63] [SLP]Fix PR85082: PHI node has multiple entries.

Need to record casted extractelement for the externally used scalar, not
original extract instruction.
---
 .../Transforms/Vectorize/SLPVectorizer.cpp    | 10 +--
 .../X86/same-scalar-in-same-phi-extract.ll    | 75 +++++++++++++++++++
 2 files changed, 80 insertions(+), 5 deletions(-)
 create mode 100644 llvm/test/Transforms/SLPVectorizer/X86/same-scalar-in-same-phi-extract.ll

diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index b8b67609d755fd..5e0f5b7efadc40 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -12592,6 +12592,11 @@ Value *BoUpSLP::vectorizeTree(
           } else {
             Ex = Builder.CreateExtractElement(Vec, Lane);
           }
+          // If necessary, sign-extend or zero-extend ScalarRoot
+          // to the larger type.
+          if (Scalar->getType() != Ex->getType())
+            Ex = Builder.CreateIntCast(Ex, Scalar->getType(),
+                                       MinBWs.find(E)->second.second);
           if (auto *I = dyn_cast<Instruction>(Ex))
             ScalarToEEs[Scalar].try_emplace(Builder.GetInsertBlock(), I);
         }
@@ -12601,11 +12606,6 @@ Value *BoUpSLP::vectorizeTree(
           GatherShuffleExtractSeq.insert(ExI);
           CSEBlocks.insert(ExI->getParent());
         }
-        // If necessary, sign-extend or zero-extend ScalarRoot
-        // to the larger type.
-        if (Scalar->getType() != Ex->getType())
-          return Builder.CreateIntCast(Ex, Scalar->getType(),
-                                       MinBWs.find(E)->second.second);
         return Ex;
       }
       assert(isa<FixedVectorType>(Scalar->getType()) &&
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/same-scalar-in-same-phi-extract.ll b/llvm/test/Transforms/SLPVectorizer/X86/same-scalar-in-same-phi-extract.ll
new file mode 100644
index 00000000000000..35f2f9e052e749
--- /dev/null
+++ b/llvm/test/Transforms/SLPVectorizer/X86/same-scalar-in-same-phi-extract.ll
@@ -0,0 +1,75 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
+; RUN: opt -S --passes=slp-vectorizer -slp-threshold=-99999 -mtriple=x86_64-unknown-linux-gnu < %s | FileCheck %s
+
+define void @test(i32 %arg) {
+; CHECK-LABEL: define void @test(
+; CHECK-SAME: i32 [[ARG:%.*]]) {
+; CHECK-NEXT:  bb:
+; CHECK-NEXT:    [[TMP0:%.*]] = insertelement <2 x i32> <i32 poison, i32 0>, i32 [[ARG]], i32 0
+; CHECK-NEXT:    br label [[BB2:%.*]]
+; CHECK:       bb2:
+; CHECK-NEXT:    switch i32 0, label [[BB10:%.*]] [
+; CHECK-NEXT:      i32 0, label [[BB9:%.*]]
+; CHECK-NEXT:      i32 11, label [[BB9]]
+; CHECK-NEXT:      i32 1, label [[BB4:%.*]]
+; CHECK-NEXT:    ]
+; CHECK:       bb3:
+; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <2 x i32> [[TMP0]], i32 0
+; CHECK-NEXT:    [[TMP2:%.*]] = zext i32 [[TMP1]] to i64
+; CHECK-NEXT:    switch i32 0, label [[BB10]] [
+; CHECK-NEXT:      i32 18, label [[BB7:%.*]]
+; CHECK-NEXT:      i32 1, label [[BB7]]
+; CHECK-NEXT:      i32 0, label [[BB10]]
+; CHECK-NEXT:    ]
+; CHECK:       bb4:
+; CHECK-NEXT:    [[TMP3:%.*]] = phi <2 x i32> [ [[TMP0]], [[BB2]] ]
+; CHECK-NEXT:    [[TMP4:%.*]] = zext <2 x i32> [[TMP3]] to <2 x i64>
+; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <2 x i64> [[TMP4]], i32 0
+; CHECK-NEXT:    [[GETELEMENTPTR:%.*]] = getelementptr i32, ptr null, i64 [[TMP5]]
+; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <2 x i64> [[TMP4]], i32 1
+; CHECK-NEXT:    [[GETELEMENTPTR6:%.*]] = getelementptr i32, ptr null, i64 [[TMP6]]
+; CHECK-NEXT:    ret void
+; CHECK:       bb7:
+; CHECK-NEXT:    [[PHI8:%.*]] = phi i64 [ [[TMP2]], [[BB3:%.*]] ], [ [[TMP2]], [[BB3]] ]
+; CHECK-NEXT:    br label [[BB9]]
+; CHECK:       bb9:
+; CHECK-NEXT:    ret void
+; CHECK:       bb10:
+; CHECK-NEXT:    ret void
+;
+bb:
+  %zext = zext i32 %arg to i64
+  %zext1 = zext i32 0 to i64
+  br label %bb2
+
+bb2:
+  switch i32 0, label %bb10 [
+  i32 0, label %bb9
+  i32 11, label %bb9
+  i32 1, label %bb4
+  ]
+
+bb3:
+  switch i32 0, label %bb10 [
+  i32 18, label %bb7
+  i32 1, label %bb7
+  i32 0, label %bb10
+  ]
+
+bb4:
+  %phi = phi i64 [ %zext, %bb2 ]
+  %phi5 = phi i64 [ %zext1, %bb2 ]
+  %getelementptr = getelementptr i32, ptr null, i64 %phi
+  %getelementptr6 = getelementptr i32, ptr null, i64 %phi5
+  ret void
+
+bb7:
+  %phi8 = phi i64 [ %zext, %bb3 ], [ %zext, %bb3 ]
+  br label %bb9
+
+bb9:
+  ret void
+
+bb10:
+  ret void
+}

>From 134a13899ef65c435f8a4e3c2819e60eaf8a978c Mon Sep 17 00:00:00 2001
From: agozillon <Andrew.Gozillon at amd.com>
Date: Wed, 13 Mar 2024 16:18:21 +0100
Subject: [PATCH 45/63] [Flang][OpenMP] Implement "promotion" of use_device_ptr
 non-cptr arguments to use_device_addr (#82834)

This effectively implements some now deprecated OpenMP functionality
that some applications (most notably at the moment GenASiS)
unfortunately depend on (deprecated in specification version 5.2):

"If a list item in a use_device_ptr clause is not of type C_PTR, the
behavior is as if the list item appeared in a use_device_addr clause.
Support for such list items in a use_device_ptr clause is deprecated."

This PR downgrades the hard-error to a deprecated warning and "promotes"
the above cases by simply moving the offending operands from the
use_device_ptr value list to the back of the use_device_addr list (and
moves the related symbols, locs and types that form the BlockArgs
correspondingly) and then the generation of the target data construct
proceeds as normal.
---
 flang/lib/Lower/OpenMP/OpenMP.cpp             | 66 +++++++++++++++++
 flang/lib/Semantics/check-omp-structure.cpp   |  2 +-
 .../use-device-ptr-to-use-device-addr.f90     | 72 +++++++++++++++++++
 .../test/Semantics/OpenMP/use_device_ptr1.f90 |  2 +-
 4 files changed, 140 insertions(+), 2 deletions(-)
 create mode 100644 flang/test/Lower/OpenMP/use-device-ptr-to-use-device-addr.f90

diff --git a/flang/lib/Lower/OpenMP/OpenMP.cpp b/flang/lib/Lower/OpenMP/OpenMP.cpp
index 1016c8389c6e6a..25bb4d9cff5d16 100644
--- a/flang/lib/Lower/OpenMP/OpenMP.cpp
+++ b/flang/lib/Lower/OpenMP/OpenMP.cpp
@@ -798,6 +798,58 @@ genTaskGroupOp(Fortran::lower::AbstractConverter &converter,
       /*task_reductions=*/nullptr, allocateOperands, allocatorOperands);
 }
 
+// This helper function implements the functionality of "promoting"
+// non-CPTR arguments of use_device_ptr to use_device_addr
+// arguments (automagic conversion of use_device_ptr ->
+// use_device_addr in these cases). The way we do so currently is
+// through the shuffling of operands from the devicePtrOperands to
+// deviceAddrOperands where neccesary and re-organizing the types,
+// locations and symbols to maintain the correct ordering of ptr/addr
+// input -> BlockArg.
+//
+// This effectively implements some deprecated OpenMP functionality
+// that some legacy applications unfortunately depend on
+// (deprecated in specification version 5.2):
+//
+// "If a list item in a use_device_ptr clause is not of type C_PTR,
+//  the behavior is as if the list item appeared in a use_device_addr
+//  clause. Support for such list items in a use_device_ptr clause
+//  is deprecated."
+static void promoteNonCPtrUseDevicePtrArgsToUseDeviceAddr(
+    llvm::SmallVector<mlir::Value> &devicePtrOperands,
+    llvm::SmallVector<mlir::Value> &deviceAddrOperands,
+    llvm::SmallVector<mlir::Type> &useDeviceTypes,
+    llvm::SmallVector<mlir::Location> &useDeviceLocs,
+    llvm::SmallVector<const Fortran::semantics::Symbol *> &useDeviceSymbols) {
+  auto moveElementToBack = [](size_t idx, auto &vector) {
+    auto *iter = std::next(vector.begin(), idx);
+    vector.push_back(*iter);
+    vector.erase(iter);
+  };
+
+  // Iterate over our use_device_ptr list and shift all non-cptr arguments into
+  // use_device_addr.
+  for (auto *it = devicePtrOperands.begin(); it != devicePtrOperands.end();) {
+    if (!fir::isa_builtin_cptr_type(fir::unwrapRefType(it->getType()))) {
+      deviceAddrOperands.push_back(*it);
+      // We have to shuffle the symbols around as well, to maintain
+      // the correct Input -> BlockArg for use_device_ptr/use_device_addr.
+      // NOTE: However, as map's do not seem to be included currently
+      // this isn't as pertinent, but we must try to maintain for
+      // future alterations. I believe the reason they are not currently
+      // is that the BlockArg assign/lowering needs to be extended
+      // to a greater set of types.
+      auto idx = std::distance(devicePtrOperands.begin(), it);
+      moveElementToBack(idx, useDeviceTypes);
+      moveElementToBack(idx, useDeviceLocs);
+      moveElementToBack(idx, useDeviceSymbols);
+      it = devicePtrOperands.erase(it);
+      continue;
+    }
+    ++it;
+  }
+}
+
 static mlir::omp::DataOp
 genDataOp(Fortran::lower::AbstractConverter &converter,
           Fortran::semantics::SemanticsContext &semaCtx,
@@ -820,6 +872,20 @@ genDataOp(Fortran::lower::AbstractConverter &converter,
                          useDeviceSymbols);
   cp.processUseDeviceAddr(deviceAddrOperands, useDeviceTypes, useDeviceLocs,
                           useDeviceSymbols);
+  // This function implements the deprecated functionality of use_device_ptr
+  // that allows users to provide non-CPTR arguments to it with the caveat
+  // that the compiler will treat them as use_device_addr. A lot of legacy
+  // code may still depend on this functionality, so we should support it
+  // in some manner. We do so currently by simply shifting non-cptr operands
+  // from the use_device_ptr list into the front of the use_device_addr list
+  // whilst maintaining the ordering of useDeviceLocs, useDeviceSymbols and
+  // useDeviceTypes to use_device_ptr/use_device_addr input for BlockArg
+  // ordering.
+  // TODO: Perhaps create a user provideable compiler option that will
+  // re-introduce a hard-error rather than a warning in these cases.
+  promoteNonCPtrUseDevicePtrArgsToUseDeviceAddr(
+      devicePtrOperands, deviceAddrOperands, useDeviceTypes, useDeviceLocs,
+      useDeviceSymbols);
   cp.processMap(currentLocation, llvm::omp::Directive::OMPD_target_data,
                 stmtCtx, mapOperands);
 
diff --git a/flang/lib/Semantics/check-omp-structure.cpp b/flang/lib/Semantics/check-omp-structure.cpp
index 54101ab8a42bbf..bf4debee1df34c 100644
--- a/flang/lib/Semantics/check-omp-structure.cpp
+++ b/flang/lib/Semantics/check-omp-structure.cpp
@@ -2948,7 +2948,7 @@ void OmpStructureChecker::Enter(const parser::OmpClause::UseDevicePtr &x) {
         if (name->symbol) {
           if (!(IsBuiltinCPtr(*(name->symbol)))) {
             context_.Say(itr->second->source,
-                "'%s' in USE_DEVICE_PTR clause must be of type C_PTR"_err_en_US,
+                "Use of non-C_PTR type '%s' in USE_DEVICE_PTR is deprecated, use USE_DEVICE_ADDR instead"_warn_en_US,
                 name->ToString());
           } else {
             useDevicePtrNameList.push_back(*name);
diff --git a/flang/test/Lower/OpenMP/use-device-ptr-to-use-device-addr.f90 b/flang/test/Lower/OpenMP/use-device-ptr-to-use-device-addr.f90
new file mode 100644
index 00000000000000..33b5971656010a
--- /dev/null
+++ b/flang/test/Lower/OpenMP/use-device-ptr-to-use-device-addr.f90
@@ -0,0 +1,72 @@
+!RUN: %flang_fc1 -emit-hlfir -fopenmp %s -o - | FileCheck %s
+!RUN: bbc -emit-hlfir -fopenmp %s -o - | FileCheck %s
+
+! This tests primary goal is to check the promotion of 
+! non-CPTR arguments from use_device_ptr to 
+! use_device_addr works, without breaking any 
+! functionality 
+
+!CHECK: func.func @{{.*}}only_use_device_ptr()
+!CHECK: omp.target_data use_device_ptr(%{{.*}} : !fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_c_ptr{__address:i64}>>) use_device_addr(%{{.*}}, %{{.*}} : !fir.ref<!fir.box<!fir.ptr<!fir.array<?xf32>>>>, !fir.ref<!fir.box<!fir.ptr<!fir.array<?xi32>>>>) {
+!CHECK: ^bb0(%{{.*}}: !fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_c_ptr{__address:i64}>>, %{{.*}}: !fir.ref<!fir.box<!fir.ptr<!fir.array<?xf32>>>>, %{{.*}}: !fir.ref<!fir.box<!fir.ptr<!fir.array<?xi32>>>>):
+subroutine only_use_device_ptr 
+    use iso_c_binding
+    integer, pointer, dimension(:) :: array
+    real, pointer :: pa(:)
+    type(c_ptr) :: cptr
+
+   !$omp target data use_device_ptr(pa, cptr, array)
+   !$omp end target data 
+end subroutine
+
+!CHECK: func.func @{{.*}}mix_use_device_ptr_and_addr()
+!CHECK: omp.target_data use_device_ptr({{.*}} : !fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_c_ptr{__address:i64}>>) use_device_addr(%{{.*}}, %{{.*}} : !fir.ref<!fir.box<!fir.ptr<!fir.array<?xi32>>>>, !fir.ref<!fir.box<!fir.ptr<!fir.array<?xf32>>>>) {
+!CHECK: ^bb0(%{{.*}}: !fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_c_ptr{__address:i64}>>, %{{.*}}: !fir.ref<!fir.box<!fir.ptr<!fir.array<?xi32>>>>, %{{.*}}: !fir.ref<!fir.box<!fir.ptr<!fir.array<?xf32>>>>):
+subroutine mix_use_device_ptr_and_addr 
+    use iso_c_binding
+    integer, pointer, dimension(:) :: array
+    real, pointer :: pa(:)
+    type(c_ptr) :: cptr
+
+   !$omp target data use_device_ptr(pa, cptr) use_device_addr(array)
+   !$omp end target data 
+end subroutine
+
+!CHECK: func.func @{{.*}}only_use_device_addr()
+!CHECK: omp.target_data use_device_addr(%{{.*}}, %{{.*}}, %{{.*}} : !fir.ref<!fir.box<!fir.ptr<!fir.array<?xf32>>>>, !fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_c_ptr{__address:i64}>>, !fir.ref<!fir.box<!fir.ptr<!fir.array<?xi32>>>>) {
+!CHECK: ^bb0(%{{.*}}: !fir.ref<!fir.box<!fir.ptr<!fir.array<?xf32>>>>, %{{.*}}: !fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_c_ptr{__address:i64}>>, %{{.*}}: !fir.ref<!fir.box<!fir.ptr<!fir.array<?xi32>>>>):
+subroutine only_use_device_addr 
+    use iso_c_binding
+    integer, pointer, dimension(:) :: array
+    real, pointer :: pa(:)
+    type(c_ptr) :: cptr
+
+   !$omp target data use_device_addr(pa, cptr, array)
+   !$omp end target data 
+end subroutine
+
+!CHECK: func.func @{{.*}}mix_use_device_ptr_and_addr_and_map()
+!CHECK: omp.target_data map_entries(%{{.*}}, %{{.*}} : !fir.ref<i32>, !fir.ref<i32>) use_device_ptr(%{{.*}} : !fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_c_ptr{__address:i64}>>) use_device_addr(%{{.*}}, %{{.*}} : !fir.ref<!fir.box<!fir.ptr<!fir.array<?xi32>>>>, !fir.ref<!fir.box<!fir.ptr<!fir.array<?xf32>>>>) {
+!CHECK: ^bb0(%{{.*}}: !fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_c_ptr{__address:i64}>>, %{{.*}}: !fir.ref<!fir.box<!fir.ptr<!fir.array<?xi32>>>>, %{{.*}}: !fir.ref<!fir.box<!fir.ptr<!fir.array<?xf32>>>>):
+subroutine mix_use_device_ptr_and_addr_and_map
+    use iso_c_binding
+    integer :: i, j
+    integer, pointer, dimension(:) :: array
+    real, pointer :: pa(:)
+    type(c_ptr) :: cptr
+
+   !$omp target data use_device_ptr(pa, cptr) use_device_addr(array) map(tofrom: i, j)
+   !$omp end target data 
+end subroutine
+
+!CHECK: func.func @{{.*}}only_use_map()
+!CHECK: omp.target_data map_entries(%{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}} : !fir.llvm_ptr<!fir.ref<!fir.array<?xf32>>>, !fir.ref<!fir.box<!fir.ptr<!fir.array<?xf32>>>>, !fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_c_ptr{__address:i64}>>, !fir.llvm_ptr<!fir.ref<!fir.array<?xi32>>>, !fir.ref<!fir.box<!fir.ptr<!fir.array<?xi32>>>>) {
+subroutine only_use_map
+    use iso_c_binding
+    integer, pointer, dimension(:) :: array
+    real, pointer :: pa(:)
+    type(c_ptr) :: cptr
+
+   !$omp target data map(pa, cptr, array)
+   !$omp end target data 
+end subroutine
diff --git a/flang/test/Semantics/OpenMP/use_device_ptr1.f90 b/flang/test/Semantics/OpenMP/use_device_ptr1.f90
index af89698a5c5a99..176fb5f35a8490 100644
--- a/flang/test/Semantics/OpenMP/use_device_ptr1.f90
+++ b/flang/test/Semantics/OpenMP/use_device_ptr1.f90
@@ -27,7 +27,7 @@ subroutine omp_target_data
       a = arrayB
    !$omp end target data
 
-   !ERROR: 'a' in USE_DEVICE_PTR clause must be of type C_PTR
+   !WARNING: Use of non-C_PTR type 'a' in USE_DEVICE_PTR is deprecated, use USE_DEVICE_ADDR instead
    !$omp target data map(tofrom: a) use_device_ptr(a)
       a = 2
    !$omp end target data

>From 1446b82ac3ad46103649a83d40588df441be71bf Mon Sep 17 00:00:00 2001
From: Slava Zakharin <szakharin at nvidia.com>
Date: Wed, 13 Mar 2024 08:23:10 -0700
Subject: [PATCH 46/63] [RFC][mlir] Add profitability callback to the Inliner.
 (#84258)

Discussion at https://discourse.llvm.org/t/inliner-cost-model/2992

This change adds a callback that reports whether inlining
of the particular call site (communicated via ResolvedCall argument)
is profitable or not. The default MLIR inliner pass behavior
is unchanged, i.e. the callback always returns true.
This callback may be used to customize the inliner behavior
based on the target specifics (like target instructions costs),
profitability of the inlining for further optimizations
(e.g. if inlining may enable loop optimizations or scalar optimizations
due to object shape propagation), optimization levels (e.g. -Os inlining
may be quite different from -Ofast inlining), etc.

One of the questions is whether the ResolvedCall entity represents
enough of the context for the custom inlining models to come up with
the profitability decision. I think we can start with this and
extend it as necessary.

---------

Co-authored-by: Mehdi Amini <joker.eph at gmail.com>
---
 mlir/include/mlir/Transforms/Inliner.h        | 43 +++++++++++--------
 mlir/include/mlir/Transforms/Passes.td        |  7 +++
 mlir/lib/Transforms/InlinerPass.cpp           | 38 +++++++++++++++-
 mlir/lib/Transforms/Utils/Inliner.cpp         |  3 ++
 .../inlining-dump-default-pipeline.mlir       |  2 +-
 mlir/test/Transforms/inlining-threshold.mlir  | 18 ++++++++
 6 files changed, 92 insertions(+), 19 deletions(-)
 create mode 100644 mlir/test/Transforms/inlining-threshold.mlir

diff --git a/mlir/include/mlir/Transforms/Inliner.h b/mlir/include/mlir/Transforms/Inliner.h
index 1fe61fb4bbe7d9..073b83f6f844c5 100644
--- a/mlir/include/mlir/Transforms/Inliner.h
+++ b/mlir/include/mlir/Transforms/Inliner.h
@@ -69,19 +69,6 @@ class InlinerConfig {
 /// of inlining decisions from the leafs to the roots of the callgraph.
 class Inliner {
 public:
-  using RunPipelineHelperTy = std::function<LogicalResult(
-      Pass &pass, OpPassManager &pipeline, Operation *op)>;
-
-  Inliner(Operation *op, CallGraph &cg, Pass &pass, AnalysisManager am,
-          RunPipelineHelperTy runPipelineHelper, const InlinerConfig &config)
-      : op(op), cg(cg), pass(pass), am(am),
-        runPipelineHelper(std::move(runPipelineHelper)), config(config) {}
-  Inliner(Inliner &) = delete;
-  void operator=(const Inliner &) = delete;
-
-  /// Perform inlining on a OpTrait::SymbolTable operation.
-  LogicalResult doInlining();
-
   /// This struct represents a resolved call to a given callgraph node. Given
   /// that the call does not actually contain a direct reference to the
   /// Region(CallGraphNode) that it is dispatching to, we need to resolve them
@@ -94,7 +81,29 @@ class Inliner {
     CallGraphNode *sourceNode, *targetNode;
   };
 
-protected:
+  using RunPipelineHelperTy = std::function<LogicalResult(
+      Pass &pass, OpPassManager &pipeline, Operation *op)>;
+
+  /// Type of the callback answering if it is profitable
+  /// to inline a callable operation at a call site.
+  /// It might be the case that the ResolvedCall does not provide
+  /// enough context to make the profitability decision, so
+  /// this hook's interface might need to be extended in future.
+  using ProfitabilityCallbackTy = std::function<bool(const ResolvedCall &)>;
+
+  Inliner(Operation *op, CallGraph &cg, Pass &pass, AnalysisManager am,
+          RunPipelineHelperTy runPipelineHelper, const InlinerConfig &config,
+          ProfitabilityCallbackTy isProfitableToInline)
+      : op(op), cg(cg), pass(pass), am(am),
+        runPipelineHelper(std::move(runPipelineHelper)), config(config),
+        isProfitableToInline(std::move(isProfitableToInline)) {}
+  Inliner(Inliner &) = delete;
+  void operator=(const Inliner &) = delete;
+
+  /// Perform inlining on a OpTrait::SymbolTable operation.
+  LogicalResult doInlining();
+
+private:
   /// An OpTrait::SymbolTable operation to run the inlining on.
   Operation *op;
   /// A CallGraph analysis for the given operation.
@@ -108,12 +117,12 @@ class Inliner {
   const RunPipelineHelperTy runPipelineHelper;
   /// The inliner configuration parameters.
   const InlinerConfig &config;
+  /// Returns true, if it is profitable to inline the callable operation
+  /// at the call site.
+  ProfitabilityCallbackTy isProfitableToInline;
 
-private:
   /// Forward declaration of the class providing the actual implementation.
   class Impl;
-
-public:
 };
 } // namespace mlir
 
diff --git a/mlir/include/mlir/Transforms/Passes.td b/mlir/include/mlir/Transforms/Passes.td
index b8fdf7a580476e..51b2a27da639d6 100644
--- a/mlir/include/mlir/Transforms/Passes.td
+++ b/mlir/include/mlir/Transforms/Passes.td
@@ -278,6 +278,13 @@ def Inliner : Pass<"inline"> {
     Option<"maxInliningIterations", "max-iterations", "unsigned",
            /*default=*/"4",
            "Maximum number of iterations when inlining within an SCC">,
+    Option<"inliningThreshold", "inlining-threshold", "unsigned",
+           /*default=*/"-1U",
+           "If the ratio between the number of the operations "
+           "in the callee and the number of the operations "
+           "in the caller exceeds this value (in percentage), "
+           "then the callee is not inlined even if it is legal "
+           "to inline it">,
   ];
 }
 
diff --git a/mlir/lib/Transforms/InlinerPass.cpp b/mlir/lib/Transforms/InlinerPass.cpp
index c058e8050cd199..08d8dbf73a6a1d 100644
--- a/mlir/lib/Transforms/InlinerPass.cpp
+++ b/mlir/lib/Transforms/InlinerPass.cpp
@@ -24,6 +24,8 @@ namespace mlir {
 #include "mlir/Transforms/Passes.h.inc"
 } // namespace mlir
 
+#define DEBUG_TYPE "inliner-pass"
+
 using namespace mlir;
 
 /// This function implements the inliner optimization pipeline.
@@ -88,6 +90,35 @@ InlinerPass::InlinerPass(std::function<void(OpPassManager &)> defaultPipeline,
   config.setOpPipelines(std::move(opPipelines));
 }
 
+// Return true if the inlining ratio does not exceed the threshold.
+static bool isProfitableToInline(const Inliner::ResolvedCall &resolvedCall,
+                                 unsigned inliningThreshold) {
+  Region *callerRegion = resolvedCall.sourceNode->getCallableRegion();
+  Region *calleeRegion = resolvedCall.targetNode->getCallableRegion();
+
+  // We should not get external nodes here, but just return true
+  // for now to preserve the original behavior of the inliner pass.
+  if (!calleeRegion || !calleeRegion)
+    return true;
+
+  auto countOps = [](Region *region) {
+    unsigned count = 0;
+    region->walk([&](Operation *) { ++count; });
+    return count;
+  };
+
+  unsigned callerOps = countOps(callerRegion);
+
+  // Always inline empty callees (if it is possible at all).
+  if (callerOps == 0)
+    return true;
+
+  unsigned ratio = countOps(calleeRegion) * 100 / callerOps;
+  LLVM_DEBUG(llvm::dbgs() << "Callee / caller operation ratio (max: "
+                          << inliningThreshold << "%): " << ratio << "%\n");
+  return ratio <= inliningThreshold;
+}
+
 void InlinerPass::runOnOperation() {
   CallGraph &cg = getAnalysis<CallGraph>();
 
@@ -100,9 +131,14 @@ void InlinerPass::runOnOperation() {
     return signalPassFailure();
   }
 
+  // By default, assume that any inlining is profitable.
+  auto profitabilityCb = [=](const Inliner::ResolvedCall &call) {
+    return isProfitableToInline(call, inliningThreshold);
+  };
+
   // Get an instance of the inliner.
   Inliner inliner(op, cg, *this, getAnalysisManager(), runPipelineHelper,
-                  config);
+                  config, profitabilityCb);
 
   // Run the inlining.
   if (failed(inliner.doInlining()))
diff --git a/mlir/lib/Transforms/Utils/Inliner.cpp b/mlir/lib/Transforms/Utils/Inliner.cpp
index f227cedb269d81..8acfc96d2b611b 100644
--- a/mlir/lib/Transforms/Utils/Inliner.cpp
+++ b/mlir/lib/Transforms/Utils/Inliner.cpp
@@ -741,6 +741,9 @@ bool Inliner::Impl::shouldInline(ResolvedCall &resolvedCall) {
   if (calleeHasMultipleBlocks && !callerRegionSupportsMultipleBlocks())
     return false;
 
+  if (!inliner.isProfitableToInline(resolvedCall))
+    return false;
+
   // Otherwise, inline.
   return true;
 }
diff --git a/mlir/test/Transforms/inlining-dump-default-pipeline.mlir b/mlir/test/Transforms/inlining-dump-default-pipeline.mlir
index e2c31867a8e045..4f8638054206e8 100644
--- a/mlir/test/Transforms/inlining-dump-default-pipeline.mlir
+++ b/mlir/test/Transforms/inlining-dump-default-pipeline.mlir
@@ -1,2 +1,2 @@
 // RUN: mlir-opt %s -pass-pipeline="builtin.module(inline)" -dump-pass-pipeline 2>&1 | FileCheck %s
-// CHECK: builtin.module(inline{default-pipeline=canonicalize max-iterations=4 })
+// CHECK: builtin.module(inline{default-pipeline=canonicalize inlining-threshold=4294967295 max-iterations=4 })
diff --git a/mlir/test/Transforms/inlining-threshold.mlir b/mlir/test/Transforms/inlining-threshold.mlir
new file mode 100644
index 00000000000000..b94115d8f26416
--- /dev/null
+++ b/mlir/test/Transforms/inlining-threshold.mlir
@@ -0,0 +1,18 @@
+// RUN: mlir-opt %s --mlir-disable-threading -inline='default-pipeline='' inlining-threshold=100' -debug-only=inliner-pass 2>&1 | FileCheck %s
+
+// Check that inlining does not happen when the threshold is exceeded.
+func.func @callee1(%arg : i32) -> i32 {
+  %v1 = arith.addi %arg, %arg : i32
+  %v2 = arith.addi %v1, %arg : i32
+  %v3 = arith.addi %v2, %arg : i32
+  return %v3 : i32
+}
+
+// CHECK-LABEL: func @caller1
+func.func @caller1(%arg0 : i32) -> i32 {
+  // CHECK-NEXT: call @callee1
+  // CHECK-NEXT: return
+
+  %0 = call @callee1(%arg0) : (i32) -> i32
+  return %0 : i32
+}

>From 9753ac4b51989596c299e38433b14fc3be5a8450 Mon Sep 17 00:00:00 2001
From: Slava Zakharin <szakharin at nvidia.com>
Date: Wed, 13 Mar 2024 08:26:33 -0700
Subject: [PATCH 47/63] [flang] Moved REAL(16) RANDOM_NUMBER to Float128Math
 library. (#85002)

---
 .../Optimizer/Builder/Runtime/Intrinsics.cpp  | 29 ++++++-
 flang/runtime/Float128Math/CMakeLists.txt     |  1 +
 flang/runtime/Float128Math/random.cpp         | 23 +++++
 flang/runtime/random-templates.h              | 87 +++++++++++++++++++
 flang/runtime/random.cpp                      | 81 ++---------------
 .../Lower/Intrinsics/random_number_real16.f90 | 16 ++++
 6 files changed, 160 insertions(+), 77 deletions(-)
 create mode 100644 flang/runtime/Float128Math/random.cpp
 create mode 100644 flang/runtime/random-templates.h
 create mode 100644 flang/test/Lower/Intrinsics/random_number_real16.f90

diff --git a/flang/lib/Optimizer/Builder/Runtime/Intrinsics.cpp b/flang/lib/Optimizer/Builder/Runtime/Intrinsics.cpp
index 638bfd60a246a6..57c47da0f3f85c 100644
--- a/flang/lib/Optimizer/Builder/Runtime/Intrinsics.cpp
+++ b/flang/lib/Optimizer/Builder/Runtime/Intrinsics.cpp
@@ -27,6 +27,24 @@
 
 using namespace Fortran::runtime;
 
+namespace {
+/// Placeholder for real*16 version of RandomNumber Intrinsic
+struct ForcedRandomNumberReal16 {
+  static constexpr const char *name = ExpandAndQuoteKey(RTNAME(RandomNumber16));
+  static constexpr fir::runtime::FuncTypeBuilderFunc getTypeModel() {
+    return [](mlir::MLIRContext *ctx) {
+      auto boxTy =
+          fir::runtime::getModel<const Fortran::runtime::Descriptor &>()(ctx);
+      auto strTy = fir::runtime::getModel<const char *>()(ctx);
+      auto intTy = fir::runtime::getModel<int>()(ctx);
+      ;
+      return mlir::FunctionType::get(ctx, {boxTy, strTy, intTy},
+                                     mlir::NoneType::get(ctx));
+    };
+  }
+};
+} // namespace
+
 mlir::Value fir::runtime::genAssociated(fir::FirOpBuilder &builder,
                                         mlir::Location loc, mlir::Value pointer,
                                         mlir::Value target) {
@@ -100,8 +118,15 @@ void fir::runtime::genRandomInit(fir::FirOpBuilder &builder, mlir::Location loc,
 
 void fir::runtime::genRandomNumber(fir::FirOpBuilder &builder,
                                    mlir::Location loc, mlir::Value harvest) {
-  mlir::func::FuncOp func =
-      fir::runtime::getRuntimeFunc<mkRTKey(RandomNumber)>(loc, builder);
+  mlir::func::FuncOp func;
+  auto boxEleTy = fir::dyn_cast_ptrOrBoxEleTy(harvest.getType());
+  auto eleTy = fir::unwrapSequenceType(boxEleTy);
+  if (eleTy.isF128()) {
+    func = fir::runtime::getRuntimeFunc<ForcedRandomNumberReal16>(loc, builder);
+  } else {
+    func = fir::runtime::getRuntimeFunc<mkRTKey(RandomNumber)>(loc, builder);
+  }
+
   mlir::FunctionType funcTy = func.getFunctionType();
   mlir::Value sourceFile = fir::factory::locationToFilename(builder, loc);
   mlir::Value sourceLine =
diff --git a/flang/runtime/Float128Math/CMakeLists.txt b/flang/runtime/Float128Math/CMakeLists.txt
index 980356131b680e..33f73a9c54451b 100644
--- a/flang/runtime/Float128Math/CMakeLists.txt
+++ b/flang/runtime/Float128Math/CMakeLists.txt
@@ -48,6 +48,7 @@ set(sources
   nearest.cpp
   norm2.cpp
   pow.cpp
+  random.cpp
   round.cpp
   rrspacing.cpp
   scale.cpp
diff --git a/flang/runtime/Float128Math/random.cpp b/flang/runtime/Float128Math/random.cpp
new file mode 100644
index 00000000000000..cda962b416144e
--- /dev/null
+++ b/flang/runtime/Float128Math/random.cpp
@@ -0,0 +1,23 @@
+//===-- runtime/Float128Math/random.cpp -----------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "math-entries.h"
+#include "numeric-template-specs.h"
+#include "random-templates.h"
+
+using namespace Fortran::runtime::random;
+extern "C" {
+
+#if LDBL_MANT_DIG == 113 || HAS_FLOAT128
+void RTDEF(RandomNumber16)(
+    const Descriptor &harvest, const char *source, int line) {
+  return Generate<CppTypeFor<TypeCategory::Real, 16>, 113>(harvest);
+}
+#endif
+
+} // extern "C"
diff --git a/flang/runtime/random-templates.h b/flang/runtime/random-templates.h
new file mode 100644
index 00000000000000..ce64a94901a281
--- /dev/null
+++ b/flang/runtime/random-templates.h
@@ -0,0 +1,87 @@
+//===-- runtime/random-templates.h ------------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef FORTRAN_RUNTIME_RANDOM_TEMPLATES_H_
+#define FORTRAN_RUNTIME_RANDOM_TEMPLATES_H_
+
+#include "lock.h"
+#include "numeric-templates.h"
+#include "flang/Runtime/descriptor.h"
+#include <algorithm>
+#include <random>
+
+namespace Fortran::runtime::random {
+
+// Newer "Minimum standard", recommended by Park, Miller, and Stockmeyer in
+// 1993. Same as C++17 std::minstd_rand, but explicitly instantiated for
+// permanence.
+using Generator =
+    std::linear_congruential_engine<std::uint_fast32_t, 48271, 0, 2147483647>;
+
+using GeneratedWord = typename Generator::result_type;
+static constexpr std::uint64_t range{
+    static_cast<std::uint64_t>(Generator::max() - Generator::min() + 1)};
+static constexpr bool rangeIsPowerOfTwo{(range & (range - 1)) == 0};
+static constexpr int rangeBits{
+    64 - common::LeadingZeroBitCount(range) - !rangeIsPowerOfTwo};
+
+extern Lock lock;
+extern Generator generator;
+extern std::optional<GeneratedWord> nextValue;
+
+// Call only with lock held
+static GeneratedWord GetNextValue() {
+  GeneratedWord result;
+  if (nextValue.has_value()) {
+    result = *nextValue;
+    nextValue.reset();
+  } else {
+    result = generator();
+  }
+  return result;
+}
+
+template <typename REAL, int PREC>
+inline void Generate(const Descriptor &harvest) {
+  static constexpr std::size_t minBits{
+      std::max<std::size_t>(PREC, 8 * sizeof(GeneratedWord))};
+  using Int = common::HostUnsignedIntType<minBits>;
+  static constexpr std::size_t words{
+      static_cast<std::size_t>(PREC + rangeBits - 1) / rangeBits};
+  std::size_t elements{harvest.Elements()};
+  SubscriptValue at[maxRank];
+  harvest.GetLowerBounds(at);
+  {
+    CriticalSection critical{lock};
+    for (std::size_t j{0}; j < elements; ++j) {
+      while (true) {
+        Int fraction{GetNextValue()};
+        if constexpr (words > 1) {
+          for (std::size_t k{1}; k < words; ++k) {
+            static constexpr auto rangeMask{
+                (GeneratedWord{1} << rangeBits) - 1};
+            GeneratedWord word{(GetNextValue() - generator.min()) & rangeMask};
+            fraction = (fraction << rangeBits) | word;
+          }
+        }
+        fraction >>= words * rangeBits - PREC;
+        REAL next{
+            LDEXPTy<REAL>::compute(static_cast<REAL>(fraction), -(PREC + 1))};
+        if (next >= 0.0 && next < 1.0) {
+          *harvest.Element<REAL>(at) = next;
+          break;
+        }
+      }
+      harvest.IncrementSubscripts(at);
+    }
+  }
+}
+
+} // namespace Fortran::runtime::random
+
+#endif // FORTRAN_RUNTIME_RANDOM_TEMPLATES_H_
diff --git a/flang/runtime/random.cpp b/flang/runtime/random.cpp
index 642091a06aff55..13bed1f0abe10c 100644
--- a/flang/runtime/random.cpp
+++ b/flang/runtime/random.cpp
@@ -11,85 +11,24 @@
 
 #include "flang/Runtime/random.h"
 #include "lock.h"
+#include "random-templates.h"
 #include "terminator.h"
 #include "flang/Common/float128.h"
 #include "flang/Common/leading-zero-bit-count.h"
 #include "flang/Common/uint128.h"
 #include "flang/Runtime/cpp-type.h"
 #include "flang/Runtime/descriptor.h"
-#include <algorithm>
 #include <cmath>
 #include <cstdint>
 #include <limits>
 #include <memory>
-#include <random>
 #include <time.h>
 
-namespace Fortran::runtime {
+namespace Fortran::runtime::random {
 
-// Newer "Minimum standard", recommended by Park, Miller, and Stockmeyer in
-// 1993. Same as C++17 std::minstd_rand, but explicitly instantiated for
-// permanence.
-using Generator =
-    std::linear_congruential_engine<std::uint_fast32_t, 48271, 0, 2147483647>;
-
-using GeneratedWord = typename Generator::result_type;
-static constexpr std::uint64_t range{
-    static_cast<std::uint64_t>(Generator::max() - Generator::min() + 1)};
-static constexpr bool rangeIsPowerOfTwo{(range & (range - 1)) == 0};
-static constexpr int rangeBits{
-    64 - common::LeadingZeroBitCount(range) - !rangeIsPowerOfTwo};
-
-static Lock lock;
-static Generator generator;
-static std::optional<GeneratedWord> nextValue;
-
-// Call only with lock held
-static GeneratedWord GetNextValue() {
-  GeneratedWord result;
-  if (nextValue.has_value()) {
-    result = *nextValue;
-    nextValue.reset();
-  } else {
-    result = generator();
-  }
-  return result;
-}
-
-template <typename REAL, int PREC>
-inline void Generate(const Descriptor &harvest) {
-  static constexpr std::size_t minBits{
-      std::max<std::size_t>(PREC, 8 * sizeof(GeneratedWord))};
-  using Int = common::HostUnsignedIntType<minBits>;
-  static constexpr std::size_t words{
-      static_cast<std::size_t>(PREC + rangeBits - 1) / rangeBits};
-  std::size_t elements{harvest.Elements()};
-  SubscriptValue at[maxRank];
-  harvest.GetLowerBounds(at);
-  {
-    CriticalSection critical{lock};
-    for (std::size_t j{0}; j < elements; ++j) {
-      while (true) {
-        Int fraction{GetNextValue()};
-        if constexpr (words > 1) {
-          for (std::size_t k{1}; k < words; ++k) {
-            static constexpr auto rangeMask{
-                (GeneratedWord{1} << rangeBits) - 1};
-            GeneratedWord word{(GetNextValue() - generator.min()) & rangeMask};
-            fraction = (fraction << rangeBits) | word;
-          }
-        }
-        fraction >>= words * rangeBits - PREC;
-        REAL next{std::ldexp(static_cast<REAL>(fraction), -(PREC + 1))};
-        if (next >= 0.0 && next < 1.0) {
-          *harvest.Element<REAL>(at) = next;
-          break;
-        }
-      }
-      harvest.IncrementSubscripts(at);
-    }
-  }
-}
+Lock lock;
+Generator generator;
+std::optional<GeneratedWord> nextValue;
 
 extern "C" {
 
@@ -130,14 +69,6 @@ void RTNAME(RandomNumber)(
 #if LDBL_MANT_DIG == 64
       Generate<CppTypeFor<TypeCategory::Real, 10>, 64>(harvest);
       return;
-#endif
-    }
-    break;
-  case 16:
-    if constexpr (HasCppTypeFor<TypeCategory::Real, 16>) {
-#if LDBL_MANT_DIG == 113
-      Generate<CppTypeFor<TypeCategory::Real, 16>, 113>(harvest);
-      return;
 #endif
     }
     break;
@@ -263,4 +194,4 @@ void RTNAME(RandomSeed)(const Descriptor *size, const Descriptor *put,
 }
 
 } // extern "C"
-} // namespace Fortran::runtime
+} // namespace Fortran::runtime::random
diff --git a/flang/test/Lower/Intrinsics/random_number_real16.f90 b/flang/test/Lower/Intrinsics/random_number_real16.f90
new file mode 100644
index 00000000000000..76fed258d8afc8
--- /dev/null
+++ b/flang/test/Lower/Intrinsics/random_number_real16.f90
@@ -0,0 +1,16 @@
+! RUN: bbc -emit-fir %s -o - | FileCheck %s
+! RUN: %flang_fc1 -emit-fir %s -o - | FileCheck %s
+
+! CHECK-LABEL: func @_QPtest_scalar
+! CHECK: fir.call @_FortranARandomNumber16({{.*}}){{.*}}: (!fir.box<none>, !fir.ref<i8>, i32) -> none
+subroutine test_scalar
+  real(16) :: r
+  call random_number(r)
+end
+
+! CHECK-LABEL: func @_QPtest_array
+! CHECK: fir.call @_FortranARandomNumber16({{.*}}){{.*}}: (!fir.box<none>, !fir.ref<i8>, i32) -> none
+subroutine test_array(r)
+  real(16) :: r(:)
+  call random_number(r)
+end

>From 07c57acb42caa45b3bf8e068a2fb0cc25b62aa2a Mon Sep 17 00:00:00 2001
From: Slava Zakharin <szakharin at nvidia.com>
Date: Wed, 13 Mar 2024 08:26:49 -0700
Subject: [PATCH 48/63] [flang] Enable REAL(16) MODULO lowering. (#85005)

The lowering currently relies on the trivial operations,
so we should just lower it for REAL(16) the same way we do this
for other trivial operations.
---
 flang/lib/Optimizer/Builder/IntrinsicCall.cpp |  5 +-
 flang/test/Lower/Intrinsics/modulo.f90        | 82 +++++++++++--------
 2 files changed, 52 insertions(+), 35 deletions(-)

diff --git a/flang/lib/Optimizer/Builder/IntrinsicCall.cpp b/flang/lib/Optimizer/Builder/IntrinsicCall.cpp
index ca5ab6fcea342a..21d253624a1ac9 100644
--- a/flang/lib/Optimizer/Builder/IntrinsicCall.cpp
+++ b/flang/lib/Optimizer/Builder/IntrinsicCall.cpp
@@ -5208,6 +5208,8 @@ mlir::Value IntrinsicLibrary::genMod(mlir::Type resultType,
 // MODULO
 mlir::Value IntrinsicLibrary::genModulo(mlir::Type resultType,
                                         llvm::ArrayRef<mlir::Value> args) {
+  // TODO: we'd better generate a runtime call here, when runtime error
+  // checking is needed (to detect 0 divisor) or when precise math is requested.
   assert(args.size() == 2);
   // No floored modulo op in LLVM/MLIR yet. TODO: add one to MLIR.
   // In the meantime, use a simple inlined implementation based on truncated
@@ -5233,10 +5235,7 @@ mlir::Value IntrinsicLibrary::genModulo(mlir::Type resultType,
     return builder.create<mlir::arith::SelectOp>(loc, mustAddP, remPlusP,
                                                  remainder);
   }
-  // Real case
-  if (resultType == mlir::FloatType::getF128(builder.getContext()))
 
-    TODO(loc, "REAL(KIND=16): in MODULO intrinsic");
   auto remainder = builder.create<mlir::arith::RemFOp>(loc, args[0], args[1]);
   mlir::Value zero = builder.createRealZeroConstant(loc, remainder.getType());
   auto remainderIsNotZero = builder.create<mlir::arith::CmpFOp>(
diff --git a/flang/test/Lower/Intrinsics/modulo.f90 b/flang/test/Lower/Intrinsics/modulo.f90
index 64a6607a09ccd5..001e307aa077a3 100644
--- a/flang/test/Lower/Intrinsics/modulo.f90
+++ b/flang/test/Lower/Intrinsics/modulo.f90
@@ -3,36 +3,54 @@
 ! CHECK-LABEL: func @_QPmodulo_testr(
 ! CHECK-SAME: %[[arg0:.*]]: !fir.ref<f64>{{.*}}, %[[arg1:.*]]: !fir.ref<f64>{{.*}}, %[[arg2:.*]]: !fir.ref<f64>{{.*}}) {
 subroutine modulo_testr(r, a, p)
-    real(8) :: r, a, p
-    ! CHECK-DAG: %[[a:.*]] = fir.load %[[arg1]] : !fir.ref<f64>
-    ! CHECK-DAG: %[[p:.*]] = fir.load %[[arg2]] : !fir.ref<f64>
-    ! CHECK-DAG: %[[rem:.*]] = arith.remf %[[a]], %[[p]] {{.*}}: f64
-    ! CHECK-DAG: %[[zero:.*]] = arith.constant 0.000000e+00 : f64
-    ! CHECK-DAG: %[[remNotZero:.*]] = arith.cmpf une, %[[rem]], %[[zero]] {{.*}} : f64
-    ! CHECK-DAG: %[[aNeg:.*]] = arith.cmpf olt, %[[a]], %[[zero]] {{.*}} : f64
-    ! CHECK-DAG: %[[pNeg:.*]] = arith.cmpf olt, %[[p]], %[[zero]] {{.*}} : f64
-    ! CHECK-DAG: %[[signDifferent:.*]] = arith.xori %[[aNeg]], %[[pNeg]] : i1
-    ! CHECK-DAG: %[[mustAddP:.*]] = arith.andi %[[remNotZero]], %[[signDifferent]] : i1
-    ! CHECK-DAG: %[[remPlusP:.*]] = arith.addf %[[rem]], %[[p]] {{.*}}: f64
-    ! CHECK: %[[res:.*]] = arith.select %[[mustAddP]], %[[remPlusP]], %[[rem]] : f64
-    ! CHECK: fir.store %[[res]] to %[[arg0]] : !fir.ref<f64>
-    r = modulo(a, p)
-  end subroutine
-  
-  ! CHECK-LABEL: func @_QPmodulo_testi(
-  ! CHECK-SAME: %[[arg0:.*]]: !fir.ref<i64>{{.*}}, %[[arg1:.*]]: !fir.ref<i64>{{.*}}, %[[arg2:.*]]: !fir.ref<i64>{{.*}}) {
-  subroutine modulo_testi(r, a, p)
-    integer(8) :: r, a, p
-    ! CHECK-DAG: %[[a:.*]] = fir.load %[[arg1]] : !fir.ref<i64>
-    ! CHECK-DAG: %[[p:.*]] = fir.load %[[arg2]] : !fir.ref<i64>
-    ! CHECK-DAG: %[[rem:.*]] = arith.remsi %[[a]], %[[p]] : i64
-    ! CHECK-DAG: %[[argXor:.*]] = arith.xori %[[a]], %[[p]] : i64
-    ! CHECK-DAG: %[[signDifferent:.*]] = arith.cmpi slt, %[[argXor]], %c0{{.*}} : i64
-    ! CHECK-DAG: %[[remNotZero:.*]] = arith.cmpi ne, %[[rem]], %c0{{.*}} : i64
-    ! CHECK-DAG: %[[mustAddP:.*]] = arith.andi %[[remNotZero]], %[[signDifferent]] : i1
-    ! CHECK-DAG: %[[remPlusP:.*]] = arith.addi %[[rem]], %[[p]] : i64
-    ! CHECK: %[[res:.*]] = arith.select %[[mustAddP]], %[[remPlusP]], %[[rem]] : i64
-    ! CHECK: fir.store %[[res]] to %[[arg0]] : !fir.ref<i64>
-    r = modulo(a, p)
-  end subroutine
+  real(8) :: r, a, p
+  ! CHECK-DAG: %[[a:.*]] = fir.load %[[arg1]] : !fir.ref<f64>
+  ! CHECK-DAG: %[[p:.*]] = fir.load %[[arg2]] : !fir.ref<f64>
+  ! CHECK-DAG: %[[rem:.*]] = arith.remf %[[a]], %[[p]] {{.*}}: f64
+  ! CHECK-DAG: %[[zero:.*]] = arith.constant 0.000000e+00 : f64
+  ! CHECK-DAG: %[[remNotZero:.*]] = arith.cmpf une, %[[rem]], %[[zero]] {{.*}} : f64
+  ! CHECK-DAG: %[[aNeg:.*]] = arith.cmpf olt, %[[a]], %[[zero]] {{.*}} : f64
+  ! CHECK-DAG: %[[pNeg:.*]] = arith.cmpf olt, %[[p]], %[[zero]] {{.*}} : f64
+  ! CHECK-DAG: %[[signDifferent:.*]] = arith.xori %[[aNeg]], %[[pNeg]] : i1
+  ! CHECK-DAG: %[[mustAddP:.*]] = arith.andi %[[remNotZero]], %[[signDifferent]] : i1
+  ! CHECK-DAG: %[[remPlusP:.*]] = arith.addf %[[rem]], %[[p]] {{.*}}: f64
+  ! CHECK: %[[res:.*]] = arith.select %[[mustAddP]], %[[remPlusP]], %[[rem]] : f64
+  ! CHECK: fir.store %[[res]] to %[[arg0]] : !fir.ref<f64>
+  r = modulo(a, p)
+end subroutine
   
+! CHECK-LABEL: func @_QPmodulo_testi(
+! CHECK-SAME: %[[arg0:.*]]: !fir.ref<i64>{{.*}}, %[[arg1:.*]]: !fir.ref<i64>{{.*}}, %[[arg2:.*]]: !fir.ref<i64>{{.*}}) {
+subroutine modulo_testi(r, a, p)
+  integer(8) :: r, a, p
+  ! CHECK-DAG: %[[a:.*]] = fir.load %[[arg1]] : !fir.ref<i64>
+  ! CHECK-DAG: %[[p:.*]] = fir.load %[[arg2]] : !fir.ref<i64>
+  ! CHECK-DAG: %[[rem:.*]] = arith.remsi %[[a]], %[[p]] : i64
+  ! CHECK-DAG: %[[argXor:.*]] = arith.xori %[[a]], %[[p]] : i64
+  ! CHECK-DAG: %[[signDifferent:.*]] = arith.cmpi slt, %[[argXor]], %c0{{.*}} : i64
+  ! CHECK-DAG: %[[remNotZero:.*]] = arith.cmpi ne, %[[rem]], %c0{{.*}} : i64
+  ! CHECK-DAG: %[[mustAddP:.*]] = arith.andi %[[remNotZero]], %[[signDifferent]] : i1
+  ! CHECK-DAG: %[[remPlusP:.*]] = arith.addi %[[rem]], %[[p]] : i64
+  ! CHECK: %[[res:.*]] = arith.select %[[mustAddP]], %[[remPlusP]], %[[rem]] : i64
+  ! CHECK: fir.store %[[res]] to %[[arg0]] : !fir.ref<i64>
+  r = modulo(a, p)
+end subroutine
+
+! CHECK-LABEL: func @_QPmodulo_testr16(
+! CHECK-SAME: %[[arg0:.*]]: !fir.ref<f128>{{.*}}, %[[arg1:.*]]: !fir.ref<f128>{{.*}}, %[[arg2:.*]]: !fir.ref<f128>{{.*}}) {
+subroutine modulo_testr16(r, a, p)
+  real(16) :: r, a, p
+  ! CHECK-DAG: %[[a:.*]] = fir.load %[[arg1]] : !fir.ref<f128>
+  ! CHECK-DAG: %[[p:.*]] = fir.load %[[arg2]] : !fir.ref<f128>
+  ! CHECK-DAG: %[[rem:.*]] = arith.remf %[[a]], %[[p]] {{.*}}: f128
+  ! CHECK-DAG: %[[zero:.*]] = arith.constant 0.000000e+00 : f128
+  ! CHECK-DAG: %[[remNotZero:.*]] = arith.cmpf une, %[[rem]], %[[zero]] {{.*}} : f128
+  ! CHECK-DAG: %[[aNeg:.*]] = arith.cmpf olt, %[[a]], %[[zero]] {{.*}} : f128
+  ! CHECK-DAG: %[[pNeg:.*]] = arith.cmpf olt, %[[p]], %[[zero]] {{.*}} : f128
+  ! CHECK-DAG: %[[signDifferent:.*]] = arith.xori %[[aNeg]], %[[pNeg]] : i1
+  ! CHECK-DAG: %[[mustAddP:.*]] = arith.andi %[[remNotZero]], %[[signDifferent]] : i1
+  ! CHECK-DAG: %[[remPlusP:.*]] = arith.addf %[[rem]], %[[p]] {{.*}}: f128
+  ! CHECK: %[[res:.*]] = arith.select %[[mustAddP]], %[[remPlusP]], %[[rem]] : f128
+  ! CHECK: fir.store %[[res]] to %[[arg0]] : !fir.ref<f128>
+  r = modulo(a, p)
+end subroutine

>From 6a097cadacaa3a78508ecbf586de9473d4505f66 Mon Sep 17 00:00:00 2001
From: Slava Zakharin <szakharin at nvidia.com>
Date: Wed, 13 Mar 2024 08:27:15 -0700
Subject: [PATCH 49/63] [flang][runtime] Added lowering and runtime for
 REAL(16) IEEE_FMA. (#85017)

---
 flang/lib/Optimizer/Builder/IntrinsicCall.cpp |  4 ++++
 flang/runtime/Float128Math/CMakeLists.txt     |  1 +
 flang/runtime/Float128Math/fma.cpp            | 23 +++++++++++++++++++
 flang/runtime/Float128Math/math-entries.h     |  3 +++
 flang/test/Lower/Intrinsics/fma_real16.f90    |  9 ++++++++
 5 files changed, 40 insertions(+)
 create mode 100644 flang/runtime/Float128Math/fma.cpp
 create mode 100644 flang/test/Lower/Intrinsics/fma_real16.f90

diff --git a/flang/lib/Optimizer/Builder/IntrinsicCall.cpp b/flang/lib/Optimizer/Builder/IntrinsicCall.cpp
index 21d253624a1ac9..94fcfa3503114e 100644
--- a/flang/lib/Optimizer/Builder/IntrinsicCall.cpp
+++ b/flang/lib/Optimizer/Builder/IntrinsicCall.cpp
@@ -922,6 +922,8 @@ mlir::Value genComplexMathOp(fir::FirOpBuilder &builder, mlir::Location loc,
 constexpr auto FuncTypeReal16Real16 = genFuncType<Ty::Real<16>, Ty::Real<16>>;
 constexpr auto FuncTypeReal16Real16Real16 =
     genFuncType<Ty::Real<16>, Ty::Real<16>, Ty::Real<16>>;
+constexpr auto FuncTypeReal16Real16Real16Real16 =
+    genFuncType<Ty::Real<16>, Ty::Real<16>, Ty::Real<16>, Ty::Real<16>>;
 constexpr auto FuncTypeReal16Integer4Real16 =
     genFuncType<Ty::Real<16>, Ty::Integer<4>, Ty::Real<16>>;
 constexpr auto FuncTypeInteger4Real16 =
@@ -1143,6 +1145,8 @@ static constexpr MathOperation mathOperations[] = {
     {"fma", "llvm.fma.f64",
      genFuncType<Ty::Real<8>, Ty::Real<8>, Ty::Real<8>, Ty::Real<8>>,
      genMathOp<mlir::math::FmaOp>},
+    {"fma", RTNAME_STRING(FmaF128), FuncTypeReal16Real16Real16Real16,
+     genLibF128Call},
     {"gamma", "tgammaf", genFuncType<Ty::Real<4>, Ty::Real<4>>, genLibCall},
     {"gamma", "tgamma", genFuncType<Ty::Real<8>, Ty::Real<8>>, genLibCall},
     {"gamma", RTNAME_STRING(TgammaF128), FuncTypeReal16Real16, genLibF128Call},
diff --git a/flang/runtime/Float128Math/CMakeLists.txt b/flang/runtime/Float128Math/CMakeLists.txt
index 33f73a9c54451b..a5f5bec1e7e4b8 100644
--- a/flang/runtime/Float128Math/CMakeLists.txt
+++ b/flang/runtime/Float128Math/CMakeLists.txt
@@ -33,6 +33,7 @@ set(sources
   exp.cpp
   exponent.cpp
   floor.cpp
+  fma.cpp
   fraction.cpp
   hypot.cpp
   j0.cpp
diff --git a/flang/runtime/Float128Math/fma.cpp b/flang/runtime/Float128Math/fma.cpp
new file mode 100644
index 00000000000000..ec67e8e6fba22b
--- /dev/null
+++ b/flang/runtime/Float128Math/fma.cpp
@@ -0,0 +1,23 @@
+//===-- runtime/Float128Math/fma.cpp --------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "math-entries.h"
+
+namespace Fortran::runtime {
+extern "C" {
+
+#if LDBL_MANT_DIG == 113 || HAS_FLOAT128
+CppTypeFor<TypeCategory::Real, 16> RTDEF(FmaF128)(
+    CppTypeFor<TypeCategory::Real, 16> x, CppTypeFor<TypeCategory::Real, 16> y,
+    CppTypeFor<TypeCategory::Real, 16> z) {
+  return Fma<true>::invoke(x, y, z);
+}
+#endif
+
+} // extern "C"
+} // namespace Fortran::runtime
diff --git a/flang/runtime/Float128Math/math-entries.h b/flang/runtime/Float128Math/math-entries.h
index 1eab7c86f2ed76..13fdab26470038 100644
--- a/flang/runtime/Float128Math/math-entries.h
+++ b/flang/runtime/Float128Math/math-entries.h
@@ -77,6 +77,7 @@ DEFINE_FALLBACK_F128(Erf)
 DEFINE_FALLBACK_F128(Erfc)
 DEFINE_FALLBACK_F128(Exp)
 DEFINE_FALLBACK_F128(Floor)
+DEFINE_FALLBACK_F128(Fma)
 DEFINE_FALLBACK_F128(Frexp)
 DEFINE_FALLBACK_F128(Hypot)
 DEFINE_FALLBACK_I32(Ilogb)
@@ -124,6 +125,7 @@ DEFINE_SIMPLE_ALIAS(Erf, erfq)
 DEFINE_SIMPLE_ALIAS(Erfc, erfcq)
 DEFINE_SIMPLE_ALIAS(Exp, expq)
 DEFINE_SIMPLE_ALIAS(Floor, floorq)
+DEFINE_SIMPLE_ALIAS(Fma, fmaq)
 DEFINE_SIMPLE_ALIAS(Frexp, frexpq)
 DEFINE_SIMPLE_ALIAS(Hypot, hypotq)
 DEFINE_SIMPLE_ALIAS(Ilogb, ilogbq)
@@ -177,6 +179,7 @@ DEFINE_SIMPLE_ALIAS(Erf, std::erf)
 DEFINE_SIMPLE_ALIAS(Erfc, std::erfc)
 DEFINE_SIMPLE_ALIAS(Exp, std::exp)
 DEFINE_SIMPLE_ALIAS(Floor, std::floor)
+DEFINE_SIMPLE_ALIAS(Fma, std::fma)
 DEFINE_SIMPLE_ALIAS(Frexp, std::frexp)
 DEFINE_SIMPLE_ALIAS(Hypot, std::hypot)
 DEFINE_SIMPLE_ALIAS(Ilogb, std::ilogb)
diff --git a/flang/test/Lower/Intrinsics/fma_real16.f90 b/flang/test/Lower/Intrinsics/fma_real16.f90
new file mode 100644
index 00000000000000..62cf2fbcefbf19
--- /dev/null
+++ b/flang/test/Lower/Intrinsics/fma_real16.f90
@@ -0,0 +1,9 @@
+! RUN: bbc -emit-fir %s -o - | FileCheck %s
+! RUN: bbc --math-runtime=precise -emit-fir %s -o - | FileCheck %s
+! RUN: %flang_fc1 -emit-fir %s -o - | FileCheck %s
+
+! CHECK: fir.call @_FortranAFmaF128({{.*}}){{.*}}: (f128, f128, f128) -> f128
+  use ieee_arithmetic, only: ieee_fma
+  real(16) :: x, y, z
+  x = ieee_fma(x, y, z)
+end

>From c358a23d1a0f0096386e5536fafcada1bc755e7c Mon Sep 17 00:00:00 2001
From: Nick Desaulniers <nickdesaulniers at users.noreply.github.com>
Date: Wed, 13 Mar 2024 08:36:49 -0700
Subject: [PATCH 50/63] [libc] roll out rest of stdbit.h entrypoints to
 gpu,linux,baremetal (#84938)

---
 libc/config/gpu/entrypoints.txt           | 30 +++++++++++++++++++++++
 libc/config/linux/aarch64/entrypoints.txt | 30 +++++++++++++++++++++++
 libc/config/linux/arm/entrypoints.txt     | 30 +++++++++++++++++++++++
 libc/config/linux/riscv/entrypoints.txt   | 30 +++++++++++++++++++++++
 4 files changed, 120 insertions(+)

diff --git a/libc/config/gpu/entrypoints.txt b/libc/config/gpu/entrypoints.txt
index fca5315fc4f0a8..4fb87cb9f5a33e 100644
--- a/libc/config/gpu/entrypoints.txt
+++ b/libc/config/gpu/entrypoints.txt
@@ -106,6 +106,36 @@ set(TARGET_LIBC_ENTRYPOINTS
     libc.src.stdbit.stdc_first_trailing_one_ui
     libc.src.stdbit.stdc_first_trailing_one_ul
     libc.src.stdbit.stdc_first_trailing_one_ull
+    libc.src.stdbit.stdc_count_zeros_uc
+    libc.src.stdbit.stdc_count_zeros_us
+    libc.src.stdbit.stdc_count_zeros_ui
+    libc.src.stdbit.stdc_count_zeros_ul
+    libc.src.stdbit.stdc_count_zeros_ull
+    libc.src.stdbit.stdc_count_ones_uc
+    libc.src.stdbit.stdc_count_ones_us
+    libc.src.stdbit.stdc_count_ones_ui
+    libc.src.stdbit.stdc_count_ones_ul
+    libc.src.stdbit.stdc_count_ones_ull
+    libc.src.stdbit.stdc_has_single_bit_uc
+    libc.src.stdbit.stdc_has_single_bit_us
+    libc.src.stdbit.stdc_has_single_bit_ui
+    libc.src.stdbit.stdc_has_single_bit_ul
+    libc.src.stdbit.stdc_has_single_bit_ull
+    libc.src.stdbit.stdc_bit_width_uc
+    libc.src.stdbit.stdc_bit_width_us
+    libc.src.stdbit.stdc_bit_width_ui
+    libc.src.stdbit.stdc_bit_width_ul
+    libc.src.stdbit.stdc_bit_width_ull
+    libc.src.stdbit.stdc_bit_floor_uc
+    libc.src.stdbit.stdc_bit_floor_us
+    libc.src.stdbit.stdc_bit_floor_ui
+    libc.src.stdbit.stdc_bit_floor_ul
+    libc.src.stdbit.stdc_bit_floor_ull
+    libc.src.stdbit.stdc_bit_ceil_uc
+    libc.src.stdbit.stdc_bit_ceil_us
+    libc.src.stdbit.stdc_bit_ceil_ui
+    libc.src.stdbit.stdc_bit_ceil_ul
+    libc.src.stdbit.stdc_bit_ceil_ull
 
     # stdlib.h entrypoints
     libc.src.stdlib.abs
diff --git a/libc/config/linux/aarch64/entrypoints.txt b/libc/config/linux/aarch64/entrypoints.txt
index abd1f83794ed0b..7d69099e3cb9db 100644
--- a/libc/config/linux/aarch64/entrypoints.txt
+++ b/libc/config/linux/aarch64/entrypoints.txt
@@ -131,6 +131,36 @@ set(TARGET_LIBC_ENTRYPOINTS
     libc.src.stdbit.stdc_first_trailing_one_ui
     libc.src.stdbit.stdc_first_trailing_one_ul
     libc.src.stdbit.stdc_first_trailing_one_ull
+    libc.src.stdbit.stdc_count_zeros_uc
+    libc.src.stdbit.stdc_count_zeros_us
+    libc.src.stdbit.stdc_count_zeros_ui
+    libc.src.stdbit.stdc_count_zeros_ul
+    libc.src.stdbit.stdc_count_zeros_ull
+    libc.src.stdbit.stdc_count_ones_uc
+    libc.src.stdbit.stdc_count_ones_us
+    libc.src.stdbit.stdc_count_ones_ui
+    libc.src.stdbit.stdc_count_ones_ul
+    libc.src.stdbit.stdc_count_ones_ull
+    libc.src.stdbit.stdc_has_single_bit_uc
+    libc.src.stdbit.stdc_has_single_bit_us
+    libc.src.stdbit.stdc_has_single_bit_ui
+    libc.src.stdbit.stdc_has_single_bit_ul
+    libc.src.stdbit.stdc_has_single_bit_ull
+    libc.src.stdbit.stdc_bit_width_uc
+    libc.src.stdbit.stdc_bit_width_us
+    libc.src.stdbit.stdc_bit_width_ui
+    libc.src.stdbit.stdc_bit_width_ul
+    libc.src.stdbit.stdc_bit_width_ull
+    libc.src.stdbit.stdc_bit_floor_uc
+    libc.src.stdbit.stdc_bit_floor_us
+    libc.src.stdbit.stdc_bit_floor_ui
+    libc.src.stdbit.stdc_bit_floor_ul
+    libc.src.stdbit.stdc_bit_floor_ull
+    libc.src.stdbit.stdc_bit_ceil_uc
+    libc.src.stdbit.stdc_bit_ceil_us
+    libc.src.stdbit.stdc_bit_ceil_ui
+    libc.src.stdbit.stdc_bit_ceil_ul
+    libc.src.stdbit.stdc_bit_ceil_ull
 
     # stdlib.h entrypoints
     libc.src.stdlib.abs
diff --git a/libc/config/linux/arm/entrypoints.txt b/libc/config/linux/arm/entrypoints.txt
index 2fca96c5601b08..bf1559b2f02369 100644
--- a/libc/config/linux/arm/entrypoints.txt
+++ b/libc/config/linux/arm/entrypoints.txt
@@ -108,6 +108,36 @@ set(TARGET_LIBC_ENTRYPOINTS
     libc.src.stdbit.stdc_first_trailing_one_ui
     libc.src.stdbit.stdc_first_trailing_one_ul
     libc.src.stdbit.stdc_first_trailing_one_ull
+    libc.src.stdbit.stdc_count_zeros_uc
+    libc.src.stdbit.stdc_count_zeros_us
+    libc.src.stdbit.stdc_count_zeros_ui
+    libc.src.stdbit.stdc_count_zeros_ul
+    libc.src.stdbit.stdc_count_zeros_ull
+    libc.src.stdbit.stdc_count_ones_uc
+    libc.src.stdbit.stdc_count_ones_us
+    libc.src.stdbit.stdc_count_ones_ui
+    libc.src.stdbit.stdc_count_ones_ul
+    libc.src.stdbit.stdc_count_ones_ull
+    libc.src.stdbit.stdc_has_single_bit_uc
+    libc.src.stdbit.stdc_has_single_bit_us
+    libc.src.stdbit.stdc_has_single_bit_ui
+    libc.src.stdbit.stdc_has_single_bit_ul
+    libc.src.stdbit.stdc_has_single_bit_ull
+    libc.src.stdbit.stdc_bit_width_uc
+    libc.src.stdbit.stdc_bit_width_us
+    libc.src.stdbit.stdc_bit_width_ui
+    libc.src.stdbit.stdc_bit_width_ul
+    libc.src.stdbit.stdc_bit_width_ull
+    libc.src.stdbit.stdc_bit_floor_uc
+    libc.src.stdbit.stdc_bit_floor_us
+    libc.src.stdbit.stdc_bit_floor_ui
+    libc.src.stdbit.stdc_bit_floor_ul
+    libc.src.stdbit.stdc_bit_floor_ull
+    libc.src.stdbit.stdc_bit_ceil_uc
+    libc.src.stdbit.stdc_bit_ceil_us
+    libc.src.stdbit.stdc_bit_ceil_ui
+    libc.src.stdbit.stdc_bit_ceil_ul
+    libc.src.stdbit.stdc_bit_ceil_ull
 
     # stdlib.h entrypoints
     libc.src.stdlib.abs
diff --git a/libc/config/linux/riscv/entrypoints.txt b/libc/config/linux/riscv/entrypoints.txt
index 006aa787ea6afd..b1c9dd0428eea5 100644
--- a/libc/config/linux/riscv/entrypoints.txt
+++ b/libc/config/linux/riscv/entrypoints.txt
@@ -132,6 +132,36 @@ set(TARGET_LIBC_ENTRYPOINTS
     libc.src.stdbit.stdc_first_trailing_one_ui
     libc.src.stdbit.stdc_first_trailing_one_ul
     libc.src.stdbit.stdc_first_trailing_one_ull
+    libc.src.stdbit.stdc_count_zeros_uc
+    libc.src.stdbit.stdc_count_zeros_us
+    libc.src.stdbit.stdc_count_zeros_ui
+    libc.src.stdbit.stdc_count_zeros_ul
+    libc.src.stdbit.stdc_count_zeros_ull
+    libc.src.stdbit.stdc_count_ones_uc
+    libc.src.stdbit.stdc_count_ones_us
+    libc.src.stdbit.stdc_count_ones_ui
+    libc.src.stdbit.stdc_count_ones_ul
+    libc.src.stdbit.stdc_count_ones_ull
+    libc.src.stdbit.stdc_has_single_bit_uc
+    libc.src.stdbit.stdc_has_single_bit_us
+    libc.src.stdbit.stdc_has_single_bit_ui
+    libc.src.stdbit.stdc_has_single_bit_ul
+    libc.src.stdbit.stdc_has_single_bit_ull
+    libc.src.stdbit.stdc_bit_width_uc
+    libc.src.stdbit.stdc_bit_width_us
+    libc.src.stdbit.stdc_bit_width_ui
+    libc.src.stdbit.stdc_bit_width_ul
+    libc.src.stdbit.stdc_bit_width_ull
+    libc.src.stdbit.stdc_bit_floor_uc
+    libc.src.stdbit.stdc_bit_floor_us
+    libc.src.stdbit.stdc_bit_floor_ui
+    libc.src.stdbit.stdc_bit_floor_ul
+    libc.src.stdbit.stdc_bit_floor_ull
+    libc.src.stdbit.stdc_bit_ceil_uc
+    libc.src.stdbit.stdc_bit_ceil_us
+    libc.src.stdbit.stdc_bit_ceil_ui
+    libc.src.stdbit.stdc_bit_ceil_ul
+    libc.src.stdbit.stdc_bit_ceil_ull
 
     # stdlib.h entrypoints
     libc.src.stdlib.abs

>From d4b919bbaf559cfaf54e6a1a0a13c5d7a7d09379 Mon Sep 17 00:00:00 2001
From: Han-Chung Wang <hanhan0912 at gmail.com>
Date: Wed, 13 Mar 2024 08:52:05 -0700
Subject: [PATCH 51/63] [mlir][tensor] Make getMixedPadImpl return static
 values when possible. (#85016)

If low and high are constants (i.e., not attributes), users still prefer
attributes. Otherwise, there could be failures in type inference. A
failure is introduced by
https://github.com/llvm/llvm-project/commit/60e562d11aeca8020de8d50ded7f0ba9e10e8843,
see the drop_known_unit_constant_low_high test for more details.
---
 .../mlir/Dialect/Tensor/IR/TensorOps.td       |  2 +-
 .../TensorToLinalg/tensor-ops-to-linalg.mlir  |  3 +--
 .../Dialect/Linalg/drop-unit-extent-dims.mlir | 20 +++++++++++++++++++
 .../Dialect/Linalg/generalize-pad-tensor.mlir |  3 +--
 4 files changed, 23 insertions(+), 5 deletions(-)

diff --git a/mlir/include/mlir/Dialect/Tensor/IR/TensorOps.td b/mlir/include/mlir/Dialect/Tensor/IR/TensorOps.td
index 670202fe4372e6..cf7f3e89079c1c 100644
--- a/mlir/include/mlir/Dialect/Tensor/IR/TensorOps.td
+++ b/mlir/include/mlir/Dialect/Tensor/IR/TensorOps.td
@@ -1364,7 +1364,7 @@ def Tensor_PadOp : Tensor_Op<"pad", [
       unsigned count = staticAttrs.size();
       for (unsigned idx = 0; idx < count; ++idx) {
         if (ShapedType::isDynamic(staticAttrs[idx]))
-          res.push_back(values[numDynamic++]);
+          res.push_back(getAsOpFoldResult(values[numDynamic++]));
         else
           res.push_back(builder.getI64IntegerAttr(staticAttrs[idx]));
       }
diff --git a/mlir/test/Conversion/TensorToLinalg/tensor-ops-to-linalg.mlir b/mlir/test/Conversion/TensorToLinalg/tensor-ops-to-linalg.mlir
index 238c0c51312a6b..a0a676edceb745 100644
--- a/mlir/test/Conversion/TensorToLinalg/tensor-ops-to-linalg.mlir
+++ b/mlir/test/Conversion/TensorToLinalg/tensor-ops-to-linalg.mlir
@@ -22,7 +22,6 @@ func.func @generalize_pad_tensor_static_shape(%arg0: tensor<1x28x28x1xf32>) -> t
 // CHECK-LABEL:   func @generalize_pad_tensor_dynamic_shape(
 // CHECK-SAME:                                              %[[IN:.*]]: tensor<4x?x2x?xf32>,
 // CHECK-SAME:                                              %[[OFFSET:.*]]: index) -> tensor<4x?x?x?xf32> {
-// CHECK-DAG:       %[[C0:.*]] = arith.constant 0 : index
 // CHECK-DAG:       %[[CST:.*]] = arith.constant 0.000000e+00 : f32
 // CHECK-DAG:       %[[C1:.*]] = arith.constant 1 : index
 // CHECK:           %[[DIM1:.*]] = tensor.dim %[[IN]], %[[C1]] : tensor<4x?x2x?xf32>
@@ -33,7 +32,7 @@ func.func @generalize_pad_tensor_static_shape(%arg0: tensor<1x28x28x1xf32>) -> t
 // CHECK:           %[[OUT_DIM3:.*]] = arith.addi %[[DIM3]], %[[OFFSET]] : index
 // CHECK:           %[[INIT:.*]] = tensor.empty(%[[DIM1]], %[[OUT_DIM2]], %[[OUT_DIM3]]) : tensor<4x?x?x?xf32>
 // CHECK:           %[[FILL:.*]] = linalg.fill ins(%[[CST]] : f32) outs(%[[INIT]] : tensor<4x?x?x?xf32>) -> tensor<4x?x?x?xf32>
-// CHECK:           %[[PADDED:.*]] = tensor.insert_slice %[[IN]] into %[[FILL]]{{\[}}%[[C0]], %[[C0]], %[[OFFSET]], %[[C0]]] [4, %[[DIM1]], 2, %[[DIM3]]] [1, 1, 1, 1] : tensor<4x?x2x?xf32> into tensor<4x?x?x?xf32>
+// CHECK:           %[[PADDED:.*]] = tensor.insert_slice %[[IN]] into %[[FILL]][0, 0, %[[OFFSET]], 0] [4, %[[DIM1]], 2, %[[DIM3]]] [1, 1, 1, 1] : tensor<4x?x2x?xf32> into tensor<4x?x?x?xf32>
 // CHECK:           return %[[PADDED]] : tensor<4x?x?x?xf32>
 // CHECK:         }
 func.func @generalize_pad_tensor_dynamic_shape(%arg0: tensor<4x?x2x?xf32>, %arg1: index) -> tensor<4x?x?x?xf32> {
diff --git a/mlir/test/Dialect/Linalg/drop-unit-extent-dims.mlir b/mlir/test/Dialect/Linalg/drop-unit-extent-dims.mlir
index f2c490b832076f..c140b6abcc37a2 100644
--- a/mlir/test/Dialect/Linalg/drop-unit-extent-dims.mlir
+++ b/mlir/test/Dialect/Linalg/drop-unit-extent-dims.mlir
@@ -1033,3 +1033,23 @@ func.func @do_not_drop_non_constant_padding(%arg0: tensor<1x1x3x1x1xf32>, %pad:
 // CHECK-SLICES-LABEL: func @do_not_drop_non_constant_padding
 //       CHECK-SLICES:   tensor.pad %{{.*}} low[0, 1, 0, %c0, 0] high[0, 0, 0, %c0, 2]
 //       CHECK-SLICES:   } : tensor<1x1x3x1x1xf32> to tensor<1x2x3x1x3xf32>
+
+// -----
+
+func.func @drop_known_unit_constant_low_high(%arg0: tensor<1x383x128xf32>) -> tensor<1x384x128xf32> {
+  %c0 = arith.constant 0 : index
+  %c1 = arith.constant 1 : index
+  %cst = arith.constant 0.000000e+00 : f32
+  %padded = tensor.pad %arg0 low[%c0, %c1, %c0] high[%c0, %c0, %c0] {
+  ^bb0(%arg1: index, %arg2: index, %arg3: index):
+    tensor.yield %cst : f32
+  } : tensor<1x383x128xf32> to tensor<1x384x128xf32>
+  return %padded : tensor<1x384x128xf32>
+}
+// CHECK-LABEL: func @drop_known_unit_constant_low_high
+//       CHECK:   %[[COLLAPSE:.+]] = tensor.collapse_shape
+//  CHECK-SAME:     {{\[}}[0, 1], [2]] : tensor<1x383x128xf32> into tensor<383x128xf32>
+//       CHECK:   %[[PADDED:.+]] = tensor.pad %[[COLLAPSE]] low[1, 0] high[0, 0]
+//       CHECK:   } : tensor<383x128xf32> to tensor<384x128xf32>
+//       CHECK:   tensor.expand_shape %[[PADDED]]
+//  CHECK-SAME:     {{\[}}[0, 1], [2]] : tensor<384x128xf32> into tensor<1x384x128xf32>
diff --git a/mlir/test/Dialect/Linalg/generalize-pad-tensor.mlir b/mlir/test/Dialect/Linalg/generalize-pad-tensor.mlir
index ac0eb48fb37940..2beab31b613d54 100644
--- a/mlir/test/Dialect/Linalg/generalize-pad-tensor.mlir
+++ b/mlir/test/Dialect/Linalg/generalize-pad-tensor.mlir
@@ -19,7 +19,6 @@ func.func @generalize_pad_tensor_static_shape(%arg0: tensor<1x28x28x1xf32>) -> t
 // CHECK-LABEL:   func @generalize_pad_tensor_dynamic_shape(
 // CHECK-SAME:                                              %[[IN:.*]]: tensor<4x?x2x?xf32>,
 // CHECK-SAME:                                              %[[OFFSET:.*]]: index) -> tensor<4x?x?x?xf32> {
-// CHECK-DAG:       %[[C0:.*]] = arith.constant 0 : index
 // CHECK-DAG:       %[[CST:.*]] = arith.constant 0.000000e+00 : f32
 // CHECK-DAG:       %[[C2:.*]] = arith.constant 2 : index
 // CHECK-DAG:       %[[C1:.*]] = arith.constant 1 : index
@@ -32,7 +31,7 @@ func.func @generalize_pad_tensor_static_shape(%arg0: tensor<1x28x28x1xf32>) -> t
 // CHECK:           %[[FILL:.*]] = linalg.fill ins(%[[CST]] : f32) outs(%[[INIT]] : tensor<4x?x?x?xf32>) -> tensor<4x?x?x?xf32>
 // CHECK:           %[[DIM1_1:.*]] = tensor.dim %[[IN]], %[[C1]] : tensor<4x?x2x?xf32>
 // CHECK:           %[[DIM3_1:.*]] = tensor.dim %[[IN]], %[[C3]] : tensor<4x?x2x?xf32>
-// CHECK:           %[[PADDED:.*]] = tensor.insert_slice %[[IN]] into %[[FILL]]{{\[}}%[[C0]], %[[C0]], %[[OFFSET]], %[[C0]]] [4, %[[DIM1_1]], 2, %[[DIM3_1]]] [1, 1, 1, 1] : tensor<4x?x2x?xf32> into tensor<4x?x?x?xf32>
+// CHECK:           %[[PADDED:.*]] = tensor.insert_slice %[[IN]] into %[[FILL]][0, 0, %[[OFFSET]], 0] [4, %[[DIM1_1]], 2, %[[DIM3_1]]] [1, 1, 1, 1] : tensor<4x?x2x?xf32> into tensor<4x?x?x?xf32>
 // CHECK:           return %[[PADDED]] : tensor<4x?x?x?xf32>
 // CHECK:         }
 func.func @generalize_pad_tensor_dynamic_shape(%arg0: tensor<4x?x2x?xf32>, %arg1: index) -> tensor<4x?x?x?xf32> {

>From 1578b576b535e490a3ab592783e61009d51d0f77 Mon Sep 17 00:00:00 2001
From: Adrian Prantl <adrian-prantl at users.noreply.github.com>
Date: Wed, 13 Mar 2024 08:53:13 -0700
Subject: [PATCH 52/63] Avoid a potential exit(1) in LLVMContext::diagnose() 
 (#84992)

by handling *all* errors in IRExecDiagnosticHandler. The function that
call this handles all unhandled errors with an `exit(1)`.

rdar://124459751

I don't really have a testcase for this, since the crash report I got
for this involved the Swift language plugin.
---
 lldb/source/Expression/IRExecutionUnit.cpp | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/lldb/source/Expression/IRExecutionUnit.cpp b/lldb/source/Expression/IRExecutionUnit.cpp
index e4e131d70d4319..cb9bee8733e15d 100644
--- a/lldb/source/Expression/IRExecutionUnit.cpp
+++ b/lldb/source/Expression/IRExecutionUnit.cpp
@@ -212,18 +212,17 @@ struct IRExecDiagnosticHandler : public llvm::DiagnosticHandler {
   Status *err;
   IRExecDiagnosticHandler(Status *err) : err(err) {}
   bool handleDiagnostics(const llvm::DiagnosticInfo &DI) override {
-    if (DI.getKind() == llvm::DK_SrcMgr) {
+    if (DI.getSeverity() == llvm::DS_Error) {
       const auto &DISM = llvm::cast<llvm::DiagnosticInfoSrcMgr>(DI);
       if (err && err->Success()) {
         err->SetErrorToGenericError();
         err->SetErrorStringWithFormat(
-            "Inline assembly error: %s",
+            "IRExecution error: %s",
             DISM.getSMDiag().getMessage().str().c_str());
       }
-      return true;
     }
 
-    return false;
+    return true;
   }
 };
 } // namespace

>From a23a13b5a3acbbfbd276090f0380da11eb13576d Mon Sep 17 00:00:00 2001
From: Zaara Syeda <syzaara at ca.ibm.com>
Date: Wed, 13 Mar 2024 11:57:07 -0400
Subject: [PATCH 53/63] [PowerPC][NFC] Rename ADDItocL to match the 64-bit
 naming convention (#85099)

In preparation of adding a similar instruction for large code model on
AIX for 32-bit, rename the exisitng ADDItocL 64-instruction to ADDItocL8
to match the naming convention of other instructions with 32-bit and
64-bit variants.
---
 .../Target/PowerPC/GISel/PPCInstructionSelector.cpp  |  4 ++--
 llvm/lib/Target/PowerPC/P10InstrResources.td         |  2 +-
 llvm/lib/Target/PowerPC/PPCAsmPrinter.cpp            |  6 +++---
 llvm/lib/Target/PowerPC/PPCBack2BackFusion.def       |  4 ++--
 llvm/lib/Target/PowerPC/PPCFastISel.cpp              | 10 ++++++----
 llvm/lib/Target/PowerPC/PPCISelDAGToDAG.cpp          | 10 +++++-----
 llvm/lib/Target/PowerPC/PPCInstr64Bit.td             |  4 ++--
 llvm/lib/Target/PowerPC/PPCInstrInfo.cpp             | 12 ++++++------
 llvm/lib/Target/PowerPC/PPCMacroFusion.def           |  6 +++---
 llvm/lib/Target/PowerPC/PPCTOCRegDeps.cpp            |  3 +--
 .../CodeGen/PowerPC/remove-copy-crunsetcrbit.mir     |  2 +-
 11 files changed, 32 insertions(+), 31 deletions(-)

diff --git a/llvm/lib/Target/PowerPC/GISel/PPCInstructionSelector.cpp b/llvm/lib/Target/PowerPC/GISel/PPCInstructionSelector.cpp
index 3fd7a1ad9efa59..98cd3a82a6e05d 100644
--- a/llvm/lib/Target/PowerPC/GISel/PPCInstructionSelector.cpp
+++ b/llvm/lib/Target/PowerPC/GISel/PPCInstructionSelector.cpp
@@ -695,8 +695,8 @@ bool PPCInstructionSelector::selectConstantPool(
                .addReg(HaAddrReg)
                .addMemOperand(MMO);
     else
-      // For medium code model, generate ADDItocL(CPI, ADDIStocHA8(X2, CPI))
-      MI = BuildMI(MBB, I, DbgLoc, TII.get(PPC::ADDItocL), DstReg)
+      // For medium code model, generate ADDItocL8(CPI, ADDIStocHA8(X2, CPI))
+      MI = BuildMI(MBB, I, DbgLoc, TII.get(PPC::ADDItocL8), DstReg)
                .addReg(HaAddrReg)
                .addConstantPoolIndex(CPI);
   }
diff --git a/llvm/lib/Target/PowerPC/P10InstrResources.td b/llvm/lib/Target/PowerPC/P10InstrResources.td
index 3bbc5a63ca7abe..5015ba887d0b65 100644
--- a/llvm/lib/Target/PowerPC/P10InstrResources.td
+++ b/llvm/lib/Target/PowerPC/P10InstrResources.td
@@ -881,7 +881,7 @@ def : InstRW<[P10W_FX_3C, P10W_DISP_ANY],
 // 3 Cycles ALU operations, 1 input operands
 def : InstRW<[P10W_FX_3C, P10W_DISP_ANY, P10FX_Read],
       (instrs
-    ADDI, ADDI8, ADDIdtprelL32, ADDItlsldLADDR32, ADDItocL, LI, LI8,
+    ADDI, ADDI8, ADDIdtprelL32, ADDItlsldLADDR32, ADDItocL8, LI, LI8,
     ADDIC, ADDIC8,
     ADDIS, ADDIS8, ADDISdtprelHA32, ADDIStocHA, ADDIStocHA8, LIS, LIS8,
     ADDME, ADDME8,
diff --git a/llvm/lib/Target/PowerPC/PPCAsmPrinter.cpp b/llvm/lib/Target/PowerPC/PPCAsmPrinter.cpp
index 6f33b16f045a87..542854ec9b99fd 100644
--- a/llvm/lib/Target/PowerPC/PPCAsmPrinter.cpp
+++ b/llvm/lib/Target/PowerPC/PPCAsmPrinter.cpp
@@ -1236,8 +1236,8 @@ void PPCAsmPrinter::emitInstruction(const MachineInstr *MI) {
     EmitToStreamer(*OutStreamer, TmpInst);
     return;
   }
-  case PPC::ADDItocL: {
-    // Transform %xd = ADDItocL %xs, @sym
+  case PPC::ADDItocL8: {
+    // Transform %xd = ADDItocL8 %xs, @sym
     LowerPPCMachineInstrToMCInst(MI, TmpInst, *this);
 
     // Change the opcode to ADDI8. If the global address is external, then
@@ -1246,7 +1246,7 @@ void PPCAsmPrinter::emitInstruction(const MachineInstr *MI) {
     TmpInst.setOpcode(PPC::ADDI8);
 
     const MachineOperand &MO = MI->getOperand(2);
-    assert((MO.isGlobal() || MO.isCPI()) && "Invalid operand for ADDItocL.");
+    assert((MO.isGlobal() || MO.isCPI()) && "Invalid operand for ADDItocL8.");
 
     LLVM_DEBUG(assert(
         !(MO.isGlobal() && Subtarget->isGVIndirectSymbol(MO.getGlobal())) &&
diff --git a/llvm/lib/Target/PowerPC/PPCBack2BackFusion.def b/llvm/lib/Target/PowerPC/PPCBack2BackFusion.def
index 8bbe315a2bb9a7..6bb66bcc6c2112 100644
--- a/llvm/lib/Target/PowerPC/PPCBack2BackFusion.def
+++ b/llvm/lib/Target/PowerPC/PPCBack2BackFusion.def
@@ -29,7 +29,7 @@ FUSION_FEATURE(GeneralBack2Back, hasBack2BackFusion, -1,
     ADDIStocHA8,
     ADDIdtprelL32,
     ADDItlsldLADDR32,
-    ADDItocL,
+    ADDItocL8,
     ADDME,
     ADDME8,
     ADDME8O,
@@ -518,7 +518,7 @@ FUSION_FEATURE(GeneralBack2Back, hasBack2BackFusion, -1,
     ADDIStocHA8,
     ADDIdtprelL32,
     ADDItlsldLADDR32,
-    ADDItocL,
+    ADDItocL8,
     ADDME,
     ADDME8,
     ADDME8O,
diff --git a/llvm/lib/Target/PowerPC/PPCFastISel.cpp b/llvm/lib/Target/PowerPC/PPCFastISel.cpp
index 56af80f9cedee8..6e31cdae84767a 100644
--- a/llvm/lib/Target/PowerPC/PPCFastISel.cpp
+++ b/llvm/lib/Target/PowerPC/PPCFastISel.cpp
@@ -2094,7 +2094,7 @@ unsigned PPCFastISel::PPCMaterializeGV(const GlobalValue *GV, MVT VT) {
     // for large code model, we generate:
     //       LDtocL(GV, ADDIStocHA8(%x2, GV))
     // Otherwise we generate:
-    //       ADDItocL(ADDIStocHA8(%x2, GV), GV)
+    //       ADDItocL8(ADDIStocHA8(%x2, GV), GV)
     // Either way, start with the ADDIStocHA8:
     Register HighPartReg = createResultReg(RC);
     BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, MIMD, TII.get(PPC::ADDIStocHA8),
@@ -2104,9 +2104,11 @@ unsigned PPCFastISel::PPCMaterializeGV(const GlobalValue *GV, MVT VT) {
       BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, MIMD, TII.get(PPC::LDtocL),
               DestReg).addGlobalAddress(GV).addReg(HighPartReg);
     } else {
-      // Otherwise generate the ADDItocL.
-      BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, MIMD, TII.get(PPC::ADDItocL),
-              DestReg).addReg(HighPartReg).addGlobalAddress(GV);
+      // Otherwise generate the ADDItocL8.
+      BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, MIMD, TII.get(PPC::ADDItocL8),
+              DestReg)
+          .addReg(HighPartReg)
+          .addGlobalAddress(GV);
     }
   }
 
diff --git a/llvm/lib/Target/PowerPC/PPCISelDAGToDAG.cpp b/llvm/lib/Target/PowerPC/PPCISelDAGToDAG.cpp
index 2462cbb1928235..0c25accd1d6ce6 100644
--- a/llvm/lib/Target/PowerPC/PPCISelDAGToDAG.cpp
+++ b/llvm/lib/Target/PowerPC/PPCISelDAGToDAG.cpp
@@ -6134,7 +6134,7 @@ void PPCDAGToDAGISel::Select(SDNode *N) {
     // [64-bit ELF/AIX]
     //   LDtocL(@sym, ADDIStocHA8(%x2, @sym))
     // Otherwise we generate:
-    //   ADDItocL(ADDIStocHA8(%x2, @sym), @sym)
+    //   ADDItocL8(ADDIStocHA8(%x2, @sym), @sym)
     SDValue GA = N->getOperand(0);
     SDValue TOCbase = N->getOperand(1);
 
@@ -6154,7 +6154,7 @@ void PPCDAGToDAGISel::Select(SDNode *N) {
     }
 
     // Build the address relative to the TOC-pointer.
-    ReplaceNode(N, CurDAG->getMachineNode(PPC::ADDItocL, dl, MVT::i64,
+    ReplaceNode(N, CurDAG->getMachineNode(PPC::ADDItocL8, dl, MVT::i64,
                                           SDValue(Tmp, 0), GA));
     return;
   }
@@ -7707,7 +7707,7 @@ void PPCDAGToDAGISel::PeepholePPC64() {
     // target flags on the immediate operand when we fold it into the
     // load instruction.
     //
-    // For something like ADDItocL, the relocation information is
+    // For something like ADDItocL8, the relocation information is
     // inferred from the opcode; when we process it in the AsmPrinter,
     // we add the necessary relocation there.  A load, though, can receive
     // relocation from various flavors of ADDIxxx, so we need to carry
@@ -7728,7 +7728,7 @@ void PPCDAGToDAGISel::PeepholePPC64() {
     case PPC::ADDItlsldL:
       Flags = PPCII::MO_TLSLD_LO;
       break;
-    case PPC::ADDItocL:
+    case PPC::ADDItocL8:
       Flags = PPCII::MO_TOC_LO;
       break;
     }
@@ -7755,7 +7755,7 @@ void PPCDAGToDAGISel::PeepholePPC64() {
         // If we have a addi(toc at l)/addis(toc at ha) pair, and the addis has only
         // one use, then we can do this for any offset, we just need to also
         // update the offset (i.e. the symbol addend) on the addis also.
-        if (Base.getMachineOpcode() != PPC::ADDItocL)
+        if (Base.getMachineOpcode() != PPC::ADDItocL8)
           continue;
 
         if (!HBase.isMachineOpcode() ||
diff --git a/llvm/lib/Target/PowerPC/PPCInstr64Bit.td b/llvm/lib/Target/PowerPC/PPCInstr64Bit.td
index 2949d58ab66479..a9359794a641c9 100644
--- a/llvm/lib/Target/PowerPC/PPCInstr64Bit.td
+++ b/llvm/lib/Target/PowerPC/PPCInstr64Bit.td
@@ -1480,8 +1480,8 @@ let hasSideEffects = 0 in {
 let isReMaterializable = 1 in {
 def ADDIStocHA8: PPCEmitTimePseudo<(outs g8rc:$rD), (ins g8rc_nox0:$reg, tocentry:$disp),
                        "#ADDIStocHA8", []>, isPPC64;
-def ADDItocL: PPCEmitTimePseudo<(outs g8rc:$rD), (ins g8rc_nox0:$reg, tocentry:$disp),
-                     "#ADDItocL", []>, isPPC64;
+def ADDItocL8: PPCEmitTimePseudo<(outs g8rc:$rD), (ins g8rc_nox0:$reg, tocentry:$disp),
+                     "#ADDItocL8", []>, isPPC64;
 }
 
 // Local Data Transform
diff --git a/llvm/lib/Target/PowerPC/PPCInstrInfo.cpp b/llvm/lib/Target/PowerPC/PPCInstrInfo.cpp
index 5d37e929f8755e..5f5eb31a5a85fa 100644
--- a/llvm/lib/Target/PowerPC/PPCInstrInfo.cpp
+++ b/llvm/lib/Target/PowerPC/PPCInstrInfo.cpp
@@ -1077,7 +1077,7 @@ bool PPCInstrInfo::isReallyTriviallyReMaterializable(
   case PPC::LIS8:
   case PPC::ADDIStocHA:
   case PPC::ADDIStocHA8:
-  case PPC::ADDItocL:
+  case PPC::ADDItocL8:
   case PPC::LOAD_STACK_GUARD:
   case PPC::PPCLdFixedAddr:
   case PPC::XXLXORz:
@@ -3453,7 +3453,7 @@ MachineInstr *PPCInstrInfo::getForwardingDefMI(
             break;
           case PPC::LI:
           case PPC::LI8:
-          case PPC::ADDItocL:
+          case PPC::ADDItocL8:
           case PPC::ADDI:
           case PPC::ADDI8:
             OpNoForForwarding = i;
@@ -4420,7 +4420,7 @@ bool PPCInstrInfo::isDefMIElgibleForForwarding(MachineInstr &DefMI,
                                                MachineOperand *&ImmMO,
                                                MachineOperand *&RegMO) const {
   unsigned Opc = DefMI.getOpcode();
-  if (Opc != PPC::ADDItocL && Opc != PPC::ADDI && Opc != PPC::ADDI8)
+  if (Opc != PPC::ADDItocL8 && Opc != PPC::ADDI && Opc != PPC::ADDI8)
     return false;
 
   assert(DefMI.getNumOperands() >= 3 &&
@@ -4485,8 +4485,8 @@ bool PPCInstrInfo::isImmElgibleForForwarding(const MachineOperand &ImmMO,
                                              int64_t &Imm,
                                              int64_t BaseImm) const {
   assert(isAnImmediateOperand(ImmMO) && "ImmMO is NOT an immediate");
-  if (DefMI.getOpcode() == PPC::ADDItocL) {
-    // The operand for ADDItocL is CPI, which isn't imm at compiling time,
+  if (DefMI.getOpcode() == PPC::ADDItocL8) {
+    // The operand for ADDItocL8 is CPI, which isn't imm at compiling time,
     // However, we know that, it is 16-bit width, and has the alignment of 4.
     // Check if the instruction met the requirement.
     if (III.ImmMustBeMultipleOf > 4 ||
@@ -4899,7 +4899,7 @@ bool PPCInstrInfo::transformToImmFormFedByAdd(
     // register with ImmMO.
     // Before that, we need to fixup the target flags for imm.
     // For some reason, we miss to set the flag for the ImmMO if it is CPI.
-    if (DefMI.getOpcode() == PPC::ADDItocL)
+    if (DefMI.getOpcode() == PPC::ADDItocL8)
       ImmMO->setTargetFlags(PPCII::MO_TOC_LO);
 
     // MI didn't have the interface such as MI.setOperand(i) though
diff --git a/llvm/lib/Target/PowerPC/PPCMacroFusion.def b/llvm/lib/Target/PowerPC/PPCMacroFusion.def
index 6b8ad22639c80f..fb6e656edb8b9b 100644
--- a/llvm/lib/Target/PowerPC/PPCMacroFusion.def
+++ b/llvm/lib/Target/PowerPC/PPCMacroFusion.def
@@ -32,7 +32,7 @@
 // {addi} followed by one of these {lxvd2x, lxvw4x, lxvdsx, lvebx, lvehx,
 // lvewx, lvx, lxsdx}
 FUSION_FEATURE(AddiLoad, hasAddiLoadFusion, 2, \
-               FUSION_OP_SET(ADDI, ADDI8, ADDItocL), \
+               FUSION_OP_SET(ADDI, ADDI8, ADDItocL8), \
                FUSION_OP_SET(LXVD2X, LXVW4X, LXVDSX, LVEBX, LVEHX, LVEWX, \
                              LVX, LXSDX))
 
@@ -135,11 +135,11 @@ FUSION_FEATURE(XorisXori, hasWideImmFusion, 1, FUSION_OP_SET(XORIS, XORIS8),
 // addis rx,ra,si - addi rt,rx,SI, SI >= 0
 FUSION_FEATURE(AddisAddi, hasWideImmFusion, 1,
                FUSION_OP_SET(ADDIS, ADDIS8, ADDIStocHA8),
-               FUSION_OP_SET(ADDI, ADDI8, ADDItocL))
+               FUSION_OP_SET(ADDI, ADDI8, ADDItocL8))
 
 // addi rx,ra,si - addis rt,rx,SI, ra > 0, SI >= 2
 FUSION_FEATURE(AddiAddis, hasWideImmFusion, 1,
-               FUSION_OP_SET(ADDI, ADDI8, ADDItocL),
+               FUSION_OP_SET(ADDI, ADDI8, ADDItocL8),
                FUSION_OP_SET(ADDIS, ADDIS8, ADDIStocHA8))
 
 // mtctr - { bcctr,bcctrl }
diff --git a/llvm/lib/Target/PowerPC/PPCTOCRegDeps.cpp b/llvm/lib/Target/PowerPC/PPCTOCRegDeps.cpp
index 81f078ab246e6a..0527991b58ba81 100644
--- a/llvm/lib/Target/PowerPC/PPCTOCRegDeps.cpp
+++ b/llvm/lib/Target/PowerPC/PPCTOCRegDeps.cpp
@@ -94,8 +94,7 @@ namespace {
 
 protected:
     bool hasTOCLoReloc(const MachineInstr &MI) {
-      if (MI.getOpcode() == PPC::LDtocL ||
-          MI.getOpcode() == PPC::ADDItocL ||
+      if (MI.getOpcode() == PPC::LDtocL || MI.getOpcode() == PPC::ADDItocL8 ||
           MI.getOpcode() == PPC::LWZtocL)
         return true;
 
diff --git a/llvm/test/CodeGen/PowerPC/remove-copy-crunsetcrbit.mir b/llvm/test/CodeGen/PowerPC/remove-copy-crunsetcrbit.mir
index 3a312d2f4a8bac..f3ef95bbb79a47 100644
--- a/llvm/test/CodeGen/PowerPC/remove-copy-crunsetcrbit.mir
+++ b/llvm/test/CodeGen/PowerPC/remove-copy-crunsetcrbit.mir
@@ -130,7 +130,7 @@ body:             |
     %22:g8rc_and_g8rc_nox0 = ADDIStocHA8 $x2, @c
     %10:g8rc_and_g8rc_nox0 = ADDIStocHA8 $x2, @e
     %13:g8rc_and_g8rc_nox0 = ADDIStocHA8 $x2, @a
-    %14:g8rc_and_g8rc_nox0 = ADDItocL killed %13, @a, implicit $x2
+    %14:g8rc_and_g8rc_nox0 = ADDItocL8 killed %13, @a, implicit $x2
 
   bb.2.while.body:
     successors: %bb.4(0x30000000), %bb.3(0x50000000)

>From 1576cdb1d14734c9d211e27226f804f9f216bad0 Mon Sep 17 00:00:00 2001
From: David Blaikie <dblaikie at gmail.com>
Date: Wed, 13 Mar 2024 15:55:18 +0000
Subject: [PATCH 54/63] Remove use of reference lifetime extension introduced
 in cdde0d9

Rather than dealing with which is more readable, the named variable
doesn't seem to add value here - so omit it.
---
 clang/lib/AST/Interp/Interp.h | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/clang/lib/AST/Interp/Interp.h b/clang/lib/AST/Interp/Interp.h
index bb220657c2dadc..db80e2d59753f8 100644
--- a/clang/lib/AST/Interp/Interp.h
+++ b/clang/lib/AST/Interp/Interp.h
@@ -846,8 +846,7 @@ bool CMP3(InterpState &S, CodePtr OpPC, const ComparisonCategoryInfo *CmpInfo) {
       CmpInfo->getValueInfo(CmpInfo->makeWeakResult(CmpResult));
   assert(CmpValueInfo);
   assert(CmpValueInfo->hasValidIntValue());
-  const APSInt &IntValue = CmpValueInfo->getIntValue();
-  return SetThreeWayComparisonField(S, OpPC, P, IntValue);
+  return SetThreeWayComparisonField(S, OpPC, P, CmpValueInfo->getIntValue());
 }
 
 template <PrimType Name, class T = typename PrimConv<Name>::T>

>From eadc54d609cf8e3094ba46bd81b49b6dbd85bf58 Mon Sep 17 00:00:00 2001
From: Usman Nadeem <mnadeem at quicinc.com>
Date: Wed, 13 Mar 2024 09:05:55 -0700
Subject: [PATCH 55/63] [AArch64] Improve lowering of truncating uzp1 (#82457)

There were two existing patterns:
    `concat_vectors(trunc(x), trunc(y)) -> uzp1(x, y)`
`concat_vectors(assertzext(trunc(x)), assertzext(trunc(y))) -> uzp1(x,
y)`

Move them into a class and add the following `assertsext` pattern to it:
`concat_vectors(assertsext(trunc(x)), assertsext(trunc(y))) -> uzp1(x,
y)`

Add the following transform for v8i8 and v4i16 result types to help with
pattern matching:
  `truncating uzp1(x, y) -> trunc(concat(x, y))`
And a pattern to go with it:
  `trunc(concat_vectors(x, y)) -> uzp1 (x, y)`

Add another isel pattern for v8i8 and v4i16 result vector types, similar
to
the existing concat pattern, but with a trunc node in the begining:
`trunc(concat_vectors(assertext_trunc(x), assertext_trunc(y))) ->
xtn(uzp1(x, y))`
---
 .../Target/AArch64/AArch64ISelLowering.cpp    |  39 +--
 llvm/lib/Target/AArch64/AArch64InstrInfo.td   |  53 ++--
 .../CodeGen/AArch64/arm64-convert-v4f64.ll    |  21 +-
 llvm/test/CodeGen/AArch64/extbinopload.ll     |  31 ++-
 .../CodeGen/AArch64/fp-conversion-to-tbl.ll   |   5 +-
 llvm/test/CodeGen/AArch64/fptoi.ll            | 256 ++++++------------
 llvm/test/CodeGen/AArch64/neon-truncstore.ll  |   5 +-
 llvm/test/CodeGen/AArch64/sadd_sat_vec.ll     |   2 +-
 llvm/test/CodeGen/AArch64/shuffle-tbl34.ll    |  14 +-
 llvm/test/CodeGen/AArch64/ssub_sat_vec.ll     |   2 +-
 llvm/test/CodeGen/AArch64/tbl-loops.ll        |   4 +-
 llvm/test/CodeGen/AArch64/trunc-to-tbl.ll     |  28 +-
 llvm/test/CodeGen/AArch64/uadd_sat_vec.ll     |   2 +-
 llvm/test/CodeGen/AArch64/usub_sat_vec.ll     |   2 +-
 llvm/test/CodeGen/AArch64/vcvt-oversize.ll    |   5 +-
 .../vec-combine-compare-truncate-store.ll     |   2 +-
 .../AArch64/vec3-loads-ext-trunc-stores.ll    |  22 +-
 17 files changed, 209 insertions(+), 284 deletions(-)

diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 5b7a36d2eba76f..9665ae5ceb903f 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -21423,12 +21423,8 @@ static SDValue performUzpCombine(SDNode *N, SelectionDAG &DAG,
     }
   }
 
-  // uzp1(xtn x, xtn y) -> xtn(uzp1 (x, y))
-  // Only implemented on little-endian subtargets.
-  bool IsLittleEndian = DAG.getDataLayout().isLittleEndian();
-
-  // This optimization only works on little endian.
-  if (!IsLittleEndian)
+  // These optimizations only work on little endian.
+  if (!DAG.getDataLayout().isLittleEndian())
     return SDValue();
 
   // uzp1(bitcast(x), bitcast(y)) -> uzp1(x, y)
@@ -21447,21 +21443,28 @@ static SDValue performUzpCombine(SDNode *N, SelectionDAG &DAG,
   if (ResVT != MVT::v2i32 && ResVT != MVT::v4i16 && ResVT != MVT::v8i8)
     return SDValue();
 
-  auto getSourceOp = [](SDValue Operand) -> SDValue {
-    const unsigned Opcode = Operand.getOpcode();
-    if (Opcode == ISD::TRUNCATE)
-      return Operand->getOperand(0);
-    if (Opcode == ISD::BITCAST &&
-        Operand->getOperand(0).getOpcode() == ISD::TRUNCATE)
-      return Operand->getOperand(0)->getOperand(0);
-    return SDValue();
-  };
+  SDValue SourceOp0 = peekThroughBitcasts(Op0);
+  SDValue SourceOp1 = peekThroughBitcasts(Op1);
 
-  SDValue SourceOp0 = getSourceOp(Op0);
-  SDValue SourceOp1 = getSourceOp(Op1);
+  // truncating uzp1(x, y) -> xtn(concat (x, y))
+  if (SourceOp0.getValueType() == SourceOp1.getValueType()) {
+    EVT Op0Ty = SourceOp0.getValueType();
+    if ((ResVT == MVT::v4i16 && Op0Ty == MVT::v2i32) ||
+        (ResVT == MVT::v8i8 && Op0Ty == MVT::v4i16)) {
+      SDValue Concat =
+          DAG.getNode(ISD::CONCAT_VECTORS, DL,
+                      Op0Ty.getDoubleNumVectorElementsVT(*DAG.getContext()),
+                      SourceOp0, SourceOp1);
+      return DAG.getNode(ISD::TRUNCATE, DL, ResVT, Concat);
+    }
+  }
 
-  if (!SourceOp0 || !SourceOp1)
+  // uzp1(xtn x, xtn y) -> xtn(uzp1 (x, y))
+  if (SourceOp0.getOpcode() != ISD::TRUNCATE ||
+      SourceOp1.getOpcode() != ISD::TRUNCATE)
     return SDValue();
+  SourceOp0 = SourceOp0.getOperand(0);
+  SourceOp1 = SourceOp1.getOperand(0);
 
   if (SourceOp0.getValueType() != SourceOp1.getValueType() ||
       !SourceOp0.getValueType().isSimple())
diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.td b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
index 6254e68326f79d..b4b975cce007ac 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
@@ -6153,26 +6153,39 @@ defm UZP2 : SIMDZipVector<0b101, "uzp2", AArch64uzp2>;
 defm ZIP1 : SIMDZipVector<0b011, "zip1", AArch64zip1>;
 defm ZIP2 : SIMDZipVector<0b111, "zip2", AArch64zip2>;
 
-def : Pat<(v16i8 (concat_vectors (v8i8 (trunc (v8i16 V128:$Vn))),
-                                 (v8i8 (trunc (v8i16 V128:$Vm))))),
-          (UZP1v16i8 V128:$Vn, V128:$Vm)>;
-def : Pat<(v8i16 (concat_vectors (v4i16 (trunc (v4i32 V128:$Vn))),
-                                 (v4i16 (trunc (v4i32 V128:$Vm))))),
-          (UZP1v8i16 V128:$Vn, V128:$Vm)>;
-def : Pat<(v4i32 (concat_vectors (v2i32 (trunc (v2i64 V128:$Vn))),
-                                 (v2i32 (trunc (v2i64 V128:$Vm))))),
-          (UZP1v4i32 V128:$Vn, V128:$Vm)>;
-// These are the same as above, with an optional assertzext node that can be
-// generated from fptoi lowering.
-def : Pat<(v16i8 (concat_vectors (v8i8 (assertzext (trunc (v8i16 V128:$Vn)))),
-                                 (v8i8 (assertzext (trunc (v8i16 V128:$Vm)))))),
-          (UZP1v16i8 V128:$Vn, V128:$Vm)>;
-def : Pat<(v8i16 (concat_vectors (v4i16 (assertzext (trunc (v4i32 V128:$Vn)))),
-                                 (v4i16 (assertzext (trunc (v4i32 V128:$Vm)))))),
-          (UZP1v8i16 V128:$Vn, V128:$Vm)>;
-def : Pat<(v4i32 (concat_vectors (v2i32 (assertzext (trunc (v2i64 V128:$Vn)))),
-                                 (v2i32 (assertzext (trunc (v2i64 V128:$Vm)))))),
-          (UZP1v4i32 V128:$Vn, V128:$Vm)>;
+def trunc_optional_assert_ext : PatFrags<(ops node:$op0),
+                                         [(trunc node:$op0),
+                                          (assertzext (trunc node:$op0)),
+                                          (assertsext (trunc node:$op0))]>;
+
+// concat_vectors(trunc(x), trunc(y)) -> uzp1(x, y)
+// concat_vectors(assertzext(trunc(x)), assertzext(trunc(y))) -> uzp1(x, y)
+// concat_vectors(assertsext(trunc(x)), assertsext(trunc(y))) -> uzp1(x, y)
+class concat_trunc_to_uzp1_pat<ValueType SrcTy, ValueType TruncTy, ValueType ConcatTy>
+  : Pat<(ConcatTy (concat_vectors (TruncTy (trunc_optional_assert_ext (SrcTy V128:$Vn))),
+                                  (TruncTy (trunc_optional_assert_ext (SrcTy V128:$Vm))))),
+        (!cast<Instruction>("UZP1"#ConcatTy) V128:$Vn, V128:$Vm)>;
+def : concat_trunc_to_uzp1_pat<v8i16, v8i8, v16i8>;
+def : concat_trunc_to_uzp1_pat<v4i32, v4i16, v8i16>;
+def : concat_trunc_to_uzp1_pat<v2i64, v2i32, v4i32>;
+
+// trunc(concat_vectors(trunc(x), trunc(y))) -> xtn(uzp1(x, y))
+// trunc(concat_vectors(assertzext(trunc(x)), assertzext(trunc(y)))) -> xtn(uzp1(x, y))
+// trunc(concat_vectors(assertsext(trunc(x)), assertsext(trunc(y)))) -> xtn(uzp1(x, y))
+class trunc_concat_trunc_to_xtn_uzp1_pat<ValueType SrcTy, ValueType TruncTy, ValueType ConcatTy,
+                                         ValueType Ty>
+  : Pat<(Ty (trunc_optional_assert_ext
+                    (ConcatTy (concat_vectors
+                                  (TruncTy (trunc_optional_assert_ext (SrcTy V128:$Vn))),
+                                  (TruncTy (trunc_optional_assert_ext (SrcTy V128:$Vm))))))),
+        (!cast<Instruction>("XTN"#Ty) (!cast<Instruction>("UZP1"#ConcatTy) V128:$Vn, V128:$Vm))>;
+def : trunc_concat_trunc_to_xtn_uzp1_pat<v4i32, v4i16, v8i16, v8i8>;
+def : trunc_concat_trunc_to_xtn_uzp1_pat<v2i64, v2i32, v4i32, v4i16>;
+
+def : Pat<(v8i8 (trunc (concat_vectors (v4i16 V64:$Vn), (v4i16 V64:$Vm)))),
+          (UZP1v8i8 V64:$Vn, V64:$Vm)>;
+def : Pat<(v4i16 (trunc (concat_vectors (v2i32 V64:$Vn), (v2i32 V64:$Vm)))),
+          (UZP1v4i16 V64:$Vn, V64:$Vm)>;
 
 def : Pat<(v16i8 (concat_vectors
                  (v8i8 (trunc (AArch64vlshr (v8i16 V128:$Vn), (i32 8)))),
diff --git a/llvm/test/CodeGen/AArch64/arm64-convert-v4f64.ll b/llvm/test/CodeGen/AArch64/arm64-convert-v4f64.ll
index 49325299f74a12..3007e7ce771e62 100644
--- a/llvm/test/CodeGen/AArch64/arm64-convert-v4f64.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-convert-v4f64.ll
@@ -8,9 +8,8 @@ define <4 x i16> @fptosi_v4f64_to_v4i16(ptr %ptr) {
 ; CHECK-NEXT:    ldp q0, q1, [x0]
 ; CHECK-NEXT:    fcvtzs v1.2d, v1.2d
 ; CHECK-NEXT:    fcvtzs v0.2d, v0.2d
-; CHECK-NEXT:    xtn v1.2s, v1.2d
-; CHECK-NEXT:    xtn v0.2s, v0.2d
-; CHECK-NEXT:    uzp1 v0.4h, v0.4h, v1.4h
+; CHECK-NEXT:    uzp1 v0.4s, v0.4s, v1.4s
+; CHECK-NEXT:    xtn v0.4h, v0.4s
 ; CHECK-NEXT:    ret
   %tmp1 = load <4 x double>, ptr %ptr
   %tmp2 = fptosi <4 x double> %tmp1 to <4 x i16>
@@ -26,13 +25,10 @@ define <8 x i8> @fptosi_v4f64_to_v4i8(ptr %ptr) {
 ; CHECK-NEXT:    fcvtzs v1.2d, v1.2d
 ; CHECK-NEXT:    fcvtzs v3.2d, v3.2d
 ; CHECK-NEXT:    fcvtzs v2.2d, v2.2d
-; CHECK-NEXT:    xtn v0.2s, v0.2d
-; CHECK-NEXT:    xtn v1.2s, v1.2d
-; CHECK-NEXT:    xtn v3.2s, v3.2d
-; CHECK-NEXT:    xtn v2.2s, v2.2d
-; CHECK-NEXT:    uzp1 v0.4h, v1.4h, v0.4h
-; CHECK-NEXT:    uzp1 v1.4h, v2.4h, v3.4h
-; CHECK-NEXT:    uzp1 v0.8b, v1.8b, v0.8b
+; CHECK-NEXT:    uzp1 v0.4s, v1.4s, v0.4s
+; CHECK-NEXT:    uzp1 v1.4s, v2.4s, v3.4s
+; CHECK-NEXT:    uzp1 v0.8h, v1.8h, v0.8h
+; CHECK-NEXT:    xtn v0.8b, v0.8h
 ; CHECK-NEXT:    ret
   %tmp1 = load <8 x double>, ptr %ptr
   %tmp2 = fptosi <8 x double> %tmp1 to <8 x i8>
@@ -96,9 +92,8 @@ define <4 x i16> @fptoui_v4f64_to_v4i16(ptr %ptr) {
 ; CHECK-NEXT:    ldp q0, q1, [x0]
 ; CHECK-NEXT:    fcvtzs v1.2d, v1.2d
 ; CHECK-NEXT:    fcvtzs v0.2d, v0.2d
-; CHECK-NEXT:    xtn v1.2s, v1.2d
-; CHECK-NEXT:    xtn v0.2s, v0.2d
-; CHECK-NEXT:    uzp1 v0.4h, v0.4h, v1.4h
+; CHECK-NEXT:    uzp1 v0.4s, v0.4s, v1.4s
+; CHECK-NEXT:    xtn v0.4h, v0.4s
 ; CHECK-NEXT:    ret
   %tmp1 = load <4 x double>, ptr %ptr
   %tmp2 = fptoui <4 x double> %tmp1 to <4 x i16>
diff --git a/llvm/test/CodeGen/AArch64/extbinopload.ll b/llvm/test/CodeGen/AArch64/extbinopload.ll
index 1f68c77611e10d..dff4831330deb0 100644
--- a/llvm/test/CodeGen/AArch64/extbinopload.ll
+++ b/llvm/test/CodeGen/AArch64/extbinopload.ll
@@ -650,7 +650,7 @@ define <16 x i32> @extrause_load(ptr %p, ptr %q, ptr %r, ptr %s, ptr %z) {
 ; CHECK-NEXT:    add x11, x3, #12
 ; CHECK-NEXT:    str s1, [x4]
 ; CHECK-NEXT:    ushll v1.8h, v1.8b, #0
-; CHECK-NEXT:    ldp s0, s5, [x2]
+; CHECK-NEXT:    ldp s0, s4, [x2]
 ; CHECK-NEXT:    ushll v2.8h, v0.8b, #0
 ; CHECK-NEXT:    umov w9, v2.h[0]
 ; CHECK-NEXT:    umov w10, v2.h[1]
@@ -662,24 +662,25 @@ define <16 x i32> @extrause_load(ptr %p, ptr %q, ptr %r, ptr %s, ptr %z) {
 ; CHECK-NEXT:    ushll v2.8h, v2.8b, #0
 ; CHECK-NEXT:    mov v0.b[10], w9
 ; CHECK-NEXT:    add x9, x1, #4
-; CHECK-NEXT:    uzp1 v1.8b, v1.8b, v2.8b
+; CHECK-NEXT:    mov v1.d[1], v2.d[0]
 ; CHECK-NEXT:    mov v0.b[11], w10
 ; CHECK-NEXT:    add x10, x1, #12
+; CHECK-NEXT:    bic v1.8h, #255, lsl #8
 ; CHECK-NEXT:    ld1 { v0.s }[3], [x3], #4
-; CHECK-NEXT:    ldr s4, [x0, #12]
-; CHECK-NEXT:    ldp s3, s16, [x0, #4]
-; CHECK-NEXT:    ld1 { v5.s }[1], [x3]
-; CHECK-NEXT:    ldp s6, s7, [x2, #8]
-; CHECK-NEXT:    ld1 { v4.s }[1], [x10]
-; CHECK-NEXT:    ld1 { v3.s }[1], [x9]
-; CHECK-NEXT:    ld1 { v6.s }[1], [x8]
-; CHECK-NEXT:    ld1 { v7.s }[1], [x11]
+; CHECK-NEXT:    ldr s3, [x0, #12]
+; CHECK-NEXT:    ldp s2, s7, [x0, #4]
+; CHECK-NEXT:    ld1 { v4.s }[1], [x3]
+; CHECK-NEXT:    ldp s5, s6, [x2, #8]
+; CHECK-NEXT:    ld1 { v3.s }[1], [x10]
+; CHECK-NEXT:    ld1 { v2.s }[1], [x9]
+; CHECK-NEXT:    ld1 { v5.s }[1], [x8]
+; CHECK-NEXT:    ld1 { v6.s }[1], [x11]
 ; CHECK-NEXT:    add x8, x1, #8
-; CHECK-NEXT:    ld1 { v16.s }[1], [x8]
-; CHECK-NEXT:    uaddl v2.8h, v3.8b, v4.8b
-; CHECK-NEXT:    ushll v3.8h, v6.8b, #0
-; CHECK-NEXT:    uaddl v4.8h, v5.8b, v7.8b
-; CHECK-NEXT:    uaddl v1.8h, v1.8b, v16.8b
+; CHECK-NEXT:    ld1 { v7.s }[1], [x8]
+; CHECK-NEXT:    uaddl v2.8h, v2.8b, v3.8b
+; CHECK-NEXT:    ushll v3.8h, v5.8b, #0
+; CHECK-NEXT:    uaddl v4.8h, v4.8b, v6.8b
+; CHECK-NEXT:    uaddw v1.8h, v1.8h, v7.8b
 ; CHECK-NEXT:    uaddw2 v5.8h, v3.8h, v0.16b
 ; CHECK-NEXT:    ushll v0.4s, v2.4h, #3
 ; CHECK-NEXT:    ushll2 v2.4s, v2.8h, #3
diff --git a/llvm/test/CodeGen/AArch64/fp-conversion-to-tbl.ll b/llvm/test/CodeGen/AArch64/fp-conversion-to-tbl.ll
index 1ea87bb6b04b51..0a3b9a070c2b32 100644
--- a/llvm/test/CodeGen/AArch64/fp-conversion-to-tbl.ll
+++ b/llvm/test/CodeGen/AArch64/fp-conversion-to-tbl.ll
@@ -73,9 +73,8 @@ define void @fptoui_v8f32_to_v8i8_no_loop(ptr %A, ptr %dst) {
 ; CHECK-NEXT:    ldp q0, q1, [x0]
 ; CHECK-NEXT:    fcvtzs.4s v1, v1
 ; CHECK-NEXT:    fcvtzs.4s v0, v0
-; CHECK-NEXT:    xtn.4h v1, v1
-; CHECK-NEXT:    xtn.4h v0, v0
-; CHECK-NEXT:    uzp1.8b v0, v0, v1
+; CHECK-NEXT:    uzp1.8h v0, v0, v1
+; CHECK-NEXT:    xtn.8b v0, v0
 ; CHECK-NEXT:    str d0, [x1]
 ; CHECK-NEXT:    ret
 entry:
diff --git a/llvm/test/CodeGen/AArch64/fptoi.ll b/llvm/test/CodeGen/AArch64/fptoi.ll
index 67190e8596c46c..7af01b53dae7e8 100644
--- a/llvm/test/CodeGen/AArch64/fptoi.ll
+++ b/llvm/test/CodeGen/AArch64/fptoi.ll
@@ -1096,30 +1096,17 @@ entry:
 }
 
 define <3 x i16> @fptos_v3f64_v3i16(<3 x double> %a) {
-; CHECK-SD-LABEL: fptos_v3f64_v3i16:
-; CHECK-SD:       // %bb.0: // %entry
-; CHECK-SD-NEXT:    // kill: def $d0 killed $d0 def $q0
-; CHECK-SD-NEXT:    // kill: def $d1 killed $d1 def $q1
-; CHECK-SD-NEXT:    // kill: def $d2 killed $d2 def $q2
-; CHECK-SD-NEXT:    mov v0.d[1], v1.d[0]
-; CHECK-SD-NEXT:    fcvtzs v1.2d, v2.2d
-; CHECK-SD-NEXT:    fcvtzs v0.2d, v0.2d
-; CHECK-SD-NEXT:    xtn v1.2s, v1.2d
-; CHECK-SD-NEXT:    xtn v0.2s, v0.2d
-; CHECK-SD-NEXT:    uzp1 v0.4h, v0.4h, v1.4h
-; CHECK-SD-NEXT:    ret
-;
-; CHECK-GI-LABEL: fptos_v3f64_v3i16:
-; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 def $q0
-; CHECK-GI-NEXT:    // kill: def $d1 killed $d1 def $q1
-; CHECK-GI-NEXT:    // kill: def $d2 killed $d2 def $q2
-; CHECK-GI-NEXT:    mov v0.d[1], v1.d[0]
-; CHECK-GI-NEXT:    fcvtzs v1.2d, v2.2d
-; CHECK-GI-NEXT:    fcvtzs v0.2d, v0.2d
-; CHECK-GI-NEXT:    uzp1 v0.4s, v0.4s, v1.4s
-; CHECK-GI-NEXT:    xtn v0.4h, v0.4s
-; CHECK-GI-NEXT:    ret
+; CHECK-LABEL: fptos_v3f64_v3i16:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-NEXT:    // kill: def $d1 killed $d1 def $q1
+; CHECK-NEXT:    // kill: def $d2 killed $d2 def $q2
+; CHECK-NEXT:    mov v0.d[1], v1.d[0]
+; CHECK-NEXT:    fcvtzs v1.2d, v2.2d
+; CHECK-NEXT:    fcvtzs v0.2d, v0.2d
+; CHECK-NEXT:    uzp1 v0.4s, v0.4s, v1.4s
+; CHECK-NEXT:    xtn v0.4h, v0.4s
+; CHECK-NEXT:    ret
 entry:
   %c = fptosi <3 x double> %a to <3 x i16>
   ret <3 x i16> %c
@@ -1134,9 +1121,8 @@ define <3 x i16> @fptou_v3f64_v3i16(<3 x double> %a) {
 ; CHECK-SD-NEXT:    mov v0.d[1], v1.d[0]
 ; CHECK-SD-NEXT:    fcvtzs v1.2d, v2.2d
 ; CHECK-SD-NEXT:    fcvtzs v0.2d, v0.2d
-; CHECK-SD-NEXT:    xtn v1.2s, v1.2d
-; CHECK-SD-NEXT:    xtn v0.2s, v0.2d
-; CHECK-SD-NEXT:    uzp1 v0.4h, v0.4h, v1.4h
+; CHECK-SD-NEXT:    uzp1 v0.4s, v0.4s, v1.4s
+; CHECK-SD-NEXT:    xtn v0.4h, v0.4s
 ; CHECK-SD-NEXT:    ret
 ;
 ; CHECK-GI-LABEL: fptou_v3f64_v3i16:
@@ -1160,9 +1146,8 @@ define <4 x i16> @fptos_v4f64_v4i16(<4 x double> %a) {
 ; CHECK-SD:       // %bb.0: // %entry
 ; CHECK-SD-NEXT:    fcvtzs v1.2d, v1.2d
 ; CHECK-SD-NEXT:    fcvtzs v0.2d, v0.2d
-; CHECK-SD-NEXT:    xtn v1.2s, v1.2d
-; CHECK-SD-NEXT:    xtn v0.2s, v0.2d
-; CHECK-SD-NEXT:    uzp1 v0.4h, v0.4h, v1.4h
+; CHECK-SD-NEXT:    uzp1 v0.4s, v0.4s, v1.4s
+; CHECK-SD-NEXT:    xtn v0.4h, v0.4s
 ; CHECK-SD-NEXT:    ret
 ;
 ; CHECK-GI-LABEL: fptos_v4f64_v4i16:
@@ -1182,9 +1167,8 @@ define <4 x i16> @fptou_v4f64_v4i16(<4 x double> %a) {
 ; CHECK-SD:       // %bb.0: // %entry
 ; CHECK-SD-NEXT:    fcvtzs v1.2d, v1.2d
 ; CHECK-SD-NEXT:    fcvtzs v0.2d, v0.2d
-; CHECK-SD-NEXT:    xtn v1.2s, v1.2d
-; CHECK-SD-NEXT:    xtn v0.2s, v0.2d
-; CHECK-SD-NEXT:    uzp1 v0.4h, v0.4h, v1.4h
+; CHECK-SD-NEXT:    uzp1 v0.4s, v0.4s, v1.4s
+; CHECK-SD-NEXT:    xtn v0.4h, v0.4s
 ; CHECK-SD-NEXT:    ret
 ;
 ; CHECK-GI-LABEL: fptou_v4f64_v4i16:
@@ -1600,9 +1584,8 @@ define <3 x i8> @fptos_v3f64_v3i8(<3 x double> %a) {
 ; CHECK-SD-NEXT:    mov v0.d[1], v1.d[0]
 ; CHECK-SD-NEXT:    fcvtzs v1.2d, v2.2d
 ; CHECK-SD-NEXT:    fcvtzs v0.2d, v0.2d
-; CHECK-SD-NEXT:    xtn v1.2s, v1.2d
-; CHECK-SD-NEXT:    xtn v0.2s, v0.2d
-; CHECK-SD-NEXT:    uzp1 v0.4h, v0.4h, v1.4h
+; CHECK-SD-NEXT:    uzp1 v0.4s, v0.4s, v1.4s
+; CHECK-SD-NEXT:    xtn v0.4h, v0.4s
 ; CHECK-SD-NEXT:    umov w0, v0.h[0]
 ; CHECK-SD-NEXT:    umov w1, v0.h[1]
 ; CHECK-SD-NEXT:    umov w2, v0.h[2]
@@ -1638,9 +1621,8 @@ define <3 x i8> @fptou_v3f64_v3i8(<3 x double> %a) {
 ; CHECK-SD-NEXT:    mov v0.d[1], v1.d[0]
 ; CHECK-SD-NEXT:    fcvtzs v1.2d, v2.2d
 ; CHECK-SD-NEXT:    fcvtzs v0.2d, v0.2d
-; CHECK-SD-NEXT:    xtn v1.2s, v1.2d
-; CHECK-SD-NEXT:    xtn v0.2s, v0.2d
-; CHECK-SD-NEXT:    uzp1 v0.4h, v0.4h, v1.4h
+; CHECK-SD-NEXT:    uzp1 v0.4s, v0.4s, v1.4s
+; CHECK-SD-NEXT:    xtn v0.4h, v0.4s
 ; CHECK-SD-NEXT:    umov w0, v0.h[0]
 ; CHECK-SD-NEXT:    umov w1, v0.h[1]
 ; CHECK-SD-NEXT:    umov w2, v0.h[2]
@@ -1672,9 +1654,8 @@ define <4 x i8> @fptos_v4f64_v4i8(<4 x double> %a) {
 ; CHECK-SD:       // %bb.0: // %entry
 ; CHECK-SD-NEXT:    fcvtzs v1.2d, v1.2d
 ; CHECK-SD-NEXT:    fcvtzs v0.2d, v0.2d
-; CHECK-SD-NEXT:    xtn v1.2s, v1.2d
-; CHECK-SD-NEXT:    xtn v0.2s, v0.2d
-; CHECK-SD-NEXT:    uzp1 v0.4h, v0.4h, v1.4h
+; CHECK-SD-NEXT:    uzp1 v0.4s, v0.4s, v1.4s
+; CHECK-SD-NEXT:    xtn v0.4h, v0.4s
 ; CHECK-SD-NEXT:    ret
 ;
 ; CHECK-GI-LABEL: fptos_v4f64_v4i8:
@@ -1694,9 +1675,8 @@ define <4 x i8> @fptou_v4f64_v4i8(<4 x double> %a) {
 ; CHECK-SD:       // %bb.0: // %entry
 ; CHECK-SD-NEXT:    fcvtzs v1.2d, v1.2d
 ; CHECK-SD-NEXT:    fcvtzs v0.2d, v0.2d
-; CHECK-SD-NEXT:    xtn v1.2s, v1.2d
-; CHECK-SD-NEXT:    xtn v0.2s, v0.2d
-; CHECK-SD-NEXT:    uzp1 v0.4h, v0.4h, v1.4h
+; CHECK-SD-NEXT:    uzp1 v0.4s, v0.4s, v1.4s
+; CHECK-SD-NEXT:    xtn v0.4h, v0.4s
 ; CHECK-SD-NEXT:    ret
 ;
 ; CHECK-GI-LABEL: fptou_v4f64_v4i8:
@@ -1718,13 +1698,10 @@ define <8 x i8> @fptos_v8f64_v8i8(<8 x double> %a) {
 ; CHECK-SD-NEXT:    fcvtzs v2.2d, v2.2d
 ; CHECK-SD-NEXT:    fcvtzs v1.2d, v1.2d
 ; CHECK-SD-NEXT:    fcvtzs v0.2d, v0.2d
-; CHECK-SD-NEXT:    xtn v3.2s, v3.2d
-; CHECK-SD-NEXT:    xtn v2.2s, v2.2d
-; CHECK-SD-NEXT:    xtn v1.2s, v1.2d
-; CHECK-SD-NEXT:    xtn v0.2s, v0.2d
-; CHECK-SD-NEXT:    uzp1 v2.4h, v2.4h, v3.4h
-; CHECK-SD-NEXT:    uzp1 v0.4h, v0.4h, v1.4h
-; CHECK-SD-NEXT:    uzp1 v0.8b, v0.8b, v2.8b
+; CHECK-SD-NEXT:    uzp1 v2.4s, v2.4s, v3.4s
+; CHECK-SD-NEXT:    uzp1 v0.4s, v0.4s, v1.4s
+; CHECK-SD-NEXT:    uzp1 v0.8h, v0.8h, v2.8h
+; CHECK-SD-NEXT:    xtn v0.8b, v0.8h
 ; CHECK-SD-NEXT:    ret
 ;
 ; CHECK-GI-LABEL: fptos_v8f64_v8i8:
@@ -1750,13 +1727,10 @@ define <8 x i8> @fptou_v8f64_v8i8(<8 x double> %a) {
 ; CHECK-SD-NEXT:    fcvtzs v2.2d, v2.2d
 ; CHECK-SD-NEXT:    fcvtzs v1.2d, v1.2d
 ; CHECK-SD-NEXT:    fcvtzs v0.2d, v0.2d
-; CHECK-SD-NEXT:    xtn v3.2s, v3.2d
-; CHECK-SD-NEXT:    xtn v2.2s, v2.2d
-; CHECK-SD-NEXT:    xtn v1.2s, v1.2d
-; CHECK-SD-NEXT:    xtn v0.2s, v0.2d
-; CHECK-SD-NEXT:    uzp1 v2.4h, v2.4h, v3.4h
-; CHECK-SD-NEXT:    uzp1 v0.4h, v0.4h, v1.4h
-; CHECK-SD-NEXT:    uzp1 v0.8b, v0.8b, v2.8b
+; CHECK-SD-NEXT:    uzp1 v2.4s, v2.4s, v3.4s
+; CHECK-SD-NEXT:    uzp1 v0.4s, v0.4s, v1.4s
+; CHECK-SD-NEXT:    uzp1 v0.8h, v0.8h, v2.8h
+; CHECK-SD-NEXT:    xtn v0.8b, v0.8h
 ; CHECK-SD-NEXT:    ret
 ;
 ; CHECK-GI-LABEL: fptou_v8f64_v8i8:
@@ -1786,21 +1760,13 @@ define <16 x i8> @fptos_v16f64_v16i8(<16 x double> %a) {
 ; CHECK-SD-NEXT:    fcvtzs v2.2d, v2.2d
 ; CHECK-SD-NEXT:    fcvtzs v1.2d, v1.2d
 ; CHECK-SD-NEXT:    fcvtzs v0.2d, v0.2d
-; CHECK-SD-NEXT:    xtn v7.2s, v7.2d
-; CHECK-SD-NEXT:    xtn v6.2s, v6.2d
-; CHECK-SD-NEXT:    xtn v5.2s, v5.2d
-; CHECK-SD-NEXT:    xtn v4.2s, v4.2d
-; CHECK-SD-NEXT:    xtn v3.2s, v3.2d
-; CHECK-SD-NEXT:    xtn v2.2s, v2.2d
-; CHECK-SD-NEXT:    xtn v1.2s, v1.2d
-; CHECK-SD-NEXT:    xtn v0.2s, v0.2d
-; CHECK-SD-NEXT:    uzp1 v6.4h, v6.4h, v7.4h
-; CHECK-SD-NEXT:    uzp1 v4.4h, v4.4h, v5.4h
-; CHECK-SD-NEXT:    uzp1 v2.4h, v2.4h, v3.4h
-; CHECK-SD-NEXT:    uzp1 v0.4h, v0.4h, v1.4h
-; CHECK-SD-NEXT:    mov v4.d[1], v6.d[0]
-; CHECK-SD-NEXT:    mov v0.d[1], v2.d[0]
-; CHECK-SD-NEXT:    uzp1 v0.16b, v0.16b, v4.16b
+; CHECK-SD-NEXT:    uzp1 v6.4s, v6.4s, v7.4s
+; CHECK-SD-NEXT:    uzp1 v4.4s, v4.4s, v5.4s
+; CHECK-SD-NEXT:    uzp1 v2.4s, v2.4s, v3.4s
+; CHECK-SD-NEXT:    uzp1 v0.4s, v0.4s, v1.4s
+; CHECK-SD-NEXT:    uzp1 v1.8h, v4.8h, v6.8h
+; CHECK-SD-NEXT:    uzp1 v0.8h, v0.8h, v2.8h
+; CHECK-SD-NEXT:    uzp1 v0.16b, v0.16b, v1.16b
 ; CHECK-SD-NEXT:    ret
 ;
 ; CHECK-GI-LABEL: fptos_v16f64_v16i8:
@@ -1837,21 +1803,13 @@ define <16 x i8> @fptou_v16f64_v16i8(<16 x double> %a) {
 ; CHECK-SD-NEXT:    fcvtzs v2.2d, v2.2d
 ; CHECK-SD-NEXT:    fcvtzs v1.2d, v1.2d
 ; CHECK-SD-NEXT:    fcvtzs v0.2d, v0.2d
-; CHECK-SD-NEXT:    xtn v7.2s, v7.2d
-; CHECK-SD-NEXT:    xtn v6.2s, v6.2d
-; CHECK-SD-NEXT:    xtn v5.2s, v5.2d
-; CHECK-SD-NEXT:    xtn v4.2s, v4.2d
-; CHECK-SD-NEXT:    xtn v3.2s, v3.2d
-; CHECK-SD-NEXT:    xtn v2.2s, v2.2d
-; CHECK-SD-NEXT:    xtn v1.2s, v1.2d
-; CHECK-SD-NEXT:    xtn v0.2s, v0.2d
-; CHECK-SD-NEXT:    uzp1 v6.4h, v6.4h, v7.4h
-; CHECK-SD-NEXT:    uzp1 v4.4h, v4.4h, v5.4h
-; CHECK-SD-NEXT:    uzp1 v2.4h, v2.4h, v3.4h
-; CHECK-SD-NEXT:    uzp1 v0.4h, v0.4h, v1.4h
-; CHECK-SD-NEXT:    mov v4.d[1], v6.d[0]
-; CHECK-SD-NEXT:    mov v0.d[1], v2.d[0]
-; CHECK-SD-NEXT:    uzp1 v0.16b, v0.16b, v4.16b
+; CHECK-SD-NEXT:    uzp1 v6.4s, v6.4s, v7.4s
+; CHECK-SD-NEXT:    uzp1 v4.4s, v4.4s, v5.4s
+; CHECK-SD-NEXT:    uzp1 v2.4s, v2.4s, v3.4s
+; CHECK-SD-NEXT:    uzp1 v0.4s, v0.4s, v1.4s
+; CHECK-SD-NEXT:    uzp1 v1.8h, v4.8h, v6.8h
+; CHECK-SD-NEXT:    uzp1 v0.8h, v0.8h, v2.8h
+; CHECK-SD-NEXT:    uzp1 v0.16b, v0.16b, v1.16b
 ; CHECK-SD-NEXT:    ret
 ;
 ; CHECK-GI-LABEL: fptou_v16f64_v16i8:
@@ -1900,36 +1858,20 @@ define <32 x i8> @fptos_v32f64_v32i8(<32 x double> %a) {
 ; CHECK-SD-NEXT:    fcvtzs v18.2d, v18.2d
 ; CHECK-SD-NEXT:    fcvtzs v17.2d, v17.2d
 ; CHECK-SD-NEXT:    fcvtzs v16.2d, v16.2d
-; CHECK-SD-NEXT:    xtn v7.2s, v7.2d
-; CHECK-SD-NEXT:    xtn v6.2s, v6.2d
-; CHECK-SD-NEXT:    xtn v5.2s, v5.2d
-; CHECK-SD-NEXT:    xtn v4.2s, v4.2d
-; CHECK-SD-NEXT:    xtn v3.2s, v3.2d
-; CHECK-SD-NEXT:    xtn v2.2s, v2.2d
-; CHECK-SD-NEXT:    xtn v1.2s, v1.2d
-; CHECK-SD-NEXT:    xtn v0.2s, v0.2d
-; CHECK-SD-NEXT:    xtn v23.2s, v23.2d
-; CHECK-SD-NEXT:    xtn v22.2s, v22.2d
-; CHECK-SD-NEXT:    xtn v21.2s, v21.2d
-; CHECK-SD-NEXT:    xtn v20.2s, v20.2d
-; CHECK-SD-NEXT:    xtn v19.2s, v19.2d
-; CHECK-SD-NEXT:    xtn v18.2s, v18.2d
-; CHECK-SD-NEXT:    xtn v17.2s, v17.2d
-; CHECK-SD-NEXT:    xtn v16.2s, v16.2d
-; CHECK-SD-NEXT:    uzp1 v6.4h, v6.4h, v7.4h
-; CHECK-SD-NEXT:    uzp1 v4.4h, v4.4h, v5.4h
-; CHECK-SD-NEXT:    uzp1 v2.4h, v2.4h, v3.4h
-; CHECK-SD-NEXT:    uzp1 v0.4h, v0.4h, v1.4h
-; CHECK-SD-NEXT:    uzp1 v1.4h, v22.4h, v23.4h
-; CHECK-SD-NEXT:    uzp1 v3.4h, v20.4h, v21.4h
-; CHECK-SD-NEXT:    uzp1 v5.4h, v18.4h, v19.4h
-; CHECK-SD-NEXT:    uzp1 v7.4h, v16.4h, v17.4h
-; CHECK-SD-NEXT:    mov v4.d[1], v6.d[0]
-; CHECK-SD-NEXT:    mov v0.d[1], v2.d[0]
-; CHECK-SD-NEXT:    mov v3.d[1], v1.d[0]
-; CHECK-SD-NEXT:    mov v7.d[1], v5.d[0]
+; CHECK-SD-NEXT:    uzp1 v6.4s, v6.4s, v7.4s
+; CHECK-SD-NEXT:    uzp1 v4.4s, v4.4s, v5.4s
+; CHECK-SD-NEXT:    uzp1 v2.4s, v2.4s, v3.4s
+; CHECK-SD-NEXT:    uzp1 v0.4s, v0.4s, v1.4s
+; CHECK-SD-NEXT:    uzp1 v3.4s, v20.4s, v21.4s
+; CHECK-SD-NEXT:    uzp1 v1.4s, v22.4s, v23.4s
+; CHECK-SD-NEXT:    uzp1 v5.4s, v18.4s, v19.4s
+; CHECK-SD-NEXT:    uzp1 v7.4s, v16.4s, v17.4s
+; CHECK-SD-NEXT:    uzp1 v4.8h, v4.8h, v6.8h
+; CHECK-SD-NEXT:    uzp1 v0.8h, v0.8h, v2.8h
+; CHECK-SD-NEXT:    uzp1 v1.8h, v3.8h, v1.8h
+; CHECK-SD-NEXT:    uzp1 v2.8h, v7.8h, v5.8h
 ; CHECK-SD-NEXT:    uzp1 v0.16b, v0.16b, v4.16b
-; CHECK-SD-NEXT:    uzp1 v1.16b, v7.16b, v3.16b
+; CHECK-SD-NEXT:    uzp1 v1.16b, v2.16b, v1.16b
 ; CHECK-SD-NEXT:    ret
 ;
 ; CHECK-GI-LABEL: fptos_v32f64_v32i8:
@@ -1997,36 +1939,20 @@ define <32 x i8> @fptou_v32f64_v32i8(<32 x double> %a) {
 ; CHECK-SD-NEXT:    fcvtzs v18.2d, v18.2d
 ; CHECK-SD-NEXT:    fcvtzs v17.2d, v17.2d
 ; CHECK-SD-NEXT:    fcvtzs v16.2d, v16.2d
-; CHECK-SD-NEXT:    xtn v7.2s, v7.2d
-; CHECK-SD-NEXT:    xtn v6.2s, v6.2d
-; CHECK-SD-NEXT:    xtn v5.2s, v5.2d
-; CHECK-SD-NEXT:    xtn v4.2s, v4.2d
-; CHECK-SD-NEXT:    xtn v3.2s, v3.2d
-; CHECK-SD-NEXT:    xtn v2.2s, v2.2d
-; CHECK-SD-NEXT:    xtn v1.2s, v1.2d
-; CHECK-SD-NEXT:    xtn v0.2s, v0.2d
-; CHECK-SD-NEXT:    xtn v23.2s, v23.2d
-; CHECK-SD-NEXT:    xtn v22.2s, v22.2d
-; CHECK-SD-NEXT:    xtn v21.2s, v21.2d
-; CHECK-SD-NEXT:    xtn v20.2s, v20.2d
-; CHECK-SD-NEXT:    xtn v19.2s, v19.2d
-; CHECK-SD-NEXT:    xtn v18.2s, v18.2d
-; CHECK-SD-NEXT:    xtn v17.2s, v17.2d
-; CHECK-SD-NEXT:    xtn v16.2s, v16.2d
-; CHECK-SD-NEXT:    uzp1 v6.4h, v6.4h, v7.4h
-; CHECK-SD-NEXT:    uzp1 v4.4h, v4.4h, v5.4h
-; CHECK-SD-NEXT:    uzp1 v2.4h, v2.4h, v3.4h
-; CHECK-SD-NEXT:    uzp1 v0.4h, v0.4h, v1.4h
-; CHECK-SD-NEXT:    uzp1 v1.4h, v22.4h, v23.4h
-; CHECK-SD-NEXT:    uzp1 v3.4h, v20.4h, v21.4h
-; CHECK-SD-NEXT:    uzp1 v5.4h, v18.4h, v19.4h
-; CHECK-SD-NEXT:    uzp1 v7.4h, v16.4h, v17.4h
-; CHECK-SD-NEXT:    mov v4.d[1], v6.d[0]
-; CHECK-SD-NEXT:    mov v0.d[1], v2.d[0]
-; CHECK-SD-NEXT:    mov v3.d[1], v1.d[0]
-; CHECK-SD-NEXT:    mov v7.d[1], v5.d[0]
+; CHECK-SD-NEXT:    uzp1 v6.4s, v6.4s, v7.4s
+; CHECK-SD-NEXT:    uzp1 v4.4s, v4.4s, v5.4s
+; CHECK-SD-NEXT:    uzp1 v2.4s, v2.4s, v3.4s
+; CHECK-SD-NEXT:    uzp1 v0.4s, v0.4s, v1.4s
+; CHECK-SD-NEXT:    uzp1 v3.4s, v20.4s, v21.4s
+; CHECK-SD-NEXT:    uzp1 v1.4s, v22.4s, v23.4s
+; CHECK-SD-NEXT:    uzp1 v5.4s, v18.4s, v19.4s
+; CHECK-SD-NEXT:    uzp1 v7.4s, v16.4s, v17.4s
+; CHECK-SD-NEXT:    uzp1 v4.8h, v4.8h, v6.8h
+; CHECK-SD-NEXT:    uzp1 v0.8h, v0.8h, v2.8h
+; CHECK-SD-NEXT:    uzp1 v1.8h, v3.8h, v1.8h
+; CHECK-SD-NEXT:    uzp1 v2.8h, v7.8h, v5.8h
 ; CHECK-SD-NEXT:    uzp1 v0.16b, v0.16b, v4.16b
-; CHECK-SD-NEXT:    uzp1 v1.16b, v7.16b, v3.16b
+; CHECK-SD-NEXT:    uzp1 v1.16b, v2.16b, v1.16b
 ; CHECK-SD-NEXT:    ret
 ;
 ; CHECK-GI-LABEL: fptou_v32f64_v32i8:
@@ -3026,9 +2952,8 @@ define <8 x i8> @fptos_v8f32_v8i8(<8 x float> %a) {
 ; CHECK-SD:       // %bb.0: // %entry
 ; CHECK-SD-NEXT:    fcvtzs v1.4s, v1.4s
 ; CHECK-SD-NEXT:    fcvtzs v0.4s, v0.4s
-; CHECK-SD-NEXT:    xtn v1.4h, v1.4s
-; CHECK-SD-NEXT:    xtn v0.4h, v0.4s
-; CHECK-SD-NEXT:    uzp1 v0.8b, v0.8b, v1.8b
+; CHECK-SD-NEXT:    uzp1 v0.8h, v0.8h, v1.8h
+; CHECK-SD-NEXT:    xtn v0.8b, v0.8h
 ; CHECK-SD-NEXT:    ret
 ;
 ; CHECK-GI-LABEL: fptos_v8f32_v8i8:
@@ -3048,9 +2973,8 @@ define <8 x i8> @fptou_v8f32_v8i8(<8 x float> %a) {
 ; CHECK-SD:       // %bb.0: // %entry
 ; CHECK-SD-NEXT:    fcvtzs v1.4s, v1.4s
 ; CHECK-SD-NEXT:    fcvtzs v0.4s, v0.4s
-; CHECK-SD-NEXT:    xtn v1.4h, v1.4s
-; CHECK-SD-NEXT:    xtn v0.4h, v0.4s
-; CHECK-SD-NEXT:    uzp1 v0.8b, v0.8b, v1.8b
+; CHECK-SD-NEXT:    uzp1 v0.8h, v0.8h, v1.8h
+; CHECK-SD-NEXT:    xtn v0.8b, v0.8h
 ; CHECK-SD-NEXT:    ret
 ;
 ; CHECK-GI-LABEL: fptou_v8f32_v8i8:
@@ -3072,12 +2996,8 @@ define <16 x i8> @fptos_v16f32_v16i8(<16 x float> %a) {
 ; CHECK-SD-NEXT:    fcvtzs v2.4s, v2.4s
 ; CHECK-SD-NEXT:    fcvtzs v1.4s, v1.4s
 ; CHECK-SD-NEXT:    fcvtzs v0.4s, v0.4s
-; CHECK-SD-NEXT:    xtn v3.4h, v3.4s
-; CHECK-SD-NEXT:    xtn v2.4h, v2.4s
-; CHECK-SD-NEXT:    xtn v1.4h, v1.4s
-; CHECK-SD-NEXT:    xtn v0.4h, v0.4s
-; CHECK-SD-NEXT:    mov v2.d[1], v3.d[0]
-; CHECK-SD-NEXT:    mov v0.d[1], v1.d[0]
+; CHECK-SD-NEXT:    uzp1 v2.8h, v2.8h, v3.8h
+; CHECK-SD-NEXT:    uzp1 v0.8h, v0.8h, v1.8h
 ; CHECK-SD-NEXT:    uzp1 v0.16b, v0.16b, v2.16b
 ; CHECK-SD-NEXT:    ret
 ;
@@ -3134,20 +3054,12 @@ define <32 x i8> @fptos_v32f32_v32i8(<32 x float> %a) {
 ; CHECK-SD-NEXT:    fcvtzs v6.4s, v6.4s
 ; CHECK-SD-NEXT:    fcvtzs v5.4s, v5.4s
 ; CHECK-SD-NEXT:    fcvtzs v4.4s, v4.4s
-; CHECK-SD-NEXT:    xtn v3.4h, v3.4s
-; CHECK-SD-NEXT:    xtn v2.4h, v2.4s
-; CHECK-SD-NEXT:    xtn v1.4h, v1.4s
-; CHECK-SD-NEXT:    xtn v0.4h, v0.4s
-; CHECK-SD-NEXT:    xtn v7.4h, v7.4s
-; CHECK-SD-NEXT:    xtn v6.4h, v6.4s
-; CHECK-SD-NEXT:    xtn v5.4h, v5.4s
-; CHECK-SD-NEXT:    xtn v4.4h, v4.4s
-; CHECK-SD-NEXT:    mov v2.d[1], v3.d[0]
-; CHECK-SD-NEXT:    mov v0.d[1], v1.d[0]
-; CHECK-SD-NEXT:    mov v6.d[1], v7.d[0]
-; CHECK-SD-NEXT:    mov v4.d[1], v5.d[0]
+; CHECK-SD-NEXT:    uzp1 v2.8h, v2.8h, v3.8h
+; CHECK-SD-NEXT:    uzp1 v0.8h, v0.8h, v1.8h
+; CHECK-SD-NEXT:    uzp1 v1.8h, v6.8h, v7.8h
+; CHECK-SD-NEXT:    uzp1 v3.8h, v4.8h, v5.8h
 ; CHECK-SD-NEXT:    uzp1 v0.16b, v0.16b, v2.16b
-; CHECK-SD-NEXT:    uzp1 v1.16b, v4.16b, v6.16b
+; CHECK-SD-NEXT:    uzp1 v1.16b, v3.16b, v1.16b
 ; CHECK-SD-NEXT:    ret
 ;
 ; CHECK-GI-LABEL: fptos_v32f32_v32i8:
diff --git a/llvm/test/CodeGen/AArch64/neon-truncstore.ll b/llvm/test/CodeGen/AArch64/neon-truncstore.ll
index b677d077b98c14..5d78ad24eb333d 100644
--- a/llvm/test/CodeGen/AArch64/neon-truncstore.ll
+++ b/llvm/test/CodeGen/AArch64/neon-truncstore.ll
@@ -104,7 +104,7 @@ define void @v4i32_v4i8(<4 x i32> %a, ptr %result) {
 ; CHECK-LABEL: v4i32_v4i8:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    xtn v0.4h, v0.4s
-; CHECK-NEXT:    xtn v0.8b, v0.8h
+; CHECK-NEXT:    uzp1 v0.8b, v0.8b, v0.8b
 ; CHECK-NEXT:    str s0, [x0]
 ; CHECK-NEXT:    ret
   %b = trunc <4 x i32> %a to <4 x i8>
@@ -170,8 +170,7 @@ define void @v2i16_v2i8(<2 x i16> %a, ptr %result) {
 define void @v4i16_v4i8(<4 x i16> %a, ptr %result) {
 ; CHECK-LABEL: v4i16_v4i8:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
-; CHECK-NEXT:    xtn v0.8b, v0.8h
+; CHECK-NEXT:    uzp1 v0.8b, v0.8b, v0.8b
 ; CHECK-NEXT:    str s0, [x0]
 ; CHECK-NEXT:    ret
   %b = trunc <4 x i16> %a to <4 x i8>
diff --git a/llvm/test/CodeGen/AArch64/sadd_sat_vec.ll b/llvm/test/CodeGen/AArch64/sadd_sat_vec.ll
index 5f905d94e3573c..6f1ae023bf25a1 100644
--- a/llvm/test/CodeGen/AArch64/sadd_sat_vec.ll
+++ b/llvm/test/CodeGen/AArch64/sadd_sat_vec.ll
@@ -145,7 +145,7 @@ define void @v4i8(ptr %px, ptr %py, ptr %pz) nounwind {
 ; CHECK-NEXT:    shl v0.4h, v0.4h, #8
 ; CHECK-NEXT:    sqadd v0.4h, v0.4h, v1.4h
 ; CHECK-NEXT:    sshr v0.4h, v0.4h, #8
-; CHECK-NEXT:    xtn v0.8b, v0.8h
+; CHECK-NEXT:    uzp1 v0.8b, v0.8b, v0.8b
 ; CHECK-NEXT:    str s0, [x2]
 ; CHECK-NEXT:    ret
   %x = load <4 x i8>, ptr %px
diff --git a/llvm/test/CodeGen/AArch64/shuffle-tbl34.ll b/llvm/test/CodeGen/AArch64/shuffle-tbl34.ll
index 0ef64789ad9724..fb571eff39fe50 100644
--- a/llvm/test/CodeGen/AArch64/shuffle-tbl34.ll
+++ b/llvm/test/CodeGen/AArch64/shuffle-tbl34.ll
@@ -353,13 +353,17 @@ define <8 x i8> @shuffle4_v8i8_v8i8(<8 x i8> %a, <8 x i8> %b, <8 x i8> %c, <8 x
 define <8 x i16> @shuffle4_v4i8_zext(<4 x i8> %a, <4 x i8> %b, <4 x i8> %c, <4 x i8> %d) {
 ; CHECK-LABEL: shuffle4_v4i8_zext:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    uzp1 v0.8b, v0.8b, v1.8b
-; CHECK-NEXT:    uzp1 v1.8b, v2.8b, v3.8b
+; CHECK-NEXT:    fmov d5, d2
+; CHECK-NEXT:    // kill: def $d1 killed $d1 def $q1
+; CHECK-NEXT:    // kill: def $d3 killed $d3 def $q3
 ; CHECK-NEXT:    adrp x8, .LCPI8_0
-; CHECK-NEXT:    ushll v2.8h, v0.8b, #0
+; CHECK-NEXT:    fmov d4, d0
 ; CHECK-NEXT:    ldr q0, [x8, :lo12:.LCPI8_0]
-; CHECK-NEXT:    ushll v3.8h, v1.8b, #0
-; CHECK-NEXT:    tbl v0.16b, { v2.16b, v3.16b }, v0.16b
+; CHECK-NEXT:    mov v4.d[1], v1.d[0]
+; CHECK-NEXT:    mov v5.d[1], v3.d[0]
+; CHECK-NEXT:    bic v4.8h, #255, lsl #8
+; CHECK-NEXT:    bic v5.8h, #255, lsl #8
+; CHECK-NEXT:    tbl v0.16b, { v4.16b, v5.16b }, v0.16b
 ; CHECK-NEXT:    ret
   %x = shufflevector <4 x i8> %a, <4 x i8> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
   %y = shufflevector <4 x i8> %c, <4 x i8> %d, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
diff --git a/llvm/test/CodeGen/AArch64/ssub_sat_vec.ll b/llvm/test/CodeGen/AArch64/ssub_sat_vec.ll
index acec3e74d3e93f..d1f843a09f749d 100644
--- a/llvm/test/CodeGen/AArch64/ssub_sat_vec.ll
+++ b/llvm/test/CodeGen/AArch64/ssub_sat_vec.ll
@@ -146,7 +146,7 @@ define void @v4i8(ptr %px, ptr %py, ptr %pz) nounwind {
 ; CHECK-NEXT:    shl v0.4h, v0.4h, #8
 ; CHECK-NEXT:    sqsub v0.4h, v0.4h, v1.4h
 ; CHECK-NEXT:    sshr v0.4h, v0.4h, #8
-; CHECK-NEXT:    xtn v0.8b, v0.8h
+; CHECK-NEXT:    uzp1 v0.8b, v0.8b, v0.8b
 ; CHECK-NEXT:    str s0, [x2]
 ; CHECK-NEXT:    ret
   %x = load <4 x i8>, ptr %px
diff --git a/llvm/test/CodeGen/AArch64/tbl-loops.ll b/llvm/test/CodeGen/AArch64/tbl-loops.ll
index 4f8a4f7aede3eb..0ad99008655184 100644
--- a/llvm/test/CodeGen/AArch64/tbl-loops.ll
+++ b/llvm/test/CodeGen/AArch64/tbl-loops.ll
@@ -41,8 +41,8 @@ define void @loop1(ptr noalias nocapture noundef writeonly %dst, ptr nocapture n
 ; CHECK-NEXT:    fcvtzs v2.4s, v2.4s
 ; CHECK-NEXT:    xtn v1.4h, v1.4s
 ; CHECK-NEXT:    xtn v2.4h, v2.4s
-; CHECK-NEXT:    xtn v1.8b, v1.8h
-; CHECK-NEXT:    xtn v2.8b, v2.8h
+; CHECK-NEXT:    uzp1 v1.8b, v1.8b, v0.8b
+; CHECK-NEXT:    uzp1 v2.8b, v2.8b, v0.8b
 ; CHECK-NEXT:    mov v1.s[1], v2.s[0]
 ; CHECK-NEXT:    stur d1, [x12, #-4]
 ; CHECK-NEXT:    add x12, x12, #8
diff --git a/llvm/test/CodeGen/AArch64/trunc-to-tbl.ll b/llvm/test/CodeGen/AArch64/trunc-to-tbl.ll
index ba367b0dbfde34..18cd4cc2111a42 100644
--- a/llvm/test/CodeGen/AArch64/trunc-to-tbl.ll
+++ b/llvm/test/CodeGen/AArch64/trunc-to-tbl.ll
@@ -710,23 +710,23 @@ define void @trunc_v11i64_to_v11i8_in_loop(ptr %A, ptr %dst) {
 ; CHECK-NEXT:  LBB6_1: ; %loop
 ; CHECK-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    ldp q4, q1, [x0, #48]
-; CHECK-NEXT:    add x9, x1, #8
-; CHECK-NEXT:    ldp q3, q2, [x0]
-; CHECK-NEXT:    subs x8, x8, #1
+; CHECK-NEXT:    add x9, x1, #10
 ; CHECK-NEXT:    ldr d0, [x0, #80]
+; CHECK-NEXT:    ldp q3, q2, [x0]
 ; CHECK-NEXT:    ldr q5, [x0, #32]
+; CHECK-NEXT:    subs x8, x8, #1
 ; CHECK-NEXT:    add x0, x0, #128
-; CHECK-NEXT:    uzp1.4s v4, v5, v4
-; CHECK-NEXT:    uzp1.4s v2, v3, v2
 ; CHECK-NEXT:    uzp1.4s v0, v1, v0
-; CHECK-NEXT:    uzp1.8h v1, v2, v4
+; CHECK-NEXT:    uzp1.4s v1, v5, v4
+; CHECK-NEXT:    uzp1.4s v2, v3, v2
 ; CHECK-NEXT:    xtn.4h v0, v0
-; CHECK-NEXT:    uzp1.16b v1, v1, v0
-; CHECK-NEXT:    xtn.8b v0, v0
-; CHECK-NEXT:    st1.h { v1 }[4], [x9]
-; CHECK-NEXT:    add x9, x1, #10
-; CHECK-NEXT:    st1.b { v0 }[2], [x9]
-; CHECK-NEXT:    str d1, [x1], #16
+; CHECK-NEXT:    uzp1.8h v1, v2, v1
+; CHECK-NEXT:    uzp1.8b v2, v0, v0
+; CHECK-NEXT:    uzp1.16b v0, v1, v0
+; CHECK-NEXT:    st1.b { v2 }[2], [x9]
+; CHECK-NEXT:    add x9, x1, #8
+; CHECK-NEXT:    st1.h { v0 }[4], [x9]
+; CHECK-NEXT:    str d0, [x1], #16
 ; CHECK-NEXT:    b.eq LBB6_1
 ; CHECK-NEXT:  ; %bb.2: ; %exit
 ; CHECK-NEXT:    ret
@@ -755,7 +755,7 @@ define void @trunc_v11i64_to_v11i8_in_loop(ptr %A, ptr %dst) {
 ; CHECK-BE-NEXT:    xtn v0.4h, v0.4s
 ; CHECK-BE-NEXT:    uzp1 v1.8h, v1.8h, v2.8h
 ; CHECK-BE-NEXT:    uzp1 v1.16b, v1.16b, v0.16b
-; CHECK-BE-NEXT:    xtn v0.8b, v0.8h
+; CHECK-BE-NEXT:    uzp1 v0.8b, v0.8b, v0.8b
 ; CHECK-BE-NEXT:    rev16 v2.16b, v1.16b
 ; CHECK-BE-NEXT:    rev64 v1.16b, v1.16b
 ; CHECK-BE-NEXT:    st1 { v0.b }[2], [x9]
@@ -790,7 +790,7 @@ define void @trunc_v11i64_to_v11i8_in_loop(ptr %A, ptr %dst) {
 ; CHECK-DISABLE-NEXT:    xtn v0.4h, v0.4s
 ; CHECK-DISABLE-NEXT:    uzp1 v1.8h, v1.8h, v2.8h
 ; CHECK-DISABLE-NEXT:    uzp1 v1.16b, v1.16b, v0.16b
-; CHECK-DISABLE-NEXT:    xtn v0.8b, v0.8h
+; CHECK-DISABLE-NEXT:    uzp1 v0.8b, v0.8b, v0.8b
 ; CHECK-DISABLE-NEXT:    rev16 v2.16b, v1.16b
 ; CHECK-DISABLE-NEXT:    rev64 v1.16b, v1.16b
 ; CHECK-DISABLE-NEXT:    st1 { v0.b }[2], [x9]
diff --git a/llvm/test/CodeGen/AArch64/uadd_sat_vec.ll b/llvm/test/CodeGen/AArch64/uadd_sat_vec.ll
index e05c65daf50aae..f0bbed59405e3f 100644
--- a/llvm/test/CodeGen/AArch64/uadd_sat_vec.ll
+++ b/llvm/test/CodeGen/AArch64/uadd_sat_vec.ll
@@ -142,7 +142,7 @@ define void @v4i8(ptr %px, ptr %py, ptr %pz) nounwind {
 ; CHECK-NEXT:    movi d0, #0xff00ff00ff00ff
 ; CHECK-NEXT:    uaddl v1.8h, v1.8b, v2.8b
 ; CHECK-NEXT:    umin v0.4h, v1.4h, v0.4h
-; CHECK-NEXT:    xtn v0.8b, v0.8h
+; CHECK-NEXT:    uzp1 v0.8b, v0.8b, v0.8b
 ; CHECK-NEXT:    str s0, [x2]
 ; CHECK-NEXT:    ret
   %x = load <4 x i8>, ptr %px
diff --git a/llvm/test/CodeGen/AArch64/usub_sat_vec.ll b/llvm/test/CodeGen/AArch64/usub_sat_vec.ll
index 05f43e7d8427b7..82c0327219f504 100644
--- a/llvm/test/CodeGen/AArch64/usub_sat_vec.ll
+++ b/llvm/test/CodeGen/AArch64/usub_sat_vec.ll
@@ -143,7 +143,7 @@ define void @v4i8(ptr %px, ptr %py, ptr %pz) nounwind {
 ; CHECK-NEXT:    ushll v0.8h, v0.8b, #0
 ; CHECK-NEXT:    ushll v1.8h, v1.8b, #0
 ; CHECK-NEXT:    uqsub v0.4h, v0.4h, v1.4h
-; CHECK-NEXT:    xtn v0.8b, v0.8h
+; CHECK-NEXT:    uzp1 v0.8b, v0.8b, v0.8b
 ; CHECK-NEXT:    str s0, [x2]
 ; CHECK-NEXT:    ret
   %x = load <4 x i8>, ptr %px
diff --git a/llvm/test/CodeGen/AArch64/vcvt-oversize.ll b/llvm/test/CodeGen/AArch64/vcvt-oversize.ll
index 380bdbcc7f7408..611940546bc1a2 100644
--- a/llvm/test/CodeGen/AArch64/vcvt-oversize.ll
+++ b/llvm/test/CodeGen/AArch64/vcvt-oversize.ll
@@ -9,9 +9,8 @@ define <8 x i8> @float_to_i8(ptr %in) {
 ; CHECK-NEXT:    fadd v0.4s, v0.4s, v0.4s
 ; CHECK-NEXT:    fcvtzs v0.4s, v0.4s
 ; CHECK-NEXT:    fcvtzs v1.4s, v1.4s
-; CHECK-NEXT:    xtn v0.4h, v0.4s
-; CHECK-NEXT:    xtn v1.4h, v1.4s
-; CHECK-NEXT:    uzp1 v0.8b, v1.8b, v0.8b
+; CHECK-NEXT:    uzp1 v0.8h, v1.8h, v0.8h
+; CHECK-NEXT:    xtn v0.8b, v0.8h
 ; CHECK-NEXT:    ret
   %l = load <8 x float>, ptr %in
   %scale = fmul <8 x float> %l, <float 2.0, float 2.0, float 2.0, float 2.0, float 2.0, float 2.0, float 2.0, float 2.0>
diff --git a/llvm/test/CodeGen/AArch64/vec-combine-compare-truncate-store.ll b/llvm/test/CodeGen/AArch64/vec-combine-compare-truncate-store.ll
index 9c6ab8da0fa756..dd7a9c6d7768bc 100644
--- a/llvm/test/CodeGen/AArch64/vec-combine-compare-truncate-store.ll
+++ b/llvm/test/CodeGen/AArch64/vec-combine-compare-truncate-store.ll
@@ -210,7 +210,7 @@ define void @no_combine_for_non_bool_truncate(<4 x i32> %vec, ptr %out) {
 ; CHECK-LABEL: no_combine_for_non_bool_truncate:
 ; CHECK:       ; %bb.0:
 ; CHECK-NEXT:    xtn.4h v0, v0
-; CHECK-NEXT:    xtn.8b v0, v0
+; CHECK-NEXT:    uzp1.8b v0, v0, v0
 ; CHECK-NEXT:    str s0, [x0]
 ; CHECK-NEXT:    ret
 
diff --git a/llvm/test/CodeGen/AArch64/vec3-loads-ext-trunc-stores.ll b/llvm/test/CodeGen/AArch64/vec3-loads-ext-trunc-stores.ll
index 90328f73f86b51..71d55df6651768 100644
--- a/llvm/test/CodeGen/AArch64/vec3-loads-ext-trunc-stores.ll
+++ b/llvm/test/CodeGen/AArch64/vec3-loads-ext-trunc-stores.ll
@@ -410,7 +410,7 @@ define void @store_trunc_from_64bits(ptr %src, ptr %dst) {
 ; BE-NEXT:    ldrh w8, [x0, #4]
 ; BE-NEXT:    rev32 v0.4h, v0.4h
 ; BE-NEXT:    mov v0.h[2], w8
-; BE-NEXT:    xtn v0.8b, v0.8h
+; BE-NEXT:    uzp1 v0.8b, v0.8b, v0.8b
 ; BE-NEXT:    rev32 v0.16b, v0.16b
 ; BE-NEXT:    str s0, [sp, #12]
 ; BE-NEXT:    ldrh w9, [sp, #12]
@@ -456,7 +456,7 @@ define void @store_trunc_add_from_64bits(ptr %src, ptr %dst) {
 ; BE-NEXT:    add x8, x8, :lo12:.LCPI11_0
 ; BE-NEXT:    ld1 { v1.4h }, [x8]
 ; BE-NEXT:    add v0.4h, v0.4h, v1.4h
-; BE-NEXT:    xtn v1.8b, v0.8h
+; BE-NEXT:    uzp1 v1.8b, v0.8b, v0.8b
 ; BE-NEXT:    umov w8, v0.h[2]
 ; BE-NEXT:    rev32 v1.16b, v1.16b
 ; BE-NEXT:    str s1, [sp, #12]
@@ -638,7 +638,7 @@ define void @shift_trunc_store(ptr %src, ptr %dst) {
 ; BE-NEXT:    .cfi_def_cfa_offset 16
 ; BE-NEXT:    ld1 { v0.4s }, [x0]
 ; BE-NEXT:    shrn v0.4h, v0.4s, #16
-; BE-NEXT:    xtn v1.8b, v0.8h
+; BE-NEXT:    uzp1 v1.8b, v0.8b, v0.8b
 ; BE-NEXT:    umov w8, v0.h[2]
 ; BE-NEXT:    rev32 v1.16b, v1.16b
 ; BE-NEXT:    str s1, [sp, #12]
@@ -672,7 +672,7 @@ define void @shift_trunc_store_default_align(ptr %src, ptr %dst) {
 ; BE-NEXT:    .cfi_def_cfa_offset 16
 ; BE-NEXT:    ld1 { v0.4s }, [x0]
 ; BE-NEXT:    shrn v0.4h, v0.4s, #16
-; BE-NEXT:    xtn v1.8b, v0.8h
+; BE-NEXT:    uzp1 v1.8b, v0.8b, v0.8b
 ; BE-NEXT:    umov w8, v0.h[2]
 ; BE-NEXT:    rev32 v1.16b, v1.16b
 ; BE-NEXT:    str s1, [sp, #12]
@@ -706,7 +706,7 @@ define void @shift_trunc_store_align_4(ptr %src, ptr %dst) {
 ; BE-NEXT:    .cfi_def_cfa_offset 16
 ; BE-NEXT:    ld1 { v0.4s }, [x0]
 ; BE-NEXT:    shrn v0.4h, v0.4s, #16
-; BE-NEXT:    xtn v1.8b, v0.8h
+; BE-NEXT:    uzp1 v1.8b, v0.8b, v0.8b
 ; BE-NEXT:    umov w8, v0.h[2]
 ; BE-NEXT:    rev32 v1.16b, v1.16b
 ; BE-NEXT:    str s1, [sp, #12]
@@ -741,7 +741,7 @@ define void @shift_trunc_store_const_offset_1(ptr %src, ptr %dst) {
 ; BE-NEXT:    .cfi_def_cfa_offset 16
 ; BE-NEXT:    ld1 { v0.4s }, [x0]
 ; BE-NEXT:    shrn v0.4h, v0.4s, #16
-; BE-NEXT:    xtn v1.8b, v0.8h
+; BE-NEXT:    uzp1 v1.8b, v0.8b, v0.8b
 ; BE-NEXT:    umov w8, v0.h[2]
 ; BE-NEXT:    rev32 v1.16b, v1.16b
 ; BE-NEXT:    str s1, [sp, #12]
@@ -777,7 +777,7 @@ define void @shift_trunc_store_const_offset_3(ptr %src, ptr %dst) {
 ; BE-NEXT:    .cfi_def_cfa_offset 16
 ; BE-NEXT:    ld1 { v0.4s }, [x0]
 ; BE-NEXT:    shrn v0.4h, v0.4s, #16
-; BE-NEXT:    xtn v1.8b, v0.8h
+; BE-NEXT:    uzp1 v1.8b, v0.8b, v0.8b
 ; BE-NEXT:    umov w8, v0.h[2]
 ; BE-NEXT:    rev32 v1.16b, v1.16b
 ; BE-NEXT:    str s1, [sp, #12]
@@ -801,7 +801,7 @@ define void @shift_trunc_volatile_store(ptr %src, ptr %dst) {
 ; CHECK-NEXT:    .cfi_def_cfa_offset 16
 ; CHECK-NEXT:    ldr q0, [x0]
 ; CHECK-NEXT:    shrn.4h v0, v0, #16
-; CHECK-NEXT:    xtn.8b v1, v0
+; CHECK-NEXT:    uzp1.8b v1, v0, v0
 ; CHECK-NEXT:    umov.h w8, v0[2]
 ; CHECK-NEXT:    str s1, [sp, #12]
 ; CHECK-NEXT:    ldrh w9, [sp, #12]
@@ -816,7 +816,7 @@ define void @shift_trunc_volatile_store(ptr %src, ptr %dst) {
 ; BE-NEXT:    .cfi_def_cfa_offset 16
 ; BE-NEXT:    ld1 { v0.4s }, [x0]
 ; BE-NEXT:    shrn v0.4h, v0.4s, #16
-; BE-NEXT:    xtn v1.8b, v0.8h
+; BE-NEXT:    uzp1 v1.8b, v0.8b, v0.8b
 ; BE-NEXT:    umov w8, v0.h[2]
 ; BE-NEXT:    rev32 v1.16b, v1.16b
 ; BE-NEXT:    str s1, [sp, #12]
@@ -868,7 +868,7 @@ define void @load_v3i8_zext_to_3xi32_add_trunc_store(ptr %src) {
 ; BE-NEXT:    ushll v0.8h, v0.8b, #0
 ; BE-NEXT:    ld1 { v0.b }[4], [x9]
 ; BE-NEXT:    add v0.4h, v0.4h, v1.4h
-; BE-NEXT:    xtn v1.8b, v0.8h
+; BE-NEXT:    uzp1 v1.8b, v0.8b, v0.8b
 ; BE-NEXT:    umov w8, v0.h[2]
 ; BE-NEXT:    rev32 v1.16b, v1.16b
 ; BE-NEXT:    str s1, [sp, #8]
@@ -921,7 +921,7 @@ define void @load_v3i8_sext_to_3xi32_add_trunc_store(ptr %src) {
 ; BE-NEXT:    ushll v0.8h, v0.8b, #0
 ; BE-NEXT:    ld1 { v0.b }[4], [x9]
 ; BE-NEXT:    add v0.4h, v0.4h, v1.4h
-; BE-NEXT:    xtn v1.8b, v0.8h
+; BE-NEXT:    uzp1 v1.8b, v0.8b, v0.8b
 ; BE-NEXT:    umov w8, v0.h[2]
 ; BE-NEXT:    rev32 v1.16b, v1.16b
 ; BE-NEXT:    str s1, [sp, #8]

>From 7d35b396450f07c6773ff68e66eef198d5e1febc Mon Sep 17 00:00:00 2001
From: Florian Hahn <flo at fhahn.com>
Date: Wed, 13 Mar 2024 16:08:01 +0000
Subject: [PATCH 56/63] [VPlan] Use VPBuilder to create ActiveLaneMask (NFC).

---
 .../Vectorize/LoopVectorizationPlanner.h         | 16 ++++++++++++----
 .../lib/Transforms/Vectorize/VPlanTransforms.cpp |  8 ++++----
 2 files changed, 16 insertions(+), 8 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h b/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h
index a7ebf78e54ceb6..e86705e8988904 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h
@@ -79,6 +79,13 @@ class VPBuilder {
   VPBasicBlock *getInsertBlock() const { return BB; }
   VPBasicBlock::iterator getInsertPoint() const { return InsertPt; }
 
+  /// Create a VPBuilder to insert after \p R.
+  static VPBuilder getToInsertAfter(VPRecipeBase *R) {
+    VPBuilder B;
+    B.setInsertPoint(R->getParent(), std::next(R->getIterator()));
+    return B;
+  }
+
   /// InsertPoint - A saved insertion point.
   class VPInsertPoint {
     VPBasicBlock *Block = nullptr;
@@ -131,8 +138,9 @@ class VPBuilder {
 
   /// Create an N-ary operation with \p Opcode, \p Operands and set \p Inst as
   /// its underlying Instruction.
-  VPValue *createNaryOp(unsigned Opcode, ArrayRef<VPValue *> Operands,
-                        Instruction *Inst = nullptr, const Twine &Name = "") {
+  VPInstruction *createNaryOp(unsigned Opcode, ArrayRef<VPValue *> Operands,
+                              Instruction *Inst = nullptr,
+                              const Twine &Name = "") {
     DebugLoc DL;
     if (Inst)
       DL = Inst->getDebugLoc();
@@ -140,8 +148,8 @@ class VPBuilder {
     NewVPInst->setUnderlyingValue(Inst);
     return NewVPInst;
   }
-  VPValue *createNaryOp(unsigned Opcode, ArrayRef<VPValue *> Operands,
-                        DebugLoc DL, const Twine &Name = "") {
+  VPInstruction *createNaryOp(unsigned Opcode, ArrayRef<VPValue *> Operands,
+                              DebugLoc DL, const Twine &Name = "") {
     return createInstruction(Opcode, Operands, DL, Name);
   }
 
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
index f6b564ad931ca9..3b19db9f0d30df 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
@@ -1192,10 +1192,10 @@ void VPlanTransforms::addActiveLaneMask(
     LaneMask = addVPLaneMaskPhiAndUpdateExitBranch(
         Plan, DataAndControlFlowWithoutRuntimeCheck);
   } else {
-    LaneMask = new VPInstruction(VPInstruction::ActiveLaneMask,
-                                 {WideCanonicalIV, Plan.getTripCount()},
-                                 nullptr, "active.lane.mask");
-    LaneMask->insertAfter(WideCanonicalIV);
+    VPBuilder B = VPBuilder::getToInsertAfter(WideCanonicalIV);
+    LaneMask = B.createNaryOp(VPInstruction::ActiveLaneMask,
+                              {WideCanonicalIV, Plan.getTripCount()}, nullptr,
+                              "active.lane.mask");
   }
 
   // Walk users of WideCanonicalIV and replace all compares of the form

>From 47a95db9b4164b5bfc6b626bac77088aae25bbb4 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Valentin=20Clement=20=28=E3=83=90=E3=83=AC=E3=83=B3?=
 =?UTF-8?q?=E3=82=BF=E3=82=A4=E3=83=B3=20=E3=82=AF=E3=83=AC=E3=83=A1?=
 =?UTF-8?q?=E3=83=B3=29?= <clementval at gmail.com>
Date: Wed, 13 Mar 2024 09:14:40 -0700
Subject: [PATCH 57/63] [flang][cuda] Enable cuda with -x cuda option (#84944)

Flang driver was already able to enable the CUDA language feature base
on the file extension but there was no command line option. This PR adds
one.
---
 flang/lib/Frontend/CompilerInvocation.cpp |  9 +++++++++
 flang/lib/Frontend/FrontendAction.cpp     | 11 ++++++++---
 flang/test/Driver/cuda-option.f90         | 15 +++++++++++++++
 3 files changed, 32 insertions(+), 3 deletions(-)
 create mode 100644 flang/test/Driver/cuda-option.f90

diff --git a/flang/lib/Frontend/CompilerInvocation.cpp b/flang/lib/Frontend/CompilerInvocation.cpp
index 4707de0e976ca7..2e3fa1f6e66039 100644
--- a/flang/lib/Frontend/CompilerInvocation.cpp
+++ b/flang/lib/Frontend/CompilerInvocation.cpp
@@ -581,6 +581,8 @@ static bool parseFrontendArgs(FrontendOptions &opts, llvm::opt::ArgList &args,
                 // pre-processed inputs.
                 .Case("f95", Language::Fortran)
                 .Case("f95-cpp-input", Language::Fortran)
+                // CUDA Fortran
+                .Case("cuda", Language::Fortran)
                 .Default(Language::Unknown);
 
     // Flang's intermediate representations.
@@ -877,6 +879,13 @@ static bool parseDialectArgs(CompilerInvocation &res, llvm::opt::ArgList &args,
   if (args.hasArg(clang::driver::options::OPT_flarge_sizes))
     res.getDefaultKinds().set_sizeIntegerKind(8);
 
+  // -x cuda
+  auto language = args.getLastArgValue(clang::driver::options::OPT_x);
+  if (language.equals("cuda")) {
+    res.getFrontendOpts().features.Enable(
+        Fortran::common::LanguageFeature::CUDA);
+  }
+
   // -fopenmp and -fopenacc
   if (args.hasArg(clang::driver::options::OPT_fopenacc)) {
     res.getFrontendOpts().features.Enable(
diff --git a/flang/lib/Frontend/FrontendAction.cpp b/flang/lib/Frontend/FrontendAction.cpp
index 599b4e11f0cfbd..bb1c239540d9f5 100644
--- a/flang/lib/Frontend/FrontendAction.cpp
+++ b/flang/lib/Frontend/FrontendAction.cpp
@@ -86,9 +86,14 @@ bool FrontendAction::beginSourceFile(CompilerInstance &ci,
     invoc.collectMacroDefinitions();
   }
 
-  // Enable CUDA Fortran if source file is *.cuf/*.CUF.
-  invoc.getFortranOpts().features.Enable(Fortran::common::LanguageFeature::CUDA,
-                                         getCurrentInput().getIsCUDAFortran());
+  if (!invoc.getFortranOpts().features.IsEnabled(
+          Fortran::common::LanguageFeature::CUDA)) {
+    // Enable CUDA Fortran if source file is *.cuf/*.CUF and not already
+    // enabled.
+    invoc.getFortranOpts().features.Enable(
+        Fortran::common::LanguageFeature::CUDA,
+        getCurrentInput().getIsCUDAFortran());
+  }
 
   // Decide between fixed and free form (if the user didn't express any
   // preference, use the file extension to decide)
diff --git a/flang/test/Driver/cuda-option.f90 b/flang/test/Driver/cuda-option.f90
new file mode 100644
index 00000000000000..112e1cb6c77f8c
--- /dev/null
+++ b/flang/test/Driver/cuda-option.f90
@@ -0,0 +1,15 @@
+! Test -fcuda option
+! RUN: %flang -fc1 -cpp -x cuda -fdebug-unparse %s -o - | FileCheck %s
+! RUN: not %flang -fc1 -cpp %s -o - 2>&1 | FileCheck %s --check-prefix=ERROR
+program main
+#if _CUDA
+  integer :: var = _CUDA
+#endif
+  integer, device :: dvar
+end program
+
+! CHECK-LABEL: PROGRAM main
+! CHECK: INTEGER :: var = 1
+! CHECK: INTEGER, DEVICE :: dvar
+
+! ERROR: cuda-option.f90:8:19: error: expected end of statement

>From d7b2f97f4f3d49ae5c6e5df17203358e9e220591 Mon Sep 17 00:00:00 2001
From: Nick Desaulniers <nickdesaulniers at users.noreply.github.com>
Date: Wed, 13 Mar 2024 09:26:36 -0700
Subject: [PATCH 58/63] [libc][docs] document gpu support for stdbit.h (#85103)

Via:
https://github.com/llvm/llvm-project/pull/84938#issuecomment-1992120095

---------

Co-authored-by: Joseph Huber <huberjn at outlook.com>
---
 libc/docs/gpu/support.rst | 73 +++++++++++++++++++++++++++++++++++++++
 1 file changed, 73 insertions(+)

diff --git a/libc/docs/gpu/support.rst b/libc/docs/gpu/support.rst
index 250f0a7794de3c..31bbdc41090be6 100644
--- a/libc/docs/gpu/support.rst
+++ b/libc/docs/gpu/support.rst
@@ -89,6 +89,79 @@ strtok_r       |check|
 strxfrm        |check|
 =============  =========  ============
 
+stdbit.h
+--------
+
+============================  =========  ============
+Function Name                 Available  RPC Required
+============================  =========  ============
+stdc_leading_zeros_uc         |check|
+stdc_leading_zeros_us         |check|
+stdc_leading_zeros_ui         |check|
+stdc_leading_zeros_ul         |check|
+stdc_leading_zeros_ull        |check|
+stdc_trailing_zeros_uc        |check|
+stdc_trailing_zeros_us        |check|
+stdc_trailing_zeros_ui        |check|
+stdc_trailing_zeros_ul        |check|
+stdc_trailing_zeros_ull       |check|
+stdc_trailing_ones_uc         |check|
+stdc_trailing_ones_us         |check|
+stdc_trailing_ones_ui         |check|
+stdc_trailing_ones_ul         |check|
+stdc_trailing_ones_ull        |check|
+stdc_first_leading_zero_uc    |check|
+stdc_first_leading_zero_us    |check|
+stdc_first_leading_zero_ui    |check|
+stdc_first_leading_zero_ul    |check|
+stdc_first_leading_zero_ull   |check|
+stdc_first_leading_one_uc     |check|
+stdc_first_leading_one_us     |check|
+stdc_first_leading_one_ui     |check|
+stdc_first_leading_one_ul     |check|
+stdc_first_leading_one_ull    |check|
+stdc_first_trailing_zero_uc   |check|
+stdc_first_trailing_zero_us   |check|
+stdc_first_trailing_zero_ui   |check|
+stdc_first_trailing_zero_ul   |check|
+stdc_first_trailing_zero_ull  |check|
+stdc_first_trailing_one_uc    |check|
+stdc_first_trailing_one_us    |check|
+stdc_first_trailing_one_ui    |check|
+stdc_first_trailing_one_ul    |check|
+stdc_first_trailing_one_ull   |check|
+stdc_count_zeros_uc           |check|
+stdc_count_zeros_us           |check|
+stdc_count_zeros_ui           |check|
+stdc_count_zeros_ul           |check|
+stdc_count_zeros_ull          |check|
+stdc_count_ones_uc            |check|
+stdc_count_ones_us            |check|
+stdc_count_ones_ui            |check|
+stdc_count_ones_ul            |check|
+stdc_count_ones_ull           |check|
+stdc_has_single_bit_uc        |check|
+stdc_has_single_bit_us        |check|
+stdc_has_single_bit_ui        |check|
+stdc_has_single_bit_ul        |check|
+stdc_has_single_bit_ull       |check|
+stdc_bit_width_uc             |check|
+stdc_bit_width_us             |check|
+stdc_bit_width_ui             |check|
+stdc_bit_width_ul             |check|
+stdc_bit_width_ull            |check|
+stdc_bit_floor_uc             |check|
+stdc_bit_floor_us             |check|
+stdc_bit_floor_ui             |check|
+stdc_bit_floor_ul             |check|
+stdc_bit_floor_ull            |check|
+stdc_bit_ceil_uc              |check|
+stdc_bit_ceil_us              |check|
+stdc_bit_ceil_ui              |check|
+stdc_bit_ceil_ul              |check|
+stdc_bit_ceil_ull             |check|
+============================  =========  ============
+
 stdlib.h
 --------
 

>From 6e54dd5e213198d64fcdcfe36b991b0158a8f840 Mon Sep 17 00:00:00 2001
From: Christian Sigg <chsigg at users.noreply.github.com>
Date: Wed, 13 Mar 2024 17:26:50 +0100
Subject: [PATCH 59/63] [mlir] Fix inlining-threshold.mlir test for NDEBUG
 builds.

---
 mlir/test/Transforms/inlining-threshold.mlir | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/mlir/test/Transforms/inlining-threshold.mlir b/mlir/test/Transforms/inlining-threshold.mlir
index b94115d8f26416..649408aab577ac 100644
--- a/mlir/test/Transforms/inlining-threshold.mlir
+++ b/mlir/test/Transforms/inlining-threshold.mlir
@@ -1,4 +1,4 @@
-// RUN: mlir-opt %s --mlir-disable-threading -inline='default-pipeline='' inlining-threshold=100' -debug-only=inliner-pass 2>&1 | FileCheck %s
+// RUN: mlir-opt %s -inline='default-pipeline= inlining-threshold=100' | FileCheck %s
 
 // Check that inlining does not happen when the threshold is exceeded.
 func.func @callee1(%arg : i32) -> i32 {

>From 4c38ddd6f7a75a2a62b277ed1def320e82158f19 Mon Sep 17 00:00:00 2001
From: Bhuminjay Soni <Soni5Happy at gmail.com>
Date: Wed, 13 Mar 2024 21:58:25 +0530
Subject: [PATCH 60/63] Diagnose misuse of the cleanup attribute (#80040)

This pull request fixes #79443 when the cleanup attribute is intended to
be applied to a variable declaration, passing its address to a specified
function. The problem arises when standard functions like free,
closedir, fclose, etc., are used incorrectly with this attribute,
leading to incorrect behavior.

Fixes #79443
---
 clang/docs/ReleaseNotes.rst     |  4 ++++
 clang/include/clang/Sema/Sema.h |  5 +++--
 clang/lib/Sema/SemaDeclAttr.cpp | 24 ++++++++++++++++++++++++
 clang/test/Sema/attr-cleanup.c  | 28 ++++++++++++++++++++++++++--
 4 files changed, 57 insertions(+), 4 deletions(-)

diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst
index 64a9fe0d8bcc48..7173c1400f53f4 100644
--- a/clang/docs/ReleaseNotes.rst
+++ b/clang/docs/ReleaseNotes.rst
@@ -237,6 +237,10 @@ Improvements to Clang's diagnostics
 - Clang now diagnoses lambda function expressions being implicitly cast to boolean values, under ``-Wpointer-bool-conversion``.
   Fixes #GH82512.
 
+- Clang now provides improved warnings for the ``cleanup`` attribute to detect misuse scenarios,
+  such as attempting to call ``free`` on an unallocated object. Fixes
+  `#79443 <https://github.com/llvm/llvm-project/issues/79443>`_.
+
 Improvements to Clang's time-trace
 ----------------------------------
 
diff --git a/clang/include/clang/Sema/Sema.h b/clang/include/clang/Sema/Sema.h
index d6ab2b0c2def99..4a853119a2bb8f 100644
--- a/clang/include/clang/Sema/Sema.h
+++ b/clang/include/clang/Sema/Sema.h
@@ -2152,14 +2152,15 @@ class Sema final {
 
   bool IsLayoutCompatible(QualType T1, QualType T2) const;
 
+  bool CheckFunctionCall(FunctionDecl *FDecl, CallExpr *TheCall,
+                         const FunctionProtoType *Proto);
+
 private:
   void CheckArrayAccess(const Expr *BaseExpr, const Expr *IndexExpr,
                         const ArraySubscriptExpr *ASE = nullptr,
                         bool AllowOnePastEnd = true, bool IndexNegated = false);
   void CheckArrayAccess(const Expr *E);
 
-  bool CheckFunctionCall(FunctionDecl *FDecl, CallExpr *TheCall,
-                         const FunctionProtoType *Proto);
   bool CheckObjCMethodCall(ObjCMethodDecl *Method, SourceLocation loc,
                            ArrayRef<const Expr *> Args);
   bool CheckPointerCall(NamedDecl *NDecl, CallExpr *TheCall,
diff --git a/clang/lib/Sema/SemaDeclAttr.cpp b/clang/lib/Sema/SemaDeclAttr.cpp
index e3da3e606435f4..ec00fdf3f88d9e 100644
--- a/clang/lib/Sema/SemaDeclAttr.cpp
+++ b/clang/lib/Sema/SemaDeclAttr.cpp
@@ -3787,6 +3787,30 @@ static void handleCleanupAttr(Sema &S, Decl *D, const ParsedAttr &AL) {
       << NI.getName() << ParamTy << Ty;
     return;
   }
+  VarDecl *VD = cast<VarDecl>(D);
+  // Create a reference to the variable declaration. This is a fake/dummy
+  // reference.
+  DeclRefExpr *VariableReference = DeclRefExpr::Create(
+      S.Context, NestedNameSpecifierLoc{}, FD->getLocation(), VD, false,
+      DeclarationNameInfo{VD->getDeclName(), VD->getLocation()}, VD->getType(),
+      VK_LValue);
+
+  // Create a unary operator expression that represents taking the address of
+  // the variable. This is a fake/dummy expression.
+  Expr *AddressOfVariable = UnaryOperator::Create(
+      S.Context, VariableReference, UnaryOperatorKind::UO_AddrOf,
+      S.Context.getPointerType(VD->getType()), VK_PRValue, OK_Ordinary, Loc,
+      +false, FPOptionsOverride{});
+
+  // Create a function call expression. This is a fake/dummy call expression.
+  CallExpr *FunctionCallExpression =
+      CallExpr::Create(S.Context, E, ArrayRef{AddressOfVariable},
+                       S.Context.VoidTy, VK_PRValue, Loc, FPOptionsOverride{});
+
+  if (S.CheckFunctionCall(FD, FunctionCallExpression,
+                          FD->getType()->getAs<FunctionProtoType>())) {
+    return;
+  }
 
   D->addAttr(::new (S.Context) CleanupAttr(S.Context, AL, FD));
 }
diff --git a/clang/test/Sema/attr-cleanup.c b/clang/test/Sema/attr-cleanup.c
index 2c38687622c2bb..95baf2e675a086 100644
--- a/clang/test/Sema/attr-cleanup.c
+++ b/clang/test/Sema/attr-cleanup.c
@@ -1,7 +1,7 @@
-// RUN: %clang_cc1 %s -verify -fsyntax-only
+// RUN: %clang_cc1 -Wfree-nonheap-object -fsyntax-only -verify %s
 
 void c1(int *a);
-
+typedef __typeof__(sizeof(0)) size_t;
 extern int g1 __attribute((cleanup(c1))); // expected-warning {{'cleanup' attribute only applies to local variables}}
 int g2 __attribute((cleanup(c1))); // expected-warning {{'cleanup' attribute only applies to local variables}}
 static int g3 __attribute((cleanup(c1))); // expected-warning {{'cleanup' attribute only applies to local variables}}
@@ -48,3 +48,27 @@ void t6(void) {
 }
 
 void t7(__attribute__((cleanup(c4))) int a) {} // expected-warning {{'cleanup' attribute only applies to local variables}}
+
+extern void free(void *);
+extern void *malloc(size_t size);
+void t8(void) {
+  void *p
+  __attribute__((
+	  cleanup(
+		  free // expected-warning{{attempt to call free on non-heap object 'p'}}
+	  )
+	))
+  = malloc(10);
+}
+typedef __attribute__((aligned(2))) int Aligned2Int;
+void t9(void){
+  Aligned2Int __attribute__((cleanup(c1))) xwarn; // expected-warning{{passing 2-byte aligned argument to 4-byte aligned parameter 1 of 'c1' may result in an unaligned pointer access}}
+}
+
+__attribute__((enforce_tcb("TCB1"))) void func1(int *x) {
+    *x = 5;
+}
+__attribute__((enforce_tcb("TCB2"))) void t10() {
+  int __attribute__((cleanup(func1))) x = 5; // expected-warning{{calling 'func1' is a violation of trusted computing base 'TCB2'}}
+}
+

>From c12eabaad3c3cfd9f7530896ff4110fd7c0d98ca Mon Sep 17 00:00:00 2001
From: Sirraide <aeternalmail at gmail.com>
Date: Wed, 13 Mar 2024 17:39:23 +0100
Subject: [PATCH 61/63] [Clang] [Sema] Fix bug in `_Complex float`+`int`
 arithmetic (#83063)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

C23 6.3.1.8 ‘Usual arithmetic conversions’ p1 states (emphasis mine):
> Otherwise, if the corresponding real type of either operand is
`float`, the other operand is converted, *without change of type
domain*, to a type whose corresponding real type is `float`.

‘type domain’ here refers to `_Complex` vs real (i.e. non-`_Complex`);
there is another clause that states the same for `double`.

Consider the following code:
```c++
_Complex float f;
int x;
f / x;
```

After talking this over with @AaronBallman, we came to the conclusion
that `x` should be converted to `float` and *not* `_Complex float` (that
is, we should perform a division of `_Complex float / float`, and *not*
`_Complex float / _Complex float`; the same also applies to `-+*`). This
was already being done correctly for cases where `x` was already a
`float`; it’s just mixed `_Complex float`+`int` operations that
currently suffer from this problem.

This pr removes the extra `FloatingRealToComplex` conversion that we
were erroneously inserting and adds some tests to make sure we’re
actually doing `_Complex float / float` and not `_Complex float /
_Complex float` (and analogously for `double` and `-+*`).

The only exception here is `float / _Complex float`, which calls a
library function (`__divsc3`) that takes 4 `float`s, so we end up having
to convert the `float` to a `_Complex float` after all (and analogously
for `double`); I don’t believe there is a way around this.

Lastly, we were also missing tests for `_Complex` arithmetic at compile
time, so this adds some tests for that as well.
---
 clang/docs/ReleaseNotes.rst             |   7 ++
 clang/lib/Sema/SemaExpr.cpp             |  15 ++-
 clang/test/CodeGen/complex-math-mixed.c | 146 ++++++++++++++++++++++++
 clang/test/CodeGen/volatile.cpp         |  48 ++++----
 clang/test/Sema/complex-arithmetic.c    | 115 +++++++++++++++++++
 5 files changed, 298 insertions(+), 33 deletions(-)
 create mode 100644 clang/test/CodeGen/complex-math-mixed.c
 create mode 100644 clang/test/Sema/complex-arithmetic.c

diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst
index 7173c1400f53f4..c5488e8742f611 100644
--- a/clang/docs/ReleaseNotes.rst
+++ b/clang/docs/ReleaseNotes.rst
@@ -278,6 +278,13 @@ Bug Fixes in This Version
 - Clang now correctly generates overloads for bit-precise integer types for
   builtin operators in C++. Fixes #GH82998.
 
+- When performing mixed arithmetic between ``_Complex`` floating-point types and integers,
+  Clang now correctly promotes the integer to its corresponding real floating-point
+  type only rather than to the complex type (e.g. ``_Complex float / int`` is now evaluated
+  as ``_Complex float / float`` rather than ``_Complex float / _Complex float``), as mandated
+  by the C standard. This significantly improves codegen of `*` and `/` especially.
+  Fixes (`#31205 <https://github.com/llvm/llvm-project/issues/31205>`_).
+
 Bug Fixes to Compiler Builtins
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 
diff --git a/clang/lib/Sema/SemaExpr.cpp b/clang/lib/Sema/SemaExpr.cpp
index 93f82e68ab6440..8725b09f8546cf 100644
--- a/clang/lib/Sema/SemaExpr.cpp
+++ b/clang/lib/Sema/SemaExpr.cpp
@@ -1099,12 +1099,13 @@ ExprResult Sema::DefaultVariadicArgumentPromotion(Expr *E, VariadicCallType CT,
   return E;
 }
 
-/// Converts an integer to complex float type.  Helper function of
+/// Convert complex integers to complex floats and real integers to
+/// real floats as required for complex arithmetic. Helper function of
 /// UsualArithmeticConversions()
 ///
 /// \return false if the integer expression is an integer type and is
-/// successfully converted to the complex type.
-static bool handleIntegerToComplexFloatConversion(Sema &S, ExprResult &IntExpr,
+/// successfully converted to the (complex) float type.
+static bool handleComplexIntegerToFloatConversion(Sema &S, ExprResult &IntExpr,
                                                   ExprResult &ComplexExpr,
                                                   QualType IntTy,
                                                   QualType ComplexTy,
@@ -1114,8 +1115,6 @@ static bool handleIntegerToComplexFloatConversion(Sema &S, ExprResult &IntExpr,
   if (IntTy->isIntegerType()) {
     QualType fpTy = ComplexTy->castAs<ComplexType>()->getElementType();
     IntExpr = S.ImpCastExprToType(IntExpr.get(), fpTy, CK_IntegralToFloating);
-    IntExpr = S.ImpCastExprToType(IntExpr.get(), ComplexTy,
-                                  CK_FloatingRealToComplex);
   } else {
     assert(IntTy->isComplexIntegerType());
     IntExpr = S.ImpCastExprToType(IntExpr.get(), ComplexTy,
@@ -1160,11 +1159,11 @@ static QualType handleComplexFloatConversion(Sema &S, ExprResult &Shorter,
 static QualType handleComplexConversion(Sema &S, ExprResult &LHS,
                                         ExprResult &RHS, QualType LHSType,
                                         QualType RHSType, bool IsCompAssign) {
-  // if we have an integer operand, the result is the complex type.
-  if (!handleIntegerToComplexFloatConversion(S, RHS, LHS, RHSType, LHSType,
+  // Handle (complex) integer types.
+  if (!handleComplexIntegerToFloatConversion(S, RHS, LHS, RHSType, LHSType,
                                              /*SkipCast=*/false))
     return LHSType;
-  if (!handleIntegerToComplexFloatConversion(S, LHS, RHS, LHSType, RHSType,
+  if (!handleComplexIntegerToFloatConversion(S, LHS, RHS, LHSType, RHSType,
                                              /*SkipCast=*/IsCompAssign))
     return RHSType;
 
diff --git a/clang/test/CodeGen/complex-math-mixed.c b/clang/test/CodeGen/complex-math-mixed.c
new file mode 100644
index 00000000000000..050163cca80aab
--- /dev/null
+++ b/clang/test/CodeGen/complex-math-mixed.c
@@ -0,0 +1,146 @@
+// RUN: %clang_cc1 %s -O0 -emit-llvm -triple x86_64-unknown-unknown -o - | FileCheck %s --check-prefix=X86
+// RUN: %clang_cc1 %s -O0 -triple x86_64-unknown-unknown -fsyntax-only -ast-dump | FileCheck %s --check-prefix=AST
+
+// Check that for 'F _Complex + int' (F = real floating-point type), we emit an
+// implicit cast from 'int' to 'F', but NOT to 'F _Complex' (i.e. that we do
+// 'F _Complex + F', NOT 'F _Complex + F _Complex'), and likewise for -/*.
+
+// AST-NOT: FloatingRealToComplex
+
+float _Complex add_float_ci(float _Complex a, int b) {
+  // X86-LABEL: @add_float_ci
+  // X86: [[I:%.*]] = sitofp i32 {{%.*}} to float
+  // X86: fadd float {{.*}}, [[I]]
+  // X86-NOT: fadd
+  return a + b;
+}
+
+float _Complex add_float_ic(int a, float _Complex b) {
+  // X86-LABEL: @add_float_ic
+  // X86: [[I:%.*]] = sitofp i32 {{%.*}} to float
+  // X86: fadd float [[I]]
+  // X86-NOT: fadd
+  return a + b;
+}
+
+float _Complex sub_float_ci(float _Complex a, int b) {
+  // X86-LABEL: @sub_float_ci
+  // X86: [[I:%.*]] = sitofp i32 {{%.*}} to float
+  // X86: fsub float {{.*}}, [[I]]
+  // X86-NOT: fsub
+  return a - b;
+}
+
+float _Complex sub_float_ic(int a, float _Complex b) {
+  // X86-LABEL: @sub_float_ic
+  // X86: [[I:%.*]] = sitofp i32 {{%.*}} to float
+  // X86: fsub float [[I]]
+  // X86: fneg
+  // X86-NOT: fsub
+  return a - b;
+}
+
+float _Complex mul_float_ci(float _Complex a, int b) {
+  // X86-LABEL: @mul_float_ci
+  // X86: [[I:%.*]] = sitofp i32 {{%.*}} to float
+  // X86: fmul float {{.*}}, [[I]]
+  // X86: fmul float {{.*}}, [[I]]
+  // X86-NOT: fmul
+  return a * b;
+}
+
+float _Complex mul_float_ic(int a, float _Complex b) {
+  // X86-LABEL: @mul_float_ic
+  // X86: [[I:%.*]] = sitofp i32 {{%.*}} to float
+  // X86: fmul float [[I]]
+  // X86: fmul float [[I]]
+  // X86-NOT: fmul
+  return a * b;
+}
+
+float _Complex div_float_ci(float _Complex a, int b) {
+  // X86-LABEL: @div_float_ci
+  // X86: [[I:%.*]] = sitofp i32 {{%.*}} to float
+  // X86: fdiv float {{.*}}, [[I]]
+  // X86: fdiv float {{.*}}, [[I]]
+  // X86-NOT: @__divsc3
+  return a / b;
+}
+
+// There is no good way of doing this w/o converting the 'int' to a complex
+// number, so we expect complex division here.
+float _Complex div_float_ic(int a, float _Complex b) {
+  // X86-LABEL: @div_float_ic
+  // X86: [[I:%.*]] = sitofp i32 {{%.*}} to float
+  // X86: call {{.*}} @__divsc3(float {{.*}} [[I]], float noundef 0.{{0+}}e+00, float {{.*}}, float {{.*}})
+  return a / b;
+}
+
+double _Complex add_double_ci(double _Complex a, int b) {
+  // X86-LABEL: @add_double_ci
+  // X86: [[I:%.*]] = sitofp i32 {{%.*}} to double
+  // X86: fadd double {{.*}}, [[I]]
+  // X86-NOT: fadd
+  return a + b;
+}
+
+double _Complex add_double_ic(int a, double _Complex b) {
+  // X86-LABEL: @add_double_ic
+  // X86: [[I:%.*]] = sitofp i32 {{%.*}} to double
+  // X86: fadd double [[I]]
+  // X86-NOT: fadd
+  return a + b;
+}
+
+double _Complex sub_double_ci(double _Complex a, int b) {
+  // X86-LABEL: @sub_double_ci
+  // X86: [[I:%.*]] = sitofp i32 {{%.*}} to double
+  // X86: fsub double {{.*}}, [[I]]
+  // X86-NOT: fsub
+  return a - b;
+}
+
+double _Complex sub_double_ic(int a, double _Complex b) {
+  // X86-LABEL: @sub_double_ic
+  // X86: [[I:%.*]] = sitofp i32 {{%.*}} to double
+  // X86: fsub double [[I]]
+  // X86: fneg
+  // X86-NOT: fsub
+  return a - b;
+}
+
+double _Complex mul_double_ci(double _Complex a, int b) {
+  // X86-LABEL: @mul_double_ci
+  // X86: [[I:%.*]] = sitofp i32 {{%.*}} to double
+  // X86: fmul double {{.*}}, [[I]]
+  // X86: fmul double {{.*}}, [[I]]
+  // X86-NOT: fmul
+  return a * b;
+}
+
+double _Complex mul_double_ic(int a, double _Complex b) {
+  // X86-LABEL: @mul_double_ic
+  // X86: [[I:%.*]] = sitofp i32 {{%.*}} to double
+  // X86: fmul double [[I]]
+  // X86: fmul double [[I]]
+  // X86-NOT: fmul
+  return a * b;
+}
+
+double _Complex div_double_ci(double _Complex a, int b) {
+  // X86-LABEL: @div_double_ci
+  // X86: [[I:%.*]] = sitofp i32 {{%.*}} to double
+  // X86: fdiv double {{.*}}, [[I]]
+  // X86: fdiv double {{.*}}, [[I]]
+  // X86-NOT: @__divdc3
+  return a / b;
+}
+
+// There is no good way of doing this w/o converting the 'int' to a complex
+// number, so we expect complex division here.
+double _Complex div_double_ic(int a, double _Complex b) {
+  // X86-LABEL: @div_double_ic
+  // X86: [[I:%.*]] = sitofp i32 {{%.*}} to double
+  // X86: call {{.*}} @__divdc3(double {{.*}} [[I]], double noundef 0.{{0+}}e+00, double {{.*}}, double {{.*}})
+  return a / b;
+}
diff --git a/clang/test/CodeGen/volatile.cpp b/clang/test/CodeGen/volatile.cpp
index 38724659ad8a35..70f523b93852ed 100644
--- a/clang/test/CodeGen/volatile.cpp
+++ b/clang/test/CodeGen/volatile.cpp
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -O2 -triple=x86_64-unknown-linux-gnu -emit-llvm %s -o -  | FileCheck %s -check-prefix CHECK
+// RUN: %clang_cc1 -O2 -triple=x86_64-unknown-linux-gnu -emit-llvm %s -o -  | FileCheck %s
 struct agg 
 {
 int a ;
@@ -10,34 +10,32 @@ _Complex float cf;
 int volatile vol =10;
 void f0() {
     const_cast<volatile _Complex float &>(cf) = const_cast<volatile _Complex float&>(cf) + 1;
-//  CHECK: %cf.real = load volatile float, ptr @cf
-//  CHECK: %cf.imag = load volatile float, ptr getelementptr
-//  CHECK: %add.r = fadd float %cf.real, 1.000000e+00
-//  CHECK: %add.i = fadd float %cf.imag, 0.000000e+00
-//  CHECK: store volatile float %add.r
-//  CHECK: store volatile float %add.i, ptr getelementptr
+//  CHECK: [[Re1:%.*]] = load volatile float, ptr @cf
+//  CHECK: [[Im1:%.*]] = load volatile float, ptr getelementptr
+//  CHECK: [[Add1:%.*]] = fadd float [[Re1]], 1.000000e+00
+//  CHECK: store volatile float [[Add1]], ptr @cf
+//  CHECK: store volatile float [[Im1]], ptr getelementptr
       static_cast<volatile _Complex float &>(cf) = static_cast<volatile _Complex float&>(cf) + 1;
-//  CHECK: %cf.real1 = load volatile float, ptr @cf
-//  CHECK: %cf.imag2 = load volatile float, ptr getelementptr
-//  CHECK: %add.r3 = fadd float %cf.real1, 1.000000e+00
-//  CHECK: %add.i4 = fadd float %cf.imag2, 0.000000e+00
-//  CHECK: store volatile float %add.r3, ptr @cf
-//  CHECK: store volatile float %add.i4, ptr getelementptr
+//  CHECK: [[Re2:%.*]] = load volatile float, ptr @cf
+//  CHECK: [[Im2:%.*]] = load volatile float, ptr getelementptr
+//  CHECK: [[Add2:%.*]] = fadd float [[Re2]], 1.000000e+00
+//  CHECK: store volatile float [[Add2]], ptr @cf
+//  CHECK: store volatile float [[Im2]], ptr getelementptr
     const_cast<volatile  int  &>(a.a) = const_cast<volatile int &>(t.a) ;
-//  CHECK: %0 = load volatile i32, ptr @t
-//  CHECK: store volatile i32 %0, ptr @a
+//  CHECK: [[I1:%.*]] = load volatile i32, ptr @t
+//  CHECK: store volatile i32 [[I1]], ptr @a
     static_cast<volatile  int  &>(a.b) = static_cast<volatile int  &>(t.a) ;
-//  CHECK: %1 = load volatile i32, ptr @t
-//  CHECK: store volatile i32 %1, ptr getelementptr
+//  CHECK: [[I2:%.*]] = load volatile i32, ptr @t
+//  CHECK: store volatile i32 [[I2]], ptr getelementptr
     const_cast<volatile int&>(vt) = const_cast<volatile int&>(vt) + 1;
-//  CHECK: %2 = load volatile i32, ptr @vt
-//  CHECK: %add = add nsw i32 %2, 1
-//  CHECK: store volatile i32 %add, ptr @vt
+//  CHECK: [[I3:%.*]] = load volatile i32, ptr @vt
+//  CHECK: [[Add3:%.*]] = add nsw i32 [[I3]], 1
+//  CHECK: store volatile i32 [[Add3]], ptr @vt
      static_cast<volatile int&>(vt) = static_cast<volatile int&>(vt) + 1;
-//  CHECK: %3 = load volatile i32, ptr @vt
-//  CHECK: %add5 = add nsw i32 %3, 1
-//  CHECK: store volatile i32 %add5, ptr @vt
+//  CHECK: [[I4:%.*]] = load volatile i32, ptr @vt
+//  CHECK: [[Add4:%.*]] = add nsw i32 [[I4]], 1
+//  CHECK: store volatile i32 [[Add4]], ptr @vt
     vt = const_cast<int&>(vol);
-//  %4 = load i32, ptr @vol
-//  store i32 %4, ptr @vt
+//  [[I5:%.*]] = load i32, ptr @vol
+//  store i32 [[I5]], ptr @vt
 }
diff --git a/clang/test/Sema/complex-arithmetic.c b/clang/test/Sema/complex-arithmetic.c
new file mode 100644
index 00000000000000..c9e84da6daa9dc
--- /dev/null
+++ b/clang/test/Sema/complex-arithmetic.c
@@ -0,0 +1,115 @@
+// RUN: %clang_cc1 -verify %s
+// expected-no-diagnostics
+
+// This tests evaluation of _Complex arithmetic at compile time.
+
+#define APPROX_EQ(a, b) (                             \
+  __builtin_fabs(__real (a) - __real (b)) < 0.0001 && \
+  __builtin_fabs(__imag (a) - __imag (b)) < 0.0001    \
+)
+
+#define EVAL(a, b) _Static_assert(a == b, "")
+#define EVALF(a, b) _Static_assert(APPROX_EQ(a, b), "")
+
+// _Complex float + _Complex float
+void a() {
+  EVALF((2.f + 3i) + (4.f + 5i), 6.f + 8i);
+  EVALF((2.f + 3i) - (4.f + 5i), -2.f - 2i);
+  EVALF((2.f + 3i) * (4.f + 5i), -7.f + 22i);
+  EVALF((2.f + 3i) / (4.f + 5i), 0.5609f + 0.0487i);
+
+  EVALF((2. + 3i) + (4. + 5i), 6. + 8i);
+  EVALF((2. + 3i) - (4. + 5i), -2. - 2i);
+  EVALF((2. + 3i) * (4. + 5i), -7. + 22i);
+  EVALF((2. + 3i) / (4. + 5i), .5609 + .0487i);
+}
+
+// _Complex int + _Complex int
+void b() {
+  EVAL((2 + 3i) + (4 + 5i), 6 + 8i);
+  EVAL((2 + 3i) - (4 + 5i), -2 - 2i);
+  EVAL((2 + 3i) * (4 + 5i), -7 + 22i);
+  EVAL((8 + 30i) / (4 + 5i), 4 + 1i);
+}
+
+// _Complex float + float
+void c() {
+  EVALF((2.f + 4i) + 3.f, 5.f + 4i);
+  EVALF((2.f + 4i) - 3.f, -1.f + 4i);
+  EVALF((2.f + 4i) * 3.f, 6.f + 12i);
+  EVALF((2.f + 4i) / 2.f, 1.f + 2i);
+
+  EVALF(3.f + (2.f + 4i), 5.f + 4i);
+  EVALF(3.f - (2.f + 4i), 1.f - 4i);
+  EVALF(3.f * (2.f + 4i), 6.f + 12i);
+  EVALF(3.f / (2.f + 4i), .3f - 0.6i);
+
+  EVALF((2. + 4i) + 3., 5. + 4i);
+  EVALF((2. + 4i) - 3., -1. + 4i);
+  EVALF((2. + 4i) * 3., 6. + 12i);
+  EVALF((2. + 4i) / 2., 1. + 2i);
+
+  EVALF(3. + (2. + 4i), 5. + 4i);
+  EVALF(3. - (2. + 4i), 1. - 4i);
+  EVALF(3. * (2. + 4i), 6. + 12i);
+  EVALF(3. / (2. + 4i), .3 - 0.6i);
+}
+
+// _Complex int + int
+void d() {
+  EVAL((2 + 4i) + 3, 5 + 4i);
+  EVAL((2 + 4i) - 3, -1 + 4i);
+  EVAL((2 + 4i) * 3, 6 + 12i);
+  EVAL((2 + 4i) / 2, 1 + 2i);
+
+  EVAL(3 + (2 + 4i), 5 + 4i);
+  EVAL(3 - (2 + 4i), 1 - 4i);
+  EVAL(3 * (2 + 4i), 6 + 12i);
+  EVAL(20 / (2 + 4i), 2 - 4i);
+}
+
+// _Complex float + int
+void e() {
+  EVALF((2.f + 4i) + 3, 5.f + 4i);
+  EVALF((2.f + 4i) - 3, -1.f + 4i);
+  EVALF((2.f + 4i) * 3, 6.f + 12i);
+  EVALF((2.f + 4i) / 2, 1.f + 2i);
+
+  EVALF(3 + (2.f + 4i), 5.f + 4i);
+  EVALF(3 - (2.f + 4i), 1.f - 4i);
+  EVALF(3 * (2.f + 4i), 6.f + 12i);
+  EVALF(3 / (2.f + 4i), .3f - 0.6i);
+
+  EVALF((2. + 4i) + 3, 5. + 4i);
+  EVALF((2. + 4i) - 3, -1. + 4i);
+  EVALF((2. + 4i) * 3, 6. + 12i);
+  EVALF((2. + 4i) / 2, 1. + 2i);
+
+  EVALF(3 + (2. + 4i), 5. + 4i);
+  EVALF(3 - (2. + 4i), 1. - 4i);
+  EVALF(3 * (2. + 4i), 6. + 12i);
+  EVALF(3 / (2. + 4i), .3 - 0.6i);
+}
+
+// _Complex int + float
+void f() {
+  EVALF((2 + 4i) + 3.f, 5.f + 4i);
+  EVALF((2 + 4i) - 3.f, -1.f + 4i);
+  EVALF((2 + 4i) * 3.f, 6.f + 12i);
+  EVALF((2 + 4i) / 2.f, 1.f + 2i);
+
+  EVALF(3.f + (2 + 4i), 5.f + 4i);
+  EVALF(3.f - (2 + 4i), 1.f - 4i);
+  EVALF(3.f * (2 + 4i), 6.f + 12i);
+  EVALF(3.f / (2 + 4i), .3f - 0.6i);
+
+  EVALF((2 + 4i) + 3., 5. + 4i);
+  EVALF((2 + 4i) - 3., -1. + 4i);
+  EVALF((2 + 4i) * 3., 6. + 12i);
+  EVALF((2 + 4i) / 2., 1. + 2i);
+
+  EVALF(3. + (2 + 4i), 5. + 4i);
+  EVALF(3. - (2 + 4i), 1. - 4i);
+  EVALF(3. * (2 + 4i), 6. + 12i);
+  EVALF(3. / (2 + 4i), .3 - 0.6i);
+}

>From 41d307e5c18ca94d1b6ed4a521e887dfe58ae360 Mon Sep 17 00:00:00 2001
From: Stephen Tozer <stephen.tozer at sony.com>
Date: Wed, 13 Mar 2024 16:39:35 +0000
Subject: [PATCH 62/63] [RemoveDI][NFC] Rename DPValue->DbgRecord in comments
 and varnames (#84939)

This patch continues the ongoing rename work, replacing DPValue with
DbgRecord in comments and the names of variables, both members and
fn-local. This is the most labour-intensive part of the rename, as it is
where the most decisions have to be made about whether a given comment
or variable is referring to DPValues (equivalent to debug variable
intrinsics) or DbgRecords (a catch-all for all debug intrinsics); these
decisions are not individually difficult, but comprise a fairly large
amount of text to review.

This patch still largely performs basic string substitutions followed by
clang-format; there are almost* no places where, for example, a comment
has been expanded or modified to reflect the semantic difference between
DPValues and DbgRecords. I don't believe such a change is generally
necessary in LLVM, but it may be useful in the docs, and so I'll be
submitting docs changes as a separate patch.

*In a few places, `dbg.values` was replaced with `debug intrinsics`.
---
 .../llvm/CodeGen/GlobalISel/IRTranslator.h    |   2 +-
 llvm/include/llvm/IR/BasicBlock.h             |  32 ++--
 .../include/llvm/IR/DebugProgramInstruction.h |  62 +++---
 llvm/include/llvm/IR/Instruction.h            |  31 +--
 llvm/include/llvm/IR/PassManager.h            |   2 +-
 llvm/lib/Bitcode/Writer/BitcodeWriterPass.cpp |   6 +-
 .../CodeGen/AssignmentTrackingAnalysis.cpp    |  35 ++--
 llvm/lib/CodeGen/CodeGenPrepare.cpp           |   8 +-
 llvm/lib/CodeGen/MIRPrinter.cpp               |   4 +-
 llvm/lib/CodeGen/SelectOptimize.cpp           |  15 +-
 llvm/lib/IR/AsmWriter.cpp                     |   2 +-
 llvm/lib/IR/BasicBlock.cpp                    | 176 +++++++++---------
 llvm/lib/IR/DebugInfo.cpp                     |   2 +-
 llvm/lib/IR/DebugProgramInstruction.cpp       |  70 +++----
 llvm/lib/IR/Instruction.cpp                   |  37 ++--
 llvm/lib/IR/LLVMContextImpl.cpp               |   2 +-
 llvm/lib/IR/LLVMContextImpl.h                 |  12 +-
 llvm/lib/Transforms/Utils/Local.cpp           |   2 +-
 .../Transforms/Utils/LoopRotationUtils.cpp    |  28 +--
 llvm/lib/Transforms/Utils/SimplifyCFG.cpp     |   9 +-
 llvm/lib/Transforms/Utils/ValueMapper.cpp     |   8 +-
 llvm/unittests/IR/BasicBlockDbgInfoTest.cpp   |  76 ++++----
 llvm/unittests/IR/DebugInfoTest.cpp           |  26 +--
 .../Transforms/Utils/DebugifyTest.cpp         |   2 +-
 24 files changed, 330 insertions(+), 319 deletions(-)

diff --git a/llvm/include/llvm/CodeGen/GlobalISel/IRTranslator.h b/llvm/include/llvm/CodeGen/GlobalISel/IRTranslator.h
index bfac54a65c5b4e..29f675b2203b6b 100644
--- a/llvm/include/llvm/CodeGen/GlobalISel/IRTranslator.h
+++ b/llvm/include/llvm/CodeGen/GlobalISel/IRTranslator.h
@@ -205,7 +205,7 @@ class IRTranslator : public MachineFunctionPass {
   bool translate(const Constant &C, Register Reg);
 
   /// Examine any debug-info attached to the instruction (in the form of
-  /// DPValues) and translate it.
+  /// DbgRecords) and translate it.
   void translateDbgInfo(const Instruction &Inst,
                           MachineIRBuilder &MIRBuilder);
 
diff --git a/llvm/include/llvm/IR/BasicBlock.h b/llvm/include/llvm/IR/BasicBlock.h
index 5bac113c9b7b42..71c1a8394896b7 100644
--- a/llvm/include/llvm/IR/BasicBlock.h
+++ b/llvm/include/llvm/IR/BasicBlock.h
@@ -78,13 +78,13 @@ class BasicBlock final : public Value, // Basic blocks are data objects also
   DPMarker *createMarker(InstListType::iterator It);
 
   /// Convert variable location debugging information stored in dbg.value
-  /// intrinsics into DPMarker / DPValue records. Deletes all dbg.values in
+  /// intrinsics into DPMarkers / DbgRecords. Deletes all dbg.values in
   /// the process and sets IsNewDbgInfoFormat = true. Only takes effect if
   /// the UseNewDbgInfoFormat LLVM command line option is given.
   void convertToNewDbgValues();
 
   /// Convert variable location debugging information stored in DPMarkers and
-  /// DPValues into the dbg.value intrinsic representation. Sets
+  /// DbgRecords into the dbg.value intrinsic representation. Sets
   /// IsNewDbgInfoFormat = false.
   void convertFromNewDbgValues();
 
@@ -93,50 +93,50 @@ class BasicBlock final : public Value, // Basic blocks are data objects also
   /// if necessary.
   void setIsNewDbgInfoFormat(bool NewFlag);
 
-  /// Record that the collection of DPValues in \p M "trails" after the last
+  /// Record that the collection of DbgRecords in \p M "trails" after the last
   /// instruction of this block. These are equivalent to dbg.value intrinsics
   /// that exist at the end of a basic block with no terminator (a transient
   /// state that occurs regularly).
   void setTrailingDbgRecords(DPMarker *M);
 
-  /// Fetch the collection of DPValues that "trail" after the last instruction
+  /// Fetch the collection of DbgRecords that "trail" after the last instruction
   /// of this block, see \ref setTrailingDbgRecords. If there are none, returns
   /// nullptr.
   DPMarker *getTrailingDbgRecords();
 
-  /// Delete any trailing DPValues at the end of this block, see
+  /// Delete any trailing DbgRecords at the end of this block, see
   /// \ref setTrailingDbgRecords.
   void deleteTrailingDbgRecords();
 
   void dumpDbgValues() const;
 
-  /// Return the DPMarker for the position given by \p It, so that DPValues can
-  /// be inserted there. This will either be nullptr if not present, a DPMarker,
-  /// or TrailingDPValues if It is end().
+  /// Return the DPMarker for the position given by \p It, so that DbgRecords
+  /// can be inserted there. This will either be nullptr if not present, a
+  /// DPMarker, or TrailingDbgRecords if It is end().
   DPMarker *getMarker(InstListType::iterator It);
 
   /// Return the DPMarker for the position that comes after \p I. \see
   /// BasicBlock::getMarker, this can be nullptr, a DPMarker, or
-  /// TrailingDPValues if there is no next instruction.
+  /// TrailingDbgRecords if there is no next instruction.
   DPMarker *getNextMarker(Instruction *I);
 
-  /// Insert a DPValue into a block at the position given by \p I.
+  /// Insert a DbgRecord into a block at the position given by \p I.
   void insertDbgRecordAfter(DbgRecord *DPV, Instruction *I);
 
-  /// Insert a DPValue into a block at the position given by \p Here.
+  /// Insert a DbgRecord into a block at the position given by \p Here.
   void insertDbgRecordBefore(DbgRecord *DPV, InstListType::iterator Here);
 
-  /// Eject any debug-info trailing at the end of a block. DPValues can
+  /// Eject any debug-info trailing at the end of a block. DbgRecords can
   /// transiently be located "off the end" of a block if the blocks terminator
   /// is temporarily removed. Once a terminator is re-inserted this method will
-  /// move such DPValues back to the right place (ahead of the terminator).
-  void flushTerminatorDbgValues();
+  /// move such DbgRecords back to the right place (ahead of the terminator).
+  void flushTerminatorDbgRecords();
 
   /// In rare circumstances instructions can be speculatively removed from
   /// blocks, and then be re-inserted back into that position later. When this
   /// happens in RemoveDIs debug-info mode, some special patching-up needs to
   /// occur: inserting into the middle of a sequence of dbg.value intrinsics
-  /// does not have an equivalent with DPValues.
+  /// does not have an equivalent with DbgRecords.
   void reinsertInstInDbgRecords(Instruction *I,
                                 std::optional<DbgRecord::self_iterator> Pos);
 
@@ -522,7 +522,7 @@ class BasicBlock final : public Value, // Basic blocks are data objects also
                                  BasicBlock::iterator FromEndIt);
 
   /// Perform any debug-info specific maintenence for the given splice
-  /// activity. In the DPValue debug-info representation, debug-info is not
+  /// activity. In the DbgRecord debug-info representation, debug-info is not
   /// in instructions, and so it does not automatically move from one block
   /// to another.
   void spliceDebugInfo(BasicBlock::iterator ToIt, BasicBlock *FromBB,
diff --git a/llvm/include/llvm/IR/DebugProgramInstruction.h b/llvm/include/llvm/IR/DebugProgramInstruction.h
index 507b652feeb016..1afc9259241d50 100644
--- a/llvm/include/llvm/IR/DebugProgramInstruction.h
+++ b/llvm/include/llvm/IR/DebugProgramInstruction.h
@@ -1,4 +1,4 @@
-//===-- llvm/DebugProgramInstruction.h - Stream of debug info -------*- C++ -*-===//
+//===-- llvm/DebugProgramInstruction.h - Stream of debug info ---*- C++ -*-===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
@@ -15,10 +15,10 @@
 //    %bar = void call @ext(%foo);
 //
 // and all information is stored in the Value / Metadata hierachy defined
-// elsewhere in LLVM. In the "DPValue" design, each instruction /may/ have a
-// connection with a DPMarker, which identifies a position immediately before the
-// instruction, and each DPMarker /may/ then have connections to DPValues which
-// record the variable assignment information. To illustrate:
+// elsewhere in LLVM. In the "DbgRecord" design, each instruction /may/ have a
+// connection with a DPMarker, which identifies a position immediately before
+// the instruction, and each DPMarker /may/ then have connections to DbgRecords
+// which record the variable assignment information. To illustrate:
 //
 //    %foo = add i32 1, %0
 //       ; foo->DbgMarker == nullptr
@@ -26,7 +26,7 @@
 //       ;; the instruction for %foo, therefore it has no DbgMarker.
 //    %bar = void call @ext(%foo)
 //       ; bar->DbgMarker = {
-//       ;   StoredDPValues = {
+//       ;   StoredDbgRecords = {
 //       ;     DPValue(metadata i32 %foo, ...)
 //       ;   }
 //       ; }
@@ -119,7 +119,7 @@ template <typename T> class DbgRecordParamRef {
 /// Base class for non-instruction debug metadata records that have positions
 /// within IR. Features various methods copied across from the Instruction
 /// class to aid ease-of-use. DbgRecords should always be linked into a
-/// DPMarker's StoredDPValues list. The marker connects a DbgRecord back to
+/// DPMarker's StoredDbgRecords list. The marker connects a DbgRecord back to
 /// it's position in the BasicBlock.
 ///
 /// We need a discriminator for dyn/isa casts. In order to avoid paying for a
@@ -557,8 +557,8 @@ class DPMarker {
   /// intrinsics. There is a one-to-one relationship between each debug
   /// intrinsic in a block and each DbgRecord once the representation has been
   /// converted, and the ordering is meaningful in the same way.
-  simple_ilist<DbgRecord> StoredDPValues;
-  bool empty() const { return StoredDPValues.empty(); }
+  simple_ilist<DbgRecord> StoredDbgRecords;
+  bool empty() const { return StoredDbgRecords.empty(); }
 
   const BasicBlock *getParent() const;
   BasicBlock *getParent();
@@ -576,54 +576,56 @@ class DPMarker {
   void print(raw_ostream &O, bool IsForDebug = false) const;
   void print(raw_ostream &ROS, ModuleSlotTracker &MST, bool IsForDebug) const;
 
-  /// Produce a range over all the DPValues in this Marker.
+  /// Produce a range over all the DbgRecords in this Marker.
   iterator_range<simple_ilist<DbgRecord>::iterator> getDbgRecordRange();
   iterator_range<simple_ilist<DbgRecord>::const_iterator>
   getDbgRecordRange() const;
-  /// Transfer any DPValues from \p Src into this DPMarker. If \p InsertAtHead
-  /// is true, place them before existing DPValues, otherwise afterwards.
+  /// Transfer any DbgRecords from \p Src into this DPMarker. If \p InsertAtHead
+  /// is true, place them before existing DbgRecords, otherwise afterwards.
   void absorbDebugValues(DPMarker &Src, bool InsertAtHead);
-  /// Transfer the DPValues in \p Range from \p Src into this DPMarker. If
-  /// \p InsertAtHead is true, place them before existing DPValues, otherwise
+  /// Transfer the DbgRecords in \p Range from \p Src into this DPMarker. If
+  /// \p InsertAtHead is true, place them before existing DbgRecords, otherwise
   // afterwards.
   void absorbDebugValues(iterator_range<DbgRecord::self_iterator> Range,
                          DPMarker &Src, bool InsertAtHead);
-  /// Insert a DPValue into this DPMarker, at the end of the list. If
+  /// Insert a DbgRecord into this DPMarker, at the end of the list. If
   /// \p InsertAtHead is true, at the start.
   void insertDbgRecord(DbgRecord *New, bool InsertAtHead);
-  /// Insert a DPValue prior to a DPValue contained within this marker.
+  /// Insert a DbgRecord prior to a DbgRecord contained within this marker.
   void insertDbgRecord(DbgRecord *New, DbgRecord *InsertBefore);
-  /// Insert a DPValue after a DPValue contained within this marker.
+  /// Insert a DbgRecord after a DbgRecord contained within this marker.
   void insertDbgRecordAfter(DbgRecord *New, DbgRecord *InsertAfter);
   /// Clone all DPMarkers from \p From into this marker. There are numerous
   /// options to customise the source/destination, due to gnarliness, see class
   /// comment.
-  /// \p FromHere If non-null, copy from FromHere to the end of From's DPValues
-  /// \p InsertAtHead Place the cloned DPValues at the start of StoredDPValues
-  /// \returns Range over all the newly cloned DPValues
+  /// \p FromHere If non-null, copy from FromHere to the end of From's
+  /// DbgRecords
+  /// \p InsertAtHead Place the cloned DbgRecords at the start of
+  /// StoredDbgRecords
+  /// \returns Range over all the newly cloned DbgRecords
   iterator_range<simple_ilist<DbgRecord>::iterator>
   cloneDebugInfoFrom(DPMarker *From,
                      std::optional<simple_ilist<DbgRecord>::iterator> FromHere,
                      bool InsertAtHead = false);
-  /// Erase all DPValues in this DPMarker.
+  /// Erase all DbgRecords in this DPMarker.
   void dropDbgRecords();
   /// Erase a single DbgRecord from this marker. In an ideal future, we would
   /// never erase an assignment in this way, but it's the equivalent to
   /// erasing a debug intrinsic from a block.
   void dropOneDbgRecord(DbgRecord *DR);
 
-  /// We generally act like all llvm Instructions have a range of DPValues
+  /// We generally act like all llvm Instructions have a range of DbgRecords
   /// attached to them, but in reality sometimes we don't allocate the DPMarker
-  /// to save time and memory, but still have to return ranges of DPValues. When
-  /// we need to describe such an unallocated DPValue range, use this static
-  /// markers range instead. This will bite us if someone tries to insert a
-  /// DPValue in that range, but they should be using the Official (TM) API for
-  /// that.
+  /// to save time and memory, but still have to return ranges of DbgRecords.
+  /// When we need to describe such an unallocated DbgRecord range, use this
+  /// static markers range instead. This will bite us if someone tries to insert
+  /// a DbgRecord in that range, but they should be using the Official (TM) API
+  /// for that.
   static DPMarker EmptyDPMarker;
   static iterator_range<simple_ilist<DbgRecord>::iterator>
   getEmptyDbgRecordRange() {
-    return make_range(EmptyDPMarker.StoredDPValues.end(),
-                      EmptyDPMarker.StoredDPValues.end());
+    return make_range(EmptyDPMarker.StoredDbgRecords.end(),
+                      EmptyDPMarker.StoredDbgRecords.end());
   }
 };
 
@@ -632,7 +634,7 @@ inline raw_ostream &operator<<(raw_ostream &OS, const DPMarker &Marker) {
   return OS;
 }
 
-/// Inline helper to return a range of DPValues attached to a marker. It needs
+/// Inline helper to return a range of DbgRecords attached to a marker. It needs
 /// to be inlined as it's frequently called, but also come after the declaration
 /// of DPMarker. Thus: it's pre-declared by users like Instruction, then an
 /// inlineable body defined here.
diff --git a/llvm/include/llvm/IR/Instruction.h b/llvm/include/llvm/IR/Instruction.h
index 817abd6afbcae0..d6cf1557752386 100644
--- a/llvm/include/llvm/IR/Instruction.h
+++ b/llvm/include/llvm/IR/Instruction.h
@@ -64,47 +64,48 @@ class Instruction : public User,
 
   /// Clone any debug-info attached to \p From onto this instruction. Used to
   /// copy debugging information from one block to another, when copying entire
-  /// blocks. \see DebugProgramInstruction.h , because the ordering of DPValues
-  /// is still important, fine grain control of which instructions are moved and
-  /// where they go is necessary.
+  /// blocks. \see DebugProgramInstruction.h , because the ordering of
+  /// DbgRecords is still important, fine grain control of which instructions
+  /// are moved and where they go is necessary.
   /// \p From The instruction to clone debug-info from.
-  /// \p from_here Optional iterator to limit DPValues cloned to be a range from
+  /// \p from_here Optional iterator to limit DbgRecords cloned to be a range
+  /// from
   ///    from_here to end().
-  /// \p InsertAtHead Whether the cloned DPValues should be placed at the end
-  ///    or the beginning of existing DPValues attached to this.
-  /// \returns A range over the newly cloned DPValues.
+  /// \p InsertAtHead Whether the cloned DbgRecords should be placed at the end
+  ///    or the beginning of existing DbgRecords attached to this.
+  /// \returns A range over the newly cloned DbgRecords.
   iterator_range<simple_ilist<DbgRecord>::iterator> cloneDebugInfoFrom(
       const Instruction *From,
       std::optional<simple_ilist<DbgRecord>::iterator> FromHere = std::nullopt,
       bool InsertAtHead = false);
 
-  /// Return a range over the DPValues attached to this instruction.
+  /// Return a range over the DbgRecords attached to this instruction.
   iterator_range<simple_ilist<DbgRecord>::iterator> getDbgRecordRange() const {
     return llvm::getDbgRecordRange(DbgMarker);
   }
 
-  /// Return an iterator to the position of the "Next" DPValue after this
+  /// Return an iterator to the position of the "Next" DbgRecord after this
   /// instruction, or std::nullopt. This is the position to pass to
   /// BasicBlock::reinsertInstInDbgRecords when re-inserting an instruction.
   std::optional<simple_ilist<DbgRecord>::iterator> getDbgReinsertionPosition();
 
-  /// Returns true if any DPValues are attached to this instruction.
+  /// Returns true if any DbgRecords are attached to this instruction.
   bool hasDbgRecords() const;
 
-  /// Transfer any DPValues on the position \p It onto this instruction,
-  /// by simply adopting the sequence of DPValues (which is efficient) if
+  /// Transfer any DbgRecords on the position \p It onto this instruction,
+  /// by simply adopting the sequence of DbgRecords (which is efficient) if
   /// possible, by merging two sequences otherwise.
   void adoptDbgRecords(BasicBlock *BB, InstListType::iterator It,
                        bool InsertAtHead);
 
-  /// Erase any DPValues attached to this instruction.
+  /// Erase any DbgRecords attached to this instruction.
   void dropDbgRecords();
 
-  /// Erase a single DPValue \p I that is attached to this instruction.
+  /// Erase a single DbgRecord \p I that is attached to this instruction.
   void dropOneDbgRecord(DbgRecord *I);
 
   /// Handle the debug-info implications of this instruction being removed. Any
-  /// attached DPValues need to "fall" down onto the next instruction.
+  /// attached DbgRecords need to "fall" down onto the next instruction.
   void handleMarkerRemoval();
 
 protected:
diff --git a/llvm/include/llvm/IR/PassManager.h b/llvm/include/llvm/IR/PassManager.h
index c03d49c3b7b978..ec8b809d40bfbc 100644
--- a/llvm/include/llvm/IR/PassManager.h
+++ b/llvm/include/llvm/IR/PassManager.h
@@ -227,7 +227,7 @@ class PassManager : public PassInfoMixin<
         detail::getAnalysisResult<PassInstrumentationAnalysis>(
             AM, IR, std::tuple<ExtraArgTs...>(ExtraArgs...));
 
-    // RemoveDIs: if requested, convert debug-info to DPValue representation
+    // RemoveDIs: if requested, convert debug-info to DbgRecord representation
     // for duration of these passes.
     bool ShouldConvertDbgInfo = shouldConvertDbgInfo(IR);
     if (ShouldConvertDbgInfo)
diff --git a/llvm/lib/Bitcode/Writer/BitcodeWriterPass.cpp b/llvm/lib/Bitcode/Writer/BitcodeWriterPass.cpp
index 93fb2a821dee74..0eb9c246f2a9b6 100644
--- a/llvm/lib/Bitcode/Writer/BitcodeWriterPass.cpp
+++ b/llvm/lib/Bitcode/Writer/BitcodeWriterPass.cpp
@@ -19,7 +19,7 @@
 using namespace llvm;
 
 PreservedAnalyses BitcodeWriterPass::run(Module &M, ModuleAnalysisManager &AM) {
-  // RemoveDIs: there's no bitcode representation of the DPValue debug-info,
+  // RemoveDIs: there's no bitcode representation of the DbgRecord debug-info,
   // convert to dbg.values before writing out.
   bool IsNewDbgInfoFormat = M.IsNewDbgInfoFormat;
   if (IsNewDbgInfoFormat)
@@ -56,8 +56,8 @@ namespace {
     StringRef getPassName() const override { return "Bitcode Writer"; }
 
     bool runOnModule(Module &M) override {
-      // RemoveDIs: there's no bitcode representation of the DPValue debug-info,
-      // convert to dbg.values before writing out.
+      // RemoveDIs: there's no bitcode representation of the DbgRecord
+      // debug-info, convert to dbg.values before writing out.
       bool IsNewDbgInfoFormat = M.IsNewDbgInfoFormat;
       if (IsNewDbgInfoFormat)
         M.convertFromNewDbgValues();
diff --git a/llvm/lib/CodeGen/AssignmentTrackingAnalysis.cpp b/llvm/lib/CodeGen/AssignmentTrackingAnalysis.cpp
index a4b819a735c640..746926e56f2e17 100644
--- a/llvm/lib/CodeGen/AssignmentTrackingAnalysis.cpp
+++ b/llvm/lib/CodeGen/AssignmentTrackingAnalysis.cpp
@@ -217,13 +217,14 @@ void FunctionVarLocs::init(FunctionVarLocsBuilder &Builder) {
   // to the start and end position in the vector with VarLocsBeforeInst. This
   // block includes VarLocs for any DPValues attached to that instruction.
   for (auto &P : Builder.VarLocsBeforeInst) {
-    // Process VarLocs attached to a DPValue alongside their marker Instruction.
+    // Process VarLocs attached to a DbgRecord alongside their marker
+    // Instruction.
     if (isa<const DbgRecord *>(P.first))
       continue;
     const Instruction *I = cast<const Instruction *>(P.first);
     unsigned BlockStart = VarLocRecords.size();
-    // Any VarLocInfos attached to a DPValue should now be remapped to their
-    // marker Instruction, in order of DPValue appearance and prior to any
+    // Any VarLocInfos attached to a DbgRecord should now be remapped to their
+    // marker Instruction, in order of DbgRecord appearance and prior to any
     // VarLocInfos attached directly to that instruction.
     for (const DPValue &DPV : DPValue::filter(I->getDbgRecordRange())) {
       // Even though DPV defines a variable location, VarLocsBeforeInst can
@@ -1649,7 +1650,7 @@ void AssignmentTrackingLowering::processUntaggedInstruction(
     Ops.push_back(dwarf::DW_OP_deref);
     DIE = DIExpression::prependOpcodes(DIE, Ops, /*StackValue=*/false,
                                        /*EntryValue=*/false);
-    // Find a suitable insert point, before the next instruction or DPValue
+    // Find a suitable insert point, before the next instruction or DbgRecord
     // after I.
     auto InsertBefore = getNextNode(&I);
     assert(InsertBefore && "Shouldn't be inserting after a terminator");
@@ -1886,21 +1887,21 @@ void AssignmentTrackingLowering::resetInsertionPoint(DPValue &After) {
 }
 
 void AssignmentTrackingLowering::process(BasicBlock &BB, BlockInfo *LiveSet) {
-  // If the block starts with DPValues, we need to process those DPValues as
+  // If the block starts with DbgRecords, we need to process those DbgRecords as
   // their own frame without processing any instructions first.
-  bool ProcessedLeadingDPValues = !BB.begin()->hasDbgRecords();
+  bool ProcessedLeadingDbgRecords = !BB.begin()->hasDbgRecords();
   for (auto II = BB.begin(), EI = BB.end(); II != EI;) {
     assert(VarsTouchedThisFrame.empty());
     // Process the instructions in "frames". A "frame" includes a single
     // non-debug instruction followed any debug instructions before the
     // next non-debug instruction.
 
-    // Skip the current instruction if it has unprocessed DPValues attached (see
-    // comment above `ProcessedLeadingDPValues`).
-    if (ProcessedLeadingDPValues) {
+    // Skip the current instruction if it has unprocessed DbgRecords attached
+    // (see comment above `ProcessedLeadingDbgRecords`).
+    if (ProcessedLeadingDbgRecords) {
       // II is now either a debug intrinsic, a non-debug instruction with no
-      // attached DPValues, or a non-debug instruction with attached processed
-      // DPValues.
+      // attached DbgRecords, or a non-debug instruction with attached processed
+      // DbgRecords.
       // II has not been processed.
       if (!isa<DbgInfoIntrinsic>(&*II)) {
         if (II->isTerminator())
@@ -1912,8 +1913,8 @@ void AssignmentTrackingLowering::process(BasicBlock &BB, BlockInfo *LiveSet) {
       }
     }
     // II is now either a debug intrinsic, a non-debug instruction with no
-    // attached DPValues, or a non-debug instruction with attached unprocessed
-    // DPValues.
+    // attached DbgRecords, or a non-debug instruction with attached unprocessed
+    // DbgRecords.
     if (II != EI && II->hasDbgRecords()) {
       // Skip over non-variable debug records (i.e., labels). They're going to
       // be read from IR (possibly re-ordering them within the debug record
@@ -1924,7 +1925,7 @@ void AssignmentTrackingLowering::process(BasicBlock &BB, BlockInfo *LiveSet) {
         assert(LiveSet->isValid());
       }
     }
-    ProcessedLeadingDPValues = true;
+    ProcessedLeadingDbgRecords = true;
     while (II != EI) {
       auto *Dbg = dyn_cast<DbgInfoIntrinsic>(&*II);
       if (!Dbg)
@@ -1934,9 +1935,9 @@ void AssignmentTrackingLowering::process(BasicBlock &BB, BlockInfo *LiveSet) {
       assert(LiveSet->isValid());
       ++II;
     }
-    // II is now a non-debug instruction either with no attached DPValues, or
-    // with attached processed DPValues. II has not been processed, and all
-    // debug instructions or DPValues in the frame preceding II have been
+    // II is now a non-debug instruction either with no attached DbgRecords, or
+    // with attached processed DbgRecords. II has not been processed, and all
+    // debug instructions or DbgRecords in the frame preceding II have been
     // processed.
 
     // We've processed everything in the "frame". Now determine which variables
diff --git a/llvm/lib/CodeGen/CodeGenPrepare.cpp b/llvm/lib/CodeGen/CodeGenPrepare.cpp
index 59a0c64d3c9f2d..055e275e143d7a 100644
--- a/llvm/lib/CodeGen/CodeGenPrepare.cpp
+++ b/llvm/lib/CodeGen/CodeGenPrepare.cpp
@@ -2946,7 +2946,7 @@ class TypePromotionTransaction {
       Instruction *PrevInst;
       BasicBlock *BB;
     } Point;
-    std::optional<DPValue::self_iterator> BeforeDPValue = std::nullopt;
+    std::optional<DbgRecord::self_iterator> BeforeDbgRecord = std::nullopt;
 
     /// Remember whether or not the instruction had a previous instruction.
     bool HasPrevInstruction;
@@ -2958,9 +2958,9 @@ class TypePromotionTransaction {
       BasicBlock *BB = Inst->getParent();
 
       // Record where we would have to re-insert the instruction in the sequence
-      // of DPValues, if we ended up reinserting.
+      // of DbgRecords, if we ended up reinserting.
       if (BB->IsNewDbgInfoFormat)
-        BeforeDPValue = Inst->getDbgReinsertionPosition();
+        BeforeDbgRecord = Inst->getDbgReinsertionPosition();
 
       if (HasPrevInstruction) {
         Point.PrevInst = &*std::prev(Inst->getIterator());
@@ -2983,7 +2983,7 @@ class TypePromotionTransaction {
           Inst->insertBefore(*Point.BB, Position);
       }
 
-      Inst->getParent()->reinsertInstInDbgRecords(Inst, BeforeDPValue);
+      Inst->getParent()->reinsertInstInDbgRecords(Inst, BeforeDbgRecord);
     }
   };
 
diff --git a/llvm/lib/CodeGen/MIRPrinter.cpp b/llvm/lib/CodeGen/MIRPrinter.cpp
index 4ed44d1c06f48c..8efe67a9a72be6 100644
--- a/llvm/lib/CodeGen/MIRPrinter.cpp
+++ b/llvm/lib/CodeGen/MIRPrinter.cpp
@@ -982,7 +982,7 @@ void MIRFormatter::printIRValue(raw_ostream &OS, const Value &V,
 }
 
 void llvm::printMIR(raw_ostream &OS, const Module &M) {
-  // RemoveDIs: as there's no textual form for DPValues yet, print debug-info
+  // RemoveDIs: as there's no textual form for DbgRecords yet, print debug-info
   // in dbg.value format.
   bool IsNewDbgInfoFormat = M.IsNewDbgInfoFormat;
   if (IsNewDbgInfoFormat)
@@ -996,7 +996,7 @@ void llvm::printMIR(raw_ostream &OS, const Module &M) {
 }
 
 void llvm::printMIR(raw_ostream &OS, const MachineFunction &MF) {
-  // RemoveDIs: as there's no textual form for DPValues yet, print debug-info
+  // RemoveDIs: as there's no textual form for DbgRecords yet, print debug-info
   // in dbg.value format.
   bool IsNewDbgInfoFormat = MF.getFunction().IsNewDbgInfoFormat;
   if (IsNewDbgInfoFormat)
diff --git a/llvm/lib/CodeGen/SelectOptimize.cpp b/llvm/lib/CodeGen/SelectOptimize.cpp
index 40898d284a09ae..f65d5320bab90d 100644
--- a/llvm/lib/CodeGen/SelectOptimize.cpp
+++ b/llvm/lib/CodeGen/SelectOptimize.cpp
@@ -645,12 +645,13 @@ void SelectOptimizeImpl::convertProfitableSIGroups(SelectGroups &ProfSIGroups) {
       DI->moveBeforePreserving(&*EndBlock->getFirstInsertionPt());
     }
 
-    // Duplicate implementation for DPValues, the non-instruction debug-info
-    // record. Helper lambda for moving DPValues to the end block.
-    auto TransferDPValues = [&](Instruction &I) {
-      for (auto &DPValue : llvm::make_early_inc_range(I.getDbgRecordRange())) {
-        DPValue.removeFromParent();
-        EndBlock->insertDbgRecordBefore(&DPValue,
+    // Duplicate implementation for DbgRecords, the non-instruction debug-info
+    // format. Helper lambda for moving DbgRecords to the end block.
+    auto TransferDbgRecords = [&](Instruction &I) {
+      for (auto &DbgRecord :
+           llvm::make_early_inc_range(I.getDbgRecordRange())) {
+        DbgRecord.removeFromParent();
+        EndBlock->insertDbgRecordBefore(&DbgRecord,
                                         EndBlock->getFirstInsertionPt());
       }
     };
@@ -660,7 +661,7 @@ void SelectOptimizeImpl::convertProfitableSIGroups(SelectGroups &ProfSIGroups) {
     // middle" of the select group.
     auto R = make_range(std::next(SI.getI()->getIterator()),
                         std::next(LastSI.getI()->getIterator()));
-    llvm::for_each(R, TransferDPValues);
+    llvm::for_each(R, TransferDbgRecords);
 
     // These are the new basic blocks for the conditional branch.
     // At least one will become an actual new basic block.
diff --git a/llvm/lib/IR/AsmWriter.cpp b/llvm/lib/IR/AsmWriter.cpp
index 1beb4c069a6997..11383ea6214bf3 100644
--- a/llvm/lib/IR/AsmWriter.cpp
+++ b/llvm/lib/IR/AsmWriter.cpp
@@ -4592,7 +4592,7 @@ void AssemblyWriter::printInstruction(const Instruction &I) {
 void AssemblyWriter::printDPMarker(const DPMarker &Marker) {
   // There's no formal representation of a DPMarker -- print purely as a
   // debugging aid.
-  for (const DbgRecord &DPR : Marker.StoredDPValues) {
+  for (const DbgRecord &DPR : Marker.StoredDbgRecords) {
     printDbgRecord(DPR);
     Out << "\n";
   }
diff --git a/llvm/lib/IR/BasicBlock.cpp b/llvm/lib/IR/BasicBlock.cpp
index 7ead7ce3bf08ab..4dd1bdd6e2f4ad 100644
--- a/llvm/lib/IR/BasicBlock.cpp
+++ b/llvm/lib/IR/BasicBlock.cpp
@@ -63,9 +63,9 @@ DPMarker *BasicBlock::createMarker(InstListType::iterator It) {
 void BasicBlock::convertToNewDbgValues() {
   IsNewDbgInfoFormat = true;
 
-  // Iterate over all instructions in the instruction list, collecting dbg.value
-  // instructions and converting them to DPValues. Once we find a "real"
-  // instruction, attach all those DPValues to a DPMarker in that instruction.
+  // Iterate over all instructions in the instruction list, collecting debug
+  // info intrinsics and converting them to DbgRecords. Once we find a "real"
+  // instruction, attach all those DbgRecords to a DPMarker in that instruction.
   SmallVector<DbgRecord *, 4> DPVals;
   for (Instruction &I : make_early_inc_range(InstList)) {
     assert(!I.DbgMarker && "DbgMarker already set on old-format instrs?");
@@ -86,7 +86,7 @@ void BasicBlock::convertToNewDbgValues() {
     if (DPVals.empty())
       continue;
 
-    // Create a marker to store DPValues in.
+    // Create a marker to store DbgRecords in.
     createMarker(&I);
     DPMarker *Marker = I.DbgMarker;
 
@@ -102,7 +102,7 @@ void BasicBlock::convertFromNewDbgValues() {
   IsNewDbgInfoFormat = false;
 
   // Iterate over the block, finding instructions annotated with DPMarkers.
-  // Convert any attached DPValues to dbg.values and insert ahead of the
+  // Convert any attached DbgRecords to debug intrinsics and insert ahead of the
   // instruction.
   for (auto &Inst : *this) {
     if (!Inst.DbgMarker)
@@ -116,7 +116,7 @@ void BasicBlock::convertFromNewDbgValues() {
     Marker.eraseFromParent();
   }
 
-  // Assume no trailing DPValues: we could technically create them at the end
+  // Assume no trailing DbgRecords: we could technically create them at the end
   // of the block, after a terminator, but this would be non-cannonical and
   // indicates that something else is broken somewhere.
   assert(!getTrailingDbgRecords());
@@ -691,15 +691,15 @@ void BasicBlock::renumberInstructions() {
   NumInstrRenumberings++;
 }
 
-void BasicBlock::flushTerminatorDbgValues() {
-  // If we erase the terminator in a block, any DPValues will sink and "fall
+void BasicBlock::flushTerminatorDbgRecords() {
+  // If we erase the terminator in a block, any DbgRecords will sink and "fall
   // off the end", existing after any terminator that gets inserted. With
   // dbg.value intrinsics we would just insert the terminator at end() and
-  // the dbg.values would come before the terminator. With DPValues, we must
+  // the dbg.values would come before the terminator. With DbgRecords, we must
   // do this manually.
   // To get out of this unfortunate form, whenever we insert a terminator,
-  // check whether there's anything trailing at the end and move those DPValues
-  // in front of the terminator.
+  // check whether there's anything trailing at the end and move those
+  // DbgRecords in front of the terminator.
 
   // Do nothing if we're not in new debug-info format.
   if (!IsNewDbgInfoFormat)
@@ -710,15 +710,15 @@ void BasicBlock::flushTerminatorDbgValues() {
   if (!Term)
     return;
 
-  // Are there any dangling DPValues?
-  DPMarker *TrailingDPValues = getTrailingDbgRecords();
-  if (!TrailingDPValues)
+  // Are there any dangling DbgRecords?
+  DPMarker *TrailingDbgRecords = getTrailingDbgRecords();
+  if (!TrailingDbgRecords)
     return;
 
-  // Transfer DPValues from the trailing position onto the terminator.
+  // Transfer DbgRecords from the trailing position onto the terminator.
   createMarker(Term);
-  Term->DbgMarker->absorbDebugValues(*TrailingDPValues, false);
-  TrailingDPValues->eraseFromParent();
+  Term->DbgMarker->absorbDebugValues(*TrailingDbgRecords, false);
+  TrailingDbgRecords->eraseFromParent();
   deleteTrailingDbgRecords();
 }
 
@@ -735,7 +735,7 @@ void BasicBlock::spliceDebugInfoEmptyBlock(BasicBlock::iterator Dest,
   // If an optimisation pass attempts to splice the contents of the block from
   // BB1->begin() to BB1->getTerminator(), then the dbg.value will be
   // transferred to the destination.
-  // However, in the "new" DPValue format for debug-info, that range is empty:
+  // However, in the "new" DbgRecord format for debug-info, that range is empty:
   // begin() returns an iterator to the terminator, as there will only be a
   // single instruction in the block. We must piece together from the bits set
   // in the iterators whether there was the intention to transfer any debug
@@ -750,16 +750,16 @@ void BasicBlock::spliceDebugInfoEmptyBlock(BasicBlock::iterator Dest,
   bool ReadFromHead = First.getHeadBit();
 
   // If the source block is completely empty, including no terminator, then
-  // transfer any trailing DPValues that are still hanging around. This can
+  // transfer any trailing DbgRecords that are still hanging around. This can
   // occur when a block is optimised away and the terminator has been moved
   // somewhere else.
   if (Src->empty()) {
-    DPMarker *SrcTrailingDPValues = Src->getTrailingDbgRecords();
-    if (!SrcTrailingDPValues)
+    DPMarker *SrcTrailingDbgRecords = Src->getTrailingDbgRecords();
+    if (!SrcTrailingDbgRecords)
       return;
 
     Dest->adoptDbgRecords(Src, Src->end(), InsertAtHead);
-    // adoptDbgRecords should have released the trailing DPValues.
+    // adoptDbgRecords should have released the trailing DbgRecords.
     assert(!Src->getTrailingDbgRecords());
     return;
   }
@@ -785,8 +785,8 @@ void BasicBlock::spliceDebugInfo(BasicBlock::iterator Dest, BasicBlock *Src,
   /* Do a quick normalisation before calling the real splice implementation. We
      might be operating on a degenerate basic block that has no instructions
      in it, a legitimate transient state. In that case, Dest will be end() and
-     any DPValues temporarily stored in the TrailingDPValues map in LLVMContext.
-     We might illustrate it thus:
+     any DbgRecords temporarily stored in the TrailingDbgRecords map in
+     LLVMContext. We might illustrate it thus:
 
                          Dest
                            |
@@ -795,35 +795,35 @@ void BasicBlock::spliceDebugInfo(BasicBlock::iterator Dest, BasicBlock *Src,
                                 |               |
                                First           Last
 
-     However: does the caller expect the "~" DPValues to end up before or after
-     the spliced segment? This is communciated in the "Head" bit of Dest, which
-     signals whether the caller called begin() or end() on this block.
+     However: does the caller expect the "~" DbgRecords to end up before or
+     after the spliced segment? This is communciated in the "Head" bit of Dest,
+     which signals whether the caller called begin() or end() on this block.
 
-     If the head bit is set, then all is well, we leave DPValues trailing just
+     If the head bit is set, then all is well, we leave DbgRecords trailing just
      like how dbg.value instructions would trail after instructions spliced to
      the beginning of this block.
 
-     If the head bit isn't set, then try to jam the "~" DPValues onto the front
-     of the First instruction, then splice like normal, which joins the "~"
-     DPValues with the "+" DPValues. However if the "+" DPValues are supposed to
-     be left behind in Src, then:
-      * detach the "+" DPValues,
-      * move the "~" DPValues onto First,
+     If the head bit isn't set, then try to jam the "~" DbgRecords onto the
+     front of the First instruction, then splice like normal, which joins the
+     "~" DbgRecords with the "+" DbgRecords. However if the "+" DbgRecords are
+     supposed to be left behind in Src, then:
+      * detach the "+" DbgRecords,
+      * move the "~" DbgRecords onto First,
       * splice like normal,
-      * replace the "+" DPValues onto the Last position.
+      * replace the "+" DbgRecords onto the Last position.
      Complicated, but gets the job done. */
 
-  // If we're inserting at end(), and not in front of dangling DPValues, then
-  // move the DPValues onto "First". They'll then be moved naturally in the
+  // If we're inserting at end(), and not in front of dangling DbgRecords, then
+  // move the DbgRecords onto "First". They'll then be moved naturally in the
   // splice process.
-  DPMarker *MoreDanglingDPValues = nullptr;
-  DPMarker *OurTrailingDPValues = getTrailingDbgRecords();
-  if (Dest == end() && !Dest.getHeadBit() && OurTrailingDPValues) {
-    // Are the "+" DPValues not supposed to move? If so, detach them
+  DPMarker *MoreDanglingDbgRecords = nullptr;
+  DPMarker *OurTrailingDbgRecords = getTrailingDbgRecords();
+  if (Dest == end() && !Dest.getHeadBit() && OurTrailingDbgRecords) {
+    // Are the "+" DbgRecords not supposed to move? If so, detach them
     // temporarily.
     if (!First.getHeadBit() && First->hasDbgRecords()) {
-      MoreDanglingDPValues = Src->getMarker(First);
-      MoreDanglingDPValues->removeFromParent();
+      MoreDanglingDbgRecords = Src->getMarker(First);
+      MoreDanglingDbgRecords->removeFromParent();
     }
 
     if (First->hasDbgRecords()) {
@@ -839,8 +839,8 @@ void BasicBlock::spliceDebugInfo(BasicBlock::iterator Dest, BasicBlock *Src,
       // No current marker, create one and absorb in. (FIXME: we can avoid an
       // allocation in the future).
       DPMarker *CurMarker = Src->createMarker(&*First);
-      CurMarker->absorbDebugValues(*OurTrailingDPValues, false);
-      OurTrailingDPValues->eraseFromParent();
+      CurMarker->absorbDebugValues(*OurTrailingDbgRecords, false);
+      OurTrailingDbgRecords->eraseFromParent();
     }
     deleteTrailingDbgRecords();
     First.setHeadBit(true);
@@ -849,16 +849,16 @@ void BasicBlock::spliceDebugInfo(BasicBlock::iterator Dest, BasicBlock *Src,
   // Call the main debug-info-splicing implementation.
   spliceDebugInfoImpl(Dest, Src, First, Last);
 
-  // Do we have some "+" DPValues hanging around that weren't supposed to move,
-  // and we detached to make things easier?
-  if (!MoreDanglingDPValues)
+  // Do we have some "+" DbgRecords hanging around that weren't supposed to
+  // move, and we detached to make things easier?
+  if (!MoreDanglingDbgRecords)
     return;
 
   // FIXME: we could avoid an allocation here sometimes. (adoptDbgRecords
   // requires an iterator).
   DPMarker *LastMarker = Src->createMarker(Last);
-  LastMarker->absorbDebugValues(*MoreDanglingDPValues, true);
-  MoreDanglingDPValues->eraseFromParent();
+  LastMarker->absorbDebugValues(*MoreDanglingDbgRecords, true);
+  MoreDanglingDbgRecords->eraseFromParent();
 }
 
 void BasicBlock::spliceDebugInfoImpl(BasicBlock::iterator Dest, BasicBlock *Src,
@@ -870,15 +870,16 @@ void BasicBlock::spliceDebugInfoImpl(BasicBlock::iterator Dest, BasicBlock *Src,
   bool InsertAtHead = Dest.getHeadBit();
   bool ReadFromHead = First.getHeadBit();
   // Use this flag to signal the abnormal case, where we don't want to copy the
-  // DPValues ahead of the "Last" position.
+  // DbgRecords ahead of the "Last" position.
   bool ReadFromTail = !Last.getTailBit();
   bool LastIsEnd = (Last == Src->end());
 
   /*
     Here's an illustration of what we're about to do. We have two blocks, this
     and Src, and two segments of list. Each instruction is marked by a capital
-    while potential DPValue debug-info is marked out by "-" characters and a few
-    other special characters (+:=) where I want to highlight what's going on.
+    while potential DbgRecord debug-info is marked out by "-" characters and a
+    few other special characters (+:=) where I want to highlight what's going
+    on.
 
                                                  Dest
                                                    |
@@ -889,18 +890,18 @@ void BasicBlock::spliceDebugInfoImpl(BasicBlock::iterator Dest, BasicBlock *Src,
 
     The splice method is going to take all the instructions from First up to
     (but not including) Last and insert them in _front_ of Dest, forming one
-    long list. All the DPValues attached to instructions _between_ First and
+    long list. All the DbgRecords attached to instructions _between_ First and
     Last need no maintenence. However, we have to do special things with the
-    DPValues marked with the +:= characters. We only have three positions:
-    should the "+" DPValues be transferred, and if so to where? Do we move the
-    ":" DPValues? Would they go in front of the "=" DPValues, or should the "="
-    DPValues go before "+" DPValues?
+    DbgRecords marked with the +:= characters. We only have three positions:
+    should the "+" DbgRecords be transferred, and if so to where? Do we move the
+    ":" DbgRecords? Would they go in front of the "=" DbgRecords, or should the
+    "=" DbgRecords go before "+" DbgRecords?
 
     We're told which way it should be by the bits carried in the iterators. The
     "Head" bit indicates whether the specified position is supposed to be at the
-    front of the attached DPValues (true) or not (false). The Tail bit is true
-    on the other end of a range: is the range intended to include DPValues up to
-    the end (false) or not (true).
+    front of the attached DbgRecords (true) or not (false). The Tail bit is true
+    on the other end of a range: is the range intended to include DbgRecords up
+    to the end (false) or not (true).
 
     FIXME: the tail bit doesn't need to be distinct from the head bit, we could
     combine them.
@@ -934,15 +935,16 @@ void BasicBlock::spliceDebugInfoImpl(BasicBlock::iterator Dest, BasicBlock *Src,
 
    */
 
-  // Detach the marker at Dest -- this lets us move the "====" DPValues around.
+  // Detach the marker at Dest -- this lets us move the "====" DbgRecords
+  // around.
   DPMarker *DestMarker = nullptr;
   if (Dest != end()) {
     if ((DestMarker = getMarker(Dest)))
       DestMarker->removeFromParent();
   }
 
-  // If we're moving the tail range of DPValues (":::"), absorb them into the
-  // front of the DPValues at Dest.
+  // If we're moving the tail range of DbgRecords (":::"), absorb them into the
+  // front of the DbgRecords at Dest.
   if (ReadFromTail && Src->getMarker(Last)) {
     DPMarker *FromLast = Src->getMarker(Last);
     if (LastIsEnd) {
@@ -956,7 +958,7 @@ void BasicBlock::spliceDebugInfoImpl(BasicBlock::iterator Dest, BasicBlock *Src,
     }
   }
 
-  // If we're _not_ reading from the head of First, i.e. the "++++" DPValues,
+  // If we're _not_ reading from the head of First, i.e. the "++++" DbgRecords,
   // move their markers onto Last. They remain in the Src block. No action
   // needed.
   if (!ReadFromHead && First->hasDbgRecords()) {
@@ -970,16 +972,16 @@ void BasicBlock::spliceDebugInfoImpl(BasicBlock::iterator Dest, BasicBlock *Src,
     }
   }
 
-  // Finally, do something with the "====" DPValues we detached.
+  // Finally, do something with the "====" DbgRecords we detached.
   if (DestMarker) {
     if (InsertAtHead) {
-      // Insert them at the end of the DPValues at Dest. The "::::" DPValues
+      // Insert them at the end of the DbgRecords at Dest. The "::::" DbgRecords
       // might be in front of them.
       DPMarker *NewDestMarker = createMarker(Dest);
       NewDestMarker->absorbDebugValues(*DestMarker, false);
     } else {
       // Insert them right at the start of the range we moved, ahead of First
-      // and the "++++" DPValues.
+      // and the "++++" DbgRecords.
       DPMarker *FirstMarker = createMarker(First);
       FirstMarker->absorbDebugValues(*DestMarker, true);
     }
@@ -990,10 +992,10 @@ void BasicBlock::spliceDebugInfoImpl(BasicBlock::iterator Dest, BasicBlock *Src,
     // any trailing debug-info at the end of the block would "normally" have
     // been pushed in front of "First". Move it there now.
     DPMarker *FirstMarker = getMarker(First);
-    DPMarker *TrailingDPValues = getTrailingDbgRecords();
-    if (TrailingDPValues) {
-      FirstMarker->absorbDebugValues(*TrailingDPValues, true);
-      TrailingDPValues->eraseFromParent();
+    DPMarker *TrailingDbgRecords = getTrailingDbgRecords();
+    if (TrailingDbgRecords) {
+      FirstMarker->absorbDebugValues(*TrailingDbgRecords, true);
+      TrailingDbgRecords->eraseFromParent();
       deleteTrailingDbgRecords();
     }
   }
@@ -1024,7 +1026,7 @@ void BasicBlock::splice(iterator Dest, BasicBlock *Src, iterator First,
   // And move the instructions.
   getInstList().splice(Dest, Src->getInstList(), First, Last);
 
-  flushTerminatorDbgValues();
+  flushTerminatorDbgRecords();
 }
 
 void BasicBlock::insertDbgRecordAfter(DbgRecord *DPV, Instruction *I) {
@@ -1057,38 +1059,40 @@ DPMarker *BasicBlock::getMarker(InstListType::iterator It) {
 }
 
 void BasicBlock::reinsertInstInDbgRecords(
-    Instruction *I, std::optional<DPValue::self_iterator> Pos) {
+    Instruction *I, std::optional<DbgRecord::self_iterator> Pos) {
   // "I" was originally removed from a position where it was
-  // immediately in front of Pos. Any DPValues on that position then "fell down"
-  // onto Pos. "I" has been re-inserted at the front of that wedge of DPValues,
-  // shuffle them around to represent the original positioning. To illustrate:
+  // immediately in front of Pos. Any DbgRecords on that position then "fell
+  // down" onto Pos. "I" has been re-inserted at the front of that wedge of
+  // DbgRecords, shuffle them around to represent the original positioning. To
+  // illustrate:
   //
   //   Instructions:  I1---I---I0
-  //       DPValues:    DDD DDD
+  //       DbgRecords:    DDD DDD
   //
   // Instruction "I" removed,
   //
   //   Instructions:  I1------I0
-  //       DPValues:    DDDDDD
+  //       DbgRecords:    DDDDDD
   //                       ^Pos
   //
   // Instruction "I" re-inserted (now):
   //
   //   Instructions:  I1---I------I0
-  //       DPValues:        DDDDDD
+  //       DbgRecords:        DDDDDD
   //                           ^Pos
   //
   // After this method completes:
   //
   //   Instructions:  I1---I---I0
-  //       DPValues:    DDD DDD
+  //       DbgRecords:    DDD DDD
 
-  // This happens if there were no DPValues on I0. Are there now DPValues there?
+  // This happens if there were no DbgRecords on I0. Are there now DbgRecords
+  // there?
   if (!Pos) {
     DPMarker *NextMarker = getNextMarker(I);
     if (!NextMarker)
       return;
-    if (NextMarker->StoredDPValues.empty())
+    if (NextMarker->StoredDbgRecords.empty())
       return;
     // There are DPMarkers there now -- they fell down from "I".
     DPMarker *ThisMarker = createMarker(I);
@@ -1096,15 +1100,15 @@ void BasicBlock::reinsertInstInDbgRecords(
     return;
   }
 
-  // Is there even a range of DPValues to move?
+  // Is there even a range of DbgRecords to move?
   DPMarker *DPM = (*Pos)->getMarker();
-  auto Range = make_range(DPM->StoredDPValues.begin(), (*Pos));
+  auto Range = make_range(DPM->StoredDbgRecords.begin(), (*Pos));
   if (Range.begin() == Range.end())
     return;
 
   // Otherwise: splice.
   DPMarker *ThisMarker = createMarker(I);
-  assert(ThisMarker->StoredDPValues.empty());
+  assert(ThisMarker->StoredDbgRecords.empty());
   ThisMarker->absorbDebugValues(Range, *DPM, true);
 }
 
diff --git a/llvm/lib/IR/DebugInfo.cpp b/llvm/lib/IR/DebugInfo.cpp
index e63b1e67dad7c2..d16895058ba252 100644
--- a/llvm/lib/IR/DebugInfo.cpp
+++ b/llvm/lib/IR/DebugInfo.cpp
@@ -895,7 +895,7 @@ bool llvm::stripNonLineTableDebugInfo(Module &M) {
         if (I.hasMetadataOtherThanDebugLoc())
           I.setMetadata("heapallocsite", nullptr);
 
-        // Strip any DPValues attached.
+        // Strip any DbgRecords attached.
         I.dropDbgRecords();
       }
     }
diff --git a/llvm/lib/IR/DebugProgramInstruction.cpp b/llvm/lib/IR/DebugProgramInstruction.cpp
index 019b00c2e2087d..f34d3aecfa0a38 100644
--- a/llvm/lib/IR/DebugProgramInstruction.cpp
+++ b/llvm/lib/IR/DebugProgramInstruction.cpp
@@ -1,4 +1,4 @@
-//======-- DebugProgramInstruction.cpp - Implement DPValues/DPMarkers --======//
+//=====-- DebugProgramInstruction.cpp - Implement DbgRecords/DPMarkers --=====//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
@@ -541,21 +541,21 @@ void DbgRecord::moveAfter(DbgRecord *MoveAfter) {
 ///////////////////////////////////////////////////////////////////////////////
 
 // An empty, global, DPMarker for the purpose of describing empty ranges of
-// DPValues.
+// DbgRecords.
 DPMarker DPMarker::EmptyDPMarker;
 
 void DPMarker::dropDbgRecords() {
-  while (!StoredDPValues.empty()) {
-    auto It = StoredDPValues.begin();
+  while (!StoredDbgRecords.empty()) {
+    auto It = StoredDbgRecords.begin();
     DbgRecord *DR = &*It;
-    StoredDPValues.erase(It);
+    StoredDbgRecords.erase(It);
     DR->deleteRecord();
   }
 }
 
 void DPMarker::dropOneDbgRecord(DbgRecord *DR) {
   assert(DR->getMarker() == this);
-  StoredDPValues.erase(DR->getIterator());
+  StoredDbgRecords.erase(DR->getIterator());
   DR->deleteRecord();
 }
 
@@ -566,15 +566,15 @@ const BasicBlock *DPMarker::getParent() const {
 BasicBlock *DPMarker::getParent() { return MarkedInstr->getParent(); }
 
 void DPMarker::removeMarker() {
-  // Are there any DPValues in this DPMarker? If not, nothing to preserve.
+  // Are there any DbgRecords in this DPMarker? If not, nothing to preserve.
   Instruction *Owner = MarkedInstr;
-  if (StoredDPValues.empty()) {
+  if (StoredDbgRecords.empty()) {
     eraseFromParent();
     Owner->DbgMarker = nullptr;
     return;
   }
 
-  // The attached DPValues need to be preserved; attach them to the next
+  // The attached DbgRecords need to be preserved; attach them to the next
   // instruction. If there isn't a next instruction, put them on the
   // "trailing" list.
   DPMarker *NextMarker = Owner->getParent()->getNextMarker(Owner);
@@ -610,15 +610,15 @@ void DPMarker::eraseFromParent() {
 }
 
 iterator_range<DbgRecord::self_iterator> DPMarker::getDbgRecordRange() {
-  return make_range(StoredDPValues.begin(), StoredDPValues.end());
+  return make_range(StoredDbgRecords.begin(), StoredDbgRecords.end());
 }
 iterator_range<DbgRecord::const_self_iterator>
 DPMarker::getDbgRecordRange() const {
-  return make_range(StoredDPValues.begin(), StoredDPValues.end());
+  return make_range(StoredDbgRecords.begin(), StoredDbgRecords.end());
 }
 
 void DbgRecord::removeFromParent() {
-  getMarker()->StoredDPValues.erase(getIterator());
+  getMarker()->StoredDbgRecords.erase(getIterator());
   Marker = nullptr;
 }
 
@@ -628,29 +628,29 @@ void DbgRecord::eraseFromParent() {
 }
 
 void DPMarker::insertDbgRecord(DbgRecord *New, bool InsertAtHead) {
-  auto It = InsertAtHead ? StoredDPValues.begin() : StoredDPValues.end();
-  StoredDPValues.insert(It, *New);
+  auto It = InsertAtHead ? StoredDbgRecords.begin() : StoredDbgRecords.end();
+  StoredDbgRecords.insert(It, *New);
   New->setMarker(this);
 }
 void DPMarker::insertDbgRecord(DbgRecord *New, DbgRecord *InsertBefore) {
   assert(InsertBefore->getMarker() == this &&
-         "DPValue 'InsertBefore' must be contained in this DPMarker!");
-  StoredDPValues.insert(InsertBefore->getIterator(), *New);
+         "DbgRecord 'InsertBefore' must be contained in this DPMarker!");
+  StoredDbgRecords.insert(InsertBefore->getIterator(), *New);
   New->setMarker(this);
 }
 void DPMarker::insertDbgRecordAfter(DbgRecord *New, DbgRecord *InsertAfter) {
   assert(InsertAfter->getMarker() == this &&
-         "DPValue 'InsertAfter' must be contained in this DPMarker!");
-  StoredDPValues.insert(++(InsertAfter->getIterator()), *New);
+         "DbgRecord 'InsertAfter' must be contained in this DPMarker!");
+  StoredDbgRecords.insert(++(InsertAfter->getIterator()), *New);
   New->setMarker(this);
 }
 
 void DPMarker::absorbDebugValues(DPMarker &Src, bool InsertAtHead) {
-  auto It = InsertAtHead ? StoredDPValues.begin() : StoredDPValues.end();
-  for (DbgRecord &DPV : Src.StoredDPValues)
+  auto It = InsertAtHead ? StoredDbgRecords.begin() : StoredDbgRecords.end();
+  for (DbgRecord &DPV : Src.StoredDbgRecords)
     DPV.setMarker(this);
 
-  StoredDPValues.splice(It, Src.StoredDPValues);
+  StoredDbgRecords.splice(It, Src.StoredDbgRecords);
 }
 
 void DPMarker::absorbDebugValues(iterator_range<DbgRecord::self_iterator> Range,
@@ -659,45 +659,45 @@ void DPMarker::absorbDebugValues(iterator_range<DbgRecord::self_iterator> Range,
     DR.setMarker(this);
 
   auto InsertPos =
-      (InsertAtHead) ? StoredDPValues.begin() : StoredDPValues.end();
+      (InsertAtHead) ? StoredDbgRecords.begin() : StoredDbgRecords.end();
 
-  StoredDPValues.splice(InsertPos, Src.StoredDPValues, Range.begin(),
-                        Range.end());
+  StoredDbgRecords.splice(InsertPos, Src.StoredDbgRecords, Range.begin(),
+                          Range.end());
 }
 
 iterator_range<simple_ilist<DbgRecord>::iterator> DPMarker::cloneDebugInfoFrom(
     DPMarker *From, std::optional<simple_ilist<DbgRecord>::iterator> from_here,
     bool InsertAtHead) {
   DbgRecord *First = nullptr;
-  // Work out what range of DPValues to clone: normally all the contents of the
-  // "From" marker, optionally we can start from the from_here position down to
-  // end().
+  // Work out what range of DbgRecords to clone: normally all the contents of
+  // the "From" marker, optionally we can start from the from_here position down
+  // to end().
   auto Range =
-      make_range(From->StoredDPValues.begin(), From->StoredDPValues.end());
+      make_range(From->StoredDbgRecords.begin(), From->StoredDbgRecords.end());
   if (from_here.has_value())
-    Range = make_range(*from_here, From->StoredDPValues.end());
+    Range = make_range(*from_here, From->StoredDbgRecords.end());
 
   // Clone each DPValue and insert into StoreDPValues; optionally place them at
   // the start or the end of the list.
-  auto Pos = (InsertAtHead) ? StoredDPValues.begin() : StoredDPValues.end();
+  auto Pos = (InsertAtHead) ? StoredDbgRecords.begin() : StoredDbgRecords.end();
   for (DbgRecord &DR : Range) {
     DbgRecord *New = DR.clone();
     New->setMarker(this);
-    StoredDPValues.insert(Pos, *New);
+    StoredDbgRecords.insert(Pos, *New);
     if (!First)
       First = New;
   }
 
   if (!First)
-    return {StoredDPValues.end(), StoredDPValues.end()};
+    return {StoredDbgRecords.end(), StoredDbgRecords.end()};
 
   if (InsertAtHead)
     // If InsertAtHead is set, we cloned a range onto the front of of the
-    // StoredDPValues collection, return that range.
-    return {StoredDPValues.begin(), Pos};
+    // StoredDbgRecords collection, return that range.
+    return {StoredDbgRecords.begin(), Pos};
   else
     // We inserted a block at the end, return that range.
-    return {First->getIterator(), StoredDPValues.end()};
+    return {First->getIterator(), StoredDbgRecords.end()};
 }
 
 } // end namespace llvm
diff --git a/llvm/lib/IR/Instruction.cpp b/llvm/lib/IR/Instruction.cpp
index e0892398f43445..7a677d7f3c2be8 100644
--- a/llvm/lib/IR/Instruction.cpp
+++ b/llvm/lib/IR/Instruction.cpp
@@ -143,7 +143,7 @@ void Instruction::insertBefore(BasicBlock &BB,
     return;
 
   // We've inserted "this": if InsertAtHead is set then it comes before any
-  // DPValues attached to InsertPos. But if it's not set, then any DPValues
+  // DPValues attached to InsertPos. But if it's not set, then any DbgRecords
   // should now come before "this".
   bool InsertAtHead = InsertPos.getHeadBit();
   if (!InsertAtHead) {
@@ -166,10 +166,10 @@ void Instruction::insertBefore(BasicBlock &BB,
   }
 
   // If we're inserting a terminator, check if we need to flush out
-  // TrailingDPValues. Inserting instructions at the end of an incomplete
+  // TrailingDbgRecords. Inserting instructions at the end of an incomplete
   // block is handled by the code block above.
   if (isTerminator())
-    getParent()->flushTerminatorDbgValues();
+    getParent()->flushTerminatorDbgRecords();
 }
 
 /// Unlink this instruction from its current basic block and insert it into the
@@ -212,12 +212,12 @@ void Instruction::moveBeforeImpl(BasicBlock &BB, InstListType::iterator I,
   assert(I == BB.end() || I->getParent() == &BB);
   bool InsertAtHead = I.getHeadBit();
 
-  // If we've been given the "Preserve" flag, then just move the DPValues with
+  // If we've been given the "Preserve" flag, then just move the DbgRecords with
   // the instruction, no more special handling needed.
   if (BB.IsNewDbgInfoFormat && DbgMarker && !Preserve) {
     if (I != this->getIterator() || InsertAtHead) {
       // "this" is definitely moving in the list, or it's moving ahead of its
-      // attached DPValues. Detach any existing DPValues.
+      // attached DPValues. Detach any existing DbgRecords.
       handleMarkerRemoval();
     }
   }
@@ -229,15 +229,15 @@ void Instruction::moveBeforeImpl(BasicBlock &BB, InstListType::iterator I,
   if (BB.IsNewDbgInfoFormat && !Preserve) {
     DPMarker *NextMarker = getParent()->getNextMarker(this);
 
-    // If we're inserting at point I, and not in front of the DPValues attached
-    // there, then we should absorb the DPValues attached to I.
+    // If we're inserting at point I, and not in front of the DbgRecords
+    // attached there, then we should absorb the DbgRecords attached to I.
     if (!InsertAtHead && NextMarker && !NextMarker->empty()) {
       adoptDbgRecords(&BB, I, false);
     }
   }
 
   if (isTerminator())
-    getParent()->flushTerminatorDbgValues();
+    getParent()->flushTerminatorDbgRecords();
 }
 
 iterator_range<DbgRecord::self_iterator> Instruction::cloneDebugInfoFrom(
@@ -263,11 +263,11 @@ Instruction::getDbgReinsertionPosition() {
   if (!NextMarker)
     return std::nullopt;
 
-  // Are there any DPValues in the next marker?
-  if (NextMarker->StoredDPValues.empty())
+  // Are there any DbgRecords in the next marker?
+  if (NextMarker->StoredDbgRecords.empty())
     return std::nullopt;
 
-  return NextMarker->StoredDPValues.begin();
+  return NextMarker->StoredDbgRecords.begin();
 }
 
 bool Instruction::hasDbgRecords() const { return !getDbgRecordRange().empty(); }
@@ -275,20 +275,20 @@ bool Instruction::hasDbgRecords() const { return !getDbgRecordRange().empty(); }
 void Instruction::adoptDbgRecords(BasicBlock *BB, BasicBlock::iterator It,
                                   bool InsertAtHead) {
   DPMarker *SrcMarker = BB->getMarker(It);
-  auto ReleaseTrailingDPValues = [BB, It, SrcMarker]() {
+  auto ReleaseTrailingDbgRecords = [BB, It, SrcMarker]() {
     if (BB->end() == It) {
       SrcMarker->eraseFromParent();
       BB->deleteTrailingDbgRecords();
     }
   };
 
-  if (!SrcMarker || SrcMarker->StoredDPValues.empty()) {
-    ReleaseTrailingDPValues();
+  if (!SrcMarker || SrcMarker->StoredDbgRecords.empty()) {
+    ReleaseTrailingDbgRecords();
     return;
   }
 
   // If we have DPMarkers attached to this instruction, we have to honour the
-  // ordering of DPValues between this and the other marker. Fall back to just
+  // ordering of DbgRecords between this and the other marker. Fall back to just
   // absorbing from the source.
   if (DbgMarker || It == BB->end()) {
     // Ensure we _do_ have a marker.
@@ -304,10 +304,11 @@ void Instruction::adoptDbgRecords(BasicBlock *BB, BasicBlock::iterator It,
     // block, it's important to not leave the empty marker trailing. It will
     // give a misleading impression that some debug records have been left
     // trailing.
-    ReleaseTrailingDPValues();
+    ReleaseTrailingDbgRecords();
   } else {
-    // Optimisation: we're transferring all the DPValues from the source marker
-    // onto this empty location: just adopt the other instructions marker.
+    // Optimisation: we're transferring all the DbgRecords from the source
+    // marker onto this empty location: just adopt the other instructions
+    // marker.
     DbgMarker = SrcMarker;
     DbgMarker->MarkedInstr = this;
     It->DbgMarker = nullptr;
diff --git a/llvm/lib/IR/LLVMContextImpl.cpp b/llvm/lib/IR/LLVMContextImpl.cpp
index a0bf9cae7926bb..a471314ccb940a 100644
--- a/llvm/lib/IR/LLVMContextImpl.cpp
+++ b/llvm/lib/IR/LLVMContextImpl.cpp
@@ -50,7 +50,7 @@ LLVMContextImpl::~LLVMContextImpl() {
   // when it's terminator was removed were eventually replaced. This assertion
   // firing indicates that DPValues went missing during the lifetime of the
   // LLVMContext.
-  assert(TrailingDPValues.empty() && "DPValue records in blocks not cleaned");
+  assert(TrailingDbgRecords.empty() && "DbgRecords in blocks not cleaned");
 #endif
 
   // NOTE: We need to delete the contents of OwnedModules, but Module's dtor
diff --git a/llvm/lib/IR/LLVMContextImpl.h b/llvm/lib/IR/LLVMContextImpl.h
index c841b28ca4380b..b1dcb262fb657f 100644
--- a/llvm/lib/IR/LLVMContextImpl.h
+++ b/llvm/lib/IR/LLVMContextImpl.h
@@ -1684,19 +1684,19 @@ class LLVMContextImpl {
   /// such a way. These are stored in LLVMContext because typically LLVM only
   /// edits a small number of blocks at a time, so there's no need to bloat
   /// BasicBlock with such a data structure.
-  SmallDenseMap<BasicBlock *, DPMarker *> TrailingDPValues;
+  SmallDenseMap<BasicBlock *, DPMarker *> TrailingDbgRecords;
 
-  // Set, get and delete operations for TrailingDPValues.
+  // Set, get and delete operations for TrailingDbgRecords.
   void setTrailingDbgRecords(BasicBlock *B, DPMarker *M) {
-    assert(!TrailingDPValues.count(B));
-    TrailingDPValues[B] = M;
+    assert(!TrailingDbgRecords.count(B));
+    TrailingDbgRecords[B] = M;
   }
 
   DPMarker *getTrailingDbgRecords(BasicBlock *B) {
-    return TrailingDPValues.lookup(B);
+    return TrailingDbgRecords.lookup(B);
   }
 
-  void deleteTrailingDbgRecords(BasicBlock *B) { TrailingDPValues.erase(B); }
+  void deleteTrailingDbgRecords(BasicBlock *B) { TrailingDbgRecords.erase(B); }
 };
 
 } // end namespace llvm
diff --git a/llvm/lib/Transforms/Utils/Local.cpp b/llvm/lib/Transforms/Utils/Local.cpp
index 7b74caac9e0840..a87e5a3a923a48 100644
--- a/llvm/lib/Transforms/Utils/Local.cpp
+++ b/llvm/lib/Transforms/Utils/Local.cpp
@@ -2848,7 +2848,7 @@ unsigned llvm::changeToUnreachable(Instruction *I, bool PreserveLCSSA,
       Updates.push_back({DominatorTree::Delete, BB, UniqueSuccessor});
     DTU->applyUpdates(Updates);
   }
-  BB->flushTerminatorDbgValues();
+  BB->flushTerminatorDbgRecords();
   return NumInstrsRemoved;
 }
 
diff --git a/llvm/lib/Transforms/Utils/LoopRotationUtils.cpp b/llvm/lib/Transforms/Utils/LoopRotationUtils.cpp
index 8c6af7afa875c9..acfd87c64b79a5 100644
--- a/llvm/lib/Transforms/Utils/LoopRotationUtils.cpp
+++ b/llvm/lib/Transforms/Utils/LoopRotationUtils.cpp
@@ -577,28 +577,28 @@ bool LoopRotate::rotateLoop(Loop *L, bool SimplifiedLatch) {
 
     Module *M = OrigHeader->getModule();
 
-    // Track the next DPValue to clone. If we have a sequence where an
+    // Track the next DbgRecord to clone. If we have a sequence where an
     // instruction is hoisted instead of being cloned:
-    //    DPValue blah
+    //    DbgRecord blah
     //    %foo = add i32 0, 0
-    //    DPValue xyzzy
+    //    DbgRecord xyzzy
     //    %bar = call i32 @foobar()
-    // where %foo is hoisted, then the DPValue "blah" will be seen twice, once
+    // where %foo is hoisted, then the DbgRecord "blah" will be seen twice, once
     // attached to %foo, then when %foo his hoisted it will "fall down" onto the
     // function call:
-    //    DPValue blah
-    //    DPValue xyzzy
+    //    DbgRecord blah
+    //    DbgRecord xyzzy
     //    %bar = call i32 @foobar()
     // causing it to appear attached to the call too.
     //
     // To avoid this, cloneDebugInfoFrom takes an optional "start cloning from
-    // here" position to account for this behaviour. We point it at any DPValues
-    // on the next instruction, here labelled xyzzy, before we hoist %foo.
-    // Later, we only only clone DPValues from that position (xyzzy) onwards,
-    // which avoids cloning DPValue "blah" multiple times.
-    // (Stored as a range because it gives us a natural way of testing whether
-    //  there were DPValues on the next instruction before we hoisted things).
-    iterator_range<DPValue::self_iterator> NextDbgInsts =
+    // here" position to account for this behaviour. We point it at any
+    // DbgRecords on the next instruction, here labelled xyzzy, before we hoist
+    // %foo. Later, we only only clone DbgRecords from that position (xyzzy)
+    // onwards, which avoids cloning DbgRecord "blah" multiple times. (Stored as
+    // a range because it gives us a natural way of testing whether
+    //  there were DbgRecords on the next instruction before we hoisted things).
+    iterator_range<DbgRecord::self_iterator> NextDbgInsts =
         (I != E) ? I->getDbgRecordRange() : DPMarker::getEmptyDbgRecordRange();
 
     while (I != E) {
@@ -777,7 +777,7 @@ bool LoopRotate::rotateLoop(Loop *L, bool SimplifiedLatch) {
     // OrigPreHeader's old terminator (the original branch into the loop), and
     // remove the corresponding incoming values from the PHI nodes in OrigHeader.
     LoopEntryBranch->eraseFromParent();
-    OrigPreheader->flushTerminatorDbgValues();
+    OrigPreheader->flushTerminatorDbgRecords();
 
     // Update MemorySSA before the rewrite call below changes the 1:1
     // instruction:cloned_instruction_or_value mapping.
diff --git a/llvm/lib/Transforms/Utils/SimplifyCFG.cpp b/llvm/lib/Transforms/Utils/SimplifyCFG.cpp
index 0f3d1403481dfc..6d2a6a3e7f11b5 100644
--- a/llvm/lib/Transforms/Utils/SimplifyCFG.cpp
+++ b/llvm/lib/Transforms/Utils/SimplifyCFG.cpp
@@ -1572,7 +1572,8 @@ hoistLockstepIdenticalDPValues(Instruction *TI, Instruction *I1,
   while (none_of(Itrs, atEnd)) {
     bool HoistDPVs = allIdentical(Itrs);
     for (CurrentAndEndIt &Pair : Itrs) {
-      // Increment Current iterator now as we may be about to move the DPValue.
+      // Increment Current iterator now as we may be about to move the
+      // DbgRecord.
       DbgRecord &DR = *Pair.first++;
       if (HoistDPVs) {
         DR.removeFromParent();
@@ -5304,7 +5305,7 @@ bool SimplifyCFGOpt::simplifyUnreachable(UnreachableInst *UI) {
   // Ensure that any debug-info records that used to occur after the Unreachable
   // are moved to in front of it -- otherwise they'll "dangle" at the end of
   // the block.
-  BB->flushTerminatorDbgValues();
+  BB->flushTerminatorDbgRecords();
 
   // Debug-info records on the unreachable inst itself should be deleted, as
   // below we delete everything past the final executable instruction.
@@ -5326,8 +5327,8 @@ bool SimplifyCFGOpt::simplifyUnreachable(UnreachableInst *UI) {
     // block will be the unwind edges of Invoke/CatchSwitch/CleanupReturn,
     // and we can therefore guarantee this block will be erased.
 
-    // If we're deleting this, we're deleting any subsequent dbg.values, so
-    // delete DPValue records of variable information.
+    // If we're deleting this, we're deleting any subsequent debug info, so
+    // delete DbgRecords.
     BBI->dropDbgRecords();
 
     // Delete this instruction (any uses are guaranteed to be dead)
diff --git a/llvm/lib/Transforms/Utils/ValueMapper.cpp b/llvm/lib/Transforms/Utils/ValueMapper.cpp
index 3da161043d6c8c..abb7a4452a3796 100644
--- a/llvm/lib/Transforms/Utils/ValueMapper.cpp
+++ b/llvm/lib/Transforms/Utils/ValueMapper.cpp
@@ -146,7 +146,7 @@ class Mapper {
   Value *mapValue(const Value *V);
   void remapInstruction(Instruction *I);
   void remapFunction(Function &F);
-  void remapDPValue(DbgRecord &DPV);
+  void remapDbgRecord(DbgRecord &DPV);
 
   Constant *mapConstant(const Constant *C) {
     return cast_or_null<Constant>(mapValue(C));
@@ -537,7 +537,7 @@ Value *Mapper::mapValue(const Value *V) {
   return getVM()[V] = ConstantPointerNull::get(cast<PointerType>(NewTy));
 }
 
-void Mapper::remapDPValue(DbgRecord &DR) {
+void Mapper::remapDbgRecord(DbgRecord &DR) {
   if (DPLabel *DPL = dyn_cast<DPLabel>(&DR)) {
     DPL->setLabel(cast<DILabel>(mapMetadata(DPL->getLabel())));
     return;
@@ -1067,7 +1067,7 @@ void Mapper::remapFunction(Function &F) {
     for (Instruction &I : BB) {
       remapInstruction(&I);
       for (DbgRecord &DR : I.getDbgRecordRange())
-        remapDPValue(DR);
+        remapDbgRecord(DR);
     }
   }
 }
@@ -1234,7 +1234,7 @@ void ValueMapper::remapInstruction(Instruction &I) {
 }
 
 void ValueMapper::remapDPValue(Module *M, DPValue &V) {
-  FlushingMapper(pImpl)->remapDPValue(V);
+  FlushingMapper(pImpl)->remapDbgRecord(V);
 }
 
 void ValueMapper::remapDPValueRange(
diff --git a/llvm/unittests/IR/BasicBlockDbgInfoTest.cpp b/llvm/unittests/IR/BasicBlockDbgInfoTest.cpp
index e23c7eaa4930d0..bfc64cb84143d3 100644
--- a/llvm/unittests/IR/BasicBlockDbgInfoTest.cpp
+++ b/llvm/unittests/IR/BasicBlockDbgInfoTest.cpp
@@ -149,7 +149,7 @@ TEST(BasicBlockDbgInfoTest, MarkerOperations) {
   Instruction *Instr2 = Instr1->getNextNode();
   DPMarker *Marker1 = Instr1->DbgMarker;
   DPMarker *Marker2 = Instr2->DbgMarker;
-  // There's no TrailingDPValues marker allocated yet.
+  // There's no TrailingDbgRecords marker allocated yet.
   DPMarker *EndMarker = nullptr;
 
   // Check that the "getMarker" utilities operate as expected.
@@ -159,26 +159,26 @@ TEST(BasicBlockDbgInfoTest, MarkerOperations) {
   EXPECT_EQ(BB.getNextMarker(Instr2), EndMarker); // Is nullptr.
 
   // There should be two DPValues,
-  EXPECT_EQ(Marker1->StoredDPValues.size(), 1u);
-  EXPECT_EQ(Marker2->StoredDPValues.size(), 1u);
+  EXPECT_EQ(Marker1->StoredDbgRecords.size(), 1u);
+  EXPECT_EQ(Marker2->StoredDbgRecords.size(), 1u);
 
   // Unlink them and try to re-insert them through the basic block.
-  DbgRecord *DPV1 = &*Marker1->StoredDPValues.begin();
-  DbgRecord *DPV2 = &*Marker2->StoredDPValues.begin();
+  DbgRecord *DPV1 = &*Marker1->StoredDbgRecords.begin();
+  DbgRecord *DPV2 = &*Marker2->StoredDbgRecords.begin();
   DPV1->removeFromParent();
   DPV2->removeFromParent();
-  EXPECT_TRUE(Marker1->StoredDPValues.empty());
-  EXPECT_TRUE(Marker2->StoredDPValues.empty());
+  EXPECT_TRUE(Marker1->StoredDbgRecords.empty());
+  EXPECT_TRUE(Marker2->StoredDbgRecords.empty());
 
   // This should appear in Marker1.
   BB.insertDbgRecordBefore(DPV1, BB.begin());
-  EXPECT_EQ(Marker1->StoredDPValues.size(), 1u);
-  EXPECT_EQ(DPV1, &*Marker1->StoredDPValues.begin());
+  EXPECT_EQ(Marker1->StoredDbgRecords.size(), 1u);
+  EXPECT_EQ(DPV1, &*Marker1->StoredDbgRecords.begin());
 
   // This should attach to Marker2.
   BB.insertDbgRecordAfter(DPV2, &*BB.begin());
-  EXPECT_EQ(Marker2->StoredDPValues.size(), 1u);
-  EXPECT_EQ(DPV2, &*Marker2->StoredDPValues.begin());
+  EXPECT_EQ(Marker2->StoredDbgRecords.size(), 1u);
+  EXPECT_EQ(DPV2, &*Marker2->StoredDbgRecords.begin());
 
   // Now, how about removing instructions? That should cause any DPValues to
   // "fall down".
@@ -186,7 +186,7 @@ TEST(BasicBlockDbgInfoTest, MarkerOperations) {
   Marker1 = nullptr;
   // DPValues should now be in Marker2.
   EXPECT_EQ(BB.size(), 1u);
-  EXPECT_EQ(Marker2->StoredDPValues.size(), 2u);
+  EXPECT_EQ(Marker2->StoredDbgRecords.size(), 2u);
   // They should also be in the correct order.
   SmallVector<DbgRecord *, 2> DPVs;
   for (DbgRecord &DPV : Marker2->getDbgRecordRange())
@@ -201,7 +201,7 @@ TEST(BasicBlockDbgInfoTest, MarkerOperations) {
   EXPECT_TRUE(BB.empty());
   EndMarker = BB.getTrailingDbgRecords();
   ASSERT_NE(EndMarker, nullptr);
-  EXPECT_EQ(EndMarker->StoredDPValues.size(), 2u);
+  EXPECT_EQ(EndMarker->StoredDbgRecords.size(), 2u);
   // Again, these should arrive in the correct order.
 
   DPVs.clear();
@@ -213,13 +213,13 @@ TEST(BasicBlockDbgInfoTest, MarkerOperations) {
   // Inserting a normal instruction at the beginning: shouldn't dislodge the
   // DPValues. It's intended to not go at the start.
   Instr1->insertBefore(BB, BB.begin());
-  EXPECT_EQ(EndMarker->StoredDPValues.size(), 2u);
+  EXPECT_EQ(EndMarker->StoredDbgRecords.size(), 2u);
   Instr1->removeFromParent();
 
   // Inserting at end(): should dislodge the DPValues, if they were dbg.values
   // then they would sit "above" the new instruction.
   Instr1->insertBefore(BB, BB.end());
-  EXPECT_EQ(Instr1->DbgMarker->StoredDPValues.size(), 2u);
+  EXPECT_EQ(Instr1->DbgMarker->StoredDbgRecords.size(), 2u);
   // We should de-allocate the trailing marker when something is inserted
   // at end().
   EXPECT_EQ(BB.getTrailingDbgRecords(), nullptr);
@@ -227,14 +227,14 @@ TEST(BasicBlockDbgInfoTest, MarkerOperations) {
   // Remove Instr1: now the DPValues will fall down again,
   Instr1->removeFromParent();
   EndMarker = BB.getTrailingDbgRecords();
-  EXPECT_EQ(EndMarker->StoredDPValues.size(), 2u);
+  EXPECT_EQ(EndMarker->StoredDbgRecords.size(), 2u);
 
   // Inserting a terminator, however it's intended, should dislodge the
   // trailing DPValues, as it's the clear intention of the caller that this be
   // the final instr in the block, and DPValues aren't allowed to live off the
   // end forever.
   Instr2->insertBefore(BB, BB.begin());
-  EXPECT_EQ(Instr2->DbgMarker->StoredDPValues.size(), 2u);
+  EXPECT_EQ(Instr2->DbgMarker->StoredDbgRecords.size(), 2u);
   EXPECT_EQ(BB.getTrailingDbgRecords(), nullptr);
 
   // Teardown,
@@ -298,24 +298,24 @@ TEST(BasicBlockDbgInfoTest, HeadBitOperations) {
   Instruction *DInst = CInst->getNextNode();
   // CInst should have debug-info.
   ASSERT_TRUE(CInst->DbgMarker);
-  EXPECT_FALSE(CInst->DbgMarker->StoredDPValues.empty());
+  EXPECT_FALSE(CInst->DbgMarker->StoredDbgRecords.empty());
 
   // If we move "c" to the start of the block, just normally, then the DPValues
   // should fall down to "d".
   CInst->moveBefore(BB, BeginIt2);
-  EXPECT_TRUE(!CInst->DbgMarker || CInst->DbgMarker->StoredDPValues.empty());
+  EXPECT_TRUE(!CInst->DbgMarker || CInst->DbgMarker->StoredDbgRecords.empty());
   ASSERT_TRUE(DInst->DbgMarker);
-  EXPECT_FALSE(DInst->DbgMarker->StoredDPValues.empty());
+  EXPECT_FALSE(DInst->DbgMarker->StoredDbgRecords.empty());
 
   // Wheras if we move D to the start of the block with moveBeforePreserving,
   // the DPValues should move with it.
   DInst->moveBeforePreserving(BB, BB.begin());
-  EXPECT_FALSE(DInst->DbgMarker->StoredDPValues.empty());
+  EXPECT_FALSE(DInst->DbgMarker->StoredDbgRecords.empty());
   EXPECT_EQ(&*BB.begin(), DInst);
 
   // Similarly, moveAfterPreserving "D" to "C" should move DPValues with "D".
   DInst->moveAfterPreserving(CInst);
-  EXPECT_FALSE(DInst->DbgMarker->StoredDPValues.empty());
+  EXPECT_FALSE(DInst->DbgMarker->StoredDbgRecords.empty());
 
   // (move back to the start...)
   DInst->moveBeforePreserving(BB, BB.begin());
@@ -324,8 +324,8 @@ TEST(BasicBlockDbgInfoTest, HeadBitOperations) {
   // If we move "C" to the beginning of the block, it should go before the
   // DPValues. They'll stay on "D".
   CInst->moveBefore(BB, BB.begin());
-  EXPECT_TRUE(!CInst->DbgMarker || CInst->DbgMarker->StoredDPValues.empty());
-  EXPECT_FALSE(DInst->DbgMarker->StoredDPValues.empty());
+  EXPECT_TRUE(!CInst->DbgMarker || CInst->DbgMarker->StoredDbgRecords.empty());
+  EXPECT_FALSE(DInst->DbgMarker->StoredDbgRecords.empty());
   EXPECT_EQ(&*BB.begin(), CInst);
   EXPECT_EQ(CInst->getNextNode(), DInst);
 
@@ -341,8 +341,8 @@ TEST(BasicBlockDbgInfoTest, HeadBitOperations) {
   // run of dbg.values and the next instruction.
   CInst->moveBefore(BB, DInst->getIterator());
   // CInst gains the DPValues.
-  EXPECT_TRUE(!DInst->DbgMarker || DInst->DbgMarker->StoredDPValues.empty());
-  EXPECT_FALSE(CInst->DbgMarker->StoredDPValues.empty());
+  EXPECT_TRUE(!DInst->DbgMarker || DInst->DbgMarker->StoredDbgRecords.empty());
+  EXPECT_FALSE(CInst->DbgMarker->StoredDbgRecords.empty());
   EXPECT_EQ(&*BB.begin(), CInst);
 
   UseNewDbgInfoFormat = false;
@@ -390,16 +390,16 @@ TEST(BasicBlockDbgInfoTest, InstrDbgAccess) {
 
   ASSERT_FALSE(BInst->DbgMarker);
   ASSERT_TRUE(CInst->DbgMarker);
-  ASSERT_EQ(CInst->DbgMarker->StoredDPValues.size(), 1u);
-  DbgRecord *DPV1 = &*CInst->DbgMarker->StoredDPValues.begin();
+  ASSERT_EQ(CInst->DbgMarker->StoredDbgRecords.size(), 1u);
+  DbgRecord *DPV1 = &*CInst->DbgMarker->StoredDbgRecords.begin();
   ASSERT_TRUE(DPV1);
   EXPECT_FALSE(BInst->hasDbgRecords());
 
   // Clone DPValues from one inst to another. Other arguments to clone are
   // tested in DPMarker test.
   auto Range1 = BInst->cloneDebugInfoFrom(CInst);
-  EXPECT_EQ(BInst->DbgMarker->StoredDPValues.size(), 1u);
-  DbgRecord *DPV2 = &*BInst->DbgMarker->StoredDPValues.begin();
+  EXPECT_EQ(BInst->DbgMarker->StoredDbgRecords.size(), 1u);
+  DbgRecord *DPV2 = &*BInst->DbgMarker->StoredDbgRecords.begin();
   EXPECT_EQ(std::distance(Range1.begin(), Range1.end()), 1u);
   EXPECT_EQ(&*Range1.begin(), DPV2);
   EXPECT_NE(DPV1, DPV2);
@@ -417,12 +417,12 @@ TEST(BasicBlockDbgInfoTest, InstrDbgAccess) {
   // Dropping should be easy,
   BInst->dropDbgRecords();
   EXPECT_FALSE(BInst->hasDbgRecords());
-  EXPECT_EQ(BInst->DbgMarker->StoredDPValues.size(), 0u);
+  EXPECT_EQ(BInst->DbgMarker->StoredDbgRecords.size(), 0u);
 
   // And we should be able to drop individual DPValues.
   CInst->dropOneDbgRecord(DPV1);
   EXPECT_FALSE(CInst->hasDbgRecords());
-  EXPECT_EQ(CInst->DbgMarker->StoredDPValues.size(), 0u);
+  EXPECT_EQ(CInst->DbgMarker->StoredDbgRecords.size(), 0u);
 
   UseNewDbgInfoFormat = false;
 }
@@ -531,9 +531,9 @@ class DbgSpliceTest : public ::testing::Test {
     Branch = &*Last;
     CInst = &*Dest;
 
-    DPVA = cast<DPValue>(&*BInst->DbgMarker->StoredDPValues.begin());
-    DPVB = cast<DPValue>(&*Branch->DbgMarker->StoredDPValues.begin());
-    DPVConst = cast<DPValue>(&*CInst->DbgMarker->StoredDPValues.begin());
+    DPVA = cast<DPValue>(&*BInst->DbgMarker->StoredDbgRecords.begin());
+    DPVB = cast<DPValue>(&*Branch->DbgMarker->StoredDbgRecords.begin());
+    DPVConst = cast<DPValue>(&*CInst->DbgMarker->StoredDbgRecords.begin());
   }
 
   void TearDown() override { UseNewDbgInfoFormat = false; }
@@ -1171,7 +1171,7 @@ TEST(BasicBlockDbgInfoTest, DbgSpliceTrailing) {
   // spliced in.
   Instruction *BInst = &*Entry.begin();
   ASSERT_TRUE(BInst->DbgMarker);
-  EXPECT_EQ(BInst->DbgMarker->StoredDPValues.size(), 1u);
+  EXPECT_EQ(BInst->DbgMarker->StoredDbgRecords.size(), 1u);
 
   UseNewDbgInfoFormat = false;
 }
@@ -1387,7 +1387,7 @@ TEST(BasicBlockDbgInfoTest, DbgSpliceToEmpty1) {
   // should be in the correct order of %a, then 0.
   Instruction *BInst = &*Entry.begin();
   ASSERT_TRUE(BInst->hasDbgRecords());
-  EXPECT_EQ(BInst->DbgMarker->StoredDPValues.size(), 2u);
+  EXPECT_EQ(BInst->DbgMarker->StoredDbgRecords.size(), 2u);
   SmallVector<DPValue *, 2> DPValues;
   for (DbgRecord &DPV : BInst->getDbgRecordRange())
     DPValues.push_back(cast<DPValue>(&DPV));
@@ -1457,7 +1457,7 @@ TEST(BasicBlockDbgInfoTest, DbgSpliceToEmpty2) {
   // We should now have one dbg.values on the first instruction, %a.
   Instruction *BInst = &*Entry.begin();
   ASSERT_TRUE(BInst->hasDbgRecords());
-  EXPECT_EQ(BInst->DbgMarker->StoredDPValues.size(), 1u);
+  EXPECT_EQ(BInst->DbgMarker->StoredDbgRecords.size(), 1u);
   SmallVector<DPValue *, 2> DPValues;
   for (DbgRecord &DPV : BInst->getDbgRecordRange())
     DPValues.push_back(cast<DPValue>(&DPV));
diff --git a/llvm/unittests/IR/DebugInfoTest.cpp b/llvm/unittests/IR/DebugInfoTest.cpp
index 0b019c26148b5c..4bd11d26071b05 100644
--- a/llvm/unittests/IR/DebugInfoTest.cpp
+++ b/llvm/unittests/IR/DebugInfoTest.cpp
@@ -951,7 +951,7 @@ TEST(MetadataTest, ConvertDbgToDPValue) {
   ExitBlock->createMarker(FirstInst);
   ExitBlock->createMarker(RetInst);
 
-  // Insert DPValues into markers, order should come out DPV2, DPV1.
+  // Insert DbgRecords into markers, order should come out DPV2, DPV1.
   FirstInst->DbgMarker->insertDbgRecord(DPV1, false);
   FirstInst->DbgMarker->insertDbgRecord(DPV2, true);
   unsigned int ItCount = 0;
@@ -964,7 +964,7 @@ TEST(MetadataTest, ConvertDbgToDPValue) {
 
   // Clone them onto the second marker -- should allocate new DPVs.
   RetInst->DbgMarker->cloneDebugInfoFrom(FirstInst->DbgMarker, std::nullopt, false);
-  EXPECT_EQ(RetInst->DbgMarker->StoredDPValues.size(), 2u);
+  EXPECT_EQ(RetInst->DbgMarker->StoredDbgRecords.size(), 2u);
   ItCount = 0;
   // Check these things store the same information; but that they're not the same
   // objects.
@@ -980,25 +980,25 @@ TEST(MetadataTest, ConvertDbgToDPValue) {
   }
 
   RetInst->DbgMarker->dropDbgRecords();
-  EXPECT_EQ(RetInst->DbgMarker->StoredDPValues.size(), 0u);
+  EXPECT_EQ(RetInst->DbgMarker->StoredDbgRecords.size(), 0u);
 
   // Try cloning one single DPValue.
   auto DIIt = std::next(FirstInst->DbgMarker->getDbgRecordRange().begin());
   RetInst->DbgMarker->cloneDebugInfoFrom(FirstInst->DbgMarker, DIIt, false);
-  EXPECT_EQ(RetInst->DbgMarker->StoredDPValues.size(), 1u);
+  EXPECT_EQ(RetInst->DbgMarker->StoredDbgRecords.size(), 1u);
   // The second DPValue should have been cloned; it should have the same values
   // as DPV1.
-  EXPECT_EQ(cast<DPValue>(RetInst->DbgMarker->StoredDPValues.begin())
+  EXPECT_EQ(cast<DPValue>(RetInst->DbgMarker->StoredDbgRecords.begin())
                 ->getRawLocation(),
             DPV1->getRawLocation());
-  // We should be able to drop individual DPValues.
+  // We should be able to drop individual DbgRecords.
   RetInst->DbgMarker->dropOneDbgRecord(
-      &*RetInst->DbgMarker->StoredDPValues.begin());
+      &*RetInst->DbgMarker->StoredDbgRecords.begin());
 
   // "Aborb" a DPMarker: this means pretend that the instruction it's attached
   // to is disappearing so it needs to be transferred into "this" marker.
   RetInst->DbgMarker->absorbDebugValues(*FirstInst->DbgMarker, true);
-  EXPECT_EQ(RetInst->DbgMarker->StoredDPValues.size(), 2u);
+  EXPECT_EQ(RetInst->DbgMarker->StoredDbgRecords.size(), 2u);
   // Should be the DPV1 and DPV2 objects.
   ItCount = 0;
   for (DbgRecord &Item : RetInst->DbgMarker->getDbgRecordRange()) {
@@ -1009,7 +1009,7 @@ TEST(MetadataTest, ConvertDbgToDPValue) {
   }
 
   // Finally -- there are two DPValues left over. If we remove evrything in the
-  // basic block, then they should sink down into the "TrailingDPValues"
+  // basic block, then they should sink down into the "TrailingDbgRecords"
   // container for dangling debug-info. Future facilities will restore them
   // back when a terminator is inserted.
   FirstInst->DbgMarker->removeMarker();
@@ -1019,7 +1019,7 @@ TEST(MetadataTest, ConvertDbgToDPValue) {
 
   DPMarker *EndMarker = ExitBlock->getTrailingDbgRecords();
   ASSERT_NE(EndMarker, nullptr);
-  EXPECT_EQ(EndMarker->StoredDPValues.size(), 2u);
+  EXPECT_EQ(EndMarker->StoredDbgRecords.size(), 2u);
   // Test again that it's those two DPValues, DPV1 and DPV2.
   ItCount = 0;
   for (DbgRecord &Item : EndMarker->getDbgRecordRange()) {
@@ -1115,14 +1115,14 @@ TEST(MetadataTest, DPValueConversionRoutines) {
   EXPECT_EQ(FirstInst, FirstInst->DbgMarker->MarkedInstr);
   EXPECT_EQ(SecondInst, SecondInst->DbgMarker->MarkedInstr);
 
-  EXPECT_EQ(FirstInst->DbgMarker->StoredDPValues.size(), 1u);
+  EXPECT_EQ(FirstInst->DbgMarker->StoredDbgRecords.size(), 1u);
   DPValue *DPV1 =
       cast<DPValue>(&*FirstInst->DbgMarker->getDbgRecordRange().begin());
   EXPECT_EQ(DPV1->getMarker(), FirstInst->DbgMarker);
   // Should point at %a, an argument.
   EXPECT_TRUE(isa<Argument>(DPV1->getVariableLocationOp(0)));
 
-  EXPECT_EQ(SecondInst->DbgMarker->StoredDPValues.size(), 1u);
+  EXPECT_EQ(SecondInst->DbgMarker->StoredDbgRecords.size(), 1u);
   DPValue *DPV2 =
       cast<DPValue>(&*SecondInst->DbgMarker->getDbgRecordRange().begin());
   EXPECT_EQ(DPV2->getMarker(), SecondInst->DbgMarker);
@@ -1135,7 +1135,7 @@ TEST(MetadataTest, DPValueConversionRoutines) {
   EXPECT_TRUE(BB2->IsNewDbgInfoFormat);
   for (auto &Inst : *BB2)
     // Either there should be no marker, or it should be empty.
-    EXPECT_TRUE(!Inst.DbgMarker || Inst.DbgMarker->StoredDPValues.empty());
+    EXPECT_TRUE(!Inst.DbgMarker || Inst.DbgMarker->StoredDbgRecords.empty());
 
   // Validating the first block should continue to not be a problem,
   Error = verifyModule(*M, &errs(), &BrokenDebugInfo);
diff --git a/llvm/unittests/Transforms/Utils/DebugifyTest.cpp b/llvm/unittests/Transforms/Utils/DebugifyTest.cpp
index 89fa1334b427ea..0b00734fc4d752 100644
--- a/llvm/unittests/Transforms/Utils/DebugifyTest.cpp
+++ b/llvm/unittests/Transforms/Utils/DebugifyTest.cpp
@@ -60,7 +60,7 @@ struct DebugValueDrop : public FunctionPass {
       for (Instruction &I : BB) {
         if (auto *DVI = dyn_cast<DbgVariableIntrinsic>(&I))
           Dbgs.push_back(DVI);
-        // If there are any non-intrinsic records (DPValues), drop those too.
+        // If there are any non-intrinsic records (DbgRecords), drop those too.
         I.dropDbgRecords();
       }
     }

>From 68e858d9668cead4682e329d636516ee35c06d26 Mon Sep 17 00:00:00 2001
From: Ganesh Gopalasubramanian <Ganesh.Gopalasubramanian at amd.com>
Date: Wed, 13 Mar 2024 12:08:46 +0530
Subject: [PATCH 63/63] [X86] Fast AVX-512-VNNI vpdpwssd tuning

---
 llvm/lib/Target/X86/X86.td                   |  6 +++---
 llvm/lib/Target/X86/X86InstrInfo.cpp         | 15 +++++++--------
 llvm/lib/Target/X86/X86InstrPredicates.td    |  2 +-
 llvm/lib/Target/X86/X86TargetTransformInfo.h |  2 +-
 llvm/test/CodeGen/X86/vpdpwssd.ll            |  7 ++-----
 5 files changed, 14 insertions(+), 18 deletions(-)

diff --git a/llvm/lib/Target/X86/X86.td b/llvm/lib/Target/X86/X86.td
index 3d426ec612bb2a..78bc043911f2fc 100644
--- a/llvm/lib/Target/X86/X86.td
+++ b/llvm/lib/Target/X86/X86.td
@@ -684,9 +684,9 @@ def TuningFastGather
                        "Indicates if gather is reasonably fast (this is true for Skylake client and all AVX-512 CPUs)">;
 
 // Generate vpdpwssd instead of vpmaddwd+vpaddd sequence.
-def TuningFastPWSSD
+def TuningFastDPWSSD
     : SubtargetFeature<
-          "fast-pwssd", "HasFastPWSSD", "true",
+          "fast-dpwssd", "HasFastDPWSSD", "true",
           "Prefer vpdpwssd instruction over vpmaddwd+vpaddd instruction sequence">;
 
 def TuningPreferNoGather
@@ -1510,7 +1510,7 @@ def ProcessorFeatures {
     !listconcat(ZN2Features, ZN3AdditionalFeatures);
 
 
-  list<SubtargetFeature> ZN4AdditionalTuning = [TuningFastPWSSD];
+  list<SubtargetFeature> ZN4AdditionalTuning = [TuningFastDPWSSD];
   list<SubtargetFeature> ZN4Tuning =
     !listconcat(ZN3Tuning, ZN4AdditionalTuning);
   list<SubtargetFeature> ZN4AdditionalFeatures = [FeatureAVX512,
diff --git a/llvm/lib/Target/X86/X86InstrInfo.cpp b/llvm/lib/Target/X86/X86InstrInfo.cpp
index 57010f1b02e1f1..61b88edab01795 100644
--- a/llvm/lib/Target/X86/X86InstrInfo.cpp
+++ b/llvm/lib/Target/X86/X86InstrInfo.cpp
@@ -10565,18 +10565,15 @@ bool X86InstrInfo::getMachineCombinerPatterns(
     bool DoRegPressureReduce) const {
   unsigned Opc = Root.getOpcode();
   switch (Opc) {
-  default:
-    return TargetInstrInfo::getMachineCombinerPatterns(Root, Patterns,
-                                                       DoRegPressureReduce);
   case X86::VPDPWSSDrr:
   case X86::VPDPWSSDrm:
   case X86::VPDPWSSDYrr:
   case X86::VPDPWSSDYrm: {
-    if (!Subtarget.hasFastPWSSD()) {
+    if (!Subtarget.hasFastDPWSSD()) {
       Patterns.push_back(MachineCombinerPattern::DPWSSD);
       return true;
     }
-    return false;
+    break;
   }
   case X86::VPDPWSSDZ128r:
   case X86::VPDPWSSDZ128m:
@@ -10584,13 +10581,15 @@ bool X86InstrInfo::getMachineCombinerPatterns(
   case X86::VPDPWSSDZ256m:
   case X86::VPDPWSSDZr:
   case X86::VPDPWSSDZm: {
-    if (Subtarget.hasBWI() && !Subtarget.hasFastPWSSD()) {
+   if (Subtarget.hasBWI() && !Subtarget.hasFastDPWSSD()) {
       Patterns.push_back(MachineCombinerPattern::DPWSSD);
       return true;
     }
-    return false;
-  }
+    break;
   }
+  } 
+  return TargetInstrInfo::getMachineCombinerPatterns(Root, 
+                                                     Patterns, DoRegPressureReduce);
 }
 
 static void
diff --git a/llvm/lib/Target/X86/X86InstrPredicates.td b/llvm/lib/Target/X86/X86InstrPredicates.td
index 9fd846dd05c5d0..b8e7768bdaf3c4 100644
--- a/llvm/lib/Target/X86/X86InstrPredicates.td
+++ b/llvm/lib/Target/X86/X86InstrPredicates.td
@@ -238,6 +238,6 @@ def HasFastSHLDRotate : Predicate<"Subtarget->hasFastSHLDRotate()">;
 def HasERMSB : Predicate<"Subtarget->hasERMSB()">;
 def HasFSRM : Predicate<"Subtarget->hasFSRM()">;
 def HasMFence    : Predicate<"Subtarget->hasMFence()">;
-def HasFastPWSSD: Predicate<"Subtarget->hasFastPWSSD()">;
+def HasFastDPWSSD: Predicate<"Subtarget->hasFastDPWSSD()">;
 def UseIndirectThunkCalls : Predicate<"Subtarget->useIndirectThunkCalls()">;
 def NotUseIndirectThunkCalls : Predicate<"!Subtarget->useIndirectThunkCalls()">;
diff --git a/llvm/lib/Target/X86/X86TargetTransformInfo.h b/llvm/lib/Target/X86/X86TargetTransformInfo.h
index 691e318cda3d33..23035f655098a7 100644
--- a/llvm/lib/Target/X86/X86TargetTransformInfo.h
+++ b/llvm/lib/Target/X86/X86TargetTransformInfo.h
@@ -94,7 +94,7 @@ class X86TTIImpl : public BasicTTIImplBase<X86TTIImpl> {
       X86::TuningNoDomainDelayBlend,
       X86::TuningPreferShiftShuffle,
       X86::TuningFastImmVectorShift,
-      X86::TuningFastPWSSD,
+      X86::TuningFastDPWSSD,
 
       // Perf-tuning flags.
       X86::TuningFastGather,
diff --git a/llvm/test/CodeGen/X86/vpdpwssd.ll b/llvm/test/CodeGen/X86/vpdpwssd.ll
index a044378fae255c..e6a07b4aeb2719 100644
--- a/llvm/test/CodeGen/X86/vpdpwssd.ll
+++ b/llvm/test/CodeGen/X86/vpdpwssd.ll
@@ -1,5 +1,6 @@
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=znver4 | FileCheck %s
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=znver4 | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vnni,+fast-dpwssd | FileCheck %s
 
 define <16 x i32> @vpdpwssd_test(<16 x i32> %0, <16 x i32> %1, <16 x i32> %2) {
 ; CHECK-LABEL: vpdpwssd_test:
@@ -9,7 +10,3 @@ define <16 x i32> @vpdpwssd_test(<16 x i32> %0, <16 x i32> %1, <16 x i32> %2) {
   %4 = tail call <16 x i32> @llvm.x86.avx512.vpdpwssd.512(<16 x i32> %0, <16 x i32> %1, <16 x i32> %2)
   ret <16 x i32> %4
 }
-
-declare <16 x i32> @llvm.x86.avx512.vpdpwssd.512(<16 x i32>, <16 x i32>, <16 x i32>) #1
-
-attributes #1 = { mustprogress nocallback nofree nosync nounwind willreturn memory(none) }