[libcxx-commits] [clang] [libcxx] [llvm] [CIR][X86] Add support for vpshl/vpshr builtins (PR #179538)

Wed Feb 4 09:02:02 PST 2026

https://github.com/Priyanshu3820 updated https://github.com/llvm/llvm-project/pull/179538

>From 4d2ec7de3325f3f19b80190604f9805ab5f036dd Mon Sep 17 00:00:00 2001
From: Akash Dutta <137309513+akadutta at users.noreply.github.com>
Date: Fri, 30 Jan 2026 08:26:40 -0600
Subject: [PATCH 1/6] Add support for vpshl/vpshr builtins

---
 libcxx/include/stdatomic.h                    |   2 +-
 .../include_stdatomic_as_c.sh.cpp             |  30 +
 llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp   |  14 +-
 llvm/lib/Target/AMDGPU/SIInstrInfo.cpp        |  36 +-
 llvm/lib/Target/AMDGPU/SIInstructions.td      |   2 +-
 llvm/lib/Target/AMDGPU/VOP3PInstructions.td   |   4 +-
 .../InstCombineSimplifyDemanded.cpp           |  30 +
 .../lib/Transforms/Scalar/SimplifyCFGPass.cpp |   2 +-
 ...amdgpu-attributor-nocallback-intrinsics.ll |  19 +-
 .../AMDGPU/amdgpu-attributor-trap-leaf.ll     |  65 ++
 .../AMDGPU/av_movimm_pseudo_expansion.mir     |  20 +
 .../AMDGPU/misaligned-vgpr-regsequence.mir    |  30 +
 .../test/CodeGen/AMDGPU/peephole-fold-imm.mir |   5 +-
 .../siloadstoreopt-misaligned-regsequence.ll  |  21 +
 .../CodeGen/AMDGPU/v_mov_b64_expansion.mir    |   9 +
 .../CodeGen/AMDGPU/wmma-gfx12-convergent.mir  | 153 ++++-
 .../InstCombine/simplify-demanded-fpclass.ll  | 649 ++++++++++++++++++
 .../AArch64/partial-reduce-chained.ll         |  31 +-
 18 files changed, 1079 insertions(+), 43 deletions(-)
 create mode 100644 libcxx/test/extensions/libcxx/depr/depr.c.headers/include_stdatomic_as_c.sh.cpp
 create mode 100644 llvm/test/CodeGen/AMDGPU/amdgpu-attributor-trap-leaf.ll
 create mode 100644 llvm/test/CodeGen/AMDGPU/misaligned-vgpr-regsequence.mir
 create mode 100644 llvm/test/CodeGen/AMDGPU/siloadstoreopt-misaligned-regsequence.ll

diff --git a/libcxx/include/stdatomic.h b/libcxx/include/stdatomic.h
index 2991030eee456..e7b787560ddc4 100644
--- a/libcxx/include/stdatomic.h
+++ b/libcxx/include/stdatomic.h
@@ -231,7 +231,7 @@ using std::atomic_store_explicit _LIBCPP_USING_IF_EXISTS;
 using std::atomic_signal_fence _LIBCPP_USING_IF_EXISTS;
 using std::atomic_thread_fence _LIBCPP_USING_IF_EXISTS;
 
-#  elif defined(_LIBCPP_COMPILER_CLANG_BASED)
+#  elif !defined(__cplusplus) || defined(_LIBCPP_COMPILER_CLANG_BASED)
 
 // Before C++23, we include the next <stdatomic.h> on the path to avoid hijacking
 // the header. We do this because Clang has historically shipped a <stdatomic.h>
diff --git a/libcxx/test/extensions/libcxx/depr/depr.c.headers/include_stdatomic_as_c.sh.cpp b/libcxx/test/extensions/libcxx/depr/depr.c.headers/include_stdatomic_as_c.sh.cpp
new file mode 100644
index 0000000000000..31ad5c9053675
--- /dev/null
+++ b/libcxx/test/extensions/libcxx/depr/depr.c.headers/include_stdatomic_as_c.sh.cpp
@@ -0,0 +1,30 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// We're building as C, so this test doesn't work when building with modules.
+// UNSUPPORTED: clang-modules-build
+
+// GCC complains about unrecognized arguments because we're compiling the
+// file as C, but we're passing C++ flags on the command-line.
+// UNSUPPORTED: gcc
+
+// Test that stdatomic.h gets the C header with its definitions.
+
+// NOTE: It's not common or recommended to have libc++ in the header search
+// path when compiling C files, but it does happen often enough.
+
+// RUN: %{cxx} -c -xc %s -fsyntax-only %{flags} %{compile_flags} -std=c99
+
+#include <stdatomic.h>
+
+int main(int argc, char** argv) {
+  (void)argc;
+  (void)argv;
+  [[maybe_unused]] atomic_bool x;
+  return 0;
+}
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp
index 0b2ee6371da06..4bcaabfd3263a 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp
@@ -1343,7 +1343,6 @@ struct AAAMDGPUMinAGPRAlloc
         Maximum.takeAssumedMaximum(NumRegs);
         return true;
       }
-
       switch (CB.getIntrinsicID()) {
       case Intrinsic::not_intrinsic:
         break;
@@ -1361,10 +1360,21 @@ struct AAAMDGPUMinAGPRAlloc
 
         return true;
       }
+      // Trap-like intrinsics such as llvm.trap and llvm.debugtrap do not have
+      // the nocallback attribute, so the AMDGPU attributor can conservatively
+      // drop all implicitly-known inputs and AGPR allocation information. Make
+      // sure we still infer that no implicit inputs are required and that the
+      // AGPR allocation stays at zero. Trap-like intrinsics may invoke a
+      // function which requires AGPRs, so we need to check if the called
+      // function has the "trap-func-name" attribute.
+      case Intrinsic::trap:
+      case Intrinsic::debugtrap:
+      case Intrinsic::ubsantrap:
+        return CB.hasFnAttr(Attribute::NoCallback) ||
+               !CB.hasFnAttr("trap-func-name");
       default:
         // Some intrinsics may use AGPRs, but if we have a choice, we are not
         // required to use AGPRs.
-
         // Assume !nocallback intrinsics may call a function which requires
         // AGPRs.
         return CB.hasFnAttr(Attribute::NoCallback);
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
index fb82598709aab..e0e0bb0c05ea2 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -2173,11 +2173,14 @@ bool SIInstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
     Register DstLo = RI.getSubReg(Dst, AMDGPU::sub0);
     Register DstHi = RI.getSubReg(Dst, AMDGPU::sub1);
 
+    const MCInstrDesc &Mov64Desc = get(AMDGPU::V_MOV_B64_e32);
+    const TargetRegisterClass *Mov64RC = getRegClass(Mov64Desc, /*OpNum=*/0);
+
     const MachineOperand &SrcOp = MI.getOperand(1);
     // FIXME: Will this work for 64-bit floating point immediates?
     assert(!SrcOp.isFPImm());
-    if (ST.hasMovB64()) {
-      MI.setDesc(get(AMDGPU::V_MOV_B64_e32));
+    if (ST.hasMovB64() && Mov64RC->contains(Dst)) {
+      MI.setDesc(Mov64Desc);
       if (SrcOp.isReg() || isInlineConstant(MI, 1) ||
           isUInt<32>(SrcOp.getImm()) || ST.has64BitLiterals())
         break;
@@ -2186,17 +2189,21 @@ bool SIInstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
       APInt Imm(64, SrcOp.getImm());
       APInt Lo(32, Imm.getLoBits(32).getZExtValue());
       APInt Hi(32, Imm.getHiBits(32).getZExtValue());
-      if (ST.hasPkMovB32() && Lo == Hi && isInlineConstant(Lo)) {
-        BuildMI(MBB, MI, DL, get(AMDGPU::V_PK_MOV_B32), Dst)
-          .addImm(SISrcMods::OP_SEL_1)
-          .addImm(Lo.getSExtValue())
-          .addImm(SISrcMods::OP_SEL_1)
-          .addImm(Lo.getSExtValue())
-          .addImm(0)  // op_sel_lo
-          .addImm(0)  // op_sel_hi
-          .addImm(0)  // neg_lo
-          .addImm(0)  // neg_hi
-          .addImm(0); // clamp
+      const MCInstrDesc &PkMovDesc = get(AMDGPU::V_PK_MOV_B32);
+      const TargetRegisterClass *PkMovRC = getRegClass(PkMovDesc, /*OpNum=*/0);
+
+      if (ST.hasPkMovB32() && Lo == Hi && isInlineConstant(Lo) &&
+          PkMovRC->contains(Dst)) {
+        BuildMI(MBB, MI, DL, PkMovDesc, Dst)
+            .addImm(SISrcMods::OP_SEL_1)
+            .addImm(Lo.getSExtValue())
+            .addImm(SISrcMods::OP_SEL_1)
+            .addImm(Lo.getSExtValue())
+            .addImm(0)  // op_sel_lo
+            .addImm(0)  // op_sel_hi
+            .addImm(0)  // neg_lo
+            .addImm(0)  // neg_hi
+            .addImm(0); // clamp
       } else {
         BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DstLo)
           .addImm(Lo.getSExtValue())
@@ -5259,7 +5266,8 @@ bool SIInstrInfo::verifyInstruction(const MachineInstr &MI,
     // aligned register constraint.
     // FIXME: We do not verify inline asm operands, but custom inline asm
     // verification is broken anyway
-    if (ST.needsAlignedVGPRs() && Opcode != AMDGPU::AV_MOV_B64_IMM_PSEUDO) {
+    if (ST.needsAlignedVGPRs() && Opcode != AMDGPU::AV_MOV_B64_IMM_PSEUDO &&
+        Opcode != AMDGPU::V_MOV_B64_PSEUDO) {
       const TargetRegisterClass *RC = RI.getRegClassForReg(MRI, Reg);
       if (RI.hasVectorRegisters(RC) && MO.getSubReg()) {
         if (const TargetRegisterClass *SubRC =
diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td
index 604a91fcb6e7f..d30e7fd0523a5 100644
--- a/llvm/lib/Target/AMDGPU/SIInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SIInstructions.td
@@ -131,7 +131,7 @@ def V_CNDMASK_B64_PSEUDO : VOP3Common <(outs VReg_64:$vdst),
 
 // 64-bit vector move instruction. This is mainly used by the
 // SIFoldOperands pass to enable folding of inline immediates.
-def V_MOV_B64_PSEUDO : VPseudoInstSI <(outs VReg_64_AlignTarget:$vdst),
+def V_MOV_B64_PSEUDO : VPseudoInstSI <(outs VReg_64:$vdst),
                                       (ins VSrc_b64:$src0)> {
   let isReMaterializable = 1;
   let isAsCheapAsAMove = 1;
diff --git a/llvm/lib/Target/AMDGPU/VOP3PInstructions.td b/llvm/lib/Target/AMDGPU/VOP3PInstructions.td
index 405e5dddba639..d111b8996ae75 100644
--- a/llvm/lib/Target/AMDGPU/VOP3PInstructions.td
+++ b/llvm/lib/Target/AMDGPU/VOP3PInstructions.td
@@ -1397,13 +1397,13 @@ multiclass WMMAInst<string Suffix, string Instr, VOPProfile P, SDPatternOperator
   defvar WMMAConstraints3Addr = "@earlyclobber $vdst";
 
   defvar WMMAProfile = VOPProfileWMMA<P, Suffix, _Src01RC64, Type.hasClamp, Type.hasOpsel>;
-  let Mnemonic = Instr, mayRaiseFPException = 0, ReadsModeReg = 0 in {
+  let isConvergent = 1, Mnemonic = Instr, mayRaiseFPException = 0, ReadsModeReg = 0 in {
     let Constraints = WMMAConstraints2Addr, isConvertibleToThreeAddress = convertibleTo3Addr in {
       def _twoaddr # Suffix : VOP3P_Pseudo<Instr # Suffix, WMMAProfile>;
     }
   }
   if convertibleTo3Addr then {
-    let Mnemonic = Instr, mayRaiseFPException = 0, ReadsModeReg = 0 in {
+    let isConvergent = 1, Mnemonic = Instr, mayRaiseFPException = 0, ReadsModeReg = 0 in {
       let Constraints = WMMAConstraints3Addr, SchedRW = [Write32Bit, Write32Bit] in {
         def _threeaddr # Suffix : VOP3P_Pseudo<Instr # Suffix, WMMAProfile>;
       }
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp b/llvm/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp
index 7b82613c73ea6..630016fe58a19 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp
@@ -2143,6 +2143,32 @@ static Value *simplifyDemandedFPClassFnegFabs(KnownFPClass &Known, Value *Src,
   return nullptr;
 }
 
+static Value *simplifyDemandedFPClassCopysignMag(Value *MagSrc,
+                                                 FPClassTest DemandedMask,
+                                                 KnownFPClass KnownSrc,
+                                                 bool NSZ) {
+  if (NSZ) {
+    constexpr FPClassTest NegOrZero = fcNegative | fcPosZero;
+    constexpr FPClassTest PosOrZero = fcPositive | fcNegZero;
+
+    if ((DemandedMask & ~NegOrZero) == fcNone &&
+        KnownSrc.isKnownAlways(NegOrZero))
+      return MagSrc;
+
+    if ((DemandedMask & ~PosOrZero) == fcNone &&
+        KnownSrc.isKnownAlways(PosOrZero))
+      return MagSrc;
+  } else {
+    if ((DemandedMask & ~fcNegative) == fcNone && KnownSrc.SignBit == true)
+      return MagSrc;
+
+    if ((DemandedMask & ~fcPositive) == fcNone && KnownSrc.SignBit == false)
+      return MagSrc;
+  }
+
+  return nullptr;
+}
+
 static Value *
 simplifyDemandedFPClassMinMax(KnownFPClass &Known, Intrinsic::ID IID,
                               const CallInst *CI, FPClassTest DemandedMask,
@@ -2764,6 +2790,10 @@ Value *InstCombinerImpl::SimplifyDemandedUseFPClass(Instruction *I,
         return I;
       }
 
+      if (Value *Simplified = simplifyDemandedFPClassCopysignMag(
+              CI->getArgOperand(0), DemandedMask, Known, FMF.noSignedZeros()))
+        return Simplified;
+
       KnownFPClass KnownSign = computeKnownFPClass(CI->getArgOperand(1),
                                                    fcAllFlags, CxtI, Depth + 1);
 
diff --git a/llvm/lib/Transforms/Scalar/SimplifyCFGPass.cpp b/llvm/lib/Transforms/Scalar/SimplifyCFGPass.cpp
index 7ffccf73bd39d..edcdaea4c31da 100644
--- a/llvm/lib/Transforms/Scalar/SimplifyCFGPass.cpp
+++ b/llvm/lib/Transforms/Scalar/SimplifyCFGPass.cpp
@@ -244,7 +244,7 @@ static bool iterativelySimplifyCFG(Function &F, const TargetTransformInfo &TTI,
   unsigned IterCnt = 0;
   (void)IterCnt;
   while (LocalChange) {
-    assert(IterCnt++ < 1000 && "Iterative simplification didn't converge!");
+    assert(IterCnt++ < 2000 && "Iterative simplification didn't converge!");
     LocalChange = false;
 
     // Loop over all of the basic blocks and remove them if they are unneeded.
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-attributor-nocallback-intrinsics.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-attributor-nocallback-intrinsics.ll
index 71c509afa8e64..163f1aa72e11f 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgpu-attributor-nocallback-intrinsics.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-attributor-nocallback-intrinsics.ll
@@ -35,7 +35,7 @@ define void @use_assume(i1 %arg) {
 
 define void @use_trap() {
 ; CHECK-LABEL: define void @use_trap(
-; CHECK-SAME: ) #[[ATTR1:[0-9]+]] {
+; CHECK-SAME: ) #[[ATTR0]] {
 ; CHECK-NEXT:    call void @llvm.trap()
 ; CHECK-NEXT:    ret void
 ;
@@ -43,9 +43,19 @@ define void @use_trap() {
   ret void
 }
 
+define void @use_trap_with_handler() {
+; CHECK-LABEL: define void @use_trap_with_handler(
+; CHECK-SAME: ) #[[ATTR1:[0-9]+]] {
+; CHECK-NEXT:    call void @llvm.trap() #[[ATTR7:[0-9]+]]
+; CHECK-NEXT:    ret void
+;
+  call void @llvm.trap() #0
+  ret void
+}
+
 define void @use_debugtrap() {
 ; CHECK-LABEL: define void @use_debugtrap(
-; CHECK-SAME: ) #[[ATTR1]] {
+; CHECK-SAME: ) #[[ATTR0]] {
 ; CHECK-NEXT:    call void @llvm.debugtrap()
 ; CHECK-NEXT:    ret void
 ;
@@ -55,7 +65,7 @@ define void @use_debugtrap() {
 
 define void @use_ubsantrap() {
 ; CHECK-LABEL: define void @use_ubsantrap(
-; CHECK-SAME: ) #[[ATTR1]] {
+; CHECK-SAME: ) #[[ATTR0]] {
 ; CHECK-NEXT:    call void @llvm.ubsantrap(i8 0)
 ; CHECK-NEXT:    ret void
 ;
@@ -63,6 +73,8 @@ define void @use_ubsantrap() {
   ret void
 }
 
+
+attributes #0 = { "trap-func-name"="handler" }
 ;.
 ; CHECK: attributes #[[ATTR0]] = { "amdgpu-agpr-alloc"="0" "amdgpu-no-cluster-id-x" "amdgpu-no-cluster-id-y" "amdgpu-no-cluster-id-z" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "target-cpu"="gfx90a" "uniform-work-group-size"="false" }
 ; CHECK: attributes #[[ATTR1]] = { "amdgpu-no-cluster-id-x" "amdgpu-no-cluster-id-y" "amdgpu-no-cluster-id-z" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "target-cpu"="gfx90a" "uniform-work-group-size"="false" }
@@ -71,4 +83,5 @@ define void @use_ubsantrap() {
 ; CHECK: attributes #[[ATTR4:[0-9]+]] = { nocallback nofree nosync nounwind willreturn memory(none) "target-cpu"="gfx90a" }
 ; CHECK: attributes #[[ATTR5:[0-9]+]] = { nocallback nofree nosync nounwind willreturn memory(inaccessiblemem: readwrite) "target-cpu"="gfx90a" }
 ; CHECK: attributes #[[ATTR6:[0-9]+]] = { cold noreturn nounwind memory(inaccessiblemem: write) "target-cpu"="gfx90a" }
+; CHECK: attributes #[[ATTR7]] = { "trap-func-name"="handler" }
 ;.
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-attributor-trap-leaf.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-attributor-trap-leaf.ll
new file mode 100644
index 0000000000000..e5e7328890146
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-attributor-trap-leaf.ll
@@ -0,0 +1,65 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-attributes --check-globals all --version 6
+; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -passes=amdgpu-attributor %s | FileCheck %s
+
+; Trap-like intrinsics such as llvm.trap and llvm.debugtrap do not have the
+; nocallback attribute, so the AMDGPU attributor used to conservatively drop
+; all implicitly-known inputs and AGPR allocation information. Make sure we
+; still infer that no implicit inputs are required and that the AGPR allocation
+; stays at zero.
+
+declare void @llvm.trap()
+
+declare void @llvm.debugtrap()
+
+define amdgpu_kernel void @trap_kernel() {
+; CHECK-LABEL: define amdgpu_kernel void @trap_kernel(
+; CHECK-SAME: ) #[[ATTR2:[0-9]+]] {
+; CHECK-NEXT:    call void @llvm.trap()
+; CHECK-NEXT:    ret void
+;
+  call void @llvm.trap()
+  ret void
+}
+
+define amdgpu_kernel void @trap_kernel_with_handler() {
+; CHECK-LABEL: define amdgpu_kernel void @trap_kernel_with_handler(
+; CHECK-SAME: ) #[[ATTR3:[0-9]+]] {
+; CHECK-NEXT:    call void @llvm.trap() #[[ATTR4:[0-9]+]]
+; CHECK-NEXT:    ret void
+;
+  call void @llvm.trap() #0
+  ret void
+}
+
+define amdgpu_kernel void @debugtrap_kernel() {
+; CHECK-LABEL: define amdgpu_kernel void @debugtrap_kernel(
+; CHECK-SAME: ) #[[ATTR2]] {
+; CHECK-NEXT:    call void @llvm.debugtrap()
+; CHECK-NEXT:    ret void
+;
+  call void @llvm.debugtrap()
+  ret void
+}
+
+; Test that a trap with both trap-func-name and nocallback is still safe
+define amdgpu_kernel void @trap_kernel_with_handler_and_nocallback() {
+; CHECK-LABEL: define amdgpu_kernel void @trap_kernel_with_handler_and_nocallback(
+; CHECK-SAME: ) #[[ATTR2]] {
+; CHECK-NEXT:    call void @llvm.trap() #[[ATTR5:[0-9]+]]
+; CHECK-NEXT:    ret void
+;
+  call void @llvm.trap() #1
+  ret void
+}
+
+attributes #0 = { "trap-func-name"="handler" }
+attributes #1 = { nocallback "trap-func-name"="handler" }
+
+;.
+; CHECK: attributes #[[ATTR0:[0-9]+]] = { cold noreturn nounwind memory(inaccessiblemem: write) "target-cpu"="gfx90a" }
+; CHECK: attributes #[[ATTR1:[0-9]+]] = { nounwind "target-cpu"="gfx90a" }
+; CHECK: attributes #[[ATTR2]] = { "amdgpu-agpr-alloc"="0" "amdgpu-no-cluster-id-x" "amdgpu-no-cluster-id-y" "amdgpu-no-cluster-id-z" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "target-cpu"="gfx90a" "uniform-work-group-size"="false" }
+; CHECK: attributes #[[ATTR3]] = { "amdgpu-no-cluster-id-x" "amdgpu-no-cluster-id-y" "amdgpu-no-cluster-id-z" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "target-cpu"="gfx90a" "uniform-work-group-size"="false" }
+; CHECK: attributes #[[ATTR4]] = { "trap-func-name"="handler" }
+; CHECK: attributes #[[ATTR5]] = { nocallback "trap-func-name"="handler" }
+;.
diff --git a/llvm/test/CodeGen/AMDGPU/av_movimm_pseudo_expansion.mir b/llvm/test/CodeGen/AMDGPU/av_movimm_pseudo_expansion.mir
index c52347b680371..d08185a9e0ccd 100644
--- a/llvm/test/CodeGen/AMDGPU/av_movimm_pseudo_expansion.mir
+++ b/llvm/test/CodeGen/AMDGPU/av_movimm_pseudo_expansion.mir
@@ -208,3 +208,23 @@ body: |
     ; CHECK-NEXT: $vgpr2 = V_MOV_B32_e32 -16, implicit $exec, implicit-def $vgpr1_vgpr2
     $vgpr1_vgpr2 = AV_MOV_B64_IMM_PSEUDO 18446744004990074889, implicit $exec
 ...
+
+---
+name: av_mov_b64_misalign_vgpr
+body: |
+  bb.0:
+    ; CHECK-LABEL: name: av_mov_b64_misalign_vgpr
+    ; CHECK: $vgpr5 = V_MOV_B32_e32 0, implicit $exec, implicit-def $vgpr5_vgpr6
+    ; CHECK-NEXT: $vgpr6 = V_MOV_B32_e32 0, implicit $exec, implicit-def $vgpr5_vgpr6
+    $vgpr5_vgpr6 = AV_MOV_B64_IMM_PSEUDO 0, implicit $exec
+...
+
+---
+name: av_mov_b64_misalign_agpr
+body: |
+  bb.0:
+    ; CHECK-LABEL: name: av_mov_b64_misalign_agpr
+    ; CHECK: $agpr5 = V_ACCVGPR_WRITE_B32_e64 0, implicit $exec, implicit-def $agpr5_agpr6
+    ; CHECK-NEXT: $agpr6 = V_ACCVGPR_WRITE_B32_e64 0, implicit $exec, implicit-def $agpr5_agpr6
+    $agpr5_agpr6 = AV_MOV_B64_IMM_PSEUDO 0, implicit $exec
+...
diff --git a/llvm/test/CodeGen/AMDGPU/misaligned-vgpr-regsequence.mir b/llvm/test/CodeGen/AMDGPU/misaligned-vgpr-regsequence.mir
new file mode 100644
index 0000000000000..26a6cc41ad8fa
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/misaligned-vgpr-regsequence.mir
@@ -0,0 +1,30 @@
+# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx942 -start-after=si-load-store-opt %s -o - | FileCheck %s
+
+# CHECK: misaligned_regsequence:
+# CHECK: ; %bb.0:
+# CHECK:         s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+# CHECK:         s_load_dwordx2 s[0:1], s[4:5], 0x0
+# CHECK:         v_mov_b32_e32 v5, 0
+# CHECK:         v_mov_b32_e32 v4, 0
+# CHECK:         v_mov_b32_e32 v6, 0
+# CHECK:         s_waitcnt lgkmcnt(0)
+# CHECK:         v_mov_b64_e32 v[2:3], s[0:1]
+# CHECK:         flat_store_dwordx3 v[2:3], v[4:6]
+# CHECK:         s_endpgm
+
+---
+name: misaligned_regsequence
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    liveins: $sgpr4_sgpr5
+
+    %0:sgpr_64 = COPY $sgpr4_sgpr5
+    %1:sreg_64_xexec = S_LOAD_DWORDX2_IMM %0, 0, 0 :: (dereferenceable invariant load (s64), align 16, addrspace 4)
+    %2:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    %3:vreg_64_align2 = COPY %1
+    %4:vreg_64_align2 = V_MOV_B64_PSEUDO 0, implicit $exec
+    %5:vreg_96_align2 = REG_SEQUENCE killed %2, %subreg.sub0, killed %4, %subreg.sub1_sub2
+    FLAT_STORE_DWORDX3 %3, killed %5, 0, 0, implicit $exec, implicit $flat_scr :: (store (s96), align 4)
+    S_ENDPGM 0
+...
diff --git a/llvm/test/CodeGen/AMDGPU/peephole-fold-imm.mir b/llvm/test/CodeGen/AMDGPU/peephole-fold-imm.mir
index 21455a9f5074f..176d7752133cf 100644
--- a/llvm/test/CodeGen/AMDGPU/peephole-fold-imm.mir
+++ b/llvm/test/CodeGen/AMDGPU/peephole-fold-imm.mir
@@ -425,7 +425,7 @@ body:             |
   bb.0:
     ; GCN-LABEL: name: fold_v_mov_b64_64_to_unaligned
     ; GCN: [[V_MOV_B64_e32_:%[0-9]+]]:vreg_64_align2 = V_MOV_B64_e32 1311768467750121200, implicit $exec
-    ; GCN-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64_align2 = V_MOV_B64_PSEUDO 1311768467750121200, implicit $exec
+    ; GCN-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 1311768467750121200, implicit $exec
     ; GCN-NEXT: SI_RETURN_TO_EPILOG implicit [[V_MOV_B]]
     %0:vreg_64_align2 = V_MOV_B64_e32 1311768467750121200, implicit $exec
     %1:vreg_64 = COPY killed %0
@@ -437,7 +437,8 @@ name:            fold_v_mov_b64_pseudo_64_to_unaligned
 body:             |
   bb.0:
     ; GCN-LABEL: name: fold_v_mov_b64_pseudo_64_to_unaligned
-    ; GCN: [[V_MOV_B:%[0-9]+]]:vreg_64_align2 = V_MOV_B64_PSEUDO 1311768467750121200, implicit $exec
+    ; GCN: [[V_MOV_B64_PSEUDO:%[0-9]+]]:vreg_64_align2 = V_MOV_B64_PSEUDO 1311768467750121200, implicit $exec
+    ; GCN-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 1311768467750121200, implicit $exec
     ; GCN-NEXT: SI_RETURN_TO_EPILOG implicit [[V_MOV_B]]
     %0:vreg_64_align2 = V_MOV_B64_PSEUDO 1311768467750121200, implicit $exec
     %1:vreg_64 = COPY killed %0
diff --git a/llvm/test/CodeGen/AMDGPU/siloadstoreopt-misaligned-regsequence.ll b/llvm/test/CodeGen/AMDGPU/siloadstoreopt-misaligned-regsequence.ll
new file mode 100644
index 0000000000000..e95aba71775b5
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/siloadstoreopt-misaligned-regsequence.ll
@@ -0,0 +1,21 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx942 < %s | FileCheck %s
+
+define amdgpu_kernel void @foo(ptr %0) {
+; CHECK-LABEL: foo:
+; CHECK:       ; %bb.0: ; %entry
+; CHECK-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; CHECK-NEXT:    v_mov_b32_e32 v2, 0
+; CHECK-NEXT:    v_mov_b32_e32 v3, v2
+; CHECK-NEXT:    v_mov_b32_e32 v4, v3
+; CHECK-NEXT:    v_mov_b32_e32 v3, v2
+; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
+; CHECK-NEXT:    flat_store_dwordx3 v[0:1], v[2:4]
+; CHECK-NEXT:    s_endpgm
+entry:
+  %1 = getelementptr inbounds i8, ptr %0, i64 4
+  store i32 0, ptr %0, align 4
+  store i64 0, ptr %1, align 4
+  ret void
+}
diff --git a/llvm/test/CodeGen/AMDGPU/v_mov_b64_expansion.mir b/llvm/test/CodeGen/AMDGPU/v_mov_b64_expansion.mir
index 70e2987454192..4c68c4519302a 100644
--- a/llvm/test/CodeGen/AMDGPU/v_mov_b64_expansion.mir
+++ b/llvm/test/CodeGen/AMDGPU/v_mov_b64_expansion.mir
@@ -93,3 +93,12 @@ body: |
   bb.0:
     $vgpr0_vgpr1 = V_MOV_B64_PSEUDO 4575657222473777152, implicit $exec
 ...
+
+# GCN-LABEL: name: v_mov_b64_misalign
+# GCN: $vgpr5 = V_MOV_B32_e32 0, implicit $exec, implicit-def $vgpr5_vgpr6
+# GCN: $vgpr6 = V_MOV_B32_e32 0, implicit $exec, implicit-def $vgpr5_vgpr6
+name: v_mov_b64_misalign
+body: |
+  bb.0:
+    $vgpr5_vgpr6 = V_MOV_B64_PSEUDO 0, implicit $exec
+...
diff --git a/llvm/test/CodeGen/AMDGPU/wmma-gfx12-convergent.mir b/llvm/test/CodeGen/AMDGPU/wmma-gfx12-convergent.mir
index df3e780c61f46..955cf0dbe38d4 100644
--- a/llvm/test/CodeGen/AMDGPU/wmma-gfx12-convergent.mir
+++ b/llvm/test/CodeGen/AMDGPU/wmma-gfx12-convergent.mir
@@ -1,11 +1,16 @@
 # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 6
+
+# machine-sink must not sink WMMA* instructions.
+# Ensure that WMMA instructions are marked as convergent to prevent
+# machine-sink from sinking them.
+
 # RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx12-generic -run-pass=machine-sink %s -o - | FileCheck %s
 
 ---
-name:            wmma_test
+name:            wmma_test_WMMA_F32_16X16X16_F16_w32_threeaddr
 tracksRegLiveness: true
 body:             |
-  ; CHECK-LABEL: name: wmma_test
+  ; CHECK-LABEL: name: wmma_test_WMMA_F32_16X16X16_F16_w32_threeaddr
   ; CHECK: bb.0:
   ; CHECK-NEXT:   successors: %bb.2(0x40000000), %bb.1(0x40000000)
   ; CHECK-NEXT: {{  $}}
@@ -40,3 +45,147 @@ body:             |
     S_ENDPGM 0
 
 ...
+
+---
+name:            wmma_test_V_WMMA_F32_16X16X16_F16_twoaddr_w32
+tracksRegLiveness: true
+body:             |
+  ; CHECK-LABEL: name: wmma_test_V_WMMA_F32_16X16X16_F16_twoaddr_w32
+  ; CHECK: bb.0:
+  ; CHECK-NEXT:   successors: %bb.2(0x40000000), %bb.1(0x40000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   %vsrc:vreg_256 = IMPLICIT_DEF
+  ; CHECK-NEXT:   %ssrc:sreg_32 = IMPLICIT_DEF
+  ; CHECK-NEXT:   early-clobber %vdst:vreg_256 = V_WMMA_F32_16X16X16_F16_twoaddr_w32 8, %vsrc, 8, %vsrc, 8, %vsrc, 0, 0, implicit $exec
+  ; CHECK-NEXT:   %sdst:sreg_32 = SI_IF %ssrc, %bb.2, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+  ; CHECK-NEXT:   S_BRANCH %bb.1
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.1:
+  ; CHECK-NEXT:   successors: %bb.2(0x80000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   %vcopy:vgpr_32 = COPY %vdst.sub0
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.2:
+  ; CHECK-NEXT:   SI_END_CF %sdst, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+  ; CHECK-NEXT:   S_ENDPGM 0
+  bb.0:
+    %vsrc:vreg_256 = IMPLICIT_DEF
+    %ssrc:sreg_32 = IMPLICIT_DEF
+    early-clobber %vdst:vreg_256 = V_WMMA_F32_16X16X16_F16_twoaddr_w32 8, %vsrc, 8, %vsrc, 8, %vsrc, 0, 0, implicit $exec
+    %sdst:sreg_32 = SI_IF %ssrc:sreg_32, %bb.2, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+    S_BRANCH %bb.1
+  bb.1:
+    %vcopy:vgpr_32 = COPY %vdst.sub0
+  bb.2:
+    SI_END_CF %sdst:sreg_32, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+    S_ENDPGM 0
+...
+
+---
+name:            wmma_test_V_WMMA_I32_16X16X16_IU8_twoaddr_w32
+tracksRegLiveness: true
+body:             |
+  ; CHECK-LABEL: name: wmma_test_V_WMMA_I32_16X16X16_IU8_twoaddr_w32
+  ; CHECK: bb.0:
+  ; CHECK-NEXT:   successors: %bb.2(0x40000000), %bb.1(0x40000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   %vsrc:vreg_128 = IMPLICIT_DEF
+  ; CHECK-NEXT:   %vsrc2:vreg_256 = IMPLICIT_DEF
+  ; CHECK-NEXT:   %ssrc:sreg_32 = IMPLICIT_DEF
+  ; CHECK-NEXT:   early-clobber %vdst:vreg_256 = V_WMMA_I32_16X16X16_IU8_twoaddr_w32 8, %vsrc, 8, %vsrc, 8, %vsrc2, 0, 0, 0, implicit $exec
+  ; CHECK-NEXT:   %sdst:sreg_32 = SI_IF %ssrc, %bb.2, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+  ; CHECK-NEXT:   S_BRANCH %bb.1
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.1:
+  ; CHECK-NEXT:   successors: %bb.2(0x80000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   %vcopy:vgpr_32 = COPY %vdst.sub0
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.2:
+  ; CHECK-NEXT:   SI_END_CF %sdst, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+  ; CHECK-NEXT:   S_ENDPGM 0
+  bb.0:
+    %vsrc:vreg_128 = IMPLICIT_DEF
+    %vsrc2:vreg_256 = IMPLICIT_DEF
+    %ssrc:sreg_32 = IMPLICIT_DEF
+    early-clobber %vdst:vreg_256 = V_WMMA_I32_16X16X16_IU8_twoaddr_w32 8, %vsrc, 8, %vsrc, 8, %vsrc2, 0, 0, 0, implicit $exec
+    %sdst:sreg_32 = SI_IF %ssrc:sreg_32, %bb.2, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+    S_BRANCH %bb.1
+  bb.1:
+    %vcopy:vgpr_32 = COPY %vdst.sub0
+  bb.2:
+    SI_END_CF %sdst:sreg_32, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+    S_ENDPGM 0
+...
+
+---
+name:            wmma_test_V_WMMA_BF16_16X16X16_BF16_threeaddr_w32
+tracksRegLiveness: true
+body:             |
+  ; CHECK-LABEL: name: wmma_test_V_WMMA_BF16_16X16X16_BF16_threeaddr_w32
+  ; CHECK: bb.0:
+  ; CHECK-NEXT:   successors: %bb.2(0x40000000), %bb.1(0x40000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   %vsrc:vreg_256 = IMPLICIT_DEF
+  ; CHECK-NEXT:   %ssrc:sreg_32 = IMPLICIT_DEF
+  ; CHECK-NEXT:   early-clobber %vdst:vreg_256 = V_WMMA_BF16_16X16X16_BF16_threeaddr_w32 8, %vsrc, 8, %vsrc, 8, %vsrc, 0, 0, 0, 0, implicit $exec
+  ; CHECK-NEXT:   %sdst:sreg_32 = SI_IF %ssrc, %bb.2, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+  ; CHECK-NEXT:   S_BRANCH %bb.1
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.1:
+  ; CHECK-NEXT:   successors: %bb.2(0x80000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   %vcopy:vgpr_32 = COPY %vdst.sub0
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.2:
+  ; CHECK-NEXT:   SI_END_CF %sdst, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+  ; CHECK-NEXT:   S_ENDPGM 0
+  bb.0:
+    %vsrc:vreg_256 = IMPLICIT_DEF
+    %ssrc:sreg_32 = IMPLICIT_DEF
+    early-clobber %vdst:vreg_256 = V_WMMA_BF16_16X16X16_BF16_threeaddr_w32 8, %vsrc, 8, %vsrc, 8, %vsrc, 0, 0, 0, 0, implicit $exec
+    %sdst:sreg_32 = SI_IF %ssrc:sreg_32, %bb.2, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+    S_BRANCH %bb.1
+  bb.1:
+    %vcopy:vgpr_32 = COPY %vdst.sub0
+  bb.2:
+    SI_END_CF %sdst:sreg_32, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+    S_ENDPGM 0
+...
+
+---
+name:            wmma_test_V_WMMA_SCALE_F32_16X16X128_F8F6F4_f8_f8_w32_twoaddr
+tracksRegLiveness: true
+body:             |
+  ; CHECK-LABEL: name: wmma_test_V_WMMA_SCALE_F32_16X16X128_F8F6F4_f8_f8_w32_twoaddr
+  ; CHECK: bb.0:
+  ; CHECK-NEXT:   successors: %bb.2(0x40000000), %bb.1(0x40000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   %vsrc256:vreg_256 = IMPLICIT_DEF
+  ; CHECK-NEXT:   %vsrc512:vreg_512 = IMPLICIT_DEF
+  ; CHECK-NEXT:   %ssrc:sreg_32 = IMPLICIT_DEF
+  ; CHECK-NEXT:   early-clobber %vdst:vreg_256 = V_WMMA_SCALE_F32_16X16X128_F8F6F4_f8_f8_w32_twoaddr %vsrc512, %vsrc512, 8, %vsrc256, 0, 0, 1, 2, 1, 1, 0, 0, 0, 0, 0, 0, implicit $exec
+  ; CHECK-NEXT:   %sdst:sreg_32 = SI_IF %ssrc, %bb.2, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+  ; CHECK-NEXT:   S_BRANCH %bb.1
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.1:
+  ; CHECK-NEXT:   successors: %bb.2(0x80000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   %vcopy:vgpr_32 = COPY %vdst.sub0
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.2:
+  ; CHECK-NEXT:   SI_END_CF %sdst, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+  ; CHECK-NEXT:   S_ENDPGM 0
+  bb.0:
+    %vsrc256:vreg_256 = IMPLICIT_DEF
+    %vsrc512:vreg_512 = IMPLICIT_DEF
+    %ssrc:sreg_32 = IMPLICIT_DEF
+    early-clobber %vdst:vreg_256 = V_WMMA_SCALE_F32_16X16X128_F8F6F4_f8_f8_w32_twoaddr %vsrc512, %vsrc512, 8, %vsrc256, 0, 0, 1, 2, 1, 1, 0, 0, 0, 0, 0, 0, implicit $exec
+    %sdst:sreg_32 = SI_IF %ssrc:sreg_32, %bb.2, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+    S_BRANCH %bb.1
+  bb.1:
+    %vcopy:vgpr_32 = COPY %vdst.sub0
+  bb.2:
+    SI_END_CF %sdst:sreg_32, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+    S_ENDPGM 0
+...
diff --git a/llvm/test/Transforms/InstCombine/simplify-demanded-fpclass.ll b/llvm/test/Transforms/InstCombine/simplify-demanded-fpclass.ll
index 6688379665924..7825d380d6e50 100644
--- a/llvm/test/Transforms/InstCombine/simplify-demanded-fpclass.ll
+++ b/llvm/test/Transforms/InstCombine/simplify-demanded-fpclass.ll
@@ -2041,3 +2041,652 @@ define nofpclass(nan) float @ret_nonan_fmul_select_nan_other_use_commute(i1 %con
   %nan.user = fmul float %select, %y
   ret float %nan.user
 }
+
+define nofpclass(snan) float @copysign_src_known_positive__sign_known_negative_multiple_use(float nofpclass(nan ninf nnorm nsub nzero) %always.positive, float nofpclass(nan pinf pnorm psub pzero) %always.negative, ptr %ptr) {
+; CHECK-LABEL: define nofpclass(snan) float @copysign_src_known_positive__sign_known_negative_multiple_use
+; CHECK-SAME: (float nofpclass(nan ninf nzero nsub nnorm) [[ALWAYS_POSITIVE:%.*]], float nofpclass(nan pinf pzero psub pnorm) [[ALWAYS_NEGATIVE:%.*]], ptr [[PTR:%.*]]) {
+; CHECK-NEXT:    [[COPYSIGN:%.*]] = fneg float [[ALWAYS_POSITIVE]]
+; CHECK-NEXT:    store float [[COPYSIGN]], ptr [[PTR]], align 4
+; CHECK-NEXT:    ret float [[COPYSIGN]]
+;
+  %copysign = call float @llvm.copysign.f32(float %always.positive, float %always.negative)
+  store float %copysign, ptr %ptr
+  ret float %copysign
+}
+
+define nofpclass(snan) float @copysign_src_known_negative__sign_known_negative_multiple_use(float nofpclass(nan pinf pnorm psub pzero) %always.negative0, float nofpclass(nan pinf pnorm psub pzero) %always.negative1, ptr %ptr) {
+; CHECK-LABEL: define nofpclass(snan) float @copysign_src_known_negative__sign_known_negative_multiple_use
+; CHECK-SAME: (float nofpclass(nan pinf pzero psub pnorm) [[ALWAYS_NEGATIVE0:%.*]], float nofpclass(nan pinf pzero psub pnorm) [[ALWAYS_NEGATIVE1:%.*]], ptr [[PTR:%.*]]) {
+; CHECK-NEXT:    [[TMP1:%.*]] = call float @llvm.fabs.f32(float [[ALWAYS_NEGATIVE0]])
+; CHECK-NEXT:    [[COPYSIGN:%.*]] = fneg float [[TMP1]]
+; CHECK-NEXT:    store float [[COPYSIGN]], ptr [[PTR]], align 4
+; CHECK-NEXT:    ret float [[ALWAYS_NEGATIVE0]]
+;
+  %copysign = call float @llvm.copysign.f32(float %always.negative0, float %always.negative1)
+  store float %copysign, ptr %ptr
+  ret float %copysign
+}
+
+define nofpclass(snan) float @copysign_src_known_positive__sign_known_positive_multiple_use(float nofpclass(nan ninf nnorm nsub nzero) %always.positive0, float nofpclass(nan ninf nnorm nsub nzero) %always.positive1, ptr %ptr) {
+; CHECK-LABEL: define nofpclass(snan) float @copysign_src_known_positive__sign_known_positive_multiple_use
+; CHECK-SAME: (float nofpclass(nan ninf nzero nsub nnorm) [[ALWAYS_POSITIVE0:%.*]], float nofpclass(nan ninf nzero nsub nnorm) [[ALWAYS_POSITIVE1:%.*]], ptr [[PTR:%.*]]) {
+; CHECK-NEXT:    store float [[ALWAYS_POSITIVE0]], ptr [[PTR]], align 4
+; CHECK-NEXT:    ret float [[ALWAYS_POSITIVE0]]
+;
+  %copysign = call float @llvm.copysign.f32(float %always.positive0, float %always.positive1)
+  store float %copysign, ptr %ptr
+  ret float %copysign
+}
+
+define nofpclass(nan) float @ret_nonan__copysign_src_known_negative_or_nan__sign_known_negative_multiple_use(float nofpclass(pinf pnorm psub pzero) %always.negative0.or.nan, float nofpclass(nan pinf pnorm psub pzero) %always.negative1, ptr %ptr) {
+; CHECK-LABEL: define nofpclass(nan) float @ret_nonan__copysign_src_known_negative_or_nan__sign_known_negative_multiple_use
+; CHECK-SAME: (float nofpclass(pinf pzero psub pnorm) [[ALWAYS_NEGATIVE0_OR_NAN:%.*]], float nofpclass(nan pinf pzero psub pnorm) [[ALWAYS_NEGATIVE1:%.*]], ptr [[PTR:%.*]]) {
+; CHECK-NEXT:    [[TMP1:%.*]] = call float @llvm.fabs.f32(float [[ALWAYS_NEGATIVE0_OR_NAN]])
+; CHECK-NEXT:    [[COPYSIGN:%.*]] = fneg float [[TMP1]]
+; CHECK-NEXT:    store float [[COPYSIGN]], ptr [[PTR]], align 4
+; CHECK-NEXT:    ret float [[ALWAYS_NEGATIVE0_OR_NAN]]
+;
+  %copysign = call float @llvm.copysign.f32(float %always.negative0.or.nan, float %always.negative1)
+  store float %copysign, ptr %ptr
+  ret float %copysign
+}
+
+define nofpclass(nan ninf nnorm nsub nzero) float @ret_only_positive__copysign__sign_known_negative_multiple_use(float %unknown, float nofpclass(nan pinf pnorm psub pzero) %always.negative, ptr %ptr) {
+; CHECK-LABEL: define nofpclass(nan ninf nzero nsub nnorm) float @ret_only_positive__copysign__sign_known_negative_multiple_use
+; CHECK-SAME: (float [[UNKNOWN:%.*]], float nofpclass(nan pinf pzero psub pnorm) [[ALWAYS_NEGATIVE:%.*]], ptr [[PTR:%.*]]) {
+; CHECK-NEXT:    [[TMP1:%.*]] = call float @llvm.fabs.f32(float [[UNKNOWN]])
+; CHECK-NEXT:    [[COPYSIGN:%.*]] = fneg float [[TMP1]]
+; CHECK-NEXT:    store float [[COPYSIGN]], ptr [[PTR]], align 4
+; CHECK-NEXT:    ret float poison
+;
+  %copysign = call float @llvm.copysign.f32(float %unknown, float %always.negative)
+  store float %copysign, ptr %ptr
+  ret float %copysign
+}
+
+define nofpclass(nan ninf nnorm nsub nzero) float @ret_only_positive__copysign__sign_known_positive_multiple_use(float %unknown, float nofpclass(nan ninf nnorm nsub nzero) %always.positive, ptr %ptr) {
+; CHECK-LABEL: define nofpclass(nan ninf nzero nsub nnorm) float @ret_only_positive__copysign__sign_known_positive_multiple_use
+; CHECK-SAME: (float [[UNKNOWN:%.*]], float nofpclass(nan ninf nzero nsub nnorm) [[ALWAYS_POSITIVE:%.*]], ptr [[PTR:%.*]]) {
+; CHECK-NEXT:    [[COPYSIGN:%.*]] = call float @llvm.fabs.f32(float [[UNKNOWN]])
+; CHECK-NEXT:    store float [[COPYSIGN]], ptr [[PTR]], align 4
+; CHECK-NEXT:    ret float [[COPYSIGN]]
+;
+  %copysign = call float @llvm.copysign.f32(float %unknown, float %always.positive)
+  store float %copysign, ptr %ptr
+  ret float %copysign
+}
+
+define nofpclass(nan pinf pnorm psub pzero) float @ret_only_negative__copysign__sign_known_negative_multiple_use(float %unknown, float nofpclass(nan pinf pnorm psub pzero) %always.negative, ptr %ptr) {
+; CHECK-LABEL: define nofpclass(nan pinf pzero psub pnorm) float @ret_only_negative__copysign__sign_known_negative_multiple_use
+; CHECK-SAME: (float [[UNKNOWN:%.*]], float nofpclass(nan pinf pzero psub pnorm) [[ALWAYS_NEGATIVE:%.*]], ptr [[PTR:%.*]]) {
+; CHECK-NEXT:    [[TMP1:%.*]] = call float @llvm.fabs.f32(float [[UNKNOWN]])
+; CHECK-NEXT:    [[COPYSIGN:%.*]] = fneg float [[TMP1]]
+; CHECK-NEXT:    store float [[COPYSIGN]], ptr [[PTR]], align 4
+; CHECK-NEXT:    ret float [[COPYSIGN]]
+;
+  %copysign = call float @llvm.copysign.f32(float %unknown, float %always.negative)
+  store float %copysign, ptr %ptr
+  ret float %copysign
+}
+
+define nofpclass(nan pinf pnorm psub pzero) float @ret_only_negative__copysign__sign_known_positive_multiple_use(float %unknown, float nofpclass(nan ninf nnorm nsub nzero) %always.positive, ptr %ptr) {
+; CHECK-LABEL: define nofpclass(nan pinf pzero psub pnorm) float @ret_only_negative__copysign__sign_known_positive_multiple_use
+; CHECK-SAME: (float [[UNKNOWN:%.*]], float nofpclass(nan ninf nzero nsub nnorm) [[ALWAYS_POSITIVE:%.*]], ptr [[PTR:%.*]]) {
+; CHECK-NEXT:    [[COPYSIGN:%.*]] = call float @llvm.fabs.f32(float [[UNKNOWN]])
+; CHECK-NEXT:    store float [[COPYSIGN]], ptr [[PTR]], align 4
+; CHECK-NEXT:    ret float poison
+;
+  %copysign = call float @llvm.copysign.f32(float %unknown, float %always.positive)
+  store float %copysign, ptr %ptr
+  ret float %copysign
+}
+
+define nofpclass(nan ninf nnorm nsub nzero) float @ret_only_positive__copysign_multiple_use(float %unknown.mag, float %unknown.sign, ptr %ptr) {
+; CHECK-LABEL: define nofpclass(nan ninf nzero nsub nnorm) float @ret_only_positive__copysign_multiple_use
+; CHECK-SAME: (float [[UNKNOWN_MAG:%.*]], float [[UNKNOWN_SIGN:%.*]], ptr [[PTR:%.*]]) {
+; CHECK-NEXT:    [[COPYSIGN:%.*]] = call float @llvm.copysign.f32(float [[UNKNOWN_MAG]], float [[UNKNOWN_SIGN]])
+; CHECK-NEXT:    store float [[COPYSIGN]], ptr [[PTR]], align 4
+; CHECK-NEXT:    ret float [[COPYSIGN]]
+;
+  %copysign = call float @llvm.copysign.f32(float %unknown.mag, float %unknown.sign)
+  store float %copysign, ptr %ptr
+  ret float %copysign
+}
+
+define nofpclass(nan pinf pnorm psub pzero) float @ret_only_negative__copysign_multiple_use(float %unknown.mag, float %unknown.sign, ptr %ptr) {
+; CHECK-LABEL: define nofpclass(nan pinf pzero psub pnorm) float @ret_only_negative__copysign_multiple_use
+; CHECK-SAME: (float [[UNKNOWN_MAG:%.*]], float [[UNKNOWN_SIGN:%.*]], ptr [[PTR:%.*]]) {
+; CHECK-NEXT:    [[COPYSIGN:%.*]] = call float @llvm.copysign.f32(float [[UNKNOWN_MAG]], float [[UNKNOWN_SIGN]])
+; CHECK-NEXT:    store float [[COPYSIGN]], ptr [[PTR]], align 4
+; CHECK-NEXT:    ret float [[COPYSIGN]]
+;
+  %copysign = call float @llvm.copysign.f32(float %unknown.mag, float %unknown.sign)
+  store float %copysign, ptr %ptr
+  ret float %copysign
+}
+
+; Fold to direct use of %always.positive
+define nofpclass(nan ninf nnorm nsub nzero) float @ret_only_positive__copysign_src_known_positive__sign_unknown(float nofpclass(nan ninf nnorm nsub nzero) %always.positive, float %unknown) {
+; CHECK-LABEL: define nofpclass(nan ninf nzero nsub nnorm) float @ret_only_positive__copysign_src_known_positive__sign_unknown
+; CHECK-SAME: (float nofpclass(nan ninf nzero nsub nnorm) [[ALWAYS_POSITIVE:%.*]], float [[UNKNOWN:%.*]]) {
+; CHECK-NEXT:    ret float [[ALWAYS_POSITIVE]]
+;
+  %copysign = call float @llvm.copysign.f32(float %always.positive, float %unknown)
+  ret float %copysign
+}
+
+; Fold to direct use of %always.positive
+define nofpclass(nan ninf nnorm nsub nzero) float @ret_only_positive__copysign_src_known_positive__sign_unknown_multiple_use(float nofpclass(nan ninf nnorm nsub nzero) %always.positive, float %unknown, ptr %ptr) {
+; CHECK-LABEL: define nofpclass(nan ninf nzero nsub nnorm) float @ret_only_positive__copysign_src_known_positive__sign_unknown_multiple_use
+; CHECK-SAME: (float nofpclass(nan ninf nzero nsub nnorm) [[ALWAYS_POSITIVE:%.*]], float [[UNKNOWN:%.*]], ptr [[PTR:%.*]]) {
+; CHECK-NEXT:    [[COPYSIGN:%.*]] = call float @llvm.copysign.f32(float [[ALWAYS_POSITIVE]], float [[UNKNOWN]])
+; CHECK-NEXT:    store float [[COPYSIGN]], ptr [[PTR]], align 4
+; CHECK-NEXT:    ret float [[COPYSIGN]]
+;
+  %copysign = call float @llvm.copysign.f32(float %always.positive, float %unknown)
+  store float %copysign, ptr %ptr
+  ret float %copysign
+}
+
+; Fold to direct use of %always.negative
+define nofpclass(nan pinf pnorm psub pzero) float @ret_only_negative__copysign_src_known_negative__sign_unknown(float nofpclass(nan pinf pnorm psub pzero) %always.negative, float %unknown) {
+; CHECK-LABEL: define nofpclass(nan pinf pzero psub pnorm) float @ret_only_negative__copysign_src_known_negative__sign_unknown
+; CHECK-SAME: (float nofpclass(nan pinf pzero psub pnorm) [[ALWAYS_NEGATIVE:%.*]], float [[UNKNOWN:%.*]]) {
+; CHECK-NEXT:    ret float [[ALWAYS_NEGATIVE]]
+;
+  %copysign = call float @llvm.copysign.f32(float %always.negative, float %unknown)
+  ret float %copysign
+}
+
+; Fold to direct use of %always.negative
+define nofpclass(nan pinf pnorm psub pzero) float @ret_only_negative__copysign_src_known_negative__sign_unknown_multiple_use(float nofpclass(nan pinf pnorm psub pzero) %always.negative, float %unknown, ptr %ptr) {
+; CHECK-LABEL: define nofpclass(nan pinf pzero psub pnorm) float @ret_only_negative__copysign_src_known_negative__sign_unknown_multiple_use
+; CHECK-SAME: (float nofpclass(nan pinf pzero psub pnorm) [[ALWAYS_NEGATIVE:%.*]], float [[UNKNOWN:%.*]], ptr [[PTR:%.*]]) {
+; CHECK-NEXT:    [[COPYSIGN:%.*]] = call float @llvm.copysign.f32(float [[ALWAYS_NEGATIVE]], float [[UNKNOWN]])
+; CHECK-NEXT:    store float [[COPYSIGN]], ptr [[PTR]], align 4
+; CHECK-NEXT:    ret float [[COPYSIGN]]
+;
+  %copysign = call float @llvm.copysign.f32(float %always.negative, float %unknown)
+  store float %copysign, ptr %ptr
+  ret float %copysign
+}
+
+; Negative test
+define nofpclass(nan ninf nnorm nsub nzero) float @ret_only_positive__copysign_src_known_negative__sign_unknown(float nofpclass(nan pinf pnorm psub pzero) %always.negative, float %unknown) {
+; CHECK-LABEL: define nofpclass(nan ninf nzero nsub nnorm) float @ret_only_positive__copysign_src_known_negative__sign_unknown
+; CHECK-SAME: (float nofpclass(nan pinf pzero psub pnorm) [[ALWAYS_NEGATIVE:%.*]], float [[UNKNOWN:%.*]]) {
+; CHECK-NEXT:    [[COPYSIGN:%.*]] = call float @llvm.fabs.f32(float [[ALWAYS_NEGATIVE]])
+; CHECK-NEXT:    ret float [[COPYSIGN]]
+;
+  %copysign = call float @llvm.copysign.f32(float %always.negative, float %unknown)
+  ret float %copysign
+}
+
+; Negative test
+define nofpclass(nan ninf nnorm nsub nzero) float @ret_only_positive__copysign_src_known_negative__sign_unknown_multiple_use(float nofpclass(nan pinf pnorm psub pzero) %always.negative, float %unknown, ptr %ptr) {
+; CHECK-LABEL: define nofpclass(nan ninf nzero nsub nnorm) float @ret_only_positive__copysign_src_known_negative__sign_unknown_multiple_use
+; CHECK-SAME: (float nofpclass(nan pinf pzero psub pnorm) [[ALWAYS_NEGATIVE:%.*]], float [[UNKNOWN:%.*]], ptr [[PTR:%.*]]) {
+; CHECK-NEXT:    [[COPYSIGN:%.*]] = call float @llvm.copysign.f32(float [[ALWAYS_NEGATIVE]], float [[UNKNOWN]])
+; CHECK-NEXT:    store float [[COPYSIGN]], ptr [[PTR]], align 4
+; CHECK-NEXT:    ret float [[COPYSIGN]]
+;
+  %copysign = call float @llvm.copysign.f32(float %always.negative, float %unknown)
+  store float %copysign, ptr %ptr
+  ret float %copysign
+}
+
+; Negative test
+define nofpclass(nan pinf pnorm psub pzero) float @ret_only_negative__copysign_src_known_positive__sign_unknown(float nofpclass(nan ninf nnorm nsub nzero) %always.positive, float %unknown) {
+; CHECK-LABEL: define nofpclass(nan pinf pzero psub pnorm) float @ret_only_negative__copysign_src_known_positive__sign_unknown
+; CHECK-SAME: (float nofpclass(nan ninf nzero nsub nnorm) [[ALWAYS_POSITIVE:%.*]], float [[UNKNOWN:%.*]]) {
+; CHECK-NEXT:    [[COPYSIGN:%.*]] = fneg float [[ALWAYS_POSITIVE]]
+; CHECK-NEXT:    ret float [[COPYSIGN]]
+;
+  %copysign = call float @llvm.copysign.f32(float %always.positive, float %unknown)
+  ret float %copysign
+}
+
+; Could still be positive for +inf input
+define nofpclass(nan nnorm nsub nzero) float @ret_only_positive_or_ninf__copysign_src_known_positive__sign_unknown(float nofpclass(nan ninf nnorm nsub nzero) %always.positive, float %unknown) {
+; CHECK-LABEL: define nofpclass(nan nzero nsub nnorm) float @ret_only_positive_or_ninf__copysign_src_known_positive__sign_unknown
+; CHECK-SAME: (float nofpclass(nan ninf nzero nsub nnorm) [[ALWAYS_POSITIVE:%.*]], float [[UNKNOWN:%.*]]) {
+; CHECK-NEXT:    [[COPYSIGN:%.*]] = call float @llvm.copysign.f32(float [[ALWAYS_POSITIVE]], float [[UNKNOWN]])
+; CHECK-NEXT:    ret float [[COPYSIGN]]
+;
+  %copysign = call float @llvm.copysign.f32(float %always.positive, float %unknown)
+  ret float %copysign
+}
+
+; The ninf flag gives the missing no-infs
+define nofpclass(nan nnorm nsub nzero) float @ret_only_positive_or_ninf__copysign_ninf_src_known_positive__sign_unknown(float nofpclass(nan ninf nnorm nsub nzero) %always.positive, float %unknown) {
+; CHECK-LABEL: define nofpclass(nan nzero nsub nnorm) float @ret_only_positive_or_ninf__copysign_ninf_src_known_positive__sign_unknown
+; CHECK-SAME: (float nofpclass(nan ninf nzero nsub nnorm) [[ALWAYS_POSITIVE:%.*]], float [[UNKNOWN:%.*]]) {
+; CHECK-NEXT:    ret float [[ALWAYS_POSITIVE]]
+;
+  %copysign = call ninf float @llvm.copysign.f32(float %always.positive, float %unknown)
+  ret float %copysign
+}
+
+; The ninf flag gives the missing no-infs
+define nofpclass(nan nnorm nsub nzero) float @ret_only_positive_or_ninf__copysign_ninf_src_known_positive__sign_unknown_multiple_use(float nofpclass(nan ninf nnorm nsub nzero) %always.positive, float %unknown, ptr %ptr) {
+; CHECK-LABEL: define nofpclass(nan nzero nsub nnorm) float @ret_only_positive_or_ninf__copysign_ninf_src_known_positive__sign_unknown_multiple_use
+; CHECK-SAME: (float nofpclass(nan ninf nzero nsub nnorm) [[ALWAYS_POSITIVE:%.*]], float [[UNKNOWN:%.*]], ptr [[PTR:%.*]]) {
+; CHECK-NEXT:    [[COPYSIGN:%.*]] = call ninf float @llvm.copysign.f32(float [[ALWAYS_POSITIVE]], float [[UNKNOWN]])
+; CHECK-NEXT:    store float [[COPYSIGN]], ptr [[PTR]], align 4
+; CHECK-NEXT:    ret float [[COPYSIGN]]
+;
+  %copysign = call ninf float @llvm.copysign.f32(float %always.positive, float %unknown)
+  store float %copysign, ptr %ptr
+  ret float %copysign
+}
+
+; No pinf from argument
+define nofpclass(nan nnorm nsub nzero) float @ret_only_positive_or_ninf__copysign_src_known_positive_no_inf__sign_unknown(float nofpclass(nan inf nnorm nsub nzero) %always.positive.no.inf, float %unknown) {
+; CHECK-LABEL: define nofpclass(nan nzero nsub nnorm) float @ret_only_positive_or_ninf__copysign_src_known_positive_no_inf__sign_unknown
+; CHECK-SAME: (float nofpclass(nan inf nzero nsub nnorm) [[ALWAYS_POSITIVE_NO_INF:%.*]], float [[UNKNOWN:%.*]]) {
+; CHECK-NEXT:    [[COPYSIGN:%.*]] = call float @llvm.copysign.f32(float [[ALWAYS_POSITIVE_NO_INF]], float [[UNKNOWN]])
+; CHECK-NEXT:    ret float [[COPYSIGN]]
+;
+  %copysign = call float @llvm.copysign.f32(float %always.positive.no.inf, float %unknown)
+  ret float %copysign
+}
+
+; No pinf from argument
+define nofpclass(nan nnorm nsub nzero) float @ret_only_positive_or_ninf__copysign_src_known_positive_no_inf__sign_unknown_multiple_use(float nofpclass(nan inf nnorm nsub nzero) %always.positive.no.inf, float %unknown, ptr %ptr) {
+; CHECK-LABEL: define nofpclass(nan nzero nsub nnorm) float @ret_only_positive_or_ninf__copysign_src_known_positive_no_inf__sign_unknown_multiple_use
+; CHECK-SAME: (float nofpclass(nan inf nzero nsub nnorm) [[ALWAYS_POSITIVE_NO_INF:%.*]], float [[UNKNOWN:%.*]], ptr [[PTR:%.*]]) {
+; CHECK-NEXT:    [[COPYSIGN:%.*]] = call float @llvm.copysign.f32(float [[ALWAYS_POSITIVE_NO_INF]], float [[UNKNOWN]])
+; CHECK-NEXT:    store float [[COPYSIGN]], ptr [[PTR]], align 4
+; CHECK-NEXT:    ret float [[COPYSIGN]]
+;
+  %copysign = call float @llvm.copysign.f32(float %always.positive.no.inf, float %unknown)
+  store float %copysign, ptr %ptr
+  ret float %copysign
+}
+
+define nofpclass(nan ninf nnorm nsub) float @ret_only_positive_or_zero__copysign_src_known_positive__sign_unknown(float nofpclass(nan ninf nnorm nsub) %always.positive.or.zero, float %unknown) {
+; CHECK-LABEL: define nofpclass(nan ninf nsub nnorm) float @ret_only_positive_or_zero__copysign_src_known_positive__sign_unknown
+; CHECK-SAME: (float nofpclass(nan ninf nsub nnorm) [[ALWAYS_POSITIVE_OR_ZERO:%.*]], float [[UNKNOWN:%.*]]) {
+; CHECK-NEXT:    [[COPYSIGN:%.*]] = call float @llvm.copysign.f32(float [[ALWAYS_POSITIVE_OR_ZERO]], float [[UNKNOWN]])
+; CHECK-NEXT:    ret float [[COPYSIGN]]
+;
+  %copysign = call float @llvm.copysign.f32(float %always.positive.or.zero, float %unknown)
+  ret float %copysign
+}
+
+; Ignore 0 mismatch with nsz
+define nofpclass(nan ninf nnorm nsub) float @ret_only_positive_or_zero__copysign_nsz_src_known_positive__sign_unknown(float nofpclass(nan ninf nnorm nsub) %always.positive.or.zero, float %unknown) {
+; CHECK-LABEL: define nofpclass(nan ninf nsub nnorm) float @ret_only_positive_or_zero__copysign_nsz_src_known_positive__sign_unknown
+; CHECK-SAME: (float nofpclass(nan ninf nsub nnorm) [[ALWAYS_POSITIVE_OR_ZERO:%.*]], float [[UNKNOWN:%.*]]) {
+; CHECK-NEXT:    ret float [[ALWAYS_POSITIVE_OR_ZERO]]
+;
+  %copysign = call nsz float @llvm.copysign.f32(float %always.positive.or.zero, float %unknown)
+  ret float %copysign
+}
+
+; Do not use nsz with multiple uses
+define nofpclass(nan ninf nnorm nsub) float @ret_only_positive_or_zero__copysign_nsz_src_known_positive__sign_unknown_multiple_use(float nofpclass(nan ninf nnorm nsub) %always.positive.or.zero, float %unknown, ptr %ptr) {
+; CHECK-LABEL: define nofpclass(nan ninf nsub nnorm) float @ret_only_positive_or_zero__copysign_nsz_src_known_positive__sign_unknown_multiple_use
+; CHECK-SAME: (float nofpclass(nan ninf nsub nnorm) [[ALWAYS_POSITIVE_OR_ZERO:%.*]], float [[UNKNOWN:%.*]], ptr [[PTR:%.*]]) {
+; CHECK-NEXT:    [[COPYSIGN:%.*]] = call nsz float @llvm.copysign.f32(float [[ALWAYS_POSITIVE_OR_ZERO]], float [[UNKNOWN]])
+; CHECK-NEXT:    store float [[COPYSIGN]], ptr [[PTR]], align 4
+; CHECK-NEXT:    ret float [[COPYSIGN]]
+;
+  %copysign = call nsz float @llvm.copysign.f32(float %always.positive.or.zero, float %unknown)
+  store float %copysign, ptr %ptr
+  ret float %copysign
+}
+
+define nofpclass(nan pinf pnorm psub) float @ret_only_negative_or_pzero__copysign_src_known_negative__sign_unknown(float nofpclass(nan pinf pnorm psub) %always.negative.or.zero, float %unknown) {
+; CHECK-LABEL: define nofpclass(nan pinf psub pnorm) float @ret_only_negative_or_pzero__copysign_src_known_negative__sign_unknown
+; CHECK-SAME: (float nofpclass(nan pinf psub pnorm) [[ALWAYS_NEGATIVE_OR_ZERO:%.*]], float [[UNKNOWN:%.*]]) {
+; CHECK-NEXT:    [[COPYSIGN:%.*]] = call float @llvm.copysign.f32(float [[ALWAYS_NEGATIVE_OR_ZERO]], float [[UNKNOWN]])
+; CHECK-NEXT:    ret float [[COPYSIGN]]
+;
+  %copysign = call float @llvm.copysign.f32(float %always.negative.or.zero, float %unknown)
+  ret float %copysign
+}
+
+; Ignore 0 mismatch with nsz
+define nofpclass(nan pinf pnorm psub) float @ret_only_negative_or_pzero__copysign_nsz_src_known_negative__sign_unknown(float nofpclass(nan pinf pnorm psub) %always.negative.or.zero, float %unknown) {
+; CHECK-LABEL: define nofpclass(nan pinf psub pnorm) float @ret_only_negative_or_pzero__copysign_nsz_src_known_negative__sign_unknown
+; CHECK-SAME: (float nofpclass(nan pinf psub pnorm) [[ALWAYS_NEGATIVE_OR_ZERO:%.*]], float [[UNKNOWN:%.*]]) {
+; CHECK-NEXT:    ret float [[ALWAYS_NEGATIVE_OR_ZERO]]
+;
+  %copysign = call nsz float @llvm.copysign.f32(float %always.negative.or.zero, float %unknown)
+  ret float %copysign
+}
+
+; Do not use nsz with multiple uses
+define nofpclass(nan pinf pnorm psub) float @ret_only_negative_or_pzero__copysign_nsz_src_known_negative__sign_unknown_multiple_use(float nofpclass(nan pinf pnorm psub) %always.negative.or.zero, float %unknown, ptr %ptr) {
+; CHECK-LABEL: define nofpclass(nan pinf psub pnorm) float @ret_only_negative_or_pzero__copysign_nsz_src_known_negative__sign_unknown_multiple_use
+; CHECK-SAME: (float nofpclass(nan pinf psub pnorm) [[ALWAYS_NEGATIVE_OR_ZERO:%.*]], float [[UNKNOWN:%.*]], ptr [[PTR:%.*]]) {
+; CHECK-NEXT:    [[COPYSIGN:%.*]] = call nsz float @llvm.copysign.f32(float [[ALWAYS_NEGATIVE_OR_ZERO]], float [[UNKNOWN]])
+; CHECK-NEXT:    store float [[COPYSIGN]], ptr [[PTR]], align 4
+; CHECK-NEXT:    ret float [[COPYSIGN]]
+;
+  %copysign = call nsz float @llvm.copysign.f32(float %always.negative.or.zero, float %unknown)
+  store float %copysign, ptr %ptr
+  ret float %copysign
+}
+
+define nofpclass(pinf pnorm psub pzero) float @ret_only_negative_or_nan__copysign_src_known_negative__sign_unknown(float nofpclass(nan pinf pnorm psub pzero) %always.negative, float %unknown) {
+; CHECK-LABEL: define nofpclass(pinf pzero psub pnorm) float @ret_only_negative_or_nan__copysign_src_known_negative__sign_unknown
+; CHECK-SAME: (float nofpclass(nan pinf pzero psub pnorm) [[ALWAYS_NEGATIVE:%.*]], float [[UNKNOWN:%.*]]) {
+; CHECK-NEXT:    [[COPYSIGN:%.*]] = call float @llvm.copysign.f32(float [[ALWAYS_NEGATIVE]], float [[UNKNOWN]])
+; CHECK-NEXT:    ret float [[COPYSIGN]]
+;
+  %copysign = call float @llvm.copysign.f32(float %always.negative, float %unknown)
+  ret float %copysign
+}
+
+
+define nofpclass(pinf pnorm psub pzero) float @ret_only_negative_or_nan__copysign_src_known_negative__sign_unknown_multiple_use(float nofpclass(nan pinf pnorm psub pzero) %always.negative, float %unknown, ptr %ptr) {
+; CHECK-LABEL: define nofpclass(pinf pzero psub pnorm) float @ret_only_negative_or_nan__copysign_src_known_negative__sign_unknown_multiple_use
+; CHECK-SAME: (float nofpclass(nan pinf pzero psub pnorm) [[ALWAYS_NEGATIVE:%.*]], float [[UNKNOWN:%.*]], ptr [[PTR:%.*]]) {
+; CHECK-NEXT:    [[COPYSIGN:%.*]] = call float @llvm.copysign.f32(float [[ALWAYS_NEGATIVE]], float [[UNKNOWN]])
+; CHECK-NEXT:    store float [[COPYSIGN]], ptr [[PTR]], align 4
+; CHECK-NEXT:    ret float [[COPYSIGN]]
+;
+  %copysign = call float @llvm.copysign.f32(float %always.negative, float %unknown)
+  store float %copysign, ptr %ptr
+  ret float %copysign
+}
+
+define nofpclass(pinf pnorm psub pzero) float @ret_only_negative_or_nan__copysign_src_known_negative_or_nan__sign_unknown(float nofpclass(pinf pnorm psub pzero) %always.negative.or.nan, float %unknown) {
+; CHECK-LABEL: define nofpclass(pinf pzero psub pnorm) float @ret_only_negative_or_nan__copysign_src_known_negative_or_nan__sign_unknown
+; CHECK-SAME: (float nofpclass(pinf pzero psub pnorm) [[ALWAYS_NEGATIVE_OR_NAN:%.*]], float [[UNKNOWN:%.*]]) {
+; CHECK-NEXT:    [[COPYSIGN:%.*]] = call float @llvm.copysign.f32(float [[ALWAYS_NEGATIVE_OR_NAN]], float [[UNKNOWN]])
+; CHECK-NEXT:    ret float [[COPYSIGN]]
+;
+  %copysign = call float @llvm.copysign.f32(float %always.negative.or.nan, float %unknown)
+  ret float %copysign
+}
+
+define nofpclass(pinf pnorm psub pzero) float @ret_only_negative_or_nan__copysign_src_known_negative_or_nan__sign_unknown_multiple_use(float nofpclass(pinf pnorm psub pzero) %always.negative.or.nan, float %unknown, ptr %ptr) {
+; CHECK-LABEL: define nofpclass(pinf pzero psub pnorm) float @ret_only_negative_or_nan__copysign_src_known_negative_or_nan__sign_unknown_multiple_use
+; CHECK-SAME: (float nofpclass(pinf pzero psub pnorm) [[ALWAYS_NEGATIVE_OR_NAN:%.*]], float [[UNKNOWN:%.*]], ptr [[PTR:%.*]]) {
+; CHECK-NEXT:    [[COPYSIGN:%.*]] = call float @llvm.copysign.f32(float [[ALWAYS_NEGATIVE_OR_NAN]], float [[UNKNOWN]])
+; CHECK-NEXT:    store float [[COPYSIGN]], ptr [[PTR]], align 4
+; CHECK-NEXT:    ret float [[COPYSIGN]]
+;
+  %copysign = call float @llvm.copysign.f32(float %always.negative.or.nan, float %unknown)
+  store float %copysign, ptr %ptr
+  ret float %copysign
+}
+
+; Take nnan from flag
+define nofpclass(pinf pnorm psub pzero) float @ret_only_negative_or_nan__copysign_nnan_src_known_negative_or_nan__sign_unknown(float nofpclass(pinf pnorm psub pzero) %always.negative.or.nan, float %unknown) {
+; CHECK-LABEL: define nofpclass(pinf pzero psub pnorm) float @ret_only_negative_or_nan__copysign_nnan_src_known_negative_or_nan__sign_unknown
+; CHECK-SAME: (float nofpclass(pinf pzero psub pnorm) [[ALWAYS_NEGATIVE_OR_NAN:%.*]], float [[UNKNOWN:%.*]]) {
+; CHECK-NEXT:    ret float [[ALWAYS_NEGATIVE_OR_NAN]]
+;
+  %copysign = call nnan float @llvm.copysign.f32(float %always.negative.or.nan, float %unknown)
+  ret float %copysign
+}
+
+; Take nnan from flag
+define nofpclass(pinf pnorm psub pzero) float @ret_only_negative_or_nan__copysign_nnan_src_known_negative_or_nan__sign_unknown_multiple_use(float nofpclass(pinf pnorm psub pzero) %always.negative.or.nan, float %unknown, ptr %ptr) {
+; CHECK-LABEL: define nofpclass(pinf pzero psub pnorm) float @ret_only_negative_or_nan__copysign_nnan_src_known_negative_or_nan__sign_unknown_multiple_use
+; CHECK-SAME: (float nofpclass(pinf pzero psub pnorm) [[ALWAYS_NEGATIVE_OR_NAN:%.*]], float [[UNKNOWN:%.*]], ptr [[PTR:%.*]]) {
+; CHECK-NEXT:    [[COPYSIGN:%.*]] = call nnan float @llvm.copysign.f32(float [[ALWAYS_NEGATIVE_OR_NAN]], float [[UNKNOWN]])
+; CHECK-NEXT:    store float [[COPYSIGN]], ptr [[PTR]], align 4
+; CHECK-NEXT:    ret float [[COPYSIGN]]
+;
+  %copysign = call nnan float @llvm.copysign.f32(float %always.negative.or.nan, float %unknown)
+  store float %copysign, ptr %ptr
+  ret float %copysign
+}
+
+
+define nofpclass(pinf pnorm psub pzero) float @ret_only_positive_or_nan__copysign_src_known_positive__sign_unknown(float nofpclass(nan pinf pnorm psub pzero) %always.negative, float %unknown) {
+; CHECK-LABEL: define nofpclass(pinf pzero psub pnorm) float @ret_only_positive_or_nan__copysign_src_known_positive__sign_unknown
+; CHECK-SAME: (float nofpclass(nan pinf pzero psub pnorm) [[ALWAYS_NEGATIVE:%.*]], float [[UNKNOWN:%.*]]) {
+; CHECK-NEXT:    [[COPYSIGN:%.*]] = call float @llvm.copysign.f32(float [[ALWAYS_NEGATIVE]], float [[UNKNOWN]])
+; CHECK-NEXT:    ret float [[COPYSIGN]]
+;
+  %copysign = call float @llvm.copysign.f32(float %always.negative, float %unknown)
+  ret float %copysign
+}
+
+
+define nofpclass(pinf pnorm psub pzero) float @ret_only_positive_or_nan__copysign_src_known_positive__sign_unknown_multiple_use(float nofpclass(nan pinf pnorm psub pzero) %always.negative, float %unknown, ptr %ptr) {
+; CHECK-LABEL: define nofpclass(pinf pzero psub pnorm) float @ret_only_positive_or_nan__copysign_src_known_positive__sign_unknown_multiple_use
+; CHECK-SAME: (float nofpclass(nan pinf pzero psub pnorm) [[ALWAYS_NEGATIVE:%.*]], float [[UNKNOWN:%.*]], ptr [[PTR:%.*]]) {
+; CHECK-NEXT:    [[COPYSIGN:%.*]] = call float @llvm.copysign.f32(float [[ALWAYS_NEGATIVE]], float [[UNKNOWN]])
+; CHECK-NEXT:    store float [[COPYSIGN]], ptr [[PTR]], align 4
+; CHECK-NEXT:    ret float [[COPYSIGN]]
+;
+  %copysign = call float @llvm.copysign.f32(float %always.negative, float %unknown)
+  store float %copysign, ptr %ptr
+  ret float %copysign
+}
+
+define nofpclass(pinf pnorm psub pzero) float @ret_only_positive_or_nan__copysign_src_known_positive_or_nan__sign_unknown(float nofpclass(pinf pnorm psub pzero) %always.negative.or.nan, float %unknown) {
+; CHECK-LABEL: define nofpclass(pinf pzero psub pnorm) float @ret_only_positive_or_nan__copysign_src_known_positive_or_nan__sign_unknown
+; CHECK-SAME: (float nofpclass(pinf pzero psub pnorm) [[ALWAYS_NEGATIVE_OR_NAN:%.*]], float [[UNKNOWN:%.*]]) {
+; CHECK-NEXT:    [[COPYSIGN:%.*]] = call float @llvm.copysign.f32(float [[ALWAYS_NEGATIVE_OR_NAN]], float [[UNKNOWN]])
+; CHECK-NEXT:    ret float [[COPYSIGN]]
+;
+  %copysign = call float @llvm.copysign.f32(float %always.negative.or.nan, float %unknown)
+  ret float %copysign
+}
+
+define nofpclass(pinf pnorm psub pzero) float @ret_only_positive_or_nan__copysign_src_known_positive_or_nan__sign_unknown_multiple_use(float nofpclass(pinf pnorm psub pzero) %always.negative.or.nan, float %unknown, ptr %ptr) {
+; CHECK-LABEL: define nofpclass(pinf pzero psub pnorm) float @ret_only_positive_or_nan__copysign_src_known_positive_or_nan__sign_unknown_multiple_use
+; CHECK-SAME: (float nofpclass(pinf pzero psub pnorm) [[ALWAYS_NEGATIVE_OR_NAN:%.*]], float [[UNKNOWN:%.*]], ptr [[PTR:%.*]]) {
+; CHECK-NEXT:    [[COPYSIGN:%.*]] = call float @llvm.copysign.f32(float [[ALWAYS_NEGATIVE_OR_NAN]], float [[UNKNOWN]])
+; CHECK-NEXT:    store float [[COPYSIGN]], ptr [[PTR]], align 4
+; CHECK-NEXT:    ret float [[COPYSIGN]]
+;
+  %copysign = call float @llvm.copysign.f32(float %always.negative.or.nan, float %unknown)
+  store float %copysign, ptr %ptr
+  ret float %copysign
+}
+
+; Take nnan from flag
+define nofpclass(pinf pnorm psub pzero) float @ret_only_positive_or_nan__copysign_nnan_src_known_positive_or_nan__sign_unknown(float nofpclass(pinf pnorm psub pzero) %always.negative.or.nan, float %unknown) {
+; CHECK-LABEL: define nofpclass(pinf pzero psub pnorm) float @ret_only_positive_or_nan__copysign_nnan_src_known_positive_or_nan__sign_unknown
+; CHECK-SAME: (float nofpclass(pinf pzero psub pnorm) [[ALWAYS_NEGATIVE_OR_NAN:%.*]], float [[UNKNOWN:%.*]]) {
+; CHECK-NEXT:    ret float [[ALWAYS_NEGATIVE_OR_NAN]]
+;
+  %copysign = call nnan float @llvm.copysign.f32(float %always.negative.or.nan, float %unknown)
+  ret float %copysign
+}
+
+; Take nnan from flag
+define nofpclass(pinf pnorm psub pzero) float @ret_only_positive_or_nan__copysign_nnan_src_known_positive_or_nan__sign_unknown_multiple_use(float nofpclass(pinf pnorm psub pzero) %always.negative.or.nan, float %unknown, ptr %ptr) {
+; CHECK-LABEL: define nofpclass(pinf pzero psub pnorm) float @ret_only_positive_or_nan__copysign_nnan_src_known_positive_or_nan__sign_unknown_multiple_use
+; CHECK-SAME: (float nofpclass(pinf pzero psub pnorm) [[ALWAYS_NEGATIVE_OR_NAN:%.*]], float [[UNKNOWN:%.*]], ptr [[PTR:%.*]]) {
+; CHECK-NEXT:    [[COPYSIGN:%.*]] = call nnan float @llvm.copysign.f32(float [[ALWAYS_NEGATIVE_OR_NAN]], float [[UNKNOWN]])
+; CHECK-NEXT:    store float [[COPYSIGN]], ptr [[PTR]], align 4
+; CHECK-NEXT:    ret float [[COPYSIGN]]
+;
+  %copysign = call nnan float @llvm.copysign.f32(float %always.negative.or.nan, float %unknown)
+  store float %copysign, ptr %ptr
+  ret float %copysign
+}
+
+; We can only tell the sign bit is negative due to the ninf flag
+define nofpclass(nan) float @ret_no_nan__copysign_ninf__src_known_negative_or_nan__sign_known_negative_or_pinf(float nofpclass(pinf pnorm psub pzero) %always.negative.or.nan, float nofpclass(nan pnorm psub pzero) %always.negative.or.pinf) {
+; CHECK-LABEL: define nofpclass(nan) float @ret_no_nan__copysign_ninf__src_known_negative_or_nan__sign_known_negative_or_pinf
+; CHECK-SAME: (float nofpclass(pinf pzero psub pnorm) [[ALWAYS_NEGATIVE_OR_NAN:%.*]], float nofpclass(nan pzero psub pnorm) [[ALWAYS_NEGATIVE_OR_PINF:%.*]]) {
+; CHECK-NEXT:    [[COPYSIGN:%.*]] = call ninf float @llvm.copysign.f32(float [[ALWAYS_NEGATIVE_OR_NAN]], float [[ALWAYS_NEGATIVE_OR_PINF]])
+; CHECK-NEXT:    ret float [[COPYSIGN]]
+;
+  %copysign = call ninf float @llvm.copysign.f32(float %always.negative.or.nan, float %always.negative.or.pinf)
+  ret float %copysign
+}
+
+; We can only tell the sign bit is negative due to the ninf flag
+define nofpclass(nan) float @ret_no_nan__copysign_ninf__src_known_negative_or_nan__sign_known_negative_or_pinf__multiple_use(float nofpclass(pinf pnorm psub pzero) %always.negative.or.nan, float nofpclass(nan pnorm psub pzero) %always.negative.or.pinf, ptr %ptr) {
+; CHECK-LABEL: define nofpclass(nan) float @ret_no_nan__copysign_ninf__src_known_negative_or_nan__sign_known_negative_or_pinf__multiple_use
+; CHECK-SAME: (float nofpclass(pinf pzero psub pnorm) [[ALWAYS_NEGATIVE_OR_NAN:%.*]], float nofpclass(nan pzero psub pnorm) [[ALWAYS_NEGATIVE_OR_PINF:%.*]], ptr [[PTR:%.*]]) {
+; CHECK-NEXT:    [[COPYSIGN:%.*]] = call ninf float @llvm.copysign.f32(float [[ALWAYS_NEGATIVE_OR_NAN]], float [[ALWAYS_NEGATIVE_OR_PINF]])
+; CHECK-NEXT:    store float [[COPYSIGN]], ptr [[PTR]], align 4
+; CHECK-NEXT:    ret float [[COPYSIGN]]
+;
+  %copysign = call ninf float @llvm.copysign.f32(float %always.negative.or.nan, float %always.negative.or.pinf)
+  store float %copysign, ptr %ptr
+  ret float %copysign
+}
+
+; We can only tell the sign bit is positive due to the ninf flag
+define nofpclass(nan) float @ret_no_nan__copysign_ninf__src_known_negative_or_nan__sign_known_positive_or_ninf(float nofpclass(ninf nnorm nsub nzero) %always.positive.or.nan, float nofpclass(nan nnorm nsub nzero) %always.negative.or.pinf) {
+; CHECK-LABEL: define nofpclass(nan) float @ret_no_nan__copysign_ninf__src_known_negative_or_nan__sign_known_positive_or_ninf
+; CHECK-SAME: (float nofpclass(ninf nzero nsub nnorm) [[ALWAYS_POSITIVE_OR_NAN:%.*]], float nofpclass(nan nzero nsub nnorm) [[ALWAYS_NEGATIVE_OR_PINF:%.*]]) {
+; CHECK-NEXT:    [[COPYSIGN:%.*]] = call ninf float @llvm.copysign.f32(float [[ALWAYS_POSITIVE_OR_NAN]], float [[ALWAYS_NEGATIVE_OR_PINF]])
+; CHECK-NEXT:    ret float [[COPYSIGN]]
+;
+  %copysign = call ninf float @llvm.copysign.f32(float %always.positive.or.nan, float %always.negative.or.pinf)
+  ret float %copysign
+}
+
+; We can only tell the sign bit is positive due to the ninf flag
+define nofpclass(nan) float @ret_no_nan__copysign_ninf__src_known_negative_or_nan__sign_known_positive_or_ninf__multiple_use(float nofpclass(ninf nnorm nsub nzero) %always.positive.or.nan, float nofpclass(nan nnorm nsub nzero) %always.negative.or.pinf, ptr %ptr) {
+; CHECK-LABEL: define nofpclass(nan) float @ret_no_nan__copysign_ninf__src_known_negative_or_nan__sign_known_positive_or_ninf__multiple_use
+; CHECK-SAME: (float nofpclass(ninf nzero nsub nnorm) [[ALWAYS_POSITIVE_OR_NAN:%.*]], float nofpclass(nan nzero nsub nnorm) [[ALWAYS_NEGATIVE_OR_PINF:%.*]], ptr [[PTR:%.*]]) {
+; CHECK-NEXT:    [[COPYSIGN:%.*]] = call ninf float @llvm.copysign.f32(float [[ALWAYS_POSITIVE_OR_NAN]], float [[ALWAYS_NEGATIVE_OR_PINF]])
+; CHECK-NEXT:    store float [[COPYSIGN]], ptr [[PTR]], align 4
+; CHECK-NEXT:    ret float [[COPYSIGN]]
+;
+  %copysign = call ninf float @llvm.copysign.f32(float %always.positive.or.nan, float %always.negative.or.pinf)
+  store float %copysign, ptr %ptr
+  ret float %copysign
+}
+
+; Can infer sign bit is negative due to no-inf return
+define nofpclass(nan inf) float @ret_no_nan_no_inf__copysign__src_known_negative_or_nan__sign_known_negative_or_pinf__multiple_use(float nofpclass(nan pnorm psub pzero) %always.negative.or.pinf, float nofpclass(nan pinf pnorm psub pzero) %always.negative, float %unknown0, float %unknown1, ptr %ptr) {
+; CHECK-LABEL: define nofpclass(nan inf) float @ret_no_nan_no_inf__copysign__src_known_negative_or_nan__sign_known_negative_or_pinf__multiple_use
+; CHECK-SAME: (float nofpclass(nan pzero psub pnorm) [[ALWAYS_NEGATIVE_OR_PINF:%.*]], float nofpclass(nan pinf pzero psub pnorm) [[ALWAYS_NEGATIVE:%.*]], float [[UNKNOWN0:%.*]], float [[UNKNOWN1:%.*]], ptr [[PTR:%.*]]) {
+; CHECK-NEXT:    [[TMP1:%.*]] = call float @llvm.fabs.f32(float [[ALWAYS_NEGATIVE_OR_PINF]])
+; CHECK-NEXT:    [[COPYSIGN:%.*]] = fneg float [[TMP1]]
+; CHECK-NEXT:    store float [[COPYSIGN]], ptr [[PTR]], align 4
+; CHECK-NEXT:    ret float [[ALWAYS_NEGATIVE_OR_PINF]]
+;
+  %copysign = call float @llvm.copysign.f32(float %always.negative.or.pinf, float %always.negative)
+  store float %copysign, ptr %ptr
+  ret float %copysign
+}
+
+; No-pinf result not sufficient
+define nofpclass(nan pinf) float @ret_no_nan_no_pinf__copysign__src_known_negative_or_nan__sign_known_negative_or_pinf__multiple_use(float nofpclass(nan pnorm psub pzero) %always.negative.or.pinf, float nofpclass(nan pinf pnorm psub pzero) %always.negative, float %unknown0, float %unknown1, ptr %ptr) {
+; CHECK-LABEL: define nofpclass(nan pinf) float @ret_no_nan_no_pinf__copysign__src_known_negative_or_nan__sign_known_negative_or_pinf__multiple_use
+; CHECK-SAME: (float nofpclass(nan pzero psub pnorm) [[ALWAYS_NEGATIVE_OR_PINF:%.*]], float nofpclass(nan pinf pzero psub pnorm) [[ALWAYS_NEGATIVE:%.*]], float [[UNKNOWN0:%.*]], float [[UNKNOWN1:%.*]], ptr [[PTR:%.*]]) {
+; CHECK-NEXT:    [[TMP1:%.*]] = call float @llvm.fabs.f32(float [[ALWAYS_NEGATIVE_OR_PINF]])
+; CHECK-NEXT:    [[COPYSIGN:%.*]] = fneg float [[TMP1]]
+; CHECK-NEXT:    store float [[COPYSIGN]], ptr [[PTR]], align 4
+; CHECK-NEXT:    ret float [[COPYSIGN]]
+;
+  %copysign = call float @llvm.copysign.f32(float %always.negative.or.pinf, float %always.negative)
+  store float %copysign, ptr %ptr
+  ret float %copysign
+}
+
+; No-ninf result not sufficient
+define nofpclass(nan ninf) float @ret_no_nan_no_ninf__copysign__src_known_negative_or_nan__sign_known_negative_or_pinf__multiple_use(float nofpclass(nan pnorm psub pzero) %always.negative.or.pinf, float nofpclass(nan pinf pnorm psub pzero) %always.negative, float %unknown0, float %unknown1, ptr %ptr) {
+; CHECK-LABEL: define nofpclass(nan ninf) float @ret_no_nan_no_ninf__copysign__src_known_negative_or_nan__sign_known_negative_or_pinf__multiple_use
+; CHECK-SAME: (float nofpclass(nan pzero psub pnorm) [[ALWAYS_NEGATIVE_OR_PINF:%.*]], float nofpclass(nan pinf pzero psub pnorm) [[ALWAYS_NEGATIVE:%.*]], float [[UNKNOWN0:%.*]], float [[UNKNOWN1:%.*]], ptr [[PTR:%.*]]) {
+; CHECK-NEXT:    [[TMP1:%.*]] = call float @llvm.fabs.f32(float [[ALWAYS_NEGATIVE_OR_PINF]])
+; CHECK-NEXT:    [[COPYSIGN:%.*]] = fneg float [[TMP1]]
+; CHECK-NEXT:    store float [[COPYSIGN]], ptr [[PTR]], align 4
+; CHECK-NEXT:    ret float [[COPYSIGN]]
+;
+  %copysign = call float @llvm.copysign.f32(float %always.negative.or.pinf, float %always.negative)
+  store float %copysign, ptr %ptr
+  ret float %copysign
+}
+
+; Can infer sign bit is negative due to no-norm return
+define nofpclass(nan norm) float @ret_no_nan_no_sub__copysign__src_known_negative_or_nan__sign_known_negative_or_pnorm__multiple_use(float nofpclass(nan pinf psub pzero) %always.negative.or.pnorm, float nofpclass(nan pinf pnorm psub pzero) %always.negative, float %unknown0, float %unknown1, ptr %ptr) {
+; CHECK-LABEL: define nofpclass(nan norm) float @ret_no_nan_no_sub__copysign__src_known_negative_or_nan__sign_known_negative_or_pnorm__multiple_use
+; CHECK-SAME: (float nofpclass(nan pinf pzero psub) [[ALWAYS_NEGATIVE_OR_PNORM:%.*]], float nofpclass(nan pinf pzero psub pnorm) [[ALWAYS_NEGATIVE:%.*]], float [[UNKNOWN0:%.*]], float [[UNKNOWN1:%.*]], ptr [[PTR:%.*]]) {
+; CHECK-NEXT:    [[TMP1:%.*]] = call float @llvm.fabs.f32(float [[ALWAYS_NEGATIVE_OR_PNORM]])
+; CHECK-NEXT:    [[COPYSIGN:%.*]] = fneg float [[TMP1]]
+; CHECK-NEXT:    store float [[COPYSIGN]], ptr [[PTR]], align 4
+; CHECK-NEXT:    ret float [[COPYSIGN]]
+;
+  %copysign = call float @llvm.copysign.f32(float %always.negative.or.pnorm, float %always.negative)
+  store float %copysign, ptr %ptr
+  ret float %copysign
+}
+
+; Can infer sign bit is negative due to no-sub return
+define nofpclass(nan sub) float @ret_no_nan_no_sub__copysign__src_known_negative_or_nan__sign_known_negative_or_psub__multiple_use(float nofpclass(nan pinf pnorm pzero) %always.negative.or.psub, float nofpclass(nan pinf pnorm psub pzero) %always.negative, float %unknown0, float %unknown1, ptr %ptr) {
+; CHECK-LABEL: define nofpclass(nan sub) float @ret_no_nan_no_sub__copysign__src_known_negative_or_nan__sign_known_negative_or_psub__multiple_use
+; CHECK-SAME: (float nofpclass(nan pinf pzero pnorm) [[ALWAYS_NEGATIVE_OR_PSUB:%.*]], float nofpclass(nan pinf pzero psub pnorm) [[ALWAYS_NEGATIVE:%.*]], float [[UNKNOWN0:%.*]], float [[UNKNOWN1:%.*]], ptr [[PTR:%.*]]) {
+; CHECK-NEXT:    [[TMP1:%.*]] = call float @llvm.fabs.f32(float [[ALWAYS_NEGATIVE_OR_PSUB]])
+; CHECK-NEXT:    [[COPYSIGN:%.*]] = fneg float [[TMP1]]
+; CHECK-NEXT:    store float [[COPYSIGN]], ptr [[PTR]], align 4
+; CHECK-NEXT:    ret float [[COPYSIGN]]
+;
+  %copysign = call float @llvm.copysign.f32(float %always.negative.or.psub, float %always.negative)
+  store float %copysign, ptr %ptr
+  ret float %copysign
+}
+
+; Can infer sign bit is negative due to no-zero return
+define nofpclass(nan zero) float @ret_no_nan_no_zero__copysign__src_known_negative_or_nan__sign_known_negative_or_pzero__multiple_use(float nofpclass(nan pinf pnorm psub) %always.negative.or.pzero, float nofpclass(nan pinf pnorm psub pzero) %always.negative, float %unknown0, float %unknown1, ptr %ptr) {
+; CHECK-LABEL: define nofpclass(nan zero) float @ret_no_nan_no_zero__copysign__src_known_negative_or_nan__sign_known_negative_or_pzero__multiple_use
+; CHECK-SAME: (float nofpclass(nan pinf psub pnorm) [[ALWAYS_NEGATIVE_OR_PZERO:%.*]], float nofpclass(nan pinf pzero psub pnorm) [[ALWAYS_NEGATIVE:%.*]], float [[UNKNOWN0:%.*]], float [[UNKNOWN1:%.*]], ptr [[PTR:%.*]]) {
+; CHECK-NEXT:    [[TMP1:%.*]] = call float @llvm.fabs.f32(float [[ALWAYS_NEGATIVE_OR_PZERO]])
+; CHECK-NEXT:    [[COPYSIGN:%.*]] = fneg float [[TMP1]]
+; CHECK-NEXT:    store float [[COPYSIGN]], ptr [[PTR]], align 4
+; CHECK-NEXT:    ret float [[COPYSIGN]]
+;
+  %copysign = call float @llvm.copysign.f32(float %always.negative.or.pzero, float %always.negative)
+  store float %copysign, ptr %ptr
+  ret float %copysign
+}
+
+; Can infer sign bit is positive due to no-inf return
+define nofpclass(nan inf) float @ret_no_nan_no_inf__copysign__src_known_positive_or_nan__sign_known_positive_or_ninf__multiple_use(float nofpclass(nan nnorm nsub nzero) %always.positive.or.ninf, float nofpclass(nan ninf nnorm nsub nzero) %always.positive, ptr %ptr) {
+; CHECK-LABEL: define nofpclass(nan inf) float @ret_no_nan_no_inf__copysign__src_known_positive_or_nan__sign_known_positive_or_ninf__multiple_use
+; CHECK-SAME: (float nofpclass(nan nzero nsub nnorm) [[ALWAYS_POSITIVE_OR_NINF:%.*]], float nofpclass(nan ninf nzero nsub nnorm) [[ALWAYS_POSITIVE:%.*]], ptr [[PTR:%.*]]) {
+; CHECK-NEXT:    [[COPYSIGN:%.*]] = call float @llvm.fabs.f32(float [[ALWAYS_POSITIVE_OR_NINF]])
+; CHECK-NEXT:    store float [[COPYSIGN]], ptr [[PTR]], align 4
+; CHECK-NEXT:    ret float [[ALWAYS_POSITIVE_OR_NINF]]
+;
+  %copysign = call float @llvm.copysign.f32(float %always.positive.or.ninf, float %always.positive)
+  store float %copysign, ptr %ptr
+  ret float %copysign
+}
+
+; Can infer sign bit is positive due to no-norm return
+define nofpclass(nan norm) float @ret_no_nan_no_norm__copysign__src_known_positive_or_nan__sign_known_positive_or_norm__multiple_use(float nofpclass(nan ninf nsub nzero) %always.positive.or.nnorm, float nofpclass(nan ninf nnorm nsub nzero) %always.positive, ptr %ptr) {
+; CHECK-LABEL: define nofpclass(nan norm) float @ret_no_nan_no_norm__copysign__src_known_positive_or_nan__sign_known_positive_or_norm__multiple_use
+; CHECK-SAME: (float nofpclass(nan ninf nzero nsub) [[ALWAYS_POSITIVE_OR_NNORM:%.*]], float nofpclass(nan ninf nzero nsub nnorm) [[ALWAYS_POSITIVE:%.*]], ptr [[PTR:%.*]]) {
+; CHECK-NEXT:    [[COPYSIGN:%.*]] = call float @llvm.fabs.f32(float [[ALWAYS_POSITIVE_OR_NNORM]])
+; CHECK-NEXT:    store float [[COPYSIGN]], ptr [[PTR]], align 4
+; CHECK-NEXT:    ret float [[COPYSIGN]]
+;
+  %copysign = call float @llvm.copysign.f32(float %always.positive.or.nnorm, float %always.positive)
+  store float %copysign, ptr %ptr
+  ret float %copysign
+}
+
+; Can infer sign bit is positive due to no-sub return
+define nofpclass(nan sub) float @ret_no_nan_no_sub__copysign__src_known_positive_or_nan__sign_known_positive_or_sub__multiple_use(float nofpclass(nan ninf nnorm nzero) %always.positive.or.nsub, float nofpclass(nan ninf nnorm nsub nzero) %always.positive, ptr %ptr) {
+; CHECK-LABEL: define nofpclass(nan sub) float @ret_no_nan_no_sub__copysign__src_known_positive_or_nan__sign_known_positive_or_sub__multiple_use
+; CHECK-SAME: (float nofpclass(nan ninf nzero nnorm) [[ALWAYS_POSITIVE_OR_NSUB:%.*]], float nofpclass(nan ninf nzero nsub nnorm) [[ALWAYS_POSITIVE:%.*]], ptr [[PTR:%.*]]) {
+; CHECK-NEXT:    [[COPYSIGN:%.*]] = call float @llvm.fabs.f32(float [[ALWAYS_POSITIVE_OR_NSUB]])
+; CHECK-NEXT:    store float [[COPYSIGN]], ptr [[PTR]], align 4
+; CHECK-NEXT:    ret float [[COPYSIGN]]
+;
+  %copysign = call float @llvm.copysign.f32(float %always.positive.or.nsub, float %always.positive)
+  store float %copysign, ptr %ptr
+  ret float %copysign
+}
+
+; Can infer sign bit is positive due to no-zero return
+define nofpclass(nan zero) float @ret_no_nan_no_zero__copysign__src_known_positive_or_nan__sign_known_positive_or_zero__multiple_use(float nofpclass(nan ninf nnorm nsub) %always.positive.or.nzero, float nofpclass(nan ninf nnorm nsub nzero) %always.positive, ptr %ptr) {
+; CHECK-LABEL: define nofpclass(nan zero) float @ret_no_nan_no_zero__copysign__src_known_positive_or_nan__sign_known_positive_or_zero__multiple_use
+; CHECK-SAME: (float nofpclass(nan ninf nsub nnorm) [[ALWAYS_POSITIVE_OR_NZERO:%.*]], float nofpclass(nan ninf nzero nsub nnorm) [[ALWAYS_POSITIVE:%.*]], ptr [[PTR:%.*]]) {
+; CHECK-NEXT:    [[COPYSIGN:%.*]] = call float @llvm.fabs.f32(float [[ALWAYS_POSITIVE_OR_NZERO]])
+; CHECK-NEXT:    store float [[COPYSIGN]], ptr [[PTR]], align 4
+; CHECK-NEXT:    ret float [[COPYSIGN]]
+;
+  %copysign = call float @llvm.copysign.f32(float %always.positive.or.nzero, float %always.positive)
+  store float %copysign, ptr %ptr
+  ret float %copysign
+}
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-chained.ll b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-chained.ll
index f90b26013fcbc..f1dee958fa09c 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-chained.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-chained.ll
@@ -1,7 +1,8 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals none --filter-out-after "^scalar.ph:" --version 4
-; RUN: opt --mattr=+neon,+dotprod -passes=loop-vectorize -force-vector-interleave=1 -enable-epilogue-vectorization=false -S < %s | FileCheck %s --check-prefixes=CHECK-NEON
-; RUN: opt --mattr=+sve -passes=loop-vectorize -force-vector-interleave=1 -enable-epilogue-vectorization=false -S < %s | FileCheck %s --check-prefixes=CHECK-SVE
-; RUN: opt --mattr=+sve -vectorizer-maximize-bandwidth -passes=loop-vectorize -force-vector-width=8 -scalable-vectorization=preferred -force-vector-interleave=1 -enable-epilogue-vectorization=false -S < %s | FileCheck %s --check-prefixes=CHECK-SVE-MAXBW
+; RUN: opt -passes=loop-vectorize -enable-epilogue-vectorization=false -mattr=+neon,+dotprod -S < %s | FileCheck %s --check-prefixes=CHECK-NEON
+; RUN: opt -passes=loop-vectorize -enable-epilogue-vectorization=false -mattr=+sve           -S < %s | FileCheck %s --check-prefixes=CHECK-SVE
+; RUN: opt -passes=loop-vectorize -enable-epilogue-vectorization=false -mattr=+sve \
+; RUN:     -vectorizer-maximize-bandwidth -force-vector-width=8                              -S < %s | FileCheck %s --check-prefixes=CHECK-SVE-MAXBW
 
 target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
 target triple = "aarch64-none-unknown-elf"
@@ -158,7 +159,7 @@ for.body:                                         ; preds = %for.body.preheader,
   %sub = sub i32 %add, %mul.ac
   %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
   %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count
-  br i1 %exitcond.not, label %for.cond.cleanup, label %for.body, !loop !1
+  br i1 %exitcond.not, label %for.cond.cleanup, label %for.body, !llvm.loop !0
 }
 
 define i32 @chained_partial_reduce_add_add(ptr %a, ptr %b, ptr %c, i32 %N) #0 {
@@ -309,7 +310,7 @@ for.body:                                         ; preds = %for.body.preheader,
   %add.2 = add i32 %add, %mul.ac
   %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
   %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count
-  br i1 %exitcond.not, label %for.cond.cleanup, label %for.body, !loop !1
+  br i1 %exitcond.not, label %for.cond.cleanup, label %for.body, !llvm.loop !0
 }
 
 define i32 @chained_partial_reduce_sub_add(ptr %a, ptr %b, ptr %c, i32 %N) #0 {
@@ -465,7 +466,7 @@ for.body:                                         ; preds = %for.body.preheader,
   %add = add i32 %sub, %mul.ac
   %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
   %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count
-  br i1 %exitcond.not, label %for.cond.cleanup, label %for.body, !loop !1
+  br i1 %exitcond.not, label %for.cond.cleanup, label %for.body, !llvm.loop !0
 }
 
 define i32 @chained_partial_reduce_sub_sub(ptr %a, ptr %b, ptr %c, i32 %N) #0 {
@@ -625,7 +626,7 @@ for.body:                                         ; preds = %for.body.preheader,
   %sub.2 = sub i32 %sub, %mul.ac
   %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
   %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count
-  br i1 %exitcond.not, label %for.cond.cleanup, label %for.body, !loop !1
+  br i1 %exitcond.not, label %for.cond.cleanup, label %for.body, !llvm.loop !0
 }
 
 define i32 @chained_partial_reduce_add_add_add(ptr %a, ptr %b, ptr %c, i32 %N) #0 {
@@ -787,7 +788,7 @@ for.body:                                         ; preds = %for.body.preheader,
   %sub.2 = add i32 %add, %mul.bc
   %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
   %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count
-  br i1 %exitcond.not, label %for.cond.cleanup, label %for.body, !loop !1
+  br i1 %exitcond.not, label %for.cond.cleanup, label %for.body, !llvm.loop !0
 }
 
 define i32 @chained_partial_reduce_sub_add_sub(ptr %a, ptr %b, ptr %c, i32 %N) #0 {
@@ -955,7 +956,7 @@ for.body:                                         ; preds = %for.body.preheader,
   %sub.2 = sub i32 %add, %mul.bc
   %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
   %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count
-  br i1 %exitcond.not, label %for.cond.cleanup, label %for.body, !loop !1
+  br i1 %exitcond.not, label %for.cond.cleanup, label %for.body, !llvm.loop !0
 }
 
 
@@ -1103,7 +1104,7 @@ for.body:                                         ; preds = %for.body.preheader,
   %add2 = add i32 %add, %c.ext
   %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
   %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count
-  br i1 %exitcond.not, label %for.cond.cleanup, label %for.body, !loop !1
+  br i1 %exitcond.not, label %for.cond.cleanup, label %for.body, !llvm.loop !0
 }
 
 
@@ -1235,7 +1236,7 @@ for.body:                                         ; preds = %for.body.preheader,
   %add2 = add i32 %add, %b.ext
   %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
   %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count
-  br i1 %exitcond.not, label %for.cond.cleanup, label %for.body, !loop !1
+  br i1 %exitcond.not, label %for.cond.cleanup, label %for.body, !llvm.loop !0
 }
 
 
@@ -1384,11 +1385,11 @@ for.body:                                         ; preds = %for.body.preheader,
   %add2 = add i32 %add, %mul.ab
   %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
   %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count
-  br i1 %exitcond.not, label %for.cond.cleanup, label %for.body, !loop !1
+  br i1 %exitcond.not, label %for.cond.cleanup, label %for.body, !llvm.loop !0
 }
 
 attributes #0 = { vscale_range(1,16) }
 
-
-!0 = !{!"llvm.loop.vectorize.scalable.enable", i1 true}
-!1 = distinct !{!0}
+!0 = distinct !{!0, !1, !2}
+!1 = !{!"llvm.loop.vectorize.scalable.enable", i1 true}
+!2 = !{!"llvm.loop.interleave.count", i32 1}

>From 179ef1fe99a24321d1ea9f572cf1c3da2be74b30 Mon Sep 17 00:00:00 2001
From: Marina Taylor <marina_taylor at apple.com>
Date: Fri, 30 Jan 2026 15:03:54 +0000
Subject: [PATCH 2/6] NFC: Rename CodeGenOptions::StackUsageOutput to
 StackUsageFile (#178898)

Preparation for #178005.

"Output" has too many different interpretations: it could be an
enabled/disabled, a file format, etc. Clarify that it's the destination
file.
---
 clang/include/clang/Basic/CodeGenOptions.h | 2 +-
 clang/include/clang/Options/Options.td     | 2 +-
 clang/lib/CodeGen/BackendUtil.cpp          | 2 +-
 clang/lib/Frontend/CompilerInvocation.cpp  | 2 +-
 llvm/include/llvm/Target/TargetOptions.h   | 2 +-
 llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp | 2 +-
 6 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/clang/include/clang/Basic/CodeGenOptions.h b/clang/include/clang/Basic/CodeGenOptions.h
index c60ca507ff917..8ef0d87faaeaf 100644
--- a/clang/include/clang/Basic/CodeGenOptions.h
+++ b/clang/include/clang/Basic/CodeGenOptions.h
@@ -521,7 +521,7 @@ class CodeGenOptions : public CodeGenOptionsBase {
   /// Name of the stack usage file (i.e., .su file) if user passes
   /// -fstack-usage. If empty, it can be implied that -fstack-usage is not
   /// passed on the command line.
-  std::string StackUsageOutput;
+  std::string StackUsageFile;
 
   /// Executable and command-line used to create a given CompilerInvocation.
   /// Most of the time this will be the full -cc1 command.
diff --git a/clang/include/clang/Options/Options.td b/clang/include/clang/Options/Options.td
index 41a23ba4cb33d..421208a812bbc 100644
--- a/clang/include/clang/Options/Options.td
+++ b/clang/include/clang/Options/Options.td
@@ -4664,7 +4664,7 @@ def fstack_usage : Flag<["-"], "fstack-usage">, Group<f_Group>,
 def stack_usage_file : Separate<["-"], "stack-usage-file">,
   Visibility<[CC1Option]>,
   HelpText<"Filename (or -) to write stack usage output to">,
-  MarshallingInfoString<CodeGenOpts<"StackUsageOutput">>;
+  MarshallingInfoString<CodeGenOpts<"StackUsageFile">>;
 def fextend_variable_liveness_EQ : Joined<["-"], "fextend-variable-liveness=">,
   Group<f_Group>, Visibility<[ClangOption, CC1Option]>,
   HelpText<"Extend the liveness of user variables through optimizations to "
diff --git a/clang/lib/CodeGen/BackendUtil.cpp b/clang/lib/CodeGen/BackendUtil.cpp
index d411ef1bf8763..b286ff359ec40 100644
--- a/clang/lib/CodeGen/BackendUtil.cpp
+++ b/clang/lib/CodeGen/BackendUtil.cpp
@@ -453,7 +453,7 @@ static bool initTargetOptions(const CompilerInstance &CI,
   Options.EmulatedTLS = CodeGenOpts.EmulatedTLS;
   Options.DebuggerTuning = CodeGenOpts.getDebuggerTuning();
   Options.EmitStackSizeSection = CodeGenOpts.StackSizeSection;
-  Options.StackUsageOutput = CodeGenOpts.StackUsageOutput;
+  Options.StackUsageFile = CodeGenOpts.StackUsageFile;
   Options.EmitAddrsig = CodeGenOpts.Addrsig;
   Options.ForceDwarfFrameSection = CodeGenOpts.ForceDwarfFrameSection;
   Options.EmitCallGraphSection = CodeGenOpts.CallGraphSection;
diff --git a/clang/lib/Frontend/CompilerInvocation.cpp b/clang/lib/Frontend/CompilerInvocation.cpp
index 5a79634773866..2af4a7f536623 100644
--- a/clang/lib/Frontend/CompilerInvocation.cpp
+++ b/clang/lib/Frontend/CompilerInvocation.cpp
@@ -2244,7 +2244,7 @@ bool CompilerInvocation::ParseCodeGenArgs(CodeGenOptions &Opts, ArgList &Args,
   if (UsingSampleProfile)
     NeedLocTracking = true;
 
-  if (!Opts.StackUsageOutput.empty())
+  if (!Opts.StackUsageFile.empty())
     NeedLocTracking = true;
 
   // If the user requested a flag that requires source locations available in
diff --git a/llvm/include/llvm/Target/TargetOptions.h b/llvm/include/llvm/Target/TargetOptions.h
index 7af50691ec0e5..a9b86626cf598 100644
--- a/llvm/include/llvm/Target/TargetOptions.h
+++ b/llvm/include/llvm/Target/TargetOptions.h
@@ -368,7 +368,7 @@ class TargetOptions {
   /// Name of the stack usage file (i.e., .su file) if user passes
   /// -fstack-usage. If empty, it can be implied that -fstack-usage is not
   /// passed on the command line.
-  std::string StackUsageOutput;
+  std::string StackUsageFile;
 
   /// If greater than 0, override TargetLoweringBase::PrefLoopAlignment.
   unsigned LoopAlignment = 0;
diff --git a/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp b/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp
index 420f09c514d63..b64c7f5b44033 100644
--- a/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp
+++ b/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp
@@ -1672,7 +1672,7 @@ void AsmPrinter::emitStackSizeSection(const MachineFunction &MF) {
 }
 
 void AsmPrinter::emitStackUsage(const MachineFunction &MF) {
-  const std::string &OutputFilename = MF.getTarget().Options.StackUsageOutput;
+  const std::string &OutputFilename = MF.getTarget().Options.StackUsageFile;
 
   // OutputFilename empty implies -fstack-usage is not passed.
   if (OutputFilename.empty())

>From 561032845b522bb340fd3988ca1d8e5d86fdd632 Mon Sep 17 00:00:00 2001
From: Joe Nash <joseph.nash at amd.com>
Date: Fri, 30 Jan 2026 10:13:33 -0500
Subject: [PATCH 3/6] [AMDGPU] Fix VOPD checks for commuting OpX and OpY
 (#178772)

We need to check that OpX does not write the sources of OpY, but if we
swap OpX and OpY with respect to program order, the check was not
swapped correctly.

The checks on gfx1250 can be relaxed slightly, that is planned for a
future patch.

---------

Co-authored-by: Matt Arsenault <arsenm2 at gmail.com>
---
 llvm/lib/Target/AMDGPU/GCNCreateVOPD.cpp      |  26 ++-
 llvm/lib/Target/AMDGPU/GCNVOPDUtils.cpp       |  58 +++---
 .../test/CodeGen/AMDGPU/GlobalISel/fptrunc.ll |   5 +-
 .../AMDGPU/GlobalISel/insertelement.i16.ll    |  10 +-
 .../AMDGPU/GlobalISel/insertelement.i8.ll     |  10 +-
 llvm/test/CodeGen/AMDGPU/GlobalISel/shl.ll    |  36 ++--
 llvm/test/CodeGen/AMDGPU/addrspacecast-gas.ll |   2 +-
 .../CodeGen/AMDGPU/amdgcn.bitcast.1024bit.ll  |   6 +-
 .../CodeGen/AMDGPU/amdgcn.bitcast.768bit.ll   |  12 +-
 .../CodeGen/AMDGPU/amdgcn.bitcast.832bit.ll   |  24 ++-
 .../CodeGen/AMDGPU/amdgcn.bitcast.896bit.ll   |  24 ++-
 .../AMDGPU/amdgpu-cs-chain-fp-nosave.ll       |   8 +-
 .../atomic_optimizations_struct_buffer.ll     | 171 ++++++------------
 llvm/test/CodeGen/AMDGPU/bf16.ll              |  45 +++--
 .../AMDGPU/expand-waitcnt-profiling.ll        |  32 ++--
 llvm/test/CodeGen/AMDGPU/fcopysign.f16.ll     |  13 +-
 llvm/test/CodeGen/AMDGPU/mad_64_32.ll         |   2 +-
 .../match-perm-extract-vector-elt-bug.ll      |   6 +-
 .../CodeGen/AMDGPU/narrow_math_for_and.ll     |   6 +-
 .../CodeGen/AMDGPU/vopd-combine-gfx1250.mir   | 107 +++++++++++
 20 files changed, 342 insertions(+), 261 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/GCNCreateVOPD.cpp b/llvm/lib/Target/AMDGPU/GCNCreateVOPD.cpp
index 27f40f1705bb4..72805aa9165b6 100644
--- a/llvm/lib/Target/AMDGPU/GCNCreateVOPD.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNCreateVOPD.cpp
@@ -134,6 +134,7 @@ class GCNCreateVOPD {
     LLVM_DEBUG(dbgs() << "CreateVOPD Pass:\n");
 
     const SIInstrInfo *SII = ST->getInstrInfo();
+    const SIRegisterInfo *TRI = ST->getRegisterInfo();
     bool Changed = false;
     unsigned EncodingFamily = AMDGPU::getVOPDEncodingFamily(*ST);
     bool HasVOPD3 = ST->hasVOPD3();
@@ -160,16 +161,25 @@ class GCNCreateVOPD {
           llvm::AMDGPU::CanBeVOPD SecondCanBeVOPD =
               AMDGPU::getCanBeVOPD(Opc2, EncodingFamily, VOPD3);
 
-          if (FirstCanBeVOPD.X && SecondCanBeVOPD.Y)
+          if (FirstCanBeVOPD.X && SecondCanBeVOPD.Y &&
+              llvm::checkVOPDRegConstraints(*SII, *FirstMI, *SecondMI, VOPD3)) {
             CI = VOPDCombineInfo(FirstMI, SecondMI, VOPD3);
-          else if (FirstCanBeVOPD.Y && SecondCanBeVOPD.X)
+            return true;
+          }
+          // We can try swapping the order of the instructions, but in that case
+          // neither instruction can write to a register the other reads from.
+          // OpX cannot write something OpY reads because that is the hardware
+          // rule, and OpY cannot write what OpX reads because that would
+          // violate the data dependency in the original order.
+          for (const auto &Use : SecondMI->uses())
+            if (Use.isReg() && FirstMI->modifiesRegister(Use.getReg(), TRI))
+              return false;
+          if (FirstCanBeVOPD.Y && SecondCanBeVOPD.X &&
+              llvm::checkVOPDRegConstraints(*SII, *SecondMI, *FirstMI, VOPD3)) {
             CI = VOPDCombineInfo(SecondMI, FirstMI, VOPD3);
-          else
-            return false;
-          // checkVOPDRegConstraints cares about program order, but doReplace
-          // cares about X-Y order in the constituted VOPD
-          return llvm::checkVOPDRegConstraints(*SII, *FirstMI, *SecondMI,
-                                               VOPD3);
+            return true;
+          }
+          return false;
         };
 
         if (checkVOPD(false) || (HasVOPD3 && checkVOPD(true))) {
diff --git a/llvm/lib/Target/AMDGPU/GCNVOPDUtils.cpp b/llvm/lib/Target/AMDGPU/GCNVOPDUtils.cpp
index 9e66909e41052..663f53889ac74 100644
--- a/llvm/lib/Target/AMDGPU/GCNVOPDUtils.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNVOPDUtils.cpp
@@ -35,18 +35,18 @@ using namespace llvm;
 #define DEBUG_TYPE "gcn-vopd-utils"
 
 bool llvm::checkVOPDRegConstraints(const SIInstrInfo &TII,
-                                   const MachineInstr &FirstMI,
-                                   const MachineInstr &SecondMI, bool IsVOPD3) {
+                                   const MachineInstr &MIX,
+                                   const MachineInstr &MIY, bool IsVOPD3) {
   namespace VOPD = AMDGPU::VOPD;
 
-  const MachineFunction *MF = FirstMI.getMF();
+  const MachineFunction *MF = MIX.getMF();
   const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
 
   if (IsVOPD3 && !ST.hasVOPD3())
     return false;
-  if (!IsVOPD3 && (TII.isVOP3(FirstMI) || TII.isVOP3(SecondMI)))
+  if (!IsVOPD3 && (TII.isVOP3(MIX) || TII.isVOP3(MIY)))
     return false;
-  if (TII.isDPP(FirstMI) || TII.isDPP(SecondMI))
+  if (TII.isDPP(MIX) || TII.isDPP(MIY))
     return false;
 
   const SIRegisterInfo *TRI = dyn_cast<SIRegisterInfo>(ST.getRegisterInfo());
@@ -61,32 +61,24 @@ bool llvm::checkVOPDRegConstraints(const SIInstrInfo &TII,
     UniqueLiterals.push_back(&Op);
   };
   SmallVector<Register> UniqueScalarRegs;
-  assert([&]() -> bool {
-    for (auto MII = MachineBasicBlock::const_iterator(&FirstMI);
-         MII != FirstMI.getParent()->instr_end(); ++MII) {
-      if (&*MII == &SecondMI)
-        return true;
-    }
-    return false;
-  }() && "Expected FirstMI to precede SecondMI");
-  // Cannot pair dependent instructions
-  for (const auto &Use : SecondMI.uses())
-    if (Use.isReg() && FirstMI.modifiesRegister(Use.getReg(), TRI))
+
+  // MIX must not modify any registers used by MIY.
+  for (const auto &Use : MIY.uses())
+    if (Use.isReg() && MIX.modifiesRegister(Use.getReg(), TRI))
       return false;
 
   auto getVRegIdx = [&](unsigned OpcodeIdx, unsigned OperandIdx) {
-    const MachineInstr &MI = (OpcodeIdx == VOPD::X) ? FirstMI : SecondMI;
+    const MachineInstr &MI = (OpcodeIdx == VOPD::X) ? MIX : MIY;
     const MachineOperand &Operand = MI.getOperand(OperandIdx);
     if (Operand.isReg() && TRI->isVectorRegister(MRI, Operand.getReg()))
       return Operand.getReg();
     return Register();
   };
 
-  auto InstInfo =
-      AMDGPU::getVOPDInstInfo(FirstMI.getDesc(), SecondMI.getDesc());
+  auto InstInfo = AMDGPU::getVOPDInstInfo(MIX.getDesc(), MIY.getDesc());
 
   for (auto CompIdx : VOPD::COMPONENTS) {
-    const MachineInstr &MI = (CompIdx == VOPD::X) ? FirstMI : SecondMI;
+    const MachineInstr &MI = (CompIdx == VOPD::X) ? MIX : MIY;
 
     const MachineOperand &Src0 = *TII.getNamedOperand(MI, AMDGPU::OpName::src0);
     if (Src0.isReg()) {
@@ -153,8 +145,8 @@ bool llvm::checkVOPDRegConstraints(const SIInstrInfo &TII,
   // On GFX12+ if both OpX and OpY are V_MOV_B32 then OPY uses SRC2
   // source-cache.
   bool SkipSrc = ST.getGeneration() >= AMDGPUSubtarget::GFX12 &&
-                 FirstMI.getOpcode() == AMDGPU::V_MOV_B32_e32 &&
-                 SecondMI.getOpcode() == AMDGPU::V_MOV_B32_e32;
+                 MIX.getOpcode() == AMDGPU::V_MOV_B32_e32 &&
+                 MIY.getOpcode() == AMDGPU::V_MOV_B32_e32;
   bool AllowSameVGPR = ST.hasGFX1250Insts();
 
   if (InstInfo.hasInvalidOperand(getVRegIdx, *TRI, SkipSrc, AllowSameVGPR,
@@ -163,22 +155,23 @@ bool llvm::checkVOPDRegConstraints(const SIInstrInfo &TII,
 
   if (IsVOPD3) {
     // BITOP3 can be converted to DUAL_BITOP2 only if src2 is zero.
-    if (AMDGPU::hasNamedOperand(SecondMI.getOpcode(), AMDGPU::OpName::bitop3)) {
+    // MIX check is only relevant to scheduling?
+    if (AMDGPU::hasNamedOperand(MIX.getOpcode(), AMDGPU::OpName::bitop3)) {
       const MachineOperand &Src2 =
-          *TII.getNamedOperand(SecondMI, AMDGPU::OpName::src2);
+          *TII.getNamedOperand(MIX, AMDGPU::OpName::src2);
       if (!Src2.isImm() || Src2.getImm())
         return false;
     }
-    if (AMDGPU::hasNamedOperand(FirstMI.getOpcode(), AMDGPU::OpName::bitop3)) {
+    if (AMDGPU::hasNamedOperand(MIY.getOpcode(), AMDGPU::OpName::bitop3)) {
       const MachineOperand &Src2 =
-          *TII.getNamedOperand(FirstMI, AMDGPU::OpName::src2);
+          *TII.getNamedOperand(MIY, AMDGPU::OpName::src2);
       if (!Src2.isImm() || Src2.getImm())
         return false;
     }
   }
 
-  LLVM_DEBUG(dbgs() << "VOPD Reg Constraints Passed\n\tX: " << FirstMI
-                    << "\n\tY: " << SecondMI << "\n");
+  LLVM_DEBUG(dbgs() << "VOPD Reg Constraints Passed\n\tX: " << MIX
+                    << "\n\tY: " << MIY << "\n");
   return true;
 }
 
@@ -208,6 +201,15 @@ static bool shouldScheduleVOPDAdjacent(const TargetInstrInfo &TII,
           (FirstCanBeVOPD.Y && SecondCanBeVOPD.X)))
       return false;
 
+    assert([&]() -> bool {
+      for (auto MII = MachineBasicBlock::const_iterator(FirstMI);
+           MII != FirstMI->getParent()->instr_end(); ++MII) {
+        if (&*MII == &SecondMI)
+          return true;
+      }
+      return false;
+    }() && "Expected FirstMI to precede SecondMI");
+
     return checkVOPDRegConstraints(STII, *FirstMI, SecondMI, VOPD3);
   };
 
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/fptrunc.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/fptrunc.ll
index fa72eb72fd723..77b05dc460168 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/fptrunc.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/fptrunc.ll
@@ -404,8 +404,9 @@ define amdgpu_ps half @fptrunc_f64_to_f16_div(double %a) {
 ; GFX1250-NEXT:    v_or_b32_e32 v3, v5, v3
 ; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1250-NEXT:    v_cndmask_b32_e32 v3, v4, v3, vcc_lo
-; GFX1250-NEXT:    v_dual_lshrrev_b32 v3, 2, v3 :: v_dual_bitop2_b32 v4, 7, v3 bitop3:0x40
-; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-NEXT:    v_and_b32_e32 v4, 7, v3
+; GFX1250-NEXT:    v_lshrrev_b32_e32 v3, 2, v3
+; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX1250-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 3, v4
 ; GFX1250-NEXT:    v_cmp_lt_i32_e64 s0, 5, v4
 ; GFX1250-NEXT:    s_or_b32 s0, vcc_lo, s0
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.i16.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.i16.ll
index 0e1bbbd1ea92b..c7b63f749e950 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.i16.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.i16.ll
@@ -437,8 +437,9 @@ define amdgpu_ps void @insertelement_s_v2i16_v_v(ptr addrspace(4) inreg %ptr, i1
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 4, v1
 ; GFX11-NEXT:    v_lshlrev_b32_e64 v2, v1, 0xffff
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_lshlrev_b32 v3, v1, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_lshlrev_b32_e32 v3, v1, v0
+; GFX11-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX11-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX11-NEXT:    v_not_b32_e32 v2, v2
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
@@ -709,8 +710,9 @@ define amdgpu_ps void @insertelement_v_v2i16_v_v(ptr addrspace(1) %ptr, i16 %val
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 4, v0
 ; GFX11-NEXT:    v_lshlrev_b32_e64 v2, v0, 0xffff
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_lshlrev_b32 v3, v0, v1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_lshlrev_b32_e32 v3, v0, v1
+; GFX11-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX11-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX11-NEXT:    v_not_b32_e32 v2, v2
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.i8.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.i8.ll
index 4598bcc04a505..7c212f1e110d1 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.i8.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.i8.ll
@@ -1224,8 +1224,9 @@ define amdgpu_ps void @insertelement_s_v4i8_v_v(ptr addrspace(4) inreg %ptr, i8
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 3, v1
 ; GFX11-NEXT:    v_lshlrev_b32_e64 v2, v1, 0xff
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_lshlrev_b32 v3, v1, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_lshlrev_b32_e32 v3, v1, v0
+; GFX11-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX11-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX11-NEXT:    v_not_b32_e32 v2, v2
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
@@ -1496,8 +1497,9 @@ define amdgpu_ps void @insertelement_v_v4i8_v_v(ptr addrspace(1) %ptr, i8 %val,
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
 ; GFX11-NEXT:    v_lshlrev_b32_e64 v2, v0, 0xff
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_lshlrev_b32 v3, v0, v1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_lshlrev_b32_e32 v3, v0, v1
+; GFX11-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX11-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX11-NEXT:    v_not_b32_e32 v2, v2
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/shl.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/shl.ll
index f589919992335..ea8023a3a227b 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/shl.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/shl.ll
@@ -1295,18 +1295,12 @@ define i64 @v_shl_i64_63(i64 %value) {
 ; GCN-NEXT:    v_mov_b32_e32 v0, 0
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: v_shl_i64_63:
-; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 31, v0
-; GFX10-NEXT:    v_mov_b32_e32 v0, 0
-; GFX10-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX11-LABEL: v_shl_i64_63:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_lshlrev_b32 v1, 31, v0
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
+; GFX10PLUS-LABEL: v_shl_i64_63:
+; GFX10PLUS:       ; %bb.0:
+; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10PLUS-NEXT:    v_lshlrev_b32_e32 v1, 31, v0
+; GFX10PLUS-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10PLUS-NEXT:    s_setpc_b64 s[30:31]
   %result = shl i64 %value, 63
   ret i64 %result
 }
@@ -1319,18 +1313,12 @@ define i64 @v_shl_i64_33(i64 %value) {
 ; GCN-NEXT:    v_mov_b32_e32 v0, 0
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: v_shl_i64_33:
-; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 1, v0
-; GFX10-NEXT:    v_mov_b32_e32 v0, 0
-; GFX10-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX11-LABEL: v_shl_i64_33:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_lshlrev_b32 v1, 1, v0
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
+; GFX10PLUS-LABEL: v_shl_i64_33:
+; GFX10PLUS:       ; %bb.0:
+; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10PLUS-NEXT:    v_lshlrev_b32_e32 v1, 1, v0
+; GFX10PLUS-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10PLUS-NEXT:    s_setpc_b64 s[30:31]
   %result = shl i64 %value, 33
   ret i64 %result
 }
diff --git a/llvm/test/CodeGen/AMDGPU/addrspacecast-gas.ll b/llvm/test/CodeGen/AMDGPU/addrspacecast-gas.ll
index e5992e398ddbd..adcc86532e46d 100644
--- a/llvm/test/CodeGen/AMDGPU/addrspacecast-gas.ll
+++ b/llvm/test/CodeGen/AMDGPU/addrspacecast-gas.ll
@@ -14,7 +14,7 @@ define amdgpu_kernel void @use_private_to_flat_addrspacecast(ptr addrspace(5) %p
 ; GFX1250-SDAG-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, 0
 ; GFX1250-SDAG-NEXT:    s_wait_kmcnt 0x0
 ; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
-; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_lshlrev_b32 v1, 20, v0
+; GFX1250-SDAG-NEXT:    v_dual_lshlrev_b32 v1, 20, v0 :: v_dual_mov_b32 v0, s0
 ; GFX1250-SDAG-NEXT:    s_cmp_lg_u32 s0, -1
 ; GFX1250-SDAG-NEXT:    s_cselect_b32 vcc_lo, -1, 0
 ; GFX1250-SDAG-NEXT:    v_add_nc_u64_e32 v[0:1], src_flat_scratch_base_lo, v[0:1]
diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.1024bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.1024bit.ll
index 15bef1949b27f..1b1f7fcadc540 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.1024bit.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.1024bit.ll
@@ -208038,10 +208038,10 @@ define inreg <64 x half> @bitcast_v64bf16_to_v64f16_scalar(<64 x bfloat> inreg %
 ; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
 ; GFX11-FAKE16-NEXT:    v_and_b32_e32 v3, 0xffff0000, v24
 ; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v37, 16, v0
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v1, v2, v4 :: v_dual_add_nc_u32 v0, v1, v5
-; GFX11-FAKE16-NEXT:    v_add_f32_e32 v2, 0x40c00000, v6
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v0, v1, v5
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v1, v2, v4 :: v_dual_add_f32 v2, 0x40c00000, v6
 ; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v4, 16, v24
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
 ; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v0, 0x7fff, v0
 ; GFX11-FAKE16-NEXT:    v_or_b32_e32 v6, 0x400000, v5
 ; GFX11-FAKE16-NEXT:    v_add_f32_e32 v3, 0x40c00000, v3
diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.768bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.768bit.ll
index 322689c91425b..194ee0705a921 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.768bit.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.768bit.ll
@@ -12680,7 +12680,8 @@ define inreg <48 x i16> @bitcast_v24f32_to_v48i16_scalar(<24 x float> inreg %a,
 ; GFX11-FAKE16-NEXT:    v_and_b32_e32 v19, 0xffff, v16
 ; GFX11-FAKE16-NEXT:    v_and_b32_e32 v2, 0xffff, v3
 ; GFX11-FAKE16-NEXT:    v_and_b32_e32 v3, 0xffff, v4
-; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v5, v29 :: v_dual_and_b32 v4, 0xffff, v5
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v4, 0xffff, v5
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v5, v29
 ; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v28, v28, 16, v20
 ; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v20, v33, 16, v1
 ; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v1, v25
@@ -16191,7 +16192,8 @@ define inreg <48 x half> @bitcast_v24f32_to_v48f16_scalar(<24 x float> inreg %a,
 ; GFX11-FAKE16-NEXT:    v_and_b32_e32 v19, 0xffff, v16
 ; GFX11-FAKE16-NEXT:    v_and_b32_e32 v2, 0xffff, v3
 ; GFX11-FAKE16-NEXT:    v_and_b32_e32 v3, 0xffff, v4
-; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v5, v29 :: v_dual_and_b32 v4, 0xffff, v5
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v4, 0xffff, v5
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v5, v29
 ; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v28, v28, 16, v20
 ; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v20, v33, 16, v1
 ; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v1, v25
@@ -27727,7 +27729,8 @@ define inreg <48 x i16> @bitcast_v12f64_to_v48i16_scalar(<12 x double> inreg %a,
 ; GFX11-FAKE16-NEXT:    v_and_b32_e32 v1, 0xffff, v2
 ; GFX11-FAKE16-NEXT:    v_and_b32_e32 v2, 0xffff, v3
 ; GFX11-FAKE16-NEXT:    v_and_b32_e32 v3, 0xffff, v4
-; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v5, v29 :: v_dual_and_b32 v4, 0xffff, v5
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v4, 0xffff, v5
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v5, v29
 ; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v28, v28, 16, v20
 ; GFX11-FAKE16-NEXT:    v_and_b32_e32 v20, 0xffff, v10
 ; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v26, v26, 16, v22
@@ -31166,7 +31169,8 @@ define inreg <48 x half> @bitcast_v12f64_to_v48f16_scalar(<12 x double> inreg %a
 ; GFX11-FAKE16-NEXT:    v_and_b32_e32 v1, 0xffff, v2
 ; GFX11-FAKE16-NEXT:    v_and_b32_e32 v2, 0xffff, v3
 ; GFX11-FAKE16-NEXT:    v_and_b32_e32 v3, 0xffff, v4
-; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v5, v29 :: v_dual_and_b32 v4, 0xffff, v5
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v4, 0xffff, v5
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v5, v29
 ; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v28, v28, 16, v20
 ; GFX11-FAKE16-NEXT:    v_and_b32_e32 v20, 0xffff, v10
 ; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v26, v26, 16, v22
diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.832bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.832bit.ll
index 911c911fa1ad4..60ce818302ce7 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.832bit.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.832bit.ll
@@ -13756,10 +13756,12 @@ define inreg <52 x i16> @bitcast_v26f32_to_v52i16_scalar(<26 x float> inreg %a,
 ; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v18, v49, 16, v0
 ; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xffff, v3
 ; GFX11-FAKE16-NEXT:    v_and_b32_e32 v3, 0xffff, v6
-; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v7, v31 :: v_dual_and_b32 v4, 0xffff, v7
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v4, 0xffff, v7
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v7, v31
 ; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v30, v30, 16, v20
 ; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v20, v39, 16, v2
-; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v5, v29 :: v_dual_and_b32 v2, 0xffff, v5
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v2, 0xffff, v5
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v5, v29
 ; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v28, v28, 16, v22
 ; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v22, v37, 16, v1
 ; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v1, v33
@@ -17627,10 +17629,12 @@ define inreg <52 x half> @bitcast_v26f32_to_v52f16_scalar(<26 x float> inreg %a,
 ; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v18, v49, 16, v0
 ; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xffff, v3
 ; GFX11-FAKE16-NEXT:    v_and_b32_e32 v3, 0xffff, v6
-; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v7, v31 :: v_dual_and_b32 v4, 0xffff, v7
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v4, 0xffff, v7
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v7, v31
 ; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v30, v30, 16, v20
 ; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v20, v39, 16, v2
-; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v5, v29 :: v_dual_and_b32 v2, 0xffff, v5
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v2, 0xffff, v5
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v5, v29
 ; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v28, v28, 16, v22
 ; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v22, v37, 16, v1
 ; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v1, v33
@@ -38965,7 +38969,8 @@ define inreg <52 x half> @bitcast_v52i16_to_v52f16_scalar(<52 x i16> inreg %a, i
 ; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GFX11-FAKE16-NEXT:    v_and_b32_e32 v1, 0xffff, v1
 ; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v20, v20, 16, v2
-; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v5, v29 :: v_dual_and_b32 v2, 0xffff, v5
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v2, 0xffff, v5
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v5, v29
 ; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v31, v52, 16, v54
 ; GFX11-FAKE16-NEXT:    v_and_b32_e32 v10, 0xffff, v10
 ; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v33, v66, 16, v68
@@ -38980,7 +38985,8 @@ define inreg <52 x half> @bitcast_v52i16_to_v52f16_scalar(<52 x i16> inreg %a, i
 ; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xffff, v3
 ; GFX11-FAKE16-NEXT:    v_and_b32_e32 v1, 0xffff, v4
 ; GFX11-FAKE16-NEXT:    v_and_b32_e32 v3, 0xffff, v6
-; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v7, v31 :: v_dual_and_b32 v4, 0xffff, v7
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v4, 0xffff, v7
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v7, v31
 ; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v8, v51, 16, v10
 ; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v10, v49, 16, v55
 ; GFX11-FAKE16-NEXT:    v_and_b32_e32 v49, 0xffff, v12
@@ -41124,7 +41130,8 @@ define inreg <52 x i16> @bitcast_v52f16_to_v52i16_scalar(<52 x half> inreg %a, i
 ; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GFX11-FAKE16-NEXT:    v_and_b32_e32 v1, 0xffff, v1
 ; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v20, v20, 16, v2
-; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v5, v29 :: v_dual_and_b32 v2, 0xffff, v5
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v2, 0xffff, v5
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v5, v29
 ; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v31, v52, 16, v54
 ; GFX11-FAKE16-NEXT:    v_and_b32_e32 v10, 0xffff, v10
 ; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v33, v66, 16, v68
@@ -41139,7 +41146,8 @@ define inreg <52 x i16> @bitcast_v52f16_to_v52i16_scalar(<52 x half> inreg %a, i
 ; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xffff, v3
 ; GFX11-FAKE16-NEXT:    v_and_b32_e32 v1, 0xffff, v4
 ; GFX11-FAKE16-NEXT:    v_and_b32_e32 v3, 0xffff, v6
-; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v7, v31 :: v_dual_and_b32 v4, 0xffff, v7
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v4, 0xffff, v7
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v7, v31
 ; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v8, v51, 16, v10
 ; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v10, v49, 16, v55
 ; GFX11-FAKE16-NEXT:    v_and_b32_e32 v49, 0xffff, v12
diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.896bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.896bit.ll
index 9f342f95cd8ee..19462c5bf8a9f 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.896bit.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.896bit.ll
@@ -14872,7 +14872,8 @@ define inreg <56 x i16> @bitcast_v28f32_to_v56i16_scalar(<28 x float> inreg %a,
 ; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v30, v30, 16, v28
 ; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v28, v71, 16, v20
 ; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v20, v53, 16, v2
-; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v7, v37 :: v_dual_and_b32 v2, 0xffff, v7
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v2, 0xffff, v7
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v7, v37
 ; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v36, v36, 16, v22
 ; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v22, v51, 16, v4
 ; GFX11-FAKE16-NEXT:    v_and_b32_e32 v4, 0xffff, v9
@@ -19086,7 +19087,8 @@ define inreg <56 x half> @bitcast_v28f32_to_v56f16_scalar(<28 x float> inreg %a,
 ; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v30, v30, 16, v28
 ; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v28, v71, 16, v20
 ; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v20, v53, 16, v2
-; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v7, v37 :: v_dual_and_b32 v2, 0xffff, v7
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v2, 0xffff, v7
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v7, v37
 ; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v36, v36, 16, v22
 ; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v22, v51, 16, v4
 ; GFX11-FAKE16-NEXT:    v_and_b32_e32 v4, 0xffff, v9
@@ -32805,7 +32807,8 @@ define inreg <56 x i16> @bitcast_v14f64_to_v56i16_scalar(<14 x double> inreg %a,
 ; GFX11-FAKE16-NEXT:    v_and_b32_e32 v19, 0xffff, v14
 ; GFX11-FAKE16-NEXT:    v_and_b32_e32 v1, 0xffff, v1
 ; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v22, v51, 16, v4
-; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v9, v29 :: v_dual_and_b32 v4, 0xffff, v9
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v4, 0xffff, v9
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v9, v29
 ; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v11, v70, 16, v12
 ; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v12, v69, 16, v15
 ; GFX11-FAKE16-NEXT:    v_and_b32_e32 v15, 0xffff, v16
@@ -32816,7 +32819,8 @@ define inreg <56 x i16> @bitcast_v14f64_to_v56i16_scalar(<14 x double> inreg %a,
 ; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v13, v68, 16, v15
 ; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v15, v66, 16, v19
 ; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v19, v54, 16, v1
-; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v6, v36 :: v_dual_and_b32 v1, 0xffff, v6
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v1, 0xffff, v6
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v6, v36
 ; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v34, v34, 16, v23
 ; GFX11-FAKE16-NEXT:    v_and_b32_e32 v25, 0xffff, v26
 ; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v30, v30, 16, v27
@@ -36930,7 +36934,8 @@ define inreg <56 x half> @bitcast_v14f64_to_v56f16_scalar(<14 x double> inreg %a
 ; GFX11-FAKE16-NEXT:    v_and_b32_e32 v19, 0xffff, v14
 ; GFX11-FAKE16-NEXT:    v_and_b32_e32 v1, 0xffff, v1
 ; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v22, v51, 16, v4
-; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v9, v29 :: v_dual_and_b32 v4, 0xffff, v9
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v4, 0xffff, v9
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v9, v29
 ; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v11, v70, 16, v12
 ; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v12, v69, 16, v15
 ; GFX11-FAKE16-NEXT:    v_and_b32_e32 v15, 0xffff, v16
@@ -36941,7 +36946,8 @@ define inreg <56 x half> @bitcast_v14f64_to_v56f16_scalar(<14 x double> inreg %a
 ; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v13, v68, 16, v15
 ; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v15, v66, 16, v19
 ; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v19, v54, 16, v1
-; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v6, v36 :: v_dual_and_b32 v1, 0xffff, v6
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v1, 0xffff, v6
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v6, v36
 ; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v34, v34, 16, v23
 ; GFX11-FAKE16-NEXT:    v_and_b32_e32 v25, 0xffff, v26
 ; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v30, v30, 16, v27
@@ -42308,7 +42314,8 @@ define inreg <56 x half> @bitcast_v56i16_to_v56f16_scalar(<56 x i16> inreg %a, i
 ; GFX11-FAKE16-NEXT:    v_and_b32_e32 v2, 0xffff, v2
 ; GFX11-FAKE16-NEXT:    v_and_b32_e32 v4, 0xffff, v4
 ; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v18, v18, 16, v0
-; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v5, v35 :: v_dual_and_b32 v0, 0xffff, v5
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xffff, v5
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v5, v35
 ; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v34, v67, 16, v36
 ; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v36, v65, 16, v69
 ; GFX11-FAKE16-NEXT:    v_and_b32_e32 v65, 0xffff, v10
@@ -44660,7 +44667,8 @@ define inreg <56 x i16> @bitcast_v56f16_to_v56i16_scalar(<56 x half> inreg %a, i
 ; GFX11-FAKE16-NEXT:    v_and_b32_e32 v2, 0xffff, v2
 ; GFX11-FAKE16-NEXT:    v_and_b32_e32 v4, 0xffff, v4
 ; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v18, v18, 16, v0
-; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v5, v35 :: v_dual_and_b32 v0, 0xffff, v5
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xffff, v5
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v5, v35
 ; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v34, v67, 16, v36
 ; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v36, v65, 16, v69
 ; GFX11-FAKE16-NEXT:    v_and_b32_e32 v65, 0xffff, v10
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-cs-chain-fp-nosave.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-cs-chain-fp-nosave.ll
index b5ebca1846807..87a06cfe75265 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgpu-cs-chain-fp-nosave.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-cs-chain-fp-nosave.ll
@@ -94,12 +94,12 @@ define amdgpu_cs_chain void @test_alloca_var(i32 %count) {
 ; GFX12-NEXT:    s_mov_b32 s0, 0
 ; GFX12-NEXT:    s_mov_b32 s32, 16
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_and_b32 v1, -16, v0
+; GFX12-NEXT:    v_and_b32_e32 v1, -16, v0
+; GFX12-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-NEXT:  .LBB2_1: ; =>This Inner Loop Header: Depth=1
 ; GFX12-NEXT:    s_wait_alu depctr_sa_sdst(0)
 ; GFX12-NEXT:    s_ctz_i32_b32 s2, s1
 ; GFX12-NEXT:    s_wait_alu depctr_sa_sdst(0)
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-NEXT:    v_readlane_b32 s3, v1, s2
 ; GFX12-NEXT:    s_bitset0_b32 s1, s2
 ; GFX12-NEXT:    s_max_u32 s0, s0, s3
@@ -261,12 +261,12 @@ define amdgpu_cs_chain void @test_alloca_and_call_var(i32 %count) {
 ; GFX12-NEXT:    s_mov_b32 s0, 0
 ; GFX12-NEXT:    s_mov_b32 s32, 16
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_and_b32 v1, -16, v0
+; GFX12-NEXT:    v_and_b32_e32 v1, -16, v0
+; GFX12-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-NEXT:  .LBB5_1: ; =>This Inner Loop Header: Depth=1
 ; GFX12-NEXT:    s_wait_alu depctr_sa_sdst(0)
 ; GFX12-NEXT:    s_ctz_i32_b32 s2, s1
 ; GFX12-NEXT:    s_wait_alu depctr_sa_sdst(0)
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-NEXT:    v_readlane_b32 s3, v1, s2
 ; GFX12-NEXT:    s_bitset0_b32 s1, s2
 ; GFX12-NEXT:    s_max_u32 s0, s0, s3
diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_struct_buffer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_struct_buffer.ll
index 45b47a1f389e5..c886146b94465 100644
--- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_struct_buffer.ll
+++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_struct_buffer.ll
@@ -1135,62 +1135,34 @@ define amdgpu_kernel void @add_i32_varying_offset(ptr addrspace(1) %out, ptr add
 ; GFX10-NEXT:    global_store_dword v0, v2, s[0:1]
 ; GFX10-NEXT:    s_endpgm
 ;
-; GFX11W64-LABEL: add_i32_varying_offset:
-; GFX11W64:       ; %bb.0: ; %entry
-; GFX11W64-NEXT:    s_load_b128 s[0:3], s[4:5], 0x34
-; GFX11W64-NEXT:    s_mov_b32 s6, 0
-; GFX11W64-NEXT:    v_and_b32_e32 v1, 0x3ff, v0
-; GFX11W64-NEXT:    v_mov_b32_e32 v0, s6
-; GFX11W64-NEXT:    v_mov_b32_e32 v2, 1
-; GFX11W64-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11W64-NEXT:    buffer_atomic_add_u32 v2, v[0:1], s[0:3], 0 idxen offen glc
-; GFX11W64-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
-; GFX11W64-NEXT:    v_mov_b32_e32 v0, 0
-; GFX11W64-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11W64-NEXT:    global_store_b32 v0, v2, s[0:1]
-; GFX11W64-NEXT:    s_endpgm
-;
-; GFX11W32-LABEL: add_i32_varying_offset:
-; GFX11W32:       ; %bb.0: ; %entry
-; GFX11W32-NEXT:    s_load_b128 s[0:3], s[4:5], 0x34
-; GFX11W32-NEXT:    s_mov_b32 s6, 0
-; GFX11W32-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11W32-NEXT:    v_dual_mov_b32 v0, s6 :: v_dual_and_b32 v1, 0x3ff, v0
-; GFX11W32-NEXT:    v_mov_b32_e32 v2, 1
-; GFX11W32-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11W32-NEXT:    buffer_atomic_add_u32 v2, v[0:1], s[0:3], 0 idxen offen glc
-; GFX11W32-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
-; GFX11W32-NEXT:    v_mov_b32_e32 v0, 0
-; GFX11W32-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11W32-NEXT:    global_store_b32 v0, v2, s[0:1]
-; GFX11W32-NEXT:    s_endpgm
-;
-; GFX12W64-LABEL: add_i32_varying_offset:
-; GFX12W64:       ; %bb.0: ; %entry
-; GFX12W64-NEXT:    s_load_b128 s[0:3], s[4:5], 0x34
-; GFX12W64-NEXT:    v_and_b32_e32 v1, 0x3ff, v0
-; GFX12W64-NEXT:    v_mov_b32_e32 v0, 0
-; GFX12W64-NEXT:    v_mov_b32_e32 v2, 1
-; GFX12W64-NEXT:    s_wait_kmcnt 0x0
-; GFX12W64-NEXT:    buffer_atomic_add_u32 v2, v[0:1], s[0:3], null idxen offen th:TH_ATOMIC_RETURN
-; GFX12W64-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
-; GFX12W64-NEXT:    s_wait_loadcnt 0x0
-; GFX12W64-NEXT:    s_wait_kmcnt 0x0
-; GFX12W64-NEXT:    global_store_b32 v0, v2, s[0:1]
-; GFX12W64-NEXT:    s_endpgm
-;
-; GFX12W32-LABEL: add_i32_varying_offset:
-; GFX12W32:       ; %bb.0: ; %entry
-; GFX12W32-NEXT:    s_load_b128 s[0:3], s[4:5], 0x34
-; GFX12W32-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_and_b32 v1, 0x3ff, v0
-; GFX12W32-NEXT:    v_mov_b32_e32 v2, 1
-; GFX12W32-NEXT:    s_wait_kmcnt 0x0
-; GFX12W32-NEXT:    buffer_atomic_add_u32 v2, v[0:1], s[0:3], null idxen offen th:TH_ATOMIC_RETURN
-; GFX12W32-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
-; GFX12W32-NEXT:    s_wait_loadcnt 0x0
-; GFX12W32-NEXT:    s_wait_kmcnt 0x0
-; GFX12W32-NEXT:    global_store_b32 v0, v2, s[0:1]
-; GFX12W32-NEXT:    s_endpgm
+; GFX11-LABEL: add_i32_varying_offset:
+; GFX11:       ; %bb.0: ; %entry
+; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x34
+; GFX11-NEXT:    s_mov_b32 s6, 0
+; GFX11-NEXT:    v_and_b32_e32 v1, 0x3ff, v0
+; GFX11-NEXT:    v_mov_b32_e32 v0, s6
+; GFX11-NEXT:    v_mov_b32_e32 v2, 1
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    buffer_atomic_add_u32 v2, v[0:1], s[0:3], 0 idxen offen glc
+; GFX11-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX11-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    global_store_b32 v0, v2, s[0:1]
+; GFX11-NEXT:    s_endpgm
+;
+; GFX12-LABEL: add_i32_varying_offset:
+; GFX12:       ; %bb.0: ; %entry
+; GFX12-NEXT:    s_load_b128 s[0:3], s[4:5], 0x34
+; GFX12-NEXT:    v_and_b32_e32 v1, 0x3ff, v0
+; GFX12-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-NEXT:    v_mov_b32_e32 v2, 1
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    buffer_atomic_add_u32 v2, v[0:1], s[0:3], null idxen offen th:TH_ATOMIC_RETURN
+; GFX12-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    global_store_b32 v0, v2, s[0:1]
+; GFX12-NEXT:    s_endpgm
 entry:
   %lane = call i32 @llvm.amdgcn.workitem.id.x()
   %old = call i32 @llvm.amdgcn.struct.ptr.buffer.atomic.add(i32 1, ptr addrspace(8) %inout, i32 0, i32 %lane, i32 0, i32 0)
@@ -2335,68 +2307,37 @@ define amdgpu_kernel void @sub_i32_varying_offset(ptr addrspace(1) %out, ptr add
 ; GFX10-NEXT:    global_store_dword v0, v2, s[0:1]
 ; GFX10-NEXT:    s_endpgm
 ;
-; GFX11W64-LABEL: sub_i32_varying_offset:
-; GFX11W64:       ; %bb.0: ; %entry
-; GFX11W64-NEXT:    s_load_b128 s[0:3], s[4:5], 0x34
-; GFX11W64-NEXT:    s_mov_b32 s6, 0
-; GFX11W64-NEXT:    v_and_b32_e32 v1, 0x3ff, v0
-; GFX11W64-NEXT:    v_mov_b32_e32 v0, s6
-; GFX11W64-NEXT:    v_mov_b32_e32 v2, 1
-; GFX11W64-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11W64-NEXT:    buffer_atomic_sub_u32 v2, v[0:1], s[0:3], 0 idxen offen glc
-; GFX11W64-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
-; GFX11W64-NEXT:    v_mov_b32_e32 v0, 0
-; GFX11W64-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11W64-NEXT:    global_store_b32 v0, v2, s[0:1]
-; GFX11W64-NEXT:    s_endpgm
-;
-; GFX11W32-LABEL: sub_i32_varying_offset:
-; GFX11W32:       ; %bb.0: ; %entry
-; GFX11W32-NEXT:    s_load_b128 s[0:3], s[4:5], 0x34
-; GFX11W32-NEXT:    s_mov_b32 s6, 0
-; GFX11W32-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11W32-NEXT:    v_dual_mov_b32 v0, s6 :: v_dual_and_b32 v1, 0x3ff, v0
-; GFX11W32-NEXT:    v_mov_b32_e32 v2, 1
-; GFX11W32-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11W32-NEXT:    buffer_atomic_sub_u32 v2, v[0:1], s[0:3], 0 idxen offen glc
-; GFX11W32-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
-; GFX11W32-NEXT:    v_mov_b32_e32 v0, 0
-; GFX11W32-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11W32-NEXT:    global_store_b32 v0, v2, s[0:1]
-; GFX11W32-NEXT:    s_endpgm
-;
-; GFX12W64-LABEL: sub_i32_varying_offset:
-; GFX12W64:       ; %bb.0: ; %entry
-; GFX12W64-NEXT:    s_load_b128 s[0:3], s[4:5], 0x34
-; GFX12W64-NEXT:    v_and_b32_e32 v1, 0x3ff, v0
-; GFX12W64-NEXT:    v_mov_b32_e32 v0, 0
-; GFX12W64-NEXT:    v_mov_b32_e32 v2, 1
-; GFX12W64-NEXT:    s_wait_kmcnt 0x0
-; GFX12W64-NEXT:    buffer_atomic_sub_u32 v2, v[0:1], s[0:3], null idxen offen th:TH_ATOMIC_RETURN
-; GFX12W64-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
-; GFX12W64-NEXT:    s_wait_loadcnt 0x0
-; GFX12W64-NEXT:    s_wait_kmcnt 0x0
-; GFX12W64-NEXT:    global_store_b32 v0, v2, s[0:1]
-; GFX12W64-NEXT:    s_endpgm
-;
-; GFX12W32-LABEL: sub_i32_varying_offset:
-; GFX12W32:       ; %bb.0: ; %entry
-; GFX12W32-NEXT:    s_load_b128 s[0:3], s[4:5], 0x34
-; GFX12W32-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_and_b32 v1, 0x3ff, v0
-; GFX12W32-NEXT:    v_mov_b32_e32 v2, 1
-; GFX12W32-NEXT:    s_wait_kmcnt 0x0
-; GFX12W32-NEXT:    buffer_atomic_sub_u32 v2, v[0:1], s[0:3], null idxen offen th:TH_ATOMIC_RETURN
-; GFX12W32-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
-; GFX12W32-NEXT:    s_wait_loadcnt 0x0
-; GFX12W32-NEXT:    s_wait_kmcnt 0x0
-; GFX12W32-NEXT:    global_store_b32 v0, v2, s[0:1]
-; GFX12W32-NEXT:    s_endpgm
+; GFX11-LABEL: sub_i32_varying_offset:
+; GFX11:       ; %bb.0: ; %entry
+; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x34
+; GFX11-NEXT:    s_mov_b32 s6, 0
+; GFX11-NEXT:    v_and_b32_e32 v1, 0x3ff, v0
+; GFX11-NEXT:    v_mov_b32_e32 v0, s6
+; GFX11-NEXT:    v_mov_b32_e32 v2, 1
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    buffer_atomic_sub_u32 v2, v[0:1], s[0:3], 0 idxen offen glc
+; GFX11-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX11-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    global_store_b32 v0, v2, s[0:1]
+; GFX11-NEXT:    s_endpgm
+;
+; GFX12-LABEL: sub_i32_varying_offset:
+; GFX12:       ; %bb.0: ; %entry
+; GFX12-NEXT:    s_load_b128 s[0:3], s[4:5], 0x34
+; GFX12-NEXT:    v_and_b32_e32 v1, 0x3ff, v0
+; GFX12-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-NEXT:    v_mov_b32_e32 v2, 1
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    buffer_atomic_sub_u32 v2, v[0:1], s[0:3], null idxen offen th:TH_ATOMIC_RETURN
+; GFX12-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    global_store_b32 v0, v2, s[0:1]
+; GFX12-NEXT:    s_endpgm
 entry:
   %lane = call i32 @llvm.amdgcn.workitem.id.x()
   %old = call i32 @llvm.amdgcn.struct.ptr.buffer.atomic.sub(i32 1, ptr addrspace(8) %inout, i32 0, i32 %lane, i32 0, i32 0)
   store i32 %old, ptr addrspace(1) %out
   ret void
 }
-;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
-; GFX11: {{.*}}
-; GFX12: {{.*}}
diff --git a/llvm/test/CodeGen/AMDGPU/bf16.ll b/llvm/test/CodeGen/AMDGPU/bf16.ll
index 0394ed7f89633..a5763816e58cc 100644
--- a/llvm/test/CodeGen/AMDGPU/bf16.ll
+++ b/llvm/test/CodeGen/AMDGPU/bf16.ll
@@ -35793,8 +35793,8 @@ define <4 x i64> @v_fptosi_v4bf16_to_v4i64(<4 x bfloat> %x) {
 ; GFX1250-NEXT:    v_xor_b32_e32 v10, v10, v2
 ; GFX1250-NEXT:    v_sub_nc_u64_e32 v[0:1], v[8:9], v[0:1]
 ; GFX1250-NEXT:    v_xor_b32_e32 v9, v14, v4
-; GFX1250-NEXT:    v_dual_mov_b32 v7, v6 :: v_dual_bitop2_b32 v8, v7, v4 bitop3:0x14
-; GFX1250-NEXT:    v_xor_b32_e32 v13, v12, v6
+; GFX1250-NEXT:    v_xor_b32_e32 v8, v7, v4
+; GFX1250-NEXT:    v_dual_mov_b32 v7, v6 :: v_dual_bitop2_b32 v13, v12, v6 bitop3:0x14
 ; GFX1250-NEXT:    v_xor_b32_e32 v12, v15, v6
 ; GFX1250-NEXT:    v_sub_nc_u64_e32 v[2:3], v[10:11], v[2:3]
 ; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
@@ -37399,8 +37399,9 @@ define bfloat @v_sitofp_i64_to_bf16(i64 %x) {
 ; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1250-NEXT:    v_lshlrev_b64_e32 v[0:1], v2, v[0:1]
 ; GFX1250-NEXT:    v_min_u32_e32 v0, 1, v0
-; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-NEXT:    v_dual_sub_nc_u32 v1, 32, v2 :: v_dual_bitop2_b32 v0, v1, v0 bitop3:0x54
+; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX1250-NEXT:    v_or_b32_e32 v0, v1, v0
+; GFX1250-NEXT:    v_sub_nc_u32_e32 v1, 32, v2
 ; GFX1250-NEXT:    v_cvt_f32_i32_e32 v0, v0
 ; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1250-NEXT:    v_ldexp_f32 v0, v0, v1
@@ -38305,12 +38306,13 @@ define <3 x bfloat> @v_sitofp_v3i64_to_v3bf16(<3 x i64> %x) {
 ; GFX1250FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
 ; GFX1250FAKE16-NEXT:    v_min_u32_e32 v0, 1, v0
 ; GFX1250FAKE16-NEXT:    v_sub_nc_u32_e32 v6, 32, v6
-; GFX1250FAKE16-NEXT:    v_dual_sub_nc_u32 v3, 32, v7 :: v_dual_bitop2_b32 v2, v3, v2 bitop3:0x54
-; GFX1250FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX1250FAKE16-NEXT:    v_or_b32_e32 v0, v1, v0
-; GFX1250FAKE16-NEXT:    v_dual_sub_nc_u32 v4, 32, v9 :: v_dual_bitop2_b32 v1, v5, v4 bitop3:0x54
+; GFX1250FAKE16-NEXT:    v_or_b32_e32 v2, v3, v2
+; GFX1250FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX1250FAKE16-NEXT:    v_dual_sub_nc_u32 v3, 32, v7 :: v_dual_bitop2_b32 v0, v1, v0 bitop3:0x54
+; GFX1250FAKE16-NEXT:    v_or_b32_e32 v1, v5, v4
+; GFX1250FAKE16-NEXT:    v_sub_nc_u32_e32 v4, 32, v9
 ; GFX1250FAKE16-NEXT:    v_cvt_f32_i32_e32 v2, v2
-; GFX1250FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX1250FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
 ; GFX1250FAKE16-NEXT:    v_cvt_f32_i32_e32 v0, v0
 ; GFX1250FAKE16-NEXT:    v_cvt_f32_i32_e32 v1, v1
 ; GFX1250FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
@@ -38944,8 +38946,8 @@ define <4 x bfloat> @v_sitofp_v4i64_to_v4bf16(<4 x i64> %x) {
 ; GFX1250-NEXT:    v_min_u32_e32 v2, 1, v2
 ; GFX1250-NEXT:    v_sub_nc_u32_e32 v7, 32, v10
 ; GFX1250-NEXT:    v_min_u32_e32 v0, 1, v0
-; GFX1250-NEXT:    v_dual_sub_nc_u32 v5, 32, v9 :: v_dual_bitop2_b32 v4, v5, v4 bitop3:0x54
-; GFX1250-NEXT:    v_or_b32_e32 v2, v3, v2
+; GFX1250-NEXT:    v_or_b32_e32 v4, v5, v4
+; GFX1250-NEXT:    v_dual_sub_nc_u32 v5, 32, v9 :: v_dual_bitop2_b32 v2, v3, v2 bitop3:0x54
 ; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_3)
 ; GFX1250-NEXT:    v_or_b32_e32 v0, v1, v0
 ; GFX1250-NEXT:    v_sub_nc_u32_e32 v1, 32, v8
@@ -40580,8 +40582,9 @@ define bfloat @v_uitofp_i64_to_bf16(i64 %x) {
 ; GFX1250-NEXT:    v_lshlrev_b64_e32 v[0:1], v2, v[0:1]
 ; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1250-NEXT:    v_min_u32_e32 v0, 1, v0
-; GFX1250-NEXT:    v_dual_sub_nc_u32 v1, 32, v2 :: v_dual_bitop2_b32 v0, v1, v0 bitop3:0x54
-; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-NEXT:    v_or_b32_e32 v0, v1, v0
+; GFX1250-NEXT:    v_sub_nc_u32_e32 v1, 32, v2
+; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1250-NEXT:    v_cvt_f32_u32_e32 v0, v0
 ; GFX1250-NEXT:    v_ldexp_f32 v0, v0, v1
 ; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1)
@@ -41294,11 +41297,12 @@ define <3 x bfloat> @v_uitofp_v3i64_to_v3bf16(<3 x i64> %x) {
 ; GFX1250FAKE16-NEXT:    v_dual_sub_nc_u32 v8, 32, v8 :: v_dual_bitop2_b32 v2, v3, v2 bitop3:0x54
 ; GFX1250FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
 ; GFX1250FAKE16-NEXT:    v_dual_sub_nc_u32 v3, 32, v6 :: v_dual_bitop2_b32 v0, v1, v0 bitop3:0x54
-; GFX1250FAKE16-NEXT:    v_dual_sub_nc_u32 v4, 32, v7 :: v_dual_bitop2_b32 v1, v5, v4 bitop3:0x54
-; GFX1250FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX1250FAKE16-NEXT:    v_or_b32_e32 v1, v5, v4
+; GFX1250FAKE16-NEXT:    v_sub_nc_u32_e32 v4, 32, v7
+; GFX1250FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
 ; GFX1250FAKE16-NEXT:    v_cvt_f32_u32_e32 v2, v2
 ; GFX1250FAKE16-NEXT:    v_cvt_f32_u32_e32 v0, v0
-; GFX1250FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX1250FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
 ; GFX1250FAKE16-NEXT:    v_cvt_f32_u32_e32 v1, v1
 ; GFX1250FAKE16-NEXT:    v_ldexp_f32 v2, v2, v3
 ; GFX1250FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
@@ -48573,9 +48577,10 @@ define <8 x bfloat> @v_fma_v8bf16(<8 x bfloat> %a, <8 x bfloat> %b, <8 x bfloat>
 ; GFX11FAKE16-NEXT:    v_bfe_u32 v16, v11, 16, 1
 ; GFX11FAKE16-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
 ; GFX11FAKE16-NEXT:    v_or_b32_e32 v17, 0x400000, v11
-; GFX11FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_4)
+; GFX11FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
 ; GFX11FAKE16-NEXT:    v_fmac_f32_e32 v3, v14, v7
-; GFX11FAKE16-NEXT:    v_dual_cndmask_b32 v10, v13, v15 :: v_dual_and_b32 v7, 0xffff0000, v10
+; GFX11FAKE16-NEXT:    v_and_b32_e32 v7, 0xffff0000, v10
+; GFX11FAKE16-NEXT:    v_cndmask_b32_e32 v10, v13, v15, vcc_lo
 ; GFX11FAKE16-NEXT:    v_add3_u32 v12, v16, v11, 0x7fff
 ; GFX11FAKE16-NEXT:    v_lshlrev_b32_e32 v14, 16, v1
 ; GFX11FAKE16-NEXT:    v_bfe_u32 v13, v3, 16, 1
@@ -49579,8 +49584,8 @@ define <16 x bfloat> @v_fma_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b, <16 x bf
 ; GFX11FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX11FAKE16-NEXT:    v_add3_u32 v24, v28, v23, 0x7fff
 ; GFX11FAKE16-NEXT:    v_fmac_f32_e32 v7, v26, v15
-; GFX11FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4)
-; GFX11FAKE16-NEXT:    v_dual_cndmask_b32 v22, v25, v27 :: v_dual_and_b32 v15, 0xffff0000, v22
+; GFX11FAKE16-NEXT:    v_and_b32_e32 v15, 0xffff0000, v22
+; GFX11FAKE16-NEXT:    v_cndmask_b32_e32 v22, v25, v27, vcc_lo
 ; GFX11FAKE16-NEXT:    v_or_b32_e32 v25, 0x400000, v23
 ; GFX11FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v23, v23
 ; GFX11FAKE16-NEXT:    v_bfe_u32 v26, v7, 16, 1
diff --git a/llvm/test/CodeGen/AMDGPU/expand-waitcnt-profiling.ll b/llvm/test/CodeGen/AMDGPU/expand-waitcnt-profiling.ll
index 848a9d07084ed..07005fd020e81 100644
--- a/llvm/test/CodeGen/AMDGPU/expand-waitcnt-profiling.ll
+++ b/llvm/test/CodeGen/AMDGPU/expand-waitcnt-profiling.ll
@@ -376,10 +376,10 @@ define amdgpu_kernel void @test_lgkmcnt_lds_operations(ptr addrspace(3) %lds_ptr
 ; GFX11-EXPAND-NEXT:    ds_load_2addr_b32 v[0:1], v2 offset1:1
 ; GFX11-EXPAND-NEXT:    ds_load_b32 v2, v2 offset:8
 ; GFX11-EXPAND-NEXT:    s_waitcnt lgkmcnt(1)
-; GFX11-EXPAND-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_add_nc_u32 v0, v0, v1
+; GFX11-EXPAND-NEXT:    v_add_nc_u32_e32 v0, v0, v1
 ; GFX11-EXPAND-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-EXPAND-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-EXPAND-NEXT:    v_add_nc_u32_e32 v0, v0, v2
+; GFX11-EXPAND-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_add_nc_u32 v0, v0, v2
 ; GFX11-EXPAND-NEXT:    global_store_b32 v1, v0, s[0:1]
 ; GFX11-EXPAND-NEXT:    s_endpgm
 ;
@@ -393,10 +393,10 @@ define amdgpu_kernel void @test_lgkmcnt_lds_operations(ptr addrspace(3) %lds_ptr
 ; GFX11-NOEXPAND-NEXT:    ds_load_2addr_b32 v[0:1], v2 offset1:1
 ; GFX11-NOEXPAND-NEXT:    ds_load_b32 v2, v2 offset:8
 ; GFX11-NOEXPAND-NEXT:    s_waitcnt lgkmcnt(1)
-; GFX11-NOEXPAND-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_add_nc_u32 v0, v0, v1
+; GFX11-NOEXPAND-NEXT:    v_add_nc_u32_e32 v0, v0, v1
 ; GFX11-NOEXPAND-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NOEXPAND-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NOEXPAND-NEXT:    v_add_nc_u32_e32 v0, v0, v2
+; GFX11-NOEXPAND-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_add_nc_u32 v0, v0, v2
 ; GFX11-NOEXPAND-NEXT:    global_store_b32 v1, v0, s[0:1]
 ; GFX11-NOEXPAND-NEXT:    s_endpgm
 ;
@@ -410,10 +410,10 @@ define amdgpu_kernel void @test_lgkmcnt_lds_operations(ptr addrspace(3) %lds_ptr
 ; GFX12-EXPAND-NEXT:    ds_load_2addr_b32 v[0:1], v2 offset1:1
 ; GFX12-EXPAND-NEXT:    ds_load_b32 v2, v2 offset:8
 ; GFX12-EXPAND-NEXT:    s_wait_dscnt 0x1
-; GFX12-EXPAND-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_add_nc_u32 v0, v0, v1
+; GFX12-EXPAND-NEXT:    v_add_nc_u32_e32 v0, v0, v1
 ; GFX12-EXPAND-NEXT:    s_wait_dscnt 0x0
 ; GFX12-EXPAND-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-EXPAND-NEXT:    v_add_nc_u32_e32 v0, v0, v2
+; GFX12-EXPAND-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_add_nc_u32 v0, v0, v2
 ; GFX12-EXPAND-NEXT:    global_store_b32 v1, v0, s[0:1]
 ; GFX12-EXPAND-NEXT:    s_endpgm
 ;
@@ -427,10 +427,10 @@ define amdgpu_kernel void @test_lgkmcnt_lds_operations(ptr addrspace(3) %lds_ptr
 ; GFX12-NOEXPAND-NEXT:    ds_load_2addr_b32 v[0:1], v2 offset1:1
 ; GFX12-NOEXPAND-NEXT:    ds_load_b32 v2, v2 offset:8
 ; GFX12-NOEXPAND-NEXT:    s_wait_dscnt 0x1
-; GFX12-NOEXPAND-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_add_nc_u32 v0, v0, v1
+; GFX12-NOEXPAND-NEXT:    v_add_nc_u32_e32 v0, v0, v1
 ; GFX12-NOEXPAND-NEXT:    s_wait_dscnt 0x0
 ; GFX12-NOEXPAND-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NOEXPAND-NEXT:    v_add_nc_u32_e32 v0, v0, v2
+; GFX12-NOEXPAND-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_add_nc_u32 v0, v0, v2
 ; GFX12-NOEXPAND-NEXT:    global_store_b32 v1, v0, s[0:1]
 ; GFX12-NOEXPAND-NEXT:    s_endpgm
 
@@ -650,9 +650,9 @@ define amdgpu_kernel void @test_outoforder_lds_and_smem(ptr addrspace(3) %lds_pt
 ; GFX11-EXPAND-NEXT:    s_load_b32 s0, s[0:1], 0x0
 ; GFX11-EXPAND-NEXT:    ds_load_2addr_b32 v[0:1], v0 offset1:1
 ; GFX11-EXPAND-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-EXPAND-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_add_nc_u32 v0, v0, v1
+; GFX11-EXPAND-NEXT:    v_add_nc_u32_e32 v0, v0, v1
 ; GFX11-EXPAND-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-EXPAND-NEXT:    v_add_nc_u32_e32 v0, s0, v0
+; GFX11-EXPAND-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_add_nc_u32 v0, s0, v0
 ; GFX11-EXPAND-NEXT:    global_store_b32 v1, v0, s[2:3]
 ; GFX11-EXPAND-NEXT:    s_endpgm
 ;
@@ -666,9 +666,9 @@ define amdgpu_kernel void @test_outoforder_lds_and_smem(ptr addrspace(3) %lds_pt
 ; GFX11-NOEXPAND-NEXT:    s_load_b32 s0, s[0:1], 0x0
 ; GFX11-NOEXPAND-NEXT:    ds_load_2addr_b32 v[0:1], v0 offset1:1
 ; GFX11-NOEXPAND-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NOEXPAND-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_add_nc_u32 v0, v0, v1
+; GFX11-NOEXPAND-NEXT:    v_add_nc_u32_e32 v0, v0, v1
 ; GFX11-NOEXPAND-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NOEXPAND-NEXT:    v_add_nc_u32_e32 v0, s0, v0
+; GFX11-NOEXPAND-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_add_nc_u32 v0, s0, v0
 ; GFX11-NOEXPAND-NEXT:    global_store_b32 v1, v0, s[2:3]
 ; GFX11-NOEXPAND-NEXT:    s_endpgm
 ;
@@ -682,10 +682,10 @@ define amdgpu_kernel void @test_outoforder_lds_and_smem(ptr addrspace(3) %lds_pt
 ; GFX12-EXPAND-NEXT:    s_load_b32 s0, s[0:1], 0x0
 ; GFX12-EXPAND-NEXT:    ds_load_2addr_b32 v[0:1], v0 offset1:1
 ; GFX12-EXPAND-NEXT:    s_wait_dscnt 0x0
-; GFX12-EXPAND-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_add_nc_u32 v0, v0, v1
+; GFX12-EXPAND-NEXT:    v_add_nc_u32_e32 v0, v0, v1
 ; GFX12-EXPAND-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-EXPAND-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-EXPAND-NEXT:    v_add_nc_u32_e32 v0, s0, v0
+; GFX12-EXPAND-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_add_nc_u32 v0, s0, v0
 ; GFX12-EXPAND-NEXT:    global_store_b32 v1, v0, s[2:3]
 ; GFX12-EXPAND-NEXT:    s_endpgm
 ;
@@ -699,10 +699,10 @@ define amdgpu_kernel void @test_outoforder_lds_and_smem(ptr addrspace(3) %lds_pt
 ; GFX12-NOEXPAND-NEXT:    s_load_b32 s0, s[0:1], 0x0
 ; GFX12-NOEXPAND-NEXT:    ds_load_2addr_b32 v[0:1], v0 offset1:1
 ; GFX12-NOEXPAND-NEXT:    s_wait_dscnt 0x0
-; GFX12-NOEXPAND-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_add_nc_u32 v0, v0, v1
+; GFX12-NOEXPAND-NEXT:    v_add_nc_u32_e32 v0, v0, v1
 ; GFX12-NOEXPAND-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NOEXPAND-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NOEXPAND-NEXT:    v_add_nc_u32_e32 v0, s0, v0
+; GFX12-NOEXPAND-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_add_nc_u32 v0, s0, v0
 ; GFX12-NOEXPAND-NEXT:    global_store_b32 v1, v0, s[2:3]
 ; GFX12-NOEXPAND-NEXT:    s_endpgm
 
diff --git a/llvm/test/CodeGen/AMDGPU/fcopysign.f16.ll b/llvm/test/CodeGen/AMDGPU/fcopysign.f16.ll
index 23753bc5970dd..2c02d7d9b3b7d 100644
--- a/llvm/test/CodeGen/AMDGPU/fcopysign.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/fcopysign.f16.ll
@@ -1067,8 +1067,9 @@ define half @v_copysign_out_f16_mag_f64_sign_f16(double %mag, half %sign) {
 ; GFX11-FAKE16-NEXT:    v_cmp_gt_i32_e32 vcc_lo, 31, v1
 ; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-FAKE16-NEXT:    v_or_b32_e32 v4, v4, v5
-; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v4, 0x7e00 :: v_dual_add_nc_u32 v3, v3, v4
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v3, v3, v4
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v4, 0x7e00
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
 ; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v3, 0x7c00, v3, vcc_lo
 ; GFX11-FAKE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v0
 ; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v0, 0x7c00, v4, vcc_lo
@@ -5106,8 +5107,8 @@ define <3 x half> @v_copysign_out_v3f16_mag_v3f64_sign_v3f16(<3 x double> %mag,
 ; GFX11-FAKE16-NEXT:    v_cndmask_b32_e64 v14, 0, 1, vcc_lo
 ; GFX11-FAKE16-NEXT:    v_or_b32_e32 v12, v14, v12
 ; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_4)
-; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v12, 0x7e00 :: v_dual_add_nc_u32 v9, v9, v12
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v15, 7, v5
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v9, v9, v12
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v12, 0x7e00 :: v_dual_and_b32 v15, 7, v5
 ; GFX11-FAKE16-NEXT:    v_and_b32_e32 v16, 7, v11
 ; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v5, 2, v5
 ; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v11, 2, v11
@@ -6433,11 +6434,11 @@ define <4 x half> @v_copysign_out_v4f16_mag_v4f64_sign_v4f16(<4 x double> %mag,
 ; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v6, v10, v6, vcc_lo
 ; GFX11-FAKE16-NEXT:    v_cmp_gt_i32_e32 vcc_lo, 31, v12
-; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v11, 0x7c00, v13 :: v_dual_add_nc_u32 v10, v16, v11
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v10, v16, v11
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v11, 0x7c00, v13, vcc_lo
 ; GFX11-FAKE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v0
 ; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v0, 0x7c00, v18, vcc_lo
 ; GFX11-FAKE16-NEXT:    v_cmp_gt_i32_e32 vcc_lo, 31, v19
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
 ; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v10, 0x7c00, v10, vcc_lo
 ; GFX11-FAKE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v2
 ; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v2, 0x7c00, v18, vcc_lo
diff --git a/llvm/test/CodeGen/AMDGPU/mad_64_32.ll b/llvm/test/CodeGen/AMDGPU/mad_64_32.ll
index fc8467cb73ab6..dc28b2c01c897 100644
--- a/llvm/test/CodeGen/AMDGPU/mad_64_32.ll
+++ b/llvm/test/CodeGen/AMDGPU/mad_64_32.ll
@@ -1995,7 +1995,7 @@ define i64 @lshr_mad_i64_negative_2(i64 %arg0) #0 {
 ; GFX1250-NEXT:    v_mad_nc_u64_u32 v[2:3], 0xd1, v1, v[0:1]
 ; GFX1250-NEXT:    v_lshlrev_b32_e32 v0, 8, v1
 ; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1250-NEXT:    v_dual_mov_b32 v0, v2 :: v_dual_sub_nc_u32 v1, v3, v0
+; GFX1250-NEXT:    v_dual_sub_nc_u32 v1, v3, v0 :: v_dual_mov_b32 v0, v2
 ; GFX1250-NEXT:    s_set_pc_i64 s[30:31]
   %lsh = lshr i64 %arg0, 32
   %mul = mul i64 %lsh, s0xffffff00000000d1
diff --git a/llvm/test/CodeGen/AMDGPU/match-perm-extract-vector-elt-bug.ll b/llvm/test/CodeGen/AMDGPU/match-perm-extract-vector-elt-bug.ll
index 93d772fdb7854..35d7b6a7ad0e5 100644
--- a/llvm/test/CodeGen/AMDGPU/match-perm-extract-vector-elt-bug.ll
+++ b/llvm/test/CodeGen/AMDGPU/match-perm-extract-vector-elt-bug.ll
@@ -62,12 +62,12 @@ define amdgpu_kernel void @test(ptr addrspace(1) %src, ptr addrspace(1) %dst) {
 ; GFX11-NEXT:    s_load_b32 s6, s[4:5], 0x1c
 ; GFX11-NEXT:    s_load_b32 s7, s[4:5], 0x38
 ; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x0
-; GFX11-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_and_b32 v1, 0x3ff, v0
+; GFX11-NEXT:    v_and_b32_e32 v1, 0x3ff, v0
+; GFX11-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    s_and_b32 s4, s6, 0xffff
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_mul_i32 s13, s13, s4
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
 ; GFX11-NEXT:    v_add3_u32 v1, s7, s13, v1
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_ashrrev_i64 v[4:5], 28, v[0:1]
diff --git a/llvm/test/CodeGen/AMDGPU/narrow_math_for_and.ll b/llvm/test/CodeGen/AMDGPU/narrow_math_for_and.ll
index 57805063b92b1..bd9135a44371b 100644
--- a/llvm/test/CodeGen/AMDGPU/narrow_math_for_and.ll
+++ b/llvm/test/CodeGen/AMDGPU/narrow_math_for_and.ll
@@ -9,7 +9,8 @@ define i64 @narrow_add(i64 %a, i64 %b) {
 ; CHECK-NEXT:    v_and_b32_e32 v0, 0x7fffffff, v0
 ; CHECK-NEXT:    v_and_b32_e32 v1, 0x7fffffff, v2
 ; CHECK-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; CHECK-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_add_nc_u32 v0, v0, v1
+; CHECK-NEXT:    v_add_nc_u32_e32 v0, v0, v1
+; CHECK-NEXT:    v_mov_b32_e32 v1, 0
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
   %zext0 = and i64 %a, 2147483647
   %zext1 = and i64 %b, 2147483647
@@ -40,7 +41,8 @@ define <2 x i64> @narrow_add_vec(<2 x i64> %a, <2 x i64> %b) #0 {
 ; CHECK-NEXT:    v_and_b32_e32 v3, 0x7ffffffe, v6
 ; CHECK-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; CHECK-NEXT:    v_add_nc_u32_e32 v0, v0, v2
-; CHECK-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_add_nc_u32 v2, v1, v3
+; CHECK-NEXT:    v_add_nc_u32_e32 v2, v1, v3
+; CHECK-NEXT:    v_mov_b32_e32 v1, 0
 ; CHECK-NEXT:    v_mov_b32_e32 v3, 0
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
   %zext0 = and <2 x i64> %a, <i64 2147483647, i64 30>
diff --git a/llvm/test/CodeGen/AMDGPU/vopd-combine-gfx1250.mir b/llvm/test/CodeGen/AMDGPU/vopd-combine-gfx1250.mir
index 05bbb0f54ef9e..c40b47ff4e4b4 100644
--- a/llvm/test/CodeGen/AMDGPU/vopd-combine-gfx1250.mir
+++ b/llvm/test/CodeGen/AMDGPU/vopd-combine-gfx1250.mir
@@ -4501,3 +4501,110 @@ body: |
     $vgpr142 = V_FMAMK_F32 $vgpr377, 1069066811, $vgpr142, implicit $mode, implicit $exec
     $vgpr145 = V_FMAC_F32_e32 1069066811, $vgpr366, $vgpr145, implicit $mode, implicit $exec
 ...
+
+---
+name:            no_combine_opx_writes_opy_src
+tracksRegLiveness: true
+body:             |
+  bb.0:
+
+    ; SCHED-LABEL: name: no_combine_opx_writes_opy_src
+    ; SCHED: $vgpr6 = IMPLICIT_DEF
+    ; SCHED-NEXT: $vgpr5 = IMPLICIT_DEF
+    ; SCHED-NEXT: $sgpr15 = IMPLICIT_DEF
+    ; SCHED-NEXT: $vgpr6 = V_LSHLREV_B32_e32 8, killed $vgpr5, implicit $mode, implicit $exec
+    ; SCHED-NEXT: $vgpr7 = V_XOR_B32_e32 killed $sgpr15, killed $vgpr6, implicit $exec
+    ;
+    ; PAIR-LABEL: name: no_combine_opx_writes_opy_src
+    ; PAIR: $vgpr6 = IMPLICIT_DEF
+    ; PAIR-NEXT: $vgpr5 = IMPLICIT_DEF
+    ; PAIR-NEXT: $sgpr15 = IMPLICIT_DEF
+    ; PAIR-NEXT: $vgpr6 = V_LSHLREV_B32_e32 8, killed $vgpr5, implicit $mode, implicit $exec
+    ; PAIR-NEXT: $vgpr7 = V_XOR_B32_e32 killed $sgpr15, killed $vgpr6, implicit $exec
+    ;
+    ; LOWER-LABEL: name: no_combine_opx_writes_opy_src
+    ; LOWER: $vgpr6 = IMPLICIT_DEF
+    ; LOWER-NEXT: $vgpr5 = IMPLICIT_DEF
+    ; LOWER-NEXT: $sgpr15 = IMPLICIT_DEF
+    ; LOWER-NEXT: $vgpr6 = V_LSHLREV_B32_e32 8, killed $vgpr5, implicit $mode, implicit $exec
+    ; LOWER-NEXT: $vgpr7 = V_XOR_B32_e32 killed $sgpr15, killed $vgpr6, implicit $exec
+    $vgpr5 = IMPLICIT_DEF
+    $vgpr6 = IMPLICIT_DEF
+    $sgpr15 = IMPLICIT_DEF
+    $vgpr6 = V_LSHLREV_B32_e32 8, $vgpr5, implicit $mode, implicit $exec
+    $vgpr7 = V_XOR_B32_e32 $sgpr15, $vgpr6, implicit $exec
+...
+
+---
+name:            no_combine_opx_writes_opy_src_reorder
+tracksRegLiveness: true
+body:             |
+  bb.0:
+
+    ; SCHED-LABEL: name: no_combine_opx_writes_opy_src_reorder
+    ; SCHED: $vgpr6 = IMPLICIT_DEF
+    ; SCHED-NEXT: $vgpr5 = IMPLICIT_DEF
+    ; SCHED-NEXT: $sgpr15 = IMPLICIT_DEF
+    ; SCHED-NEXT: $vgpr7 = V_XOR_B32_e32 killed $sgpr15, killed $vgpr6, implicit $exec
+    ; SCHED-NEXT: $vgpr6 = V_LSHLREV_B32_e32 8, killed $vgpr5, implicit $mode, implicit $exec
+    ;
+    ; PAIR-LABEL: name: no_combine_opx_writes_opy_src_reorder
+    ; PAIR: $vgpr6 = IMPLICIT_DEF
+    ; PAIR-NEXT: $vgpr5 = IMPLICIT_DEF
+    ; PAIR-NEXT: $sgpr15 = IMPLICIT_DEF
+    ; PAIR-NEXT: $vgpr7 = V_XOR_B32_e32 killed $sgpr15, killed $vgpr6, implicit $exec
+    ; PAIR-NEXT: $vgpr6 = V_LSHLREV_B32_e32 8, killed $vgpr5, implicit $mode, implicit $exec
+    ;
+    ; LOWER-LABEL: name: no_combine_opx_writes_opy_src_reorder
+    ; LOWER: $vgpr6 = IMPLICIT_DEF
+    ; LOWER-NEXT: $vgpr5 = IMPLICIT_DEF
+    ; LOWER-NEXT: $sgpr15 = IMPLICIT_DEF
+    ; LOWER-NEXT: $vgpr7 = V_XOR_B32_e32 killed $sgpr15, killed $vgpr6, implicit $exec
+    ; LOWER-NEXT: $vgpr6 = V_LSHLREV_B32_e32 8, killed $vgpr5, implicit $mode, implicit $exec
+    $vgpr5 = IMPLICIT_DEF
+    $vgpr6 = IMPLICIT_DEF
+    $sgpr15 = IMPLICIT_DEF
+    $vgpr7 = V_XOR_B32_e32 $sgpr15, $vgpr6, implicit $exec
+    $vgpr6 = V_LSHLREV_B32_e32 8, $vgpr5, implicit $mode, implicit $exec
+...
+
+---
+name:            no_combine_add_f64_fmac_f32_e64_neg_overlap_dst
+tracksRegLiveness: true
+body:             |
+  bb.0:
+
+    ; SCHED-LABEL: name: no_combine_add_f64_fmac_f32_e64_neg_overlap_dst
+    ; SCHED: $vgpr2 = IMPLICIT_DEF
+    ; SCHED-NEXT: $vgpr3 = IMPLICIT_DEF
+    ; SCHED-NEXT: $vgpr6 = IMPLICIT_DEF
+    ; SCHED-NEXT: $vgpr0 = IMPLICIT_DEF
+    ; SCHED-NEXT: $vgpr1 = IMPLICIT_DEF
+    ; SCHED-NEXT: $vgpr6 = V_FMAC_F32_e64 0, killed $vgpr2, 0, killed $vgpr3, 0, killed $vgpr6, 0, 0, implicit $mode, implicit $exec
+    ; SCHED-NEXT: $vgpr2_vgpr3 = V_ADD_F64_pseudo_e32 10, killed $vgpr0_vgpr1, implicit $mode, implicit $exec
+    ;
+    ; PAIR-LABEL: name: no_combine_add_f64_fmac_f32_e64_neg_overlap_dst
+    ; PAIR: $vgpr2 = IMPLICIT_DEF
+    ; PAIR-NEXT: $vgpr3 = IMPLICIT_DEF
+    ; PAIR-NEXT: $vgpr6 = IMPLICIT_DEF
+    ; PAIR-NEXT: $vgpr0 = IMPLICIT_DEF
+    ; PAIR-NEXT: $vgpr1 = IMPLICIT_DEF
+    ; PAIR-NEXT: $vgpr6 = V_FMAC_F32_e64 0, killed $vgpr2, 0, killed $vgpr3, 0, killed $vgpr6, 0, 0, implicit $mode, implicit $exec
+    ; PAIR-NEXT: $vgpr2_vgpr3 = V_ADD_F64_pseudo_e32 10, killed $vgpr0_vgpr1, implicit $mode, implicit $exec
+    ;
+    ; LOWER-LABEL: name: no_combine_add_f64_fmac_f32_e64_neg_overlap_dst
+    ; LOWER: $vgpr2 = IMPLICIT_DEF
+    ; LOWER-NEXT: $vgpr3 = IMPLICIT_DEF
+    ; LOWER-NEXT: $vgpr6 = IMPLICIT_DEF
+    ; LOWER-NEXT: $vgpr0 = IMPLICIT_DEF
+    ; LOWER-NEXT: $vgpr1 = IMPLICIT_DEF
+    ; LOWER-NEXT: $vgpr6 = V_FMAC_F32_e64 0, killed $vgpr2, 0, killed $vgpr3, 0, killed $vgpr6, 0, 0, implicit $mode, implicit $exec
+    ; LOWER-NEXT: $vgpr2_vgpr3 = V_ADD_F64_pseudo_e32 10, killed $vgpr0_vgpr1, implicit $mode, implicit $exec
+    $vgpr0 = IMPLICIT_DEF
+    $vgpr1 = IMPLICIT_DEF
+    $vgpr2 = IMPLICIT_DEF
+    $vgpr3 = IMPLICIT_DEF
+    $vgpr6 = IMPLICIT_DEF
+    $vgpr6 = V_FMAC_F32_e64 0, $vgpr2, 0, $vgpr3, 0, $vgpr6, 0, 0, implicit $mode, implicit $exec
+    $vgpr2_vgpr3 = V_ADD_F64_pseudo_e32 10, $vgpr0_vgpr1, implicit $mode, implicit $exec
+...

>From 4d74d93594fdcaba28d1916cabb31f4d8ba95e58 Mon Sep 17 00:00:00 2001
From: Leon Clark <Leon4116 at gmail.com>
Date: Fri, 30 Jan 2026 15:20:27 +0000
Subject: [PATCH 4/6] [VectorCombine] Trim low end of loads used in
 shufflevector rebroadcasts. (#149093)

Following on from #128938, trim the low end of loads where only some of
the incoming lanes are used for rebroadcasts in shufflevector
instructions.

---------

Co-authored-by: Leon Clark <leoclark at amd.com>
Co-authored-by: Simon Pilgrim <llvm-dev at redking.me.uk>
---
 .../Transforms/Vectorize/VectorCombine.cpp    | 26 ++++++---
 .../VectorCombine/load-shufflevector.ll       | 55 +++++++++++--------
 2 files changed, 50 insertions(+), 31 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
index b3295576eb73e..d173afd24e54c 100644
--- a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
+++ b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
@@ -5420,7 +5420,7 @@ bool VectorCombine::shrinkLoadForShuffles(Instruction &I) {
 
   // Get the range of vector elements used by shufflevector instructions.
   if (std::optional<IndexRange> Indices = GetIndexRangeInShuffles()) {
-    unsigned const NewNumElements = Indices->second + 1u;
+    unsigned const NewNumElements = (Indices->second + 1) - Indices->first;
 
     // If the range of vector elements is smaller than the full load, attempt
     // to create a smaller load.
@@ -5442,19 +5442,23 @@ bool VectorCombine::shrinkLoadForShuffles(Instruction &I) {
 
       using UseEntry = std::pair<ShuffleVectorInst *, std::vector<int>>;
       SmallVector<UseEntry, 4u> NewUses;
-      unsigned const MaxIndex = NewNumElements * 2u;
+      unsigned const LowOffset = Indices->first;
+      unsigned const HighOffset = OldNumElements - (Indices->second + 1);
 
       for (llvm::Use &Use : I.uses()) {
         auto *Shuffle = cast<ShuffleVectorInst>(Use.getUser());
         ArrayRef<int> OldMask = Shuffle->getShuffleMask();
 
         // Create entry for new use.
-        NewUses.push_back({Shuffle, OldMask});
-
-        // Validate mask indices.
+        NewUses.push_back({Shuffle, {}});
+        std::vector<int> &NewMask = NewUses.back().second;
         for (int Index : OldMask) {
-          if (Index >= static_cast<int>(MaxIndex))
+          int NewIndex = Index >= static_cast<int>(OldNumElements)
+                             ? Index - LowOffset - HighOffset
+                             : Index - LowOffset;
+          if (NewIndex >= static_cast<int>(NewNumElements * 2u))
             return false;
+          NewMask.push_back(NewIndex);
         }
 
         // Update costs.
@@ -5463,7 +5467,7 @@ bool VectorCombine::shrinkLoadForShuffles(Instruction &I) {
                                OldLoadTy, OldMask, CostKind);
         NewCost +=
             TTI.getShuffleCost(TTI::SK_PermuteSingleSrc, Shuffle->getType(),
-                               NewLoadTy, OldMask, CostKind);
+                               NewLoadTy, NewMask, CostKind);
       }
 
       LLVM_DEBUG(
@@ -5475,8 +5479,14 @@ bool VectorCombine::shrinkLoadForShuffles(Instruction &I) {
         return false;
 
       // Create new load of smaller vector.
+      Type *IndexTy = DL->getIndexType(PtrOp->getType());
+      Value *NewPtr = LowOffset > 0u
+                          ? Builder.CreateInBoundsPtrAdd(
+                                PtrOp, ConstantInt::get(IndexTy, LowOffset))
+                          : PtrOp;
+
       auto *NewLoad = cast<LoadInst>(
-          Builder.CreateAlignedLoad(NewLoadTy, PtrOp, OldLoad->getAlign()));
+          Builder.CreateAlignedLoad(NewLoadTy, NewPtr, OldLoad->getAlign()));
       NewLoad->copyMetadata(I);
 
       // Replace all uses.
diff --git a/llvm/test/Transforms/VectorCombine/load-shufflevector.ll b/llvm/test/Transforms/VectorCombine/load-shufflevector.ll
index 7d9393ab77f20..2b5cec9ccfde5 100644
--- a/llvm/test/Transforms/VectorCombine/load-shufflevector.ll
+++ b/llvm/test/Transforms/VectorCombine/load-shufflevector.ll
@@ -47,8 +47,9 @@ define <4 x half> @shuffle_v4_v4f16_r1_2(ptr addrspace(1) nocapture readonly %ar
 ; CHECK-LABEL: define <4 x half> @shuffle_v4_v4f16_r1_2(
 ; CHECK-SAME: ptr addrspace(1) readonly captures(none) [[ARG0:%.*]]) local_unnamed_addr {
 ; CHECK-NEXT:  [[ENTRY:.*:]]
-; CHECK-NEXT:    [[TMP0:%.*]] = load <3 x half>, ptr addrspace(1) [[ARG0]], align 32
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <3 x half> [[TMP0]], <3 x half> poison, <4 x i32> <i32 1, i32 1, i32 2, i32 2>
+; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[ARG0]], i64 1
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x half>, ptr addrspace(1) [[TMP0]], align 32
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <2 x half> [[TMP2]], <2 x half> poison, <4 x i32> <i32 0, i32 0, i32 1, i32 1>
 ; CHECK-NEXT:    ret <4 x half> [[TMP1]]
 ;
 entry:
@@ -61,8 +62,9 @@ define <8 x half> @shuffle_v4_v8f16_r1_2(ptr addrspace(1) nocapture readonly %ar
 ; CHECK-LABEL: define <8 x half> @shuffle_v4_v8f16_r1_2(
 ; CHECK-SAME: ptr addrspace(1) readonly captures(none) [[ARG0:%.*]]) local_unnamed_addr {
 ; CHECK-NEXT:  [[ENTRY:.*:]]
-; CHECK-NEXT:    [[TMP0:%.*]] = load <3 x half>, ptr addrspace(1) [[ARG0]], align 32
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <3 x half> [[TMP0]], <3 x half> poison, <8 x i32> <i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2>
+; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[ARG0]], i64 1
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x half>, ptr addrspace(1) [[TMP0]], align 32
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <2 x half> [[TMP2]], <2 x half> poison, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1>
 ; CHECK-NEXT:    ret <8 x half> [[TMP1]]
 ;
 entry:
@@ -108,13 +110,14 @@ define <4 x half> @shuffle_v4_v4f16_cond_r1_2(ptr addrspace(1) nocapture readonl
 ; CHECK-LABEL: define <4 x half> @shuffle_v4_v4f16_cond_r1_2(
 ; CHECK-SAME: ptr addrspace(1) readonly captures(none) [[ARG0:%.*]], i1 [[COND:%.*]]) local_unnamed_addr {
 ; CHECK-NEXT:  [[ENTRY:.*:]]
-; CHECK-NEXT:    [[TMP0:%.*]] = load <3 x half>, ptr addrspace(1) [[ARG0]], align 32
+; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[ARG0]], i64 1
+; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[TMP0]], align 32
 ; CHECK-NEXT:    br i1 [[COND]], label %[[THEN:.*]], label %[[ELSE:.*]]
 ; CHECK:       [[THEN]]:
-; CHECK-NEXT:    [[VAL1:%.*]] = shufflevector <3 x half> [[TMP0]], <3 x half> poison, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
+; CHECK-NEXT:    [[VAL1:%.*]] = shufflevector <2 x half> [[TMP1]], <2 x half> poison, <4 x i32> zeroinitializer
 ; CHECK-NEXT:    br label %[[FINALLY:.*]]
 ; CHECK:       [[ELSE]]:
-; CHECK-NEXT:    [[VAL2:%.*]] = shufflevector <3 x half> [[TMP0]], <3 x half> poison, <4 x i32> <i32 2, i32 2, i32 2, i32 2>
+; CHECK-NEXT:    [[VAL2:%.*]] = shufflevector <2 x half> [[TMP1]], <2 x half> poison, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
 ; CHECK-NEXT:    br label %[[FINALLY]]
 ; CHECK:       [[FINALLY]]:
 ; CHECK-NEXT:    [[VAL3:%.*]] = phi <4 x half> [ [[VAL1]], %[[THEN]] ], [ [[VAL2]], %[[ELSE]] ]
@@ -141,13 +144,14 @@ define <8 x half> @shuffle_v4_v8f16_cond_r1_2(ptr addrspace(1) nocapture readonl
 ; CHECK-LABEL: define <8 x half> @shuffle_v4_v8f16_cond_r1_2(
 ; CHECK-SAME: ptr addrspace(1) readonly captures(none) [[ARG0:%.*]], i1 [[COND:%.*]]) local_unnamed_addr {
 ; CHECK-NEXT:  [[ENTRY:.*:]]
-; CHECK-NEXT:    [[TMP0:%.*]] = load <3 x half>, ptr addrspace(1) [[ARG0]], align 32
+; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[ARG0]], i64 1
+; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[TMP0]], align 32
 ; CHECK-NEXT:    br i1 [[COND]], label %[[THEN:.*]], label %[[ELSE:.*]]
 ; CHECK:       [[THEN]]:
-; CHECK-NEXT:    [[VAL1:%.*]] = shufflevector <3 x half> [[TMP0]], <3 x half> poison, <8 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
+; CHECK-NEXT:    [[VAL1:%.*]] = shufflevector <2 x half> [[TMP1]], <2 x half> poison, <8 x i32> zeroinitializer
 ; CHECK-NEXT:    br label %[[FINALLY:.*]]
 ; CHECK:       [[ELSE]]:
-; CHECK-NEXT:    [[VAL2:%.*]] = shufflevector <3 x half> [[TMP0]], <3 x half> poison, <8 x i32> <i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2>
+; CHECK-NEXT:    [[VAL2:%.*]] = shufflevector <2 x half> [[TMP1]], <2 x half> poison, <8 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
 ; CHECK-NEXT:    br label %[[FINALLY]]
 ; CHECK:       [[FINALLY]]:
 ; CHECK-NEXT:    [[VAL3:%.*]] = phi <8 x half> [ [[VAL1]], %[[THEN]] ], [ [[VAL2]], %[[ELSE]] ]
@@ -202,8 +206,9 @@ define <4 x i32> @shuffle_v4_v4i32_r1_2(ptr addrspace(1) nocapture readonly %arg
 ; CHECK-LABEL: define <4 x i32> @shuffle_v4_v4i32_r1_2(
 ; CHECK-SAME: ptr addrspace(1) readonly captures(none) [[ARG0:%.*]]) local_unnamed_addr {
 ; CHECK-NEXT:  [[ENTRY:.*:]]
-; CHECK-NEXT:    [[TMP0:%.*]] = load <3 x i32>, ptr addrspace(1) [[ARG0]], align 32
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <3 x i32> [[TMP0]], <3 x i32> poison, <4 x i32> <i32 1, i32 1, i32 2, i32 2>
+; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[ARG0]], i64 1
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i32>, ptr addrspace(1) [[TMP0]], align 32
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <2 x i32> [[TMP2]], <2 x i32> poison, <4 x i32> <i32 0, i32 0, i32 1, i32 1>
 ; CHECK-NEXT:    ret <4 x i32> [[TMP1]]
 ;
 entry:
@@ -216,8 +221,9 @@ define <8 x i32> @shuffle_v4_v8i32_r1_2(ptr addrspace(1) nocapture readonly %arg
 ; CHECK-LABEL: define <8 x i32> @shuffle_v4_v8i32_r1_2(
 ; CHECK-SAME: ptr addrspace(1) readonly captures(none) [[ARG0:%.*]]) local_unnamed_addr {
 ; CHECK-NEXT:  [[ENTRY:.*:]]
-; CHECK-NEXT:    [[TMP0:%.*]] = load <3 x i32>, ptr addrspace(1) [[ARG0]], align 32
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <3 x i32> [[TMP0]], <3 x i32> poison, <8 x i32> <i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2>
+; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[ARG0]], i64 1
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i32>, ptr addrspace(1) [[TMP0]], align 32
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <2 x i32> [[TMP2]], <2 x i32> poison, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1>
 ; CHECK-NEXT:    ret <8 x i32> [[TMP1]]
 ;
 entry:
@@ -296,13 +302,14 @@ define <4 x i32> @shuffle_v4_v4i32_cond_r1_2(ptr addrspace(1) nocapture readonly
 ; CHECK-LABEL: define <4 x i32> @shuffle_v4_v4i32_cond_r1_2(
 ; CHECK-SAME: ptr addrspace(1) readonly captures(none) [[ARG0:%.*]], i1 [[COND:%.*]]) local_unnamed_addr {
 ; CHECK-NEXT:  [[ENTRY:.*:]]
-; CHECK-NEXT:    [[TMP0:%.*]] = load <3 x i32>, ptr addrspace(1) [[ARG0]], align 32
+; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[ARG0]], i64 1
+; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i32>, ptr addrspace(1) [[TMP0]], align 32
 ; CHECK-NEXT:    br i1 [[COND]], label %[[THEN:.*]], label %[[ELSE:.*]]
 ; CHECK:       [[THEN]]:
-; CHECK-NEXT:    [[VAL1:%.*]] = shufflevector <3 x i32> [[TMP0]], <3 x i32> poison, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
+; CHECK-NEXT:    [[VAL1:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> poison, <4 x i32> zeroinitializer
 ; CHECK-NEXT:    br label %[[FINALLY:.*]]
 ; CHECK:       [[ELSE]]:
-; CHECK-NEXT:    [[VAL2:%.*]] = shufflevector <3 x i32> [[TMP0]], <3 x i32> poison, <4 x i32> <i32 2, i32 2, i32 2, i32 2>
+; CHECK-NEXT:    [[VAL2:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> poison, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
 ; CHECK-NEXT:    br label %[[FINALLY]]
 ; CHECK:       [[FINALLY]]:
 ; CHECK-NEXT:    [[VAL3:%.*]] = phi <4 x i32> [ [[VAL1]], %[[THEN]] ], [ [[VAL2]], %[[ELSE]] ]
@@ -329,13 +336,14 @@ define <8 x i32> @shuffle_v4_v8i32_cond_r1_2(ptr addrspace(1) nocapture readonly
 ; CHECK-LABEL: define <8 x i32> @shuffle_v4_v8i32_cond_r1_2(
 ; CHECK-SAME: ptr addrspace(1) readonly captures(none) [[ARG0:%.*]], i1 [[COND:%.*]]) local_unnamed_addr {
 ; CHECK-NEXT:  [[ENTRY:.*:]]
-; CHECK-NEXT:    [[TMP0:%.*]] = load <3 x i32>, ptr addrspace(1) [[ARG0]], align 32
+; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[ARG0]], i64 1
+; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i32>, ptr addrspace(1) [[TMP0]], align 32
 ; CHECK-NEXT:    br i1 [[COND]], label %[[THEN:.*]], label %[[ELSE:.*]]
 ; CHECK:       [[THEN]]:
-; CHECK-NEXT:    [[VAL1:%.*]] = shufflevector <3 x i32> [[TMP0]], <3 x i32> poison, <8 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
+; CHECK-NEXT:    [[VAL1:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> poison, <8 x i32> zeroinitializer
 ; CHECK-NEXT:    br label %[[FINALLY:.*]]
 ; CHECK:       [[ELSE]]:
-; CHECK-NEXT:    [[VAL2:%.*]] = shufflevector <3 x i32> [[TMP0]], <3 x i32> poison, <8 x i32> <i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2>
+; CHECK-NEXT:    [[VAL2:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> poison, <8 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
 ; CHECK-NEXT:    br label %[[FINALLY]]
 ; CHECK:       [[FINALLY]]:
 ; CHECK-NEXT:    [[VAL3:%.*]] = phi <8 x i32> [ [[VAL1]], %[[THEN]] ], [ [[VAL2]], %[[ELSE]] ]
@@ -362,13 +370,14 @@ define <8 x i32> @shuffle_v4_v8i32_cond_r1_4(ptr addrspace(1) nocapture readonly
 ; CHECK-LABEL: define <8 x i32> @shuffle_v4_v8i32_cond_r1_4(
 ; CHECK-SAME: ptr addrspace(1) readonly captures(none) [[ARG0:%.*]], i1 [[COND:%.*]]) local_unnamed_addr {
 ; CHECK-NEXT:  [[ENTRY:.*:]]
-; CHECK-NEXT:    [[VAL0:%.*]] = load <4 x i32>, ptr addrspace(1) [[ARG0]], align 32
+; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[ARG0]], i64 1
+; CHECK-NEXT:    [[TMP1:%.*]] = load <3 x i32>, ptr addrspace(1) [[TMP0]], align 32
 ; CHECK-NEXT:    br i1 [[COND]], label %[[THEN:.*]], label %[[ELSE:.*]]
 ; CHECK:       [[THEN]]:
-; CHECK-NEXT:    [[VAL1:%.*]] = shufflevector <4 x i32> [[VAL0]], <4 x i32> poison, <8 x i32> <i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2>
+; CHECK-NEXT:    [[VAL1:%.*]] = shufflevector <3 x i32> [[TMP1]], <3 x i32> poison, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1>
 ; CHECK-NEXT:    br label %[[FINALLY:.*]]
 ; CHECK:       [[ELSE]]:
-; CHECK-NEXT:    [[VAL2:%.*]] = shufflevector <4 x i32> [[VAL0]], <4 x i32> poison, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4>
+; CHECK-NEXT:    [[VAL2:%.*]] = shufflevector <3 x i32> [[TMP1]], <3 x i32> poison, <8 x i32> <i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3>
 ; CHECK-NEXT:    br label %[[FINALLY]]
 ; CHECK:       [[FINALLY]]:
 ; CHECK-NEXT:    [[VAL3:%.*]] = phi <8 x i32> [ [[VAL1]], %[[THEN]] ], [ [[VAL2]], %[[ELSE]] ]

>From 62fbfe9a16bd7a08f36c3afacd91c9e79a216a6a Mon Sep 17 00:00:00 2001
From: Priyanshu Kumar <10b.priyanshu at gmail.com>
Date: Wed, 4 Feb 2026 16:47:58 +0000
Subject: [PATCH 5/6] Add support for vpshl/vpshr builtins

---
 clang/lib/CIR/CodeGen/CIRGenBuiltin.cpp       |  19 +-
 clang/lib/CIR/CodeGen/CIRGenBuiltinX86.cpp    |  13 +-
 .../X86/avx512vbmi2-builtins.c                | 418 ++++++++++++++++++
 3 files changed, 444 insertions(+), 6 deletions(-)
 create mode 100644 clang/test/CIR/CodeGenBuiltins/X86/avx512vbmi2-builtins.c

diff --git a/clang/lib/CIR/CodeGen/CIRGenBuiltin.cpp b/clang/lib/CIR/CodeGen/CIRGenBuiltin.cpp
index 580ada8901cbb..c7d6cd8ae240e 100644
--- a/clang/lib/CIR/CodeGen/CIRGenBuiltin.cpp
+++ b/clang/lib/CIR/CodeGen/CIRGenBuiltin.cpp
@@ -1246,8 +1246,23 @@ RValue CIRGenFunction::emitBuiltinExpr(const GlobalDecl &gd, unsigned builtinID,
   case Builtin::BI__builtin_elementwise_canonicalize:
   case Builtin::BI__builtin_elementwise_copysign:
   case Builtin::BI__builtin_elementwise_fma:
-  case Builtin::BI__builtin_elementwise_fshl:
-  case Builtin::BI__builtin_elementwise_fshr:
+    return errorBuiltinNYI(*this, e, builtinID);
+  case Builtin::BI__builtin_elementwise_fshl: {
+    mlir::Location loc = getLoc(e->getExprLoc());
+    mlir::Value a = emitScalarExpr(e->getArg(0));
+    mlir::Value b = emitScalarExpr(e->getArg(1));
+    mlir::Value c = emitScalarExpr(e->getArg(2));
+    return RValue::get(builder.emitIntrinsicCallOp(loc, "fshl", a.getType(),
+                                                   mlir::ValueRange{a, b, c}));
+  }
+  case Builtin::BI__builtin_elementwise_fshr: {
+    mlir::Location loc = getLoc(e->getExprLoc());
+    mlir::Value a = emitScalarExpr(e->getArg(0));
+    mlir::Value b = emitScalarExpr(e->getArg(1));
+    mlir::Value c = emitScalarExpr(e->getArg(2));
+    return RValue::get(builder.emitIntrinsicCallOp(loc, "fshr", a.getType(),
+                                                   mlir::ValueRange{a, b, c}));
+  }
   case Builtin::BI__builtin_elementwise_add_sat:
   case Builtin::BI__builtin_elementwise_sub_sat:
   case Builtin::BI__builtin_elementwise_max:
diff --git a/clang/lib/CIR/CodeGen/CIRGenBuiltinX86.cpp b/clang/lib/CIR/CodeGen/CIRGenBuiltinX86.cpp
index 80022998448ad..4e190ab4de3fb 100644
--- a/clang/lib/CIR/CodeGen/CIRGenBuiltinX86.cpp
+++ b/clang/lib/CIR/CodeGen/CIRGenBuiltinX86.cpp
@@ -2032,6 +2032,10 @@ CIRGenFunction::emitX86BuiltinExpr(unsigned builtinID, const CallExpr *expr) {
   case X86::BI__builtin_ia32_pternlogd256_maskz:
   case X86::BI__builtin_ia32_pternlogq128_maskz:
   case X86::BI__builtin_ia32_pternlogq256_maskz:
+    cgm.errorNYI(expr->getSourceRange(),
+                 std::string("unimplemented X86 builtin call: ") +
+                     getContext().BuiltinInfo.getName(builtinID));
+    return mlir::Value{};
   case X86::BI__builtin_ia32_vpshldd128:
   case X86::BI__builtin_ia32_vpshldd256:
   case X86::BI__builtin_ia32_vpshldd512:
@@ -2041,6 +2045,8 @@ CIRGenFunction::emitX86BuiltinExpr(unsigned builtinID, const CallExpr *expr) {
   case X86::BI__builtin_ia32_vpshldw128:
   case X86::BI__builtin_ia32_vpshldw256:
   case X86::BI__builtin_ia32_vpshldw512:
+    return emitX86FunnelShift(builder, getLoc(expr->getExprLoc()), ops[0],
+                              ops[1], ops[2], false);
   case X86::BI__builtin_ia32_vpshrdd128:
   case X86::BI__builtin_ia32_vpshrdd256:
   case X86::BI__builtin_ia32_vpshrdd512:
@@ -2050,10 +2056,9 @@ CIRGenFunction::emitX86BuiltinExpr(unsigned builtinID, const CallExpr *expr) {
   case X86::BI__builtin_ia32_vpshrdw128:
   case X86::BI__builtin_ia32_vpshrdw256:
   case X86::BI__builtin_ia32_vpshrdw512:
-    cgm.errorNYI(expr->getSourceRange(),
-                 std::string("unimplemented X86 builtin call: ") +
-                     getContext().BuiltinInfo.getName(builtinID));
-    return mlir::Value{};
+    // Ops 0 and 1 are swapped.
+    return emitX86FunnelShift(builder, getLoc(expr->getExprLoc()), ops[1],
+                              ops[0], ops[2], true);
   case X86::BI__builtin_ia32_reduce_fadd_pd512:
   case X86::BI__builtin_ia32_reduce_fadd_ps512:
   case X86::BI__builtin_ia32_reduce_fadd_ph512:
diff --git a/clang/test/CIR/CodeGenBuiltins/X86/avx512vbmi2-builtins.c b/clang/test/CIR/CodeGenBuiltins/X86/avx512vbmi2-builtins.c
new file mode 100644
index 0000000000000..ae0cdf09f3f77
--- /dev/null
+++ b/clang/test/CIR/CodeGenBuiltins/X86/avx512vbmi2-builtins.c
@@ -0,0 +1,418 @@
+// RUN: %clang_cc1 -flax-vector-conversions=none -ffreestanding -triple x86_64-unknown-linux-gnu -fclangir -target-feature +avx512vbmi2 -emit-cir %s -o %t.cir
+// RUN: FileCheck --check-prefix=CIR --input-file=%t.cir %s
+// RUN: %clang_cc1 -flax-vector-conversions=none -ffreestanding -triple x86_64-unknown-linux-gnu -fclangir -target-feature +avx512vbmi2 -emit-llvm %s -o %t.ll
+// RUN: FileCheck --check-prefixes=LLVM --input-file=%t.ll %s
+// RUN: %clang_cc1 -flax-vector-conversions=none -ffreestanding %s -triple=x86_64-unknown-linux -target-feature +avx512vbmi2 -emit-llvm -o %t.ll -Wall -Werror -Wsign-conversion
+// RUN: FileCheck --check-prefixes=OGCG --input-file=%t.ll %s
+
+
+#include <immintrin.h>
+
+__m512i test_mm512_mask_shldi_epi64(__m512i s, __mmask8 u, __m512i a, __m512i b) {
+  // CIR-LABEL: @test_mm512_mask_shldi_epi64
+  // CIR: cir.call_llvm_intrinsic "fshl" %{{.*}}, %{{.*}}, %{{.*}} : (!cir.vector<8 x !s64i>, !cir.vector<8 x !s64i>, !cir.vector<8 x !u64i>) -> !cir.vector<8 x !s64i>
+  // CIR: cir.vec.ternary(%{{.*}}, %{{.*}}, %{{.*}}) : !cir.vector<8 x !cir.int<s, 1>>, !cir.vector<8 x !s64i>
+  // LLVM-LABEL: @test_mm512_mask_shldi_epi64
+  // LLVM: call <8 x i64> @llvm.fshl.v8i64(<8 x i64> {{.*}}, <8 x i64> {{.*}}, <8 x i64> splat (i64 47))
+  // LLVM: select <8 x i1> {{.*}}, <8 x i64> {{.*}}, <8 x i64>
+  // OGCG-LABEL: @test_mm512_mask_shldi_epi64
+  // OGCG: call <8 x i64> @llvm.fshl.v8i64(<8 x i64> %{{.*}}, <8 x i64> %{{.*}}, <8 x i64> splat (i64 47))
+  // OGCG: select <8 x i1> %{{.*}}, <8 x i64> %{{.*}}, <8 x i64> %{{.*}}
+  return _mm512_mask_shldi_epi64(s, u, a, b, 47);
+}
+
+__m512i test_mm512_maskz_shldi_epi64(__mmask8 u, __m512i a, __m512i b) {
+  // CIR-LABEL: @test_mm512_maskz_shldi_epi64
+  // CIR: cir.call_llvm_intrinsic "fshl" %{{.*}}, %{{.*}}, %{{.*}} : (!cir.vector<8 x !s64i>, !cir.vector<8 x !s64i>, !cir.vector<8 x !u64i>) -> !cir.vector<8 x !s64i>
+  // CIR: cir.vec.ternary(%{{.*}}, %{{.*}}, %{{.*}}) : !cir.vector<8 x !cir.int<s, 1>>, !cir.vector<8 x !s64i>
+  // LLVM-LABEL: @test_mm512_maskz_shldi_epi64
+  // LLVM: call <8 x i64> @llvm.fshl.v8i64(<8 x i64> {{.*}}, <8 x i64> {{.*}}, <8 x i64> splat (i64 63))
+  // LLVM: select <8 x i1> {{.*}}, <8 x i64> {{.*}}, <8 x i64>
+  // OGCG-LABEL: @test_mm512_maskz_shldi_epi64
+  // OGCG: call <8 x i64> @llvm.fshl.v8i64(<8 x i64> %{{.*}}, <8 x i64> %{{.*}}, <8 x i64> splat (i64 63))
+  // OGCG: select <8 x i1> %{{.*}}, <8 x i64> %{{.*}}, <8 x i64> %{{.*}}
+  return _mm512_maskz_shldi_epi64(u, a, b, 63);
+}
+
+__m512i test_mm512_shldi_epi64(__m512i a, __m512i b) {
+  // CIR-LABEL: @test_mm512_shldi_epi64
+  // CIR: cir.call_llvm_intrinsic "fshl" %{{.*}}, %{{.*}}, %{{.*}} : (!cir.vector<8 x !s64i>, !cir.vector<8 x !s64i>, !cir.vector<8 x !u64i>) -> !cir.vector<8 x !s64i>
+  // LLVM-LABEL: @test_mm512_shldi_epi64
+  // LLVM: call <8 x i64> @llvm.fshl.v8i64(<8 x i64> {{.*}}, <8 x i64> {{.*}}, <8 x i64> splat (i64 31))
+  // OGCG-LABEL: @test_mm512_shldi_epi64
+  // OGCG: call <8 x i64> @llvm.fshl.v8i64(<8 x i64> %{{.*}}, <8 x i64> %{{.*}}, <8 x i64> splat (i64 31))
+  return _mm512_shldi_epi64(a, b, 31);
+}
+
+__m512i test_mm512_mask_shldi_epi32(__m512i s, __mmask16 u, __m512i a, __m512i b) {
+  // CIR-LABEL: @test_mm512_mask_shldi_epi32
+  // CIR: cir.call_llvm_intrinsic "fshl" %{{.*}}, %{{.*}}, %{{.*}} : (!cir.vector<16 x !s32i>, !cir.vector<16 x !s32i>, !cir.vector<16 x !u32i>) -> !cir.vector<16 x !s32i>
+  // CIR: cir.vec.ternary(%{{.*}}, %{{.*}}, %{{.*}}) : !cir.vector<16 x !cir.int<s, 1>>, !cir.vector<16 x !s32i>
+  // LLVM-LABEL: @test_mm512_mask_shldi_epi32
+  // LLVM: call <16 x i32> @llvm.fshl.v16i32(<16 x i32> {{.*}}, <16 x i32> {{.*}}, <16 x i32> splat (i32 7))
+  // LLVM: select <16 x i1> {{.*}}, <16 x i32> {{.*}}, <16 x i32>
+  // OGCG-LABEL: @test_mm512_mask_shldi_epi32
+  // OGCG: call <16 x i32> @llvm.fshl.v16i32(<16 x i32> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> splat (i32 7))
+  // OGCG: select <16 x i1> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> %{{.*}}
+  return _mm512_mask_shldi_epi32(s, u, a, b, 7);
+}
+
+__m512i test_mm512_maskz_shldi_epi32(__mmask16 u, __m512i a, __m512i b) {
+  // CIR-LABEL: @test_mm512_maskz_shldi_epi32
+  // CIR: cir.call_llvm_intrinsic "fshl" %{{.*}}, %{{.*}}, %{{.*}} : (!cir.vector<16 x !s32i>, !cir.vector<16 x !s32i>, !cir.vector<16 x !u32i>) -> !cir.vector<16 x !s32i>
+  // CIR: cir.vec.ternary(%{{.*}}, %{{.*}}, %{{.*}}) : !cir.vector<16 x !cir.int<s, 1>>, !cir.vector<16 x !s32i>
+  // LLVM-LABEL: @test_mm512_maskz_shldi_epi32
+  // LLVM: call <16 x i32> @llvm.fshl.v16i32(<16 x i32> {{.*}}, <16 x i32> {{.*}}, <16 x i32> splat (i32 15))
+  // LLVM: select <16 x i1> {{.*}}, <16 x i32> {{.*}}, <16 x i32>
+  // OGCG-LABEL: @test_mm512_maskz_shldi_epi32
+  // OGCG: call <16 x i32> @llvm.fshl.v16i32(<16 x i32> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> splat (i32 15))
+  // OGCG: select <16 x i1> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> %{{.*}}
+  return _mm512_maskz_shldi_epi32(u, a, b, 15);
+}
+
+__m512i test_mm512_shldi_epi32(__m512i a, __m512i b) {
+  // CIR-LABEL: @test_mm512_shldi_epi32
+  // CIR: cir.call_llvm_intrinsic "fshl" %{{.*}}, %{{.*}}, %{{.*}} : (!cir.vector<16 x !s32i>, !cir.vector<16 x !s32i>, !cir.vector<16 x !u32i>) -> !cir.vector<16 x !s32i>
+  // LLVM-LABEL: @test_mm512_shldi_epi32
+  // LLVM: call <16 x i32> @llvm.fshl.v16i32(<16 x i32> {{.*}}, <16 x i32> {{.*}}, <16 x i32> splat (i32 31))
+  // OGCG-LABEL: @test_mm512_shldi_epi32
+  // OGCG: call <16 x i32> @llvm.fshl.v16i32(<16 x i32> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> splat (i32 31))
+  return _mm512_shldi_epi32(a, b, 31);
+}
+
+__m512i test_mm512_mask_shldi_epi16(__m512i s, __mmask32 u, __m512i a, __m512i b) {
+  // CIR-LABEL: @test_mm512_mask_shldi_epi16
+  // CIR: cir.call_llvm_intrinsic "fshl" %{{.*}}, %{{.*}}, %{{.*}} : (!cir.vector<32 x !s16i>, !cir.vector<32 x !s16i>, !cir.vector<32 x !u16i>) -> !cir.vector<32 x !s16i>
+  // CIR: cir.vec.ternary(%{{.*}}, %{{.*}}, %{{.*}}) : !cir.vector<32 x !cir.int<s, 1>>, !cir.vector<32 x !s16i>
+  // LLVM-LABEL: @test_mm512_mask_shldi_epi16
+  // LLVM: call <32 x i16> @llvm.fshl.v32i16(<32 x i16> {{.*}}, <32 x i16> {{.*}}, <32 x i16> splat (i16 3))
+  // LLVM: select <32 x i1> {{.*}}, <32 x i16> {{.*}}, <32 x i16>
+  // OGCG-LABEL: @test_mm512_mask_shldi_epi16
+  // OGCG: call <32 x i16> @llvm.fshl.v32i16(<32 x i16> %{{.*}}, <32 x i16> %{{.*}}, <32 x i16> splat (i16 3))
+  // OGCG: select <32 x i1> %{{.*}}, <32 x i16> %{{.*}}, <32 x i16> %{{.*}}
+  return _mm512_mask_shldi_epi16(s, u, a, b, 3);
+}
+
+__m512i test_mm512_maskz_shldi_epi16(__mmask32 u, __m512i a, __m512i b) {
+  // CIR-LABEL: @test_mm512_maskz_shldi_epi16
+  // CIR: cir.call_llvm_intrinsic "fshl" %{{.*}}, %{{.*}}, %{{.*}} : (!cir.vector<32 x !s16i>, !cir.vector<32 x !s16i>, !cir.vector<32 x !u16i>) -> !cir.vector<32 x !s16i>
+  // CIR: cir.vec.ternary(%{{.*}}, %{{.*}}, %{{.*}}) : !cir.vector<32 x !cir.int<s, 1>>, !cir.vector<32 x !s16i>
+  // LLVM-LABEL: @test_mm512_maskz_shldi_epi16
+  // LLVM: call <32 x i16> @llvm.fshl.v32i16(<32 x i16> {{.*}}, <32 x i16> {{.*}}, <32 x i16> splat (i16 15))
+  // LLVM: select <32 x i1> {{.*}}, <32 x i16> {{.*}}, <32 x i16>
+  // OGCG-LABEL: @test_mm512_maskz_shldi_epi16
+  // OGCG: call <32 x i16> @llvm.fshl.v32i16(<32 x i16> %{{.*}}, <32 x i16> %{{.*}}, <32 x i16> splat (i16 15))
+  // OGCG: select <32 x i1> %{{.*}}, <32 x i16> %{{.*}}, <32 x i16> %{{.*}}
+  return _mm512_maskz_shldi_epi16(u, a, b, 15);
+}
+
+__m512i test_mm512_shldi_epi16(__m512i a, __m512i b) {
+  // CIR-LABEL: @test_mm512_shldi_epi16
+  // CIR: cir.call_llvm_intrinsic "fshl" %{{.*}}, %{{.*}}, %{{.*}} : (!cir.vector<32 x !s16i>, !cir.vector<32 x !s16i>, !cir.vector<32 x !u16i>) -> !cir.vector<32 x !s16i>
+  // LLVM-LABEL: @test_mm512_shldi_epi16
+  // LLVM: call <32 x i16> @llvm.fshl.v32i16(<32 x i16> {{.*}}, <32 x i16> {{.*}}, <32 x i16> splat (i16 31))
+  // OGCG-LABEL: @test_mm512_shldi_epi16
+  // OGCG: call <32 x i16> @llvm.fshl.v32i16(<32 x i16> %{{.*}}, <32 x i16> %{{.*}}, <32 x i16> splat (i16 31))
+  return _mm512_shldi_epi16(a, b, 31);
+}
+
+__m512i test_mm512_mask_shldv_epi64(__m512i s, __mmask8 u, __m512i a, __m512i b) {
+  // CIR-LABEL: @test_mm512_mask_shldv_epi64
+  // CIR: cir.call_llvm_intrinsic "fshl" %{{.*}}, %{{.*}}, %{{.*}} : (!cir.vector<8 x !s64i>, !cir.vector<8 x !s64i>, !cir.vector<8 x !u64i>) -> !cir.vector<8 x !s64i>
+  // CIR: cir.vec.ternary(%{{.*}}, %{{.*}}, %{{.*}}) : !cir.vector<8 x !cir.int<s, 1>>, !cir.vector<8 x !s64i>
+  // LLVM-LABEL: @test_mm512_mask_shldv_epi64
+  // LLVM: call <8 x i64> @llvm.fshl.v8i64(<8 x i64> {{.*}}, <8 x i64> {{.*}}, <8 x i64>
+  // LLVM: select <8 x i1> {{.*}}, <8 x i64> {{.*}}, <8 x i64>
+  // OGCG-LABEL: @test_mm512_mask_shldv_epi64
+  // OGCG: call <8 x i64> @llvm.fshl.v8i64(<8 x i64> %{{.*}}, <8 x i64> %{{.*}}, <8 x i64>
+  // OGCG: select <8 x i1> %{{.*}}, <8 x i64> %{{.*}}, <8 x i64> %{{.*}}
+  return _mm512_mask_shldv_epi64(s, u, a, b);
+}
+
+__m512i test_mm512_maskz_shldv_epi64(__mmask8 u, __m512i s, __m512i a, __m512i b) {
+  // CIR-LABEL: @test_mm512_maskz_shldv_epi64
+  // CIR: cir.call_llvm_intrinsic "fshl" %{{.*}}, %{{.*}}, %{{.*}} : (!cir.vector<8 x !s64i>, !cir.vector<8 x !s64i>, !cir.vector<8 x !u64i>) -> !cir.vector<8 x !s64i>
+  // CIR: cir.vec.ternary(%{{.*}}, %{{.*}}, %{{.*}}) : !cir.vector<8 x !cir.int<s, 1>>, !cir.vector<8 x !s64i>
+  // LLVM-LABEL: @test_mm512_maskz_shldv_epi64
+  // LLVM: call <8 x i64> @llvm.fshl.v8i64(<8 x i64> {{.*}}, <8 x i64> {{.*}}, <8 x i64>
+  // LLVM: select <8 x i1> {{.*}}, <8 x i64> {{.*}}, <8 x i64>
+  // OGCG-LABEL: @test_mm512_maskz_shldv_epi64
+  // OGCG: call <8 x i64> @llvm.fshl.v8i64(<8 x i64> %{{.*}}, <8 x i64> %{{.*}}, <8 x i64>
+  // OGCG: select <8 x i1> %{{.*}}, <8 x i64> %{{.*}}, <8 x i64> %{{.*}}
+  return _mm512_maskz_shldv_epi64(u, s, a, b);
+}
+
+__m512i test_mm512_shldv_epi64(__m512i s, __m512i a, __m512i b) {
+  // CIR-LABEL: @test_mm512_shldv_epi64
+  // CIR: cir.call_llvm_intrinsic "fshl" %{{.*}}, %{{.*}}, %{{.*}} : (!cir.vector<8 x !s64i>, !cir.vector<8 x !s64i>, !cir.vector<8 x !u64i>) -> !cir.vector<8 x !s64i>
+  // LLVM-LABEL: @test_mm512_shldv_epi64
+  // LLVM: call <8 x i64> @llvm.fshl.v8i64(<8 x i64> {{.*}}, <8 x i64> {{.*}}, <8 x i64>
+  // OGCG-LABEL: @test_mm512_shldv_epi64
+  // OGCG: call <8 x i64> @llvm.fshl.v8i64(<8 x i64> %{{.*}}, <8 x i64> %{{.*}}, <8 x i64>
+  return _mm512_shldv_epi64(s, a, b);
+}
+
+__m512i test_mm512_shldv_epi32(__m512i s, __m512i a, __m512i b) {
+  // CIR-LABEL: @test_mm512_shldv_epi32
+  // CIR: cir.call_llvm_intrinsic "fshl" %{{.*}}, %{{.*}}, %{{.*}} : (!cir.vector<16 x !s32i>, !cir.vector<16 x !s32i>, !cir.vector<16 x !u32i>) -> !cir.vector<16 x !s32i>
+  // LLVM-LABEL: @test_mm512_shldv_epi32
+  // LLVM: call <16 x i32> @llvm.fshl.v16i32(<16 x i32> {{.*}}, <16 x i32> {{.*}}, <16 x i32>
+  // OGCG-LABEL: @test_mm512_shldv_epi32
+  // OGCG: call <16 x i32> @llvm.fshl.v16i32(<16 x i32> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32>
+  return _mm512_shldv_epi32(s, a, b);
+}
+
+__m512i test_mm512_mask_shldv_epi16(__m512i s, __mmask32 u, __m512i a, __m512i b) {
+  // CIR-LABEL: @test_mm512_mask_shldv_epi16
+  // CIR: cir.call_llvm_intrinsic "fshl" %{{.*}}, %{{.*}}, %{{.*}} : (!cir.vector<32 x !s16i>, !cir.vector<32 x !s16i>, !cir.vector<32 x !u16i>) -> !cir.vector<32 x !s16i>
+  // CIR: cir.vec.ternary(%{{.*}}, %{{.*}}, %{{.*}}) : !cir.vector<32 x !cir.int<s, 1>>, !cir.vector<32 x !s16i>
+  // LLVM-LABEL: @test_mm512_mask_shldv_epi16
+  // LLVM: call <32 x i16> @llvm.fshl.v32i16(<32 x i16> {{.*}}, <32 x i16> {{.*}}, <32 x i16>
+  // LLVM: select <32 x i1> {{.*}}, <32 x i16> {{.*}}, <32 x i16>
+  // OGCG-LABEL: @test_mm512_mask_shldv_epi16
+  // OGCG: call <32 x i16> @llvm.fshl.v32i16(<32 x i16> %{{.*}}, <32 x i16> %{{.*}}, <32 x i16>
+  // OGCG: select <32 x i1> %{{.*}}, <32 x i16> %{{.*}}, <32 x i16> %{{.*}}
+  return _mm512_mask_shldv_epi16(s, u, a, b);
+}
+
+__m512i test_mm512_maskz_shldv_epi16(__mmask32 u, __m512i s, __m512i a, __m512i b) {
+  // CIR-LABEL: @test_mm512_maskz_shldv_epi16
+  // CIR: cir.call_llvm_intrinsic "fshl" %{{.*}}, %{{.*}}, %{{.*}} : (!cir.vector<32 x !s16i>, !cir.vector<32 x !s16i>, !cir.vector<32 x !u16i>) -> !cir.vector<32 x !s16i>
+  // CIR: cir.vec.ternary(%{{.*}}, %{{.*}}, %{{.*}}) : !cir.vector<32 x !cir.int<s, 1>>, !cir.vector<32 x !s16i>
+  // LLVM-LABEL: @test_mm512_maskz_shldv_epi16
+  // LLVM: call <32 x i16> @llvm.fshl.v32i16(<32 x i16> {{.*}}, <32 x i16> {{.*}}, <32 x i16>
+  // LLVM: select <32 x i1> {{.*}}, <32 x i16> {{.*}}, <32 x i16>
+  // OGCG-LABEL: @test_mm512_maskz_shldv_epi16
+  // OGCG: call <32 x i16> @llvm.fshl.v32i16(<32 x i16> %{{.*}}, <32 x i16> %{{.*}}, <32 x i16>
+  // OGCG: select <32 x i1> %{{.*}}, <32 x i16> %{{.*}}, <32 x i16> %{{.*}}
+  return _mm512_maskz_shldv_epi16(u, s, a, b);
+}
+
+__m512i test_mm512_shldv_epi16(__m512i s, __m512i a, __m512i b) {
+  // CIR-LABEL: @test_mm512_shldv_epi16
+  // CIR: cir.call_llvm_intrinsic "fshl" %{{.*}}, %{{.*}}, %{{.*}} : (!cir.vector<32 x !s16i>, !cir.vector<32 x !s16i>, !cir.vector<32 x !u16i>) -> !cir.vector<32 x !s16i>
+  // LLVM-LABEL: @test_mm512_shldv_epi16
+  // LLVM: call <32 x i16> @llvm.fshl.v32i16(<32 x i16> {{.*}}, <32 x i16> {{.*}}, <32 x i16>
+  // OGCG-LABEL: @test_mm512_shldv_epi16
+  // OGCG: call <32 x i16> @llvm.fshl.v32i16(<32 x i16> %{{.*}}, <32 x i16> %{{.*}}, <32 x i16>
+  return _mm512_shldv_epi16(s, a, b);
+}
+
+__m512i test_mm512_mask_shrdi_epi64(__m512i s, __mmask8 u, __m512i a, __m512i b) {
+  // CIR-LABEL: @test_mm512_mask_shrdi_epi64
+  // CIR: cir.call_llvm_intrinsic "fshr" %{{.*}}, %{{.*}}, %{{.*}} : (!cir.vector<8 x !s64i>, !cir.vector<8 x !s64i>, !cir.vector<8 x !u64i>) -> !cir.vector<8 x !s64i>
+  // CIR: cir.vec.ternary(%{{.*}}, %{{.*}}, %{{.*}}) : !cir.vector<8 x !cir.int<s, 1>>, !cir.vector<8 x !s64i>
+  // LLVM-LABEL: @test_mm512_mask_shrdi_epi64
+  // LLVM: call <8 x i64> @llvm.fshr.v8i64(<8 x i64> {{.*}}, <8 x i64> {{.*}}, <8 x i64> splat (i64 47))
+  // LLVM: select <8 x i1> {{.*}}, <8 x i64> {{.*}}, <8 x i64>
+  // OGCG-LABEL: @test_mm512_mask_shrdi_epi64
+  // OGCG: call <8 x i64> @llvm.fshr.v8i64(<8 x i64> %{{.*}}, <8 x i64> %{{.*}}, <8 x i64> splat (i64 47))
+  // OGCG: select <8 x i1> %{{.*}}, <8 x i64> %{{.*}}, <8 x i64> %{{.*}}
+  return _mm512_mask_shrdi_epi64(s, u, a, b, 47);
+}
+
+__m512i test_mm512_maskz_shrdi_epi64(__mmask8 u, __m512i a, __m512i b) {
+  // CIR-LABEL: @test_mm512_maskz_shrdi_epi64
+  // CIR: cir.call_llvm_intrinsic "fshr" %{{.*}}, %{{.*}}, %{{.*}} : (!cir.vector<8 x !s64i>, !cir.vector<8 x !s64i>, !cir.vector<8 x !u64i>) -> !cir.vector<8 x !s64i>
+  // CIR: cir.vec.ternary(%{{.*}}, %{{.*}}, %{{.*}}) : !cir.vector<8 x !cir.int<s, 1>>, !cir.vector<8 x !s64i>
+  // LLVM-LABEL: @test_mm512_maskz_shrdi_epi64
+  // LLVM: call <8 x i64> @llvm.fshr.v8i64(<8 x i64> {{.*}}, <8 x i64> {{.*}}, <8 x i64> splat (i64 63))
+  // LLVM: select <8 x i1> {{.*}}, <8 x i64> {{.*}}, <8 x i64>
+  // OGCG-LABEL: @test_mm512_maskz_shrdi_epi64
+  // OGCG: call <8 x i64> @llvm.fshr.v8i64(<8 x i64> %{{.*}}, <8 x i64> %{{.*}}, <8 x i64> splat (i64 63))
+  // OGCG: select <8 x i1> %{{.*}}, <8 x i64> %{{.*}}, <8 x i64> %{{.*}}
+  return _mm512_maskz_shrdi_epi64(u, a, b, 63);
+}
+
+__m512i test_mm512_shrdi_epi64(__m512i a, __m512i b) {
+  // CIR-LABEL: @test_mm512_shrdi_epi64
+  // CIR: cir.call_llvm_intrinsic "fshr" %{{.*}}, %{{.*}}, %{{.*}} : (!cir.vector<8 x !s64i>, !cir.vector<8 x !s64i>, !cir.vector<8 x !u64i>) -> !cir.vector<8 x !s64i>
+  // LLVM-LABEL: @test_mm512_shrdi_epi64
+  // LLVM: call <8 x i64> @llvm.fshr.v8i64(<8 x i64> {{.*}}, <8 x i64> {{.*}}, <8 x i64> splat (i64 31))
+  // OGCG-LABEL: @test_mm512_shrdi_epi64
+  // OGCG: call <8 x i64> @llvm.fshr.v8i64(<8 x i64> %{{.*}}, <8 x i64> %{{.*}}, <8 x i64> splat (i64 31))
+  return _mm512_shrdi_epi64(a, b, 31);
+}
+
+__m512i test_mm512_mask_shrdi_epi32(__m512i s, __mmask16 u, __m512i a, __m512i b) {
+  // CIR-LABEL: @test_mm512_mask_shrdi_epi32
+  // CIR: cir.call_llvm_intrinsic "fshr" %{{.*}}, %{{.*}}, %{{.*}} : (!cir.vector<16 x !s32i>, !cir.vector<16 x !s32i>, !cir.vector<16 x !u32i>) -> !cir.vector<16 x !s32i>
+  // CIR: cir.vec.ternary(%{{.*}}, %{{.*}}, %{{.*}}) : !cir.vector<16 x !cir.int<s, 1>>, !cir.vector<16 x !s32i>
+  // LLVM-LABEL: @test_mm512_mask_shrdi_epi32
+  // LLVM: call <16 x i32> @llvm.fshr.v16i32(<16 x i32> {{.*}}, <16 x i32> {{.*}}, <16 x i32> splat (i32 7))
+  // LLVM: select <16 x i1> {{.*}}, <16 x i32> {{.*}}, <16 x i32>
+  // OGCG-LABEL: @test_mm512_mask_shrdi_epi32
+  // OGCG: call <16 x i32> @llvm.fshr.v16i32(<16 x i32> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> splat (i32 7))
+  // OGCG: select <16 x i1> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> %{{.*}}
+  return _mm512_mask_shrdi_epi32(s, u, a, b, 7);
+}
+
+__m512i test_mm512_maskz_shrdi_epi32(__mmask16 u, __m512i a, __m512i b) {
+  // CIR-LABEL: @test_mm512_maskz_shrdi_epi32
+  // CIR: cir.call_llvm_intrinsic "fshr" %{{.*}}, %{{.*}}, %{{.*}} : (!cir.vector<16 x !s32i>, !cir.vector<16 x !s32i>, !cir.vector<16 x !u32i>) -> !cir.vector<16 x !s32i>
+  // CIR: cir.vec.ternary(%{{.*}}, %{{.*}}, %{{.*}}) : !cir.vector<16 x !cir.int<s, 1>>, !cir.vector<16 x !s32i>
+  // LLVM-LABEL: @test_mm512_maskz_shrdi_epi32
+  // LLVM: call <16 x i32> @llvm.fshr.v16i32(<16 x i32> {{.*}}, <16 x i32> {{.*}}, <16 x i32> splat (i32 15))
+  // LLVM: select <16 x i1> {{.*}}, <16 x i32> {{.*}}, <16 x i32>
+  // OGCG-LABEL: @test_mm512_maskz_shrdi_epi32
+  // OGCG: call <16 x i32> @llvm.fshr.v16i32(<16 x i32> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> splat (i32 15))
+  // OGCG: select <16 x i1> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> %{{.*}}
+  return _mm512_maskz_shrdi_epi32(u, a, b, 15);
+}
+
+__m512i test_mm512_shrdi_epi32(__m512i a, __m512i b) {
+  // CIR-LABEL: @test_mm512_shrdi_epi32
+  // CIR: cir.call_llvm_intrinsic "fshr" %{{.*}}, %{{.*}}, %{{.*}} : (!cir.vector<16 x !s32i>, !cir.vector<16 x !s32i>, !cir.vector<16 x !u32i>) -> !cir.vector<16 x !s32i>
+  // LLVM-LABEL: @test_mm512_shrdi_epi32
+  // LLVM: call <16 x i32> @llvm.fshr.v16i32(<16 x i32> {{.*}}, <16 x i32> {{.*}}, <16 x i32> splat (i32 31))
+  // OGCG-LABEL: @test_mm512_shrdi_epi32
+  // OGCG: call <16 x i32> @llvm.fshr.v16i32(<16 x i32> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> splat (i32 31))
+  return _mm512_shrdi_epi32(a, b, 31);
+}
+
+__m512i test_mm512_mask_shrdi_epi16(__m512i s, __mmask32 u, __m512i a, __m512i b) {
+  // CIR-LABEL: @test_mm512_mask_shrdi_epi16
+  // CIR: cir.call_llvm_intrinsic "fshr" %{{.*}}, %{{.*}}, %{{.*}} : (!cir.vector<32 x !s16i>, !cir.vector<32 x !s16i>, !cir.vector<32 x !u16i>) -> !cir.vector<32 x !s16i>
+  // CIR: cir.vec.ternary(%{{.*}}, %{{.*}}, %{{.*}}) : !cir.vector<32 x !cir.int<s, 1>>, !cir.vector<32 x !s16i>
+  // LLVM-LABEL: @test_mm512_mask_shrdi_epi16
+  // LLVM: call <32 x i16> @llvm.fshr.v32i16(<32 x i16> {{.*}}, <32 x i16> {{.*}}, <32 x i16> splat (i16 3))
+  // LLVM: select <32 x i1> {{.*}}, <32 x i16> {{.*}}, <32 x i16>
+  // OGCG-LABEL: @test_mm512_mask_shrdi_epi16
+  // OGCG: call <32 x i16> @llvm.fshr.v32i16(<32 x i16> %{{.*}}, <32 x i16> %{{.*}}, <32 x i16> splat (i16 3))
+  // OGCG: select <32 x i1> %{{.*}}, <32 x i16> %{{.*}}, <32 x i16> %{{.*}}
+  return _mm512_mask_shrdi_epi16(s, u, a, b, 3);
+}
+
+__m512i test_mm512_maskz_shrdi_epi16(__mmask32 u, __m512i a, __m512i b) {
+  // CIR-LABEL: @test_mm512_maskz_shrdi_epi16
+  // CIR: cir.call_llvm_intrinsic "fshr" %{{.*}}, %{{.*}}, %{{.*}} : (!cir.vector<32 x !s16i>, !cir.vector<32 x !s16i>, !cir.vector<32 x !u16i>) -> !cir.vector<32 x !s16i>
+  // CIR: cir.vec.ternary(%{{.*}}, %{{.*}}, %{{.*}}) : !cir.vector<32 x !cir.int<s, 1>>, !cir.vector<32 x !s16i>
+  // LLVM-LABEL: @test_mm512_maskz_shrdi_epi16
+  // LLVM: call <32 x i16> @llvm.fshr.v32i16(<32 x i16> {{.*}}, <32 x i16> {{.*}}, <32 x i16> splat (i16 15))
+  // LLVM: select <32 x i1> {{.*}}, <32 x i16> {{.*}}, <32 x i16>
+  // OGCG-LABEL: @test_mm512_maskz_shrdi_epi16
+  // OGCG: call <32 x i16> @llvm.fshr.v32i16(<32 x i16> %{{.*}}, <32 x i16> %{{.*}}, <32 x i16> splat (i16 15))
+  // OGCG: select <32 x i1> %{{.*}}, <32 x i16> %{{.*}}, <32 x i16> %{{.*}}
+  return _mm512_maskz_shrdi_epi16(u, a, b, 15);
+}
+
+__m512i test_mm512_shrdi_epi16(__m512i a, __m512i b) {
+  // CIR-LABEL: @test_mm512_shrdi_epi16
+  // CIR: cir.call_llvm_intrinsic "fshr" %{{.*}}, %{{.*}}, %{{.*}} : (!cir.vector<32 x !s16i>, !cir.vector<32 x !s16i>, !cir.vector<32 x !u16i>) -> !cir.vector<32 x !s16i>
+  // LLVM-LABEL: @test_mm512_shrdi_epi16
+  // LLVM: call <32 x i16> @llvm.fshr.v32i16(<32 x i16> {{.*}}, <32 x i16> {{.*}}, <32 x i16> splat (i16 31))
+  // OGCG-LABEL: @test_mm512_shrdi_epi16
+  // OGCG: call <32 x i16> @llvm.fshr.v32i16(<32 x i16> %{{.*}}, <32 x i16> %{{.*}}, <32 x i16> splat (i16 31))
+  return _mm512_shrdi_epi16(a, b, 31);
+}
+
+__m512i test_mm512_mask_shldv_epi32(__m512i s, __mmask16 u, __m512i a, __m512i b) {
+  // CIR-LABEL: @test_mm512_mask_shldv_epi32
+  // CIR: cir.call_llvm_intrinsic "fshl" %{{.*}}, %{{.*}}, %{{.*}} : (!cir.vector<16 x !s32i>, !cir.vector<16 x !s32i>, !cir.vector<16 x !u32i>) -> !cir.vector<16 x !s32i>
+  // CIR: cir.vec.ternary(%{{.*}}, %{{.*}}, %{{.*}}) : !cir.vector<16 x !cir.int<s, 1>>, !cir.vector<16 x !s32i>
+  // LLVM-LABEL: @test_mm512_mask_shldv_epi32
+  // LLVM: call <16 x i32> @llvm.fshl.v16i32(<16 x i32> {{.*}}, <16 x i32> {{.*}}, <16 x i32>
+  // LLVM: select <16 x i1> {{.*}}, <16 x i32> {{.*}}, <16 x i32>
+  // OGCG-LABEL: @test_mm512_mask_shldv_epi32
+  // OGCG: call <16 x i32> @llvm.fshl.v16i32(<16 x i32> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32>
+  // OGCG: select <16 x i1> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> %{{.*}}
+  return _mm512_mask_shldv_epi32(s, u, a, b);
+}
+
+__m512i test_mm512_maskz_shldv_epi32(__mmask16 u, __m512i s, __m512i a, __m512i b) {
+  // CIR-LABEL: @test_mm512_maskz_shldv_epi32
+  // CIR: cir.call_llvm_intrinsic "fshl" %{{.*}}, %{{.*}}, %{{.*}} : (!cir.vector<16 x !s32i>, !cir.vector<16 x !s32i>, !cir.vector<16 x !u32i>) -> !cir.vector<16 x !s32i>
+  // CIR: cir.vec.ternary(%{{.*}}, %{{.*}}, %{{.*}}) : !cir.vector<16 x !cir.int<s, 1>>, !cir.vector<16 x !s32i>
+  // LLVM-LABEL: @test_mm512_maskz_shldv_epi32
+  // LLVM: call <16 x i32> @llvm.fshl.v16i32(<16 x i32> {{.*}}, <16 x i32> {{.*}}, <16 x i32>
+  // LLVM: select <16 x i1> {{.*}}, <16 x i32> {{.*}}, <16 x i32>
+  // OGCG-LABEL: @test_mm512_maskz_shldv_epi32
+  // OGCG: call <16 x i32> @llvm.fshl.v16i32(<16 x i32> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32>
+  // OGCG: select <16 x i1> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> %{{.*}}
+  return _mm512_maskz_shldv_epi32(u, s, a, b);
+}
+
+__m512i test_mm512_mask_shrdv_epi64(__m512i s, __mmask8 u, __m512i a, __m512i b) {
+  // CIR-LABEL: @test_mm512_mask_shrdv_epi64
+  // CIR: cir.call_llvm_intrinsic "fshr" %{{.*}}, %{{.*}}, %{{.*}} : (!cir.vector<8 x !s64i>, !cir.vector<8 x !s64i>, !cir.vector<8 x !u64i>) -> !cir.vector<8 x !s64i>
+  // CIR: cir.vec.ternary(%{{.*}}, %{{.*}}, %{{.*}}) : !cir.vector<8 x !cir.int<s, 1>>, !cir.vector<8 x !s64i>
+  // LLVM-LABEL: @test_mm512_mask_shrdv_epi64
+  // LLVM: call <8 x i64> @llvm.fshr.v8i64(<8 x i64> {{.*}}, <8 x i64> {{.*}}, <8 x i64>
+  // LLVM: select <8 x i1> {{.*}}, <8 x i64> {{.*}}, <8 x i64>
+  // OGCG-LABEL: @test_mm512_mask_shrdv_epi64
+  // OGCG: call <8 x i64> @llvm.fshr.v8i64(<8 x i64> %{{.*}}, <8 x i64> %{{.*}}, <8 x i64>
+  // OGCG: select <8 x i1> %{{.*}}, <8 x i64> %{{.*}}, <8 x i64> %{{.*}}
+  return _mm512_mask_shrdv_epi64(s, u, a, b);
+}
+
+__m512i test_mm512_maskz_shrdv_epi64(__mmask8 u, __m512i s, __m512i a, __m512i b) {
+  // CIR-LABEL: @test_mm512_maskz_shrdv_epi64
+  // CIR: cir.call_llvm_intrinsic "fshr" %{{.*}}, %{{.*}}, %{{.*}} : (!cir.vector<8 x !s64i>, !cir.vector<8 x !s64i>, !cir.vector<8 x !u64i>) -> !cir.vector<8 x !s64i>
+  // CIR: cir.vec.ternary(%{{.*}}, %{{.*}}, %{{.*}}) : !cir.vector<8 x !cir.int<s, 1>>, !cir.vector<8 x !s64i>
+  // LLVM-LABEL: @test_mm512_maskz_shrdv_epi64
+  // LLVM: call <8 x i64> @llvm.fshr.v8i64(<8 x i64> {{.*}}, <8 x i64> {{.*}}, <8 x i64>
+  // LLVM: select <8 x i1> {{.*}}, <8 x i64> {{.*}}, <8 x i64>
+  // OGCG-LABEL: @test_mm512_maskz_shrdv_epi64
+  // OGCG: call <8 x i64> @llvm.fshr.v8i64(<8 x i64> %{{.*}}, <8 x i64> %{{.*}}, <8 x i64>
+  // OGCG: select <8 x i1> %{{.*}}, <8 x i64> %{{.*}}, <8 x i64> %{{.*}}
+  return _mm512_maskz_shrdv_epi64(u, s, a, b);
+}
+
+__m512i test_mm512_mask_shrdv_epi32(__m512i s, __mmask16 u, __m512i a, __m512i b) {
+  // CIR-LABEL: @test_mm512_mask_shrdv_epi32
+  // CIR: cir.call_llvm_intrinsic "fshr" %{{.*}}, %{{.*}}, %{{.*}} : (!cir.vector<16 x !s32i>, !cir.vector<16 x !s32i>, !cir.vector<16 x !u32i>) -> !cir.vector<16 x !s32i>
+  // CIR: cir.vec.ternary(%{{.*}}, %{{.*}}, %{{.*}}) : !cir.vector<16 x !cir.int<s, 1>>, !cir.vector<16 x !s32i>
+  // LLVM-LABEL: @test_mm512_mask_shrdv_epi32
+  // LLVM: call <16 x i32> @llvm.fshr.v16i32(<16 x i32> {{.*}}, <16 x i32> {{.*}}, <16 x i32>
+  // LLVM: select <16 x i1> {{.*}}, <16 x i32> {{.*}}, <16 x i32>
+  // OGCG-LABEL: @test_mm512_mask_shrdv_epi32
+  // OGCG: call <16 x i32> @llvm.fshr.v16i32(<16 x i32> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32>
+  // OGCG: select <16 x i1> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> %{{.*}}
+  return _mm512_mask_shrdv_epi32(s, u, a, b);
+}
+
+__m512i test_mm512_maskz_shrdv_epi32(__mmask16 u, __m512i s, __m512i a, __m512i b) {
+  // CIR-LABEL: @test_mm512_maskz_shrdv_epi32
+  // CIR: cir.call_llvm_intrinsic "fshr" %{{.*}}, %{{.*}}, %{{.*}} : (!cir.vector<16 x !s32i>, !cir.vector<16 x !s32i>, !cir.vector<16 x !u32i>) -> !cir.vector<16 x !s32i>
+  // CIR: cir.vec.ternary(%{{.*}}, %{{.*}}, %{{.*}}) : !cir.vector<16 x !cir.int<s, 1>>, !cir.vector<16 x !s32i>
+  // LLVM-LABEL: @test_mm512_maskz_shrdv_epi32
+  // LLVM: call <16 x i32> @llvm.fshr.v16i32(<16 x i32> {{.*}}, <16 x i32> {{.*}}, <16 x i32>
+  // LLVM: select <16 x i1> {{.*}}, <16 x i32> {{.*}}, <16 x i32>
+  // OGCG-LABEL: @test_mm512_maskz_shrdv_epi32
+  // OGCG: call <16 x i32> @llvm.fshr.v16i32(<16 x i32> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32>
+  // OGCG: select <16 x i1> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> %{{.*}}
+  return _mm512_maskz_shrdv_epi32(u, s, a, b);
+}
+
+__m512i test_mm512_shrdv_epi32(__m512i s, __m512i a, __m512i b) {
+  // CIR-LABEL: @test_mm512_shrdv_epi32
+  // CIR: cir.call_llvm_intrinsic "fshr" %{{.*}}, %{{.*}}, %{{.*}} : (!cir.vector<16 x !s32i>, !cir.vector<16 x !s32i>, !cir.vector<16 x !u32i>) -> !cir.vector<16 x !s32i>
+  // LLVM-LABEL: @test_mm512_shrdv_epi32
+  // LLVM: call <16 x i32> @llvm.fshr.v16i32(<16 x i32> {{.*}}, <16 x i32> {{.*}}, <16 x i32>
+  // OGCG-LABEL: @test_mm512_shrdv_epi32
+  // OGCG: call <16 x i32> @llvm.fshr.v16i32(<16 x i32> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32>
+  return _mm512_shrdv_epi32(s, a, b);
+}
+
+__m512i test_mm512_mask_shrdv_epi16(__m512i s, __mmask32 u, __m512i a, __m512i b) {
+  // CIR-LABEL: @test_mm512_mask_shrdv_epi16
+  // CIR: cir.call_llvm_intrinsic "fshr" %{{.*}}, %{{.*}}, %{{.*}} : (!cir.vector<32 x !s16i>, !cir.vector<32 x !s16i>, !cir.vector<32 x !u16i>) -> !cir.vector<32 x !s16i>
+  // CIR: cir.vec.ternary(%{{.*}}, %{{.*}}, %{{.*}}) : !cir.vector<32 x !cir.int<s, 1>>, !cir.vector<32 x !s16i>
+  // LLVM-LABEL: @test_mm512_mask_shrdv_epi16
+  // LLVM: call <32 x i16> @llvm.fshr.v32i16(<32 x i16> {{.*}}, <32 x i16> {{.*}}, <32 x i16>
+  // LLVM: select <32 x i1> {{.*}}, <32 x i16> {{.*}}, <32 x i16>
+  // OGCG-LABEL: @test_mm512_mask_shrdv_epi16
+  // OGCG: call <32 x i16> @llvm.fshr.v32i16(<32 x i16> %{{.*}}, <32 x i16> %{{.*}}, <32 x i16>
+  // OGCG: select <32 x i1> %{{.*}}, <32 x i16> %{{.*}}, <32 x i16> %{{.*}}
+  return _mm512_mask_shrdv_epi16(s, u, a, b);
+}
+
+__m512i test_mm512_shrdv_epi16(__m512i s, __m512i a, __m512i b) {
+  // CIR-LABEL: @test_mm512_shrdv_epi16
+  // CIR: cir.call_llvm_intrinsic "fshr" %{{.*}}, %{{.*}}, %{{.*}} : (!cir.vector<32 x !s16i>, !cir.vector<32 x !s16i>, !cir.vector<32 x !u16i>) -> !cir.vector<32 x !s16i>
+  // LLVM-LABEL: @test_mm512_shrdv_epi16
+  // LLVM: call <32 x i16> @llvm.fshr.v32i16(<32 x i16> {{.*}}, <32 x i16> {{.*}}, <32 x i16>
+  // OGCG-LABEL: @test_mm512_shrdv_epi16
+  // OGCG: call <32 x i16> @llvm.fshr.v32i16(<32 x i16> %{{.*}}, <32 x i16> %{{.*}}, <32 x i16>
+  return _mm512_shrdv_epi16(s, a, b);
+}

>From 3f776835667ed0264fade83a1c8efda65893eaba Mon Sep 17 00:00:00 2001
From: Priyanshu Kumar <10b.priyanshu at gmail.com>
Date: Wed, 4 Feb 2026 22:31:47 +0530
Subject: [PATCH 6/6] Delete clang/include/clang/Basic/CodeGenOptions.h

---
 clang/include/clang/Basic/CodeGenOptions.h | 667 ---------------------
 1 file changed, 667 deletions(-)
 delete mode 100644 clang/include/clang/Basic/CodeGenOptions.h

diff --git a/clang/include/clang/Basic/CodeGenOptions.h b/clang/include/clang/Basic/CodeGenOptions.h
deleted file mode 100644
index 8ef0d87faaeaf..0000000000000
--- a/clang/include/clang/Basic/CodeGenOptions.h
+++ /dev/null
@@ -1,667 +0,0 @@
-//===--- CodeGenOptions.h ---------------------------------------*- C++ -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-//  This file defines the CodeGenOptions interface.
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef LLVM_CLANG_BASIC_CODEGENOPTIONS_H
-#define LLVM_CLANG_BASIC_CODEGENOPTIONS_H
-
-#include "clang/Basic/CFProtectionOptions.h"
-#include "clang/Basic/PointerAuthOptions.h"
-#include "clang/Basic/Sanitizers.h"
-#include "clang/Basic/XRayInstr.h"
-#include "llvm/ADT/FloatingPointMode.h"
-#include "llvm/Frontend/Debug/Options.h"
-#include "llvm/Frontend/Driver/CodeGenOptions.h"
-#include "llvm/Support/CodeGen.h"
-#include "llvm/Support/Hash.h"
-#include "llvm/Support/Regex.h"
-#include "llvm/Target/TargetOptions.h"
-#include "llvm/Transforms/Instrumentation/AddressSanitizerOptions.h"
-#include <map>
-#include <memory>
-#include <string>
-#include <vector>
-
-namespace llvm {
-class PassBuilder;
-}
-namespace clang {
-
-/// Bitfields of CodeGenOptions, split out from CodeGenOptions to ensure
-/// that this large collection of bitfields is a trivial class type.
-class CodeGenOptionsBase {
-  friend class CompilerInvocation;
-  friend class CompilerInvocationBase;
-
-public:
-  /// For ASTs produced with different option value, signifies their level of
-  /// compatibility.
-  enum class CompatibilityKind {
-    /// Does affect the construction of the AST in a way that does prevent
-    /// module interoperability.
-    NotCompatible,
-    /// Does affect the construction of the AST in a way that doesn't prevent
-    /// interoperability (that is, the value can be different between an
-    /// explicit module and the user of that module).
-    Compatible,
-    /// Does not affect the construction of the AST in any way (that is, the
-    /// value can be different between an implicit module and the user of that
-    /// module).
-    Benign,
-  };
-
-  using CFBranchLabelSchemeKind = clang::CFBranchLabelSchemeKind;
-  using ProfileInstrKind = llvm::driver::ProfileInstrKind;
-  using AsanDetectStackUseAfterReturnMode =
-      llvm::AsanDetectStackUseAfterReturnMode;
-  using AsanDtorKind = llvm::AsanDtorKind;
-  using VectorLibrary = llvm::driver::VectorLibrary;
-  using ZeroCallUsedRegsKind = llvm::ZeroCallUsedRegs::ZeroCallUsedRegsKind;
-  using WinX64EHUnwindV2Mode = llvm::WinX64EHUnwindV2Mode;
-
-  using DebugCompressionType = llvm::DebugCompressionType;
-  using EmitDwarfUnwindType = llvm::EmitDwarfUnwindType;
-  using DebugTemplateNamesKind = llvm::codegenoptions::DebugTemplateNamesKind;
-  using DebugInfoKind = llvm::codegenoptions::DebugInfoKind;
-  using DebuggerKind = llvm::DebuggerKind;
-
-#define CODEGENOPT(Name, Bits, Default, Compatibility) unsigned Name : Bits;
-#define ENUM_CODEGENOPT(Name, Type, Bits, Default, Compatibility)
-#include "clang/Basic/CodeGenOptions.def"
-
-protected:
-#define CODEGENOPT(Name, Bits, Default, Compatibility)
-#define ENUM_CODEGENOPT(Name, Type, Bits, Default, Compatibility)              \
-  unsigned Name : Bits;
-#include "clang/Basic/CodeGenOptions.def"
-};
-
-/// CodeGenOptions - Track various options which control how the code
-/// is optimized and passed to the backend.
-class CodeGenOptions : public CodeGenOptionsBase {
-public:
-  enum InliningMethod {
-    NormalInlining,     // Use the standard function inlining pass.
-    OnlyHintInlining,   // Inline only (implicitly) hinted functions.
-    OnlyAlwaysInlining  // Only run the always inlining pass.
-  };
-
-  enum ObjCDispatchMethodKind {
-    Legacy = 0,
-    NonLegacy = 1,
-    Mixed = 2
-  };
-
-  enum TLSModel {
-    GeneralDynamicTLSModel,
-    LocalDynamicTLSModel,
-    InitialExecTLSModel,
-    LocalExecTLSModel
-  };
-
-  enum StructReturnConventionKind {
-    SRCK_Default,  // No special option was passed.
-    SRCK_OnStack,  // Small structs on the stack (-fpcc-struct-return).
-    SRCK_InRegs    // Small structs in registers (-freg-struct-return).
-  };
-
-  enum EmbedBitcodeKind {
-    Embed_Off,      // No embedded bitcode.
-    Embed_All,      // Embed both bitcode and commandline in the output.
-    Embed_Bitcode,  // Embed just the bitcode in the output.
-    Embed_Marker    // Embed a marker as a placeholder for bitcode.
-  };
-
-  enum class ExtendVariableLivenessKind {
-    None,
-    This,
-    All,
-  };
-
-  enum InlineAsmDialectKind {
-    IAD_ATT,
-    IAD_Intel,
-  };
-
-  enum DebugSrcHashKind {
-    DSH_MD5,
-    DSH_SHA1,
-    DSH_SHA256,
-    DSH_NONE,
-  };
-
-  // This field stores one of the allowed values for the option
-  // -fbasic-block-sections=.  The allowed values with this option are:
-  // {"all", "list=<file>", "none"}.
-  //
-  // "all" :        Generate basic block sections for all basic blocks.
-  // "list=<file>": Generate basic block sections for a subset of basic blocks.
-  //                The functions and the machine basic block ids are specified
-  //                in the file.
-  // "none":        Disable sections for basic blocks.
-  std::string BBSections;
-
-  // If set, override the default value of MCAsmInfo::BinutilsVersion. If
-  // DisableIntegratedAS is specified, the assembly output will consider GNU as
-  // support. "none" means that all ELF features can be used, regardless of
-  // binutils support.
-  std::string BinutilsVersion;
-
-  enum class FramePointerKind {
-    NonLeafNoReserve, // Keep non-leaf frame pointers, allow the FP to be used
-                      // as a GPR in leaf functions.
-    None,             // Omit all frame pointers.
-    Reserved,         // Maintain valid frame pointer chain.
-    NonLeaf, // Keep non-leaf frame pointers, don't allow the FP to be used as a
-             // GPR in leaf functions.
-    All,     // Keep all frame pointers.
-  };
-
-  static StringRef getFramePointerKindName(FramePointerKind Kind) {
-    switch (Kind) {
-    case FramePointerKind::None:
-      return "none";
-    case FramePointerKind::Reserved:
-      return "reserved";
-    case FramePointerKind::NonLeafNoReserve:
-      return "non-leaf-no-reserve";
-    case FramePointerKind::NonLeaf:
-      return "non-leaf";
-    case FramePointerKind::All:
-      return "all";
-    }
-
-    llvm_unreachable("invalid FramePointerKind");
-  }
-
-  /// Possible exception handling behavior.
-  enum class ExceptionHandlingKind { None, SjLj, WinEH, DwarfCFI, Wasm };
-
-  enum class SwiftAsyncFramePointerKind {
-    Auto, // Choose Swift async extended frame info based on deployment target.
-    Always, // Unconditionally emit Swift async extended frame info.
-    Never,  // Don't emit Swift async extended frame info.
-    Default = Always,
-  };
-
-  enum FiniteLoopsKind {
-    Language, // Not specified, use language standard.
-    Always,   // All loops are assumed to be finite.
-    Never,    // No loop is assumed to be finite.
-  };
-
-  enum AssignmentTrackingOpts {
-    Disabled,
-    Enabled,
-    Forced,
-  };
-
-  enum SanitizeDebugTrapReasonKind {
-    None,  ///< Trap Messages are omitted. This offers the smallest debug info
-           ///< size but at the cost of making traps hard to debug.
-    Basic, ///< Trap Message is fixed per SanitizerKind. Produces smaller debug
-           ///< info than `Detailed` but is not as helpful for debugging.
-    Detailed, ///< Trap Message includes more context (e.g. the expression being
-              ///< overflowed). This is more helpful for debugging but produces
-              ///< larger debug info than `Basic`.
-  };
-
-  /// The code model to use (-mcmodel).
-  std::string CodeModel;
-
-  /// The code model-specific large data threshold to use
-  /// (-mlarge-data-threshold).
-  uint64_t LargeDataThreshold;
-
-  /// The filename with path we use for coverage data files. The runtime
-  /// allows further manipulation with the GCOV_PREFIX and GCOV_PREFIX_STRIP
-  /// environment variables.
-  std::string CoverageDataFile;
-
-  /// The filename with path we use for coverage notes files.
-  std::string CoverageNotesFile;
-
-  /// Regexes separated by a semi-colon to filter the files to instrument.
-  std::string ProfileFilterFiles;
-
-  /// Regexes separated by a semi-colon to filter the files to not instrument.
-  std::string ProfileExcludeFiles;
-
-  /// The version string to put into coverage files.
-  char CoverageVersion[4] = {'0', '0', '0', '0'};
-
-  /// Enable additional debugging information.
-  std::string DebugPass;
-
-  /// The string to embed in debug information as the current working directory.
-  std::string DebugCompilationDir;
-
-  /// The string to embed in coverage mapping as the current working directory.
-  std::string CoverageCompilationDir;
-
-  /// The string to embed in the debug information for the compile unit, if
-  /// non-empty.
-  std::string DwarfDebugFlags;
-
-  /// The string containing the commandline for the llvm.commandline metadata,
-  /// if non-empty.
-  std::string RecordCommandLine;
-
-  llvm::SmallVector<std::pair<std::string, std::string>, 0> DebugPrefixMap;
-
-  /// Prefix replacement map for source-based code coverage to remap source
-  /// file paths in coverage mapping.
-  llvm::SmallVector<std::pair<std::string, std::string>, 0> CoveragePrefixMap;
-
-  /// The ABI to use for passing floating point arguments.
-  std::string FloatABI;
-
-  /// The file to use for dumping bug report by `Debugify` for original
-  /// debug info.
-  std::string DIBugsReportFilePath;
-
-  /// The floating-point denormal mode to use.
-  llvm::DenormalMode FPDenormalMode = llvm::DenormalMode::getIEEE();
-
-  /// The floating-point denormal mode to use, for float.
-  llvm::DenormalMode FP32DenormalMode = llvm::DenormalMode::getIEEE();
-
-  /// The float precision limit to use, if non-empty.
-  std::string LimitFloatPrecision;
-
-  struct BitcodeFileToLink {
-    /// The filename of the bitcode file to link in.
-    std::string Filename;
-    /// If true, we set attributes functions in the bitcode library according to
-    /// our CodeGenOptions, much as we set attrs on functions that we generate
-    /// ourselves.
-    bool PropagateAttrs = false;
-    /// If true, we use LLVM module internalizer.
-    bool Internalize = false;
-    /// Bitwise combination of llvm::Linker::Flags, passed to the LLVM linker.
-    unsigned LinkFlags = 0;
-  };
-
-  /// The files specified here are linked in to the module before optimizations.
-  std::vector<BitcodeFileToLink> LinkBitcodeFiles;
-
-  /// The user provided name for the "main file", if non-empty. This is useful
-  /// in situations where the input file name does not match the original input
-  /// file, for example with -save-temps.
-  std::string MainFileName;
-
-  /// The name for the split debug info file used for the DW_AT_[GNU_]dwo_name
-  /// attribute in the skeleton CU.
-  std::string SplitDwarfFile;
-
-  /// Output filename for the split debug info, not used in the skeleton CU.
-  std::string SplitDwarfOutput;
-
-  /// Output filename used in the COFF debug information.
-  std::string ObjectFilenameForDebug;
-
-  /// The name of the relocation model to use.
-  llvm::Reloc::Model RelocationModel;
-
-  /// If not an empty string, trap intrinsics are lowered to calls to this
-  /// function instead of to trap instructions.
-  std::string TrapFuncName;
-
-  /// A list of dependent libraries.
-  std::vector<std::string> DependentLibraries;
-
-  /// A list of linker options to embed in the object file.
-  std::vector<std::string> LinkerOptions;
-
-  /// Name of the profile file to use as output for -fprofile-instr-generate,
-  /// -fprofile-generate, and -fcs-profile-generate.
-  std::string InstrProfileOutput;
-
-  /// Name of the patchable function entry section with
-  /// -fpatchable-function-entry.
-  std::string PatchableFunctionEntrySection;
-
-  /// Name of the profile file to use with -fprofile-sample-use.
-  std::string SampleProfileFile;
-
-  /// Name of the profile file to use as output for with -fmemory-profile.
-  std::string MemoryProfileOutput;
-
-  /// Name of the profile file to use as input for -fmemory-profile-use.
-  std::string MemoryProfileUsePath;
-
-  /// Name of the profile file to use as input for -fprofile-instr-use
-  std::string ProfileInstrumentUsePath;
-
-  /// Name of the profile remapping file to apply to the profile data supplied
-  /// by -fprofile-sample-use or -fprofile-instr-use.
-  std::string ProfileRemappingFile;
-
-  /// Name of the function summary index file to use for ThinLTO function
-  /// importing.
-  std::string ThinLTOIndexFile;
-
-  /// Name of a file that can optionally be written with minimized bitcode
-  /// to be used as input for the ThinLTO thin link step, which only needs
-  /// the summary and module symbol table (and not, e.g. any debug metadata).
-  std::string ThinLinkBitcodeFile;
-
-  /// Prefix to use for -save-temps output.
-  std::string SaveTempsFilePrefix;
-
-  /// Name of file passed with -fcuda-include-gpubinary option to forward to
-  /// CUDA runtime back-end for incorporating them into host-side object file.
-  std::string CudaGpuBinaryFileName;
-
-  /// List of filenames passed in using the -fembed-offload-object option. These
-  /// are offloading binaries containing device images and metadata.
-  std::vector<std::string> OffloadObjects;
-
-  /// The name of the file to which the backend should save YAML optimization
-  /// records.
-  std::string OptRecordFile;
-
-  /// The regex that filters the passes that should be saved to the optimization
-  /// records.
-  std::string OptRecordPasses;
-
-  /// The format used for serializing remarks (default: YAML)
-  std::string OptRecordFormat;
-
-  /// The name of the partition that symbols are assigned to, specified with
-  /// -fsymbol-partition (see https://lld.llvm.org/Partitions.html).
-  std::string SymbolPartition;
-
-  /// If non-empty, allow the compiler to assume that the given source file
-  /// identifier is unique at link time.
-  std::string UniqueSourceFileIdentifier;
-
-  enum RemarkKind {
-    RK_Missing,            // Remark argument not present on the command line.
-    RK_Enabled,            // Remark enabled via '-Rgroup'.
-    RK_EnabledEverything,  // Remark enabled via '-Reverything'.
-    RK_Disabled,           // Remark disabled via '-Rno-group'.
-    RK_DisabledEverything, // Remark disabled via '-Rno-everything'.
-    RK_WithPattern,        // Remark pattern specified via '-Rgroup=regexp'.
-  };
-
-  /// Optimization remark with an optional regular expression pattern.
-  struct OptRemark {
-    RemarkKind Kind = RK_Missing;
-    std::string Pattern;
-    std::shared_ptr<llvm::Regex> Regex;
-
-    /// By default, optimization remark is missing.
-    OptRemark() = default;
-
-    /// Returns true iff the optimization remark holds a valid regular
-    /// expression.
-    bool hasValidPattern() const { return Regex != nullptr; }
-
-    /// Matches the given string against the regex, if there is some.
-    bool patternMatches(StringRef String) const {
-      return hasValidPattern() && Regex->match(String);
-    }
-  };
-
-  /// Selected optimizations for which we should enable optimization remarks.
-  /// Transformation passes whose name matches the contained (optional) regular
-  /// expression (and support this feature), will emit a diagnostic whenever
-  /// they perform a transformation.
-  OptRemark OptimizationRemark;
-
-  /// Selected optimizations for which we should enable missed optimization
-  /// remarks. Transformation passes whose name matches the contained (optional)
-  /// regular expression (and support this feature), will emit a diagnostic
-  /// whenever they tried but failed to perform a transformation.
-  OptRemark OptimizationRemarkMissed;
-
-  /// Selected optimizations for which we should enable optimization analyses.
-  /// Transformation passes whose name matches the contained (optional) regular
-  /// expression (and support this feature), will emit a diagnostic whenever
-  /// they want to explain why they decided to apply or not apply a given
-  /// transformation.
-  OptRemark OptimizationRemarkAnalysis;
-
-  /// Set of sanitizer checks that are non-fatal (i.e. execution should be
-  /// continued when possible).
-  SanitizerSet SanitizeRecover;
-
-  /// Set of sanitizer checks that trap rather than diagnose.
-  SanitizerSet SanitizeTrap;
-
-  /// Set of sanitizer checks that can merge handlers (smaller code size at
-  /// the expense of debuggability).
-  SanitizerSet SanitizeMergeHandlers;
-
-  /// Set of thresholds in a range [0.0, 1.0]: the top hottest code responsible
-  /// for the given fraction of PGO counters will be excluded from sanitization
-  /// (0.0 [default] to skip none, 1.0 to skip all).
-  SanitizerMaskCutoffs SanitizeSkipHotCutoffs;
-
-  /// Set of sanitizer checks, for which the instrumentation will be annotated
-  /// with extra debug info.
-  SanitizerSet SanitizeAnnotateDebugInfo;
-
-  std::optional<double> AllowRuntimeCheckSkipHotCutoff;
-
-  /// List of backend command-line options for -fembed-bitcode.
-  std::vector<uint8_t> CmdArgs;
-
-  /// A list of all -fno-builtin-* function names (e.g., memset).
-  std::vector<std::string> NoBuiltinFuncs;
-
-  std::vector<std::string> Reciprocals;
-
-  /// Configuration for pointer-signing.
-  PointerAuthOptions PointerAuth;
-
-  /// The preferred width for auto-vectorization transforms. This is intended to
-  /// override default transforms based on the width of the architected vector
-  /// registers.
-  std::string PreferVectorWidth;
-
-  /// Set of XRay instrumentation kinds to emit.
-  XRayInstrSet XRayInstrumentationBundle;
-
-  std::vector<std::string> DefaultFunctionAttrs;
-
-  /// List of dynamic shared object files to be loaded as pass plugins.
-  std::vector<std::string> PassPlugins;
-
-  /// List of pass builder callbacks.
-  std::vector<std::function<void(llvm::PassBuilder &)>> PassBuilderCallbacks;
-
-  /// List of global variables explicitly specified by the user as toc-data.
-  std::vector<std::string> TocDataVarsUserSpecified;
-
-  /// List of global variables that over-ride the toc-data default.
-  std::vector<std::string> NoTocDataVars;
-
-  /// Path to allowlist file specifying which objects
-  /// (files, functions) should exclusively be instrumented
-  /// by sanitizer coverage pass.
-  std::vector<std::string> SanitizeCoverageAllowlistFiles;
-
-  /// The guard style used for stack protector to get a initial value, this
-  /// value usually be gotten from TLS or get from __stack_chk_guard, or some
-  /// other styles we may implement in the future.
-  std::string StackProtectorGuard;
-
-  /// The TLS base register when StackProtectorGuard is "tls", or register used
-  /// to store the stack canary for "sysreg".
-  /// On x86 this can be "fs" or "gs".
-  /// On AArch64 this can only be "sp_el0".
-  std::string StackProtectorGuardReg;
-
-  /// Specify a symbol to be the guard value.
-  std::string StackProtectorGuardSymbol;
-
-  /// Path to ignorelist file specifying which objects
-  /// (files, functions) listed for instrumentation by sanitizer
-  /// coverage pass should actually not be instrumented.
-  std::vector<std::string> SanitizeCoverageIgnorelistFiles;
-
-  /// Path to ignorelist file specifying which objects
-  /// (files, functions) listed for instrumentation by sanitizer
-  /// binary metadata pass should not be instrumented.
-  std::vector<std::string> SanitizeMetadataIgnorelistFiles;
-
-  /// Hash algorithm to use for KCFI type IDs.
-  llvm::KCFIHashAlgorithm SanitizeKcfiHash;
-
-  /// Name of the stack usage file (i.e., .su file) if user passes
-  /// -fstack-usage. If empty, it can be implied that -fstack-usage is not
-  /// passed on the command line.
-  std::string StackUsageFile;
-
-  /// Executable and command-line used to create a given CompilerInvocation.
-  /// Most of the time this will be the full -cc1 command.
-  const char *Argv0 = nullptr;
-  std::vector<std::string> CommandLineArgs;
-
-  /// The minimum hotness value a diagnostic needs in order to be included in
-  /// optimization diagnostics.
-  ///
-  /// The threshold is an Optional value, which maps to one of the 3 states:
-  /// 1. 0            => threshold disabled. All remarks will be printed.
-  /// 2. positive int => manual threshold by user. Remarks with hotness exceed
-  ///                    threshold will be printed.
-  /// 3. None         => 'auto' threshold by user. The actual value is not
-  ///                    available at command line, but will be synced with
-  ///                    hotness threshold from profile summary during
-  ///                    compilation.
-  ///
-  /// If threshold option is not specified, it is disabled by default.
-  std::optional<uint64_t> DiagnosticsHotnessThreshold = 0;
-
-  /// The maximum percentage profiling weights can deviate from the expected
-  /// values in order to be included in misexpect diagnostics.
-  std::optional<uint32_t> DiagnosticsMisExpectTolerance = 0;
-
-  /// The name of a file to use with \c .secure_log_unique directives.
-  std::string AsSecureLogFile;
-
-  /// A list of functions that are replacable by the loader.
-  std::vector<std::string> LoaderReplaceableFunctionNames;
-  /// The name of a file that contains functions which will be compiled for
-  /// hotpatching. See -fms-secure-hotpatch-functions-file.
-  std::string MSSecureHotPatchFunctionsFile;
-
-  /// A list of functions which will be compiled for hotpatching.
-  /// See -fms-secure-hotpatch-functions-list.
-  std::vector<std::string> MSSecureHotPatchFunctionsList;
-
-public:
-  // Define accessors/mutators for code generation options of enumeration type.
-#define CODEGENOPT(Name, Bits, Default, Compatibility)
-#define ENUM_CODEGENOPT(Name, Type, Bits, Default, Compatibility)              \
-  Type get##Name() const { return static_cast<Type>(Name); }                   \
-  void set##Name(Type Value) { Name = static_cast<unsigned>(Value); }
-#include "clang/Basic/CodeGenOptions.def"
-
-  CodeGenOptions();
-
-  const std::vector<std::string> &getNoBuiltinFuncs() const {
-    return NoBuiltinFuncs;
-  }
-
-  bool hasSjLjExceptions() const {
-    return getExceptionHandling() == ExceptionHandlingKind::SjLj;
-  }
-
-  bool hasSEHExceptions() const {
-    return getExceptionHandling() == ExceptionHandlingKind::WinEH;
-  }
-
-  bool hasDWARFExceptions() const {
-    return getExceptionHandling() == ExceptionHandlingKind::DwarfCFI;
-  }
-
-  bool hasWasmExceptions() const {
-    return getExceptionHandling() == ExceptionHandlingKind::Wasm;
-  }
-
-  /// Check if Clang profile instrumenation is on.
-  bool hasProfileClangInstr() const {
-    return getProfileInstr() ==
-           llvm::driver::ProfileInstrKind::ProfileClangInstr;
-  }
-
-  /// Check if IR level profile instrumentation is on.
-  bool hasProfileIRInstr() const {
-    return getProfileInstr() == llvm::driver::ProfileInstrKind::ProfileIRInstr;
-  }
-
-  /// Check if CS IR level profile instrumentation is on.
-  bool hasProfileCSIRInstr() const {
-    return getProfileInstr() ==
-           llvm::driver::ProfileInstrKind::ProfileCSIRInstr;
-  }
-
-  /// Check if any form of instrumentation is on.
-  bool hasProfileInstr() const {
-    return getProfileInstr() != llvm::driver::ProfileInstrKind::ProfileNone;
-  }
-
-  /// Check if Clang profile use is on.
-  bool hasProfileClangUse() const {
-    return getProfileUse() == llvm::driver::ProfileInstrKind::ProfileClangInstr;
-  }
-
-  /// Check if IR level profile use is on.
-  bool hasProfileIRUse() const {
-    return getProfileUse() == llvm::driver::ProfileInstrKind::ProfileIRInstr ||
-           getProfileUse() == llvm::driver::ProfileInstrKind::ProfileCSIRInstr;
-  }
-
-  /// Check if CSIR profile use is on.
-  bool hasProfileCSIRUse() const {
-    return getProfileUse() == llvm::driver::ProfileInstrKind::ProfileCSIRInstr;
-  }
-
-  /// Check if type and variable info should be emitted.
-  bool hasReducedDebugInfo() const {
-    return getDebugInfo() >= llvm::codegenoptions::DebugInfoConstructor;
-  }
-
-  /// Check if maybe unused type info should be emitted.
-  bool hasMaybeUnusedDebugInfo() const {
-    return getDebugInfo() >= llvm::codegenoptions::UnusedTypeInfo;
-  }
-
-  // Check if any one of SanitizeCoverage* is enabled.
-  bool hasSanitizeCoverage() const {
-    return SanitizeCoverageType || SanitizeCoverageIndirectCalls ||
-           SanitizeCoverageTraceCmp || SanitizeCoverageTraceLoads ||
-           SanitizeCoverageTraceStores || SanitizeCoverageControlFlow;
-  }
-
-  // Check if any one of SanitizeBinaryMetadata* is enabled.
-  bool hasSanitizeBinaryMetadata() const {
-    return SanitizeBinaryMetadataCovered || SanitizeBinaryMetadataAtomics ||
-           SanitizeBinaryMetadataUAR;
-  }
-
-  /// Reset all of the options that are not considered when building a
-  /// module.
-  void resetNonModularOptions(StringRef ModuleFormat);
-
-  // Is the given function name one of the functions that can be replaced by the
-  // loader?
-  bool isLoaderReplaceableFunctionName(StringRef FuncName) const {
-    return llvm::is_contained(LoaderReplaceableFunctionNames, FuncName);
-  }
-};
-
-}  // end namespace clang
-
-#endif