[clang] [libcxx] [llvm] [CIR][X86] Add support for vpshl/vpshr builtins (PR #179538)

Wed Feb 4 09:00:25 PST 2026

https://github.com/Priyanshu3820 updated https://github.com/llvm/llvm-project/pull/179538

>From 4d2ec7de3325f3f19b80190604f9805ab5f036dd Mon Sep 17 00:00:00 2001
From: Akash Dutta <137309513+akadutta at users.noreply.github.com>
Date: Fri, 30 Jan 2026 08:26:40 -0600
Subject: [PATCH 1/5] Add support for vpshl/vpshr builtins

---
 libcxx/include/stdatomic.h                    |   2 +-
 .../include_stdatomic_as_c.sh.cpp             |  30 +
 llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp   |  14 +-
 llvm/lib/Target/AMDGPU/SIInstrInfo.cpp        |  36 +-
 llvm/lib/Target/AMDGPU/SIInstructions.td      |   2 +-
 llvm/lib/Target/AMDGPU/VOP3PInstructions.td   |   4 +-
 .../InstCombineSimplifyDemanded.cpp           |  30 +
 .../lib/Transforms/Scalar/SimplifyCFGPass.cpp |   2 +-
 ...amdgpu-attributor-nocallback-intrinsics.ll |  19 +-
 .../AMDGPU/amdgpu-attributor-trap-leaf.ll     |  65 ++
 .../AMDGPU/av_movimm_pseudo_expansion.mir     |  20 +
 .../AMDGPU/misaligned-vgpr-regsequence.mir    |  30 +
 .../test/CodeGen/AMDGPU/peephole-fold-imm.mir |   5 +-
 .../siloadstoreopt-misaligned-regsequence.ll  |  21 +
 .../CodeGen/AMDGPU/v_mov_b64_expansion.mir    |   9 +
 .../CodeGen/AMDGPU/wmma-gfx12-convergent.mir  | 153 ++++-
 .../InstCombine/simplify-demanded-fpclass.ll  | 649 ++++++++++++++++++
 .../AArch64/partial-reduce-chained.ll         |  31 +-
 18 files changed, 1079 insertions(+), 43 deletions(-)
 create mode 100644 libcxx/test/extensions/libcxx/depr/depr.c.headers/include_stdatomic_as_c.sh.cpp
 create mode 100644 llvm/test/CodeGen/AMDGPU/amdgpu-attributor-trap-leaf.ll
 create mode 100644 llvm/test/CodeGen/AMDGPU/misaligned-vgpr-regsequence.mir
 create mode 100644 llvm/test/CodeGen/AMDGPU/siloadstoreopt-misaligned-regsequence.ll

diff --git a/libcxx/include/stdatomic.h b/libcxx/include/stdatomic.h
index 2991030eee456..e7b787560ddc4 100644
--- a/libcxx/include/stdatomic.h
+++ b/libcxx/include/stdatomic.h
@@ -231,7 +231,7 @@ using std::atomic_store_explicit _LIBCPP_USING_IF_EXISTS;
 using std::atomic_signal_fence _LIBCPP_USING_IF_EXISTS;
 using std::atomic_thread_fence _LIBCPP_USING_IF_EXISTS;
 
-#  elif defined(_LIBCPP_COMPILER_CLANG_BASED)
+#  elif !defined(__cplusplus) || defined(_LIBCPP_COMPILER_CLANG_BASED)
 
 // Before C++23, we include the next <stdatomic.h> on the path to avoid hijacking
 // the header. We do this because Clang has historically shipped a <stdatomic.h>
diff --git a/libcxx/test/extensions/libcxx/depr/depr.c.headers/include_stdatomic_as_c.sh.cpp b/libcxx/test/extensions/libcxx/depr/depr.c.headers/include_stdatomic_as_c.sh.cpp
new file mode 100644
index 0000000000000..31ad5c9053675
--- /dev/null
+++ b/libcxx/test/extensions/libcxx/depr/depr.c.headers/include_stdatomic_as_c.sh.cpp
@@ -0,0 +1,30 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// We're building as C, so this test doesn't work when building with modules.
+// UNSUPPORTED: clang-modules-build
+
+// GCC complains about unrecognized arguments because we're compiling the
+// file as C, but we're passing C++ flags on the command-line.
+// UNSUPPORTED: gcc
+
+// Test that stdatomic.h gets the C header with its definitions.
+
+// NOTE: It's not common or recommended to have libc++ in the header search
+// path when compiling C files, but it does happen often enough.
+
+// RUN: %{cxx} -c -xc %s -fsyntax-only %{flags} %{compile_flags} -std=c99
+
+#include <stdatomic.h>
+
+int main(int argc, char** argv) {
+  (void)argc;
+  (void)argv;
+  [[maybe_unused]] atomic_bool x;
+  return 0;
+}
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp
index 0b2ee6371da06..4bcaabfd3263a 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp
@@ -1343,7 +1343,6 @@ struct AAAMDGPUMinAGPRAlloc
         Maximum.takeAssumedMaximum(NumRegs);
         return true;
       }
-
       switch (CB.getIntrinsicID()) {
       case Intrinsic::not_intrinsic:
         break;
@@ -1361,10 +1360,21 @@ struct AAAMDGPUMinAGPRAlloc
 
         return true;
       }
+      // Trap-like intrinsics such as llvm.trap and llvm.debugtrap do not have
+      // the nocallback attribute, so the AMDGPU attributor can conservatively
+      // drop all implicitly-known inputs and AGPR allocation information. Make
+      // sure we still infer that no implicit inputs are required and that the
+      // AGPR allocation stays at zero. Trap-like intrinsics may invoke a
+      // function which requires AGPRs, so we need to check if the called
+      // function has the "trap-func-name" attribute.
+      case Intrinsic::trap:
+      case Intrinsic::debugtrap:
+      case Intrinsic::ubsantrap:
+        return CB.hasFnAttr(Attribute::NoCallback) ||
+               !CB.hasFnAttr("trap-func-name");
       default:
         // Some intrinsics may use AGPRs, but if we have a choice, we are not
         // required to use AGPRs.
-
         // Assume !nocallback intrinsics may call a function which requires
         // AGPRs.
         return CB.hasFnAttr(Attribute::NoCallback);
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
index fb82598709aab..e0e0bb0c05ea2 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -2173,11 +2173,14 @@ bool SIInstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
     Register DstLo = RI.getSubReg(Dst, AMDGPU::sub0);
     Register DstHi = RI.getSubReg(Dst, AMDGPU::sub1);
 
+    const MCInstrDesc &Mov64Desc = get(AMDGPU::V_MOV_B64_e32);
+    const TargetRegisterClass *Mov64RC = getRegClass(Mov64Desc, /*OpNum=*/0);
+
     const MachineOperand &SrcOp = MI.getOperand(1);
     // FIXME: Will this work for 64-bit floating point immediates?
     assert(!SrcOp.isFPImm());
-    if (ST.hasMovB64()) {
-      MI.setDesc(get(AMDGPU::V_MOV_B64_e32));
+    if (ST.hasMovB64() && Mov64RC->contains(Dst)) {
+      MI.setDesc(Mov64Desc);
       if (SrcOp.isReg() || isInlineConstant(MI, 1) ||
           isUInt<32>(SrcOp.getImm()) || ST.has64BitLiterals())
         break;
@@ -2186,17 +2189,21 @@ bool SIInstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
       APInt Imm(64, SrcOp.getImm());
       APInt Lo(32, Imm.getLoBits(32).getZExtValue());
       APInt Hi(32, Imm.getHiBits(32).getZExtValue());
-      if (ST.hasPkMovB32() && Lo == Hi && isInlineConstant(Lo)) {
-        BuildMI(MBB, MI, DL, get(AMDGPU::V_PK_MOV_B32), Dst)
-          .addImm(SISrcMods::OP_SEL_1)
-          .addImm(Lo.getSExtValue())
-          .addImm(SISrcMods::OP_SEL_1)
-          .addImm(Lo.getSExtValue())
-          .addImm(0)  // op_sel_lo
-          .addImm(0)  // op_sel_hi
-          .addImm(0)  // neg_lo
-          .addImm(0)  // neg_hi
-          .addImm(0); // clamp
+      const MCInstrDesc &PkMovDesc = get(AMDGPU::V_PK_MOV_B32);
+      const TargetRegisterClass *PkMovRC = getRegClass(PkMovDesc, /*OpNum=*/0);
+
+      if (ST.hasPkMovB32() && Lo == Hi && isInlineConstant(Lo) &&
+          PkMovRC->contains(Dst)) {
+        BuildMI(MBB, MI, DL, PkMovDesc, Dst)
+            .addImm(SISrcMods::OP_SEL_1)
+            .addImm(Lo.getSExtValue())
+            .addImm(SISrcMods::OP_SEL_1)
+            .addImm(Lo.getSExtValue())
+            .addImm(0)  // op_sel_lo
+            .addImm(0)  // op_sel_hi
+            .addImm(0)  // neg_lo
+            .addImm(0)  // neg_hi
+            .addImm(0); // clamp
       } else {
         BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DstLo)
           .addImm(Lo.getSExtValue())
@@ -5259,7 +5266,8 @@ bool SIInstrInfo::verifyInstruction(const MachineInstr &MI,
     // aligned register constraint.
     // FIXME: We do not verify inline asm operands, but custom inline asm
     // verification is broken anyway
-    if (ST.needsAlignedVGPRs() && Opcode != AMDGPU::AV_MOV_B64_IMM_PSEUDO) {
+    if (ST.needsAlignedVGPRs() && Opcode != AMDGPU::AV_MOV_B64_IMM_PSEUDO &&
+        Opcode != AMDGPU::V_MOV_B64_PSEUDO) {
       const TargetRegisterClass *RC = RI.getRegClassForReg(MRI, Reg);
       if (RI.hasVectorRegisters(RC) && MO.getSubReg()) {
         if (const TargetRegisterClass *SubRC =
diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td
index 604a91fcb6e7f..d30e7fd0523a5 100644
--- a/llvm/lib/Target/AMDGPU/SIInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SIInstructions.td
@@ -131,7 +131,7 @@ def V_CNDMASK_B64_PSEUDO : VOP3Common <(outs VReg_64:$vdst),
 
 // 64-bit vector move instruction. This is mainly used by the
 // SIFoldOperands pass to enable folding of inline immediates.
-def V_MOV_B64_PSEUDO : VPseudoInstSI <(outs VReg_64_AlignTarget:$vdst),
+def V_MOV_B64_PSEUDO : VPseudoInstSI <(outs VReg_64:$vdst),
                                       (ins VSrc_b64:$src0)> {
   let isReMaterializable = 1;
   let isAsCheapAsAMove = 1;
diff --git a/llvm/lib/Target/AMDGPU/VOP3PInstructions.td b/llvm/lib/Target/AMDGPU/VOP3PInstructions.td
index 405e5dddba639..d111b8996ae75 100644
--- a/llvm/lib/Target/AMDGPU/VOP3PInstructions.td
+++ b/llvm/lib/Target/AMDGPU/VOP3PInstructions.td
@@ -1397,13 +1397,13 @@ multiclass WMMAInst<string Suffix, string Instr, VOPProfile P, SDPatternOperator
   defvar WMMAConstraints3Addr = "@earlyclobber $vdst";
 
   defvar WMMAProfile = VOPProfileWMMA<P, Suffix, _Src01RC64, Type.hasClamp, Type.hasOpsel>;
-  let Mnemonic = Instr, mayRaiseFPException = 0, ReadsModeReg = 0 in {
+  let isConvergent = 1, Mnemonic = Instr, mayRaiseFPException = 0, ReadsModeReg = 0 in {
     let Constraints = WMMAConstraints2Addr, isConvertibleToThreeAddress = convertibleTo3Addr in {
       def _twoaddr # Suffix : VOP3P_Pseudo<Instr # Suffix, WMMAProfile>;
     }
   }
   if convertibleTo3Addr then {
-    let Mnemonic = Instr, mayRaiseFPException = 0, ReadsModeReg = 0 in {
+    let isConvergent = 1, Mnemonic = Instr, mayRaiseFPException = 0, ReadsModeReg = 0 in {
       let Constraints = WMMAConstraints3Addr, SchedRW = [Write32Bit, Write32Bit] in {
         def _threeaddr # Suffix : VOP3P_Pseudo<Instr # Suffix, WMMAProfile>;
       }
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp b/llvm/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp
index 7b82613c73ea6..630016fe58a19 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp
@@ -2143,6 +2143,32 @@ static Value *simplifyDemandedFPClassFnegFabs(KnownFPClass &Known, Value *Src,
   return nullptr;
 }
 
+static Value *simplifyDemandedFPClassCopysignMag(Value *MagSrc,
+                                                 FPClassTest DemandedMask,
+                                                 KnownFPClass KnownSrc,
+                                                 bool NSZ) {
+  if (NSZ) {
+    constexpr FPClassTest NegOrZero = fcNegative | fcPosZero;
+    constexpr FPClassTest PosOrZero = fcPositive | fcNegZero;
+
+    if ((DemandedMask & ~NegOrZero) == fcNone &&
+        KnownSrc.isKnownAlways(NegOrZero))
+      return MagSrc;
+
+    if ((DemandedMask & ~PosOrZero) == fcNone &&
+        KnownSrc.isKnownAlways(PosOrZero))
+      return MagSrc;
+  } else {
+    if ((DemandedMask & ~fcNegative) == fcNone && KnownSrc.SignBit == true)
+      return MagSrc;
+
+    if ((DemandedMask & ~fcPositive) == fcNone && KnownSrc.SignBit == false)
+      return MagSrc;
+  }
+
+  return nullptr;
+}
+
 static Value *
 simplifyDemandedFPClassMinMax(KnownFPClass &Known, Intrinsic::ID IID,
                               const CallInst *CI, FPClassTest DemandedMask,
@@ -2764,6 +2790,10 @@ Value *InstCombinerImpl::SimplifyDemandedUseFPClass(Instruction *I,
         return I;
       }
 
+      if (Value *Simplified = simplifyDemandedFPClassCopysignMag(
+              CI->getArgOperand(0), DemandedMask, Known, FMF.noSignedZeros()))
+        return Simplified;
+
       KnownFPClass KnownSign = computeKnownFPClass(CI->getArgOperand(1),
                                                    fcAllFlags, CxtI, Depth + 1);
 
diff --git a/llvm/lib/Transforms/Scalar/SimplifyCFGPass.cpp b/llvm/lib/Transforms/Scalar/SimplifyCFGPass.cpp
index 7ffccf73bd39d..edcdaea4c31da 100644
--- a/llvm/lib/Transforms/Scalar/SimplifyCFGPass.cpp
+++ b/llvm/lib/Transforms/Scalar/SimplifyCFGPass.cpp
@@ -244,7 +244,7 @@ static bool iterativelySimplifyCFG(Function &F, const TargetTransformInfo &TTI,
   unsigned IterCnt = 0;
   (void)IterCnt;
   while (LocalChange) {
-    assert(IterCnt++ < 1000 && "Iterative simplification didn't converge!");
+    assert(IterCnt++ < 2000 && "Iterative simplification didn't converge!");
     LocalChange = false;
 
     // Loop over all of the basic blocks and remove them if they are unneeded.
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-attributor-nocallback-intrinsics.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-attributor-nocallback-intrinsics.ll
index 71c509afa8e64..163f1aa72e11f 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgpu-attributor-nocallback-intrinsics.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-attributor-nocallback-intrinsics.ll
@@ -35,7 +35,7 @@ define void @use_assume(i1 %arg) {
 
 define void @use_trap() {
 ; CHECK-LABEL: define void @use_trap(
-; CHECK-SAME: ) #[[ATTR1:[0-9]+]] {
+; CHECK-SAME: ) #[[ATTR0]] {
 ; CHECK-NEXT:    call void @llvm.trap()
 ; CHECK-NEXT:    ret void
 ;
@@ -43,9 +43,19 @@ define void @use_trap() {
   ret void
 }
 
+define void @use_trap_with_handler() {
+; CHECK-LABEL: define void @use_trap_with_handler(
+; CHECK-SAME: ) #[[ATTR1:[0-9]+]] {
+; CHECK-NEXT:    call void @llvm.trap() #[[ATTR7:[0-9]+]]
+; CHECK-NEXT:    ret void
+;
+  call void @llvm.trap() #0
+  ret void
+}
+
 define void @use_debugtrap() {
 ; CHECK-LABEL: define void @use_debugtrap(
-; CHECK-SAME: ) #[[ATTR1]] {
+; CHECK-SAME: ) #[[ATTR0]] {
 ; CHECK-NEXT:    call void @llvm.debugtrap()
 ; CHECK-NEXT:    ret void
 ;
@@ -55,7 +65,7 @@ define void @use_debugtrap() {
 
 define void @use_ubsantrap() {
 ; CHECK-LABEL: define void @use_ubsantrap(
-; CHECK-SAME: ) #[[ATTR1]] {
+; CHECK-SAME: ) #[[ATTR0]] {
 ; CHECK-NEXT:    call void @llvm.ubsantrap(i8 0)
 ; CHECK-NEXT:    ret void
 ;
@@ -63,6 +73,8 @@ define void @use_ubsantrap() {
   ret void
 }
 
+
+attributes #0 = { "trap-func-name"="handler" }
 ;.
 ; CHECK: attributes #[[ATTR0]] = { "amdgpu-agpr-alloc"="0" "amdgpu-no-cluster-id-x" "amdgpu-no-cluster-id-y" "amdgpu-no-cluster-id-z" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "target-cpu"="gfx90a" "uniform-work-group-size"="false" }
 ; CHECK: attributes #[[ATTR1]] = { "amdgpu-no-cluster-id-x" "amdgpu-no-cluster-id-y" "amdgpu-no-cluster-id-z" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "target-cpu"="gfx90a" "uniform-work-group-size"="false" }
@@ -71,4 +83,5 @@ define void @use_ubsantrap() {
 ; CHECK: attributes #[[ATTR4:[0-9]+]] = { nocallback nofree nosync nounwind willreturn memory(none) "target-cpu"="gfx90a" }
 ; CHECK: attributes #[[ATTR5:[0-9]+]] = { nocallback nofree nosync nounwind willreturn memory(inaccessiblemem: readwrite) "target-cpu"="gfx90a" }
 ; CHECK: attributes #[[ATTR6:[0-9]+]] = { cold noreturn nounwind memory(inaccessiblemem: write) "target-cpu"="gfx90a" }
+; CHECK: attributes #[[ATTR7]] = { "trap-func-name"="handler" }
 ;.
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-attributor-trap-leaf.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-attributor-trap-leaf.ll
new file mode 100644
index 0000000000000..e5e7328890146
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-attributor-trap-leaf.ll
@@ -0,0 +1,65 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-attributes --check-globals all --version 6
+; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -passes=amdgpu-attributor %s | FileCheck %s
+
+; Trap-like intrinsics such as llvm.trap and llvm.debugtrap do not have the
+; nocallback attribute, so the AMDGPU attributor used to conservatively drop
+; all implicitly-known inputs and AGPR allocation information. Make sure we
+; still infer that no implicit inputs are required and that the AGPR allocation
+; stays at zero.
+
+declare void @llvm.trap()
+
+declare void @llvm.debugtrap()
+
+define amdgpu_kernel void @trap_kernel() {
+; CHECK-LABEL: define amdgpu_kernel void @trap_kernel(
+; CHECK-SAME: ) #[[ATTR2:[0-9]+]] {
+; CHECK-NEXT:    call void @llvm.trap()
+; CHECK-NEXT:    ret void
+;
+  call void @llvm.trap()
+  ret void
+}
+
+define amdgpu_kernel void @trap_kernel_with_handler() {
+; CHECK-LABEL: define amdgpu_kernel void @trap_kernel_with_handler(
+; CHECK-SAME: ) #[[ATTR3:[0-9]+]] {
+; CHECK-NEXT:    call void @llvm.trap() #[[ATTR4:[0-9]+]]
+; CHECK-NEXT:    ret void
+;
+  call void @llvm.trap() #0
+  ret void
+}
+
+define amdgpu_kernel void @debugtrap_kernel() {
+; CHECK-LABEL: define amdgpu_kernel void @debugtrap_kernel(
+; CHECK-SAME: ) #[[ATTR2]] {
+; CHECK-NEXT:    call void @llvm.debugtrap()
+; CHECK-NEXT:    ret void
+;
+  call void @llvm.debugtrap()
+  ret void
+}
+
+; Test that a trap with both trap-func-name and nocallback is still safe
+define amdgpu_kernel void @trap_kernel_with_handler_and_nocallback() {
+; CHECK-LABEL: define amdgpu_kernel void @trap_kernel_with_handler_and_nocallback(
+; CHECK-SAME: ) #[[ATTR2]] {
+; CHECK-NEXT:    call void @llvm.trap() #[[ATTR5:[0-9]+]]
+; CHECK-NEXT:    ret void
+;
+  call void @llvm.trap() #1
+  ret void
+}
+
+attributes #0 = { "trap-func-name"="handler" }
+attributes #1 = { nocallback "trap-func-name"="handler" }
+
+;.
+; CHECK: attributes #[[ATTR0:[0-9]+]] = { cold noreturn nounwind memory(inaccessiblemem: write) "target-cpu"="gfx90a" }
+; CHECK: attributes #[[ATTR1:[0-9]+]] = { nounwind "target-cpu"="gfx90a" }
+; CHECK: attributes #[[ATTR2]] = { "amdgpu-agpr-alloc"="0" "amdgpu-no-cluster-id-x" "amdgpu-no-cluster-id-y" "amdgpu-no-cluster-id-z" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "target-cpu"="gfx90a" "uniform-work-group-size"="false" }
+; CHECK: attributes #[[ATTR3]] = { "amdgpu-no-cluster-id-x" "amdgpu-no-cluster-id-y" "amdgpu-no-cluster-id-z" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "target-cpu"="gfx90a" "uniform-work-group-size"="false" }
+; CHECK: attributes #[[ATTR4]] = { "trap-func-name"="handler" }
+; CHECK: attributes #[[ATTR5]] = { nocallback "trap-func-name"="handler" }
+;.
diff --git a/llvm/test/CodeGen/AMDGPU/av_movimm_pseudo_expansion.mir b/llvm/test/CodeGen/AMDGPU/av_movimm_pseudo_expansion.mir
index c52347b680371..d08185a9e0ccd 100644
--- a/llvm/test/CodeGen/AMDGPU/av_movimm_pseudo_expansion.mir
+++ b/llvm/test/CodeGen/AMDGPU/av_movimm_pseudo_expansion.mir
@@ -208,3 +208,23 @@ body: |
     ; CHECK-NEXT: $vgpr2 = V_MOV_B32_e32 -16, implicit $exec, implicit-def $vgpr1_vgpr2
     $vgpr1_vgpr2 = AV_MOV_B64_IMM_PSEUDO 18446744004990074889, implicit $exec
 ...
+
+---
+name: av_mov_b64_misalign_vgpr
+body: |
+  bb.0:
+    ; CHECK-LABEL: name: av_mov_b64_misalign_vgpr
+    ; CHECK: $vgpr5 = V_MOV_B32_e32 0, implicit $exec, implicit-def $vgpr5_vgpr6
+    ; CHECK-NEXT: $vgpr6 = V_MOV_B32_e32 0, implicit $exec, implicit-def $vgpr5_vgpr6
+    $vgpr5_vgpr6 = AV_MOV_B64_IMM_PSEUDO 0, implicit $exec
+...
+
+---
+name: av_mov_b64_misalign_agpr
+body: |
+  bb.0:
+    ; CHECK-LABEL: name: av_mov_b64_misalign_agpr
+    ; CHECK: $agpr5 = V_ACCVGPR_WRITE_B32_e64 0, implicit $exec, implicit-def $agpr5_agpr6
+    ; CHECK-NEXT: $agpr6 = V_ACCVGPR_WRITE_B32_e64 0, implicit $exec, implicit-def $agpr5_agpr6
+    $agpr5_agpr6 = AV_MOV_B64_IMM_PSEUDO 0, implicit $exec
+...
diff --git a/llvm/test/CodeGen/AMDGPU/misaligned-vgpr-regsequence.mir b/llvm/test/CodeGen/AMDGPU/misaligned-vgpr-regsequence.mir
new file mode 100644
index 0000000000000..26a6cc41ad8fa
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/misaligned-vgpr-regsequence.mir
@@ -0,0 +1,30 @@
+# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx942 -start-after=si-load-store-opt %s -o - | FileCheck %s
+
+# CHECK: misaligned_regsequence:
+# CHECK: ; %bb.0:
+# CHECK:         s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+# CHECK:         s_load_dwordx2 s[0:1], s[4:5], 0x0
+# CHECK:         v_mov_b32_e32 v5, 0
+# CHECK:         v_mov_b32_e32 v4, 0
+# CHECK:         v_mov_b32_e32 v6, 0
+# CHECK:         s_waitcnt lgkmcnt(0)
+# CHECK:         v_mov_b64_e32 v[2:3], s[0:1]
+# CHECK:         flat_store_dwordx3 v[2:3], v[4:6]
+# CHECK:         s_endpgm
+
+---
+name: misaligned_regsequence
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    liveins: $sgpr4_sgpr5
+
+    %0:sgpr_64 = COPY $sgpr4_sgpr5
+    %1:sreg_64_xexec = S_LOAD_DWORDX2_IMM %0, 0, 0 :: (dereferenceable invariant load (s64), align 16, addrspace 4)
+    %2:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+    %3:vreg_64_align2 = COPY %1
+    %4:vreg_64_align2 = V_MOV_B64_PSEUDO 0, implicit $exec
+    %5:vreg_96_align2 = REG_SEQUENCE killed %2, %subreg.sub0, killed %4, %subreg.sub1_sub2
+    FLAT_STORE_DWORDX3 %3, killed %5, 0, 0, implicit $exec, implicit $flat_scr :: (store (s96), align 4)
+    S_ENDPGM 0
+...
diff --git a/llvm/test/CodeGen/AMDGPU/peephole-fold-imm.mir b/llvm/test/CodeGen/AMDGPU/peephole-fold-imm.mir
index 21455a9f5074f..176d7752133cf 100644
--- a/llvm/test/CodeGen/AMDGPU/peephole-fold-imm.mir
+++ b/llvm/test/CodeGen/AMDGPU/peephole-fold-imm.mir
@@ -425,7 +425,7 @@ body:             |
   bb.0:
     ; GCN-LABEL: name: fold_v_mov_b64_64_to_unaligned
     ; GCN: [[V_MOV_B64_e32_:%[0-9]+]]:vreg_64_align2 = V_MOV_B64_e32 1311768467750121200, implicit $exec
-    ; GCN-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64_align2 = V_MOV_B64_PSEUDO 1311768467750121200, implicit $exec
+    ; GCN-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 1311768467750121200, implicit $exec
     ; GCN-NEXT: SI_RETURN_TO_EPILOG implicit [[V_MOV_B]]
     %0:vreg_64_align2 = V_MOV_B64_e32 1311768467750121200, implicit $exec
     %1:vreg_64 = COPY killed %0
@@ -437,7 +437,8 @@ name:            fold_v_mov_b64_pseudo_64_to_unaligned
 body:             |
   bb.0:
     ; GCN-LABEL: name: fold_v_mov_b64_pseudo_64_to_unaligned
-    ; GCN: [[V_MOV_B:%[0-9]+]]:vreg_64_align2 = V_MOV_B64_PSEUDO 1311768467750121200, implicit $exec
+    ; GCN: [[V_MOV_B64_PSEUDO:%[0-9]+]]:vreg_64_align2 = V_MOV_B64_PSEUDO 1311768467750121200, implicit $exec
+    ; GCN-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 1311768467750121200, implicit $exec
     ; GCN-NEXT: SI_RETURN_TO_EPILOG implicit [[V_MOV_B]]
     %0:vreg_64_align2 = V_MOV_B64_PSEUDO 1311768467750121200, implicit $exec
     %1:vreg_64 = COPY killed %0
diff --git a/llvm/test/CodeGen/AMDGPU/siloadstoreopt-misaligned-regsequence.ll b/llvm/test/CodeGen/AMDGPU/siloadstoreopt-misaligned-regsequence.ll
new file mode 100644
index 0000000000000..e95aba71775b5
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/siloadstoreopt-misaligned-regsequence.ll
@@ -0,0 +1,21 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx942 < %s | FileCheck %s
+
+define amdgpu_kernel void @foo(ptr %0) {
+; CHECK-LABEL: foo:
+; CHECK:       ; %bb.0: ; %entry
+; CHECK-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; CHECK-NEXT:    v_mov_b32_e32 v2, 0
+; CHECK-NEXT:    v_mov_b32_e32 v3, v2
+; CHECK-NEXT:    v_mov_b32_e32 v4, v3
+; CHECK-NEXT:    v_mov_b32_e32 v3, v2
+; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
+; CHECK-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
+; CHECK-NEXT:    flat_store_dwordx3 v[0:1], v[2:4]
+; CHECK-NEXT:    s_endpgm
+entry:
+  %1 = getelementptr inbounds i8, ptr %0, i64 4
+  store i32 0, ptr %0, align 4
+  store i64 0, ptr %1, align 4
+  ret void
+}
diff --git a/llvm/test/CodeGen/AMDGPU/v_mov_b64_expansion.mir b/llvm/test/CodeGen/AMDGPU/v_mov_b64_expansion.mir
index 70e2987454192..4c68c4519302a 100644
--- a/llvm/test/CodeGen/AMDGPU/v_mov_b64_expansion.mir
+++ b/llvm/test/CodeGen/AMDGPU/v_mov_b64_expansion.mir
@@ -93,3 +93,12 @@ body: |
   bb.0:
     $vgpr0_vgpr1 = V_MOV_B64_PSEUDO 4575657222473777152, implicit $exec
 ...
+
+# GCN-LABEL: name: v_mov_b64_misalign
+# GCN: $vgpr5 = V_MOV_B32_e32 0, implicit $exec, implicit-def $vgpr5_vgpr6
+# GCN: $vgpr6 = V_MOV_B32_e32 0, implicit $exec, implicit-def $vgpr5_vgpr6
+name: v_mov_b64_misalign
+body: |
+  bb.0:
+    $vgpr5_vgpr6 = V_MOV_B64_PSEUDO 0, implicit $exec
+...
diff --git a/llvm/test/CodeGen/AMDGPU/wmma-gfx12-convergent.mir b/llvm/test/CodeGen/AMDGPU/wmma-gfx12-convergent.mir
index df3e780c61f46..955cf0dbe38d4 100644
--- a/llvm/test/CodeGen/AMDGPU/wmma-gfx12-convergent.mir
+++ b/llvm/test/CodeGen/AMDGPU/wmma-gfx12-convergent.mir
@@ -1,11 +1,16 @@
 # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 6
+
+# machine-sink must not sink WMMA* instructions.
+# Ensure that WMMA instructions are marked as convergent to prevent
+# machine-sink from sinking them.
+
 # RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx12-generic -run-pass=machine-sink %s -o - | FileCheck %s
 
 ---
-name:            wmma_test
+name:            wmma_test_WMMA_F32_16X16X16_F16_w32_threeaddr
 tracksRegLiveness: true
 body:             |
-  ; CHECK-LABEL: name: wmma_test
+  ; CHECK-LABEL: name: wmma_test_WMMA_F32_16X16X16_F16_w32_threeaddr
   ; CHECK: bb.0:
   ; CHECK-NEXT:   successors: %bb.2(0x40000000), %bb.1(0x40000000)
   ; CHECK-NEXT: {{  $}}
@@ -40,3 +45,147 @@ body:             |
     S_ENDPGM 0
 
 ...
+
+---
+name:            wmma_test_V_WMMA_F32_16X16X16_F16_twoaddr_w32
+tracksRegLiveness: true
+body:             |
+  ; CHECK-LABEL: name: wmma_test_V_WMMA_F32_16X16X16_F16_twoaddr_w32
+  ; CHECK: bb.0:
+  ; CHECK-NEXT:   successors: %bb.2(0x40000000), %bb.1(0x40000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   %vsrc:vreg_256 = IMPLICIT_DEF
+  ; CHECK-NEXT:   %ssrc:sreg_32 = IMPLICIT_DEF
+  ; CHECK-NEXT:   early-clobber %vdst:vreg_256 = V_WMMA_F32_16X16X16_F16_twoaddr_w32 8, %vsrc, 8, %vsrc, 8, %vsrc, 0, 0, implicit $exec
+  ; CHECK-NEXT:   %sdst:sreg_32 = SI_IF %ssrc, %bb.2, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+  ; CHECK-NEXT:   S_BRANCH %bb.1
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.1:
+  ; CHECK-NEXT:   successors: %bb.2(0x80000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   %vcopy:vgpr_32 = COPY %vdst.sub0
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.2:
+  ; CHECK-NEXT:   SI_END_CF %sdst, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+  ; CHECK-NEXT:   S_ENDPGM 0
+  bb.0:
+    %vsrc:vreg_256 = IMPLICIT_DEF
+    %ssrc:sreg_32 = IMPLICIT_DEF
+    early-clobber %vdst:vreg_256 = V_WMMA_F32_16X16X16_F16_twoaddr_w32 8, %vsrc, 8, %vsrc, 8, %vsrc, 0, 0, implicit $exec
+    %sdst:sreg_32 = SI_IF %ssrc:sreg_32, %bb.2, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+    S_BRANCH %bb.1
+  bb.1:
+    %vcopy:vgpr_32 = COPY %vdst.sub0
+  bb.2:
+    SI_END_CF %sdst:sreg_32, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+    S_ENDPGM 0
+...
+
+---
+name:            wmma_test_V_WMMA_I32_16X16X16_IU8_twoaddr_w32
+tracksRegLiveness: true
+body:             |
+  ; CHECK-LABEL: name: wmma_test_V_WMMA_I32_16X16X16_IU8_twoaddr_w32
+  ; CHECK: bb.0:
+  ; CHECK-NEXT:   successors: %bb.2(0x40000000), %bb.1(0x40000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   %vsrc:vreg_128 = IMPLICIT_DEF
+  ; CHECK-NEXT:   %vsrc2:vreg_256 = IMPLICIT_DEF
+  ; CHECK-NEXT:   %ssrc:sreg_32 = IMPLICIT_DEF
+  ; CHECK-NEXT:   early-clobber %vdst:vreg_256 = V_WMMA_I32_16X16X16_IU8_twoaddr_w32 8, %vsrc, 8, %vsrc, 8, %vsrc2, 0, 0, 0, implicit $exec
+  ; CHECK-NEXT:   %sdst:sreg_32 = SI_IF %ssrc, %bb.2, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+  ; CHECK-NEXT:   S_BRANCH %bb.1
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.1:
+  ; CHECK-NEXT:   successors: %bb.2(0x80000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   %vcopy:vgpr_32 = COPY %vdst.sub0
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.2:
+  ; CHECK-NEXT:   SI_END_CF %sdst, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+  ; CHECK-NEXT:   S_ENDPGM 0
+  bb.0:
+    %vsrc:vreg_128 = IMPLICIT_DEF
+    %vsrc2:vreg_256 = IMPLICIT_DEF
+    %ssrc:sreg_32 = IMPLICIT_DEF
+    early-clobber %vdst:vreg_256 = V_WMMA_I32_16X16X16_IU8_twoaddr_w32 8, %vsrc, 8, %vsrc, 8, %vsrc2, 0, 0, 0, implicit $exec
+    %sdst:sreg_32 = SI_IF %ssrc:sreg_32, %bb.2, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+    S_BRANCH %bb.1
+  bb.1:
+    %vcopy:vgpr_32 = COPY %vdst.sub0
+  bb.2:
+    SI_END_CF %sdst:sreg_32, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+    S_ENDPGM 0
+...
+
+---
+name:            wmma_test_V_WMMA_BF16_16X16X16_BF16_threeaddr_w32
+tracksRegLiveness: true
+body:             |
+  ; CHECK-LABEL: name: wmma_test_V_WMMA_BF16_16X16X16_BF16_threeaddr_w32
+  ; CHECK: bb.0:
+  ; CHECK-NEXT:   successors: %bb.2(0x40000000), %bb.1(0x40000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   %vsrc:vreg_256 = IMPLICIT_DEF
+  ; CHECK-NEXT:   %ssrc:sreg_32 = IMPLICIT_DEF
+  ; CHECK-NEXT:   early-clobber %vdst:vreg_256 = V_WMMA_BF16_16X16X16_BF16_threeaddr_w32 8, %vsrc, 8, %vsrc, 8, %vsrc, 0, 0, 0, 0, implicit $exec
+  ; CHECK-NEXT:   %sdst:sreg_32 = SI_IF %ssrc, %bb.2, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+  ; CHECK-NEXT:   S_BRANCH %bb.1
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.1:
+  ; CHECK-NEXT:   successors: %bb.2(0x80000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   %vcopy:vgpr_32 = COPY %vdst.sub0
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.2:
+  ; CHECK-NEXT:   SI_END_CF %sdst, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+  ; CHECK-NEXT:   S_ENDPGM 0
+  bb.0:
+    %vsrc:vreg_256 = IMPLICIT_DEF
+    %ssrc:sreg_32 = IMPLICIT_DEF
+    early-clobber %vdst:vreg_256 = V_WMMA_BF16_16X16X16_BF16_threeaddr_w32 8, %vsrc, 8, %vsrc, 8, %vsrc, 0, 0, 0, 0, implicit $exec
+    %sdst:sreg_32 = SI_IF %ssrc:sreg_32, %bb.2, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+    S_BRANCH %bb.1
+  bb.1:
+    %vcopy:vgpr_32 = COPY %vdst.sub0
+  bb.2:
+    SI_END_CF %sdst:sreg_32, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+    S_ENDPGM 0
+...
+
+---
+name:            wmma_test_V_WMMA_SCALE_F32_16X16X128_F8F6F4_f8_f8_w32_twoaddr
+tracksRegLiveness: true
+body:             |
+  ; CHECK-LABEL: name: wmma_test_V_WMMA_SCALE_F32_16X16X128_F8F6F4_f8_f8_w32_twoaddr
+  ; CHECK: bb.0:
+  ; CHECK-NEXT:   successors: %bb.2(0x40000000), %bb.1(0x40000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   %vsrc256:vreg_256 = IMPLICIT_DEF
+  ; CHECK-NEXT:   %vsrc512:vreg_512 = IMPLICIT_DEF
+  ; CHECK-NEXT:   %ssrc:sreg_32 = IMPLICIT_DEF
+  ; CHECK-NEXT:   early-clobber %vdst:vreg_256 = V_WMMA_SCALE_F32_16X16X128_F8F6F4_f8_f8_w32_twoaddr %vsrc512, %vsrc512, 8, %vsrc256, 0, 0, 1, 2, 1, 1, 0, 0, 0, 0, 0, 0, implicit $exec
+  ; CHECK-NEXT:   %sdst:sreg_32 = SI_IF %ssrc, %bb.2, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+  ; CHECK-NEXT:   S_BRANCH %bb.1
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.1:
+  ; CHECK-NEXT:   successors: %bb.2(0x80000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   %vcopy:vgpr_32 = COPY %vdst.sub0
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.2:
+  ; CHECK-NEXT:   SI_END_CF %sdst, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+  ; CHECK-NEXT:   S_ENDPGM 0
+  bb.0:
+    %vsrc256:vreg_256 = IMPLICIT_DEF
+    %vsrc512:vreg_512 = IMPLICIT_DEF
+    %ssrc:sreg_32 = IMPLICIT_DEF
+    early-clobber %vdst:vreg_256 = V_WMMA_SCALE_F32_16X16X128_F8F6F4_f8_f8_w32_twoaddr %vsrc512, %vsrc512, 8, %vsrc256, 0, 0, 1, 2, 1, 1, 0, 0, 0, 0, 0, 0, implicit $exec
+    %sdst:sreg_32 = SI_IF %ssrc:sreg_32, %bb.2, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+    S_BRANCH %bb.1
+  bb.1:
+    %vcopy:vgpr_32 = COPY %vdst.sub0
+  bb.2:
+    SI_END_CF %sdst:sreg_32, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
+    S_ENDPGM 0
+...
diff --git a/llvm/test/Transforms/InstCombine/simplify-demanded-fpclass.ll b/llvm/test/Transforms/InstCombine/simplify-demanded-fpclass.ll
index 6688379665924..7825d380d6e50 100644
--- a/llvm/test/Transforms/InstCombine/simplify-demanded-fpclass.ll
+++ b/llvm/test/Transforms/InstCombine/simplify-demanded-fpclass.ll
@@ -2041,3 +2041,652 @@ define nofpclass(nan) float @ret_nonan_fmul_select_nan_other_use_commute(i1 %con
   %nan.user = fmul float %select, %y
   ret float %nan.user
 }
+
+define nofpclass(snan) float @copysign_src_known_positive__sign_known_negative_multiple_use(float nofpclass(nan ninf nnorm nsub nzero) %always.positive, float nofpclass(nan pinf pnorm psub pzero) %always.negative, ptr %ptr) {
+; CHECK-LABEL: define nofpclass(snan) float @copysign_src_known_positive__sign_known_negative_multiple_use
+; CHECK-SAME: (float nofpclass(nan ninf nzero nsub nnorm) [[ALWAYS_POSITIVE:%.*]], float nofpclass(nan pinf pzero psub pnorm) [[ALWAYS_NEGATIVE:%.*]], ptr [[PTR:%.*]]) {
+; CHECK-NEXT:    [[COPYSIGN:%.*]] = fneg float [[ALWAYS_POSITIVE]]
+; CHECK-NEXT:    store float [[COPYSIGN]], ptr [[PTR]], align 4
+; CHECK-NEXT:    ret float [[COPYSIGN]]
+;
+  %copysign = call float @llvm.copysign.f32(float %always.positive, float %always.negative)
+  store float %copysign, ptr %ptr
+  ret float %copysign
+}
+
+define nofpclass(snan) float @copysign_src_known_negative__sign_known_negative_multiple_use(float nofpclass(nan pinf pnorm psub pzero) %always.negative0, float nofpclass(nan pinf pnorm psub pzero) %always.negative1, ptr %ptr) {
+; CHECK-LABEL: define nofpclass(snan) float @copysign_src_known_negative__sign_known_negative_multiple_use
+; CHECK-SAME: (float nofpclass(nan pinf pzero psub pnorm) [[ALWAYS_NEGATIVE0:%.*]], float nofpclass(nan pinf pzero psub pnorm) [[ALWAYS_NEGATIVE1:%.*]], ptr [[PTR:%.*]]) {
+; CHECK-NEXT:    [[TMP1:%.*]] = call float @llvm.fabs.f32(float [[ALWAYS_NEGATIVE0]])
+; CHECK-NEXT:    [[COPYSIGN:%.*]] = fneg float [[TMP1]]
+; CHECK-NEXT:    store float [[COPYSIGN]], ptr [[PTR]], align 4
+; CHECK-NEXT:    ret float [[ALWAYS_NEGATIVE0]]
+;
+  %copysign = call float @llvm.copysign.f32(float %always.negative0, float %always.negative1)
+  store float %copysign, ptr %ptr
+  ret float %copysign
+}
+
+define nofpclass(snan) float @copysign_src_known_positive__sign_known_positive_multiple_use(float nofpclass(nan ninf nnorm nsub nzero) %always.positive0, float nofpclass(nan ninf nnorm nsub nzero) %always.positive1, ptr %ptr) {
+; CHECK-LABEL: define nofpclass(snan) float @copysign_src_known_positive__sign_known_positive_multiple_use
+; CHECK-SAME: (float nofpclass(nan ninf nzero nsub nnorm) [[ALWAYS_POSITIVE0:%.*]], float nofpclass(nan ninf nzero nsub nnorm) [[ALWAYS_POSITIVE1:%.*]], ptr [[PTR:%.*]]) {
+; CHECK-NEXT:    store float [[ALWAYS_POSITIVE0]], ptr [[PTR]], align 4
+; CHECK-NEXT:    ret float [[ALWAYS_POSITIVE0]]
+;
+  %copysign = call float @llvm.copysign.f32(float %always.positive0, float %always.positive1)
+  store float %copysign, ptr %ptr
+  ret float %copysign
+}
+
+define nofpclass(nan) float @ret_nonan__copysign_src_known_negative_or_nan__sign_known_negative_multiple_use(float nofpclass(pinf pnorm psub pzero) %always.negative0.or.nan, float nofpclass(nan pinf pnorm psub pzero) %always.negative1, ptr %ptr) {
+; CHECK-LABEL: define nofpclass(nan) float @ret_nonan__copysign_src_known_negative_or_nan__sign_known_negative_multiple_use
+; CHECK-SAME: (float nofpclass(pinf pzero psub pnorm) [[ALWAYS_NEGATIVE0_OR_NAN:%.*]], float nofpclass(nan pinf pzero psub pnorm) [[ALWAYS_NEGATIVE1:%.*]], ptr [[PTR:%.*]]) {
+; CHECK-NEXT:    [[TMP1:%.*]] = call float @llvm.fabs.f32(float [[ALWAYS_NEGATIVE0_OR_NAN]])
+; CHECK-NEXT:    [[COPYSIGN:%.*]] = fneg float [[TMP1]]
+; CHECK-NEXT:    store float [[COPYSIGN]], ptr [[PTR]], align 4
+; CHECK-NEXT:    ret float [[ALWAYS_NEGATIVE0_OR_NAN]]
+;
+  %copysign = call float @llvm.copysign.f32(float %always.negative0.or.nan, float %always.negative1)
+  store float %copysign, ptr %ptr
+  ret float %copysign
+}
+
+define nofpclass(nan ninf nnorm nsub nzero) float @ret_only_positive__copysign__sign_known_negative_multiple_use(float %unknown, float nofpclass(nan pinf pnorm psub pzero) %always.negative, ptr %ptr) {
+; CHECK-LABEL: define nofpclass(nan ninf nzero nsub nnorm) float @ret_only_positive__copysign__sign_known_negative_multiple_use
+; CHECK-SAME: (float [[UNKNOWN:%.*]], float nofpclass(nan pinf pzero psub pnorm) [[ALWAYS_NEGATIVE:%.*]], ptr [[PTR:%.*]]) {
+; CHECK-NEXT:    [[TMP1:%.*]] = call float @llvm.fabs.f32(float [[UNKNOWN]])
+; CHECK-NEXT:    [[COPYSIGN:%.*]] = fneg float [[TMP1]]
+; CHECK-NEXT:    store float [[COPYSIGN]], ptr [[PTR]], align 4
+; CHECK-NEXT:    ret float poison
+;
+  %copysign = call float @llvm.copysign.f32(float %unknown, float %always.negative)
+  store float %copysign, ptr %ptr
+  ret float %copysign
+}
+
+define nofpclass(nan ninf nnorm nsub nzero) float @ret_only_positive__copysign__sign_known_positive_multiple_use(float %unknown, float nofpclass(nan ninf nnorm nsub nzero) %always.positive, ptr %ptr) {
+; CHECK-LABEL: define nofpclass(nan ninf nzero nsub nnorm) float @ret_only_positive__copysign__sign_known_positive_multiple_use
+; CHECK-SAME: (float [[UNKNOWN:%.*]], float nofpclass(nan ninf nzero nsub nnorm) [[ALWAYS_POSITIVE:%.*]], ptr [[PTR:%.*]]) {
+; CHECK-NEXT:    [[COPYSIGN:%.*]] = call float @llvm.fabs.f32(float [[UNKNOWN]])
+; CHECK-NEXT:    store float [[COPYSIGN]], ptr [[PTR]], align 4
+; CHECK-NEXT:    ret float [[COPYSIGN]]
+;
+  %copysign = call float @llvm.copysign.f32(float %unknown, float %always.positive)
+  store float %copysign, ptr %ptr
+  ret float %copysign
+}
+
+define nofpclass(nan pinf pnorm psub pzero) float @ret_only_negative__copysign__sign_known_negative_multiple_use(float %unknown, float nofpclass(nan pinf pnorm psub pzero) %always.negative, ptr %ptr) {
+; CHECK-LABEL: define nofpclass(nan pinf pzero psub pnorm) float @ret_only_negative__copysign__sign_known_negative_multiple_use
+; CHECK-SAME: (float [[UNKNOWN:%.*]], float nofpclass(nan pinf pzero psub pnorm) [[ALWAYS_NEGATIVE:%.*]], ptr [[PTR:%.*]]) {
+; CHECK-NEXT:    [[TMP1:%.*]] = call float @llvm.fabs.f32(float [[UNKNOWN]])
+; CHECK-NEXT:    [[COPYSIGN:%.*]] = fneg float [[TMP1]]
+; CHECK-NEXT:    store float [[COPYSIGN]], ptr [[PTR]], align 4
+; CHECK-NEXT:    ret float [[COPYSIGN]]
+;
+  %copysign = call float @llvm.copysign.f32(float %unknown, float %always.negative)
+  store float %copysign, ptr %ptr
+  ret float %copysign
+}
+
+define nofpclass(nan pinf pnorm psub pzero) float @ret_only_negative__copysign__sign_known_positive_multiple_use(float %unknown, float nofpclass(nan ninf nnorm nsub nzero) %always.positive, ptr %ptr) {
+; CHECK-LABEL: define nofpclass(nan pinf pzero psub pnorm) float @ret_only_negative__copysign__sign_known_positive_multiple_use
+; CHECK-SAME: (float [[UNKNOWN:%.*]], float nofpclass(nan ninf nzero nsub nnorm) [[ALWAYS_POSITIVE:%.*]], ptr [[PTR:%.*]]) {
+; CHECK-NEXT:    [[COPYSIGN:%.*]] = call float @llvm.fabs.f32(float [[UNKNOWN]])
+; CHECK-NEXT:    store float [[COPYSIGN]], ptr [[PTR]], align 4
+; CHECK-NEXT:    ret float poison
+;
+  %copysign = call float @llvm.copysign.f32(float %unknown, float %always.positive)
+  store float %copysign, ptr %ptr
+  ret float %copysign
+}
+
+define nofpclass(nan ninf nnorm nsub nzero) float @ret_only_positive__copysign_multiple_use(float %unknown.mag, float %unknown.sign, ptr %ptr) {
+; CHECK-LABEL: define nofpclass(nan ninf nzero nsub nnorm) float @ret_only_positive__copysign_multiple_use
+; CHECK-SAME: (float [[UNKNOWN_MAG:%.*]], float [[UNKNOWN_SIGN:%.*]], ptr [[PTR:%.*]]) {
+; CHECK-NEXT:    [[COPYSIGN:%.*]] = call float @llvm.copysign.f32(float [[UNKNOWN_MAG]], float [[UNKNOWN_SIGN]])
+; CHECK-NEXT:    store float [[COPYSIGN]], ptr [[PTR]], align 4
+; CHECK-NEXT:    ret float [[COPYSIGN]]
+;
+  %copysign = call float @llvm.copysign.f32(float %unknown.mag, float %unknown.sign)
+  store float %copysign, ptr %ptr
+  ret float %copysign
+}
+
+define nofpclass(nan pinf pnorm psub pzero) float @ret_only_negative__copysign_multiple_use(float %unknown.mag, float %unknown.sign, ptr %ptr) {
+; CHECK-LABEL: define nofpclass(nan pinf pzero psub pnorm) float @ret_only_negative__copysign_multiple_use
+; CHECK-SAME: (float [[UNKNOWN_MAG:%.*]], float [[UNKNOWN_SIGN:%.*]], ptr [[PTR:%.*]]) {
+; CHECK-NEXT:    [[COPYSIGN:%.*]] = call float @llvm.copysign.f32(float [[UNKNOWN_MAG]], float [[UNKNOWN_SIGN]])
+; CHECK-NEXT:    store float [[COPYSIGN]], ptr [[PTR]], align 4
+; CHECK-NEXT:    ret float [[COPYSIGN]]
+;
+  %copysign = call float @llvm.copysign.f32(float %unknown.mag, float %unknown.sign)
+  store float %copysign, ptr %ptr
+  ret float %copysign
+}
+
+; Fold to direct use of %always.positive
+define nofpclass(nan ninf nnorm nsub nzero) float @ret_only_positive__copysign_src_known_positive__sign_unknown(float nofpclass(nan ninf nnorm nsub nzero) %always.positive, float %unknown) {
+; CHECK-LABEL: define nofpclass(nan ninf nzero nsub nnorm) float @ret_only_positive__copysign_src_known_positive__sign_unknown
+; CHECK-SAME: (float nofpclass(nan ninf nzero nsub nnorm) [[ALWAYS_POSITIVE:%.*]], float [[UNKNOWN:%.*]]) {
+; CHECK-NEXT:    ret float [[ALWAYS_POSITIVE]]
+;
+  %copysign = call float @llvm.copysign.f32(float %always.positive, float %unknown)
+  ret float %copysign
+}
+
+; Fold to direct use of %always.positive
+define nofpclass(nan ninf nnorm nsub nzero) float @ret_only_positive__copysign_src_known_positive__sign_unknown_multiple_use(float nofpclass(nan ninf nnorm nsub nzero) %always.positive, float %unknown, ptr %ptr) {
+; CHECK-LABEL: define nofpclass(nan ninf nzero nsub nnorm) float @ret_only_positive__copysign_src_known_positive__sign_unknown_multiple_use
+; CHECK-SAME: (float nofpclass(nan ninf nzero nsub nnorm) [[ALWAYS_POSITIVE:%.*]], float [[UNKNOWN:%.*]], ptr [[PTR:%.*]]) {
+; CHECK-NEXT:    [[COPYSIGN:%.*]] = call float @llvm.copysign.f32(float [[ALWAYS_POSITIVE]], float [[UNKNOWN]])
+; CHECK-NEXT:    store float [[COPYSIGN]], ptr [[PTR]], align 4
+; CHECK-NEXT:    ret float [[COPYSIGN]]
+;
+  %copysign = call float @llvm.copysign.f32(float %always.positive, float %unknown)
+  store float %copysign, ptr %ptr
+  ret float %copysign
+}
+
+; Fold to direct use of %always.negative
+define nofpclass(nan pinf pnorm psub pzero) float @ret_only_negative__copysign_src_known_negative__sign_unknown(float nofpclass(nan pinf pnorm psub pzero) %always.negative, float %unknown) {
+; CHECK-LABEL: define nofpclass(nan pinf pzero psub pnorm) float @ret_only_negative__copysign_src_known_negative__sign_unknown
+; CHECK-SAME: (float nofpclass(nan pinf pzero psub pnorm) [[ALWAYS_NEGATIVE:%.*]], float [[UNKNOWN:%.*]]) {
+; CHECK-NEXT:    ret float [[ALWAYS_NEGATIVE]]
+;
+  %copysign = call float @llvm.copysign.f32(float %always.negative, float %unknown)
+  ret float %copysign
+}
+
+; Fold to direct use of %always.negative
+define nofpclass(nan pinf pnorm psub pzero) float @ret_only_negative__copysign_src_known_negative__sign_unknown_multiple_use(float nofpclass(nan pinf pnorm psub pzero) %always.negative, float %unknown, ptr %ptr) {
+; CHECK-LABEL: define nofpclass(nan pinf pzero psub pnorm) float @ret_only_negative__copysign_src_known_negative__sign_unknown_multiple_use
+; CHECK-SAME: (float nofpclass(nan pinf pzero psub pnorm) [[ALWAYS_NEGATIVE:%.*]], float [[UNKNOWN:%.*]], ptr [[PTR:%.*]]) {
+; CHECK-NEXT:    [[COPYSIGN:%.*]] = call float @llvm.copysign.f32(float [[ALWAYS_NEGATIVE]], float [[UNKNOWN]])
+; CHECK-NEXT:    store float [[COPYSIGN]], ptr [[PTR]], align 4
+; CHECK-NEXT:    ret float [[COPYSIGN]]
+;
+  %copysign = call float @llvm.copysign.f32(float %always.negative, float %unknown)
+  store float %copysign, ptr %ptr
+  ret float %copysign
+}
+
+; Negative test
+define nofpclass(nan ninf nnorm nsub nzero) float @ret_only_positive__copysign_src_known_negative__sign_unknown(float nofpclass(nan pinf pnorm psub pzero) %always.negative, float %unknown) {
+; CHECK-LABEL: define nofpclass(nan ninf nzero nsub nnorm) float @ret_only_positive__copysign_src_known_negative__sign_unknown
+; CHECK-SAME: (float nofpclass(nan pinf pzero psub pnorm) [[ALWAYS_NEGATIVE:%.*]], float [[UNKNOWN:%.*]]) {
+; CHECK-NEXT:    [[COPYSIGN:%.*]] = call float @llvm.fabs.f32(float [[ALWAYS_NEGATIVE]])
+; CHECK-NEXT:    ret float [[COPYSIGN]]
+;
+  %copysign = call float @llvm.copysign.f32(float %always.negative, float %unknown)
+  ret float %copysign
+}
+
+; Negative test
+define nofpclass(nan ninf nnorm nsub nzero) float @ret_only_positive__copysign_src_known_negative__sign_unknown_multiple_use(float nofpclass(nan pinf pnorm psub pzero) %always.negative, float %unknown, ptr %ptr) {
+; CHECK-LABEL: define nofpclass(nan ninf nzero nsub nnorm) float @ret_only_positive__copysign_src_known_negative__sign_unknown_multiple_use
+; CHECK-SAME: (float nofpclass(nan pinf pzero psub pnorm) [[ALWAYS_NEGATIVE:%.*]], float [[UNKNOWN:%.*]], ptr [[PTR:%.*]]) {
+; CHECK-NEXT:    [[COPYSIGN:%.*]] = call float @llvm.copysign.f32(float [[ALWAYS_NEGATIVE]], float [[UNKNOWN]])
+; CHECK-NEXT:    store float [[COPYSIGN]], ptr [[PTR]], align 4
+; CHECK-NEXT:    ret float [[COPYSIGN]]
+;
+  %copysign = call float @llvm.copysign.f32(float %always.negative, float %unknown)
+  store float %copysign, ptr %ptr
+  ret float %copysign
+}
+
+; Negative test
+define nofpclass(nan pinf pnorm psub pzero) float @ret_only_negative__copysign_src_known_positive__sign_unknown(float nofpclass(nan ninf nnorm nsub nzero) %always.positive, float %unknown) {
+; CHECK-LABEL: define nofpclass(nan pinf pzero psub pnorm) float @ret_only_negative__copysign_src_known_positive__sign_unknown
+; CHECK-SAME: (float nofpclass(nan ninf nzero nsub nnorm) [[ALWAYS_POSITIVE:%.*]], float [[UNKNOWN:%.*]]) {
+; CHECK-NEXT:    [[COPYSIGN:%.*]] = fneg float [[ALWAYS_POSITIVE]]
+; CHECK-NEXT:    ret float [[COPYSIGN]]
+;
+  %copysign = call float @llvm.copysign.f32(float %always.positive, float %unknown)
+  ret float %copysign
+}
+
+; Could still be positive for +inf input
+define nofpclass(nan nnorm nsub nzero) float @ret_only_positive_or_ninf__copysign_src_known_positive__sign_unknown(float nofpclass(nan ninf nnorm nsub nzero) %always.positive, float %unknown) {
+; CHECK-LABEL: define nofpclass(nan nzero nsub nnorm) float @ret_only_positive_or_ninf__copysign_src_known_positive__sign_unknown
+; CHECK-SAME: (float nofpclass(nan ninf nzero nsub nnorm) [[ALWAYS_POSITIVE:%.*]], float [[UNKNOWN:%.*]]) {
+; CHECK-NEXT:    [[COPYSIGN:%.*]] = call float @llvm.copysign.f32(float [[ALWAYS_POSITIVE]], float [[UNKNOWN]])
+; CHECK-NEXT:    ret float [[COPYSIGN]]
+;
+  %copysign = call float @llvm.copysign.f32(float %always.positive, float %unknown)
+  ret float %copysign
+}
+
+; The ninf flag gives the missing no-infs
+define nofpclass(nan nnorm nsub nzero) float @ret_only_positive_or_ninf__copysign_ninf_src_known_positive__sign_unknown(float nofpclass(nan ninf nnorm nsub nzero) %always.positive, float %unknown) {
+; CHECK-LABEL: define nofpclass(nan nzero nsub nnorm) float @ret_only_positive_or_ninf__copysign_ninf_src_known_positive__sign_unknown
+; CHECK-SAME: (float nofpclass(nan ninf nzero nsub nnorm) [[ALWAYS_POSITIVE:%.*]], float [[UNKNOWN:%.*]]) {
+; CHECK-NEXT:    ret float [[ALWAYS_POSITIVE]]
+;
+  %copysign = call ninf float @llvm.copysign.f32(float %always.positive, float %unknown)
+  ret float %copysign
+}
+
+; The ninf flag gives the missing no-infs
+define nofpclass(nan nnorm nsub nzero) float @ret_only_positive_or_ninf__copysign_ninf_src_known_positive__sign_unknown_multiple_use(float nofpclass(nan ninf nnorm nsub nzero) %always.positive, float %unknown, ptr %ptr) {
+; CHECK-LABEL: define nofpclass(nan nzero nsub nnorm) float @ret_only_positive_or_ninf__copysign_ninf_src_known_positive__sign_unknown_multiple_use
+; CHECK-SAME: (float nofpclass(nan ninf nzero nsub nnorm) [[ALWAYS_POSITIVE:%.*]], float [[UNKNOWN:%.*]], ptr [[PTR:%.*]]) {
+; CHECK-NEXT:    [[COPYSIGN:%.*]] = call ninf float @llvm.copysign.f32(float [[ALWAYS_POSITIVE]], float [[UNKNOWN]])
+; CHECK-NEXT:    store float [[COPYSIGN]], ptr [[PTR]], align 4
+; CHECK-NEXT:    ret float [[COPYSIGN]]
+;
+  %copysign = call ninf float @llvm.copysign.f32(float %always.positive, float %unknown)
+  store float %copysign, ptr %ptr
+  ret float %copysign
+}
+
+; No pinf from argument
+define nofpclass(nan nnorm nsub nzero) float @ret_only_positive_or_ninf__copysign_src_known_positive_no_inf__sign_unknown(float nofpclass(nan inf nnorm nsub nzero) %always.positive.no.inf, float %unknown) {
+; CHECK-LABEL: define nofpclass(nan nzero nsub nnorm) float @ret_only_positive_or_ninf__copysign_src_known_positive_no_inf__sign_unknown
+; CHECK-SAME: (float nofpclass(nan inf nzero nsub nnorm) [[ALWAYS_POSITIVE_NO_INF:%.*]], float [[UNKNOWN:%.*]]) {
+; CHECK-NEXT:    [[COPYSIGN:%.*]] = call float @llvm.copysign.f32(float [[ALWAYS_POSITIVE_NO_INF]], float [[UNKNOWN]])
+; CHECK-NEXT:    ret float [[COPYSIGN]]
+;
+  %copysign = call float @llvm.copysign.f32(float %always.positive.no.inf, float %unknown)
+  ret float %copysign
+}
+
+; No pinf from argument
+define nofpclass(nan nnorm nsub nzero) float @ret_only_positive_or_ninf__copysign_src_known_positive_no_inf__sign_unknown_multiple_use(float nofpclass(nan inf nnorm nsub nzero) %always.positive.no.inf, float %unknown, ptr %ptr) {
+; CHECK-LABEL: define nofpclass(nan nzero nsub nnorm) float @ret_only_positive_or_ninf__copysign_src_known_positive_no_inf__sign_unknown_multiple_use
+; CHECK-SAME: (float nofpclass(nan inf nzero nsub nnorm) [[ALWAYS_POSITIVE_NO_INF:%.*]], float [[UNKNOWN:%.*]], ptr [[PTR:%.*]]) {
+; CHECK-NEXT:    [[COPYSIGN:%.*]] = call float @llvm.copysign.f32(float [[ALWAYS_POSITIVE_NO_INF]], float [[UNKNOWN]])
+; CHECK-NEXT:    store float [[COPYSIGN]], ptr [[PTR]], align 4
+; CHECK-NEXT:    ret float [[COPYSIGN]]
+;
+  %copysign = call float @llvm.copysign.f32(float %always.positive.no.inf, float %unknown)
+  store float %copysign, ptr %ptr
+  ret float %copysign
+}
+
+define nofpclass(nan ninf nnorm nsub) float @ret_only_positive_or_zero__copysign_src_known_positive__sign_unknown(float nofpclass(nan ninf nnorm nsub) %always.positive.or.zero, float %unknown) {
+; CHECK-LABEL: define nofpclass(nan ninf nsub nnorm) float @ret_only_positive_or_zero__copysign_src_known_positive__sign_unknown
+; CHECK-SAME: (float nofpclass(nan ninf nsub nnorm) [[ALWAYS_POSITIVE_OR_ZERO:%.*]], float [[UNKNOWN:%.*]]) {
+; CHECK-NEXT:    [[COPYSIGN:%.*]] = call float @llvm.copysign.f32(float [[ALWAYS_POSITIVE_OR_ZERO]], float [[UNKNOWN]])
+; CHECK-NEXT:    ret float [[COPYSIGN]]
+;
+  %copysign = call float @llvm.copysign.f32(float %always.positive.or.zero, float %unknown)
+  ret float %copysign
+}
+
+; Ignore 0 mismatch with nsz
+define nofpclass(nan ninf nnorm nsub) float @ret_only_positive_or_zero__copysign_nsz_src_known_positive__sign_unknown(float nofpclass(nan ninf nnorm nsub) %always.positive.or.zero, float %unknown) {
+; CHECK-LABEL: define nofpclass(nan ninf nsub nnorm) float @ret_only_positive_or_zero__copysign_nsz_src_known_positive__sign_unknown
+; CHECK-SAME: (float nofpclass(nan ninf nsub nnorm) [[ALWAYS_POSITIVE_OR_ZERO:%.*]], float [[UNKNOWN:%.*]]) {
+; CHECK-NEXT:    ret float [[ALWAYS_POSITIVE_OR_ZERO]]
+;
+  %copysign = call nsz float @llvm.copysign.f32(float %always.positive.or.zero, float %unknown)
+  ret float %copysign
+}
+
+; Do not use nsz with multiple uses
+define nofpclass(nan ninf nnorm nsub) float @ret_only_positive_or_zero__copysign_nsz_src_known_positive__sign_unknown_multiple_use(float nofpclass(nan ninf nnorm nsub) %always.positive.or.zero, float %unknown, ptr %ptr) {
+; CHECK-LABEL: define nofpclass(nan ninf nsub nnorm) float @ret_only_positive_or_zero__copysign_nsz_src_known_positive__sign_unknown_multiple_use
+; CHECK-SAME: (float nofpclass(nan ninf nsub nnorm) [[ALWAYS_POSITIVE_OR_ZERO:%.*]], float [[UNKNOWN:%.*]], ptr [[PTR:%.*]]) {
+; CHECK-NEXT:    [[COPYSIGN:%.*]] = call nsz float @llvm.copysign.f32(float [[ALWAYS_POSITIVE_OR_ZERO]], float [[UNKNOWN]])
+; CHECK-NEXT:    store float [[COPYSIGN]], ptr [[PTR]], align 4
+; CHECK-NEXT:    ret float [[COPYSIGN]]
+;
+  %copysign = call nsz float @llvm.copysign.f32(float %always.positive.or.zero, float %unknown)
+  store float %copysign, ptr %ptr
+  ret float %copysign
+}
+
+define nofpclass(nan pinf pnorm psub) float @ret_only_negative_or_pzero__copysign_src_known_negative__sign_unknown(float nofpclass(nan pinf pnorm psub) %always.negative.or.zero, float %unknown) {
+; CHECK-LABEL: define nofpclass(nan pinf psub pnorm) float @ret_only_negative_or_pzero__copysign_src_known_negative__sign_unknown
+; CHECK-SAME: (float nofpclass(nan pinf psub pnorm) [[ALWAYS_NEGATIVE_OR_ZERO:%.*]], float [[UNKNOWN:%.*]]) {
+; CHECK-NEXT:    [[COPYSIGN:%.*]] = call float @llvm.copysign.f32(float [[ALWAYS_NEGATIVE_OR_ZERO]], float [[UNKNOWN]])
+; CHECK-NEXT:    ret float [[COPYSIGN]]
+;
+  %copysign = call float @llvm.copysign.f32(float %always.negative.or.zero, float %unknown)
+  ret float %copysign
+}
+
+; Ignore 0 mismatch with nsz
+define nofpclass(nan pinf pnorm psub) float @ret_only_negative_or_pzero__copysign_nsz_src_known_negative__sign_unknown(float nofpclass(nan pinf pnorm psub) %always.negative.or.zero, float %unknown) {
+; CHECK-LABEL: define nofpclass(nan pinf psub pnorm) float @ret_only_negative_or_pzero__copysign_nsz_src_known_negative__sign_unknown
+; CHECK-SAME: (float nofpclass(nan pinf psub pnorm) [[ALWAYS_NEGATIVE_OR_ZERO:%.*]], float [[UNKNOWN:%.*]]) {
+; CHECK-NEXT:    ret float [[ALWAYS_NEGATIVE_OR_ZERO]]
+;
+  %copysign = call nsz float @llvm.copysign.f32(float %always.negative.or.zero, float %unknown)
+  ret float %copysign
+}
+
+; Do not use nsz with multiple uses
+define nofpclass(nan pinf pnorm psub) float @ret_only_negative_or_pzero__copysign_nsz_src_known_negative__sign_unknown_multiple_use(float nofpclass(nan pinf pnorm psub) %always.negative.or.zero, float %unknown, ptr %ptr) {
+; CHECK-LABEL: define nofpclass(nan pinf psub pnorm) float @ret_only_negative_or_pzero__copysign_nsz_src_known_negative__sign_unknown_multiple_use
+; CHECK-SAME: (float nofpclass(nan pinf psub pnorm) [[ALWAYS_NEGATIVE_OR_ZERO:%.*]], float [[UNKNOWN:%.*]], ptr [[PTR:%.*]]) {
+; CHECK-NEXT:    [[COPYSIGN:%.*]] = call nsz float @llvm.copysign.f32(float [[ALWAYS_NEGATIVE_OR_ZERO]], float [[UNKNOWN]])
+; CHECK-NEXT:    store float [[COPYSIGN]], ptr [[PTR]], align 4
+; CHECK-NEXT:    ret float [[COPYSIGN]]
+;
+  %copysign = call nsz float @llvm.copysign.f32(float %always.negative.or.zero, float %unknown)
+  store float %copysign, ptr %ptr
+  ret float %copysign
+}
+
+define nofpclass(pinf pnorm psub pzero) float @ret_only_negative_or_nan__copysign_src_known_negative__sign_unknown(float nofpclass(nan pinf pnorm psub pzero) %always.negative, float %unknown) {
+; CHECK-LABEL: define nofpclass(pinf pzero psub pnorm) float @ret_only_negative_or_nan__copysign_src_known_negative__sign_unknown
+; CHECK-SAME: (float nofpclass(nan pinf pzero psub pnorm) [[ALWAYS_NEGATIVE:%.*]], float [[UNKNOWN:%.*]]) {
+; CHECK-NEXT:    [[COPYSIGN:%.*]] = call float @llvm.copysign.f32(float [[ALWAYS_NEGATIVE]], float [[UNKNOWN]])
+; CHECK-NEXT:    ret float [[COPYSIGN]]
+;
+  %copysign = call float @llvm.copysign.f32(float %always.negative, float %unknown)
+  ret float %copysign
+}
+
+
+define nofpclass(pinf pnorm psub pzero) float @ret_only_negative_or_nan__copysign_src_known_negative__sign_unknown_multiple_use(float nofpclass(nan pinf pnorm psub pzero) %always.negative, float %unknown, ptr %ptr) {
+; CHECK-LABEL: define nofpclass(pinf pzero psub pnorm) float @ret_only_negative_or_nan__copysign_src_known_negative__sign_unknown_multiple_use
+; CHECK-SAME: (float nofpclass(nan pinf pzero psub pnorm) [[ALWAYS_NEGATIVE:%.*]], float [[UNKNOWN:%.*]], ptr [[PTR:%.*]]) {
+; CHECK-NEXT:    [[COPYSIGN:%.*]] = call float @llvm.copysign.f32(float [[ALWAYS_NEGATIVE]], float [[UNKNOWN]])
+; CHECK-NEXT:    store float [[COPYSIGN]], ptr [[PTR]], align 4
+; CHECK-NEXT:    ret float [[COPYSIGN]]
+;
+  %copysign = call float @llvm.copysign.f32(float %always.negative, float %unknown)
+  store float %copysign, ptr %ptr
+  ret float %copysign
+}
+
+define nofpclass(pinf pnorm psub pzero) float @ret_only_negative_or_nan__copysign_src_known_negative_or_nan__sign_unknown(float nofpclass(pinf pnorm psub pzero) %always.negative.or.nan, float %unknown) {
+; CHECK-LABEL: define nofpclass(pinf pzero psub pnorm) float @ret_only_negative_or_nan__copysign_src_known_negative_or_nan__sign_unknown
+; CHECK-SAME: (float nofpclass(pinf pzero psub pnorm) [[ALWAYS_NEGATIVE_OR_NAN:%.*]], float [[UNKNOWN:%.*]]) {
+; CHECK-NEXT:    [[COPYSIGN:%.*]] = call float @llvm.copysign.f32(float [[ALWAYS_NEGATIVE_OR_NAN]], float [[UNKNOWN]])
+; CHECK-NEXT:    ret float [[COPYSIGN]]
+;
+  %copysign = call float @llvm.copysign.f32(float %always.negative.or.nan, float %unknown)
+  ret float %copysign
+}
+
+define nofpclass(pinf pnorm psub pzero) float @ret_only_negative_or_nan__copysign_src_known_negative_or_nan__sign_unknown_multiple_use(float nofpclass(pinf pnorm psub pzero) %always.negative.or.nan, float %unknown, ptr %ptr) {
+; CHECK-LABEL: define nofpclass(pinf pzero psub pnorm) float @ret_only_negative_or_nan__copysign_src_known_negative_or_nan__sign_unknown_multiple_use
+; CHECK-SAME: (float nofpclass(pinf pzero psub pnorm) [[ALWAYS_NEGATIVE_OR_NAN:%.*]], float [[UNKNOWN:%.*]], ptr [[PTR:%.*]]) {
+; CHECK-NEXT:    [[COPYSIGN:%.*]] = call float @llvm.copysign.f32(float [[ALWAYS_NEGATIVE_OR_NAN]], float [[UNKNOWN]])
+; CHECK-NEXT:    store float [[COPYSIGN]], ptr [[PTR]], align 4
+; CHECK-NEXT:    ret float [[COPYSIGN]]
+;
+  %copysign = call float @llvm.copysign.f32(float %always.negative.or.nan, float %unknown)
+  store float %copysign, ptr %ptr
+  ret float %copysign
+}
+
+; Take nnan from flag
+define nofpclass(pinf pnorm psub pzero) float @ret_only_negative_or_nan__copysign_nnan_src_known_negative_or_nan__sign_unknown(float nofpclass(pinf pnorm psub pzero) %always.negative.or.nan, float %unknown) {
+; CHECK-LABEL: define nofpclass(pinf pzero psub pnorm) float @ret_only_negative_or_nan__copysign_nnan_src_known_negative_or_nan__sign_unknown
+; CHECK-SAME: (float nofpclass(pinf pzero psub pnorm) [[ALWAYS_NEGATIVE_OR_NAN:%.*]], float [[UNKNOWN:%.*]]) {
+; CHECK-NEXT:    ret float [[ALWAYS_NEGATIVE_OR_NAN]]
+;
+  %copysign = call nnan float @llvm.copysign.f32(float %always.negative.or.nan, float %unknown)
+  ret float %copysign
+}
+
+; Take nnan from flag
+define nofpclass(pinf pnorm psub pzero) float @ret_only_negative_or_nan__copysign_nnan_src_known_negative_or_nan__sign_unknown_multiple_use(float nofpclass(pinf pnorm psub pzero) %always.negative.or.nan, float %unknown, ptr %ptr) {
+; CHECK-LABEL: define nofpclass(pinf pzero psub pnorm) float @ret_only_negative_or_nan__copysign_nnan_src_known_negative_or_nan__sign_unknown_multiple_use
+; CHECK-SAME: (float nofpclass(pinf pzero psub pnorm) [[ALWAYS_NEGATIVE_OR_NAN:%.*]], float [[UNKNOWN:%.*]], ptr [[PTR:%.*]]) {
+; CHECK-NEXT:    [[COPYSIGN:%.*]] = call nnan float @llvm.copysign.f32(float [[ALWAYS_NEGATIVE_OR_NAN]], float [[UNKNOWN]])
+; CHECK-NEXT:    store float [[COPYSIGN]], ptr [[PTR]], align 4
+; CHECK-NEXT:    ret float [[COPYSIGN]]
+;
+  %copysign = call nnan float @llvm.copysign.f32(float %always.negative.or.nan, float %unknown)
+  store float %copysign, ptr %ptr
+  ret float %copysign
+}
+
+
+define nofpclass(pinf pnorm psub pzero) float @ret_only_positive_or_nan__copysign_src_known_positive__sign_unknown(float nofpclass(nan pinf pnorm psub pzero) %always.negative, float %unknown) {
+; CHECK-LABEL: define nofpclass(pinf pzero psub pnorm) float @ret_only_positive_or_nan__copysign_src_known_positive__sign_unknown
+; CHECK-SAME: (float nofpclass(nan pinf pzero psub pnorm) [[ALWAYS_NEGATIVE:%.*]], float [[UNKNOWN:%.*]]) {
+; CHECK-NEXT:    [[COPYSIGN:%.*]] = call float @llvm.copysign.f32(float [[ALWAYS_NEGATIVE]], float [[UNKNOWN]])
+; CHECK-NEXT:    ret float [[COPYSIGN]]
+;
+  %copysign = call float @llvm.copysign.f32(float %always.negative, float %unknown)
+  ret float %copysign
+}
+
+
+define nofpclass(pinf pnorm psub pzero) float @ret_only_positive_or_nan__copysign_src_known_positive__sign_unknown_multiple_use(float nofpclass(nan pinf pnorm psub pzero) %always.negative, float %unknown, ptr %ptr) {
+; CHECK-LABEL: define nofpclass(pinf pzero psub pnorm) float @ret_only_positive_or_nan__copysign_src_known_positive__sign_unknown_multiple_use
+; CHECK-SAME: (float nofpclass(nan pinf pzero psub pnorm) [[ALWAYS_NEGATIVE:%.*]], float [[UNKNOWN:%.*]], ptr [[PTR:%.*]]) {
+; CHECK-NEXT:    [[COPYSIGN:%.*]] = call float @llvm.copysign.f32(float [[ALWAYS_NEGATIVE]], float [[UNKNOWN]])
+; CHECK-NEXT:    store float [[COPYSIGN]], ptr [[PTR]], align 4
+; CHECK-NEXT:    ret float [[COPYSIGN]]
+;
+  %copysign = call float @llvm.copysign.f32(float %always.negative, float %unknown)
+  store float %copysign, ptr %ptr
+  ret float %copysign
+}
+
+define nofpclass(pinf pnorm psub pzero) float @ret_only_positive_or_nan__copysign_src_known_positive_or_nan__sign_unknown(float nofpclass(pinf pnorm psub pzero) %always.negative.or.nan, float %unknown) {
+; CHECK-LABEL: define nofpclass(pinf pzero psub pnorm) float @ret_only_positive_or_nan__copysign_src_known_positive_or_nan__sign_unknown
+; CHECK-SAME: (float nofpclass(pinf pzero psub pnorm) [[ALWAYS_NEGATIVE_OR_NAN:%.*]], float [[UNKNOWN:%.*]]) {
+; CHECK-NEXT:    [[COPYSIGN:%.*]] = call float @llvm.copysign.f32(float [[ALWAYS_NEGATIVE_OR_NAN]], float [[UNKNOWN]])
+; CHECK-NEXT:    ret float [[COPYSIGN]]
+;
+  %copysign = call float @llvm.copysign.f32(float %always.negative.or.nan, float %unknown)
+  ret float %copysign
+}
+
+define nofpclass(pinf pnorm psub pzero) float @ret_only_positive_or_nan__copysign_src_known_positive_or_nan__sign_unknown_multiple_use(float nofpclass(pinf pnorm psub pzero) %always.negative.or.nan, float %unknown, ptr %ptr) {
+; CHECK-LABEL: define nofpclass(pinf pzero psub pnorm) float @ret_only_positive_or_nan__copysign_src_known_positive_or_nan__sign_unknown_multiple_use
+; CHECK-SAME: (float nofpclass(pinf pzero psub pnorm) [[ALWAYS_NEGATIVE_OR_NAN:%.*]], float [[UNKNOWN:%.*]], ptr [[PTR:%.*]]) {
+; CHECK-NEXT:    [[COPYSIGN:%.*]] = call float @llvm.copysign.f32(float [[ALWAYS_NEGATIVE_OR_NAN]], float [[UNKNOWN]])
+; CHECK-NEXT:    store float [[COPYSIGN]], ptr [[PTR]], align 4
+; CHECK-NEXT:    ret float [[COPYSIGN]]
+;
+  %copysign = call float @llvm.copysign.f32(float %always.negative.or.nan, float %unknown)
+  store float %copysign, ptr %ptr
+  ret float %copysign
+}
+
+; Take nnan from flag
+define nofpclass(pinf pnorm psub pzero) float @ret_only_positive_or_nan__copysign_nnan_src_known_positive_or_nan__sign_unknown(float nofpclass(pinf pnorm psub pzero) %always.negative.or.nan, float %unknown) {
+; CHECK-LABEL: define nofpclass(pinf pzero psub pnorm) float @ret_only_positive_or_nan__copysign_nnan_src_known_positive_or_nan__sign_unknown
+; CHECK-SAME: (float nofpclass(pinf pzero psub pnorm) [[ALWAYS_NEGATIVE_OR_NAN:%.*]], float [[UNKNOWN:%.*]]) {
+; CHECK-NEXT:    ret float [[ALWAYS_NEGATIVE_OR_NAN]]
+;
+  %copysign = call nnan float @llvm.copysign.f32(float %always.negative.or.nan, float %unknown)
+  ret float %copysign
+}
+
+; Take nnan from flag
+define nofpclass(pinf pnorm psub pzero) float @ret_only_positive_or_nan__copysign_nnan_src_known_positive_or_nan__sign_unknown_multiple_use(float nofpclass(pinf pnorm psub pzero) %always.negative.or.nan, float %unknown, ptr %ptr) {
+; CHECK-LABEL: define nofpclass(pinf pzero psub pnorm) float @ret_only_positive_or_nan__copysign_nnan_src_known_positive_or_nan__sign_unknown_multiple_use
+; CHECK-SAME: (float nofpclass(pinf pzero psub pnorm) [[ALWAYS_NEGATIVE_OR_NAN:%.*]], float [[UNKNOWN:%.*]], ptr [[PTR:%.*]]) {
+; CHECK-NEXT:    [[COPYSIGN:%.*]] = call nnan float @llvm.copysign.f32(float [[ALWAYS_NEGATIVE_OR_NAN]], float [[UNKNOWN]])
+; CHECK-NEXT:    store float [[COPYSIGN]], ptr [[PTR]], align 4
+; CHECK-NEXT:    ret float [[COPYSIGN]]
+;
+  %copysign = call nnan float @llvm.copysign.f32(float %always.negative.or.nan, float %unknown)
+  store float %copysign, ptr %ptr
+  ret float %copysign
+}
+
+; We can only tell the sign bit is negative due to the ninf flag
+define nofpclass(nan) float @ret_no_nan__copysign_ninf__src_known_negative_or_nan__sign_known_negative_or_pinf(float nofpclass(pinf pnorm psub pzero) %always.negative.or.nan, float nofpclass(nan pnorm psub pzero) %always.negative.or.pinf) {
+; CHECK-LABEL: define nofpclass(nan) float @ret_no_nan__copysign_ninf__src_known_negative_or_nan__sign_known_negative_or_pinf
+; CHECK-SAME: (float nofpclass(pinf pzero psub pnorm) [[ALWAYS_NEGATIVE_OR_NAN:%.*]], float nofpclass(nan pzero psub pnorm) [[ALWAYS_NEGATIVE_OR_PINF:%.*]]) {
+; CHECK-NEXT:    [[COPYSIGN:%.*]] = call ninf float @llvm.copysign.f32(float [[ALWAYS_NEGATIVE_OR_NAN]], float [[ALWAYS_NEGATIVE_OR_PINF]])
+; CHECK-NEXT:    ret float [[COPYSIGN]]
+;
+  %copysign = call ninf float @llvm.copysign.f32(float %always.negative.or.nan, float %always.negative.or.pinf)
+  ret float %copysign
+}
+
+; We can only tell the sign bit is negative due to the ninf flag
+define nofpclass(nan) float @ret_no_nan__copysign_ninf__src_known_negative_or_nan__sign_known_negative_or_pinf__multiple_use(float nofpclass(pinf pnorm psub pzero) %always.negative.or.nan, float nofpclass(nan pnorm psub pzero) %always.negative.or.pinf, ptr %ptr) {
+; CHECK-LABEL: define nofpclass(nan) float @ret_no_nan__copysign_ninf__src_known_negative_or_nan__sign_known_negative_or_pinf__multiple_use
+; CHECK-SAME: (float nofpclass(pinf pzero psub pnorm) [[ALWAYS_NEGATIVE_OR_NAN:%.*]], float nofpclass(nan pzero psub pnorm) [[ALWAYS_NEGATIVE_OR_PINF:%.*]], ptr [[PTR:%.*]]) {
+; CHECK-NEXT:    [[COPYSIGN:%.*]] = call ninf float @llvm.copysign.f32(float [[ALWAYS_NEGATIVE_OR_NAN]], float [[ALWAYS_NEGATIVE_OR_PINF]])
+; CHECK-NEXT:    store float [[COPYSIGN]], ptr [[PTR]], align 4
+; CHECK-NEXT:    ret float [[COPYSIGN]]
+;
+  %copysign = call ninf float @llvm.copysign.f32(float %always.negative.or.nan, float %always.negative.or.pinf)
+  store float %copysign, ptr %ptr
+  ret float %copysign
+}
+
+; We can only tell the sign bit is positive due to the ninf flag
+define nofpclass(nan) float @ret_no_nan__copysign_ninf__src_known_negative_or_nan__sign_known_positive_or_ninf(float nofpclass(ninf nnorm nsub nzero) %always.positive.or.nan, float nofpclass(nan nnorm nsub nzero) %always.negative.or.pinf) {
+; CHECK-LABEL: define nofpclass(nan) float @ret_no_nan__copysign_ninf__src_known_negative_or_nan__sign_known_positive_or_ninf
+; CHECK-SAME: (float nofpclass(ninf nzero nsub nnorm) [[ALWAYS_POSITIVE_OR_NAN:%.*]], float nofpclass(nan nzero nsub nnorm) [[ALWAYS_NEGATIVE_OR_PINF:%.*]]) {
+; CHECK-NEXT:    [[COPYSIGN:%.*]] = call ninf float @llvm.copysign.f32(float [[ALWAYS_POSITIVE_OR_NAN]], float [[ALWAYS_NEGATIVE_OR_PINF]])
+; CHECK-NEXT:    ret float [[COPYSIGN]]
+;
+  %copysign = call ninf float @llvm.copysign.f32(float %always.positive.or.nan, float %always.negative.or.pinf)
+  ret float %copysign
+}
+
+; We can only tell the sign bit is positive due to the ninf flag
+define nofpclass(nan) float @ret_no_nan__copysign_ninf__src_known_negative_or_nan__sign_known_positive_or_ninf__multiple_use(float nofpclass(ninf nnorm nsub nzero) %always.positive.or.nan, float nofpclass(nan nnorm nsub nzero) %always.negative.or.pinf, ptr %ptr) {
+; CHECK-LABEL: define nofpclass(nan) float @ret_no_nan__copysign_ninf__src_known_negative_or_nan__sign_known_positive_or_ninf__multiple_use
+; CHECK-SAME: (float nofpclass(ninf nzero nsub nnorm) [[ALWAYS_POSITIVE_OR_NAN:%.*]], float nofpclass(nan nzero nsub nnorm) [[ALWAYS_NEGATIVE_OR_PINF:%.*]], ptr [[PTR:%.*]]) {
+; CHECK-NEXT:    [[COPYSIGN:%.*]] = call ninf float @llvm.copysign.f32(float [[ALWAYS_POSITIVE_OR_NAN]], float [[ALWAYS_NEGATIVE_OR_PINF]])
+; CHECK-NEXT:    store float [[COPYSIGN]], ptr [[PTR]], align 4
+; CHECK-NEXT:    ret float [[COPYSIGN]]
+;
+  %copysign = call ninf float @llvm.copysign.f32(float %always.positive.or.nan, float %always.negative.or.pinf)
+  store float %copysign, ptr %ptr
+  ret float %copysign
+}
+
+; Can infer sign bit is negative due to no-inf return
+define nofpclass(nan inf) float @ret_no_nan_no_inf__copysign__src_known_negative_or_nan__sign_known_negative_or_pinf__multiple_use(float nofpclass(nan pnorm psub pzero) %always.negative.or.pinf, float nofpclass(nan pinf pnorm psub pzero) %always.negative, float %unknown0, float %unknown1, ptr %ptr) {
+; CHECK-LABEL: define nofpclass(nan inf) float @ret_no_nan_no_inf__copysign__src_known_negative_or_nan__sign_known_negative_or_pinf__multiple_use
+; CHECK-SAME: (float nofpclass(nan pzero psub pnorm) [[ALWAYS_NEGATIVE_OR_PINF:%.*]], float nofpclass(nan pinf pzero psub pnorm) [[ALWAYS_NEGATIVE:%.*]], float [[UNKNOWN0:%.*]], float [[UNKNOWN1:%.*]], ptr [[PTR:%.*]]) {
+; CHECK-NEXT:    [[TMP1:%.*]] = call float @llvm.fabs.f32(float [[ALWAYS_NEGATIVE_OR_PINF]])
+; CHECK-NEXT:    [[COPYSIGN:%.*]] = fneg float [[TMP1]]
+; CHECK-NEXT:    store float [[COPYSIGN]], ptr [[PTR]], align 4
+; CHECK-NEXT:    ret float [[ALWAYS_NEGATIVE_OR_PINF]]
+;
+  %copysign = call float @llvm.copysign.f32(float %always.negative.or.pinf, float %always.negative)
+  store float %copysign, ptr %ptr
+  ret float %copysign
+}
+
+; No-pinf result not sufficient
+define nofpclass(nan pinf) float @ret_no_nan_no_pinf__copysign__src_known_negative_or_nan__sign_known_negative_or_pinf__multiple_use(float nofpclass(nan pnorm psub pzero) %always.negative.or.pinf, float nofpclass(nan pinf pnorm psub pzero) %always.negative, float %unknown0, float %unknown1, ptr %ptr) {
+; CHECK-LABEL: define nofpclass(nan pinf) float @ret_no_nan_no_pinf__copysign__src_known_negative_or_nan__sign_known_negative_or_pinf__multiple_use
+; CHECK-SAME: (float nofpclass(nan pzero psub pnorm) [[ALWAYS_NEGATIVE_OR_PINF:%.*]], float nofpclass(nan pinf pzero psub pnorm) [[ALWAYS_NEGATIVE:%.*]], float [[UNKNOWN0:%.*]], float [[UNKNOWN1:%.*]], ptr [[PTR:%.*]]) {
+; CHECK-NEXT:    [[TMP1:%.*]] = call float @llvm.fabs.f32(float [[ALWAYS_NEGATIVE_OR_PINF]])
+; CHECK-NEXT:    [[COPYSIGN:%.*]] = fneg float [[TMP1]]
+; CHECK-NEXT:    store float [[COPYSIGN]], ptr [[PTR]], align 4
+; CHECK-NEXT:    ret float [[COPYSIGN]]
+;
+  %copysign = call float @llvm.copysign.f32(float %always.negative.or.pinf, float %always.negative)
+  store float %copysign, ptr %ptr
+  ret float %copysign
+}
+
+; No-ninf result not sufficient
+define nofpclass(nan ninf) float @ret_no_nan_no_ninf__copysign__src_known_negative_or_nan__sign_known_negative_or_pinf__multiple_use(float nofpclass(nan pnorm psub pzero) %always.negative.or.pinf, float nofpclass(nan pinf pnorm psub pzero) %always.negative, float %unknown0, float %unknown1, ptr %ptr) {
+; CHECK-LABEL: define nofpclass(nan ninf) float @ret_no_nan_no_ninf__copysign__src_known_negative_or_nan__sign_known_negative_or_pinf__multiple_use
+; CHECK-SAME: (float nofpclass(nan pzero psub pnorm) [[ALWAYS_NEGATIVE_OR_PINF:%.*]], float nofpclass(nan pinf pzero psub pnorm) [[ALWAYS_NEGATIVE:%.*]], float [[UNKNOWN0:%.*]], float [[UNKNOWN1:%.*]], ptr [[PTR:%.*]]) {
+; CHECK-NEXT:    [[TMP1:%.*]] = call float @llvm.fabs.f32(float [[ALWAYS_NEGATIVE_OR_PINF]])
+; CHECK-NEXT:    [[COPYSIGN:%.*]] = fneg float [[TMP1]]
+; CHECK-NEXT:    store float [[COPYSIGN]], ptr [[PTR]], align 4
+; CHECK-NEXT:    ret float [[COPYSIGN]]
+;
+  %copysign = call float @llvm.copysign.f32(float %always.negative.or.pinf, float %always.negative)
+  store float %copysign, ptr %ptr
+  ret float %copysign
+}
+
+; Can infer sign bit is negative due to no-norm return
+define nofpclass(nan norm) float @ret_no_nan_no_sub__copysign__src_known_negative_or_nan__sign_known_negative_or_pnorm__multiple_use(float nofpclass(nan pinf psub pzero) %always.negative.or.pnorm, float nofpclass(nan pinf pnorm psub pzero) %always.negative, float %unknown0, float %unknown1, ptr %ptr) {
+; CHECK-LABEL: define nofpclass(nan norm) float @ret_no_nan_no_sub__copysign__src_known_negative_or_nan__sign_known_negative_or_pnorm__multiple_use
+; CHECK-SAME: (float nofpclass(nan pinf pzero psub) [[ALWAYS_NEGATIVE_OR_PNORM:%.*]], float nofpclass(nan pinf pzero psub pnorm) [[ALWAYS_NEGATIVE:%.*]], float [[UNKNOWN0:%.*]], float [[UNKNOWN1:%.*]], ptr [[PTR:%.*]]) {
+; CHECK-NEXT:    [[TMP1:%.*]] = call float @llvm.fabs.f32(float [[ALWAYS_NEGATIVE_OR_PNORM]])
+; CHECK-NEXT:    [[COPYSIGN:%.*]] = fneg float [[TMP1]]
+; CHECK-NEXT:    store float [[COPYSIGN]], ptr [[PTR]], align 4
+; CHECK-NEXT:    ret float [[COPYSIGN]]
+;
+  %copysign = call float @llvm.copysign.f32(float %always.negative.or.pnorm, float %always.negative)
+  store float %copysign, ptr %ptr
+  ret float %copysign
+}
+
+; Can infer sign bit is negative due to no-sub return
+define nofpclass(nan sub) float @ret_no_nan_no_sub__copysign__src_known_negative_or_nan__sign_known_negative_or_psub__multiple_use(float nofpclass(nan pinf pnorm pzero) %always.negative.or.psub, float nofpclass(nan pinf pnorm psub pzero) %always.negative, float %unknown0, float %unknown1, ptr %ptr) {
+; CHECK-LABEL: define nofpclass(nan sub) float @ret_no_nan_no_sub__copysign__src_known_negative_or_nan__sign_known_negative_or_psub__multiple_use
+; CHECK-SAME: (float nofpclass(nan pinf pzero pnorm) [[ALWAYS_NEGATIVE_OR_PSUB:%.*]], float nofpclass(nan pinf pzero psub pnorm) [[ALWAYS_NEGATIVE:%.*]], float [[UNKNOWN0:%.*]], float [[UNKNOWN1:%.*]], ptr [[PTR:%.*]]) {
+; CHECK-NEXT:    [[TMP1:%.*]] = call float @llvm.fabs.f32(float [[ALWAYS_NEGATIVE_OR_PSUB]])
+; CHECK-NEXT:    [[COPYSIGN:%.*]] = fneg float [[TMP1]]
+; CHECK-NEXT:    store float [[COPYSIGN]], ptr [[PTR]], align 4
+; CHECK-NEXT:    ret float [[COPYSIGN]]
+;
+  %copysign = call float @llvm.copysign.f32(float %always.negative.or.psub, float %always.negative)
+  store float %copysign, ptr %ptr
+  ret float %copysign
+}
+
+; Can infer sign bit is negative due to no-zero return
+define nofpclass(nan zero) float @ret_no_nan_no_zero__copysign__src_known_negative_or_nan__sign_known_negative_or_pzero__multiple_use(float nofpclass(nan pinf pnorm psub) %always.negative.or.pzero, float nofpclass(nan pinf pnorm psub pzero) %always.negative, float %unknown0, float %unknown1, ptr %ptr) {
+; CHECK-LABEL: define nofpclass(nan zero) float @ret_no_nan_no_zero__copysign__src_known_negative_or_nan__sign_known_negative_or_pzero__multiple_use
+; CHECK-SAME: (float nofpclass(nan pinf psub pnorm) [[ALWAYS_NEGATIVE_OR_PZERO:%.*]], float nofpclass(nan pinf pzero psub pnorm) [[ALWAYS_NEGATIVE:%.*]], float [[UNKNOWN0:%.*]], float [[UNKNOWN1:%.*]], ptr [[PTR:%.*]]) {
+; CHECK-NEXT:    [[TMP1:%.*]] = call float @llvm.fabs.f32(float [[ALWAYS_NEGATIVE_OR_PZERO]])
+; CHECK-NEXT:    [[COPYSIGN:%.*]] = fneg float [[TMP1]]
+; CHECK-NEXT:    store float [[COPYSIGN]], ptr [[PTR]], align 4
+; CHECK-NEXT:    ret float [[COPYSIGN]]
+;
+  %copysign = call float @llvm.copysign.f32(float %always.negative.or.pzero, float %always.negative)
+  store float %copysign, ptr %ptr
+  ret float %copysign
+}
+
+; Can infer sign bit is positive due to no-inf return
+define nofpclass(nan inf) float @ret_no_nan_no_inf__copysign__src_known_positive_or_nan__sign_known_positive_or_ninf__multiple_use(float nofpclass(nan nnorm nsub nzero) %always.positive.or.ninf, float nofpclass(nan ninf nnorm nsub nzero) %always.positive, ptr %ptr) {
+; CHECK-LABEL: define nofpclass(nan inf) float @ret_no_nan_no_inf__copysign__src_known_positive_or_nan__sign_known_positive_or_ninf__multiple_use
+; CHECK-SAME: (float nofpclass(nan nzero nsub nnorm) [[ALWAYS_POSITIVE_OR_NINF:%.*]], float nofpclass(nan ninf nzero nsub nnorm) [[ALWAYS_POSITIVE:%.*]], ptr [[PTR:%.*]]) {
+; CHECK-NEXT:    [[COPYSIGN:%.*]] = call float @llvm.fabs.f32(float [[ALWAYS_POSITIVE_OR_NINF]])
+; CHECK-NEXT:    store float [[COPYSIGN]], ptr [[PTR]], align 4
+; CHECK-NEXT:    ret float [[ALWAYS_POSITIVE_OR_NINF]]
+;
+  %copysign = call float @llvm.copysign.f32(float %always.positive.or.ninf, float %always.positive)
+  store float %copysign, ptr %ptr
+  ret float %copysign
+}
+
+; Can infer sign bit is positive due to no-norm return
+define nofpclass(nan norm) float @ret_no_nan_no_norm__copysign__src_known_positive_or_nan__sign_known_positive_or_norm__multiple_use(float nofpclass(nan ninf nsub nzero) %always.positive.or.nnorm, float nofpclass(nan ninf nnorm nsub nzero) %always.positive, ptr %ptr) {
+; CHECK-LABEL: define nofpclass(nan norm) float @ret_no_nan_no_norm__copysign__src_known_positive_or_nan__sign_known_positive_or_norm__multiple_use
+; CHECK-SAME: (float nofpclass(nan ninf nzero nsub) [[ALWAYS_POSITIVE_OR_NNORM:%.*]], float nofpclass(nan ninf nzero nsub nnorm) [[ALWAYS_POSITIVE:%.*]], ptr [[PTR:%.*]]) {
+; CHECK-NEXT:    [[COPYSIGN:%.*]] = call float @llvm.fabs.f32(float [[ALWAYS_POSITIVE_OR_NNORM]])
+; CHECK-NEXT:    store float [[COPYSIGN]], ptr [[PTR]], align 4
+; CHECK-NEXT:    ret float [[COPYSIGN]]
+;
+  %copysign = call float @llvm.copysign.f32(float %always.positive.or.nnorm, float %always.positive)
+  store float %copysign, ptr %ptr
+  ret float %copysign
+}
+
+; Can infer sign bit is positive due to no-sub return
+define nofpclass(nan sub) float @ret_no_nan_no_sub__copysign__src_known_positive_or_nan__sign_known_positive_or_sub__multiple_use(float nofpclass(nan ninf nnorm nzero) %always.positive.or.nsub, float nofpclass(nan ninf nnorm nsub nzero) %always.positive, ptr %ptr) {
+; CHECK-LABEL: define nofpclass(nan sub) float @ret_no_nan_no_sub__copysign__src_known_positive_or_nan__sign_known_positive_or_sub__multiple_use
+; CHECK-SAME: (float nofpclass(nan ninf nzero nnorm) [[ALWAYS_POSITIVE_OR_NSUB:%.*]], float nofpclass(nan ninf nzero nsub nnorm) [[ALWAYS_POSITIVE:%.*]], ptr [[PTR:%.*]]) {
+; CHECK-NEXT:    [[COPYSIGN:%.*]] = call float @llvm.fabs.f32(float [[ALWAYS_POSITIVE_OR_NSUB]])
+; CHECK-NEXT:    store float [[COPYSIGN]], ptr [[PTR]], align 4
+; CHECK-NEXT:    ret float [[COPYSIGN]]
+;
+  %copysign = call float @llvm.copysign.f32(float %always.positive.or.nsub, float %always.positive)
+  store float %copysign, ptr %ptr
+  ret float %copysign
+}
+
+; Can infer sign bit is positive due to no-zero return
+define nofpclass(nan zero) float @ret_no_nan_no_zero__copysign__src_known_positive_or_nan__sign_known_positive_or_zero__multiple_use(float nofpclass(nan ninf nnorm nsub) %always.positive.or.nzero, float nofpclass(nan ninf nnorm nsub nzero) %always.positive, ptr %ptr) {
+; CHECK-LABEL: define nofpclass(nan zero) float @ret_no_nan_no_zero__copysign__src_known_positive_or_nan__sign_known_positive_or_zero__multiple_use
+; CHECK-SAME: (float nofpclass(nan ninf nsub nnorm) [[ALWAYS_POSITIVE_OR_NZERO:%.*]], float nofpclass(nan ninf nzero nsub nnorm) [[ALWAYS_POSITIVE:%.*]], ptr [[PTR:%.*]]) {
+; CHECK-NEXT:    [[COPYSIGN:%.*]] = call float @llvm.fabs.f32(float [[ALWAYS_POSITIVE_OR_NZERO]])
+; CHECK-NEXT:    store float [[COPYSIGN]], ptr [[PTR]], align 4
+; CHECK-NEXT:    ret float [[COPYSIGN]]
+;
+  %copysign = call float @llvm.copysign.f32(float %always.positive.or.nzero, float %always.positive)
+  store float %copysign, ptr %ptr
+  ret float %copysign
+}
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-chained.ll b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-chained.ll
index f90b26013fcbc..f1dee958fa09c 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-chained.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-chained.ll
@@ -1,7 +1,8 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals none --filter-out-after "^scalar.ph:" --version 4
-; RUN: opt --mattr=+neon,+dotprod -passes=loop-vectorize -force-vector-interleave=1 -enable-epilogue-vectorization=false -S < %s | FileCheck %s --check-prefixes=CHECK-NEON
-; RUN: opt --mattr=+sve -passes=loop-vectorize -force-vector-interleave=1 -enable-epilogue-vectorization=false -S < %s | FileCheck %s --check-prefixes=CHECK-SVE
-; RUN: opt --mattr=+sve -vectorizer-maximize-bandwidth -passes=loop-vectorize -force-vector-width=8 -scalable-vectorization=preferred -force-vector-interleave=1 -enable-epilogue-vectorization=false -S < %s | FileCheck %s --check-prefixes=CHECK-SVE-MAXBW
+; RUN: opt -passes=loop-vectorize -enable-epilogue-vectorization=false -mattr=+neon,+dotprod -S < %s | FileCheck %s --check-prefixes=CHECK-NEON
+; RUN: opt -passes=loop-vectorize -enable-epilogue-vectorization=false -mattr=+sve           -S < %s | FileCheck %s --check-prefixes=CHECK-SVE
+; RUN: opt -passes=loop-vectorize -enable-epilogue-vectorization=false -mattr=+sve \
+; RUN:     -vectorizer-maximize-bandwidth -force-vector-width=8                              -S < %s | FileCheck %s --check-prefixes=CHECK-SVE-MAXBW
 
 target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
 target triple = "aarch64-none-unknown-elf"
@@ -158,7 +159,7 @@ for.body:                                         ; preds = %for.body.preheader,
   %sub = sub i32 %add, %mul.ac
   %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
   %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count
-  br i1 %exitcond.not, label %for.cond.cleanup, label %for.body, !loop !1
+  br i1 %exitcond.not, label %for.cond.cleanup, label %for.body, !llvm.loop !0
 }
 
 define i32 @chained_partial_reduce_add_add(ptr %a, ptr %b, ptr %c, i32 %N) #0 {
@@ -309,7 +310,7 @@ for.body:                                         ; preds = %for.body.preheader,
   %add.2 = add i32 %add, %mul.ac
   %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
   %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count
-  br i1 %exitcond.not, label %for.cond.cleanup, label %for.body, !loop !1
+  br i1 %exitcond.not, label %for.cond.cleanup, label %for.body, !llvm.loop !0
 }
 
 define i32 @chained_partial_reduce_sub_add(ptr %a, ptr %b, ptr %c, i32 %N) #0 {
@@ -465,7 +466,7 @@ for.body:                                         ; preds = %for.body.preheader,
   %add = add i32 %sub, %mul.ac
   %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
   %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count
-  br i1 %exitcond.not, label %for.cond.cleanup, label %for.body, !loop !1
+  br i1 %exitcond.not, label %for.cond.cleanup, label %for.body, !llvm.loop !0
 }
 
 define i32 @chained_partial_reduce_sub_sub(ptr %a, ptr %b, ptr %c, i32 %N) #0 {
@@ -625,7 +626,7 @@ for.body:                                         ; preds = %for.body.preheader,
   %sub.2 = sub i32 %sub, %mul.ac
   %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
   %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count
-  br i1 %exitcond.not, label %for.cond.cleanup, label %for.body, !loop !1
+  br i1 %exitcond.not, label %for.cond.cleanup, label %for.body, !llvm.loop !0
 }
 
 define i32 @chained_partial_reduce_add_add_add(ptr %a, ptr %b, ptr %c, i32 %N) #0 {
@@ -787,7 +788,7 @@ for.body:                                         ; preds = %for.body.preheader,
   %sub.2 = add i32 %add, %mul.bc
   %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
   %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count
-  br i1 %exitcond.not, label %for.cond.cleanup, label %for.body, !loop !1
+  br i1 %exitcond.not, label %for.cond.cleanup, label %for.body, !llvm.loop !0
 }
 
 define i32 @chained_partial_reduce_sub_add_sub(ptr %a, ptr %b, ptr %c, i32 %N) #0 {
@@ -955,7 +956,7 @@ for.body:                                         ; preds = %for.body.preheader,
   %sub.2 = sub i32 %add, %mul.bc
   %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
   %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count
-  br i1 %exitcond.not, label %for.cond.cleanup, label %for.body, !loop !1
+  br i1 %exitcond.not, label %for.cond.cleanup, label %for.body, !llvm.loop !0
 }
 
 
@@ -1103,7 +1104,7 @@ for.body:                                         ; preds = %for.body.preheader,
   %add2 = add i32 %add, %c.ext
   %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
   %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count
-  br i1 %exitcond.not, label %for.cond.cleanup, label %for.body, !loop !1
+  br i1 %exitcond.not, label %for.cond.cleanup, label %for.body, !llvm.loop !0
 }
 
 
@@ -1235,7 +1236,7 @@ for.body:                                         ; preds = %for.body.preheader,
   %add2 = add i32 %add, %b.ext
   %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
   %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count
-  br i1 %exitcond.not, label %for.cond.cleanup, label %for.body, !loop !1
+  br i1 %exitcond.not, label %for.cond.cleanup, label %for.body, !llvm.loop !0
 }
 
 
@@ -1384,11 +1385,11 @@ for.body:                                         ; preds = %for.body.preheader,
   %add2 = add i32 %add, %mul.ab
   %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
   %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count
-  br i1 %exitcond.not, label %for.cond.cleanup, label %for.body, !loop !1
+  br i1 %exitcond.not, label %for.cond.cleanup, label %for.body, !llvm.loop !0
 }
 
 attributes #0 = { vscale_range(1,16) }
 
-
-!0 = !{!"llvm.loop.vectorize.scalable.enable", i1 true}
-!1 = distinct !{!0}
+!0 = distinct !{!0, !1, !2}
+!1 = !{!"llvm.loop.vectorize.scalable.enable", i1 true}
+!2 = !{!"llvm.loop.interleave.count", i32 1}

>From 179ef1fe99a24321d1ea9f572cf1c3da2be74b30 Mon Sep 17 00:00:00 2001
From: Marina Taylor <marina_taylor at apple.com>
Date: Fri, 30 Jan 2026 15:03:54 +0000
Subject: [PATCH 2/5] NFC: Rename CodeGenOptions::StackUsageOutput to
 StackUsageFile (#178898)

Preparation for #178005.

"Output" has too many different interpretations: it could be an
enabled/disabled, a file format, etc. Clarify that it's the destination
file.
---
 clang/include/clang/Basic/CodeGenOptions.h | 2 +-
 clang/include/clang/Options/Options.td     | 2 +-
 clang/lib/CodeGen/BackendUtil.cpp          | 2 +-
 clang/lib/Frontend/CompilerInvocation.cpp  | 2 +-
 llvm/include/llvm/Target/TargetOptions.h   | 2 +-
 llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp | 2 +-
 6 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/clang/include/clang/Basic/CodeGenOptions.h b/clang/include/clang/Basic/CodeGenOptions.h
index c60ca507ff917..8ef0d87faaeaf 100644
--- a/clang/include/clang/Basic/CodeGenOptions.h
+++ b/clang/include/clang/Basic/CodeGenOptions.h
@@ -521,7 +521,7 @@ class CodeGenOptions : public CodeGenOptionsBase {
   /// Name of the stack usage file (i.e., .su file) if user passes
   /// -fstack-usage. If empty, it can be implied that -fstack-usage is not
   /// passed on the command line.
-  std::string StackUsageOutput;
+  std::string StackUsageFile;
 
   /// Executable and command-line used to create a given CompilerInvocation.
   /// Most of the time this will be the full -cc1 command.
diff --git a/clang/include/clang/Options/Options.td b/clang/include/clang/Options/Options.td
index 41a23ba4cb33d..421208a812bbc 100644
--- a/clang/include/clang/Options/Options.td
+++ b/clang/include/clang/Options/Options.td
@@ -4664,7 +4664,7 @@ def fstack_usage : Flag<["-"], "fstack-usage">, Group<f_Group>,
 def stack_usage_file : Separate<["-"], "stack-usage-file">,
   Visibility<[CC1Option]>,
   HelpText<"Filename (or -) to write stack usage output to">,
-  MarshallingInfoString<CodeGenOpts<"StackUsageOutput">>;
+  MarshallingInfoString<CodeGenOpts<"StackUsageFile">>;
 def fextend_variable_liveness_EQ : Joined<["-"], "fextend-variable-liveness=">,
   Group<f_Group>, Visibility<[ClangOption, CC1Option]>,
   HelpText<"Extend the liveness of user variables through optimizations to "
diff --git a/clang/lib/CodeGen/BackendUtil.cpp b/clang/lib/CodeGen/BackendUtil.cpp
index d411ef1bf8763..b286ff359ec40 100644
--- a/clang/lib/CodeGen/BackendUtil.cpp
+++ b/clang/lib/CodeGen/BackendUtil.cpp
@@ -453,7 +453,7 @@ static bool initTargetOptions(const CompilerInstance &CI,
   Options.EmulatedTLS = CodeGenOpts.EmulatedTLS;
   Options.DebuggerTuning = CodeGenOpts.getDebuggerTuning();
   Options.EmitStackSizeSection = CodeGenOpts.StackSizeSection;
-  Options.StackUsageOutput = CodeGenOpts.StackUsageOutput;
+  Options.StackUsageFile = CodeGenOpts.StackUsageFile;
   Options.EmitAddrsig = CodeGenOpts.Addrsig;
   Options.ForceDwarfFrameSection = CodeGenOpts.ForceDwarfFrameSection;
   Options.EmitCallGraphSection = CodeGenOpts.CallGraphSection;
diff --git a/clang/lib/Frontend/CompilerInvocation.cpp b/clang/lib/Frontend/CompilerInvocation.cpp
index 5a79634773866..2af4a7f536623 100644
--- a/clang/lib/Frontend/CompilerInvocation.cpp
+++ b/clang/lib/Frontend/CompilerInvocation.cpp
@@ -2244,7 +2244,7 @@ bool CompilerInvocation::ParseCodeGenArgs(CodeGenOptions &Opts, ArgList &Args,
   if (UsingSampleProfile)
     NeedLocTracking = true;
 
-  if (!Opts.StackUsageOutput.empty())
+  if (!Opts.StackUsageFile.empty())
     NeedLocTracking = true;
 
   // If the user requested a flag that requires source locations available in
diff --git a/llvm/include/llvm/Target/TargetOptions.h b/llvm/include/llvm/Target/TargetOptions.h
index 7af50691ec0e5..a9b86626cf598 100644
--- a/llvm/include/llvm/Target/TargetOptions.h
+++ b/llvm/include/llvm/Target/TargetOptions.h
@@ -368,7 +368,7 @@ class TargetOptions {
   /// Name of the stack usage file (i.e., .su file) if user passes
   /// -fstack-usage. If empty, it can be implied that -fstack-usage is not
   /// passed on the command line.
-  std::string StackUsageOutput;
+  std::string StackUsageFile;
 
   /// If greater than 0, override TargetLoweringBase::PrefLoopAlignment.
   unsigned LoopAlignment = 0;
diff --git a/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp b/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp
index 420f09c514d63..b64c7f5b44033 100644
--- a/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp
+++ b/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp
@@ -1672,7 +1672,7 @@ void AsmPrinter::emitStackSizeSection(const MachineFunction &MF) {
 }
 
 void AsmPrinter::emitStackUsage(const MachineFunction &MF) {
-  const std::string &OutputFilename = MF.getTarget().Options.StackUsageOutput;
+  const std::string &OutputFilename = MF.getTarget().Options.StackUsageFile;
 
   // OutputFilename empty implies -fstack-usage is not passed.
   if (OutputFilename.empty())

>From 561032845b522bb340fd3988ca1d8e5d86fdd632 Mon Sep 17 00:00:00 2001
From: Joe Nash <joseph.nash at amd.com>
Date: Fri, 30 Jan 2026 10:13:33 -0500
Subject: [PATCH 3/5] [AMDGPU] Fix VOPD checks for commuting OpX and OpY
 (#178772)

We need to check that OpX does not write the sources of OpY, but if we
swap OpX and OpY with respect to program order, the check was not
swapped correctly.

The checks on gfx1250 can be relaxed slightly, that is planned for a
future patch.

---------

Co-authored-by: Matt Arsenault <arsenm2 at gmail.com>
---
 llvm/lib/Target/AMDGPU/GCNCreateVOPD.cpp      |  26 ++-
 llvm/lib/Target/AMDGPU/GCNVOPDUtils.cpp       |  58 +++---
 .../test/CodeGen/AMDGPU/GlobalISel/fptrunc.ll |   5 +-
 .../AMDGPU/GlobalISel/insertelement.i16.ll    |  10 +-
 .../AMDGPU/GlobalISel/insertelement.i8.ll     |  10 +-
 llvm/test/CodeGen/AMDGPU/GlobalISel/shl.ll    |  36 ++--
 llvm/test/CodeGen/AMDGPU/addrspacecast-gas.ll |   2 +-
 .../CodeGen/AMDGPU/amdgcn.bitcast.1024bit.ll  |   6 +-
 .../CodeGen/AMDGPU/amdgcn.bitcast.768bit.ll   |  12 +-
 .../CodeGen/AMDGPU/amdgcn.bitcast.832bit.ll   |  24 ++-
 .../CodeGen/AMDGPU/amdgcn.bitcast.896bit.ll   |  24 ++-
 .../AMDGPU/amdgpu-cs-chain-fp-nosave.ll       |   8 +-
 .../atomic_optimizations_struct_buffer.ll     | 171 ++++++------------
 llvm/test/CodeGen/AMDGPU/bf16.ll              |  45 +++--
 .../AMDGPU/expand-waitcnt-profiling.ll        |  32 ++--
 llvm/test/CodeGen/AMDGPU/fcopysign.f16.ll     |  13 +-
 llvm/test/CodeGen/AMDGPU/mad_64_32.ll         |   2 +-
 .../match-perm-extract-vector-elt-bug.ll      |   6 +-
 .../CodeGen/AMDGPU/narrow_math_for_and.ll     |   6 +-
 .../CodeGen/AMDGPU/vopd-combine-gfx1250.mir   | 107 +++++++++++
 20 files changed, 342 insertions(+), 261 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/GCNCreateVOPD.cpp b/llvm/lib/Target/AMDGPU/GCNCreateVOPD.cpp
index 27f40f1705bb4..72805aa9165b6 100644
--- a/llvm/lib/Target/AMDGPU/GCNCreateVOPD.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNCreateVOPD.cpp
@@ -134,6 +134,7 @@ class GCNCreateVOPD {
     LLVM_DEBUG(dbgs() << "CreateVOPD Pass:\n");
 
     const SIInstrInfo *SII = ST->getInstrInfo();
+    const SIRegisterInfo *TRI = ST->getRegisterInfo();
     bool Changed = false;
     unsigned EncodingFamily = AMDGPU::getVOPDEncodingFamily(*ST);
     bool HasVOPD3 = ST->hasVOPD3();
@@ -160,16 +161,25 @@ class GCNCreateVOPD {
           llvm::AMDGPU::CanBeVOPD SecondCanBeVOPD =
               AMDGPU::getCanBeVOPD(Opc2, EncodingFamily, VOPD3);
 
-          if (FirstCanBeVOPD.X && SecondCanBeVOPD.Y)
+          if (FirstCanBeVOPD.X && SecondCanBeVOPD.Y &&
+              llvm::checkVOPDRegConstraints(*SII, *FirstMI, *SecondMI, VOPD3)) {
             CI = VOPDCombineInfo(FirstMI, SecondMI, VOPD3);
-          else if (FirstCanBeVOPD.Y && SecondCanBeVOPD.X)
+            return true;
+          }
+          // We can try swapping the order of the instructions, but in that case
+          // neither instruction can write to a register the other reads from.
+          // OpX cannot write something OpY reads because that is the hardware
+          // rule, and OpY cannot write what OpX reads because that would
+          // violate the data dependency in the original order.
+          for (const auto &Use : SecondMI->uses())
+            if (Use.isReg() && FirstMI->modifiesRegister(Use.getReg(), TRI))
+              return false;
+          if (FirstCanBeVOPD.Y && SecondCanBeVOPD.X &&
+              llvm::checkVOPDRegConstraints(*SII, *SecondMI, *FirstMI, VOPD3)) {
             CI = VOPDCombineInfo(SecondMI, FirstMI, VOPD3);
-          else
-            return false;
-          // checkVOPDRegConstraints cares about program order, but doReplace
-          // cares about X-Y order in the constituted VOPD
-          return llvm::checkVOPDRegConstraints(*SII, *FirstMI, *SecondMI,
-                                               VOPD3);
+            return true;
+          }
+          return false;
         };
 
         if (checkVOPD(false) || (HasVOPD3 && checkVOPD(true))) {
diff --git a/llvm/lib/Target/AMDGPU/GCNVOPDUtils.cpp b/llvm/lib/Target/AMDGPU/GCNVOPDUtils.cpp
index 9e66909e41052..663f53889ac74 100644
--- a/llvm/lib/Target/AMDGPU/GCNVOPDUtils.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNVOPDUtils.cpp
@@ -35,18 +35,18 @@ using namespace llvm;
 #define DEBUG_TYPE "gcn-vopd-utils"
 
 bool llvm::checkVOPDRegConstraints(const SIInstrInfo &TII,
-                                   const MachineInstr &FirstMI,
-                                   const MachineInstr &SecondMI, bool IsVOPD3) {
+                                   const MachineInstr &MIX,
+                                   const MachineInstr &MIY, bool IsVOPD3) {
   namespace VOPD = AMDGPU::VOPD;
 
-  const MachineFunction *MF = FirstMI.getMF();
+  const MachineFunction *MF = MIX.getMF();
   const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
 
   if (IsVOPD3 && !ST.hasVOPD3())
     return false;
-  if (!IsVOPD3 && (TII.isVOP3(FirstMI) || TII.isVOP3(SecondMI)))
+  if (!IsVOPD3 && (TII.isVOP3(MIX) || TII.isVOP3(MIY)))
     return false;
-  if (TII.isDPP(FirstMI) || TII.isDPP(SecondMI))
+  if (TII.isDPP(MIX) || TII.isDPP(MIY))
     return false;
 
   const SIRegisterInfo *TRI = dyn_cast<SIRegisterInfo>(ST.getRegisterInfo());
@@ -61,32 +61,24 @@ bool llvm::checkVOPDRegConstraints(const SIInstrInfo &TII,
     UniqueLiterals.push_back(&Op);
   };
   SmallVector<Register> UniqueScalarRegs;
-  assert([&]() -> bool {
-    for (auto MII = MachineBasicBlock::const_iterator(&FirstMI);
-         MII != FirstMI.getParent()->instr_end(); ++MII) {
-      if (&*MII == &SecondMI)
-        return true;
-    }
-    return false;
-  }() && "Expected FirstMI to precede SecondMI");
-  // Cannot pair dependent instructions
-  for (const auto &Use : SecondMI.uses())
-    if (Use.isReg() && FirstMI.modifiesRegister(Use.getReg(), TRI))
+
+  // MIX must not modify any registers used by MIY.
+  for (const auto &Use : MIY.uses())
+    if (Use.isReg() && MIX.modifiesRegister(Use.getReg(), TRI))
       return false;
 
   auto getVRegIdx = [&](unsigned OpcodeIdx, unsigned OperandIdx) {
-    const MachineInstr &MI = (OpcodeIdx == VOPD::X) ? FirstMI : SecondMI;
+    const MachineInstr &MI = (OpcodeIdx == VOPD::X) ? MIX : MIY;
     const MachineOperand &Operand = MI.getOperand(OperandIdx);
     if (Operand.isReg() && TRI->isVectorRegister(MRI, Operand.getReg()))
       return Operand.getReg();
     return Register();
   };
 
-  auto InstInfo =
-      AMDGPU::getVOPDInstInfo(FirstMI.getDesc(), SecondMI.getDesc());
+  auto InstInfo = AMDGPU::getVOPDInstInfo(MIX.getDesc(), MIY.getDesc());
 
   for (auto CompIdx : VOPD::COMPONENTS) {
-    const MachineInstr &MI = (CompIdx == VOPD::X) ? FirstMI : SecondMI;
+    const MachineInstr &MI = (CompIdx == VOPD::X) ? MIX : MIY;
 
     const MachineOperand &Src0 = *TII.getNamedOperand(MI, AMDGPU::OpName::src0);
     if (Src0.isReg()) {
@@ -153,8 +145,8 @@ bool llvm::checkVOPDRegConstraints(const SIInstrInfo &TII,
   // On GFX12+ if both OpX and OpY are V_MOV_B32 then OPY uses SRC2
   // source-cache.
   bool SkipSrc = ST.getGeneration() >= AMDGPUSubtarget::GFX12 &&
-                 FirstMI.getOpcode() == AMDGPU::V_MOV_B32_e32 &&
-                 SecondMI.getOpcode() == AMDGPU::V_MOV_B32_e32;
+                 MIX.getOpcode() == AMDGPU::V_MOV_B32_e32 &&
+                 MIY.getOpcode() == AMDGPU::V_MOV_B32_e32;
   bool AllowSameVGPR = ST.hasGFX1250Insts();
 
   if (InstInfo.hasInvalidOperand(getVRegIdx, *TRI, SkipSrc, AllowSameVGPR,
@@ -163,22 +155,23 @@ bool llvm::checkVOPDRegConstraints(const SIInstrInfo &TII,
 
   if (IsVOPD3) {
     // BITOP3 can be converted to DUAL_BITOP2 only if src2 is zero.
-    if (AMDGPU::hasNamedOperand(SecondMI.getOpcode(), AMDGPU::OpName::bitop3)) {
+    // MIX check is only relevant to scheduling?
+    if (AMDGPU::hasNamedOperand(MIX.getOpcode(), AMDGPU::OpName::bitop3)) {
       const MachineOperand &Src2 =
-          *TII.getNamedOperand(SecondMI, AMDGPU::OpName::src2);
+          *TII.getNamedOperand(MIX, AMDGPU::OpName::src2);
       if (!Src2.isImm() || Src2.getImm())
         return false;
     }
-    if (AMDGPU::hasNamedOperand(FirstMI.getOpcode(), AMDGPU::OpName::bitop3)) {
+    if (AMDGPU::hasNamedOperand(MIY.getOpcode(), AMDGPU::OpName::bitop3)) {
       const MachineOperand &Src2 =
-          *TII.getNamedOperand(FirstMI, AMDGPU::OpName::src2);
+          *TII.getNamedOperand(MIY, AMDGPU::OpName::src2);
       if (!Src2.isImm() || Src2.getImm())
         return false;
     }
   }
 
-  LLVM_DEBUG(dbgs() << "VOPD Reg Constraints Passed\n\tX: " << FirstMI
-                    << "\n\tY: " << SecondMI << "\n");
+  LLVM_DEBUG(dbgs() << "VOPD Reg Constraints Passed\n\tX: " << MIX
+                    << "\n\tY: " << MIY << "\n");
   return true;
 }
 
@@ -208,6 +201,15 @@ static bool shouldScheduleVOPDAdjacent(const TargetInstrInfo &TII,
           (FirstCanBeVOPD.Y && SecondCanBeVOPD.X)))
       return false;
 
+    assert([&]() -> bool {
+      for (auto MII = MachineBasicBlock::const_iterator(FirstMI);
+           MII != FirstMI->getParent()->instr_end(); ++MII) {
+        if (&*MII == &SecondMI)
+          return true;
+      }
+      return false;
+    }() && "Expected FirstMI to precede SecondMI");
+
     return checkVOPDRegConstraints(STII, *FirstMI, SecondMI, VOPD3);
   };
 
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/fptrunc.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/fptrunc.ll
index fa72eb72fd723..77b05dc460168 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/fptrunc.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/fptrunc.ll
@@ -404,8 +404,9 @@ define amdgpu_ps half @fptrunc_f64_to_f16_div(double %a) {
 ; GFX1250-NEXT:    v_or_b32_e32 v3, v5, v3
 ; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1250-NEXT:    v_cndmask_b32_e32 v3, v4, v3, vcc_lo
-; GFX1250-NEXT:    v_dual_lshrrev_b32 v3, 2, v3 :: v_dual_bitop2_b32 v4, 7, v3 bitop3:0x40
-; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-NEXT:    v_and_b32_e32 v4, 7, v3
+; GFX1250-NEXT:    v_lshrrev_b32_e32 v3, 2, v3
+; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX1250-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 3, v4
 ; GFX1250-NEXT:    v_cmp_lt_i32_e64 s0, 5, v4
 ; GFX1250-NEXT:    s_or_b32 s0, vcc_lo, s0
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.i16.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.i16.ll
index 0e1bbbd1ea92b..c7b63f749e950 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.i16.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.i16.ll
@@ -437,8 +437,9 @@ define amdgpu_ps void @insertelement_s_v2i16_v_v(ptr addrspace(4) inreg %ptr, i1
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 4, v1
 ; GFX11-NEXT:    v_lshlrev_b32_e64 v2, v1, 0xffff
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_lshlrev_b32 v3, v1, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_lshlrev_b32_e32 v3, v1, v0
+; GFX11-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX11-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX11-NEXT:    v_not_b32_e32 v2, v2
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
@@ -709,8 +710,9 @@ define amdgpu_ps void @insertelement_v_v2i16_v_v(ptr addrspace(1) %ptr, i16 %val
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 4, v0
 ; GFX11-NEXT:    v_lshlrev_b32_e64 v2, v0, 0xffff
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_lshlrev_b32 v3, v0, v1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_lshlrev_b32_e32 v3, v0, v1
+; GFX11-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX11-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX11-NEXT:    v_not_b32_e32 v2, v2
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.i8.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.i8.ll
index 4598bcc04a505..7c212f1e110d1 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.i8.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.i8.ll
@@ -1224,8 +1224,9 @@ define amdgpu_ps void @insertelement_s_v4i8_v_v(ptr addrspace(4) inreg %ptr, i8
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 3, v1
 ; GFX11-NEXT:    v_lshlrev_b32_e64 v2, v1, 0xff
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_lshlrev_b32 v3, v1, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_lshlrev_b32_e32 v3, v1, v0
+; GFX11-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX11-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX11-NEXT:    v_not_b32_e32 v2, v2
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
@@ -1496,8 +1497,9 @@ define amdgpu_ps void @insertelement_v_v4i8_v_v(ptr addrspace(1) %ptr, i8 %val,
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
 ; GFX11-NEXT:    v_lshlrev_b32_e64 v2, v0, 0xff
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_lshlrev_b32 v3, v0, v1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_lshlrev_b32_e32 v3, v0, v1
+; GFX11-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX11-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX11-NEXT:    v_not_b32_e32 v2, v2
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/shl.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/shl.ll
index f589919992335..ea8023a3a227b 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/shl.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/shl.ll
@@ -1295,18 +1295,12 @@ define i64 @v_shl_i64_63(i64 %value) {
 ; GCN-NEXT:    v_mov_b32_e32 v0, 0
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: v_shl_i64_63:
-; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 31, v0
-; GFX10-NEXT:    v_mov_b32_e32 v0, 0
-; GFX10-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX11-LABEL: v_shl_i64_63:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_lshlrev_b32 v1, 31, v0
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
+; GFX10PLUS-LABEL: v_shl_i64_63:
+; GFX10PLUS:       ; %bb.0:
+; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10PLUS-NEXT:    v_lshlrev_b32_e32 v1, 31, v0
+; GFX10PLUS-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10PLUS-NEXT:    s_setpc_b64 s[30:31]
   %result = shl i64 %value, 63
   ret i64 %result
 }
@@ -1319,18 +1313,12 @@ define i64 @v_shl_i64_33(i64 %value) {
 ; GCN-NEXT:    v_mov_b32_e32 v0, 0
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: v_shl_i64_33:
-; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 1, v0
-; GFX10-NEXT:    v_mov_b32_e32 v0, 0
-; GFX10-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX11-LABEL: v_shl_i64_33:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_lshlrev_b32 v1, 1, v0
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
+; GFX10PLUS-LABEL: v_shl_i64_33:
+; GFX10PLUS:       ; %bb.0:
+; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10PLUS-NEXT:    v_lshlrev_b32_e32 v1, 1, v0
+; GFX10PLUS-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10PLUS-NEXT:    s_setpc_b64 s[30:31]
   %result = shl i64 %value, 33
   ret i64 %result
 }
diff --git a/llvm/test/CodeGen/AMDGPU/addrspacecast-gas.ll b/llvm/test/CodeGen/AMDGPU/addrspacecast-gas.ll
index e5992e398ddbd..adcc86532e46d 100644
--- a/llvm/test/CodeGen/AMDGPU/addrspacecast-gas.ll
+++ b/llvm/test/CodeGen/AMDGPU/addrspacecast-gas.ll
@@ -14,7 +14,7 @@ define amdgpu_kernel void @use_private_to_flat_addrspacecast(ptr addrspace(5) %p
 ; GFX1250-SDAG-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, 0
 ; GFX1250-SDAG-NEXT:    s_wait_kmcnt 0x0
 ; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
-; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_lshlrev_b32 v1, 20, v0
+; GFX1250-SDAG-NEXT:    v_dual_lshlrev_b32 v1, 20, v0 :: v_dual_mov_b32 v0, s0
 ; GFX1250-SDAG-NEXT:    s_cmp_lg_u32 s0, -1
 ; GFX1250-SDAG-NEXT:    s_cselect_b32 vcc_lo, -1, 0
 ; GFX1250-SDAG-NEXT:    v_add_nc_u64_e32 v[0:1], src_flat_scratch_base_lo, v[0:1]
diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.1024bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.1024bit.ll
index 15bef1949b27f..1b1f7fcadc540 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.1024bit.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.1024bit.ll
@@ -208038,10 +208038,10 @@ define inreg <64 x half> @bitcast_v64bf16_to_v64f16_scalar(<64 x bfloat> inreg %
 ; GFX11-FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
 ; GFX11-FAKE16-NEXT:    v_and_b32_e32 v3, 0xffff0000, v24
 ; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v37, 16, v0
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v1, v2, v4 :: v_dual_add_nc_u32 v0, v1, v5
-; GFX11-FAKE16-NEXT:    v_add_f32_e32 v2, 0x40c00000, v6
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v0, v1, v5
+; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v1, v2, v4 :: v_dual_add_f32 v2, 0x40c00000, v6
 ; GFX11-FAKE16-NEXT:    v_lshlrev_b32_e32 v4, 16, v24
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
 ; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v0, 0x7fff, v0
 ; GFX11-FAKE16-NEXT:    v_or_b32_e32 v6, 0x400000, v5
 ; GFX11-FAKE16-NEXT:    v_add_f32_e32 v3, 0x40c00000, v3
diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.768bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.768bit.ll
index 322689c91425b..194ee0705a921 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.768bit.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.768bit.ll
@@ -12680,7 +12680,8 @@ define inreg <48 x i16> @bitcast_v24f32_to_v48i16_scalar(<24 x float> inreg %a,
 ; GFX11-FAKE16-NEXT:    v_and_b32_e32 v19, 0xffff, v16
 ; GFX11-FAKE16-NEXT:    v_and_b32_e32 v2, 0xffff, v3
 ; GFX11-FAKE16-NEXT:    v_and_b32_e32 v3, 0xffff, v4
-; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v5, v29 :: v_dual_and_b32 v4, 0xffff, v5
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v4, 0xffff, v5
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v5, v29
 ; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v28, v28, 16, v20
 ; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v20, v33, 16, v1
 ; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v1, v25
@@ -16191,7 +16192,8 @@ define inreg <48 x half> @bitcast_v24f32_to_v48f16_scalar(<24 x float> inreg %a,
 ; GFX11-FAKE16-NEXT:    v_and_b32_e32 v19, 0xffff, v16
 ; GFX11-FAKE16-NEXT:    v_and_b32_e32 v2, 0xffff, v3
 ; GFX11-FAKE16-NEXT:    v_and_b32_e32 v3, 0xffff, v4
-; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v5, v29 :: v_dual_and_b32 v4, 0xffff, v5
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v4, 0xffff, v5
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v5, v29
 ; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v28, v28, 16, v20
 ; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v20, v33, 16, v1
 ; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v1, v25
@@ -27727,7 +27729,8 @@ define inreg <48 x i16> @bitcast_v12f64_to_v48i16_scalar(<12 x double> inreg %a,
 ; GFX11-FAKE16-NEXT:    v_and_b32_e32 v1, 0xffff, v2
 ; GFX11-FAKE16-NEXT:    v_and_b32_e32 v2, 0xffff, v3
 ; GFX11-FAKE16-NEXT:    v_and_b32_e32 v3, 0xffff, v4
-; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v5, v29 :: v_dual_and_b32 v4, 0xffff, v5
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v4, 0xffff, v5
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v5, v29
 ; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v28, v28, 16, v20
 ; GFX11-FAKE16-NEXT:    v_and_b32_e32 v20, 0xffff, v10
 ; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v26, v26, 16, v22
@@ -31166,7 +31169,8 @@ define inreg <48 x half> @bitcast_v12f64_to_v48f16_scalar(<12 x double> inreg %a
 ; GFX11-FAKE16-NEXT:    v_and_b32_e32 v1, 0xffff, v2
 ; GFX11-FAKE16-NEXT:    v_and_b32_e32 v2, 0xffff, v3
 ; GFX11-FAKE16-NEXT:    v_and_b32_e32 v3, 0xffff, v4
-; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v5, v29 :: v_dual_and_b32 v4, 0xffff, v5
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v4, 0xffff, v5
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v5, v29
 ; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v28, v28, 16, v20
 ; GFX11-FAKE16-NEXT:    v_and_b32_e32 v20, 0xffff, v10
 ; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v26, v26, 16, v22
diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.832bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.832bit.ll
index 911c911fa1ad4..60ce818302ce7 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.832bit.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.832bit.ll
@@ -13756,10 +13756,12 @@ define inreg <52 x i16> @bitcast_v26f32_to_v52i16_scalar(<26 x float> inreg %a,
 ; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v18, v49, 16, v0
 ; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xffff, v3
 ; GFX11-FAKE16-NEXT:    v_and_b32_e32 v3, 0xffff, v6
-; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v7, v31 :: v_dual_and_b32 v4, 0xffff, v7
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v4, 0xffff, v7
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v7, v31
 ; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v30, v30, 16, v20
 ; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v20, v39, 16, v2
-; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v5, v29 :: v_dual_and_b32 v2, 0xffff, v5
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v2, 0xffff, v5
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v5, v29
 ; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v28, v28, 16, v22
 ; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v22, v37, 16, v1
 ; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v1, v33
@@ -17627,10 +17629,12 @@ define inreg <52 x half> @bitcast_v26f32_to_v52f16_scalar(<26 x float> inreg %a,
 ; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v18, v49, 16, v0
 ; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xffff, v3
 ; GFX11-FAKE16-NEXT:    v_and_b32_e32 v3, 0xffff, v6
-; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v7, v31 :: v_dual_and_b32 v4, 0xffff, v7
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v4, 0xffff, v7
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v7, v31
 ; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v30, v30, 16, v20
 ; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v20, v39, 16, v2
-; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v5, v29 :: v_dual_and_b32 v2, 0xffff, v5
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v2, 0xffff, v5
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v5, v29
 ; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v28, v28, 16, v22
 ; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v22, v37, 16, v1
 ; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v1, v33
@@ -38965,7 +38969,8 @@ define inreg <52 x half> @bitcast_v52i16_to_v52f16_scalar(<52 x i16> inreg %a, i
 ; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GFX11-FAKE16-NEXT:    v_and_b32_e32 v1, 0xffff, v1
 ; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v20, v20, 16, v2
-; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v5, v29 :: v_dual_and_b32 v2, 0xffff, v5
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v2, 0xffff, v5
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v5, v29
 ; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v31, v52, 16, v54
 ; GFX11-FAKE16-NEXT:    v_and_b32_e32 v10, 0xffff, v10
 ; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v33, v66, 16, v68
@@ -38980,7 +38985,8 @@ define inreg <52 x half> @bitcast_v52i16_to_v52f16_scalar(<52 x i16> inreg %a, i
 ; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xffff, v3
 ; GFX11-FAKE16-NEXT:    v_and_b32_e32 v1, 0xffff, v4
 ; GFX11-FAKE16-NEXT:    v_and_b32_e32 v3, 0xffff, v6
-; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v7, v31 :: v_dual_and_b32 v4, 0xffff, v7
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v4, 0xffff, v7
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v7, v31
 ; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v8, v51, 16, v10
 ; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v10, v49, 16, v55
 ; GFX11-FAKE16-NEXT:    v_and_b32_e32 v49, 0xffff, v12
@@ -41124,7 +41130,8 @@ define inreg <52 x i16> @bitcast_v52f16_to_v52i16_scalar(<52 x half> inreg %a, i
 ; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GFX11-FAKE16-NEXT:    v_and_b32_e32 v1, 0xffff, v1
 ; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v20, v20, 16, v2
-; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v5, v29 :: v_dual_and_b32 v2, 0xffff, v5
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v2, 0xffff, v5
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v5, v29
 ; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v31, v52, 16, v54
 ; GFX11-FAKE16-NEXT:    v_and_b32_e32 v10, 0xffff, v10
 ; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v33, v66, 16, v68
@@ -41139,7 +41146,8 @@ define inreg <52 x i16> @bitcast_v52f16_to_v52i16_scalar(<52 x half> inreg %a, i
 ; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xffff, v3
 ; GFX11-FAKE16-NEXT:    v_and_b32_e32 v1, 0xffff, v4
 ; GFX11-FAKE16-NEXT:    v_and_b32_e32 v3, 0xffff, v6
-; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v7, v31 :: v_dual_and_b32 v4, 0xffff, v7
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v4, 0xffff, v7
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v7, v31
 ; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v8, v51, 16, v10
 ; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v10, v49, 16, v55
 ; GFX11-FAKE16-NEXT:    v_and_b32_e32 v49, 0xffff, v12
diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.896bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.896bit.ll
index 9f342f95cd8ee..19462c5bf8a9f 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.896bit.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.896bit.ll
@@ -14872,7 +14872,8 @@ define inreg <56 x i16> @bitcast_v28f32_to_v56i16_scalar(<28 x float> inreg %a,
 ; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v30, v30, 16, v28
 ; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v28, v71, 16, v20
 ; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v20, v53, 16, v2
-; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v7, v37 :: v_dual_and_b32 v2, 0xffff, v7
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v2, 0xffff, v7
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v7, v37
 ; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v36, v36, 16, v22
 ; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v22, v51, 16, v4
 ; GFX11-FAKE16-NEXT:    v_and_b32_e32 v4, 0xffff, v9
@@ -19086,7 +19087,8 @@ define inreg <56 x half> @bitcast_v28f32_to_v56f16_scalar(<28 x float> inreg %a,
 ; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v30, v30, 16, v28
 ; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v28, v71, 16, v20
 ; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v20, v53, 16, v2
-; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v7, v37 :: v_dual_and_b32 v2, 0xffff, v7
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v2, 0xffff, v7
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v7, v37
 ; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v36, v36, 16, v22
 ; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v22, v51, 16, v4
 ; GFX11-FAKE16-NEXT:    v_and_b32_e32 v4, 0xffff, v9
@@ -32805,7 +32807,8 @@ define inreg <56 x i16> @bitcast_v14f64_to_v56i16_scalar(<14 x double> inreg %a,
 ; GFX11-FAKE16-NEXT:    v_and_b32_e32 v19, 0xffff, v14
 ; GFX11-FAKE16-NEXT:    v_and_b32_e32 v1, 0xffff, v1
 ; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v22, v51, 16, v4
-; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v9, v29 :: v_dual_and_b32 v4, 0xffff, v9
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v4, 0xffff, v9
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v9, v29
 ; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v11, v70, 16, v12
 ; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v12, v69, 16, v15
 ; GFX11-FAKE16-NEXT:    v_and_b32_e32 v15, 0xffff, v16
@@ -32816,7 +32819,8 @@ define inreg <56 x i16> @bitcast_v14f64_to_v56i16_scalar(<14 x double> inreg %a,
 ; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v13, v68, 16, v15
 ; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v15, v66, 16, v19
 ; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v19, v54, 16, v1
-; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v6, v36 :: v_dual_and_b32 v1, 0xffff, v6
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v1, 0xffff, v6
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v6, v36
 ; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v34, v34, 16, v23
 ; GFX11-FAKE16-NEXT:    v_and_b32_e32 v25, 0xffff, v26
 ; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v30, v30, 16, v27
@@ -36930,7 +36934,8 @@ define inreg <56 x half> @bitcast_v14f64_to_v56f16_scalar(<14 x double> inreg %a
 ; GFX11-FAKE16-NEXT:    v_and_b32_e32 v19, 0xffff, v14
 ; GFX11-FAKE16-NEXT:    v_and_b32_e32 v1, 0xffff, v1
 ; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v22, v51, 16, v4
-; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v9, v29 :: v_dual_and_b32 v4, 0xffff, v9
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v4, 0xffff, v9
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v9, v29
 ; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v11, v70, 16, v12
 ; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v12, v69, 16, v15
 ; GFX11-FAKE16-NEXT:    v_and_b32_e32 v15, 0xffff, v16
@@ -36941,7 +36946,8 @@ define inreg <56 x half> @bitcast_v14f64_to_v56f16_scalar(<14 x double> inreg %a
 ; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v13, v68, 16, v15
 ; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v15, v66, 16, v19
 ; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v19, v54, 16, v1
-; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v6, v36 :: v_dual_and_b32 v1, 0xffff, v6
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v1, 0xffff, v6
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v6, v36
 ; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v34, v34, 16, v23
 ; GFX11-FAKE16-NEXT:    v_and_b32_e32 v25, 0xffff, v26
 ; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v30, v30, 16, v27
@@ -42308,7 +42314,8 @@ define inreg <56 x half> @bitcast_v56i16_to_v56f16_scalar(<56 x i16> inreg %a, i
 ; GFX11-FAKE16-NEXT:    v_and_b32_e32 v2, 0xffff, v2
 ; GFX11-FAKE16-NEXT:    v_and_b32_e32 v4, 0xffff, v4
 ; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v18, v18, 16, v0
-; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v5, v35 :: v_dual_and_b32 v0, 0xffff, v5
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xffff, v5
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v5, v35
 ; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v34, v67, 16, v36
 ; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v36, v65, 16, v69
 ; GFX11-FAKE16-NEXT:    v_and_b32_e32 v65, 0xffff, v10
@@ -44660,7 +44667,8 @@ define inreg <56 x i16> @bitcast_v56f16_to_v56i16_scalar(<56 x half> inreg %a, i
 ; GFX11-FAKE16-NEXT:    v_and_b32_e32 v2, 0xffff, v2
 ; GFX11-FAKE16-NEXT:    v_and_b32_e32 v4, 0xffff, v4
 ; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v18, v18, 16, v0
-; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v5, v35 :: v_dual_and_b32 v0, 0xffff, v5
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xffff, v5
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v5, v35
 ; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v34, v67, 16, v36
 ; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v36, v65, 16, v69
 ; GFX11-FAKE16-NEXT:    v_and_b32_e32 v65, 0xffff, v10
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-cs-chain-fp-nosave.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-cs-chain-fp-nosave.ll
index b5ebca1846807..87a06cfe75265 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgpu-cs-chain-fp-nosave.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-cs-chain-fp-nosave.ll
@@ -94,12 +94,12 @@ define amdgpu_cs_chain void @test_alloca_var(i32 %count) {
 ; GFX12-NEXT:    s_mov_b32 s0, 0
 ; GFX12-NEXT:    s_mov_b32 s32, 16
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_and_b32 v1, -16, v0
+; GFX12-NEXT:    v_and_b32_e32 v1, -16, v0
+; GFX12-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-NEXT:  .LBB2_1: ; =>This Inner Loop Header: Depth=1
 ; GFX12-NEXT:    s_wait_alu depctr_sa_sdst(0)
 ; GFX12-NEXT:    s_ctz_i32_b32 s2, s1
 ; GFX12-NEXT:    s_wait_alu depctr_sa_sdst(0)
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-NEXT:    v_readlane_b32 s3, v1, s2
 ; GFX12-NEXT:    s_bitset0_b32 s1, s2
 ; GFX12-NEXT:    s_max_u32 s0, s0, s3
@@ -261,12 +261,12 @@ define amdgpu_cs_chain void @test_alloca_and_call_var(i32 %count) {
 ; GFX12-NEXT:    s_mov_b32 s0, 0
 ; GFX12-NEXT:    s_mov_b32 s32, 16
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_and_b32 v1, -16, v0
+; GFX12-NEXT:    v_and_b32_e32 v1, -16, v0
+; GFX12-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-NEXT:  .LBB5_1: ; =>This Inner Loop Header: Depth=1
 ; GFX12-NEXT:    s_wait_alu depctr_sa_sdst(0)
 ; GFX12-NEXT:    s_ctz_i32_b32 s2, s1
 ; GFX12-NEXT:    s_wait_alu depctr_sa_sdst(0)
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-NEXT:    v_readlane_b32 s3, v1, s2
 ; GFX12-NEXT:    s_bitset0_b32 s1, s2
 ; GFX12-NEXT:    s_max_u32 s0, s0, s3
diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_struct_buffer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_struct_buffer.ll
index 45b47a1f389e5..c886146b94465 100644
--- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_struct_buffer.ll
+++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_struct_buffer.ll
@@ -1135,62 +1135,34 @@ define amdgpu_kernel void @add_i32_varying_offset(ptr addrspace(1) %out, ptr add
 ; GFX10-NEXT:    global_store_dword v0, v2, s[0:1]
 ; GFX10-NEXT:    s_endpgm
 ;
-; GFX11W64-LABEL: add_i32_varying_offset:
-; GFX11W64:       ; %bb.0: ; %entry
-; GFX11W64-NEXT:    s_load_b128 s[0:3], s[4:5], 0x34
-; GFX11W64-NEXT:    s_mov_b32 s6, 0
-; GFX11W64-NEXT:    v_and_b32_e32 v1, 0x3ff, v0
-; GFX11W64-NEXT:    v_mov_b32_e32 v0, s6
-; GFX11W64-NEXT:    v_mov_b32_e32 v2, 1
-; GFX11W64-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11W64-NEXT:    buffer_atomic_add_u32 v2, v[0:1], s[0:3], 0 idxen offen glc
-; GFX11W64-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
-; GFX11W64-NEXT:    v_mov_b32_e32 v0, 0
-; GFX11W64-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11W64-NEXT:    global_store_b32 v0, v2, s[0:1]
-; GFX11W64-NEXT:    s_endpgm
-;
-; GFX11W32-LABEL: add_i32_varying_offset:
-; GFX11W32:       ; %bb.0: ; %entry
-; GFX11W32-NEXT:    s_load_b128 s[0:3], s[4:5], 0x34
-; GFX11W32-NEXT:    s_mov_b32 s6, 0
-; GFX11W32-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11W32-NEXT:    v_dual_mov_b32 v0, s6 :: v_dual_and_b32 v1, 0x3ff, v0
-; GFX11W32-NEXT:    v_mov_b32_e32 v2, 1
-; GFX11W32-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11W32-NEXT:    buffer_atomic_add_u32 v2, v[0:1], s[0:3], 0 idxen offen glc
-; GFX11W32-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
-; GFX11W32-NEXT:    v_mov_b32_e32 v0, 0
-; GFX11W32-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11W32-NEXT:    global_store_b32 v0, v2, s[0:1]
-; GFX11W32-NEXT:    s_endpgm
-;
-; GFX12W64-LABEL: add_i32_varying_offset:
-; GFX12W64:       ; %bb.0: ; %entry
-; GFX12W64-NEXT:    s_load_b128 s[0:3], s[4:5], 0x34
-; GFX12W64-NEXT:    v_and_b32_e32 v1, 0x3ff, v0
-; GFX12W64-NEXT:    v_mov_b32_e32 v0, 0
-; GFX12W64-NEXT:    v_mov_b32_e32 v2, 1
-; GFX12W64-NEXT:    s_wait_kmcnt 0x0
-; GFX12W64-NEXT:    buffer_atomic_add_u32 v2, v[0:1], s[0:3], null idxen offen th:TH_ATOMIC_RETURN
-; GFX12W64-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
-; GFX12W64-NEXT:    s_wait_loadcnt 0x0
-; GFX12W64-NEXT:    s_wait_kmcnt 0x0
-; GFX12W64-NEXT:    global_store_b32 v0, v2, s[0:1]
-; GFX12W64-NEXT:    s_endpgm
-;
-; GFX12W32-LABEL: add_i32_varying_offset:
-; GFX12W32:       ; %bb.0: ; %entry
-; GFX12W32-NEXT:    s_load_b128 s[0:3], s[4:5], 0x34
-; GFX12W32-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_and_b32 v1, 0x3ff, v0
-; GFX12W32-NEXT:    v_mov_b32_e32 v2, 1
-; GFX12W32-NEXT:    s_wait_kmcnt 0x0
-; GFX12W32-NEXT:    buffer_atomic_add_u32 v2, v[0:1], s[0:3], null idxen offen th:TH_ATOMIC_RETURN
-; GFX12W32-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
-; GFX12W32-NEXT:    s_wait_loadcnt 0x0
-; GFX12W32-NEXT:    s_wait_kmcnt 0x0
-; GFX12W32-NEXT:    global_store_b32 v0, v2, s[0:1]
-; GFX12W32-NEXT:    s_endpgm
+; GFX11-LABEL: add_i32_varying_offset:
+; GFX11:       ; %bb.0: ; %entry
+; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x34
+; GFX11-NEXT:    s_mov_b32 s6, 0
+; GFX11-NEXT:    v_and_b32_e32 v1, 0x3ff, v0
+; GFX11-NEXT:    v_mov_b32_e32 v0, s6
+; GFX11-NEXT:    v_mov_b32_e32 v2, 1
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    buffer_atomic_add_u32 v2, v[0:1], s[0:3], 0 idxen offen glc
+; GFX11-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX11-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    global_store_b32 v0, v2, s[0:1]
+; GFX11-NEXT:    s_endpgm
+;
+; GFX12-LABEL: add_i32_varying_offset:
+; GFX12:       ; %bb.0: ; %entry
+; GFX12-NEXT:    s_load_b128 s[0:3], s[4:5], 0x34
+; GFX12-NEXT:    v_and_b32_e32 v1, 0x3ff, v0
+; GFX12-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-NEXT:    v_mov_b32_e32 v2, 1
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    buffer_atomic_add_u32 v2, v[0:1], s[0:3], null idxen offen th:TH_ATOMIC_RETURN
+; GFX12-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    global_store_b32 v0, v2, s[0:1]
+; GFX12-NEXT:    s_endpgm
 entry:
   %lane = call i32 @llvm.amdgcn.workitem.id.x()
   %old = call i32 @llvm.amdgcn.struct.ptr.buffer.atomic.add(i32 1, ptr addrspace(8) %inout, i32 0, i32 %lane, i32 0, i32 0)
@@ -2335,68 +2307,37 @@ define amdgpu_kernel void @sub_i32_varying_offset(ptr addrspace(1) %out, ptr add
 ; GFX10-NEXT:    global_store_dword v0, v2, s[0:1]
 ; GFX10-NEXT:    s_endpgm
 ;
-; GFX11W64-LABEL: sub_i32_varying_offset:
-; GFX11W64:       ; %bb.0: ; %entry
-; GFX11W64-NEXT:    s_load_b128 s[0:3], s[4:5], 0x34
-; GFX11W64-NEXT:    s_mov_b32 s6, 0
-; GFX11W64-NEXT:    v_and_b32_e32 v1, 0x3ff, v0
-; GFX11W64-NEXT:    v_mov_b32_e32 v0, s6
-; GFX11W64-NEXT:    v_mov_b32_e32 v2, 1
-; GFX11W64-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11W64-NEXT:    buffer_atomic_sub_u32 v2, v[0:1], s[0:3], 0 idxen offen glc
-; GFX11W64-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
-; GFX11W64-NEXT:    v_mov_b32_e32 v0, 0
-; GFX11W64-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11W64-NEXT:    global_store_b32 v0, v2, s[0:1]
-; GFX11W64-NEXT:    s_endpgm
-;
-; GFX11W32-LABEL: sub_i32_varying_offset:
-; GFX11W32:       ; %bb.0: ; %entry
-; GFX11W32-NEXT:    s_load_b128 s[0:3], s[4:5], 0x34
-; GFX11W32-NEXT:    s_mov_b32 s6, 0
-; GFX11W32-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11W32-NEXT:    v_dual_mov_b32 v0, s6 :: v_dual_and_b32 v1, 0x3ff, v0
-; GFX11W32-NEXT:    v_mov_b32_e32 v2, 1
-; GFX11W32-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11W32-NEXT:    buffer_atomic_sub_u32 v2, v[0:1], s[0:3], 0 idxen offen glc
-; GFX11W32-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
-; GFX11W32-NEXT:    v_mov_b32_e32 v0, 0
-; GFX11W32-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX11W32-NEXT:    global_store_b32 v0, v2, s[0:1]
-; GFX11W32-NEXT:    s_endpgm
-;
-; GFX12W64-LABEL: sub_i32_varying_offset:
-; GFX12W64:       ; %bb.0: ; %entry
-; GFX12W64-NEXT:    s_load_b128 s[0:3], s[4:5], 0x34
-; GFX12W64-NEXT:    v_and_b32_e32 v1, 0x3ff, v0
-; GFX12W64-NEXT:    v_mov_b32_e32 v0, 0
-; GFX12W64-NEXT:    v_mov_b32_e32 v2, 1
-; GFX12W64-NEXT:    s_wait_kmcnt 0x0
-; GFX12W64-NEXT:    buffer_atomic_sub_u32 v2, v[0:1], s[0:3], null idxen offen th:TH_ATOMIC_RETURN
-; GFX12W64-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
-; GFX12W64-NEXT:    s_wait_loadcnt 0x0
-; GFX12W64-NEXT:    s_wait_kmcnt 0x0
-; GFX12W64-NEXT:    global_store_b32 v0, v2, s[0:1]
-; GFX12W64-NEXT:    s_endpgm
-;
-; GFX12W32-LABEL: sub_i32_varying_offset:
-; GFX12W32:       ; %bb.0: ; %entry
-; GFX12W32-NEXT:    s_load_b128 s[0:3], s[4:5], 0x34
-; GFX12W32-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_and_b32 v1, 0x3ff, v0
-; GFX12W32-NEXT:    v_mov_b32_e32 v2, 1
-; GFX12W32-NEXT:    s_wait_kmcnt 0x0
-; GFX12W32-NEXT:    buffer_atomic_sub_u32 v2, v[0:1], s[0:3], null idxen offen th:TH_ATOMIC_RETURN
-; GFX12W32-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
-; GFX12W32-NEXT:    s_wait_loadcnt 0x0
-; GFX12W32-NEXT:    s_wait_kmcnt 0x0
-; GFX12W32-NEXT:    global_store_b32 v0, v2, s[0:1]
-; GFX12W32-NEXT:    s_endpgm
+; GFX11-LABEL: sub_i32_varying_offset:
+; GFX11:       ; %bb.0: ; %entry
+; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x34
+; GFX11-NEXT:    s_mov_b32 s6, 0
+; GFX11-NEXT:    v_and_b32_e32 v1, 0x3ff, v0
+; GFX11-NEXT:    v_mov_b32_e32 v0, s6
+; GFX11-NEXT:    v_mov_b32_e32 v2, 1
+; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-NEXT:    buffer_atomic_sub_u32 v2, v[0:1], s[0:3], 0 idxen offen glc
+; GFX11-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX11-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    global_store_b32 v0, v2, s[0:1]
+; GFX11-NEXT:    s_endpgm
+;
+; GFX12-LABEL: sub_i32_varying_offset:
+; GFX12:       ; %bb.0: ; %entry
+; GFX12-NEXT:    s_load_b128 s[0:3], s[4:5], 0x34
+; GFX12-NEXT:    v_and_b32_e32 v1, 0x3ff, v0
+; GFX12-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-NEXT:    v_mov_b32_e32 v2, 1
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    buffer_atomic_sub_u32 v2, v[0:1], s[0:3], null idxen offen th:TH_ATOMIC_RETURN
+; GFX12-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    global_store_b32 v0, v2, s[0:1]
+; GFX12-NEXT:    s_endpgm
 entry:
   %lane = call i32 @llvm.amdgcn.workitem.id.x()
   %old = call i32 @llvm.amdgcn.struct.ptr.buffer.atomic.sub(i32 1, ptr addrspace(8) %inout, i32 0, i32 %lane, i32 0, i32 0)
   store i32 %old, ptr addrspace(1) %out
   ret void
 }
-;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
-; GFX11: {{.*}}
-; GFX12: {{.*}}
diff --git a/llvm/test/CodeGen/AMDGPU/bf16.ll b/llvm/test/CodeGen/AMDGPU/bf16.ll
index 0394ed7f89633..a5763816e58cc 100644
--- a/llvm/test/CodeGen/AMDGPU/bf16.ll
+++ b/llvm/test/CodeGen/AMDGPU/bf16.ll
@@ -35793,8 +35793,8 @@ define <4 x i64> @v_fptosi_v4bf16_to_v4i64(<4 x bfloat> %x) {
 ; GFX1250-NEXT:    v_xor_b32_e32 v10, v10, v2
 ; GFX1250-NEXT:    v_sub_nc_u64_e32 v[0:1], v[8:9], v[0:1]
 ; GFX1250-NEXT:    v_xor_b32_e32 v9, v14, v4
-; GFX1250-NEXT:    v_dual_mov_b32 v7, v6 :: v_dual_bitop2_b32 v8, v7, v4 bitop3:0x14
-; GFX1250-NEXT:    v_xor_b32_e32 v13, v12, v6
+; GFX1250-NEXT:    v_xor_b32_e32 v8, v7, v4
+; GFX1250-NEXT:    v_dual_mov_b32 v7, v6 :: v_dual_bitop2_b32 v13, v12, v6 bitop3:0x14
 ; GFX1250-NEXT:    v_xor_b32_e32 v12, v15, v6
 ; GFX1250-NEXT:    v_sub_nc_u64_e32 v[2:3], v[10:11], v[2:3]
 ; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
@@ -37399,8 +37399,9 @@ define bfloat @v_sitofp_i64_to_bf16(i64 %x) {
 ; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1250-NEXT:    v_lshlrev_b64_e32 v[0:1], v2, v[0:1]
 ; GFX1250-NEXT:    v_min_u32_e32 v0, 1, v0
-; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-NEXT:    v_dual_sub_nc_u32 v1, 32, v2 :: v_dual_bitop2_b32 v0, v1, v0 bitop3:0x54
+; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX1250-NEXT:    v_or_b32_e32 v0, v1, v0
+; GFX1250-NEXT:    v_sub_nc_u32_e32 v1, 32, v2
 ; GFX1250-NEXT:    v_cvt_f32_i32_e32 v0, v0
 ; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1250-NEXT:    v_ldexp_f32 v0, v0, v1
@@ -38305,12 +38306,13 @@ define <3 x bfloat> @v_sitofp_v3i64_to_v3bf16(<3 x i64> %x) {
 ; GFX1250FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
 ; GFX1250FAKE16-NEXT:    v_min_u32_e32 v0, 1, v0
 ; GFX1250FAKE16-NEXT:    v_sub_nc_u32_e32 v6, 32, v6
-; GFX1250FAKE16-NEXT:    v_dual_sub_nc_u32 v3, 32, v7 :: v_dual_bitop2_b32 v2, v3, v2 bitop3:0x54
-; GFX1250FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX1250FAKE16-NEXT:    v_or_b32_e32 v0, v1, v0
-; GFX1250FAKE16-NEXT:    v_dual_sub_nc_u32 v4, 32, v9 :: v_dual_bitop2_b32 v1, v5, v4 bitop3:0x54
+; GFX1250FAKE16-NEXT:    v_or_b32_e32 v2, v3, v2
+; GFX1250FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_4)
+; GFX1250FAKE16-NEXT:    v_dual_sub_nc_u32 v3, 32, v7 :: v_dual_bitop2_b32 v0, v1, v0 bitop3:0x54
+; GFX1250FAKE16-NEXT:    v_or_b32_e32 v1, v5, v4
+; GFX1250FAKE16-NEXT:    v_sub_nc_u32_e32 v4, 32, v9
 ; GFX1250FAKE16-NEXT:    v_cvt_f32_i32_e32 v2, v2
-; GFX1250FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX1250FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
 ; GFX1250FAKE16-NEXT:    v_cvt_f32_i32_e32 v0, v0
 ; GFX1250FAKE16-NEXT:    v_cvt_f32_i32_e32 v1, v1
 ; GFX1250FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
@@ -38944,8 +38946,8 @@ define <4 x bfloat> @v_sitofp_v4i64_to_v4bf16(<4 x i64> %x) {
 ; GFX1250-NEXT:    v_min_u32_e32 v2, 1, v2
 ; GFX1250-NEXT:    v_sub_nc_u32_e32 v7, 32, v10
 ; GFX1250-NEXT:    v_min_u32_e32 v0, 1, v0
-; GFX1250-NEXT:    v_dual_sub_nc_u32 v5, 32, v9 :: v_dual_bitop2_b32 v4, v5, v4 bitop3:0x54
-; GFX1250-NEXT:    v_or_b32_e32 v2, v3, v2
+; GFX1250-NEXT:    v_or_b32_e32 v4, v5, v4
+; GFX1250-NEXT:    v_dual_sub_nc_u32 v5, 32, v9 :: v_dual_bitop2_b32 v2, v3, v2 bitop3:0x54
 ; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_3)
 ; GFX1250-NEXT:    v_or_b32_e32 v0, v1, v0
 ; GFX1250-NEXT:    v_sub_nc_u32_e32 v1, 32, v8
@@ -40580,8 +40582,9 @@ define bfloat @v_uitofp_i64_to_bf16(i64 %x) {
 ; GFX1250-NEXT:    v_lshlrev_b64_e32 v[0:1], v2, v[0:1]
 ; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1250-NEXT:    v_min_u32_e32 v0, 1, v0
-; GFX1250-NEXT:    v_dual_sub_nc_u32 v1, 32, v2 :: v_dual_bitop2_b32 v0, v1, v0 bitop3:0x54
-; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-NEXT:    v_or_b32_e32 v0, v1, v0
+; GFX1250-NEXT:    v_sub_nc_u32_e32 v1, 32, v2
+; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1250-NEXT:    v_cvt_f32_u32_e32 v0, v0
 ; GFX1250-NEXT:    v_ldexp_f32 v0, v0, v1
 ; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1)
@@ -41294,11 +41297,12 @@ define <3 x bfloat> @v_uitofp_v3i64_to_v3bf16(<3 x i64> %x) {
 ; GFX1250FAKE16-NEXT:    v_dual_sub_nc_u32 v8, 32, v8 :: v_dual_bitop2_b32 v2, v3, v2 bitop3:0x54
 ; GFX1250FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
 ; GFX1250FAKE16-NEXT:    v_dual_sub_nc_u32 v3, 32, v6 :: v_dual_bitop2_b32 v0, v1, v0 bitop3:0x54
-; GFX1250FAKE16-NEXT:    v_dual_sub_nc_u32 v4, 32, v7 :: v_dual_bitop2_b32 v1, v5, v4 bitop3:0x54
-; GFX1250FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX1250FAKE16-NEXT:    v_or_b32_e32 v1, v5, v4
+; GFX1250FAKE16-NEXT:    v_sub_nc_u32_e32 v4, 32, v7
+; GFX1250FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
 ; GFX1250FAKE16-NEXT:    v_cvt_f32_u32_e32 v2, v2
 ; GFX1250FAKE16-NEXT:    v_cvt_f32_u32_e32 v0, v0
-; GFX1250FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX1250FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
 ; GFX1250FAKE16-NEXT:    v_cvt_f32_u32_e32 v1, v1
 ; GFX1250FAKE16-NEXT:    v_ldexp_f32 v2, v2, v3
 ; GFX1250FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
@@ -48573,9 +48577,10 @@ define <8 x bfloat> @v_fma_v8bf16(<8 x bfloat> %a, <8 x bfloat> %b, <8 x bfloat>
 ; GFX11FAKE16-NEXT:    v_bfe_u32 v16, v11, 16, 1
 ; GFX11FAKE16-NEXT:    v_and_b32_e32 v6, 0xffff0000, v6
 ; GFX11FAKE16-NEXT:    v_or_b32_e32 v17, 0x400000, v11
-; GFX11FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_4)
+; GFX11FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
 ; GFX11FAKE16-NEXT:    v_fmac_f32_e32 v3, v14, v7
-; GFX11FAKE16-NEXT:    v_dual_cndmask_b32 v10, v13, v15 :: v_dual_and_b32 v7, 0xffff0000, v10
+; GFX11FAKE16-NEXT:    v_and_b32_e32 v7, 0xffff0000, v10
+; GFX11FAKE16-NEXT:    v_cndmask_b32_e32 v10, v13, v15, vcc_lo
 ; GFX11FAKE16-NEXT:    v_add3_u32 v12, v16, v11, 0x7fff
 ; GFX11FAKE16-NEXT:    v_lshlrev_b32_e32 v14, 16, v1
 ; GFX11FAKE16-NEXT:    v_bfe_u32 v13, v3, 16, 1
@@ -49579,8 +49584,8 @@ define <16 x bfloat> @v_fma_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b, <16 x bf
 ; GFX11FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX11FAKE16-NEXT:    v_add3_u32 v24, v28, v23, 0x7fff
 ; GFX11FAKE16-NEXT:    v_fmac_f32_e32 v7, v26, v15
-; GFX11FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4)
-; GFX11FAKE16-NEXT:    v_dual_cndmask_b32 v22, v25, v27 :: v_dual_and_b32 v15, 0xffff0000, v22
+; GFX11FAKE16-NEXT:    v_and_b32_e32 v15, 0xffff0000, v22
+; GFX11FAKE16-NEXT:    v_cndmask_b32_e32 v22, v25, v27, vcc_lo
 ; GFX11FAKE16-NEXT:    v_or_b32_e32 v25, 0x400000, v23
 ; GFX11FAKE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v23, v23
 ; GFX11FAKE16-NEXT:    v_bfe_u32 v26, v7, 16, 1
diff --git a/llvm/test/CodeGen/AMDGPU/expand-waitcnt-profiling.ll b/llvm/test/CodeGen/AMDGPU/expand-waitcnt-profiling.ll
index 848a9d07084ed..07005fd020e81 100644
--- a/llvm/test/CodeGen/AMDGPU/expand-waitcnt-profiling.ll
+++ b/llvm/test/CodeGen/AMDGPU/expand-waitcnt-profiling.ll
@@ -376,10 +376,10 @@ define amdgpu_kernel void @test_lgkmcnt_lds_operations(ptr addrspace(3) %lds_ptr
 ; GFX11-EXPAND-NEXT:    ds_load_2addr_b32 v[0:1], v2 offset1:1
 ; GFX11-EXPAND-NEXT:    ds_load_b32 v2, v2 offset:8
 ; GFX11-EXPAND-NEXT:    s_waitcnt lgkmcnt(1)
-; GFX11-EXPAND-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_add_nc_u32 v0, v0, v1
+; GFX11-EXPAND-NEXT:    v_add_nc_u32_e32 v0, v0, v1
 ; GFX11-EXPAND-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-EXPAND-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-EXPAND-NEXT:    v_add_nc_u32_e32 v0, v0, v2
+; GFX11-EXPAND-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_add_nc_u32 v0, v0, v2
 ; GFX11-EXPAND-NEXT:    global_store_b32 v1, v0, s[0:1]
 ; GFX11-EXPAND-NEXT:    s_endpgm
 ;
@@ -393,10 +393,10 @@ define amdgpu_kernel void @test_lgkmcnt_lds_operations(ptr addrspace(3) %lds_ptr
 ; GFX11-NOEXPAND-NEXT:    ds_load_2addr_b32 v[0:1], v2 offset1:1
 ; GFX11-NOEXPAND-NEXT:    ds_load_b32 v2, v2 offset:8
 ; GFX11-NOEXPAND-NEXT:    s_waitcnt lgkmcnt(1)
-; GFX11-NOEXPAND-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_add_nc_u32 v0, v0, v1
+; GFX11-NOEXPAND-NEXT:    v_add_nc_u32_e32 v0, v0, v1
 ; GFX11-NOEXPAND-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NOEXPAND-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NOEXPAND-NEXT:    v_add_nc_u32_e32 v0, v0, v2
+; GFX11-NOEXPAND-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_add_nc_u32 v0, v0, v2
 ; GFX11-NOEXPAND-NEXT:    global_store_b32 v1, v0, s[0:1]
 ; GFX11-NOEXPAND-NEXT:    s_endpgm
 ;
@@ -410,10 +410,10 @@ define amdgpu_kernel void @test_lgkmcnt_lds_operations(ptr addrspace(3) %lds_ptr
 ; GFX12-EXPAND-NEXT:    ds_load_2addr_b32 v[0:1], v2 offset1:1
 ; GFX12-EXPAND-NEXT:    ds_load_b32 v2, v2 offset:8
 ; GFX12-EXPAND-NEXT:    s_wait_dscnt 0x1
-; GFX12-EXPAND-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_add_nc_u32 v0, v0, v1
+; GFX12-EXPAND-NEXT:    v_add_nc_u32_e32 v0, v0, v1
 ; GFX12-EXPAND-NEXT:    s_wait_dscnt 0x0
 ; GFX12-EXPAND-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-EXPAND-NEXT:    v_add_nc_u32_e32 v0, v0, v2
+; GFX12-EXPAND-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_add_nc_u32 v0, v0, v2
 ; GFX12-EXPAND-NEXT:    global_store_b32 v1, v0, s[0:1]
 ; GFX12-EXPAND-NEXT:    s_endpgm
 ;
@@ -427,10 +427,10 @@ define amdgpu_kernel void @test_lgkmcnt_lds_operations(ptr addrspace(3) %lds_ptr
 ; GFX12-NOEXPAND-NEXT:    ds_load_2addr_b32 v[0:1], v2 offset1:1
 ; GFX12-NOEXPAND-NEXT:    ds_load_b32 v2, v2 offset:8
 ; GFX12-NOEXPAND-NEXT:    s_wait_dscnt 0x1
-; GFX12-NOEXPAND-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_add_nc_u32 v0, v0, v1
+; GFX12-NOEXPAND-NEXT:    v_add_nc_u32_e32 v0, v0, v1
 ; GFX12-NOEXPAND-NEXT:    s_wait_dscnt 0x0
 ; GFX12-NOEXPAND-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NOEXPAND-NEXT:    v_add_nc_u32_e32 v0, v0, v2
+; GFX12-NOEXPAND-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_add_nc_u32 v0, v0, v2
 ; GFX12-NOEXPAND-NEXT:    global_store_b32 v1, v0, s[0:1]
 ; GFX12-NOEXPAND-NEXT:    s_endpgm
 
@@ -650,9 +650,9 @@ define amdgpu_kernel void @test_outoforder_lds_and_smem(ptr addrspace(3) %lds_pt
 ; GFX11-EXPAND-NEXT:    s_load_b32 s0, s[0:1], 0x0
 ; GFX11-EXPAND-NEXT:    ds_load_2addr_b32 v[0:1], v0 offset1:1
 ; GFX11-EXPAND-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-EXPAND-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_add_nc_u32 v0, v0, v1
+; GFX11-EXPAND-NEXT:    v_add_nc_u32_e32 v0, v0, v1
 ; GFX11-EXPAND-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-EXPAND-NEXT:    v_add_nc_u32_e32 v0, s0, v0
+; GFX11-EXPAND-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_add_nc_u32 v0, s0, v0
 ; GFX11-EXPAND-NEXT:    global_store_b32 v1, v0, s[2:3]
 ; GFX11-EXPAND-NEXT:    s_endpgm
 ;
@@ -666,9 +666,9 @@ define amdgpu_kernel void @test_outoforder_lds_and_smem(ptr addrspace(3) %lds_pt
 ; GFX11-NOEXPAND-NEXT:    s_load_b32 s0, s[0:1], 0x0
 ; GFX11-NOEXPAND-NEXT:    ds_load_2addr_b32 v[0:1], v0 offset1:1
 ; GFX11-NOEXPAND-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NOEXPAND-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_add_nc_u32 v0, v0, v1
+; GFX11-NOEXPAND-NEXT:    v_add_nc_u32_e32 v0, v0, v1
 ; GFX11-NOEXPAND-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NOEXPAND-NEXT:    v_add_nc_u32_e32 v0, s0, v0
+; GFX11-NOEXPAND-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_add_nc_u32 v0, s0, v0
 ; GFX11-NOEXPAND-NEXT:    global_store_b32 v1, v0, s[2:3]
 ; GFX11-NOEXPAND-NEXT:    s_endpgm
 ;
@@ -682,10 +682,10 @@ define amdgpu_kernel void @test_outoforder_lds_and_smem(ptr addrspace(3) %lds_pt
 ; GFX12-EXPAND-NEXT:    s_load_b32 s0, s[0:1], 0x0
 ; GFX12-EXPAND-NEXT:    ds_load_2addr_b32 v[0:1], v0 offset1:1
 ; GFX12-EXPAND-NEXT:    s_wait_dscnt 0x0
-; GFX12-EXPAND-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_add_nc_u32 v0, v0, v1
+; GFX12-EXPAND-NEXT:    v_add_nc_u32_e32 v0, v0, v1
 ; GFX12-EXPAND-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-EXPAND-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-EXPAND-NEXT:    v_add_nc_u32_e32 v0, s0, v0
+; GFX12-EXPAND-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_add_nc_u32 v0, s0, v0
 ; GFX12-EXPAND-NEXT:    global_store_b32 v1, v0, s[2:3]
 ; GFX12-EXPAND-NEXT:    s_endpgm
 ;
@@ -699,10 +699,10 @@ define amdgpu_kernel void @test_outoforder_lds_and_smem(ptr addrspace(3) %lds_pt
 ; GFX12-NOEXPAND-NEXT:    s_load_b32 s0, s[0:1], 0x0
 ; GFX12-NOEXPAND-NEXT:    ds_load_2addr_b32 v[0:1], v0 offset1:1
 ; GFX12-NOEXPAND-NEXT:    s_wait_dscnt 0x0
-; GFX12-NOEXPAND-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_add_nc_u32 v0, v0, v1
+; GFX12-NOEXPAND-NEXT:    v_add_nc_u32_e32 v0, v0, v1
 ; GFX12-NOEXPAND-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NOEXPAND-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NOEXPAND-NEXT:    v_add_nc_u32_e32 v0, s0, v0
+; GFX12-NOEXPAND-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_add_nc_u32 v0, s0, v0
 ; GFX12-NOEXPAND-NEXT:    global_store_b32 v1, v0, s[2:3]
 ; GFX12-NOEXPAND-NEXT:    s_endpgm
 
diff --git a/llvm/test/CodeGen/AMDGPU/fcopysign.f16.ll b/llvm/test/CodeGen/AMDGPU/fcopysign.f16.ll
index 23753bc5970dd..2c02d7d9b3b7d 100644
--- a/llvm/test/CodeGen/AMDGPU/fcopysign.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/fcopysign.f16.ll
@@ -1067,8 +1067,9 @@ define half @v_copysign_out_f16_mag_f64_sign_f16(double %mag, half %sign) {
 ; GFX11-FAKE16-NEXT:    v_cmp_gt_i32_e32 vcc_lo, 31, v1
 ; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-FAKE16-NEXT:    v_or_b32_e32 v4, v4, v5
-; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v4, 0x7e00 :: v_dual_add_nc_u32 v3, v3, v4
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v3, v3, v4
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v4, 0x7e00
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
 ; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v3, 0x7c00, v3, vcc_lo
 ; GFX11-FAKE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v0
 ; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v0, 0x7c00, v4, vcc_lo
@@ -5106,8 +5107,8 @@ define <3 x half> @v_copysign_out_v3f16_mag_v3f64_sign_v3f16(<3 x double> %mag,
 ; GFX11-FAKE16-NEXT:    v_cndmask_b32_e64 v14, 0, 1, vcc_lo
 ; GFX11-FAKE16-NEXT:    v_or_b32_e32 v12, v14, v12
 ; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_4)
-; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v12, 0x7e00 :: v_dual_add_nc_u32 v9, v9, v12
-; GFX11-FAKE16-NEXT:    v_and_b32_e32 v15, 7, v5
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v9, v9, v12
+; GFX11-FAKE16-NEXT:    v_dual_mov_b32 v12, 0x7e00 :: v_dual_and_b32 v15, 7, v5
 ; GFX11-FAKE16-NEXT:    v_and_b32_e32 v16, 7, v11
 ; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v5, 2, v5
 ; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v11, 2, v11
@@ -6433,11 +6434,11 @@ define <4 x half> @v_copysign_out_v4f16_mag_v4f64_sign_v4f16(<4 x double> %mag,
 ; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v6, v10, v6, vcc_lo
 ; GFX11-FAKE16-NEXT:    v_cmp_gt_i32_e32 vcc_lo, 31, v12
-; GFX11-FAKE16-NEXT:    v_dual_cndmask_b32 v11, 0x7c00, v13 :: v_dual_add_nc_u32 v10, v16, v11
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v10, v16, v11
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v11, 0x7c00, v13, vcc_lo
 ; GFX11-FAKE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v0
 ; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v0, 0x7c00, v18, vcc_lo
 ; GFX11-FAKE16-NEXT:    v_cmp_gt_i32_e32 vcc_lo, 31, v19
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
 ; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v10, 0x7c00, v10, vcc_lo
 ; GFX11-FAKE16-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v2
 ; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v2, 0x7c00, v18, vcc_lo
diff --git a/llvm/test/CodeGen/AMDGPU/mad_64_32.ll b/llvm/test/CodeGen/AMDGPU/mad_64_32.ll
index fc8467cb73ab6..dc28b2c01c897 100644
--- a/llvm/test/CodeGen/AMDGPU/mad_64_32.ll
+++ b/llvm/test/CodeGen/AMDGPU/mad_64_32.ll
@@ -1995,7 +1995,7 @@ define i64 @lshr_mad_i64_negative_2(i64 %arg0) #0 {
 ; GFX1250-NEXT:    v_mad_nc_u64_u32 v[2:3], 0xd1, v1, v[0:1]
 ; GFX1250-NEXT:    v_lshlrev_b32_e32 v0, 8, v1
 ; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1250-NEXT:    v_dual_mov_b32 v0, v2 :: v_dual_sub_nc_u32 v1, v3, v0
+; GFX1250-NEXT:    v_dual_sub_nc_u32 v1, v3, v0 :: v_dual_mov_b32 v0, v2
 ; GFX1250-NEXT:    s_set_pc_i64 s[30:31]
   %lsh = lshr i64 %arg0, 32
   %mul = mul i64 %lsh, s0xffffff00000000d1
diff --git a/llvm/test/CodeGen/AMDGPU/match-perm-extract-vector-elt-bug.ll b/llvm/test/CodeGen/AMDGPU/match-perm-extract-vector-elt-bug.ll
index 93d772fdb7854..35d7b6a7ad0e5 100644
--- a/llvm/test/CodeGen/AMDGPU/match-perm-extract-vector-elt-bug.ll
+++ b/llvm/test/CodeGen/AMDGPU/match-perm-extract-vector-elt-bug.ll
@@ -62,12 +62,12 @@ define amdgpu_kernel void @test(ptr addrspace(1) %src, ptr addrspace(1) %dst) {
 ; GFX11-NEXT:    s_load_b32 s6, s[4:5], 0x1c
 ; GFX11-NEXT:    s_load_b32 s7, s[4:5], 0x38
 ; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x0
-; GFX11-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_and_b32 v1, 0x3ff, v0
+; GFX11-NEXT:    v_and_b32_e32 v1, 0x3ff, v0
+; GFX11-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    s_and_b32 s4, s6, 0xffff
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_mul_i32 s13, s13, s4
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
 ; GFX11-NEXT:    v_add3_u32 v1, s7, s13, v1
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_ashrrev_i64 v[4:5], 28, v[0:1]
diff --git a/llvm/test/CodeGen/AMDGPU/narrow_math_for_and.ll b/llvm/test/CodeGen/AMDGPU/narrow_math_for_and.ll
index 57805063b92b1..bd9135a44371b 100644
--- a/llvm/test/CodeGen/AMDGPU/narrow_math_for_and.ll
+++ b/llvm/test/CodeGen/AMDGPU/narrow_math_for_and.ll
@@ -9,7 +9,8 @@ define i64 @narrow_add(i64 %a, i64 %b) {
 ; CHECK-NEXT:    v_and_b32_e32 v0, 0x7fffffff, v0
 ; CHECK-NEXT:    v_and_b32_e32 v1, 0x7fffffff, v2
 ; CHECK-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; CHECK-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_add_nc_u32 v0, v0, v1
+; CHECK-NEXT:    v_add_nc_u32_e32 v0, v0, v1
+; CHECK-NEXT:    v_mov_b32_e32 v1, 0
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
   %zext0 = and i64 %a, 2147483647
   %zext1 = and i64 %b, 2147483647
@@ -40,7 +41,8 @@ define <2 x i64> @narrow_add_vec(<2 x i64> %a, <2 x i64> %b) #0 {
 ; CHECK-NEXT:    v_and_b32_e32 v3, 0x7ffffffe, v6
 ; CHECK-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; CHECK-NEXT:    v_add_nc_u32_e32 v0, v0, v2
-; CHECK-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_add_nc_u32 v2, v1, v3
+; CHECK-NEXT:    v_add_nc_u32_e32 v2, v1, v3
+; CHECK-NEXT:    v_mov_b32_e32 v1, 0
 ; CHECK-NEXT:    v_mov_b32_e32 v3, 0
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
   %zext0 = and <2 x i64> %a, <i64 2147483647, i64 30>
diff --git a/llvm/test/CodeGen/AMDGPU/vopd-combine-gfx1250.mir b/llvm/test/CodeGen/AMDGPU/vopd-combine-gfx1250.mir
index 05bbb0f54ef9e..c40b47ff4e4b4 100644
--- a/llvm/test/CodeGen/AMDGPU/vopd-combine-gfx1250.mir
+++ b/llvm/test/CodeGen/AMDGPU/vopd-combine-gfx1250.mir
@@ -4501,3 +4501,110 @@ body: |
     $vgpr142 = V_FMAMK_F32 $vgpr377, 1069066811, $vgpr142, implicit $mode, implicit $exec
     $vgpr145 = V_FMAC_F32_e32 1069066811, $vgpr366, $vgpr145, implicit $mode, implicit $exec
 ...
+
+---
+name:            no_combine_opx_writes_opy_src
+tracksRegLiveness: true
+body:             |
+  bb.0:
+
+    ; SCHED-LABEL: name: no_combine_opx_writes_opy_src
+    ; SCHED: $vgpr6 = IMPLICIT_DEF
+    ; SCHED-NEXT: $vgpr5 = IMPLICIT_DEF
+    ; SCHED-NEXT: $sgpr15 = IMPLICIT_DEF
+    ; SCHED-NEXT: $vgpr6 = V_LSHLREV_B32_e32 8, killed $vgpr5, implicit $mode, implicit $exec
+    ; SCHED-NEXT: $vgpr7 = V_XOR_B32_e32 killed $sgpr15, killed $vgpr6, implicit $exec
+    ;
+    ; PAIR-LABEL: name: no_combine_opx_writes_opy_src
+    ; PAIR: $vgpr6 = IMPLICIT_DEF
+    ; PAIR-NEXT: $vgpr5 = IMPLICIT_DEF
+    ; PAIR-NEXT: $sgpr15 = IMPLICIT_DEF
+    ; PAIR-NEXT: $vgpr6 = V_LSHLREV_B32_e32 8, killed $vgpr5, implicit $mode, implicit $exec
+    ; PAIR-NEXT: $vgpr7 = V_XOR_B32_e32 killed $sgpr15, killed $vgpr6, implicit $exec
+    ;
+    ; LOWER-LABEL: name: no_combine_opx_writes_opy_src
+    ; LOWER: $vgpr6 = IMPLICIT_DEF
+    ; LOWER-NEXT: $vgpr5 = IMPLICIT_DEF
+    ; LOWER-NEXT: $sgpr15 = IMPLICIT_DEF
+    ; LOWER-NEXT: $vgpr6 = V_LSHLREV_B32_e32 8, killed $vgpr5, implicit $mode, implicit $exec
+    ; LOWER-NEXT: $vgpr7 = V_XOR_B32_e32 killed $sgpr15, killed $vgpr6, implicit $exec
+    $vgpr5 = IMPLICIT_DEF
+    $vgpr6 = IMPLICIT_DEF
+    $sgpr15 = IMPLICIT_DEF
+    $vgpr6 = V_LSHLREV_B32_e32 8, $vgpr5, implicit $mode, implicit $exec
+    $vgpr7 = V_XOR_B32_e32 $sgpr15, $vgpr6, implicit $exec
+...
+
+---
+name:            no_combine_opx_writes_opy_src_reorder
+tracksRegLiveness: true
+body:             |
+  bb.0:
+
+    ; SCHED-LABEL: name: no_combine_opx_writes_opy_src_reorder
+    ; SCHED: $vgpr6 = IMPLICIT_DEF
+    ; SCHED-NEXT: $vgpr5 = IMPLICIT_DEF
+    ; SCHED-NEXT: $sgpr15 = IMPLICIT_DEF
+    ; SCHED-NEXT: $vgpr7 = V_XOR_B32_e32 killed $sgpr15, killed $vgpr6, implicit $exec
+    ; SCHED-NEXT: $vgpr6 = V_LSHLREV_B32_e32 8, killed $vgpr5, implicit $mode, implicit $exec
+    ;
+    ; PAIR-LABEL: name: no_combine_opx_writes_opy_src_reorder
+    ; PAIR: $vgpr6 = IMPLICIT_DEF
+    ; PAIR-NEXT: $vgpr5 = IMPLICIT_DEF
+    ; PAIR-NEXT: $sgpr15 = IMPLICIT_DEF
+    ; PAIR-NEXT: $vgpr7 = V_XOR_B32_e32 killed $sgpr15, killed $vgpr6, implicit $exec
+    ; PAIR-NEXT: $vgpr6 = V_LSHLREV_B32_e32 8, killed $vgpr5, implicit $mode, implicit $exec
+    ;
+    ; LOWER-LABEL: name: no_combine_opx_writes_opy_src_reorder
+    ; LOWER: $vgpr6 = IMPLICIT_DEF
+    ; LOWER-NEXT: $vgpr5 = IMPLICIT_DEF
+    ; LOWER-NEXT: $sgpr15 = IMPLICIT_DEF
+    ; LOWER-NEXT: $vgpr7 = V_XOR_B32_e32 killed $sgpr15, killed $vgpr6, implicit $exec
+    ; LOWER-NEXT: $vgpr6 = V_LSHLREV_B32_e32 8, killed $vgpr5, implicit $mode, implicit $exec
+    $vgpr5 = IMPLICIT_DEF
+    $vgpr6 = IMPLICIT_DEF
+    $sgpr15 = IMPLICIT_DEF
+    $vgpr7 = V_XOR_B32_e32 $sgpr15, $vgpr6, implicit $exec
+    $vgpr6 = V_LSHLREV_B32_e32 8, $vgpr5, implicit $mode, implicit $exec
+...
+
+---
+name:            no_combine_add_f64_fmac_f32_e64_neg_overlap_dst
+tracksRegLiveness: true
+body:             |
+  bb.0:
+
+    ; SCHED-LABEL: name: no_combine_add_f64_fmac_f32_e64_neg_overlap_dst
+    ; SCHED: $vgpr2 = IMPLICIT_DEF
+    ; SCHED-NEXT: $vgpr3 = IMPLICIT_DEF
+    ; SCHED-NEXT: $vgpr6 = IMPLICIT_DEF
+    ; SCHED-NEXT: $vgpr0 = IMPLICIT_DEF
+    ; SCHED-NEXT: $vgpr1 = IMPLICIT_DEF
+    ; SCHED-NEXT: $vgpr6 = V_FMAC_F32_e64 0, killed $vgpr2, 0, killed $vgpr3, 0, killed $vgpr6, 0, 0, implicit $mode, implicit $exec
+    ; SCHED-NEXT: $vgpr2_vgpr3 = V_ADD_F64_pseudo_e32 10, killed $vgpr0_vgpr1, implicit $mode, implicit $exec
+    ;
+    ; PAIR-LABEL: name: no_combine_add_f64_fmac_f32_e64_neg_overlap_dst
+    ; PAIR: $vgpr2 = IMPLICIT_DEF
+    ; PAIR-NEXT: $vgpr3 = IMPLICIT_DEF
+    ; PAIR-NEXT: $vgpr6 = IMPLICIT_DEF
+    ; PAIR-NEXT: $vgpr0 = IMPLICIT_DEF
+    ; PAIR-NEXT: $vgpr1 = IMPLICIT_DEF
+    ; PAIR-NEXT: $vgpr6 = V_FMAC_F32_e64 0, killed $vgpr2, 0, killed $vgpr3, 0, killed $vgpr6, 0, 0, implicit $mode, implicit $exec
+    ; PAIR-NEXT: $vgpr2_vgpr3 = V_ADD_F64_pseudo_e32 10, killed $vgpr0_vgpr1, implicit $mode, implicit $exec
+    ;
+    ; LOWER-LABEL: name: no_combine_add_f64_fmac_f32_e64_neg_overlap_dst
+    ; LOWER: $vgpr2 = IMPLICIT_DEF
+    ; LOWER-NEXT: $vgpr3 = IMPLICIT_DEF
+    ; LOWER-NEXT: $vgpr6 = IMPLICIT_DEF
+    ; LOWER-NEXT: $vgpr0 = IMPLICIT_DEF
+    ; LOWER-NEXT: $vgpr1 = IMPLICIT_DEF
+    ; LOWER-NEXT: $vgpr6 = V_FMAC_F32_e64 0, killed $vgpr2, 0, killed $vgpr3, 0, killed $vgpr6, 0, 0, implicit $mode, implicit $exec
+    ; LOWER-NEXT: $vgpr2_vgpr3 = V_ADD_F64_pseudo_e32 10, killed $vgpr0_vgpr1, implicit $mode, implicit $exec
+    $vgpr0 = IMPLICIT_DEF
+    $vgpr1 = IMPLICIT_DEF
+    $vgpr2 = IMPLICIT_DEF
+    $vgpr3 = IMPLICIT_DEF
+    $vgpr6 = IMPLICIT_DEF
+    $vgpr6 = V_FMAC_F32_e64 0, $vgpr2, 0, $vgpr3, 0, $vgpr6, 0, 0, implicit $mode, implicit $exec
+    $vgpr2_vgpr3 = V_ADD_F64_pseudo_e32 10, $vgpr0_vgpr1, implicit $mode, implicit $exec
+...

>From 4d74d93594fdcaba28d1916cabb31f4d8ba95e58 Mon Sep 17 00:00:00 2001
From: Leon Clark <Leon4116 at gmail.com>
Date: Fri, 30 Jan 2026 15:20:27 +0000
Subject: [PATCH 4/5] [VectorCombine] Trim low end of loads used in
 shufflevector rebroadcasts. (#149093)

Following on from #128938, trim the low end of loads where only some of
the incoming lanes are used for rebroadcasts in shufflevector
instructions.

---------

Co-authored-by: Leon Clark <leoclark at amd.com>
Co-authored-by: Simon Pilgrim <llvm-dev at redking.me.uk>
---
 .../Transforms/Vectorize/VectorCombine.cpp    | 26 ++++++---
 .../VectorCombine/load-shufflevector.ll       | 55 +++++++++++--------
 2 files changed, 50 insertions(+), 31 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
index b3295576eb73e..d173afd24e54c 100644
--- a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
+++ b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
@@ -5420,7 +5420,7 @@ bool VectorCombine::shrinkLoadForShuffles(Instruction &I) {
 
   // Get the range of vector elements used by shufflevector instructions.
   if (std::optional<IndexRange> Indices = GetIndexRangeInShuffles()) {
-    unsigned const NewNumElements = Indices->second + 1u;
+    unsigned const NewNumElements = (Indices->second + 1) - Indices->first;
 
     // If the range of vector elements is smaller than the full load, attempt
     // to create a smaller load.
@@ -5442,19 +5442,23 @@ bool VectorCombine::shrinkLoadForShuffles(Instruction &I) {
 
       using UseEntry = std::pair<ShuffleVectorInst *, std::vector<int>>;
       SmallVector<UseEntry, 4u> NewUses;
-      unsigned const MaxIndex = NewNumElements * 2u;
+      unsigned const LowOffset = Indices->first;
+      unsigned const HighOffset = OldNumElements - (Indices->second + 1);
 
       for (llvm::Use &Use : I.uses()) {
         auto *Shuffle = cast<ShuffleVectorInst>(Use.getUser());
         ArrayRef<int> OldMask = Shuffle->getShuffleMask();
 
         // Create entry for new use.
-        NewUses.push_back({Shuffle, OldMask});
-
-        // Validate mask indices.
+        NewUses.push_back({Shuffle, {}});
+        std::vector<int> &NewMask = NewUses.back().second;
         for (int Index : OldMask) {
-          if (Index >= static_cast<int>(MaxIndex))
+          int NewIndex = Index >= static_cast<int>(OldNumElements)
+                             ? Index - LowOffset - HighOffset
+                             : Index - LowOffset;
+          if (NewIndex >= static_cast<int>(NewNumElements * 2u))
             return false;
+          NewMask.push_back(NewIndex);
         }
 
         // Update costs.
@@ -5463,7 +5467,7 @@ bool VectorCombine::shrinkLoadForShuffles(Instruction &I) {
                                OldLoadTy, OldMask, CostKind);
         NewCost +=
             TTI.getShuffleCost(TTI::SK_PermuteSingleSrc, Shuffle->getType(),
-                               NewLoadTy, OldMask, CostKind);
+                               NewLoadTy, NewMask, CostKind);
       }
 
       LLVM_DEBUG(
@@ -5475,8 +5479,14 @@ bool VectorCombine::shrinkLoadForShuffles(Instruction &I) {
         return false;
 
       // Create new load of smaller vector.
+      Type *IndexTy = DL->getIndexType(PtrOp->getType());
+      Value *NewPtr = LowOffset > 0u
+                          ? Builder.CreateInBoundsPtrAdd(
+                                PtrOp, ConstantInt::get(IndexTy, LowOffset))
+                          : PtrOp;
+
       auto *NewLoad = cast<LoadInst>(
-          Builder.CreateAlignedLoad(NewLoadTy, PtrOp, OldLoad->getAlign()));
+          Builder.CreateAlignedLoad(NewLoadTy, NewPtr, OldLoad->getAlign()));
       NewLoad->copyMetadata(I);
 
       // Replace all uses.
diff --git a/llvm/test/Transforms/VectorCombine/load-shufflevector.ll b/llvm/test/Transforms/VectorCombine/load-shufflevector.ll
index 7d9393ab77f20..2b5cec9ccfde5 100644
--- a/llvm/test/Transforms/VectorCombine/load-shufflevector.ll
+++ b/llvm/test/Transforms/VectorCombine/load-shufflevector.ll
@@ -47,8 +47,9 @@ define <4 x half> @shuffle_v4_v4f16_r1_2(ptr addrspace(1) nocapture readonly %ar
 ; CHECK-LABEL: define <4 x half> @shuffle_v4_v4f16_r1_2(
 ; CHECK-SAME: ptr addrspace(1) readonly captures(none) [[ARG0:%.*]]) local_unnamed_addr {
 ; CHECK-NEXT:  [[ENTRY:.*:]]
-; CHECK-NEXT:    [[TMP0:%.*]] = load <3 x half>, ptr addrspace(1) [[ARG0]], align 32
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <3 x half> [[TMP0]], <3 x half> poison, <4 x i32> <i32 1, i32 1, i32 2, i32 2>
+; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[ARG0]], i64 1
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x half>, ptr addrspace(1) [[TMP0]], align 32
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <2 x half> [[TMP2]], <2 x half> poison, <4 x i32> <i32 0, i32 0, i32 1, i32 1>
 ; CHECK-NEXT:    ret <4 x half> [[TMP1]]
 ;
 entry:
@@ -61,8 +62,9 @@ define <8 x half> @shuffle_v4_v8f16_r1_2(ptr addrspace(1) nocapture readonly %ar
 ; CHECK-LABEL: define <8 x half> @shuffle_v4_v8f16_r1_2(
 ; CHECK-SAME: ptr addrspace(1) readonly captures(none) [[ARG0:%.*]]) local_unnamed_addr {
 ; CHECK-NEXT:  [[ENTRY:.*:]]
-; CHECK-NEXT:    [[TMP0:%.*]] = load <3 x half>, ptr addrspace(1) [[ARG0]], align 32
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <3 x half> [[TMP0]], <3 x half> poison, <8 x i32> <i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2>
+; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[ARG0]], i64 1
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x half>, ptr addrspace(1) [[TMP0]], align 32
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <2 x half> [[TMP2]], <2 x half> poison, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1>
 ; CHECK-NEXT:    ret <8 x half> [[TMP1]]
 ;
 entry:
@@ -108,13 +110,14 @@ define <4 x half> @shuffle_v4_v4f16_cond_r1_2(ptr addrspace(1) nocapture readonl
 ; CHECK-LABEL: define <4 x half> @shuffle_v4_v4f16_cond_r1_2(
 ; CHECK-SAME: ptr addrspace(1) readonly captures(none) [[ARG0:%.*]], i1 [[COND:%.*]]) local_unnamed_addr {
 ; CHECK-NEXT:  [[ENTRY:.*:]]
-; CHECK-NEXT:    [[TMP0:%.*]] = load <3 x half>, ptr addrspace(1) [[ARG0]], align 32
+; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[ARG0]], i64 1
+; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[TMP0]], align 32
 ; CHECK-NEXT:    br i1 [[COND]], label %[[THEN:.*]], label %[[ELSE:.*]]
 ; CHECK:       [[THEN]]:
-; CHECK-NEXT:    [[VAL1:%.*]] = shufflevector <3 x half> [[TMP0]], <3 x half> poison, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
+; CHECK-NEXT:    [[VAL1:%.*]] = shufflevector <2 x half> [[TMP1]], <2 x half> poison, <4 x i32> zeroinitializer
 ; CHECK-NEXT:    br label %[[FINALLY:.*]]
 ; CHECK:       [[ELSE]]:
-; CHECK-NEXT:    [[VAL2:%.*]] = shufflevector <3 x half> [[TMP0]], <3 x half> poison, <4 x i32> <i32 2, i32 2, i32 2, i32 2>
+; CHECK-NEXT:    [[VAL2:%.*]] = shufflevector <2 x half> [[TMP1]], <2 x half> poison, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
 ; CHECK-NEXT:    br label %[[FINALLY]]
 ; CHECK:       [[FINALLY]]:
 ; CHECK-NEXT:    [[VAL3:%.*]] = phi <4 x half> [ [[VAL1]], %[[THEN]] ], [ [[VAL2]], %[[ELSE]] ]
@@ -141,13 +144,14 @@ define <8 x half> @shuffle_v4_v8f16_cond_r1_2(ptr addrspace(1) nocapture readonl
 ; CHECK-LABEL: define <8 x half> @shuffle_v4_v8f16_cond_r1_2(
 ; CHECK-SAME: ptr addrspace(1) readonly captures(none) [[ARG0:%.*]], i1 [[COND:%.*]]) local_unnamed_addr {
 ; CHECK-NEXT:  [[ENTRY:.*:]]
-; CHECK-NEXT:    [[TMP0:%.*]] = load <3 x half>, ptr addrspace(1) [[ARG0]], align 32
+; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[ARG0]], i64 1
+; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[TMP0]], align 32
 ; CHECK-NEXT:    br i1 [[COND]], label %[[THEN:.*]], label %[[ELSE:.*]]
 ; CHECK:       [[THEN]]:
-; CHECK-NEXT:    [[VAL1:%.*]] = shufflevector <3 x half> [[TMP0]], <3 x half> poison, <8 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
+; CHECK-NEXT:    [[VAL1:%.*]] = shufflevector <2 x half> [[TMP1]], <2 x half> poison, <8 x i32> zeroinitializer
 ; CHECK-NEXT:    br label %[[FINALLY:.*]]
 ; CHECK:       [[ELSE]]:
-; CHECK-NEXT:    [[VAL2:%.*]] = shufflevector <3 x half> [[TMP0]], <3 x half> poison, <8 x i32> <i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2>
+; CHECK-NEXT:    [[VAL2:%.*]] = shufflevector <2 x half> [[TMP1]], <2 x half> poison, <8 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
 ; CHECK-NEXT:    br label %[[FINALLY]]
 ; CHECK:       [[FINALLY]]:
 ; CHECK-NEXT:    [[VAL3:%.*]] = phi <8 x half> [ [[VAL1]], %[[THEN]] ], [ [[VAL2]], %[[ELSE]] ]
@@ -202,8 +206,9 @@ define <4 x i32> @shuffle_v4_v4i32_r1_2(ptr addrspace(1) nocapture readonly %arg
 ; CHECK-LABEL: define <4 x i32> @shuffle_v4_v4i32_r1_2(
 ; CHECK-SAME: ptr addrspace(1) readonly captures(none) [[ARG0:%.*]]) local_unnamed_addr {
 ; CHECK-NEXT:  [[ENTRY:.*:]]
-; CHECK-NEXT:    [[TMP0:%.*]] = load <3 x i32>, ptr addrspace(1) [[ARG0]], align 32
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <3 x i32> [[TMP0]], <3 x i32> poison, <4 x i32> <i32 1, i32 1, i32 2, i32 2>
+; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[ARG0]], i64 1
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i32>, ptr addrspace(1) [[TMP0]], align 32
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <2 x i32> [[TMP2]], <2 x i32> poison, <4 x i32> <i32 0, i32 0, i32 1, i32 1>
 ; CHECK-NEXT:    ret <4 x i32> [[TMP1]]
 ;
 entry:
@@ -216,8 +221,9 @@ define <8 x i32> @shuffle_v4_v8i32_r1_2(ptr addrspace(1) nocapture readonly %arg
 ; CHECK-LABEL: define <8 x i32> @shuffle_v4_v8i32_r1_2(
 ; CHECK-SAME: ptr addrspace(1) readonly captures(none) [[ARG0:%.*]]) local_unnamed_addr {
 ; CHECK-NEXT:  [[ENTRY:.*:]]
-; CHECK-NEXT:    [[TMP0:%.*]] = load <3 x i32>, ptr addrspace(1) [[ARG0]], align 32
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <3 x i32> [[TMP0]], <3 x i32> poison, <8 x i32> <i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2>
+; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[ARG0]], i64 1
+; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i32>, ptr addrspace(1) [[TMP0]], align 32
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <2 x i32> [[TMP2]], <2 x i32> poison, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1>
 ; CHECK-NEXT:    ret <8 x i32> [[TMP1]]
 ;
 entry:
@@ -296,13 +302,14 @@ define <4 x i32> @shuffle_v4_v4i32_cond_r1_2(ptr addrspace(1) nocapture readonly
 ; CHECK-LABEL: define <4 x i32> @shuffle_v4_v4i32_cond_r1_2(
 ; CHECK-SAME: ptr addrspace(1) readonly captures(none) [[ARG0:%.*]], i1 [[COND:%.*]]) local_unnamed_addr {
 ; CHECK-NEXT:  [[ENTRY:.*:]]
-; CHECK-NEXT:    [[TMP0:%.*]] = load <3 x i32>, ptr addrspace(1) [[ARG0]], align 32
+; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[ARG0]], i64 1
+; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i32>, ptr addrspace(1) [[TMP0]], align 32
 ; CHECK-NEXT:    br i1 [[COND]], label %[[THEN:.*]], label %[[ELSE:.*]]
 ; CHECK:       [[THEN]]:
-; CHECK-NEXT:    [[VAL1:%.*]] = shufflevector <3 x i32> [[TMP0]], <3 x i32> poison, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
+; CHECK-NEXT:    [[VAL1:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> poison, <4 x i32> zeroinitializer
 ; CHECK-NEXT:    br label %[[FINALLY:.*]]
 ; CHECK:       [[ELSE]]:
-; CHECK-NEXT:    [[VAL2:%.*]] = shufflevector <3 x i32> [[TMP0]], <3 x i32> poison, <4 x i32> <i32 2, i32 2, i32 2, i32 2>
+; CHECK-NEXT:    [[VAL2:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> poison, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
 ; CHECK-NEXT:    br label %[[FINALLY]]
 ; CHECK:       [[FINALLY]]:
 ; CHECK-NEXT:    [[VAL3:%.*]] = phi <4 x i32> [ [[VAL1]], %[[THEN]] ], [ [[VAL2]], %[[ELSE]] ]
@@ -329,13 +336,14 @@ define <8 x i32> @shuffle_v4_v8i32_cond_r1_2(ptr addrspace(1) nocapture readonly
 ; CHECK-LABEL: define <8 x i32> @shuffle_v4_v8i32_cond_r1_2(
 ; CHECK-SAME: ptr addrspace(1) readonly captures(none) [[ARG0:%.*]], i1 [[COND:%.*]]) local_unnamed_addr {
 ; CHECK-NEXT:  [[ENTRY:.*:]]
-; CHECK-NEXT:    [[TMP0:%.*]] = load <3 x i32>, ptr addrspace(1) [[ARG0]], align 32
+; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[ARG0]], i64 1
+; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i32>, ptr addrspace(1) [[TMP0]], align 32
 ; CHECK-NEXT:    br i1 [[COND]], label %[[THEN:.*]], label %[[ELSE:.*]]
 ; CHECK:       [[THEN]]:
-; CHECK-NEXT:    [[VAL1:%.*]] = shufflevector <3 x i32> [[TMP0]], <3 x i32> poison, <8 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
+; CHECK-NEXT:    [[VAL1:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> poison, <8 x i32> zeroinitializer
 ; CHECK-NEXT:    br label %[[FINALLY:.*]]
 ; CHECK:       [[ELSE]]:
-; CHECK-NEXT:    [[VAL2:%.*]] = shufflevector <3 x i32> [[TMP0]], <3 x i32> poison, <8 x i32> <i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2>
+; CHECK-NEXT:    [[VAL2:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> poison, <8 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
 ; CHECK-NEXT:    br label %[[FINALLY]]
 ; CHECK:       [[FINALLY]]:
 ; CHECK-NEXT:    [[VAL3:%.*]] = phi <8 x i32> [ [[VAL1]], %[[THEN]] ], [ [[VAL2]], %[[ELSE]] ]
@@ -362,13 +370,14 @@ define <8 x i32> @shuffle_v4_v8i32_cond_r1_4(ptr addrspace(1) nocapture readonly
 ; CHECK-LABEL: define <8 x i32> @shuffle_v4_v8i32_cond_r1_4(
 ; CHECK-SAME: ptr addrspace(1) readonly captures(none) [[ARG0:%.*]], i1 [[COND:%.*]]) local_unnamed_addr {
 ; CHECK-NEXT:  [[ENTRY:.*:]]
-; CHECK-NEXT:    [[VAL0:%.*]] = load <4 x i32>, ptr addrspace(1) [[ARG0]], align 32
+; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[ARG0]], i64 1
+; CHECK-NEXT:    [[TMP1:%.*]] = load <3 x i32>, ptr addrspace(1) [[TMP0]], align 32
 ; CHECK-NEXT:    br i1 [[COND]], label %[[THEN:.*]], label %[[ELSE:.*]]
 ; CHECK:       [[THEN]]:
-; CHECK-NEXT:    [[VAL1:%.*]] = shufflevector <4 x i32> [[VAL0]], <4 x i32> poison, <8 x i32> <i32 1, i32 1, i32 1, i32 1, i32 2, i32 2, i32 2, i32 2>
+; CHECK-NEXT:    [[VAL1:%.*]] = shufflevector <3 x i32> [[TMP1]], <3 x i32> poison, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1>
 ; CHECK-NEXT:    br label %[[FINALLY:.*]]
 ; CHECK:       [[ELSE]]:
-; CHECK-NEXT:    [[VAL2:%.*]] = shufflevector <4 x i32> [[VAL0]], <4 x i32> poison, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 4, i32 4, i32 4, i32 4>
+; CHECK-NEXT:    [[VAL2:%.*]] = shufflevector <3 x i32> [[TMP1]], <3 x i32> poison, <8 x i32> <i32 2, i32 2, i32 2, i32 2, i32 3, i32 3, i32 3, i32 3>
 ; CHECK-NEXT:    br label %[[FINALLY]]
 ; CHECK:       [[FINALLY]]:
 ; CHECK-NEXT:    [[VAL3:%.*]] = phi <8 x i32> [ [[VAL1]], %[[THEN]] ], [ [[VAL2]], %[[ELSE]] ]

>From 62fbfe9a16bd7a08f36c3afacd91c9e79a216a6a Mon Sep 17 00:00:00 2001
From: Priyanshu Kumar <10b.priyanshu at gmail.com>
Date: Wed, 4 Feb 2026 16:47:58 +0000
Subject: [PATCH 5/5] Add support for vpshl/vpshr builtins

---
 clang/lib/CIR/CodeGen/CIRGenBuiltin.cpp       |  19 +-
 clang/lib/CIR/CodeGen/CIRGenBuiltinX86.cpp    |  13 +-
 .../X86/avx512vbmi2-builtins.c                | 418 ++++++++++++++++++
 3 files changed, 444 insertions(+), 6 deletions(-)
 create mode 100644 clang/test/CIR/CodeGenBuiltins/X86/avx512vbmi2-builtins.c

diff --git a/clang/lib/CIR/CodeGen/CIRGenBuiltin.cpp b/clang/lib/CIR/CodeGen/CIRGenBuiltin.cpp
index 580ada8901cbb..c7d6cd8ae240e 100644
--- a/clang/lib/CIR/CodeGen/CIRGenBuiltin.cpp
+++ b/clang/lib/CIR/CodeGen/CIRGenBuiltin.cpp
@@ -1246,8 +1246,23 @@ RValue CIRGenFunction::emitBuiltinExpr(const GlobalDecl &gd, unsigned builtinID,
   case Builtin::BI__builtin_elementwise_canonicalize:
   case Builtin::BI__builtin_elementwise_copysign:
   case Builtin::BI__builtin_elementwise_fma:
-  case Builtin::BI__builtin_elementwise_fshl:
-  case Builtin::BI__builtin_elementwise_fshr:
+    return errorBuiltinNYI(*this, e, builtinID);
+  case Builtin::BI__builtin_elementwise_fshl: {
+    mlir::Location loc = getLoc(e->getExprLoc());
+    mlir::Value a = emitScalarExpr(e->getArg(0));
+    mlir::Value b = emitScalarExpr(e->getArg(1));
+    mlir::Value c = emitScalarExpr(e->getArg(2));
+    return RValue::get(builder.emitIntrinsicCallOp(loc, "fshl", a.getType(),
+                                                   mlir::ValueRange{a, b, c}));
+  }
+  case Builtin::BI__builtin_elementwise_fshr: {
+    mlir::Location loc = getLoc(e->getExprLoc());
+    mlir::Value a = emitScalarExpr(e->getArg(0));
+    mlir::Value b = emitScalarExpr(e->getArg(1));
+    mlir::Value c = emitScalarExpr(e->getArg(2));
+    return RValue::get(builder.emitIntrinsicCallOp(loc, "fshr", a.getType(),
+                                                   mlir::ValueRange{a, b, c}));
+  }
   case Builtin::BI__builtin_elementwise_add_sat:
   case Builtin::BI__builtin_elementwise_sub_sat:
   case Builtin::BI__builtin_elementwise_max:
diff --git a/clang/lib/CIR/CodeGen/CIRGenBuiltinX86.cpp b/clang/lib/CIR/CodeGen/CIRGenBuiltinX86.cpp
index 80022998448ad..4e190ab4de3fb 100644
--- a/clang/lib/CIR/CodeGen/CIRGenBuiltinX86.cpp
+++ b/clang/lib/CIR/CodeGen/CIRGenBuiltinX86.cpp
@@ -2032,6 +2032,10 @@ CIRGenFunction::emitX86BuiltinExpr(unsigned builtinID, const CallExpr *expr) {
   case X86::BI__builtin_ia32_pternlogd256_maskz:
   case X86::BI__builtin_ia32_pternlogq128_maskz:
   case X86::BI__builtin_ia32_pternlogq256_maskz:
+    cgm.errorNYI(expr->getSourceRange(),
+                 std::string("unimplemented X86 builtin call: ") +
+                     getContext().BuiltinInfo.getName(builtinID));
+    return mlir::Value{};
   case X86::BI__builtin_ia32_vpshldd128:
   case X86::BI__builtin_ia32_vpshldd256:
   case X86::BI__builtin_ia32_vpshldd512:
@@ -2041,6 +2045,8 @@ CIRGenFunction::emitX86BuiltinExpr(unsigned builtinID, const CallExpr *expr) {
   case X86::BI__builtin_ia32_vpshldw128:
   case X86::BI__builtin_ia32_vpshldw256:
   case X86::BI__builtin_ia32_vpshldw512:
+    return emitX86FunnelShift(builder, getLoc(expr->getExprLoc()), ops[0],
+                              ops[1], ops[2], false);
   case X86::BI__builtin_ia32_vpshrdd128:
   case X86::BI__builtin_ia32_vpshrdd256:
   case X86::BI__builtin_ia32_vpshrdd512:
@@ -2050,10 +2056,9 @@ CIRGenFunction::emitX86BuiltinExpr(unsigned builtinID, const CallExpr *expr) {
   case X86::BI__builtin_ia32_vpshrdw128:
   case X86::BI__builtin_ia32_vpshrdw256:
   case X86::BI__builtin_ia32_vpshrdw512:
-    cgm.errorNYI(expr->getSourceRange(),
-                 std::string("unimplemented X86 builtin call: ") +
-                     getContext().BuiltinInfo.getName(builtinID));
-    return mlir::Value{};
+    // Ops 0 and 1 are swapped.
+    return emitX86FunnelShift(builder, getLoc(expr->getExprLoc()), ops[1],
+                              ops[0], ops[2], true);
   case X86::BI__builtin_ia32_reduce_fadd_pd512:
   case X86::BI__builtin_ia32_reduce_fadd_ps512:
   case X86::BI__builtin_ia32_reduce_fadd_ph512:
diff --git a/clang/test/CIR/CodeGenBuiltins/X86/avx512vbmi2-builtins.c b/clang/test/CIR/CodeGenBuiltins/X86/avx512vbmi2-builtins.c
new file mode 100644
index 0000000000000..ae0cdf09f3f77
--- /dev/null
+++ b/clang/test/CIR/CodeGenBuiltins/X86/avx512vbmi2-builtins.c
@@ -0,0 +1,418 @@
+// RUN: %clang_cc1 -flax-vector-conversions=none -ffreestanding -triple x86_64-unknown-linux-gnu -fclangir -target-feature +avx512vbmi2 -emit-cir %s -o %t.cir
+// RUN: FileCheck --check-prefix=CIR --input-file=%t.cir %s
+// RUN: %clang_cc1 -flax-vector-conversions=none -ffreestanding -triple x86_64-unknown-linux-gnu -fclangir -target-feature +avx512vbmi2 -emit-llvm %s -o %t.ll
+// RUN: FileCheck --check-prefixes=LLVM --input-file=%t.ll %s
+// RUN: %clang_cc1 -flax-vector-conversions=none -ffreestanding %s -triple=x86_64-unknown-linux -target-feature +avx512vbmi2 -emit-llvm -o %t.ll -Wall -Werror -Wsign-conversion
+// RUN: FileCheck --check-prefixes=OGCG --input-file=%t.ll %s
+
+
+#include <immintrin.h>
+
+__m512i test_mm512_mask_shldi_epi64(__m512i s, __mmask8 u, __m512i a, __m512i b) {
+  // CIR-LABEL: @test_mm512_mask_shldi_epi64
+  // CIR: cir.call_llvm_intrinsic "fshl" %{{.*}}, %{{.*}}, %{{.*}} : (!cir.vector<8 x !s64i>, !cir.vector<8 x !s64i>, !cir.vector<8 x !u64i>) -> !cir.vector<8 x !s64i>
+  // CIR: cir.vec.ternary(%{{.*}}, %{{.*}}, %{{.*}}) : !cir.vector<8 x !cir.int<s, 1>>, !cir.vector<8 x !s64i>
+  // LLVM-LABEL: @test_mm512_mask_shldi_epi64
+  // LLVM: call <8 x i64> @llvm.fshl.v8i64(<8 x i64> {{.*}}, <8 x i64> {{.*}}, <8 x i64> splat (i64 47))
+  // LLVM: select <8 x i1> {{.*}}, <8 x i64> {{.*}}, <8 x i64>
+  // OGCG-LABEL: @test_mm512_mask_shldi_epi64
+  // OGCG: call <8 x i64> @llvm.fshl.v8i64(<8 x i64> %{{.*}}, <8 x i64> %{{.*}}, <8 x i64> splat (i64 47))
+  // OGCG: select <8 x i1> %{{.*}}, <8 x i64> %{{.*}}, <8 x i64> %{{.*}}
+  return _mm512_mask_shldi_epi64(s, u, a, b, 47);
+}
+
+__m512i test_mm512_maskz_shldi_epi64(__mmask8 u, __m512i a, __m512i b) {
+  // CIR-LABEL: @test_mm512_maskz_shldi_epi64
+  // CIR: cir.call_llvm_intrinsic "fshl" %{{.*}}, %{{.*}}, %{{.*}} : (!cir.vector<8 x !s64i>, !cir.vector<8 x !s64i>, !cir.vector<8 x !u64i>) -> !cir.vector<8 x !s64i>
+  // CIR: cir.vec.ternary(%{{.*}}, %{{.*}}, %{{.*}}) : !cir.vector<8 x !cir.int<s, 1>>, !cir.vector<8 x !s64i>
+  // LLVM-LABEL: @test_mm512_maskz_shldi_epi64
+  // LLVM: call <8 x i64> @llvm.fshl.v8i64(<8 x i64> {{.*}}, <8 x i64> {{.*}}, <8 x i64> splat (i64 63))
+  // LLVM: select <8 x i1> {{.*}}, <8 x i64> {{.*}}, <8 x i64>
+  // OGCG-LABEL: @test_mm512_maskz_shldi_epi64
+  // OGCG: call <8 x i64> @llvm.fshl.v8i64(<8 x i64> %{{.*}}, <8 x i64> %{{.*}}, <8 x i64> splat (i64 63))
+  // OGCG: select <8 x i1> %{{.*}}, <8 x i64> %{{.*}}, <8 x i64> %{{.*}}
+  return _mm512_maskz_shldi_epi64(u, a, b, 63);
+}
+
+__m512i test_mm512_shldi_epi64(__m512i a, __m512i b) {
+  // CIR-LABEL: @test_mm512_shldi_epi64
+  // CIR: cir.call_llvm_intrinsic "fshl" %{{.*}}, %{{.*}}, %{{.*}} : (!cir.vector<8 x !s64i>, !cir.vector<8 x !s64i>, !cir.vector<8 x !u64i>) -> !cir.vector<8 x !s64i>
+  // LLVM-LABEL: @test_mm512_shldi_epi64
+  // LLVM: call <8 x i64> @llvm.fshl.v8i64(<8 x i64> {{.*}}, <8 x i64> {{.*}}, <8 x i64> splat (i64 31))
+  // OGCG-LABEL: @test_mm512_shldi_epi64
+  // OGCG: call <8 x i64> @llvm.fshl.v8i64(<8 x i64> %{{.*}}, <8 x i64> %{{.*}}, <8 x i64> splat (i64 31))
+  return _mm512_shldi_epi64(a, b, 31);
+}
+
+__m512i test_mm512_mask_shldi_epi32(__m512i s, __mmask16 u, __m512i a, __m512i b) {
+  // CIR-LABEL: @test_mm512_mask_shldi_epi32
+  // CIR: cir.call_llvm_intrinsic "fshl" %{{.*}}, %{{.*}}, %{{.*}} : (!cir.vector<16 x !s32i>, !cir.vector<16 x !s32i>, !cir.vector<16 x !u32i>) -> !cir.vector<16 x !s32i>
+  // CIR: cir.vec.ternary(%{{.*}}, %{{.*}}, %{{.*}}) : !cir.vector<16 x !cir.int<s, 1>>, !cir.vector<16 x !s32i>
+  // LLVM-LABEL: @test_mm512_mask_shldi_epi32
+  // LLVM: call <16 x i32> @llvm.fshl.v16i32(<16 x i32> {{.*}}, <16 x i32> {{.*}}, <16 x i32> splat (i32 7))
+  // LLVM: select <16 x i1> {{.*}}, <16 x i32> {{.*}}, <16 x i32>
+  // OGCG-LABEL: @test_mm512_mask_shldi_epi32
+  // OGCG: call <16 x i32> @llvm.fshl.v16i32(<16 x i32> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> splat (i32 7))
+  // OGCG: select <16 x i1> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> %{{.*}}
+  return _mm512_mask_shldi_epi32(s, u, a, b, 7);
+}
+
+__m512i test_mm512_maskz_shldi_epi32(__mmask16 u, __m512i a, __m512i b) {
+  // CIR-LABEL: @test_mm512_maskz_shldi_epi32
+  // CIR: cir.call_llvm_intrinsic "fshl" %{{.*}}, %{{.*}}, %{{.*}} : (!cir.vector<16 x !s32i>, !cir.vector<16 x !s32i>, !cir.vector<16 x !u32i>) -> !cir.vector<16 x !s32i>
+  // CIR: cir.vec.ternary(%{{.*}}, %{{.*}}, %{{.*}}) : !cir.vector<16 x !cir.int<s, 1>>, !cir.vector<16 x !s32i>
+  // LLVM-LABEL: @test_mm512_maskz_shldi_epi32
+  // LLVM: call <16 x i32> @llvm.fshl.v16i32(<16 x i32> {{.*}}, <16 x i32> {{.*}}, <16 x i32> splat (i32 15))
+  // LLVM: select <16 x i1> {{.*}}, <16 x i32> {{.*}}, <16 x i32>
+  // OGCG-LABEL: @test_mm512_maskz_shldi_epi32
+  // OGCG: call <16 x i32> @llvm.fshl.v16i32(<16 x i32> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> splat (i32 15))
+  // OGCG: select <16 x i1> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> %{{.*}}
+  return _mm512_maskz_shldi_epi32(u, a, b, 15);
+}
+
+__m512i test_mm512_shldi_epi32(__m512i a, __m512i b) {
+  // CIR-LABEL: @test_mm512_shldi_epi32
+  // CIR: cir.call_llvm_intrinsic "fshl" %{{.*}}, %{{.*}}, %{{.*}} : (!cir.vector<16 x !s32i>, !cir.vector<16 x !s32i>, !cir.vector<16 x !u32i>) -> !cir.vector<16 x !s32i>
+  // LLVM-LABEL: @test_mm512_shldi_epi32
+  // LLVM: call <16 x i32> @llvm.fshl.v16i32(<16 x i32> {{.*}}, <16 x i32> {{.*}}, <16 x i32> splat (i32 31))
+  // OGCG-LABEL: @test_mm512_shldi_epi32
+  // OGCG: call <16 x i32> @llvm.fshl.v16i32(<16 x i32> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> splat (i32 31))
+  return _mm512_shldi_epi32(a, b, 31);
+}
+
+__m512i test_mm512_mask_shldi_epi16(__m512i s, __mmask32 u, __m512i a, __m512i b) {
+  // CIR-LABEL: @test_mm512_mask_shldi_epi16
+  // CIR: cir.call_llvm_intrinsic "fshl" %{{.*}}, %{{.*}}, %{{.*}} : (!cir.vector<32 x !s16i>, !cir.vector<32 x !s16i>, !cir.vector<32 x !u16i>) -> !cir.vector<32 x !s16i>
+  // CIR: cir.vec.ternary(%{{.*}}, %{{.*}}, %{{.*}}) : !cir.vector<32 x !cir.int<s, 1>>, !cir.vector<32 x !s16i>
+  // LLVM-LABEL: @test_mm512_mask_shldi_epi16
+  // LLVM: call <32 x i16> @llvm.fshl.v32i16(<32 x i16> {{.*}}, <32 x i16> {{.*}}, <32 x i16> splat (i16 3))
+  // LLVM: select <32 x i1> {{.*}}, <32 x i16> {{.*}}, <32 x i16>
+  // OGCG-LABEL: @test_mm512_mask_shldi_epi16
+  // OGCG: call <32 x i16> @llvm.fshl.v32i16(<32 x i16> %{{.*}}, <32 x i16> %{{.*}}, <32 x i16> splat (i16 3))
+  // OGCG: select <32 x i1> %{{.*}}, <32 x i16> %{{.*}}, <32 x i16> %{{.*}}
+  return _mm512_mask_shldi_epi16(s, u, a, b, 3);
+}
+
+__m512i test_mm512_maskz_shldi_epi16(__mmask32 u, __m512i a, __m512i b) {
+  // CIR-LABEL: @test_mm512_maskz_shldi_epi16
+  // CIR: cir.call_llvm_intrinsic "fshl" %{{.*}}, %{{.*}}, %{{.*}} : (!cir.vector<32 x !s16i>, !cir.vector<32 x !s16i>, !cir.vector<32 x !u16i>) -> !cir.vector<32 x !s16i>
+  // CIR: cir.vec.ternary(%{{.*}}, %{{.*}}, %{{.*}}) : !cir.vector<32 x !cir.int<s, 1>>, !cir.vector<32 x !s16i>
+  // LLVM-LABEL: @test_mm512_maskz_shldi_epi16
+  // LLVM: call <32 x i16> @llvm.fshl.v32i16(<32 x i16> {{.*}}, <32 x i16> {{.*}}, <32 x i16> splat (i16 15))
+  // LLVM: select <32 x i1> {{.*}}, <32 x i16> {{.*}}, <32 x i16>
+  // OGCG-LABEL: @test_mm512_maskz_shldi_epi16
+  // OGCG: call <32 x i16> @llvm.fshl.v32i16(<32 x i16> %{{.*}}, <32 x i16> %{{.*}}, <32 x i16> splat (i16 15))
+  // OGCG: select <32 x i1> %{{.*}}, <32 x i16> %{{.*}}, <32 x i16> %{{.*}}
+  return _mm512_maskz_shldi_epi16(u, a, b, 15);
+}
+
+__m512i test_mm512_shldi_epi16(__m512i a, __m512i b) {
+  // CIR-LABEL: @test_mm512_shldi_epi16
+  // CIR: cir.call_llvm_intrinsic "fshl" %{{.*}}, %{{.*}}, %{{.*}} : (!cir.vector<32 x !s16i>, !cir.vector<32 x !s16i>, !cir.vector<32 x !u16i>) -> !cir.vector<32 x !s16i>
+  // LLVM-LABEL: @test_mm512_shldi_epi16
+  // LLVM: call <32 x i16> @llvm.fshl.v32i16(<32 x i16> {{.*}}, <32 x i16> {{.*}}, <32 x i16> splat (i16 31))
+  // OGCG-LABEL: @test_mm512_shldi_epi16
+  // OGCG: call <32 x i16> @llvm.fshl.v32i16(<32 x i16> %{{.*}}, <32 x i16> %{{.*}}, <32 x i16> splat (i16 31))
+  return _mm512_shldi_epi16(a, b, 31);
+}
+
+__m512i test_mm512_mask_shldv_epi64(__m512i s, __mmask8 u, __m512i a, __m512i b) {
+  // CIR-LABEL: @test_mm512_mask_shldv_epi64
+  // CIR: cir.call_llvm_intrinsic "fshl" %{{.*}}, %{{.*}}, %{{.*}} : (!cir.vector<8 x !s64i>, !cir.vector<8 x !s64i>, !cir.vector<8 x !u64i>) -> !cir.vector<8 x !s64i>
+  // CIR: cir.vec.ternary(%{{.*}}, %{{.*}}, %{{.*}}) : !cir.vector<8 x !cir.int<s, 1>>, !cir.vector<8 x !s64i>
+  // LLVM-LABEL: @test_mm512_mask_shldv_epi64
+  // LLVM: call <8 x i64> @llvm.fshl.v8i64(<8 x i64> {{.*}}, <8 x i64> {{.*}}, <8 x i64>
+  // LLVM: select <8 x i1> {{.*}}, <8 x i64> {{.*}}, <8 x i64>
+  // OGCG-LABEL: @test_mm512_mask_shldv_epi64
+  // OGCG: call <8 x i64> @llvm.fshl.v8i64(<8 x i64> %{{.*}}, <8 x i64> %{{.*}}, <8 x i64>
+  // OGCG: select <8 x i1> %{{.*}}, <8 x i64> %{{.*}}, <8 x i64> %{{.*}}
+  return _mm512_mask_shldv_epi64(s, u, a, b);
+}
+
+__m512i test_mm512_maskz_shldv_epi64(__mmask8 u, __m512i s, __m512i a, __m512i b) {
+  // CIR-LABEL: @test_mm512_maskz_shldv_epi64
+  // CIR: cir.call_llvm_intrinsic "fshl" %{{.*}}, %{{.*}}, %{{.*}} : (!cir.vector<8 x !s64i>, !cir.vector<8 x !s64i>, !cir.vector<8 x !u64i>) -> !cir.vector<8 x !s64i>
+  // CIR: cir.vec.ternary(%{{.*}}, %{{.*}}, %{{.*}}) : !cir.vector<8 x !cir.int<s, 1>>, !cir.vector<8 x !s64i>
+  // LLVM-LABEL: @test_mm512_maskz_shldv_epi64
+  // LLVM: call <8 x i64> @llvm.fshl.v8i64(<8 x i64> {{.*}}, <8 x i64> {{.*}}, <8 x i64>
+  // LLVM: select <8 x i1> {{.*}}, <8 x i64> {{.*}}, <8 x i64>
+  // OGCG-LABEL: @test_mm512_maskz_shldv_epi64
+  // OGCG: call <8 x i64> @llvm.fshl.v8i64(<8 x i64> %{{.*}}, <8 x i64> %{{.*}}, <8 x i64>
+  // OGCG: select <8 x i1> %{{.*}}, <8 x i64> %{{.*}}, <8 x i64> %{{.*}}
+  return _mm512_maskz_shldv_epi64(u, s, a, b);
+}
+
+__m512i test_mm512_shldv_epi64(__m512i s, __m512i a, __m512i b) {
+  // CIR-LABEL: @test_mm512_shldv_epi64
+  // CIR: cir.call_llvm_intrinsic "fshl" %{{.*}}, %{{.*}}, %{{.*}} : (!cir.vector<8 x !s64i>, !cir.vector<8 x !s64i>, !cir.vector<8 x !u64i>) -> !cir.vector<8 x !s64i>
+  // LLVM-LABEL: @test_mm512_shldv_epi64
+  // LLVM: call <8 x i64> @llvm.fshl.v8i64(<8 x i64> {{.*}}, <8 x i64> {{.*}}, <8 x i64>
+  // OGCG-LABEL: @test_mm512_shldv_epi64
+  // OGCG: call <8 x i64> @llvm.fshl.v8i64(<8 x i64> %{{.*}}, <8 x i64> %{{.*}}, <8 x i64>
+  return _mm512_shldv_epi64(s, a, b);
+}
+
+__m512i test_mm512_shldv_epi32(__m512i s, __m512i a, __m512i b) {
+  // CIR-LABEL: @test_mm512_shldv_epi32
+  // CIR: cir.call_llvm_intrinsic "fshl" %{{.*}}, %{{.*}}, %{{.*}} : (!cir.vector<16 x !s32i>, !cir.vector<16 x !s32i>, !cir.vector<16 x !u32i>) -> !cir.vector<16 x !s32i>
+  // LLVM-LABEL: @test_mm512_shldv_epi32
+  // LLVM: call <16 x i32> @llvm.fshl.v16i32(<16 x i32> {{.*}}, <16 x i32> {{.*}}, <16 x i32>
+  // OGCG-LABEL: @test_mm512_shldv_epi32
+  // OGCG: call <16 x i32> @llvm.fshl.v16i32(<16 x i32> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32>
+  return _mm512_shldv_epi32(s, a, b);
+}
+
+__m512i test_mm512_mask_shldv_epi16(__m512i s, __mmask32 u, __m512i a, __m512i b) {
+  // CIR-LABEL: @test_mm512_mask_shldv_epi16
+  // CIR: cir.call_llvm_intrinsic "fshl" %{{.*}}, %{{.*}}, %{{.*}} : (!cir.vector<32 x !s16i>, !cir.vector<32 x !s16i>, !cir.vector<32 x !u16i>) -> !cir.vector<32 x !s16i>
+  // CIR: cir.vec.ternary(%{{.*}}, %{{.*}}, %{{.*}}) : !cir.vector<32 x !cir.int<s, 1>>, !cir.vector<32 x !s16i>
+  // LLVM-LABEL: @test_mm512_mask_shldv_epi16
+  // LLVM: call <32 x i16> @llvm.fshl.v32i16(<32 x i16> {{.*}}, <32 x i16> {{.*}}, <32 x i16>
+  // LLVM: select <32 x i1> {{.*}}, <32 x i16> {{.*}}, <32 x i16>
+  // OGCG-LABEL: @test_mm512_mask_shldv_epi16
+  // OGCG: call <32 x i16> @llvm.fshl.v32i16(<32 x i16> %{{.*}}, <32 x i16> %{{.*}}, <32 x i16>
+  // OGCG: select <32 x i1> %{{.*}}, <32 x i16> %{{.*}}, <32 x i16> %{{.*}}
+  return _mm512_mask_shldv_epi16(s, u, a, b);
+}
+
+__m512i test_mm512_maskz_shldv_epi16(__mmask32 u, __m512i s, __m512i a, __m512i b) {
+  // CIR-LABEL: @test_mm512_maskz_shldv_epi16
+  // CIR: cir.call_llvm_intrinsic "fshl" %{{.*}}, %{{.*}}, %{{.*}} : (!cir.vector<32 x !s16i>, !cir.vector<32 x !s16i>, !cir.vector<32 x !u16i>) -> !cir.vector<32 x !s16i>
+  // CIR: cir.vec.ternary(%{{.*}}, %{{.*}}, %{{.*}}) : !cir.vector<32 x !cir.int<s, 1>>, !cir.vector<32 x !s16i>
+  // LLVM-LABEL: @test_mm512_maskz_shldv_epi16
+  // LLVM: call <32 x i16> @llvm.fshl.v32i16(<32 x i16> {{.*}}, <32 x i16> {{.*}}, <32 x i16>
+  // LLVM: select <32 x i1> {{.*}}, <32 x i16> {{.*}}, <32 x i16>
+  // OGCG-LABEL: @test_mm512_maskz_shldv_epi16
+  // OGCG: call <32 x i16> @llvm.fshl.v32i16(<32 x i16> %{{.*}}, <32 x i16> %{{.*}}, <32 x i16>
+  // OGCG: select <32 x i1> %{{.*}}, <32 x i16> %{{.*}}, <32 x i16> %{{.*}}
+  return _mm512_maskz_shldv_epi16(u, s, a, b);
+}
+
+__m512i test_mm512_shldv_epi16(__m512i s, __m512i a, __m512i b) {
+  // CIR-LABEL: @test_mm512_shldv_epi16
+  // CIR: cir.call_llvm_intrinsic "fshl" %{{.*}}, %{{.*}}, %{{.*}} : (!cir.vector<32 x !s16i>, !cir.vector<32 x !s16i>, !cir.vector<32 x !u16i>) -> !cir.vector<32 x !s16i>
+  // LLVM-LABEL: @test_mm512_shldv_epi16
+  // LLVM: call <32 x i16> @llvm.fshl.v32i16(<32 x i16> {{.*}}, <32 x i16> {{.*}}, <32 x i16>
+  // OGCG-LABEL: @test_mm512_shldv_epi16
+  // OGCG: call <32 x i16> @llvm.fshl.v32i16(<32 x i16> %{{.*}}, <32 x i16> %{{.*}}, <32 x i16>
+  return _mm512_shldv_epi16(s, a, b);
+}
+
+__m512i test_mm512_mask_shrdi_epi64(__m512i s, __mmask8 u, __m512i a, __m512i b) {
+  // CIR-LABEL: @test_mm512_mask_shrdi_epi64
+  // CIR: cir.call_llvm_intrinsic "fshr" %{{.*}}, %{{.*}}, %{{.*}} : (!cir.vector<8 x !s64i>, !cir.vector<8 x !s64i>, !cir.vector<8 x !u64i>) -> !cir.vector<8 x !s64i>
+  // CIR: cir.vec.ternary(%{{.*}}, %{{.*}}, %{{.*}}) : !cir.vector<8 x !cir.int<s, 1>>, !cir.vector<8 x !s64i>
+  // LLVM-LABEL: @test_mm512_mask_shrdi_epi64
+  // LLVM: call <8 x i64> @llvm.fshr.v8i64(<8 x i64> {{.*}}, <8 x i64> {{.*}}, <8 x i64> splat (i64 47))
+  // LLVM: select <8 x i1> {{.*}}, <8 x i64> {{.*}}, <8 x i64>
+  // OGCG-LABEL: @test_mm512_mask_shrdi_epi64
+  // OGCG: call <8 x i64> @llvm.fshr.v8i64(<8 x i64> %{{.*}}, <8 x i64> %{{.*}}, <8 x i64> splat (i64 47))
+  // OGCG: select <8 x i1> %{{.*}}, <8 x i64> %{{.*}}, <8 x i64> %{{.*}}
+  return _mm512_mask_shrdi_epi64(s, u, a, b, 47);
+}
+
+__m512i test_mm512_maskz_shrdi_epi64(__mmask8 u, __m512i a, __m512i b) {
+  // CIR-LABEL: @test_mm512_maskz_shrdi_epi64
+  // CIR: cir.call_llvm_intrinsic "fshr" %{{.*}}, %{{.*}}, %{{.*}} : (!cir.vector<8 x !s64i>, !cir.vector<8 x !s64i>, !cir.vector<8 x !u64i>) -> !cir.vector<8 x !s64i>
+  // CIR: cir.vec.ternary(%{{.*}}, %{{.*}}, %{{.*}}) : !cir.vector<8 x !cir.int<s, 1>>, !cir.vector<8 x !s64i>
+  // LLVM-LABEL: @test_mm512_maskz_shrdi_epi64
+  // LLVM: call <8 x i64> @llvm.fshr.v8i64(<8 x i64> {{.*}}, <8 x i64> {{.*}}, <8 x i64> splat (i64 63))
+  // LLVM: select <8 x i1> {{.*}}, <8 x i64> {{.*}}, <8 x i64>
+  // OGCG-LABEL: @test_mm512_maskz_shrdi_epi64
+  // OGCG: call <8 x i64> @llvm.fshr.v8i64(<8 x i64> %{{.*}}, <8 x i64> %{{.*}}, <8 x i64> splat (i64 63))
+  // OGCG: select <8 x i1> %{{.*}}, <8 x i64> %{{.*}}, <8 x i64> %{{.*}}
+  return _mm512_maskz_shrdi_epi64(u, a, b, 63);
+}
+
+__m512i test_mm512_shrdi_epi64(__m512i a, __m512i b) {
+  // CIR-LABEL: @test_mm512_shrdi_epi64
+  // CIR: cir.call_llvm_intrinsic "fshr" %{{.*}}, %{{.*}}, %{{.*}} : (!cir.vector<8 x !s64i>, !cir.vector<8 x !s64i>, !cir.vector<8 x !u64i>) -> !cir.vector<8 x !s64i>
+  // LLVM-LABEL: @test_mm512_shrdi_epi64
+  // LLVM: call <8 x i64> @llvm.fshr.v8i64(<8 x i64> {{.*}}, <8 x i64> {{.*}}, <8 x i64> splat (i64 31))
+  // OGCG-LABEL: @test_mm512_shrdi_epi64
+  // OGCG: call <8 x i64> @llvm.fshr.v8i64(<8 x i64> %{{.*}}, <8 x i64> %{{.*}}, <8 x i64> splat (i64 31))
+  return _mm512_shrdi_epi64(a, b, 31);
+}
+
+__m512i test_mm512_mask_shrdi_epi32(__m512i s, __mmask16 u, __m512i a, __m512i b) {
+  // CIR-LABEL: @test_mm512_mask_shrdi_epi32
+  // CIR: cir.call_llvm_intrinsic "fshr" %{{.*}}, %{{.*}}, %{{.*}} : (!cir.vector<16 x !s32i>, !cir.vector<16 x !s32i>, !cir.vector<16 x !u32i>) -> !cir.vector<16 x !s32i>
+  // CIR: cir.vec.ternary(%{{.*}}, %{{.*}}, %{{.*}}) : !cir.vector<16 x !cir.int<s, 1>>, !cir.vector<16 x !s32i>
+  // LLVM-LABEL: @test_mm512_mask_shrdi_epi32
+  // LLVM: call <16 x i32> @llvm.fshr.v16i32(<16 x i32> {{.*}}, <16 x i32> {{.*}}, <16 x i32> splat (i32 7))
+  // LLVM: select <16 x i1> {{.*}}, <16 x i32> {{.*}}, <16 x i32>
+  // OGCG-LABEL: @test_mm512_mask_shrdi_epi32
+  // OGCG: call <16 x i32> @llvm.fshr.v16i32(<16 x i32> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> splat (i32 7))
+  // OGCG: select <16 x i1> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> %{{.*}}
+  return _mm512_mask_shrdi_epi32(s, u, a, b, 7);
+}
+
+__m512i test_mm512_maskz_shrdi_epi32(__mmask16 u, __m512i a, __m512i b) {
+  // CIR-LABEL: @test_mm512_maskz_shrdi_epi32
+  // CIR: cir.call_llvm_intrinsic "fshr" %{{.*}}, %{{.*}}, %{{.*}} : (!cir.vector<16 x !s32i>, !cir.vector<16 x !s32i>, !cir.vector<16 x !u32i>) -> !cir.vector<16 x !s32i>
+  // CIR: cir.vec.ternary(%{{.*}}, %{{.*}}, %{{.*}}) : !cir.vector<16 x !cir.int<s, 1>>, !cir.vector<16 x !s32i>
+  // LLVM-LABEL: @test_mm512_maskz_shrdi_epi32
+  // LLVM: call <16 x i32> @llvm.fshr.v16i32(<16 x i32> {{.*}}, <16 x i32> {{.*}}, <16 x i32> splat (i32 15))
+  // LLVM: select <16 x i1> {{.*}}, <16 x i32> {{.*}}, <16 x i32>
+  // OGCG-LABEL: @test_mm512_maskz_shrdi_epi32
+  // OGCG: call <16 x i32> @llvm.fshr.v16i32(<16 x i32> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> splat (i32 15))
+  // OGCG: select <16 x i1> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> %{{.*}}
+  return _mm512_maskz_shrdi_epi32(u, a, b, 15);
+}
+
+__m512i test_mm512_shrdi_epi32(__m512i a, __m512i b) {
+  // CIR-LABEL: @test_mm512_shrdi_epi32
+  // CIR: cir.call_llvm_intrinsic "fshr" %{{.*}}, %{{.*}}, %{{.*}} : (!cir.vector<16 x !s32i>, !cir.vector<16 x !s32i>, !cir.vector<16 x !u32i>) -> !cir.vector<16 x !s32i>
+  // LLVM-LABEL: @test_mm512_shrdi_epi32
+  // LLVM: call <16 x i32> @llvm.fshr.v16i32(<16 x i32> {{.*}}, <16 x i32> {{.*}}, <16 x i32> splat (i32 31))
+  // OGCG-LABEL: @test_mm512_shrdi_epi32
+  // OGCG: call <16 x i32> @llvm.fshr.v16i32(<16 x i32> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> splat (i32 31))
+  return _mm512_shrdi_epi32(a, b, 31);
+}
+
+__m512i test_mm512_mask_shrdi_epi16(__m512i s, __mmask32 u, __m512i a, __m512i b) {
+  // CIR-LABEL: @test_mm512_mask_shrdi_epi16
+  // CIR: cir.call_llvm_intrinsic "fshr" %{{.*}}, %{{.*}}, %{{.*}} : (!cir.vector<32 x !s16i>, !cir.vector<32 x !s16i>, !cir.vector<32 x !u16i>) -> !cir.vector<32 x !s16i>
+  // CIR: cir.vec.ternary(%{{.*}}, %{{.*}}, %{{.*}}) : !cir.vector<32 x !cir.int<s, 1>>, !cir.vector<32 x !s16i>
+  // LLVM-LABEL: @test_mm512_mask_shrdi_epi16
+  // LLVM: call <32 x i16> @llvm.fshr.v32i16(<32 x i16> {{.*}}, <32 x i16> {{.*}}, <32 x i16> splat (i16 3))
+  // LLVM: select <32 x i1> {{.*}}, <32 x i16> {{.*}}, <32 x i16>
+  // OGCG-LABEL: @test_mm512_mask_shrdi_epi16
+  // OGCG: call <32 x i16> @llvm.fshr.v32i16(<32 x i16> %{{.*}}, <32 x i16> %{{.*}}, <32 x i16> splat (i16 3))
+  // OGCG: select <32 x i1> %{{.*}}, <32 x i16> %{{.*}}, <32 x i16> %{{.*}}
+  return _mm512_mask_shrdi_epi16(s, u, a, b, 3);
+}
+
+__m512i test_mm512_maskz_shrdi_epi16(__mmask32 u, __m512i a, __m512i b) {
+  // CIR-LABEL: @test_mm512_maskz_shrdi_epi16
+  // CIR: cir.call_llvm_intrinsic "fshr" %{{.*}}, %{{.*}}, %{{.*}} : (!cir.vector<32 x !s16i>, !cir.vector<32 x !s16i>, !cir.vector<32 x !u16i>) -> !cir.vector<32 x !s16i>
+  // CIR: cir.vec.ternary(%{{.*}}, %{{.*}}, %{{.*}}) : !cir.vector<32 x !cir.int<s, 1>>, !cir.vector<32 x !s16i>
+  // LLVM-LABEL: @test_mm512_maskz_shrdi_epi16
+  // LLVM: call <32 x i16> @llvm.fshr.v32i16(<32 x i16> {{.*}}, <32 x i16> {{.*}}, <32 x i16> splat (i16 15))
+  // LLVM: select <32 x i1> {{.*}}, <32 x i16> {{.*}}, <32 x i16>
+  // OGCG-LABEL: @test_mm512_maskz_shrdi_epi16
+  // OGCG: call <32 x i16> @llvm.fshr.v32i16(<32 x i16> %{{.*}}, <32 x i16> %{{.*}}, <32 x i16> splat (i16 15))
+  // OGCG: select <32 x i1> %{{.*}}, <32 x i16> %{{.*}}, <32 x i16> %{{.*}}
+  return _mm512_maskz_shrdi_epi16(u, a, b, 15);
+}
+
+__m512i test_mm512_shrdi_epi16(__m512i a, __m512i b) {
+  // CIR-LABEL: @test_mm512_shrdi_epi16
+  // CIR: cir.call_llvm_intrinsic "fshr" %{{.*}}, %{{.*}}, %{{.*}} : (!cir.vector<32 x !s16i>, !cir.vector<32 x !s16i>, !cir.vector<32 x !u16i>) -> !cir.vector<32 x !s16i>
+  // LLVM-LABEL: @test_mm512_shrdi_epi16
+  // LLVM: call <32 x i16> @llvm.fshr.v32i16(<32 x i16> {{.*}}, <32 x i16> {{.*}}, <32 x i16> splat (i16 31))
+  // OGCG-LABEL: @test_mm512_shrdi_epi16
+  // OGCG: call <32 x i16> @llvm.fshr.v32i16(<32 x i16> %{{.*}}, <32 x i16> %{{.*}}, <32 x i16> splat (i16 31))
+  return _mm512_shrdi_epi16(a, b, 31);
+}
+
+__m512i test_mm512_mask_shldv_epi32(__m512i s, __mmask16 u, __m512i a, __m512i b) {
+  // CIR-LABEL: @test_mm512_mask_shldv_epi32
+  // CIR: cir.call_llvm_intrinsic "fshl" %{{.*}}, %{{.*}}, %{{.*}} : (!cir.vector<16 x !s32i>, !cir.vector<16 x !s32i>, !cir.vector<16 x !u32i>) -> !cir.vector<16 x !s32i>
+  // CIR: cir.vec.ternary(%{{.*}}, %{{.*}}, %{{.*}}) : !cir.vector<16 x !cir.int<s, 1>>, !cir.vector<16 x !s32i>
+  // LLVM-LABEL: @test_mm512_mask_shldv_epi32
+  // LLVM: call <16 x i32> @llvm.fshl.v16i32(<16 x i32> {{.*}}, <16 x i32> {{.*}}, <16 x i32>
+  // LLVM: select <16 x i1> {{.*}}, <16 x i32> {{.*}}, <16 x i32>
+  // OGCG-LABEL: @test_mm512_mask_shldv_epi32
+  // OGCG: call <16 x i32> @llvm.fshl.v16i32(<16 x i32> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32>
+  // OGCG: select <16 x i1> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> %{{.*}}
+  return _mm512_mask_shldv_epi32(s, u, a, b);
+}
+
+__m512i test_mm512_maskz_shldv_epi32(__mmask16 u, __m512i s, __m512i a, __m512i b) {
+  // CIR-LABEL: @test_mm512_maskz_shldv_epi32
+  // CIR: cir.call_llvm_intrinsic "fshl" %{{.*}}, %{{.*}}, %{{.*}} : (!cir.vector<16 x !s32i>, !cir.vector<16 x !s32i>, !cir.vector<16 x !u32i>) -> !cir.vector<16 x !s32i>
+  // CIR: cir.vec.ternary(%{{.*}}, %{{.*}}, %{{.*}}) : !cir.vector<16 x !cir.int<s, 1>>, !cir.vector<16 x !s32i>
+  // LLVM-LABEL: @test_mm512_maskz_shldv_epi32
+  // LLVM: call <16 x i32> @llvm.fshl.v16i32(<16 x i32> {{.*}}, <16 x i32> {{.*}}, <16 x i32>
+  // LLVM: select <16 x i1> {{.*}}, <16 x i32> {{.*}}, <16 x i32>
+  // OGCG-LABEL: @test_mm512_maskz_shldv_epi32
+  // OGCG: call <16 x i32> @llvm.fshl.v16i32(<16 x i32> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32>
+  // OGCG: select <16 x i1> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> %{{.*}}
+  return _mm512_maskz_shldv_epi32(u, s, a, b);
+}
+
+__m512i test_mm512_mask_shrdv_epi64(__m512i s, __mmask8 u, __m512i a, __m512i b) {
+  // CIR-LABEL: @test_mm512_mask_shrdv_epi64
+  // CIR: cir.call_llvm_intrinsic "fshr" %{{.*}}, %{{.*}}, %{{.*}} : (!cir.vector<8 x !s64i>, !cir.vector<8 x !s64i>, !cir.vector<8 x !u64i>) -> !cir.vector<8 x !s64i>
+  // CIR: cir.vec.ternary(%{{.*}}, %{{.*}}, %{{.*}}) : !cir.vector<8 x !cir.int<s, 1>>, !cir.vector<8 x !s64i>
+  // LLVM-LABEL: @test_mm512_mask_shrdv_epi64
+  // LLVM: call <8 x i64> @llvm.fshr.v8i64(<8 x i64> {{.*}}, <8 x i64> {{.*}}, <8 x i64>
+  // LLVM: select <8 x i1> {{.*}}, <8 x i64> {{.*}}, <8 x i64>
+  // OGCG-LABEL: @test_mm512_mask_shrdv_epi64
+  // OGCG: call <8 x i64> @llvm.fshr.v8i64(<8 x i64> %{{.*}}, <8 x i64> %{{.*}}, <8 x i64>
+  // OGCG: select <8 x i1> %{{.*}}, <8 x i64> %{{.*}}, <8 x i64> %{{.*}}
+  return _mm512_mask_shrdv_epi64(s, u, a, b);
+}
+
+__m512i test_mm512_maskz_shrdv_epi64(__mmask8 u, __m512i s, __m512i a, __m512i b) {
+  // CIR-LABEL: @test_mm512_maskz_shrdv_epi64
+  // CIR: cir.call_llvm_intrinsic "fshr" %{{.*}}, %{{.*}}, %{{.*}} : (!cir.vector<8 x !s64i>, !cir.vector<8 x !s64i>, !cir.vector<8 x !u64i>) -> !cir.vector<8 x !s64i>
+  // CIR: cir.vec.ternary(%{{.*}}, %{{.*}}, %{{.*}}) : !cir.vector<8 x !cir.int<s, 1>>, !cir.vector<8 x !s64i>
+  // LLVM-LABEL: @test_mm512_maskz_shrdv_epi64
+  // LLVM: call <8 x i64> @llvm.fshr.v8i64(<8 x i64> {{.*}}, <8 x i64> {{.*}}, <8 x i64>
+  // LLVM: select <8 x i1> {{.*}}, <8 x i64> {{.*}}, <8 x i64>
+  // OGCG-LABEL: @test_mm512_maskz_shrdv_epi64
+  // OGCG: call <8 x i64> @llvm.fshr.v8i64(<8 x i64> %{{.*}}, <8 x i64> %{{.*}}, <8 x i64>
+  // OGCG: select <8 x i1> %{{.*}}, <8 x i64> %{{.*}}, <8 x i64> %{{.*}}
+  return _mm512_maskz_shrdv_epi64(u, s, a, b);
+}
+
+__m512i test_mm512_mask_shrdv_epi32(__m512i s, __mmask16 u, __m512i a, __m512i b) {
+  // CIR-LABEL: @test_mm512_mask_shrdv_epi32
+  // CIR: cir.call_llvm_intrinsic "fshr" %{{.*}}, %{{.*}}, %{{.*}} : (!cir.vector<16 x !s32i>, !cir.vector<16 x !s32i>, !cir.vector<16 x !u32i>) -> !cir.vector<16 x !s32i>
+  // CIR: cir.vec.ternary(%{{.*}}, %{{.*}}, %{{.*}}) : !cir.vector<16 x !cir.int<s, 1>>, !cir.vector<16 x !s32i>
+  // LLVM-LABEL: @test_mm512_mask_shrdv_epi32
+  // LLVM: call <16 x i32> @llvm.fshr.v16i32(<16 x i32> {{.*}}, <16 x i32> {{.*}}, <16 x i32>
+  // LLVM: select <16 x i1> {{.*}}, <16 x i32> {{.*}}, <16 x i32>
+  // OGCG-LABEL: @test_mm512_mask_shrdv_epi32
+  // OGCG: call <16 x i32> @llvm.fshr.v16i32(<16 x i32> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32>
+  // OGCG: select <16 x i1> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> %{{.*}}
+  return _mm512_mask_shrdv_epi32(s, u, a, b);
+}
+
+__m512i test_mm512_maskz_shrdv_epi32(__mmask16 u, __m512i s, __m512i a, __m512i b) {
+  // CIR-LABEL: @test_mm512_maskz_shrdv_epi32
+  // CIR: cir.call_llvm_intrinsic "fshr" %{{.*}}, %{{.*}}, %{{.*}} : (!cir.vector<16 x !s32i>, !cir.vector<16 x !s32i>, !cir.vector<16 x !u32i>) -> !cir.vector<16 x !s32i>
+  // CIR: cir.vec.ternary(%{{.*}}, %{{.*}}, %{{.*}}) : !cir.vector<16 x !cir.int<s, 1>>, !cir.vector<16 x !s32i>
+  // LLVM-LABEL: @test_mm512_maskz_shrdv_epi32
+  // LLVM: call <16 x i32> @llvm.fshr.v16i32(<16 x i32> {{.*}}, <16 x i32> {{.*}}, <16 x i32>
+  // LLVM: select <16 x i1> {{.*}}, <16 x i32> {{.*}}, <16 x i32>
+  // OGCG-LABEL: @test_mm512_maskz_shrdv_epi32
+  // OGCG: call <16 x i32> @llvm.fshr.v16i32(<16 x i32> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32>
+  // OGCG: select <16 x i1> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> %{{.*}}
+  return _mm512_maskz_shrdv_epi32(u, s, a, b);
+}
+
+__m512i test_mm512_shrdv_epi32(__m512i s, __m512i a, __m512i b) {
+  // CIR-LABEL: @test_mm512_shrdv_epi32
+  // CIR: cir.call_llvm_intrinsic "fshr" %{{.*}}, %{{.*}}, %{{.*}} : (!cir.vector<16 x !s32i>, !cir.vector<16 x !s32i>, !cir.vector<16 x !u32i>) -> !cir.vector<16 x !s32i>
+  // LLVM-LABEL: @test_mm512_shrdv_epi32
+  // LLVM: call <16 x i32> @llvm.fshr.v16i32(<16 x i32> {{.*}}, <16 x i32> {{.*}}, <16 x i32>
+  // OGCG-LABEL: @test_mm512_shrdv_epi32
+  // OGCG: call <16 x i32> @llvm.fshr.v16i32(<16 x i32> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32>
+  return _mm512_shrdv_epi32(s, a, b);
+}
+
+__m512i test_mm512_mask_shrdv_epi16(__m512i s, __mmask32 u, __m512i a, __m512i b) {
+  // CIR-LABEL: @test_mm512_mask_shrdv_epi16
+  // CIR: cir.call_llvm_intrinsic "fshr" %{{.*}}, %{{.*}}, %{{.*}} : (!cir.vector<32 x !s16i>, !cir.vector<32 x !s16i>, !cir.vector<32 x !u16i>) -> !cir.vector<32 x !s16i>
+  // CIR: cir.vec.ternary(%{{.*}}, %{{.*}}, %{{.*}}) : !cir.vector<32 x !cir.int<s, 1>>, !cir.vector<32 x !s16i>
+  // LLVM-LABEL: @test_mm512_mask_shrdv_epi16
+  // LLVM: call <32 x i16> @llvm.fshr.v32i16(<32 x i16> {{.*}}, <32 x i16> {{.*}}, <32 x i16>
+  // LLVM: select <32 x i1> {{.*}}, <32 x i16> {{.*}}, <32 x i16>
+  // OGCG-LABEL: @test_mm512_mask_shrdv_epi16
+  // OGCG: call <32 x i16> @llvm.fshr.v32i16(<32 x i16> %{{.*}}, <32 x i16> %{{.*}}, <32 x i16>
+  // OGCG: select <32 x i1> %{{.*}}, <32 x i16> %{{.*}}, <32 x i16> %{{.*}}
+  return _mm512_mask_shrdv_epi16(s, u, a, b);
+}
+
+__m512i test_mm512_shrdv_epi16(__m512i s, __m512i a, __m512i b) {
+  // CIR-LABEL: @test_mm512_shrdv_epi16
+  // CIR: cir.call_llvm_intrinsic "fshr" %{{.*}}, %{{.*}}, %{{.*}} : (!cir.vector<32 x !s16i>, !cir.vector<32 x !s16i>, !cir.vector<32 x !u16i>) -> !cir.vector<32 x !s16i>
+  // LLVM-LABEL: @test_mm512_shrdv_epi16
+  // LLVM: call <32 x i16> @llvm.fshr.v32i16(<32 x i16> {{.*}}, <32 x i16> {{.*}}, <32 x i16>
+  // OGCG-LABEL: @test_mm512_shrdv_epi16
+  // OGCG: call <32 x i16> @llvm.fshr.v32i16(<32 x i16> %{{.*}}, <32 x i16> %{{.*}}, <32 x i16>
+  return _mm512_shrdv_epi16(s, a, b);
+}