[llvm] [AMDGPU] Add option to prevent insns straddling half cache-line boundaries (PR #150239)

Wed Jul 23 08:35:04 PDT 2025

llvmbot wrote:




@llvm/pr-subscribers-backend-amdgpu

Author: None (LU-JOHN)

<details>
<summary>Changes</summary>

Add option to prevent instructions from straddling half cache-line boundaries (32-bytes) by adding NOPs.  This is done to improve performance.

In testing, NOPs had to be added before 8496 out of 138953 MachineInstrs, %6.1.

---

Patch is 517.98 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/150239.diff


19 Files Affected:

- (modified) llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp (+31-2) 
- (modified) llvm/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp (+65-10) 
- (modified) llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h (+6) 
- (added) llvm/test/CodeGen/AMDGPU/has_cache_straddle.py (+29) 
- (added) llvm/test/CodeGen/AMDGPU/no_straddle.ll (+153) 
- (added) llvm/test/CodeGen/AMDGPU/no_straddle_amdgcn.bitcast.1024bit.ll (+2024) 
- (added) llvm/test/CodeGen/AMDGPU/no_straddle_fcanonicalize.f16.ll (+452) 
- (added) llvm/test/CodeGen/AMDGPU/no_straddle_global_store_short_saddr_t16.ll (+19) 
- (added) llvm/test/CodeGen/AMDGPU/no_straddle_llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.ll (+665) 
- (added) llvm/test/CodeGen/AMDGPU/no_straddle_llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.ll (+692) 
- (added) llvm/test/CodeGen/AMDGPU/no_straddle_memmove-var-size.ll (+149) 
- (added) llvm/test/CodeGen/AMDGPU/no_straddle_mfma-loop.ll (+303) 
- (added) llvm/test/CodeGen/AMDGPU/no_straddle_mfma-no-register-aliasing.ll (+42) 
- (added) llvm/test/CodeGen/AMDGPU/no_straddle_mfma-vgpr-cd-select.ll (+122) 
- (added) llvm/test/CodeGen/AMDGPU/no_straddle_saddsat.ll (+61) 
- (added) llvm/test/CodeGen/AMDGPU/no_straddle_shufflevector.v2p3.v8p3.ll (+4435) 
- (added) llvm/test/CodeGen/AMDGPU/no_straddle_sub.ll (+131) 
- (added) llvm/test/CodeGen/AMDGPU/no_straddle_wave32.ll (+783) 
- (added) llvm/test/CodeGen/AMDGPU/no_straddle_wqm.ll (+1033) 


``````````diff

diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
index c0920e3e71bee..733cea8ac69ca 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
@@ -159,10 +159,17 @@ void AMDGPUAsmPrinter::emitEndOfAsmFile(Module &M) {
 }
 
 void AMDGPUAsmPrinter::emitFunctionBodyStart() {
-  const SIMachineFunctionInfo &MFI = *MF->getInfo<SIMachineFunctionInfo>();
+  SIMachineFunctionInfo &MFI = *MF->getInfo<SIMachineFunctionInfo>();
   const GCNSubtarget &STM = MF->getSubtarget<GCNSubtarget>();
   const Function &F = MF->getFunction();
 
+  const MCAsmInfo *MAI = MF->getTarget().getMCAsmInfo();
+  if (MAI->hasFunctionAlignment()) {
+    Align Alignment = MF->getAlignment();
+    MFI.Alignment = Alignment.value();
+    MFI.Offset = 0;
+  }
+
   // TODO: We're checking this late, would be nice to check it earlier.
   if (STM.requiresCodeObjectV6() && CodeObjectVersion < AMDGPU::AMDHSA_COV6) {
     reportFatalUsageError(
@@ -298,6 +305,18 @@ void AMDGPUAsmPrinter::emitBasicBlockStart(const MachineBasicBlock &MBB) {
     HexLines.emplace_back("");
   }
   AsmPrinter::emitBasicBlockStart(MBB);
+
+  Align Alignment = MBB.getAlignment();
+  if (Alignment != Align(1)) {
+    const MachineFunction *MF = MBB.getParent();
+    SIMachineFunctionInfo *MFI = const_cast<SIMachineFunctionInfo *>(
+        MF->getInfo<SIMachineFunctionInfo>());
+    unsigned BlockAlignment = Alignment.value();
+    // Do not decrease known Alignment.  Increment Offset to satisfy
+    // BlockAlignment.
+    MFI->Alignment = std::max(MFI->Alignment, BlockAlignment);
+    MFI->Offset += (BlockAlignment - (MFI->Offset % BlockAlignment));
+  }
 }
 
 void AMDGPUAsmPrinter::emitGlobalVariable(const GlobalVariable *GV) {
@@ -640,6 +659,12 @@ AMDGPUAsmPrinter::getAmdhsaKernelDescriptor(const MachineFunction &MF,
   return KernelDescriptor;
 }
 
+cl::opt<bool> PreventHalfCacheLineStraddling(
+    "amdgpu-prevent-half-cache-line-straddling", cl::Hidden,
+    cl::desc(
+        "Add NOPs to prevent instructions from straddling half a cache-line"),
+    cl::init(false));
+
 bool AMDGPUAsmPrinter::runOnMachineFunction(MachineFunction &MF) {
   // Init target streamer lazily on the first function so that previous passes
   // can set metadata.
@@ -654,7 +679,11 @@ bool AMDGPUAsmPrinter::runOnMachineFunction(MachineFunction &MF) {
 
   // The starting address of all shader programs must be 256 bytes aligned.
   // Regular functions just need the basic required instruction alignment.
-  MF.setAlignment(MFI->isEntryFunction() ? Align(256) : Align(4));
+  // However, align regular functions to half a cache-line 64/2 bytes if
+  // PreventHalfCacheLineStraddling is enabled.
+  MF.setAlignment(MFI->isEntryFunction()           ? Align(256)
+                  : PreventHalfCacheLineStraddling ? Align(32)
+                                                   : Align(4));
 
   SetupMachineFunction(MF);
 
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp b/llvm/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp
index 2dec16de940d1..6d8c7b72b0e2c 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp
@@ -274,12 +274,72 @@ static void emitVGPRBlockComment(const MachineInstr *MI, const SIInstrInfo *TII,
   OS.emitRawComment(" transferring at most " + TransferredRegs);
 }
 
+extern cl::opt<bool> PreventHalfCacheLineStraddling;
+
+static unsigned getMCInstSizeInBytes(const MCInst &LoweredMCI,
+                                     const GCNSubtarget &STI,
+                                     MCContext &OutContext) {
+  SmallVector<MCFixup, 4> Fixups;
+  SmallVector<char, 16> CodeBytes;
+
+  std::unique_ptr<MCCodeEmitter> InstEmitter(
+      createAMDGPUMCCodeEmitter(*STI.getInstrInfo(), OutContext));
+  InstEmitter->encodeInstruction(LoweredMCI, CodeBytes, Fixups, STI);
+  return (CodeBytes.size());
+};
+
 void AMDGPUAsmPrinter::emitInstruction(const MachineInstr *MI) {
   // FIXME: Enable feature predicate checks once all the test pass.
   // AMDGPU_MC::verifyInstructionPredicates(MI->getOpcode(),
   //                                        getSubtargetInfo().getFeatureBits());
 
+  auto AvoidHalfCacheLineBoundary = [this](const MachineInstr *MI,
+                                           const MachineFunction *MF,
+                                           const MCInst &LoweredMCI) -> void {
+    const GCNSubtarget &STI = MF->getSubtarget<GCNSubtarget>();
+    SIMachineFunctionInfo *MFI = const_cast<SIMachineFunctionInfo *>(
+        MF->getInfo<SIMachineFunctionInfo>());
+
+    unsigned InstSizeInBytes = STI.getInstrInfo()->getInstSizeInBytes(*MI);
+
+    // getInstrSizeInBytes convervatively overestimates the size of branches due
+    // to a NOP added for the 0x3f offset bug.  Any inaccuracies in instruction
+    // sizes will cause problems when avoiding straddling half cache-line
+    // boundaries. A NOP is usually not added so remove the +4 that was added.
+    if (MI->isBranch() && STI.hasOffset3fBug())
+      InstSizeInBytes -= 4;
+    // Rarely, some MachineInstr do not have accurate instruction sizes.  Try to
+    // calculate the size from the lowered MCInst.
+    else if (InstSizeInBytes == 0 && STI.isCPUStringValid(STI.getCPU()) &&
+             !(MI->getOpcode() == AMDGPU::SI_ILLEGAL_COPY ||
+               MI->getOpcode() == AMDGPU::ATOMIC_FENCE))
+      InstSizeInBytes = getMCInstSizeInBytes(LoweredMCI, STI, OutContext);
+
+    // FIXME: Workaround bug in V_MADMK_F32 size.
+    if (MI->getOpcode() == AMDGPU::V_MADMK_F32)
+      InstSizeInBytes = 8;
+
+    unsigned Alignment = MFI->Alignment;
+    unsigned Offset = MFI->Offset;
+    constexpr unsigned HalfCacheLineBoundary = 32;
+
+    unsigned Boundary = std::min(Alignment, HalfCacheLineBoundary);
+    Offset %= Boundary;
+
+    if (Offset + InstSizeInBytes > Boundary) {
+      emitAlignment(Align(HalfCacheLineBoundary));
+      // Do not decrease known Alignment.  Increment Offset to satisfy
+      // HalfCacheLineBoundary.
+      MFI->Alignment = std::max(Alignment, HalfCacheLineBoundary);
+      MFI->Offset +=
+          (HalfCacheLineBoundary - (MFI->Offset % HalfCacheLineBoundary));
+    }
+    MFI->Offset += InstSizeInBytes;
+  };
+
   if (MCInst OutInst; lowerPseudoInstExpansion(MI, OutInst)) {
+    if (PreventHalfCacheLineStraddling)
+      AvoidHalfCacheLineBoundary(MI, MF, OutInst);
     EmitToStreamer(*OutStreamer, OutInst);
     return;
   }
@@ -370,6 +430,8 @@ void AMDGPUAsmPrinter::emitInstruction(const MachineInstr *MI) {
 
     MCInst TmpInst;
     MCInstLowering.lower(MI, TmpInst);
+    if (PreventHalfCacheLineStraddling)
+      AvoidHalfCacheLineBoundary(MI, MF, TmpInst);
     EmitToStreamer(*OutStreamer, TmpInst);
 
 #ifdef EXPENSIVE_CHECKS
@@ -382,16 +444,9 @@ void AMDGPUAsmPrinter::emitInstruction(const MachineInstr *MI) {
     //
     // We also overestimate branch sizes with the offset bug.
     if (!MI->isPseudo() && STI.isCPUStringValid(STI.getCPU()) &&
-        (!STI.hasOffset3fBug() || !MI->isBranch())) {
-      SmallVector<MCFixup, 4> Fixups;
-      SmallVector<char, 16> CodeBytes;
-
-      std::unique_ptr<MCCodeEmitter> InstEmitter(createAMDGPUMCCodeEmitter(
-          *STI.getInstrInfo(), OutContext));
-      InstEmitter->encodeInstruction(TmpInst, CodeBytes, Fixups, STI);
-
-      assert(CodeBytes.size() == STI.getInstrInfo()->getInstSizeInBytes(*MI));
-    }
+        (!STI.hasOffset3fBug() || !MI->isBranch()))
+      assert(getMCInstSizeInBytes(TmpInst, STI, OutContext) ==
+             STI.getInstrInfo()->getInstSizeInBytes(*MI));
 #endif
 
     if (DumpCodeInstEmitter) {
diff --git a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h
index 274a60adb8d07..d3def4ce51319 100644
--- a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h
+++ b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h
@@ -525,6 +525,12 @@ class SIMachineFunctionInfo final : public AMDGPUMachineFunction,
   void MRI_NoteCloneVirtualRegister(Register NewReg, Register SrcReg) override;
 
 public:
+  // Current known instruction alignment and offset in bytes.
+  // Used to prevent instructions from straddling half cache-line boundaries
+  // for performance.
+  unsigned Alignment = 1;
+  unsigned Offset = 0;
+
   struct VGPRSpillToAGPR {
     SmallVector<MCPhysReg, 32> Lanes;
     bool FullyAllocated = false;
diff --git a/llvm/test/CodeGen/AMDGPU/has_cache_straddle.py b/llvm/test/CodeGen/AMDGPU/has_cache_straddle.py
new file mode 100755
index 0000000000000..19d6e8d3e86c7
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/has_cache_straddle.py
@@ -0,0 +1,29 @@
+#!/usr/bin/env python3
+
+import re
+import sys
+
+if len(sys.argv) !=2 :
+    print("Usage: has_straddle.py <dissasembly file>")
+    sys.exit(1)
+
+inputFilename = sys.argv[1]
+address_and_encoding_regex = r"// (\S{12}):(( [0-9A-F]{8})+)";
+
+file = open(inputFilename)
+
+for line in file :
+    match = re.search(address_and_encoding_regex,line)
+    if match :
+        hexaddress = match.group(1)
+        encoding = match.group(2)
+        dwords = encoding.split()
+        address = int(hexaddress, 16)
+        address_end = address + len(dwords)*4 - 1
+        #Cache-line is 64 bytes.  Check for half cache-line straddle.
+        if address//32 != address_end//32:
+            print("Straddling instruction found at:")
+            print(line)
+            sys.exit(1)
+
+sys.exit(0)
diff --git a/llvm/test/CodeGen/AMDGPU/no_straddle.ll b/llvm/test/CodeGen/AMDGPU/no_straddle.ll
new file mode 100644
index 0000000000000..b5010a03a60cb
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/no_straddle.ll
@@ -0,0 +1,153 @@
+;RUN: llc --amdgpu-prevent-half-cache-line-straddling -mtriple=amdgcn -mcpu=fiji  -mattr=dumpcode --filetype=obj < %s | llvm-objdump --triple=amdgcn --mcpu=fiji -d  - > %t.dis
+;RUN: %python %p/has_cache_straddle.py %t.dis
+
+define amdgpu_kernel void @xor_v2i32(ptr addrspace(1) %out, ptr addrspace(1) %in0, ptr addrspace(1) %in1) {
+  %a = load <2 x i32>, ptr addrspace(1) %in0
+  %b = load <2 x i32>, ptr addrspace(1) %in1
+  %result = xor <2 x i32> %a, %b
+  store <2 x i32> %result, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_kernel void @xor_v4i32(ptr addrspace(1) %out, ptr addrspace(1) %in0, ptr addrspace(1) %in1) {
+  %a = load <4 x i32>, ptr addrspace(1) %in0
+  %b = load <4 x i32>, ptr addrspace(1) %in1
+  %result = xor <4 x i32> %a, %b
+  store <4 x i32> %result, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_kernel void @xor_i1(ptr addrspace(1) %out, ptr addrspace(1) %in0, ptr addrspace(1) %in1) {
+  %a = load float, ptr addrspace(1) %in0
+  %b = load float, ptr addrspace(1) %in1
+  %acmp = fcmp oge float %a, 0.000000e+00
+  %bcmp = fcmp oge float %b, 1.000000e+00
+  %xor = xor i1 %acmp, %bcmp
+  %result = select i1 %xor, float %a, float %b
+  store float %result, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_kernel void @v_xor_i1(ptr addrspace(1) %out, ptr addrspace(1) %in0, ptr addrspace(1) %in1) {
+  %a = load volatile i1, ptr addrspace(1) %in0
+  %b = load volatile i1, ptr addrspace(1) %in1
+  %xor = xor i1 %a, %b
+  store i1 %xor, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_kernel void @vector_xor_i32(ptr addrspace(1) %out, ptr addrspace(1) %in0, ptr addrspace(1) %in1) {
+  %a = load i32, ptr addrspace(1) %in0
+  %b = load i32, ptr addrspace(1) %in1
+  %result = xor i32 %a, %b
+  store i32 %result, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_kernel void @scalar_xor_i32(ptr addrspace(1) %out, i32 %a, i32 %b) {
+  %result = xor i32 %a, %b
+  store i32 %result, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_kernel void @scalar_not_i32(ptr addrspace(1) %out, i32 %a) {
+  %result = xor i32 %a, -1
+  store i32 %result, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_kernel void @vector_not_i32(ptr addrspace(1) %out, ptr addrspace(1) %in0, ptr addrspace(1) %in1) {
+  %a = load i32, ptr addrspace(1) %in0
+  %b = load i32, ptr addrspace(1) %in1
+  %result = xor i32 %a, -1
+  store i32 %result, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_kernel void @vector_xor_i64(ptr addrspace(1) %out, ptr addrspace(1) %in0, ptr addrspace(1) %in1) {
+  %a = load i64, ptr addrspace(1) %in0
+  %b = load i64, ptr addrspace(1) %in1
+  %result = xor i64 %a, %b
+  store i64 %result, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_kernel void @scalar_xor_i64(ptr addrspace(1) %out, i64 %a, i64 %b) {
+  %result = xor i64 %a, %b
+  store i64 %result, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_kernel void @scalar_not_i64(ptr addrspace(1) %out, i64 %a) {
+  %result = xor i64 %a, -1
+  store i64 %result, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_kernel void @vector_not_i64(ptr addrspace(1) %out, ptr addrspace(1) %in0, ptr addrspace(1) %in1) {
+  %a = load i64, ptr addrspace(1) %in0
+  %b = load i64, ptr addrspace(1) %in1
+  %result = xor i64 %a, -1
+  store i64 %result, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_kernel void @xor_cf(ptr addrspace(1) %out, ptr addrspace(1) %in, i64 %a, i64 %b) {
+entry:
+  %0 = icmp eq i64 %a, 0
+  br i1 %0, label %if, label %else
+
+if:
+  %1 = xor i64 %a, %b
+  br label %endif
+
+else:
+  %2 = load i64, ptr addrspace(1) %in
+  br label %endif
+
+endif:
+  %3 = phi i64 [%1, %if], [%2, %else]
+  store i64 %3, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_kernel void @scalar_xor_literal_i64(ptr addrspace(1) %out, [8 x i32], i64 %a) {
+  %or = xor i64 %a, 4261135838621753
+  store i64 %or, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_kernel void @scalar_xor_literal_multi_use_i64(ptr addrspace(1) %out, [8 x i32], i64 %a, i64 %b) {
+  %or = xor i64 %a, 4261135838621753
+  store i64 %or, ptr addrspace(1) %out
+
+  %foo = add i64 %b, 4261135838621753
+  store volatile i64 %foo, ptr addrspace(1) poison
+  ret void
+}
+
+define amdgpu_kernel void @scalar_xor_inline_imm_i64(ptr addrspace(1) %out, [8 x i32], i64 %a) {
+  %or = xor i64 %a, 63
+  store i64 %or, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_kernel void @scalar_xor_neg_inline_imm_i64(ptr addrspace(1) %out, [8 x i32], i64 %a) {
+  %or = xor i64 %a, -8
+  store i64 %or, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_kernel void @vector_xor_i64_neg_inline_imm(ptr addrspace(1) %out, ptr addrspace(1) %a, ptr addrspace(1) %b) {
+  %loada = load i64, ptr addrspace(1) %a, align 8
+  %or = xor i64 %loada, -8
+  store i64 %or, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_kernel void @vector_xor_literal_i64(ptr addrspace(1) %out, ptr addrspace(1) %a, ptr addrspace(1) %b) {
+  %loada = load i64, ptr addrspace(1) %a, align 8
+  %or = xor i64 %loada, 22470723082367
+  store i64 %or, ptr addrspace(1) %out
+  ret void
+}
diff --git a/llvm/test/CodeGen/AMDGPU/no_straddle_amdgcn.bitcast.1024bit.ll b/llvm/test/CodeGen/AMDGPU/no_straddle_amdgcn.bitcast.1024bit.ll
new file mode 100644
index 0000000000000..d4096e2be2b54
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/no_straddle_amdgcn.bitcast.1024bit.ll
@@ -0,0 +1,2024 @@
+;RUN: llc --amdgpu-prevent-half-cache-line-straddling -mtriple=amdgcn -mcpu=tonga  -mattr=dumpcode --filetype=obj < %s | llvm-objdump --triple=amdgcn --mcpu=tonga -d  - > %t.dis
+;RUN: %python %p/has_cache_straddle.py %t.dis
+
+;RUN: llc --amdgpu-prevent-half-cache-line-straddling -mtriple=amdgcn -mcpu=gfx900  -mattr=dumpcode --filetype=obj < %s | llvm-objdump --triple=amdgcn --mcpu=gfx900 -d  - > %t.dis
+;RUN: %python %p/has_cache_straddle.py %t.dis
+
+;RUN: llc --amdgpu-prevent-half-cache-line-straddling -mtriple=amdgcn -mcpu=gfx1100  -mattr=dumpcode --filetype=obj < %s | llvm-objdump --triple=amdgcn --mcpu=gfx1100 -d  - > %t.dis
+;RUN: %python %p/has_cache_straddle.py %t.dis
+
+define <32 x float> @bitcast_v32i32_to_v32f32(<32 x i32> %a, i32 %b) {
+  %cmp = icmp eq i32 %b, 0
+  br i1 %cmp, label %cmp.true, label %cmp.false
+
+cmp.true:
+  %a1 = add <32 x i32> %a, splat (i32 3)
+  %a2 = bitcast <32 x i32> %a1 to <32 x float>
+  br label %end
+
+cmp.false:
+  %a3 = bitcast <32 x i32> %a to <32 x float>
+  br label %end
+
+end:
+  %phi = phi <32 x float> [ %a2, %cmp.true ], [ %a3, %cmp.false ]
+  ret <32 x float> %phi
+}
+
+define inreg <32 x float> @bitcast_v32i32_to_v32f32_scalar(<32 x i32> inreg %a, i32 inreg %b) {
+  %cmp = icmp eq i32 %b, 0
+  br i1 %cmp, label %cmp.true, label %cmp.false
+
+cmp.true:
+  %a1 = add <32 x i32> %a, splat (i32 3)
+  %a2 = bitcast <32 x i32> %a1 to <32 x float>
+  br label %end
+
+cmp.false:
+  %a3 = bitcast <32 x i32> %a to <32 x float>
+  br label %end
+
+end:
+  %phi = phi <32 x float> [ %a2, %cmp.true ], [ %a3, %cmp.false ]
+  ret <32 x float> %phi
+}
+
+define <32 x i32> @bitcast_v32f32_to_v32i32(<32 x float> %a, i32 %b) {
+  %cmp = icmp eq i32 %b, 0
+  br i1 %cmp, label %cmp.true, label %cmp.false
+
+cmp.true:
+  %a1 = fadd <32 x float> %a, splat (float 1.000000e+00)
+  %a2 = bitcast <32 x float> %a1 to <32 x i32>
+  br label %end
+
+cmp.false:
+  %a3 = bitcast <32 x float> %a to <32 x i32>
+  br label %end
+
+end:
+  %phi = phi <32 x i32> [ %a2, %cmp.true ], [ %a3, %cmp.false ]
+  ret <32 x i32> %phi
+}
+
+define inreg <32 x i32> @bitcast_v32f32_to_v32i32_scalar(<32 x float> inreg %a, i32 inreg %b) {
+  %cmp = icmp eq i32 %b, 0
+  br i1 %cmp, label %cmp.true, label %cmp.false
+
+cmp.true:
+  %a1 = fadd <32 x float> %a, splat (float 1.000000e+00)
+  %a2 = bitcast <32 x float> %a1 to <32 x i32>
+  br label %end
+
+cmp.false:
+  %a3 = bitcast <32 x float> %a to <32 x i32>
+  br label %end
+
+end:
+  %phi = phi <32 x i32> [ %a2, %cmp.true ], [ %a3, %cmp.false ]
+  ret <32 x i32> %phi
+}
+
+define <16 x i64> @bitcast_v32i32_to_v16i64(<32 x i32> %a, i32 %b) {
+  %cmp = icmp eq i32 %b, 0
+  br i1 %cmp, label %cmp.true, label %cmp.false
+
+cmp.true:
+  %a1 = add <32 x i32> %a, splat (i32 3)
+  %a2 = bitcast <32 x i32> %a1 to <16 x i64>
+  br label %end
+
+cmp.false:
+  %a3 = bitcast <32 x i32> %a to <16 x i64>
+  br label %end
+
+end:
+  %phi = phi <16 x i64> [ %a2, %cmp.true ], [ %a3, %cmp.false ]
+  ret <16 x i64> %phi
+}
+
+define inreg <16 x i64> @bitcast_v32i32_to_v16i64_scalar(<32 x i32> inreg %a, i32 inreg %b) {
+  %cmp = icmp eq i32 %b, 0
+  br i1 %cmp, label %cmp.true, label %cmp.false
+
+cmp.true:
+  %a1 = add <32 x i32> %a, splat (i32 3)
+  %a2 = bitcast <32 x i32> %a1 to <16 x i64>
+  br label %end
+
+cmp.false:
+  %a3 = bitcast <32 x i32> %a to <16 x i64>
+  br label %end
+
+end:
+  %phi = phi <16 x i64> [ %a2, %cmp.true ], [ %a3, %cmp.false ]
+  ret <16 x i64> %phi
+}
+
+define <32 x i32> @bitcast_v16i64_to_v32i32(<16 x i64> %a, i32 %b) {
+  %cmp = icmp eq i32 %b, 0
+  br i1 %cmp, label %cmp.true, label %cmp.false
+
+cmp.true:
+  %a1 = add <16 x i64> %a, splat (i64 3)
+  %a2 = bitcast <16 x i64> %a1 to <32 x i32>
+  br label %end
+
+cmp.false:
+  %a3 = bitcast <16 x i64> %a to <32 x i32>
+  br label %end
+
+end:
+  %phi = phi <32 x i32> [ %a2, %cmp.true ], [ %a3, %cmp.false ]
+  ret <32 x i32> %phi
+}
+
+define inreg <32 x i32> @bitcast_v16i64_to_v32i32_scalar(<16 x i64> inreg %a, i32 inreg %b) {
+  %cmp = icmp eq i32 %b, 0
+  br i1 %cmp, label %cmp.true, label %cmp.false
+
+cmp.true:
+  %a1 = add <16 x i64> %a, splat (i64 3)
+  %a2 = bitcast <16 x i64> %a1 to <32 x i32>
+  br label %end
+
+cmp.false:
+  %a3 = bitcast <16 x i64> %a to <32 x i32>
+  br label %end
+
+end:
+  %phi = phi <32 x i32> [ %a2, %cmp.true ], [ %a3, %cmp.false ]
+  ret <32 x i32> %phi
+}
+
+define <16 x double> @bitcast_v32i32_to_v16f64(<32 x i32> %a, i32 %b) {
+  %cmp = icmp eq i32 %b, 0
+  br i1 %cmp, label %cmp.true, label %cmp.false
+
+cmp.true:
+  %a1 = add <32 x i32> %a, splat (i32 3)
+  %a2 = bitcast <32 x i32> %a1 to <16 x double>
+  br label %end
+
+cmp.false:
+  %a3 = bitcast <32 x i32> %a to <16 x double>
+  br label %end
+
+end:
+  %phi = phi <16 x double> [ %a2, %cmp.true ], [ %a3, %cmp.false ]
+  ret <16 x double> %phi
+}
+
+define inreg <16 x double> @bitcast_v32i32_to_v16f64_scalar(<32 x i32> inreg %a, i32 inreg %b) {
+  %cmp = icmp eq i32 %b, 0
+  br i1 %cmp, label %cmp.true, label %cmp.false
+
+cmp.true:
+  %a1 = add <32 x i32> %a, splat (i32 3)
+  %a2 = bitcast <32 x i32> %a1 to <16 x double>
+  br label %end
+
+cmp.false:
+  %a3 = bitcast <32 x i32> %a to <16 x double>
+  br label %end
+
+end:
+  %phi = phi <16 x double> [ %a2, %cmp.true ], [ %a3, %cmp.false ]
+  ret <16 x double> %phi
+}
+
+define <32 x i32> @bit...
[truncated]

``````````

</details>


https://github.com/llvm/llvm-project/pull/150239