[llvm] [AMDGPU] Add option to prevent insns straddling half cache-line boundaries (PR #150239)
via llvm-commits
llvm-commits at lists.llvm.org
Wed Jul 23 08:35:04 PDT 2025
llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT-->
@llvm/pr-subscribers-backend-amdgpu
Author: None (LU-JOHN)
<details>
<summary>Changes</summary>
Add option to prevent instructions from straddling half cache-line boundaries (32-bytes) by adding NOPs. This is done to improve performance.
In testing, NOPs had to be added before 8496 out of 138953 MachineInstrs, %6.1.
---
Patch is 517.98 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/150239.diff
19 Files Affected:
- (modified) llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp (+31-2)
- (modified) llvm/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp (+65-10)
- (modified) llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h (+6)
- (added) llvm/test/CodeGen/AMDGPU/has_cache_straddle.py (+29)
- (added) llvm/test/CodeGen/AMDGPU/no_straddle.ll (+153)
- (added) llvm/test/CodeGen/AMDGPU/no_straddle_amdgcn.bitcast.1024bit.ll (+2024)
- (added) llvm/test/CodeGen/AMDGPU/no_straddle_fcanonicalize.f16.ll (+452)
- (added) llvm/test/CodeGen/AMDGPU/no_straddle_global_store_short_saddr_t16.ll (+19)
- (added) llvm/test/CodeGen/AMDGPU/no_straddle_llvm.amdgcn.mfma.scale.f32.16x16x128.f8f6f4.ll (+665)
- (added) llvm/test/CodeGen/AMDGPU/no_straddle_llvm.amdgcn.mfma.scale.f32.32x32x64.f8f6f4.ll (+692)
- (added) llvm/test/CodeGen/AMDGPU/no_straddle_memmove-var-size.ll (+149)
- (added) llvm/test/CodeGen/AMDGPU/no_straddle_mfma-loop.ll (+303)
- (added) llvm/test/CodeGen/AMDGPU/no_straddle_mfma-no-register-aliasing.ll (+42)
- (added) llvm/test/CodeGen/AMDGPU/no_straddle_mfma-vgpr-cd-select.ll (+122)
- (added) llvm/test/CodeGen/AMDGPU/no_straddle_saddsat.ll (+61)
- (added) llvm/test/CodeGen/AMDGPU/no_straddle_shufflevector.v2p3.v8p3.ll (+4435)
- (added) llvm/test/CodeGen/AMDGPU/no_straddle_sub.ll (+131)
- (added) llvm/test/CodeGen/AMDGPU/no_straddle_wave32.ll (+783)
- (added) llvm/test/CodeGen/AMDGPU/no_straddle_wqm.ll (+1033)
``````````diff
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
index c0920e3e71bee..733cea8ac69ca 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
@@ -159,10 +159,17 @@ void AMDGPUAsmPrinter::emitEndOfAsmFile(Module &M) {
}
void AMDGPUAsmPrinter::emitFunctionBodyStart() {
- const SIMachineFunctionInfo &MFI = *MF->getInfo<SIMachineFunctionInfo>();
+ SIMachineFunctionInfo &MFI = *MF->getInfo<SIMachineFunctionInfo>();
const GCNSubtarget &STM = MF->getSubtarget<GCNSubtarget>();
const Function &F = MF->getFunction();
+ const MCAsmInfo *MAI = MF->getTarget().getMCAsmInfo();
+ if (MAI->hasFunctionAlignment()) {
+ Align Alignment = MF->getAlignment();
+ MFI.Alignment = Alignment.value();
+ MFI.Offset = 0;
+ }
+
// TODO: We're checking this late, would be nice to check it earlier.
if (STM.requiresCodeObjectV6() && CodeObjectVersion < AMDGPU::AMDHSA_COV6) {
reportFatalUsageError(
@@ -298,6 +305,18 @@ void AMDGPUAsmPrinter::emitBasicBlockStart(const MachineBasicBlock &MBB) {
HexLines.emplace_back("");
}
AsmPrinter::emitBasicBlockStart(MBB);
+
+ Align Alignment = MBB.getAlignment();
+ if (Alignment != Align(1)) {
+ const MachineFunction *MF = MBB.getParent();
+ SIMachineFunctionInfo *MFI = const_cast<SIMachineFunctionInfo *>(
+ MF->getInfo<SIMachineFunctionInfo>());
+ unsigned BlockAlignment = Alignment.value();
+ // Do not decrease known Alignment. Increment Offset to satisfy
+ // BlockAlignment.
+ MFI->Alignment = std::max(MFI->Alignment, BlockAlignment);
+ MFI->Offset += (BlockAlignment - (MFI->Offset % BlockAlignment));
+ }
}
void AMDGPUAsmPrinter::emitGlobalVariable(const GlobalVariable *GV) {
@@ -640,6 +659,12 @@ AMDGPUAsmPrinter::getAmdhsaKernelDescriptor(const MachineFunction &MF,
return KernelDescriptor;
}
+cl::opt<bool> PreventHalfCacheLineStraddling(
+ "amdgpu-prevent-half-cache-line-straddling", cl::Hidden,
+ cl::desc(
+ "Add NOPs to prevent instructions from straddling half a cache-line"),
+ cl::init(false));
+
bool AMDGPUAsmPrinter::runOnMachineFunction(MachineFunction &MF) {
// Init target streamer lazily on the first function so that previous passes
// can set metadata.
@@ -654,7 +679,11 @@ bool AMDGPUAsmPrinter::runOnMachineFunction(MachineFunction &MF) {
// The starting address of all shader programs must be 256 bytes aligned.
// Regular functions just need the basic required instruction alignment.
- MF.setAlignment(MFI->isEntryFunction() ? Align(256) : Align(4));
+ // However, align regular functions to half a cache-line 64/2 bytes if
+ // PreventHalfCacheLineStraddling is enabled.
+ MF.setAlignment(MFI->isEntryFunction() ? Align(256)
+ : PreventHalfCacheLineStraddling ? Align(32)
+ : Align(4));
SetupMachineFunction(MF);
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp b/llvm/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp
index 2dec16de940d1..6d8c7b72b0e2c 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp
@@ -274,12 +274,72 @@ static void emitVGPRBlockComment(const MachineInstr *MI, const SIInstrInfo *TII,
OS.emitRawComment(" transferring at most " + TransferredRegs);
}
+extern cl::opt<bool> PreventHalfCacheLineStraddling;
+
+static unsigned getMCInstSizeInBytes(const MCInst &LoweredMCI,
+ const GCNSubtarget &STI,
+ MCContext &OutContext) {
+ SmallVector<MCFixup, 4> Fixups;
+ SmallVector<char, 16> CodeBytes;
+
+ std::unique_ptr<MCCodeEmitter> InstEmitter(
+ createAMDGPUMCCodeEmitter(*STI.getInstrInfo(), OutContext));
+ InstEmitter->encodeInstruction(LoweredMCI, CodeBytes, Fixups, STI);
+ return (CodeBytes.size());
+};
+
void AMDGPUAsmPrinter::emitInstruction(const MachineInstr *MI) {
// FIXME: Enable feature predicate checks once all the test pass.
// AMDGPU_MC::verifyInstructionPredicates(MI->getOpcode(),
// getSubtargetInfo().getFeatureBits());
+ auto AvoidHalfCacheLineBoundary = [this](const MachineInstr *MI,
+ const MachineFunction *MF,
+ const MCInst &LoweredMCI) -> void {
+ const GCNSubtarget &STI = MF->getSubtarget<GCNSubtarget>();
+ SIMachineFunctionInfo *MFI = const_cast<SIMachineFunctionInfo *>(
+ MF->getInfo<SIMachineFunctionInfo>());
+
+ unsigned InstSizeInBytes = STI.getInstrInfo()->getInstSizeInBytes(*MI);
+
+ // getInstrSizeInBytes convervatively overestimates the size of branches due
+ // to a NOP added for the 0x3f offset bug. Any inaccuracies in instruction
+ // sizes will cause problems when avoiding straddling half cache-line
+ // boundaries. A NOP is usually not added so remove the +4 that was added.
+ if (MI->isBranch() && STI.hasOffset3fBug())
+ InstSizeInBytes -= 4;
+ // Rarely, some MachineInstr do not have accurate instruction sizes. Try to
+ // calculate the size from the lowered MCInst.
+ else if (InstSizeInBytes == 0 && STI.isCPUStringValid(STI.getCPU()) &&
+ !(MI->getOpcode() == AMDGPU::SI_ILLEGAL_COPY ||
+ MI->getOpcode() == AMDGPU::ATOMIC_FENCE))
+ InstSizeInBytes = getMCInstSizeInBytes(LoweredMCI, STI, OutContext);
+
+ // FIXME: Workaround bug in V_MADMK_F32 size.
+ if (MI->getOpcode() == AMDGPU::V_MADMK_F32)
+ InstSizeInBytes = 8;
+
+ unsigned Alignment = MFI->Alignment;
+ unsigned Offset = MFI->Offset;
+ constexpr unsigned HalfCacheLineBoundary = 32;
+
+ unsigned Boundary = std::min(Alignment, HalfCacheLineBoundary);
+ Offset %= Boundary;
+
+ if (Offset + InstSizeInBytes > Boundary) {
+ emitAlignment(Align(HalfCacheLineBoundary));
+ // Do not decrease known Alignment. Increment Offset to satisfy
+ // HalfCacheLineBoundary.
+ MFI->Alignment = std::max(Alignment, HalfCacheLineBoundary);
+ MFI->Offset +=
+ (HalfCacheLineBoundary - (MFI->Offset % HalfCacheLineBoundary));
+ }
+ MFI->Offset += InstSizeInBytes;
+ };
+
if (MCInst OutInst; lowerPseudoInstExpansion(MI, OutInst)) {
+ if (PreventHalfCacheLineStraddling)
+ AvoidHalfCacheLineBoundary(MI, MF, OutInst);
EmitToStreamer(*OutStreamer, OutInst);
return;
}
@@ -370,6 +430,8 @@ void AMDGPUAsmPrinter::emitInstruction(const MachineInstr *MI) {
MCInst TmpInst;
MCInstLowering.lower(MI, TmpInst);
+ if (PreventHalfCacheLineStraddling)
+ AvoidHalfCacheLineBoundary(MI, MF, TmpInst);
EmitToStreamer(*OutStreamer, TmpInst);
#ifdef EXPENSIVE_CHECKS
@@ -382,16 +444,9 @@ void AMDGPUAsmPrinter::emitInstruction(const MachineInstr *MI) {
//
// We also overestimate branch sizes with the offset bug.
if (!MI->isPseudo() && STI.isCPUStringValid(STI.getCPU()) &&
- (!STI.hasOffset3fBug() || !MI->isBranch())) {
- SmallVector<MCFixup, 4> Fixups;
- SmallVector<char, 16> CodeBytes;
-
- std::unique_ptr<MCCodeEmitter> InstEmitter(createAMDGPUMCCodeEmitter(
- *STI.getInstrInfo(), OutContext));
- InstEmitter->encodeInstruction(TmpInst, CodeBytes, Fixups, STI);
-
- assert(CodeBytes.size() == STI.getInstrInfo()->getInstSizeInBytes(*MI));
- }
+ (!STI.hasOffset3fBug() || !MI->isBranch()))
+ assert(getMCInstSizeInBytes(TmpInst, STI, OutContext) ==
+ STI.getInstrInfo()->getInstSizeInBytes(*MI));
#endif
if (DumpCodeInstEmitter) {
diff --git a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h
index 274a60adb8d07..d3def4ce51319 100644
--- a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h
+++ b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h
@@ -525,6 +525,12 @@ class SIMachineFunctionInfo final : public AMDGPUMachineFunction,
void MRI_NoteCloneVirtualRegister(Register NewReg, Register SrcReg) override;
public:
+ // Current known instruction alignment and offset in bytes.
+ // Used to prevent instructions from straddling half cache-line boundaries
+ // for performance.
+ unsigned Alignment = 1;
+ unsigned Offset = 0;
+
struct VGPRSpillToAGPR {
SmallVector<MCPhysReg, 32> Lanes;
bool FullyAllocated = false;
diff --git a/llvm/test/CodeGen/AMDGPU/has_cache_straddle.py b/llvm/test/CodeGen/AMDGPU/has_cache_straddle.py
new file mode 100755
index 0000000000000..19d6e8d3e86c7
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/has_cache_straddle.py
@@ -0,0 +1,29 @@
+#!/usr/bin/env python3
+
+import re
+import sys
+
+if len(sys.argv) !=2 :
+ print("Usage: has_straddle.py <dissasembly file>")
+ sys.exit(1)
+
+inputFilename = sys.argv[1]
+address_and_encoding_regex = r"// (\S{12}):(( [0-9A-F]{8})+)";
+
+file = open(inputFilename)
+
+for line in file :
+ match = re.search(address_and_encoding_regex,line)
+ if match :
+ hexaddress = match.group(1)
+ encoding = match.group(2)
+ dwords = encoding.split()
+ address = int(hexaddress, 16)
+ address_end = address + len(dwords)*4 - 1
+ #Cache-line is 64 bytes. Check for half cache-line straddle.
+ if address//32 != address_end//32:
+ print("Straddling instruction found at:")
+ print(line)
+ sys.exit(1)
+
+sys.exit(0)
diff --git a/llvm/test/CodeGen/AMDGPU/no_straddle.ll b/llvm/test/CodeGen/AMDGPU/no_straddle.ll
new file mode 100644
index 0000000000000..b5010a03a60cb
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/no_straddle.ll
@@ -0,0 +1,153 @@
+;RUN: llc --amdgpu-prevent-half-cache-line-straddling -mtriple=amdgcn -mcpu=fiji -mattr=dumpcode --filetype=obj < %s | llvm-objdump --triple=amdgcn --mcpu=fiji -d - > %t.dis
+;RUN: %python %p/has_cache_straddle.py %t.dis
+
+define amdgpu_kernel void @xor_v2i32(ptr addrspace(1) %out, ptr addrspace(1) %in0, ptr addrspace(1) %in1) {
+ %a = load <2 x i32>, ptr addrspace(1) %in0
+ %b = load <2 x i32>, ptr addrspace(1) %in1
+ %result = xor <2 x i32> %a, %b
+ store <2 x i32> %result, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_kernel void @xor_v4i32(ptr addrspace(1) %out, ptr addrspace(1) %in0, ptr addrspace(1) %in1) {
+ %a = load <4 x i32>, ptr addrspace(1) %in0
+ %b = load <4 x i32>, ptr addrspace(1) %in1
+ %result = xor <4 x i32> %a, %b
+ store <4 x i32> %result, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_kernel void @xor_i1(ptr addrspace(1) %out, ptr addrspace(1) %in0, ptr addrspace(1) %in1) {
+ %a = load float, ptr addrspace(1) %in0
+ %b = load float, ptr addrspace(1) %in1
+ %acmp = fcmp oge float %a, 0.000000e+00
+ %bcmp = fcmp oge float %b, 1.000000e+00
+ %xor = xor i1 %acmp, %bcmp
+ %result = select i1 %xor, float %a, float %b
+ store float %result, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_kernel void @v_xor_i1(ptr addrspace(1) %out, ptr addrspace(1) %in0, ptr addrspace(1) %in1) {
+ %a = load volatile i1, ptr addrspace(1) %in0
+ %b = load volatile i1, ptr addrspace(1) %in1
+ %xor = xor i1 %a, %b
+ store i1 %xor, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_kernel void @vector_xor_i32(ptr addrspace(1) %out, ptr addrspace(1) %in0, ptr addrspace(1) %in1) {
+ %a = load i32, ptr addrspace(1) %in0
+ %b = load i32, ptr addrspace(1) %in1
+ %result = xor i32 %a, %b
+ store i32 %result, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_kernel void @scalar_xor_i32(ptr addrspace(1) %out, i32 %a, i32 %b) {
+ %result = xor i32 %a, %b
+ store i32 %result, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_kernel void @scalar_not_i32(ptr addrspace(1) %out, i32 %a) {
+ %result = xor i32 %a, -1
+ store i32 %result, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_kernel void @vector_not_i32(ptr addrspace(1) %out, ptr addrspace(1) %in0, ptr addrspace(1) %in1) {
+ %a = load i32, ptr addrspace(1) %in0
+ %b = load i32, ptr addrspace(1) %in1
+ %result = xor i32 %a, -1
+ store i32 %result, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_kernel void @vector_xor_i64(ptr addrspace(1) %out, ptr addrspace(1) %in0, ptr addrspace(1) %in1) {
+ %a = load i64, ptr addrspace(1) %in0
+ %b = load i64, ptr addrspace(1) %in1
+ %result = xor i64 %a, %b
+ store i64 %result, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_kernel void @scalar_xor_i64(ptr addrspace(1) %out, i64 %a, i64 %b) {
+ %result = xor i64 %a, %b
+ store i64 %result, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_kernel void @scalar_not_i64(ptr addrspace(1) %out, i64 %a) {
+ %result = xor i64 %a, -1
+ store i64 %result, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_kernel void @vector_not_i64(ptr addrspace(1) %out, ptr addrspace(1) %in0, ptr addrspace(1) %in1) {
+ %a = load i64, ptr addrspace(1) %in0
+ %b = load i64, ptr addrspace(1) %in1
+ %result = xor i64 %a, -1
+ store i64 %result, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_kernel void @xor_cf(ptr addrspace(1) %out, ptr addrspace(1) %in, i64 %a, i64 %b) {
+entry:
+ %0 = icmp eq i64 %a, 0
+ br i1 %0, label %if, label %else
+
+if:
+ %1 = xor i64 %a, %b
+ br label %endif
+
+else:
+ %2 = load i64, ptr addrspace(1) %in
+ br label %endif
+
+endif:
+ %3 = phi i64 [%1, %if], [%2, %else]
+ store i64 %3, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_kernel void @scalar_xor_literal_i64(ptr addrspace(1) %out, [8 x i32], i64 %a) {
+ %or = xor i64 %a, 4261135838621753
+ store i64 %or, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_kernel void @scalar_xor_literal_multi_use_i64(ptr addrspace(1) %out, [8 x i32], i64 %a, i64 %b) {
+ %or = xor i64 %a, 4261135838621753
+ store i64 %or, ptr addrspace(1) %out
+
+ %foo = add i64 %b, 4261135838621753
+ store volatile i64 %foo, ptr addrspace(1) poison
+ ret void
+}
+
+define amdgpu_kernel void @scalar_xor_inline_imm_i64(ptr addrspace(1) %out, [8 x i32], i64 %a) {
+ %or = xor i64 %a, 63
+ store i64 %or, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_kernel void @scalar_xor_neg_inline_imm_i64(ptr addrspace(1) %out, [8 x i32], i64 %a) {
+ %or = xor i64 %a, -8
+ store i64 %or, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_kernel void @vector_xor_i64_neg_inline_imm(ptr addrspace(1) %out, ptr addrspace(1) %a, ptr addrspace(1) %b) {
+ %loada = load i64, ptr addrspace(1) %a, align 8
+ %or = xor i64 %loada, -8
+ store i64 %or, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_kernel void @vector_xor_literal_i64(ptr addrspace(1) %out, ptr addrspace(1) %a, ptr addrspace(1) %b) {
+ %loada = load i64, ptr addrspace(1) %a, align 8
+ %or = xor i64 %loada, 22470723082367
+ store i64 %or, ptr addrspace(1) %out
+ ret void
+}
diff --git a/llvm/test/CodeGen/AMDGPU/no_straddle_amdgcn.bitcast.1024bit.ll b/llvm/test/CodeGen/AMDGPU/no_straddle_amdgcn.bitcast.1024bit.ll
new file mode 100644
index 0000000000000..d4096e2be2b54
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/no_straddle_amdgcn.bitcast.1024bit.ll
@@ -0,0 +1,2024 @@
+;RUN: llc --amdgpu-prevent-half-cache-line-straddling -mtriple=amdgcn -mcpu=tonga -mattr=dumpcode --filetype=obj < %s | llvm-objdump --triple=amdgcn --mcpu=tonga -d - > %t.dis
+;RUN: %python %p/has_cache_straddle.py %t.dis
+
+;RUN: llc --amdgpu-prevent-half-cache-line-straddling -mtriple=amdgcn -mcpu=gfx900 -mattr=dumpcode --filetype=obj < %s | llvm-objdump --triple=amdgcn --mcpu=gfx900 -d - > %t.dis
+;RUN: %python %p/has_cache_straddle.py %t.dis
+
+;RUN: llc --amdgpu-prevent-half-cache-line-straddling -mtriple=amdgcn -mcpu=gfx1100 -mattr=dumpcode --filetype=obj < %s | llvm-objdump --triple=amdgcn --mcpu=gfx1100 -d - > %t.dis
+;RUN: %python %p/has_cache_straddle.py %t.dis
+
+define <32 x float> @bitcast_v32i32_to_v32f32(<32 x i32> %a, i32 %b) {
+ %cmp = icmp eq i32 %b, 0
+ br i1 %cmp, label %cmp.true, label %cmp.false
+
+cmp.true:
+ %a1 = add <32 x i32> %a, splat (i32 3)
+ %a2 = bitcast <32 x i32> %a1 to <32 x float>
+ br label %end
+
+cmp.false:
+ %a3 = bitcast <32 x i32> %a to <32 x float>
+ br label %end
+
+end:
+ %phi = phi <32 x float> [ %a2, %cmp.true ], [ %a3, %cmp.false ]
+ ret <32 x float> %phi
+}
+
+define inreg <32 x float> @bitcast_v32i32_to_v32f32_scalar(<32 x i32> inreg %a, i32 inreg %b) {
+ %cmp = icmp eq i32 %b, 0
+ br i1 %cmp, label %cmp.true, label %cmp.false
+
+cmp.true:
+ %a1 = add <32 x i32> %a, splat (i32 3)
+ %a2 = bitcast <32 x i32> %a1 to <32 x float>
+ br label %end
+
+cmp.false:
+ %a3 = bitcast <32 x i32> %a to <32 x float>
+ br label %end
+
+end:
+ %phi = phi <32 x float> [ %a2, %cmp.true ], [ %a3, %cmp.false ]
+ ret <32 x float> %phi
+}
+
+define <32 x i32> @bitcast_v32f32_to_v32i32(<32 x float> %a, i32 %b) {
+ %cmp = icmp eq i32 %b, 0
+ br i1 %cmp, label %cmp.true, label %cmp.false
+
+cmp.true:
+ %a1 = fadd <32 x float> %a, splat (float 1.000000e+00)
+ %a2 = bitcast <32 x float> %a1 to <32 x i32>
+ br label %end
+
+cmp.false:
+ %a3 = bitcast <32 x float> %a to <32 x i32>
+ br label %end
+
+end:
+ %phi = phi <32 x i32> [ %a2, %cmp.true ], [ %a3, %cmp.false ]
+ ret <32 x i32> %phi
+}
+
+define inreg <32 x i32> @bitcast_v32f32_to_v32i32_scalar(<32 x float> inreg %a, i32 inreg %b) {
+ %cmp = icmp eq i32 %b, 0
+ br i1 %cmp, label %cmp.true, label %cmp.false
+
+cmp.true:
+ %a1 = fadd <32 x float> %a, splat (float 1.000000e+00)
+ %a2 = bitcast <32 x float> %a1 to <32 x i32>
+ br label %end
+
+cmp.false:
+ %a3 = bitcast <32 x float> %a to <32 x i32>
+ br label %end
+
+end:
+ %phi = phi <32 x i32> [ %a2, %cmp.true ], [ %a3, %cmp.false ]
+ ret <32 x i32> %phi
+}
+
+define <16 x i64> @bitcast_v32i32_to_v16i64(<32 x i32> %a, i32 %b) {
+ %cmp = icmp eq i32 %b, 0
+ br i1 %cmp, label %cmp.true, label %cmp.false
+
+cmp.true:
+ %a1 = add <32 x i32> %a, splat (i32 3)
+ %a2 = bitcast <32 x i32> %a1 to <16 x i64>
+ br label %end
+
+cmp.false:
+ %a3 = bitcast <32 x i32> %a to <16 x i64>
+ br label %end
+
+end:
+ %phi = phi <16 x i64> [ %a2, %cmp.true ], [ %a3, %cmp.false ]
+ ret <16 x i64> %phi
+}
+
+define inreg <16 x i64> @bitcast_v32i32_to_v16i64_scalar(<32 x i32> inreg %a, i32 inreg %b) {
+ %cmp = icmp eq i32 %b, 0
+ br i1 %cmp, label %cmp.true, label %cmp.false
+
+cmp.true:
+ %a1 = add <32 x i32> %a, splat (i32 3)
+ %a2 = bitcast <32 x i32> %a1 to <16 x i64>
+ br label %end
+
+cmp.false:
+ %a3 = bitcast <32 x i32> %a to <16 x i64>
+ br label %end
+
+end:
+ %phi = phi <16 x i64> [ %a2, %cmp.true ], [ %a3, %cmp.false ]
+ ret <16 x i64> %phi
+}
+
+define <32 x i32> @bitcast_v16i64_to_v32i32(<16 x i64> %a, i32 %b) {
+ %cmp = icmp eq i32 %b, 0
+ br i1 %cmp, label %cmp.true, label %cmp.false
+
+cmp.true:
+ %a1 = add <16 x i64> %a, splat (i64 3)
+ %a2 = bitcast <16 x i64> %a1 to <32 x i32>
+ br label %end
+
+cmp.false:
+ %a3 = bitcast <16 x i64> %a to <32 x i32>
+ br label %end
+
+end:
+ %phi = phi <32 x i32> [ %a2, %cmp.true ], [ %a3, %cmp.false ]
+ ret <32 x i32> %phi
+}
+
+define inreg <32 x i32> @bitcast_v16i64_to_v32i32_scalar(<16 x i64> inreg %a, i32 inreg %b) {
+ %cmp = icmp eq i32 %b, 0
+ br i1 %cmp, label %cmp.true, label %cmp.false
+
+cmp.true:
+ %a1 = add <16 x i64> %a, splat (i64 3)
+ %a2 = bitcast <16 x i64> %a1 to <32 x i32>
+ br label %end
+
+cmp.false:
+ %a3 = bitcast <16 x i64> %a to <32 x i32>
+ br label %end
+
+end:
+ %phi = phi <32 x i32> [ %a2, %cmp.true ], [ %a3, %cmp.false ]
+ ret <32 x i32> %phi
+}
+
+define <16 x double> @bitcast_v32i32_to_v16f64(<32 x i32> %a, i32 %b) {
+ %cmp = icmp eq i32 %b, 0
+ br i1 %cmp, label %cmp.true, label %cmp.false
+
+cmp.true:
+ %a1 = add <32 x i32> %a, splat (i32 3)
+ %a2 = bitcast <32 x i32> %a1 to <16 x double>
+ br label %end
+
+cmp.false:
+ %a3 = bitcast <32 x i32> %a to <16 x double>
+ br label %end
+
+end:
+ %phi = phi <16 x double> [ %a2, %cmp.true ], [ %a3, %cmp.false ]
+ ret <16 x double> %phi
+}
+
+define inreg <16 x double> @bitcast_v32i32_to_v16f64_scalar(<32 x i32> inreg %a, i32 inreg %b) {
+ %cmp = icmp eq i32 %b, 0
+ br i1 %cmp, label %cmp.true, label %cmp.false
+
+cmp.true:
+ %a1 = add <32 x i32> %a, splat (i32 3)
+ %a2 = bitcast <32 x i32> %a1 to <16 x double>
+ br label %end
+
+cmp.false:
+ %a3 = bitcast <32 x i32> %a to <16 x double>
+ br label %end
+
+end:
+ %phi = phi <16 x double> [ %a2, %cmp.true ], [ %a3, %cmp.false ]
+ ret <16 x double> %phi
+}
+
+define <32 x i32> @bit...
[truncated]
``````````
</details>
https://github.com/llvm/llvm-project/pull/150239
More information about the llvm-commits
mailing list