[llvm] ad479dd - Revert "[SeparateConstOffsetFromGEP] Decompose constant xor operand if possible (#135788)"

Tue Jun 10 16:50:17 PDT 2025

Author: Matt Arsenault
Date: 2025-06-11T08:49:13+09:00
New Revision: ad479ddb343c2756e6eed0f2999bbdb88a65c7c5

URL: https://github.com/llvm/llvm-project/commit/ad479ddb343c2756e6eed0f2999bbdb88a65c7c5
DIFF: https://github.com/llvm/llvm-project/commit/ad479ddb343c2756e6eed0f2999bbdb88a65c7c5.diff

LOG: Revert "[SeparateConstOffsetFromGEP] Decompose constant xor operand if possible (#135788)"

This reverts commit 13ccce28776d8ad27b0c6a92b5a452d62da05663.

The tests are on non-canonical IR, and adds an extra unrelated
pre-processing step to the pass. I'm assuming this is a workaround
for the known-bits recursion depth limit in instcombine.

Added: 
    

Modified: 
    llvm/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp

Removed: 
    llvm/test/Transforms/SeparateConstOffsetFromGEP/AMDGPU/xor-to-or-disjoint.ll


################################################################################
diff  --git a/llvm/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp b/llvm/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp
index 6fae9f1dd2404..320b79203c0b3 100644

--- a/llvm/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp
+++ b/llvm/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp
@@ -174,7 +174,6 @@
 #include "llvm/IR/Function.h"
 #include "llvm/IR/GetElementPtrTypeIterator.h"
 #include "llvm/IR/IRBuilder.h"
-#include "llvm/IR/InstIterator.h"
 #include "llvm/IR/InstrTypes.h"
 #include "llvm/IR/Instruction.h"
 #include "llvm/IR/Instructions.h"
@@ -191,7 +190,6 @@
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Transforms/Scalar.h"
-#include "llvm/Transforms/Utils/BasicBlockUtils.h"
 #include "llvm/Transforms/Utils/Local.h"
 #include <cassert>
 #include <cstdint>
@@ -200,8 +198,6 @@
 using namespace llvm;
 using namespace llvm::PatternMatch;
 
-#define DEBUG_TYPE "separate-offset-gep"
-
 static cl::opt<bool> DisableSeparateConstOffsetFromGEP(
     "disable-separate-const-offset-from-gep", cl::init(false),
     cl::desc("Do not separate the constant offset from a GEP instruction"),
@@ -492,42 +488,6 @@ class SeparateConstOffsetFromGEP {
   DenseMap<ExprKey, SmallVector<Instruction *, 2>> DominatingSubs;
 };
 
-/// A helper class that aims to convert xor operations into or operations when
-/// their operands are disjoint and the result is used in a GEP's index. This
-/// can then enable further GEP optimizations by effectively turning BaseVal |
-/// Const into BaseVal + Const when they are disjoint, which
-/// SeparateConstOffsetFromGEP can then process. This is a common pattern that
-/// sets up a grid of memory accesses across a wave where each thread acesses
-/// data at various offsets.
-class XorToOrDisjointTransformer {
-public:
-  XorToOrDisjointTransformer(Function &F, DominatorTree &DT,
-                             const DataLayout &DL)
-      : F(F), DT(DT), DL(DL) {}
-
-  bool run();
-
-private:
-  Function &F;
-  DominatorTree &DT;
-  const DataLayout &DL;
-  /// Maps a common operand to all Xor instructions
-  using XorOpList = SmallVector<std::pair<BinaryOperator *, APInt>, 8>;
-  using XorBaseValInst = DenseMap<Instruction *, XorOpList>;
-  XorBaseValInst XorGroups;
-
-  /// Checks if the given value has at least one GetElementPtr user
-  static bool hasGEPUser(const Value *V);
-
-  /// Helper function to check if BaseXor dominates all XORs in the group
-  bool dominatesAllXors(BinaryOperator *BaseXor, const XorOpList &XorsInGroup);
-
-  /// Processes a group of XOR instructions that share the same non-constant
-  /// base operand. Returns true if this group's processing modified the
-  /// function.
-  bool processXorGroup(Instruction *OriginalBaseInst, XorOpList &XorsInGroup);
-};
-
 } // end anonymous namespace
 
 char SeparateConstOffsetFromGEPLegacyPass::ID = 0;
@@ -1263,154 +1223,6 @@ bool SeparateConstOffsetFromGEP::splitGEP(GetElementPtrInst *GEP) {
   return true;
 }
 
-// Helper function to check if an instruction has at least one GEP user
-bool XorToOrDisjointTransformer::hasGEPUser(const Value *V) {
-  return llvm::any_of(V->users(), [](const User *U) {
-    return isa<llvm::GetElementPtrInst>(U);
-  });
-}
-
-bool XorToOrDisjointTransformer::dominatesAllXors(
-    BinaryOperator *BaseXor, const XorOpList &XorsInGroup) {
-  return llvm::all_of(XorsInGroup, [&](const auto &XorEntry) {
-    BinaryOperator *XorInst = XorEntry.first;
-    // Do not evaluate the BaseXor, otherwise we end up cloning it.
-    return XorInst == BaseXor || DT.dominates(BaseXor, XorInst);
-  });
-}
-
-bool XorToOrDisjointTransformer::processXorGroup(Instruction *OriginalBaseInst,
-                                                 XorOpList &XorsInGroup) {
-  bool Changed = false;
-  if (XorsInGroup.size() <= 1)
-    return false;
-
-  // Sort XorsInGroup by the constant offset value in increasing order.
-  llvm::sort(XorsInGroup, [](const auto &A, const auto &B) {
-    return A.second.slt(B.second);
-  });
-
-  // Dominance check
-  // The "base" XOR for dominance purposes is the one with the smallest
-  // constant.
-  BinaryOperator *XorWithSmallConst = XorsInGroup[0].first;
-
-  if (!dominatesAllXors(XorWithSmallConst, XorsInGroup)) {
-    LLVM_DEBUG(dbgs() << DEBUG_TYPE
-                      << ": Cloning and inserting XOR with smallest constant ("
-                      << *XorWithSmallConst
-                      << ") as it does not dominate all other XORs"
-                      << " in function " << F.getName() << "\n");
-
-    BinaryOperator *ClonedXor =
-        cast<BinaryOperator>(XorWithSmallConst->clone());
-    ClonedXor->setName(XorWithSmallConst->getName() + ".dom_clone");
-    ClonedXor->insertAfter(OriginalBaseInst);
-    LLVM_DEBUG(dbgs() << "  Cloned Inst: " << *ClonedXor << "\n");
-    Changed = true;
-    XorWithSmallConst = ClonedXor;
-  }
-
-  SmallVector<Instruction *, 8> InstructionsToErase;
-  const APInt SmallestConst =
-      cast<ConstantInt>(XorWithSmallConst->getOperand(1))->getValue();
-
-  // Main transformation loop: Iterate over the original XORs in the sorted
-  // group.
-  for (const auto &XorEntry : XorsInGroup) {
-    BinaryOperator *XorInst = XorEntry.first; // Original XOR instruction
-    const APInt ConstOffsetVal = XorEntry.second;
-
-    // Do not process the one with smallest constant as it is the base.
-    if (XorInst == XorWithSmallConst)
-      continue;
-
-    // Disjointness Check 1
-    APInt NewConstVal = ConstOffsetVal - SmallestConst;
-    if ((NewConstVal & SmallestConst) != 0) {
-      LLVM_DEBUG(dbgs() << DEBUG_TYPE << ": Cannot transform XOR in function "
-                        << F.getName() << ":\n"
-                        << "  New Const: " << NewConstVal
-                        << "  Smallest Const: " << SmallestConst
-                        << "  are not disjoint \n");
-      continue;
-    }
-
-    // Disjointness Check 2
-    if (MaskedValueIsZero(XorWithSmallConst, NewConstVal, SimplifyQuery(DL),
-                          0)) {
-      LLVM_DEBUG(dbgs() << DEBUG_TYPE
-                        << ": Transforming XOR to OR (disjoint) in function "
-                        << F.getName() << ":\n"
-                        << "  Xor: " << *XorInst << "\n"
-                        << "  Base Val: " << *XorWithSmallConst << "\n"
-                        << "  New Const: " << NewConstVal << "\n");
-
-      auto *NewOrInst = BinaryOperator::CreateDisjointOr(
-          XorWithSmallConst,
-          ConstantInt::get(OriginalBaseInst->getType(), NewConstVal),
-          XorInst->getName() + ".or_disjoint", XorInst->getIterator());
-
-      NewOrInst->copyMetadata(*XorInst);
-      XorInst->replaceAllUsesWith(NewOrInst);
-      LLVM_DEBUG(dbgs() << "  New Inst: " << *NewOrInst << "\n");
-      InstructionsToErase.push_back(XorInst); // Mark original XOR for deletion
-
-      Changed = true;
-    } else {
-      LLVM_DEBUG(
-          dbgs() << DEBUG_TYPE
-                 << ": Cannot transform XOR (not proven disjoint) in function "
-                 << F.getName() << ":\n"
-                 << "  Xor: " << *XorInst << "\n"
-                 << "  Base Val: " << *XorWithSmallConst << "\n"
-                 << "  New Const: " << NewConstVal << "\n");
-    }
-  }
-
-  for (Instruction *I : InstructionsToErase)
-    I->eraseFromParent();
-
-  return Changed;
-}
-
-// Try to transform XOR(A, B+C) in to XOR(A,C) + B where XOR(A,C) becomes
-// the base for memory operations. This transformation is true under the
-// following conditions
-// Check 1 -  B and C are disjoint.
-// Check 2 - XOR(A,C) and B are disjoint.
-//
-// This transformation is beneficial particularly for GEPs because:
-// 1. OR operations often map better to addressing modes than XOR
-// 2. Disjoint OR operations preserve the semantics of the original XOR
-// 3. This can enable further optimizations in the GEP offset folding pipeline
-bool XorToOrDisjointTransformer::run() {
-  bool Changed = false;
-
-  // Collect all candidate XORs
-  for (Instruction &I : instructions(F)) {
-    Instruction *Op0 = nullptr;
-    ConstantInt *C1 = nullptr;
-    BinaryOperator *MatchedXorOp = nullptr;
-
-    // Attempt to match the instruction 'I' as XOR operation.
-    if (match(&I, m_CombineAnd(m_Xor(m_Instruction(Op0), m_ConstantInt(C1)),
-                               m_BinOp(MatchedXorOp))) &&
-        hasGEPUser(MatchedXorOp))
-      XorGroups[Op0].emplace_back(MatchedXorOp, C1->getValue());
-  }
-
-  if (XorGroups.empty())
-    return false;
-
-  // Process each group of XORs
-  for (auto &[OriginalBaseInst, XorsInGroup] : XorGroups)
-    if (processXorGroup(OriginalBaseInst, XorsInGroup))
-      Changed = true;
-
-  return Changed;
-}
-
 bool SeparateConstOffsetFromGEPLegacyPass::runOnFunction(Function &F) {
   if (skipFunction(F))
     return false;
@@ -1430,11 +1242,6 @@ bool SeparateConstOffsetFromGEP::run(Function &F) {
 
   DL = &F.getDataLayout();
   bool Changed = false;
-
-  // Decompose xor in to "or disjoint" if possible.
-  XorToOrDisjointTransformer XorTransformer(F, *DT, *DL);
-  Changed |= XorTransformer.run();
-
   for (BasicBlock &B : F) {
     if (!DT->isReachableFromEntry(&B))
       continue;

diff  --git a/llvm/test/Transforms/SeparateConstOffsetFromGEP/AMDGPU/xor-to-or-disjoint.ll b/llvm/test/Transforms/SeparateConstOffsetFromGEP/AMDGPU/xor-to-or-disjoint.ll
deleted file mode 100644
index 825227292fe14..0000000000000
--- a/llvm/test/Transforms/SeparateConstOffsetFromGEP/AMDGPU/xor-to-or-disjoint.ll
+++ /dev/null
@@ -1,204 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
-; RUN: opt -mtriple=amdgcn-amd-amdhsa -passes=separate-const-offset-from-gep \
-; RUN: -S < %s | FileCheck %s
-
-
-; Test a simple case of xor to or disjoint transformation
-define half @test_basic_transformation(ptr %ptr, i64 %input) {
-; CHECK-LABEL: define half @test_basic_transformation(
-; CHECK-SAME: ptr [[PTR:%.*]], i64 [[INPUT:%.*]]) {
-; CHECK-NEXT:  [[ENTRY:.*:]]
-; CHECK-NEXT:    [[BASE:%.*]] = and i64 [[INPUT]], -8192
-; CHECK-NEXT:    [[ADDR1:%.*]] = xor i64 [[BASE]], 32
-; CHECK-NEXT:    [[ADDR2_OR_DISJOINT:%.*]] = or disjoint i64 [[ADDR1]], 2048
-; CHECK-NEXT:    [[ADDR3_OR_DISJOINT:%.*]] = or disjoint i64 [[ADDR1]], 4096
-; CHECK-NEXT:    [[GEP1:%.*]] = getelementptr i8, ptr [[PTR]], i64 [[ADDR1]]
-; CHECK-NEXT:    [[GEP2:%.*]] = getelementptr i8, ptr [[PTR]], i64 [[ADDR2_OR_DISJOINT]]
-; CHECK-NEXT:    [[GEP3:%.*]] = getelementptr i8, ptr [[PTR]], i64 [[ADDR3_OR_DISJOINT]]
-; CHECK-NEXT:    [[VAL1:%.*]] = load half, ptr [[GEP1]], align 2
-; CHECK-NEXT:    [[VAL2:%.*]] = load half, ptr [[GEP2]], align 2
-; CHECK-NEXT:    [[VAL3:%.*]] = load half, ptr [[GEP3]], align 2
-; CHECK-NEXT:    [[VAL1_F:%.*]] = fpext half [[VAL1]] to float
-; CHECK-NEXT:    [[VAL2_F:%.*]] = fpext half [[VAL2]] to float
-; CHECK-NEXT:    [[VAL3_F:%.*]] = fpext half [[VAL3]] to float
-; CHECK-NEXT:    [[SUM1_F:%.*]] = fadd float [[VAL1_F]], [[VAL2_F]]
-; CHECK-NEXT:    [[SUM_TOTAL_F:%.*]] = fadd float [[SUM1_F]], [[VAL3_F]]
-; CHECK-NEXT:    [[RESULT_H:%.*]] = fptrunc float [[SUM_TOTAL_F]] to half
-; CHECK-NEXT:    ret half [[RESULT_H]]
-;
-entry:
-  %base = and i64 %input, -8192    ; Clear low bits
-  %addr1 = xor i64 %base, 32
-  %addr2 = xor i64 %base, 2080
-  %addr3 = xor i64 %base, 4128
-  %gep1 = getelementptr i8, ptr %ptr, i64 %addr1
-  %gep2 = getelementptr i8, ptr %ptr, i64 %addr2
-  %gep3 = getelementptr i8, ptr %ptr, i64 %addr3
-  %val1 = load half, ptr %gep1
-  %val2 = load half, ptr %gep2
-  %val3 = load half, ptr %gep3
-  %val1.f = fpext half %val1 to float
-  %val2.f = fpext half %val2 to float
-  %val3.f = fpext half %val3 to float
-  %sum1.f = fadd float %val1.f, %val2.f
-  %sum_total.f = fadd float %sum1.f, %val3.f
-  %result.h = fptrunc float %sum_total.f to half
-  ret half %result.h
-}
-
-
-; Test the decreasing order of offset xor to or disjoint transformation
-define half @test_descending_offset_transformation(ptr %ptr, i64 %input) {
-; CHECK-LABEL: define half @test_descending_offset_transformation(
-; CHECK-SAME: ptr [[PTR:%.*]], i64 [[INPUT:%.*]]) {
-; CHECK-NEXT:  [[ENTRY:.*:]]
-; CHECK-NEXT:    [[BASE:%.*]] = and i64 [[INPUT]], -8192
-; CHECK-NEXT:    [[ADDR3_DOM_CLONE:%.*]] = xor i64 [[BASE]], 32
-; CHECK-NEXT:    [[ADDR1_OR_DISJOINT:%.*]] = or disjoint i64 [[ADDR3_DOM_CLONE]], 4096
-; CHECK-NEXT:    [[ADDR2_OR_DISJOINT:%.*]] = or disjoint i64 [[ADDR3_DOM_CLONE]], 2048
-; CHECK-NEXT:    [[ADDR3_OR_DISJOINT:%.*]] = or disjoint i64 [[ADDR3_DOM_CLONE]], 0
-; CHECK-NEXT:    [[GEP1:%.*]] = getelementptr i8, ptr [[PTR]], i64 [[ADDR1_OR_DISJOINT]]
-; CHECK-NEXT:    [[GEP2:%.*]] = getelementptr i8, ptr [[PTR]], i64 [[ADDR2_OR_DISJOINT]]
-; CHECK-NEXT:    [[GEP3:%.*]] = getelementptr i8, ptr [[PTR]], i64 [[ADDR3_OR_DISJOINT]]
-; CHECK-NEXT:    [[VAL1:%.*]] = load half, ptr [[GEP1]], align 2
-; CHECK-NEXT:    [[VAL2:%.*]] = load half, ptr [[GEP2]], align 2
-; CHECK-NEXT:    [[VAL3:%.*]] = load half, ptr [[GEP3]], align 2
-; CHECK-NEXT:    [[VAL1_F:%.*]] = fpext half [[VAL1]] to float
-; CHECK-NEXT:    [[VAL2_F:%.*]] = fpext half [[VAL2]] to float
-; CHECK-NEXT:    [[VAL3_F:%.*]] = fpext half [[VAL3]] to float
-; CHECK-NEXT:    [[SUM1_F:%.*]] = fadd float [[VAL1_F]], [[VAL2_F]]
-; CHECK-NEXT:    [[SUM_TOTAL_F:%.*]] = fadd float [[SUM1_F]], [[VAL3_F]]
-; CHECK-NEXT:    [[RESULT_H:%.*]] = fptrunc float [[SUM_TOTAL_F]] to half
-; CHECK-NEXT:    ret half [[RESULT_H]]
-;
-entry:
-  %base = and i64 %input, -8192    ; Clear low bits
-  %addr1 = xor i64 %base, 4128
-  %addr2 = xor i64 %base, 2080
-  %addr3 = xor i64 %base, 32
-  %gep1 = getelementptr i8, ptr %ptr, i64 %addr1
-  %gep2 = getelementptr i8, ptr %ptr, i64 %addr2
-  %gep3 = getelementptr i8, ptr %ptr, i64 %addr3
-  %val1 = load half, ptr %gep1
-  %val2 = load half, ptr %gep2
-  %val3 = load half, ptr %gep3
-  %val1.f = fpext half %val1 to float
-  %val2.f = fpext half %val2 to float
-  %val3.f = fpext half %val3 to float
-  %sum1.f = fadd float %val1.f, %val2.f
-  %sum_total.f = fadd float %sum1.f, %val3.f
-  %result.h = fptrunc float %sum_total.f to half
-  ret half %result.h
-}
-
-
-; Test that %addr2 is not transformed to or disjoint.
-define half @test_no_transfomation(ptr %ptr, i64 %input) {
-; CHECK-LABEL: define half @test_no_transfomation(
-; CHECK-SAME: ptr [[PTR:%.*]], i64 [[INPUT:%.*]]) {
-; CHECK-NEXT:  [[ENTRY:.*:]]
-; CHECK-NEXT:    [[BASE:%.*]] = and i64 [[INPUT]], -8192
-; CHECK-NEXT:    [[ADDR1:%.*]] = xor i64 [[BASE]], 32
-; CHECK-NEXT:    [[ADDR2:%.*]] = xor i64 [[BASE]], 64
-; CHECK-NEXT:    [[ADDR3_OR_DISJOINT:%.*]] = or disjoint i64 [[ADDR1]], 2048
-; CHECK-NEXT:    [[GEP1:%.*]] = getelementptr i8, ptr [[PTR]], i64 [[ADDR1]]
-; CHECK-NEXT:    [[GEP2:%.*]] = getelementptr i8, ptr [[PTR]], i64 [[ADDR2]]
-; CHECK-NEXT:    [[GEP3:%.*]] = getelementptr i8, ptr [[PTR]], i64 [[ADDR3_OR_DISJOINT]]
-; CHECK-NEXT:    [[VAL1:%.*]] = load half, ptr [[GEP1]], align 2
-; CHECK-NEXT:    [[VAL2:%.*]] = load half, ptr [[GEP2]], align 2
-; CHECK-NEXT:    [[VAL3:%.*]] = load half, ptr [[GEP3]], align 2
-; CHECK-NEXT:    [[VAL1_F:%.*]] = fpext half [[VAL1]] to float
-; CHECK-NEXT:    [[VAL2_F:%.*]] = fpext half [[VAL2]] to float
-; CHECK-NEXT:    [[VAL3_F:%.*]] = fpext half [[VAL3]] to float
-; CHECK-NEXT:    [[SUM1_F:%.*]] = fadd float [[VAL1_F]], [[VAL2_F]]
-; CHECK-NEXT:    [[SUM_TOTAL_F:%.*]] = fadd float [[SUM1_F]], [[VAL3_F]]
-; CHECK-NEXT:    [[RESULT_H:%.*]] = fptrunc float [[SUM_TOTAL_F]] to half
-; CHECK-NEXT:    ret half [[RESULT_H]]
-;
-entry:
-  %base = and i64 %input, -8192    ; Clear low bits
-  %addr1 = xor i64 %base, 32
-  %addr2 = xor i64 %base, 64  ; Should not be transformed
-  %addr3 = xor i64 %base, 2080
-  %gep1 = getelementptr i8, ptr %ptr, i64 %addr1
-  %gep2 = getelementptr i8, ptr %ptr, i64 %addr2
-  %gep3 = getelementptr i8, ptr %ptr, i64 %addr3
-  %val1 = load half, ptr %gep1
-  %val2 = load half, ptr %gep2
-  %val3 = load half, ptr %gep3
-  %val1.f = fpext half %val1 to float
-  %val2.f = fpext half %val2 to float
-  %val3.f = fpext half %val3 to float
-  %sum1.f = fadd float %val1.f, %val2.f
-  %sum_total.f = fadd float %sum1.f, %val3.f
-  %result.h = fptrunc float %sum_total.f to half
-  ret half %result.h
-}
-
-
-; Test case with xor instructions in 
diff erent basic blocks
-define half @test_dom_tree(ptr %ptr, i64 %input, i1 %cond) {
-; CHECK-LABEL: define half @test_dom_tree(
-; CHECK-SAME: ptr [[PTR:%.*]], i64 [[INPUT:%.*]], i1 [[COND:%.*]]) {
-; CHECK-NEXT:  [[ENTRY:.*:]]
-; CHECK-NEXT:    [[BASE:%.*]] = and i64 [[INPUT]], -8192
-; CHECK-NEXT:    [[ADDR1:%.*]] = xor i64 [[BASE]], 16
-; CHECK-NEXT:    [[GEP1:%.*]] = getelementptr i8, ptr [[PTR]], i64 [[ADDR1]]
-; CHECK-NEXT:    [[VAL1:%.*]] = load half, ptr [[GEP1]], align 2
-; CHECK-NEXT:    br i1 [[COND]], label %[[THEN:.*]], label %[[ELSE:.*]]
-; CHECK:       [[THEN]]:
-; CHECK-NEXT:    [[ADDR2_OR_DISJOINT:%.*]] = or disjoint i64 [[ADDR1]], 32
-; CHECK-NEXT:    [[GEP2:%.*]] = getelementptr i8, ptr [[PTR]], i64 [[ADDR2_OR_DISJOINT]]
-; CHECK-NEXT:    [[VAL2:%.*]] = load half, ptr [[GEP2]], align 2
-; CHECK-NEXT:    br label %[[MERGE:.*]]
-; CHECK:       [[ELSE]]:
-; CHECK-NEXT:    [[ADDR3_OR_DISJOINT:%.*]] = or disjoint i64 [[ADDR1]], 96
-; CHECK-NEXT:    [[GEP3:%.*]] = getelementptr i8, ptr [[PTR]], i64 [[ADDR3_OR_DISJOINT]]
-; CHECK-NEXT:    [[VAL3:%.*]] = load half, ptr [[GEP3]], align 2
-; CHECK-NEXT:    br label %[[MERGE]]
-; CHECK:       [[MERGE]]:
-; CHECK-NEXT:    [[VAL_FROM_BRANCH:%.*]] = phi half [ [[VAL2]], %[[THEN]] ], [ [[VAL3]], %[[ELSE]] ]
-; CHECK-NEXT:    [[ADDR4_OR_DISJOINT:%.*]] = or disjoint i64 [[ADDR1]], 224
-; CHECK-NEXT:    [[GEP4:%.*]] = getelementptr i8, ptr [[PTR]], i64 [[ADDR4_OR_DISJOINT]]
-; CHECK-NEXT:    [[VAL4:%.*]] = load half, ptr [[GEP4]], align 2
-; CHECK-NEXT:    [[VAL1_F:%.*]] = fpext half [[VAL1]] to float
-; CHECK-NEXT:    [[VAL_FROM_BRANCH_F:%.*]] = fpext half [[VAL_FROM_BRANCH]] to float
-; CHECK-NEXT:    [[VAL4_F:%.*]] = fpext half [[VAL4]] to float
-; CHECK-NEXT:    [[SUM_INTERMEDIATE_F:%.*]] = fadd float [[VAL1_F]], [[VAL_FROM_BRANCH_F]]
-; CHECK-NEXT:    [[FINAL_SUM_F:%.*]] = fadd float [[SUM_INTERMEDIATE_F]], [[VAL4_F]]
-; CHECK-NEXT:    [[RESULT_H:%.*]] = fptrunc float [[FINAL_SUM_F]] to half
-; CHECK-NEXT:    ret half [[RESULT_H]]
-;
-entry:
-  %base = and i64 %input, -8192   ; Clear low bits
-  %addr1 = xor i64 %base,16
-  %gep1 = getelementptr i8, ptr %ptr, i64 %addr1
-  %val1 = load half, ptr %gep1
-  br i1 %cond, label %then, label %else
-
-then:
-  %addr2 = xor i64 %base, 48
-  %gep2 = getelementptr i8, ptr %ptr, i64 %addr2
-  %val2 = load half, ptr %gep2
-  br label %merge
-
-else:
-  %addr3 = xor i64 %base, 112
-  %gep3 = getelementptr i8, ptr %ptr, i64 %addr3
-  %val3 = load half, ptr %gep3
-  br label %merge
-
-merge:
-  %val_from_branch = phi half [ %val2, %then ], [ %val3, %else ]
-  %addr4 = xor i64 %base, 240
-  %gep4 = getelementptr i8, ptr %ptr, i64 %addr4
-  %val4 = load half, ptr %gep4
-  %val1.f = fpext half %val1 to float
-  %val_from_branch.f = fpext half %val_from_branch to float
-  %val4.f = fpext half %val4 to float
-  %sum_intermediate.f = fadd float %val1.f, %val_from_branch.f
-  %final_sum.f = fadd float %sum_intermediate.f, %val4.f
-  %result.h = fptrunc float %final_sum.f to half
-  ret half %result.h
-}
-