[llvm] 13ccce2 - [SeparateConstOffsetFromGEP] Decompose constant xor operand if possible (#135788)

Tue Jun 10 13:46:30 PDT 2025

Author: Sumanth Gundapaneni
Date: 2025-06-10T15:46:27-05:00
New Revision: 13ccce28776d8ad27b0c6a92b5a452d62da05663

URL: https://github.com/llvm/llvm-project/commit/13ccce28776d8ad27b0c6a92b5a452d62da05663
DIFF: https://github.com/llvm/llvm-project/commit/13ccce28776d8ad27b0c6a92b5a452d62da05663.diff

LOG: [SeparateConstOffsetFromGEP] Decompose constant xor operand if possible (#135788)

Try to transform XOR(A, B+C) in to XOR(A,C) + B where XOR(A,C) becomes
the base for memory operations. This transformation is true under the
following conditions
Check 1 -  B and C are disjoint.
Check 2 - XOR(A,C) and B are disjoint.

This transformation is beneficial particularly for GEPs because
Disjoint OR operations often map better to addressing modes than XOR.
This can enable further optimizations in the GEP offset folding pipeline

Added: 
    llvm/test/Transforms/SeparateConstOffsetFromGEP/AMDGPU/xor-to-or-disjoint.ll

Modified: 
    llvm/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp b/llvm/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp
index 320b79203c0b3..6fae9f1dd2404 100644

--- a/llvm/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp
+++ b/llvm/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp
@@ -174,6 +174,7 @@
 #include "llvm/IR/Function.h"
 #include "llvm/IR/GetElementPtrTypeIterator.h"
 #include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/InstIterator.h"
 #include "llvm/IR/InstrTypes.h"
 #include "llvm/IR/Instruction.h"
 #include "llvm/IR/Instructions.h"
@@ -190,6 +191,7 @@
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Transforms/Scalar.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
 #include "llvm/Transforms/Utils/Local.h"
 #include <cassert>
 #include <cstdint>
@@ -198,6 +200,8 @@
 using namespace llvm;
 using namespace llvm::PatternMatch;
 
+#define DEBUG_TYPE "separate-offset-gep"
+
 static cl::opt<bool> DisableSeparateConstOffsetFromGEP(
     "disable-separate-const-offset-from-gep", cl::init(false),
     cl::desc("Do not separate the constant offset from a GEP instruction"),
@@ -488,6 +492,42 @@ class SeparateConstOffsetFromGEP {
   DenseMap<ExprKey, SmallVector<Instruction *, 2>> DominatingSubs;
 };
 
+/// A helper class that aims to convert xor operations into or operations when
+/// their operands are disjoint and the result is used in a GEP's index. This
+/// can then enable further GEP optimizations by effectively turning BaseVal |
+/// Const into BaseVal + Const when they are disjoint, which
+/// SeparateConstOffsetFromGEP can then process. This is a common pattern that
+/// sets up a grid of memory accesses across a wave where each thread acesses
+/// data at various offsets.
+class XorToOrDisjointTransformer {
+public:
+  XorToOrDisjointTransformer(Function &F, DominatorTree &DT,
+                             const DataLayout &DL)
+      : F(F), DT(DT), DL(DL) {}
+
+  bool run();
+
+private:
+  Function &F;
+  DominatorTree &DT;
+  const DataLayout &DL;
+  /// Maps a common operand to all Xor instructions
+  using XorOpList = SmallVector<std::pair<BinaryOperator *, APInt>, 8>;
+  using XorBaseValInst = DenseMap<Instruction *, XorOpList>;
+  XorBaseValInst XorGroups;
+
+  /// Checks if the given value has at least one GetElementPtr user
+  static bool hasGEPUser(const Value *V);
+
+  /// Helper function to check if BaseXor dominates all XORs in the group
+  bool dominatesAllXors(BinaryOperator *BaseXor, const XorOpList &XorsInGroup);
+
+  /// Processes a group of XOR instructions that share the same non-constant
+  /// base operand. Returns true if this group's processing modified the
+  /// function.
+  bool processXorGroup(Instruction *OriginalBaseInst, XorOpList &XorsInGroup);
+};
+
 } // end anonymous namespace
 
 char SeparateConstOffsetFromGEPLegacyPass::ID = 0;
@@ -1223,6 +1263,154 @@ bool SeparateConstOffsetFromGEP::splitGEP(GetElementPtrInst *GEP) {
   return true;
 }
 
+// Helper function to check if an instruction has at least one GEP user
+bool XorToOrDisjointTransformer::hasGEPUser(const Value *V) {
+  return llvm::any_of(V->users(), [](const User *U) {
+    return isa<llvm::GetElementPtrInst>(U);
+  });
+}
+
+bool XorToOrDisjointTransformer::dominatesAllXors(
+    BinaryOperator *BaseXor, const XorOpList &XorsInGroup) {
+  return llvm::all_of(XorsInGroup, [&](const auto &XorEntry) {
+    BinaryOperator *XorInst = XorEntry.first;
+    // Do not evaluate the BaseXor, otherwise we end up cloning it.
+    return XorInst == BaseXor || DT.dominates(BaseXor, XorInst);
+  });
+}
+
+bool XorToOrDisjointTransformer::processXorGroup(Instruction *OriginalBaseInst,
+                                                 XorOpList &XorsInGroup) {
+  bool Changed = false;
+  if (XorsInGroup.size() <= 1)
+    return false;
+
+  // Sort XorsInGroup by the constant offset value in increasing order.
+  llvm::sort(XorsInGroup, [](const auto &A, const auto &B) {
+    return A.second.slt(B.second);
+  });
+
+  // Dominance check
+  // The "base" XOR for dominance purposes is the one with the smallest
+  // constant.
+  BinaryOperator *XorWithSmallConst = XorsInGroup[0].first;
+
+  if (!dominatesAllXors(XorWithSmallConst, XorsInGroup)) {
+    LLVM_DEBUG(dbgs() << DEBUG_TYPE
+                      << ": Cloning and inserting XOR with smallest constant ("
+                      << *XorWithSmallConst
+                      << ") as it does not dominate all other XORs"
+                      << " in function " << F.getName() << "\n");
+
+    BinaryOperator *ClonedXor =
+        cast<BinaryOperator>(XorWithSmallConst->clone());
+    ClonedXor->setName(XorWithSmallConst->getName() + ".dom_clone");
+    ClonedXor->insertAfter(OriginalBaseInst);
+    LLVM_DEBUG(dbgs() << "  Cloned Inst: " << *ClonedXor << "\n");
+    Changed = true;
+    XorWithSmallConst = ClonedXor;
+  }
+
+  SmallVector<Instruction *, 8> InstructionsToErase;
+  const APInt SmallestConst =
+      cast<ConstantInt>(XorWithSmallConst->getOperand(1))->getValue();
+
+  // Main transformation loop: Iterate over the original XORs in the sorted
+  // group.
+  for (const auto &XorEntry : XorsInGroup) {
+    BinaryOperator *XorInst = XorEntry.first; // Original XOR instruction
+    const APInt ConstOffsetVal = XorEntry.second;
+
+    // Do not process the one with smallest constant as it is the base.
+    if (XorInst == XorWithSmallConst)
+      continue;
+
+    // Disjointness Check 1
+    APInt NewConstVal = ConstOffsetVal - SmallestConst;
+    if ((NewConstVal & SmallestConst) != 0) {
+      LLVM_DEBUG(dbgs() << DEBUG_TYPE << ": Cannot transform XOR in function "
+                        << F.getName() << ":\n"
+                        << "  New Const: " << NewConstVal
+                        << "  Smallest Const: " << SmallestConst
+                        << "  are not disjoint \n");
+      continue;
+    }
+
+    // Disjointness Check 2
+    if (MaskedValueIsZero(XorWithSmallConst, NewConstVal, SimplifyQuery(DL),
+                          0)) {
+      LLVM_DEBUG(dbgs() << DEBUG_TYPE
+                        << ": Transforming XOR to OR (disjoint) in function "
+                        << F.getName() << ":\n"
+                        << "  Xor: " << *XorInst << "\n"
+                        << "  Base Val: " << *XorWithSmallConst << "\n"
+                        << "  New Const: " << NewConstVal << "\n");
+
+      auto *NewOrInst = BinaryOperator::CreateDisjointOr(
+          XorWithSmallConst,
+          ConstantInt::get(OriginalBaseInst->getType(), NewConstVal),
+          XorInst->getName() + ".or_disjoint", XorInst->getIterator());
+
+      NewOrInst->copyMetadata(*XorInst);
+      XorInst->replaceAllUsesWith(NewOrInst);
+      LLVM_DEBUG(dbgs() << "  New Inst: " << *NewOrInst << "\n");
+      InstructionsToErase.push_back(XorInst); // Mark original XOR for deletion
+
+      Changed = true;
+    } else {
+      LLVM_DEBUG(
+          dbgs() << DEBUG_TYPE
+                 << ": Cannot transform XOR (not proven disjoint) in function "
+                 << F.getName() << ":\n"
+                 << "  Xor: " << *XorInst << "\n"
+                 << "  Base Val: " << *XorWithSmallConst << "\n"
+                 << "  New Const: " << NewConstVal << "\n");
+    }
+  }
+
+  for (Instruction *I : InstructionsToErase)
+    I->eraseFromParent();
+
+  return Changed;
+}
+
+// Try to transform XOR(A, B+C) in to XOR(A,C) + B where XOR(A,C) becomes
+// the base for memory operations. This transformation is true under the
+// following conditions
+// Check 1 -  B and C are disjoint.
+// Check 2 - XOR(A,C) and B are disjoint.
+//
+// This transformation is beneficial particularly for GEPs because:
+// 1. OR operations often map better to addressing modes than XOR
+// 2. Disjoint OR operations preserve the semantics of the original XOR
+// 3. This can enable further optimizations in the GEP offset folding pipeline
+bool XorToOrDisjointTransformer::run() {
+  bool Changed = false;
+
+  // Collect all candidate XORs
+  for (Instruction &I : instructions(F)) {
+    Instruction *Op0 = nullptr;
+    ConstantInt *C1 = nullptr;
+    BinaryOperator *MatchedXorOp = nullptr;
+
+    // Attempt to match the instruction 'I' as XOR operation.
+    if (match(&I, m_CombineAnd(m_Xor(m_Instruction(Op0), m_ConstantInt(C1)),
+                               m_BinOp(MatchedXorOp))) &&
+        hasGEPUser(MatchedXorOp))
+      XorGroups[Op0].emplace_back(MatchedXorOp, C1->getValue());
+  }
+
+  if (XorGroups.empty())
+    return false;
+
+  // Process each group of XORs
+  for (auto &[OriginalBaseInst, XorsInGroup] : XorGroups)
+    if (processXorGroup(OriginalBaseInst, XorsInGroup))
+      Changed = true;
+
+  return Changed;
+}
+
 bool SeparateConstOffsetFromGEPLegacyPass::runOnFunction(Function &F) {
   if (skipFunction(F))
     return false;
@@ -1242,6 +1430,11 @@ bool SeparateConstOffsetFromGEP::run(Function &F) {
 
   DL = &F.getDataLayout();
   bool Changed = false;
+
+  // Decompose xor in to "or disjoint" if possible.
+  XorToOrDisjointTransformer XorTransformer(F, *DT, *DL);
+  Changed |= XorTransformer.run();
+
   for (BasicBlock &B : F) {
     if (!DT->isReachableFromEntry(&B))
       continue;

diff  --git a/llvm/test/Transforms/SeparateConstOffsetFromGEP/AMDGPU/xor-to-or-disjoint.ll b/llvm/test/Transforms/SeparateConstOffsetFromGEP/AMDGPU/xor-to-or-disjoint.ll
new file mode 100644
index 0000000000000..825227292fe14
--- /dev/null
+++ b/llvm/test/Transforms/SeparateConstOffsetFromGEP/AMDGPU/xor-to-or-disjoint.ll
@@ -0,0 +1,204 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt -mtriple=amdgcn-amd-amdhsa -passes=separate-const-offset-from-gep \
+; RUN: -S < %s | FileCheck %s
+
+
+; Test a simple case of xor to or disjoint transformation
+define half @test_basic_transformation(ptr %ptr, i64 %input) {
+; CHECK-LABEL: define half @test_basic_transformation(
+; CHECK-SAME: ptr [[PTR:%.*]], i64 [[INPUT:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[BASE:%.*]] = and i64 [[INPUT]], -8192
+; CHECK-NEXT:    [[ADDR1:%.*]] = xor i64 [[BASE]], 32
+; CHECK-NEXT:    [[ADDR2_OR_DISJOINT:%.*]] = or disjoint i64 [[ADDR1]], 2048
+; CHECK-NEXT:    [[ADDR3_OR_DISJOINT:%.*]] = or disjoint i64 [[ADDR1]], 4096
+; CHECK-NEXT:    [[GEP1:%.*]] = getelementptr i8, ptr [[PTR]], i64 [[ADDR1]]
+; CHECK-NEXT:    [[GEP2:%.*]] = getelementptr i8, ptr [[PTR]], i64 [[ADDR2_OR_DISJOINT]]
+; CHECK-NEXT:    [[GEP3:%.*]] = getelementptr i8, ptr [[PTR]], i64 [[ADDR3_OR_DISJOINT]]
+; CHECK-NEXT:    [[VAL1:%.*]] = load half, ptr [[GEP1]], align 2
+; CHECK-NEXT:    [[VAL2:%.*]] = load half, ptr [[GEP2]], align 2
+; CHECK-NEXT:    [[VAL3:%.*]] = load half, ptr [[GEP3]], align 2
+; CHECK-NEXT:    [[VAL1_F:%.*]] = fpext half [[VAL1]] to float
+; CHECK-NEXT:    [[VAL2_F:%.*]] = fpext half [[VAL2]] to float
+; CHECK-NEXT:    [[VAL3_F:%.*]] = fpext half [[VAL3]] to float
+; CHECK-NEXT:    [[SUM1_F:%.*]] = fadd float [[VAL1_F]], [[VAL2_F]]
+; CHECK-NEXT:    [[SUM_TOTAL_F:%.*]] = fadd float [[SUM1_F]], [[VAL3_F]]
+; CHECK-NEXT:    [[RESULT_H:%.*]] = fptrunc float [[SUM_TOTAL_F]] to half
+; CHECK-NEXT:    ret half [[RESULT_H]]
+;
+entry:
+  %base = and i64 %input, -8192    ; Clear low bits
+  %addr1 = xor i64 %base, 32
+  %addr2 = xor i64 %base, 2080
+  %addr3 = xor i64 %base, 4128
+  %gep1 = getelementptr i8, ptr %ptr, i64 %addr1
+  %gep2 = getelementptr i8, ptr %ptr, i64 %addr2
+  %gep3 = getelementptr i8, ptr %ptr, i64 %addr3
+  %val1 = load half, ptr %gep1
+  %val2 = load half, ptr %gep2
+  %val3 = load half, ptr %gep3
+  %val1.f = fpext half %val1 to float
+  %val2.f = fpext half %val2 to float
+  %val3.f = fpext half %val3 to float
+  %sum1.f = fadd float %val1.f, %val2.f
+  %sum_total.f = fadd float %sum1.f, %val3.f
+  %result.h = fptrunc float %sum_total.f to half
+  ret half %result.h
+}
+
+
+; Test the decreasing order of offset xor to or disjoint transformation
+define half @test_descending_offset_transformation(ptr %ptr, i64 %input) {
+; CHECK-LABEL: define half @test_descending_offset_transformation(
+; CHECK-SAME: ptr [[PTR:%.*]], i64 [[INPUT:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[BASE:%.*]] = and i64 [[INPUT]], -8192
+; CHECK-NEXT:    [[ADDR3_DOM_CLONE:%.*]] = xor i64 [[BASE]], 32
+; CHECK-NEXT:    [[ADDR1_OR_DISJOINT:%.*]] = or disjoint i64 [[ADDR3_DOM_CLONE]], 4096
+; CHECK-NEXT:    [[ADDR2_OR_DISJOINT:%.*]] = or disjoint i64 [[ADDR3_DOM_CLONE]], 2048
+; CHECK-NEXT:    [[ADDR3_OR_DISJOINT:%.*]] = or disjoint i64 [[ADDR3_DOM_CLONE]], 0
+; CHECK-NEXT:    [[GEP1:%.*]] = getelementptr i8, ptr [[PTR]], i64 [[ADDR1_OR_DISJOINT]]
+; CHECK-NEXT:    [[GEP2:%.*]] = getelementptr i8, ptr [[PTR]], i64 [[ADDR2_OR_DISJOINT]]
+; CHECK-NEXT:    [[GEP3:%.*]] = getelementptr i8, ptr [[PTR]], i64 [[ADDR3_OR_DISJOINT]]
+; CHECK-NEXT:    [[VAL1:%.*]] = load half, ptr [[GEP1]], align 2
+; CHECK-NEXT:    [[VAL2:%.*]] = load half, ptr [[GEP2]], align 2
+; CHECK-NEXT:    [[VAL3:%.*]] = load half, ptr [[GEP3]], align 2
+; CHECK-NEXT:    [[VAL1_F:%.*]] = fpext half [[VAL1]] to float
+; CHECK-NEXT:    [[VAL2_F:%.*]] = fpext half [[VAL2]] to float
+; CHECK-NEXT:    [[VAL3_F:%.*]] = fpext half [[VAL3]] to float
+; CHECK-NEXT:    [[SUM1_F:%.*]] = fadd float [[VAL1_F]], [[VAL2_F]]
+; CHECK-NEXT:    [[SUM_TOTAL_F:%.*]] = fadd float [[SUM1_F]], [[VAL3_F]]
+; CHECK-NEXT:    [[RESULT_H:%.*]] = fptrunc float [[SUM_TOTAL_F]] to half
+; CHECK-NEXT:    ret half [[RESULT_H]]
+;
+entry:
+  %base = and i64 %input, -8192    ; Clear low bits
+  %addr1 = xor i64 %base, 4128
+  %addr2 = xor i64 %base, 2080
+  %addr3 = xor i64 %base, 32
+  %gep1 = getelementptr i8, ptr %ptr, i64 %addr1
+  %gep2 = getelementptr i8, ptr %ptr, i64 %addr2
+  %gep3 = getelementptr i8, ptr %ptr, i64 %addr3
+  %val1 = load half, ptr %gep1
+  %val2 = load half, ptr %gep2
+  %val3 = load half, ptr %gep3
+  %val1.f = fpext half %val1 to float
+  %val2.f = fpext half %val2 to float
+  %val3.f = fpext half %val3 to float
+  %sum1.f = fadd float %val1.f, %val2.f
+  %sum_total.f = fadd float %sum1.f, %val3.f
+  %result.h = fptrunc float %sum_total.f to half
+  ret half %result.h
+}
+
+
+; Test that %addr2 is not transformed to or disjoint.
+define half @test_no_transfomation(ptr %ptr, i64 %input) {
+; CHECK-LABEL: define half @test_no_transfomation(
+; CHECK-SAME: ptr [[PTR:%.*]], i64 [[INPUT:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[BASE:%.*]] = and i64 [[INPUT]], -8192
+; CHECK-NEXT:    [[ADDR1:%.*]] = xor i64 [[BASE]], 32
+; CHECK-NEXT:    [[ADDR2:%.*]] = xor i64 [[BASE]], 64
+; CHECK-NEXT:    [[ADDR3_OR_DISJOINT:%.*]] = or disjoint i64 [[ADDR1]], 2048
+; CHECK-NEXT:    [[GEP1:%.*]] = getelementptr i8, ptr [[PTR]], i64 [[ADDR1]]
+; CHECK-NEXT:    [[GEP2:%.*]] = getelementptr i8, ptr [[PTR]], i64 [[ADDR2]]
+; CHECK-NEXT:    [[GEP3:%.*]] = getelementptr i8, ptr [[PTR]], i64 [[ADDR3_OR_DISJOINT]]
+; CHECK-NEXT:    [[VAL1:%.*]] = load half, ptr [[GEP1]], align 2
+; CHECK-NEXT:    [[VAL2:%.*]] = load half, ptr [[GEP2]], align 2
+; CHECK-NEXT:    [[VAL3:%.*]] = load half, ptr [[GEP3]], align 2
+; CHECK-NEXT:    [[VAL1_F:%.*]] = fpext half [[VAL1]] to float
+; CHECK-NEXT:    [[VAL2_F:%.*]] = fpext half [[VAL2]] to float
+; CHECK-NEXT:    [[VAL3_F:%.*]] = fpext half [[VAL3]] to float
+; CHECK-NEXT:    [[SUM1_F:%.*]] = fadd float [[VAL1_F]], [[VAL2_F]]
+; CHECK-NEXT:    [[SUM_TOTAL_F:%.*]] = fadd float [[SUM1_F]], [[VAL3_F]]
+; CHECK-NEXT:    [[RESULT_H:%.*]] = fptrunc float [[SUM_TOTAL_F]] to half
+; CHECK-NEXT:    ret half [[RESULT_H]]
+;
+entry:
+  %base = and i64 %input, -8192    ; Clear low bits
+  %addr1 = xor i64 %base, 32
+  %addr2 = xor i64 %base, 64  ; Should not be transformed
+  %addr3 = xor i64 %base, 2080
+  %gep1 = getelementptr i8, ptr %ptr, i64 %addr1
+  %gep2 = getelementptr i8, ptr %ptr, i64 %addr2
+  %gep3 = getelementptr i8, ptr %ptr, i64 %addr3
+  %val1 = load half, ptr %gep1
+  %val2 = load half, ptr %gep2
+  %val3 = load half, ptr %gep3
+  %val1.f = fpext half %val1 to float
+  %val2.f = fpext half %val2 to float
+  %val3.f = fpext half %val3 to float
+  %sum1.f = fadd float %val1.f, %val2.f
+  %sum_total.f = fadd float %sum1.f, %val3.f
+  %result.h = fptrunc float %sum_total.f to half
+  ret half %result.h
+}
+
+
+; Test case with xor instructions in 
diff erent basic blocks
+define half @test_dom_tree(ptr %ptr, i64 %input, i1 %cond) {
+; CHECK-LABEL: define half @test_dom_tree(
+; CHECK-SAME: ptr [[PTR:%.*]], i64 [[INPUT:%.*]], i1 [[COND:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[BASE:%.*]] = and i64 [[INPUT]], -8192
+; CHECK-NEXT:    [[ADDR1:%.*]] = xor i64 [[BASE]], 16
+; CHECK-NEXT:    [[GEP1:%.*]] = getelementptr i8, ptr [[PTR]], i64 [[ADDR1]]
+; CHECK-NEXT:    [[VAL1:%.*]] = load half, ptr [[GEP1]], align 2
+; CHECK-NEXT:    br i1 [[COND]], label %[[THEN:.*]], label %[[ELSE:.*]]
+; CHECK:       [[THEN]]:
+; CHECK-NEXT:    [[ADDR2_OR_DISJOINT:%.*]] = or disjoint i64 [[ADDR1]], 32
+; CHECK-NEXT:    [[GEP2:%.*]] = getelementptr i8, ptr [[PTR]], i64 [[ADDR2_OR_DISJOINT]]
+; CHECK-NEXT:    [[VAL2:%.*]] = load half, ptr [[GEP2]], align 2
+; CHECK-NEXT:    br label %[[MERGE:.*]]
+; CHECK:       [[ELSE]]:
+; CHECK-NEXT:    [[ADDR3_OR_DISJOINT:%.*]] = or disjoint i64 [[ADDR1]], 96
+; CHECK-NEXT:    [[GEP3:%.*]] = getelementptr i8, ptr [[PTR]], i64 [[ADDR3_OR_DISJOINT]]
+; CHECK-NEXT:    [[VAL3:%.*]] = load half, ptr [[GEP3]], align 2
+; CHECK-NEXT:    br label %[[MERGE]]
+; CHECK:       [[MERGE]]:
+; CHECK-NEXT:    [[VAL_FROM_BRANCH:%.*]] = phi half [ [[VAL2]], %[[THEN]] ], [ [[VAL3]], %[[ELSE]] ]
+; CHECK-NEXT:    [[ADDR4_OR_DISJOINT:%.*]] = or disjoint i64 [[ADDR1]], 224
+; CHECK-NEXT:    [[GEP4:%.*]] = getelementptr i8, ptr [[PTR]], i64 [[ADDR4_OR_DISJOINT]]
+; CHECK-NEXT:    [[VAL4:%.*]] = load half, ptr [[GEP4]], align 2
+; CHECK-NEXT:    [[VAL1_F:%.*]] = fpext half [[VAL1]] to float
+; CHECK-NEXT:    [[VAL_FROM_BRANCH_F:%.*]] = fpext half [[VAL_FROM_BRANCH]] to float
+; CHECK-NEXT:    [[VAL4_F:%.*]] = fpext half [[VAL4]] to float
+; CHECK-NEXT:    [[SUM_INTERMEDIATE_F:%.*]] = fadd float [[VAL1_F]], [[VAL_FROM_BRANCH_F]]
+; CHECK-NEXT:    [[FINAL_SUM_F:%.*]] = fadd float [[SUM_INTERMEDIATE_F]], [[VAL4_F]]
+; CHECK-NEXT:    [[RESULT_H:%.*]] = fptrunc float [[FINAL_SUM_F]] to half
+; CHECK-NEXT:    ret half [[RESULT_H]]
+;
+entry:
+  %base = and i64 %input, -8192   ; Clear low bits
+  %addr1 = xor i64 %base,16
+  %gep1 = getelementptr i8, ptr %ptr, i64 %addr1
+  %val1 = load half, ptr %gep1
+  br i1 %cond, label %then, label %else
+
+then:
+  %addr2 = xor i64 %base, 48
+  %gep2 = getelementptr i8, ptr %ptr, i64 %addr2
+  %val2 = load half, ptr %gep2
+  br label %merge
+
+else:
+  %addr3 = xor i64 %base, 112
+  %gep3 = getelementptr i8, ptr %ptr, i64 %addr3
+  %val3 = load half, ptr %gep3
+  br label %merge
+
+merge:
+  %val_from_branch = phi half [ %val2, %then ], [ %val3, %else ]
+  %addr4 = xor i64 %base, 240
+  %gep4 = getelementptr i8, ptr %ptr, i64 %addr4
+  %val4 = load half, ptr %gep4
+  %val1.f = fpext half %val1 to float
+  %val_from_branch.f = fpext half %val_from_branch to float
+  %val4.f = fpext half %val4 to float
+  %sum_intermediate.f = fadd float %val1.f, %val_from_branch.f
+  %final_sum.f = fadd float %sum_intermediate.f, %val4.f
+  %result.h = fptrunc float %final_sum.f to half
+  ret half %result.h
+}
+