[llvm] [AggressiveInstCombine] Support store merge with non-consecutive parts (PR #149807)

Mon Jul 21 05:43:07 PDT 2025

https://github.com/nikic created https://github.com/llvm/llvm-project/pull/149807

This is a minor extension of #147540, resolving one of the FIXMEs. If the collected parts contain some non-consecutive elements, we can still handle smaller ranges that *are* consecutive.

This is not common in practice and mostly shows up when the same value is stored at two different offsets.

llvm-opt-benchmark results: https://github.com/dtcxzyw/llvm-opt-benchmark/pull/2593

>From d3f21ab3bfb87b89b913dce44226731623818cc8 Mon Sep 17 00:00:00 2001
From: Nikita Popov <npopov at redhat.com>
Date: Mon, 21 Jul 2025 12:56:23 +0200
Subject: [PATCH] [AggressiveInstCombine] Support store merge with
 non-consecutive parts

This is a minor extension of #147540, resolving one of the FIXMEs.
If the collected parts contain some non-consecutive elements, we
can still handle smaller ranges that *are* consecutive.

This is not common in practice and mostly shows up when the same
value is stored at two different offsets.

llvm-opt-benchmark results: https://github.com/dtcxzyw/llvm-opt-benchmark/pull/2593
---
 .../AggressiveInstCombine.cpp                 | 60 +++++++----
 .../AggressiveInstCombine/X86/store-merge.ll  | 99 +++++++++++++++++++
 2 files changed, 138 insertions(+), 21 deletions(-)

diff --git a/llvm/lib/Transforms/AggressiveInstCombine/AggressiveInstCombine.cpp b/llvm/lib/Transforms/AggressiveInstCombine/AggressiveInstCombine.cpp
index a3a0e31f887ab..7fa6e6c5161cf 100644
--- a/llvm/lib/Transforms/AggressiveInstCombine/AggressiveInstCombine.cpp
+++ b/llvm/lib/Transforms/AggressiveInstCombine/AggressiveInstCombine.cpp
@@ -886,35 +886,20 @@ static std::optional<PartStore> matchPartStore(Instruction &I,
   return {{PtrBase, PtrOffset, Val, ValOffset, ValWidth, Store}};
 }
 
-static bool mergePartStores(SmallVectorImpl<PartStore> &Parts,
-                            const DataLayout &DL, TargetTransformInfo &TTI) {
+static bool mergeConsecutivePartStores(ArrayRef<PartStore> Parts,
+                                       unsigned Width, const DataLayout &DL,
+                                       TargetTransformInfo &TTI) {
   if (Parts.size() < 2)
     return false;
 
-  // We now have multiple parts of the same value stored to the same pointer.
-  // Sort the parts by pointer offset, and make sure they are consistent with
-  // the value offsets. Also check that the value is fully covered without
-  // overlaps.
-  // FIXME: We could support merging stores for only part of the value here.
-  llvm::sort(Parts);
-  int64_t LastEndOffsetFromFirst = 0;
-  const PartStore &First = Parts[0];
-  for (const PartStore &Part : Parts) {
-    APInt PtrOffsetFromFirst = Part.PtrOffset - First.PtrOffset;
-    int64_t ValOffsetFromFirst = Part.ValOffset - First.ValOffset;
-    if (PtrOffsetFromFirst * 8 != ValOffsetFromFirst ||
-        LastEndOffsetFromFirst != ValOffsetFromFirst)
-      return false;
-    LastEndOffsetFromFirst = ValOffsetFromFirst + Part.ValWidth;
-  }
-
   // Check whether combining the stores is profitable.
   // FIXME: We could generate smaller stores if we can't produce a large one.
+  const PartStore &First = Parts.front();
   LLVMContext &Ctx = First.Store->getContext();
-  Type *NewTy = Type::getIntNTy(Ctx, LastEndOffsetFromFirst);
+  Type *NewTy = Type::getIntNTy(Ctx, Width);
   unsigned Fast = 0;
   if (!TTI.isTypeLegal(NewTy) ||
-      !TTI.allowsMisalignedMemoryAccesses(Ctx, LastEndOffsetFromFirst,
+      !TTI.allowsMisalignedMemoryAccesses(Ctx, Width,
                                           First.Store->getPointerAddressSpace(),
                                           First.Store->getAlign(), &Fast) ||
       !Fast)
@@ -941,6 +926,39 @@ static bool mergePartStores(SmallVectorImpl<PartStore> &Parts,
   return true;
 }
 
+static bool mergePartStores(SmallVectorImpl<PartStore> &Parts,
+                            const DataLayout &DL, TargetTransformInfo &TTI) {
+  if (Parts.size() < 2)
+    return false;
+
+  // We now have multiple parts of the same value stored to the same pointer.
+  // Sort the parts by pointer offset, and make sure they are consistent with
+  // the value offsets. Also check that the value is fully covered without
+  // overlaps.
+  bool Changed = false;
+  llvm::sort(Parts);
+  int64_t LastEndOffsetFromFirst = 0;
+  const PartStore *First = &Parts[0];
+  for (const PartStore &Part : Parts) {
+    APInt PtrOffsetFromFirst = Part.PtrOffset - First->PtrOffset;
+    int64_t ValOffsetFromFirst = Part.ValOffset - First->ValOffset;
+    if (PtrOffsetFromFirst * 8 != ValOffsetFromFirst ||
+        LastEndOffsetFromFirst != ValOffsetFromFirst) {
+      Changed |= mergeConsecutivePartStores(ArrayRef(First, &Part),
+                                            LastEndOffsetFromFirst, DL, TTI);
+      First = &Part;
+      LastEndOffsetFromFirst = Part.ValWidth;
+      continue;
+    }
+
+    LastEndOffsetFromFirst = ValOffsetFromFirst + Part.ValWidth;
+  }
+
+  Changed |= mergeConsecutivePartStores(ArrayRef(First, Parts.end()),
+                                        LastEndOffsetFromFirst, DL, TTI);
+  return Changed;
+}
+
 static bool foldConsecutiveStores(BasicBlock &BB, const DataLayout &DL,
                                   TargetTransformInfo &TTI, AliasAnalysis &AA) {
   // FIXME: Add big endian support.
diff --git a/llvm/test/Transforms/AggressiveInstCombine/X86/store-merge.ll b/llvm/test/Transforms/AggressiveInstCombine/X86/store-merge.ll
index 38a55e1566a77..4ab8d18eb69b5 100644
--- a/llvm/test/Transforms/AggressiveInstCombine/X86/store-merge.ll
+++ b/llvm/test/Transforms/AggressiveInstCombine/X86/store-merge.ll
@@ -792,6 +792,105 @@ define void @test_i32_tbaa(i32 %x, ptr %p) {
   ret void
 }
 
+define void @test_multiple_parts_with_gap1(i32 %x, ptr %p) {
+; CHECK-LABEL: define void @test_multiple_parts_with_gap1(
+; CHECK-SAME: i32 [[X:%.*]], ptr [[P:%.*]]) {
+; CHECK-NEXT:    [[TMP1:%.*]] = trunc i32 [[X]] to i16
+; CHECK-NEXT:    store i16 [[TMP1]], ptr [[P]], align 1
+; CHECK-NEXT:    [[SHR_3:%.*]] = lshr i32 [[X]], 24
+; CHECK-NEXT:    [[X_3:%.*]] = trunc i32 [[SHR_3]] to i8
+; CHECK-NEXT:    [[GEP_3:%.*]] = getelementptr i8, ptr [[P]], i64 3
+; CHECK-NEXT:    store i8 [[X_3]], ptr [[GEP_3]], align 1
+; CHECK-NEXT:    ret void
+;
+  %x.0 = trunc i32 %x to i8
+  store i8 %x.0, ptr %p
+  %shr.1 = lshr i32 %x, 8
+  %x.1 = trunc i32 %shr.1 to i8
+  %gep.1 = getelementptr i8, ptr %p, i64 1
+  store i8 %x.1, ptr %gep.1
+  %shr.3 = lshr i32 %x, 24
+  %x.3 = trunc i32 %shr.3 to i8
+  %gep.3 = getelementptr i8, ptr %p, i64 3
+  store i8 %x.3, ptr %gep.3
+  ret void
+}
+
+define void @test_multiple_parts_with_gap2(i32 %x, ptr %p) {
+; CHECK-LABEL: define void @test_multiple_parts_with_gap2(
+; CHECK-SAME: i32 [[X:%.*]], ptr [[P:%.*]]) {
+; CHECK-NEXT:    [[X_0:%.*]] = trunc i32 [[X]] to i8
+; CHECK-NEXT:    store i8 [[X_0]], ptr [[P]], align 1
+; CHECK-NEXT:    [[GEP_2:%.*]] = getelementptr i8, ptr [[P]], i64 1
+; CHECK-NEXT:    [[TMP1:%.*]] = lshr i32 [[X]], 16
+; CHECK-NEXT:    [[TMP2:%.*]] = trunc i32 [[TMP1]] to i16
+; CHECK-NEXT:    store i16 [[TMP2]], ptr [[GEP_2]], align 1
+; CHECK-NEXT:    ret void
+;
+  %x.0 = trunc i32 %x to i8
+  store i8 %x.0, ptr %p
+  %shr.2 = lshr i32 %x, 16
+  %x.2 = trunc i32 %shr.2 to i8
+  %gep.2 = getelementptr i8, ptr %p, i64 1
+  store i8 %x.2, ptr %gep.2
+  %shr.3 = lshr i32 %x, 24
+  %x.3 = trunc i32 %shr.3 to i8
+  %gep.3 = getelementptr i8, ptr %p, i64 2
+  store i8 %x.3, ptr %gep.3
+  ret void
+}
+
+define void @test_multiple_parts_with_gap3(i64 %x, ptr %p) {
+; CHECK-LABEL: define void @test_multiple_parts_with_gap3(
+; CHECK-SAME: i64 [[X:%.*]], ptr [[P:%.*]]) {
+; CHECK-NEXT:    [[TMP1:%.*]] = trunc i64 [[X]] to i16
+; CHECK-NEXT:    store i16 [[TMP1]], ptr [[P]], align 1
+; CHECK-NEXT:    [[GEP_3:%.*]] = getelementptr i8, ptr [[P]], i64 3
+; CHECK-NEXT:    [[TMP2:%.*]] = lshr i64 [[X]], 24
+; CHECK-NEXT:    [[TMP3:%.*]] = trunc i64 [[TMP2]] to i16
+; CHECK-NEXT:    store i16 [[TMP3]], ptr [[GEP_3]], align 1
+; CHECK-NEXT:    ret void
+;
+  %x.0 = trunc i64 %x to i8
+  store i8 %x.0, ptr %p
+  %shr.1 = lshr i64 %x, 8
+  %x.1 = trunc i64 %shr.1 to i8
+  %gep.1 = getelementptr i8, ptr %p, i64 1
+  store i8 %x.1, ptr %gep.1
+  %shr.3 = lshr i64 %x, 24
+  %x.3 = trunc i64 %shr.3 to i8
+  %gep.3 = getelementptr i8, ptr %p, i64 3
+  store i8 %x.3, ptr %gep.3
+  %shr.4 = lshr i64 %x, 32
+  %x.4 = trunc i64 %shr.4 to i8
+  %gep.4 = getelementptr i8, ptr %p, i64 4
+  store i8 %x.4, ptr %gep.4
+  ret void
+}
+
+define void @test_store_same_parts_twice(i32 %x, ptr %p) {
+; CHECK-LABEL: define void @test_store_same_parts_twice(
+; CHECK-SAME: i32 [[X:%.*]], ptr [[P:%.*]]) {
+; CHECK-NEXT:    [[TMP1:%.*]] = trunc i32 [[X]] to i16
+; CHECK-NEXT:    store i16 [[TMP1]], ptr [[P]], align 1
+; CHECK-NEXT:    [[GEP_2:%.*]] = getelementptr i8, ptr [[P]], i64 2
+; CHECK-NEXT:    [[TMP2:%.*]] = trunc i32 [[X]] to i16
+; CHECK-NEXT:    store i16 [[TMP2]], ptr [[GEP_2]], align 1
+; CHECK-NEXT:    ret void
+;
+  %x.0 = trunc i32 %x to i8
+  store i8 %x.0, ptr %p
+  %shr.1 = lshr i32 %x, 8
+  %x.1 = trunc i32 %shr.1 to i8
+  %gep.1 = getelementptr i8, ptr %p, i64 1
+  store i8 %x.1, ptr %gep.1
+  %gep.2 = getelementptr i8, ptr %p, i64 2
+  store i8 %x.0, ptr %gep.2
+  %gep.3 = getelementptr i8, ptr %p, i64 3
+  store i8 %x.1, ptr %gep.3
+  ret void
+}
+
 !0 = !{!1}
 !1 = !{!1, !2}
 !2 = !{!2}