[llvm] [DSE] Optimizing shrinking of memory intrinsic (PR #106425)
via llvm-commits
llvm-commits at lists.llvm.org
Sat Sep 28 15:33:34 PDT 2024
https://github.com/goldsteinn updated https://github.com/llvm/llvm-project/pull/106425
>From 133883fbb26539b4fa97d9f6b9c1bd2d03c3f9c5 Mon Sep 17 00:00:00 2001
From: Noah Goldstein <goldstein.w.n at gmail.com>
Date: Thu, 29 Aug 2024 11:03:47 -0700
Subject: [PATCH 1/2] [DSE] Add more tests for optimizing shrinkage of
memset/memcpy; NFC
---
.../OverwriteStoreBegin.ll | 160 ++++++++++++++++--
.../DeadStoreElimination/OverwriteStoreEnd.ll | 98 ++++++++++-
2 files changed, 241 insertions(+), 17 deletions(-)
diff --git a/llvm/test/Transforms/DeadStoreElimination/OverwriteStoreBegin.ll b/llvm/test/Transforms/DeadStoreElimination/OverwriteStoreBegin.ll
index bc1756f6ca9d1b..6a257792db833c 100644
--- a/llvm/test/Transforms/DeadStoreElimination/OverwriteStoreBegin.ll
+++ b/llvm/test/Transforms/DeadStoreElimination/OverwriteStoreBegin.ll
@@ -1,5 +1,6 @@
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt < %s -passes=dse -S | FileCheck %s
+; RUN: opt < %s -passes=dse -S | FileCheck %s --check-prefixes=CHECK,CHECK-MEM4
+; RUN: opt < %s -mtriple=x86_64-unknown-unknown -passes=dse -S | FileCheck %s --check-prefixes=CHECK,CHECK-MEM16
define void @write4to7(ptr nocapture %p) {
; CHECK-LABEL: @write4to7(
@@ -23,8 +24,8 @@ define void @write4to7_weird_element_type(ptr nocapture %p) {
; CHECK-LABEL: @write4to7_weird_element_type(
; CHECK-NEXT: entry:
; CHECK-NEXT: [[ARRAYIDX0:%.*]] = getelementptr inbounds i32, ptr [[P:%.*]], i64 1
-; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[ARRAYIDX0]], i64 4
-; CHECK-NEXT: call void @llvm.memset.p0.i64(ptr align 4 [[TMP1]], i8 0, i64 24, i1 false)
+; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds i8, ptr [[ARRAYIDX0]], i64 4
+; CHECK-NEXT: call void @llvm.memset.p0.i64(ptr align 4 [[TMP0]], i8 0, i64 24, i1 false)
; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i32, ptr [[P]], i64 1
; CHECK-NEXT: store i32 1, ptr [[ARRAYIDX1]], align 4
; CHECK-NEXT: ret void
@@ -269,14 +270,23 @@ entry:
}
define void @write8To15AndThen0To7(ptr nocapture %P) {
-; CHECK-LABEL: @write8To15AndThen0To7(
-; CHECK-NEXT: entry:
-; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds i8, ptr [[P:%.*]], i64 16
-; CHECK-NEXT: tail call void @llvm.memset.p0.i64(ptr align 8 [[TMP0]], i8 0, i64 16, i1 false)
-; CHECK-NEXT: [[BASE64_1:%.*]] = getelementptr inbounds i64, ptr [[P]], i64 1
-; CHECK-NEXT: store i64 1, ptr [[BASE64_1]], align 4
-; CHECK-NEXT: store i64 2, ptr [[P]], align 4
-; CHECK-NEXT: ret void
+; CHECK-MEM4-LABEL: @write8To15AndThen0To7(
+; CHECK-MEM4-NEXT: entry:
+; CHECK-MEM4-NEXT: [[TMP0:%.*]] = getelementptr inbounds i8, ptr [[P:%.*]], i64 16
+; CHECK-MEM4-NEXT: tail call void @llvm.memset.p0.i64(ptr align 8 [[TMP0]], i8 0, i64 16, i1 false)
+; CHECK-MEM4-NEXT: [[BASE64_1:%.*]] = getelementptr inbounds i64, ptr [[P]], i64 1
+; CHECK-MEM4-NEXT: store i64 1, ptr [[BASE64_1]], align 4
+; CHECK-MEM4-NEXT: store i64 2, ptr [[P]], align 4
+; CHECK-MEM4-NEXT: ret void
+;
+; CHECK-MEM16-LABEL: @write8To15AndThen0To7(
+; CHECK-MEM16-NEXT: entry:
+; CHECK-MEM16-NEXT: [[TMP0:%.*]] = getelementptr inbounds i8, ptr [[P:%.*]], i64 16
+; CHECK-MEM16-NEXT: tail call void @llvm.memset.p0.i64(ptr align 8 [[TMP0]], i8 0, i64 16, i1 false)
+; CHECK-MEM16-NEXT: [[BASE64_1:%.*]] = getelementptr inbounds i64, ptr [[P]], i64 1
+; CHECK-MEM16-NEXT: store i64 1, ptr [[BASE64_1]], align 8
+; CHECK-MEM16-NEXT: store i64 2, ptr [[P]], align 8
+; CHECK-MEM16-NEXT: ret void
;
entry:
@@ -402,3 +412,131 @@ entry:
store i64 1, ptr %p, align 1
ret void
}
+
+define void @memset_optimize_size_lo_33_to_x86_32_generic_28(ptr %p) {
+; CHECK-LABEL: @memset_optimize_size_lo_33_to_x86_32_generic_28(
+; CHECK-NEXT: [[P0:%.*]] = getelementptr inbounds i8, ptr [[P:%.*]], i64 3
+; CHECK-NEXT: [[P1:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 0
+; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[P0]], i64 5
+; CHECK-NEXT: call void @llvm.memset.p0.i64(ptr align 1 [[TMP1]], i8 0, i64 28, i1 false)
+; CHECK-NEXT: store i64 0, ptr [[P1]], align 1
+; CHECK-NEXT: ret void
+;
+ %p0 = getelementptr inbounds i8, ptr %p, i64 3
+ %p1 = getelementptr inbounds i8, ptr %p, i64 0
+ call void @llvm.memset.p0.i64(ptr align 1 %p0, i8 0, i64 33, i1 false)
+ store i64 0, ptr %p1, align 1
+ ret void
+}
+
+define void @memset_optimize_size_lo_33_misaligned_x86_fail_generic_save_unit(ptr %p) {
+; CHECK-LABEL: @memset_optimize_size_lo_33_misaligned_x86_fail_generic_save_unit(
+; CHECK-NEXT: [[P0:%.*]] = getelementptr inbounds i8, ptr [[P:%.*]], i64 3
+; CHECK-NEXT: [[P1:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 0
+; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[P0]], i64 4
+; CHECK-NEXT: call void @llvm.memset.p0.i64(ptr align 2 [[TMP1]], i8 0, i64 29, i1 false)
+; CHECK-NEXT: store i64 0, ptr [[P1]], align 1
+; CHECK-NEXT: ret void
+;
+ %p0 = getelementptr inbounds i8, ptr %p, i64 3
+ %p1 = getelementptr inbounds i8, ptr %p, i64 0
+ call void @llvm.memset.p0.i64(ptr align 2 %p0, i8 0, i64 33, i1 false)
+ store i64 0, ptr %p1, align 1
+ ret void
+}
+
+define void @memset_optimize_size_lo_32_x86_misaligned_fail_generic_save_unit2(ptr %p) {
+; CHECK-LABEL: @memset_optimize_size_lo_32_x86_misaligned_fail_generic_save_unit2(
+; CHECK-NEXT: [[P0:%.*]] = getelementptr inbounds i8, ptr [[P:%.*]], i64 4
+; CHECK-NEXT: [[P1:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 0
+; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[P0]], i64 4
+; CHECK-NEXT: call void @llvm.memset.p0.i64(ptr align 2 [[TMP1]], i8 0, i64 28, i1 false)
+; CHECK-NEXT: store i64 0, ptr [[P1]], align 1
+; CHECK-NEXT: ret void
+;
+ %p0 = getelementptr inbounds i8, ptr %p, i64 4
+ %p1 = getelementptr inbounds i8, ptr %p, i64 0
+ call void @llvm.memset.p0.i64(ptr align 2 %p0, i8 0, i64 32, i1 false)
+ store i64 0, ptr %p1, align 1
+ ret void
+}
+
+define void @memset_optimize_size_lo_34_to_32(ptr %p) {
+; CHECK-LABEL: @memset_optimize_size_lo_34_to_32(
+; CHECK-NEXT: [[P0:%.*]] = getelementptr inbounds i8, ptr [[P:%.*]], i64 4
+; CHECK-NEXT: [[P1:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 0
+; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[P0]], i64 4
+; CHECK-NEXT: call void @llvm.memset.p0.i64(ptr align 2 [[TMP1]], i8 0, i64 30, i1 false)
+; CHECK-NEXT: store i64 0, ptr [[P1]], align 1
+; CHECK-NEXT: ret void
+;
+ %p0 = getelementptr inbounds i8, ptr %p, i64 4
+ %p1 = getelementptr inbounds i8, ptr %p, i64 0
+ call void @llvm.memset.p0.i64(ptr align 2 %p0, i8 0, i64 34, i1 false)
+ store i64 0, ptr %p1, align 1
+ ret void
+}
+
+define void @memset_optimize_size_lo_34_x86_misaligned_fail_generic_save_unit(ptr %p) {
+; CHECK-LABEL: @memset_optimize_size_lo_34_x86_misaligned_fail_generic_save_unit(
+; CHECK-NEXT: [[P0:%.*]] = getelementptr inbounds i8, ptr [[P:%.*]], i64 4
+; CHECK-NEXT: [[P1:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 0
+; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[P0]], i64 4
+; CHECK-NEXT: call void @llvm.memset.p0.i64(ptr align 4 [[TMP1]], i8 0, i64 30, i1 false)
+; CHECK-NEXT: store i64 0, ptr [[P1]], align 1
+; CHECK-NEXT: ret void
+;
+ %p0 = getelementptr inbounds i8, ptr %p, i64 4
+ %p1 = getelementptr inbounds i8, ptr %p, i64 0
+ call void @llvm.memset.p0.i64(ptr align 4 %p0, i8 0, i64 34, i1 false)
+ store i64 0, ptr %p1, align 1
+ ret void
+}
+
+define void @memset_optimize_size_lo_34_to_32_no_align_okay(ptr %p) {
+; CHECK-LABEL: @memset_optimize_size_lo_34_to_32_no_align_okay(
+; CHECK-NEXT: [[P0:%.*]] = getelementptr inbounds i8, ptr [[P:%.*]], i64 4
+; CHECK-NEXT: [[P1:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 0
+; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[P0]], i64 4
+; CHECK-NEXT: call void @llvm.memset.p0.i64(ptr align 1 [[TMP1]], i8 0, i64 30, i1 false)
+; CHECK-NEXT: store i64 0, ptr [[P1]], align 1
+; CHECK-NEXT: ret void
+;
+ %p0 = getelementptr inbounds i8, ptr %p, i64 4
+ %p1 = getelementptr inbounds i8, ptr %p, i64 0
+ call void @llvm.memset.p0.i64(ptr align 1 %p0, i8 0, i64 34, i1 false)
+ store i64 0, ptr %p1, align 1
+ ret void
+}
+
+define void @memset_optimize_size_lo_33_to_31_save_unit_no_change(ptr %p) {
+; CHECK-LABEL: @memset_optimize_size_lo_33_to_31_save_unit_no_change(
+; CHECK-NEXT: [[P0:%.*]] = getelementptr inbounds i8, ptr [[P:%.*]], i64 1
+; CHECK-NEXT: [[P1:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 0
+; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[P0]], i64 2
+; CHECK-NEXT: call void @llvm.memset.p0.i64(ptr align 2 [[TMP1]], i8 0, i64 31, i1 false)
+; CHECK-NEXT: store i32 0, ptr [[P1]], align 1
+; CHECK-NEXT: ret void
+;
+ %p0 = getelementptr inbounds i8, ptr %p, i64 1
+ %p1 = getelementptr inbounds i8, ptr %p, i64 0
+ call void @llvm.memset.p0.i64(ptr align 2 %p0, i8 0, i64 33, i1 false)
+ store i32 0, ptr %p1, align 1
+ ret void
+}
+
+define void @memset_optimize_size_lo_36_to_32(ptr %p) {
+; CHECK-LABEL: @memset_optimize_size_lo_36_to_32(
+; CHECK-NEXT: [[P0:%.*]] = getelementptr inbounds i8, ptr [[P:%.*]], i64 1
+; CHECK-NEXT: [[P1:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 0
+; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[P0]], i64 4
+; CHECK-NEXT: call void @llvm.memset.p0.i64(ptr align 4 [[TMP1]], i8 0, i64 32, i1 false)
+; CHECK-NEXT: store i64 0, ptr [[P1]], align 1
+; CHECK-NEXT: ret void
+;
+ %p0 = getelementptr inbounds i8, ptr %p, i64 1
+ %p1 = getelementptr inbounds i8, ptr %p, i64 0
+ call void @llvm.memset.p0.i64(ptr align 4 %p0, i8 0, i64 36, i1 false)
+ store i64 0, ptr %p1, align 1
+ ret void
+}
diff --git a/llvm/test/Transforms/DeadStoreElimination/OverwriteStoreEnd.ll b/llvm/test/Transforms/DeadStoreElimination/OverwriteStoreEnd.ll
index ac8eee7088ad8f..9fc3f827e614db 100644
--- a/llvm/test/Transforms/DeadStoreElimination/OverwriteStoreEnd.ll
+++ b/llvm/test/Transforms/DeadStoreElimination/OverwriteStoreEnd.ll
@@ -1,9 +1,16 @@
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt < %s -passes=dse -S | FileCheck %s
+; RUN: opt < %s -passes=dse -S | FileCheck %s --check-prefixes=CHECK,CHECK-MEM4
+; RUN: opt < %s -mtriple=x86_64-unknown-unknown -passes=dse -S | FileCheck %s --check-prefixes=CHECK,CHECK-MEM16
+
target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
-%struct.vec2 = type { <4 x i32>, <4 x i32> }
-%struct.vec2plusi = type { <4 x i32>, <4 x i32>, i32 }
+%struct.vec2 = type {
+<4 x i32>, <4 x i32>
+}
+
+%struct.vec2plusi = type {
+<4 x i32>, <4 x i32>, i32
+}
@glob1 = global %struct.vec2 zeroinitializer, align 16
@glob2 = global %struct.vec2plusi zeroinitializer, align 16
@@ -231,7 +238,9 @@ declare void @llvm.memcpy.element.unordered.atomic.p0.p0.i64(ptr nocapture, ptr
declare void @llvm.memset.p0.i64(ptr nocapture, i8, i64, i1) nounwind
declare void @llvm.memset.element.unordered.atomic.p0.i64(ptr nocapture, i8, i64, i32) nounwind
-%struct.trapframe = type { i64, i64, i64 }
+%struct.trapframe = type {
+i64, i64, i64
+}
; bugzilla 11455 - make sure negative GEP's don't break this optimisation
define void @cpu_lwp_fork(ptr %md_regs, i64 %pcb_rsp0) nounwind uwtable noinline ssp {
@@ -259,8 +268,8 @@ define void @write16To23AndThen24To31(ptr nocapture %P, i64 %n64, i32 %n32, i16
; CHECK-NEXT: tail call void @llvm.memset.p0.i64(ptr align 8 [[P:%.*]], i8 0, i64 16, i1 false)
; CHECK-NEXT: [[BASE64_2:%.*]] = getelementptr inbounds i64, ptr [[P]], i64 2
; CHECK-NEXT: [[BASE64_3:%.*]] = getelementptr inbounds i64, ptr [[P]], i64 3
-; CHECK-NEXT: store i64 3, ptr [[BASE64_2]]
-; CHECK-NEXT: store i64 3, ptr [[BASE64_3]]
+; CHECK-NEXT: store i64 3, ptr [[BASE64_2]], align 8
+; CHECK-NEXT: store i64 3, ptr [[BASE64_3]], align 8
; CHECK-NEXT: ret void
;
entry:
@@ -392,3 +401,80 @@ entry:
ret void
}
+define void @memset_optimize_size_hi_31_to_24(ptr %p) {
+; CHECK-LABEL: @memset_optimize_size_hi_31_to_24(
+; CHECK-NEXT: [[P0:%.*]] = getelementptr inbounds i8, ptr [[P:%.*]], i64 0
+; CHECK-NEXT: [[P1:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 23
+; CHECK-NEXT: call void @llvm.memset.p0.i64(ptr align 1 [[P0]], i8 0, i64 23, i1 false)
+; CHECK-NEXT: store i64 0, ptr [[P1]], align 1
+; CHECK-NEXT: ret void
+;
+ %p0 = getelementptr inbounds i8, ptr %p, i64 0
+ %p1 = getelementptr inbounds i8, ptr %p, i64 23
+ call void @llvm.memset.p0.i64(ptr align 1 %p0, i8 0, i64 31, i1 false)
+ store i64 0, ptr %p1, align 1
+ ret void
+}
+
+define void @memset_optimize_size_hi_32_no_change_x86_change_generic(ptr %p) {
+; CHECK-LABEL: @memset_optimize_size_hi_32_no_change_x86_change_generic(
+; CHECK-NEXT: [[P0:%.*]] = getelementptr inbounds i8, ptr [[P:%.*]], i64 0
+; CHECK-NEXT: [[P1:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 28
+; CHECK-NEXT: call void @llvm.memset.p0.i64(ptr align 1 [[P0]], i8 0, i64 28, i1 false)
+; CHECK-NEXT: store i64 0, ptr [[P1]], align 1
+; CHECK-NEXT: ret void
+;
+ %p0 = getelementptr inbounds i8, ptr %p, i64 0
+ %p1 = getelementptr inbounds i8, ptr %p, i64 28
+ call void @llvm.memset.p0.i64(ptr align 1 %p0, i8 0, i64 32, i1 false)
+ store i64 0, ptr %p1, align 1
+ ret void
+}
+
+define void @memset_optimize_size_hi_28_to_24(ptr %p) {
+; CHECK-LABEL: @memset_optimize_size_hi_28_to_24(
+; CHECK-NEXT: [[P0:%.*]] = getelementptr inbounds i8, ptr [[P:%.*]], i64 0
+; CHECK-NEXT: [[P1:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 21
+; CHECK-NEXT: call void @llvm.memset.p0.i64(ptr align 8 [[P0]], i8 0, i64 24, i1 false)
+; CHECK-NEXT: store i64 0, ptr [[P1]], align 1
+; CHECK-NEXT: ret void
+;
+ %p0 = getelementptr inbounds i8, ptr %p, i64 0
+ %p1 = getelementptr inbounds i8, ptr %p, i64 21
+ call void @llvm.memset.p0.i64(ptr align 8 %p0, i8 0, i64 28, i1 false)
+ store i64 0, ptr %p1, align 1
+ ret void
+}
+
+define void @memset_optimize_size_hi_31_to_28(ptr %p) {
+; CHECK-LABEL: @memset_optimize_size_hi_31_to_28(
+; CHECK-NEXT: [[P0:%.*]] = getelementptr inbounds i8, ptr [[P:%.*]], i64 0
+; CHECK-NEXT: [[P1:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 27
+; CHECK-NEXT: call void @llvm.memset.p0.i64(ptr align 2 [[P0]], i8 0, i64 28, i1 false)
+; CHECK-NEXT: store i64 0, ptr [[P1]], align 1
+; CHECK-NEXT: ret void
+;
+ %p0 = getelementptr inbounds i8, ptr %p, i64 0
+ %p1 = getelementptr inbounds i8, ptr %p, i64 27
+ call void @llvm.memset.p0.i64(ptr align 2 %p0, i8 0, i64 31, i1 false)
+ store i64 0, ptr %p1, align 1
+ ret void
+}
+
+define void @memset_optimize_size_hi_33_to_x86_32_generic_28(ptr %p) {
+; CHECK-LABEL: @memset_optimize_size_hi_33_to_x86_32_generic_28(
+; CHECK-NEXT: [[P0:%.*]] = getelementptr inbounds i8, ptr [[P:%.*]], i64 0
+; CHECK-NEXT: [[P1:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 27
+; CHECK-NEXT: call void @llvm.memset.p0.i64(ptr align 4 [[P0]], i8 0, i64 28, i1 false)
+; CHECK-NEXT: store i64 0, ptr [[P1]], align 1
+; CHECK-NEXT: ret void
+;
+ %p0 = getelementptr inbounds i8, ptr %p, i64 0
+ %p1 = getelementptr inbounds i8, ptr %p, i64 27
+ call void @llvm.memset.p0.i64(ptr align 4 %p0, i8 0, i64 33, i1 false)
+ store i64 0, ptr %p1, align 1
+ ret void
+}
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; CHECK-MEM16: {{.*}}
+; CHECK-MEM4: {{.*}}
>From 54d72d12973fc392d1f479703619187a33660995 Mon Sep 17 00:00:00 2001
From: Noah Goldstein <goldstein.w.n at gmail.com>
Date: Wed, 28 Aug 2024 10:37:46 -0700
Subject: [PATCH 2/2] [DSE] Optimizing shrinkinking of memory intrinsic
Currently for the following snippet:
`memcpy(dst, src, 8); dst[7] = 0`
DSE will transform it to:
`memcpy(dst, src, 7); dst[7] = 0`
Likewise if we have:
`memcpy(dst, src, 9); dst[7] = 0; dst[8] = 0`
DSE will transform it to:
`memcpy(dst, src, 7); dst[7] = 0`
However, in both cases we would prefer to emit an 8-byte `memcpy`
followed by any overwrite of the trailing byte(s).
This patch attempts to optimize the new intrinsic length within the
available range of the original size and the maximally shrunk size.
---
.../Scalar/DeadStoreElimination.cpp | 141 ++++++++--
.../OverwriteStoreBegin.ll | 177 ++++++++-----
.../DeadStoreElimination/OverwriteStoreEnd.ll | 242 ++++++++++++------
.../stores-of-existing-values.ll | 4 +-
4 files changed, 407 insertions(+), 157 deletions(-)
diff --git a/llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp b/llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp
index a304f7b056f5f7..f25bdccafb3cc3 100644
--- a/llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp
+++ b/llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp
@@ -48,6 +48,7 @@
#include "llvm/Analysis/MustExecute.h"
#include "llvm/Analysis/PostDominators.h"
#include "llvm/Analysis/TargetLibraryInfo.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
#include "llvm/Analysis/ValueTracking.h"
#include "llvm/IR/Argument.h"
#include "llvm/IR/BasicBlock.h"
@@ -558,9 +559,10 @@ static void shortenAssignment(Instruction *Inst, Value *OriginalDest,
for_each(LinkedDVRAssigns, InsertAssignForOverlap);
}
-static bool tryToShorten(Instruction *DeadI, int64_t &DeadStart,
- uint64_t &DeadSize, int64_t KillingStart,
- uint64_t KillingSize, bool IsOverwriteEnd) {
+static bool tryToShorten(Instruction *DeadI, int64_t DeadStart,
+ uint64_t DeadSize, int64_t KillingStart,
+ uint64_t KillingSize, bool IsOverwriteEnd,
+ const TargetTransformInfo &TTI) {
auto *DeadIntrinsic = cast<AnyMemIntrinsic>(DeadI);
Align PrefAlign = DeadIntrinsic->getDestAlign().valueOrOne();
@@ -583,11 +585,7 @@ static bool tryToShorten(Instruction *DeadI, int64_t &DeadStart,
// Compute start and size of the region to remove. Make sure 'PrefAlign' is
// maintained on the remaining store.
if (IsOverwriteEnd) {
- // Calculate required adjustment for 'KillingStart' in order to keep
- // remaining store size aligned on 'PerfAlign'.
- uint64_t Off =
- offsetToAlignment(uint64_t(KillingStart - DeadStart), PrefAlign);
- ToRemoveStart = KillingStart + Off;
+ ToRemoveStart = KillingStart;
if (DeadSize <= uint64_t(ToRemoveStart - DeadStart))
return false;
ToRemoveSize = DeadSize - uint64_t(ToRemoveStart - DeadStart);
@@ -612,6 +610,108 @@ static bool tryToShorten(Instruction *DeadI, int64_t &DeadStart,
assert(DeadSize > ToRemoveSize && "Can't remove more than original size");
uint64_t NewSize = DeadSize - ToRemoveSize;
+
+ // Try to coerce the new memcpy/memset size to a "fast" value. This typically
+ // means some exact multiple of the register width of the loads/stores.
+
+ // If scalar size >= vec size, assume target will use scalars for implementing
+ // memset/memcpy.
+ TypeSize ScalarSize =
+ TTI.getRegisterBitWidth(TargetTransformInfo::RGK_Scalar);
+ TypeSize VecSize =
+ TTI.getRegisterBitWidth(TargetTransformInfo::RGK_FixedWidthVector);
+ uint64_t MemUnit = 0;
+ if (ScalarSize >= VecSize)
+ MemUnit = ScalarSize.getFixedValue();
+ // Otherwise assume memset/memcpy will be lowered with Vec's
+ else
+ MemUnit =
+ TTI.getLoadStoreVecRegBitWidth(DeadIntrinsic->getDestAddressSpace());
+
+ MemUnit /= 8U;
+
+ // Assume loads/stores are issued by power of 2 regions. Try to minimize
+ // number of power of 2 blocks.
+ // ie if we have DeadSize = 15
+ // NewSize = 7 -> 8 (4 + 3 + 2 + 1) -> (8)
+ // NewSize = 9 -> 9 (8 + 1) == (8 + 1)
+ // NewSize = 11 -> 12 (8 + 2 + 1) -> (8 + 4)
+ uint64_t Upper = DeadSize;
+ uint64_t Lower = NewSize;
+
+ uint64_t RoundLower = MemUnit * (Lower / MemUnit);
+
+ // We have some trailing loads/stores we can try to optimize.
+ if (RoundLower != Lower && Lower != 0 && (RoundLower + MemUnit) != 0) {
+ Upper = std::min(Upper, RoundLower + MemUnit - 1);
+ // Don't bust inlining doing this.
+ uint64_t InlineThresh = TTI.getMaxMemIntrinsicInlineSizeThreshold();
+ if (Upper > InlineThresh && Lower <= InlineThresh)
+ Upper = InlineThresh;
+
+ // Replace Lower with value in range [Lower, Upper] that has min popcount
+ // (selecting for minimum value as tiebreaker when popcount is the same).
+ // The idea here is this will require the minimum number of load/stores and
+ // within that will use the presumably preferable minimum width.
+
+ // Get highest bit that differs between Lower and Upper. Anything above this
+ // bit must be in the new value. Anything below it thats larger than Lower
+ // is fair game.
+ uint64_t Dif = (Lower - 1) ^ Upper;
+ uint64_t HighestBit = 63 - llvm::countl_zero(Dif);
+
+ // Make Lo/Hi masks from the HighestDif bit. Lo mask is use to find value we
+ // can roundup for minimum power of 2 chunk, Hi mask is preserved.
+ uint64_t HighestP2 = static_cast<uint64_t>(1) << HighestBit;
+ uint64_t LoMask = HighestP2 - 1;
+ uint64_t HiMask = -HighestP2;
+
+ // Minimum power of 2 for the "tail"
+ uint64_t LoVal = Lower & LoMask;
+ if (LoVal)
+ LoVal = llvm::bit_ceil(LoVal);
+ // Preserved high bits to stay in range.
+ uint64_t HiVal = Lower & HiMask;
+ Lower = LoVal | HiVal;
+
+ // If we have more than two tail stores see if we can just roundup the next
+ // memunit.
+ if (llvm::popcount(Lower % MemUnit) > 1 &&
+ DeadSize >= (RoundLower + MemUnit))
+ Lower = RoundLower + MemUnit;
+
+ uint64_t OptimizedNewSize = NewSize;
+ // If we are over-writing the begining, make sure we don't mess up the
+ // alignment.
+ if (IsOverwriteEnd || isAligned(PrefAlign, DeadSize - Lower)) {
+ OptimizedNewSize = Lower;
+ } else {
+ // Our minimal value isn't properly aligned, see if we can
+ // increase the size of a tail loads/stores.
+ Lower = HiVal | HighestP2;
+ if (isAligned(PrefAlign, DeadSize - Lower))
+ OptimizedNewSize = Lower;
+ // If we can't adjust size without messing up alignment, see if the new
+ // size is actually preferable.
+ // TODO: We should probably do better here than just giving up.
+ else if ((NewSize <= InlineThresh) == (DeadSize <= InlineThresh) &&
+ llvm::popcount(NewSize) > llvm::popcount(DeadSize) &&
+ DeadSize / MemUnit == NewSize / MemUnit)
+ return false;
+ }
+
+ // Adjust new starting point for the memset/memcpy.
+ if (OptimizedNewSize != NewSize) {
+ if (!IsOverwriteEnd)
+ ToRemoveSize = DeadSize - OptimizedNewSize;
+ NewSize = OptimizedNewSize;
+ }
+
+ // Our optimal length is the original length, skip the transform.
+ if (NewSize == DeadSize)
+ return false;
+ }
+
if (auto *AMI = dyn_cast<AtomicMemIntrinsic>(DeadI)) {
// When shortening an atomic memory intrinsic, the newly shortened
// length must remain an integer multiple of the element size.
@@ -654,7 +754,8 @@ static bool tryToShorten(Instruction *DeadI, int64_t &DeadStart,
}
static bool tryToShortenEnd(Instruction *DeadI, OverlapIntervalsTy &IntervalMap,
- int64_t &DeadStart, uint64_t &DeadSize) {
+ int64_t &DeadStart, uint64_t &DeadSize,
+ const TargetTransformInfo &TTI) {
if (IntervalMap.empty() || !isShortenableAtTheEnd(DeadI))
return false;
@@ -672,7 +773,7 @@ static bool tryToShortenEnd(Instruction *DeadI, OverlapIntervalsTy &IntervalMap,
// be non negative due to preceding checks.
KillingSize >= DeadSize - (uint64_t)(KillingStart - DeadStart)) {
if (tryToShorten(DeadI, DeadStart, DeadSize, KillingStart, KillingSize,
- true)) {
+ true, TTI)) {
IntervalMap.erase(OII);
return true;
}
@@ -682,7 +783,8 @@ static bool tryToShortenEnd(Instruction *DeadI, OverlapIntervalsTy &IntervalMap,
static bool tryToShortenBegin(Instruction *DeadI,
OverlapIntervalsTy &IntervalMap,
- int64_t &DeadStart, uint64_t &DeadSize) {
+ int64_t &DeadStart, uint64_t &DeadSize,
+ const TargetTransformInfo &TTI) {
if (IntervalMap.empty() || !isShortenableAtTheBeginning(DeadI))
return false;
@@ -701,7 +803,7 @@ static bool tryToShortenBegin(Instruction *DeadI,
assert(KillingSize - (uint64_t)(DeadStart - KillingStart) < DeadSize &&
"Should have been handled as OW_Complete");
if (tryToShorten(DeadI, DeadStart, DeadSize, KillingStart, KillingSize,
- false)) {
+ false, TTI)) {
IntervalMap.erase(OII);
return true;
}
@@ -852,6 +954,7 @@ struct DSEState {
DominatorTree &DT;
PostDominatorTree &PDT;
const TargetLibraryInfo &TLI;
+ const TargetTransformInfo &TTI;
const DataLayout &DL;
const LoopInfo &LI;
@@ -896,9 +999,9 @@ struct DSEState {
DSEState(Function &F, AliasAnalysis &AA, MemorySSA &MSSA, DominatorTree &DT,
PostDominatorTree &PDT, const TargetLibraryInfo &TLI,
- const LoopInfo &LI)
+ const TargetTransformInfo &TTI, const LoopInfo &LI)
: F(F), AA(AA), EI(DT, &LI), BatchAA(AA, &EI), MSSA(MSSA), DT(DT),
- PDT(PDT), TLI(TLI), DL(F.getDataLayout()), LI(LI) {
+ PDT(PDT), TLI(TLI), TTI(TTI), DL(F.getDataLayout()), LI(LI) {
// Collect blocks with throwing instructions not modeled in MemorySSA and
// alloc-like objects.
unsigned PO = 0;
@@ -2103,10 +2206,10 @@ struct DSEState {
uint64_t DeadSize = Loc.Size.getValue();
GetPointerBaseWithConstantOffset(Ptr, DeadStart, DL);
OverlapIntervalsTy &IntervalMap = OI.second;
- Changed |= tryToShortenEnd(DeadI, IntervalMap, DeadStart, DeadSize);
+ Changed |= tryToShortenEnd(DeadI, IntervalMap, DeadStart, DeadSize, TTI);
if (IntervalMap.empty())
continue;
- Changed |= tryToShortenBegin(DeadI, IntervalMap, DeadStart, DeadSize);
+ Changed |= tryToShortenBegin(DeadI, IntervalMap, DeadStart, DeadSize, TTI);
}
return Changed;
}
@@ -2347,9 +2450,10 @@ bool DSEState::eliminateDeadDefs(const MemoryDefWrapper &KillingDefWrapper) {
static bool eliminateDeadStores(Function &F, AliasAnalysis &AA, MemorySSA &MSSA,
DominatorTree &DT, PostDominatorTree &PDT,
const TargetLibraryInfo &TLI,
+ const TargetTransformInfo &TTI,
const LoopInfo &LI) {
bool MadeChange = false;
- DSEState State(F, AA, MSSA, DT, PDT, TLI, LI);
+ DSEState State(F, AA, MSSA, DT, PDT, TLI, TTI, LI);
// For each store:
for (unsigned I = 0; I < State.MemDefs.size(); I++) {
MemoryDef *KillingDef = State.MemDefs[I];
@@ -2383,12 +2487,13 @@ static bool eliminateDeadStores(Function &F, AliasAnalysis &AA, MemorySSA &MSSA,
PreservedAnalyses DSEPass::run(Function &F, FunctionAnalysisManager &AM) {
AliasAnalysis &AA = AM.getResult<AAManager>(F);
const TargetLibraryInfo &TLI = AM.getResult<TargetLibraryAnalysis>(F);
+ const TargetTransformInfo &TTI = AM.getResult<TargetIRAnalysis>(F);
DominatorTree &DT = AM.getResult<DominatorTreeAnalysis>(F);
MemorySSA &MSSA = AM.getResult<MemorySSAAnalysis>(F).getMSSA();
PostDominatorTree &PDT = AM.getResult<PostDominatorTreeAnalysis>(F);
LoopInfo &LI = AM.getResult<LoopAnalysis>(F);
- bool Changed = eliminateDeadStores(F, AA, MSSA, DT, PDT, TLI, LI);
+ bool Changed = eliminateDeadStores(F, AA, MSSA, DT, PDT, TLI, TTI, LI);
#ifdef LLVM_ENABLE_STATS
if (AreStatisticsEnabled())
diff --git a/llvm/test/Transforms/DeadStoreElimination/OverwriteStoreBegin.ll b/llvm/test/Transforms/DeadStoreElimination/OverwriteStoreBegin.ll
index 6a257792db833c..135b4a18341e92 100644
--- a/llvm/test/Transforms/DeadStoreElimination/OverwriteStoreBegin.ll
+++ b/llvm/test/Transforms/DeadStoreElimination/OverwriteStoreBegin.ll
@@ -234,14 +234,22 @@ entry:
}
define void @write2to10(ptr nocapture %p) {
-; CHECK-LABEL: @write2to10(
-; CHECK-NEXT: entry:
-; CHECK-NEXT: [[ARRAYIDX0:%.*]] = getelementptr inbounds i32, ptr [[P:%.*]], i64 1
-; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds i8, ptr [[ARRAYIDX0]], i64 4
-; CHECK-NEXT: call void @llvm.memset.p0.i64(ptr align 4 [[TMP0]], i8 0, i64 28, i1 false)
-; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i16, ptr [[P]], i64 1
-; CHECK-NEXT: store i64 1, ptr [[ARRAYIDX2]], align 8
-; CHECK-NEXT: ret void
+; CHECK-MEM4-LABEL: @write2to10(
+; CHECK-MEM4-NEXT: entry:
+; CHECK-MEM4-NEXT: [[ARRAYIDX0:%.*]] = getelementptr inbounds i32, ptr [[P:%.*]], i64 1
+; CHECK-MEM4-NEXT: [[TMP0:%.*]] = getelementptr inbounds i8, ptr [[ARRAYIDX0]], i64 4
+; CHECK-MEM4-NEXT: call void @llvm.memset.p0.i64(ptr align 4 [[TMP0]], i8 0, i64 28, i1 false)
+; CHECK-MEM4-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i16, ptr [[P]], i64 1
+; CHECK-MEM4-NEXT: store i64 1, ptr [[ARRAYIDX2]], align 8
+; CHECK-MEM4-NEXT: ret void
+;
+; CHECK-MEM16-LABEL: @write2to10(
+; CHECK-MEM16-NEXT: entry:
+; CHECK-MEM16-NEXT: [[ARRAYIDX0:%.*]] = getelementptr inbounds i32, ptr [[P:%.*]], i64 1
+; CHECK-MEM16-NEXT: call void @llvm.memset.p0.i64(ptr align 4 [[ARRAYIDX0]], i8 0, i64 32, i1 false)
+; CHECK-MEM16-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i16, ptr [[P]], i64 1
+; CHECK-MEM16-NEXT: store i64 1, ptr [[ARRAYIDX2]], align 8
+; CHECK-MEM16-NEXT: ret void
;
entry:
%arrayidx0 = getelementptr inbounds i32, ptr %p, i64 1
@@ -252,14 +260,22 @@ entry:
}
define void @write2to10_atomic(ptr nocapture %p) {
-; CHECK-LABEL: @write2to10_atomic(
-; CHECK-NEXT: entry:
-; CHECK-NEXT: [[ARRAYIDX0:%.*]] = getelementptr inbounds i32, ptr [[P:%.*]], i64 1
-; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds i8, ptr [[ARRAYIDX0]], i64 4
-; CHECK-NEXT: call void @llvm.memset.element.unordered.atomic.p0.i64(ptr align 4 [[TMP0]], i8 0, i64 28, i32 4)
-; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i16, ptr [[P]], i64 1
-; CHECK-NEXT: store atomic i64 1, ptr [[ARRAYIDX2]] unordered, align 8
-; CHECK-NEXT: ret void
+; CHECK-MEM4-LABEL: @write2to10_atomic(
+; CHECK-MEM4-NEXT: entry:
+; CHECK-MEM4-NEXT: [[ARRAYIDX0:%.*]] = getelementptr inbounds i32, ptr [[P:%.*]], i64 1
+; CHECK-MEM4-NEXT: [[TMP0:%.*]] = getelementptr inbounds i8, ptr [[ARRAYIDX0]], i64 4
+; CHECK-MEM4-NEXT: call void @llvm.memset.element.unordered.atomic.p0.i64(ptr align 4 [[TMP0]], i8 0, i64 28, i32 4)
+; CHECK-MEM4-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i16, ptr [[P]], i64 1
+; CHECK-MEM4-NEXT: store atomic i64 1, ptr [[ARRAYIDX2]] unordered, align 8
+; CHECK-MEM4-NEXT: ret void
+;
+; CHECK-MEM16-LABEL: @write2to10_atomic(
+; CHECK-MEM16-NEXT: entry:
+; CHECK-MEM16-NEXT: [[ARRAYIDX0:%.*]] = getelementptr inbounds i32, ptr [[P:%.*]], i64 1
+; CHECK-MEM16-NEXT: call void @llvm.memset.element.unordered.atomic.p0.i64(ptr align 4 [[ARRAYIDX0]], i8 0, i64 32, i32 4)
+; CHECK-MEM16-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i16, ptr [[P]], i64 1
+; CHECK-MEM16-NEXT: store atomic i64 1, ptr [[ARRAYIDX2]] unordered, align 8
+; CHECK-MEM16-NEXT: ret void
;
entry:
%arrayidx0 = getelementptr inbounds i32, ptr %p, i64 1
@@ -367,13 +383,20 @@ declare void @llvm.memset.p1.i64(ptr addrspace(1) nocapture, i8, i64, i1) nounwi
declare void @llvm.memset.element.unordered.atomic.p0.i64(ptr nocapture, i8, i64, i32) nounwind
define void @ow_begin_align1(ptr nocapture %p) {
-; CHECK-LABEL: @ow_begin_align1(
-; CHECK-NEXT: entry:
-; CHECK-NEXT: [[P1:%.*]] = getelementptr inbounds i8, ptr [[P:%.*]], i64 1
-; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 7
-; CHECK-NEXT: call void @llvm.memset.p0.i64(ptr align 1 [[TMP0]], i8 0, i64 25, i1 false)
-; CHECK-NEXT: store i64 1, ptr [[P]], align 1
-; CHECK-NEXT: ret void
+; CHECK-MEM4-LABEL: @ow_begin_align1(
+; CHECK-MEM4-NEXT: entry:
+; CHECK-MEM4-NEXT: [[P1:%.*]] = getelementptr inbounds i8, ptr [[P:%.*]], i64 1
+; CHECK-MEM4-NEXT: [[TMP0:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 7
+; CHECK-MEM4-NEXT: call void @llvm.memset.p0.i64(ptr align 1 [[TMP0]], i8 0, i64 25, i1 false)
+; CHECK-MEM4-NEXT: store i64 1, ptr [[P]], align 1
+; CHECK-MEM4-NEXT: ret void
+;
+; CHECK-MEM16-LABEL: @ow_begin_align1(
+; CHECK-MEM16-NEXT: entry:
+; CHECK-MEM16-NEXT: [[P1:%.*]] = getelementptr inbounds i8, ptr [[P:%.*]], i64 1
+; CHECK-MEM16-NEXT: call void @llvm.memset.p0.i64(ptr align 1 [[P1]], i8 0, i64 32, i1 false)
+; CHECK-MEM16-NEXT: store i64 1, ptr [[P]], align 1
+; CHECK-MEM16-NEXT: ret void
;
entry:
%p1 = getelementptr inbounds i8, ptr %p, i64 1
@@ -383,13 +406,20 @@ entry:
}
define void @ow_end_align4(ptr nocapture %p) {
-; CHECK-LABEL: @ow_end_align4(
-; CHECK-NEXT: entry:
-; CHECK-NEXT: [[P1:%.*]] = getelementptr inbounds i8, ptr [[P:%.*]], i64 1
-; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 4
-; CHECK-NEXT: call void @llvm.memset.p0.i64(ptr align 4 [[TMP0]], i8 0, i64 28, i1 false)
-; CHECK-NEXT: store i64 1, ptr [[P]], align 1
-; CHECK-NEXT: ret void
+; CHECK-MEM4-LABEL: @ow_end_align4(
+; CHECK-MEM4-NEXT: entry:
+; CHECK-MEM4-NEXT: [[P1:%.*]] = getelementptr inbounds i8, ptr [[P:%.*]], i64 1
+; CHECK-MEM4-NEXT: [[TMP0:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 4
+; CHECK-MEM4-NEXT: call void @llvm.memset.p0.i64(ptr align 4 [[TMP0]], i8 0, i64 28, i1 false)
+; CHECK-MEM4-NEXT: store i64 1, ptr [[P]], align 1
+; CHECK-MEM4-NEXT: ret void
+;
+; CHECK-MEM16-LABEL: @ow_end_align4(
+; CHECK-MEM16-NEXT: entry:
+; CHECK-MEM16-NEXT: [[P1:%.*]] = getelementptr inbounds i8, ptr [[P:%.*]], i64 1
+; CHECK-MEM16-NEXT: call void @llvm.memset.p0.i64(ptr align 4 [[P1]], i8 0, i64 32, i1 false)
+; CHECK-MEM16-NEXT: store i64 1, ptr [[P]], align 1
+; CHECK-MEM16-NEXT: ret void
;
entry:
%p1 = getelementptr inbounds i8, ptr %p, i64 1
@@ -414,13 +444,21 @@ entry:
}
define void @memset_optimize_size_lo_33_to_x86_32_generic_28(ptr %p) {
-; CHECK-LABEL: @memset_optimize_size_lo_33_to_x86_32_generic_28(
-; CHECK-NEXT: [[P0:%.*]] = getelementptr inbounds i8, ptr [[P:%.*]], i64 3
-; CHECK-NEXT: [[P1:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 0
-; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[P0]], i64 5
-; CHECK-NEXT: call void @llvm.memset.p0.i64(ptr align 1 [[TMP1]], i8 0, i64 28, i1 false)
-; CHECK-NEXT: store i64 0, ptr [[P1]], align 1
-; CHECK-NEXT: ret void
+; CHECK-MEM4-LABEL: @memset_optimize_size_lo_33_to_x86_32_generic_28(
+; CHECK-MEM4-NEXT: [[P0:%.*]] = getelementptr inbounds i8, ptr [[P:%.*]], i64 3
+; CHECK-MEM4-NEXT: [[P1:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 0
+; CHECK-MEM4-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[P0]], i64 5
+; CHECK-MEM4-NEXT: call void @llvm.memset.p0.i64(ptr align 1 [[TMP1]], i8 0, i64 28, i1 false)
+; CHECK-MEM4-NEXT: store i64 0, ptr [[P1]], align 1
+; CHECK-MEM4-NEXT: ret void
+;
+; CHECK-MEM16-LABEL: @memset_optimize_size_lo_33_to_x86_32_generic_28(
+; CHECK-MEM16-NEXT: [[P0:%.*]] = getelementptr inbounds i8, ptr [[P:%.*]], i64 3
+; CHECK-MEM16-NEXT: [[P1:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 0
+; CHECK-MEM16-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[P0]], i64 1
+; CHECK-MEM16-NEXT: call void @llvm.memset.p0.i64(ptr align 1 [[TMP1]], i8 0, i64 32, i1 false)
+; CHECK-MEM16-NEXT: store i64 0, ptr [[P1]], align 1
+; CHECK-MEM16-NEXT: ret void
;
%p0 = getelementptr inbounds i8, ptr %p, i64 3
%p1 = getelementptr inbounds i8, ptr %p, i64 0
@@ -446,13 +484,20 @@ define void @memset_optimize_size_lo_33_misaligned_x86_fail_generic_save_unit(pt
}
define void @memset_optimize_size_lo_32_x86_misaligned_fail_generic_save_unit2(ptr %p) {
-; CHECK-LABEL: @memset_optimize_size_lo_32_x86_misaligned_fail_generic_save_unit2(
-; CHECK-NEXT: [[P0:%.*]] = getelementptr inbounds i8, ptr [[P:%.*]], i64 4
-; CHECK-NEXT: [[P1:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 0
-; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[P0]], i64 4
-; CHECK-NEXT: call void @llvm.memset.p0.i64(ptr align 2 [[TMP1]], i8 0, i64 28, i1 false)
-; CHECK-NEXT: store i64 0, ptr [[P1]], align 1
-; CHECK-NEXT: ret void
+; CHECK-MEM4-LABEL: @memset_optimize_size_lo_32_x86_misaligned_fail_generic_save_unit2(
+; CHECK-MEM4-NEXT: [[P0:%.*]] = getelementptr inbounds i8, ptr [[P:%.*]], i64 4
+; CHECK-MEM4-NEXT: [[P1:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 0
+; CHECK-MEM4-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[P0]], i64 4
+; CHECK-MEM4-NEXT: call void @llvm.memset.p0.i64(ptr align 2 [[TMP1]], i8 0, i64 28, i1 false)
+; CHECK-MEM4-NEXT: store i64 0, ptr [[P1]], align 1
+; CHECK-MEM4-NEXT: ret void
+;
+; CHECK-MEM16-LABEL: @memset_optimize_size_lo_32_x86_misaligned_fail_generic_save_unit2(
+; CHECK-MEM16-NEXT: [[P0:%.*]] = getelementptr inbounds i8, ptr [[P:%.*]], i64 4
+; CHECK-MEM16-NEXT: [[P1:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 0
+; CHECK-MEM16-NEXT: call void @llvm.memset.p0.i64(ptr align 2 [[P0]], i8 0, i64 32, i1 false)
+; CHECK-MEM16-NEXT: store i64 0, ptr [[P1]], align 1
+; CHECK-MEM16-NEXT: ret void
;
%p0 = getelementptr inbounds i8, ptr %p, i64 4
%p1 = getelementptr inbounds i8, ptr %p, i64 0
@@ -462,13 +507,21 @@ define void @memset_optimize_size_lo_32_x86_misaligned_fail_generic_save_unit2(p
}
define void @memset_optimize_size_lo_34_to_32(ptr %p) {
-; CHECK-LABEL: @memset_optimize_size_lo_34_to_32(
-; CHECK-NEXT: [[P0:%.*]] = getelementptr inbounds i8, ptr [[P:%.*]], i64 4
-; CHECK-NEXT: [[P1:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 0
-; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[P0]], i64 4
-; CHECK-NEXT: call void @llvm.memset.p0.i64(ptr align 2 [[TMP1]], i8 0, i64 30, i1 false)
-; CHECK-NEXT: store i64 0, ptr [[P1]], align 1
-; CHECK-NEXT: ret void
+; CHECK-MEM4-LABEL: @memset_optimize_size_lo_34_to_32(
+; CHECK-MEM4-NEXT: [[P0:%.*]] = getelementptr inbounds i8, ptr [[P:%.*]], i64 4
+; CHECK-MEM4-NEXT: [[P1:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 0
+; CHECK-MEM4-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[P0]], i64 4
+; CHECK-MEM4-NEXT: call void @llvm.memset.p0.i64(ptr align 2 [[TMP1]], i8 0, i64 30, i1 false)
+; CHECK-MEM4-NEXT: store i64 0, ptr [[P1]], align 1
+; CHECK-MEM4-NEXT: ret void
+;
+; CHECK-MEM16-LABEL: @memset_optimize_size_lo_34_to_32(
+; CHECK-MEM16-NEXT: [[P0:%.*]] = getelementptr inbounds i8, ptr [[P:%.*]], i64 4
+; CHECK-MEM16-NEXT: [[P1:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 0
+; CHECK-MEM16-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[P0]], i64 2
+; CHECK-MEM16-NEXT: call void @llvm.memset.p0.i64(ptr align 2 [[TMP1]], i8 0, i64 32, i1 false)
+; CHECK-MEM16-NEXT: store i64 0, ptr [[P1]], align 1
+; CHECK-MEM16-NEXT: ret void
;
%p0 = getelementptr inbounds i8, ptr %p, i64 4
%p1 = getelementptr inbounds i8, ptr %p, i64 0
@@ -494,13 +547,21 @@ define void @memset_optimize_size_lo_34_x86_misaligned_fail_generic_save_unit(pt
}
define void @memset_optimize_size_lo_34_to_32_no_align_okay(ptr %p) {
-; CHECK-LABEL: @memset_optimize_size_lo_34_to_32_no_align_okay(
-; CHECK-NEXT: [[P0:%.*]] = getelementptr inbounds i8, ptr [[P:%.*]], i64 4
-; CHECK-NEXT: [[P1:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 0
-; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[P0]], i64 4
-; CHECK-NEXT: call void @llvm.memset.p0.i64(ptr align 1 [[TMP1]], i8 0, i64 30, i1 false)
-; CHECK-NEXT: store i64 0, ptr [[P1]], align 1
-; CHECK-NEXT: ret void
+; CHECK-MEM4-LABEL: @memset_optimize_size_lo_34_to_32_no_align_okay(
+; CHECK-MEM4-NEXT: [[P0:%.*]] = getelementptr inbounds i8, ptr [[P:%.*]], i64 4
+; CHECK-MEM4-NEXT: [[P1:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 0
+; CHECK-MEM4-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[P0]], i64 4
+; CHECK-MEM4-NEXT: call void @llvm.memset.p0.i64(ptr align 1 [[TMP1]], i8 0, i64 30, i1 false)
+; CHECK-MEM4-NEXT: store i64 0, ptr [[P1]], align 1
+; CHECK-MEM4-NEXT: ret void
+;
+; CHECK-MEM16-LABEL: @memset_optimize_size_lo_34_to_32_no_align_okay(
+; CHECK-MEM16-NEXT: [[P0:%.*]] = getelementptr inbounds i8, ptr [[P:%.*]], i64 4
+; CHECK-MEM16-NEXT: [[P1:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 0
+; CHECK-MEM16-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[P0]], i64 2
+; CHECK-MEM16-NEXT: call void @llvm.memset.p0.i64(ptr align 1 [[TMP1]], i8 0, i64 32, i1 false)
+; CHECK-MEM16-NEXT: store i64 0, ptr [[P1]], align 1
+; CHECK-MEM16-NEXT: ret void
;
%p0 = getelementptr inbounds i8, ptr %p, i64 4
%p1 = getelementptr inbounds i8, ptr %p, i64 0
diff --git a/llvm/test/Transforms/DeadStoreElimination/OverwriteStoreEnd.ll b/llvm/test/Transforms/DeadStoreElimination/OverwriteStoreEnd.ll
index 9fc3f827e614db..4ad84f213c08d2 100644
--- a/llvm/test/Transforms/DeadStoreElimination/OverwriteStoreEnd.ll
+++ b/llvm/test/Transforms/DeadStoreElimination/OverwriteStoreEnd.ll
@@ -68,12 +68,19 @@ entry:
}
define void @write28to32(ptr nocapture %p) nounwind uwtable ssp {
-; CHECK-LABEL: @write28to32(
-; CHECK-NEXT: entry:
-; CHECK-NEXT: call void @llvm.memset.p0.i64(ptr align 4 [[P:%.*]], i8 0, i64 28, i1 false)
-; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i32, ptr [[P]], i64 7
-; CHECK-NEXT: store i32 1, ptr [[ARRAYIDX1]], align 4
-; CHECK-NEXT: ret void
+; CHECK-MEM4-LABEL: @write28to32(
+; CHECK-MEM4-NEXT: entry:
+; CHECK-MEM4-NEXT: call void @llvm.memset.p0.i64(ptr align 4 [[P:%.*]], i8 0, i64 28, i1 false)
+; CHECK-MEM4-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i32, ptr [[P]], i64 7
+; CHECK-MEM4-NEXT: store i32 1, ptr [[ARRAYIDX1]], align 4
+; CHECK-MEM4-NEXT: ret void
+;
+; CHECK-MEM16-LABEL: @write28to32(
+; CHECK-MEM16-NEXT: entry:
+; CHECK-MEM16-NEXT: call void @llvm.memset.p0.i64(ptr align 4 [[P:%.*]], i8 0, i64 32, i1 false)
+; CHECK-MEM16-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i32, ptr [[P]], i64 7
+; CHECK-MEM16-NEXT: store i32 1, ptr [[ARRAYIDX1]], align 4
+; CHECK-MEM16-NEXT: ret void
;
entry:
call void @llvm.memset.p0.i64(ptr align 4 %p, i8 0, i64 32, i1 false)
@@ -83,12 +90,19 @@ entry:
}
define void @write28to32_atomic(ptr nocapture %p) nounwind uwtable ssp {
-; CHECK-LABEL: @write28to32_atomic(
-; CHECK-NEXT: entry:
-; CHECK-NEXT: call void @llvm.memset.element.unordered.atomic.p0.i64(ptr align 4 [[P:%.*]], i8 0, i64 28, i32 4)
-; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i32, ptr [[P]], i64 7
-; CHECK-NEXT: store atomic i32 1, ptr [[ARRAYIDX1]] unordered, align 4
-; CHECK-NEXT: ret void
+; CHECK-MEM4-LABEL: @write28to32_atomic(
+; CHECK-MEM4-NEXT: entry:
+; CHECK-MEM4-NEXT: call void @llvm.memset.element.unordered.atomic.p0.i64(ptr align 4 [[P:%.*]], i8 0, i64 28, i32 4)
+; CHECK-MEM4-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i32, ptr [[P]], i64 7
+; CHECK-MEM4-NEXT: store atomic i32 1, ptr [[ARRAYIDX1]] unordered, align 4
+; CHECK-MEM4-NEXT: ret void
+;
+; CHECK-MEM16-LABEL: @write28to32_atomic(
+; CHECK-MEM16-NEXT: entry:
+; CHECK-MEM16-NEXT: call void @llvm.memset.element.unordered.atomic.p0.i64(ptr align 4 [[P:%.*]], i8 0, i64 32, i32 4)
+; CHECK-MEM16-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i32, ptr [[P]], i64 7
+; CHECK-MEM16-NEXT: store atomic i32 1, ptr [[ARRAYIDX1]] unordered, align 4
+; CHECK-MEM16-NEXT: ret void
;
entry:
call void @llvm.memset.element.unordered.atomic.p0.i64(ptr align 4 %p, i8 0, i64 32, i32 4)
@@ -98,12 +112,19 @@ entry:
}
define void @dontwrite28to32memset(ptr nocapture %p) nounwind uwtable ssp {
-; CHECK-LABEL: @dontwrite28to32memset(
-; CHECK-NEXT: entry:
-; CHECK-NEXT: call void @llvm.memset.p0.i64(ptr align 16 [[P:%.*]], i8 0, i64 32, i1 false)
-; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i32, ptr [[P]], i64 7
-; CHECK-NEXT: store i32 1, ptr [[ARRAYIDX1]], align 4
-; CHECK-NEXT: ret void
+; CHECK-MEM4-LABEL: @dontwrite28to32memset(
+; CHECK-MEM4-NEXT: entry:
+; CHECK-MEM4-NEXT: call void @llvm.memset.p0.i64(ptr align 16 [[P:%.*]], i8 0, i64 28, i1 false)
+; CHECK-MEM4-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i32, ptr [[P]], i64 7
+; CHECK-MEM4-NEXT: store i32 1, ptr [[ARRAYIDX1]], align 4
+; CHECK-MEM4-NEXT: ret void
+;
+; CHECK-MEM16-LABEL: @dontwrite28to32memset(
+; CHECK-MEM16-NEXT: entry:
+; CHECK-MEM16-NEXT: call void @llvm.memset.p0.i64(ptr align 16 [[P:%.*]], i8 0, i64 32, i1 false)
+; CHECK-MEM16-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i32, ptr [[P]], i64 7
+; CHECK-MEM16-NEXT: store i32 1, ptr [[ARRAYIDX1]], align 4
+; CHECK-MEM16-NEXT: ret void
;
entry:
call void @llvm.memset.p0.i64(ptr align 16 %p, i8 0, i64 32, i1 false)
@@ -113,12 +134,19 @@ entry:
}
define void @dontwrite28to32memset_atomic(ptr nocapture %p) nounwind uwtable ssp {
-; CHECK-LABEL: @dontwrite28to32memset_atomic(
-; CHECK-NEXT: entry:
-; CHECK-NEXT: call void @llvm.memset.element.unordered.atomic.p0.i64(ptr align 16 [[P:%.*]], i8 0, i64 32, i32 4)
-; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i32, ptr [[P]], i64 7
-; CHECK-NEXT: store atomic i32 1, ptr [[ARRAYIDX1]] unordered, align 4
-; CHECK-NEXT: ret void
+; CHECK-MEM4-LABEL: @dontwrite28to32memset_atomic(
+; CHECK-MEM4-NEXT: entry:
+; CHECK-MEM4-NEXT: call void @llvm.memset.element.unordered.atomic.p0.i64(ptr align 16 [[P:%.*]], i8 0, i64 28, i32 4)
+; CHECK-MEM4-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i32, ptr [[P]], i64 7
+; CHECK-MEM4-NEXT: store atomic i32 1, ptr [[ARRAYIDX1]] unordered, align 4
+; CHECK-MEM4-NEXT: ret void
+;
+; CHECK-MEM16-LABEL: @dontwrite28to32memset_atomic(
+; CHECK-MEM16-NEXT: entry:
+; CHECK-MEM16-NEXT: call void @llvm.memset.element.unordered.atomic.p0.i64(ptr align 16 [[P:%.*]], i8 0, i64 32, i32 4)
+; CHECK-MEM16-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i32, ptr [[P]], i64 7
+; CHECK-MEM16-NEXT: store atomic i32 1, ptr [[ARRAYIDX1]] unordered, align 4
+; CHECK-MEM16-NEXT: ret void
;
entry:
call void @llvm.memset.element.unordered.atomic.p0.i64(ptr align 16 %p, i8 0, i64 32, i32 4)
@@ -204,12 +232,19 @@ entry:
}
define void @dontwrite28to32memcpy(ptr nocapture %p) nounwind uwtable ssp {
-; CHECK-LABEL: @dontwrite28to32memcpy(
-; CHECK-NEXT: entry:
-; CHECK-NEXT: tail call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[P:%.*]], ptr align 16 @glob1, i64 32, i1 false)
-; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds [[STRUCT_VEC2:%.*]], ptr [[P]], i64 0, i32 0, i64 7
-; CHECK-NEXT: store i32 1, ptr [[ARRAYIDX1]], align 4
-; CHECK-NEXT: ret void
+; CHECK-MEM4-LABEL: @dontwrite28to32memcpy(
+; CHECK-MEM4-NEXT: entry:
+; CHECK-MEM4-NEXT: tail call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[P:%.*]], ptr align 16 @glob1, i64 28, i1 false)
+; CHECK-MEM4-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds [[STRUCT_VEC2:%.*]], ptr [[P]], i64 0, i32 0, i64 7
+; CHECK-MEM4-NEXT: store i32 1, ptr [[ARRAYIDX1]], align 4
+; CHECK-MEM4-NEXT: ret void
+;
+; CHECK-MEM16-LABEL: @dontwrite28to32memcpy(
+; CHECK-MEM16-NEXT: entry:
+; CHECK-MEM16-NEXT: tail call void @llvm.memcpy.p0.p0.i64(ptr align 16 [[P:%.*]], ptr align 16 @glob1, i64 32, i1 false)
+; CHECK-MEM16-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds [[STRUCT_VEC2:%.*]], ptr [[P]], i64 0, i32 0, i64 7
+; CHECK-MEM16-NEXT: store i32 1, ptr [[ARRAYIDX1]], align 4
+; CHECK-MEM16-NEXT: ret void
;
entry:
tail call void @llvm.memcpy.p0.p0.i64(ptr align 16 %p, ptr align 16 @glob1, i64 32, i1 false)
@@ -219,12 +254,19 @@ entry:
}
define void @dontwrite28to32memcpy_atomic(ptr nocapture %p) nounwind uwtable ssp {
-; CHECK-LABEL: @dontwrite28to32memcpy_atomic(
-; CHECK-NEXT: entry:
-; CHECK-NEXT: tail call void @llvm.memcpy.element.unordered.atomic.p0.p0.i64(ptr align 16 [[P:%.*]], ptr align 16 @glob1, i64 32, i32 4)
-; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds [[STRUCT_VEC2:%.*]], ptr [[P]], i64 0, i32 0, i64 7
-; CHECK-NEXT: store atomic i32 1, ptr [[ARRAYIDX1]] unordered, align 4
-; CHECK-NEXT: ret void
+; CHECK-MEM4-LABEL: @dontwrite28to32memcpy_atomic(
+; CHECK-MEM4-NEXT: entry:
+; CHECK-MEM4-NEXT: tail call void @llvm.memcpy.element.unordered.atomic.p0.p0.i64(ptr align 16 [[P:%.*]], ptr align 16 @glob1, i64 28, i32 4)
+; CHECK-MEM4-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds [[STRUCT_VEC2:%.*]], ptr [[P]], i64 0, i32 0, i64 7
+; CHECK-MEM4-NEXT: store atomic i32 1, ptr [[ARRAYIDX1]] unordered, align 4
+; CHECK-MEM4-NEXT: ret void
+;
+; CHECK-MEM16-LABEL: @dontwrite28to32memcpy_atomic(
+; CHECK-MEM16-NEXT: entry:
+; CHECK-MEM16-NEXT: tail call void @llvm.memcpy.element.unordered.atomic.p0.p0.i64(ptr align 16 [[P:%.*]], ptr align 16 @glob1, i64 32, i32 4)
+; CHECK-MEM16-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds [[STRUCT_VEC2:%.*]], ptr [[P]], i64 0, i32 0, i64 7
+; CHECK-MEM16-NEXT: store atomic i32 1, ptr [[ARRAYIDX1]] unordered, align 4
+; CHECK-MEM16-NEXT: ret void
;
entry:
tail call void @llvm.memcpy.element.unordered.atomic.p0.p0.i64(ptr align 16 %p, ptr align 16 @glob1, i64 32, i32 4)
@@ -351,13 +393,21 @@ entry:
}
define void @ow_end_align1(ptr nocapture %p) {
-; CHECK-LABEL: @ow_end_align1(
-; CHECK-NEXT: entry:
-; CHECK-NEXT: [[P1:%.*]] = getelementptr inbounds i8, ptr [[P:%.*]], i64 1
-; CHECK-NEXT: call void @llvm.memset.p0.i64(ptr align 1 [[P1]], i8 0, i64 27, i1 false)
-; CHECK-NEXT: [[P2:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 27
-; CHECK-NEXT: store i64 1, ptr [[P2]], align 1
-; CHECK-NEXT: ret void
+; CHECK-MEM4-LABEL: @ow_end_align1(
+; CHECK-MEM4-NEXT: entry:
+; CHECK-MEM4-NEXT: [[P1:%.*]] = getelementptr inbounds i8, ptr [[P:%.*]], i64 1
+; CHECK-MEM4-NEXT: call void @llvm.memset.p0.i64(ptr align 1 [[P1]], i8 0, i64 28, i1 false)
+; CHECK-MEM4-NEXT: [[P2:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 27
+; CHECK-MEM4-NEXT: store i64 1, ptr [[P2]], align 1
+; CHECK-MEM4-NEXT: ret void
+;
+; CHECK-MEM16-LABEL: @ow_end_align1(
+; CHECK-MEM16-NEXT: entry:
+; CHECK-MEM16-NEXT: [[P1:%.*]] = getelementptr inbounds i8, ptr [[P:%.*]], i64 1
+; CHECK-MEM16-NEXT: call void @llvm.memset.p0.i64(ptr align 1 [[P1]], i8 0, i64 32, i1 false)
+; CHECK-MEM16-NEXT: [[P2:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 27
+; CHECK-MEM16-NEXT: store i64 1, ptr [[P2]], align 1
+; CHECK-MEM16-NEXT: ret void
;
entry:
%p1 = getelementptr inbounds i8, ptr %p, i64 1
@@ -368,13 +418,21 @@ entry:
}
define void @ow_end_align4(ptr nocapture %p) {
-; CHECK-LABEL: @ow_end_align4(
-; CHECK-NEXT: entry:
-; CHECK-NEXT: [[P1:%.*]] = getelementptr inbounds i8, ptr [[P:%.*]], i64 1
-; CHECK-NEXT: call void @llvm.memset.p0.i64(ptr align 4 [[P1]], i8 0, i64 28, i1 false)
-; CHECK-NEXT: [[P2:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 27
-; CHECK-NEXT: store i64 1, ptr [[P2]], align 1
-; CHECK-NEXT: ret void
+; CHECK-MEM4-LABEL: @ow_end_align4(
+; CHECK-MEM4-NEXT: entry:
+; CHECK-MEM4-NEXT: [[P1:%.*]] = getelementptr inbounds i8, ptr [[P:%.*]], i64 1
+; CHECK-MEM4-NEXT: call void @llvm.memset.p0.i64(ptr align 4 [[P1]], i8 0, i64 28, i1 false)
+; CHECK-MEM4-NEXT: [[P2:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 27
+; CHECK-MEM4-NEXT: store i64 1, ptr [[P2]], align 1
+; CHECK-MEM4-NEXT: ret void
+;
+; CHECK-MEM16-LABEL: @ow_end_align4(
+; CHECK-MEM16-NEXT: entry:
+; CHECK-MEM16-NEXT: [[P1:%.*]] = getelementptr inbounds i8, ptr [[P:%.*]], i64 1
+; CHECK-MEM16-NEXT: call void @llvm.memset.p0.i64(ptr align 4 [[P1]], i8 0, i64 32, i1 false)
+; CHECK-MEM16-NEXT: [[P2:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 27
+; CHECK-MEM16-NEXT: store i64 1, ptr [[P2]], align 1
+; CHECK-MEM16-NEXT: ret void
;
entry:
%p1 = getelementptr inbounds i8, ptr %p, i64 1
@@ -385,13 +443,21 @@ entry:
}
define void @ow_end_align8(ptr nocapture %p) {
-; CHECK-LABEL: @ow_end_align8(
-; CHECK-NEXT: entry:
-; CHECK-NEXT: [[P1:%.*]] = getelementptr inbounds i8, ptr [[P:%.*]], i64 1
-; CHECK-NEXT: call void @llvm.memset.p0.i64(ptr align 8 [[P1]], i8 0, i64 32, i1 false)
-; CHECK-NEXT: [[P2:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 27
-; CHECK-NEXT: store i64 1, ptr [[P2]], align 1
-; CHECK-NEXT: ret void
+; CHECK-MEM4-LABEL: @ow_end_align8(
+; CHECK-MEM4-NEXT: entry:
+; CHECK-MEM4-NEXT: [[P1:%.*]] = getelementptr inbounds i8, ptr [[P:%.*]], i64 1
+; CHECK-MEM4-NEXT: call void @llvm.memset.p0.i64(ptr align 8 [[P1]], i8 0, i64 28, i1 false)
+; CHECK-MEM4-NEXT: [[P2:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 27
+; CHECK-MEM4-NEXT: store i64 1, ptr [[P2]], align 1
+; CHECK-MEM4-NEXT: ret void
+;
+; CHECK-MEM16-LABEL: @ow_end_align8(
+; CHECK-MEM16-NEXT: entry:
+; CHECK-MEM16-NEXT: [[P1:%.*]] = getelementptr inbounds i8, ptr [[P:%.*]], i64 1
+; CHECK-MEM16-NEXT: call void @llvm.memset.p0.i64(ptr align 8 [[P1]], i8 0, i64 32, i1 false)
+; CHECK-MEM16-NEXT: [[P2:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 27
+; CHECK-MEM16-NEXT: store i64 1, ptr [[P2]], align 1
+; CHECK-MEM16-NEXT: ret void
;
entry:
%p1 = getelementptr inbounds i8, ptr %p, i64 1
@@ -405,7 +471,7 @@ define void @memset_optimize_size_hi_31_to_24(ptr %p) {
; CHECK-LABEL: @memset_optimize_size_hi_31_to_24(
; CHECK-NEXT: [[P0:%.*]] = getelementptr inbounds i8, ptr [[P:%.*]], i64 0
; CHECK-NEXT: [[P1:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 23
-; CHECK-NEXT: call void @llvm.memset.p0.i64(ptr align 1 [[P0]], i8 0, i64 23, i1 false)
+; CHECK-NEXT: call void @llvm.memset.p0.i64(ptr align 1 [[P0]], i8 0, i64 24, i1 false)
; CHECK-NEXT: store i64 0, ptr [[P1]], align 1
; CHECK-NEXT: ret void
;
@@ -417,12 +483,19 @@ define void @memset_optimize_size_hi_31_to_24(ptr %p) {
}
define void @memset_optimize_size_hi_32_no_change_x86_change_generic(ptr %p) {
-; CHECK-LABEL: @memset_optimize_size_hi_32_no_change_x86_change_generic(
-; CHECK-NEXT: [[P0:%.*]] = getelementptr inbounds i8, ptr [[P:%.*]], i64 0
-; CHECK-NEXT: [[P1:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 28
-; CHECK-NEXT: call void @llvm.memset.p0.i64(ptr align 1 [[P0]], i8 0, i64 28, i1 false)
-; CHECK-NEXT: store i64 0, ptr [[P1]], align 1
-; CHECK-NEXT: ret void
+; CHECK-MEM4-LABEL: @memset_optimize_size_hi_32_no_change_x86_change_generic(
+; CHECK-MEM4-NEXT: [[P0:%.*]] = getelementptr inbounds i8, ptr [[P:%.*]], i64 0
+; CHECK-MEM4-NEXT: [[P1:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 28
+; CHECK-MEM4-NEXT: call void @llvm.memset.p0.i64(ptr align 1 [[P0]], i8 0, i64 28, i1 false)
+; CHECK-MEM4-NEXT: store i64 0, ptr [[P1]], align 1
+; CHECK-MEM4-NEXT: ret void
+;
+; CHECK-MEM16-LABEL: @memset_optimize_size_hi_32_no_change_x86_change_generic(
+; CHECK-MEM16-NEXT: [[P0:%.*]] = getelementptr inbounds i8, ptr [[P:%.*]], i64 0
+; CHECK-MEM16-NEXT: [[P1:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 28
+; CHECK-MEM16-NEXT: call void @llvm.memset.p0.i64(ptr align 1 [[P0]], i8 0, i64 32, i1 false)
+; CHECK-MEM16-NEXT: store i64 0, ptr [[P1]], align 1
+; CHECK-MEM16-NEXT: ret void
;
%p0 = getelementptr inbounds i8, ptr %p, i64 0
%p1 = getelementptr inbounds i8, ptr %p, i64 28
@@ -432,12 +505,19 @@ define void @memset_optimize_size_hi_32_no_change_x86_change_generic(ptr %p) {
}
define void @memset_optimize_size_hi_28_to_24(ptr %p) {
-; CHECK-LABEL: @memset_optimize_size_hi_28_to_24(
-; CHECK-NEXT: [[P0:%.*]] = getelementptr inbounds i8, ptr [[P:%.*]], i64 0
-; CHECK-NEXT: [[P1:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 21
-; CHECK-NEXT: call void @llvm.memset.p0.i64(ptr align 8 [[P0]], i8 0, i64 24, i1 false)
-; CHECK-NEXT: store i64 0, ptr [[P1]], align 1
-; CHECK-NEXT: ret void
+; CHECK-MEM4-LABEL: @memset_optimize_size_hi_28_to_24(
+; CHECK-MEM4-NEXT: [[P0:%.*]] = getelementptr inbounds i8, ptr [[P:%.*]], i64 0
+; CHECK-MEM4-NEXT: [[P1:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 21
+; CHECK-MEM4-NEXT: call void @llvm.memset.p0.i64(ptr align 8 [[P0]], i8 0, i64 21, i1 false)
+; CHECK-MEM4-NEXT: store i64 0, ptr [[P1]], align 1
+; CHECK-MEM4-NEXT: ret void
+;
+; CHECK-MEM16-LABEL: @memset_optimize_size_hi_28_to_24(
+; CHECK-MEM16-NEXT: [[P0:%.*]] = getelementptr inbounds i8, ptr [[P:%.*]], i64 0
+; CHECK-MEM16-NEXT: [[P1:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 21
+; CHECK-MEM16-NEXT: call void @llvm.memset.p0.i64(ptr align 8 [[P0]], i8 0, i64 24, i1 false)
+; CHECK-MEM16-NEXT: store i64 0, ptr [[P1]], align 1
+; CHECK-MEM16-NEXT: ret void
;
%p0 = getelementptr inbounds i8, ptr %p, i64 0
%p1 = getelementptr inbounds i8, ptr %p, i64 21
@@ -462,12 +542,19 @@ define void @memset_optimize_size_hi_31_to_28(ptr %p) {
}
define void @memset_optimize_size_hi_33_to_x86_32_generic_28(ptr %p) {
-; CHECK-LABEL: @memset_optimize_size_hi_33_to_x86_32_generic_28(
-; CHECK-NEXT: [[P0:%.*]] = getelementptr inbounds i8, ptr [[P:%.*]], i64 0
-; CHECK-NEXT: [[P1:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 27
-; CHECK-NEXT: call void @llvm.memset.p0.i64(ptr align 4 [[P0]], i8 0, i64 28, i1 false)
-; CHECK-NEXT: store i64 0, ptr [[P1]], align 1
-; CHECK-NEXT: ret void
+; CHECK-MEM4-LABEL: @memset_optimize_size_hi_33_to_x86_32_generic_28(
+; CHECK-MEM4-NEXT: [[P0:%.*]] = getelementptr inbounds i8, ptr [[P:%.*]], i64 0
+; CHECK-MEM4-NEXT: [[P1:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 27
+; CHECK-MEM4-NEXT: call void @llvm.memset.p0.i64(ptr align 4 [[P0]], i8 0, i64 28, i1 false)
+; CHECK-MEM4-NEXT: store i64 0, ptr [[P1]], align 1
+; CHECK-MEM4-NEXT: ret void
+;
+; CHECK-MEM16-LABEL: @memset_optimize_size_hi_33_to_x86_32_generic_28(
+; CHECK-MEM16-NEXT: [[P0:%.*]] = getelementptr inbounds i8, ptr [[P:%.*]], i64 0
+; CHECK-MEM16-NEXT: [[P1:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 27
+; CHECK-MEM16-NEXT: call void @llvm.memset.p0.i64(ptr align 4 [[P0]], i8 0, i64 32, i1 false)
+; CHECK-MEM16-NEXT: store i64 0, ptr [[P1]], align 1
+; CHECK-MEM16-NEXT: ret void
;
%p0 = getelementptr inbounds i8, ptr %p, i64 0
%p1 = getelementptr inbounds i8, ptr %p, i64 27
@@ -475,6 +562,3 @@ define void @memset_optimize_size_hi_33_to_x86_32_generic_28(ptr %p) {
store i64 0, ptr %p1, align 1
ret void
}
-;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
-; CHECK-MEM16: {{.*}}
-; CHECK-MEM4: {{.*}}
diff --git a/llvm/test/Transforms/DeadStoreElimination/stores-of-existing-values.ll b/llvm/test/Transforms/DeadStoreElimination/stores-of-existing-values.ll
index c9a0943de8cd98..2d04179eeb6e0f 100644
--- a/llvm/test/Transforms/DeadStoreElimination/stores-of-existing-values.ll
+++ b/llvm/test/Transforms/DeadStoreElimination/stores-of-existing-values.ll
@@ -549,8 +549,8 @@ define void @test12_memset_later_store_exceeds_memset(ptr %ptr) {
define void @test12_memset_later_store_before_memset(ptr %ptr) {
; CHECK-LABEL: @test12_memset_later_store_before_memset(
; CHECK-NEXT: [[PTR_1:%.*]] = getelementptr i8, ptr [[PTR:%.*]], i64 1
-; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[PTR_1]], i64 7
-; CHECK-NEXT: call void @llvm.memset.p0.i64(ptr align 1 [[TMP1]], i8 0, i64 3, i1 false)
+; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[PTR_1]], i64 6
+; CHECK-NEXT: call void @llvm.memset.p0.i64(ptr align 1 [[TMP1]], i8 0, i64 4, i1 false)
; CHECK-NEXT: store i64 0, ptr [[PTR]], align 8
; CHECK-NEXT: ret void
;
More information about the llvm-commits
mailing list