[llvm] [SROA] Allow rewriting memcpy depending on tbaa.struct (PR #77597)

Wed Jan 10 04:53:55 PST 2024

https://github.com/bjope created https://github.com/llvm/llvm-project/pull/77597

The bugfix in commit 54067c5fbe, related to
  https://github.com/llvm/llvm-project/issues/64081
limited the ability of SROA to handle non byte-sized types
when used in aggregates that are memcpy'd.

Main problem was that the LLVM types used in an alloca doesn't
always reflect if the stack slot can be used to for multiple
types (typically unions). So even if we for example have
  %p = alloca i6
that stack slot may be used for other types than i6. And it
would be legal to for example store an i8 to that stack slot.

Thus, if %p was dereferenced in a memcpy we needed to consider
that also padding bits (seen from the i6 perspective) could be
of importance.

The limitation added to SROA in commit https://github.com/llvm/llvm-project/commit/54067c5fbe9fc13ab195cdddb8f17e18d72b5fe4 resulted
in huge regressions for a downstream target. Since the frontend
typically emit memcpy for aggregate copy it seems quite normal
that one end up with a memcpy that is copying padding bits even
when there are no unions or type punning. So that limitation
seem unfortunate in general.

In this patch we try to lift the restrictions a bit. If the
memcpy is decorated with tbaa.struct metadata we look at that
metadata. If we find out if the slice used by our new alloca is
touching memory described by a single scalar type according to
the tbaa.struct, then we assume that the type derived for the
new alloca is correct for all accesses made to the stack slot.
And then we can allow replacing the memcpy by regular load/store
instructions operating on that type (disregarding any padding
bits).

>From 780d723b41a8bad5ff565919af631fb6ffbd60b2 Mon Sep 17 00:00:00 2001
From: Bjorn Pettersson <bjorn.a.pettersson at ericsson.com>
Date: Tue, 9 Jan 2024 16:24:06 +0100
Subject: [PATCH 1/2] [SROA] Pre-commit test cases involving aggregates with
 non-byte sized types

The bugfix in commit 54067c5fbe9f, related to
  https://github.com/llvm/llvm-project/issues/64081
limited the ability of SROA to handle non byte-sized types
when used in aggregates that are memcpy'd.

Adding some test cases involving tbaa.struct annotations
that might allow us to eliminate more memcpy/alloca.
---
 llvm/test/Transforms/SROA/pr64081.ll | 80 ++++++++++++++++++++++++++++
 1 file changed, 80 insertions(+)

diff --git a/llvm/test/Transforms/SROA/pr64081.ll b/llvm/test/Transforms/SROA/pr64081.ll
index 4b893842138263..b10144e1949eee 100644
--- a/llvm/test/Transforms/SROA/pr64081.ll
+++ b/llvm/test/Transforms/SROA/pr64081.ll
@@ -30,3 +30,83 @@ bb:
 declare void @use(ptr)
 
 declare void @llvm.memcpy.p0.p0.i64(ptr noalias nocapture writeonly, ptr noalias nocapture readonly, i64, i1 immarg)
+
+
+; No unions or overlaps in the tbaa.struct. So we can rely on the types
+define void @test2(i3 %x) {
+; CHECK-LABEL: define void @test2(
+; CHECK-SAME: i3 [[X:%.*]]) {
+; CHECK-NEXT:  bb:
+; CHECK-NEXT:    [[RES:%.*]] = alloca [[B:%.*]], align 8
+; CHECK-NEXT:    [[TMP_SROA_0:%.*]] = alloca i1, align 8
+; CHECK-NEXT:    [[TMP_SROA_2:%.*]] = alloca i3, align 1
+; CHECK-NEXT:    store i1 true, ptr [[TMP_SROA_0]], align 8
+; CHECK-NEXT:    store i3 [[X]], ptr [[TMP_SROA_2]], align 1
+; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 1 [[RES]], ptr align 8 [[TMP_SROA_0]], i64 1, i1 false), !tbaa.struct [[TBAA_STRUCT0:![0-9]+]]
+; CHECK-NEXT:    [[TMP_SROA_2_0_RES_SROA_IDX:%.*]] = getelementptr inbounds i8, ptr [[RES]], i64 1
+; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 1 [[TMP_SROA_2_0_RES_SROA_IDX]], ptr align 1 [[TMP_SROA_2]], i64 1, i1 false), !tbaa.struct [[TBAA_STRUCT7:![0-9]+]]
+; CHECK-NEXT:    [[TMP0:%.*]] = call i8 @use(ptr [[RES]])
+; CHECK-NEXT:    ret void
+;
+bb:
+  %res = alloca %B
+  %tmp = alloca %B
+  %tmp.1 = getelementptr i8, ptr %tmp, i64 1
+  store i1 1, ptr %tmp
+  store i3 %x, ptr %tmp.1
+  call void @llvm.memcpy.p0.p0.i64(ptr %res, ptr %tmp, i64 2, i1 false), !tbaa.struct !6
+  call i8 @use(ptr %res)
+  ret void
+}
+
+; Union preventing SROA from removing the memcpy for the first byte.
+define void @test3(i3 %x) {
+; CHECK-LABEL: define void @test3(
+; CHECK-SAME: i3 [[X:%.*]]) {
+; CHECK-NEXT:  bb:
+; CHECK-NEXT:    [[RES:%.*]] = alloca [[B:%.*]], align 8
+; CHECK-NEXT:    [[TMP_SROA_0:%.*]] = alloca i1, align 8
+; CHECK-NEXT:    [[TMP_SROA_2:%.*]] = alloca i3, align 1
+; CHECK-NEXT:    store i1 true, ptr [[TMP_SROA_0]], align 8
+; CHECK-NEXT:    store i3 [[X]], ptr [[TMP_SROA_2]], align 1
+; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 1 [[RES]], ptr align 8 [[TMP_SROA_0]], i64 1, i1 false), !tbaa.struct [[TBAA_STRUCT8:![0-9]+]]
+; CHECK-NEXT:    [[TMP_SROA_2_0_RES_SROA_IDX:%.*]] = getelementptr inbounds i8, ptr [[RES]], i64 1
+; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 1 [[TMP_SROA_2_0_RES_SROA_IDX]], ptr align 1 [[TMP_SROA_2]], i64 1, i1 false), !tbaa.struct [[TBAA_STRUCT7]]
+; CHECK-NEXT:    [[TMP0:%.*]] = call i8 @use(ptr [[RES]])
+; CHECK-NEXT:    ret void
+;
+bb:
+  %res = alloca %B
+  %tmp = alloca %B
+  %tmp.1 = getelementptr i8, ptr %tmp, i64 1
+  store i1 1, ptr %tmp
+  store i3 %x, ptr %tmp.1
+  call void @llvm.memcpy.p0.p0.i64(ptr %res, ptr %tmp, i64 2, i1 false), !tbaa.struct !9
+  call i8 @use(ptr %res)
+  ret void
+}
+
+!1 = !{!"_BitInt(7)", !4, i64 0}
+!2 = !{!"_BitInt(1)", !4, i64 0}
+!3 = !{!"_BitInt(3)", !4, i64 0}
+!4 = !{!"omnipotent char", !5, i64 0}
+!5 = !{!"Simple C++ TBAA"}
+!6 = !{i64 0, i64 1, !7, i64 1, i64 1, !8}
+!7 = !{!2, !2, i64 0}
+!8 = !{!3, !3, i64 0}
+!9 = !{i64 0, i64 1, !10, i64 0, i64 1, !7, i64 1, i64 1, !8}
+!10 = !{!1, !1, i64 0}
+
+;.
+; CHECK: [[TBAA_STRUCT0]] = !{i64 0, i64 1, [[META1:![0-9]+]], i64 1, i64 1, [[META5:![0-9]+]]}
+; CHECK: [[META1]] = !{[[META2:![0-9]+]], [[META2]], i64 0}
+; CHECK: [[META2]] = !{!"_BitInt(1)", [[META3:![0-9]+]], i64 0}
+; CHECK: [[META3]] = !{!"omnipotent char", [[META4:![0-9]+]], i64 0}
+; CHECK: [[META4]] = !{!"Simple C++ TBAA"}
+; CHECK: [[META5]] = !{[[META6:![0-9]+]], [[META6]], i64 0}
+; CHECK: [[META6]] = !{!"_BitInt(3)", [[META3]], i64 0}
+; CHECK: [[TBAA_STRUCT7]] = !{i64 0, i64 1, [[META5]]}
+; CHECK: [[TBAA_STRUCT8]] = !{i64 0, i64 1, [[META9:![0-9]+]], i64 0, i64 1, [[META1]], i64 1, i64 1, [[META5]]}
+; CHECK: [[META9]] = !{[[META10:![0-9]+]], [[META10]], i64 0}
+; CHECK: [[META10]] = !{!"_BitInt(7)", [[META3]], i64 0}
+;.

>From db74d290b229efb64698996e8647e1cca264d84f Mon Sep 17 00:00:00 2001
From: Bjorn Pettersson <bjorn.a.pettersson at ericsson.com>
Date: Wed, 10 Jan 2024 13:46:24 +0100
Subject: [PATCH 2/2] [SROA] Allow rewriting memcpy of non-byte-sized type
 depending on tbaa.struct

The bugfix in commit 54067c5fbe9f, related to
  https://github.com/llvm/llvm-project/issues/64081
limited the ability of SROA to handle non byte-sized types
when used in aggregates that are memcpy'd.

Main problem was that the LLVM types used in an alloca doesn't
always reflect if the stack slot can be used to for multiple
types (typically unions). So even if we for example have
  %p = alloca i6
that stack slot may be used for other types than i6. And it
would be legal to for example store an i8 to that stack slot.

Thus, if %p was dereferenced in a memcpy we needed to consider
that also padding bits (seen from the i6 perspective) could be
of importance.

The limitation added to SROA in commit 54067c5fbe9f resulted
in huge regressions for a downstream target. Since the frontend
typically emit memcpy for aggregate copy it seems quite normal
that one end up with a memcpy that is copying padding bits even
when there are no unions or type punning. So that limitation
seem unfortunate in general.

In this patch we try to lift the restrictions a bit. If the
memcpy is decorated with tbaa.struct metadata we look at that
metadata. If we find out if the slice used by our new alloca is
touching memory described by a single scalar type according to
the tbaa.struct, then we assume that the type derived for the
new alloca is correct for all accesses made to the stack slot.
And then we can allow replacing the memcpy by regular load/store
instructions operating on that type (disregarding any padding
bits).
---
 llvm/lib/Transforms/Scalar/SROA.cpp  | 38 ++++++++++++++++++++++++++--
 llvm/test/Transforms/SROA/pr64081.ll | 12 +++------
 2 files changed, 39 insertions(+), 11 deletions(-)

diff --git a/llvm/lib/Transforms/Scalar/SROA.cpp b/llvm/lib/Transforms/Scalar/SROA.cpp
index 75cddfa16d6db5..4439765f68db0b 100644
--- a/llvm/lib/Transforms/Scalar/SROA.cpp
+++ b/llvm/lib/Transforms/Scalar/SROA.cpp
@@ -3276,6 +3276,39 @@ class AllocaSliceRewriter : public InstVisitor<AllocaSliceRewriter, bool> {
     // memmove with memcpy, and we don't need to worry about all manner of
     // downsides to splitting and transforming the operations.
 
+    // The tbaa.struct is only being explicit about byte padding. Here we assume
+    // that if the derived type used for the NewAI maps to a single scalar type,
+    // as given by the tbaa.struct, then it is safe to assume that we can use
+    // that type when doing the copying even if it include bit padding. If there
+    // for example would be a union of "_BitInt(3)" and "char" types the
+    // tbaa.struct would have multiple entries indicating the different types
+    // (or there wouldn't be any tbaa.struct)..
+    auto IsSingleTypeAccordingToTBAA = [&]() -> bool {
+      // Only consider the case when we have a tbaa.struct.
+      if (!(AATags && AATags.TBAAStruct))
+        return false;
+      MDNode *MD = AATags.TBAAStruct;
+      uint64_t Offset = NewBeginOffset - BeginOffset;
+      unsigned Count = 0;
+      for (size_t i = 0, size = MD->getNumOperands(); i < size; i += 3) {
+        uint64_t InnerOffset =
+          mdconst::extract<ConstantInt>(MD->getOperand(i))->getZExtValue();
+        uint64_t InnerSize =
+          mdconst::extract<ConstantInt>(MD->getOperand(i + 1))->getZExtValue();
+        // Ignore entries that aren't overlapping with our slice.
+        if (InnerOffset + InnerSize <= Offset ||
+            InnerOffset >= Offset + SliceSize)
+          continue;
+        // Only allow a single match (no unions).
+        if (++Count > 1)
+          return false;
+        // Size/offset must match up.
+        if (InnerSize != SliceSize || Offset != InnerOffset)
+          return false;
+      }
+      return Count == 1;
+    };
+
     // If this doesn't map cleanly onto the alloca type, and that type isn't
     // a single value type, just emit a memcpy.
     bool EmitMemCpy =
@@ -3283,8 +3316,9 @@ class AllocaSliceRewriter : public InstVisitor<AllocaSliceRewriter, bool> {
         (BeginOffset > NewAllocaBeginOffset || EndOffset < NewAllocaEndOffset ||
          SliceSize !=
              DL.getTypeStoreSize(NewAI.getAllocatedType()).getFixedValue() ||
-         !DL.typeSizeEqualsStoreSize(NewAI.getAllocatedType()) ||
-         !NewAI.getAllocatedType()->isSingleValueType());
+         !NewAI.getAllocatedType()->isSingleValueType() ||
+         (!DL.typeSizeEqualsStoreSize(NewAI.getAllocatedType()) &&
+          !IsSingleTypeAccordingToTBAA()));
 
     // If we're just going to emit a memcpy, the alloca hasn't changed, and the
     // size hasn't been shrunk based on analysis of the viable range, this is
diff --git a/llvm/test/Transforms/SROA/pr64081.ll b/llvm/test/Transforms/SROA/pr64081.ll
index b10144e1949eee..ba83e495f56c27 100644
--- a/llvm/test/Transforms/SROA/pr64081.ll
+++ b/llvm/test/Transforms/SROA/pr64081.ll
@@ -38,13 +38,9 @@ define void @test2(i3 %x) {
 ; CHECK-SAME: i3 [[X:%.*]]) {
 ; CHECK-NEXT:  bb:
 ; CHECK-NEXT:    [[RES:%.*]] = alloca [[B:%.*]], align 8
-; CHECK-NEXT:    [[TMP_SROA_0:%.*]] = alloca i1, align 8
-; CHECK-NEXT:    [[TMP_SROA_2:%.*]] = alloca i3, align 1
-; CHECK-NEXT:    store i1 true, ptr [[TMP_SROA_0]], align 8
-; CHECK-NEXT:    store i3 [[X]], ptr [[TMP_SROA_2]], align 1
-; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 1 [[RES]], ptr align 8 [[TMP_SROA_0]], i64 1, i1 false), !tbaa.struct [[TBAA_STRUCT0:![0-9]+]]
+; CHECK-NEXT:    store i1 true, ptr [[RES]], align 1, !tbaa.struct [[TBAA_STRUCT0:![0-9]+]]
 ; CHECK-NEXT:    [[TMP_SROA_2_0_RES_SROA_IDX:%.*]] = getelementptr inbounds i8, ptr [[RES]], i64 1
-; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 1 [[TMP_SROA_2_0_RES_SROA_IDX]], ptr align 1 [[TMP_SROA_2]], i64 1, i1 false), !tbaa.struct [[TBAA_STRUCT7:![0-9]+]]
+; CHECK-NEXT:    store i3 [[X]], ptr [[TMP_SROA_2_0_RES_SROA_IDX]], align 1, !tbaa.struct [[TBAA_STRUCT7:![0-9]+]]
 ; CHECK-NEXT:    [[TMP0:%.*]] = call i8 @use(ptr [[RES]])
 ; CHECK-NEXT:    ret void
 ;
@@ -66,12 +62,10 @@ define void @test3(i3 %x) {
 ; CHECK-NEXT:  bb:
 ; CHECK-NEXT:    [[RES:%.*]] = alloca [[B:%.*]], align 8
 ; CHECK-NEXT:    [[TMP_SROA_0:%.*]] = alloca i1, align 8
-; CHECK-NEXT:    [[TMP_SROA_2:%.*]] = alloca i3, align 1
 ; CHECK-NEXT:    store i1 true, ptr [[TMP_SROA_0]], align 8
-; CHECK-NEXT:    store i3 [[X]], ptr [[TMP_SROA_2]], align 1
 ; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 1 [[RES]], ptr align 8 [[TMP_SROA_0]], i64 1, i1 false), !tbaa.struct [[TBAA_STRUCT8:![0-9]+]]
 ; CHECK-NEXT:    [[TMP_SROA_2_0_RES_SROA_IDX:%.*]] = getelementptr inbounds i8, ptr [[RES]], i64 1
-; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 1 [[TMP_SROA_2_0_RES_SROA_IDX]], ptr align 1 [[TMP_SROA_2]], i64 1, i1 false), !tbaa.struct [[TBAA_STRUCT7]]
+; CHECK-NEXT:    store i3 [[X]], ptr [[TMP_SROA_2_0_RES_SROA_IDX]], align 1, !tbaa.struct [[TBAA_STRUCT7]]
 ; CHECK-NEXT:    [[TMP0:%.*]] = call i8 @use(ptr [[RES]])
 ; CHECK-NEXT:    ret void
 ;