[llvm] [CGP][AArch64] Do not sink instructions that might read/write memory. (PR #176182)

Thu Jan 15 07:26:36 PST 2026

llvmbot wrote:




@llvm/pr-subscribers-llvm-transforms

Author: David Green (davemgreen)

<details>
<summary>Changes</summary>

The test case's call instruction was being sank past the point where the memory
it accessed was valid. Add a check that CGP does not try to sink instruction that
might be invalid to move.

Fixes #176095

---
Full diff: https://github.com/llvm/llvm-project/pull/176182.diff


2 Files Affected:

- (modified) llvm/lib/CodeGen/CodeGenPrepare.cpp (+1-1) 
- (modified) llvm/test/Transforms/CodeGenPrepare/AArch64/sink-free-instructions.ll (+54-6) 


``````````diff

diff --git a/llvm/lib/CodeGen/CodeGenPrepare.cpp b/llvm/lib/CodeGen/CodeGenPrepare.cpp
index 28662665e86fa..bc6765d7e4e0e 100644
--- a/llvm/lib/CodeGen/CodeGenPrepare.cpp
+++ b/llvm/lib/CodeGen/CodeGenPrepare.cpp
@@ -7943,7 +7943,7 @@ bool CodeGenPrepare::tryToSinkFreeOperands(Instruction *I) {
 
   for (Use *U : reverse(OpsToSink)) {
     auto *UI = cast<Instruction>(U->get());
-    if (isa<PHINode>(UI))
+    if (isa<PHINode>(UI) || UI->mayReadOrWriteMemory())
       continue;
     if (UI->getParent() == TargetBB) {
       if (InstOrdering[UI] < InstOrdering[InsertPoint])
diff --git a/llvm/test/Transforms/CodeGenPrepare/AArch64/sink-free-instructions.ll b/llvm/test/Transforms/CodeGenPrepare/AArch64/sink-free-instructions.ll
index 0ccfd9c20c12e..643755ce80ffc 100644
--- a/llvm/test/Transforms/CodeGenPrepare/AArch64/sink-free-instructions.ll
+++ b/llvm/test/Transforms/CodeGenPrepare/AArch64/sink-free-instructions.ll
@@ -142,12 +142,12 @@ entry:
 
 if.then:
   %s2 = shufflevector <16 x i8> %b, <16 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-  %vmull0 = tail call <8 x i16> @llvm.aarch64.neon.umull.v8i16(<8 x i8> %s1, <8 x i8> %s2) #3
+  %vmull0 = tail call <8 x i16> @llvm.aarch64.neon.umull.v8i16(<8 x i8> %s1, <8 x i8> %s2)
   ret <8 x i16> %vmull0
 
 if.else:
   %s4 = shufflevector <16 x i8> %b, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-  %vmull1 = tail call <8 x i16> @llvm.aarch64.neon.umull.v8i16(<8 x i8> %s3, <8 x i8> %s4) #3
+  %vmull1 = tail call <8 x i16> @llvm.aarch64.neon.umull.v8i16(<8 x i8> %s3, <8 x i8> %s4)
   ret <8 x i16> %vmull1
 }
 
@@ -174,12 +174,12 @@ entry:
 
 if.then:
   %s2 = shufflevector <16 x i8> %b, <16 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-  %vmull0 = tail call <8 x i16> @llvm.aarch64.neon.smull.v8i16(<8 x i8> %s1, <8 x i8> %s2) #3
+  %vmull0 = tail call <8 x i16> @llvm.aarch64.neon.smull.v8i16(<8 x i8> %s1, <8 x i8> %s2)
   ret <8 x i16> %vmull0
 
 if.else:
   %s4 = shufflevector <16 x i8> %b, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-  %vmull1 = tail call <8 x i16> @llvm.aarch64.neon.smull.v8i16(<8 x i8> %s3, <8 x i8> %s4) #3
+  %vmull1 = tail call <8 x i16> @llvm.aarch64.neon.smull.v8i16(<8 x i8> %s3, <8 x i8> %s4)
   ret <8 x i16> %vmull1
 }
 
@@ -294,12 +294,12 @@ entry:
 
 if.then:
   %s2 = shufflevector <16 x i8> %b, <16 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-  %vmull0 = tail call <8 x i16> @llvm.aarch64.neon.umull.v8i16(<8 x i8> %s1, <8 x i8> %s2) #3
+  %vmull0 = tail call <8 x i16> @llvm.aarch64.neon.umull.v8i16(<8 x i8> %s1, <8 x i8> %s2)
   ret <8 x i16> %vmull0
 
 if.else:
   %s4 = shufflevector <16 x i8> %b, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 10, i32 12, i32 13, i32 14, i32 15>
-  %vmull1 = tail call <8 x i16> @llvm.aarch64.neon.umull.v8i16(<8 x i8> %s3, <8 x i8> %s4) #3
+  %vmull1 = tail call <8 x i16> @llvm.aarch64.neon.umull.v8i16(<8 x i8> %s3, <8 x i8> %s4)
   ret <8 x i16> %vmull1
 }
 
@@ -1003,3 +1003,51 @@ entry:
   %2 = sub <vscale x 8 x i16> %0, %1
   ret <vscale x 8 x i16> %2
 }
+
+declare range(i64 0, 65536) i64 @backsmith_pure_3(ptr dead_on_return readonly captures(none) %0, <8 x i8> %BS_ARG_1, i32 %BS_ARG_2)
+define i32 @dont_sink_calls(ptr %func_1_a) {
+; CHECK-LABEL: @dont_sink_calls(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[BYVAL_TEMP:%.*]] = alloca <16 x i16>, align 16
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0(ptr nonnull [[BYVAL_TEMP]])
+; CHECK-NEXT:    store <16 x i16> zeroinitializer, ptr [[BYVAL_TEMP]], align 16
+; CHECK-NEXT:    [[TMP2:%.*]] = call i64 @backsmith_pure_3(ptr dead_on_return nonnull [[BYVAL_TEMP]], <8 x i8> <i8 0, i8 0, i8 0, i8 0, i8 0, i8 10, i8 0, i8 0>, i32 0)
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0(ptr nonnull [[BYVAL_TEMP]])
+; CHECK-NEXT:    [[TMP0:%.*]] = load i64, ptr [[FUNC_1_A:%.*]], align 8
+; CHECK-NEXT:    [[TOBOOL_NOT:%.*]] = icmp eq i64 [[TMP0]], 0
+; CHECK-NEXT:    br i1 [[TOBOOL_NOT]], label [[IF_END:%.*]], label [[CLEANUP:%.*]]
+; CHECK:       if.end:
+; CHECK-NEXT:    [[VQADDQ_V_I:%.*]] = tail call <16 x i8> @llvm.aarch64.neon.uqadd.v16i8(<16 x i8> <i8 3, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0>, <16 x i8> zeroinitializer)
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <16 x i8> [[VQADDQ_V_I]], <16 x i8> poison, <16 x i32> <i32 0, i32 2, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[VECINIT21:%.*]] = zext <16 x i8> [[TMP1]] to <16 x i64>
+; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <16 x i64> poison, i64 [[TMP2]], i64 0
+; CHECK-NEXT:    [[VECINIT38:%.*]] = shufflevector <16 x i64> [[TMP3]], <16 x i64> poison, <16 x i32> <i32 0, i32 0, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[MUL:%.*]] = mul nuw nsw <16 x i64> [[VECINIT38]], [[VECINIT21]]
+; CHECK-NEXT:    store <16 x i64> [[MUL]], ptr [[FUNC_1_A]], align 128
+; CHECK-NEXT:    br label [[CLEANUP]]
+; CHECK:       cleanup:
+; CHECK-NEXT:    ret i32 0
+;
+entry:
+  %byval-temp = alloca <16 x i16>, align 16
+  call void @llvm.lifetime.start.p0(ptr nonnull %byval-temp)
+  store <16 x i16> zeroinitializer, ptr %byval-temp, align 16
+  %call4 = call i64 @backsmith_pure_3(ptr dead_on_return nonnull %byval-temp, <8 x i8> <i8 0, i8 0, i8 0, i8 0, i8 0, i8 10, i8 0, i8 0>, i32 0)
+  call void @llvm.lifetime.end.p0(ptr nonnull %byval-temp)
+  %0 = load i64, ptr %func_1_a, align 8
+  %tobool.not = icmp eq i64 %0, 0
+  br i1 %tobool.not, label %if.end, label %cleanup
+
+if.end:                                           ; preds = %entry
+  %vqaddq_v.i = tail call <16 x i8> @llvm.aarch64.neon.uqadd.v16i8(<16 x i8> <i8 3, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0>, <16 x i8> zeroinitializer)
+  %1 = shufflevector <16 x i8> %vqaddq_v.i, <16 x i8> poison, <16 x i32> <i32 0, i32 2, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+  %vecinit21 = zext <16 x i8> %1 to <16 x i64>
+  %2 = insertelement <16 x i64> poison, i64 %call4, i64 0
+  %vecinit38 = shufflevector <16 x i64> %2, <16 x i64> poison, <16 x i32> <i32 0, i32 0, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+  %mul = mul nuw nsw <16 x i64> %vecinit38, %vecinit21
+  store <16 x i64> %mul, ptr %func_1_a
+  br label %cleanup
+
+cleanup:                                          ; preds = %entry, %if.end
+  ret i32 0
+}

``````````

</details>


https://github.com/llvm/llvm-project/pull/176182