[llvm] [LoadStoreVectorizer] Allow vectorization of partially overlapped vector-stores (PR #169946)

Tue Dec 9 10:54:10 PST 2025

================
@@ -0,0 +1,108 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6
+; RUN: opt -mtriple=amdgcn-amd-amdhsa -passes=load-store-vectorizer -S -o - %s | FileCheck %s
+
+define void @onevec(ptr %ptr, <1 x i32> %sd0, i32 %sd1, i32 %sd2, <1 x i32> %sd3, <1 x i32> %sd4, <1 x i32> %sd5) {
+; CHECK-LABEL: define void @onevec(
+; CHECK-SAME: ptr [[PTR:%.*]], <1 x i32> [[SD0:%.*]], i32 [[SD1:%.*]], i32 [[SD2:%.*]], <1 x i32> [[SD3:%.*]], <1 x i32> [[SD4:%.*]], <1 x i32> [[SD5:%.*]]) {
+; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <1 x i32> [[SD0]], i32 0
+; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <1 x i32> poison, i32 [[TMP1]], i32 0
+; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <1 x i32> [[TMP2]], i32 [[SD1]], i32 0
+; CHECK-NEXT:    store <1 x i32> [[TMP3]], ptr [[PTR]], align 4
+; CHECK-NEXT:    [[GEP1:%.*]] = getelementptr inbounds i8, ptr [[PTR]], i32 16
+; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <1 x i32> poison, i32 [[SD2]], i32 0
+; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <1 x i32> [[SD3]], i32 0
+; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <1 x i32> [[TMP4]], i32 [[TMP5]], i32 0
+; CHECK-NEXT:    store <1 x i32> [[TMP6]], ptr [[GEP1]], align 4
+; CHECK-NEXT:    [[GEP2:%.*]] = getelementptr inbounds i8, ptr [[PTR]], i32 32
+; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <1 x i32> [[SD4]], i32 0
+; CHECK-NEXT:    [[TMP8:%.*]] = insertelement <1 x i32> poison, i32 [[TMP7]], i32 0
+; CHECK-NEXT:    [[TMP9:%.*]] = extractelement <1 x i32> [[SD5]], i32 0
+; CHECK-NEXT:    [[TMP10:%.*]] = insertelement <1 x i32> [[TMP8]], i32 [[TMP9]], i32 0
+; CHECK-NEXT:    store <1 x i32> [[TMP10]], ptr [[GEP2]], align 4
+; CHECK-NEXT:    ret void
+;
+  store <1 x i32> %sd0, ptr %ptr, align 4
+  store i32 %sd1, ptr %ptr, align 4
+
+  %gep1 = getelementptr inbounds i8, ptr %ptr, i32 16
+  store i32 %sd2, ptr %gep1, align 4
+  store <1 x i32> %sd3, ptr %gep1, align 4
+
+  %gep2 = getelementptr inbounds i8, ptr %ptr, i32 32
+  store <1 x i32> %sd4, ptr %gep2, align 4
+  store <1 x i32> %sd5, ptr %gep2, align 4
+  ret void
+}
+
+define void @test(ptr %ptr, i32 %sd0, <2 x i32> %sd1, <2 x i32> %sd2, i32 %sd3) {
+; CHECK-LABEL: define void @test(
+; CHECK-SAME: ptr [[PTR:%.*]], i32 [[SD0:%.*]], <2 x i32> [[SD1:%.*]], <2 x i32> [[SD2:%.*]], i32 [[SD3:%.*]]) {
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <4 x i32> poison, i32 [[SD0]], i32 0
+; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <2 x i32> [[SD1]], i32 0
+; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <4 x i32> [[TMP1]], i32 [[TMP2]], i32 1
+; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <2 x i32> [[SD1]], i32 1
+; CHECK-NEXT:    [[TMP5:%.*]] = insertelement <4 x i32> [[TMP3]], i32 [[TMP4]], i32 2
+; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <2 x i32> [[SD2]], i32 0
+; CHECK-NEXT:    [[TMP7:%.*]] = insertelement <4 x i32> [[TMP5]], i32 [[TMP6]], i32 2
+; CHECK-NEXT:    [[TMP8:%.*]] = extractelement <2 x i32> [[SD2]], i32 1
+; CHECK-NEXT:    [[TMP9:%.*]] = insertelement <4 x i32> [[TMP7]], i32 [[TMP8]], i32 3
+; CHECK-NEXT:    [[TMP10:%.*]] = insertelement <4 x i32> [[TMP9]], i32 [[SD3]], i32 2
+; CHECK-NEXT:    store <4 x i32> [[TMP10]], ptr [[PTR]], align 4
+; CHECK-NEXT:    ret void
+;
+  store i32 %sd0, ptr %ptr, align 4
+  %gep1 = getelementptr inbounds i8, ptr %ptr, i32 4
+  store <2 x i32> %sd1, ptr %gep1, align 4
+  %gep2 = getelementptr inbounds i8, ptr %ptr, i32 8
+  store <2 x i32> %sd2, ptr %gep2, align 4
+  %gep3 = getelementptr inbounds i8, ptr %ptr, i32 8
+  store i32 %sd3, ptr %gep3, align 4
+  ret void
----------------
dakersnar wrote:

To support @cmc-rep's argument, I think that this test is a decent argument for why this is _not_ simply a DSE replacement. For DSE + current LSV to achieve this vectorized output, DSE would have to reason about these overlaps and split up the vectors into scalars, leaving behind the part of the store that is not dead. I'm not sure whether it is capable of that, but I could buy the argument that it might be too complex for that pass to reason about. Also, that this constitutes an improvement to the capabilities of the vectorizer, as it is able to create a v4 due to this change.

https://github.com/llvm/llvm-project/pull/169946