[llvm] fde7464 - [LV] Add tests for incorrect narrowing of IGs with scalable vectors.

Thu Mar 12 10:00:40 PDT 2026

Author: Florian Hahn
Date: 2026-03-12T17:00:24Z
New Revision: fde746470b73dca357dace2c77634fa7c1fec903

URL: https://github.com/llvm/llvm-project/commit/fde746470b73dca357dace2c77634fa7c1fec903
DIFF: https://github.com/llvm/llvm-project/commit/fde746470b73dca357dace2c77634fa7c1fec903.diff

LOG: [LV] Add tests for incorrect narrowing of IGs with scalable vectors.

Add test coverage for mis-compile in https://github.com/llvm/llvm-project/issues/185860.

Currently a wide load gets incorrectly narrowed to a single-scalar one
with scalable VFs, which only loads one element instead of the needed
vscale elements.

Added: 
    

Modified: 
    llvm/test/Transforms/LoopVectorize/AArch64/transform-narrow-interleave-to-widen-memory-scalable.ll

Removed: 
    


################################################################################
diff  --git a/llvm/test/Transforms/LoopVectorize/AArch64/transform-narrow-interleave-to-widen-memory-scalable.ll b/llvm/test/Transforms/LoopVectorize/AArch64/transform-narrow-interleave-to-widen-memory-scalable.ll
index f4e1dd836da73..baa1f7ed85c1d 100644

--- a/llvm/test/Transforms/LoopVectorize/AArch64/transform-narrow-interleave-to-widen-memory-scalable.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/transform-narrow-interleave-to-widen-memory-scalable.ll
@@ -408,6 +408,134 @@ exit:
   ret void
 }
 
+; Shared wide load (scales) feeds all members of the store interleave group
+; through fmul.
+; FIXME: With scalable VF, we incorrectly narrow the shared load to a uniform
+; scalar which is incorrect because the loop step is vscale and we need vscale
+; distinct values.
+; Test case for https://github.com/llvm/llvm-project/issues/185860.
+define void @shared_wide_load_not_narrowed(ptr noalias %src, ptr noalias %scales, ptr noalias %dst, i64 %n) {
+; CHECK-LABEL: define void @shared_wide_load_not_narrowed(
+; CHECK-SAME: ptr noalias [[SRC:%.*]], ptr noalias [[SCALES:%.*]], ptr noalias [[DST:%.*]], i64 [[N:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP1:%.*]] = shl nuw i64 [[TMP0]], 1
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], [[TMP1]]
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; CHECK:       [[VECTOR_PH]]:
+; CHECK-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP2]]
+; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
+; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK:       [[VECTOR_BODY]]:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds [16 x i8], ptr [[SRC]], i64 [[INDEX]]
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 2 x double>, ptr [[TMP3]], align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [8 x i8], ptr [[SCALES]], i64 [[INDEX]]
+; CHECK-NEXT:    [[TMP5:%.*]] = load double, ptr [[TMP4]], align 8
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 2 x double> poison, double [[TMP5]], i64 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 2 x double> [[BROADCAST_SPLATINSERT]], <vscale x 2 x double> poison, <vscale x 2 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP6:%.*]] = fmul <vscale x 2 x double> [[WIDE_LOAD]], [[BROADCAST_SPLAT]]
+; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [16 x i8], ptr [[DST]], i64 [[INDEX]]
+; CHECK-NEXT:    store <vscale x 2 x double> [[TMP6]], ptr [[TMP7]], align 8
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP2]]
+; CHECK-NEXT:    [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP8]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP20:![0-9]+]]
+; CHECK:       [[MIDDLE_BLOCK]]:
+; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[CMP_N]], [[EXIT:label %.*]], label %[[SCALAR_PH]]
+; CHECK:       [[SCALAR_PH]]:
+;
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+  %struct.ptr = getelementptr inbounds [16 x i8], ptr %src, i64 %iv
+  %x = load double, ptr %struct.ptr, align 8
+  %y.ptr = getelementptr inbounds nuw i8, ptr %struct.ptr, i64 8
+  %y = load double, ptr %y.ptr, align 8
+  %scale.ptr = getelementptr inbounds [8 x i8], ptr %scales, i64 %iv
+  %scale = load double, ptr %scale.ptr, align 8
+  %ax = fmul double %x, %scale
+  %ay = fmul double %y, %scale
+  %out.ptr = getelementptr inbounds [16 x i8], ptr %dst, i64 %iv
+  store double %ax, ptr %out.ptr, align 8
+  %out.y.ptr = getelementptr inbounds nuw i8, ptr %out.ptr, i64 8
+  store double %ay, ptr %out.y.ptr, align 8
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond.not = icmp eq i64 %iv.next, %n
+  br i1 %exitcond.not, label %exit, label %loop
+
+exit:
+  ret void
+}
+
+; Same as shared_wide_load_not_narrowed, but with an fpext cast between the
+; shared wide load and the fmul.
+define void @shared_wide_load_with_cast_not_narrowed(ptr noalias %src, ptr noalias %scales, ptr noalias %dst, i64 %n) {
+; CHECK-LABEL: define void @shared_wide_load_with_cast_not_narrowed(
+; CHECK-SAME: ptr noalias [[SRC:%.*]], ptr noalias [[SCALES:%.*]], ptr noalias [[DST:%.*]], i64 [[N:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[TMP3:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP4:%.*]] = shl nuw i64 [[TMP3]], 2
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], [[TMP4]]
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; CHECK:       [[VECTOR_PH]]:
+; CHECK-NEXT:    [[TMP8:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP7:%.*]] = shl nuw i64 [[TMP8]], 2
+; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP7]]
+; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
+; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK:       [[VECTOR_BODY]]:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds [16 x i8], ptr [[SRC]], i64 [[INDEX]]
+; CHECK-NEXT:    [[WIDE_VEC:%.*]] = load <vscale x 8 x double>, ptr [[TMP0]], align 8
+; CHECK-NEXT:    [[STRIDED_VEC:%.*]] = call { <vscale x 4 x double>, <vscale x 4 x double> } @llvm.vector.deinterleave2.nxv8f64(<vscale x 8 x double> [[WIDE_VEC]])
+; CHECK-NEXT:    [[TMP11:%.*]] = extractvalue { <vscale x 4 x double>, <vscale x 4 x double> } [[STRIDED_VEC]], 0
+; CHECK-NEXT:    [[TMP6:%.*]] = extractvalue { <vscale x 4 x double>, <vscale x 4 x double> } [[STRIDED_VEC]], 1
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds [4 x i8], ptr [[SCALES]], i64 [[INDEX]]
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x float>, ptr [[TMP1]], align 4
+; CHECK-NEXT:    [[TMP12:%.*]] = fpext <vscale x 4 x float> [[WIDE_LOAD]] to <vscale x 4 x double>
+; CHECK-NEXT:    [[TMP13:%.*]] = fmul <vscale x 4 x double> [[TMP11]], [[TMP12]]
+; CHECK-NEXT:    [[TMP10:%.*]] = fmul <vscale x 4 x double> [[TMP6]], [[TMP12]]
+; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [16 x i8], ptr [[DST]], i64 [[INDEX]]
+; CHECK-NEXT:    [[INTERLEAVED_VEC:%.*]] = call <vscale x 8 x double> @llvm.vector.interleave2.nxv8f64(<vscale x 4 x double> [[TMP13]], <vscale x 4 x double> [[TMP10]])
+; CHECK-NEXT:    store <vscale x 8 x double> [[INTERLEAVED_VEC]], ptr [[TMP5]], align 8
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP7]]
+; CHECK-NEXT:    [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP9]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP22:![0-9]+]]
+; CHECK:       [[MIDDLE_BLOCK]]:
+; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[CMP_N]], [[EXIT:label %.*]], label %[[SCALAR_PH]]
+; CHECK:       [[SCALAR_PH]]:
+;
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+  %struct.ptr = getelementptr inbounds [16 x i8], ptr %src, i64 %iv
+  %x = load double, ptr %struct.ptr, align 8
+  %y.ptr = getelementptr inbounds nuw i8, ptr %struct.ptr, i64 8
+  %y = load double, ptr %y.ptr, align 8
+  %scale.ptr = getelementptr inbounds [4 x i8], ptr %scales, i64 %iv
+  %scale = load float, ptr %scale.ptr, align 4
+  %scale.ext = fpext float %scale to double
+  %ax = fmul double %x, %scale.ext
+  %ay = fmul double %y, %scale.ext
+  %out.ptr = getelementptr inbounds [16 x i8], ptr %dst, i64 %iv
+  store double %ax, ptr %out.ptr, align 8
+  %out.y.ptr = getelementptr inbounds nuw i8, ptr %out.ptr, i64 8
+  store double %ay, ptr %out.y.ptr, align 8
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond.not = icmp eq i64 %iv.next, %n
+  br i1 %exitcond.not, label %exit, label %loop
+
+exit:
+  ret void
+}
+
 ; Test case for https://github.com/llvm/llvm-project/issues/183345.
 define void @interleave_group_with_gather(ptr %indices, ptr %src, i64 %n) {
 ; CHECK-LABEL: define void @interleave_group_with_gather(