[llvm] 8b41551 - [AArch64] Add a slp vectorization test for extract and shuffle costs. NFC

Tue May 6 10:09:33 PDT 2025

Author: David Green
Date: 2025-05-06T18:09:29+01:00
New Revision: 8b41551651b66939255e0bcd13e829bb6944e0d6

URL: https://github.com/llvm/llvm-project/commit/8b41551651b66939255e0bcd13e829bb6944e0d6
DIFF: https://github.com/llvm/llvm-project/commit/8b41551651b66939255e0bcd13e829bb6944e0d6.diff

LOG: [AArch64] Add a slp vectorization test for extract and shuffle costs. NFC

Added: 
    

Modified: 
    llvm/test/Transforms/SLPVectorizer/AArch64/extractelements-to-shuffle.ll

Removed: 
    


################################################################################
diff  --git a/llvm/test/Transforms/SLPVectorizer/AArch64/extractelements-to-shuffle.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/extractelements-to-shuffle.ll
index f9953df6c1735..667fc41c069e1 100644

--- a/llvm/test/Transforms/SLPVectorizer/AArch64/extractelements-to-shuffle.ll
+++ b/llvm/test/Transforms/SLPVectorizer/AArch64/extractelements-to-shuffle.ll
@@ -635,6 +635,56 @@ while.end71:                                      ; preds = %while.body38, %whil
   ret void
 }
 
+; FIXME: This should not be vectorizing (further) with expensive shuffles.
+; The old cost of the or+extract should be 2*1 (or) + 4*2 (extract). The new
+; cost should be 1*1 (or) + 2*2 (extract) + at least 4 (shuffles).
+define i1 @tryMapToRange(ptr %values, ptr %result, <2 x i64> %hi, <2 x i64> %lo) {
+; CHECK-LABEL: @tryMapToRange(
+; CHECK-NEXT:    [[L:%.*]] = load <2 x i64>, ptr [[VALUES:%.*]], align 8
+; CHECK-NEXT:    [[C1:%.*]] = icmp sgt <2 x i64> [[L]], [[HI:%.*]]
+; CHECK-NEXT:    [[S1:%.*]] = sext <2 x i1> [[C1]] to <2 x i64>
+; CHECK-NEXT:    [[BC1:%.*]] = bitcast <2 x i64> [[S1]] to <16 x i8>
+; CHECK-NEXT:    [[A1:%.*]] = and <16 x i8> [[BC1]], <i8 1, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 1, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison>
+; CHECK-NEXT:    [[C2:%.*]] = icmp slt <2 x i64> [[L]], [[LO:%.*]]
+; CHECK-NEXT:    [[S2:%.*]] = sext <2 x i1> [[C2]] to <2 x i64>
+; CHECK-NEXT:    [[BC2:%.*]] = bitcast <2 x i64> [[S2]] to <16 x i8>
+; CHECK-NEXT:    [[A2:%.*]] = and <16 x i8> [[BC2]], <i8 1, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 1, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison>
+; CHECK-NEXT:    [[REASS_SUB:%.*]] = sub <2 x i64> [[L]], [[LO]]
+; CHECK-NEXT:    [[ADD_I_I_I_I_I_I:%.*]] = add <2 x i64> [[REASS_SUB]], splat (i64 1)
+; CHECK-NEXT:    store <2 x i64> [[ADD_I_I_I_I_I_I]], ptr [[RESULT:%.*]], align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <16 x i8> [[A1]], <16 x i8> [[A2]], <2 x i32> <i32 8, i32 24>
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <16 x i8> [[A1]], <16 x i8> [[A2]], <2 x i32> <i32 0, i32 16>
+; CHECK-NEXT:    [[TMP3:%.*]] = or <2 x i8> [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <2 x i8> [[TMP3]], i32 0
+; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <2 x i8> [[TMP3]], i32 1
+; CHECK-NEXT:    [[O3:%.*]] = or i8 [[TMP4]], [[TMP5]]
+; CHECK-NEXT:    [[C:%.*]] = icmp eq i8 [[O3]], 0
+; CHECK-NEXT:    ret i1 [[C]]
+;
+  %l = load <2 x i64>, ptr %values, align 8
+  %c1 = icmp sgt <2 x i64> %l, %hi
+  %s1 = sext <2 x i1> %c1 to <2 x i64>
+  %bc1 = bitcast <2 x i64> %s1 to <16 x i8>
+  %a1 = and <16 x i8> %bc1, <i8 1, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 1, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison>
+  %e1 = extractelement <16 x i8> %a1, i64 0
+  %e2 = extractelement <16 x i8> %a1, i64 8
+  %c2 = icmp slt <2 x i64> %l, %lo
+  %s2 = sext <2 x i1> %c2 to <2 x i64>
+  %bc2 = bitcast <2 x i64> %s2 to <16 x i8>
+  %a2 = and <16 x i8> %bc2, <i8 1, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 1, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison>
+  %e3 = extractelement <16 x i8> %a2, i64 0
+  %e4 = extractelement <16 x i8> %a2, i64 8
+  %reass.sub = sub <2 x i64> %l, %lo
+  %add.i.i.i.i.i.i = add <2 x i64> %reass.sub, splat (i64 1)
+  store <2 x i64> %add.i.i.i.i.i.i, ptr %result, align 8
+  %o1 = or i8 %e2, %e1
+  %o2 = or i8 %e4, %e3
+  %o3 = or i8 %o1, %o2
+  %c = icmp eq i8 %o3, 0
+  ret i1 %c
+}
+
+
 declare <16 x i8> @llvm.ctpop.v16i8(<16 x i8>) #1
 declare <8 x i16> @llvm.aarch64.neon.uaddlp.v8i16.v16i8(<16 x i8>) #2
 declare <4 x i32> @llvm.aarch64.neon.uaddlp.v4i32.v8i16(<8 x i16>) #2