[llvm] [InstCombine] Transform `vector.reduce.add` and `splat` into multiplication (PR #161020)
Nikita Popov via llvm-commits
llvm-commits at lists.llvm.org
Sun Sep 28 07:42:24 PDT 2025
================
@@ -308,3 +308,148 @@ define i32 @diff_of_sums_type_mismatch2(<8 x i32> %v0, <4 x i32> %v1) {
%r = sub i32 %r0, %r1
ret i32 %r
}
+
+define i32 @constant_multiplied_at_0(i32 %0) {
+; CHECK-LABEL: @constant_multiplied_at_0(
+; CHECK-NEXT: [[TMP2:%.*]] = shl i32 [[TMP0:%.*]], 2
+; CHECK-NEXT: ret i32 [[TMP2]]
+;
+ %2 = insertelement <4 x i32> poison, i32 %0, i64 0
+ %3 = shufflevector <4 x i32> %2, <4 x i32> poison, <4 x i32> zeroinitializer
+ %4 = tail call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %3)
+ ret i32 %4
+}
+
+define i64 @constant_multiplied_at_0_64bits(i64 %0) {
+; CHECK-LABEL: @constant_multiplied_at_0_64bits(
+; CHECK-NEXT: [[TMP2:%.*]] = shl i64 [[TMP0:%.*]], 2
+; CHECK-NEXT: ret i64 [[TMP2]]
+;
+ %2 = insertelement <4 x i64> poison, i64 %0, i64 0
+ %3 = shufflevector <4 x i64> %2, <4 x i64> poison, <4 x i32> zeroinitializer
+ %4 = tail call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %3)
+ ret i64 %4
+}
+
+define i32 @constant_multiplied_at_0_two_pow8(i32 %0) {
+; CHECK-LABEL: @constant_multiplied_at_0_two_pow8(
+; CHECK-NEXT: [[TMP2:%.*]] = shl i32 [[TMP0:%.*]], 3
+; CHECK-NEXT: ret i32 [[TMP2]]
+;
+ %2 = insertelement <4 x i32> poison, i32 %0, i64 0
+ %3 = shufflevector <4 x i32> %2, <4 x i32> poison, <8 x i32> zeroinitializer
+ %4 = tail call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %3)
+ ret i32 %4
+}
+
+
+define i32 @constant_multiplied_at_0_two_pow16(i32 %0) {
+; CHECK-LABEL: @constant_multiplied_at_0_two_pow16(
+; CHECK-NEXT: [[TMP2:%.*]] = shl i32 [[TMP0:%.*]], 4
+; CHECK-NEXT: ret i32 [[TMP2]]
+;
+ %2 = insertelement <4 x i32> poison, i32 %0, i64 0
+ %3 = shufflevector <4 x i32> %2, <4 x i32> poison, <16 x i32> zeroinitializer
+ %4 = tail call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %3)
+ ret i32 %4
+}
+
+
+define i32 @constant_multiplied_at_1(i32 %0) {
+; CHECK-LABEL: @constant_multiplied_at_1(
+; CHECK-NEXT: [[TMP2:%.*]] = shl i32 [[TMP0:%.*]], 2
+; CHECK-NEXT: ret i32 [[TMP2]]
+;
+ %2 = insertelement <4 x i32> poison, i32 %0, i64 1
+ %3 = shufflevector <4 x i32> %2, <4 x i32> poison,
+ <4 x i32> <i32 1, i32 1, i32 1, i32 1>
+ %4 = tail call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %3)
+ ret i32 %4
+}
+
+define i32 @negative_constant_multiplied_at_1(i32 %0) {
+; CHECK-LABEL: @negative_constant_multiplied_at_1(
+; CHECK-NEXT: ret i32 poison
+;
+ %2 = insertelement <4 x i32> poison, i32 %0, i64 1
+ %3 = shufflevector <4 x i32> %2, <4 x i32> poison, <4 x i32> zeroinitializer
+ %4 = tail call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %3)
+ ret i32 %4
+}
+
+define i32 @constant_multiplied_non_power_of_2(i32 %0) {
+; CHECK-LABEL: @constant_multiplied_non_power_of_2(
+; CHECK-NEXT: [[TMP2:%.*]] = mul i32 [[TMP0:%.*]], 6
+; CHECK-NEXT: ret i32 [[TMP2]]
+;
+ %2 = insertelement <4 x i32> poison, i32 %0, i64 0
+ %3 = shufflevector <4 x i32> %2, <4 x i32> poison, <6 x i32> zeroinitializer
+ %4 = tail call i32 @llvm.vector.reduce.add.v6i32(<6 x i32> %3)
+ ret i32 %4
+}
+
+define i64 @constant_multiplied_non_power_of_2_i64(i64 %0) {
+; CHECK-LABEL: @constant_multiplied_non_power_of_2_i64(
+; CHECK-NEXT: [[TMP2:%.*]] = mul i64 [[TMP0:%.*]], 6
+; CHECK-NEXT: ret i64 [[TMP2]]
+;
+ %2 = insertelement <4 x i64> poison, i64 %0, i64 0
+ %3 = shufflevector <4 x i64> %2, <4 x i64> poison, <6 x i32> zeroinitializer
+ %4 = tail call i64 @llvm.vector.reduce.add.v6i64(<6 x i64> %3)
+ ret i64 %4
+}
+
+define i1 @constant_multiplied_non_power_of_2_i1(i1 %0) {
+; CHECK-LABEL: @constant_multiplied_non_power_of_2_i1(
+; CHECK-NEXT: [[TMP6:%.*]] = insertelement <8 x i1> poison, i1 [[TMP0:%.*]], i64 0
+; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <8 x i1> [[TMP6]], <8 x i1> poison, <8 x i32> zeroinitializer
+; CHECK-NEXT: [[TMP4:%.*]] = bitcast <8 x i1> [[TMP3]] to i8
+; CHECK-NEXT: [[TMP5:%.*]] = call range(i8 0, 9) i8 @llvm.ctpop.i8(i8 [[TMP4]])
+; CHECK-NEXT: [[TMP2:%.*]] = trunc i8 [[TMP5]] to i1
+; CHECK-NEXT: ret i1 [[TMP2]]
+;
+ %2 = insertelement <8 x i1> poison, i1 %0, i32 0
+ %3 = shufflevector <8 x i1> %2, <8 x i1> poison, <8 x i32> zeroinitializer
+ %4 = tail call i1 @llvm.vector.reduce.add.v8i1(<8 x i1> %3)
+ ret i1 %4
+}
+
+define i1 @constant_multiplied_non_power_of_2_i1x4(i1 %0) {
+; CHECK-LABEL: @constant_multiplied_non_power_of_2_i1x4(
+; CHECK-NEXT: [[TMP2:%.*]] = insertelement <4 x i1> poison, i1 [[TMP0:%.*]], i64 0
+; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <4 x i1> [[TMP2]], <4 x i1> poison, <4 x i32> zeroinitializer
+; CHECK-NEXT: [[TMP4:%.*]] = bitcast <4 x i1> [[TMP3]] to i4
+; CHECK-NEXT: [[TMP5:%.*]] = call range(i4 0, 5) i4 @llvm.ctpop.i4(i4 [[TMP4]])
+; CHECK-NEXT: [[TMP6:%.*]] = trunc i4 [[TMP5]] to i1
+; CHECK-NEXT: ret i1 [[TMP6]]
+;
+ %2 = insertelement <4 x i1> poison, i1 %0, i32 0
+ %3 = shufflevector <4 x i1> %2, <4 x i1> poison, <4 x i32> zeroinitializer
+ %4 = tail call i1 @llvm.vector.reduce.add.v4i1(<4 x i1> %3)
+ ret i1 %4
+}
+
+define i1 @constant_multiplied_non_power_of_2_i1x2(i1 %0) {
+; CHECK-LABEL: @constant_multiplied_non_power_of_2_i1x2(
+; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x i1> poison, i1 [[TMP0:%.*]], i64 0
+; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <2 x i1> [[TMP2]], <2 x i1> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT: [[TMP4:%.*]] = bitcast <2 x i1> [[TMP3]] to i2
+; CHECK-NEXT: [[TMP5:%.*]] = call range(i2 0, -1) i2 @llvm.ctpop.i2(i2 [[TMP4]])
+; CHECK-NEXT: [[TMP6:%.*]] = trunc i2 [[TMP5]] to i1
----------------
nikic wrote:
No need for so many i1 tests that don't hit this code path anyway. I'd suggest adding additional i2 tests instead, which make it a bit clearer what is going on (e.g. v5i2 and v6i2).
https://github.com/llvm/llvm-project/pull/161020
More information about the llvm-commits
mailing list