[llvm] [InstCombine][NFC] Precommit a test for folding a binary op of reductions. (PR #121568)

Wed Jan 8 14:29:10 PST 2025

https://github.com/mgudim updated https://github.com/llvm/llvm-project/pull/121568

>From f161113052e758e7835222bf89f62cbd757a30cb Mon Sep 17 00:00:00 2001
From: Mikhail Gudim <mgudim at ventanamicro.com>
Date: Fri, 3 Jan 2025 05:02:13 -0800
Subject: [PATCH] [InstCombine] Precommit a test for folding a binary op of
 reductions.

---
 .../InstCombine/fold-binop-of-reductions.ll   | 166 ++++++++++++++++++
 1 file changed, 166 insertions(+)
 create mode 100644 llvm/test/Transforms/InstCombine/fold-binop-of-reductions.ll

diff --git a/llvm/test/Transforms/InstCombine/fold-binop-of-reductions.ll b/llvm/test/Transforms/InstCombine/fold-binop-of-reductions.ll
new file mode 100644
index 00000000000000..77ddf5ccdab417
--- /dev/null
+++ b/llvm/test/Transforms/InstCombine/fold-binop-of-reductions.ll
@@ -0,0 +1,166 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt < %s -passes=instcombine -S | FileCheck %s
+
+define i32 @add_of_reduce_add(<16 x i32> %v0, <16 x i32> %v1) {
+; CHECK-LABEL: define i32 @add_of_reduce_add(
+; CHECK-SAME: <16 x i32> [[V0:%.*]], <16 x i32> [[V1:%.*]]) {
+; CHECK-NEXT:    [[TMP1:%.*]] = add <16 x i32> [[V0]], [[V1]]
+; CHECK-NEXT:    [[RES:%.*]] = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[TMP1]])
+; CHECK-NEXT:    ret i32 [[RES]]
+;
+  %v0_red = tail call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %v0)
+  %v1_red = tail call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %v1)
+  %res = add i32 %v0_red, %v1_red
+  ret i32 %res
+}
+
+define i32 @sub_of_reduce_add(<16 x i32> %v0, <16 x i32> %v1) {
+; CHECK-LABEL: define i32 @sub_of_reduce_add(
+; CHECK-SAME: <16 x i32> [[V0:%.*]], <16 x i32> [[V1:%.*]]) {
+; CHECK-NEXT:    [[TMP1:%.*]] = sub <16 x i32> [[V0]], [[V1]]
+; CHECK-NEXT:    [[RES:%.*]] = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[TMP1]])
+; CHECK-NEXT:    ret i32 [[RES]]
+;
+  %v0_red = tail call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %v0)
+  %v1_red = tail call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %v1)
+  %res = sub i32 %v0_red, %v1_red
+  ret i32 %res
+}
+
+define i32 @mul_of_reduce_mul(<16 x i32> %v0, <16 x i32> %v1) {
+; CHECK-LABEL: define i32 @mul_of_reduce_mul(
+; CHECK-SAME: <16 x i32> [[V0:%.*]], <16 x i32> [[V1:%.*]]) {
+; CHECK-NEXT:    [[TMP1:%.*]] = mul <16 x i32> [[V0]], [[V1]]
+; CHECK-NEXT:    [[RES:%.*]] = call i32 @llvm.vector.reduce.mul.v16i32(<16 x i32> [[TMP1]])
+; CHECK-NEXT:    ret i32 [[RES]]
+;
+  %v0_red = tail call i32 @llvm.vector.reduce.mul.v16i32(<16 x i32> %v0)
+  %v1_red = tail call i32 @llvm.vector.reduce.mul.v16i32(<16 x i32> %v1)
+  %res = mul i32 %v0_red, %v1_red
+  ret i32 %res
+}
+
+define i32 @and_of_reduce_and(<16 x i32> %v0, <16 x i32> %v1) {
+; CHECK-LABEL: define i32 @and_of_reduce_and(
+; CHECK-SAME: <16 x i32> [[V0:%.*]], <16 x i32> [[V1:%.*]]) {
+; CHECK-NEXT:    [[TMP1:%.*]] = and <16 x i32> [[V0]], [[V1]]
+; CHECK-NEXT:    [[RES:%.*]] = call i32 @llvm.vector.reduce.and.v16i32(<16 x i32> [[TMP1]])
+; CHECK-NEXT:    ret i32 [[RES]]
+;
+  %v0_red = tail call i32 @llvm.vector.reduce.and.v16i32(<16 x i32> %v0)
+  %v1_red = tail call i32 @llvm.vector.reduce.and.v16i32(<16 x i32> %v1)
+  %res = and i32 %v0_red, %v1_red
+  ret i32 %res
+}
+
+define i32 @or_of_reduce_or(<16 x i32> %v0, <16 x i32> %v1) {
+; CHECK-LABEL: define i32 @or_of_reduce_or(
+; CHECK-SAME: <16 x i32> [[V0:%.*]], <16 x i32> [[V1:%.*]]) {
+; CHECK-NEXT:    [[TMP1:%.*]] = or <16 x i32> [[V0]], [[V1]]
+; CHECK-NEXT:    [[RES:%.*]] = call i32 @llvm.vector.reduce.or.v16i32(<16 x i32> [[TMP1]])
+; CHECK-NEXT:    ret i32 [[RES]]
+;
+  %v0_red = tail call i32 @llvm.vector.reduce.or.v16i32(<16 x i32> %v0)
+  %v1_red = tail call i32 @llvm.vector.reduce.or.v16i32(<16 x i32> %v1)
+  %res = or i32 %v0_red, %v1_red
+  ret i32 %res
+}
+
+define i32 @xor_of_reduce_xor(<16 x i32> %v0, <16 x i32> %v1) {
+; CHECK-LABEL: define i32 @xor_of_reduce_xor(
+; CHECK-SAME: <16 x i32> [[V0:%.*]], <16 x i32> [[V1:%.*]]) {
+; CHECK-NEXT:    [[TMP1:%.*]] = xor <16 x i32> [[V0]], [[V1]]
+; CHECK-NEXT:    [[RES:%.*]] = call i32 @llvm.vector.reduce.xor.v16i32(<16 x i32> [[TMP1]])
+; CHECK-NEXT:    ret i32 [[RES]]
+;
+  %v0_red = tail call i32 @llvm.vector.reduce.xor.v16i32(<16 x i32> %v0)
+  %v1_red = tail call i32 @llvm.vector.reduce.xor.v16i32(<16 x i32> %v1)
+  %res = xor i32 %v0_red, %v1_red
+  ret i32 %res
+}
+
+define i32 @reduction_does_not_match_binop(<16 x i32> %v0, <16 x i32> %v1) {
+; CHECK-LABEL: define i32 @reduction_does_not_match_binop(
+; CHECK-SAME: <16 x i32> [[V0:%.*]], <16 x i32> [[V1:%.*]]) {
+; CHECK-NEXT:    [[V0_RED:%.*]] = tail call i32 @llvm.vector.reduce.mul.v16i32(<16 x i32> [[V0]])
+; CHECK-NEXT:    [[V1_RED:%.*]] = tail call i32 @llvm.vector.reduce.mul.v16i32(<16 x i32> [[V1]])
+; CHECK-NEXT:    [[RES:%.*]] = add i32 [[V0_RED]], [[V1_RED]]
+; CHECK-NEXT:    ret i32 [[RES]]
+;
+  %v0_red = tail call i32 @llvm.vector.reduce.mul.v16i32(<16 x i32> %v0)
+  %v1_red = tail call i32 @llvm.vector.reduce.mul.v16i32(<16 x i32> %v1)
+  %res = add i32 %v0_red, %v1_red
+  ret i32 %res
+}
+
+define i32 @element_counts_do_not_match(<16 x i32> %v0, <8 x i32> %v1) {
+; CHECK-LABEL: define i32 @element_counts_do_not_match(
+; CHECK-SAME: <16 x i32> [[V0:%.*]], <8 x i32> [[V1:%.*]]) {
+; CHECK-NEXT:    [[V0_RED:%.*]] = tail call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[V0]])
+; CHECK-NEXT:    [[V1_RED:%.*]] = tail call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> [[V1]])
+; CHECK-NEXT:    [[RES:%.*]] = add i32 [[V0_RED]], [[V1_RED]]
+; CHECK-NEXT:    ret i32 [[RES]]
+;
+  %v0_red = tail call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %v0)
+  %v1_red = tail call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %v1)
+  %res = add i32 %v0_red, %v1_red
+  ret i32 %res
+}
+
+define i32 @multiple_use_of_reduction_0(<16 x i32> %v0, <16 x i32> %v1, ptr %p) {
+; CHECK-LABEL: define i32 @multiple_use_of_reduction_0(
+; CHECK-SAME: <16 x i32> [[V0:%.*]], <16 x i32> [[V1:%.*]], ptr [[P:%.*]]) {
+; CHECK-NEXT:    [[V0_RED:%.*]] = tail call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[V0]])
+; CHECK-NEXT:    [[V1_RED:%.*]] = tail call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[V1]])
+; CHECK-NEXT:    [[RES:%.*]] = add i32 [[V0_RED]], [[V1_RED]]
+; CHECK-NEXT:    store i32 [[V0_RED]], ptr [[P]], align 4
+; CHECK-NEXT:    ret i32 [[RES]]
+;
+  %v0_red = tail call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %v0)
+  %v1_red = tail call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %v1)
+  %res = add i32 %v0_red, %v1_red
+  store i32 %v0_red, ptr %p
+  ret i32 %res
+}
+
+define i32 @multiple_use_of_reduction_1(<16 x i32> %v0, <16 x i32> %v1, ptr %p) {
+; CHECK-LABEL: define i32 @multiple_use_of_reduction_1(
+; CHECK-SAME: <16 x i32> [[V0:%.*]], <16 x i32> [[V1:%.*]], ptr [[P:%.*]]) {
+; CHECK-NEXT:    [[V0_RED:%.*]] = tail call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[V0]])
+; CHECK-NEXT:    [[V1_RED:%.*]] = tail call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[V1]])
+; CHECK-NEXT:    [[RES:%.*]] = add i32 [[V0_RED]], [[V1_RED]]
+; CHECK-NEXT:    store i32 [[V1_RED]], ptr [[P]], align 4
+; CHECK-NEXT:    ret i32 [[RES]]
+;
+  %v0_red = tail call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %v0)
+  %v1_red = tail call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %v1)
+  %res = add i32 %v0_red, %v1_red
+  store i32 %v1_red, ptr %p
+  ret i32 %res
+}
+
+define i32 @do_not_preserve_overflow_flags(<16 x i32> %v0, <16 x i32> %v1, ptr %p) {
+; CHECK-LABEL: define i32 @do_not_preserve_overflow_flags(
+; CHECK-SAME: <16 x i32> [[V0:%.*]], <16 x i32> [[V1:%.*]], ptr [[P:%.*]]) {
+; CHECK-NEXT:    [[TMP1:%.*]] = add <16 x i32> [[V0]], [[V1]]
+; CHECK-NEXT:    [[RES:%.*]] = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[TMP1]])
+; CHECK-NEXT:    ret i32 [[RES]]
+;
+  %v0_red = tail call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %v0)
+  %v1_red = tail call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %v1)
+  %res = add nsw nuw i32 %v0_red, %v1_red
+  ret i32 %res
+}
+
+define i32 @preserve_disjoint_flags(<16 x i32> %v0, <16 x i32> %v1, ptr %p) {
+; CHECK-LABEL: define i32 @preserve_disjoint_flags(
+; CHECK-SAME: <16 x i32> [[V0:%.*]], <16 x i32> [[V1:%.*]], ptr [[P:%.*]]) {
+; CHECK-NEXT:    [[TMP1:%.*]] = or disjoint <16 x i32> [[V0]], [[V1]]
+; CHECK-NEXT:    [[RES:%.*]] = call i32 @llvm.vector.reduce.or.v16i32(<16 x i32> [[TMP1]])
+; CHECK-NEXT:    ret i32 [[RES]]
+;
+  %v0_red = tail call i32 @llvm.vector.reduce.or.v16i32(<16 x i32> %v0)
+  %v1_red = tail call i32 @llvm.vector.reduce.or.v16i32(<16 x i32> %v1)
+  %res = or disjoint i32 %v0_red, %v1_red
+  ret i32 %res
+}