[llvm] [Matrix] Lower vector reductions using shape info (PR #142055)

Mon Jun 23 11:49:25 PDT 2025

================
@@ -0,0 +1,300 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -passes='lower-matrix-intrinsics' -S < %s | FileCheck %s
+
+define i32 @reduce_add_4x2(ptr %in, ptr %out) {
+; CHECK-LABEL: @reduce_add_4x2(
+; CHECK-NEXT:    [[COL_LOAD:%.*]] = load volatile <4 x i32>, ptr [[IN:%.*]], align 4
+; CHECK-NEXT:    [[VEC_GEP:%.*]] = getelementptr i32, ptr [[IN]], i64 4
+; CHECK-NEXT:    [[COL_LOAD1:%.*]] = load volatile <4 x i32>, ptr [[VEC_GEP]], align 4
+; CHECK-NEXT:    [[TMP1:%.*]] = add <4 x i32> [[COL_LOAD]], [[COL_LOAD1]]
+; CHECK-NEXT:    [[REDUCE:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP1]])
+; CHECK-NEXT:    ret i32 [[REDUCE]]
+;
+  %inv = call <8 x i32> @llvm.matrix.column.major.load(ptr %in, i64 4, i1 1, i32 4, i32 2)
+  %reduce = call i32 @llvm.vector.reduce.add(<8 x i32> %inv)
+  ret i32 %reduce
+}
+
+define i32 @reduce_add_8x1(ptr %in, ptr %out) {
+; CHECK-LABEL: @reduce_add_8x1(
+; CHECK-NEXT:    [[COL_LOAD:%.*]] = load volatile <8 x i32>, ptr [[IN:%.*]], align 4
+; CHECK-NEXT:    [[REDUCE:%.*]] = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> [[COL_LOAD]])
+; CHECK-NEXT:    ret i32 [[REDUCE]]
+;
+  %inv = call <8 x i32> @llvm.matrix.column.major.load(ptr %in, i64 8, i1 1, i32 8, i32 1)
+  %reduce = call i32 @llvm.vector.reduce.add(<8 x i32> %inv)
+  ret i32 %reduce
+}
+
+define i32 @reduce_add_1x8(ptr %in, ptr %out) {
+; CHECK-LABEL: @reduce_add_1x8(
+; CHECK-NEXT:    [[COL_LOAD:%.*]] = load volatile <1 x i32>, ptr [[IN:%.*]], align 4
+; CHECK-NEXT:    [[VEC_GEP:%.*]] = getelementptr i32, ptr [[IN]], i64 1
+; CHECK-NEXT:    [[COL_LOAD1:%.*]] = load volatile <1 x i32>, ptr [[VEC_GEP]], align 4
+; CHECK-NEXT:    [[VEC_GEP2:%.*]] = getelementptr i32, ptr [[IN]], i64 2
+; CHECK-NEXT:    [[COL_LOAD3:%.*]] = load volatile <1 x i32>, ptr [[VEC_GEP2]], align 4
+; CHECK-NEXT:    [[VEC_GEP4:%.*]] = getelementptr i32, ptr [[IN]], i64 3
+; CHECK-NEXT:    [[COL_LOAD5:%.*]] = load volatile <1 x i32>, ptr [[VEC_GEP4]], align 4
+; CHECK-NEXT:    [[VEC_GEP6:%.*]] = getelementptr i32, ptr [[IN]], i64 4
+; CHECK-NEXT:    [[COL_LOAD7:%.*]] = load volatile <1 x i32>, ptr [[VEC_GEP6]], align 4
+; CHECK-NEXT:    [[VEC_GEP8:%.*]] = getelementptr i32, ptr [[IN]], i64 5
+; CHECK-NEXT:    [[COL_LOAD9:%.*]] = load volatile <1 x i32>, ptr [[VEC_GEP8]], align 4
+; CHECK-NEXT:    [[VEC_GEP10:%.*]] = getelementptr i32, ptr [[IN]], i64 6
+; CHECK-NEXT:    [[COL_LOAD11:%.*]] = load volatile <1 x i32>, ptr [[VEC_GEP10]], align 4
+; CHECK-NEXT:    [[VEC_GEP12:%.*]] = getelementptr i32, ptr [[IN]], i64 7
+; CHECK-NEXT:    [[COL_LOAD13:%.*]] = load volatile <1 x i32>, ptr [[VEC_GEP12]], align 4
+; CHECK-NEXT:    [[TMP1:%.*]] = add <1 x i32> [[COL_LOAD]], [[COL_LOAD1]]
+; CHECK-NEXT:    [[TMP2:%.*]] = add <1 x i32> [[TMP1]], [[COL_LOAD3]]
+; CHECK-NEXT:    [[TMP3:%.*]] = add <1 x i32> [[TMP2]], [[COL_LOAD5]]
+; CHECK-NEXT:    [[TMP4:%.*]] = add <1 x i32> [[TMP3]], [[COL_LOAD7]]
+; CHECK-NEXT:    [[TMP5:%.*]] = add <1 x i32> [[TMP4]], [[COL_LOAD9]]
+; CHECK-NEXT:    [[TMP6:%.*]] = add <1 x i32> [[TMP5]], [[COL_LOAD11]]
+; CHECK-NEXT:    [[TMP7:%.*]] = add <1 x i32> [[TMP6]], [[COL_LOAD13]]
+; CHECK-NEXT:    [[REDUCE:%.*]] = call i32 @llvm.vector.reduce.add.v1i32(<1 x i32> [[TMP7]])
----------------
jroelofs wrote:

I've rebased the patch stack to pseudo pre-land the test, so the diff is more obvious.

https://github.com/llvm/llvm-project/pull/142055/commits/aff9068ff4fbc3c1009729d66d8917a766bc5a4b#diff-6a09d32782efca5b9899b5ed357c5befd70d65f461a2ae64284f2504975f8948

Hm, yeah, that is bad: https://llvm.godbolt.org/z/7EbfGer3E

https://github.com/llvm/llvm-project/pull/142055