[llvm] 6042c25 - [GlobalISel] Add translation support for vector reduction intrinsics.

Fri Oct 16 10:25:40 PDT 2020

Author: Amara Emerson
Date: 2020-10-16T10:17:53-07:00
New Revision: 6042c25b0a7a9d812ace6ffe164628af9a1e1259

URL: https://github.com/llvm/llvm-project/commit/6042c25b0a7a9d812ace6ffe164628af9a1e1259
DIFF: https://github.com/llvm/llvm-project/commit/6042c25b0a7a9d812ace6ffe164628af9a1e1259.diff

LOG: [GlobalISel] Add translation support for vector reduction intrinsics.

In order to prevent the ExpandReductions pass from expanding some intrinsics
before they get to codegen, I had to add a -disable-expand-reductions flag
for testing purposes.

Differential Revision: https://reviews.llvm.org/D89028

Added: 
    llvm/test/CodeGen/AArch64/GlobalISel/irtranslator-reductions.ll

Modified: 
    llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp
    llvm/lib/CodeGen/TargetPassConfig.cpp

Removed: 
    


################################################################################
diff  --git a/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp b/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp
index a8f7059a6aed..8fec559bd55c 100644

--- a/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp
@@ -1720,6 +1720,29 @@ unsigned IRTranslator::getSimpleIntrinsicOpcode(Intrinsic::ID ID) {
       return TargetOpcode::G_PTRMASK;
     case Intrinsic::lrint:
       return TargetOpcode::G_INTRINSIC_LRINT;
+    // FADD/FMUL require checking the FMF, so are handled elsewhere.
+    case Intrinsic::vector_reduce_fmin:
+      return TargetOpcode::G_VECREDUCE_FMIN;
+    case Intrinsic::vector_reduce_fmax:
+      return TargetOpcode::G_VECREDUCE_FMAX;
+    case Intrinsic::vector_reduce_add:
+      return TargetOpcode::G_VECREDUCE_ADD;
+    case Intrinsic::vector_reduce_mul:
+      return TargetOpcode::G_VECREDUCE_MUL;
+    case Intrinsic::vector_reduce_and:
+      return TargetOpcode::G_VECREDUCE_AND;
+    case Intrinsic::vector_reduce_or:
+      return TargetOpcode::G_VECREDUCE_OR;
+    case Intrinsic::vector_reduce_xor:
+      return TargetOpcode::G_VECREDUCE_XOR;
+    case Intrinsic::vector_reduce_smax:
+      return TargetOpcode::G_VECREDUCE_SMAX;
+    case Intrinsic::vector_reduce_smin:
+      return TargetOpcode::G_VECREDUCE_SMIN;
+    case Intrinsic::vector_reduce_umax:
+      return TargetOpcode::G_VECREDUCE_UMAX;
+    case Intrinsic::vector_reduce_umin:
+      return TargetOpcode::G_VECREDUCE_UMIN;
   }
   return Intrinsic::not_intrinsic;
 }
@@ -2135,6 +2158,41 @@ bool IRTranslator::translateKnownIntrinsic(const CallInst &CI, Intrinsic::ID ID,
       EntryMBB.insert(EntryMBB.begin(), LocalEscape);
     }
 
+    return true;
+  }
+  case Intrinsic::vector_reduce_fadd:
+  case Intrinsic::vector_reduce_fmul: {
+    // Need to check for the reassoc flag to decide whether we want a
+    // sequential reduction opcode or not.
+    Register Dst = getOrCreateVReg(CI);
+    Register ScalarSrc = getOrCreateVReg(*CI.getArgOperand(0));
+    Register VecSrc = getOrCreateVReg(*CI.getArgOperand(1));
+    unsigned Opc = 0;
+    if (!CI.hasAllowReassoc()) {
+      // The sequential ordering case.
+      Opc = ID == Intrinsic::vector_reduce_fadd
+                ? TargetOpcode::G_VECREDUCE_SEQ_FADD
+                : TargetOpcode::G_VECREDUCE_SEQ_FMUL;
+      MIRBuilder.buildInstr(Opc, {Dst}, {ScalarSrc, VecSrc},
+                            MachineInstr::copyFlagsFromInstruction(CI));
+      return true;
+    }
+    // We split the operation into a separate G_FADD/G_FMUL + the reduce,
+    // since the associativity doesn't matter.
+    unsigned ScalarOpc;
+    if (ID == Intrinsic::vector_reduce_fadd) {
+      Opc = TargetOpcode::G_VECREDUCE_FADD;
+      ScalarOpc = TargetOpcode::G_FADD;
+    } else {
+      Opc = TargetOpcode::G_VECREDUCE_FMUL;
+      ScalarOpc = TargetOpcode::G_FMUL;
+    }
+    LLT DstTy = MRI->getType(Dst);
+    auto Rdx = MIRBuilder.buildInstr(
+        Opc, {DstTy}, {VecSrc}, MachineInstr::copyFlagsFromInstruction(CI));
+    MIRBuilder.buildInstr(ScalarOpc, {Dst}, {ScalarSrc, Rdx},
+                          MachineInstr::copyFlagsFromInstruction(CI));
+
     return true;
   }
 #define INSTRUCTION(NAME, NARG, ROUND_MODE, INTRINSIC)  \

diff  --git a/llvm/lib/CodeGen/TargetPassConfig.cpp b/llvm/lib/CodeGen/TargetPassConfig.cpp
index ef070ee7dbae..3a1a82fadaa5 100644
--- a/llvm/lib/CodeGen/TargetPassConfig.cpp
+++ b/llvm/lib/CodeGen/TargetPassConfig.cpp
@@ -218,6 +218,11 @@ static cl::opt<bool> EnableMachineFunctionSplitter(
     cl::desc("Split out cold blocks from machine functions based on profile "
              "information."));
 
+/// Disable the expand reductions pass for testing.
+static cl::opt<bool> DisableExpandReductions(
+    "disable-expand-reductions", cl::init(false), cl::Hidden,
+    cl::desc("Disable the expand reduction intrinsics pass from running"));
+
 /// Allow standard passes to be disabled by command line options. This supports
 /// simple binary flags that either suppress the pass or do nothing.
 /// i.e. -disable-mypass=false has no effect.
@@ -708,7 +713,9 @@ void TargetPassConfig::addIRPasses() {
   addPass(createScalarizeMaskedMemIntrinPass());
 
   // Expand reduction intrinsics into shuffle sequences if the target wants to.
-  addPass(createExpandReductionsPass());
+  // Allow disabling it for testing purposes.
+  if (!DisableExpandReductions)
+    addPass(createExpandReductionsPass());
 }
 
 /// Turn exception handling constructs into something the code generators can

diff  --git a/llvm/test/CodeGen/AArch64/GlobalISel/irtranslator-reductions.ll b/llvm/test/CodeGen/AArch64/GlobalISel/irtranslator-reductions.ll
new file mode 100644
index 000000000000..7a6387416ad8
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/irtranslator-reductions.ll
@@ -0,0 +1,225 @@
+; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
+; RUN: llc -O0 -mtriple=aarch64-apple-ios -global-isel -disable-expand-reductions -stop-after=irtranslator %s -o - | FileCheck %s
+
+declare float @llvm.vector.reduce.fadd.v4f32(float, <4 x float>)
+declare double @llvm.vector.reduce.fmul.v4f64(double, <4 x double>)
+
+define float @fadd_seq(float %start, <4 x float> %vec) {
+  ; CHECK-LABEL: name: fadd_seq
+  ; CHECK: bb.1 (%ir-block.0):
+  ; CHECK:   liveins: $q1, $s0
+  ; CHECK:   [[COPY:%[0-9]+]]:_(s32) = COPY $s0
+  ; CHECK:   [[COPY1:%[0-9]+]]:_(<4 x s32>) = COPY $q1
+  ; CHECK:   [[VECREDUCE_SEQ_FADD:%[0-9]+]]:_(s32) = G_VECREDUCE_SEQ_FADD [[COPY]](s32), [[COPY1]](<4 x s32>)
+  ; CHECK:   $s0 = COPY [[VECREDUCE_SEQ_FADD]](s32)
+  ; CHECK:   RET_ReallyLR implicit $s0
+  %res = call float @llvm.vector.reduce.fadd.v4f32(float %start, <4 x float> %vec)
+  ret float %res
+}
+
+define float @fadd_fast(float %start, <4 x float> %vec) {
+  ; CHECK-LABEL: name: fadd_fast
+  ; CHECK: bb.1 (%ir-block.0):
+  ; CHECK:   liveins: $q1, $s0
+  ; CHECK:   [[COPY:%[0-9]+]]:_(s32) = COPY $s0
+  ; CHECK:   [[COPY1:%[0-9]+]]:_(<4 x s32>) = COPY $q1
+  ; CHECK:   [[VECREDUCE_FADD:%[0-9]+]]:_(s32) = reassoc G_VECREDUCE_FADD [[COPY1]](<4 x s32>)
+  ; CHECK:   [[FADD:%[0-9]+]]:_(s32) = reassoc G_FADD [[COPY]], [[VECREDUCE_FADD]]
+  ; CHECK:   $s0 = COPY [[FADD]](s32)
+  ; CHECK:   RET_ReallyLR implicit $s0
+  %res = call reassoc float @llvm.vector.reduce.fadd.v4f32(float %start, <4 x float> %vec)
+  ret float %res
+}
+
+define double @fmul_seq(double %start, <4 x double> %vec) {
+  ; CHECK-LABEL: name: fmul_seq
+  ; CHECK: bb.1 (%ir-block.0):
+  ; CHECK:   liveins: $d0, $q1, $q2
+  ; CHECK:   [[COPY:%[0-9]+]]:_(s64) = COPY $d0
+  ; CHECK:   [[COPY1:%[0-9]+]]:_(<2 x s64>) = COPY $q1
+  ; CHECK:   [[COPY2:%[0-9]+]]:_(<2 x s64>) = COPY $q2
+  ; CHECK:   [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s64>) = G_CONCAT_VECTORS [[COPY1]](<2 x s64>), [[COPY2]](<2 x s64>)
+  ; CHECK:   [[VECREDUCE_SEQ_FMUL:%[0-9]+]]:_(s64) = G_VECREDUCE_SEQ_FMUL [[COPY]](s64), [[CONCAT_VECTORS]](<4 x s64>)
+  ; CHECK:   $d0 = COPY [[VECREDUCE_SEQ_FMUL]](s64)
+  ; CHECK:   RET_ReallyLR implicit $d0
+  %res = call double @llvm.vector.reduce.fmul.v4f64(double %start, <4 x double> %vec)
+  ret double %res
+}
+
+define double @fmul_fast(double %start, <4 x double> %vec) {
+  ; CHECK-LABEL: name: fmul_fast
+  ; CHECK: bb.1 (%ir-block.0):
+  ; CHECK:   liveins: $d0, $q1, $q2
+  ; CHECK:   [[COPY:%[0-9]+]]:_(s64) = COPY $d0
+  ; CHECK:   [[COPY1:%[0-9]+]]:_(<2 x s64>) = COPY $q1
+  ; CHECK:   [[COPY2:%[0-9]+]]:_(<2 x s64>) = COPY $q2
+  ; CHECK:   [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s64>) = G_CONCAT_VECTORS [[COPY1]](<2 x s64>), [[COPY2]](<2 x s64>)
+  ; CHECK:   [[VECREDUCE_FMUL:%[0-9]+]]:_(s64) = reassoc G_VECREDUCE_FMUL [[CONCAT_VECTORS]](<4 x s64>)
+  ; CHECK:   [[FMUL:%[0-9]+]]:_(s64) = reassoc G_FMUL [[COPY]], [[VECREDUCE_FMUL]]
+  ; CHECK:   $d0 = COPY [[FMUL]](s64)
+  ; CHECK:   RET_ReallyLR implicit $d0
+  %res = call reassoc double @llvm.vector.reduce.fmul.v4f64(double %start, <4 x double> %vec)
+  ret double %res
+}
+
+declare float @llvm.vector.reduce.fmax.v4f32(<4 x float>)
+declare float @llvm.vector.reduce.fmin.v4f32(<4 x float>)
+
+define float @fmax(<4 x float> %vec) {
+  ; CHECK-LABEL: name: fmax
+  ; CHECK: bb.1 (%ir-block.0):
+  ; CHECK:   liveins: $q0
+  ; CHECK:   [[COPY:%[0-9]+]]:_(<4 x s32>) = COPY $q0
+  ; CHECK:   [[VECREDUCE_FMAX:%[0-9]+]]:_(s32) = G_VECREDUCE_FMAX [[COPY]](<4 x s32>)
+  ; CHECK:   $s0 = COPY [[VECREDUCE_FMAX]](s32)
+  ; CHECK:   RET_ReallyLR implicit $s0
+  %res = call float @llvm.vector.reduce.fmax.v4f32(<4 x float> %vec)
+  ret float %res
+}
+
+define float @fmin(<4 x float> %vec) {
+  ; CHECK-LABEL: name: fmin
+  ; CHECK: bb.1 (%ir-block.0):
+  ; CHECK:   liveins: $q0
+  ; CHECK:   [[COPY:%[0-9]+]]:_(<4 x s32>) = COPY $q0
+  ; CHECK:   [[VECREDUCE_FMIN:%[0-9]+]]:_(s32) = G_VECREDUCE_FMIN [[COPY]](<4 x s32>)
+  ; CHECK:   $s0 = COPY [[VECREDUCE_FMIN]](s32)
+  ; CHECK:   RET_ReallyLR implicit $s0
+  %res = call float @llvm.vector.reduce.fmin.v4f32(<4 x float> %vec)
+  ret float %res
+}
+
+define float @fmin_nnan(<4 x float> %vec) {
+  ; CHECK-LABEL: name: fmin_nnan
+  ; CHECK: bb.1 (%ir-block.0):
+  ; CHECK:   liveins: $q0
+  ; CHECK:   [[COPY:%[0-9]+]]:_(<4 x s32>) = COPY $q0
+  ; CHECK:   [[VECREDUCE_FMIN:%[0-9]+]]:_(s32) = nnan G_VECREDUCE_FMIN [[COPY]](<4 x s32>)
+  ; CHECK:   $s0 = COPY [[VECREDUCE_FMIN]](s32)
+  ; CHECK:   RET_ReallyLR implicit $s0
+  %res = call nnan float @llvm.vector.reduce.fmin.v4f32(<4 x float> %vec)
+  ret float %res
+}
+
+declare i32 @llvm.vector.reduce.add.v4i32(<4 x i32>)
+
+define i32 @add(<4 x i32> %vec) {
+  ; CHECK-LABEL: name: add
+  ; CHECK: bb.1 (%ir-block.0):
+  ; CHECK:   liveins: $q0
+  ; CHECK:   [[COPY:%[0-9]+]]:_(<4 x s32>) = COPY $q0
+  ; CHECK:   [[VECREDUCE_ADD:%[0-9]+]]:_(s32) = G_VECREDUCE_ADD [[COPY]](<4 x s32>)
+  ; CHECK:   $w0 = COPY [[VECREDUCE_ADD]](s32)
+  ; CHECK:   RET_ReallyLR implicit $w0
+  %res = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %vec)
+  ret i32 %res
+}
+
+declare i32 @llvm.vector.reduce.mul.v4i32(<4 x i32>)
+
+define i32 @mul(<4 x i32> %vec) {
+  ; CHECK-LABEL: name: mul
+  ; CHECK: bb.1 (%ir-block.0):
+  ; CHECK:   liveins: $q0
+  ; CHECK:   [[COPY:%[0-9]+]]:_(<4 x s32>) = COPY $q0
+  ; CHECK:   [[VECREDUCE_MUL:%[0-9]+]]:_(s32) = G_VECREDUCE_MUL [[COPY]](<4 x s32>)
+  ; CHECK:   $w0 = COPY [[VECREDUCE_MUL]](s32)
+  ; CHECK:   RET_ReallyLR implicit $w0
+  %res = call i32 @llvm.vector.reduce.mul.v4i32(<4 x i32> %vec)
+  ret i32 %res
+}
+
+declare i32 @llvm.vector.reduce.and.v4i32(<4 x i32>)
+
+define i32 @and(<4 x i32> %vec) {
+  ; CHECK-LABEL: name: and
+  ; CHECK: bb.1 (%ir-block.0):
+  ; CHECK:   liveins: $q0
+  ; CHECK:   [[COPY:%[0-9]+]]:_(<4 x s32>) = COPY $q0
+  ; CHECK:   [[VECREDUCE_AND:%[0-9]+]]:_(s32) = G_VECREDUCE_AND [[COPY]](<4 x s32>)
+  ; CHECK:   $w0 = COPY [[VECREDUCE_AND]](s32)
+  ; CHECK:   RET_ReallyLR implicit $w0
+  %res = call i32 @llvm.vector.reduce.and.v4i32(<4 x i32> %vec)
+  ret i32 %res
+}
+
+declare i32 @llvm.vector.reduce.or.v4i32(<4 x i32>)
+
+define i32 @or(<4 x i32> %vec) {
+  ; CHECK-LABEL: name: or
+  ; CHECK: bb.1 (%ir-block.0):
+  ; CHECK:   liveins: $q0
+  ; CHECK:   [[COPY:%[0-9]+]]:_(<4 x s32>) = COPY $q0
+  ; CHECK:   [[VECREDUCE_OR:%[0-9]+]]:_(s32) = G_VECREDUCE_OR [[COPY]](<4 x s32>)
+  ; CHECK:   $w0 = COPY [[VECREDUCE_OR]](s32)
+  ; CHECK:   RET_ReallyLR implicit $w0
+  %res = call i32 @llvm.vector.reduce.or.v4i32(<4 x i32> %vec)
+  ret i32 %res
+}
+
+declare i32 @llvm.vector.reduce.xor.v4i32(<4 x i32>)
+
+define i32 @xor(<4 x i32> %vec) {
+  ; CHECK-LABEL: name: xor
+  ; CHECK: bb.1 (%ir-block.0):
+  ; CHECK:   liveins: $q0
+  ; CHECK:   [[COPY:%[0-9]+]]:_(<4 x s32>) = COPY $q0
+  ; CHECK:   [[VECREDUCE_XOR:%[0-9]+]]:_(s32) = G_VECREDUCE_XOR [[COPY]](<4 x s32>)
+  ; CHECK:   $w0 = COPY [[VECREDUCE_XOR]](s32)
+  ; CHECK:   RET_ReallyLR implicit $w0
+  %res = call i32 @llvm.vector.reduce.xor.v4i32(<4 x i32> %vec)
+  ret i32 %res
+}
+
+declare i32 @llvm.vector.reduce.smax.v4i32(<4 x i32>)
+declare i32 @llvm.vector.reduce.smin.v4i32(<4 x i32>)
+declare i32 @llvm.vector.reduce.umax.v4i32(<4 x i32>)
+declare i32 @llvm.vector.reduce.umin.v4i32(<4 x i32>)
+
+define i32 @smax(<4 x i32> %vec) {
+  ; CHECK-LABEL: name: smax
+  ; CHECK: bb.1 (%ir-block.0):
+  ; CHECK:   liveins: $q0
+  ; CHECK:   [[COPY:%[0-9]+]]:_(<4 x s32>) = COPY $q0
+  ; CHECK:   [[VECREDUCE_SMAX:%[0-9]+]]:_(s32) = G_VECREDUCE_SMAX [[COPY]](<4 x s32>)
+  ; CHECK:   $w0 = COPY [[VECREDUCE_SMAX]](s32)
+  ; CHECK:   RET_ReallyLR implicit $w0
+  %res = call i32 @llvm.vector.reduce.smax.v4i32(<4 x i32> %vec)
+  ret i32 %res
+}
+
+define i32 @smin(<4 x i32> %vec) {
+  ; CHECK-LABEL: name: smin
+  ; CHECK: bb.1 (%ir-block.0):
+  ; CHECK:   liveins: $q0
+  ; CHECK:   [[COPY:%[0-9]+]]:_(<4 x s32>) = COPY $q0
+  ; CHECK:   [[VECREDUCE_SMIN:%[0-9]+]]:_(s32) = G_VECREDUCE_SMIN [[COPY]](<4 x s32>)
+  ; CHECK:   $w0 = COPY [[VECREDUCE_SMIN]](s32)
+  ; CHECK:   RET_ReallyLR implicit $w0
+  %res = call i32 @llvm.vector.reduce.smin.v4i32(<4 x i32> %vec)
+  ret i32 %res
+}
+
+define i32 @umax(<4 x i32> %vec) {
+  ; CHECK-LABEL: name: umax
+  ; CHECK: bb.1 (%ir-block.0):
+  ; CHECK:   liveins: $q0
+  ; CHECK:   [[COPY:%[0-9]+]]:_(<4 x s32>) = COPY $q0
+  ; CHECK:   [[VECREDUCE_UMAX:%[0-9]+]]:_(s32) = G_VECREDUCE_UMAX [[COPY]](<4 x s32>)
+  ; CHECK:   $w0 = COPY [[VECREDUCE_UMAX]](s32)
+  ; CHECK:   RET_ReallyLR implicit $w0
+  %res = call i32 @llvm.vector.reduce.umax.v4i32(<4 x i32> %vec)
+  ret i32 %res
+}
+
+define i32 @umin(<4 x i32> %vec) {
+  ; CHECK-LABEL: name: umin
+  ; CHECK: bb.1 (%ir-block.0):
+  ; CHECK:   liveins: $q0
+  ; CHECK:   [[COPY:%[0-9]+]]:_(<4 x s32>) = COPY $q0
+  ; CHECK:   [[VECREDUCE_UMIN:%[0-9]+]]:_(s32) = G_VECREDUCE_UMIN [[COPY]](<4 x s32>)
+  ; CHECK:   $w0 = COPY [[VECREDUCE_UMIN]](s32)
+  ; CHECK:   RET_ReallyLR implicit $w0
+  %res = call i32 @llvm.vector.reduce.umin.v4i32(<4 x i32> %vec)
+  ret i32 %res
+}