[llvm] cca9b59 - [AArch64] Add Machine InstCombiner patterns for FMUL indexed variant

Mon Apr 12 06:15:36 PDT 2021

Author: Andrew Savonichev
Date: 2021-04-12T16:08:39+03:00
New Revision: cca9b5985c0c7e3c34da7f2db7cc8e7e707b0e2e

URL: https://github.com/llvm/llvm-project/commit/cca9b5985c0c7e3c34da7f2db7cc8e7e707b0e2e
DIFF: https://github.com/llvm/llvm-project/commit/cca9b5985c0c7e3c34da7f2db7cc8e7e707b0e2e.diff

LOG: [AArch64] Add Machine InstCombiner patterns for FMUL indexed variant

This patch adds DUP+FMUL => FMUL_indexed pattern to InstCombiner.
FMUL_indexed is normally selected during instruction selection, but it
does not work in cases when VDUP and VMUL are in different basic
blocks.

Differential Revision: https://reviews.llvm.org/D99662

Added: 
    llvm/test/CodeGen/AArch64/machine-combiner-fmul-dup.mir

Modified: 
    llvm/include/llvm/CodeGen/MachineCombinerPattern.h
    llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
    llvm/test/CodeGen/AArch64/arm64-fma-combines.ll

Removed: 
    


################################################################################
diff  --git a/llvm/include/llvm/CodeGen/MachineCombinerPattern.h b/llvm/include/llvm/CodeGen/MachineCombinerPattern.h
index ac0cc70744d18..67544779f34c6 100644

--- a/llvm/include/llvm/CodeGen/MachineCombinerPattern.h
+++ b/llvm/include/llvm/CodeGen/MachineCombinerPattern.h
@@ -153,7 +153,18 @@ enum class MachineCombinerPattern {
   FMLSv4f32_OP1,
   FMLSv4f32_OP2,
   FMLSv4i32_indexed_OP1,
-  FMLSv4i32_indexed_OP2
+  FMLSv4i32_indexed_OP2,
+
+  FMULv2i32_indexed_OP1,
+  FMULv2i32_indexed_OP2,
+  FMULv2i64_indexed_OP1,
+  FMULv2i64_indexed_OP2,
+  FMULv4i16_indexed_OP1,
+  FMULv4i16_indexed_OP2,
+  FMULv4i32_indexed_OP1,
+  FMULv4i32_indexed_OP2,
+  FMULv8i16_indexed_OP1,
+  FMULv8i16_indexed_OP2,
 };
 
 } // end namespace llvm

diff  --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
index 64adc973beeb8..94a6f4dd45b7f 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
@@ -4525,6 +4525,55 @@ static bool getFMAPatterns(MachineInstr &Root,
   return Found;
 }
 
+static bool getFMULPatterns(MachineInstr &Root,
+                            SmallVectorImpl<MachineCombinerPattern> &Patterns) {
+  MachineBasicBlock &MBB = *Root.getParent();
+  bool Found = false;
+
+  auto Match = [&](unsigned Opcode, int Operand,
+                   MachineCombinerPattern Pattern) -> bool {
+    MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
+    MachineOperand &MO = Root.getOperand(Operand);
+    MachineInstr *MI = nullptr;
+    if (MO.isReg() && Register::isVirtualRegister(MO.getReg()))
+      MI = MRI.getUniqueVRegDef(MO.getReg());
+    if (MI && MI->getOpcode() == Opcode) {
+      Patterns.push_back(Pattern);
+      return true;
+    }
+    return false;
+  };
+
+  typedef MachineCombinerPattern MCP;
+
+  switch (Root.getOpcode()) {
+  default:
+    return false;
+  case AArch64::FMULv2f32:
+    Found = Match(AArch64::DUPv2i32lane, 1, MCP::FMULv2i32_indexed_OP1);
+    Found |= Match(AArch64::DUPv2i32lane, 2, MCP::FMULv2i32_indexed_OP2);
+    break;
+  case AArch64::FMULv2f64:
+    Found = Match(AArch64::DUPv2i64lane, 1, MCP::FMULv2i64_indexed_OP1);
+    Found |= Match(AArch64::DUPv2i64lane, 2, MCP::FMULv2i64_indexed_OP2);
+    break;
+  case AArch64::FMULv4f16:
+    Found = Match(AArch64::DUPv4i16lane, 1, MCP::FMULv4i16_indexed_OP1);
+    Found |= Match(AArch64::DUPv4i16lane, 2, MCP::FMULv4i16_indexed_OP2);
+    break;
+  case AArch64::FMULv4f32:
+    Found = Match(AArch64::DUPv4i32lane, 1, MCP::FMULv4i32_indexed_OP1);
+    Found |= Match(AArch64::DUPv4i32lane, 2, MCP::FMULv4i32_indexed_OP2);
+    break;
+  case AArch64::FMULv8f16:
+    Found = Match(AArch64::DUPv8i16lane, 1, MCP::FMULv8i16_indexed_OP1);
+    Found |= Match(AArch64::DUPv8i16lane, 2, MCP::FMULv8i16_indexed_OP2);
+    break;
+  }
+
+  return Found;
+}
+
 /// Return true when a code sequence can improve throughput. It
 /// should be called only for instructions in loops.
 /// \param Pattern - combiner pattern
@@ -4588,6 +4637,16 @@ bool AArch64InstrInfo::isThroughputPattern(
   case MachineCombinerPattern::FMLSv2f64_OP2:
   case MachineCombinerPattern::FMLSv4i32_indexed_OP2:
   case MachineCombinerPattern::FMLSv4f32_OP2:
+  case MachineCombinerPattern::FMULv2i32_indexed_OP1:
+  case MachineCombinerPattern::FMULv2i32_indexed_OP2:
+  case MachineCombinerPattern::FMULv2i64_indexed_OP1:
+  case MachineCombinerPattern::FMULv2i64_indexed_OP2:
+  case MachineCombinerPattern::FMULv4i16_indexed_OP1:
+  case MachineCombinerPattern::FMULv4i16_indexed_OP2:
+  case MachineCombinerPattern::FMULv4i32_indexed_OP1:
+  case MachineCombinerPattern::FMULv4i32_indexed_OP2:
+  case MachineCombinerPattern::FMULv8i16_indexed_OP1:
+  case MachineCombinerPattern::FMULv8i16_indexed_OP2:
   case MachineCombinerPattern::MULADDv8i8_OP1:
   case MachineCombinerPattern::MULADDv8i8_OP2:
   case MachineCombinerPattern::MULADDv16i8_OP1:
@@ -4644,6 +4703,8 @@ bool AArch64InstrInfo::getMachineCombinerPatterns(
   if (getMaddPatterns(Root, Patterns))
     return true;
   // Floating point patterns
+  if (getFMULPatterns(Root, Patterns))
+    return true;
   if (getFMAPatterns(Root, Patterns))
     return true;
 
@@ -4732,6 +4793,34 @@ genFusedMultiply(MachineFunction &MF, MachineRegisterInfo &MRI,
   return MUL;
 }
 
+static MachineInstr *genIndexedMultiply(
+    MachineInstr &Root, SmallVectorImpl<MachineInstr *> &InsInstrs,
+    unsigned IdxDupOp, unsigned MulOpc, const TargetRegisterClass *RC) {
+  assert(IdxDupOp == 1 || IdxDupOp == 2);
+
+  MachineFunction &MF = *Root.getMF();
+  const TargetInstrInfo *TII = MF.getSubtarget().getInstrInfo();
+
+  MachineInstr *Dup =
+      MF.getRegInfo().getUniqueVRegDef(Root.getOperand(IdxDupOp).getReg());
+  Register DupSrcReg = Dup->getOperand(1).getReg();
+  Register DupSrcLane = Dup->getOperand(2).getImm();
+
+  unsigned IdxMulOp = IdxDupOp == 1 ? 2 : 1;
+  MachineOperand &MulOp = Root.getOperand(IdxMulOp);
+
+  Register ResultReg = Root.getOperand(0).getReg();
+
+  MachineInstrBuilder MIB;
+  MIB = BuildMI(MF, Root.getDebugLoc(), TII->get(MulOpc), ResultReg)
+            .add(MulOp)
+            .addReg(DupSrcReg)
+            .addImm(DupSrcLane);
+
+  InsInstrs.push_back(MIB);
+  return &Root;
+}
+
 /// genFusedMultiplyAcc - Helper to generate fused multiply accumulate
 /// instructions.
 ///
@@ -5690,12 +5779,53 @@ void AArch64InstrInfo::genAlternativeCodeSequence(
     }
     break;
   }
+  case MachineCombinerPattern::FMULv2i32_indexed_OP1:
+  case MachineCombinerPattern::FMULv2i32_indexed_OP2: {
+    unsigned IdxDupOp =
+        (Pattern == MachineCombinerPattern::FMULv2i32_indexed_OP1) ? 1 : 2;
+    genIndexedMultiply(Root, InsInstrs, IdxDupOp, AArch64::FMULv2i32_indexed,
+                       &AArch64::FPR64RegClass);
+    break;
+  }
+  case MachineCombinerPattern::FMULv2i64_indexed_OP1:
+  case MachineCombinerPattern::FMULv2i64_indexed_OP2: {
+    unsigned IdxDupOp =
+        (Pattern == MachineCombinerPattern::FMULv2i64_indexed_OP1) ? 1 : 2;
+    genIndexedMultiply(Root, InsInstrs, IdxDupOp, AArch64::FMULv2i64_indexed,
+                       &AArch64::FPR128RegClass);
+    break;
+  }
+  case MachineCombinerPattern::FMULv4i16_indexed_OP1:
+  case MachineCombinerPattern::FMULv4i16_indexed_OP2: {
+    unsigned IdxDupOp =
+        (Pattern == MachineCombinerPattern::FMULv4i16_indexed_OP1) ? 1 : 2;
+    genIndexedMultiply(Root, InsInstrs, IdxDupOp, AArch64::FMULv4i16_indexed,
+                       &AArch64::FPR64RegClass);
+    break;
+  }
+  case MachineCombinerPattern::FMULv4i32_indexed_OP1:
+  case MachineCombinerPattern::FMULv4i32_indexed_OP2: {
+    unsigned IdxDupOp =
+        (Pattern == MachineCombinerPattern::FMULv4i32_indexed_OP1) ? 1 : 2;
+    genIndexedMultiply(Root, InsInstrs, IdxDupOp, AArch64::FMULv4i32_indexed,
+                       &AArch64::FPR128RegClass);
+    break;
+  }
+  case MachineCombinerPattern::FMULv8i16_indexed_OP1:
+  case MachineCombinerPattern::FMULv8i16_indexed_OP2: {
+    unsigned IdxDupOp =
+        (Pattern == MachineCombinerPattern::FMULv8i16_indexed_OP1) ? 1 : 2;
+    genIndexedMultiply(Root, InsInstrs, IdxDupOp, AArch64::FMULv8i16_indexed,
+                       &AArch64::FPR128RegClass);
+    break;
+  }
   } // end switch (Pattern)
   // Record MUL and ADD/SUB for deletion
   // FIXME: This assertion fails in CodeGen/AArch64/tailmerging_in_mbp.ll and
   // CodeGen/AArch64/urem-seteq-nonzero.ll.
   // assert(MUL && "MUL was never set");
-  DelInstrs.push_back(MUL);
+  if (MUL)
+    DelInstrs.push_back(MUL);
   DelInstrs.push_back(&Root);
 }
 

diff  --git a/llvm/test/CodeGen/AArch64/arm64-fma-combines.ll b/llvm/test/CodeGen/AArch64/arm64-fma-combines.ll
index 95ef0f90d2315..1768314a97a1b 100644
--- a/llvm/test/CodeGen/AArch64/arm64-fma-combines.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-fma-combines.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -O=3 -mtriple=arm64-apple-ios -mcpu=cyclone -enable-unsafe-fp-math | FileCheck %s
+; RUN: llc < %s -O=3 -mtriple=arm64-apple-ios -mcpu=cyclone -mattr=+fullfp16 -enable-unsafe-fp-math | FileCheck %s
 define void @foo_2d(double* %src) {
 ; CHECK-LABEL: %entry
 ; CHECK: fmul {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}}
@@ -134,3 +134,128 @@ for.body:                                         ; preds = %for.body, %entry
 for.end:                                          ; preds = %for.body
   ret void
 }
+
+define void @indexed_2s(<2 x float> %shuf, <2 x float> %add,
+                        <2 x float>* %pmul, <2 x float>* %pret) {
+; CHECK-LABEL: %entry
+; CHECK: for.body
+; CHECK: fmla.2s {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}[0]
+;
+entry:
+  %shuffle = shufflevector <2 x float> %shuf, <2 x float> undef, <2 x i32> zeroinitializer
+  br label %for.body
+
+for.body:
+  %i = phi i64 [ 0, %entry ], [ %inext, %for.body ]
+  %pmul_i = getelementptr inbounds <2 x float>, <2 x float>* %pmul, i64 %i
+  %pret_i = getelementptr inbounds <2 x float>, <2 x float>* %pret, i64 %i
+
+  %mul_i = load <2 x float>, <2 x float>* %pmul_i
+
+  %mul = fmul fast <2 x float> %mul_i, %shuffle
+  %muladd = fadd fast <2 x float> %mul, %add
+
+  store <2 x float> %muladd, <2 x float>* %pret_i, align 16
+  %inext = add i64 %i, 1
+  br label %for.body
+}
+
+define void @indexed_2d(<2 x double> %shuf, <2 x double> %add,
+                        <2 x double>* %pmul, <2 x double>* %pret) {
+; CHECK-LABEL: %entry
+; CHECK: for.body
+; CHECK: fmla.2d {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}[0]
+;
+entry:
+  %shuffle = shufflevector <2 x double> %shuf, <2 x double> undef, <2 x i32> zeroinitializer
+  br label %for.body
+
+for.body:
+  %i = phi i64 [ 0, %entry ], [ %inext, %for.body ]
+  %pmul_i = getelementptr inbounds <2 x double>, <2 x double>* %pmul, i64 %i
+  %pret_i = getelementptr inbounds <2 x double>, <2 x double>* %pret, i64 %i
+
+  %mul_i = load <2 x double>, <2 x double>* %pmul_i
+
+  %mul = fmul fast <2 x double> %mul_i, %shuffle
+  %muladd = fadd fast <2 x double> %mul, %add
+
+  store <2 x double> %muladd, <2 x double>* %pret_i, align 16
+  %inext = add i64 %i, 1
+  br label %for.body
+}
+
+define void @indexed_4s(<4 x float> %shuf, <4 x float> %add,
+                        <4 x float>* %pmul, <4 x float>* %pret) {
+; CHECK-LABEL: %entry
+; CHECK: for.body
+; CHECK: fmla.4s {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}[0]
+;
+entry:
+  %shuffle = shufflevector <4 x float> %shuf, <4 x float> undef, <4 x i32> zeroinitializer
+  br label %for.body
+
+for.body:
+  %i = phi i64 [ 0, %entry ], [ %inext, %for.body ]
+  %pmul_i = getelementptr inbounds <4 x float>, <4 x float>* %pmul, i64 %i
+  %pret_i = getelementptr inbounds <4 x float>, <4 x float>* %pret, i64 %i
+
+  %mul_i = load <4 x float>, <4 x float>* %pmul_i
+
+  %mul = fmul fast <4 x float> %mul_i, %shuffle
+  %muladd = fadd fast <4 x float> %mul, %add
+
+  store <4 x float> %muladd, <4 x float>* %pret_i, align 16
+  %inext = add i64 %i, 1
+  br label %for.body
+}
+
+define void @indexed_4h(<4 x half> %shuf, <4 x half> %add,
+                        <4 x half>* %pmul, <4 x half>* %pret) {
+; CHECK-LABEL: %entry
+; CHECK: for.body
+; CHECK: fmla.4h {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}[0]
+;
+entry:
+  %shuffle = shufflevector <4 x half> %shuf, <4 x half> undef, <4 x i32> zeroinitializer
+  br label %for.body
+
+for.body:
+  %i = phi i64 [ 0, %entry ], [ %inext, %for.body ]
+  %pmul_i = getelementptr inbounds <4 x half>, <4 x half>* %pmul, i64 %i
+  %pret_i = getelementptr inbounds <4 x half>, <4 x half>* %pret, i64 %i
+
+  %mul_i = load <4 x half>, <4 x half>* %pmul_i
+
+  %mul = fmul fast <4 x half> %mul_i, %shuffle
+  %muladd = fadd fast <4 x half> %mul, %add
+
+  store <4 x half> %muladd, <4 x half>* %pret_i, align 16
+  %inext = add i64 %i, 1
+  br label %for.body
+}
+
+define void @indexed_8h(<8 x half> %shuf, <8 x half> %add,
+                        <8 x half>* %pmul, <8 x half>* %pret) {
+; CHECK-LABEL: %entry
+; CHECK: for.body
+; CHECK: fmla.8h {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}[0]
+;
+entry:
+  %shuffle = shufflevector <8 x half> %shuf, <8 x half> undef, <8 x i32> zeroinitializer
+  br label %for.body
+
+for.body:
+  %i = phi i64 [ 0, %entry ], [ %inext, %for.body ]
+  %pmul_i = getelementptr inbounds <8 x half>, <8 x half>* %pmul, i64 %i
+  %pret_i = getelementptr inbounds <8 x half>, <8 x half>* %pret, i64 %i
+
+  %mul_i = load <8 x half>, <8 x half>* %pmul_i
+
+  %mul = fmul fast <8 x half> %mul_i, %shuffle
+  %muladd = fadd fast <8 x half> %mul, %add
+
+  store <8 x half> %muladd, <8 x half>* %pret_i, align 16
+  %inext = add i64 %i, 1
+  br label %for.body
+}

diff  --git a/llvm/test/CodeGen/AArch64/machine-combiner-fmul-dup.mir b/llvm/test/CodeGen/AArch64/machine-combiner-fmul-dup.mir
new file mode 100644
index 0000000000000..29c8f38f83a9c
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/machine-combiner-fmul-dup.mir
@@ -0,0 +1,378 @@
+# RUN: llc -run-pass=machine-combiner -o - -simplify-mir -mtriple=aarch64-unknown-linux-gnu -mattr=+fullfp16 %s | FileCheck %s
+--- |
+  ; ModuleID = 'lit.ll'
+  source_filename = "lit.ll"
+  target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
+  target triple = "aarch64-unknown-linux-gnu"
+
+  define void @indexed_2s(<2 x float> %shuf, <2 x float> %mu, <2 x float> %ad, <2 x float>* %ret) #0 {
+  entry:
+    %shuffle = shufflevector <2 x float> %shuf, <2 x float> undef, <2 x i32> zeroinitializer
+    br label %for.cond
+
+  for.cond:                                         ; preds = %for.cond, %entry
+    %mul = fmul <2 x float> %mu, %shuffle
+    %add = fadd <2 x float> %mul, %ad
+    store <2 x float> %add, <2 x float>* %ret, align 16
+    br label %for.cond
+  }
+
+  define void @indexed_2s_rev(<2 x float> %shuf, <2 x float> %mu, <2 x float> %ad, <2 x float>* %ret) #0 {
+  entry:
+    %shuffle = shufflevector <2 x float> %shuf, <2 x float> undef, <2 x i32> zeroinitializer
+    br label %for.cond
+
+  for.cond:                                         ; preds = %for.cond, %entry
+    %mul = fmul <2 x float> %shuffle, %mu
+    %add = fadd <2 x float> %mul, %ad
+    store <2 x float> %add, <2 x float>* %ret, align 16
+    br label %for.cond
+  }
+
+  define void @indexed_2d(<2 x double> %shuf, <2 x double> %mu, <2 x double> %ad, <2 x double>* %ret) #0 {
+  entry:
+    %shuffle = shufflevector <2 x double> %shuf, <2 x double> undef, <2 x i32> zeroinitializer
+    br label %for.cond
+
+  for.cond:                                         ; preds = %for.cond, %entry
+    %mul = fmul <2 x double> %mu, %shuffle
+    %add = fadd <2 x double> %mul, %ad
+    store <2 x double> %add, <2 x double>* %ret, align 16
+    br label %for.cond
+  }
+
+  define void @indexed_4s(<4 x float> %shuf, <4 x float> %mu, <4 x float> %ad, <4 x float>* %ret) #0 {
+  entry:
+    %shuffle = shufflevector <4 x float> %shuf, <4 x float> undef, <4 x i32> zeroinitializer
+    br label %for.cond
+
+  for.cond:                                         ; preds = %for.cond, %entry
+    %mul = fmul <4 x float> %mu, %shuffle
+    %add = fadd <4 x float> %mul, %ad
+    store <4 x float> %add, <4 x float>* %ret, align 16
+    br label %for.cond
+  }
+
+  define void @indexed_4h(<4 x half> %shuf, <4 x half> %mu, <4 x half> %ad, <4 x half>* %ret) #0 {
+  entry:
+    %shuffle = shufflevector <4 x half> %shuf, <4 x half> undef, <4 x i32> zeroinitializer
+    br label %for.cond
+
+  for.cond:
+    %mul = fmul <4 x half> %mu, %shuffle
+    %add = fadd <4 x half> %mul, %ad
+    store <4 x half> %add, <4 x half>* %ret, align 16
+    br label %for.cond
+  }
+
+  define void @indexed_8h(<8 x half> %shuf, <8 x half> %mu, <8 x half> %ad, <8 x half>* %ret) #0 {
+  entry:
+    %shuffle = shufflevector <8 x half> %shuf, <8 x half> undef, <8 x i32> zeroinitializer
+    br label %for.cond
+
+  for.cond:
+    %mul = fmul <8 x half> %mu, %shuffle
+    %add = fadd <8 x half> %mul, %ad
+    store <8 x half> %add, <8 x half>* %ret, align 16
+    br label %for.cond
+  }
+
+  attributes #0 = { "target-cpu"="cortex-a57" }
+
+...
+# CHECK-LABEL: name: indexed_2s
+# CHECK:        [[OP1COPY:%.*]]:fpr64 = COPY $d1
+# CHECK:        [[OP2COPY:%.*]]:fpr64 = COPY $d0
+# CHECK:        [[UNDEF:%.*]]:fpr128 = IMPLICIT_DEF
+# CHECK:        [[OP2:%.*]]:fpr128 = INSERT_SUBREG [[UNDEF]], [[OP2COPY]], %subreg.dsub
+# CHECK:        [[OP1:%.*]]:fpr64 = COPY [[OP1COPY]]
+# CHECK-NOT:    FMULv2f32
+# CHECK:        :fpr64 = FMULv2i32_indexed [[OP1]], [[OP2]], 0
+---
+name:            indexed_2s
+alignment:       16
+tracksRegLiveness: true
+registers:
+  - { id: 0, class: fpr64 }
+  - { id: 1, class: fpr64 }
+  - { id: 2, class: fpr64 }
+  - { id: 3, class: fpr64 }
+  - { id: 4, class: gpr64common }
+  - { id: 5, class: fpr64 }
+  - { id: 6, class: fpr64 }
+  - { id: 7, class: fpr128 }
+  - { id: 8, class: fpr128 }
+  - { id: 9, class: fpr64 }
+  - { id: 10, class: fpr64 }
+liveins:
+  - { reg: '$d0', virtual-reg: '%1' }
+  - { reg: '$d1', virtual-reg: '%2' }
+  - { reg: '$d2', virtual-reg: '%3' }
+  - { reg: '$x0', virtual-reg: '%4' }
+frameInfo:
+  maxAlignment:    1
+  maxCallFrameSize: 0
+machineFunctionInfo: {}
+body:             |
+  bb.0.entry:
+    liveins: $d0, $d1, $d2, $x0
+
+    %4:gpr64common = COPY $x0
+    %3:fpr64 = COPY $d2
+    %2:fpr64 = COPY $d1
+    %1:fpr64 = COPY $d0
+    %8:fpr128 = IMPLICIT_DEF
+    %7:fpr128 = INSERT_SUBREG %8, %1, %subreg.dsub
+    %6:fpr64 = COPY %3
+    %5:fpr64 = COPY %2
+    %0:fpr64 = DUPv2i32lane killed %7, 0
+
+  bb.1.for.cond:
+    %9:fpr64 = FMULv2f32 %5, %0
+    %10:fpr64 = FADDv2f32 killed %9, %6
+    STRDui killed %10, %4, 0 :: (store 8 into %ir.ret, align 16)
+    B %bb.1
+
+...
+# CHECK-LABEL: name: indexed_2s_rev
+# CHECK:        [[OP2COPY:%.*]]:fpr64 = COPY $d1
+# CHECK:        [[OP1COPY:%.*]]:fpr64 = COPY $d0
+# CHECK:        [[UNDEF:%.*]]:fpr128 = IMPLICIT_DEF
+# CHECK:        [[OP1:%.*]]:fpr128 = INSERT_SUBREG [[UNDEF]], [[OP1COPY]], %subreg.dsub
+# CHECK:        [[OP2:%.*]]:fpr64 = COPY [[OP2COPY]]
+# CHECK-NOT:    FMULv2f32
+# CHECK:        :fpr64 = FMULv2i32_indexed [[OP2]], [[OP1]], 0
+---
+name:            indexed_2s_rev
+alignment:       16
+tracksRegLiveness: true
+registers:
+  - { id: 0, class: fpr64 }
+  - { id: 1, class: fpr64 }
+  - { id: 2, class: fpr64 }
+  - { id: 3, class: fpr64 }
+  - { id: 4, class: gpr64common }
+  - { id: 5, class: fpr64 }
+  - { id: 6, class: fpr64 }
+  - { id: 7, class: fpr128 }
+  - { id: 8, class: fpr128 }
+  - { id: 9, class: fpr64 }
+  - { id: 10, class: fpr64 }
+liveins:
+  - { reg: '$d0', virtual-reg: '%1' }
+  - { reg: '$d1', virtual-reg: '%2' }
+  - { reg: '$d2', virtual-reg: '%3' }
+  - { reg: '$x0', virtual-reg: '%4' }
+frameInfo:
+  maxAlignment:    1
+  maxCallFrameSize: 0
+machineFunctionInfo: {}
+body:             |
+  bb.0.entry:
+    liveins: $d0, $d1, $d2, $x0
+
+    %4:gpr64common = COPY $x0
+    %3:fpr64 = COPY $d2
+    %2:fpr64 = COPY $d1
+    %1:fpr64 = COPY $d0
+    %8:fpr128 = IMPLICIT_DEF
+    %7:fpr128 = INSERT_SUBREG %8, %1, %subreg.dsub
+    %6:fpr64 = COPY %3
+    %5:fpr64 = COPY %2
+    %0:fpr64 = DUPv2i32lane killed %7, 0
+
+  bb.1.for.cond:
+    %9:fpr64 = FMULv2f32 %0, %5
+    %10:fpr64 = FADDv2f32 killed %9, %6
+    STRDui killed %10, %4, 0 :: (store 8 into %ir.ret, align 16)
+    B %bb.1
+
+...
+# CHECK-LABEL: name: indexed_2d
+# CHECK:        [[OP1COPY:%.*]]:fpr128 = COPY $q1
+# CHECK:        [[OP2:%.*]]:fpr128 = COPY $q0
+# CHECK:        [[OP1:%.*]]:fpr128 = COPY [[OP1COPY]]
+# CHECK-NOT:    FMULv2f64
+# CHECK:        :fpr128 = FMULv2i64_indexed [[OP1]], [[OP2]], 0
+---
+name:            indexed_2d
+alignment:       16
+tracksRegLiveness: true
+registers:
+  - { id: 0, class: fpr128 }
+  - { id: 1, class: fpr128 }
+  - { id: 2, class: fpr128 }
+  - { id: 3, class: fpr128 }
+  - { id: 4, class: gpr64common }
+  - { id: 5, class: fpr128 }
+  - { id: 6, class: fpr128 }
+  - { id: 7, class: fpr128 }
+  - { id: 8, class: fpr128 }
+liveins:
+  - { reg: '$q0', virtual-reg: '%1' }
+  - { reg: '$q1', virtual-reg: '%2' }
+  - { reg: '$q2', virtual-reg: '%3' }
+  - { reg: '$x0', virtual-reg: '%4' }
+frameInfo:
+  maxAlignment:    1
+  maxCallFrameSize: 0
+machineFunctionInfo: {}
+body:             |
+  bb.0.entry:
+    liveins: $q0, $q1, $q2, $x0
+
+    %4:gpr64common = COPY $x0
+    %3:fpr128 = COPY $q2
+    %2:fpr128 = COPY $q1
+    %1:fpr128 = COPY $q0
+    %6:fpr128 = COPY %3
+    %5:fpr128 = COPY %2
+    %0:fpr128 = DUPv2i64lane %1, 0
+
+  bb.1.for.cond:
+    %7:fpr128 = FMULv2f64 %5, %0
+    %8:fpr128 = FADDv2f64 killed %7, %6
+    STRQui killed %8, %4, 0 :: (store 16 into %ir.ret)
+    B %bb.1
+
+...
+# CHECK-LABEL: name: indexed_4s
+# CHECK:        [[OP1COPY:%.*]]:fpr128 = COPY $q1
+# CHECK:        [[OP2:%.*]]:fpr128 = COPY $q0
+# CHECK:        [[OP1:%.*]]:fpr128 = COPY [[OP1COPY]]
+# CHECK-NOT:    FMULv4f32
+# CHECK:        :fpr128 = FMULv4i32_indexed [[OP1]], [[OP2]], 0
+---
+name:            indexed_4s
+alignment:       16
+tracksRegLiveness: true
+registers:
+  - { id: 0, class: fpr128 }
+  - { id: 1, class: fpr128 }
+  - { id: 2, class: fpr128 }
+  - { id: 3, class: fpr128 }
+  - { id: 4, class: gpr64common }
+  - { id: 5, class: fpr128 }
+  - { id: 6, class: fpr128 }
+  - { id: 7, class: fpr128 }
+  - { id: 8, class: fpr128 }
+liveins:
+  - { reg: '$q0', virtual-reg: '%1' }
+  - { reg: '$q1', virtual-reg: '%2' }
+  - { reg: '$q2', virtual-reg: '%3' }
+  - { reg: '$x0', virtual-reg: '%4' }
+frameInfo:
+  maxAlignment:    1
+  maxCallFrameSize: 0
+machineFunctionInfo: {}
+body:             |
+  bb.0.entry:
+    liveins: $q0, $q1, $q2, $x0
+
+    %4:gpr64common = COPY $x0
+    %3:fpr128 = COPY $q2
+    %2:fpr128 = COPY $q1
+    %1:fpr128 = COPY $q0
+    %6:fpr128 = COPY %3
+    %5:fpr128 = COPY %2
+    %0:fpr128 = DUPv4i32lane %1, 0
+
+  bb.1.for.cond:
+    %7:fpr128 = FMULv4f32 %5, %0
+    %8:fpr128 = FADDv4f32 killed %7, %6
+    STRQui killed %8, %4, 0 :: (store 16 into %ir.ret)
+    B %bb.1
+
+...
+# CHECK-LABEL: name: indexed_4h
+# CHECK:        [[OP1:%.*]]:fpr64 = COPY $d1
+# CHECK:        [[OP2COPY:%.*]]:fpr64 = COPY $d0
+# CHECK:        [[UNDEF:%.*]]:fpr128 = IMPLICIT_DEF
+# CHECK:        [[OP2:%.*]]:fpr128 = INSERT_SUBREG [[UNDEF]], [[OP2COPY]], %subreg.dsub
+# CHECK-NOT:    FMULv4f16
+# CHECK:        :fpr64 = FMULv4i16_indexed [[OP1]], [[OP2]], 0
+---
+name:            indexed_4h
+alignment:       16
+tracksRegLiveness: true
+registers:
+  - { id: 0, class: fpr64 }
+  - { id: 1, class: fpr64 }
+  - { id: 2, class: fpr64 }
+  - { id: 3, class: fpr64 }
+  - { id: 4, class: gpr64common }
+  - { id: 5, class: fpr128 }
+  - { id: 6, class: fpr128 }
+  - { id: 7, class: fpr64 }
+  - { id: 8, class: fpr64 }
+liveins:
+  - { reg: '$d0', virtual-reg: '%1' }
+  - { reg: '$d1', virtual-reg: '%2' }
+  - { reg: '$d2', virtual-reg: '%3' }
+  - { reg: '$x0', virtual-reg: '%4' }
+frameInfo:
+  maxAlignment:    1
+  maxCallFrameSize: 0
+machineFunctionInfo: {}
+body:             |
+  bb.0.entry:
+    liveins: $d0, $d1, $d2, $x0
+
+    %4:gpr64common = COPY $x0
+    %3:fpr64 = COPY $d2
+    %2:fpr64 = COPY $d1
+    %1:fpr64 = COPY $d0
+    %6:fpr128 = IMPLICIT_DEF
+    %5:fpr128 = INSERT_SUBREG %6, %1, %subreg.dsub
+    %0:fpr64 = DUPv4i16lane killed %5, 0
+
+  bb.1.for.cond:
+    %7:fpr64 = FMULv4f16 %2, %0
+    %8:fpr64 = FADDv4f16 killed %7, %3
+    STRDui killed %8, %4, 0 :: (store 8 into %ir.ret, align 16)
+    B %bb.1
+
+...
+# CHECK-LABEL: name: indexed_8h
+# CHECK:        [[OP1:%.*]]:fpr128 = COPY $q1
+# CHECK:        [[OP2:%.*]]:fpr128 = COPY $q0
+# CHECK-NOT:    FMULv8f16
+# CHECK:        :fpr128 = FMULv8i16_indexed [[OP1]], [[OP2]], 0
+---
+name:            indexed_8h
+alignment:       16
+tracksRegLiveness: true
+registers:
+  - { id: 0, class: fpr128 }
+  - { id: 1, class: fpr128 }
+  - { id: 2, class: fpr128 }
+  - { id: 3, class: fpr128 }
+  - { id: 4, class: gpr64common }
+  - { id: 5, class: fpr128 }
+  - { id: 6, class: fpr128 }
+liveins:
+  - { reg: '$q0', virtual-reg: '%1' }
+  - { reg: '$q1', virtual-reg: '%2' }
+  - { reg: '$q2', virtual-reg: '%3' }
+  - { reg: '$x0', virtual-reg: '%4' }
+frameInfo:
+  maxAlignment:    1
+  maxCallFrameSize: 0
+machineFunctionInfo: {}
+body:             |
+  bb.0.entry:
+    liveins: $q0, $q1, $q2, $x0
+
+    %4:gpr64common = COPY $x0
+    %3:fpr128 = COPY $q2
+    %2:fpr128 = COPY $q1
+    %1:fpr128 = COPY $q0
+    %0:fpr128 = DUPv8i16lane %1, 0
+
+  bb.1.for.cond:
+    %5:fpr128 = FMULv8f16 %2, %0
+    %6:fpr128 = FADDv8f16 killed %5, %3
+    STRQui killed %6, %4, 0 :: (store 16 into %ir.ret)
+    B %bb.1
+
+...