[llvm] [RISCV] Convert implicit_def tuples to noreg in post-isel peephole (PR #91173)

Tue May 7 01:10:13 PDT 2024

https://github.com/lukel97 updated https://github.com/llvm/llvm-project/pull/91173

>From f14c46d1d06856653f9a875710394b6c6cf85c9b Mon Sep 17 00:00:00 2001
From: Luke Lau <luke at igalia.com>
Date: Mon, 6 May 2024 16:35:06 +0800
Subject: [PATCH 1/3] [RISCV] Convert implicit_def tuples to noreg in post-isel
 peephole

If a segmented load has an undefined passthru then it will be selected as a reg_sequence with implicit_def operands, which currently slips through the implicit_def -> noreg peephole.

This patch fixes this so we're able to infer the undef flag for vlseg passthrus, which will help with moving RISCVInsertVSETVLI to LiveIntervals in #70549
---
 llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp   | 11 +++++-
 .../CodeGen/RISCV/rvv/rv32-spill-zvlsseg.ll   | 38 +++----------------
 .../CodeGen/RISCV/rvv/rv64-spill-zvlsseg.ll   | 38 +++----------------
 .../RISCV/rvv/vleff-vlseg2ff-output.ll        |  7 +---
 4 files changed, 20 insertions(+), 74 deletions(-)

diff --git a/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp b/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp
index dc3ad5ac5908c..e73a3af92af6f 100644
--- a/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp
@@ -3478,8 +3478,15 @@ static bool usesAllOnesMask(SDNode *N, unsigned MaskOpIdx) {
 }
 
 static bool isImplicitDef(SDValue V) {
-  return V.isMachineOpcode() &&
-         V.getMachineOpcode() == TargetOpcode::IMPLICIT_DEF;
+  if (!V.isMachineOpcode())
+    return false;
+  if (V.getMachineOpcode() == TargetOpcode::REG_SEQUENCE) {
+    for (unsigned I = 1; I < V.getNumOperands(); I += 2)
+      if (!isImplicitDef(V.getOperand(I)))
+        return false;
+    return true;
+  }
+  return V.getMachineOpcode() == TargetOpcode::IMPLICIT_DEF;
 }
 
 // Optimize masked RVV pseudo instructions with a known all-ones mask to their
diff --git a/llvm/test/CodeGen/RISCV/rvv/rv32-spill-zvlsseg.ll b/llvm/test/CodeGen/RISCV/rvv/rv32-spill-zvlsseg.ll
index 407c782d3377a..e7913fc53df0f 100644
--- a/llvm/test/CodeGen/RISCV/rvv/rv32-spill-zvlsseg.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/rv32-spill-zvlsseg.ll
@@ -13,13 +13,8 @@ define <vscale x 1 x i32> @spill_zvlsseg_nxv1i32(ptr %base, i32 %vl) nounwind {
 ; SPILL-O0-NEXT:    csrr a2, vlenb
 ; SPILL-O0-NEXT:    slli a2, a2, 1
 ; SPILL-O0-NEXT:    sub sp, sp, a2
-; SPILL-O0-NEXT:    # implicit-def: $v8
-; SPILL-O0-NEXT:    # implicit-def: $v9
-; SPILL-O0-NEXT:    # implicit-def: $v10
-; SPILL-O0-NEXT:    # implicit-def: $v9
-; SPILL-O0-NEXT:    # kill: def $v8 killed $v8 def $v8_v9
-; SPILL-O0-NEXT:    vmv1r.v v9, v10
 ; SPILL-O0-NEXT:    vsetvli zero, a1, e32, mf2, ta, ma
+; SPILL-O0-NEXT:    # implicit-def: $v8_v9
 ; SPILL-O0-NEXT:    vlseg2e32.v v8, (a0)
 ; SPILL-O0-NEXT:    vmv1r.v v8, v9
 ; SPILL-O0-NEXT:    addi a0, sp, 16
@@ -95,13 +90,8 @@ define <vscale x 2 x i32> @spill_zvlsseg_nxv2i32(ptr %base, i32 %vl) nounwind {
 ; SPILL-O0-NEXT:    csrr a2, vlenb
 ; SPILL-O0-NEXT:    slli a2, a2, 1
 ; SPILL-O0-NEXT:    sub sp, sp, a2
-; SPILL-O0-NEXT:    # implicit-def: $v8
-; SPILL-O0-NEXT:    # implicit-def: $v9
-; SPILL-O0-NEXT:    # implicit-def: $v10
-; SPILL-O0-NEXT:    # implicit-def: $v9
-; SPILL-O0-NEXT:    # kill: def $v8 killed $v8 def $v8_v9
-; SPILL-O0-NEXT:    vmv1r.v v9, v10
 ; SPILL-O0-NEXT:    vsetvli zero, a1, e32, m1, ta, ma
+; SPILL-O0-NEXT:    # implicit-def: $v8_v9
 ; SPILL-O0-NEXT:    vlseg2e32.v v8, (a0)
 ; SPILL-O0-NEXT:    vmv1r.v v8, v9
 ; SPILL-O0-NEXT:    addi a0, sp, 16
@@ -177,13 +167,8 @@ define <vscale x 4 x i32> @spill_zvlsseg_nxv4i32(ptr %base, i32 %vl) nounwind {
 ; SPILL-O0-NEXT:    csrr a2, vlenb
 ; SPILL-O0-NEXT:    slli a2, a2, 1
 ; SPILL-O0-NEXT:    sub sp, sp, a2
-; SPILL-O0-NEXT:    # implicit-def: $v8m2
-; SPILL-O0-NEXT:    # implicit-def: $v10m2
-; SPILL-O0-NEXT:    # implicit-def: $v12m2
-; SPILL-O0-NEXT:    # implicit-def: $v10m2
-; SPILL-O0-NEXT:    # kill: def $v8m2 killed $v8m2 def $v8m2_v10m2
-; SPILL-O0-NEXT:    vmv2r.v v10, v12
 ; SPILL-O0-NEXT:    vsetvli zero, a1, e32, m2, ta, ma
+; SPILL-O0-NEXT:    # implicit-def: $v8m2_v10m2
 ; SPILL-O0-NEXT:    vlseg2e32.v v8, (a0)
 ; SPILL-O0-NEXT:    vmv2r.v v8, v10
 ; SPILL-O0-NEXT:    addi a0, sp, 16
@@ -262,13 +247,8 @@ define <vscale x 8 x i32> @spill_zvlsseg_nxv8i32(ptr %base, i32 %vl) nounwind {
 ; SPILL-O0-NEXT:    csrr a2, vlenb
 ; SPILL-O0-NEXT:    slli a2, a2, 2
 ; SPILL-O0-NEXT:    sub sp, sp, a2
-; SPILL-O0-NEXT:    # implicit-def: $v8m4
-; SPILL-O0-NEXT:    # implicit-def: $v12m4
-; SPILL-O0-NEXT:    # implicit-def: $v16m4
-; SPILL-O0-NEXT:    # implicit-def: $v12m4
-; SPILL-O0-NEXT:    # kill: def $v8m4 killed $v8m4 def $v8m4_v12m4
-; SPILL-O0-NEXT:    vmv4r.v v12, v16
 ; SPILL-O0-NEXT:    vsetvli zero, a1, e32, m4, ta, ma
+; SPILL-O0-NEXT:    # implicit-def: $v8m4_v12m4
 ; SPILL-O0-NEXT:    vlseg2e32.v v8, (a0)
 ; SPILL-O0-NEXT:    vmv4r.v v8, v12
 ; SPILL-O0-NEXT:    addi a0, sp, 16
@@ -347,16 +327,8 @@ define <vscale x 4 x i32> @spill_zvlsseg3_nxv4i32(ptr %base, i32 %vl) nounwind {
 ; SPILL-O0-NEXT:    csrr a2, vlenb
 ; SPILL-O0-NEXT:    slli a2, a2, 1
 ; SPILL-O0-NEXT:    sub sp, sp, a2
-; SPILL-O0-NEXT:    # implicit-def: $v8m2
-; SPILL-O0-NEXT:    # implicit-def: $v10m2
-; SPILL-O0-NEXT:    # implicit-def: $v16m2
-; SPILL-O0-NEXT:    # implicit-def: $v10m2
-; SPILL-O0-NEXT:    # implicit-def: $v14m2
-; SPILL-O0-NEXT:    # implicit-def: $v10m2
-; SPILL-O0-NEXT:    # kill: def $v8m2 killed $v8m2 def $v8m2_v10m2_v12m2
-; SPILL-O0-NEXT:    vmv2r.v v10, v16
-; SPILL-O0-NEXT:    vmv2r.v v12, v14
 ; SPILL-O0-NEXT:    vsetvli zero, a1, e32, m2, ta, ma
+; SPILL-O0-NEXT:    # implicit-def: $v8m2_v10m2_v12m2
 ; SPILL-O0-NEXT:    vlseg3e32.v v8, (a0)
 ; SPILL-O0-NEXT:    vmv2r.v v8, v10
 ; SPILL-O0-NEXT:    addi a0, sp, 16
diff --git a/llvm/test/CodeGen/RISCV/rvv/rv64-spill-zvlsseg.ll b/llvm/test/CodeGen/RISCV/rvv/rv64-spill-zvlsseg.ll
index 1c1544b4efa0b..dd575b3fceb55 100644
--- a/llvm/test/CodeGen/RISCV/rvv/rv64-spill-zvlsseg.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/rv64-spill-zvlsseg.ll
@@ -13,13 +13,8 @@ define <vscale x 1 x i32> @spill_zvlsseg_nxv1i32(ptr %base, i64 %vl) nounwind {
 ; SPILL-O0-NEXT:    csrr a2, vlenb
 ; SPILL-O0-NEXT:    slli a2, a2, 1
 ; SPILL-O0-NEXT:    sub sp, sp, a2
-; SPILL-O0-NEXT:    # implicit-def: $v8
-; SPILL-O0-NEXT:    # implicit-def: $v9
-; SPILL-O0-NEXT:    # implicit-def: $v10
-; SPILL-O0-NEXT:    # implicit-def: $v9
-; SPILL-O0-NEXT:    # kill: def $v8 killed $v8 def $v8_v9
-; SPILL-O0-NEXT:    vmv1r.v v9, v10
 ; SPILL-O0-NEXT:    vsetvli zero, a1, e32, mf2, ta, ma
+; SPILL-O0-NEXT:    # implicit-def: $v8_v9
 ; SPILL-O0-NEXT:    vlseg2e32.v v8, (a0)
 ; SPILL-O0-NEXT:    vmv1r.v v8, v9
 ; SPILL-O0-NEXT:    addi a0, sp, 16
@@ -95,13 +90,8 @@ define <vscale x 2 x i32> @spill_zvlsseg_nxv2i32(ptr %base, i64 %vl) nounwind {
 ; SPILL-O0-NEXT:    csrr a2, vlenb
 ; SPILL-O0-NEXT:    slli a2, a2, 1
 ; SPILL-O0-NEXT:    sub sp, sp, a2
-; SPILL-O0-NEXT:    # implicit-def: $v8
-; SPILL-O0-NEXT:    # implicit-def: $v9
-; SPILL-O0-NEXT:    # implicit-def: $v10
-; SPILL-O0-NEXT:    # implicit-def: $v9
-; SPILL-O0-NEXT:    # kill: def $v8 killed $v8 def $v8_v9
-; SPILL-O0-NEXT:    vmv1r.v v9, v10
 ; SPILL-O0-NEXT:    vsetvli zero, a1, e32, m1, ta, ma
+; SPILL-O0-NEXT:    # implicit-def: $v8_v9
 ; SPILL-O0-NEXT:    vlseg2e32.v v8, (a0)
 ; SPILL-O0-NEXT:    vmv1r.v v8, v9
 ; SPILL-O0-NEXT:    addi a0, sp, 16
@@ -177,13 +167,8 @@ define <vscale x 4 x i32> @spill_zvlsseg_nxv4i32(ptr %base, i64 %vl) nounwind {
 ; SPILL-O0-NEXT:    csrr a2, vlenb
 ; SPILL-O0-NEXT:    slli a2, a2, 1
 ; SPILL-O0-NEXT:    sub sp, sp, a2
-; SPILL-O0-NEXT:    # implicit-def: $v8m2
-; SPILL-O0-NEXT:    # implicit-def: $v10m2
-; SPILL-O0-NEXT:    # implicit-def: $v12m2
-; SPILL-O0-NEXT:    # implicit-def: $v10m2
-; SPILL-O0-NEXT:    # kill: def $v8m2 killed $v8m2 def $v8m2_v10m2
-; SPILL-O0-NEXT:    vmv2r.v v10, v12
 ; SPILL-O0-NEXT:    vsetvli zero, a1, e32, m2, ta, ma
+; SPILL-O0-NEXT:    # implicit-def: $v8m2_v10m2
 ; SPILL-O0-NEXT:    vlseg2e32.v v8, (a0)
 ; SPILL-O0-NEXT:    vmv2r.v v8, v10
 ; SPILL-O0-NEXT:    addi a0, sp, 16
@@ -262,13 +247,8 @@ define <vscale x 8 x i32> @spill_zvlsseg_nxv8i32(ptr %base, i64 %vl) nounwind {
 ; SPILL-O0-NEXT:    csrr a2, vlenb
 ; SPILL-O0-NEXT:    slli a2, a2, 2
 ; SPILL-O0-NEXT:    sub sp, sp, a2
-; SPILL-O0-NEXT:    # implicit-def: $v8m4
-; SPILL-O0-NEXT:    # implicit-def: $v12m4
-; SPILL-O0-NEXT:    # implicit-def: $v16m4
-; SPILL-O0-NEXT:    # implicit-def: $v12m4
-; SPILL-O0-NEXT:    # kill: def $v8m4 killed $v8m4 def $v8m4_v12m4
-; SPILL-O0-NEXT:    vmv4r.v v12, v16
 ; SPILL-O0-NEXT:    vsetvli zero, a1, e32, m4, ta, ma
+; SPILL-O0-NEXT:    # implicit-def: $v8m4_v12m4
 ; SPILL-O0-NEXT:    vlseg2e32.v v8, (a0)
 ; SPILL-O0-NEXT:    vmv4r.v v8, v12
 ; SPILL-O0-NEXT:    addi a0, sp, 16
@@ -347,16 +327,8 @@ define <vscale x 4 x i32> @spill_zvlsseg3_nxv4i32(ptr %base, i64 %vl) nounwind {
 ; SPILL-O0-NEXT:    csrr a2, vlenb
 ; SPILL-O0-NEXT:    slli a2, a2, 1
 ; SPILL-O0-NEXT:    sub sp, sp, a2
-; SPILL-O0-NEXT:    # implicit-def: $v8m2
-; SPILL-O0-NEXT:    # implicit-def: $v10m2
-; SPILL-O0-NEXT:    # implicit-def: $v16m2
-; SPILL-O0-NEXT:    # implicit-def: $v10m2
-; SPILL-O0-NEXT:    # implicit-def: $v14m2
-; SPILL-O0-NEXT:    # implicit-def: $v10m2
-; SPILL-O0-NEXT:    # kill: def $v8m2 killed $v8m2 def $v8m2_v10m2_v12m2
-; SPILL-O0-NEXT:    vmv2r.v v10, v16
-; SPILL-O0-NEXT:    vmv2r.v v12, v14
 ; SPILL-O0-NEXT:    vsetvli zero, a1, e32, m2, ta, ma
+; SPILL-O0-NEXT:    # implicit-def: $v8m2_v10m2_v12m2
 ; SPILL-O0-NEXT:    vlseg3e32.v v8, (a0)
 ; SPILL-O0-NEXT:    vmv2r.v v8, v10
 ; SPILL-O0-NEXT:    addi a0, sp, 16
diff --git a/llvm/test/CodeGen/RISCV/rvv/vleff-vlseg2ff-output.ll b/llvm/test/CodeGen/RISCV/rvv/vleff-vlseg2ff-output.ll
index 15cb42bacf173..390647fd9e6c6 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vleff-vlseg2ff-output.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vleff-vlseg2ff-output.ll
@@ -66,12 +66,7 @@ define i64 @test_vlseg2ff_nxv8i8(ptr %base, i64 %vl, ptr %outvl) {
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT:   [[COPY:%[0-9]+]]:gprnox0 = COPY $x11
   ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:gpr = COPY $x10
-  ; CHECK-NEXT:   [[DEF:%[0-9]+]]:vr = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF1:%[0-9]+]]:vr = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF2:%[0-9]+]]:vr = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[DEF3:%[0-9]+]]:vr = IMPLICIT_DEF
-  ; CHECK-NEXT:   [[REG_SEQUENCE:%[0-9]+]]:vrn2m1 = REG_SEQUENCE [[DEF]], %subreg.sub_vrm1_0, [[DEF2]], %subreg.sub_vrm1_1
-  ; CHECK-NEXT:   [[PseudoVLSEG2E8FF_V_M1_:%[0-9]+]]:vrn2m1, [[PseudoVLSEG2E8FF_V_M1_1:%[0-9]+]]:gpr = PseudoVLSEG2E8FF_V_M1 [[REG_SEQUENCE]], [[COPY1]], [[COPY]], 3 /* e8 */, 2 /* tu, ma */, implicit-def dead $vl :: (load unknown-size from %ir.base, align 1)
+  ; CHECK-NEXT:   [[PseudoVLSEG2E8FF_V_M1_:%[0-9]+]]:vrn2m1, [[PseudoVLSEG2E8FF_V_M1_1:%[0-9]+]]:gpr = PseudoVLSEG2E8FF_V_M1 $noreg, [[COPY1]], [[COPY]], 3 /* e8 */, 2 /* tu, ma */, implicit-def dead $vl :: (load unknown-size from %ir.base, align 1)
   ; CHECK-NEXT:   $x10 = COPY [[PseudoVLSEG2E8FF_V_M1_1]]
   ; CHECK-NEXT:   PseudoRET implicit $x10
 entry:

>From 351223f8ae529d21e09d7c6797d5cd055f768429 Mon Sep 17 00:00:00 2001
From: Luke Lau <luke at igalia.com>
Date: Tue, 7 May 2024 16:05:57 +0800
Subject: [PATCH 2/3] Simplify hasUndefinedMergeOp

---
 llvm/lib/Target/RISCV/RISCVInsertVSETVLI.cpp | 28 +++-----------------
 1 file changed, 3 insertions(+), 25 deletions(-)

diff --git a/llvm/lib/Target/RISCV/RISCVInsertVSETVLI.cpp b/llvm/lib/Target/RISCV/RISCVInsertVSETVLI.cpp
index eaebdc2e54bef..a9c1b9c95724a 100644
--- a/llvm/lib/Target/RISCV/RISCVInsertVSETVLI.cpp
+++ b/llvm/lib/Target/RISCV/RISCVInsertVSETVLI.cpp
@@ -182,32 +182,10 @@ static bool hasUndefinedMergeOp(const MachineInstr &MI,
     // lanes are undefined.
     return true;
 
-  // If the tied operand is NoReg, an IMPLICIT_DEF, or a REG_SEQEUENCE whose
-  // operands are solely IMPLICIT_DEFS, then the pass through lanes are
-  // undefined.
+  // All undefined passthrus should be $noreg: see
+  // RISCVDAGToDAGISel::doPeepholeNoRegPassThru
   const MachineOperand &UseMO = MI.getOperand(UseOpIdx);
-  if (UseMO.getReg() == RISCV::NoRegister)
-    return true;
-
-  if (UseMO.isUndef())
-    return true;
-  if (UseMO.getReg().isPhysical())
-    return false;
-
-  if (MachineInstr *UseMI = MRI.getVRegDef(UseMO.getReg())) {
-    if (UseMI->isImplicitDef())
-      return true;
-
-    if (UseMI->isRegSequence()) {
-      for (unsigned i = 1, e = UseMI->getNumOperands(); i < e; i += 2) {
-        MachineInstr *SourceMI = MRI.getVRegDef(UseMI->getOperand(i).getReg());
-        if (!SourceMI || !SourceMI->isImplicitDef())
-          return false;
-      }
-      return true;
-    }
-  }
-  return false;
+  return UseMO.getReg() == RISCV::NoRegister || UseMO.isUndef();
 }
 
 /// Which subfields of VL or VTYPE have values we need to preserve?

>From aefb3f1e0fdb281481d41da1a6cc5ebc139c5b09 Mon Sep 17 00:00:00 2001
From: Luke Lau <luke at igalia.com>
Date: Tue, 7 May 2024 16:09:52 +0800
Subject: [PATCH 3/3] Remove unused MRI argument

---
 llvm/lib/Target/RISCV/RISCVInsertVSETVLI.cpp | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/llvm/lib/Target/RISCV/RISCVInsertVSETVLI.cpp b/llvm/lib/Target/RISCV/RISCVInsertVSETVLI.cpp
index a9c1b9c95724a..13ab4824b8b83 100644
--- a/llvm/lib/Target/RISCV/RISCVInsertVSETVLI.cpp
+++ b/llvm/lib/Target/RISCV/RISCVInsertVSETVLI.cpp
@@ -173,8 +173,7 @@ static bool isMaskRegOp(const MachineInstr &MI) {
 /// Note that this is different from "agnostic" as defined by the vector
 /// specification.  Agnostic requires each lane to either be undisturbed, or
 /// take the value -1; no other value is allowed.
-static bool hasUndefinedMergeOp(const MachineInstr &MI,
-                                const MachineRegisterInfo &MRI) {
+static bool hasUndefinedMergeOp(const MachineInstr &MI) {
 
   unsigned UseOpIdx;
   if (!MI.isRegTiedToUseOperand(0, &UseOpIdx))
@@ -404,7 +403,7 @@ DemandedFields getDemanded(const MachineInstr &MI,
     // this for any tail agnostic operation, but we can't as TA requires
     // tail lanes to either be the original value or -1.  We are writing
     // unknown bits to the lanes here.
-    if (hasUndefinedMergeOp(MI, *MRI)) {
+    if (hasUndefinedMergeOp(MI)) {
       if (isFloatScalarMoveOrScalarSplatInstr(MI) && !ST->hasVInstructionsF64())
         Res.SEW = DemandedFields::SEWGreaterThanOrEqualAndLessThan64;
       else
@@ -888,7 +887,7 @@ static VSETVLIInfo computeInfoForInstr(const MachineInstr &MI, uint64_t TSFlags,
 
   bool TailAgnostic = true;
   bool MaskAgnostic = true;
-  if (!hasUndefinedMergeOp(MI, *MRI)) {
+  if (!hasUndefinedMergeOp(MI)) {
     // Start with undisturbed.
     TailAgnostic = false;
     MaskAgnostic = false;
@@ -1083,7 +1082,7 @@ bool RISCVInsertVSETVLI::needVSETVLI(const MachineInstr &MI,
   // * The LMUL1 restriction is for machines whose latency may depend on VL.
   // * As above, this is only legal for tail "undefined" not "agnostic".
   if (isVSlideInstr(MI) && Require.hasAVLImm() && Require.getAVLImm() == 1 &&
-      isLMUL1OrSmaller(CurInfo.getVLMUL()) && hasUndefinedMergeOp(MI, *MRI)) {
+      isLMUL1OrSmaller(CurInfo.getVLMUL()) && hasUndefinedMergeOp(MI)) {
     Used.VLAny = false;
     Used.VLZeroness = true;
     Used.LMUL = false;
@@ -1095,8 +1094,9 @@ bool RISCVInsertVSETVLI::needVSETVLI(const MachineInstr &MI,
   // immediate form of vmv.s.x, and thus frequently use vmv.v.i in it's place.
   // Since a splat is non-constant time in LMUL, we do need to be careful to not
   // increase the number of active vector registers (unlike for vmv.s.x.)
-  if (isScalarSplatInstr(MI) && Require.hasAVLImm() && Require.getAVLImm() == 1 &&
-      isLMUL1OrSmaller(CurInfo.getVLMUL()) && hasUndefinedMergeOp(MI, *MRI)) {
+  if (isScalarSplatInstr(MI) && Require.hasAVLImm() &&
+      Require.getAVLImm() == 1 && isLMUL1OrSmaller(CurInfo.getVLMUL()) &&
+      hasUndefinedMergeOp(MI)) {
     Used.LMUL = false;
     Used.SEWLMULRatio = false;
     Used.VLAny = false;