[llvm] [AArch64][CodeGen] Add patterns for small negative VScale const (PR #89607)

Tue Apr 23 20:16:53 PDT 2024

https://github.com/vfdff updated https://github.com/llvm/llvm-project/pull/89607

>From c1a935bbf4bacf0c8ff9f74d6c745b03665298a6 Mon Sep 17 00:00:00 2001
From: zhongyunde 00443407 <zhongyunde at huawei.com>
Date: Tue, 9 Apr 2024 05:25:16 -0400
Subject: [PATCH 1/2] [AArch64][CodeGen] Add patterns for small negative VScale
 const

On AArch64, rdvl can accept a nagative value, while cntd/cntw/cnth can't.
As we do support VScale with a negative multiply value, so we did not limit
the negative value and instead took the hit of having the extra patterns.
Also add NoUseScalarIncVL to avoid affecting patterns works for -mattr=+use-scalar-inc-vl

Fix https://github.com/llvm/llvm-project/issues/84620
---
 llvm/lib/Target/AArch64/AArch64InstrInfo.td   |  2 ++
 .../lib/Target/AArch64/AArch64SVEInstrInfo.td | 21 ++++++++++++++++
 ...plex-deinterleaving-reductions-scalable.ll | 24 +++++++++----------
 llvm/test/CodeGen/AArch64/sve-vl-arith.ll     | 18 +++++---------
 .../vscale-and-sve-cnt-demandedbits.ll        | 10 ++++----
 5 files changed, 46 insertions(+), 29 deletions(-)

diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.td b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
index be53be782077b7..2c6bfdd6c97456 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
@@ -310,6 +310,8 @@ def UseNegativeImmediates
 
 def UseScalarIncVL : Predicate<"Subtarget->useScalarIncVL()">;
 
+def NoUseScalarIncVL : Predicate<"!Subtarget->useScalarIncVL()">;
+
 def UseSVEFPLD1R : Predicate<"!Subtarget->noSVEFPLD1R()">;
 
 def IsNeonAvailable : Predicate<"Subtarget->isNeonAvailable()">;
diff --git a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
index 6972acd985cb9a..d50182cc8f1e4d 100644
--- a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
@@ -2583,6 +2583,27 @@ let Predicates = [HasSVEorSME] in {
                                sub_32)>;
   }
 
+  // Add NoUseScalarIncVL to avoid affecting for patterns with UseScalarIncVL
+  let Predicates = [NoUseScalarIncVL] in {
+    def : Pat<(add GPR64:$op, (vscale (sve_rdvl_imm i32:$imm))),
+              (ADDXrs GPR64:$op, (RDVLI_XI $imm), 0)>;
+    def : Pat<(add GPR64:$op, (vscale (sve_cnth_imm_neg i32:$imm))),
+              (SUBXrs GPR64:$op, (CNTH_XPiI 31, $imm), 0)>;
+    def : Pat<(add GPR64:$op, (vscale (sve_cntw_imm_neg i32:$imm))),
+              (SUBXrs GPR64:$op, (CNTW_XPiI 31, $imm), 0)>;
+    def : Pat<(add GPR64:$op, (vscale (sve_cntd_imm_neg i32:$imm))),
+              (SUBXrs GPR64:$op, (CNTD_XPiI 31, $imm), 0)>;
+
+    def : Pat<(add GPR32:$op, (i32 (trunc (vscale (sve_rdvl_imm i32:$imm))))),
+              (ADDSWrr GPR32:$op, (EXTRACT_SUBREG (RDVLI_XI $imm), sub_32))>;
+    def : Pat<(add GPR32:$op, (i32 (trunc (vscale (sve_cnth_imm_neg i32:$imm))))),
+              (SUBSWrr GPR32:$op, (EXTRACT_SUBREG (CNTH_XPiI 31, $imm), sub_32))>;
+    def : Pat<(add GPR32:$op, (i32 (trunc (vscale (sve_cntw_imm_neg i32:$imm))))),
+              (SUBSWrr GPR32:$op, (EXTRACT_SUBREG (CNTW_XPiI 31, $imm), sub_32))>;
+    def : Pat<(add GPR32:$op, (i32 (trunc (vscale (sve_cntd_imm_neg i32:$imm))))),
+              (SUBSWrr GPR32:$op, (EXTRACT_SUBREG (CNTD_XPiI 31, $imm), sub_32))>;
+  }
+
   // FIXME: BigEndian requires an additional REV instruction to satisfy the
   // constraint that none of the bits change when stored to memory as one
   // type, and reloaded as another type.
diff --git a/llvm/test/CodeGen/AArch64/complex-deinterleaving-reductions-scalable.ll b/llvm/test/CodeGen/AArch64/complex-deinterleaving-reductions-scalable.ll
index 664d99a3627b58..5bef95910d9060 100644
--- a/llvm/test/CodeGen/AArch64/complex-deinterleaving-reductions-scalable.ll
+++ b/llvm/test/CodeGen/AArch64/complex-deinterleaving-reductions-scalable.ll
@@ -17,11 +17,11 @@ define %"class.std::complex" @complex_mul_v2f64(ptr %a, ptr %b) {
 ; CHECK-NEXT:    mov z1.d, #0 // =0x0
 ; CHECK-NEXT:    cntd x9
 ; CHECK-NEXT:    ptrue p1.b
-; CHECK-NEXT:    neg x9, x9
-; CHECK-NEXT:    mov w10, #100 // =0x64
+; CHECK-NEXT:    neg x10, x9
+; CHECK-NEXT:    mov w11, #100 // =0x64
 ; CHECK-NEXT:    ptrue p0.d
 ; CHECK-NEXT:    mov x8, xzr
-; CHECK-NEXT:    and x10, x9, x10
+; CHECK-NEXT:    and x10, x10, x11
 ; CHECK-NEXT:    rdvl x11, #2
 ; CHECK-NEXT:    zip2 z0.d, z1.d, z1.d
 ; CHECK-NEXT:    zip1 z1.d, z1.d, z1.d
@@ -33,7 +33,7 @@ define %"class.std::complex" @complex_mul_v2f64(ptr %a, ptr %b) {
 ; CHECK-NEXT:    ld1d { z3.d }, p0/z, [x12, #1, mul vl]
 ; CHECK-NEXT:    ld1b { z4.b }, p1/z, [x1, x8]
 ; CHECK-NEXT:    ld1d { z5.d }, p0/z, [x13, #1, mul vl]
-; CHECK-NEXT:    adds x10, x10, x9
+; CHECK-NEXT:    subs x10, x10, x9
 ; CHECK-NEXT:    add x8, x8, x11
 ; CHECK-NEXT:    fcmla z1.d, p0/m, z4.d, z2.d, #0
 ; CHECK-NEXT:    fcmla z0.d, p0/m, z5.d, z3.d, #0
@@ -106,12 +106,12 @@ define %"class.std::complex" @complex_mul_nonzero_init_v2f64(ptr %a, ptr %b) {
 ; CHECK-NEXT:    cntd x9
 ; CHECK-NEXT:    fmov d2, #2.00000000
 ; CHECK-NEXT:    ptrue p0.d, vl1
-; CHECK-NEXT:    neg x9, x9
+; CHECK-NEXT:    neg x10, x9
 ; CHECK-NEXT:    ptrue p1.b
-; CHECK-NEXT:    mov w10, #100 // =0x64
+; CHECK-NEXT:    mov w11, #100 // =0x64
 ; CHECK-NEXT:    mov x8, xzr
 ; CHECK-NEXT:    sel z3.d, p0, z0.d, z1.d
-; CHECK-NEXT:    and x10, x9, x10
+; CHECK-NEXT:    and x10, x10, x11
 ; CHECK-NEXT:    rdvl x11, #2
 ; CHECK-NEXT:    mov z1.d, p0/m, z2.d
 ; CHECK-NEXT:    ptrue p0.d
@@ -125,7 +125,7 @@ define %"class.std::complex" @complex_mul_nonzero_init_v2f64(ptr %a, ptr %b) {
 ; CHECK-NEXT:    ld1d { z3.d }, p0/z, [x12, #1, mul vl]
 ; CHECK-NEXT:    ld1b { z4.b }, p1/z, [x1, x8]
 ; CHECK-NEXT:    ld1d { z5.d }, p0/z, [x13, #1, mul vl]
-; CHECK-NEXT:    adds x10, x10, x9
+; CHECK-NEXT:    subs x10, x10, x9
 ; CHECK-NEXT:    add x8, x8, x11
 ; CHECK-NEXT:    fcmla z1.d, p0/m, z4.d, z2.d, #0
 ; CHECK-NEXT:    fcmla z0.d, p0/m, z5.d, z3.d, #0
@@ -191,13 +191,13 @@ define %"class.std::complex" @complex_mul_v2f64_unrolled(ptr %a, ptr %b) {
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    mov z1.d, #0 // =0x0
 ; CHECK-NEXT:    cntw x9
-; CHECK-NEXT:    mov w10, #1000 // =0x3e8
-; CHECK-NEXT:    neg x9, x9
+; CHECK-NEXT:    mov w11, #1000 // =0x3e8
+; CHECK-NEXT:    neg x10, x9
 ; CHECK-NEXT:    rdvl x12, #2
 ; CHECK-NEXT:    ptrue p1.b
 ; CHECK-NEXT:    ptrue p0.d
 ; CHECK-NEXT:    mov x8, xzr
-; CHECK-NEXT:    and x10, x9, x10
+; CHECK-NEXT:    and x10, x10, x11
 ; CHECK-NEXT:    zip2 z0.d, z1.d, z1.d
 ; CHECK-NEXT:    zip1 z1.d, z1.d, z1.d
 ; CHECK-NEXT:    add x11, x1, x12
@@ -219,7 +219,7 @@ define %"class.std::complex" @complex_mul_v2f64_unrolled(ptr %a, ptr %b) {
 ; CHECK-NEXT:    ld1d { z17.d }, p0/z, [x15, #1, mul vl]
 ; CHECK-NEXT:    ld1b { z18.b }, p1/z, [x11, x8]
 ; CHECK-NEXT:    ld1d { z19.d }, p0/z, [x17, #1, mul vl]
-; CHECK-NEXT:    adds x10, x10, x9
+; CHECK-NEXT:    subs x10, x10, x9
 ; CHECK-NEXT:    add x8, x8, x13
 ; CHECK-NEXT:    fcmla z1.d, p0/m, z7.d, z4.d, #0
 ; CHECK-NEXT:    fcmla z0.d, p0/m, z16.d, z5.d, #0
diff --git a/llvm/test/CodeGen/AArch64/sve-vl-arith.ll b/llvm/test/CodeGen/AArch64/sve-vl-arith.ll
index dd4294c8d3bdcc..98d96da427c4f0 100644
--- a/llvm/test/CodeGen/AArch64/sve-vl-arith.ll
+++ b/llvm/test/CodeGen/AArch64/sve-vl-arith.ll
@@ -204,8 +204,7 @@ define i64 @dech_scalar_i64(i64 %a) {
 ; NO_SCALAR_INC-LABEL: dech_scalar_i64:
 ; NO_SCALAR_INC:       // %bb.0:
 ; NO_SCALAR_INC-NEXT:    cnth x8, all, mul #3
-; NO_SCALAR_INC-NEXT:    neg x8, x8
-; NO_SCALAR_INC-NEXT:    add x0, x0, x8
+; NO_SCALAR_INC-NEXT:    sub x0, x0, x8
 ; NO_SCALAR_INC-NEXT:    ret
 ;
 ; CHECK-LABEL: dech_scalar_i64:
@@ -222,8 +221,7 @@ define i64 @decw_scalar_i64(i64 %a) {
 ; NO_SCALAR_INC-LABEL: decw_scalar_i64:
 ; NO_SCALAR_INC:       // %bb.0:
 ; NO_SCALAR_INC-NEXT:    cntw x8, all, mul #3
-; NO_SCALAR_INC-NEXT:    neg x8, x8
-; NO_SCALAR_INC-NEXT:    add x0, x0, x8
+; NO_SCALAR_INC-NEXT:    sub x0, x0, x8
 ; NO_SCALAR_INC-NEXT:    ret
 ;
 ; CHECK-LABEL: decw_scalar_i64:
@@ -240,8 +238,7 @@ define i64 @decd_scalar_i64(i64 %a) {
 ; NO_SCALAR_INC-LABEL: decd_scalar_i64:
 ; NO_SCALAR_INC:       // %bb.0:
 ; NO_SCALAR_INC-NEXT:    cntd x8, all, mul #3
-; NO_SCALAR_INC-NEXT:    neg x8, x8
-; NO_SCALAR_INC-NEXT:    add x0, x0, x8
+; NO_SCALAR_INC-NEXT:    sub x0, x0, x8
 ; NO_SCALAR_INC-NEXT:    ret
 ;
 ; CHECK-LABEL: decd_scalar_i64:
@@ -367,8 +364,7 @@ define i32 @dech_scalar_i32(i32 %a) {
 ; NO_SCALAR_INC-LABEL: dech_scalar_i32:
 ; NO_SCALAR_INC:       // %bb.0:
 ; NO_SCALAR_INC-NEXT:    cnth x8
-; NO_SCALAR_INC-NEXT:    neg x8, x8
-; NO_SCALAR_INC-NEXT:    add w0, w0, w8
+; NO_SCALAR_INC-NEXT:    sub w0, w0, w8
 ; NO_SCALAR_INC-NEXT:    ret
 ;
 ; CHECK-LABEL: dech_scalar_i32:
@@ -389,8 +385,7 @@ define i32 @decw_scalar_i32(i32 %a) {
 ; NO_SCALAR_INC-LABEL: decw_scalar_i32:
 ; NO_SCALAR_INC:       // %bb.0:
 ; NO_SCALAR_INC-NEXT:    cntw x8
-; NO_SCALAR_INC-NEXT:    neg x8, x8
-; NO_SCALAR_INC-NEXT:    add w0, w0, w8
+; NO_SCALAR_INC-NEXT:    sub w0, w0, w8
 ; NO_SCALAR_INC-NEXT:    ret
 ;
 ; CHECK-LABEL: decw_scalar_i32:
@@ -411,8 +406,7 @@ define i32 @decd_scalar_i32(i32 %a) {
 ; NO_SCALAR_INC-LABEL: decd_scalar_i32:
 ; NO_SCALAR_INC:       // %bb.0:
 ; NO_SCALAR_INC-NEXT:    cntd x8
-; NO_SCALAR_INC-NEXT:    neg x8, x8
-; NO_SCALAR_INC-NEXT:    add w0, w0, w8
+; NO_SCALAR_INC-NEXT:    sub w0, w0, w8
 ; NO_SCALAR_INC-NEXT:    ret
 ;
 ; CHECK-LABEL: decd_scalar_i32:
diff --git a/llvm/test/CodeGen/AArch64/vscale-and-sve-cnt-demandedbits.ll b/llvm/test/CodeGen/AArch64/vscale-and-sve-cnt-demandedbits.ll
index dbdab799c83522..9572778484f8d3 100644
--- a/llvm/test/CodeGen/AArch64/vscale-and-sve-cnt-demandedbits.ll
+++ b/llvm/test/CodeGen/AArch64/vscale-and-sve-cnt-demandedbits.ll
@@ -194,7 +194,7 @@ define i32 @vscale_with_multiplier() vscale_range(1,16) {
 ; CHECK-LABEL: vscale_with_multiplier:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    rdvl x8, #1
-; CHECK-NEXT:    mov w9, #5
+; CHECK-NEXT:    mov w9, #5 // =0x5
 ; CHECK-NEXT:    lsr x8, x8, #4
 ; CHECK-NEXT:    mul x8, x8, x9
 ; CHECK-NEXT:    and w9, w8, #0x3f
@@ -212,7 +212,7 @@ define i32 @vscale_with_negative_multiplier() vscale_range(1,16) {
 ; CHECK-LABEL: vscale_with_negative_multiplier:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    rdvl x8, #1
-; CHECK-NEXT:    mov x9, #-5
+; CHECK-NEXT:    mov x9, #-5 // =0xfffffffffffffffb
 ; CHECK-NEXT:    lsr x8, x8, #4
 ; CHECK-NEXT:    mul x8, x8, x9
 ; CHECK-NEXT:    and w9, w8, #0xffffffc0
@@ -230,9 +230,9 @@ define i32 @pow2_vscale_with_negative_multiplier() vscale_range(1,16) {
 ; CHECK-LABEL: pow2_vscale_with_negative_multiplier:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    cntd x8
-; CHECK-NEXT:    neg x8, x8
-; CHECK-NEXT:    orr w9, w8, #0xfffffff0
-; CHECK-NEXT:    add w0, w8, w9
+; CHECK-NEXT:    neg x9, x8
+; CHECK-NEXT:    orr w9, w9, #0xfffffff0
+; CHECK-NEXT:    sub w0, w9, w8
 ; CHECK-NEXT:    ret
   %vscale = call i32 @llvm.vscale.i32()
   %mul = mul i32 %vscale, -2

>From f0326e27c0e26fe24526254cb3689c645a9fd971 Mon Sep 17 00:00:00 2001
From: zhongyunde 00443407 <zhongyunde at huawei.com>
Date: Tue, 23 Apr 2024 22:29:58 -0400
Subject: [PATCH 2/2] [CodeGen] fix comment

---
 .../lib/Target/AArch64/AArch64SVEInstrInfo.td | 38 +++++++++----------
 llvm/test/CodeGen/AArch64/sve-vl-arith.ll     |  8 ++--
 2 files changed, 21 insertions(+), 25 deletions(-)

diff --git a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
index d50182cc8f1e4d..1ca474e563247b 100644
--- a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
@@ -2517,6 +2517,23 @@ let Predicates = [HasSVEorSME] in {
     def : Pat<(vscale (sve_cntd_imm_neg i32:$imm)), (SUBXrs XZR, (CNTD_XPiI 31, $imm), 0)>;
   }
 
+  // Add NoUseScalarIncVL to avoid affecting for patterns with UseScalarIncVL
+  let Predicates = [HasSVEorSME, NoUseScalarIncVL] in {
+    def : Pat<(add GPR64:$op, (vscale (sve_cnth_imm_neg i32:$imm))),
+              (SUBXrs GPR64:$op, (CNTH_XPiI 31, $imm), 0)>;
+    def : Pat<(add GPR64:$op, (vscale (sve_cntw_imm_neg i32:$imm))),
+              (SUBXrs GPR64:$op, (CNTW_XPiI 31, $imm), 0)>;
+    def : Pat<(add GPR64:$op, (vscale (sve_cntd_imm_neg i32:$imm))),
+              (SUBXrs GPR64:$op, (CNTD_XPiI 31, $imm), 0)>;
+
+    def : Pat<(add GPR32:$op, (i32 (trunc (vscale (sve_cnth_imm_neg i32:$imm))))),
+              (SUBSWrr GPR32:$op, (EXTRACT_SUBREG (CNTH_XPiI 31, $imm), sub_32))>;
+    def : Pat<(add GPR32:$op, (i32 (trunc (vscale (sve_cntw_imm_neg i32:$imm))))),
+              (SUBSWrr GPR32:$op, (EXTRACT_SUBREG (CNTW_XPiI 31, $imm), sub_32))>;
+    def : Pat<(add GPR32:$op, (i32 (trunc (vscale (sve_cntd_imm_neg i32:$imm))))),
+              (SUBSWrr GPR32:$op, (EXTRACT_SUBREG (CNTD_XPiI 31, $imm), sub_32))>;
+  }
+
   let AddedComplexity = 5 in {
     def : Pat<(nxv8i16 (add ZPR:$op, (nxv8i16 (splat_vector (i32 (trunc (vscale (sve_cnth_imm i32:$imm)))))))),
               (INCH_ZPiI ZPR:$op, 31, $imm)>;
@@ -2583,27 +2600,6 @@ let Predicates = [HasSVEorSME] in {
                                sub_32)>;
   }
 
-  // Add NoUseScalarIncVL to avoid affecting for patterns with UseScalarIncVL
-  let Predicates = [NoUseScalarIncVL] in {
-    def : Pat<(add GPR64:$op, (vscale (sve_rdvl_imm i32:$imm))),
-              (ADDXrs GPR64:$op, (RDVLI_XI $imm), 0)>;
-    def : Pat<(add GPR64:$op, (vscale (sve_cnth_imm_neg i32:$imm))),
-              (SUBXrs GPR64:$op, (CNTH_XPiI 31, $imm), 0)>;
-    def : Pat<(add GPR64:$op, (vscale (sve_cntw_imm_neg i32:$imm))),
-              (SUBXrs GPR64:$op, (CNTW_XPiI 31, $imm), 0)>;
-    def : Pat<(add GPR64:$op, (vscale (sve_cntd_imm_neg i32:$imm))),
-              (SUBXrs GPR64:$op, (CNTD_XPiI 31, $imm), 0)>;
-
-    def : Pat<(add GPR32:$op, (i32 (trunc (vscale (sve_rdvl_imm i32:$imm))))),
-              (ADDSWrr GPR32:$op, (EXTRACT_SUBREG (RDVLI_XI $imm), sub_32))>;
-    def : Pat<(add GPR32:$op, (i32 (trunc (vscale (sve_cnth_imm_neg i32:$imm))))),
-              (SUBSWrr GPR32:$op, (EXTRACT_SUBREG (CNTH_XPiI 31, $imm), sub_32))>;
-    def : Pat<(add GPR32:$op, (i32 (trunc (vscale (sve_cntw_imm_neg i32:$imm))))),
-              (SUBSWrr GPR32:$op, (EXTRACT_SUBREG (CNTW_XPiI 31, $imm), sub_32))>;
-    def : Pat<(add GPR32:$op, (i32 (trunc (vscale (sve_cntd_imm_neg i32:$imm))))),
-              (SUBSWrr GPR32:$op, (EXTRACT_SUBREG (CNTD_XPiI 31, $imm), sub_32))>;
-  }
-
   // FIXME: BigEndian requires an additional REV instruction to satisfy the
   // constraint that none of the bits change when stored to memory as one
   // type, and reloaded as another type.
diff --git a/llvm/test/CodeGen/AArch64/sve-vl-arith.ll b/llvm/test/CodeGen/AArch64/sve-vl-arith.ll
index 98d96da427c4f0..de2af590acd1e2 100644
--- a/llvm/test/CodeGen/AArch64/sve-vl-arith.ll
+++ b/llvm/test/CodeGen/AArch64/sve-vl-arith.ll
@@ -186,8 +186,8 @@ define i64 @incd_scalar_i64(i64 %a) {
 define i64 @decb_scalar_i64(i64 %a) {
 ; NO_SCALAR_INC-LABEL: decb_scalar_i64:
 ; NO_SCALAR_INC:       // %bb.0:
-; NO_SCALAR_INC-NEXT:    rdvl x8, #-2
-; NO_SCALAR_INC-NEXT:    add x0, x0, x8
+; NO_SCALAR_INC-NEXT:    cnth x8, all, mul #4
+; NO_SCALAR_INC-NEXT:    sub x0, x0, x8
 ; NO_SCALAR_INC-NEXT:    ret
 ;
 ; CHECK-LABEL: decb_scalar_i64:
@@ -342,8 +342,8 @@ define i32 @incd_scalar_i32(i32 %a) {
 define i32 @decb_scalar_i32(i32 %a) {
 ; NO_SCALAR_INC-LABEL: decb_scalar_i32:
 ; NO_SCALAR_INC:       // %bb.0:
-; NO_SCALAR_INC-NEXT:    rdvl x8, #-4
-; NO_SCALAR_INC-NEXT:    add w0, w0, w8
+; NO_SCALAR_INC-NEXT:    cnth x8, all, mul #8
+; NO_SCALAR_INC-NEXT:    sub w0, w0, w8
 ; NO_SCALAR_INC-NEXT:    ret
 ;
 ; CHECK-LABEL: decb_scalar_i32: