[clang] [llvm] [ARM] Fix instruction selection for MVE vsbciq intrinsic (PR #118284)

Mon Dec 2 03:26:18 PST 2024

https://github.com/ostannard created https://github.com/llvm/llvm-project/pull/118284

There were two bugs in the implementation of the MVE vsbciq (subtract with carry across vector, with initial carry value) intrinsics:
* The VSBCI instruction behaves as if the carry-in is always set, but we were selecting it when the carry-in is clear.
* The vsbciq intrinsics should generate IR with the carry-in set, but they were leaving it clear.

These two bugs almost cancelled each other out, but resulted in incorrect code when the vsbcq intrinsics (with a carry-in) were used, and the carry-in was a compile time constant.

>From e29a0823d22656beace962ddb25ee91aef3e1321 Mon Sep 17 00:00:00 2001
From: Oliver Stannard <oliver.stannard at arm.com>
Date: Fri, 29 Nov 2024 16:56:00 +0000
Subject: [PATCH 1/2] [ARM] Fix MVE VSBCI instruction selection

The MVE VSBCI instruction should be selected when the carry-in is set,
unlike the VADCI instruction which should be selected when the carry-in
is zero. This was already implemented in the code, but the function was
always called with Add=1, even for the subtract instructions.
---
 llvm/lib/Target/ARM/ARMISelDAGToDAG.cpp       |  2 +-
 .../CodeGen/Thumb2/mve-intrinsics/vadc.ll     | 92 ++++++++++++++++++-
 .../CodeGen/Thumb2/mve-vadc-vsbc-spill.ll     |  2 +-
 3 files changed, 92 insertions(+), 4 deletions(-)

diff --git a/llvm/lib/Target/ARM/ARMISelDAGToDAG.cpp b/llvm/lib/Target/ARM/ARMISelDAGToDAG.cpp
index 73ee8cf81adcd6..9714370ec7cc8e 100644
--- a/llvm/lib/Target/ARM/ARMISelDAGToDAG.cpp
+++ b/llvm/lib/Target/ARM/ARMISelDAGToDAG.cpp
@@ -5229,7 +5229,7 @@ void ARMDAGToDAGISel::Select(SDNode *N) {
       return;
     case Intrinsic::arm_mve_vsbc:
     case Intrinsic::arm_mve_vsbc_predicated:
-      SelectMVE_VADCSBC(N, ARM::MVE_VSBC, ARM::MVE_VSBCI, true,
+      SelectMVE_VADCSBC(N, ARM::MVE_VSBC, ARM::MVE_VSBCI, false,
                         IntNo == Intrinsic::arm_mve_vsbc_predicated);
       return;
     case Intrinsic::arm_mve_vshlc:
diff --git a/llvm/test/CodeGen/Thumb2/mve-intrinsics/vadc.ll b/llvm/test/CodeGen/Thumb2/mve-intrinsics/vadc.ll
index 967e8a94900e16..2b4c478dfe3e6a 100644
--- a/llvm/test/CodeGen/Thumb2/mve-intrinsics/vadc.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-intrinsics/vadc.ll
@@ -108,7 +108,7 @@ define arm_aapcs_vfpcc <4 x i32> @test_vsbciq_s32(<4 x i32> %a, <4 x i32> %b, pt
 ; CHECK-NEXT:    str r1, [r0]
 ; CHECK-NEXT:    bx lr
 entry:
-  %0 = tail call { <4 x i32>, i32 } @llvm.arm.mve.vsbc.v4i32(<4 x i32> %a, <4 x i32> %b, i32 0)
+  %0 = tail call { <4 x i32>, i32 } @llvm.arm.mve.vsbc.v4i32(<4 x i32> %a, <4 x i32> %b, i32 536870912)
   %1 = extractvalue { <4 x i32>, i32 } %0, 1
   %2 = lshr i32 %1, 29
   %3 = and i32 %2, 1
@@ -125,6 +125,46 @@ define arm_aapcs_vfpcc <4 x i32> @test_vsbciq_u32(<4 x i32> %a, <4 x i32> %b, pt
 ; CHECK-NEXT:    ubfx r1, r1, #29, #1
 ; CHECK-NEXT:    str r1, [r0]
 ; CHECK-NEXT:    bx lr
+entry:
+  %0 = tail call { <4 x i32>, i32 } @llvm.arm.mve.vsbc.v4i32(<4 x i32> %a, <4 x i32> %b, i32 536870912)
+  %1 = extractvalue { <4 x i32>, i32 } %0, 1
+  %2 = lshr i32 %1, 29
+  %3 = and i32 %2, 1
+  store i32 %3, ptr %carry_out, align 4
+  %4 = extractvalue { <4 x i32>, i32 } %0, 0
+  ret <4 x i32> %4
+}
+
+define arm_aapcs_vfpcc <4 x i32> @test_vsbcq_s32_carry_in_zero(<4 x i32> %a, <4 x i32> %b, ptr nocapture %carry_out) {
+; CHECK-LABEL: test_vsbcq_s32_carry_in_zero:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    movs r1, #0
+; CHECK-NEXT:    vmsr fpscr_nzcvqc, r1
+; CHECK-NEXT:    vsbc.i32 q0, q0, q1
+; CHECK-NEXT:    vmrs r1, fpscr_nzcvqc
+; CHECK-NEXT:    ubfx r1, r1, #29, #1
+; CHECK-NEXT:    str r1, [r0]
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = tail call { <4 x i32>, i32 } @llvm.arm.mve.vsbc.v4i32(<4 x i32> %a, <4 x i32> %b, i32 0)
+  %1 = extractvalue { <4 x i32>, i32 } %0, 1
+  %2 = lshr i32 %1, 29
+  %3 = and i32 %2, 1
+  store i32 %3, ptr %carry_out, align 4
+  %4 = extractvalue { <4 x i32>, i32 } %0, 0
+  ret <4 x i32> %4
+}
+
+define arm_aapcs_vfpcc <4 x i32> @test_vsbcq_u32_carry_in_zero(<4 x i32> %a, <4 x i32> %b, ptr nocapture %carry_out) {
+; CHECK-LABEL: test_vsbcq_u32_carry_in_zero:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    movs r1, #0
+; CHECK-NEXT:    vmsr fpscr_nzcvqc, r1
+; CHECK-NEXT:    vsbc.i32 q0, q0, q1
+; CHECK-NEXT:    vmrs r1, fpscr_nzcvqc
+; CHECK-NEXT:    ubfx r1, r1, #29, #1
+; CHECK-NEXT:    str r1, [r0]
+; CHECK-NEXT:    bx lr
 entry:
   %0 = tail call { <4 x i32>, i32 } @llvm.arm.mve.vsbc.v4i32(<4 x i32> %a, <4 x i32> %b, i32 0)
   %1 = extractvalue { <4 x i32>, i32 } %0, 1
@@ -196,7 +236,7 @@ define arm_aapcs_vfpcc <4 x i32> @test_vsbciq_m_s32(<4 x i32> %inactive, <4 x i3
 entry:
   %0 = zext i16 %p to i32
   %1 = tail call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0)
-  %2 = tail call { <4 x i32>, i32 } @llvm.arm.mve.vsbc.predicated.v4i32.v4i1(<4 x i32> %inactive, <4 x i32> %a, <4 x i32> %b, i32 0, <4 x i1> %1)
+  %2 = tail call { <4 x i32>, i32 } @llvm.arm.mve.vsbc.predicated.v4i32.v4i1(<4 x i32> %inactive, <4 x i32> %a, <4 x i32> %b, i32 536870912, <4 x i1> %1)
   %3 = extractvalue { <4 x i32>, i32 } %2, 1
   %4 = lshr i32 %3, 29
   %5 = and i32 %4, 1
@@ -215,6 +255,54 @@ define arm_aapcs_vfpcc <4 x i32> @test_vsbciq_m_u32(<4 x i32> %inactive, <4 x i3
 ; CHECK-NEXT:    ubfx r1, r1, #29, #1
 ; CHECK-NEXT:    str r1, [r0]
 ; CHECK-NEXT:    bx lr
+entry:
+  %0 = zext i16 %p to i32
+  %1 = tail call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0)
+  %2 = tail call { <4 x i32>, i32 } @llvm.arm.mve.vsbc.predicated.v4i32.v4i1(<4 x i32> %inactive, <4 x i32> %a, <4 x i32> %b, i32 536870912, <4 x i1> %1)
+  %3 = extractvalue { <4 x i32>, i32 } %2, 1
+  %4 = lshr i32 %3, 29
+  %5 = and i32 %4, 1
+  store i32 %5, ptr %carry_out, align 4
+  %6 = extractvalue { <4 x i32>, i32 } %2, 0
+  ret <4 x i32> %6
+}
+
+define arm_aapcs_vfpcc <4 x i32> @test_vsbcq_m_s32_carry_in_zero(<4 x i32> %inactive, <4 x i32> %a, <4 x i32> %b, ptr nocapture %carry_out, i16 zeroext %p) {
+; CHECK-LABEL: test_vsbcq_m_s32_carry_in_zero:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    movs r2, #0
+; CHECK-NEXT:    vmsr p0, r1
+; CHECK-NEXT:    vmsr fpscr_nzcvqc, r2
+; CHECK-NEXT:    vpst
+; CHECK-NEXT:    vsbct.i32 q0, q1, q2
+; CHECK-NEXT:    vmrs r1, fpscr_nzcvqc
+; CHECK-NEXT:    ubfx r1, r1, #29, #1
+; CHECK-NEXT:    str r1, [r0]
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = zext i16 %p to i32
+  %1 = tail call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0)
+  %2 = tail call { <4 x i32>, i32 } @llvm.arm.mve.vsbc.predicated.v4i32.v4i1(<4 x i32> %inactive, <4 x i32> %a, <4 x i32> %b, i32 0, <4 x i1> %1)
+  %3 = extractvalue { <4 x i32>, i32 } %2, 1
+  %4 = lshr i32 %3, 29
+  %5 = and i32 %4, 1
+  store i32 %5, ptr %carry_out, align 4
+  %6 = extractvalue { <4 x i32>, i32 } %2, 0
+  ret <4 x i32> %6
+}
+
+define arm_aapcs_vfpcc <4 x i32> @test_vsbcq_m_u32_carry_in_zero(<4 x i32> %inactive, <4 x i32> %a, <4 x i32> %b, ptr nocapture %carry_out, i16 zeroext %p) {
+; CHECK-LABEL: test_vsbcq_m_u32_carry_in_zero:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    movs r2, #0
+; CHECK-NEXT:    vmsr p0, r1
+; CHECK-NEXT:    vmsr fpscr_nzcvqc, r2
+; CHECK-NEXT:    vpst
+; CHECK-NEXT:    vsbct.i32 q0, q1, q2
+; CHECK-NEXT:    vmrs r1, fpscr_nzcvqc
+; CHECK-NEXT:    ubfx r1, r1, #29, #1
+; CHECK-NEXT:    str r1, [r0]
+; CHECK-NEXT:    bx lr
 entry:
   %0 = zext i16 %p to i32
   %1 = tail call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0)
diff --git a/llvm/test/CodeGen/Thumb2/mve-vadc-vsbc-spill.ll b/llvm/test/CodeGen/Thumb2/mve-vadc-vsbc-spill.ll
index 7fe8e94589a0c5..8a0134a4c5ff29 100644
--- a/llvm/test/CodeGen/Thumb2/mve-vadc-vsbc-spill.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-vadc-vsbc-spill.ll
@@ -59,7 +59,7 @@ define void @sub_256(<4 x i32> %a_low, <4 x i32> %a_high, <4 x i32> %b_low, <4 x
 ; CHECK-NEXT:    pop.w {r7, lr}
 ; CHECK-NEXT:    b use_int32x4_t
 entry:
-  %adc_low = tail call { <4 x i32>, i32 } @llvm.arm.mve.vsbc.v4i32(<4 x i32> %a_low, <4 x i32> %b_low, i32 0)
+  %adc_low = tail call { <4 x i32>, i32 } @llvm.arm.mve.vsbc.v4i32(<4 x i32> %a_low, <4 x i32> %b_low, i32 536870912)
   %carry = extractvalue { <4 x i32>, i32 } %adc_low, 1
   %result_low = extractvalue { <4 x i32>, i32 } %adc_low, 0
   tail call void @use_int32x4_t(<4 x i32> %result_low)

>From 3f643adff83fb6c69f90a3da86a8795355697127 Mon Sep 17 00:00:00 2001
From: Oliver Stannard <oliver.stannard at arm.com>
Date: Fri, 29 Nov 2024 17:46:20 +0000
Subject: [PATCH 2/2] [ARM] Fix clang codegen for MVE vsbciq intrinsic

The VSBCI instruction behaves as if the carry flag input is set, unlike
the VADCI instruction which behaves as if it is clear. The IR intrinsics
take and return a carry flag in the format expected in FPSCR, with the
carry flag in bit 29, so we need to match that in clang.
---
 clang/include/clang/Basic/arm_mve.td         | 10 +++++-----
 clang/test/CodeGen/arm-mve-intrinsics/vadc.c |  8 ++++----
 2 files changed, 9 insertions(+), 9 deletions(-)

diff --git a/clang/include/clang/Basic/arm_mve.td b/clang/include/clang/Basic/arm_mve.td
index 93abbc47c54dd5..6dd8c52ddfd772 100644
--- a/clang/include/clang/Basic/arm_mve.td
+++ b/clang/include/clang/Basic/arm_mve.td
@@ -1270,13 +1270,13 @@ defm sqrshr: ScalarSaturatingShiftReg<s32, s64>;
 def lsll: LongScalarShift<u64, (args s32:$sh), (IRInt<"lsll"> $lo, $hi, $sh)>;
 def asrl: LongScalarShift<s64, (args s32:$sh), (IRInt<"asrl"> $lo, $hi, $sh)>;
 
-multiclass vadcsbc {
+multiclass vadcsbc<dag initial_carry_in> {
   def q: Intrinsic<Vector, (args Vector:$a, Vector:$b, Ptr<uint>:$carry),
       (seq (IRInt<NAME, [Vector]> $a, $b, (shl (load $carry), 29)):$pair,
            (store (and 1, (lshr (xval $pair, 1), 29)), $carry),
            (xval $pair, 0))>;
   def iq: Intrinsic<Vector, (args Vector:$a, Vector:$b, Ptr<uint>:$carry),
-      (seq (IRInt<NAME, [Vector]> $a, $b, 0):$pair,
+      (seq (IRInt<NAME, [Vector]> $a, $b, initial_carry_in):$pair,
            (store (and 1, (lshr (xval $pair, 1), 29)), $carry),
            (xval $pair, 0))>;
   def q_m: Intrinsic<Vector, (args Vector:$inactive, Vector:$a, Vector:$b,
@@ -1288,13 +1288,13 @@ multiclass vadcsbc {
   def iq_m: Intrinsic<Vector, (args Vector:$inactive, Vector:$a, Vector:$b,
                                     Ptr<uint>:$carry, Predicate:$pred),
       (seq (IRInt<NAME # "_predicated", [Vector, Predicate]> $inactive, $a, $b,
-               0, $pred):$pair,
+               initial_carry_in, $pred):$pair,
            (store (and 1, (lshr (xval $pair, 1), 29)), $carry),
            (xval $pair, 0))>;
 }
 let params = T.Int32 in {
-  defm vadc: vadcsbc;
-  defm vsbc: vadcsbc;
+  defm vadc: vadcsbc<(u32 0)>;
+  defm vsbc: vadcsbc<(shl 1, 29)>;
 }
 
 let params = T.Int in {
diff --git a/clang/test/CodeGen/arm-mve-intrinsics/vadc.c b/clang/test/CodeGen/arm-mve-intrinsics/vadc.c
index 21087b83300c88..29e067d65c6c89 100644
--- a/clang/test/CodeGen/arm-mve-intrinsics/vadc.c
+++ b/clang/test/CodeGen/arm-mve-intrinsics/vadc.c
@@ -92,7 +92,7 @@ int32x4_t test_vadcq_m_s32(int32x4_t inactive, int32x4_t a, int32x4_t b, unsigne
 
 // CHECK-LABEL: @test_vsbciq_s32(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = call { <4 x i32>, i32 } @llvm.arm.mve.vsbc.v4i32(<4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], i32 0)
+// CHECK-NEXT:    [[TMP0:%.*]] = call { <4 x i32>, i32 } @llvm.arm.mve.vsbc.v4i32(<4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], i32 536870912)
 // CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <4 x i32>, i32 } [[TMP0]], 1
 // CHECK-NEXT:    [[TMP2:%.*]] = lshr i32 [[TMP1]], 29
 // CHECK-NEXT:    [[TMP3:%.*]] = and i32 1, [[TMP2]]
@@ -110,7 +110,7 @@ int32x4_t test_vsbciq_s32(int32x4_t a, int32x4_t b, unsigned *carry_out) {
 
 // CHECK-LABEL: @test_vsbciq_u32(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = call { <4 x i32>, i32 } @llvm.arm.mve.vsbc.v4i32(<4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], i32 0)
+// CHECK-NEXT:    [[TMP0:%.*]] = call { <4 x i32>, i32 } @llvm.arm.mve.vsbc.v4i32(<4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], i32 536870912)
 // CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <4 x i32>, i32 } [[TMP0]], 1
 // CHECK-NEXT:    [[TMP2:%.*]] = lshr i32 [[TMP1]], 29
 // CHECK-NEXT:    [[TMP3:%.*]] = and i32 1, [[TMP2]]
@@ -170,7 +170,7 @@ uint32x4_t test_vsbcq_u32(uint32x4_t a, uint32x4_t b, unsigned *carry) {
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
 // CHECK-NEXT:    [[TMP1:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP0]])
-// CHECK-NEXT:    [[TMP2:%.*]] = call { <4 x i32>, i32 } @llvm.arm.mve.vsbc.predicated.v4i32.v4i1(<4 x i32> [[INACTIVE:%.*]], <4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], i32 0, <4 x i1> [[TMP1]])
+// CHECK-NEXT:    [[TMP2:%.*]] = call { <4 x i32>, i32 } @llvm.arm.mve.vsbc.predicated.v4i32.v4i1(<4 x i32> [[INACTIVE:%.*]], <4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], i32 536870912, <4 x i1> [[TMP1]])
 // CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <4 x i32>, i32 } [[TMP2]], 1
 // CHECK-NEXT:    [[TMP4:%.*]] = lshr i32 [[TMP3]], 29
 // CHECK-NEXT:    [[TMP5:%.*]] = and i32 1, [[TMP4]]
@@ -190,7 +190,7 @@ int32x4_t test_vsbciq_m_s32(int32x4_t inactive, int32x4_t a, int32x4_t b, unsign
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32
 // CHECK-NEXT:    [[TMP1:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP0]])
-// CHECK-NEXT:    [[TMP2:%.*]] = call { <4 x i32>, i32 } @llvm.arm.mve.vsbc.predicated.v4i32.v4i1(<4 x i32> [[INACTIVE:%.*]], <4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], i32 0, <4 x i1> [[TMP1]])
+// CHECK-NEXT:    [[TMP2:%.*]] = call { <4 x i32>, i32 } @llvm.arm.mve.vsbc.predicated.v4i32.v4i1(<4 x i32> [[INACTIVE:%.*]], <4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], i32 536870912, <4 x i1> [[TMP1]])
 // CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <4 x i32>, i32 } [[TMP2]], 1
 // CHECK-NEXT:    [[TMP4:%.*]] = lshr i32 [[TMP3]], 29
 // CHECK-NEXT:    [[TMP5:%.*]] = and i32 1, [[TMP4]]