[llvm] [RISCV] Implement cross basic block VXRM write insertion. (PR #70382)
Philip Reames via llvm-commits
llvm-commits at lists.llvm.org
Thu Nov 2 10:57:10 PDT 2023
================
@@ -0,0 +1,549 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
+
+; RUN: sed 's/iXLen/i32/g' %s | llc -mtriple=riscv32 -mattr=+v \
+; RUN: -verify-machineinstrs | FileCheck %s --check-prefixes=CHECK,RV32
+; RUN: sed 's/iXLen/i64/g' %s | llc -mtriple=riscv64 -mattr=+v \
+; RUN: -verify-machineinstrs | FileCheck %s --check-prefixes=CHECK,RV64
+
+declare <vscale x 1 x i8> @llvm.riscv.vaadd.nxv1i8.nxv1i8(
+ <vscale x 1 x i8>,
+ <vscale x 1 x i8>,
+ <vscale x 1 x i8>,
+ iXLen, iXLen);
+declare <vscale x 1 x i8> @llvm.riscv.vasub.nxv1i8.nxv1i8(
+ <vscale x 1 x i8>,
+ <vscale x 1 x i8>,
+ <vscale x 1 x i8>,
+ iXLen, iXLen);
+
+; Test same rounding mode in one block.
+define <vscale x 1 x i8> @test1(<vscale x 1 x i8> %0, <vscale x 1 x i8> %1, <vscale x 1 x i8> %2, iXLen %3) nounwind {
+; CHECK-LABEL: test1:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vsetvli zero, a0, e8, mf8, ta, ma
+; CHECK-NEXT: csrwi vxrm, 0
+; CHECK-NEXT: vaadd.vv v8, v8, v9
+; CHECK-NEXT: vaadd.vv v8, v8, v10
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 1 x i8> @llvm.riscv.vaadd.nxv1i8.nxv1i8(
+ <vscale x 1 x i8> undef,
+ <vscale x 1 x i8> %0,
+ <vscale x 1 x i8> %1,
+ iXLen 0, iXLen %3)
+ %b = call <vscale x 1 x i8> @llvm.riscv.vaadd.nxv1i8.nxv1i8(
+ <vscale x 1 x i8> undef,
+ <vscale x 1 x i8> %a,
+ <vscale x 1 x i8> %2,
+ iXLen 0, iXLen %3)
+
+ ret <vscale x 1 x i8> %b
+}
+
+; Test different rounding mode.
+define <vscale x 1 x i8> @test2(<vscale x 1 x i8> %0, <vscale x 1 x i8> %1, <vscale x 1 x i8> %2, iXLen %3) nounwind {
+; CHECK-LABEL: test2:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vsetvli zero, a0, e8, mf8, ta, ma
+; CHECK-NEXT: csrwi vxrm, 2
+; CHECK-NEXT: vaadd.vv v8, v8, v9
+; CHECK-NEXT: csrwi vxrm, 0
+; CHECK-NEXT: vaadd.vv v8, v8, v10
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 1 x i8> @llvm.riscv.vaadd.nxv1i8.nxv1i8(
+ <vscale x 1 x i8> undef,
+ <vscale x 1 x i8> %0,
+ <vscale x 1 x i8> %1,
+ iXLen 2, iXLen %3)
+ %b = call <vscale x 1 x i8> @llvm.riscv.vaadd.nxv1i8.nxv1i8(
+ <vscale x 1 x i8> undef,
+ <vscale x 1 x i8> %a,
+ <vscale x 1 x i8> %2,
+ iXLen 0, iXLen %3)
+
+ ret <vscale x 1 x i8> %b
+}
+
+declare <vscale x 1 x i8> @foo(<vscale x 1 x i8>)
+
+; Test same vxrm with call in between which may invalidate vxrm.
+define <vscale x 1 x i8> @test3(<vscale x 1 x i8> %0, <vscale x 1 x i8> %1, <vscale x 1 x i8> %2, iXLen %3) nounwind {
+; RV32-LABEL: test3:
+; RV32: # %bb.0: # %entry
+; RV32-NEXT: addi sp, sp, -32
+; RV32-NEXT: sw ra, 28(sp) # 4-byte Folded Spill
+; RV32-NEXT: sw s0, 24(sp) # 4-byte Folded Spill
+; RV32-NEXT: csrr a1, vlenb
+; RV32-NEXT: slli a1, a1, 1
+; RV32-NEXT: sub sp, sp, a1
+; RV32-NEXT: mv s0, a0
+; RV32-NEXT: addi a1, sp, 16
+; RV32-NEXT: vs1r.v v10, (a1) # Unknown-size Folded Spill
+; RV32-NEXT: vsetvli zero, a0, e8, mf8, ta, ma
+; RV32-NEXT: csrwi vxrm, 0
+; RV32-NEXT: vaadd.vv v8, v8, v9
+; RV32-NEXT: call foo at plt
+; RV32-NEXT: vsetvli zero, s0, e8, mf8, ta, ma
+; RV32-NEXT: csrwi vxrm, 0
+; RV32-NEXT: addi a0, sp, 16
+; RV32-NEXT: vl1r.v v9, (a0) # Unknown-size Folded Reload
+; RV32-NEXT: vaadd.vv v8, v8, v9
+; RV32-NEXT: csrr a0, vlenb
+; RV32-NEXT: slli a0, a0, 1
+; RV32-NEXT: add sp, sp, a0
+; RV32-NEXT: lw ra, 28(sp) # 4-byte Folded Reload
+; RV32-NEXT: lw s0, 24(sp) # 4-byte Folded Reload
+; RV32-NEXT: addi sp, sp, 32
+; RV32-NEXT: ret
+;
+; RV64-LABEL: test3:
+; RV64: # %bb.0: # %entry
+; RV64-NEXT: addi sp, sp, -32
+; RV64-NEXT: sd ra, 24(sp) # 8-byte Folded Spill
+; RV64-NEXT: sd s0, 16(sp) # 8-byte Folded Spill
+; RV64-NEXT: csrr a1, vlenb
+; RV64-NEXT: slli a1, a1, 1
+; RV64-NEXT: sub sp, sp, a1
+; RV64-NEXT: mv s0, a0
+; RV64-NEXT: addi a1, sp, 16
+; RV64-NEXT: vs1r.v v10, (a1) # Unknown-size Folded Spill
+; RV64-NEXT: vsetvli zero, a0, e8, mf8, ta, ma
+; RV64-NEXT: csrwi vxrm, 0
+; RV64-NEXT: vaadd.vv v8, v8, v9
+; RV64-NEXT: call foo at plt
+; RV64-NEXT: vsetvli zero, s0, e8, mf8, ta, ma
+; RV64-NEXT: csrwi vxrm, 0
+; RV64-NEXT: addi a0, sp, 16
+; RV64-NEXT: vl1r.v v9, (a0) # Unknown-size Folded Reload
+; RV64-NEXT: vaadd.vv v8, v8, v9
+; RV64-NEXT: csrr a0, vlenb
+; RV64-NEXT: slli a0, a0, 1
+; RV64-NEXT: add sp, sp, a0
+; RV64-NEXT: ld ra, 24(sp) # 8-byte Folded Reload
+; RV64-NEXT: ld s0, 16(sp) # 8-byte Folded Reload
+; RV64-NEXT: addi sp, sp, 32
+; RV64-NEXT: ret
+entry:
+ %a = call <vscale x 1 x i8> @llvm.riscv.vaadd.nxv1i8.nxv1i8(
+ <vscale x 1 x i8> undef,
+ <vscale x 1 x i8> %0,
+ <vscale x 1 x i8> %1,
+ iXLen 0, iXLen %3)
+ %b = call <vscale x 1 x i8> @foo(<vscale x 1 x i8> %a)
+ %c = call <vscale x 1 x i8> @llvm.riscv.vaadd.nxv1i8.nxv1i8(
+ <vscale x 1 x i8> undef,
+ <vscale x 1 x i8> %b,
+ <vscale x 1 x i8> %2,
+ iXLen 0, iXLen %3)
+
+ ret <vscale x 1 x i8> %c
+}
+
+; Test same vxrm with asm in between which may invalidate vxrm.
+define <vscale x 1 x i8> @test4(<vscale x 1 x i8> %0, <vscale x 1 x i8> %1, <vscale x 1 x i8> %2, iXLen %3) nounwind {
+; CHECK-LABEL: test4:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vsetvli zero, a0, e8, mf8, ta, ma
+; CHECK-NEXT: csrwi vxrm, 0
+; CHECK-NEXT: vaadd.vv v8, v8, v9
+; CHECK-NEXT: #APP
+; CHECK-NEXT: #NO_APP
+; CHECK-NEXT: vsetvli zero, a0, e8, mf8, ta, ma
+; CHECK-NEXT: csrwi vxrm, 0
+; CHECK-NEXT: vaadd.vv v8, v8, v10
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 1 x i8> @llvm.riscv.vaadd.nxv1i8.nxv1i8(
+ <vscale x 1 x i8> undef,
+ <vscale x 1 x i8> %0,
+ <vscale x 1 x i8> %1,
+ iXLen 0, iXLen %3)
+ %b = call <vscale x 1 x i8> asm "", "=^vr,0"(<vscale x 1 x i8> %a)
+ %c = call <vscale x 1 x i8> @llvm.riscv.vaadd.nxv1i8.nxv1i8(
+ <vscale x 1 x i8> undef,
+ <vscale x 1 x i8> %b,
+ <vscale x 1 x i8> %2,
+ iXLen 0, iXLen %3)
+
+ ret <vscale x 1 x i8> %c
+}
+
+; Test same rounding mode in triangle.
+define <vscale x 1 x i8> @test5(<vscale x 1 x i8> %0, <vscale x 1 x i8> %1, <vscale x 1 x i8> %2, iXLen %3, i1 %cond) nounwind {
+; CHECK-LABEL: test5:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: andi a1, a1, 1
+; CHECK-NEXT: vsetvli zero, a0, e8, mf8, ta, ma
+; CHECK-NEXT: csrwi vxrm, 0
+; CHECK-NEXT: vaadd.vv v8, v8, v9
+; CHECK-NEXT: beqz a1, .LBB4_2
+; CHECK-NEXT: # %bb.1: # %condblock
+; CHECK-NEXT: vaadd.vv v8, v8, v10
+; CHECK-NEXT: .LBB4_2: # %mergeblock
+; CHECK-NEXT: ret
+entry:
+ %a = call <vscale x 1 x i8> @llvm.riscv.vaadd.nxv1i8.nxv1i8(
+ <vscale x 1 x i8> undef,
+ <vscale x 1 x i8> %0,
+ <vscale x 1 x i8> %1,
+ iXLen 0, iXLen %3)
+ br i1 %cond, label %condblock, label %mergeblock
+
+condblock:
+ %b = call <vscale x 1 x i8> @llvm.riscv.vaadd.nxv1i8.nxv1i8(
+ <vscale x 1 x i8> undef,
+ <vscale x 1 x i8> %a,
+ <vscale x 1 x i8> %2,
+ iXLen 0, iXLen %3)
+ br label %mergeblock
+
+mergeblock:
+ %c = phi <vscale x 1 x i8> [%a, %entry], [%b, %condblock]
+
+ ret <vscale x 1 x i8> %c
+}
+
+; Test same rounding mode in diamond with no dominating vxrm.
+define <vscale x 1 x i8> @test6(<vscale x 1 x i8> %0, <vscale x 1 x i8> %1, <vscale x 1 x i8> %2, iXLen %3, i1 %cond) nounwind {
+; CHECK-LABEL: test6:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: andi a1, a1, 1
+; CHECK-NEXT: csrwi vxrm, 0
+; CHECK-NEXT: beqz a1, .LBB5_2
+; CHECK-NEXT: # %bb.1: # %trueblock
+; CHECK-NEXT: vsetvli zero, a0, e8, mf8, ta, ma
----------------
preames wrote:
Pure aside, but this test case demonstrates that we could probably benefit from an anticipation based extension to vsetvli insertion, and maybe a more aggressive flattening or tail commoning.
https://github.com/llvm/llvm-project/pull/70382
More information about the llvm-commits
mailing list