[llvm] [RISCV] Optimize divide by constant for VP intrinsics (PR #125991)

Wed Feb 19 10:59:11 PST 2025

================
@@ -0,0 +1,1627 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=riscv64 -mattr=+v -verify-machineinstrs | FileCheck %s
+
+declare <vscale x 8 x i8> @llvm.vp.udiv.nxv8i8(<vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i1>, i32)
+declare <vscale x 4 x i16> @llvm.vp.udiv.nxv4i16(<vscale x 4 x i16>, <vscale x 4 x i16>, <vscale x 4 x i1>, i32)
+declare <vscale x 2 x i32> @llvm.vp.udiv.nxv2i32(<vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i1>, i32)
+declare <vscale x 1 x i64> @llvm.vp.udiv.nxv1i64(<vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i1>, i32)
+declare <vscale x 8 x i8> @llvm.vp.sdiv.nxv8i8(<vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i1>, i32)
+declare <vscale x 4 x i16> @llvm.vp.sdiv.nxv4i16(<vscale x 4 x i16>, <vscale x 4 x i16>, <vscale x 4 x i1>, i32)
+declare <vscale x 2 x i32> @llvm.vp.sdiv.nxv2i32(<vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i1>, i32)
+declare <vscale x 1 x i64> @llvm.vp.sdiv.nxv1i64(<vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i1>, i32)
+declare <vscale x 8 x i8> @llvm.vp.urem.nxv8i8(<vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i1>, i32)
+declare <vscale x 4 x i16> @llvm.vp.urem.nxv4i16(<vscale x 4 x i16>, <vscale x 4 x i16>, <vscale x 4 x i1>, i32)
+declare <vscale x 2 x i32> @llvm.vp.urem.nxv2i32(<vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i1>, i32)
+declare <vscale x 1 x i64> @llvm.vp.urem.nxv1i64(<vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i1>, i32)
+declare <vscale x 8 x i8> @llvm.vp.srem.nxv8i8(<vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i1>, i32)
+declare <vscale x 4 x i16> @llvm.vp.srem.nxv4i16(<vscale x 4 x i16>, <vscale x 4 x i16>, <vscale x 4 x i1>, i32)
+declare <vscale x 2 x i32> @llvm.vp.srem.nxv2i32(<vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i1>, i32)
+declare <vscale x 1 x i64> @llvm.vp.srem.nxv1i64(<vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i1>, i32)
+declare <vscale x 8 x i8> @llvm.vp.shl.nxv8i8(<vscale x 8 x i8>, <vscale x 8 x i8>, <vscale x 8 x i1>, i32)
+declare <vscale x 4 x i16> @llvm.vp.shl.nxv4i16(<vscale x 4 x i16>, <vscale x 4 x i16>, <vscale x 4 x i1>, i32)
+declare <vscale x 2 x i32> @llvm.vp.shl.nxv2i32(<vscale x 2 x i32>, <vscale x 2 x i32>, <vscale x 2 x i1>, i32)
+declare <vscale x 1 x i64> @llvm.vp.shl.nxv1i64(<vscale x 1 x i64>, <vscale x 1 x i64>, <vscale x 1 x i1>, i32)
+
+
+define <vscale x 8 x i8> @vpudiv_by_max_nxv8i8(<vscale x 8 x i8> %va, <vscale x 8 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vpudiv_by_max_nxv8i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e8, m1, ta, ma
+; CHECK-NEXT:    vmseq.vi v0, v8, -1, v0.t
+; CHECK-NEXT:    vmv.v.i v8, 0
+; CHECK-NEXT:    vmerge.vim v8, v8, 1, v0
+; CHECK-NEXT:    ret
+  %v = call <vscale x 8 x i8> @llvm.vp.udiv.nxv8i8(<vscale x 8 x i8> %va, <vscale x 8 x i8> splat (i8 255), <vscale x 8 x i1> %m, i32 %evl)
+  ret <vscale x 8 x i8> %v
+}
+
+define <vscale x 4 x i16> @vpudiv_by_max_nxv4i16(<vscale x 4 x i16> %va, <vscale x 4 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vpudiv_by_max_nxv4i16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
+; CHECK-NEXT:    vmseq.vi v0, v8, -1, v0.t
+; CHECK-NEXT:    vmv.v.i v8, 0
+; CHECK-NEXT:    vmerge.vim v8, v8, 1, v0
+; CHECK-NEXT:    ret
+  %v = call <vscale x 4 x i16> @llvm.vp.udiv.nxv4i16(<vscale x 4 x i16> %va, <vscale x 4 x i16> splat (i16 65535), <vscale x 4 x i1> %m, i32 %evl)
+  ret <vscale x 4 x i16> %v
+}
+
+define <vscale x 2 x i32> @vpudiv_by_max_nxv2i32(<vscale x 2 x i32> %va, <vscale x 2 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vpudiv_by_max_nxv2i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
+; CHECK-NEXT:    vmseq.vi v0, v8, -1, v0.t
+; CHECK-NEXT:    vmv.v.i v8, 0
+; CHECK-NEXT:    vmerge.vim v8, v8, 1, v0
+; CHECK-NEXT:    ret
+  %v = call <vscale x 2 x i32> @llvm.vp.udiv.nxv2i32(<vscale x 2 x i32> %va, <vscale x 2 x i32> splat (i32 4294967295), <vscale x 2 x i1> %m, i32 %evl)
+  ret <vscale x 2 x i32> %v
+}
+
+define <vscale x 1 x i64> @vpudiv_by_max_nxv1i64(<vscale x 1 x i64> %va, <vscale x 1 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vpudiv_by_max_nxv1i64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
+; CHECK-NEXT:    vmseq.vi v0, v8, -1, v0.t
+; CHECK-NEXT:    vmv.v.i v8, 0
+; CHECK-NEXT:    vmerge.vim v8, v8, 1, v0
+; CHECK-NEXT:    ret
+  %v = call <vscale x 1 x i64> @llvm.vp.udiv.nxv1i64(<vscale x 1 x i64> %va, <vscale x 1 x i64> splat (i64 18446744073709551615), <vscale x 1 x i1> %m, i32 %evl)
+  ret <vscale x 1 x i64> %v
+}
+
+define <vscale x 8 x i8> @fold_vpudiv_vpurem_nxv8i8(<vscale x 8 x i8> %va, <vscale x 8 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: fold_vpudiv_vpurem_nxv8i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e8, m1, ta, ma
+; CHECK-NEXT:    vsrl.vi v9, v8, 7, v0.t
+; CHECK-NEXT:    vsll.vi v10, v9, 7, v0.t
+; CHECK-NEXT:    vsub.vv v8, v8, v10, v0.t
+; CHECK-NEXT:    vsetvli a0, zero, e8, m1, ta, ma
+; CHECK-NEXT:    vadd.vv v8, v9, v8
+; CHECK-NEXT:    ret
+  %v = call <vscale x 8 x i8> @llvm.vp.udiv.nxv8i8(<vscale x 8 x i8> %va, <vscale x 8 x i8> splat (i8 128), <vscale x 8 x i1> %m, i32 %evl)
+  %u = call <vscale x 8 x i8> @llvm.vp.urem.nxv8i8(<vscale x 8 x i8> %va, <vscale x 8 x i8> splat (i8 128), <vscale x 8 x i1> %m, i32 %evl)
+  %x = add <vscale x 8 x i8> %v, %u
+  ret <vscale x 8 x i8> %x
+}
+
+define <vscale x 4 x i16> @fold_vpudiv_vpurem_nxv4i16(<vscale x 4 x i16> %va, <vscale x 4 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: fold_vpudiv_vpurem_nxv4i16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
+; CHECK-NEXT:    vsrl.vi v9, v8, 14, v0.t
+; CHECK-NEXT:    vsll.vi v10, v9, 14, v0.t
+; CHECK-NEXT:    vsub.vv v8, v8, v10, v0.t
+; CHECK-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
+; CHECK-NEXT:    vadd.vv v8, v9, v8
+; CHECK-NEXT:    ret
+  %v = call <vscale x 4 x i16> @llvm.vp.udiv.nxv4i16(<vscale x 4 x i16> %va, <vscale x 4 x i16> splat (i16 16384), <vscale x 4 x i1> %m, i32 %evl)
+  %u = call <vscale x 4 x i16> @llvm.vp.urem.nxv4i16(<vscale x 4 x i16> %va, <vscale x 4 x i16> splat (i16 16384), <vscale x 4 x i1> %m, i32 %evl)
+  %x = add <vscale x 4 x i16> %v, %u
+  ret <vscale x 4 x i16> %x
+}
+
+define <vscale x 2 x i32> @fold_vpudiv_vpurem_nxv2i32(<vscale x 2 x i32> %va, <vscale x 2 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: fold_vpudiv_vpurem_nxv2i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
+; CHECK-NEXT:    vsrl.vi v9, v8, 14, v0.t
+; CHECK-NEXT:    vsll.vi v10, v9, 14, v0.t
+; CHECK-NEXT:    vsub.vv v8, v8, v10, v0.t
+; CHECK-NEXT:    vsetvli a0, zero, e32, m1, ta, ma
+; CHECK-NEXT:    vadd.vv v8, v9, v8
+; CHECK-NEXT:    ret
+  %v = call <vscale x 2 x i32> @llvm.vp.udiv.nxv2i32(<vscale x 2 x i32> %va, <vscale x 2 x i32> splat (i32 16384), <vscale x 2 x i1> %m, i32 %evl)
+  %u = call <vscale x 2 x i32> @llvm.vp.urem.nxv2i32(<vscale x 2 x i32> %va, <vscale x 2 x i32> splat (i32 16384), <vscale x 2 x i1> %m, i32 %evl)
+  %x = add <vscale x 2 x i32> %v, %u
+  ret <vscale x 2 x i32> %x
+}
+
+define <vscale x 1 x i64> @fold_vpudiv_vpurem_nxv1i64(<vscale x 1 x i64> %va, <vscale x 1 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: fold_vpudiv_vpurem_nxv1i64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
+; CHECK-NEXT:    vsrl.vi v9, v8, 14, v0.t
+; CHECK-NEXT:    vsll.vi v10, v9, 14, v0.t
+; CHECK-NEXT:    vsub.vv v8, v8, v10, v0.t
+; CHECK-NEXT:    vsetvli a0, zero, e64, m1, ta, ma
+; CHECK-NEXT:    vadd.vv v8, v9, v8
+; CHECK-NEXT:    ret
+  %v = call <vscale x 1 x i64> @llvm.vp.udiv.nxv1i64(<vscale x 1 x i64> %va, <vscale x 1 x i64> splat (i64 16384), <vscale x 1 x i1> %m, i32 %evl)
+  %u = call <vscale x 1 x i64> @llvm.vp.urem.nxv1i64(<vscale x 1 x i64> %va, <vscale x 1 x i64> splat (i64 16384), <vscale x 1 x i1> %m, i32 %evl)
+  %x = add <vscale x 1 x i64> %v, %u
+  ret <vscale x 1 x i64> %x
+}
+
+define <vscale x 8 x i8> @vpudiv_by_shl2_nxv8i8(<vscale x 8 x i8> %va, i8 %b, <vscale x 8 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vpudiv_by_shl2_nxv8i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    andi a0, a0, 255
+; CHECK-NEXT:    addi a0, a0, 1
+; CHECK-NEXT:    vsetvli zero, a1, e8, m1, ta, ma
+; CHECK-NEXT:    vsrl.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    ret
+  %sh = shl i8 2, %b
+  %vec = insertelement <vscale x 8 x i8> poison, i8 %sh, i32 0
+  %splat = shufflevector <vscale x 8 x i8> %vec, <vscale x 8 x i8> poison, <vscale x 8 x i32> zeroinitializer
+  %v = call <vscale x 8 x i8> @llvm.vp.udiv.nxv8i8(<vscale x 8 x i8> %va, <vscale x 8 x i8> %splat, <vscale x 8 x i1> %m, i32 %evl)
+  ret <vscale x 8 x i8> %v
+}
+
+define <vscale x 4 x i16> @vpudiv_by_shl2_nxv4i16(<vscale x 4 x i16> %va, i16 %b, <vscale x 4 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vpudiv_by_shl2_nxv4i16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    slli a0, a0, 48
+; CHECK-NEXT:    srli a0, a0, 48
+; CHECK-NEXT:    addi a0, a0, 1
+; CHECK-NEXT:    vsetvli zero, a1, e16, m1, ta, ma
+; CHECK-NEXT:    vsrl.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    ret
+  %sh = shl i16 2, %b
+  %vec = insertelement <vscale x 4 x i16> poison, i16 %sh, i32 0
+  %splat = shufflevector <vscale x 4 x i16> %vec, <vscale x 4 x i16> poison, <vscale x 4 x i32> zeroinitializer
+  %v = call <vscale x 4 x i16> @llvm.vp.udiv.nxv4i16(<vscale x 4 x i16> %va, <vscale x 4 x i16> %splat, <vscale x 4 x i1> %m, i32 %evl)
+  ret <vscale x 4 x i16> %v
+}
+
+define <vscale x 2 x i32> @vpudiv_by_shl2_nxv2i32(<vscale x 2 x i32> %va, i32 %b, <vscale x 2 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vpudiv_by_shl2_nxv2i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    addi a0, a0, 1
+; CHECK-NEXT:    vsetvli zero, a1, e32, m1, ta, ma
+; CHECK-NEXT:    vsrl.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    ret
+  %sh = shl i32 2, %b
+  %vec = insertelement <vscale x 2 x i32> poison, i32 %sh, i32 0
+  %splat = shufflevector <vscale x 2 x i32> %vec, <vscale x 2 x i32> poison, <vscale x 2 x i32> zeroinitializer
+  %v = call <vscale x 2 x i32> @llvm.vp.udiv.nxv2i32(<vscale x 2 x i32> %va, <vscale x 2 x i32> %splat, <vscale x 2 x i1> %m, i32 %evl)
+  ret <vscale x 2 x i32> %v
+}
+
+define <vscale x 1 x i64> @vpudiv_by_shl2_nxv1i64(<vscale x 1 x i64> %va, i64 %b, <vscale x 1 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vpudiv_by_shl2_nxv1i64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    addi a0, a0, 1
+; CHECK-NEXT:    vsetvli zero, a1, e64, m1, ta, ma
+; CHECK-NEXT:    vsrl.vx v8, v8, a0, v0.t
+; CHECK-NEXT:    ret
+  %sh = shl i64 2, %b
+  %vec = insertelement <vscale x 1 x i64> poison, i64 %sh, i32 0
+  %splat = shufflevector <vscale x 1 x i64> %vec, <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer
+  %v = call <vscale x 1 x i64> @llvm.vp.udiv.nxv1i64(<vscale x 1 x i64> %va, <vscale x 1 x i64> %splat, <vscale x 1 x i1> %m, i32 %evl)
+  ret <vscale x 1 x i64> %v
+}
+
+define <vscale x 8 x i8> @vpudiv_by_vpshl2_nxv8i8(<vscale x 8 x i8> %va, i8 %b, <vscale x 8 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vpudiv_by_vpshl2_nxv8i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e8, m1, ta, ma
+; CHECK-NEXT:    vmv.v.x v9, a0
+; CHECK-NEXT:    vadd.vi v9, v9, 2, v0.t
+; CHECK-NEXT:    vsrl.vv v8, v8, v9, v0.t
+; CHECK-NEXT:    ret
+  %vec2 = insertelement <vscale x 8 x i8> poison, i8 %b, i32 0
+  %splat2 = shufflevector <vscale x 8 x i8> %vec2, <vscale x 8 x i8> poison, <vscale x 8 x i32> zeroinitializer
+  %sh = call <vscale x 8 x i8> @llvm.vp.shl.nxv8i8(<vscale x 8 x i8> splat (i8 4), <vscale x 8 x i8> %splat2, <vscale x 8 x i1> %m, i32 %evl)
+  %v = call <vscale x 8 x i8> @llvm.vp.udiv.nxv8i8(<vscale x 8 x i8> %va, <vscale x 8 x i8> %sh, <vscale x 8 x i1> %m, i32 %evl)
+  ret <vscale x 8 x i8> %v
+}
+
+define <vscale x 4 x i16> @vpudiv_by_vpshl2_nxv4i16(<vscale x 4 x i16> %va, i16 %b, <vscale x 4 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vpudiv_by_vpshl2_nxv4i16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e16, m1, ta, ma
+; CHECK-NEXT:    vmv.v.x v9, a0
+; CHECK-NEXT:    vadd.vi v9, v9, 2, v0.t
+; CHECK-NEXT:    vsrl.vv v8, v8, v9, v0.t
+; CHECK-NEXT:    ret
+  %vec2 = insertelement <vscale x 4 x i16> poison, i16 %b, i32 0
+  %splat2 = shufflevector <vscale x 4 x i16> %vec2, <vscale x 4 x i16> poison, <vscale x 4 x i32> zeroinitializer
+  %sh = call <vscale x 4 x i16> @llvm.vp.shl.nxv4i16(<vscale x 4 x i16> splat (i16 4), <vscale x 4 x i16> %splat2, <vscale x 4 x i1> %m, i32 %evl)
+  %v = call <vscale x 4 x i16> @llvm.vp.udiv.nxv4i16(<vscale x 4 x i16> %va, <vscale x 4 x i16> %sh, <vscale x 4 x i1> %m, i32 %evl)
+  ret <vscale x 4 x i16> %v
+}
+
+define <vscale x 2 x i32> @vpudiv_by_vpshl2_nxv2i32(<vscale x 2 x i32> %va, i32 %b, <vscale x 2 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vpudiv_by_vpshl2_nxv2i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e32, m1, ta, ma
+; CHECK-NEXT:    vmv.v.x v9, a0
+; CHECK-NEXT:    vadd.vi v9, v9, 2, v0.t
+; CHECK-NEXT:    vsrl.vv v8, v8, v9, v0.t
+; CHECK-NEXT:    ret
+  %vec2 = insertelement <vscale x 2 x i32> poison, i32 %b, i32 0
+  %splat2 = shufflevector <vscale x 2 x i32> %vec2, <vscale x 2 x i32> poison, <vscale x 2 x i32> zeroinitializer
+  %sh = call <vscale x 2 x i32> @llvm.vp.shl.nxv2i32(<vscale x 2 x i32> splat (i32 4), <vscale x 2 x i32> %splat2, <vscale x 2 x i1> %m, i32 %evl)
+  %v = call <vscale x 2 x i32> @llvm.vp.udiv.nxv2i32(<vscale x 2 x i32> %va, <vscale x 2 x i32> %sh, <vscale x 2 x i1> %m, i32 %evl)
+  ret <vscale x 2 x i32> %v
+}
+
+define <vscale x 1 x i64> @vpudiv_by_vpshl2_nxv1i64(<vscale x 1 x i64> %va, i64 %b, <vscale x 1 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vpudiv_by_vpshl2_nxv1i64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a1, e64, m1, ta, ma
+; CHECK-NEXT:    vmv.v.x v9, a0
+; CHECK-NEXT:    vadd.vi v9, v9, 2, v0.t
+; CHECK-NEXT:    vsrl.vv v8, v8, v9, v0.t
+; CHECK-NEXT:    ret
+  %vec2 = insertelement <vscale x 1 x i64> poison, i64 %b, i32 0
+  %splat2 = shufflevector <vscale x 1 x i64> %vec2, <vscale x 1 x i64> poison, <vscale x 1 x i32> zeroinitializer
+  %sh = call <vscale x 1 x i64> @llvm.vp.shl.nxv1i64(<vscale x 1 x i64> splat (i64 4), <vscale x 1 x i64> %splat2, <vscale x 1 x i1> %m, i32 %evl)
+  %v = call <vscale x 1 x i64> @llvm.vp.udiv.nxv1i64(<vscale x 1 x i64> %va, <vscale x 1 x i64> %sh, <vscale x 1 x i1> %m, i32 %evl)
+  ret <vscale x 1 x i64> %v
+}
+
+define <vscale x 8 x i8> @vpudiv_by_const_no_add_nxv8i8(<vscale x 8 x i8> %va, <vscale x 8 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vpudiv_by_const_no_add_nxv8i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a1, -51
+; CHECK-NEXT:    vsetvli zero, a0, e8, m1, ta, ma
+; CHECK-NEXT:    vmulhu.vx v9, v8, a1, v0.t
+; CHECK-NEXT:    vsrl.vi v9, v9, 2, v0.t
+; CHECK-NEXT:    vmv.v.i v10, 5
+; CHECK-NEXT:    vmseq.vi v0, v10, 1, v0.t
+; CHECK-NEXT:    vmerge.vvm v8, v9, v8, v0
+; CHECK-NEXT:    ret
+  %v = call <vscale x 8 x i8> @llvm.vp.udiv.nxv8i8(<vscale x 8 x i8> %va, <vscale x 8 x i8> splat (i8 5), <vscale x 8 x i1> %m, i32 %evl)
+  ret <vscale x 8 x i8> %v
+}
+
+define <vscale x 4 x i16> @vpudiv_by_const_no_add_nxv4i16(<vscale x 4 x i16> %va, <vscale x 4 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vpudiv_by_const_no_add_nxv4i16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    lui a1, 1048573
+; CHECK-NEXT:    addi a1, a1, -819
+; CHECK-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
+; CHECK-NEXT:    vmulhu.vx v9, v8, a1, v0.t
+; CHECK-NEXT:    vsrl.vi v9, v9, 2, v0.t
+; CHECK-NEXT:    vmv.v.i v10, 5
+; CHECK-NEXT:    vmseq.vi v0, v10, 1, v0.t
+; CHECK-NEXT:    vmerge.vvm v8, v9, v8, v0
+; CHECK-NEXT:    ret
+  %v = call <vscale x 4 x i16> @llvm.vp.udiv.nxv4i16(<vscale x 4 x i16> %va, <vscale x 4 x i16> splat (i16 5), <vscale x 4 x i1> %m, i32 %evl)
+  ret <vscale x 4 x i16> %v
+}
+
+define <vscale x 2 x i32> @vpudiv_by_const_no_add_nxv2i32(<vscale x 2 x i32> %va, <vscale x 2 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vpudiv_by_const_no_add_nxv2i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    lui a1, 838861
+; CHECK-NEXT:    addi a1, a1, -819
+; CHECK-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
+; CHECK-NEXT:    vmulhu.vx v9, v8, a1, v0.t
+; CHECK-NEXT:    vsrl.vi v9, v9, 2, v0.t
+; CHECK-NEXT:    vmv.v.i v10, 5
+; CHECK-NEXT:    vmseq.vi v0, v10, 1, v0.t
+; CHECK-NEXT:    vmerge.vvm v8, v9, v8, v0
+; CHECK-NEXT:    ret
+  %v = call <vscale x 2 x i32> @llvm.vp.udiv.nxv2i32(<vscale x 2 x i32> %va, <vscale x 2 x i32> splat (i32 5), <vscale x 2 x i1> %m, i32 %evl)
+  ret <vscale x 2 x i32> %v
+}
+
+define <vscale x 1 x i64> @vpudiv_by_const_no_add_nxv1i64(<vscale x 1 x i64> %va, <vscale x 1 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vpudiv_by_const_no_add_nxv1i64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
+; CHECK-NEXT:    vmv.v.i v9, 5
+; CHECK-NEXT:    lui a0, 838861
+; CHECK-NEXT:    vmseq.vi v9, v9, 1, v0.t
+; CHECK-NEXT:    addiw a0, a0, -819
+; CHECK-NEXT:    slli a1, a0, 32
+; CHECK-NEXT:    add a0, a0, a1
+; CHECK-NEXT:    vmulhu.vx v10, v8, a0, v0.t
+; CHECK-NEXT:    vsrl.vi v10, v10, 2, v0.t
+; CHECK-NEXT:    vmv.v.v v0, v9
+; CHECK-NEXT:    vmerge.vvm v8, v10, v8, v0
+; CHECK-NEXT:    ret
+  %v = call <vscale x 1 x i64> @llvm.vp.udiv.nxv1i64(<vscale x 1 x i64> %va, <vscale x 1 x i64> splat (i64 5), <vscale x 1 x i1> %m, i32 %evl)
+  ret <vscale x 1 x i64> %v
+}
+
+define <vscale x 8 x i8> @vpudiv_by_const_with_add_nxv8i8(<vscale x 8 x i8> %va, <vscale x 8 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vpudiv_by_const_with_add_nxv8i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a1, 37
+; CHECK-NEXT:    vsetvli zero, a0, e8, m1, ta, ma
+; CHECK-NEXT:    vmulhu.vx v9, v8, a1, v0.t
+; CHECK-NEXT:    li a0, -128
+; CHECK-NEXT:    vsub.vv v10, v8, v9, v0.t
+; CHECK-NEXT:    vmulhu.vx v10, v10, a0, v0.t
+; CHECK-NEXT:    vadd.vv v9, v10, v9, v0.t
+; CHECK-NEXT:    vmv.v.i v10, 7
+; CHECK-NEXT:    vsrl.vi v9, v9, 2, v0.t
+; CHECK-NEXT:    vmseq.vi v0, v10, 1, v0.t
+; CHECK-NEXT:    vmerge.vvm v8, v9, v8, v0
+; CHECK-NEXT:    ret
+  %v = call <vscale x 8 x i8> @llvm.vp.udiv.nxv8i8(<vscale x 8 x i8> %va, <vscale x 8 x i8> splat (i8 7), <vscale x 8 x i1> %m, i32 %evl)
+  ret <vscale x 8 x i8> %v
+}
+
+define <vscale x 4 x i16> @vpudiv_by_const_with_add_nxv4i16(<vscale x 4 x i16> %va, <vscale x 4 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vpudiv_by_const_with_add_nxv4i16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    lui a1, 2
+; CHECK-NEXT:    addi a1, a1, 1171
+; CHECK-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
+; CHECK-NEXT:    vmulhu.vx v9, v8, a1, v0.t
+; CHECK-NEXT:    lui a0, 1048568
+; CHECK-NEXT:    vsub.vv v10, v8, v9, v0.t
+; CHECK-NEXT:    vmulhu.vx v10, v10, a0, v0.t
+; CHECK-NEXT:    vadd.vv v9, v10, v9, v0.t
+; CHECK-NEXT:    vmv.v.i v10, 7
+; CHECK-NEXT:    vsrl.vi v9, v9, 2, v0.t
+; CHECK-NEXT:    vmseq.vi v0, v10, 1, v0.t
+; CHECK-NEXT:    vmerge.vvm v8, v9, v8, v0
+; CHECK-NEXT:    ret
+  %v = call <vscale x 4 x i16> @llvm.vp.udiv.nxv4i16(<vscale x 4 x i16> %va, <vscale x 4 x i16> splat (i16 7), <vscale x 4 x i1> %m, i32 %evl)
+  ret <vscale x 4 x i16> %v
+}
+
+define <vscale x 2 x i32> @vpudiv_by_const_with_add_nxv2i32(<vscale x 2 x i32> %va, <vscale x 2 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vpudiv_by_const_with_add_nxv2i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    lui a1, 149797
+; CHECK-NEXT:    addi a1, a1, -1755
+; CHECK-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
+; CHECK-NEXT:    vmulhu.vx v9, v8, a1, v0.t
+; CHECK-NEXT:    lui a0, 524288
+; CHECK-NEXT:    vsub.vv v10, v8, v9, v0.t
+; CHECK-NEXT:    vmulhu.vx v10, v10, a0, v0.t
+; CHECK-NEXT:    vadd.vv v9, v10, v9, v0.t
+; CHECK-NEXT:    vmv.v.i v10, 7
+; CHECK-NEXT:    vsrl.vi v9, v9, 2, v0.t
+; CHECK-NEXT:    vmseq.vi v0, v10, 1, v0.t
+; CHECK-NEXT:    vmerge.vvm v8, v9, v8, v0
+; CHECK-NEXT:    ret
+  %v = call <vscale x 2 x i32> @llvm.vp.udiv.nxv2i32(<vscale x 2 x i32> %va, <vscale x 2 x i32> splat (i32 7), <vscale x 2 x i1> %m, i32 %evl)
+  ret <vscale x 2 x i32> %v
+}
+
+define <vscale x 1 x i64> @vpudiv_by_const_with_add_nxv1i64(<vscale x 1 x i64> %va, <vscale x 1 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vpudiv_by_const_with_add_nxv1i64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    lui a1, %hi(.LCPI23_0)
+; CHECK-NEXT:    ld a1, %lo(.LCPI23_0)(a1)
+; CHECK-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
+; CHECK-NEXT:    vmulhu.vx v9, v8, a1, v0.t
+; CHECK-NEXT:    li a0, -1
+; CHECK-NEXT:    slli a0, a0, 63
+; CHECK-NEXT:    vsub.vv v10, v8, v9, v0.t
+; CHECK-NEXT:    vmulhu.vx v10, v10, a0, v0.t
+; CHECK-NEXT:    vadd.vv v9, v10, v9, v0.t
+; CHECK-NEXT:    vmv.v.i v10, 7
+; CHECK-NEXT:    vsrl.vi v9, v9, 2, v0.t
+; CHECK-NEXT:    vmseq.vi v0, v10, 1, v0.t
+; CHECK-NEXT:    vmerge.vvm v8, v9, v8, v0
+; CHECK-NEXT:    ret
+  %v = call <vscale x 1 x i64> @llvm.vp.udiv.nxv1i64(<vscale x 1 x i64> %va, <vscale x 1 x i64> splat (i64 7), <vscale x 1 x i1> %m, i32 %evl)
+  ret <vscale x 1 x i64> %v
+}
+
+define <vscale x 8 x i8> @vpsdiv_by_neg1_nxv8i8(<vscale x 8 x i8> %va, <vscale x 8 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vpsdiv_by_neg1_nxv8i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e8, m1, ta, ma
+; CHECK-NEXT:    vrsub.vi v8, v8, 0, v0.t
+; CHECK-NEXT:    ret
+  %vec = insertelement <vscale x 8 x i8> poison, i8 -1, i32 0
----------------
topperc wrote:

Used `splat`

https://github.com/llvm/llvm-project/pull/125991