[llvm] [Hexagon] Add HVX patterns for vector arithmetic (PR #170704)

Sat Dec 6 09:05:46 PST 2025

================
@@ -565,12 +565,142 @@ define <32 x i32> @test_2h(<32 x i32> %v0, <32 x i32> %v1, <32 x i32> %v2) #0 {
 ; CHECK-LABEL: test_2i:
 ; CHECK: q[[Q2I0:[0-3]]] ^= vcmp.gt(v0.uw,v1.uw)
 ; CHECK: v0 = vmux(q[[Q2I0]],v0,v1)
-define <32 x i32> @test_2i(<32 x i32> %v0, <32 x i32> %v1, <32 x i32> %v2) #0 {
+define <32 x i32> @test_2i(<32 x i32> %v0, <32 x i32> %v1, <32 x i32> %v2) {
   %q0 = icmp ugt <32 x i32> %v0, %v1
   %q1 = trunc <32 x i32> %v2 to <32 x i1>
   %q2 = xor <32 x i1> %q0, %q1
   %t1 = select <32 x i1> %q2, <32 x i32> %v0, <32 x i32> %v1
   ret <32 x i32> %t1
 }
 
-attributes #0 = { nounwind readnone "target-cpu"="hexagonv60" "target-features"="+hvx,+hvx-length128b" }
+; --- Float32
+
+; CHECK-LABEL: test_2j:
+; CHECK: q[[Q2J0:[0-3]]] = vcmp.eq(v0.w,v1.w)
+; CHECK: v0 = vmux(q[[Q2J0]],v0,v1)
+define <32 x float> @test_2j(<32 x float> %v0, <32 x float> %v1) {
+  %t0 = fcmp oeq <32 x float> %v0, %v1
+  %t1 = select <32 x i1> %t0, <32 x float> %v0, <32 x float> %v1
+  ret <32 x float> %t1
+}
+
+; CHECK-LABEL: test_2k:
+; CHECK: q[[Q2K0:[0-3]]] = vcmp.eq(v0.w,v1.w)
+; CHECK: v0 = vmux(q[[Q2K0]],v1,v0)
+define <32 x float> @test_2k(<32 x float> %v0, <32 x float> %v1) {
+  %t0 = fcmp one <32 x float> %v0, %v1
+  %t1 = select <32 x i1> %t0, <32 x float> %v0, <32 x float> %v1
+  ret <32 x float> %t1
+}
+
+; CHECK-LABEL: test_2l:
+; CHECK: v0.sf = vmin(v1.sf,v0.sf)
+define <32 x float> @test_2l(<32 x float> %v0, <32 x float> %v1) {
+  %t0 = fcmp olt <32 x float> %v0, %v1
+  %t1 = select <32 x i1> %t0, <32 x float> %v0, <32 x float> %v1
+  ret <32 x float> %t1
+}
+
+; CHECK-LABEL: test_2m:
+; CHECK: q[[Q2M0:[0-3]]] = vcmp.gt(v0.sf,v1.sf)
+; CHECK: v0 = vmux(q[[Q2M0]],v1,v0)
+define <32 x float> @test_2m(<32 x float> %v0, <32 x float> %v1) {
+  %t0 = fcmp ole <32 x float> %v0, %v1
+  %t1 = select <32 x i1> %t0, <32 x float> %v0, <32 x float> %v1
+  ret <32 x float> %t1
+}
+
+; CHECK-LABEL: test_2n:
+; CHECK: v0.sf = vmax(v0.sf,v1.sf)
+define <32 x float> @test_2n(<32 x float> %v0, <32 x float> %v1) {
+  %t0 = fcmp ogt <32 x float> %v0, %v1
+  %t1 = select <32 x i1> %t0, <32 x float> %v0, <32 x float> %v1
+  ret <32 x float> %t1
+}
+
+; CHECK-LABEL: test_2o:
+; CHECK: q[[Q2O0:[0-3]]] = vcmp.gt(v1.sf,v0.sf)
+; CHECK: v0 = vmux(q[[Q2O0]],v1,v0)
+define <32 x float> @test_2o(<32 x float> %v0, <32 x float> %v1) {
+  %t0 = fcmp oge <32 x float> %v0, %v1
+  %t1 = select <32 x i1> %t0, <32 x float> %v0, <32 x float> %v1
+  ret <32 x float> %t1
+}
+
+; CHECK-LABEL: test_2p:
+; CHECK: r[[R2P0:[0-9]*]] = ##16843009
----------------
kparzysz wrote:

If you want to do a truncation from v32i32 to v32i1, using 0x01010101 is still wrong.

A Q register always has 128 bits.  A value of type v32i1 is represented as 32 groups of 4 bits each.  Within each group all 4 bits have to be 0 or all have to be 1.

If v2 has 0x00000001 repeated 32 times, using vand with 0x01010101 will produce groups of bits in Q that are 0x0000 or 0x0001.  This will cause the final vmux to pick only the lowest _byte_ from v0 instead the whole float.  To get a proper truncation from v32i32 to v32i1, do vandv(v2, vsplatw(0x00000001)) first, then vand(result, -1).

https://github.com/llvm/llvm-project/pull/170704