[llvm] 505d574 - [Hexagon] Improve BUILD_VECTOR codegen

Wed Dec 29 10:23:11 PST 2021

Author: Joshua Herrera
Date: 2021-12-29T10:18:21-08:00
New Revision: 505d57486e57eb61e29bed6517de5152d208fede

URL: https://github.com/llvm/llvm-project/commit/505d57486e57eb61e29bed6517de5152d208fede
DIFF: https://github.com/llvm/llvm-project/commit/505d57486e57eb61e29bed6517de5152d208fede.diff

LOG: [Hexagon] Improve BUILD_VECTOR codegen

For vectors with repeating values, old codegen would rotate and insert
every duplicate element. This patch replaces that behavior with a splat
of the most common element, vinsert/vror only occur when needed.

Added: 
    llvm/test/CodeGen/Hexagon/autohvx/build-vector-i32-128b.ll
    llvm/test/CodeGen/Hexagon/autohvx/build-vector-i32-64b.ll
    llvm/test/CodeGen/Hexagon/autohvx/isel-build-vector.ll

Modified: 
    llvm/lib/Target/Hexagon/HexagonISelLoweringHVX.cpp

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Target/Hexagon/HexagonISelLoweringHVX.cpp b/llvm/lib/Target/Hexagon/HexagonISelLoweringHVX.cpp
old mode 100644
new mode 100755
index a3a9097378e7..569ad8b337db

--- a/llvm/lib/Target/Hexagon/HexagonISelLoweringHVX.cpp
+++ b/llvm/lib/Target/Hexagon/HexagonISelLoweringHVX.cpp
@@ -656,22 +656,66 @@ HexagonTargetLowering::buildHvxVectorReg(ArrayRef<SDValue> Values,
     }
   }
 
-  // Construct two halves in parallel, then or them together.
+  // Find most common element to initialize vector with. This is to avoid
+  // unnecessary vinsert/valign for cases where the same value is present
+  // many times. Creates a histogram of the vector's elements to find the
+  // most common element n.
   assert(4*Words.size() == Subtarget.getVectorLength());
-  SDValue HalfV0 = getInstr(Hexagon::V6_vd0, dl, VecTy, {}, DAG);
-  SDValue HalfV1 = getInstr(Hexagon::V6_vd0, dl, VecTy, {}, DAG);
-  SDValue S = DAG.getConstant(4, dl, MVT::i32);
-  for (unsigned i = 0; i != NumWords/2; ++i) {
-    SDValue N = DAG.getNode(HexagonISD::VINSERTW0, dl, VecTy,
-                            {HalfV0, Words[i]});
-    SDValue M = DAG.getNode(HexagonISD::VINSERTW0, dl, VecTy,
-                            {HalfV1, Words[i+NumWords/2]});
-    HalfV0 = DAG.getNode(HexagonISD::VROR, dl, VecTy, {N, S});
-    HalfV1 = DAG.getNode(HexagonISD::VROR, dl, VecTy, {M, S});
+  int VecHist[32];
+  int n = 0;
+  for (unsigned i = 0; i != NumWords; ++i) {
+    VecHist[i] = 0;
+    if (Words[i].isUndef())
+      continue;
+    for (unsigned j = i; j != NumWords; ++j)
+      if (Words[i] == Words[j])
+        VecHist[i]++;
+
+    if (VecHist[i] > VecHist[n])
+      n = i;
   }
 
-  HalfV0 = DAG.getNode(HexagonISD::VROR, dl, VecTy,
-                       {HalfV0, DAG.getConstant(HwLen/2, dl, MVT::i32)});
+  SDValue HalfV = getZero(dl, VecTy, DAG);
+  if (VecHist[n] > 1) {
+    SDValue SplatV = DAG.getNode(ISD::SPLAT_VECTOR, dl, VecTy, Words[n]);
+    HalfV = DAG.getNode(HexagonISD::VALIGN, dl, VecTy,
+                       {HalfV, SplatV, DAG.getConstant(HwLen/2, dl, MVT::i32)});
+  }
+  SDValue HalfV0 = HalfV;
+  SDValue HalfV1 = HalfV;
+
+  // Construct two halves in parallel, then or them together. Rn and Rm count
+  // number of rotations needed before the next element. One last rotation is
+  // performed post-loop to position the last element.
+  int Rn = 0, Rm = 0;
+  SDValue Sn, Sm;
+  SDValue N = HalfV0;
+  SDValue M = HalfV1;
+  for (unsigned i = 0; i != NumWords/2; ++i) {
+
+    // Rotate by element count since last insertion.
+    if (Words[i] != Words[n] || VecHist[n] <= 1) {
+      Sn = DAG.getConstant(Rn, dl, MVT::i32);
+      HalfV0 = DAG.getNode(HexagonISD::VROR, dl, VecTy, {N, Sn});
+      N = DAG.getNode(HexagonISD::VINSERTW0, dl, VecTy,
+                      {HalfV0, Words[i]});
+      Rn = 0;
+    }
+    if (Words[i+NumWords/2] != Words[n] || VecHist[n] <= 1) {
+      Sm = DAG.getConstant(Rm, dl, MVT::i32);
+      HalfV1 = DAG.getNode(HexagonISD::VROR, dl, VecTy, {M, Sm});
+      M = DAG.getNode(HexagonISD::VINSERTW0, dl, VecTy,
+                      {HalfV1, Words[i+NumWords/2]});
+      Rm = 0;
+    }
+    Rn += 4;
+    Rm += 4;
+  }
+  // Perform last rotation.
+  Sn = DAG.getConstant(Rn+HwLen/2, dl, MVT::i32);
+  Sm = DAG.getConstant(Rm, dl, MVT::i32);
+  HalfV0 = DAG.getNode(HexagonISD::VROR, dl, VecTy, {N, Sn});
+  HalfV1 = DAG.getNode(HexagonISD::VROR, dl, VecTy, {M, Sm});
 
   SDValue T0 = DAG.getBitcast(tyVector(VecTy, MVT::i32), HalfV0);
   SDValue T1 = DAG.getBitcast(tyVector(VecTy, MVT::i32), HalfV1);

diff  --git a/llvm/test/CodeGen/Hexagon/autohvx/build-vector-i32-128b.ll b/llvm/test/CodeGen/Hexagon/autohvx/build-vector-i32-128b.ll
new file mode 100644
index 000000000000..102ebd26c825
--- /dev/null
+++ b/llvm/test/CodeGen/Hexagon/autohvx/build-vector-i32-128b.ll
@@ -0,0 +1,34 @@
+; RUN: llc -march=hexagon < %s | FileCheck %s
+
+; Check that vector is produced with vxor
+; CHECK: v{{[0-9]*}} = vxor
+define <32 x i32> @f0(i32 %x) #0 {
+  %vect = insertelement <32 x i32> <i32 undef, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>, i32 %x, i32 0
+  ret <32 x i32> %vect
+}
+
+; Check that vector is produced with vsplat
+; CHECK: v{{[0-9]*}} = vsplat
+define <32 x i32> @f1(i32 %x) #0 {
+  %vect = insertelement <32 x i32> <i32 undef, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>, i32 %x, i32 0
+  ret <32 x i32> %vect
+}
+
+; Check that the correct vror is generated
+; CHECK: [[REG0:r([0-9]+)]] = #120
+; CHECK: vror(v{{[0-9]+}},[[REG0]])
+define <32 x i32> @f2(i32 %x) #0 {
+  %vect = insertelement <32 x i32> <i32 1, i32 1, i32 undef, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>, i32 %x, i32 2
+  ret <32 x i32> %vect
+}
+
+; Check that the correct vror is generated
+; CHECK: [[REG0:r([0-9]+)]] = #12
+; CHECK: vror(v{{[0-9]+}},[[REG0]])
+define <32 x i32> @f3(i32 %x) #0 {
+  %vect = insertelement <32 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 undef, i32 1, i32 1>, i32 %x, i32 29
+  ret <32 x i32> %vect
+}
+
+attributes #0 = { readnone nounwind "target-cpu"="hexagonv62" "target-features"="+hvx,+hvx-length128b" }
+

diff  --git a/llvm/test/CodeGen/Hexagon/autohvx/build-vector-i32-64b.ll b/llvm/test/CodeGen/Hexagon/autohvx/build-vector-i32-64b.ll
new file mode 100644
index 000000000000..85a7872b8a61
--- /dev/null
+++ b/llvm/test/CodeGen/Hexagon/autohvx/build-vector-i32-64b.ll
@@ -0,0 +1,34 @@
+; RUN: llc -march=hexagon < %s | FileCheck %s
+
+; Check that vector is produced with vxor
+; CHECK: v{{[0-9]*}} = vxor
+define <16 x i32> @f0(i32 %x) #0 {
+  %vect = insertelement <16 x i32> <i32 undef, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>, i32 %x, i32 0
+  ret <16 x i32> %vect
+}
+
+; Check that vector is produced with vsplat
+; CHECK: v{{[0-9]*}} = vsplat
+define <16 x i32> @f1(i32 %x) #0 {
+  %vect = insertelement <16 x i32> <i32 undef, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>, i32 %x, i32 0
+  ret <16 x i32> %vect
+}
+
+; Check that the correct vror is generated
+; CHECK: [[REG0:r([0-9]+)]] = #56
+; CHECK: vror(v{{[0-9]+}},[[REG0]])
+define <16 x i32> @f2(i32 %x) #0 {
+  %vect = insertelement <16 x i32> <i32 1, i32 1, i32 undef, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>, i32 %x, i32 2
+  ret <16 x i32> %vect
+}
+
+; Check that the correct vror is generated
+; CHECK: [[REG0:r([0-9]+)]] = #12
+; CHECK: vror(v{{[0-9]+}},[[REG0]])
+define <16 x i32> @f3(i32 %x) #0 {
+  %vect = insertelement <16 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 undef, i32 1, i32 1>, i32 %x, i32 13
+  ret <16 x i32> %vect
+}
+
+attributes #0 = { readnone nounwind "target-cpu"="hexagonv62" "target-features"="+hvx,+hvx-length64b" }
+

diff  --git a/llvm/test/CodeGen/Hexagon/autohvx/isel-build-vector.ll b/llvm/test/CodeGen/Hexagon/autohvx/isel-build-vector.ll
new file mode 100644
index 000000000000..e6b8445f5121
--- /dev/null
+++ b/llvm/test/CodeGen/Hexagon/autohvx/isel-build-vector.ll
@@ -0,0 +1,76 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -march=hexagon < %s | FileCheck %s
+
+define <32 x i32> @fred(i32 %a0) #0 {
+; CHECK-LABEL: fred:
+; CHECK:         .cfi_startproc
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    {
+; CHECK-NEXT:     r3:2 = combine(#20,#9)
+; CHECK-NEXT:     v0 = vxor(v0,v0)
+; CHECK-NEXT:     r1 = #24
+; CHECK-NEXT:     r4 = #12
+; CHECK-NEXT:    }
+; CHECK-NEXT:    {
+; CHECK-NEXT:     v1 = vror(v0,r1)
+; CHECK-NEXT:    }
+; CHECK-NEXT:    {
+; CHECK-NEXT:     v1.w = vinsert(r2)
+; CHECK-NEXT:     r4 = #7
+; CHECK-NEXT:     r2 = #116
+; CHECK-NEXT:     v0 = vror(v0,r4)
+; CHECK-NEXT:    }
+; CHECK-NEXT:    {
+; CHECK-NEXT:     v0.w = vinsert(r4)
+; CHECK-NEXT:    }
+; CHECK-NEXT:    {
+; CHECK-NEXT:     v1 = vror(v1,r3)
+; CHECK-NEXT:    }
+; CHECK-NEXT:    {
+; CHECK-NEXT:     v1.w = vinsert(r0)
+; CHECK-NEXT:     v0 = vror(v0,r2)
+; CHECK-NEXT:    }
+; CHECK-NEXT:    {
+; CHECK-NEXT:     v1 = vror(v1,r3)
+; CHECK-NEXT:    }
+; CHECK-NEXT:    {
+; CHECK-NEXT:     v0 = vor(v0,v1)
+; CHECK-NEXT:     jumpr r31
+; CHECK-NEXT:    }
+  %v0 = insertelement <32 x i32> undef, i32 undef, i32 0
+  %v1 = insertelement <32 x i32> %v0, i32 undef, i32 1
+  %v2 = insertelement <32 x i32> %v1, i32 undef, i32 2
+  %v3 = insertelement <32 x i32> %v2, i32 7, i32 3
+  %v4 = insertelement <32 x i32> %v3, i32 undef, i32 4
+  %v5 = insertelement <32 x i32> %v4, i32 undef, i32 5
+  %v6 = insertelement <32 x i32> %v5, i32 undef, i32 6
+  %v7 = insertelement <32 x i32> %v6, i32 undef, i32 7
+  %v8 = insertelement <32 x i32> %v7, i32 undef, i32 8
+  %v9 = insertelement <32 x i32> %v8, i32 undef, i32 9
+  %v10 = insertelement <32 x i32> %v9, i32 undef, i32 10
+  %v11 = insertelement <32 x i32> %v10, i32 undef, i32 11
+  %v12 = insertelement <32 x i32> %v11, i32 undef, i32 12
+  %v13 = insertelement <32 x i32> %v12, i32 undef, i32 13
+  %v14 = insertelement <32 x i32> %v13, i32 undef, i32 14
+  %v15 = insertelement <32 x i32> %v14, i32 undef, i32 15
+  %v16 = insertelement <32 x i32> %v15, i32 undef, i32 16
+  %v17 = insertelement <32 x i32> %v16, i32 undef, i32 17
+  %v18 = insertelement <32 x i32> %v17, i32 undef, i32 18
+  %v19 = insertelement <32 x i32> %v18, i32 undef, i32 19
+  %v20 = insertelement <32 x i32> %v19, i32 undef, i32 20
+  %v21 = insertelement <32 x i32> %v20, i32 undef, i32 21
+  %v22 = insertelement <32 x i32> %v21, i32 9, i32 22
+  %v23 = insertelement <32 x i32> %v22, i32 undef, i32 23
+  %v24 = insertelement <32 x i32> %v23, i32 undef, i32 24
+  %v25 = insertelement <32 x i32> %v24, i32 undef, i32 25
+  %v26 = insertelement <32 x i32> %v25, i32 undef, i32 26
+  %v27 = insertelement <32 x i32> %v26, i32 %a0, i32 27
+  %v28 = insertelement <32 x i32> %v27, i32 undef, i32 28
+  %v29 = insertelement <32 x i32> %v28, i32 undef, i32 29
+  %v30 = insertelement <32 x i32> %v29, i32 undef, i32 30
+  %v31 = insertelement <32 x i32> %v30, i32 undef, i32 31
+  ret <32 x i32> %v31
+}
+
+attributes #0 = { "target-cpu"="hexagonv66" "target-features"="+hvx,+hvx-length128b" }
+