[llvm] 1dc1a3f - [PowerPC] Implement low-order Vector Multiply, Modulus and Divide Instructions

Thu Jul 23 15:18:54 PDT 2020

Author: Amy Kwan
Date: 2020-07-23T17:18:36-05:00
New Revision: 1dc1a3fb0c51527d46ca47ae794ace16c8ec2fd2

URL: https://github.com/llvm/llvm-project/commit/1dc1a3fb0c51527d46ca47ae794ace16c8ec2fd2
DIFF: https://github.com/llvm/llvm-project/commit/1dc1a3fb0c51527d46ca47ae794ace16c8ec2fd2.diff

LOG: [PowerPC] Implement low-order Vector Multiply, Modulus and Divide Instructions

This patch aims to implement the low order vector multiply, divide and modulo
instructions available on Power10.

The patch involves legalizing the ISD nodes MUL, UDIV, SDIV, UREM and SREM for
v2i64 and v4i32 vector types in order to utilize the following instructions:
- Vector Multiply Low Doubleword: vmulld
- Vector Modulus Word/Doubleword: vmodsw, vmoduw, vmodsd, vmodud
- Vector Divide Word/Doubleword: vdivsw, vdivsd, vdivuw, vdivud

Differential Revision: https://reviews.llvm.org/D82510

Added: 
    llvm/test/CodeGen/PowerPC/p10-vector-divide.ll
    llvm/test/CodeGen/PowerPC/p10-vector-modulo.ll
    llvm/test/CodeGen/PowerPC/p10-vector-multiply.ll

Modified: 
    llvm/lib/Target/PowerPC/PPCISelLowering.cpp
    llvm/lib/Target/PowerPC/PPCInstrPrefix.td

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
index 5db829f8d470..edc23b2673f3 100644

--- a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
+++ b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
@@ -809,6 +809,18 @@ PPCTargetLowering::PPCTargetLowering(const PPCTargetMachine &TM,
     else
       setOperationAction(ISD::MUL, MVT::v4i32, Custom);
 
+    if (Subtarget.isISA3_1()) {
+      setOperationAction(ISD::MUL, MVT::v2i64, Legal);
+      setOperationAction(ISD::UDIV, MVT::v2i64, Legal);
+      setOperationAction(ISD::SDIV, MVT::v2i64, Legal);
+      setOperationAction(ISD::UDIV, MVT::v4i32, Legal);
+      setOperationAction(ISD::SDIV, MVT::v4i32, Legal);
+      setOperationAction(ISD::UREM, MVT::v2i64, Legal);
+      setOperationAction(ISD::SREM, MVT::v2i64, Legal);
+      setOperationAction(ISD::UREM, MVT::v4i32, Legal);
+      setOperationAction(ISD::SREM, MVT::v4i32, Legal);
+    }
+
     setOperationAction(ISD::MUL, MVT::v8i16, Legal);
     setOperationAction(ISD::MUL, MVT::v16i8, Custom);
 

diff  --git a/llvm/lib/Target/PowerPC/PPCInstrPrefix.td b/llvm/lib/Target/PowerPC/PPCInstrPrefix.td
index 832c66158f10..b468a8f318ee 100644
--- a/llvm/lib/Target/PowerPC/PPCInstrPrefix.td
+++ b/llvm/lib/Target/PowerPC/PPCInstrPrefix.td
@@ -973,7 +973,8 @@ let Predicates = [IsISA3_1] in {
                          [(set v16i8:$vD,
                                (int_ppc_altivec_vclrrb v16i8:$vA, i32:$rB))]>;
   def VMULLD : VXForm_1<457, (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vB),
-                        "vmulld $vD, $vA, $vB", IIC_VecGeneral, []>;
+                        "vmulld $vD, $vA, $vB", IIC_VecGeneral,
+                        [(set v2i64:$vD, (mul v2i64:$vA, v2i64:$vB))]>;
   def VMULHSW : VXForm_1<905, (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vB),
                          "vmulhsw $vD, $vA, $vB", IIC_VecGeneral, []>;
   def VMULHUW : VXForm_1<649, (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vB),
@@ -983,21 +984,29 @@ let Predicates = [IsISA3_1] in {
   def VMULHUD : VXForm_1<713, (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vB),
                          "vmulhud $vD, $vA, $vB", IIC_VecGeneral, []>;
   def VMODSW : VXForm_1<1931, (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vB),
-                        "vmodsw $vD, $vA, $vB", IIC_VecGeneral, []>;
+                        "vmodsw $vD, $vA, $vB", IIC_VecGeneral,
+                        [(set v4i32:$vD, (srem v4i32:$vA, v4i32:$vB))]>;
   def VMODUW : VXForm_1<1675, (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vB),
-                        "vmoduw $vD, $vA, $vB", IIC_VecGeneral, []>;
+                        "vmoduw $vD, $vA, $vB", IIC_VecGeneral,
+                        [(set v4i32:$vD, (urem v4i32:$vA, v4i32:$vB))]>;
   def VMODSD : VXForm_1<1995, (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vB),
-                        "vmodsd $vD, $vA, $vB", IIC_VecGeneral, []>;
+                        "vmodsd $vD, $vA, $vB", IIC_VecGeneral,
+                        [(set v2i64:$vD, (srem v2i64:$vA, v2i64:$vB))]>;
   def VMODUD : VXForm_1<1739, (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vB),
-                        "vmodud $vD, $vA, $vB", IIC_VecGeneral, []>;
+                        "vmodud $vD, $vA, $vB", IIC_VecGeneral,
+                        [(set v2i64:$vD, (urem v2i64:$vA, v2i64:$vB))]>;
   def VDIVSW : VXForm_1<395, (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vB),
-                        "vdivsw $vD, $vA, $vB", IIC_VecGeneral, []>;
+                        "vdivsw $vD, $vA, $vB", IIC_VecGeneral,
+                        [(set v4i32:$vD, (sdiv v4i32:$vA, v4i32:$vB))]>;
   def VDIVUW : VXForm_1<139, (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vB),
-                        "vdivuw $vD, $vA, $vB", IIC_VecGeneral, []>;
+                        "vdivuw $vD, $vA, $vB", IIC_VecGeneral,
+                        [(set v4i32:$vD, (udiv v4i32:$vA, v4i32:$vB))]>;
   def VDIVSD : VXForm_1<459, (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vB),
-                        "vdivsd $vD, $vA, $vB", IIC_VecGeneral, []>;
+                        "vdivsd $vD, $vA, $vB", IIC_VecGeneral,
+                        [(set v2i64:$vD, (sdiv v2i64:$vA, v2i64:$vB))]>;
   def VDIVUD : VXForm_1<203, (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vB),
-                        "vdivud $vD, $vA, $vB", IIC_VecGeneral, []>;
+                        "vdivud $vD, $vA, $vB", IIC_VecGeneral,
+                        [(set v2i64:$vD, (udiv v2i64:$vA, v2i64:$vB))]>;
   def VDIVESW : VXForm_1<907, (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vB),
                          "vdivesw $vD, $vA, $vB", IIC_VecGeneral, []>;
   def VDIVEUW : VXForm_1<651, (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vB),

diff  --git a/llvm/test/CodeGen/PowerPC/p10-vector-divide.ll b/llvm/test/CodeGen/PowerPC/p10-vector-divide.ll
new file mode 100644
index 000000000000..4ecc3a17fedb
--- /dev/null
+++ b/llvm/test/CodeGen/PowerPC/p10-vector-divide.ll
@@ -0,0 +1,51 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -verify-machineinstrs -mtriple=powerpc64le-unknown-linux-gnu \
+; RUN:   -mcpu=pwr10 -ppc-asm-full-reg-names -ppc-vsr-nums-as-vr < %s | \
+; RUN:   FileCheck %s
+; RUN: llc -verify-machineinstrs -mtriple=powerpc64-unknown-linux-gnu \
+; RUN:   -mcpu=pwr10 -ppc-asm-full-reg-names -ppc-vsr-nums-as-vr < %s | \
+; RUN:   FileCheck %s
+
+; This test case aims to test the vector divide instructions on Power10.
+; This includes the low order and extended versions of vector divide,
+; that operate on signed and unsigned words and doublewords.
+
+define <2 x i64> @test_vdivud(<2 x i64> %a, <2 x i64> %b) {
+; CHECK-LABEL: test_vdivud:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vdivud v2, v2, v3
+; CHECK-NEXT:    blr
+entry:
+  %div = udiv <2 x i64> %a, %b
+  ret <2 x i64> %div
+}
+
+define <2 x i64> @test_vdivsd(<2 x i64> %a, <2 x i64> %b) {
+; CHECK-LABEL: test_vdivsd:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vdivsd v2, v2, v3
+; CHECK-NEXT:    blr
+entry:
+  %div = sdiv <2 x i64> %a, %b
+  ret <2 x i64> %div
+}
+
+define <4 x i32> @test_vdivuw(<4 x i32> %a, <4 x i32> %b) {
+; CHECK-LABEL: test_vdivuw:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vdivuw v2, v2, v3
+; CHECK-NEXT:    blr
+entry:
+  %div = udiv <4 x i32> %a, %b
+  ret <4 x i32> %div
+}
+
+define <4 x i32> @test_vdivsw(<4 x i32> %a, <4 x i32> %b) {
+; CHECK-LABEL: test_vdivsw:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vdivsw v2, v2, v3
+; CHECK-NEXT:    blr
+entry:
+  %div = sdiv <4 x i32> %a, %b
+  ret <4 x i32> %div
+}

diff  --git a/llvm/test/CodeGen/PowerPC/p10-vector-modulo.ll b/llvm/test/CodeGen/PowerPC/p10-vector-modulo.ll
new file mode 100644
index 000000000000..e4ef0380ae8b
--- /dev/null
+++ b/llvm/test/CodeGen/PowerPC/p10-vector-modulo.ll
@@ -0,0 +1,107 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -verify-machineinstrs -mtriple=powerpc64le-unknown-linux-gnu \
+; RUN:   -mcpu=pwr10 -ppc-asm-full-reg-names -ppc-vsr-nums-as-vr < %s | \
+; RUN:   FileCheck %s
+; RUN: llc -verify-machineinstrs -mtriple=powerpc64-unknown-linux-gnu \
+; RUN:   -mcpu=pwr10 -ppc-asm-full-reg-names -ppc-vsr-nums-as-vr < %s | \
+; RUN:   FileCheck %s
+
+; This test case aims to test the vector modulo instructions on Power10.
+; The vector modulo instructions operate on signed and unsigned words
+; and doublewords.
+
+define <2 x i64> @test_vmodud(<2 x i64> %a, <2 x i64> %b) {
+; CHECK-LABEL: test_vmodud:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vmodud v2, v2, v3
+; CHECK-NEXT:    blr
+entry:
+  %rem = urem <2 x i64> %a, %b
+  ret <2 x i64> %rem
+}
+
+define <2 x i64> @test_vmodsd(<2 x i64> %a, <2 x i64> %b) {
+; CHECK-LABEL: test_vmodsd:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vmodsd v2, v2, v3
+; CHECK-NEXT:    blr
+entry:
+  %rem = srem <2 x i64> %a, %b
+  ret <2 x i64> %rem
+}
+
+define <4 x i32> @test_vmoduw(<4 x i32> %a, <4 x i32> %b) {
+; CHECK-LABEL: test_vmoduw:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vmoduw v2, v2, v3
+; CHECK-NEXT:    blr
+entry:
+  %rem = urem <4 x i32> %a, %b
+  ret <4 x i32> %rem
+}
+
+define <4 x i32> @test_vmodsw(<4 x i32> %a, <4 x i32> %b) {
+; CHECK-LABEL: test_vmodsw:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vmodsw v2, v2, v3
+; CHECK-NEXT:    blr
+entry:
+  %rem = srem <4 x i32> %a, %b
+  ret <4 x i32> %rem
+}
+
+define <2 x i64> @test_vmodud_with_div(<2 x i64> %a, <2 x i64> %b) {
+; CHECK-LABEL: test_vmodud_with_div:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vmodud v4, v2, v3
+; CHECK-NEXT:    vdivud v2, v2, v3
+; CHECK-NEXT:    vaddudm v2, v4, v2
+; CHECK-NEXT:    blr
+entry:
+  %rem = urem <2 x i64> %a, %b
+  %div = udiv <2 x i64> %a, %b
+  %add = add <2 x i64> %rem, %div
+  ret <2 x i64> %add
+}
+
+define <2 x i64> @test_vmodsd_with_div(<2 x i64> %a, <2 x i64> %b) {
+; CHECK-LABEL: test_vmodsd_with_div:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vmodsd v4, v2, v3
+; CHECK-NEXT:    vdivsd v2, v2, v3
+; CHECK-NEXT:    vaddudm v2, v4, v2
+; CHECK-NEXT:    blr
+entry:
+  %rem = srem <2 x i64> %a, %b
+  %div = sdiv <2 x i64> %a, %b
+  %add = add <2 x i64> %rem, %div
+  ret <2 x i64> %add
+}
+
+define <4 x i32> @test_vmoduw_with_div(<4 x i32> %a, <4 x i32> %b) {
+; CHECK-LABEL: test_vmoduw_with_div:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vmoduw v4, v2, v3
+; CHECK-NEXT:    vdivuw v2, v2, v3
+; CHECK-NEXT:    vadduwm v2, v4, v2
+; CHECK-NEXT:    blr
+entry:
+  %rem = urem <4 x i32> %a, %b
+  %div = udiv <4 x i32> %a, %b
+  %add = add <4 x i32> %rem, %div
+  ret <4 x i32> %add
+}
+
+define <4 x i32> @test_vmodsw_div(<4 x i32> %a, <4 x i32> %b) {
+; CHECK-LABEL: test_vmodsw_div:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vmodsw v4, v2, v3
+; CHECK-NEXT:    vdivsw v2, v2, v3
+; CHECK-NEXT:    vadduwm v2, v4, v2
+; CHECK-NEXT:    blr
+entry:
+  %rem = srem <4 x i32> %a, %b
+  %div = sdiv <4 x i32> %a, %b
+  %add = add <4 x i32> %rem, %div
+  ret <4 x i32> %add
+}

diff  --git a/llvm/test/CodeGen/PowerPC/p10-vector-multiply.ll b/llvm/test/CodeGen/PowerPC/p10-vector-multiply.ll
new file mode 100644
index 000000000000..e8f77574f66c
--- /dev/null
+++ b/llvm/test/CodeGen/PowerPC/p10-vector-multiply.ll
@@ -0,0 +1,19 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -verify-machineinstrs -mtriple=powerpc64le-unknown-linux-gnu \
+; RUN:   -mcpu=pwr10 -ppc-asm-full-reg-names -ppc-vsr-nums-as-vr < %s | \
+; RUN:   FileCheck %s
+; RUN: llc -verify-machineinstrs -mtriple=powerpc64-unknown-linux-gnu \
+; RUN:   -mcpu=pwr10 -ppc-asm-full-reg-names -ppc-vsr-nums-as-vr < %s | \
+; RUN:   FileCheck %s
+
+; This test case aims to test the vector multiply instructions on Power10.
+
+define <2 x i64> @test_vmulld(<2 x i64> %a, <2 x i64> %b) {
+; CHECK-LABEL: test_vmulld:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vmulld v2, v3, v2
+; CHECK-NEXT:    blr
+entry:
+  %mul = mul <2 x i64> %b, %a
+  ret <2 x i64> %mul
+}