[llvm] d058262 - [SystemZ] Support i128 inline asm operands.

Wed May 26 08:10:01 PDT 2021

Author: Jonas Paulsson
Date: 2021-05-26T10:08:32-05:00
New Revision: d058262b1471c80577924fc988ee86e175e3fc16

URL: https://github.com/llvm/llvm-project/commit/d058262b1471c80577924fc988ee86e175e3fc16
DIFF: https://github.com/llvm/llvm-project/commit/d058262b1471c80577924fc988ee86e175e3fc16.diff

LOG: [SystemZ] Support i128 inline asm operands.

Support virtual, physical and tied i128 register operands in inline assembly.

i128 is on SystemZ not really supported and is not a legal type and generally
such a value will be split into two i64 parts. There are however some
instructions that require a pair of two GPR64 registers contained in the GR128
bit reg class, which is untyped.

For inline assmebly operands, it proved to be very cumbersome to first follow
the general behavior of splitting an i128 operand into two parts and then
later rebuild the INLINEASM MI to have one GR128 register. Instead, some
minor common code changes were made to SelectionDAGBUilder to only create one
GR128 register part to begin with. In particular:

- getNumRegisters() now has an optional parameter "RegisterVT" which is
  passed by AddInlineAsmOperands() and GetRegistersForValue().

- The bitcasting in GetRegistersForValue is not performed if RegVT is
  Untyped.

- The RC for a tied use in AddInlineAsmOperands() is now computed either from
  the tied def (virtual register), or by getMinimalPhysRegClass() (physical
  register).

- InstrEmitter.cpp:EmitCopyFromReg() has been fixed so that the register
  class (DstRC) can also be computed for an illegal type.

In the SystemZ backend getNumRegisters(), splitValueIntoRegisterParts() and
joinRegisterPartsIntoValue() have been implemented to handle i128 operands.

Differential Revision: https://reviews.llvm.org/D100788

Review: Ulrich Weigand

Added: 
    llvm/test/CodeGen/SystemZ/inline-asm-i128.ll

Modified: 
    llvm/include/llvm/CodeGen/TargetLowering.h
    llvm/lib/CodeGen/SelectionDAG/InstrEmitter.cpp
    llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
    llvm/lib/Target/SystemZ/SystemZISelLowering.cpp
    llvm/lib/Target/SystemZ/SystemZISelLowering.h

Removed: 
    


################################################################################
diff  --git a/llvm/include/llvm/CodeGen/TargetLowering.h b/llvm/include/llvm/CodeGen/TargetLowering.h
index 13079b8f17fc1..31f3d1e5a68a2 100644

--- a/llvm/include/llvm/CodeGen/TargetLowering.h
+++ b/llvm/include/llvm/CodeGen/TargetLowering.h
@@ -1472,7 +1472,12 @@ class TargetLoweringBase {
   /// like i140, which are first promoted then expanded, it is the number of
   /// registers needed to hold all the bits of the original type.  For an i140
   /// on a 32 bit machine this means 5 registers.
-  unsigned getNumRegisters(LLVMContext &Context, EVT VT) const {
+  ///
+  /// RegisterVT may be passed as a way to override the default settings, for
+  /// instance with i128 inline assembly operands on SystemZ.
+  virtual unsigned
+  getNumRegisters(LLVMContext &Context, EVT VT,
+                  Optional<MVT> RegisterVT = None) const {
     if (VT.isSimple()) {
       assert((unsigned)VT.getSimpleVT().SimpleTy <
                 array_lengthof(NumRegistersForVT));

diff  --git a/llvm/lib/CodeGen/SelectionDAG/InstrEmitter.cpp b/llvm/lib/CodeGen/SelectionDAG/InstrEmitter.cpp
index 84a92a02bb03b..691cdc9662c98 100644
--- a/llvm/lib/CodeGen/SelectionDAG/InstrEmitter.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/InstrEmitter.cpp
@@ -166,9 +166,8 @@ EmitCopyFromReg(SDNode *Node, unsigned ResNo, bool IsClone, bool IsCloned,
     assert(TRI->isTypeLegalForClass(*UseRC, VT) &&
            "Incompatible phys register def and uses!");
     DstRC = UseRC;
-  } else {
-    DstRC = TLI->getRegClassFor(VT, Node->isDivergent());
-  }
+  } else
+    DstRC = SrcRC;
 
   // If all uses are reading from the src physical register and copying the
   // register is either impossible or very expensive, then don't create a copy.

diff  --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
index 92d52138a777e..b581b9a929ad1 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
@@ -988,8 +988,9 @@ void RegsForValue::AddInlineAsmOperands(unsigned Code, bool HasMatching,
   }
 
   for (unsigned Value = 0, Reg = 0, e = ValueVTs.size(); Value != e; ++Value) {
-    unsigned NumRegs = TLI.getNumRegisters(*DAG.getContext(), ValueVTs[Value]);
     MVT RegisterVT = RegVTs[Value];
+    unsigned NumRegs = TLI.getNumRegisters(*DAG.getContext(), ValueVTs[Value],
+                                           RegisterVT);
     for (unsigned i = 0; i != NumRegs; ++i) {
       assert(Reg < Regs.size() && "Mismatch in # registers expected");
       unsigned TheReg = Regs[Reg++];
@@ -8241,7 +8242,7 @@ static void GetRegistersForValue(SelectionDAG &DAG, const SDLoc &DL,
   // remember that AX is actually i16 to get the right extension.
   const MVT RegVT = *TRI.legalclasstypes_begin(*RC);
 
-  if (OpInfo.ConstraintVT != MVT::Other) {
+  if (OpInfo.ConstraintVT != MVT::Other && RegVT != MVT::Untyped) {
     // If this is an FP operand in an integer register (or visa versa), or more
     // generally if the operand value disagrees with the register class we plan
     // to stick it in, fix the operand type.
@@ -8288,7 +8289,7 @@ static void GetRegistersForValue(SelectionDAG &DAG, const SDLoc &DL,
   // Initialize NumRegs.
   unsigned NumRegs = 1;
   if (OpInfo.ConstraintVT != MVT::Other)
-    NumRegs = TLI.getNumRegisters(Context, OpInfo.ConstraintVT);
+    NumRegs = TLI.getNumRegisters(Context, OpInfo.ConstraintVT, RegVT);
 
   // If this is a constraint for a specific physical register, like {r17},
   // assign it now.
@@ -8621,21 +8622,18 @@ void SelectionDAGBuilder::visitInlineAsm(const CallBase &Call,
             return;
           }
 
-          MVT RegVT = AsmNodeOperands[CurOp+1].getSimpleValueType();
           SmallVector<unsigned, 4> Regs;
-
-          if (const TargetRegisterClass *RC = TLI.getRegClassFor(RegVT)) {
-            unsigned NumRegs = InlineAsm::getNumOperandRegisters(OpFlag);
-            MachineRegisterInfo &RegInfo =
-                DAG.getMachineFunction().getRegInfo();
-            for (unsigned i = 0; i != NumRegs; ++i)
-              Regs.push_back(RegInfo.createVirtualRegister(RC));
-          } else {
-            emitInlineAsmError(Call,
-                               "inline asm error: This value type register "
-                               "class is not natively supported!");
-            return;
-          }
+          MachineFunction &MF = DAG.getMachineFunction();
+          MachineRegisterInfo &MRI = MF.getRegInfo();
+          const TargetRegisterInfo &TRI = *MF.getSubtarget().getRegisterInfo();
+          RegisterSDNode *R = dyn_cast<RegisterSDNode>(AsmNodeOperands[CurOp+1]);
+          Register TiedReg = R->getReg();
+          MVT RegVT = R->getSimpleValueType(0);
+          const TargetRegisterClass *RC = TiedReg.isVirtual() ?
+            MRI.getRegClass(TiedReg) : TRI.getMinimalPhysRegClass(TiedReg);
+          unsigned NumRegs = InlineAsm::getNumOperandRegisters(OpFlag);
+          for (unsigned i = 0; i != NumRegs; ++i)
+            Regs.push_back(MRI.createVirtualRegister(RC));
 
           RegsForValue MatchedRegs(Regs, RegVT, InOperandVal.getValueType());
 

diff  --git a/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp b/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp
index 5c1f3dbc9afec..53a6ee9398c70 100644
--- a/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp
+++ b/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp
@@ -1368,6 +1368,55 @@ static SDValue convertValVTToLocVT(SelectionDAG &DAG, const SDLoc &DL,
   }
 }
 
+static SDValue lowerI128ToGR128(SelectionDAG &DAG, SDValue In) {
+  SDLoc DL(In);
+  SDValue Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, MVT::i64, In,
+                           DAG.getIntPtrConstant(0, DL));
+  SDValue Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, MVT::i64, In,
+                           DAG.getIntPtrConstant(1, DL));
+  SDNode *Pair = DAG.getMachineNode(SystemZ::PAIR128, DL,
+                                    MVT::Untyped, Hi, Lo);
+  return SDValue(Pair, 0);
+}
+
+static SDValue lowerGR128ToI128(SelectionDAG &DAG, SDValue In) {
+  SDLoc DL(In);
+  SDValue Hi = DAG.getTargetExtractSubreg(SystemZ::subreg_h64,
+                                          DL, MVT::i64, In);
+  SDValue Lo = DAG.getTargetExtractSubreg(SystemZ::subreg_l64,
+                                          DL, MVT::i64, In);
+  return DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i128, Lo, Hi);
+}
+
+bool SystemZTargetLowering::splitValueIntoRegisterParts(
+    SelectionDAG &DAG, const SDLoc &DL, SDValue Val, SDValue *Parts,
+    unsigned NumParts, MVT PartVT, Optional<CallingConv::ID> CC) const {
+  EVT ValueVT = Val.getValueType();
+  assert((ValueVT != MVT::i128 ||
+          ((NumParts == 1 && PartVT == MVT::Untyped) ||
+           (NumParts == 2 && PartVT == MVT::i64))) &&
+         "Unknown handling of i128 value.");
+  if (ValueVT == MVT::i128 && NumParts == 1) {
+    // Inline assembly operand.
+    Parts[0] = lowerI128ToGR128(DAG, Val);
+    return true;
+  }
+  return false;
+}
+
+SDValue SystemZTargetLowering::joinRegisterPartsIntoValue(
+    SelectionDAG &DAG, const SDLoc &DL, const SDValue *Parts, unsigned NumParts,
+    MVT PartVT, EVT ValueVT, Optional<CallingConv::ID> CC) const {
+  assert((ValueVT != MVT::i128 ||
+          ((NumParts == 1 && PartVT == MVT::Untyped) ||
+           (NumParts == 2 && PartVT == MVT::i64))) &&
+         "Unknown handling of i128 value.");
+  if (ValueVT == MVT::i128 && NumParts == 1)
+    // Inline assembly operand.
+    return lowerGR128ToI128(DAG, Parts[0]);
+  return SDValue();
+}
+
 SDValue SystemZTargetLowering::LowerFormalArguments(
     SDValue Chain, CallingConv::ID CallConv, bool IsVarArg,
     const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &DL,
@@ -5489,27 +5538,6 @@ SDValue SystemZTargetLowering::LowerOperation(SDValue Op,
 
 // Lower operations with invalid operand or result types (currently used
 // only for 128-bit integer types).
-
-static SDValue lowerI128ToGR128(SelectionDAG &DAG, SDValue In) {
-  SDLoc DL(In);
-  SDValue Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, MVT::i64, In,
-                           DAG.getIntPtrConstant(0, DL));
-  SDValue Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, MVT::i64, In,
-                           DAG.getIntPtrConstant(1, DL));
-  SDNode *Pair = DAG.getMachineNode(SystemZ::PAIR128, DL,
-                                    MVT::Untyped, Hi, Lo);
-  return SDValue(Pair, 0);
-}
-
-static SDValue lowerGR128ToI128(SelectionDAG &DAG, SDValue In) {
-  SDLoc DL(In);
-  SDValue Hi = DAG.getTargetExtractSubreg(SystemZ::subreg_h64,
-                                          DL, MVT::i64, In);
-  SDValue Lo = DAG.getTargetExtractSubreg(SystemZ::subreg_l64,
-                                          DL, MVT::i64, In);
-  return DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i128, Lo, Hi);
-}
-
 void
 SystemZTargetLowering::LowerOperationWrapper(SDNode *N,
                                              SmallVectorImpl<SDValue> &Results,

diff  --git a/llvm/lib/Target/SystemZ/SystemZISelLowering.h b/llvm/lib/Target/SystemZ/SystemZISelLowering.h
index 21b3e3a1f67fd..557d066a149b7 100644
--- a/llvm/lib/Target/SystemZ/SystemZISelLowering.h
+++ b/llvm/lib/Target/SystemZ/SystemZISelLowering.h
@@ -422,6 +422,15 @@ class SystemZTargetLowering : public TargetLowering {
       return TypeWidenVector;
     return TargetLoweringBase::getPreferredVectorAction(VT);
   }
+  unsigned
+  getNumRegisters(LLVMContext &Context, EVT VT,
+                  Optional<MVT> RegisterVT) const override {
+    // i128 inline assembly operand.
+    if (VT == MVT::i128 &&
+        RegisterVT.hasValue() && RegisterVT.getValue() == MVT::Untyped)
+      return 1;
+    return TargetLowering::getNumRegisters(Context, VT);
+  }
   bool isCheapToSpeculateCtlz() const override { return true; }
   bool preferZeroCompareBranch() const override { return true; }
   EVT getSetCCResultType(const DataLayout &DL, LLVMContext &,
@@ -518,6 +527,15 @@ class SystemZTargetLowering : public TargetLowering {
   const MCPhysReg *getScratchRegisters(CallingConv::ID CC) const override;
   bool allowTruncateForTailCall(Type *, Type *) const override;
   bool mayBeEmittedAsTailCall(const CallInst *CI) const override;
+  bool splitValueIntoRegisterParts(SelectionDAG &DAG, const SDLoc &DL,
+                                   SDValue Val, SDValue *Parts,
+                                   unsigned NumParts, MVT PartVT,
+                                   Optional<CallingConv::ID> CC) const override;
+  SDValue
+  joinRegisterPartsIntoValue(SelectionDAG &DAG, const SDLoc &DL,
+                             const SDValue *Parts, unsigned NumParts,
+                             MVT PartVT, EVT ValueVT,
+                             Optional<CallingConv::ID> CC) const override;
   SDValue LowerFormalArguments(SDValue Chain, CallingConv::ID CallConv,
                                bool isVarArg,
                                const SmallVectorImpl<ISD::InputArg> &Ins,

diff  --git a/llvm/test/CodeGen/SystemZ/inline-asm-i128.ll b/llvm/test/CodeGen/SystemZ/inline-asm-i128.ll
new file mode 100644
index 0000000000000..d79e1d875db7f
--- /dev/null
+++ b/llvm/test/CodeGen/SystemZ/inline-asm-i128.ll
@@ -0,0 +1,120 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=s390x-linux-gnu -no-integrated-as < %s | FileCheck %s
+;
+; Test i128 (tied) operands.
+
+define i32 @fun0(i8* %p1, i32 signext %l1, i8* %p2, i32 signext %l2, i8 zeroext %pad) {
+; CHECK-LABEL: fun0:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    lgr %r0, %r5
+; CHECK-NEXT:    # kill: def $r4d killed $r4d def $r4q
+; CHECK-NEXT:    lgr %r1, %r3
+; CHECK-NEXT:    # kill: def $r2d killed $r2d def $r2q
+; CHECK-NEXT:    sllg %r5, %r6, 24
+; CHECK-NEXT:    rosbg %r5, %r0, 40, 63, 0
+; CHECK-NEXT:    risbg %r3, %r1, 40, 191, 0
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    clcl %r2, %r4
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    ogr %r3, %r5
+; CHECK-NEXT:    risbg %r0, %r3, 40, 191, 0
+; CHECK-NEXT:    ipm %r2
+; CHECK-NEXT:    afi %r2, -268435456
+; CHECK-NEXT:    srl %r2, 31
+; CHECK-NEXT:    br %r14
+entry:
+  %0 = ptrtoint i8* %p1 to i64
+  %1 = ptrtoint i8* %p2 to i64
+  %and5 = and i32 %l2, 16777215
+  %2 = zext i32 %and5 to i64
+  %conv7 = zext i8 %pad to i64
+  %shl = shl nuw nsw i64 %conv7, 24
+  %or = or i64 %shl, %2
+  %u1.sroa.0.0.insert.ext = zext i64 %0 to i128
+  %u1.sroa.0.0.insert.shift = shl nuw i128 %u1.sroa.0.0.insert.ext, 64
+  %3 = and i32 %l1, 16777215
+  %u1.sroa.0.0.insert.mask = zext i32 %3 to i128
+  %u1.sroa.0.0.insert.insert = or i128 %u1.sroa.0.0.insert.shift, %u1.sroa.0.0.insert.mask
+  %u2.sroa.5.0.insert.ext = zext i64 %or to i128
+  %u2.sroa.0.0.insert.ext = zext i64 %1 to i128
+  %u2.sroa.0.0.insert.shift = shl nuw i128 %u2.sroa.0.0.insert.ext, 64
+  %u2.sroa.0.0.insert.insert = or i128 %u2.sroa.0.0.insert.shift, %u2.sroa.5.0.insert.ext
+  %4 = tail call { i128, i128 } asm "clcl $0, $1", "=r,=r,0,1"(i128 %u1.sroa.0.0.insert.insert, i128 %u2.sroa.0.0.insert.insert)
+  %asmresult = extractvalue { i128, i128 } %4, 0
+  %asmresult11 = extractvalue { i128, i128 } %4, 1
+  %5 = or i128 %asmresult, %asmresult11
+  %6 = and i128 %5, 16777215
+  %7 = icmp eq i128 %6, 0
+  %land.ext = zext i1 %7 to i32
+  ret i32 %land.ext
+}
+
+; Test a phys-reg def.
+define void @fun1(i128* %Src, i128* %Dst) {
+; CHECK-LABEL: fun1:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    BLA %r4
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    stg %r5, 8(%r3)
+; CHECK-NEXT:    stg %r4, 0(%r3)
+; CHECK-NEXT:    br %r14
+entry:
+   %IAsm = call i128 asm "BLA $0", "={r4}"()
+  store volatile i128 %IAsm, i128* %Dst
+  ret void
+}
+
+; Test a phys-reg use.
+define void @fun2(i128* %Src, i128* %Dst) {
+; CHECK-LABEL: fun2:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    lg %r5, 8(%r2)
+; CHECK-NEXT:    lg %r4, 0(%r2)
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    BLA %r4
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    br %r14
+entry:
+  %L = load i128, i128* %Src
+  call void asm "BLA $0", "{r4}"(i128 %L)
+  ret void
+}
+
+; Test phys-reg use and phys-reg def.
+define void @fun3(i128* %Src, i128* %Dst) {
+; CHECK-LABEL: fun3:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    lg %r1, 8(%r2)
+; CHECK-NEXT:    lg %r0, 0(%r2)
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    BLA %r4, %r0
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    stg %r5, 8(%r3)
+; CHECK-NEXT:    stg %r4, 0(%r3)
+; CHECK-NEXT:    br %r14
+entry:
+  %L = load i128, i128* %Src
+  %IAsm = call i128 asm "BLA $0, $1", "={r4},{r0}"(i128 %L)
+  store volatile i128 %IAsm, i128* %Dst
+  ret void
+}
+
+; Test a tied phys-reg.
+define void @fun4(i128* %Src, i128* %Dst) {
+; CHECK-LABEL: fun4:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    lg %r5, 8(%r2)
+; CHECK-NEXT:    lg %r4, 0(%r2)
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    BLA %r4, %r4
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    stg %r5, 8(%r3)
+; CHECK-NEXT:    stg %r4, 0(%r3)
+; CHECK-NEXT:    br %r14
+entry:
+  %L = load i128, i128* %Src
+  %IAsm = call i128 asm "BLA $0, $1", "={r4},0"(i128 %L)
+  store volatile i128 %IAsm, i128* %Dst
+  ret void
+}