[PATCH] Add a LOAD_SEQUENCE_POINT ISDOpcode

Wed Nov 13 11:17:25 PST 2013

On Wed, Nov 13, 2013 at 09:19:17AM -0800, Richard Sandiford wrote:
> One unusual feature of the z architecture is that the result of a previous load can be reused indefinitely for subsequent loads, even if a cache-coherent store to that location is performed by another CPU.  A special serialising instruction must be used if you want to force a load to be reattempted.  To quote the architecture manual (where MVI is MOVE IMMEDIATE and CLI is COMPARE LOGICAL IMMEDIATE):

We have a very similar 'feature' for the VLIW targets in the R600 backend.
The result of a load from LDS (Local memory in OpenCL) is stored in the
'output queue'.  When an ALU instructions wants to use the result of a
load, it had two options: 1. It can read the value from the top of the
queue and then pop it off.  2. It can read the value and leave it on
the queue.  If instructions use option 2, then the result of the load
can be used indefinitely.

> 
>   Following is an example showing the effects of serialization. Location
>   A initially contains FF hex.
> 
>   CPU 1                  CPU 2
>   MVI A,X'00'       G    CLI A,X'00'
>   BCR 15,0               BNE G
> 
>   The BCR 15,0 instruction executed by CPU 1 is a serializing
>   instruction that ensures that the store by CPU 1 at location A is
>   completed. However, CPU 2 may loop indefinitely, or until the next
>   interruption on CPU 2, because CPU 2 may already have fetched from
>   location A for every execution of the CLI instruction. A serializing
>   instruction must be in the CPU-2 loop to ensure that CPU 2 will again
>   fetch from location A.
> 
> Since volatile loads are not supposed to be omitted in this way, we should insert a serialising instruction before each such load.  The same goes for atomic loads.
> 
> This patch adds a new ISDOpcode for this situation.  It's emitted in a similar way to ATOMIC_FENCE, but in different circumstances and for different reasons:
> 
>     /// Marks a point before a volatile or atomic load, to ensure that
>     /// subsequent loads are attempted.  This exists for architectures
>     /// like SystemZ that allow results from previous loads to be reused
>     /// indefinitely.  For example, the architecture may treat a loop:
>     ///
>     ///   while (*i == 0);
>     ///
>     /// as:
>     ///
>     ///   while (*i == 0) spin-until-interrupt;
>     ///
>     /// omitting all but the first load in each time slice (even if a
>     /// cache-coherent store is performed by another CPU).  Inserting
>     /// this operation forces each iteration of the loop to attempt a load.
>     ///
>     /// Note that this is not an ordering fence per se.  It simply ensures
>     /// that a sequence of N loads is not collapsed into 1 load.

What would cause N loads to be collapsed into 1 load?  Is this something
the Legalizer might do?

-Tom

>     LOAD_SEQUENCE_POINT,
> 
> I'm certainly open to better names than LOAD_SEQUENCE_POINT though.
> 
> http://llvm-reviews.chandlerc.com/D2171
> 
> Files:
>   include/llvm/CodeGen/ISDOpcodes.h
>   include/llvm/Target/TargetLowering.h
>   include/llvm/Target/TargetSelectionDAG.td
>   lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
>   lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp
>   lib/CodeGen/TargetLoweringBase.cpp
>   lib/Target/SystemZ/SystemZAsmPrinter.cpp
>   lib/Target/SystemZ/SystemZISelLowering.cpp
>   lib/Target/SystemZ/SystemZISelLowering.h
>   lib/Target/SystemZ/SystemZInstrInfo.td
>   lib/Target/SystemZ/SystemZProcessors.td
>   lib/Target/SystemZ/SystemZSubtarget.cpp
>   lib/Target/SystemZ/SystemZSubtarget.h
>   test/CodeGen/SystemZ/Large/branch-range-01.py
>   test/CodeGen/SystemZ/Large/branch-range-02.py
>   test/CodeGen/SystemZ/Large/branch-range-03.py
>   test/CodeGen/SystemZ/Large/branch-range-04.py
>   test/CodeGen/SystemZ/Large/branch-range-05.py
>   test/CodeGen/SystemZ/Large/branch-range-06.py
>   test/CodeGen/SystemZ/Large/branch-range-09.py
>   test/CodeGen/SystemZ/Large/branch-range-10.py
>   test/CodeGen/SystemZ/Large/branch-range-11.py
>   test/CodeGen/SystemZ/Large/branch-range-12.py
>   test/CodeGen/SystemZ/frame-13.ll
>   test/CodeGen/SystemZ/frame-14.ll
>   test/CodeGen/SystemZ/serialize-01.ll
>   test/CodeGen/SystemZ/spill-01.ll

> Index: include/llvm/CodeGen/ISDOpcodes.h
> ===================================================================
> --- include/llvm/CodeGen/ISDOpcodes.h
> +++ include/llvm/CodeGen/ISDOpcodes.h
> @@ -632,6 +632,25 @@
>      /// is the chain and the second operand is the alloca pointer.
>      LIFETIME_START, LIFETIME_END,
>  
> +    /// Marks a point before a volatile or atomic load, to ensure that
> +    /// subsequent loads are attempted.  This exists for architectures
> +    /// like SystemZ that allow results from previous loads to be reused
> +    /// indefinitely.  For example, the architecture may treat a loop:
> +    ///
> +    ///   while (*i == 0);
> +    ///
> +    /// as:
> +    ///
> +    ///   while (*i == 0) spin-until-interrupt;
> +    ///
> +    /// omitting all but the first load in each time slice (even if a
> +    /// cache-coherent store is performed by another CPU).  Inserting
> +    /// this operation forces each iteration of the loop to attempt a load.
> +    ///
> +    /// Note that this is not an ordering fence per se.  It simply ensures
> +    /// that a sequence of N loads is not collapsed into 1 load.
> +    LOAD_SEQUENCE_POINT,
> +
>      /// BUILTIN_OP_END - This must be the last enum value in this list.
>      /// The target-specific pre-isel opcode values start here.
>      BUILTIN_OP_END
> Index: include/llvm/Target/TargetLowering.h
> ===================================================================
> --- include/llvm/Target/TargetLowering.h
> +++ include/llvm/Target/TargetLowering.h
> @@ -813,6 +813,12 @@
>      return InsertFencesForAtomic;
>    }
>  
> +  /// Return whether the DAG builder should automatically insert
> +  /// LOAD_SEQUENCE_POINTs before volatile and atomic loads.
> +  bool getInsertLoadSequencePoints() const {
> +    return InsertLoadSequencePoints;
> +  }
> +
>    /// Return true if the target stores stack protector cookies at a fixed offset
>    /// in some non-standard address space, and populates the address space and
>    /// offset as appropriate.
> @@ -1091,6 +1097,12 @@
>      InsertFencesForAtomic = fence;
>    }
>  
> +  /// Set whether the DAG builder should automatically insert
> +  /// LOAD_SEQUENCE_POINTs before volatile and atomic loads.
> +  void setInsertLoadSequencePoints(bool ILSP) {
> +    InsertLoadSequencePoints = ILSP;
> +  }
> +
>  public:
>    //===--------------------------------------------------------------------===//
>    // Addressing mode description hooks (used by LSR etc).
> @@ -1386,6 +1398,10 @@
>    /// weak memory ordering.)
>    bool InsertFencesForAtomic;
>  
> +  /// Whether the DAG builder should automatically insert
> +  /// LOAD_SEQUENCE_POINTs before volatile and atomic loads.
> +  bool InsertLoadSequencePoints;
> +
>    /// If set to a physical register, this specifies the register that
>    /// llvm.savestack/llvm.restorestack should save and restore.
>    unsigned StackPointerRegisterToSaveRestore;
> Index: include/llvm/Target/TargetSelectionDAG.td
> ===================================================================
> --- include/llvm/Target/TargetSelectionDAG.td
> +++ include/llvm/Target/TargetSelectionDAG.td
> @@ -448,6 +448,9 @@
>  def atomic_store     : SDNode<"ISD::ATOMIC_STORE", SDTAtomicStore,
>                      [SDNPHasChain, SDNPMayStore, SDNPMemOperand]>;
>  
> +def load_sequence_point : SDNode<"ISD::LOAD_SEQUENCE_POINT" , SDTNone,
> +                                 [SDNPHasChain, SDNPSideEffect]>;
> +
>  // Do not use ld, st directly. Use load, extload, sextload, zextload, store,
>  // and truncst (see below).
>  def ld         : SDNode<"ISD::LOAD"       , SDTLoad,
> Index: lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
> ===================================================================
> --- lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
> +++ lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
> @@ -3383,7 +3383,7 @@
>  
>    SDValue Root;
>    bool ConstantMemory = false;
> -  if (I.isVolatile() || NumValues > MaxParallelChains)
> +  if (isVolatile || NumValues > MaxParallelChains)
>      // Serialize volatile loads with other side effects.
>      Root = getRoot();
>    else if (AA->pointsToConstantMemory(
> @@ -3396,6 +3396,11 @@
>      Root = DAG.getRoot();
>    }
>  
> +  const TargetLowering *TLI = TM.getTargetLowering();
> +  if (isVolatile && TLI->getInsertLoadSequencePoints())
> +    Root = DAG.getNode(ISD::LOAD_SEQUENCE_POINT, getCurSDLoc(),
> +                       MVT::Other, Root);
> +
>    SmallVector<SDValue, 4> Values(NumValues);
>    SmallVector<SDValue, 4> Chains(std::min(unsigned(MaxParallelChains),
>                                            NumValues));
> @@ -3620,6 +3625,10 @@
>    if (I.getAlignment() < VT.getSizeInBits() / 8)
>      report_fatal_error("Cannot generate unaligned atomic load");
>  
> +  if (TLI->getInsertLoadSequencePoints())
> +    InChain = DAG.getNode(ISD::LOAD_SEQUENCE_POINT, getCurSDLoc(),
> +                          MVT::Other, InChain);
> +
>    SDValue L =
>      DAG.getAtomic(ISD::ATOMIC_LOAD, dl, VT, VT, InChain,
>                    getValue(I.getPointerOperand()),
> Index: lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp
> ===================================================================
> --- lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp
> +++ lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp
> @@ -69,6 +69,7 @@
>    case ISD::ATOMIC_LOAD_UMAX:           return "AtomicLoadUMax";
>    case ISD::ATOMIC_LOAD:                return "AtomicLoad";
>    case ISD::ATOMIC_STORE:               return "AtomicStore";
> +  case ISD::LOAD_SEQUENCE_POINT:        return "LoadSequencePoint";
>    case ISD::PCMARKER:                   return "PCMarker";
>    case ISD::READCYCLECOUNTER:           return "ReadCycleCounter";
>    case ISD::SRCVALUE:                   return "SrcValue";
> Index: lib/CodeGen/TargetLoweringBase.cpp
> ===================================================================
> --- lib/CodeGen/TargetLoweringBase.cpp
> +++ lib/CodeGen/TargetLoweringBase.cpp
> @@ -687,6 +687,7 @@
>    PrefLoopAlignment = 0;
>    MinStackArgumentAlignment = 1;
>    InsertFencesForAtomic = false;
> +  InsertLoadSequencePoints = false;
>    SupportJumpTables = true;
>    MinimumJumpTableEntries = 4;
>  
> Index: lib/Target/SystemZ/SystemZAsmPrinter.cpp
> ===================================================================
> --- lib/Target/SystemZ/SystemZAsmPrinter.cpp
> +++ lib/Target/SystemZ/SystemZAsmPrinter.cpp
> @@ -137,6 +137,15 @@
>  
>  #undef LOWER_HIGH
>  
> +  case SystemZ::LoadSequencePoint:
> +    if (Subtarget->hasFastSerialization())
> +      LoweredMI = MCInstBuilder(SystemZ::AsmBCR)
> +        .addImm(14).addReg(SystemZ::R0D);
> +    else
> +      LoweredMI = MCInstBuilder(SystemZ::AsmBCR)
> +        .addImm(15).addReg(SystemZ::R0D);
> +    break;
> +
>    default:
>      Lower.lower(MI, LoweredMI);
>      break;
> Index: lib/Target/SystemZ/SystemZISelLowering.cpp
> ===================================================================
> --- lib/Target/SystemZ/SystemZISelLowering.cpp
> +++ lib/Target/SystemZ/SystemZISelLowering.cpp
> @@ -277,6 +277,10 @@
>    setOperationAction(ISD::VACOPY,  MVT::Other, Custom);
>    setOperationAction(ISD::VAEND,   MVT::Other, Expand);
>  
> +  // Force a serialization instruction to be inserted before volatile
> +  // and atomic loads.
> +  setInsertLoadSequencePoints(true);
> +
>    // We want to use MVC in preference to even a single load/store pair.
>    MaxStoresPerMemcpy = 0;
>    MaxStoresPerMemcpyOptSize = 0;
> Index: lib/Target/SystemZ/SystemZISelLowering.h
> ===================================================================
> --- lib/Target/SystemZ/SystemZISelLowering.h
> +++ lib/Target/SystemZ/SystemZISelLowering.h
> @@ -274,6 +274,7 @@
>    SDValue lowerATOMIC_LOAD(SDValue Op, SelectionDAG &DAG,
>                             unsigned Opcode) const;
>    SDValue lowerATOMIC_CMP_SWAP(SDValue Op, SelectionDAG &DAG) const;
> +  SDValue lowerLOAD_SEQUENCE_POINT(SDValue Op, SelectionDAG &DAG) const;
>    SDValue lowerSTACKSAVE(SDValue Op, SelectionDAG &DAG) const;
>    SDValue lowerSTACKRESTORE(SDValue Op, SelectionDAG &DAG) const;
>    SDValue lowerPREFETCH(SDValue Op, SelectionDAG &DAG) const;
> Index: lib/Target/SystemZ/SystemZInstrInfo.td
> ===================================================================
> --- lib/Target/SystemZ/SystemZInstrInfo.td
> +++ lib/Target/SystemZ/SystemZInstrInfo.td
> @@ -1194,6 +1194,8 @@
>  // Atomic operations
>  //===----------------------------------------------------------------------===//
>  
> +def LoadSequencePoint : Alias<2, (outs), (ins), [(load_sequence_point)]>;
> +
>  def ATOMIC_SWAPW        : AtomicLoadWBinaryReg<z_atomic_swapw>;
>  def ATOMIC_SWAP_32      : AtomicLoadBinaryReg32<atomic_swap_32>;
>  def ATOMIC_SWAP_64      : AtomicLoadBinaryReg64<atomic_swap_64>;
> Index: lib/Target/SystemZ/SystemZProcessors.td
> ===================================================================
> --- lib/Target/SystemZ/SystemZProcessors.td
> +++ lib/Target/SystemZ/SystemZProcessors.td
> @@ -36,11 +36,16 @@
>    "Assume that the floating-point extension facility is installed"
>  >;
>  
> +def FeatureFastSerialization : SystemZFeature<
> +  "fast-serialization", "FastSerialization",
> +  "Assume that the fast-serialization facility is installed"
> +>;
> +
>  def : Processor<"generic", NoItineraries, []>;
>  def : Processor<"z10", NoItineraries, []>;
>  def : Processor<"z196", NoItineraries,
>                  [FeatureDistinctOps, FeatureLoadStoreOnCond, FeatureHighWord,
> -                 FeatureFPExtension]>;
> +                 FeatureFPExtension, FeatureFastSerialization]>;
>  def : Processor<"zEC12", NoItineraries,
>                  [FeatureDistinctOps, FeatureLoadStoreOnCond, FeatureHighWord,
> -                 FeatureFPExtension]>;
> +                 FeatureFPExtension, FeatureFastSerialization]>;
> Index: lib/Target/SystemZ/SystemZSubtarget.cpp
> ===================================================================
> --- lib/Target/SystemZ/SystemZSubtarget.cpp
> +++ lib/Target/SystemZ/SystemZSubtarget.cpp
> @@ -23,7 +23,7 @@
>                                     const std::string &FS)
>    : SystemZGenSubtargetInfo(TT, CPU, FS), HasDistinctOps(false),
>      HasLoadStoreOnCond(false), HasHighWord(false), HasFPExtension(false),
> -    TargetTriple(TT) {
> +    HasFastSerialization(false), TargetTriple(TT) {
>    std::string CPUName = CPU;
>    if (CPUName.empty())
>      CPUName = "generic";
> Index: lib/Target/SystemZ/SystemZSubtarget.h
> ===================================================================
> --- lib/Target/SystemZ/SystemZSubtarget.h
> +++ lib/Target/SystemZ/SystemZSubtarget.h
> @@ -31,6 +31,7 @@
>    bool HasLoadStoreOnCond;
>    bool HasHighWord;
>    bool HasFPExtension;
> +  bool HasFastSerialization;
>  
>  private:
>    Triple TargetTriple;
> @@ -57,6 +58,9 @@
>    // Return true if the target has the floating-point extension facility.
>    bool hasFPExtension() const { return HasFPExtension; }
>  
> +  // Return true if the target has the fast-serialization facility.
> +  bool hasFastSerialization() const { return HasFastSerialization; }
> +
>    // Return true if GV can be accessed using LARL for reloc model RM
>    // and code model CM.
>    bool isPC32DBLSymbol(const GlobalValue *GV, Reloc::Model RM,
> Index: test/CodeGen/SystemZ/Large/branch-range-01.py
> ===================================================================
> --- test/CodeGen/SystemZ/Large/branch-range-01.py
> +++ test/CodeGen/SystemZ/Large/branch-range-01.py
> @@ -79,7 +79,7 @@
>      next = 'before%d' % (i + 1) if i + 1 < branch_blocks else 'main'
>      print 'before%d:' % i
>      print '  %%bstop%d = getelementptr i32 *%%stop, i64 %d' % (i, i)
> -    print '  %%bcur%d = load volatile i32 *%%bstop%d' % (i, i)
> +    print '  %%bcur%d = load i32 *%%bstop%d' % (i, i)
>      print '  %%btest%d = icmp eq i32 %%limit, %%bcur%d' % (i, i)
>      print '  br i1 %%btest%d, label %%after0, label %%%s' % (i, next)
>      print ''
> @@ -95,7 +95,7 @@
>  
>  for i in xrange(branch_blocks):
>      print '  %%astop%d = getelementptr i32 *%%stop, i64 %d' % (i, i + 25)
> -    print '  %%acur%d = load volatile i32 *%%astop%d' % (i, i)
> +    print '  %%acur%d = load i32 *%%astop%d' % (i, i)
>      print '  %%atest%d = icmp eq i32 %%limit, %%acur%d' % (i, i)
>      print '  br i1 %%atest%d, label %%main, label %%after%d' % (i, i)
>      print ''
> Index: test/CodeGen/SystemZ/Large/branch-range-02.py
> ===================================================================
> --- test/CodeGen/SystemZ/Large/branch-range-02.py
> +++ test/CodeGen/SystemZ/Large/branch-range-02.py
> @@ -72,7 +72,7 @@
>      print 'b%d:' % i
>      print '  store volatile i8 %d, i8 *%%base' % value
>      print '  %%astop%d = getelementptr i32 *%%stop, i64 %d' % (i, i)
> -    print '  %%acur%d = load volatile i32 *%%astop%d' % (i, i)
> +    print '  %%acur%d = load i32 *%%astop%d' % (i, i)
>      print '  %%atest%d = icmp eq i32 %%limit, %%acur%d' % (i, i)
>      print '  br i1 %%atest%d, label %%%s, label %%%s' % (i, other, next)
>  
> Index: test/CodeGen/SystemZ/Large/branch-range-03.py
> ===================================================================
> --- test/CodeGen/SystemZ/Large/branch-range-03.py
> +++ test/CodeGen/SystemZ/Large/branch-range-03.py
> @@ -79,7 +79,7 @@
>      next = 'before%d' % (i + 1) if i + 1 < branch_blocks else 'main'
>      print 'before%d:' % i
>      print '  %%bstop%d = getelementptr i8 *%%stop, i64 %d' % (i, i)
> -    print '  %%bcur%d = load volatile i8 *%%bstop%d' % (i, i)
> +    print '  %%bcur%d = load i8 *%%bstop%d' % (i, i)
>      print '  %%bext%d = sext i8 %%bcur%d to i32' % (i, i)
>      print '  %%btest%d = icmp eq i32 %%limit, %%bext%d' % (i, i)
>      print '  br i1 %%btest%d, label %%after0, label %%%s' % (i, next)
> @@ -96,7 +96,7 @@
>  
>  for i in xrange(branch_blocks):
>      print '  %%astop%d = getelementptr i8 *%%stop, i64 %d' % (i, i + 25)
> -    print '  %%acur%d = load volatile i8 *%%astop%d' % (i, i)
> +    print '  %%acur%d = load i8 *%%astop%d' % (i, i)
>      print '  %%aext%d = sext i8 %%acur%d to i32' % (i, i)
>      print '  %%atest%d = icmp eq i32 %%limit, %%aext%d' % (i, i)
>      print '  br i1 %%atest%d, label %%main, label %%after%d' % (i, i)
> Index: test/CodeGen/SystemZ/Large/branch-range-04.py
> ===================================================================
> --- test/CodeGen/SystemZ/Large/branch-range-04.py
> +++ test/CodeGen/SystemZ/Large/branch-range-04.py
> @@ -83,7 +83,7 @@
>      next = 'before%d' % (i + 1) if i + 1 < branch_blocks else 'main'
>      print 'before%d:' % i
>      print '  %%bstop%d = getelementptr i8 *%%stop, i64 %d' % (i, i)
> -    print '  %%bcur%d = load volatile i8 *%%bstop%d' % (i, i)
> +    print '  %%bcur%d = load i8 *%%bstop%d' % (i, i)
>      print '  %%bext%d = sext i8 %%bcur%d to i64' % (i, i)
>      print '  %%btest%d = icmp eq i64 %%limit, %%bext%d' % (i, i)
>      print '  br i1 %%btest%d, label %%after0, label %%%s' % (i, next)
> @@ -100,7 +100,7 @@
>  
>  for i in xrange(branch_blocks):
>      print '  %%astop%d = getelementptr i8 *%%stop, i64 %d' % (i, i + 25)
> -    print '  %%acur%d = load volatile i8 *%%astop%d' % (i, i)
> +    print '  %%acur%d = load i8 *%%astop%d' % (i, i)
>      print '  %%aext%d = sext i8 %%acur%d to i64' % (i, i)
>      print '  %%atest%d = icmp eq i64 %%limit, %%aext%d' % (i, i)
>      print '  br i1 %%atest%d, label %%main, label %%after%d' % (i, i)
> Index: test/CodeGen/SystemZ/Large/branch-range-05.py
> ===================================================================
> --- test/CodeGen/SystemZ/Large/branch-range-05.py
> +++ test/CodeGen/SystemZ/Large/branch-range-05.py
> @@ -82,7 +82,7 @@
>  for i in xrange(branch_blocks):
>      next = 'before%d' % (i + 1) if i + 1 < branch_blocks else 'main'
>      print 'before%d:' % i
> -    print '  %%bcur%d = load volatile i8 *%%stop' % i
> +    print '  %%bcur%d = load i8 *%%stop' % i
>      print '  %%bext%d = sext i8 %%bcur%d to i32' % (i, i)
>      print '  %%btest%d = icmp slt i32 %%bext%d, %d' % (i, i, i + 50)
>      print '  br i1 %%btest%d, label %%after0, label %%%s' % (i, next)
> @@ -98,7 +98,7 @@
>      print '  store volatile i8 %d, i8 *%%ptr%d' % (value, i)
>  
>  for i in xrange(branch_blocks):
> -    print '  %%acur%d = load volatile i8 *%%stop' % i
> +    print '  %%acur%d = load i8 *%%stop' % i
>      print '  %%aext%d = sext i8 %%acur%d to i32' % (i, i)
>      print '  %%atest%d = icmp slt i32 %%aext%d, %d' % (i, i, i + 100)
>      print '  br i1 %%atest%d, label %%main, label %%after%d' % (i, i)
> Index: test/CodeGen/SystemZ/Large/branch-range-06.py
> ===================================================================
> --- test/CodeGen/SystemZ/Large/branch-range-06.py
> +++ test/CodeGen/SystemZ/Large/branch-range-06.py
> @@ -82,7 +82,7 @@
>  for i in xrange(branch_blocks):
>      next = 'before%d' % (i + 1) if i + 1 < branch_blocks else 'main'
>      print 'before%d:' % i
> -    print '  %%bcur%d = load volatile i8 *%%stop' % i
> +    print '  %%bcur%d = load i8 *%%stop' % i
>      print '  %%bext%d = sext i8 %%bcur%d to i64' % (i, i)
>      print '  %%btest%d = icmp slt i64 %%bext%d, %d' % (i, i, i + 50)
>      print '  br i1 %%btest%d, label %%after0, label %%%s' % (i, next)
> @@ -98,7 +98,7 @@
>      print '  store volatile i8 %d, i8 *%%ptr%d' % (value, i)
>  
>  for i in xrange(branch_blocks):
> -    print '  %%acur%d = load volatile i8 *%%stop' % i
> +    print '  %%acur%d = load i8 *%%stop' % i
>      print '  %%aext%d = sext i8 %%acur%d to i64' % (i, i)
>      print '  %%atest%d = icmp slt i64 %%aext%d, %d' % (i, i, i + 100)
>      print '  br i1 %%atest%d, label %%main, label %%after%d' % (i, i)
> Index: test/CodeGen/SystemZ/Large/branch-range-09.py
> ===================================================================
> --- test/CodeGen/SystemZ/Large/branch-range-09.py
> +++ test/CodeGen/SystemZ/Large/branch-range-09.py
> @@ -79,7 +79,7 @@
>      next = 'before%d' % (i + 1) if i + 1 < branch_blocks else 'main'
>      print 'before%d:' % i
>      print '  %%bstop%d = getelementptr i8 *%%stop, i64 %d' % (i, i)
> -    print '  %%bcur%d = load volatile i8 *%%bstop%d' % (i, i)
> +    print '  %%bcur%d = load i8 *%%bstop%d' % (i, i)
>      print '  %%bext%d = sext i8 %%bcur%d to i32' % (i, i)
>      print '  %%btest%d = icmp ult i32 %%limit, %%bext%d' % (i, i)
>      print '  br i1 %%btest%d, label %%after0, label %%%s' % (i, next)
> @@ -96,7 +96,7 @@
>  
>  for i in xrange(branch_blocks):
>      print '  %%astop%d = getelementptr i8 *%%stop, i64 %d' % (i, i + 25)
> -    print '  %%acur%d = load volatile i8 *%%astop%d' % (i, i)
> +    print '  %%acur%d = load i8 *%%astop%d' % (i, i)
>      print '  %%aext%d = sext i8 %%acur%d to i32' % (i, i)
>      print '  %%atest%d = icmp ult i32 %%limit, %%aext%d' % (i, i)
>      print '  br i1 %%atest%d, label %%main, label %%after%d' % (i, i)
> Index: test/CodeGen/SystemZ/Large/branch-range-10.py
> ===================================================================
> --- test/CodeGen/SystemZ/Large/branch-range-10.py
> +++ test/CodeGen/SystemZ/Large/branch-range-10.py
> @@ -83,7 +83,7 @@
>      next = 'before%d' % (i + 1) if i + 1 < branch_blocks else 'main'
>      print 'before%d:' % i
>      print '  %%bstop%d = getelementptr i8 *%%stop, i64 %d' % (i, i)
> -    print '  %%bcur%d = load volatile i8 *%%bstop%d' % (i, i)
> +    print '  %%bcur%d = load i8 *%%bstop%d' % (i, i)
>      print '  %%bext%d = sext i8 %%bcur%d to i64' % (i, i)
>      print '  %%btest%d = icmp ult i64 %%limit, %%bext%d' % (i, i)
>      print '  br i1 %%btest%d, label %%after0, label %%%s' % (i, next)
> @@ -100,7 +100,7 @@
>  
>  for i in xrange(branch_blocks):
>      print '  %%astop%d = getelementptr i8 *%%stop, i64 %d' % (i, i + 25)
> -    print '  %%acur%d = load volatile i8 *%%astop%d' % (i, i)
> +    print '  %%acur%d = load i8 *%%astop%d' % (i, i)
>      print '  %%aext%d = sext i8 %%acur%d to i64' % (i, i)
>      print '  %%atest%d = icmp ult i64 %%limit, %%aext%d' % (i, i)
>      print '  br i1 %%atest%d, label %%main, label %%after%d' % (i, i)
> Index: test/CodeGen/SystemZ/Large/branch-range-11.py
> ===================================================================
> --- test/CodeGen/SystemZ/Large/branch-range-11.py
> +++ test/CodeGen/SystemZ/Large/branch-range-11.py
> @@ -98,8 +98,8 @@
>  for i in xrange(branch_blocks):
>      next = 'before%d' % (i + 1) if i + 1 < branch_blocks else 'main'
>      print 'before%d:' % i
> -    print '  %%bcur%da = load volatile i32 *%%stopa' % i
> -    print '  %%bcur%db = load volatile i32 *%%stopb' % i
> +    print '  %%bcur%da = load i32 *%%stopa' % i
> +    print '  %%bcur%db = load i32 *%%stopb' % i
>      print '  %%bsub%d = sub i32 %%bcur%da, %%bcur%db' % (i, i, i)
>      print '  %%btest%d = icmp ult i32 %%bsub%d, %d' % (i, i, i + 50)
>      print '  br i1 %%btest%d, label %%after0, label %%%s' % (i, next)
> @@ -115,8 +115,8 @@
>      print '  store volatile i8 %d, i8 *%%ptr%d' % (value, i)
>  
>  for i in xrange(branch_blocks):
> -    print '  %%acur%da = load volatile i32 *%%stopa' % i
> -    print '  %%acur%db = load volatile i32 *%%stopb' % i
> +    print '  %%acur%da = load i32 *%%stopa' % i
> +    print '  %%acur%db = load i32 *%%stopb' % i
>      print '  %%asub%d = sub i32 %%acur%da, %%acur%db' % (i, i, i)
>      print '  %%atest%d = icmp ult i32 %%asub%d, %d' % (i, i, i + 100)
>      print '  br i1 %%atest%d, label %%main, label %%after%d' % (i, i)
> Index: test/CodeGen/SystemZ/Large/branch-range-12.py
> ===================================================================
> --- test/CodeGen/SystemZ/Large/branch-range-12.py
> +++ test/CodeGen/SystemZ/Large/branch-range-12.py
> @@ -98,8 +98,8 @@
>  for i in xrange(branch_blocks):
>      next = 'before%d' % (i + 1) if i + 1 < branch_blocks else 'main'
>      print 'before%d:' % i
> -    print '  %%bcur%da = load volatile i64 *%%stopa' % i
> -    print '  %%bcur%db = load volatile i64 *%%stopb' % i
> +    print '  %%bcur%da = load i64 *%%stopa' % i
> +    print '  %%bcur%db = load i64 *%%stopb' % i
>      print '  %%bsub%d = sub i64 %%bcur%da, %%bcur%db' % (i, i, i)
>      print '  %%btest%d = icmp ult i64 %%bsub%d, %d' % (i, i, i + 50)
>      print '  br i1 %%btest%d, label %%after0, label %%%s' % (i, next)
> @@ -115,8 +115,8 @@
>      print '  store volatile i8 %d, i8 *%%ptr%d' % (value, i)
>  
>  for i in xrange(branch_blocks):
> -    print '  %%acur%da = load volatile i64 *%%stopa' % i
> -    print '  %%acur%db = load volatile i64 *%%stopb' % i
> +    print '  %%acur%da = load i64 *%%stopa' % i
> +    print '  %%acur%db = load i64 *%%stopb' % i
>      print '  %%asub%d = sub i64 %%acur%da, %%acur%db' % (i, i, i)
>      print '  %%atest%d = icmp ult i64 %%asub%d, %d' % (i, i, i + 100)
>      print '  br i1 %%atest%d, label %%main, label %%after%d' % (i, i)
> Index: test/CodeGen/SystemZ/frame-13.ll
> ===================================================================
> --- test/CodeGen/SystemZ/frame-13.ll
> +++ test/CodeGen/SystemZ/frame-13.ll
> @@ -243,8 +243,8 @@
>  
>  ; And again with maximum register pressure.  The only spill slots that the
>  ; NOFP case needs are the emergency ones, so the offsets are the same as for f2.
> -; However, the FP case uses %r11 as the frame pointer and must therefore
> -; spill a second register.  This leads to an extra displacement of 8.
> +; The FP case needs to spill an extra register and is too dependent on
> +; register allocation heuristics for a stable test.
>  define void @f11(i32 *%vptr) {
>  ; CHECK-NOFP-LABEL: f11:
>  ; CHECK-NOFP: stmg %r6, %r15,
> @@ -254,15 +254,6 @@
>  ; CHECK-NOFP: lg [[REGISTER]], [[OFFSET]](%r15)
>  ; CHECK-NOFP: lmg %r6, %r15,
>  ; CHECK-NOFP: br %r14
> -;
> -; CHECK-FP-LABEL: f11:
> -; CHECK-FP: stmg %r6, %r15,
> -; CHECK-FP: stg [[REGISTER:%r[1-9][0-4]?]], [[OFFSET:160|168]](%r11)
> -; CHECK-FP: lay [[REGISTER]], 4096(%r11)
> -; CHECK-FP: mvhi 8([[REGISTER]]), 42
> -; CHECK-FP: lg [[REGISTER]], [[OFFSET]](%r11)
> -; CHECK-FP: lmg %r6, %r15,
> -; CHECK-FP: br %r14
>    %i0 = load volatile i32 *%vptr
>    %i1 = load volatile i32 *%vptr
>    %i3 = load volatile i32 *%vptr
> Index: test/CodeGen/SystemZ/frame-14.ll
> ===================================================================
> --- test/CodeGen/SystemZ/frame-14.ll
> +++ test/CodeGen/SystemZ/frame-14.ll
> @@ -266,8 +266,8 @@
>  
>  ; And again with maximum register pressure.  The only spill slots that the
>  ; NOFP case needs are the emergency ones, so the offsets are the same as for f4.
> -; However, the FP case uses %r11 as the frame pointer and must therefore
> -; spill a second register.  This leads to an extra displacement of 8.
> +; The FP case needs to spill an extra register and is too dependent on
> +; register allocation heuristics for a stable test.
>  define void @f11(i32 *%vptr) {
>  ; CHECK-NOFP-LABEL: f11:
>  ; CHECK-NOFP: stmg %r6, %r15,
> @@ -278,16 +278,6 @@
>  ; CHECK-NOFP: lg [[REGISTER]], [[OFFSET]](%r15)
>  ; CHECK-NOFP: lmg %r6, %r15,
>  ; CHECK-NOFP: br %r14
> -;
> -; CHECK-FP-LABEL: f11:
> -; CHECK-FP: stmg %r6, %r15,
> -; CHECK-FP: stg [[REGISTER:%r[1-9][0-4]?]], [[OFFSET:160|168]](%r11)
> -; CHECK-FP: llilh [[REGISTER]], 8
> -; CHECK-FP: agr [[REGISTER]], %r11
> -; CHECK-FP: mvi 8([[REGISTER]]), 42
> -; CHECK-FP: lg [[REGISTER]], [[OFFSET]](%r11)
> -; CHECK-FP: lmg %r6, %r15,
> -; CHECK-FP: br %r14
>    %i0 = load volatile i32 *%vptr
>    %i1 = load volatile i32 *%vptr
>    %i3 = load volatile i32 *%vptr
> Index: test/CodeGen/SystemZ/serialize-01.ll
> ===================================================================
> --- /dev/null
> +++ test/CodeGen/SystemZ/serialize-01.ll
> @@ -0,0 +1,21 @@
> +; Test serialization instructions.
> +;
> +; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z10 | \
> +; RUN:   FileCheck %s -check-prefix=CHECK-FULL
> +; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z196 | \
> +; RUN:   FileCheck %s -check-prefix=CHECK-FAST
> +
> +; Check that volatile loads produce a serialisation.
> +define i32 @f1(i32 *%src) {
> +; CHECK-FULL-LABEL: f1:
> +; CHECK-FULL: bcr 15, %r0
> +; CHECK-FULL: l %r2, 0(%r2)
> +; CHECK-FULL: br %r14
> +;
> +; CHECK-FAST-LABEL: f1:
> +; CHECK-FAST: bcr 14, %r0
> +; CHECK-FAST: l %r2, 0(%r2)
> +; CHECK-FAST: br %r14
> +  %val = load volatile i32 *%src
> +  ret i32 %val
> +}
> Index: test/CodeGen/SystemZ/spill-01.ll
> ===================================================================
> --- test/CodeGen/SystemZ/spill-01.ll
> +++ test/CodeGen/SystemZ/spill-01.ll
> @@ -400,15 +400,15 @@
>  ; CHECK: stgrl [[REG]], h8
>  ; CHECK: br %r14
>  entry:
> +  %val8 = load volatile i64 *@h8
>    %val0 = load volatile i64 *@h0
>    %val1 = load volatile i64 *@h1
>    %val2 = load volatile i64 *@h2
>    %val3 = load volatile i64 *@h3
>    %val4 = load volatile i64 *@h4
>    %val5 = load volatile i64 *@h5
>    %val6 = load volatile i64 *@h6
>    %val7 = load volatile i64 *@h7
> -  %val8 = load volatile i64 *@h8
>    %val9 = load volatile i64 *@h9
>  
>    call void @foo()

> _______________________________________________
> llvm-commits mailing list
> llvm-commits at cs.uiuc.edu
> http://lists.cs.uiuc.edu/mailman/listinfo/llvm-commits