Raw File
llvm-D28759-loopclearance.patch
From e3621af0115a851d0ed02f0b436deec62ec3e99c Mon Sep 17 00:00:00 2001
From: Keno Fischer <keno@juliacomputing.com>
Date: Sun, 15 Jan 2017 23:59:07 -0500
Subject: [PATCH] [ExecutionDepsFix] Improve clearance calculation for loops

In revision rL278321, ExecutionDepsFix learned how to pick a better
register for undef register reads, e.g. for instructions such as
`vcvtsi2sdq`. While this revision improved performance on a good number
of our benchmarks, it unfortunately also caused significant regressions
(up to 3x) on others. This regression turned out to be caused by loops
such as:

PH -> A -> B (xmm<Undef> -> xmm<Def>) -> C -> D -> EXIT
      ^                                  |
      +----------------------------------+

In the previous version of the clearance calculation, we would visit
the blocks in order, remembering for each whether there were any
incoming backedges from blocks that we hadn't processed yet and if
so queuing up the block to be re-processed. However, for loop structures
such as the above, this is clearly insufficient, since the block B
does not have any unknown backedges, so we do not see the false
dependency from the previous interation's Def of xmm registers in B.

To fix this, we need to consider all blocks that are part of the loop
and reprocess them one the correct clearance values are known. As
an optimization, we also want to avoid reprocessing any later blocks
that are not part of the loop.

In summary, the iteration order is as follows:
Before: PH A B C D A'
Corrected (Naive): PH A B C D A' B' C' D'
Corrected (w/ optimization): PH A B C A' B' C' D

To facilitate this optimization we introduce two new counters for each
basic block. The first counts how many of it's predecssors have
completed primary processing. The second counts how many of its
predecessors have completed all processing (we will call such a block
*done*. Now, the criteria to reprocess a block is as follows:
    - All Predecessors have completed primary processing
    - For x the number of predecessors that have completed primary
      processing *at the time of primary processing of this block*,
      the number of predecessors that are done has reached x.

The intuition behind this criterion is as follows:
We need to perform primary processing on all predecessors in order to
find out any direct defs in those predecessors. When predecessors are
done, we also know that we have information about indirect defs (e.g.
in block B though that were inherited through B->C->A->B). However,
we can't wait for all predecessors to be done, since that would
cause cyclic dependencies. However, it is guaranteed that all those
predecessors that are prior to us in reverse postorder will be done
before us. Since we iterate of the basic blocks in reverse postorder,
the number x above, is precisely the count of the number of predecessors
prior to us in reverse postorder.
---
 lib/CodeGen/ExecutionDepsFix.cpp    | 223 ++++++++++++++++++++++--------------
 test/CodeGen/X86/break-false-dep.ll |  57 +++++++++
 2 files changed, 197 insertions(+), 83 deletions(-)

diff --git a/lib/CodeGen/ExecutionDepsFix.cpp b/lib/CodeGen/ExecutionDepsFix.cpp
index e7c6b03..6ac1db4 100644
--- a/lib/CodeGen/ExecutionDepsFix.cpp
+++ b/lib/CodeGen/ExecutionDepsFix.cpp
@@ -142,8 +142,26 @@ class ExeDepsFix : public MachineFunctionPass {
   std::vector<SmallVector<int, 1>> AliasMap;
   const unsigned NumRegs;
   LiveReg *LiveRegs;
-  typedef DenseMap<MachineBasicBlock*, LiveReg*> LiveOutMap;
-  LiveOutMap LiveOuts;
+  struct MBBInfo {
+    // Keeps clearance and domain information for all registers. Not that this
+    // is different from the usual definition notion of liveness. The CPU
+    // doesn't care whether or not we consider a register killed.
+    LiveReg *OutRegs;
+
+    // Whether we have gotten to this block in primary processing yet.
+    bool PrimaryCompleted;
+
+    // The number of predecessors for which primary processing has completed
+    unsigned IncomingProcessed;
+
+    // The value of `IncomingProcessed` at the start of primary processing
+    unsigned PrimaryIncoming;
+
+    // The number of predecessors for which all processing steps are done.
+    unsigned IncomingCompleted;
+  };
+  typedef DenseMap<MachineBasicBlock *, MBBInfo> MBBInfoMap;
+  MBBInfoMap MBBInfos;
 
   /// List of undefined register reads in this block in forward order.
   std::vector<std::pair<MachineInstr*, unsigned> > UndefReads;
@@ -154,11 +172,6 @@ class ExeDepsFix : public MachineFunctionPass {
   /// Current instruction number.
   /// The first instruction in each basic block is 0.
   int CurInstr;
-
-  /// True when the current block has a predecessor that hasn't been visited
-  /// yet.
-  bool SeenUnknownBackEdge;
-
 public:
   ExeDepsFix(const TargetRegisterClass *rc)
     : MachineFunctionPass(ID), RC(rc), NumRegs(RC->getNumRegs()) {}
@@ -180,7 +193,6 @@ public:
 private:
   iterator_range<SmallVectorImpl<int>::const_iterator>
   regIndices(unsigned Reg) const;
-
   // DomainValue allocation.
   DomainValue *alloc(int domain = -1);
   DomainValue *retain(DomainValue *DV) {
@@ -199,8 +211,11 @@ private:
 
   void enterBasicBlock(MachineBasicBlock*);
   void leaveBasicBlock(MachineBasicBlock*);
-  void visitInstr(MachineInstr*);
-  void processDefs(MachineInstr*, bool Kill);
+  bool isBlockDone(MachineBasicBlock *);
+  void processBasicBlock(MachineBasicBlock *MBB, bool PrimaryPass, bool Done);
+  void updateSuccessors(MachineBasicBlock *MBB, bool Primary, bool Done);
+  bool visitInstr(MachineInstr *);
+  void processDefs(MachineInstr *, bool BlockDone, bool Kill);
   void visitSoftInstr(MachineInstr*, unsigned mask);
   void visitHardInstr(MachineInstr*, unsigned domain);
   void pickBestRegisterForUndef(MachineInstr *MI, unsigned OpIdx,
@@ -360,9 +375,6 @@ bool ExeDepsFix::merge(DomainValue *A, DomainValue *B) {
 
 /// Set up LiveRegs by merging predecessor live-out values.
 void ExeDepsFix::enterBasicBlock(MachineBasicBlock *MBB) {
-  // Detect back-edges from predecessors we haven't processed yet.
-  SeenUnknownBackEdge = false;
-
   // Reset instruction counter in each basic block.
   CurInstr = 0;
 
@@ -397,18 +409,18 @@ void ExeDepsFix::enterBasicBlock(MachineBasicBlock *MBB) {
   // Try to coalesce live-out registers from predecessors.
   for (MachineBasicBlock::const_pred_iterator pi = MBB->pred_begin(),
        pe = MBB->pred_end(); pi != pe; ++pi) {
-    LiveOutMap::const_iterator fi = LiveOuts.find(*pi);
-    if (fi == LiveOuts.end()) {
-      SeenUnknownBackEdge = true;
+    auto fi = MBBInfos.find(*pi);
+    assert(fi != MBBInfos.end());
+    LiveReg *Incoming = fi->second.OutRegs;
+    if (Incoming == nullptr) {
       continue;
     }
-    assert(fi->second && "Can't have NULL entries");
 
     for (unsigned rx = 0; rx != NumRegs; ++rx) {
       // Use the most recent predecessor def for each register.
-      LiveRegs[rx].Def = std::max(LiveRegs[rx].Def, fi->second[rx].Def);
+      LiveRegs[rx].Def = std::max(LiveRegs[rx].Def, Incoming[rx].Def);
 
-      DomainValue *pdv = resolve(fi->second[rx].Value);
+      DomainValue *pdv = resolve(Incoming[rx].Value);
       if (!pdv)
         continue;
       if (!LiveRegs[rx].Value) {
@@ -432,35 +444,33 @@ void ExeDepsFix::enterBasicBlock(MachineBasicBlock *MBB) {
         force(rx, pdv->getFirstDomain());
     }
   }
-  DEBUG(dbgs() << "BB#" << MBB->getNumber()
-        << (SeenUnknownBackEdge ? ": incomplete\n" : ": all preds known\n"));
+  DEBUG(
+      dbgs() << "BB#" << MBB->getNumber()
+             << (!isBlockDone(MBB) ? ": incomplete\n" : ": all preds known\n"));
 }
 
 void ExeDepsFix::leaveBasicBlock(MachineBasicBlock *MBB) {
   assert(LiveRegs && "Must enter basic block first.");
+  LiveReg *OldOutRegs = MBBInfos[MBB].OutRegs;
   // Save live registers at end of MBB - used by enterBasicBlock().
   // Also use LiveOuts as a visited set to detect back-edges.
-  bool First = LiveOuts.insert(std::make_pair(MBB, LiveRegs)).second;
-
-  if (First) {
-    // LiveRegs was inserted in LiveOuts.  Adjust all defs to be relative to
-    // the end of this block instead of the beginning.
-    for (unsigned i = 0, e = NumRegs; i != e; ++i)
-      LiveRegs[i].Def -= CurInstr;
-  } else {
-    // Insertion failed, this must be the second pass.
+  MBBInfos[MBB].OutRegs = LiveRegs;
+
+  // LiveRegs was inserted in LiveOuts.  Adjust all defs to be relative to
+  // the end of this block instead of the beginning.
+  for (unsigned i = 0, e = NumRegs; i != e; ++i)
+    LiveRegs[i].Def -= CurInstr;
+  if (OldOutRegs) {
+    // This must be the second pass.
     // Release all the DomainValues instead of keeping them.
     for (unsigned i = 0, e = NumRegs; i != e; ++i)
-      release(LiveRegs[i].Value);
-    delete[] LiveRegs;
+      release(OldOutRegs[i].Value);
+    delete[] OldOutRegs;
   }
   LiveRegs = nullptr;
 }
 
-void ExeDepsFix::visitInstr(MachineInstr *MI) {
-  if (MI->isDebugValue())
-    return;
-
+bool ExeDepsFix::visitInstr(MachineInstr *MI) {
   // Update instructions with explicit execution domains.
   std::pair<uint16_t, uint16_t> DomP = TII->getExecutionDomain(*MI);
   if (DomP.first) {
@@ -470,9 +480,7 @@ void ExeDepsFix::visitInstr(MachineInstr *MI) {
       visitHardInstr(MI, DomP.first);
   }
 
-  // Process defs to track register ages, and kill values clobbered by generic
-  // instructions.
-  processDefs(MI, !DomP.first);
+  return !DomP.first;
 }
 
 /// \brief Helps avoid false dependencies on undef registers by updating the
@@ -542,14 +550,7 @@ bool ExeDepsFix::shouldBreakDependence(MachineInstr *MI, unsigned OpIdx,
       DEBUG(dbgs() << ": Break dependency.\n");
       continue;
     }
-    // The current clearance seems OK, but we may be ignoring a def from a
-    // back-edge.
-    if (!SeenUnknownBackEdge || Pref <= unsigned(CurInstr)) {
-      DEBUG(dbgs() << ": OK .\n");
-      return false;
-    }
-    // A def from an unprocessed back-edge may make us break this dependency.
-    DEBUG(dbgs() << ": Wait for back-edge to resolve.\n");
+    DEBUG(dbgs() << ": OK .\n");
     return false;
   }
   return true;
@@ -559,16 +560,21 @@ bool ExeDepsFix::shouldBreakDependence(MachineInstr *MI, unsigned OpIdx,
 // If Kill is set, also kill off DomainValues clobbered by the defs.
 //
 // Also break dependencies on partial defs and undef uses.
-void ExeDepsFix::processDefs(MachineInstr *MI, bool Kill) {
+void ExeDepsFix::processDefs(MachineInstr *MI, bool BlockDone, bool Kill) {
   assert(!MI->isDebugValue() && "Won't process debug values");
 
   // Break dependence on undef uses. Do this before updating LiveRegs below.
   unsigned OpNum;
-  unsigned Pref = TII->getUndefRegClearance(*MI, OpNum, TRI);
-  if (Pref) {
-    pickBestRegisterForUndef(MI, OpNum, Pref);
-    if (shouldBreakDependence(MI, OpNum, Pref))
-      UndefReads.push_back(std::make_pair(MI, OpNum));
+  // If this block is not done, it makes little sense to make any decisions
+  // based on clearance information. We need to make a second pass anyway,
+  // and by then we'll have better information, so we can avoid this work now.
+  if (BlockDone) {
+    unsigned Pref = TII->getUndefRegClearance(*MI, OpNum, TRI);
+    if (Pref) {
+      pickBestRegisterForUndef(MI, OpNum, Pref);
+      if (shouldBreakDependence(MI, OpNum, Pref))
+        UndefReads.push_back(std::make_pair(MI, OpNum));
+    }
   }
   const MCInstrDesc &MCID = MI->getDesc();
   for (unsigned i = 0,
@@ -584,11 +590,13 @@ void ExeDepsFix::processDefs(MachineInstr *MI, bool Kill) {
       DEBUG(dbgs() << TRI->getName(RC->getRegister(rx)) << ":\t" << CurInstr
                    << '\t' << *MI);
 
-      // Check clearance before partial register updates.
-      // Call breakDependence before setting LiveRegs[rx].Def.
-      unsigned Pref = TII->getPartialRegUpdateClearance(*MI, i, TRI);
-      if (Pref && shouldBreakDependence(MI, i, Pref))
-        TII->breakPartialRegDependency(*MI, i, TRI);
+      if (BlockDone) {
+        // Check clearance before partial register updates.
+        // Call breakDependence before setting LiveRegs[rx].Def.
+        unsigned Pref = TII->getPartialRegUpdateClearance(*MI, i, TRI);
+        if (Pref && shouldBreakDependence(MI, i, Pref))
+          TII->breakPartialRegDependency(*MI, i, TRI);
+      }
 
       // How many instructions since rx was last written?
       LiveRegs[rx].Def = CurInstr;
@@ -780,6 +788,45 @@ void ExeDepsFix::visitSoftInstr(MachineInstr *mi, unsigned mask) {
   }
 }
 
+void ExeDepsFix::processBasicBlock(MachineBasicBlock *MBB, bool PrimaryPass,
+                                   bool Done) {
+  enterBasicBlock(MBB);
+  for (MachineInstr &MI : *MBB) {
+    if (!MI.isDebugValue()) {
+      bool Kill = false;
+      if (PrimaryPass)
+        Kill = visitInstr(&MI);
+      processDefs(&MI, isBlockDone(MBB), Kill);
+    }
+  }
+  processUndefReads(MBB);
+  leaveBasicBlock(MBB);
+}
+
+bool ExeDepsFix::isBlockDone(MachineBasicBlock *MBB) {
+  return MBBInfos[MBB].PrimaryCompleted &&
+         MBBInfos[MBB].IncomingCompleted == MBBInfos[MBB].PrimaryIncoming &&
+         MBBInfos[MBB].IncomingProcessed == MBB->pred_size();
+}
+
+void ExeDepsFix::updateSuccessors(MachineBasicBlock *MBB, bool Primary,
+                                  bool Done) {
+  for (auto *Succ : MBB->successors()) {
+    if (!isBlockDone(Succ)) {
+      if (Primary) {
+        MBBInfos[Succ].IncomingProcessed++;
+      }
+      if (Done) {
+        MBBInfos[Succ].IncomingCompleted++;
+      }
+      if (isBlockDone(Succ)) {
+        processBasicBlock(Succ, false, true);
+        updateSuccessors(Succ, false, true);
+      }
+    }
+  }
+}
+
 bool ExeDepsFix::runOnMachineFunction(MachineFunction &mf) {
   if (skipFunction(*mf.getFunction()))
     return false;
@@ -816,44 +863,54 @@ bool ExeDepsFix::runOnMachineFunction(MachineFunction &mf) {
         AliasMap[*AI].push_back(i);
   }
 
+  // Initialize the MMBInfos
+  for (auto &MBB : mf) {
+    MBBInfo InitialInfo{nullptr, false, 0, 0, 0};
+    MBBInfos.insert(std::make_pair(&MBB, InitialInfo));
+  }
+
   MachineBasicBlock *Entry = &*MF->begin();
   ReversePostOrderTraversal<MachineBasicBlock*> RPOT(Entry);
-  SmallVector<MachineBasicBlock*, 16> Loops;
   for (ReversePostOrderTraversal<MachineBasicBlock*>::rpo_iterator
          MBBI = RPOT.begin(), MBBE = RPOT.end(); MBBI != MBBE; ++MBBI) {
     MachineBasicBlock *MBB = *MBBI;
-    enterBasicBlock(MBB);
-    if (SeenUnknownBackEdge)
-      Loops.push_back(MBB);
-    for (MachineInstr &MI : *MBB)
-      visitInstr(&MI);
-    processUndefReads(MBB);
-    leaveBasicBlock(MBB);
-  }
-
-  // Visit all the loop blocks again in order to merge DomainValues from
-  // back-edges.
-  for (MachineBasicBlock *MBB : Loops) {
-    enterBasicBlock(MBB);
-    for (MachineInstr &MI : *MBB)
-      if (!MI.isDebugValue())
-        processDefs(&MI, false);
-    processUndefReads(MBB);
-    leaveBasicBlock(MBB);
+    MBBInfos[MBB].PrimaryCompleted = true;
+    MBBInfos[MBB].PrimaryIncoming = MBBInfos[MBB].IncomingProcessed;
+    bool PrimaryDone = isBlockDone(MBB);
+    processBasicBlock(MBB, true, PrimaryDone);
+    updateSuccessors(MBB, true, PrimaryDone);
+  }
+
+  // We need to go through again and finalize any blocks that are not done yet.
+  // This is possible if blocks have dead predecessors, so we didn't visit them
+  // above. N.B.: The reason we update succesors immidately above, rather than
+  // doing everything in one go here, is to avoid having to do two passes on
+  // basic block between loops (with the scheme above, the whole loop will be
+  // completed before moving on to the blocks after it).
+  for (ReversePostOrderTraversal<MachineBasicBlock *>::rpo_iterator
+           MBBI = RPOT.begin(),
+           MBBE = RPOT.end();
+       MBBI != MBBE; ++MBBI) {
+    MachineBasicBlock *MBB = *MBBI;
+    if (!isBlockDone(MBB)) {
+      processBasicBlock(MBB, false, true);
+      // Don't update successors here. We'll get to them anyway through this
+      // loop.
+    }
   }
 
   // Clear the LiveOuts vectors and collapse any remaining DomainValues.
   for (ReversePostOrderTraversal<MachineBasicBlock*>::rpo_iterator
          MBBI = RPOT.begin(), MBBE = RPOT.end(); MBBI != MBBE; ++MBBI) {
-    LiveOutMap::const_iterator FI = LiveOuts.find(*MBBI);
-    if (FI == LiveOuts.end() || !FI->second)
+    auto FI = MBBInfos.find(*MBBI);
+    if (FI == MBBInfos.end() || !FI->second.OutRegs)
       continue;
     for (unsigned i = 0, e = NumRegs; i != e; ++i)
-      if (FI->second[i].Value)
-        release(FI->second[i].Value);
-    delete[] FI->second;
+      if (FI->second.OutRegs[i].Value)
+        release(FI->second.OutRegs[i].Value);
+    delete[] FI->second.OutRegs;
   }
-  LiveOuts.clear();
+  MBBInfos.clear();
   UndefReads.clear();
   Avail.clear();
   Allocator.DestroyAll();
diff --git a/test/CodeGen/X86/break-false-dep.ll b/test/CodeGen/X86/break-false-dep.ll
index 4c5e747..0ba1825 100644
--- a/test/CodeGen/X86/break-false-dep.ll
+++ b/test/CodeGen/X86/break-false-dep.ll
@@ -277,3 +277,60 @@ ret:
 ;AVX: vcvtsi2sdq {{.*}}, [[XMM4_7:%xmm[4-7]]], {{%xmm[0-9]+}}
 ;AVX-NOT: [[XMM4_7]]
 }
+
+; Make sure we are making a smart choice regarding undef registers even for more
+; complicated loop structures. This example is the inner loop from
+; julia> a = falses(10000); a[1:4:end] = true
+; julia> linspace(1.0,2.0,10000)[a]
+define void @loopclearance2(double* nocapture %y, i64* %x, double %c1, double %c2, double %c3, double %c4, i64 %size) {
+entry:
+  tail call void asm sideeffect "", "~{xmm7},~{dirflag},~{fpsr},~{flags}"()
+  tail call void asm sideeffect "", "~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{dirflag},~{fpsr},~{flags}"()
+  tail call void asm sideeffect "", "~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{dirflag},~{fpsr},~{flags}"()
+  br label %loop
+
+loop:
+  %phi_i = phi i64 [ 1, %entry ], [ %nexti, %loop_end ]
+  %phi_j = phi i64 [ 1, %entry ], [ %nextj, %loop_end ]
+  %phi_k = phi i64 [ 0, %entry ], [ %nextk, %loop_end ]
+  br label %inner_loop
+
+inner_loop:
+  %phi = phi i64 [ %phi_k, %loop ], [ %nextk, %inner_loop ]
+  %idx = lshr i64 %phi, 6
+  %inputptr = getelementptr i64, i64* %x, i64 %idx
+  %input = load i64, i64* %inputptr, align 8
+  %masked = and i64 %phi, 63
+  %shiftedmasked = shl i64 1, %masked
+  %maskedinput = and i64 %input, %shiftedmasked
+  %cmp = icmp eq i64 %maskedinput, 0
+  %nextk = add i64 %phi, 1
+  br i1 %cmp, label %inner_loop, label %loop_end
+
+loop_end:
+  %nexti = add i64 %phi_i, 1
+  %nextj = add i64 %phi_j, 1
+  ; Register use, plus us clobbering 7-15 above, basically forces xmm7 here as
+  ; the only reasonable choice. The primary thing we care about is that it's
+  ; not one of the registers used in the loop (e.g. not the output reg here)
+;AVX-NOT: %xmm6
+;AVX: vcvtsi2sdq {{.*}}, %xmm6, {{%xmm[0-9]+}}
+;AVX-NOT: %xmm6
+  %nexti_f = sitofp i64 %nexti to double
+  %sub = fsub double %c1, %nexti_f
+  %mul = fmul double %sub, %c2
+;AVX: vcvtsi2sdq {{.*}}, %xmm6, {{%xmm[0-9]+}}
+;AVX-NOT: %xmm6
+  %phi_f = sitofp i64 %phi to double
+  %mul2 = fmul double %phi_f, %c3
+  %add2 = fadd double %mul, %mul2
+  %div = fdiv double %add2, %c4
+  %prev_j = add i64 %phi_j, -1
+  %outptr = getelementptr double, double* %y, i64 %prev_j
+  store double %div, double* %outptr, align 8
+  %done = icmp slt i64 %size, %nexti
+  br i1 %done, label %loopdone, label %loop
+
+loopdone:
+  ret void
+}
-- 
2.9.3
back to top