Skip to content

Commit 2330fd2

Browse files
authored
[LoopPeel] Add new option to peeling loops to convert PHI into IV (#121104)
LoopPeel currently considers PHI nodes that become loop invariants through peeling. However, in some cases, peeling transforms PHI nodes into induction variables (IVs), potentially enabling further optimizations such as loop vectorization. For example: ```c // TSVC s292 int im = N-1; for (int i=0; i<N; i++) { a[i] = b[i] + b[im]; im = i; } ``` In this case, peeling one iteration converts `im` into an IV, allowing it to be handled by the loop vectorizer. This patch adds a new feature to peel loops when to convert PHIs into IVs. At the moment this feature is disabled by default. Enabling it allows to vectorize the above example. I have measured on neoverse-v2 and observed a speedup of more than 60% (options: `-O3 -ffast-math -mcpu=neoverse-v2 -mllvm -enable-peeling-for-iv`). This PR is taken over from #94900 Related #81851
1 parent 8b2028c commit 2330fd2

File tree

3 files changed

+568
-28
lines changed

3 files changed

+568
-28
lines changed

llvm/lib/Transforms/Utils/LoopPeel.cpp

Lines changed: 171 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -81,6 +81,10 @@ static cl::opt<bool> DisableAdvancedPeeling(
8181
cl::desc(
8282
"Disable advance peeling. Issues for convergent targets (D134803)."));
8383

84+
static cl::opt<bool> EnablePeelingForIV(
85+
"enable-peeling-for-iv", cl::init(false), cl::Hidden,
86+
cl::desc("Enable peeling to convert Phi nodes into IVs"));
87+
8488
static const char *PeeledCountMetaData = "llvm.loop.peeled.count";
8589

8690
// Check whether we are capable of peeling this loop.
@@ -155,45 +159,170 @@ namespace {
155159
// corresponding calls to g are determined and the code for computing
156160
// x, y, and a can be removed.
157161
//
162+
// Similarly, there are cases where peeling makes Phi nodes loop-inductions
163+
// (i.e., the value is increased or decreased by a fixed amount on every
164+
// iteration). For example, consider the following function.
165+
//
166+
// #define N 100
167+
// void f(int a[], int b[]) {
168+
// int im = N - 1;
169+
// for (int i = 0; i < N; i++) {
170+
// a[i] = b[i] + b[im];
171+
// im = i;
172+
// }
173+
// }
174+
//
175+
// The IR of the loop will look something like the following.
176+
//
177+
// %i = phi i32 [ 0, %entry ], [ %i.next, %for.body ]
178+
// %im = phi i32 [ 99, %entry ], [ %i, %for.body ]
179+
// ...
180+
// %i.next = add nuw nsw i32 %i, 1
181+
// ...
182+
//
183+
// In this case, %im becomes a loop-induction variable by peeling 1 iteration,
184+
// because %i is a loop-induction one. The peeling count can be determined by
185+
// the same algorithm with loop-invariant case. Such peeling is profitable for
186+
// loop-vectorization.
187+
//
158188
// The PhiAnalyzer class calculates how many times a loop should be
159189
// peeled based on the above analysis of the phi nodes in the loop while
160190
// respecting the maximum specified.
161191
class PhiAnalyzer {
162192
public:
163-
PhiAnalyzer(const Loop &L, unsigned MaxIterations);
193+
PhiAnalyzer(const Loop &L, unsigned MaxIterations, bool PeelForIV);
164194

165195
// Calculate the sufficient minimum number of iterations of the loop to peel
166196
// such that phi instructions become determined (subject to allowable limits)
167197
std::optional<unsigned> calculateIterationsToPeel();
168198

169199
protected:
170-
using PeelCounter = std::optional<unsigned>;
200+
enum class PeelCounterType {
201+
Invariant,
202+
Induction,
203+
};
204+
205+
using PeelCounterValue = std::pair<unsigned, PeelCounterType>;
206+
using PeelCounter = std::optional<PeelCounterValue>;
171207
const PeelCounter Unknown = std::nullopt;
172208

173209
// Add 1 respecting Unknown and return Unknown if result over MaxIterations
174210
PeelCounter addOne(PeelCounter PC) const {
175211
if (PC == Unknown)
176212
return Unknown;
177-
return (*PC + 1 <= MaxIterations) ? PeelCounter{*PC + 1} : Unknown;
213+
auto [Val, Ty] = *PC;
214+
return (Val + 1 <= MaxIterations) ? PeelCounter({Val + 1, Ty}) : Unknown;
178215
}
179216

180-
// Calculate the number of iterations after which the given value
181-
// becomes an invariant.
217+
// Return a value representing zero for the given counter type.
218+
PeelCounter makeZero(PeelCounterType Ty) const {
219+
return PeelCounter({0, Ty});
220+
}
221+
222+
// Calculate the number of iterations after which the given value becomes an
223+
// invariant or an induction.
182224
PeelCounter calculate(const Value &);
183225

226+
// Auxiliary function to calculate the number of iterations for a comparison
227+
// instruction or a binary operator.
228+
PeelCounter mergeTwoCounter(const Instruction &CmpOrBinaryOp,
229+
const PeelCounterValue &LHS,
230+
const PeelCounterValue &RHS) const;
231+
232+
// Returns true if the \p Phi is an induction in the target loop. This is a
233+
// lightweight check and possible to detect an IV in some cases.
234+
bool isInductionPHI(const PHINode *Phi) const;
235+
184236
const Loop &L;
185237
const unsigned MaxIterations;
238+
const bool PeelForIV;
186239

187-
// Map of Values to number of iterations to invariance
188-
SmallDenseMap<const Value *, PeelCounter> IterationsToInvariance;
240+
// Map of Values to number of iterations to invariance or induction
241+
SmallDenseMap<const Value *, PeelCounter> IterationsToInvarianceOrInduction;
189242
};
190243

191-
PhiAnalyzer::PhiAnalyzer(const Loop &L, unsigned MaxIterations)
192-
: L(L), MaxIterations(MaxIterations) {
244+
PhiAnalyzer::PhiAnalyzer(const Loop &L, unsigned MaxIterations, bool PeelForIV)
245+
: L(L), MaxIterations(MaxIterations), PeelForIV(PeelForIV) {
193246
assert(canPeel(&L) && "loop is not suitable for peeling");
194247
assert(MaxIterations > 0 && "no peeling is allowed?");
195248
}
196249

250+
/// Test whether \p Phi is an induction variable. Although this can be
251+
/// determined using SCEV analysis, it is expensive to compute here. Instead,
252+
/// we perform cheaper checks that may not detect complex cases but are
253+
/// sufficient for some situations.
254+
bool PhiAnalyzer::isInductionPHI(const PHINode *Phi) const {
255+
// Currently we only support a loop that has single latch.
256+
BasicBlock *Latch = L.getLoopLatch();
257+
if (Latch == nullptr)
258+
return false;
259+
260+
Value *Cur = Phi->getIncomingValueForBlock(Latch);
261+
SmallPtrSet<Value *, 4> Visited;
262+
bool VisitBinOp = false;
263+
264+
// Starting from the incoming value of the Phi, we follow the use-def chain.
265+
// We consider Phi to be an IV if we can reach it again by traversing only
266+
// add, sub, or cast instructions.
267+
while (true) {
268+
if (Cur == Phi)
269+
break;
270+
271+
// Avoid infinite loop.
272+
if (Visited.contains(Cur))
273+
return false;
274+
275+
auto *I = dyn_cast<Instruction>(Cur);
276+
if (!I || !L.contains(I))
277+
return false;
278+
279+
Visited.insert(Cur);
280+
281+
if (auto *Cast = dyn_cast<CastInst>(I)) {
282+
Cur = Cast->getOperand(0);
283+
} else if (auto *BinOp = dyn_cast<BinaryOperator>(I)) {
284+
if (BinOp->getOpcode() != Instruction::Add &&
285+
BinOp->getOpcode() != Instruction::Sub)
286+
return false;
287+
if (!isa<ConstantInt>(BinOp->getOperand(1)))
288+
return false;
289+
290+
VisitBinOp = true;
291+
Cur = BinOp->getOperand(0);
292+
} else {
293+
return false;
294+
}
295+
}
296+
297+
// Ignore cases where no binary operations are visited.
298+
return VisitBinOp;
299+
}
300+
301+
/// When either \p LHS or \p RHS is an IV, the result of \p CmpOrBinaryOp is
302+
/// considered an IV only if it is an addition or a subtraction. Otherwise the
303+
/// result can be a value that is neither an loop-invariant nor an IV.
304+
///
305+
/// If both \p LHS and \p RHS are loop-invariants, then the result of
306+
/// \CmpOrBinaryOp is also a loop-invariant.
307+
PhiAnalyzer::PeelCounter
308+
PhiAnalyzer::mergeTwoCounter(const Instruction &CmpOrBinaryOp,
309+
const PeelCounterValue &LHS,
310+
const PeelCounterValue &RHS) const {
311+
auto &[LVal, LTy] = LHS;
312+
auto &[RVal, RTy] = RHS;
313+
unsigned NewVal = std::max(LVal, RVal);
314+
315+
if (LTy == PeelCounterType::Induction || RTy == PeelCounterType::Induction) {
316+
if (const auto *BinOp = dyn_cast<BinaryOperator>(&CmpOrBinaryOp)) {
317+
if (BinOp->getOpcode() == Instruction::Add ||
318+
BinOp->getOpcode() == Instruction::Sub)
319+
return PeelCounter({NewVal, PeelCounterType::Induction});
320+
}
321+
return Unknown;
322+
}
323+
return PeelCounter({NewVal, PeelCounterType::Invariant});
324+
}
325+
197326
// This function calculates the number of iterations after which the value
198327
// becomes an invariant. The pre-calculated values are memorized in a map.
199328
// N.B. This number will be Unknown or <= MaxIterations.
@@ -212,25 +341,34 @@ PhiAnalyzer::PeelCounter PhiAnalyzer::calculate(const Value &V) {
212341
// If we already know the answer, take it from the map.
213342
// Otherwise, place Unknown to map to avoid infinite recursion. Such
214343
// cycles can never stop on an invariant.
215-
auto [I, Inserted] = IterationsToInvariance.try_emplace(&V, Unknown);
344+
auto [I, Inserted] =
345+
IterationsToInvarianceOrInduction.try_emplace(&V, Unknown);
216346
if (!Inserted)
217347
return I->second;
218348

219349
if (L.isLoopInvariant(&V))
220350
// Loop invariant so known at start.
221-
return (IterationsToInvariance[&V] = 0);
351+
return (IterationsToInvarianceOrInduction[&V] =
352+
makeZero(PeelCounterType::Invariant));
222353
if (const PHINode *Phi = dyn_cast<PHINode>(&V)) {
223354
if (Phi->getParent() != L.getHeader()) {
224355
// Phi is not in header block so Unknown.
225-
assert(IterationsToInvariance[&V] == Unknown && "unexpected value saved");
356+
assert(IterationsToInvarianceOrInduction[&V] == Unknown &&
357+
"unexpected value saved");
226358
return Unknown;
227359
}
360+
361+
// If Phi is an induction, register it as a starting point.
362+
if (PeelForIV && isInductionPHI(Phi))
363+
return (IterationsToInvarianceOrInduction[&V] =
364+
makeZero(PeelCounterType::Induction));
365+
228366
// We need to analyze the input from the back edge and add 1.
229367
Value *Input = Phi->getIncomingValueForBlock(L.getLoopLatch());
230368
PeelCounter Iterations = calculate(*Input);
231-
assert(IterationsToInvariance[Input] == Iterations &&
369+
assert(IterationsToInvarianceOrInduction[Input] == Iterations &&
232370
"unexpected value saved");
233-
return (IterationsToInvariance[Phi] = addOne(Iterations));
371+
return (IterationsToInvarianceOrInduction[Phi] = addOne(Iterations));
234372
}
235373
if (const Instruction *I = dyn_cast<Instruction>(&V)) {
236374
if (isa<CmpInst>(I) || I->isBinaryOp()) {
@@ -241,26 +379,30 @@ PhiAnalyzer::PeelCounter PhiAnalyzer::calculate(const Value &V) {
241379
PeelCounter RHS = calculate(*I->getOperand(1));
242380
if (RHS == Unknown)
243381
return Unknown;
244-
return (IterationsToInvariance[I] = {std::max(*LHS, *RHS)});
382+
return (IterationsToInvarianceOrInduction[I] =
383+
mergeTwoCounter(*I, *LHS, *RHS));
245384
}
246385
if (I->isCast())
247386
// Cast instructions get the value of the operand.
248-
return (IterationsToInvariance[I] = calculate(*I->getOperand(0)));
387+
return (IterationsToInvarianceOrInduction[I] =
388+
calculate(*I->getOperand(0)));
249389
}
250390
// TODO: handle more expressions
251391

252392
// Everything else is Unknown.
253-
assert(IterationsToInvariance[&V] == Unknown && "unexpected value saved");
393+
assert(IterationsToInvarianceOrInduction[&V] == Unknown &&
394+
"unexpected value saved");
254395
return Unknown;
255396
}
256397

257398
std::optional<unsigned> PhiAnalyzer::calculateIterationsToPeel() {
258399
unsigned Iterations = 0;
259400
for (auto &PHI : L.getHeader()->phis()) {
260-
PeelCounter ToInvariance = calculate(PHI);
261-
if (ToInvariance != Unknown) {
262-
assert(*ToInvariance <= MaxIterations && "bad result in phi analysis");
263-
Iterations = std::max(Iterations, *ToInvariance);
401+
PeelCounter ToInvarianceOrInduction = calculate(PHI);
402+
if (ToInvarianceOrInduction != Unknown) {
403+
unsigned Val = ToInvarianceOrInduction->first;
404+
assert(Val <= MaxIterations && "bad result in phi analysis");
405+
Iterations = std::max(Iterations, Val);
264406
if (Iterations == MaxIterations)
265407
break;
266408
}
@@ -654,14 +796,15 @@ void llvm::computePeelCount(Loop *L, unsigned LoopSize,
654796
// in TTI.getPeelingPreferences or by the flag -unroll-peel-count.
655797
unsigned DesiredPeelCount = TargetPeelCount;
656798

657-
// Here we try to get rid of Phis which become invariants after 1, 2, ..., N
658-
// iterations of the loop. For this we compute the number for iterations after
659-
// which every Phi is guaranteed to become an invariant, and try to peel the
660-
// maximum number of iterations among these values, thus turning all those
661-
// Phis into invariants.
799+
// Here we try to get rid of Phis which become invariants or inductions after
800+
// 1, 2, ..., N iterations of the loop. For this we compute the number for
801+
// iterations after which every Phi is guaranteed to become an invariant or an
802+
// induction, and try to peel the maximum number of iterations among these
803+
// values, thus turning all those Phis into invariants or inductions.
662804
if (MaxPeelCount > DesiredPeelCount) {
663805
// Check how many iterations are useful for resolving Phis
664-
auto NumPeels = PhiAnalyzer(*L, MaxPeelCount).calculateIterationsToPeel();
806+
auto NumPeels = PhiAnalyzer(*L, MaxPeelCount, EnablePeelingForIV)
807+
.calculateIterationsToPeel();
665808
if (NumPeels)
666809
DesiredPeelCount = std::max(DesiredPeelCount, *NumPeels);
667810
}
@@ -680,7 +823,7 @@ void llvm::computePeelCount(Loop *L, unsigned LoopSize,
680823
if (DesiredPeelCount + AlreadyPeeled <= UnrollPeelMaxCount) {
681824
LLVM_DEBUG(dbgs() << "Peel " << DesiredPeelCount
682825
<< " iteration(s) to turn"
683-
<< " some Phis into invariants.\n");
826+
<< " some Phis into invariants or inductions.\n");
684827
PP.PeelCount = DesiredPeelCount;
685828
PP.PeelProfiledIterations = false;
686829
PP.PeelLast = false;

0 commit comments

Comments
 (0)