1/*
2 * Copyright (C) 2019 Apple Inc. All rights reserved.
3 *
4 * Redistribution and use in source and binary forms, with or without
5 * modification, are permitted provided that the following conditions
6 * are met:
7 * 1. Redistributions of source code must retain the above copyright
8 * notice, this list of conditions and the following disclaimer.
9 * 2. Redistributions in binary form must reproduce the above copyright
10 * notice, this list of conditions and the following disclaimer in the
11 * documentation and/or other materials provided with the distribution.
12 *
13 * THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY
14 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
15 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
16 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR
17 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
18 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
19 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
20 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
21 * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
22 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
23 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
24 */
25
26#include "config.h"
27#include "B3ReduceLoopStrength.h"
28
29#if ENABLE(B3_JIT)
30
31#include "B3BasicBlockInlines.h"
32#include "B3BlockInsertionSet.h"
33#include "B3ConstPtrValue.h"
34#include "B3EnsureLoopPreHeaders.h"
35#include "B3InsertionSet.h"
36#include "B3NaturalLoops.h"
37#include "B3PhaseScope.h"
38#include "B3ProcedureInlines.h"
39#include "B3ValueInlines.h"
40#include "B3Variable.h"
41#include "B3VariableValue.h"
42#include <wtf/GraphNodeWorklist.h>
43#include <wtf/IndexSet.h>
44#include <wtf/SmallPtrSet.h>
45#include <wtf/Vector.h>
46
47namespace JSC { namespace B3 {
48
49#if CPU(X86_64)
50/*
51 * The semantics of B3 require that loads and stores are not reduced in granularity.
52 * If we would use system memcpy, then it is possible that we would get a byte copy loop,
53 * and this could cause GC tearing.
54 */
55void fastForwardCopy32(uint32_t* dst, const uint32_t* src, size_t size)
56{
57 uint64_t i;
58 int64_t counter;
59 uint64_t tmp, tmp2;
60
61 asm volatile(
62 "test %q[size], %q[size]\t\n"
63 "jz 900f\t\n" // exit
64 "xorq %q[i], %q[i]\t\n"
65 // if size < 8
66 "cmp $8, %q[size]\t\n"
67 "jl 100f\t\n" // backporch
68 // if (dst + size*4 <= src || src + size*4 <= dst)
69 "leaq (%q[src], %q[size], 4), %q[tmp]\t\n"
70 "cmp %q[tmp], %q[dst]\t\n"
71 "jae 500f\t\n" // simd no overlap
72 "leaq (%q[dst], %q[size], 4), %q[tmp]\t\n"
73 "cmp %q[tmp], %q[src]\t\n"
74 "jae 500f\t\n" // simd no overlap
75 "jmp 100f\t\n" // Backporch
76
77 // Backporch (assumes we can write at least one int)
78 "100:\t\n"
79 // counter = (size - i)%4
80 "mov %q[size], %q[counter]\t\n"
81 "subq %q[i], %q[counter]\t\n"
82
83 // If we are a multiple of 4, unroll the loop
84 // We already know we are nonzero
85 "and $3, %q[counter]\t\n"
86 "jz 200f\t\n" // unrolled loop
87
88 "negq %q[counter]\t\n"
89 ".balign 32\t\n"
90 "101:\t\n"
91 "mov (%q[src], %q[i], 4), %k[tmp]\t\n"
92 "mov %k[tmp], (%q[dst], %q[i], 4)\t\n"
93 "incq %q[i]\t\n"
94 "incq %q[counter]\t\n"
95 "jnz 101b\t\n"
96 // Do we still have anything left?
97 "cmpq %q[size], %q[i]\t\n"
98 "je 900f\t\n" // exit
99 // Then go to the unrolled loop
100 "jmp 200f\t\n"
101
102 // Unrolled loop (assumes multiple of 4, can do one iteration)
103 "200:\t\n"
104 "shr $2, %q[counter]\t\n"
105 "negq %q[counter]\t\n"
106 ".balign 32\t\n"
107 "201:\t\n"
108 "mov (%q[src], %q[i], 4), %k[tmp]\t\n"
109 "mov %k[tmp], (%q[dst], %q[i], 4)\t\n"
110 "mov 4(%q[src], %q[i], 4), %k[tmp2]\t\n"
111 "mov %k[tmp2], 4(%q[dst], %q[i], 4)\t\n"
112 "mov 8(%q[src], %q[i], 4), %k[tmp2]\t\n"
113 "mov %k[tmp2], 8(%q[dst], %q[i], 4)\t\n"
114 "mov 12(%q[src], %q[i], 4), %k[tmp]\t\n"
115 "mov %k[tmp], 12(%q[dst], %q[i], 4)\t\n"
116 "addq $4, %q[i]\t\n"
117 "cmpq %q[size], %q[i]\t\n"
118 "jb 201b\t\n"
119 "jmp 900f\t\n" // exit
120
121 // simd no overlap
122 // counter = -(size/8);
123 "500:\t\n"
124 "mov %q[size], %q[counter]\t\n"
125 "shrq $3, %q[counter]\t\n"
126 "negq %q[counter]\t\n"
127 ".balign 32\t\n"
128 "501:\t\n"
129 "movups (%q[src], %q[i], 4), %%xmm0\t\n"
130 "movups 16(%q[src], %q[i], 4), %%xmm1\t\n"
131 "movups %%xmm0, (%q[dst], %q[i], 4)\t\n"
132 "movups %%xmm1, 16(%q[dst], %q[i], 4)\t\n"
133 "addq $8, %q[i]\t\n"
134 "incq %q[counter]\t\n"
135 "jnz 501b\t\n"
136 // if (i == size)
137 "cmpq %q[size], %q[i]\t\n"
138 "je 900f\t\n" // end
139 "jmp 100b\t\n" // backporch
140 "900:\t\n"
141 : [i] "=&c" (i), [counter] "=&r" (counter), [tmp] "=&a" (tmp), [tmp2] "=&r" (tmp2)
142 : [dst] "D" (dst), [src] "S" (src), [size] "r" (size)
143 : "xmm0", "xmm1");
144}
145#endif
146
147class ReduceLoopStrength {
148 static constexpr bool verbose = false;
149
150 struct AddrInfo {
151 Value* appendAddr(Procedure& proc, BasicBlock* block, Value* addr)
152 {
153 block->appendNew<Value>(proc, Identity, addr->origin(), addr);
154 return addr;
155 }
156
157 Value* insideLoopAddr;
158
159 Value* arrayBase;
160 Value* index;
161 };
162
163public:
164 ReduceLoopStrength(Procedure& proc)
165 : m_proc(proc)
166 , m_insertionSet(proc)
167 , m_blockInsertionSet(proc)
168 , m_root(proc.at(0))
169 , m_phiChildren(proc)
170 {
171 }
172
173#if CPU(X86_64)
174 void reduceByteCopyLoopsToMemcpy()
175 {
176 HashSet<Value*> patternMatchedValues;
177
178 Value* store = m_value;
179 if (store->opcode() != Store)
180 return;
181 patternMatchedValues.add(store);
182
183 Value* load = store->child(0);
184 uint32_t elemSize = sizeofType(load->type());
185 if (load->opcode() != Load || elemSize != 4)
186 return;
187 patternMatchedValues.add(load);
188
189 if (verbose)
190 dataLogLn("Found store/load:", *store);
191
192 auto extractArrayInfo = [&] (Value* value) -> Optional<AddrInfo> {
193 patternMatchedValues.add(value);
194
195 Optional<AddrInfo> addr = { AddrInfo() };
196
197 Value* sum = value;
198 if (value->opcode() == ZExt32)
199 sum = sum->child(0);
200
201 if (sum->opcode() != Add || sum->numChildren() != 2)
202 return { };
203
204 addr->insideLoopAddr = sum;
205 auto extractLoopIndexInSubtree = [&] (Value* value) -> Value* {
206 // Match arrBase[i*elemSize]
207 if (value->opcode() == Shl
208 && value->child(0)->opcode() == Phi
209 && value->child(1)->isInt(WTF::fastLog2(elemSize)))
210 return value->child(0);
211 if (value->opcode() == Shl
212 && value->child(0)->opcode() == ZExt32
213 && value->child(0)->child(0)->opcode() == Phi
214 && value->child(1)->isInt(WTF::fastLog2(elemSize)))
215 return value->child(0)->child(0);
216 return nullptr;
217 };
218
219 // Try to find a known pattern applied to a single phi, which we can assume is our loop index.
220 Value* left = extractLoopIndexInSubtree(sum->child(0));
221 Value* right = extractLoopIndexInSubtree(sum->child(1));
222 if (left && !right) {
223 addr->index = left;
224 addr->arrayBase = sum->child(1);
225 } else if (right && !left) {
226 addr->index = right;
227 addr->arrayBase = sum->child(0);
228 } else
229 return { };
230
231 patternMatchedValues.add(addr->index);
232 patternMatchedValues.add(addr->arrayBase);
233
234 return addr;
235 };
236
237 auto destination = extractArrayInfo(store->child(1));
238 auto source = extractArrayInfo(load->child(0));
239
240 if (!source || !destination || source->index != destination->index)
241 return;
242
243 if (destination->arrayBase->type() != Int64 || source->arrayBase->type() != Int64)
244 return;
245
246 if (verbose)
247 dataLogLn("Found array info: ", *source->arrayBase, "[", *source->index, "] = ", *destination->arrayBase, "[", *destination->index, "]");
248
249 // Find the loop header, footer and inner blocks.
250 // Verify that there is one entrance to and one exit from the loop.
251 auto* loop = m_proc.naturalLoops().innerMostLoopOf(m_block);
252 if (!loop)
253 return;
254 BasicBlock* loopHead = loop->header();
255 BasicBlock* loopFoot = nullptr;
256 BasicBlock* loopPreheader = nullptr;
257 BasicBlock* loopPostfooter = nullptr;
258 SmallPtrSet<BasicBlock*> loopInnerBlocks;
259
260 {
261 for (unsigned i = 0; i < loop->size(); ++i)
262 loopInnerBlocks.add(loop->at(i));
263
264 if (!loopInnerBlocks.contains(load->owner))
265 return;
266
267 if (!loopInnerBlocks.contains(store->owner))
268 return;
269
270 SmallPtrSet<BasicBlock*> loopEntrances;
271 SmallPtrSet<BasicBlock*> loopExits;
272 for (auto* block : loopInnerBlocks) {
273 for (auto successor : block->successors()) {
274 if (!loopInnerBlocks.contains(successor.block()))
275 loopExits.add(successor.block());
276 }
277 for (auto* predecessor : block->predecessors()) {
278 if (!loopInnerBlocks.contains(predecessor))
279 loopEntrances.add(predecessor);
280 }
281 }
282
283 if (loopExits.size() != 1) {
284 if (verbose) {
285 dataLog("Loop has incorrect number of exits: ");
286 for (BasicBlock* block : loopExits)
287 dataLog(*block, " ");
288 dataLogLn();
289 }
290 return;
291 }
292 loopPostfooter = *loopExits.begin();
293
294 if (loopEntrances.size() != 1) {
295 if (verbose) {
296 dataLog("Loop has incorrect number of entrances: ");
297 for (BasicBlock* block : loopEntrances)
298 dataLog(*block, " ");
299 dataLogLn();
300 }
301 return;
302 }
303 loopPreheader = *loopEntrances.begin();
304
305 if (!loopHead->predecessors().contains(loopPreheader)) {
306 if (verbose)
307 dataLogLn("Loop head does not follow preheader");
308 return;
309 }
310
311 for (BasicBlock* predecessor : loopPostfooter->predecessors()) {
312 if (loopInnerBlocks.contains(predecessor)) {
313 // There is only one loop exit.
314 RELEASE_ASSERT(!loopFoot);
315 loopFoot = predecessor;
316 }
317 }
318 }
319
320 if (verbose) {
321 dataLog("Found loop head:", *loopHead, " foot:", *loopFoot, " prehead ", *loopPreheader, " postfoot ", *loopPostfooter, " inner blocks: ");
322 for (BasicBlock* block : loopInnerBlocks)
323 dataLog(*block, " ");
324 dataLogLn();
325 }
326
327 // Verify that the index is a monotonic inductive variable.
328 Value* startingIndex = nullptr;
329 {
330 if (m_phiChildren.at(source->index).size() != 2)
331 return;
332
333 Value* updateIndex = nullptr;
334 for (Value* up : m_phiChildren.at(source->index)) {
335 if (m_proc.dominators().dominates(up->owner, loopPreheader))
336 startingIndex = up;
337 else
338 updateIndex = up;
339 }
340
341 if (!updateIndex || !startingIndex || !loopInnerBlocks.contains(updateIndex->owner))
342 return;
343 patternMatchedValues.add(updateIndex);
344 patternMatchedValues.add(startingIndex);
345
346 updateIndex = updateIndex->child(0);
347 startingIndex = startingIndex->child(0);
348 if (updateIndex->opcode() != Add || !updateIndex->child(1)->isInt(1) || updateIndex->child(0) != source->index)
349 return;
350
351 if (!startingIndex->isInt(0))
352 return;
353 }
354
355 if (verbose)
356 dataLogLn("Found starting index:", *startingIndex);
357
358 // Identify the loop exit condition.
359 Value* exitCondition = nullptr;
360 // Is this an exit condition or a continuation condition?
361 bool conditionInverted = false;
362 // Do we check the index before or after using it?
363 bool checkBeforeWrite = false;
364 {
365 Value* branch = loopFoot->last();
366 if (branch->opcode() != Branch)
367 return;
368 patternMatchedValues.add(branch);
369 exitCondition = branch->child(0);
370
371 conditionInverted = loopPostfooter != loopFoot->successor(0).block();
372 checkBeforeWrite = m_proc.dominators().strictlyDominates(loopFoot, m_block);
373 }
374
375 if (verbose)
376 dataLogLn("Found exit condition: ", *exitCondition, " inverted: ", conditionInverted, " checks before the first write: ", checkBeforeWrite);
377
378 // Idenfity the loop bound by matching loop bound expressions.
379 Value* loopBound = nullptr;
380 {
381 auto extractLoopBound = [&] (Value* index, Value* bound) -> Value* {
382 // Match i+1 when we do a write first followed by the check for the next iteration
383 if (!checkBeforeWrite && index->opcode() == Add && index->child(0) == source->index && index->child(1)->isInt(1))
384 return bound;
385 return nullptr;
386 };
387
388 if (exitCondition->opcode() == GreaterThan && conditionInverted) {
389 // enter loop again if size > i
390 loopBound = extractLoopBound(exitCondition->child(1), exitCondition->child(0));
391 } else if (exitCondition->opcode() == LessThan && !conditionInverted) {
392 // enter loop again if size < i
393 loopBound = extractLoopBound(exitCondition->child(1), exitCondition->child(0));
394 }
395
396 if (!loopBound) {
397 if (verbose)
398 dataLogLn("Unable to extract bound from exit condition ", *exitCondition);
399 return;
400 }
401 }
402
403 // Make sure there are no other effectful things happening.
404 // This also guarantees that we found the only branch in the loop. This means that our
405 // load/store must happen iff our index update and branch do.
406 for (BasicBlock* block : loopInnerBlocks) {
407 for (unsigned index = 0; index < block->size(); ++index) {
408 Value* value = block->at(index);
409 if (patternMatchedValues.contains(value) || value->opcode() == Jump)
410 continue;
411
412 Effects effects = value->effects();
413 effects.readsLocalState = false;
414 if (effects != Effects::none()) {
415 if (verbose)
416 dataLogLn("Not byte copy loop, node ", *value, " has effects");
417 return;
418 }
419 }
420 }
421
422 if (!hoistValue(loopPreheader, source->arrayBase, false)
423 || !hoistValue(loopPreheader, destination->arrayBase, false)
424 || !hoistValue(loopPreheader, loopBound, false))
425 return;
426
427 if (verbose)
428 dataLogLn("Found byte copy loop: ", *source->arrayBase, "[", *source->index, "] = ", *destination->arrayBase, "[", *destination->index, "] from index ", *startingIndex, " to max index ", *loopBound, " stride: ", elemSize);
429
430 bool hoistResult = hoistValue(loopPreheader, source->arrayBase);
431 hoistResult &= hoistValue(loopPreheader, destination->arrayBase);
432 hoistResult &= hoistValue(loopPreheader, loopBound);
433 ASSERT_UNUSED(hoistResult, hoistResult);
434
435 auto origin = loopPostfooter->at(0)->origin();
436
437 BasicBlock* memcpy = m_blockInsertionSet.insertBefore(loopPostfooter, loopPostfooter->frequency());
438 memcpy->setSuccessors(FrequentedBlock(loopPostfooter));
439 memcpy->addPredecessor(loopPreheader);
440 for (BasicBlock* pred : loopPostfooter->predecessors())
441 loopPostfooter->removePredecessor(pred);
442 loopPostfooter->addPredecessor(memcpy);
443 loopPreheader->setSuccessors(memcpy);
444
445 Effects effects = Effects::forCall();
446 memcpy->appendNew<CCallValue>(m_proc, B3::Void, origin, effects,
447 memcpy->appendNew<ConstPtrValue>(m_proc, origin, tagCFunctionPtr<void*>(fastForwardCopy32, B3CCallPtrTag)),
448 destination->appendAddr(m_proc, memcpy, destination->arrayBase),
449 source->appendAddr(m_proc, memcpy, source->arrayBase),
450 loopBound);
451
452 memcpy->appendNew<Value>(m_proc, Jump, origin);
453 }
454
455 bool hoistValue(BasicBlock* into, Value* value, bool canMutate = true)
456 {
457 if (m_proc.dominators().dominates(value->owner, into))
458 return true;
459
460 Effects effects = value->effects();
461 if (effects != Effects::none()) {
462 if (verbose)
463 dataLogLn("Cannot hoist ", *value, " into ", *into, " since it has effects");
464 return false;
465 }
466
467 for (Value* child : value->children()) {
468 if (!hoistValue(into, child, false))
469 return false;
470 }
471
472 if (canMutate) {
473 for (Value* child : value->children())
474 hoistValue(into, child);
475
476 value->owner->at(value->index()) = m_proc.add<Value>(Nop, Void, value->origin());
477 into->appendNonTerminal(value);
478 }
479
480 return true;
481 }
482
483 bool run()
484 {
485 if (verbose)
486 dataLogLn("ReduceLoopStrength examining proc: ", m_proc);
487 bool result = false;
488
489 do {
490 m_changed = false;
491
492 for (BasicBlock* block : m_proc.blocksInPreOrder()) {
493 m_block = block;
494
495 for (m_index = 0; m_index < block->size(); ++m_index) {
496 m_value = m_block->at(m_index);
497 m_value->performSubstitution();
498 reduceByteCopyLoopsToMemcpy();
499 m_insertionSet.execute(m_block);
500 m_changed |= m_blockInsertionSet.execute();
501
502 if (m_changed) {
503 m_proc.resetReachability();
504 m_proc.invalidateCFG();
505 m_proc.resetValueOwners();
506 result = true;
507 break;
508 }
509 }
510
511 if (m_changed)
512 break;
513 }
514 } while (m_changed && m_proc.optLevel() >= 2);
515
516 if (result && verbose)
517 dataLogLn("ReduceLoopStrength produced proc: ", m_proc);
518
519 return result;
520 }
521#else
522 bool run()
523 {
524 return false;
525 }
526#endif
527
528 Procedure& m_proc;
529 InsertionSet m_insertionSet;
530 BlockInsertionSet m_blockInsertionSet;
531 BasicBlock* m_root { nullptr };
532 BasicBlock* m_block { nullptr };
533 unsigned m_index { 0 };
534 Value* m_value { nullptr };
535 bool m_changed { false };
536 PhiChildren m_phiChildren;
537};
538
539bool reduceLoopStrength(Procedure& proc)
540{
541 PhaseScope phaseScope(proc, "reduceLoopStrength");
542 return ReduceLoopStrength(proc).run();
543}
544
545} } // namespace JSC::B3
546
547#endif // ENABLE(B3_JIT)
548