Skip to content

Commit 08e52f9

Browse files
authored
Bysyncify: optimize better by coalescing before instrumenting control flow (#2183)
This results in better code sizes on many testcases, sometimes much better. For example, on SQLite the 150K function has only 27 locals instead of 3,874 which it had before (!). This also reduces total code size on SQLite by 15%. The key issue is that after instrumenting control flow we have a lot bigger live ranges. This must be done rather carefully, as we need to introduce some temp locals early on (for breaking up ifs, for call return values, etc.).
1 parent e63fe0b commit 08e52f9

6 files changed

Lines changed: 1848 additions & 927 deletions

src/passes/Bysyncify.cpp

Lines changed: 138 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -236,9 +236,54 @@ enum class DataOffset { BStackPos = 0, BStackEnd = 4 };
236236

237237
const auto STACK_ALIGN = 4;
238238

239+
// A helper class for managing fake global names. Creates the globals and
240+
// provides mappings for using them.
241+
class GlobalHelper {
242+
Module& module;
243+
244+
public:
245+
GlobalHelper(Module& module) : module(module) {
246+
map[i32] = "bysyncify_fake_call_global_i32";
247+
map[i64] = "bysyncify_fake_call_global_i64";
248+
map[f32] = "bysyncify_fake_call_global_f32";
249+
map[f64] = "bysyncify_fake_call_global_f64";
250+
Builder builder(module);
251+
for (auto& pair : map) {
252+
auto type = pair.first;
253+
auto name = pair.second;
254+
rev[name] = type;
255+
module.addGlobal(builder.makeGlobal(
256+
name, type, LiteralUtils::makeZero(type, module), Builder::Mutable));
257+
}
258+
}
259+
260+
~GlobalHelper() {
261+
for (auto& pair : map) {
262+
auto name = pair.second;
263+
module.removeGlobal(name);
264+
}
265+
}
266+
267+
Name getName(Type type) { return map.at(type); }
268+
269+
Type getTypeOrNone(Name name) {
270+
auto iter = rev.find(name);
271+
if (iter != rev.end()) {
272+
return iter->second;
273+
}
274+
return none;
275+
}
276+
277+
private:
278+
std::map<Type, Name> map;
279+
std::map<Name, Type> rev;
280+
};
281+
239282
// Analyze the entire module to see which calls may change the state, that
240283
// is, start an unwind or rewind), either in itself or in something called
241284
// by it.
285+
// Handles global module management, needed from the various parts of this
286+
// transformation.
242287
class ModuleAnalyzer {
243288
Module& module;
244289
bool canIndirectChangeState;
@@ -268,7 +313,8 @@ class ModuleAnalyzer {
268313
ModuleAnalyzer(Module& module,
269314
std::function<bool(Name, Name)> canImportChangeState,
270315
bool canIndirectChangeState)
271-
: module(module), canIndirectChangeState(canIndirectChangeState) {
316+
: module(module), canIndirectChangeState(canIndirectChangeState),
317+
globals(module) {
272318
// Scan to see which functions can directly change the state.
273319
// Also handle the bysyncify imports, removing them (as we will implement
274320
// them later), and replace calls to them with calls to the later proper
@@ -429,6 +475,8 @@ class ModuleAnalyzer {
429475
}
430476
return walker.canChangeState;
431477
}
478+
479+
GlobalHelper globals;
432480
};
433481

434482
// Checks if something performs a call: either a direct or indirect call,
@@ -487,8 +535,9 @@ struct BysyncifyFlow : public Pass {
487535
BysyncifyFlow(ModuleAnalyzer* analyzer) : analyzer(analyzer) {}
488536

489537
void
490-
runOnFunction(PassRunner* runner, Module* module_, Function* func) override {
538+
runOnFunction(PassRunner* runner, Module* module_, Function* func_) override {
491539
module = module_;
540+
func = func_;
492541
// If the function cannot change our state, we have nothing to do -
493542
// we will never unwind or rewind the stack here.
494543
if (!analyzer->needsInstrumentation(func)) {
@@ -518,6 +567,7 @@ struct BysyncifyFlow : public Pass {
518567
std::unique_ptr<BysyncifyBuilder> builder;
519568

520569
Module* module;
570+
Function* func;
521571

522572
// Each call in the function has an index, noted during unwind and checked
523573
// during rewind.
@@ -527,12 +577,15 @@ struct BysyncifyFlow : public Pass {
527577
if (!analyzer->canChangeState(curr)) {
528578
return makeMaybeSkip(curr);
529579
}
530-
// The IR is in flat form, which makes this much simpler. We basically
531-
// need to add skips to avoid code when rewinding, and checks around calls
532-
// with unwinding/rewinding support.
580+
// The IR is in flat form, which makes this much simpler: there are no
581+
// unnecessarily nested side effects or control flow, so we can add
582+
// skips for rewinding in an easy manner, putting a single if around
583+
// whole chunks of code. Also calls are separated out so that it is easy
584+
// to add the necessary checks for them for unwinding/rewinding support.
533585
//
534586
// Aside from that, we must "linearize" all control flow so that we can
535-
// reach the right part when unwinding. For example, for an if we do this:
587+
// reach the right part when rewinding, which is done by always skipping
588+
// forward. For example, for an if we do this:
536589
//
537590
// if (condition()) {
538591
// side1();
@@ -595,25 +648,28 @@ struct BysyncifyFlow : public Pass {
595648
// must be in one of the children.
596649
assert(!analyzer->canChangeState(iff->condition));
597650
// We must linearize this, which means we pass through both arms if we
598-
// are rewinding. This is pretty simple as in flat form the if condition
599-
// is either a const or a get, so easily copyable.
600-
// Start with the first arm, for which we reuse the original if.
601-
auto* otherArm = iff->ifFalse;
602-
iff->ifFalse = nullptr;
603-
auto* originalCondition = iff->condition;
651+
// are rewinding.
652+
if (!iff->ifFalse) {
653+
iff->condition = builder->makeBinary(
654+
OrInt32, iff->condition, builder->makeStateCheck(State::Rewinding));
655+
iff->ifTrue = process(iff->ifTrue);
656+
iff->finalize();
657+
return iff;
658+
}
659+
auto conditionTemp = builder->addVar(func, i32);
660+
iff->condition = builder->makeLocalTee(conditionTemp, iff->condition);
604661
iff->condition = builder->makeBinary(
605-
OrInt32, originalCondition, builder->makeStateCheck(State::Rewinding));
662+
OrInt32, iff->condition, builder->makeStateCheck(State::Rewinding));
606663
iff->ifTrue = process(iff->ifTrue);
664+
auto* otherArm = iff->ifFalse;
665+
iff->ifFalse = nullptr;
607666
iff->finalize();
608-
if (!otherArm) {
609-
return iff;
610-
}
611667
// Add support for the second arm as well.
612668
auto* otherIf = builder->makeIf(
613669
builder->makeBinary(
614670
OrInt32,
615-
builder->makeUnary(
616-
EqZInt32, ExpressionManipulator::copy(originalCondition, *module)),
671+
builder->makeUnary(EqZInt32,
672+
builder->makeLocalGet(conditionTemp, i32)),
617673
builder->makeStateCheck(State::Rewinding)),
618674
process(otherArm));
619675
otherIf->finalize();
@@ -640,27 +696,48 @@ struct BysyncifyFlow : public Pass {
640696
// change the state
641697
assert(doesCall(curr));
642698
assert(curr->type == none);
699+
// The case of a set is tricky: we must *not* execute the set when
700+
// unwinding, since at that point we have a fake value for the return,
701+
// and if we applied it to the local, it would end up saved and then
702+
// potentially used later - and with coalescing, that may interfere
703+
// with other things. Note also that at this point in the process we
704+
// have not yet written the load saving/loading code, so the optimizer
705+
// doesn't see that locals will have another use at the beginning and
706+
// end of the function. We don't do that yet because we don't want it
707+
// to force the final number of locals to be too high, but we also
708+
// must be careful to never touch a local unnecessarily to avoid bugs.
709+
// To implement this, write to a fake global; we'll fix it up after
710+
// BysyncifyLocals locals adds local saving/restoring.
711+
auto* set = curr->dynCast<LocalSet>();
712+
if (set) {
713+
auto name = analyzer->globals.getName(set->value->type);
714+
curr = builder->makeGlobalSet(name, set->value);
715+
set->value = builder->makeGlobalGet(name, set->value->type);
716+
}
717+
// Instrument the call itself (or, if a drop, the drop+call).
643718
auto index = callIndex++;
644719
// Execute the call, if either normal execution, or if rewinding and this
645720
// is the right call to go into.
646721
// TODO: we can read the next call index once in each function (but should
647722
// avoid saving/restoring that local later)
648-
return builder->makeIf(
723+
curr = builder->makeIf(
649724
builder->makeIf(builder->makeStateCheck(State::Normal),
650725
builder->makeConst(Literal(int32_t(1))),
651726
makeCallIndexPeek(index)),
652-
builder->makeSequence(curr, makePossibleUnwind(index)));
727+
builder->makeSequence(curr, makePossibleUnwind(index, set)));
728+
return curr;
653729
}
654730

655-
Expression* makePossibleUnwind(Index index) {
731+
Expression* makePossibleUnwind(Index index, Expression* ifNotUnwinding) {
656732
// In this pass we emit an "unwind" as a call to a fake intrinsic. We
657733
// will implement it in the later pass. (We can't create the unwind block
658734
// target here, as the optimizer would remove it later; we can only add
659735
// it when we add its contents, later.)
660736
return builder->makeIf(
661737
builder->makeStateCheck(State::Unwinding),
662738
builder->makeCall(
663-
BYSYNCIFY_UNWIND, {builder->makeConst(Literal(int32_t(index)))}, none));
739+
BYSYNCIFY_UNWIND, {builder->makeConst(Literal(int32_t(index)))}, none),
740+
ifNotUnwinding);
664741
}
665742

666743
Expression* makeCallIndexPeek(Index index) {
@@ -707,6 +784,29 @@ struct BysyncifyLocals : public WalkerPass<PostWalker<BysyncifyLocals>> {
707784
}
708785
}
709786

787+
void visitGlobalSet(GlobalSet* curr) {
788+
auto type = analyzer->globals.getTypeOrNone(curr->name);
789+
if (type != none) {
790+
replaceCurrent(
791+
builder->makeLocalSet(getFakeCallLocal(type), curr->value));
792+
}
793+
}
794+
795+
void visitGlobalGet(GlobalGet* curr) {
796+
auto type = analyzer->globals.getTypeOrNone(curr->name);
797+
if (type != none) {
798+
replaceCurrent(builder->makeLocalGet(getFakeCallLocal(type), type));
799+
}
800+
}
801+
802+
Index getFakeCallLocal(Type type) {
803+
auto iter = fakeCallLocals.find(type);
804+
if (iter != fakeCallLocals.end()) {
805+
return iter->second;
806+
}
807+
return fakeCallLocals[type] = builder->addVar(getFunction(), type);
808+
}
809+
710810
void doWalkFunction(Function* func) {
711811
// If the function cannot change our state, we have nothing to do -
712812
// we will never unwind or rewind the stack here.
@@ -763,6 +863,7 @@ struct BysyncifyLocals : public WalkerPass<PostWalker<BysyncifyLocals>> {
763863

764864
Index rewindIndex;
765865
Index numPreservableLocals;
866+
std::map<Type, Index> fakeCallLocals;
766867

767868
Expression* makeLocalLoading() {
768869
if (numPreservableLocals == 0) {
@@ -884,13 +985,22 @@ struct Bysyncify : public Pass {
884985
PassRunner runner(module);
885986
runner.add("flatten");
886987
// Dce is useful here, since BysyncifyFlow makes control flow conditional,
887-
// which may make unreachable code look reachable. We also do some other
888-
// minimal optimization here in an unconditional way here, to counteract
889-
// the flattening.
988+
// which may make unreachable code look reachable. It also lets us ignore
989+
// unreachable code here.
890990
runner.add("dce");
891-
runner.add("simplify-locals-nonesting");
892-
runner.add("merge-blocks");
893-
runner.add("vacuum");
991+
if (optimize) {
992+
// Optimizing before BsyncifyFlow is crucial, especially coalescing,
993+
// because the flow changes add many branches, break up if-elses, etc.,
994+
// all of which extend the live ranges of locals. In other words, it is
995+
// not possible to coalesce well afterwards.
996+
runner.add("simplify-locals-nonesting");
997+
runner.add("reorder-locals");
998+
runner.add("coalesce-locals");
999+
runner.add("simplify-locals-nonesting");
1000+
runner.add("reorder-locals");
1001+
runner.add("merge-blocks");
1002+
runner.add("vacuum");
1003+
}
8941004
runner.add<BysyncifyFlow>(&analyzer);
8951005
runner.setIsNested(true);
8961006
runner.setValidateGlobally(false);

0 commit comments

Comments
 (0)