// Demonstration of loop customization in C++. John Rose, 2/2015 #include #include // To run, use these commands: // $ c++ -O splitvtbl.cpp -o splitvtbl && ./splitvtbl // $ c++ -O splitvtbl.cpp -S -o - -fno-rtti | c++filt > splitvtbl.s // Basic ideas: // - loops come in multiple shapes (this is a virtual call, on IntLoop) // - loops obtain successive values various ways // - loops accumulate their results in various ways (another virtual, on IntSink) // - the shape of a loop and its disposal operation are independently variable // - for any specific shape and sink, we want fully devirtualized loop code // The independent variability of loop components (shape, sink) creates // a requirement for *multiple dispatch* and *customization* of loops. // If this is done right (as shown below), loop logic can be integrated // with sink logic producing a fully customized code for both at once. // For example, running a RepeatingIntLoop on an AddingIntSink repeatedly // adds a constant value to an accumulator. If this loop is customized // properly, the optimizer can replace the loop by a single multiplication // (constant loop value times loop trip count, added into the sink). // Simplifying assumptions, for this demonstration: // - a loop iterates over a sequence of int values and return an int // - a loop's data source comes from the loop itself (no arrays, etc.) // - there is no separately pluggable "filter" operation // - thus, a loop is characterized only by (loop shape, data sink) // Key techniques: // - each loop type defines its behavior as a function (run_impl) // - each sink type defines its behavior as functions (accept_impl, result_impl) // - in order to customize each run_impl to each sink type, run_impl is a template // - in order to get virtual access to run_impl it is wired to *Loop::run_virt // - macros make it easy to wire up every *_impl to its *_virt and *_api // Naming conventions: // In the various design patterns used here, methods have three roles, // a private implementation, a public API point, and a protected virtual // override point. We clearly separate these roles by using the suffixes // "impl", "api", and "virt", respectively. // Probably the "api" suffix would be dropped in real use. // In some versions of these patterns, the implementation and/or virtual // method could be identified (merged) with the API point method. // In the simplest uses of C++, all three roles can be played by // one method. #define non_virtual /*for documentation purposes only*/ // Extensions and improvements, left as exercises for the reader: // 1. The name "Int" can be replaced uniformly by a template parameter. // 2. The name "Int" can be replaced for some sink types by a template // parameter and the various *Loop::run_impl function templates would // carry that parameter. The vtable splitting scheme would have to // choose a finite number of template instances for the various sinks. // 3. The *Loop::run_impl function templates can take additional // arguments, such as a filter (e.g., IntPredicate). In that case, // the variable splitting scheme would have to choose a cross // product (or other set) of selected pairs of sink/filter types. // 4. The actually loop/sink pattern is an awkward toy. // In reality, either the loop should reset the sink and extract // the result value, or else the caller of the loop does both steps. // 5. If C++ supported virtual template functions, the boilerplate // macros could go away, and there would be little or no need for // a predetermined set of types favored for customization. // 6. These techniques can be applied to Java heuristically, // since the JVM manages virtuality under the covers. // 7. Vtable splitting is not just for loop customization. // These patterns can be adapted to support generic arithmetic, // where a binary operator (like addition) needs to take into // account both of its operand types, and devirtualization // on both argument types is needed for reasonable performance. // Separable demonstration of RTTI-like feature. // This adds complexity, but allows customization // to be recovered in some cases where types are weak. #ifndef IntSink_USE_TYPE_ENUM #define IntSink_USE_TYPE_ENUM 1 #endif bool allow_tripwire = false; bool verbose = false; // abstract loop component: a sink // (could have template over Int) // In order to build split vtables, we need an up-front enumeration // of all subtypes of IntSink which we intend to optimize. // These are the sink types we will integrate into customized loops. #define IntSink_EACH_OPTIMIZABLE_SUBTYPE(macro) \ macro(AddingIntSink) \ macro(OringIntSink) \ macro(MaxingIntSink) \ /*end*/ // end of sink types class IntSink { protected: // We do not want clients calling these directly because they are slow. // The public API points will be non-virtual. virtual void accept_virt(int x) = 0; virtual int result_virt() = 0; private: static void tripwire(const char* what) { printf("tripwire: slow virtual call for %s\n", what); if (!allow_tripwire) abort(); } public: // Allow templates to use untyped IntSink, accessing "*_impl" methods via virtuals. non_virtual void accept_api(int x) { tripwire("accept_virt"); accept_virt(x); } non_virtual int result_api() { tripwire("result_virt"); return result_virt(); } // Each subclass of IntLoop must present its "*_impl" methods by calling this macro. #define IntSink_IMPLEMENT_VIRTUALS() \ virtual void accept_virt(int x) { accept_impl(x); } \ virtual int result_virt() { return result_impl(); } \ /* also implement the API points, for regularity */ \ non_virtual void accept_api(int x) { accept_impl(x); } \ non_virtual int result_api() { return result_impl(); } \ /*end*/ #define IntSink_MAYBE_IMPLEMENT_TYPE_ID(IntSink_SUBCLASS) /*empty*/ #if IntSink_USE_TYPE_ENUM // The following RTTI-like stuff is optional: #define IntSink_ENUM(IntSink_SUBCLASS) \ IntSink_SUBCLASS ## _EnumValue #define IntSink_DEFINE_ENUM(IntSink_SUBCLASS) \ IntSink_SUBCLASS ## _EnumValue , enum TypeEnum { Uncustomized = 0, IntSink_EACH_OPTIMIZABLE_SUBTYPE(IntSink_DEFINE_ENUM) }; #undef IntSink_DEFINE_ENUM virtual TypeEnum type_enum() { return Uncustomized; } #undef IntSink_MAYBE_IMPLEMENT_TYPE_ID #define IntSink_MAYBE_IMPLEMENT_TYPE_ID(IntSink_SUBCLASS) \ virtual TypeEnum type_enum() { return IntSink_ENUM(IntSink_SUBCLASS); } #endif //IntSink_USE_TYPE_ENUM }; // three implementations of IntSink class AddingIntSink : public IntSink { int acc; non_virtual void accept_impl(int x) { acc += x; } non_virtual int result_impl() { return acc; } public: AddingIntSink() { acc = 0; } IntSink_IMPLEMENT_VIRTUALS(); IntSink_MAYBE_IMPLEMENT_TYPE_ID(AddingIntSink); }; class OringIntSink : public IntSink { int acc; non_virtual void accept_impl(int x) { acc |= x; } non_virtual int result_impl() { return acc; } public: OringIntSink() { acc = 0; } IntSink_IMPLEMENT_VIRTUALS(); IntSink_MAYBE_IMPLEMENT_TYPE_ID(OringIntSink); }; class MaxingIntSink : public IntSink { int acc; non_virtual void accept_impl(int x) { if (acc < x) acc = x; } non_virtual int result_impl() { return acc; } public: MaxingIntSink() { acc = 0; } IntSink_IMPLEMENT_VIRTUALS(); IntSink_MAYBE_IMPLEMENT_TYPE_ID(MaxingIntSink); }; // this guy is not mentioned in IntSink_EACH_OPTIMIZABLE_SUBTYPE class UncustomizedIntSink : public IntSink { AddingIntSink sink; // delegate to this real one non_virtual void accept_impl(int x) { sink.accept_api(x); } non_virtual int result_impl() { return sink.result_api(); } public: IntSink_IMPLEMENT_VIRTUALS(); }; // abstract loop, defining a loop shape with some data source // (could have template over Int) class IntLoop { protected: // Define abstract virtual methods for all sink versions: #define IntLoop_DECLARE_ABSTRACT_RUN(IntSink_SUBCLASS) \ virtual int run_virt(IntSink_SUBCLASS& sink) = 0; IntSink_EACH_OPTIMIZABLE_SUBTYPE(IntLoop_DECLARE_ABSTRACT_RUN) IntLoop_DECLARE_ABSTRACT_RUN(IntSink); //declare a slow generic version #undef IntLoop_DECLARE_ABSTRACT_RUN public: // Use dynamic typing when available to avoid tripwire. non_virtual int run_api(IntSink& sink) { #if IntSink_USE_TYPE_ENUM if (verbose) printf("testing run-time type of loop sink...\n"); switch (sink.type_enum()) { #define IntLoop_NARROW_SINK_CASE(IntSink_SUBCLASS) \ case IntSink::IntSink_ENUM(IntSink_SUBCLASS): \ return run_virt((IntSink_SUBCLASS&)sink); IntSink_EACH_OPTIMIZABLE_SUBTYPE(IntLoop_NARROW_SINK_CASE); #undef IntLoop_NARROW_SINK_CASE default: case IntSink::Uncustomized: // to avoid warning printf("run-time type test failed; using uncustomized loop...\n"); break; } #endif //IntSink_USE_TYPE_ENUM return run_virt(sink); // this will hit the tripwire } // support sink subtypes directly: template int run_api(IntSink_SUBCLASS& sink) { return run_virt(sink); } // Each subclass of IntLoop must define a template function like this: /* --- private: template int run_impl(IntSink_SUBCLASS& sink) { for (each iteration of loop) { int value = (compute a loop value); sink.accept_impl(value); } return sink.result_api(); } --- */ // Each subclass of IntLoop must wire up its virtuals by calling this macro. #define IntLoop_IMPLEMENT_VIRTUALS() \ IntLoop_DEFINE_RUN(IntSink) \ IntSink_EACH_OPTIMIZABLE_SUBTYPE(IntLoop_DEFINE_RUN) \ /*end*/ // Private helper macro: #define IntLoop_DEFINE_RUN(IntSink_SUBCLASS) \ virtual int run_virt(IntSink_SUBCLASS& sink) { return run_impl(sink); } }; // concrete loop types: class RepeatingIntLoop : public IntLoop { int count; int value; template int run_impl(IntSink_SUBCLASS& sink) { for (int i = 0; i < count; i++) { sink.accept_api(value); } return sink.result_api(); } IntLoop_IMPLEMENT_VIRTUALS(); public: RepeatingIntLoop(int count, int value) { this->count = count; this->value = value; } }; class CountingIntLoop : public IntLoop { int count; template int run_impl(IntSink_SUBCLASS& sink) { for (int i = 0; i < count; i++) { sink.accept_api(i); } return sink.result_api(); } IntLoop_IMPLEMENT_VIRTUALS(); public: CountingIntLoop(int count) { this->count = count; } }; class ArithmeticSequenceIntLoop : public IntLoop { int count; int step; int start; template int run_impl(IntSink_SUBCLASS& sink) { int value = start; for (int i = 0; i < count; i++) { sink.accept_api(value); value += step; } return sink.result_api(); } IntLoop_IMPLEMENT_VIRTUALS(); public: ArithmeticSequenceIntLoop(int count, int step, int start) { this->count = count; this->step = step; this->start = start; } }; // OK, let's use a more realistic data source... class ArrayOfIntLoop : public IntLoop { int count; int* array; template int run_impl(IntSink_SUBCLASS& sink) { for (int i = 0; i < count; i++) { sink.accept_api(array[i]); } return sink.result_api(); } IntLoop_IMPLEMENT_VIRTUALS(); public: ArrayOfIntLoop(int count, int* array) { this->count = count; this->array = array; } }; // end of concrete loop types // matrix testing... // test one element of the matrix template void test_loop(IntLoop& loop, IntSink_SUBCLASS& sink, int exp_res) { int act_res = loop.run_api(sink); bool fail = (act_res != exp_res); if (fail | verbose) { printf("%d = %d\n", exp_res, act_res); } if (fail) { printf("unexpected return value; aborting...\n"); abort(); } } // test one row of the matrix void test_loop_with_add_or_max(IntLoop& loop, int add_res, int or_res, int max_res) { for (int dynamic_typing = 0; dynamic_typing <= 1; dynamic_typing++) { #if !IntSink_USE_TYPE_ENUM if (dynamic_typing) break; // just kidding #endif { AddingIntSink sink; if (!dynamic_typing) test_loop(loop, sink, add_res); else test_loop(loop, (IntSink&)sink, add_res); } { OringIntSink sink; if (!dynamic_typing) test_loop(loop, sink, or_res); else test_loop(loop, (IntSink&)sink, or_res); } { MaxingIntSink sink; if (!dynamic_typing) test_loop(loop, sink, max_res); else test_loop(loop, (IntSink&)sink, max_res); } } } // test the whole matrix void test_all_loops() { { RepeatingIntLoop loop(5, 3); test_loop_with_add_or_max(loop, 15, 3, 3); } { CountingIntLoop loop(5); test_loop_with_add_or_max(loop, 10, 7, 4); } { ArithmeticSequenceIntLoop loop(5, 4, 1); test_loop_with_add_or_max(loop, 45, 29, 17); } { int a[] = { 1, 5, 4, 17, 16 }; ArrayOfIntLoop loop(5, a); test_loop_with_add_or_max(loop, 43, 21, 17); } } int main(int ac, char** av) { test_all_loops(); printf("fast loop tests ran successfully\n"); // test an uncustomized loop: printf("testing slow loop; stand by for tripwire noise...\n"); allow_tripwire = true; { int array[] = { 1000, 42 }; ArrayOfIntLoop loop(2, array); UncustomizedIntSink sink; test_loop(loop, sink, 1000 + 42); /* --- expected noise: --- tripwire: slow virtual call for accept_virt tripwire: slow virtual call for accept_virt tripwire: slow virtual call for result_virt */ } #if !IntSink_USE_TYPE_ENUM printf("testing customization works even if disabled by a weak type...\n"); { int array[] = { 1028, 4 }; ArrayOfIntLoop loop(2, array); OringIntSink sink; test_loop(loop, (IntSink&)sink, 1028 | 4); /* --- expected noise: --- tripwire: slow virtual call for accept_virt tripwire: slow virtual call for accept_virt tripwire: slow virtual call for result_virt */ } #endif //IntSink_USE_TYPE_ENUM return 0; }