Halide  12.0.1
Halide compiler and libraries
LoopNest.h
Go to the documentation of this file.
1 /** This file defines the LoopNest, which is our
2  * representation of a Halide schedule, and contains methods to
3  * generate candidates for scheduling as well as extract a
4  * featurization that can be used to cost each candidate. */
5 
6 #ifndef LOOP_NEST_H
7 #define LOOP_NEST_H
8 
9 #include "FunctionDAG.h"
10 #include "PerfectHashMap.h"
11 #include <map>
12 #include <set>
13 #include <utility>
14 #include <vector>
15 
16 namespace Halide {
17 namespace Internal {
18 namespace Autoscheduler {
19 
20 template<typename T>
22 
23 template<typename T>
25 
26 bool may_subtile();
27 
28 // Given a multi-dimensional box of dimensionality d, generate a list
29 // of candidate tile sizes for it, logarithmically spacing the sizes
30 // using the given factor. If 'allow_splits' is false, every dimension
31 // must either be one, or the full extent of the box. This function is
32 // used to generate candidate tilings when tiling for
33 // producer-consumer fusion, or tiling for parallelism.
34 std::vector<std::vector<int64_t>> generate_tilings(const vector<int64_t> &s, int d, int factor, bool allow_splits);
35 
36 struct LoopNest {
38 
39  // The extents of this loop. Put another way, the number of tiles,
40  // not the size of each tile.
41  std::vector<int64_t> size;
42 
43  // The nodes inside the loop body
44  std::vector<IntrusivePtr<const LoopNest>> children;
45 
46  // Funcs inlined into this inner loop, and the number of times
47  // each is called. Only valid if children is empty.
49 
50  // Funcs stored inside this loop
51  std::set<const FunctionDAG::Node *> store_at;
52 
53  // The total bounds required of any given Func over all iterations
54  // of this loop. In the paper, this is represented using the
55  // little boxes to the left of the loop nest tree figures.
57 
58  // The Func this loop nest belongs to
59  const FunctionDAG::Node *node = nullptr;
60 
61  // The stage of the Func
62  const FunctionDAG::Node::Stage *stage = nullptr;
63 
64  // Is this the innermost loop of this func (the SIMD loop)?
65  bool innermost = false;
66 
67  // Are we permitted to tile this loop?
68  bool tileable = false;
69 
70  // Is this the parallel outer loop?
71  bool parallel = false;
72 
73  // What dimension is this Func vectorized over, in terms of the pure args of the Func?
74  int vector_dim = -1;
75 
76  // Which loop corresponds to the innermost storage dimension and will be vectorized. -1 means none of them.
78 
79  void copy_from(const LoopNest &n);
80 
81  static void hash_combine(uint64_t &h, uint64_t next) {
82  // From boost
83  h ^= (next + 0x9e3779b9 + (h << 6) + (h >> 2));
84  }
85 
86  // Hash the loop structure and sizes up to a fixed depth. This is
87  // used as the hash function for the coarse-to-fine beam search in
88  // the paper.
89  void structural_hash(uint64_t &h, int depth) const;
90 
91  // How many funcs are scheduled inside this loop level. Used in
92  // the structural hash.
93  size_t funcs_realized_or_inlined() const {
94  size_t count = inlined.size() + store_at.size();
95  for (const auto &c : children) {
96  count += c->funcs_realized_or_inlined();
97  }
98  return count;
99  }
100 
101  // All of a stage's interesting locations in the loop nest. Used to help compute the featurization of a stage.
102  struct Sites {
103  const LoopNest *compute = nullptr; // Its containing compute_at site
104  const LoopNest *store = nullptr; // Its containing store_at site
105  const LoopNest *produce = nullptr; // Its own outermost node
106  const LoopNest *innermost = nullptr; // Its innermost node - usually a SIMD loop
107  const LoopNest *task = nullptr; // The parallel for loop it belongs to
108  bool inlined = false; // Is the Func inlined?
109 
110  // Used for caching features/feature intermediates.
112  };
113 
114  // Compute all the sites of interest for each pipeline stage
116  const LoopNest *task = nullptr,
117  const LoopNest *parent = nullptr) const;
118 
119  // A helper for the working_set_at_task feature. Most features are
120  // computed in the recursive pass 'compute_features' below, but
121  // this one must be done in a second separate recursive pass.
123  StageMap<ScheduleFeatures> *features) const {
124  for (const auto &c : children) {
125  c->set_working_set_at_task_feature(working_set, features);
126  features->get(c->stage).working_set_at_task = working_set;
127  }
128  }
129 
130  // Do a recursive walk over the loop nest computing features to feed the cost model.
131  void compute_features(const FunctionDAG &dag,
132  const MachineParams &params,
133  const StageMap<Sites> &sites,
134  int64_t instances,
135  int64_t parallelism,
136  const LoopNest *parent,
137  const LoopNest *grandparent,
138  const LoopNest &root,
139  int64_t *working_set,
140  StageMap<ScheduleFeatures> *features,
141  bool use_cached_features) const;
142 
143  bool is_root() const {
144  // The root is the sole node without a Func associated with
145  // it.
146  return node == nullptr;
147  }
148 
149  // Set the region required of a Func at this site.
150  const Bound &set_bounds(const FunctionDAG::Node *f, BoundContents *b) const {
151  return bounds.emplace(f, b);
152  }
153 
154  // Get the region required of a Func at this site, from which we
155  // know what region would be computed if it were scheduled here,
156  // and what its loop nest would be.
157  const Bound &get_bounds(const FunctionDAG::Node *f) const;
158 
159  // Recursively print a loop nest representation to stderr
160  void dump(string prefix, const LoopNest *parent) const;
161 
162  // Does this loop nest access the given Func
163  bool calls(const FunctionDAG::Node *f) const;
164 
165  // What is the maximum number of inlined calls to a Func that
166  // occur within this loop. Used to prune states that would
167  // generate too much code.
169 
170  // Does this loop nest access an input buffer? Used to select
171  // trail strategies when splitting loops. We don't want to read
172  // out of bounds on inputs, even if we don't intend to use the
173  // values read. It could create annoying assertion failures for
174  // the user. It's OK to read out of range of the values computed
175  // on internal Funcs though. Allocation bounds inference just pads
176  // out the bounds so that it won't fault.
177  bool accesses_input_buffer() const;
178 
179  // Does this loop nest contain a computation of the given Func.
180  bool computes(const FunctionDAG::Node *f) const;
181 
182  // Above here most methods query the loop nest. Below we have
183  // methods that mutate the loop nest.
184 
185  // Inline a Func into all consumers within this loop.
187 
188  // Compute a Func at this site.
189  void compute_here(const FunctionDAG::Node *f, bool tileable, int v);
190 
191  // Parallelize this loop according to the given tiling.
193  const vector<int64_t> &tiling,
194  const LoopNest *parent) const;
195 
196  // Return all possible ways to compute f in tiles somewhere within
197  // this loop nest.
198  std::vector<IntrusivePtr<const LoopNest>> compute_in_tiles(const FunctionDAG::Node *f,
199  const LoopNest *parent,
200  const MachineParams &params,
201  int v,
202  bool in_realization) const;
203 
204  // Below here we have methods that apply a schedule to a Halide pipeline.
205 
206  // A model of the state of the loop nest of a Func while applying
207  // Halide's scheduling directives.
208 
209  // Note that StageScheduleState is movable-but-not-copyable thanks
210  // to its ostringstream member.
212  // How much parallelism do we need to exploit with this Func?
213  double num_cores = 0;
214 
215  // Which storage dimension is vectorized? We need to reorder it innermost
216  int vector_dim = -1;
218 
219  // The various Vars and RVars used for scheduling a Func.
220  struct FuncVar {
221  // The top-level var or rvar this was split off from
223 
224  // This var.
226 
227  // Source code to access this Var/RVar. Used for printing
228  // valid Halide source for this schedule.
229  string accessor;
230 
231  // Our estimate of the extent of this var. This is exact
232  // when constant_extent flag is true.
234 
235  // Which index in the symbolic loop nest does this var
236  // belong to.
237  size_t index = 0;
238 
239  // Some flags.
240  bool innermost_pure_dim = false,
241  outermost = false,
242  parallel = false,
243  exists = false,
244  pure = false,
247  : orig(Var()), var(Var()) {
248  }
249  };
250 
251  // In order from innermost to outermost. Each group of d is one tiling level.
252  std::vector<FuncVar> vars;
253 
254  std::ostringstream schedule_source;
255  };
256 
257  // Apply the schedule represented by this loop nest to a Halide pipeline.
258  void apply(LoopLevel here,
259  StageMap<std::unique_ptr<StageScheduleState>> &state_map,
260  double num_cores,
261  int depth,
262  const LoopNest *parent,
263  const LoopNest *compute_site) const;
264 
265  // The below are two feature caches.
266  // hash of producers -> StageMap
267  mutable std::map<uint64_t, StageMap<StageMap<FeatureIntermediates>>> feature_intermediates_cache;
268  // hash of producers -> StageMap
269  mutable std::map<uint64_t, StageMap<ScheduleFeatures>> features_cache;
270 
271  // Same as copy_from (above) but also copies the two caches.
273 
274  // Loops through inlined funcs and caches the pcm found in features, into memoized_features.
276  const StageMap<ScheduleFeatures> *features) const;
277 
278  // Merges features_to_insert into memoized_features if it does not already exist there.
280  const StageMap<ScheduleFeatures> *features_to_insert) const;
281 
282  // Recalculates working_set from cached features
284  const StageMap<ScheduleFeatures> *features) const;
285 
286  // Features need to be recomputed for inlined Funcs
288  StageMap<ScheduleFeatures> *features) const;
289 
290  // Create a (hopefully) unique hash of the producers.
292 
293  // Gather all stages that are producers for any Func in this LoopNest.
294  std::vector<std::pair<int, int>> collect_producers(const StageMap<Sites> &sites) const;
295 
296  // Collect all stages referenced in this LoopNest.
297  void collect_stages(std::set<const FunctionDAG::Node::Stage *> &stages) const;
298 };
299 
300 // Find the deepest common ancestor of `a` and `b`.
301 // `parents` is a map from loop nest to (parent, depth) tuples.
302 // Assumes that `a` and `b` are found in `parents`, otherwise errors.
303 const LoopNest *deepest_common_ancestor(const std::map<const LoopNest *, std::pair<const LoopNest *, int>> &parents,
304  const LoopNest *a, const LoopNest *b);
305 
306 // Compute the parent and depth of every loop nest node.
307 // Stores in `parents` the children of `here` (keys) to tuples of (here, depth).
308 // Recurses on all children of `here`.
309 void compute_loop_nest_parents(std::map<const LoopNest *, std::pair<const LoopNest *, int>> &parents,
310  const LoopNest *here, int depth);
311 
312 } // namespace Autoscheduler
313 } // namespace Internal
314 } // namespace Halide
315 
316 #endif // LOOP_NEST_H
A class representing a reference count to be used with IntrusivePtr.
Definition: IntrusivePtr.h:19
A reference to a site in a Halide statement at the top of the body of a particular for loop.
Definition: Schedule.h:153
A Halide variable, to be used when defining functions.
Definition: Var.h:19
const T & get(const K *n) const
const LoopNest * deepest_common_ancestor(const std::map< const LoopNest *, std::pair< const LoopNest *, int >> &parents, const LoopNest *a, const LoopNest *b)
void compute_loop_nest_parents(std::map< const LoopNest *, std::pair< const LoopNest *, int >> &parents, const LoopNest *here, int depth)
std::vector< std::vector< int64_t > > generate_tilings(const vector< int64_t > &s, int d, int factor, bool allow_splits)
This file defines the class FunctionDAG, which is our representation of a Halide pipeline,...
@ Internal
Not visible externally, similar to 'static' linkage in C.
unsigned __INT64_TYPE__ uint64_t
signed __INT64_TYPE__ int64_t
const FunctionDAG::Node::Stage * stage
Definition: LoopNest.h:62
void apply(LoopLevel here, StageMap< std::unique_ptr< StageScheduleState >> &state_map, double num_cores, int depth, const LoopNest *parent, const LoopNest *compute_site) const
const FunctionDAG::Node * node
Definition: LoopNest.h:59
void recompute_inlined_features(const StageMap< Sites > &sites, StageMap< ScheduleFeatures > *features) const
void inline_func(const FunctionDAG::Node *f)
void dump(string prefix, const LoopNest *parent) const
const Bound & set_bounds(const FunctionDAG::Node *f, BoundContents *b) const
Definition: LoopNest.h:150
std::vector< IntrusivePtr< const LoopNest > > compute_in_tiles(const FunctionDAG::Node *f, const LoopNest *parent, const MachineParams &params, int v, bool in_realization) const
void collect_stages(std::set< const FunctionDAG::Node::Stage * > &stages) const
IntrusivePtr< const LoopNest > parallelize_in_tiles(const MachineParams &params, const vector< int64_t > &tiling, const LoopNest *parent) const
void get_sites(StageMap< Sites > &sites, const LoopNest *task=nullptr, const LoopNest *parent=nullptr) const
std::set< const FunctionDAG::Node * > store_at
Definition: LoopNest.h:51
void memoize_features(StageMap< ScheduleFeatures > &memoized_features, const StageMap< ScheduleFeatures > *features_to_insert) const
std::map< uint64_t, StageMap< ScheduleFeatures > > features_cache
Definition: LoopNest.h:269
bool computes(const FunctionDAG::Node *f) const
std::vector< IntrusivePtr< const LoopNest > > children
Definition: LoopNest.h:44
void set_working_set_at_task_feature(int64_t working_set, StageMap< ScheduleFeatures > *features) const
Definition: LoopNest.h:122
void memoize_points_computed_minimum(StageMap< ScheduleFeatures > &memoized_features, const StageMap< ScheduleFeatures > *features) const
static void hash_combine(uint64_t &h, uint64_t next)
Definition: LoopNest.h:81
void compute_features(const FunctionDAG &dag, const MachineParams &params, const StageMap< Sites > &sites, int64_t instances, int64_t parallelism, const LoopNest *parent, const LoopNest *grandparent, const LoopNest &root, int64_t *working_set, StageMap< ScheduleFeatures > *features, bool use_cached_features) const
uint64_t compute_hash_of_producers_stored_at_root(const StageMap< Sites > &sites) const
bool calls(const FunctionDAG::Node *f) const
std::vector< std::pair< int, int > > collect_producers(const StageMap< Sites > &sites) const
void copy_from_including_features(const LoopNest &n)
const Bound & get_bounds(const FunctionDAG::Node *f) const
void structural_hash(uint64_t &h, int depth) const
void compute_here(const FunctionDAG::Node *f, bool tileable, int v)
std::map< uint64_t, StageMap< StageMap< FeatureIntermediates > > > feature_intermediates_cache
Definition: LoopNest.h:267
void compute_working_set_from_features(int64_t *working_set, const StageMap< ScheduleFeatures > *features) const
Intrusive shared pointers have a reference count (a RefCount object) stored in the class itself.
Definition: IntrusivePtr.h:68
A struct representing the machine parameters to generate the auto-scheduled code for.
Definition: Pipeline.h:31
A class that can represent Vars or RVars.
Definition: Func.h:30