Halide 16.0.0
Halide compiler and libraries
Loading...
Searching...
No Matches
Func.h
Go to the documentation of this file.
1#ifndef HALIDE_FUNC_H
2#define HALIDE_FUNC_H
3
4/** \file
5 *
6 * Defines Func - the front-end handle on a halide function, and related classes.
7 */
8
9#include "Argument.h"
10#include "Expr.h"
11#include "JITModule.h"
12#include "Module.h"
13#include "Param.h"
14#include "Pipeline.h"
15#include "RDom.h"
16#include "Target.h"
17#include "Tuple.h"
18#include "Var.h"
19
20#include <map>
21#include <utility>
22
23namespace Halide {
24
25class OutputImageParam;
26class ParamMap;
27
28/** A class that can represent Vars or RVars. Used for reorder calls
29 * which can accept a mix of either. */
30struct VarOrRVar {
31 VarOrRVar(const std::string &n, bool r)
32 : var(n), rvar(n), is_rvar(r) {
33 }
34 VarOrRVar(const Var &v)
35 : var(v), is_rvar(false) {
36 }
37 VarOrRVar(const RVar &r)
38 : rvar(r), is_rvar(true) {
39 }
40 VarOrRVar(const RDom &r)
41 : rvar(RVar(r)), is_rvar(true) {
42 }
43 template<int N>
45 : var(u), is_rvar(false) {
46 }
47
48 const std::string &name() const {
49 if (is_rvar) {
50 return rvar.name();
51 } else {
52 return var.name();
53 }
54 }
55
58 bool is_rvar;
59};
60
61class ImageParam;
62
63namespace Internal {
64class Function;
65struct Split;
66struct StorageDim;
67} // namespace Internal
68
69/** A single definition of a Func. May be a pure or update definition. */
70class Stage {
71 /** Reference to the Function this stage (or definition) belongs to. */
72 Internal::Function function;
73 Internal::Definition definition;
74 /** Indicate which stage the definition belongs to (0 for initial
75 * definition, 1 for first update, etc.). */
76 size_t stage_index;
77 /** Pure Vars of the Function (from the init definition). */
78 std::vector<Var> dim_vars;
79
80 void set_dim_type(const VarOrRVar &var, Internal::ForType t);
81 void set_dim_device_api(const VarOrRVar &var, DeviceAPI device_api);
82 void split(const std::string &old, const std::string &outer, const std::string &inner,
83 const Expr &factor, bool exact, TailStrategy tail);
84 void remove(const std::string &var);
85 Stage &purify(const VarOrRVar &old_name, const VarOrRVar &new_name);
86
87 const std::vector<Internal::StorageDim> &storage_dims() const {
88 return function.schedule().storage_dims();
89 }
90
91 Stage &compute_with(LoopLevel loop_level, const std::map<std::string, LoopAlignStrategy> &align);
92
93public:
95 : function(std::move(f)), definition(std::move(d)), stage_index(stage_index) {
96 internal_assert(definition.defined());
97
98 dim_vars.reserve(function.args().size());
99 for (const auto &arg : function.args()) {
100 dim_vars.emplace_back(arg);
101 }
102 internal_assert(definition.args().size() == dim_vars.size());
103 }
104
105 /** Return the current StageSchedule associated with this Stage. For
106 * introspection only: to modify schedule, use the Func interface. */
108 return definition.schedule();
109 }
110
111 /** Return a string describing the current var list taking into
112 * account all the splits, reorders, and tiles. */
113 std::string dump_argument_list() const;
114
115 /** Return the name of this stage, e.g. "f.update(2)" */
116 std::string name() const;
117
118 /** Calling rfactor() on an associative update definition a Func will split
119 * the update into an intermediate which computes the partial results and
120 * replaces the current update definition with a new definition which merges
121 * the partial results. If called on a init/pure definition, this will
122 * throw an error. rfactor() will automatically infer the associative reduction
123 * operator and identity of the operator. If it can't prove the operation
124 * is associative or if it cannot find an identity for that operator, this
125 * will throw an error. In addition, commutativity of the operator is required
126 * if rfactor() is called on the inner dimension but excluding the outer
127 * dimensions.
128 *
129 * rfactor() takes as input 'preserved', which is a list of <RVar, Var> pairs.
130 * The rvars not listed in 'preserved' are removed from the original Func and
131 * are lifted to the intermediate Func. The remaining rvars (the ones in
132 * 'preserved') are made pure in the intermediate Func. The intermediate Func's
133 * update definition inherits all scheduling directives (e.g. split,fuse, etc.)
134 * applied to the original Func's update definition. The loop order of the
135 * intermediate Func's update definition is the same as the original, although
136 * the RVars in 'preserved' are replaced by the new pure Vars. The loop order of the
137 * intermediate Func's init definition from innermost to outermost is the args'
138 * order of the original Func's init definition followed by the new pure Vars.
139 *
140 * The intermediate Func also inherits storage order from the original Func
141 * with the new pure Vars added to the outermost.
142 *
143 * For example, f.update(0).rfactor({{r.y, u}}) would rewrite a pipeline like this:
144 \code
145 f(x, y) = 0;
146 f(x, y) += g(r.x, r.y);
147 \endcode
148 * into a pipeline like this:
149 \code
150 f_intm(x, y, u) = 0;
151 f_intm(x, y, u) += g(r.x, u);
152
153 f(x, y) = 0;
154 f(x, y) += f_intm(x, y, r.y);
155 \endcode
156 *
157 * This has a variety of uses. You can use it to split computation of an associative reduction:
158 \code
159 f(x, y) = 10;
160 RDom r(0, 96);
161 f(x, y) = max(f(x, y), g(x, y, r.x));
162 f.update(0).split(r.x, rxo, rxi, 8).reorder(y, x).parallel(x);
163 f.update(0).rfactor({{rxo, u}}).compute_root().parallel(u).update(0).parallel(u);
164 \endcode
165 *
166 *, which is equivalent to:
167 \code
168 parallel for u = 0 to 11:
169 for y:
170 for x:
171 f_intm(x, y, u) = -inf
172 parallel for x:
173 for y:
174 parallel for u = 0 to 11:
175 for rxi = 0 to 7:
176 f_intm(x, y, u) = max(f_intm(x, y, u), g(8*u + rxi))
177 for y:
178 for x:
179 f(x, y) = 10
180 parallel for x:
181 for y:
182 for rxo = 0 to 11:
183 f(x, y) = max(f(x, y), f_intm(x, y, rxo))
184 \endcode
185 *
186 */
187 // @{
188 Func rfactor(std::vector<std::pair<RVar, Var>> preserved);
189 Func rfactor(const RVar &r, const Var &v);
190 // @}
191
192 /** Schedule the iteration over this stage to be fused with another
193 * stage 's' from outermost loop to a given LoopLevel. 'this' stage will
194 * be computed AFTER 's' in the innermost fused dimension. There should not
195 * be any dependencies between those two fused stages. If either of the
196 * stages being fused is a stage of an extern Func, this will throw an error.
197 *
198 * Note that the two stages that are fused together should have the same
199 * exact schedule from the outermost to the innermost fused dimension, and
200 * the stage we are calling compute_with on should not have specializations,
201 * e.g. f2.compute_with(f1, x) is allowed only if f2 has no specializations.
202 *
203 * Also, if a producer is desired to be computed at the fused loop level,
204 * the function passed to the compute_at() needs to be the "parent". Consider
205 * the following code:
206 \code
207 input(x, y) = x + y;
208 f(x, y) = input(x, y);
209 f(x, y) += 5;
210 g(x, y) = x - y;
211 g(x, y) += 10;
212 f.compute_with(g, y);
213 f.update().compute_with(g.update(), y);
214 \endcode
215 *
216 * To compute 'input' at the fused loop level at dimension y, we specify
217 * input.compute_at(g, y) instead of input.compute_at(f, y) since 'g' is
218 * the "parent" for this fused loop (i.e. 'g' is computed first before 'f'
219 * is computed). On the other hand, to compute 'input' at the innermost
220 * dimension of 'f', we specify input.compute_at(f, x) instead of
221 * input.compute_at(g, x) since the x dimension of 'f' is not fused
222 * (only the y dimension is).
223 *
224 * Given the constraints, this has a variety of uses. Consider the
225 * following code:
226 \code
227 f(x, y) = x + y;
228 g(x, y) = x - y;
229 h(x, y) = f(x, y) + g(x, y);
230 f.compute_root();
231 g.compute_root();
232 f.split(x, xo, xi, 8);
233 g.split(x, xo, xi, 8);
234 g.compute_with(f, xo);
235 \endcode
236 *
237 * This is equivalent to:
238 \code
239 for y:
240 for xo:
241 for xi:
242 f(8*xo + xi) = (8*xo + xi) + y
243 for xi:
244 g(8*xo + xi) = (8*xo + xi) - y
245 for y:
246 for x:
247 h(x, y) = f(x, y) + g(x, y)
248 \endcode
249 *
250 * The size of the dimensions of the stages computed_with do not have
251 * to match. Consider the following code where 'g' is half the size of 'f':
252 \code
253 Image<int> f_im(size, size), g_im(size/2, size/2);
254 input(x, y) = x + y;
255 f(x, y) = input(x, y);
256 g(x, y) = input(2*x, 2*y);
257 g.compute_with(f, y);
258 input.compute_at(f, y);
259 Pipeline({f, g}).realize({f_im, g_im});
260 \endcode
261 *
262 * This is equivalent to:
263 \code
264 for y = 0 to size-1:
265 for x = 0 to size-1:
266 input(x, y) = x + y;
267 for x = 0 to size-1:
268 f(x, y) = input(x, y)
269 for x = 0 to size/2-1:
270 if (y < size/2-1):
271 g(x, y) = input(2*x, 2*y)
272 \endcode
273 *
274 * 'align' specifies how the loop iteration of each dimension of the
275 * two stages being fused should be aligned in the fused loop nests
276 * (see LoopAlignStrategy for options). Consider the following loop nests:
277 \code
278 for z = f_min_z to f_max_z:
279 for y = f_min_y to f_max_y:
280 for x = f_min_x to f_max_x:
281 f(x, y, z) = x + y + z
282 for z = g_min_z to g_max_z:
283 for y = g_min_y to g_max_y:
284 for x = g_min_x to g_max_x:
285 g(x, y, z) = x - y - z
286 \endcode
287 *
288 * If no alignment strategy is specified, the following loop nest will be
289 * generated:
290 \code
291 for z = min(f_min_z, g_min_z) to max(f_max_z, g_max_z):
292 for y = min(f_min_y, g_min_y) to max(f_max_y, g_max_y):
293 for x = f_min_x to f_max_x:
294 if (f_min_z <= z <= f_max_z):
295 if (f_min_y <= y <= f_max_y):
296 f(x, y, z) = x + y + z
297 for x = g_min_x to g_max_x:
298 if (g_min_z <= z <= g_max_z):
299 if (g_min_y <= y <= g_max_y):
300 g(x, y, z) = x - y - z
301 \endcode
302 *
303 * Instead, these alignment strategies:
304 \code
305 g.compute_with(f, y, {{z, LoopAlignStrategy::AlignStart}, {y, LoopAlignStrategy::AlignEnd}});
306 \endcode
307 * will produce the following loop nest:
308 \code
309 f_loop_min_z = f_min_z
310 f_loop_max_z = max(f_max_z, (f_min_z - g_min_z) + g_max_z)
311 for z = f_min_z to f_loop_max_z:
312 f_loop_min_y = min(f_min_y, (f_max_y - g_max_y) + g_min_y)
313 f_loop_max_y = f_max_y
314 for y = f_loop_min_y to f_loop_max_y:
315 for x = f_min_x to f_max_x:
316 if (f_loop_min_z <= z <= f_loop_max_z):
317 if (f_loop_min_y <= y <= f_loop_max_y):
318 f(x, y, z) = x + y + z
319 for x = g_min_x to g_max_x:
320 g_shift_z = g_min_z - f_loop_min_z
321 g_shift_y = g_max_y - f_loop_max_y
322 if (g_min_z <= (z + g_shift_z) <= g_max_z):
323 if (g_min_y <= (y + g_shift_y) <= g_max_y):
324 g(x, y + g_shift_y, z + g_shift_z) = x - (y + g_shift_y) - (z + g_shift_z)
325 \endcode
326 *
327 * LoopAlignStrategy::AlignStart on dimension z will shift the loop iteration
328 * of 'g' at dimension z so that its starting value matches that of 'f'.
329 * Likewise, LoopAlignStrategy::AlignEnd on dimension y will shift the loop
330 * iteration of 'g' at dimension y so that its end value matches that of 'f'.
331 */
332 // @{
333 Stage &compute_with(LoopLevel loop_level, const std::vector<std::pair<VarOrRVar, LoopAlignStrategy>> &align);
335 Stage &compute_with(const Stage &s, const VarOrRVar &var, const std::vector<std::pair<VarOrRVar, LoopAlignStrategy>> &align);
337 // @}
338
339 /** Scheduling calls that control how the domain of this stage is
340 * traversed. See the documentation for Func for the meanings. */
341 // @{
342
343 Stage &split(const VarOrRVar &old, const VarOrRVar &outer, const VarOrRVar &inner, const Expr &factor, TailStrategy tail = TailStrategy::Auto);
344 Stage &fuse(const VarOrRVar &inner, const VarOrRVar &outer, const VarOrRVar &fused);
345 Stage &serial(const VarOrRVar &var);
348 Stage &unroll(const VarOrRVar &var);
350 Stage &vectorize(const VarOrRVar &var, const Expr &factor, TailStrategy tail = TailStrategy::Auto);
351 Stage &unroll(const VarOrRVar &var, const Expr &factor, TailStrategy tail = TailStrategy::Auto);
352 Stage &tile(const VarOrRVar &x, const VarOrRVar &y,
353 const VarOrRVar &xo, const VarOrRVar &yo,
354 const VarOrRVar &xi, const VarOrRVar &yi, const Expr &xfactor, const Expr &yfactor,
356 Stage &tile(const VarOrRVar &x, const VarOrRVar &y,
357 const VarOrRVar &xi, const VarOrRVar &yi,
358 const Expr &xfactor, const Expr &yfactor,
360 Stage &tile(const std::vector<VarOrRVar> &previous,
361 const std::vector<VarOrRVar> &outers,
362 const std::vector<VarOrRVar> &inners,
363 const std::vector<Expr> &factors,
364 const std::vector<TailStrategy> &tails);
365 Stage &tile(const std::vector<VarOrRVar> &previous,
366 const std::vector<VarOrRVar> &outers,
367 const std::vector<VarOrRVar> &inners,
368 const std::vector<Expr> &factors,
370 Stage &tile(const std::vector<VarOrRVar> &previous,
371 const std::vector<VarOrRVar> &inners,
372 const std::vector<Expr> &factors,
374 Stage &reorder(const std::vector<VarOrRVar> &vars);
375
376 template<typename... Args>
377 HALIDE_NO_USER_CODE_INLINE typename std::enable_if<Internal::all_are_convertible<VarOrRVar, Args...>::value, Stage &>::type
378 reorder(const VarOrRVar &x, const VarOrRVar &y, Args &&...args) {
379 std::vector<VarOrRVar> collected_args{x, y, std::forward<Args>(args)...};
380 return reorder(collected_args);
381 }
382
384 Stage specialize(const Expr &condition);
385 void specialize_fail(const std::string &message);
386
390
392
394
398
401 const VarOrRVar &thread_x, const VarOrRVar &thread_y,
402 DeviceAPI device_api = DeviceAPI::Default_GPU);
404 const VarOrRVar &thread_x, const VarOrRVar &thread_y, const VarOrRVar &thread_z,
405 DeviceAPI device_api = DeviceAPI::Default_GPU);
406
407 Stage &gpu_tile(const VarOrRVar &x, const VarOrRVar &bx, const VarOrRVar &tx, const Expr &x_size,
409 DeviceAPI device_api = DeviceAPI::Default_GPU);
410
411 Stage &gpu_tile(const VarOrRVar &x, const VarOrRVar &tx, const Expr &x_size,
413 DeviceAPI device_api = DeviceAPI::Default_GPU);
414 Stage &gpu_tile(const VarOrRVar &x, const VarOrRVar &y,
415 const VarOrRVar &bx, const VarOrRVar &by,
416 const VarOrRVar &tx, const VarOrRVar &ty,
417 const Expr &x_size, const Expr &y_size,
419 DeviceAPI device_api = DeviceAPI::Default_GPU);
420
421 Stage &gpu_tile(const VarOrRVar &x, const VarOrRVar &y,
422 const VarOrRVar &tx, const VarOrRVar &ty,
423 const Expr &x_size, const Expr &y_size,
425 DeviceAPI device_api = DeviceAPI::Default_GPU);
426
427 Stage &gpu_tile(const VarOrRVar &x, const VarOrRVar &y, const VarOrRVar &z,
428 const VarOrRVar &bx, const VarOrRVar &by, const VarOrRVar &bz,
429 const VarOrRVar &tx, const VarOrRVar &ty, const VarOrRVar &tz,
430 const Expr &x_size, const Expr &y_size, const Expr &z_size,
432 DeviceAPI device_api = DeviceAPI::Default_GPU);
433 Stage &gpu_tile(const VarOrRVar &x, const VarOrRVar &y, const VarOrRVar &z,
434 const VarOrRVar &tx, const VarOrRVar &ty, const VarOrRVar &tz,
435 const Expr &x_size, const Expr &y_size, const Expr &z_size,
437 DeviceAPI device_api = DeviceAPI::Default_GPU);
438
441
443
444 Stage &prefetch(const Func &f, const VarOrRVar &at, const VarOrRVar &from, Expr offset = 1,
446 Stage &prefetch(const Internal::Parameter &param, const VarOrRVar &at, const VarOrRVar &from, Expr offset = 1,
448 template<typename T>
449 Stage &prefetch(const T &image, const VarOrRVar &at, const VarOrRVar &from, Expr offset = 1,
451 return prefetch(image.parameter(), at, from, std::move(offset), strategy);
452 }
453 // @}
454
455 /** Attempt to get the source file and line where this stage was
456 * defined by parsing the process's own debug symbols. Returns an
457 * empty string if no debug symbols were found or the debug
458 * symbols were not understood. Works on OS X and Linux only. */
459 std::string source_location() const;
460
461 /** Assert that this stage has intentionally been given no schedule, and
462 * suppress the warning about unscheduled update definitions that would
463 * otherwise fire. This counts as a schedule, so calling this twice on the
464 * same Stage will fail the assertion. */
466};
467
468// For backwards compatibility, keep the ScheduleHandle name.
470
472
473/** A fragment of front-end syntax of the form f(x, y, z), where x, y,
474 * z are Vars or Exprs. If could be the left hand side of a definition or
475 * an update definition, or it could be a call to a function. We don't know
476 * until we see how this object gets used.
477 */
478class FuncRef {
480 int implicit_placeholder_pos;
481 int implicit_count;
482 std::vector<Expr> args;
483 std::vector<Expr> args_with_implicit_vars(const std::vector<Expr> &e) const;
484
485 /** Helper for function update by Tuple. If the function does not
486 * already have a pure definition, init_val will be used as RHS of
487 * each tuple element in the initial function definition. */
488 template<typename BinaryOp>
489 Stage func_ref_update(const Tuple &e, int init_val);
490
491 /** Helper for function update by Expr. If the function does not
492 * already have a pure definition, init_val will be used as RHS in
493 * the initial function definition. */
494 template<typename BinaryOp>
495 Stage func_ref_update(Expr e, int init_val);
496
497public:
498 FuncRef(const Internal::Function &, const std::vector<Expr> &,
499 int placeholder_pos = -1, int count = 0);
500 FuncRef(Internal::Function, const std::vector<Var> &,
501 int placeholder_pos = -1, int count = 0);
502
503 /** Use this as the left-hand-side of a definition or an update definition
504 * (see \ref RDom).
505 */
507
508 /** Use this as the left-hand-side of a definition or an update definition
509 * for a Func with multiple outputs. */
511
512 /** Define a stage that adds the given expression to this Func. If the
513 * expression refers to some RDom, this performs a sum reduction of the
514 * expression over the domain. If the function does not already have a
515 * pure definition, this sets it to zero.
516 */
517 // @{
521 // @}
522
523 /** Define a stage that adds the negative of the given expression to this
524 * Func. If the expression refers to some RDom, this performs a sum reduction
525 * of the negative of the expression over the domain. If the function does
526 * not already have a pure definition, this sets it to zero.
527 */
528 // @{
532 // @}
533
534 /** Define a stage that multiplies this Func by the given expression. If the
535 * expression refers to some RDom, this performs a product reduction of the
536 * expression over the domain. If the function does not already have a pure
537 * definition, this sets it to 1.
538 */
539 // @{
543 // @}
544
545 /** Define a stage that divides this Func by the given expression.
546 * If the expression refers to some RDom, this performs a product
547 * reduction of the inverse of the expression over the domain. If the
548 * function does not already have a pure definition, this sets it to 1.
549 */
550 // @{
554 // @}
555
556 /* Override the usual assignment operator, so that
557 * f(x, y) = g(x, y) defines f.
558 */
560
561 /** Use this as a call to the function, and not the left-hand-side
562 * of a definition. Only works for single-output Funcs. */
563 operator Expr() const;
564
565 /** When a FuncRef refers to a function that provides multiple
566 * outputs, you can access each output as an Expr using
567 * operator[].
568 */
570
571 /** How many outputs does the function this refers to produce. */
572 size_t size() const;
573
574 /** What function is this calling? */
576 return func;
577 }
578};
579
580/** Explicit overloads of min and max for FuncRef. These exist to
581 * disambiguate calls to min on FuncRefs when a user has pulled both
582 * Halide::min and std::min into their namespace. */
583// @{
584inline Expr min(const FuncRef &a, const FuncRef &b) {
585 return min(Expr(a), Expr(b));
586}
587inline Expr max(const FuncRef &a, const FuncRef &b) {
588 return max(Expr(a), Expr(b));
589}
590// @}
591
592/** A fragment of front-end syntax of the form f(x, y, z)[index], where x, y,
593 * z are Vars or Exprs. If could be the left hand side of an update
594 * definition, or it could be a call to a function. We don't know
595 * until we see how this object gets used.
596 */
598 FuncRef func_ref;
599 std::vector<Expr> args; // args to the function
600 int idx; // Index to function outputs
601
602 /** Helper function that generates a Tuple where element at 'idx' is set
603 * to 'e' and the rests are undef. */
604 Tuple values_with_undefs(const Expr &e) const;
605
606public:
607 FuncTupleElementRef(const FuncRef &ref, const std::vector<Expr> &args, int idx);
608
609 /** Use this as the left-hand-side of an update definition of Tuple
610 * component 'idx' of a Func (see \ref RDom). The function must
611 * already have an initial definition.
612 */
614
615 /** Define a stage that adds the given expression to Tuple component 'idx'
616 * of this Func. The other Tuple components are unchanged. If the expression
617 * refers to some RDom, this performs a sum reduction of the expression over
618 * the domain. The function must already have an initial definition.
619 */
621
622 /** Define a stage that adds the negative of the given expression to Tuple
623 * component 'idx' of this Func. The other Tuple components are unchanged.
624 * If the expression refers to some RDom, this performs a sum reduction of
625 * the negative of the expression over the domain. The function must already
626 * have an initial definition.
627 */
629
630 /** Define a stage that multiplies Tuple component 'idx' of this Func by
631 * the given expression. The other Tuple components are unchanged. If the
632 * expression refers to some RDom, this performs a product reduction of
633 * the expression over the domain. The function must already have an
634 * initial definition.
635 */
637
638 /** Define a stage that divides Tuple component 'idx' of this Func by
639 * the given expression. The other Tuple components are unchanged.
640 * If the expression refers to some RDom, this performs a product
641 * reduction of the inverse of the expression over the domain. The function
642 * must already have an initial definition.
643 */
645
646 /* Override the usual assignment operator, so that
647 * f(x, y)[index] = g(x, y) defines f.
648 */
650
651 /** Use this as a call to Tuple component 'idx' of a Func, and not the
652 * left-hand-side of a definition. */
653 operator Expr() const;
654
655 /** What function is this calling? */
657 return func_ref.function();
658 }
659
660 /** Return index to the function outputs. */
661 int index() const {
662 return idx;
663 }
664};
665
666namespace Internal {
667class IRMutator;
668} // namespace Internal
669
670/** Helper class for identifying purpose of an Expr passed to memoize.
671 */
673protected:
675 friend class Func;
676
677public:
678 explicit EvictionKey(const Expr &expr = Expr())
679 : key(expr) {
680 }
681};
682
683/** A halide function. This class represents one stage in a Halide
684 * pipeline, and is the unit by which we schedule things. By default
685 * they are aggressively inlined, so you are encouraged to make lots
686 * of little functions, rather than storing things in Exprs. */
687class Func {
688
689 /** A handle on the internal halide function that this
690 * represents */
692
693 /** When you make a reference to this function with fewer
694 * arguments than it has dimensions, the argument list is bulked
695 * up with 'implicit' vars with canonical names. This lets you
696 * pass around partially applied Halide functions. */
697 // @{
698 std::pair<int, int> add_implicit_vars(std::vector<Var> &) const;
699 std::pair<int, int> add_implicit_vars(std::vector<Expr> &) const;
700 // @}
701
702 /** The imaging pipeline that outputs this Func alone. */
703 Pipeline pipeline_;
704
705 /** Get the imaging pipeline that outputs this Func alone,
706 * creating it (and freezing the Func) if necessary. */
707 Pipeline pipeline();
708
709 // Helper function for recursive reordering support
710 Func &reorder_storage(const std::vector<Var> &dims, size_t start);
711
712 void invalidate_cache();
713
714public:
715 /** Declare a new undefined function with the given name */
716 explicit Func(const std::string &name);
717
718 /** Declare a new undefined function with the given name.
719 * The function will be constrained to represent Exprs of required_type.
720 * If required_dims is not AnyDims, the function will be constrained to exactly
721 * that many dimensions. */
722 explicit Func(const Type &required_type, int required_dims, const std::string &name);
723
724 /** Declare a new undefined function with the given name.
725 * If required_types is not empty, the function will be constrained to represent
726 * Tuples of the same arity and types. (If required_types is empty, there is no constraint.)
727 * If required_dims is not AnyDims, the function will be constrained to exactly
728 * that many dimensions. */
729 explicit Func(const std::vector<Type> &required_types, int required_dims, const std::string &name);
730
731 /** Declare a new undefined function with an
732 * automatically-generated unique name */
734
735 /** Declare a new function with an automatically-generated unique
736 * name, and define it to return the given expression (which may
737 * not contain free variables). */
738 explicit Func(const Expr &e);
739
740 /** Construct a new Func to wrap an existing, already-define
741 * Function object. */
743
744 /** Construct a new Func to wrap a Buffer. */
745 template<typename T, int Dims>
747 : Func() {
748 (*this)(_) = im(_);
749 }
750
751 /** Evaluate this function over some rectangular domain and return
752 * the resulting buffer or buffers. Performs compilation if the
753 * Func has not previously been realized and compile_jit has not
754 * been called. If the final stage of the pipeline is on the GPU,
755 * data is copied back to the host before being returned. The
756 * returned Realization should probably be instantly converted to
757 * a Buffer class of the appropriate type. That is, do this:
758 *
759 \code
760 f(x) = sin(x);
761 Buffer<float> im = f.realize(...);
762 \endcode
763 *
764 * If your Func has multiple values, because you defined it using
765 * a Tuple, then casting the result of a realize call to a buffer
766 * or image will produce a run-time error. Instead you should do the
767 * following:
768 *
769 \code
770 f(x) = Tuple(x, sin(x));
771 Realization r = f.realize(...);
772 Buffer<int> im0 = r[0];
773 Buffer<float> im1 = r[1];
774 \endcode
775 *
776 * In Halide formal arguments of a computation are specified using
777 * Param<T> and ImageParam objects in the expressions defining the
778 * computation. The param_map argument to realize allows
779 * specifying a set of per-call parameters to be used for a
780 * specific computation. This method is thread-safe where the
781 * globals used by Param<T> and ImageParam are not. Any parameters
782 * that are not in the param_map are taken from the global values,
783 * so those can continue to be used if they are not changing
784 * per-thread.
785 *
786 * One can explicitly construct a ParamMap and
787 * use its set method to insert Parameter to scalar or Buffer
788 * value mappings. (NOTE: ParamMap is deprecated in Halide 16 and
789 * will be removed in Halide 17. Callers requiring threadsafe JIT
790 * calls should migrate to use compile_to_callable() instead.)
791 *
792 \code
793 Param<int32> p(42);
794 ImageParam img(Int(32), 1);
795 f(x) = img(x) + p;
796
797 Buffer<int32_t) arg_img(10, 10);
798 <fill in arg_img...>
799 ParamMap params;
800 params.set(p, 17);
801 params.set(img, arg_img);
802
803 Target t = get_jit_target_from_environment();
804 Buffer<int32_t> result = f.realize({10, 10}, t, params);
805 \endcode
806 *
807 * Alternatively, an initializer list can be used
808 * directly in the realize call to pass this information:
809 *
810 \code
811 Param<int32> p(42);
812 ImageParam img(Int(32), 1);
813 f(x) = img(x) + p;
814
815 Buffer<int32_t) arg_img(10, 10);
816 <fill in arg_img...>
817
818 Target t = get_jit_target_from_environment();
819 Buffer<int32_t> result = f.realize({10, 10}, t, { { p, 17 }, { img, arg_img } });
820 \endcode
821 *
822 * If the Func cannot be realized into a buffer of the given size
823 * due to scheduling constraints on scattering update definitions,
824 * it will be realized into a larger buffer of the minimum size
825 * possible, and a cropped view at the requested size will be
826 * returned. It is thus not safe to assume the returned buffers
827 * are contiguous in memory. This behavior can be disabled with
828 * the NoBoundsQuery target flag, in which case an error about
829 * writing out of bounds on the output buffer will trigger
830 * instead.
831 *
832 */
833 Realization realize(std::vector<int32_t> sizes = {}, const Target &target = Target(),
835
836 /** Same as above, but takes a custom user-provided context to be
837 * passed to runtime functions. This can be used to pass state to
838 * runtime overrides in a thread-safe manner. A nullptr context is
839 * legal, and is equivalent to calling the variant of realize
840 * that does not take a context. */
842 std::vector<int32_t> sizes = {},
843 const Target &target = Target(),
845
846 /** Evaluate this function into an existing allocated buffer or
847 * buffers. If the buffer is also one of the arguments to the
848 * function, strange things may happen, as the pipeline isn't
849 * necessarily safe to run in-place. If you pass multiple buffers,
850 * they must have matching sizes. This form of realize does *not*
851 * automatically copy data back from the GPU. */
854
855 /** Same as above, but takes a custom user-provided context to be
856 * passed to runtime functions. This can be used to pass state to
857 * runtime overrides in a thread-safe manner. A nullptr context is
858 * legal, and is equivalent to calling the variant of realize
859 * that does not take a context. */
860 void realize(JITUserContext *context,
862 const Target &target = Target(),
864
865 /** For a given size of output, or a given output buffer,
866 * determine the bounds required of all unbound ImageParams
867 * referenced. Communicates the result by allocating new buffers
868 * of the appropriate size and binding them to the unbound
869 * ImageParams.
870 *
871 * Set the documentation for Func::realize regarding the
872 * ParamMap. There is one difference in that input Buffer<>
873 * arguments that are being inferred are specified as a pointer to
874 * the Buffer<> in the ParamMap. E.g.
875 *
876 \code
877 Param<int32> p(42);
878 ImageParam img(Int(32), 1);
879 f(x) = img(x) + p;
880
881 Target t = get_jit_target_from_environment();
882 Buffer<> in;
883 f.infer_input_bounds({10, 10}, t, { { img, &in } });
884 \endcode
885 * On return, in will be an allocated buffer of the correct size
886 * to evaulate f over a 10x10 region.
887 */
888 // @{
889 void infer_input_bounds(const std::vector<int32_t> &sizes,
890 const Target &target = get_jit_target_from_environment(),
893 const Target &target = get_jit_target_from_environment(),
895 // @}
896
897 /** Versions of infer_input_bounds that take a custom user context
898 * to pass to runtime functions. */
899 // @{
901 const std::vector<int32_t> &sizes,
902 const Target &target = get_jit_target_from_environment(),
906 const Target &target = get_jit_target_from_environment(),
908 // @}
909 /** Statically compile this function to llvm bitcode, with the
910 * given filename (which should probably end in .bc), type
911 * signature, and C function name (which defaults to the same name
912 * as this halide function */
913 //@{
914 void compile_to_bitcode(const std::string &filename, const std::vector<Argument> &, const std::string &fn_name,
915 const Target &target = get_target_from_environment());
916 void compile_to_bitcode(const std::string &filename, const std::vector<Argument> &,
917 const Target &target = get_target_from_environment());
918 // @}
919
920 /** Statically compile this function to llvm assembly, with the
921 * given filename (which should probably end in .ll), type
922 * signature, and C function name (which defaults to the same name
923 * as this halide function */
924 //@{
925 void compile_to_llvm_assembly(const std::string &filename, const std::vector<Argument> &, const std::string &fn_name,
926 const Target &target = get_target_from_environment());
927 void compile_to_llvm_assembly(const std::string &filename, const std::vector<Argument> &,
928 const Target &target = get_target_from_environment());
929 // @}
930
931 /** Statically compile this function to an object file, with the
932 * given filename (which should probably end in .o or .obj), type
933 * signature, and C function name (which defaults to the same name
934 * as this halide function. You probably don't want to use this
935 * directly; call compile_to_static_library or compile_to_file instead. */
936 //@{
937 void compile_to_object(const std::string &filename, const std::vector<Argument> &, const std::string &fn_name,
938 const Target &target = get_target_from_environment());
939 void compile_to_object(const std::string &filename, const std::vector<Argument> &,
940 const Target &target = get_target_from_environment());
941 // @}
942
943 /** Emit a header file with the given filename for this
944 * function. The header will define a function with the type
945 * signature given by the second argument, and a name given by the
946 * third. The name defaults to the same name as this halide
947 * function. You don't actually have to have defined this function
948 * yet to call this. You probably don't want to use this directly;
949 * call compile_to_static_library or compile_to_file instead. */
950 void compile_to_header(const std::string &filename, const std::vector<Argument> &, const std::string &fn_name = "",
951 const Target &target = get_target_from_environment());
952
953 /** Statically compile this function to text assembly equivalent
954 * to the object file generated by compile_to_object. This is
955 * useful for checking what Halide is producing without having to
956 * disassemble anything, or if you need to feed the assembly into
957 * some custom toolchain to produce an object file (e.g. iOS) */
958 //@{
959 void compile_to_assembly(const std::string &filename, const std::vector<Argument> &, const std::string &fn_name,
960 const Target &target = get_target_from_environment());
961 void compile_to_assembly(const std::string &filename, const std::vector<Argument> &,
962 const Target &target = get_target_from_environment());
963 // @}
964
965 /** Statically compile this function to C source code. This is
966 * useful for providing fallback code paths that will compile on
967 * many platforms. Vectorization will fail, and parallelization
968 * will produce serial code. */
969 void compile_to_c(const std::string &filename,
970 const std::vector<Argument> &,
971 const std::string &fn_name = "",
972 const Target &target = get_target_from_environment());
973
974 /** Write out an internal representation of lowered code. Useful
975 * for analyzing and debugging scheduling. Can emit html or plain
976 * text. */
977 void compile_to_lowered_stmt(const std::string &filename,
978 const std::vector<Argument> &args,
980 const Target &target = get_target_from_environment());
981
982 /** Write out the loop nests specified by the schedule for this
983 * Function. Helpful for understanding what a schedule is
984 * doing. */
986
987 /** Compile to object file and header pair, with the given
988 * arguments. The name defaults to the same name as this halide
989 * function.
990 */
991 void compile_to_file(const std::string &filename_prefix, const std::vector<Argument> &args,
992 const std::string &fn_name = "",
993 const Target &target = get_target_from_environment());
994
995 /** Compile to static-library file and header pair, with the given
996 * arguments. The name defaults to the same name as this halide
997 * function.
998 */
999 void compile_to_static_library(const std::string &filename_prefix, const std::vector<Argument> &args,
1000 const std::string &fn_name = "",
1001 const Target &target = get_target_from_environment());
1002
1003 /** Compile to static-library file and header pair once for each target;
1004 * each resulting function will be considered (in order) via halide_can_use_target_features()
1005 * at runtime, with the first appropriate match being selected for subsequent use.
1006 * This is typically useful for specializations that may vary unpredictably by machine
1007 * (e.g., SSE4.1/AVX/AVX2 on x86 desktop machines).
1008 * All targets must have identical arch-os-bits.
1009 */
1011 const std::vector<Argument> &args,
1012 const std::vector<Target> &targets);
1013
1014 /** Like compile_to_multitarget_static_library(), except that the object files
1015 * are all output as object files (rather than bundled into a static library).
1016 *
1017 * `suffixes` is an optional list of strings to use for as the suffix for each object
1018 * file. If nonempty, it must be the same length as `targets`. (If empty, Target::to_string()
1019 * will be used for each suffix.)
1020 *
1021 * Note that if `targets.size()` > 1, the wrapper code (to select the subtarget)
1022 * will be generated with the filename `${filename_prefix}_wrapper.o`
1023 *
1024 * Note that if `targets.size()` > 1 and `no_runtime` is not specified, the runtime
1025 * will be generated with the filename `${filename_prefix}_runtime.o`
1026 */
1028 const std::vector<Argument> &args,
1029 const std::vector<Target> &targets,
1030 const std::vector<std::string> &suffixes);
1031
1032 /** Store an internal representation of lowered code as a self
1033 * contained Module suitable for further compilation. */
1034 Module compile_to_module(const std::vector<Argument> &args, const std::string &fn_name = "",
1035 const Target &target = get_target_from_environment());
1036
1037 /** Compile and generate multiple target files with single call.
1038 * Deduces target files based on filenames specified in
1039 * output_files map.
1040 */
1041 void compile_to(const std::map<OutputFileType, std::string> &output_files,
1042 const std::vector<Argument> &args,
1043 const std::string &fn_name,
1044 const Target &target = get_target_from_environment());
1045
1046 /** Eagerly jit compile the function to machine code. This
1047 * normally happens on the first call to realize. If you're
1048 * running your halide pipeline inside time-sensitive code and
1049 * wish to avoid including the time taken to compile a pipeline,
1050 * then you can call this ahead of time. Default is to use the Target
1051 * returned from Halide::get_jit_target_from_environment()
1052 */
1054
1055 /** Get a struct containing the currently set custom functions
1056 * used by JIT. This can be mutated. Changes will take effect the
1057 * next time this Func is realized. */
1059
1060 /** Eagerly jit compile the function to machine code and return a callable
1061 * struct that behaves like a function pointer. The calling convention
1062 * will exactly match that of an AOT-compiled version of this Func
1063 * with the same Argument list.
1064 */
1065 Callable compile_to_callable(const std::vector<Argument> &args,
1066 const Target &target = get_jit_target_from_environment());
1067
1068 /** Add a custom pass to be used during lowering. It is run after
1069 * all other lowering passes. Can be used to verify properties of
1070 * the lowered Stmt, instrument it with extra code, or otherwise
1071 * modify it. The Func takes ownership of the pass, and will call
1072 * delete on it when the Func goes out of scope. So don't pass a
1073 * stack object, or share pass instances between multiple
1074 * Funcs. */
1075 template<typename T>
1077 // Template instantiate a custom deleter for this type, then
1078 // wrap in a lambda. The custom deleter lives in user code, so
1079 // that deletion is on the same heap as construction (I hate Windows).
1080 add_custom_lowering_pass(pass, [pass]() { delete_lowering_pass<T>(pass); });
1081 }
1082
1083 /** Add a custom pass to be used during lowering, with the
1084 * function that will be called to delete it also passed in. Set
1085 * it to nullptr if you wish to retain ownership of the object. */
1086 void add_custom_lowering_pass(Internal::IRMutator *pass, std::function<void()> deleter);
1087
1088 /** Remove all previously-set custom lowering passes */
1090
1091 /** Get the custom lowering passes. */
1092 const std::vector<CustomLoweringPass> &custom_lowering_passes();
1093
1094 /** When this function is compiled, include code that dumps its
1095 * values to a file after it is realized, for the purpose of
1096 * debugging.
1097 *
1098 * If filename ends in ".tif" or ".tiff" (case insensitive) the file
1099 * is in TIFF format and can be read by standard tools. Oherwise, the
1100 * file format is as follows:
1101 *
1102 * All data is in the byte-order of the target platform. First, a
1103 * 20 byte-header containing four 32-bit ints, giving the extents
1104 * of the first four dimensions. Dimensions beyond four are
1105 * folded into the fourth. Then, a fifth 32-bit int giving the
1106 * data type of the function. The typecodes are given by: float =
1107 * 0, double = 1, uint8_t = 2, int8_t = 3, uint16_t = 4, int16_t =
1108 * 5, uint32_t = 6, int32_t = 7, uint64_t = 8, int64_t = 9. The
1109 * data follows the header, as a densely packed array of the given
1110 * size and the given type. If given the extension .tmp, this file
1111 * format can be natively read by the program ImageStack. */
1112 void debug_to_file(const std::string &filename);
1113
1114 /** The name of this function, either given during construction,
1115 * or automatically generated. */
1116 const std::string &name() const;
1117
1118 /** Get the pure arguments. */
1119 std::vector<Var> args() const;
1120
1121 /** The right-hand-side value of the pure definition of this
1122 * function. Causes an error if there's no pure definition, or if
1123 * the function is defined to return multiple values. */
1124 Expr value() const;
1125
1126 /** The values returned by this function. An error if the function
1127 * has not been been defined. Returns a Tuple with one element for
1128 * functions defined to return a single value. */
1129 Tuple values() const;
1130
1131 /** Does this function have at least a pure definition. */
1132 bool defined() const;
1133
1134 /** Get the left-hand-side of the update definition. An empty
1135 * vector if there's no update definition. If there are
1136 * multiple update definitions for this function, use the
1137 * argument to select which one you want. */
1138 const std::vector<Expr> &update_args(int idx = 0) const;
1139
1140 /** Get the right-hand-side of an update definition. An error if
1141 * there's no update definition. If there are multiple
1142 * update definitions for this function, use the argument to
1143 * select which one you want. */
1144 Expr update_value(int idx = 0) const;
1145
1146 /** Get the right-hand-side of an update definition for
1147 * functions that returns multiple values. An error if there's no
1148 * update definition. Returns a Tuple with one element for
1149 * functions that return a single value. */
1150 Tuple update_values(int idx = 0) const;
1151
1152 /** Get the RVars of the reduction domain for an update definition, if there is
1153 * one. */
1154 std::vector<RVar> rvars(int idx = 0) const;
1155
1156 /** Does this function have at least one update definition? */
1158
1159 /** How many update definitions does this function have? */
1161
1162 /** Is this function an external stage? That is, was it defined
1163 * using define_extern? */
1164 bool is_extern() const;
1165
1166 /** Add an extern definition for this Func. This lets you define a
1167 * Func that represents an external pipeline stage. You can, for
1168 * example, use it to wrap a call to an extern library such as
1169 * fftw. */
1170 // @{
1171 void define_extern(const std::string &function_name,
1172 const std::vector<ExternFuncArgument> &params, Type t,
1173 int dimensionality,
1175 DeviceAPI device_api = DeviceAPI::Host) {
1176 define_extern(function_name, params, t,
1178 device_api);
1179 }
1180
1181 void define_extern(const std::string &function_name,
1182 const std::vector<ExternFuncArgument> &params,
1183 const std::vector<Type> &types, int dimensionality,
1185 define_extern(function_name, params, types,
1187 }
1188
1189 void define_extern(const std::string &function_name,
1190 const std::vector<ExternFuncArgument> &params,
1191 const std::vector<Type> &types, int dimensionality,
1193 DeviceAPI device_api = DeviceAPI::Host) {
1194 define_extern(function_name, params, types,
1196 device_api);
1197 }
1198
1199 void define_extern(const std::string &function_name,
1200 const std::vector<ExternFuncArgument> &params, Type t,
1201 const std::vector<Var> &arguments,
1203 DeviceAPI device_api = DeviceAPI::Host) {
1204 define_extern(function_name, params, std::vector<Type>{t}, arguments,
1205 mangling, device_api);
1206 }
1207
1208 void define_extern(const std::string &function_name,
1209 const std::vector<ExternFuncArgument> &params,
1210 const std::vector<Type> &types,
1211 const std::vector<Var> &arguments,
1213 DeviceAPI device_api = DeviceAPI::Host);
1214 // @}
1215
1216 /** Get the type(s) of the outputs of this Func.
1217 *
1218 * It is not legal to call type() unless the Func has non-Tuple elements.
1219 *
1220 * If the Func isn't yet defined, and was not specified with required types,
1221 * a runtime error will occur.
1222 *
1223 * If the Func isn't yet defined, but *was* specified with required types,
1224 * the requirements will be returned. */
1225 // @{
1226 const Type &type() const;
1227 const std::vector<Type> &types() const;
1228 // @}
1229
1230 /** Get the number of outputs of this Func. Corresponds to the
1231 * size of the Tuple this Func was defined to return.
1232 * If the Func isn't yet defined, but was specified with required types,
1233 * the number of outputs specified in the requirements will be returned. */
1234 int outputs() const;
1235
1236 /** Get the name of the extern function called for an extern
1237 * definition. */
1238 const std::string &extern_function_name() const;
1239
1240 /** The dimensionality (number of arguments) of this function.
1241 * If the Func isn't yet defined, but was specified with required dimensionality,
1242 * the dimensionality specified in the requirements will be returned. */
1243 int dimensions() const;
1244
1245 /** Construct either the left-hand-side of a definition, or a call
1246 * to a functions that happens to only contain vars as
1247 * arguments. If the function has already been defined, and fewer
1248 * arguments are given than the function has dimensions, then
1249 * enough implicit vars are added to the end of the argument list
1250 * to make up the difference (see \ref Var::implicit) */
1251 // @{
1252 FuncRef operator()(std::vector<Var>) const;
1253
1254 template<typename... Args>
1256 operator()(Args &&...args) const {
1257 std::vector<Var> collected_args{std::forward<Args>(args)...};
1258 return this->operator()(collected_args);
1259 }
1260 // @}
1261
1262 /** Either calls to the function, or the left-hand-side of
1263 * an update definition (see \ref RDom). If the function has
1264 * already been defined, and fewer arguments are given than the
1265 * function has dimensions, then enough implicit vars are added to
1266 * the end of the argument list to make up the difference. (see
1267 * \ref Var::implicit)*/
1268 // @{
1269 FuncRef operator()(std::vector<Expr>) const;
1270
1271 template<typename... Args>
1273 operator()(const Expr &x, Args &&...args) const {
1274 std::vector<Expr> collected_args{x, std::forward<Args>(args)...};
1275 return (*this)(collected_args);
1276 }
1277 // @}
1278
1279 /** Creates and returns a new identity Func that wraps this Func. During
1280 * compilation, Halide replaces all calls to this Func done by 'f'
1281 * with calls to the wrapper. If this Func is already wrapped for
1282 * use in 'f', will return the existing wrapper.
1283 *
1284 * For example, g.in(f) would rewrite a pipeline like this:
1285 \code
1286 g(x, y) = ...
1287 f(x, y) = ... g(x, y) ...
1288 \endcode
1289 * into a pipeline like this:
1290 \code
1291 g(x, y) = ...
1292 g_wrap(x, y) = g(x, y)
1293 f(x, y) = ... g_wrap(x, y)
1294 \endcode
1295 *
1296 * This has a variety of uses. You can use it to schedule this
1297 * Func differently in the different places it is used:
1298 \code
1299 g(x, y) = ...
1300 f1(x, y) = ... g(x, y) ...
1301 f2(x, y) = ... g(x, y) ...
1302 g.in(f1).compute_at(f1, y).vectorize(x, 8);
1303 g.in(f2).compute_at(f2, x).unroll(x);
1304 \endcode
1305 *
1306 * You can also use it to stage loads from this Func via some
1307 * intermediate buffer (perhaps on the stack as in
1308 * test/performance/block_transpose.cpp, or in shared GPU memory
1309 * as in test/performance/wrap.cpp). In this we compute the
1310 * wrapper at tiles of the consuming Funcs like so:
1311 \code
1312 g.compute_root()...
1313 g.in(f).compute_at(f, tiles)...
1314 \endcode
1315 *
1316 * Func::in() can also be used to compute pieces of a Func into a
1317 * smaller scratch buffer (perhaps on the GPU) and then copy them
1318 * into a larger output buffer one tile at a time. See
1319 * apps/interpolate/interpolate.cpp for an example of this. In
1320 * this case we compute the Func at tiles of its own wrapper:
1321 \code
1322 f.in(g).compute_root().gpu_tile(...)...
1323 f.compute_at(f.in(g), tiles)...
1324 \endcode
1325 *
1326 * A similar use of Func::in() wrapping Funcs with multiple update
1327 * stages in a pure wrapper. The following code:
1328 \code
1329 f(x, y) = x + y;
1330 f(x, y) += 5;
1331 g(x, y) = f(x, y);
1332 f.compute_root();
1333 \endcode
1334 *
1335 * Is equivalent to:
1336 \code
1337 for y:
1338 for x:
1339 f(x, y) = x + y;
1340 for y:
1341 for x:
1342 f(x, y) += 5
1343 for y:
1344 for x:
1345 g(x, y) = f(x, y)
1346 \endcode
1347 * using Func::in(), we can write:
1348 \code
1349 f(x, y) = x + y;
1350 f(x, y) += 5;
1351 g(x, y) = f(x, y);
1352 f.in(g).compute_root();
1353 \endcode
1354 * which instead produces:
1355 \code
1356 for y:
1357 for x:
1358 f(x, y) = x + y;
1359 f(x, y) += 5
1360 f_wrap(x, y) = f(x, y)
1361 for y:
1362 for x:
1363 g(x, y) = f_wrap(x, y)
1364 \endcode
1365 */
1366 Func in(const Func &f);
1367
1368 /** Create and return an identity wrapper shared by all the Funcs in
1369 * 'fs'. If any of the Funcs in 'fs' already have a custom wrapper,
1370 * this will throw an error. */
1371 Func in(const std::vector<Func> &fs);
1372
1373 /** Create and return a global identity wrapper, which wraps all calls to
1374 * this Func by any other Func. If a global wrapper already exists,
1375 * returns it. The global identity wrapper is only used by callers for
1376 * which no custom wrapper has been specified.
1377 */
1379
1380 /** Similar to \ref Func::in; however, instead of replacing the call to
1381 * this Func with an identity Func that refers to it, this replaces the
1382 * call with a clone of this Func.
1383 *
1384 * For example, f.clone_in(g) would rewrite a pipeline like this:
1385 \code
1386 f(x, y) = x + y;
1387 g(x, y) = f(x, y) + 2;
1388 h(x, y) = f(x, y) - 3;
1389 \endcode
1390 * into a pipeline like this:
1391 \code
1392 f(x, y) = x + y;
1393 f_clone(x, y) = x + y;
1394 g(x, y) = f_clone(x, y) + 2;
1395 h(x, y) = f(x, y) - 3;
1396 \endcode
1397 *
1398 */
1399 //@{
1400 Func clone_in(const Func &f);
1401 Func clone_in(const std::vector<Func> &fs);
1402 //@}
1403
1404 /** Declare that this function should be implemented by a call to
1405 * halide_buffer_copy with the given target device API. Asserts
1406 * that the Func has a pure definition which is a simple call to a
1407 * single input, and no update definitions. The wrapper Funcs
1408 * returned by in() are suitable candidates. Consumes all pure
1409 * variables, and rewrites the Func to have an extern definition
1410 * that calls halide_buffer_copy. */
1412
1413 /** Declare that this function should be implemented by a call to
1414 * halide_buffer_copy with a NULL target device API. Equivalent to
1415 * copy_to_device(DeviceAPI::Host). Asserts that the Func has a
1416 * pure definition which is a simple call to a single input, and
1417 * no update definitions. The wrapper Funcs returned by in() are
1418 * suitable candidates. Consumes all pure variables, and rewrites
1419 * the Func to have an extern definition that calls
1420 * halide_buffer_copy.
1421 *
1422 * Note that if the source Func is already valid in host memory,
1423 * this compiles to code that does the minimum number of calls to
1424 * memcpy.
1425 */
1427
1428 /** Split a dimension into inner and outer subdimensions with the
1429 * given names, where the inner dimension iterates from 0 to
1430 * factor-1. The inner and outer subdimensions can then be dealt
1431 * with using the other scheduling calls. It's ok to reuse the old
1432 * variable name as either the inner or outer variable. The final
1433 * argument specifies how the tail should be handled if the split
1434 * factor does not provably divide the extent. */
1435 Func &split(const VarOrRVar &old, const VarOrRVar &outer, const VarOrRVar &inner, const Expr &factor, TailStrategy tail = TailStrategy::Auto);
1436
1437 /** Join two dimensions into a single fused dimension. The fused
1438 * dimension covers the product of the extents of the inner and
1439 * outer dimensions given. */
1440 Func &fuse(const VarOrRVar &inner, const VarOrRVar &outer, const VarOrRVar &fused);
1441
1442 /** Mark a dimension to be traversed serially. This is the default. */
1443 Func &serial(const VarOrRVar &var);
1444
1445 /** Mark a dimension to be traversed in parallel */
1447
1448 /** Split a dimension by the given task_size, and the parallelize the
1449 * outer dimension. This creates parallel tasks that have size
1450 * task_size. After this call, var refers to the outer dimension of
1451 * the split. The inner dimension has a new anonymous name. If you
1452 * wish to mutate it, or schedule with respect to it, do the split
1453 * manually. */
1455
1456 /** Mark a dimension to be computed all-at-once as a single
1457 * vector. The dimension should have constant extent -
1458 * e.g. because it is the inner dimension following a split by a
1459 * constant factor. For most uses of vectorize you want the two
1460 * argument form. The variable to be vectorized should be the
1461 * innermost one. */
1463
1464 /** Mark a dimension to be completely unrolled. The dimension
1465 * should have constant extent - e.g. because it is the inner
1466 * dimension following a split by a constant factor. For most uses
1467 * of unroll you want the two-argument form. */
1468 Func &unroll(const VarOrRVar &var);
1469
1470 /** Split a dimension by the given factor, then vectorize the
1471 * inner dimension. This is how you vectorize a loop of unknown
1472 * size. The variable to be vectorized should be the innermost
1473 * one. After this call, var refers to the outer dimension of the
1474 * split. 'factor' must be an integer. */
1475 Func &vectorize(const VarOrRVar &var, const Expr &factor, TailStrategy tail = TailStrategy::Auto);
1476
1477 /** Split a dimension by the given factor, then unroll the inner
1478 * dimension. This is how you unroll a loop of unknown size by
1479 * some constant factor. After this call, var refers to the outer
1480 * dimension of the split. 'factor' must be an integer. */
1481 Func &unroll(const VarOrRVar &var, const Expr &factor, TailStrategy tail = TailStrategy::Auto);
1482
1483 /** Statically declare that the range over which a function should
1484 * be evaluated is given by the second and third arguments. This
1485 * can let Halide perform some optimizations. E.g. if you know
1486 * there are going to be 4 color channels, you can completely
1487 * vectorize the color channel dimension without the overhead of
1488 * splitting it up. If bounds inference decides that it requires
1489 * more of this function than the bounds you have stated, a
1490 * runtime error will occur when you try to run your pipeline. */
1491 Func &bound(const Var &var, Expr min, Expr extent);
1492
1493 /** Statically declare the range over which the function will be
1494 * evaluated in the general case. This provides a basis for the auto
1495 * scheduler to make trade-offs and scheduling decisions. The auto
1496 * generated schedules might break when the sizes of the dimensions are
1497 * very different from the estimates specified. These estimates are used
1498 * only by the auto scheduler if the function is a pipeline output. */
1499 Func &set_estimate(const Var &var, const Expr &min, const Expr &extent);
1500
1501 /** Set (min, extent) estimates for all dimensions in the Func
1502 * at once; this is equivalent to calling `set_estimate(args()[n], min, extent)`
1503 * repeatedly, but slightly terser. The size of the estimates vector
1504 * must match the dimensionality of the Func. */
1505 Func &set_estimates(const Region &estimates);
1506
1507 /** Expand the region computed so that the min coordinates is
1508 * congruent to 'remainder' modulo 'modulus', and the extent is a
1509 * multiple of 'modulus'. For example, f.align_bounds(x, 2) forces
1510 * the min and extent realized to be even, and calling
1511 * f.align_bounds(x, 2, 1) forces the min to be odd and the extent
1512 * to be even. The region computed always contains the region that
1513 * would have been computed without this directive, so no
1514 * assertions are injected.
1515 */
1516 Func &align_bounds(const Var &var, Expr modulus, Expr remainder = 0);
1517
1518 /** Expand the region computed so that the extent is a
1519 * multiple of 'modulus'. For example, f.align_extent(x, 2) forces
1520 * the extent realized to be even. The region computed always contains the
1521 * region that would have been computed without this directive, so no
1522 * assertions are injected. (This is essentially equivalent to align_bounds(),
1523 * but always leaving the min untouched.)
1524 */
1525 Func &align_extent(const Var &var, Expr modulus);
1526
1527 /** Bound the extent of a Func's realization, but not its
1528 * min. This means the dimension can be unrolled or vectorized
1529 * even when its min is not fixed (for example because it is
1530 * compute_at tiles of another Func). This can also be useful for
1531 * forcing a function's allocation to be a fixed size, which often
1532 * means it can go on the stack. */
1533 Func &bound_extent(const Var &var, Expr extent);
1534
1535 /** Split two dimensions at once by the given factors, and then
1536 * reorder the resulting dimensions to be xi, yi, xo, yo from
1537 * innermost outwards. This gives a tiled traversal. */
1538 Func &tile(const VarOrRVar &x, const VarOrRVar &y,
1539 const VarOrRVar &xo, const VarOrRVar &yo,
1540 const VarOrRVar &xi, const VarOrRVar &yi,
1541 const Expr &xfactor, const Expr &yfactor,
1543
1544 /** A shorter form of tile, which reuses the old variable names as
1545 * the new outer dimensions */
1546 Func &tile(const VarOrRVar &x, const VarOrRVar &y,
1547 const VarOrRVar &xi, const VarOrRVar &yi,
1548 const Expr &xfactor, const Expr &yfactor,
1550
1551 /** A more general form of tile, which defines tiles of any dimensionality. */
1552 Func &tile(const std::vector<VarOrRVar> &previous,
1553 const std::vector<VarOrRVar> &outers,
1554 const std::vector<VarOrRVar> &inners,
1555 const std::vector<Expr> &factors,
1556 const std::vector<TailStrategy> &tails);
1557
1558 /** The generalized tile, with a single tail strategy to apply to all vars. */
1559 Func &tile(const std::vector<VarOrRVar> &previous,
1560 const std::vector<VarOrRVar> &outers,
1561 const std::vector<VarOrRVar> &inners,
1562 const std::vector<Expr> &factors,
1564
1565 /** Generalized tiling, reusing the previous names as the outer names. */
1566 Func &tile(const std::vector<VarOrRVar> &previous,
1567 const std::vector<VarOrRVar> &inners,
1568 const std::vector<Expr> &factors,
1570
1571 /** Reorder variables to have the given nesting order, from
1572 * innermost out */
1573 Func &reorder(const std::vector<VarOrRVar> &vars);
1574
1575 template<typename... Args>
1577 reorder(const VarOrRVar &x, const VarOrRVar &y, Args &&...args) {
1578 std::vector<VarOrRVar> collected_args{x, y, std::forward<Args>(args)...};
1579 return reorder(collected_args);
1580 }
1581
1582 /** Rename a dimension. Equivalent to split with a inner size of one. */
1584
1585 /** Specify that race conditions are permitted for this Func,
1586 * which enables parallelizing over RVars even when Halide cannot
1587 * prove that it is safe to do so. Use this with great caution,
1588 * and only if you can prove to yourself that this is safe, as it
1589 * may result in a non-deterministic routine that returns
1590 * different values at different times or on different machines. */
1592
1593 /** Issue atomic updates for this Func. This allows parallelization
1594 * on associative RVars. The function throws a compile error when
1595 * Halide fails to prove associativity. Use override_associativity_test
1596 * to disable the associativity test if you believe the function is
1597 * associative or the order of reduction variable execution does not
1598 * matter.
1599 * Halide compiles this into hardware atomic operations whenever possible,
1600 * and falls back to a mutex lock per storage element if it is impossible
1601 * to atomically update.
1602 * There are three possible outcomes of the compiled code:
1603 * atomic add, compare-and-swap loop, and mutex lock.
1604 * For example:
1605 *
1606 * hist(x) = 0;
1607 * hist(im(r)) += 1;
1608 * hist.compute_root();
1609 * hist.update().atomic().parallel();
1610 *
1611 * will be compiled to atomic add operations.
1612 *
1613 * hist(x) = 0;
1614 * hist(im(r)) = min(hist(im(r)) + 1, 100);
1615 * hist.compute_root();
1616 * hist.update().atomic().parallel();
1617 *
1618 * will be compiled to compare-and-swap loops.
1619 *
1620 * arg_max() = {0, im(0)};
1621 * Expr old_index = arg_max()[0];
1622 * Expr old_max = arg_max()[1];
1623 * Expr new_index = select(old_max < im(r), r, old_index);
1624 * Expr new_max = max(im(r), old_max);
1625 * arg_max() = {new_index, new_max};
1626 * arg_max.compute_root();
1627 * arg_max.update().atomic().parallel();
1628 *
1629 * will be compiled to updates guarded by a mutex lock,
1630 * since it is impossible to atomically update two different locations.
1631 *
1632 * Currently the atomic operation is supported by x86, CUDA, and OpenCL backends.
1633 * Compiling to other backends results in a compile error.
1634 * If an operation is compiled into a mutex lock, and is vectorized or is
1635 * compiled to CUDA or OpenCL, it also results in a compile error,
1636 * since per-element mutex lock on vectorized operation leads to a
1637 * deadlock.
1638 * Vectorization of predicated RVars (through rdom.where()) on CPU
1639 * is also unsupported yet (see https://github.com/halide/Halide/issues/4298).
1640 * 8-bit and 16-bit atomics on GPU are also not supported. */
1642
1643 /** Specialize a Func. This creates a special-case version of the
1644 * Func where the given condition is true. The most effective
1645 * conditions are those of the form param == value, and boolean
1646 * Params. Consider a simple example:
1647 \code
1648 f(x) = x + select(cond, 0, 1);
1649 f.compute_root();
1650 \endcode
1651 * This is equivalent to:
1652 \code
1653 for (int x = 0; x < width; x++) {
1654 f[x] = x + (cond ? 0 : 1);
1655 }
1656 \endcode
1657 * Adding the scheduling directive:
1658 \code
1659 f.specialize(cond)
1660 \endcode
1661 * makes it equivalent to:
1662 \code
1663 if (cond) {
1664 for (int x = 0; x < width; x++) {
1665 f[x] = x;
1666 }
1667 } else {
1668 for (int x = 0; x < width; x++) {
1669 f[x] = x + 1;
1670 }
1671 }
1672 \endcode
1673 * Note that the inner loops have been simplified. In the first
1674 * path Halide knows that cond is true, and in the second path
1675 * Halide knows that it is false.
1676 *
1677 * The specialized version gets its own schedule, which inherits
1678 * every directive made about the parent Func's schedule so far
1679 * except for its specializations. This method returns a handle to
1680 * the new schedule. If you wish to retrieve the specialized
1681 * sub-schedule again later, you can call this method with the
1682 * same condition. Consider the following example of scheduling
1683 * the specialized version:
1684 *
1685 \code
1686 f(x) = x;
1687 f.compute_root();
1688 f.specialize(width > 1).unroll(x, 2);
1689 \endcode
1690 * Assuming for simplicity that width is even, this is equivalent to:
1691 \code
1692 if (width > 1) {
1693 for (int x = 0; x < width/2; x++) {
1694 f[2*x] = 2*x;
1695 f[2*x + 1] = 2*x + 1;
1696 }
1697 } else {
1698 for (int x = 0; x < width/2; x++) {
1699 f[x] = x;
1700 }
1701 }
1702 \endcode
1703 * For this case, it may be better to schedule the un-specialized
1704 * case instead:
1705 \code
1706 f(x) = x;
1707 f.compute_root();
1708 f.specialize(width == 1); // Creates a copy of the schedule so far.
1709 f.unroll(x, 2); // Only applies to the unspecialized case.
1710 \endcode
1711 * This is equivalent to:
1712 \code
1713 if (width == 1) {
1714 f[0] = 0;
1715 } else {
1716 for (int x = 0; x < width/2; x++) {
1717 f[2*x] = 2*x;
1718 f[2*x + 1] = 2*x + 1;
1719 }
1720 }
1721 \endcode
1722 * This can be a good way to write a pipeline that splits,
1723 * vectorizes, or tiles, but can still handle small inputs.
1724 *
1725 * If a Func has several specializations, the first matching one
1726 * will be used, so the order in which you define specializations
1727 * is significant. For example:
1728 *
1729 \code
1730 f(x) = x + select(cond1, a, b) - select(cond2, c, d);
1731 f.specialize(cond1);
1732 f.specialize(cond2);
1733 \endcode
1734 * is equivalent to:
1735 \code
1736 if (cond1) {
1737 for (int x = 0; x < width; x++) {
1738 f[x] = x + a - (cond2 ? c : d);
1739 }
1740 } else if (cond2) {
1741 for (int x = 0; x < width; x++) {
1742 f[x] = x + b - c;
1743 }
1744 } else {
1745 for (int x = 0; x < width; x++) {
1746 f[x] = x + b - d;
1747 }
1748 }
1749 \endcode
1750 *
1751 * Specializations may in turn be specialized, which creates a
1752 * nested if statement in the generated code.
1753 *
1754 \code
1755 f(x) = x + select(cond1, a, b) - select(cond2, c, d);
1756 f.specialize(cond1).specialize(cond2);
1757 \endcode
1758 * This is equivalent to:
1759 \code
1760 if (cond1) {
1761 if (cond2) {
1762 for (int x = 0; x < width; x++) {
1763 f[x] = x + a - c;
1764 }
1765 } else {
1766 for (int x = 0; x < width; x++) {
1767 f[x] = x + a - d;
1768 }
1769 }
1770 } else {
1771 for (int x = 0; x < width; x++) {
1772 f[x] = x + b - (cond2 ? c : d);
1773 }
1774 }
1775 \endcode
1776 * To create a 4-way if statement that simplifies away all of the
1777 * ternary operators above, you could say:
1778 \code
1779 f.specialize(cond1).specialize(cond2);
1780 f.specialize(cond2);
1781 \endcode
1782 * or
1783 \code
1784 f.specialize(cond1 && cond2);
1785 f.specialize(cond1);
1786 f.specialize(cond2);
1787 \endcode
1788 *
1789 * Any prior Func which is compute_at some variable of this Func
1790 * gets separately included in all paths of the generated if
1791 * statement. The Var in the compute_at call to must exist in all
1792 * paths, but it may have been generated via a different path of
1793 * splits, fuses, and renames. This can be used somewhat
1794 * creatively. Consider the following code:
1795 \code
1796 g(x, y) = 8*x;
1797 f(x, y) = g(x, y) + 1;
1798 f.compute_root().specialize(cond);
1799 Var g_loop;
1800 f.specialize(cond).rename(y, g_loop);
1801 f.rename(x, g_loop);
1802 g.compute_at(f, g_loop);
1803 \endcode
1804 * When cond is true, this is equivalent to g.compute_at(f,y).
1805 * When it is false, this is equivalent to g.compute_at(f,x).
1806 */
1807 Stage specialize(const Expr &condition);
1808
1809 /** Add a specialization to a Func that always terminates execution
1810 * with a call to halide_error(). By itself, this is of limited use,
1811 * but can be useful to terminate chains of specialize() calls where
1812 * no "default" case is expected (thus avoiding unnecessary code generation).
1813 *
1814 * For instance, say we want to optimize a pipeline to process images
1815 * in planar and interleaved format; we might typically do something like:
1816 \code
1817 ImageParam im(UInt(8), 3);
1818 Func f = do_something_with(im);
1819 f.specialize(im.dim(0).stride() == 1).vectorize(x, 8); // planar
1820 f.specialize(im.dim(2).stride() == 1).reorder(c, x, y).vectorize(c); // interleaved
1821 \endcode
1822 * This code will vectorize along rows for the planar case, and across pixel
1823 * components for the interleaved case... but there is an implicit "else"
1824 * for the unhandled cases, which generates unoptimized code. If we never
1825 * anticipate passing any other sort of images to this, we code streamline
1826 * our code by adding specialize_fail():
1827 \code
1828 ImageParam im(UInt(8), 3);
1829 Func f = do_something(im);
1830 f.specialize(im.dim(0).stride() == 1).vectorize(x, 8); // planar
1831 f.specialize(im.dim(2).stride() == 1).reorder(c, x, y).vectorize(c); // interleaved
1832 f.specialize_fail("Unhandled image format");
1833 \endcode
1834 * Conceptually, this produces codes like:
1835 \code
1836 if (im.dim(0).stride() == 1) {
1837 do_something_planar();
1838 } else if (im.dim(2).stride() == 1) {
1839 do_something_interleaved();
1840 } else {
1841 halide_error("Unhandled image format");
1842 }
1843 \endcode
1844 *
1845 * Note that calling specialize_fail() terminates the specialization chain
1846 * for a given Func; you cannot create new specializations for the Func
1847 * afterwards (though you can retrieve handles to previous specializations).
1848 */
1849 void specialize_fail(const std::string &message);
1850
1851 /** Tell Halide that the following dimensions correspond to GPU
1852 * thread indices. This is useful if you compute a producer
1853 * function within the block indices of a consumer function, and
1854 * want to control how that function's dimensions map to GPU
1855 * threads. If the selected target is not an appropriate GPU, this
1856 * just marks those dimensions as parallel. */
1857 // @{
1861 // @}
1862
1863 /** The given dimension corresponds to the lanes in a GPU
1864 * warp. GPU warp lanes are distinguished from GPU threads by the
1865 * fact that all warp lanes run together in lockstep, which
1866 * permits lightweight communication of data from one lane to
1867 * another. */
1869
1870 /** Tell Halide to run this stage using a single gpu thread and
1871 * block. This is not an efficient use of your GPU, but it can be
1872 * useful to avoid copy-back for intermediate update stages that
1873 * touch a very small part of your Func. */
1875
1876 /** Tell Halide that the following dimensions correspond to GPU
1877 * block indices. This is useful for scheduling stages that will
1878 * run serially within each GPU block. If the selected target is
1879 * not ptx, this just marks those dimensions as parallel. */
1880 // @{
1884 // @}
1885
1886 /** Tell Halide that the following dimensions correspond to GPU
1887 * block indices and thread indices. If the selected target is not
1888 * ptx, these just mark the given dimensions as parallel. The
1889 * dimensions are consumed by this call, so do all other
1890 * unrolling, reordering, etc first. */
1891 // @{
1897 // @}
1898
1899 /** Short-hand for tiling a domain and mapping the tile indices
1900 * to GPU block indices and the coordinates within each tile to
1901 * GPU thread indices. Consumes the variables given, so do all
1902 * other scheduling first. */
1903 // @{
1904 Func &gpu_tile(const VarOrRVar &x, const VarOrRVar &bx, const VarOrRVar &tx, const Expr &x_size,
1906 DeviceAPI device_api = DeviceAPI::Default_GPU);
1907
1908 Func &gpu_tile(const VarOrRVar &x, const VarOrRVar &tx, const Expr &x_size,
1910 DeviceAPI device_api = DeviceAPI::Default_GPU);
1911 Func &gpu_tile(const VarOrRVar &x, const VarOrRVar &y,
1912 const VarOrRVar &bx, const VarOrRVar &by,
1913 const VarOrRVar &tx, const VarOrRVar &ty,
1914 const Expr &x_size, const Expr &y_size,
1916 DeviceAPI device_api = DeviceAPI::Default_GPU);
1917
1918 Func &gpu_tile(const VarOrRVar &x, const VarOrRVar &y,
1919 const VarOrRVar &tx, const VarOrRVar &ty,
1920 const Expr &x_size, const Expr &y_size,
1922 DeviceAPI device_api = DeviceAPI::Default_GPU);
1923
1924 Func &gpu_tile(const VarOrRVar &x, const VarOrRVar &y, const VarOrRVar &z,
1925 const VarOrRVar &bx, const VarOrRVar &by, const VarOrRVar &bz,
1926 const VarOrRVar &tx, const VarOrRVar &ty, const VarOrRVar &tz,
1927 const Expr &x_size, const Expr &y_size, const Expr &z_size,
1929 DeviceAPI device_api = DeviceAPI::Default_GPU);
1930 Func &gpu_tile(const VarOrRVar &x, const VarOrRVar &y, const VarOrRVar &z,
1931 const VarOrRVar &tx, const VarOrRVar &ty, const VarOrRVar &tz,
1932 const Expr &x_size, const Expr &y_size, const Expr &z_size,
1934 DeviceAPI device_api = DeviceAPI::Default_GPU);
1935 // @}
1936
1937 /** Schedule for execution on Hexagon. When a loop is marked with
1938 * Hexagon, that loop is executed on a Hexagon DSP. */
1940
1941 /** Prefetch data written to or read from a Func or an ImageParam by a
1942 * subsequent loop iteration, at an optionally specified iteration offset. You may specify
1943 * specification of different vars for the location of the prefetch() instruction
1944 * vs. the location that is being prefetched:
1945 *
1946 * - the first var specified, 'at', indicates the loop in which the prefetch will be placed
1947 * - the second var specified, 'from', determines the var used to find the bounds to prefetch
1948 * (in conjunction with 'offset')
1949 *
1950 * If 'at' and 'from' are distinct vars, then 'from' must be at a nesting level outside 'at.'
1951 * Note that the value for 'offset' applies only to 'from', not 'at'.
1952 *
1953 * The final argument specifies how prefetch of region outside bounds
1954 * should be handled.
1955 *
1956 * For example, consider this pipeline:
1957 \code
1958 Func f, g;
1959 Var x, y, z;
1960 f(x, y) = x + y;
1961 g(x, y) = 2 * f(x, y);
1962 h(x, y) = 3 * f(x, y);
1963 \endcode
1964 *
1965 * The following schedule:
1966 \code
1967 f.compute_root();
1968 g.prefetch(f, x, x, 2, PrefetchBoundStrategy::NonFaulting);
1969 h.prefetch(f, x, y, 2, PrefetchBoundStrategy::NonFaulting);
1970 \endcode
1971 *
1972 * will inject prefetch call at the innermost loop of 'g' and 'h' and generate
1973 * the following loop nest:
1974 \code
1975 for y = ...
1976 for x = ...
1977 f(x, y) = x + y
1978 for y = ..
1979 for x = ...
1980 prefetch(&f[x + 2, y], 1, 16);
1981 g(x, y) = 2 * f(x, y)
1982 for y = ..
1983 for x = ...
1984 prefetch(&f[x, y + 2], 1, 16);
1985 h(x, y) = 3 * f(x, y)
1986 \endcode
1987 *
1988 * Note that the 'from' nesting level need not be adjacent to 'at':
1989 \code
1990 Func f, g;
1991 Var x, y, z, w;
1992 f(x, y, z, w) = x + y + z + w;
1993 g(x, y, z, w) = 2 * f(x, y, z, w);
1994 \endcode
1995 *
1996 * The following schedule:
1997 \code
1998 f.compute_root();
1999 g.prefetch(f, y, w, 2, PrefetchBoundStrategy::NonFaulting);
2000 \endcode
2001 *
2002 * will produce code that prefetches a tile of data:
2003 \code
2004 for w = ...
2005 for z = ...
2006 for y = ...
2007 for x = ...
2008 f(x, y, z, w) = x + y + z + w
2009 for w = ...
2010 for z = ...
2011 for y = ...
2012 for x0 = ...
2013 prefetch(&f[x0, y, z, w + 2], 1, 16);
2014 for x = ...
2015 g(x, y, z, w) = 2 * f(x, y, z, w)
2016 \endcode
2017 *
2018 * Note that calling prefetch() with the same var for both 'at' and 'from'
2019 * is equivalent to calling prefetch() with that var.
2020 */
2021 // @{
2022 Func &prefetch(const Func &f, const VarOrRVar &at, const VarOrRVar &from, Expr offset = 1,
2024 Func &prefetch(const Internal::Parameter &param, const VarOrRVar &at, const VarOrRVar &from, Expr offset = 1,
2026 template<typename T>
2027 Func &prefetch(const T &image, const VarOrRVar &at, const VarOrRVar &from, Expr offset = 1,
2029 return prefetch(image.parameter(), at, from, std::move(offset), strategy);
2030 }
2031 // @}
2032
2033 /** Specify how the storage for the function is laid out. These
2034 * calls let you specify the nesting order of the dimensions. For
2035 * example, foo.reorder_storage(y, x) tells Halide to use
2036 * column-major storage for any realizations of foo, without
2037 * changing how you refer to foo in the code. You may want to do
2038 * this if you intend to vectorize across y. When representing
2039 * color images, foo.reorder_storage(c, x, y) specifies packed
2040 * storage (red, green, and blue values adjacent in memory), and
2041 * foo.reorder_storage(x, y, c) specifies planar storage (entire
2042 * red, green, and blue images one after the other in memory).
2043 *
2044 * If you leave out some dimensions, those remain in the same
2045 * positions in the nesting order while the specified variables
2046 * are reordered around them. */
2047 // @{
2048 Func &reorder_storage(const std::vector<Var> &dims);
2049
2050 Func &reorder_storage(const Var &x, const Var &y);
2051 template<typename... Args>
2053 reorder_storage(const Var &x, const Var &y, Args &&...args) {
2054 std::vector<Var> collected_args{x, y, std::forward<Args>(args)...};
2055 return reorder_storage(collected_args);
2056 }
2057 // @}
2058
2059 /** Pad the storage extent of a particular dimension of
2060 * realizations of this function up to be a multiple of the
2061 * specified alignment. This guarantees that the strides for the
2062 * dimensions stored outside of dim will be multiples of the
2063 * specified alignment, where the strides and alignment are
2064 * measured in numbers of elements.
2065 *
2066 * For example, to guarantee that a function foo(x, y, c)
2067 * representing an image has scanlines starting on offsets
2068 * aligned to multiples of 16, use foo.align_storage(x, 16). */
2069 Func &align_storage(const Var &dim, const Expr &alignment);
2070
2071 /** Store realizations of this function in a circular buffer of a
2072 * given extent. This is more efficient when the extent of the
2073 * circular buffer is a power of 2. If the fold factor is too
2074 * small, or the dimension is not accessed monotonically, the
2075 * pipeline will generate an error at runtime.
2076 *
2077 * The fold_forward option indicates that the new values of the
2078 * producer are accessed by the consumer in a monotonically
2079 * increasing order. Folding storage of producers is also
2080 * supported if the new values are accessed in a monotonically
2081 * decreasing order by setting fold_forward to false.
2082 *
2083 * For example, consider the pipeline:
2084 \code
2085 Func f, g;
2086 Var x, y;
2087 g(x, y) = x*y;
2088 f(x, y) = g(x, y) + g(x, y+1);
2089 \endcode
2090 *
2091 * If we schedule f like so:
2092 *
2093 \code
2094 g.compute_at(f, y).store_root().fold_storage(y, 2);
2095 \endcode
2096 *
2097 * Then g will be computed at each row of f and stored in a buffer
2098 * with an extent in y of 2, alternately storing each computed row
2099 * of g in row y=0 or y=1.
2100 */
2101 Func &fold_storage(const Var &dim, const Expr &extent, bool fold_forward = true);
2102
2103 /** Compute this function as needed for each unique value of the
2104 * given var for the given calling function f.
2105 *
2106 * For example, consider the simple pipeline:
2107 \code
2108 Func f, g;
2109 Var x, y;
2110 g(x, y) = x*y;
2111 f(x, y) = g(x, y) + g(x, y+1) + g(x+1, y) + g(x+1, y+1);
2112 \endcode
2113 *
2114 * If we schedule f like so:
2115 *
2116 \code
2117 g.compute_at(f, x);
2118 \endcode
2119 *
2120 * Then the C code equivalent to this pipeline will look like this
2121 *
2122 \code
2123
2124 int f[height][width];
2125 for (int y = 0; y < height; y++) {
2126 for (int x = 0; x < width; x++) {
2127 int g[2][2];
2128 g[0][0] = x*y;
2129 g[0][1] = (x+1)*y;
2130 g[1][0] = x*(y+1);
2131 g[1][1] = (x+1)*(y+1);
2132 f[y][x] = g[0][0] + g[1][0] + g[0][1] + g[1][1];
2133 }
2134 }
2135
2136 \endcode
2137 *
2138 * The allocation and computation of g is within f's loop over x,
2139 * and enough of g is computed to satisfy all that f will need for
2140 * that iteration. This has excellent locality - values of g are
2141 * used as soon as they are computed, but it does redundant
2142 * work. Each value of g ends up getting computed four times. If
2143 * we instead schedule f like so:
2144 *
2145 \code
2146 g.compute_at(f, y);
2147 \endcode
2148 *
2149 * The equivalent C code is:
2150 *
2151 \code
2152 int f[height][width];
2153 for (int y = 0; y < height; y++) {
2154 int g[2][width+1];
2155 for (int x = 0; x < width; x++) {
2156 g[0][x] = x*y;
2157 g[1][x] = x*(y+1);
2158 }
2159 for (int x = 0; x < width; x++) {
2160 f[y][x] = g[0][x] + g[1][x] + g[0][x+1] + g[1][x+1];
2161 }
2162 }
2163 \endcode
2164 *
2165 * The allocation and computation of g is within f's loop over y,
2166 * and enough of g is computed to satisfy all that f will need for
2167 * that iteration. This does less redundant work (each point in g
2168 * ends up being evaluated twice), but the locality is not quite
2169 * as good, and we have to allocate more temporary memory to store
2170 * g.
2171 */
2172 Func &compute_at(const Func &f, const Var &var);
2173
2174 /** Schedule a function to be computed within the iteration over
2175 * some dimension of an update domain. Produces equivalent code
2176 * to the version of compute_at that takes a Var. */
2177 Func &compute_at(const Func &f, const RVar &var);
2178
2179 /** Schedule a function to be computed within the iteration over
2180 * a given LoopLevel. */
2182
2183 /** Schedule the iteration over the initial definition of this function
2184 * to be fused with another stage 's' from outermost loop to a
2185 * given LoopLevel. */
2186 // @{
2187 Func &compute_with(const Stage &s, const VarOrRVar &var, const std::vector<std::pair<VarOrRVar, LoopAlignStrategy>> &align);
2189 Func &compute_with(LoopLevel loop_level, const std::vector<std::pair<VarOrRVar, LoopAlignStrategy>> &align);
2191
2192 /** Compute all of this function once ahead of time. Reusing
2193 * the example in \ref Func::compute_at :
2194 *
2195 \code
2196 Func f, g;
2197 Var x, y;
2198 g(x, y) = x*y;
2199 f(x, y) = g(x, y) + g(x, y+1) + g(x+1, y) + g(x+1, y+1);
2200
2201 g.compute_root();
2202 \endcode
2203 *
2204 * is equivalent to
2205 *
2206 \code
2207 int f[height][width];
2208 int g[height+1][width+1];
2209 for (int y = 0; y < height+1; y++) {
2210 for (int x = 0; x < width+1; x++) {
2211 g[y][x] = x*y;
2212 }
2213 }
2214 for (int y = 0; y < height; y++) {
2215 for (int x = 0; x < width; x++) {
2216 f[y][x] = g[y][x] + g[y+1][x] + g[y][x+1] + g[y+1][x+1];
2217 }
2218 }
2219 \endcode
2220 *
2221 * g is computed once ahead of time, and enough is computed to
2222 * satisfy all uses of it. This does no redundant work (each point
2223 * in g is evaluated once), but has poor locality (values of g are
2224 * probably not still in cache when they are used by f), and
2225 * allocates lots of temporary memory to store g.
2226 */
2228
2229 /** Use the halide_memoization_cache_... interface to store a
2230 * computed version of this function across invocations of the
2231 * Func.
2232 *
2233 * If an eviction_key is provided, it must be constructed with
2234 * Expr of integer or handle type. The key Expr will be promoted
2235 * to a uint64_t and can be used with halide_memoization_cache_evict
2236 * to remove memoized entries using this eviction key from the
2237 * cache. Memoized computations that do not provide an eviction
2238 * key will never be evicted by this mechanism.
2239 */
2241
2242 /** Produce this Func asynchronously in a separate
2243 * thread. Consumers will be run by the task system when the
2244 * production is complete. If this Func's store level is different
2245 * to its compute level, consumers will be run concurrently,
2246 * blocking as necessary to prevent reading ahead of what the
2247 * producer has computed. If storage is folded, then the producer
2248 * will additionally not be permitted to run too far ahead of the
2249 * consumer, to avoid clobbering data that has not yet been
2250 * used.
2251 *
2252 * Take special care when combining this with custom thread pool
2253 * implementations, as avoiding deadlock with producer-consumer
2254 * parallelism requires a much more sophisticated parallel runtime
2255 * than with data parallelism alone. It is strongly recommended
2256 * you just use Halide's default thread pool, which guarantees no
2257 * deadlock and a bound on the number of threads launched.
2258 */
2260
2261 /** Bound the extent of a Func's storage, but not extent of its
2262 * compute. This can be useful for forcing a function's allocation
2263 * to be a fixed size, which often means it can go on the stack.
2264 * If bounds inference decides that it requires more storage for
2265 * this function than the allocation size you have stated, a runtime
2266 * error will occur when you try to run the pipeline. */
2267 Func &bound_storage(const Var &dim, const Expr &bound);
2268
2269 /** Allocate storage for this function within f's loop over
2270 * var. Scheduling storage is optional, and can be used to
2271 * separate the loop level at which storage occurs from the loop
2272 * level at which computation occurs to trade off between locality
2273 * and redundant work. This can open the door for two types of
2274 * optimization.
2275 *
2276 * Consider again the pipeline from \ref Func::compute_at :
2277 \code
2278 Func f, g;
2279 Var x, y;
2280 g(x, y) = x*y;
2281 f(x, y) = g(x, y) + g(x+1, y) + g(x, y+1) + g(x+1, y+1);
2282 \endcode
2283 *
2284 * If we schedule it like so:
2285 *
2286 \code
2287 g.compute_at(f, x).store_at(f, y);
2288 \endcode
2289 *
2290 * Then the computation of g takes place within the loop over x,
2291 * but the storage takes place within the loop over y:
2292 *
2293 \code
2294 int f[height][width];
2295 for (int y = 0; y < height; y++) {
2296 int g[2][width+1];
2297 for (int x = 0; x < width; x++) {
2298 g[0][x] = x*y;
2299 g[0][x+1] = (x+1)*y;
2300 g[1][x] = x*(y+1);
2301 g[1][x+1] = (x+1)*(y+1);
2302 f[y][x] = g[0][x] + g[1][x] + g[0][x+1] + g[1][x+1];
2303 }
2304 }
2305 \endcode
2306 *
2307 * Provided the for loop over x is serial, halide then
2308 * automatically performs the following sliding window
2309 * optimization:
2310 *
2311 \code
2312 int f[height][width];
2313 for (int y = 0; y < height; y++) {
2314 int g[2][width+1];
2315 for (int x = 0; x < width; x++) {
2316 if (x == 0) {
2317 g[0][x] = x*y;
2318 g[1][x] = x*(y+1);
2319 }
2320 g[0][x+1] = (x+1)*y;
2321 g[1][x+1] = (x+1)*(y+1);
2322 f[y][x] = g[0][x] + g[1][x] + g[0][x+1] + g[1][x+1];
2323 }
2324 }
2325 \endcode
2326 *
2327 * Two of the assignments to g only need to be done when x is
2328 * zero. The rest of the time, those sites have already been
2329 * filled in by a previous iteration. This version has the
2330 * locality of compute_at(f, x), but allocates more memory and
2331 * does much less redundant work.
2332 *
2333 * Halide then further optimizes this pipeline like so:
2334 *
2335 \code
2336 int f[height][width];
2337 for (int y = 0; y < height; y++) {
2338 int g[2][2];
2339 for (int x = 0; x < width; x++) {
2340 if (x == 0) {
2341 g[0][0] = x*y;
2342 g[1][0] = x*(y+1);
2343 }
2344 g[0][(x+1)%2] = (x+1)*y;
2345 g[1][(x+1)%2] = (x+1)*(y+1);
2346 f[y][x] = g[0][x%2] + g[1][x%2] + g[0][(x+1)%2] + g[1][(x+1)%2];
2347 }
2348 }
2349 \endcode
2350 *
2351 * Halide has detected that it's possible to use a circular buffer
2352 * to represent g, and has reduced all accesses to g modulo 2 in
2353 * the x dimension. This optimization only triggers if the for
2354 * loop over x is serial, and if halide can statically determine
2355 * some power of two large enough to cover the range needed. For
2356 * powers of two, the modulo operator compiles to more efficient
2357 * bit-masking. This optimization reduces memory usage, and also
2358 * improves locality by reusing recently-accessed memory instead
2359 * of pulling new memory into cache.
2360 *
2361 */
2362 Func &store_at(const Func &f, const Var &var);
2363
2364 /** Equivalent to the version of store_at that takes a Var, but
2365 * schedules storage within the loop over a dimension of a
2366 * reduction domain */
2367 Func &store_at(const Func &f, const RVar &var);
2368
2369 /** Equivalent to the version of store_at that takes a Var, but
2370 * schedules storage at a given LoopLevel. */
2372
2373 /** Equivalent to \ref Func::store_at, but schedules storage
2374 * outside the outermost loop. */
2376
2377 /** Aggressively inline all uses of this function. This is the
2378 * default schedule, so you're unlikely to need to call this. For
2379 * a Func with an update definition, that means it gets computed
2380 * as close to the innermost loop as possible.
2381 *
2382 * Consider once more the pipeline from \ref Func::compute_at :
2383 *
2384 \code
2385 Func f, g;
2386 Var x, y;
2387 g(x, y) = x*y;
2388 f(x, y) = g(x, y) + g(x+1, y) + g(x, y+1) + g(x+1, y+1);
2389 \endcode
2390 *
2391 * Leaving g as inline, this compiles to code equivalent to the following C:
2392 *
2393 \code
2394 int f[height][width];
2395 for (int y = 0; y < height; y++) {
2396 for (int x = 0; x < width; x++) {
2397 f[y][x] = x*y + x*(y+1) + (x+1)*y + (x+1)*(y+1);
2398 }
2399 }
2400 \endcode
2401 */
2403
2404 /** Get a handle on an update step for the purposes of scheduling
2405 * it. */
2406 Stage update(int idx = 0);
2407
2408 /** Set the type of memory this Func should be stored in. Controls
2409 * whether allocations go on the stack or the heap on the CPU, and
2410 * in global vs shared vs local on the GPU. See the documentation
2411 * on MemoryType for more detail. */
2412 Func &store_in(MemoryType memory_type);
2413
2414 /** Trace all loads from this Func by emitting calls to
2415 * halide_trace. If the Func is inlined, this has no
2416 * effect. */
2418
2419 /** Trace all stores to the buffer backing this Func by emitting
2420 * calls to halide_trace. If the Func is inlined, this call
2421 * has no effect. */
2423
2424 /** Trace all realizations of this Func by emitting calls to
2425 * halide_trace. */
2427
2428 /** Add a string of arbitrary text that will be passed thru to trace
2429 * inspection code if the Func is realized in trace mode. (Funcs that are
2430 * inlined won't have their tags emitted.) Ignored entirely if
2431 * tracing is not enabled for the Func (or globally).
2432 */
2433 Func &add_trace_tag(const std::string &trace_tag);
2434
2435 /** Get a handle on the internal halide function that this Func
2436 * represents. Useful if you want to do introspection on Halide
2437 * functions */
2439 return func;
2440 }
2441
2442 /** You can cast a Func to its pure stage for the purposes of
2443 * scheduling it. */
2444 operator Stage() const;
2445
2446 /** Get a handle on the output buffer for this Func. Only relevant
2447 * if this is the output Func in a pipeline. Useful for making
2448 * static promises about strides, mins, and extents. */
2449 // @{
2451 std::vector<OutputImageParam> output_buffers() const;
2452 // @}
2453
2454 /** Use a Func as an argument to an external stage. */
2455 operator ExternFuncArgument() const;
2456
2457 /** Infer the arguments to the Func, sorted into a canonical order:
2458 * all buffers (sorted alphabetically by name), followed by all non-buffers
2459 * (sorted alphabetically by name).
2460 This lets you write things like:
2461 \code
2462 func.compile_to_assembly("/dev/stdout", func.infer_arguments());
2463 \endcode
2464 */
2465 std::vector<Argument> infer_arguments() const;
2466
2467 /** Get the source location of the pure definition of this
2468 * Func. See Stage::source_location() */
2469 std::string source_location() const;
2470
2471 /** Return the current StageSchedule associated with this initial
2472 * Stage of this Func. For introspection only: to modify schedule,
2473 * use the Func interface. */
2475 return Stage(*this).get_schedule();
2476 }
2477};
2478
2479namespace Internal {
2480
2481template<typename Last>
2482inline void check_types(const Tuple &t, int idx) {
2483 using T = typename std::remove_pointer<typename std::remove_reference<Last>::type>::type;
2484 user_assert(t[idx].type() == type_of<T>())
2485 << "Can't evaluate expression "
2486 << t[idx] << " of type " << t[idx].type()
2487 << " as a scalar of type " << type_of<T>() << "\n";
2488}
2489
2490template<typename First, typename Second, typename... Rest>
2491inline void check_types(const Tuple &t, int idx) {
2492 check_types<First>(t, idx);
2493 check_types<Second, Rest...>(t, idx + 1);
2494}
2495
2496template<typename Last>
2497inline void assign_results(Realization &r, int idx, Last last) {
2498 using T = typename std::remove_pointer<typename std::remove_reference<Last>::type>::type;
2499 *last = Buffer<T>(r[idx])();
2500}
2501
2502template<typename First, typename Second, typename... Rest>
2503inline void assign_results(Realization &r, int idx, First first, Second second, Rest &&...rest) {
2504 assign_results<First>(r, idx, first);
2505 assign_results<Second, Rest...>(r, idx + 1, second, rest...);
2506}
2507
2508} // namespace Internal
2509
2510/** JIT-Compile and run enough code to evaluate a Halide
2511 * expression. This can be thought of as a scalar version of
2512 * \ref Func::realize */
2513template<typename T>
2515 user_assert(e.type() == type_of<T>())
2516 << "Can't evaluate expression "
2517 << e << " of type " << e.type()
2518 << " as a scalar of type " << type_of<T>() << "\n";
2519 Func f;
2520 f() = e;
2522 return im();
2523}
2524
2525/** evaluate with a default user context */
2526template<typename T>
2528 return evaluate<T>(nullptr, e);
2529}
2530
2531/** JIT-compile and run enough code to evaluate a Halide Tuple. */
2532template<typename First, typename... Rest>
2534 Internal::check_types<First, Rest...>(t, 0);
2535
2536 Func f;
2537 f() = t;
2538 Realization r = f.realize(ctx);
2539 Internal::assign_results(r, 0, first, rest...);
2540}
2541
2542/** JIT-compile and run enough code to evaluate a Halide Tuple. */
2543template<typename First, typename... Rest>
2545 evaluate<First, Rest...>(nullptr, std::move(t), std::forward<First>(first), std::forward<Rest...>(rest...));
2546}
2547
2548namespace Internal {
2549
2550inline void schedule_scalar(Func f) {
2552 if (t.has_gpu_feature()) {
2554 }
2555 if (t.has_feature(Target::HVX)) {
2556 f.hexagon();
2557 }
2558}
2559
2560} // namespace Internal
2561
2562/** JIT-Compile and run enough code to evaluate a Halide
2563 * expression. This can be thought of as a scalar version of
2564 * \ref Func::realize. Can use GPU if jit target from environment
2565 * specifies one.
2566 */
2567template<typename T>
2569 user_assert(e.type() == type_of<T>())
2570 << "Can't evaluate expression "
2571 << e << " of type " << e.type()
2572 << " as a scalar of type " << type_of<T>() << "\n";
2573 Func f;
2574 f() = e;
2576 Buffer<T, 0> im = f.realize();
2577 return im();
2578}
2579
2580/** JIT-compile and run enough code to evaluate a Halide Tuple. Can
2581 * use GPU if jit target from environment specifies one. */
2582// @{
2583template<typename First, typename... Rest>
2585 Internal::check_types<First, Rest...>(t, 0);
2586
2587 Func f;
2588 f() = t;
2590 Realization r = f.realize();
2591 Internal::assign_results(r, 0, first, rest...);
2592}
2593// @}
2594
2595} // namespace Halide
2596
2597#endif
Defines a type used for expressing the type signature of a generated halide pipeline.
#define internal_assert(c)
Definition Errors.h:19
Base classes for Halide expressions (Halide::Expr) and statements (Halide::Internal::Stmt)
Defines the struct representing lifetime and dependencies of a JIT compiled halide pipeline.
Defines Module, an IR container that fully describes a Halide program.
Classes for declaring scalar parameters to halide pipelines.
Defines the front-end class representing an entire Halide imaging pipeline.
Defines the front-end syntax for reduction domains and reduction variables.
Defines the structure that describes a Halide target.
Defines Tuple - the front-end handle on small arrays of expressions.
#define HALIDE_NO_USER_CODE_INLINE
Definition Util.h:45
Defines the Var - the front-end variable.
A Halide::Buffer is a named shared reference to a Halide::Runtime::Buffer.
Definition Buffer.h:122
Helper class for identifying purpose of an Expr passed to memoize.
Definition Func.h:672
EvictionKey(const Expr &expr=Expr())
Definition Func.h:678
A halide function.
Definition Func.h:687
Func & gpu(const VarOrRVar &block_x, const VarOrRVar &thread_x, DeviceAPI device_api=DeviceAPI::Default_GPU)
Tell Halide that the following dimensions correspond to GPU block indices and thread indices.
Func & gpu_blocks(const VarOrRVar &block_x, DeviceAPI device_api=DeviceAPI::Default_GPU)
Tell Halide that the following dimensions correspond to GPU block indices.
Func & bound_extent(const Var &var, Expr extent)
Bound the extent of a Func's realization, but not its min.
void print_loop_nest()
Write out the loop nests specified by the schedule for this Function.
Func & unroll(const VarOrRVar &var, const Expr &factor, TailStrategy tail=TailStrategy::Auto)
Split a dimension by the given factor, then unroll the inner dimension.
bool is_extern() const
Is this function an external stage? That is, was it defined using define_extern?
FuncRef operator()(std::vector< Expr >) const
Either calls to the function, or the left-hand-side of an update definition (see RDom).
Func & gpu_tile(const VarOrRVar &x, const VarOrRVar &y, const VarOrRVar &tx, const VarOrRVar &ty, const Expr &x_size, const Expr &y_size, TailStrategy tail=TailStrategy::Auto, DeviceAPI device_api=DeviceAPI::Default_GPU)
Func(const std::string &name)
Declare a new undefined function with the given name.
void compile_to_multitarget_object_files(const std::string &filename_prefix, const std::vector< Argument > &args, const std::vector< Target > &targets, const std::vector< std::string > &suffixes)
Like compile_to_multitarget_static_library(), except that the object files are all output as object f...
Func & memoize(const EvictionKey &eviction_key=EvictionKey())
Use the halide_memoization_cache_... interface to store a computed version of this function across in...
Func & trace_stores()
Trace all stores to the buffer backing this Func by emitting calls to halide_trace.
Func & trace_loads()
Trace all loads from this Func by emitting calls to halide_trace.
void specialize_fail(const std::string &message)
Add a specialization to a Func that always terminates execution with a call to halide_error().
Func & vectorize(const VarOrRVar &var, const Expr &factor, TailStrategy tail=TailStrategy::Auto)
Split a dimension by the given factor, then vectorize the inner dimension.
Func & compute_at(const Func &f, const RVar &var)
Schedule a function to be computed within the iteration over some dimension of an update domain.
Func & tile(const std::vector< VarOrRVar > &previous, const std::vector< VarOrRVar > &outers, const std::vector< VarOrRVar > &inners, const std::vector< Expr > &factors, TailStrategy tail=TailStrategy::Auto)
The generalized tile, with a single tail strategy to apply to all vars.
void compile_to_assembly(const std::string &filename, const std::vector< Argument > &, const std::string &fn_name, const Target &target=get_target_from_environment())
Statically compile this function to text assembly equivalent to the object file generated by compile_...
Internal::Function function() const
Get a handle on the internal halide function that this Func represents.
Definition Func.h:2438
bool has_update_definition() const
Does this function have at least one update definition?
void compile_jit(const Target &target=get_jit_target_from_environment())
Eagerly jit compile the function to machine code.
Func & compute_with(LoopLevel loop_level, const std::vector< std::pair< VarOrRVar, LoopAlignStrategy > > &align)
Func()
Declare a new undefined function with an automatically-generated unique name.
Func & store_in(MemoryType memory_type)
Set the type of memory this Func should be stored in.
const Type & type() const
Get the type(s) of the outputs of this Func.
void compile_to_bitcode(const std::string &filename, const std::vector< Argument > &, const Target &target=get_target_from_environment())
void realize(Pipeline::RealizationArg outputs, const Target &target=Target(), const ParamMap &param_map=ParamMap::empty_map())
Evaluate this function into an existing allocated buffer or buffers.
Func & async()
Produce this Func asynchronously in a separate thread.
Func & reorder(const std::vector< VarOrRVar > &vars)
Reorder variables to have the given nesting order, from innermost out.
HALIDE_NO_USER_CODE_INLINE std::enable_if< Internal::all_are_convertible< Expr, Args... >::value, FuncRef >::type operator()(const Expr &x, Args &&...args) const
Definition Func.h:1273
Func & gpu_blocks(const VarOrRVar &block_x, const VarOrRVar &block_y, const VarOrRVar &block_z, DeviceAPI device_api=DeviceAPI::Default_GPU)
Func & set_estimate(const Var &var, const Expr &min, const Expr &extent)
Statically declare the range over which the function will be evaluated in the general case.
Func & gpu_lanes(const VarOrRVar &thread_x, DeviceAPI device_api=DeviceAPI::Default_GPU)
The given dimension corresponds to the lanes in a GPU warp.
void compile_to_lowered_stmt(const std::string &filename, const std::vector< Argument > &args, StmtOutputFormat fmt=Text, const Target &target=get_target_from_environment())
Write out an internal representation of lowered code.
void compile_to_c(const std::string &filename, const std::vector< Argument > &, const std::string &fn_name="", const Target &target=get_target_from_environment())
Statically compile this function to C source code.
Stage update(int idx=0)
Get a handle on an update step for the purposes of scheduling it.
Func & gpu_blocks(const VarOrRVar &block_x, const VarOrRVar &block_y, DeviceAPI device_api=DeviceAPI::Default_GPU)
Func & gpu_threads(const VarOrRVar &thread_x, const VarOrRVar &thread_y, DeviceAPI device_api=DeviceAPI::Default_GPU)
Func(const Type &required_type, int required_dims, const std::string &name)
Declare a new undefined function with the given name.
bool defined() const
Does this function have at least a pure definition.
Func(const std::vector< Type > &required_types, int required_dims, const std::string &name)
Declare a new undefined function with the given name.
Func & align_storage(const Var &dim, const Expr &alignment)
Pad the storage extent of a particular dimension of realizations of this function up to be a multiple...
Func copy_to_host()
Declare that this function should be implemented by a call to halide_buffer_copy with a NULL target d...
Func & gpu_tile(const VarOrRVar &x, const VarOrRVar &tx, const Expr &x_size, TailStrategy tail=TailStrategy::Auto, DeviceAPI device_api=DeviceAPI::Default_GPU)
Func & serial(const VarOrRVar &var)
Mark a dimension to be traversed serially.
void compile_to_header(const std::string &filename, const std::vector< Argument > &, const std::string &fn_name="", const Target &target=get_target_from_environment())
Emit a header file with the given filename for this function.
Func & align_bounds(const Var &var, Expr modulus, Expr remainder=0)
Expand the region computed so that the min coordinates is congruent to 'remainder' modulo 'modulus',...
Func & reorder_storage(const Var &x, const Var &y)
Func & split(const VarOrRVar &old, const VarOrRVar &outer, const VarOrRVar &inner, const Expr &factor, TailStrategy tail=TailStrategy::Auto)
Split a dimension into inner and outer subdimensions with the given names, where the inner dimension ...
Func(const Expr &e)
Declare a new function with an automatically-generated unique name, and define it to return the given...
Func & tile(const VarOrRVar &x, const VarOrRVar &y, const VarOrRVar &xo, const VarOrRVar &yo, const VarOrRVar &xi, const VarOrRVar &yi, const Expr &xfactor, const Expr &yfactor, TailStrategy tail=TailStrategy::Auto)
Split two dimensions at once by the given factors, and then reorder the resulting dimensions to be xi...
int dimensions() const
The dimensionality (number of arguments) of this function.
void realize(JITUserContext *context, Pipeline::RealizationArg outputs, const Target &target=Target(), const ParamMap &param_map=ParamMap::empty_map())
Same as above, but takes a custom user-provided context to be passed to runtime functions.
Func & gpu(const VarOrRVar &block_x, const VarOrRVar &block_y, const VarOrRVar &thread_x, const VarOrRVar &thread_y, DeviceAPI device_api=DeviceAPI::Default_GPU)
std::string source_location() const
Get the source location of the pure definition of this Func.
const std::vector< Type > & types() const
void infer_input_bounds(const std::vector< int32_t > &sizes, const Target &target=get_jit_target_from_environment(), const ParamMap &param_map=ParamMap::empty_map())
For a given size of output, or a given output buffer, determine the bounds required of all unbound Im...
Func & tile(const std::vector< VarOrRVar > &previous, const std::vector< VarOrRVar > &outers, const std::vector< VarOrRVar > &inners, const std::vector< Expr > &factors, const std::vector< TailStrategy > &tails)
A more general form of tile, which defines tiles of any dimensionality.
Func & gpu_tile(const VarOrRVar &x, const VarOrRVar &y, const VarOrRVar &z, const VarOrRVar &bx, const VarOrRVar &by, const VarOrRVar &bz, const VarOrRVar &tx, const VarOrRVar &ty, const VarOrRVar &tz, const Expr &x_size, const Expr &y_size, const Expr &z_size, TailStrategy tail=TailStrategy::Auto, DeviceAPI device_api=DeviceAPI::Default_GPU)
const std::vector< Expr > & update_args(int idx=0) const
Get the left-hand-side of the update definition.
int outputs() const
Get the number of outputs of this Func.
HALIDE_NO_USER_CODE_INLINE std::enable_if< Internal::all_are_convertible< Var, Args... >::value, Func & >::type reorder_storage(const Var &x, const Var &y, Args &&...args)
Definition Func.h:2053
Func & compute_root()
Compute all of this function once ahead of time.
Func & compute_with(LoopLevel loop_level, LoopAlignStrategy align=LoopAlignStrategy::Auto)
Func & trace_realizations()
Trace all realizations of this Func by emitting calls to halide_trace.
JITHandlers & jit_handlers()
Get a struct containing the currently set custom functions used by JIT.
std::vector< Var > args() const
Get the pure arguments.
Tuple update_values(int idx=0) const
Get the right-hand-side of an update definition for functions that returns multiple values.
Func & allow_race_conditions()
Specify that race conditions are permitted for this Func, which enables parallelizing over RVars even...
void compile_to_bitcode(const std::string &filename, const std::vector< Argument > &, const std::string &fn_name, const Target &target=get_target_from_environment())
Statically compile this function to llvm bitcode, with the given filename (which should probably end ...
HALIDE_NO_USER_CODE_INLINE std::enable_if< Internal::all_are_convertible< VarOrRVar, Args... >::value, Func & >::type reorder(const VarOrRVar &x, const VarOrRVar &y, Args &&...args)
Definition Func.h:1577
int num_update_definitions() const
How many update definitions does this function have?
Func & tile(const VarOrRVar &x, const VarOrRVar &y, const VarOrRVar &xi, const VarOrRVar &yi, const Expr &xfactor, const Expr &yfactor, TailStrategy tail=TailStrategy::Auto)
A shorter form of tile, which reuses the old variable names as the new outer dimensions.
Stage specialize(const Expr &condition)
Specialize a Func.
Callable compile_to_callable(const std::vector< Argument > &args, const Target &target=get_jit_target_from_environment())
Eagerly jit compile the function to machine code and return a callable struct that behaves like a fun...
Func & compute_with(const Stage &s, const VarOrRVar &var, LoopAlignStrategy align=LoopAlignStrategy::Auto)
Realization realize(std::vector< int32_t > sizes={}, const Target &target=Target(), const ParamMap &param_map=ParamMap::empty_map())
Evaluate this function over some rectangular domain and return the resulting buffer or buffers.
Func & store_at(LoopLevel loop_level)
Equivalent to the version of store_at that takes a Var, but schedules storage at a given LoopLevel.
Func & gpu_tile(const VarOrRVar &x, const VarOrRVar &y, const VarOrRVar &z, const VarOrRVar &tx, const VarOrRVar &ty, const VarOrRVar &tz, const Expr &x_size, const Expr &y_size, const Expr &z_size, TailStrategy tail=TailStrategy::Auto, DeviceAPI device_api=DeviceAPI::Default_GPU)
HALIDE_NO_USER_CODE_INLINE Func(Buffer< T, Dims > &im)
Construct a new Func to wrap a Buffer.
Definition Func.h:746
void define_extern(const std::string &function_name, const std::vector< ExternFuncArgument > &params, const std::vector< Type > &types, const std::vector< Var > &arguments, NameMangling mangling=NameMangling::Default, DeviceAPI device_api=DeviceAPI::Host)
Func & compute_with(const Stage &s, const VarOrRVar &var, const std::vector< std::pair< VarOrRVar, LoopAlignStrategy > > &align)
Schedule the iteration over the initial definition of this function to be fused with another stage 's...
Expr value() const
The right-hand-side value of the pure definition of this function.
Func & align_extent(const Var &var, Expr modulus)
Expand the region computed so that the extent is a multiple of 'modulus'.
Func clone_in(const std::vector< Func > &fs)
Module compile_to_module(const std::vector< Argument > &args, const std::string &fn_name="", const Target &target=get_target_from_environment())
Store an internal representation of lowered code as a self contained Module suitable for further comp...
Func & atomic(bool override_associativity_test=false)
Issue atomic updates for this Func.
void infer_input_bounds(JITUserContext *context, const std::vector< int32_t > &sizes, const Target &target=get_jit_target_from_environment(), const ParamMap &param_map=ParamMap::empty_map())
Versions of infer_input_bounds that take a custom user context to pass to runtime functions.
void define_extern(const std::string &function_name, const std::vector< ExternFuncArgument > &params, const std::vector< Type > &types, int dimensionality, NameMangling mangling=NameMangling::Default, DeviceAPI device_api=DeviceAPI::Host)
Definition Func.h:1189
Func & unroll(const VarOrRVar &var)
Mark a dimension to be completely unrolled.
Func & set_estimates(const Region &estimates)
Set (min, extent) estimates for all dimensions in the Func at once; this is equivalent to calling set...
Func in()
Create and return a global identity wrapper, which wraps all calls to this Func by any other Func.
OutputImageParam output_buffer() const
Get a handle on the output buffer for this Func.
Expr update_value(int idx=0) const
Get the right-hand-side of an update definition.
void compile_to(const std::map< OutputFileType, std::string > &output_files, const std::vector< Argument > &args, const std::string &fn_name, const Target &target=get_target_from_environment())
Compile and generate multiple target files with single call.
std::vector< Argument > infer_arguments() const
Infer the arguments to the Func, sorted into a canonical order: all buffers (sorted alphabetically by...
void compile_to_llvm_assembly(const std::string &filename, const std::vector< Argument > &, const Target &target=get_target_from_environment())
Func & store_at(const Func &f, const Var &var)
Allocate storage for this function within f's loop over var.
void add_custom_lowering_pass(T *pass)
Add a custom pass to be used during lowering.
Definition Func.h:1076
Func in(const std::vector< Func > &fs)
Create and return an identity wrapper shared by all the Funcs in 'fs'.
Func & fold_storage(const Var &dim, const Expr &extent, bool fold_forward=true)
Store realizations of this function in a circular buffer of a given extent.
void infer_input_bounds(JITUserContext *context, Pipeline::RealizationArg outputs, const Target &target=get_jit_target_from_environment(), const ParamMap &param_map=ParamMap::empty_map())
Func & compute_at(LoopLevel loop_level)
Schedule a function to be computed within the iteration over a given LoopLevel.
Func & gpu_threads(const VarOrRVar &thread_x, DeviceAPI device_api=DeviceAPI::Default_GPU)
Tell Halide that the following dimensions correspond to GPU thread indices.
void define_extern(const std::string &function_name, const std::vector< ExternFuncArgument > &params, Type t, int dimensionality, NameMangling mangling=NameMangling::Default, DeviceAPI device_api=DeviceAPI::Host)
Add an extern definition for this Func.
Definition Func.h:1171
void compile_to_file(const std::string &filename_prefix, const std::vector< Argument > &args, const std::string &fn_name="", const Target &target=get_target_from_environment())
Compile to object file and header pair, with the given arguments.
void add_custom_lowering_pass(Internal::IRMutator *pass, std::function< void()> deleter)
Add a custom pass to be used during lowering, with the function that will be called to delete it also...
Func & add_trace_tag(const std::string &trace_tag)
Add a string of arbitrary text that will be passed thru to trace inspection code if the Func is reali...
Func & store_at(const Func &f, const RVar &var)
Equivalent to the version of store_at that takes a Var, but schedules storage within the loop over a ...
void clear_custom_lowering_passes()
Remove all previously-set custom lowering passes.
void compile_to_llvm_assembly(const std::string &filename, const std::vector< Argument > &, const std::string &fn_name, const Target &target=get_target_from_environment())
Statically compile this function to llvm assembly, with the given filename (which should probably end...
const std::string & name() const
The name of this function, either given during construction, or automatically generated.
void compile_to_multitarget_static_library(const std::string &filename_prefix, const std::vector< Argument > &args, const std::vector< Target > &targets)
Compile to static-library file and header pair once for each target; each resulting function will be ...
Func & hexagon(const VarOrRVar &x=Var::outermost())
Schedule for execution on Hexagon.
Func & tile(const std::vector< VarOrRVar > &previous, const std::vector< VarOrRVar > &inners, const std::vector< Expr > &factors, TailStrategy tail=TailStrategy::Auto)
Generalized tiling, reusing the previous names as the outer names.
Func & store_root()
Equivalent to Func::store_at, but schedules storage outside the outermost loop.
Func & gpu_tile(const VarOrRVar &x, const VarOrRVar &y, const VarOrRVar &bx, const VarOrRVar &by, const VarOrRVar &tx, const VarOrRVar &ty, const Expr &x_size, const Expr &y_size, TailStrategy tail=TailStrategy::Auto, DeviceAPI device_api=DeviceAPI::Default_GPU)
Func & prefetch(const Func &f, const VarOrRVar &at, const VarOrRVar &from, Expr offset=1, PrefetchBoundStrategy strategy=PrefetchBoundStrategy::GuardWithIf)
Prefetch data written to or read from a Func or an ImageParam by a subsequent loop iteration,...
Realization realize(JITUserContext *context, std::vector< int32_t > sizes={}, const Target &target=Target(), const ParamMap &param_map=ParamMap::empty_map())
Same as above, but takes a custom user-provided context to be passed to runtime functions.
void compile_to_assembly(const std::string &filename, const std::vector< Argument > &, const Target &target=get_target_from_environment())
std::vector< RVar > rvars(int idx=0) const
Get the RVars of the reduction domain for an update definition, if there is one.
Func clone_in(const Func &f)
Similar to Func::in; however, instead of replacing the call to this Func with an identity Func that r...
const std::vector< CustomLoweringPass > & custom_lowering_passes()
Get the custom lowering passes.
HALIDE_NO_USER_CODE_INLINE std::enable_if< Internal::all_are_convertible< Var, Args... >::value, FuncRef >::type operator()(Args &&...args) const
Definition Func.h:1256
Func & compute_inline()
Aggressively inline all uses of this function.
Func(Internal::Function f)
Construct a new Func to wrap an existing, already-define Function object.
void compile_to_object(const std::string &filename, const std::vector< Argument > &, const std::string &fn_name, const Target &target=get_target_from_environment())
Statically compile this function to an object file, with the given filename (which should probably en...
Func & bound_storage(const Var &dim, const Expr &bound)
Bound the extent of a Func's storage, but not extent of its compute.
Func & rename(const VarOrRVar &old_name, const VarOrRVar &new_name)
Rename a dimension.
Tuple values() const
The values returned by this function.
const std::string & extern_function_name() const
Get the name of the extern function called for an extern definition.
Func copy_to_device(DeviceAPI d=DeviceAPI::Default_GPU)
Declare that this function should be implemented by a call to halide_buffer_copy with the given targe...
Func & parallel(const VarOrRVar &var)
Mark a dimension to be traversed in parallel.
Func & gpu_threads(const VarOrRVar &thread_x, const VarOrRVar &thread_y, const VarOrRVar &thread_z, DeviceAPI device_api=DeviceAPI::Default_GPU)
void compile_to_object(const std::string &filename, const std::vector< Argument > &, const Target &target=get_target_from_environment())
Func & reorder_storage(const std::vector< Var > &dims)
Specify how the storage for the function is laid out.
Func & gpu(const VarOrRVar &block_x, const VarOrRVar &block_y, const VarOrRVar &block_z, const VarOrRVar &thread_x, const VarOrRVar &thread_y, const VarOrRVar &thread_z, DeviceAPI device_api=DeviceAPI::Default_GPU)
void define_extern(const std::string &function_name, const std::vector< ExternFuncArgument > &params, Type t, const std::vector< Var > &arguments, NameMangling mangling=NameMangling::Default, DeviceAPI device_api=DeviceAPI::Host)
Definition Func.h:1199
Func & prefetch(const Internal::Parameter &param, const VarOrRVar &at, const VarOrRVar &from, Expr offset=1, PrefetchBoundStrategy strategy=PrefetchBoundStrategy::GuardWithIf)
Func & vectorize(const VarOrRVar &var)
Mark a dimension to be computed all-at-once as a single vector.
void debug_to_file(const std::string &filename)
When this function is compiled, include code that dumps its values to a file after it is realized,...
Func & parallel(const VarOrRVar &var, const Expr &task_size, TailStrategy tail=TailStrategy::Auto)
Split a dimension by the given task_size, and the parallelize the outer dimension.
Func & fuse(const VarOrRVar &inner, const VarOrRVar &outer, const VarOrRVar &fused)
Join two dimensions into a single fused dimension.
Func in(const Func &f)
Creates and returns a new identity Func that wraps this Func.
Func & bound(const Var &var, Expr min, Expr extent)
Statically declare that the range over which a function should be evaluated is given by the second an...
std::vector< OutputImageParam > output_buffers() const
Func & prefetch(const T &image, const VarOrRVar &at, const VarOrRVar &from, Expr offset=1, PrefetchBoundStrategy strategy=PrefetchBoundStrategy::GuardWithIf)
Definition Func.h:2027
void compile_to_static_library(const std::string &filename_prefix, const std::vector< Argument > &args, const std::string &fn_name="", const Target &target=get_target_from_environment())
Compile to static-library file and header pair, with the given arguments.
Func & compute_at(const Func &f, const Var &var)
Compute this function as needed for each unique value of the given var for the given calling function...
void infer_input_bounds(Pipeline::RealizationArg outputs, const Target &target=get_jit_target_from_environment(), const ParamMap &param_map=ParamMap::empty_map())
FuncRef operator()(std::vector< Var >) const
Construct either the left-hand-side of a definition, or a call to a functions that happens to only co...
const Internal::StageSchedule & get_schedule() const
Return the current StageSchedule associated with this initial Stage of this Func.
Definition Func.h:2474
Func & gpu_single_thread(DeviceAPI device_api=DeviceAPI::Default_GPU)
Tell Halide to run this stage using a single gpu thread and block.
void define_extern(const std::string &function_name, const std::vector< ExternFuncArgument > &params, const std::vector< Type > &types, int dimensionality, NameMangling mangling)
Definition Func.h:1181
Func & gpu_tile(const VarOrRVar &x, const VarOrRVar &bx, const VarOrRVar &tx, const Expr &x_size, TailStrategy tail=TailStrategy::Auto, DeviceAPI device_api=DeviceAPI::Default_GPU)
Short-hand for tiling a domain and mapping the tile indices to GPU block indices and the coordinates ...
A fragment of front-end syntax of the form f(x, y, z), where x, y, z are Vars or Exprs.
Definition Func.h:478
Stage operator*=(const FuncRef &)
FuncTupleElementRef operator[](int) const
When a FuncRef refers to a function that provides multiple outputs, you can access each output as an ...
Stage operator-=(const FuncRef &)
size_t size() const
How many outputs does the function this refers to produce.
Internal::Function function() const
What function is this calling?
Definition Func.h:575
Stage operator+=(Expr)
Define a stage that adds the given expression to this Func.
Stage operator-=(Expr)
Define a stage that adds the negative of the given expression to this Func.
Stage operator*=(Expr)
Define a stage that multiplies this Func by the given expression.
Stage operator-=(const Tuple &)
Stage operator/=(Expr)
Define a stage that divides this Func by the given expression.
Stage operator+=(const FuncRef &)
Stage operator=(const Expr &)
Use this as the left-hand-side of a definition or an update definition (see RDom).
Stage operator=(const FuncRef &)
FuncRef(Internal::Function, const std::vector< Var > &, int placeholder_pos=-1, int count=0)
Stage operator+=(const Tuple &)
FuncRef(const Internal::Function &, const std::vector< Expr > &, int placeholder_pos=-1, int count=0)
Stage operator/=(const FuncRef &)
Stage operator*=(const Tuple &)
Stage operator/=(const Tuple &)
Stage operator=(const Tuple &)
Use this as the left-hand-side of a definition or an update definition for a Func with multiple outpu...
A fragment of front-end syntax of the form f(x, y, z)[index], where x, y, z are Vars or Exprs.
Definition Func.h:597
int index() const
Return index to the function outputs.
Definition Func.h:661
Stage operator+=(const Expr &e)
Define a stage that adds the given expression to Tuple component 'idx' of this Func.
Stage operator*=(const Expr &e)
Define a stage that multiplies Tuple component 'idx' of this Func by the given expression.
Stage operator/=(const Expr &e)
Define a stage that divides Tuple component 'idx' of this Func by the given expression.
Stage operator=(const Expr &e)
Use this as the left-hand-side of an update definition of Tuple component 'idx' of a Func (see RDom).
Stage operator=(const FuncRef &e)
Internal::Function function() const
What function is this calling?
Definition Func.h:656
Stage operator-=(const Expr &e)
Define a stage that adds the negative of the given expression to Tuple component 'idx' of this Func.
FuncTupleElementRef(const FuncRef &ref, const std::vector< Expr > &args, int idx)
An Image parameter to a halide pipeline.
Definition ImageParam.h:23
A Function definition which can either represent a init or an update definition.
Definition Definition.h:38
const std::vector< Expr > & args() const
Get the default (no-specialization) arguments (left-hand-side) of the definition.
const StageSchedule & schedule() const
Get the default (no-specialization) stage-specific schedule associated with this definition.
bool defined() const
Definition objects are nullable.
const std::vector< StorageDim > & storage_dims() const
The list and order of dimensions used to store this function.
A reference-counted handle to Halide's internal representation of a function.
Definition Function.h:39
FuncSchedule & schedule()
Get a handle to the function-specific schedule for the purpose of modifying it.
const std::vector< std::string > & args() const
Get the pure arguments.
A base class for passes over the IR which modify it (e.g.
Definition IRMutator.h:26
A reference-counted handle to a parameter to a halide pipeline.
Definition Parameter.h:28
A schedule for a single stage of a Halide pipeline.
Definition Schedule.h:646
A reference to a site in a Halide statement at the top of the body of a particular for loop.
Definition Schedule.h:176
A halide module.
Definition Module.h:138
A handle on the output buffer of a pipeline.
static const ParamMap & empty_map()
A const ref to an empty ParamMap.
Definition ParamMap.h:110
A class representing a Halide pipeline.
Definition Pipeline.h:108
A multi-dimensional domain over which to iterate.
Definition RDom.h:193
A reduction variable represents a single dimension of a reduction domain (RDom).
Definition RDom.h:29
const std::string & name() const
The name of this reduction variable.
A Realization is a vector of references to existing Buffer objects.
Definition Realization.h:19
A single definition of a Func.
Definition Func.h:70
Stage & prefetch(const Func &f, const VarOrRVar &at, const VarOrRVar &from, Expr offset=1, PrefetchBoundStrategy strategy=PrefetchBoundStrategy::GuardWithIf)
std::string name() const
Return the name of this stage, e.g.
Stage & gpu_tile(const VarOrRVar &x, const VarOrRVar &y, const VarOrRVar &bx, const VarOrRVar &by, const VarOrRVar &tx, const VarOrRVar &ty, const Expr &x_size, const Expr &y_size, TailStrategy tail=TailStrategy::Auto, DeviceAPI device_api=DeviceAPI::Default_GPU)
Stage & tile(const std::vector< VarOrRVar > &previous, const std::vector< VarOrRVar > &inners, const std::vector< Expr > &factors, TailStrategy tail=TailStrategy::Auto)
Stage & gpu_tile(const VarOrRVar &x, const VarOrRVar &tx, const Expr &x_size, TailStrategy tail=TailStrategy::Auto, DeviceAPI device_api=DeviceAPI::Default_GPU)
Stage & gpu_threads(const VarOrRVar &thread_x, const VarOrRVar &thread_y, DeviceAPI device_api=DeviceAPI::Default_GPU)
HALIDE_NO_USER_CODE_INLINE std::enable_if< Internal::all_are_convertible< VarOrRVar, Args... >::value, Stage & >::type reorder(const VarOrRVar &x, const VarOrRVar &y, Args &&...args)
Definition Func.h:378
Stage & compute_with(const Stage &s, const VarOrRVar &var, const std::vector< std::pair< VarOrRVar, LoopAlignStrategy > > &align)
Func rfactor(const RVar &r, const Var &v)
Stage & gpu(const VarOrRVar &block_x, const VarOrRVar &thread_x, DeviceAPI device_api=DeviceAPI::Default_GPU)
Stage & vectorize(const VarOrRVar &var)
Stage & gpu_lanes(const VarOrRVar &thread_x, DeviceAPI device_api=DeviceAPI::Default_GPU)
Stage & unroll(const VarOrRVar &var)
Stage & compute_with(LoopLevel loop_level, const std::vector< std::pair< VarOrRVar, LoopAlignStrategy > > &align)
Schedule the iteration over this stage to be fused with another stage 's' from outermost loop to a gi...
Stage & gpu_tile(const VarOrRVar &x, const VarOrRVar &y, const VarOrRVar &z, const VarOrRVar &tx, const VarOrRVar &ty, const VarOrRVar &tz, const Expr &x_size, const Expr &y_size, const Expr &z_size, TailStrategy tail=TailStrategy::Auto, DeviceAPI device_api=DeviceAPI::Default_GPU)
Stage & gpu_tile(const VarOrRVar &x, const VarOrRVar &bx, const VarOrRVar &tx, const Expr &x_size, TailStrategy tail=TailStrategy::Auto, DeviceAPI device_api=DeviceAPI::Default_GPU)
Func rfactor(std::vector< std::pair< RVar, Var > > preserved)
Calling rfactor() on an associative update definition a Func will split the update into an intermedia...
Stage & allow_race_conditions()
Stage & tile(const VarOrRVar &x, const VarOrRVar &y, const VarOrRVar &xi, const VarOrRVar &yi, const Expr &xfactor, const Expr &yfactor, TailStrategy tail=TailStrategy::Auto)
Stage & parallel(const VarOrRVar &var, const Expr &task_size, TailStrategy tail=TailStrategy::Auto)
Stage & rename(const VarOrRVar &old_name, const VarOrRVar &new_name)
Stage & tile(const std::vector< VarOrRVar > &previous, const std::vector< VarOrRVar > &outers, const std::vector< VarOrRVar > &inners, const std::vector< Expr > &factors, TailStrategy tail=TailStrategy::Auto)
Stage & gpu_single_thread(DeviceAPI device_api=DeviceAPI::Default_GPU)
Stage & unroll(const VarOrRVar &var, const Expr &factor, TailStrategy tail=TailStrategy::Auto)
Stage & prefetch(const Internal::Parameter &param, const VarOrRVar &at, const VarOrRVar &from, Expr offset=1, PrefetchBoundStrategy strategy=PrefetchBoundStrategy::GuardWithIf)
Stage specialize(const Expr &condition)
Stage & prefetch(const T &image, const VarOrRVar &at, const VarOrRVar &from, Expr offset=1, PrefetchBoundStrategy strategy=PrefetchBoundStrategy::GuardWithIf)
Definition Func.h:449
Stage & gpu_threads(const VarOrRVar &thread_x, const VarOrRVar &thread_y, const VarOrRVar &thread_z, DeviceAPI device_api=DeviceAPI::Default_GPU)
Stage & reorder(const std::vector< VarOrRVar > &vars)
Stage & gpu(const VarOrRVar &block_x, const VarOrRVar &block_y, const VarOrRVar &block_z, const VarOrRVar &thread_x, const VarOrRVar &thread_y, const VarOrRVar &thread_z, DeviceAPI device_api=DeviceAPI::Default_GPU)
Stage(Internal::Function f, Internal::Definition d, size_t stage_index)
Definition Func.h:94
Stage & gpu_blocks(const VarOrRVar &block_x, const VarOrRVar &block_y, const VarOrRVar &block_z, DeviceAPI device_api=DeviceAPI::Default_GPU)
Stage & tile(const std::vector< VarOrRVar > &previous, const std::vector< VarOrRVar > &outers, const std::vector< VarOrRVar > &inners, const std::vector< Expr > &factors, const std::vector< TailStrategy > &tails)
Stage & compute_with(LoopLevel loop_level, LoopAlignStrategy align=LoopAlignStrategy::Auto)
Stage & parallel(const VarOrRVar &var)
const Internal::StageSchedule & get_schedule() const
Return the current StageSchedule associated with this Stage.
Definition Func.h:107
Stage & serial(const VarOrRVar &var)
Stage & gpu_blocks(const VarOrRVar &block_x, DeviceAPI device_api=DeviceAPI::Default_GPU)
Stage & fuse(const VarOrRVar &inner, const VarOrRVar &outer, const VarOrRVar &fused)
Stage & vectorize(const VarOrRVar &var, const Expr &factor, TailStrategy tail=TailStrategy::Auto)
Stage & gpu(const VarOrRVar &block_x, const VarOrRVar &block_y, const VarOrRVar &thread_x, const VarOrRVar &thread_y, DeviceAPI device_api=DeviceAPI::Default_GPU)
Stage & compute_with(const Stage &s, const VarOrRVar &var, LoopAlignStrategy align=LoopAlignStrategy::Auto)
void specialize_fail(const std::string &message)
Stage & tile(const VarOrRVar &x, const VarOrRVar &y, const VarOrRVar &xo, const VarOrRVar &yo, const VarOrRVar &xi, const VarOrRVar &yi, const Expr &xfactor, const Expr &yfactor, TailStrategy tail=TailStrategy::Auto)
Stage & gpu_blocks(const VarOrRVar &block_x, const VarOrRVar &block_y, DeviceAPI device_api=DeviceAPI::Default_GPU)
Stage & hexagon(const VarOrRVar &x=Var::outermost())
Stage & split(const VarOrRVar &old, const VarOrRVar &outer, const VarOrRVar &inner, const Expr &factor, TailStrategy tail=TailStrategy::Auto)
Scheduling calls that control how the domain of this stage is traversed.
Stage & atomic(bool override_associativity_test=false)
std::string source_location() const
Attempt to get the source file and line where this stage was defined by parsing the process's own deb...
Stage & gpu_threads(const VarOrRVar &thread_x, DeviceAPI device_api=DeviceAPI::Default_GPU)
std::string dump_argument_list() const
Return a string describing the current var list taking into account all the splits,...
Stage & gpu_tile(const VarOrRVar &x, const VarOrRVar &y, const VarOrRVar &tx, const VarOrRVar &ty, const Expr &x_size, const Expr &y_size, TailStrategy tail=TailStrategy::Auto, DeviceAPI device_api=DeviceAPI::Default_GPU)
Stage & gpu_tile(const VarOrRVar &x, const VarOrRVar &y, const VarOrRVar &z, const VarOrRVar &bx, const VarOrRVar &by, const VarOrRVar &bz, const VarOrRVar &tx, const VarOrRVar &ty, const VarOrRVar &tz, const Expr &x_size, const Expr &y_size, const Expr &z_size, TailStrategy tail=TailStrategy::Auto, DeviceAPI device_api=DeviceAPI::Default_GPU)
void unscheduled()
Assert that this stage has intentionally been given no schedule, and suppress the warning about unsch...
Create a small array of Exprs for defining and calling functions with multiple outputs.
Definition Tuple.h:18
A Halide variable, to be used when defining functions.
Definition Var.h:19
const std::string & name() const
Get the name of a Var.
static Var outermost()
A Var that represents the location outside the outermost loop.
Definition Var.h:163
void schedule_scalar(Func f)
Definition Func.h:2550
std::vector< Var > make_argument_list(int dimensionality)
Make a list of unique arguments for definitions with unnamed arguments.
void assign_results(Realization &r, int idx, Last last)
Definition Func.h:2497
void check_types(const Tuple &t, int idx)
Definition Func.h:2482
ForType
An enum describing a type of loop traversal.
Definition Expr.h:400
This file defines the class FunctionDAG, which is our representation of a Halide pipeline,...
@ Internal
Not visible externally, similar to 'static' linkage in C.
PrefetchBoundStrategy
Different ways to handle accesses outside the original extents in a prefetch.
@ GuardWithIf
Guard the prefetch with if-guards that ignores the prefetch if any of the prefetched region ever goes...
HALIDE_NO_USER_CODE_INLINE T evaluate_may_gpu(const Expr &e)
JIT-Compile and run enough code to evaluate a Halide expression.
Definition Func.h:2568
Expr cast(Expr a)
Cast an expression to the halide type corresponding to the C++ type T.
Definition IROperator.h:358
TailStrategy
Different ways to handle a tail case in a split when the factor does not provably divide the extent.
Definition Schedule.h:32
@ Auto
For pure definitions use ShiftInwards.
LoopAlignStrategy
Different ways to handle the case when the start/end of the loops of stages computed with (fused) are...
Definition Schedule.h:110
@ Auto
By default, LoopAlignStrategy is set to NoAlign.
Expr min(const FuncRef &a, const FuncRef &b)
Explicit overloads of min and max for FuncRef.
Definition Func.h:584
NameMangling
An enum to specify calling convention for extern stages.
Definition Function.h:25
@ Default
Match whatever is specified in the Target.
Target get_jit_target_from_environment()
Return the target that Halide will use for jit-compilation.
DeviceAPI
An enum describing a type of device API.
Definition DeviceAPI.h:15
@ Host
Used to denote for loops that run on the same device as the containing code.
Target get_target_from_environment()
Return the target that Halide will use.
StmtOutputFormat
Used to determine if the output printed to file should be as a normal string or as an HTML file which...
Definition Pipeline.h:73
@ Text
Definition Pipeline.h:74
Stage ScheduleHandle
Definition Func.h:469
std::vector< Range > Region
A multi-dimensional box.
Definition Expr.h:344
Expr max(const FuncRef &a, const FuncRef &b)
Definition Func.h:587
MemoryType
An enum describing different address spaces to be used with Func::store_in.
Definition Expr.h:347
HALIDE_NO_USER_CODE_INLINE T evaluate(JITUserContext *ctx, const Expr &e)
JIT-Compile and run enough code to evaluate a Halide expression.
Definition Func.h:2514
A fragment of Halide syntax.
Definition Expr.h:257
HALIDE_ALWAYS_INLINE Type type() const
Get the type of this expression node.
Definition Expr.h:321
An argument to an extern-defined Func.
A set of custom overrides of runtime functions.
Definition JITModule.h:35
A context to be passed to Pipeline::realize.
Definition JITModule.h:136
A struct representing a target machine and os to generate code for.
Definition Target.h:19
bool has_gpu_feature() const
Is a fully feature GPU compute runtime enabled? I.e.
bool has_feature(Feature f) const
Types in the halide type system.
Definition Type.h:276
A class that can represent Vars or RVars.
Definition Func.h:30
VarOrRVar(const Var &v)
Definition Func.h:34
VarOrRVar(const RVar &r)
Definition Func.h:37
VarOrRVar(const std::string &n, bool r)
Definition Func.h:31
VarOrRVar(const ImplicitVar< N > &u)
Definition Func.h:44
const std::string & name() const
Definition Func.h:48
VarOrRVar(const RDom &r)
Definition Func.h:40
#define user_assert(c)
Definition test.h:10