#include <stdio.h>
#include "Halide.h"
#include "clock.h"
#include "halide_image_io.h"
using namespace Halide::Tools;
Target find_gpu_target();
Var x, y, c, i, ii, xo, yo, xi, yi;
class MyPipeline {
public:
Func lut, padded, padded16, sharpen, curved;
Buffer<uint8_t> input;
MyPipeline(Buffer<uint8_t> in)
: input(in) {
lut(i) = cast<uint8_t>(
clamp(
pow(i / 255.0f, 1.2f) * 255.0f, 0, 255));
padded(x, y, c) = input(
clamp(x, 0, input.width() - 1),
clamp(y, 0, input.height() - 1), c);
padded16(x, y, c) = cast<uint16_t>(padded(x, y, c));
sharpen(x, y, c) = (padded16(x, y, c) * 2 -
(padded16(x - 1, y, c) +
padded16(x, y - 1, c) +
padded16(x + 1, y, c) +
padded16(x, y + 1, c)) /
4);
curved(x, y, c) = lut(sharpen(x, y, c));
}
void schedule_for_cpu() {
lut.compute_root();
curved.reorder(c, x, y)
.bound(c, 0, 3)
.unroll(c);
Var yo, yi;
curved.split(y, yo, yi, 16)
.parallel(yo);
sharpen.compute_at(curved, yi);
sharpen.vectorize(x, 8);
padded.store_at(curved, yo)
.compute_at(curved, yi);
padded.vectorize(x, 16);
curved.compile_jit(target);
}
bool schedule_for_gpu() {
Target target = find_gpu_target();
if (!target.has_gpu_feature()) {
return false;
}
lut.compute_root();
Var block, thread;
lut.split(i, block, thread, 16);
lut.gpu_blocks(block)
.gpu_threads(thread);
curved.reorder(c, x, y)
.bound(c, 0, 3)
.unroll(c);
curved.gpu_tile(x, y, xo, yo, xi, yi, 8, 8);
padded.compute_at(curved, xo);
padded.gpu_threads(x, y);
printf("Target: %s\n", target.to_string().c_str());
curved.compile_jit(target);
return true;
}
void test_performance() {
Buffer<uint8_t> output(input.width(), input.height(), input.channels());
curved.realize(output);
double best_time = 0.0;
for (int i = 0; i < 3; i++) {
double t1 = current_time();
for (int j = 0; j < 100; j++) {
curved.realize(output);
}
output.copy_to_host();
double t2 = current_time();
double elapsed = (t2 - t1) / 100;
if (i == 0 || elapsed < best_time) {
best_time = elapsed;
}
}
printf("%1.4f milliseconds\n", best_time);
}
void test_correctness(Buffer<uint8_t> reference_output) {
Buffer<uint8_t> output =
curved.realize({input.width(), input.height(), input.channels()});
for (int c = 0; c < input.channels(); c++) {
for (int y = 0; y < input.height(); y++) {
for (int x = 0; x < input.width(); x++) {
if (output(x, y, c) != reference_output(x, y, c)) {
printf("Mismatch between output (%d) and "
"reference output (%d) at %d, %d, %d\n",
output(x, y, c),
reference_output(x, y, c),
x, y, c);
exit(-1);
}
}
}
}
}
};
int main(int argc, char **argv) {
Buffer<uint8_t> input = load_image("images/rgb.png");
Buffer<uint8_t> reference_output(input.width(), input.height(), input.channels());
printf("Running pipeline on CPU:\n");
MyPipeline p1(input);
p1.schedule_for_cpu();
p1.curved.realize(reference_output);
printf("Running pipeline on GPU:\n");
MyPipeline p2(input);
bool has_gpu_target = p2.schedule_for_gpu();
if (has_gpu_target) {
printf("Testing GPU correctness:\n");
p2.test_correctness(reference_output);
} else {
printf("No GPU target available on the host\n");
}
printf("Testing performance on CPU:\n");
p1.test_performance();
if (has_gpu_target) {
printf("Testing performance on GPU:\n");
p2.test_performance();
}
return 0;
}
Target find_gpu_target() {
std::vector<Target::Feature> features_to_try;
if (sizeof(void*) == 8) {
}
} else {
}
Target new_target = target.with_feature(f);
return new_target;
}
}
printf("Requested GPU(s) are not supported. (Do you have the proper hardware and/or driver installed?)\n");
return target;
}
This file defines the class FunctionDAG, which is our representation of a Halide pipeline,...
Target get_host_target()
Return the target corresponding to the host machine.
bool host_supports_target_device(const Target &t)
This attempts to sniff whether a given Target (and its implied DeviceAPI) is usable on the current ho...
Expr clamp(Expr a, const Expr &min_val, const Expr &max_val)
Clamps an expression to lie within the given bounds.
Expr pow(Expr x, Expr y)
Return one floating point expression raised to the power of another.
Feature
Optional features a target can have.