#include "Halide.h"
#include <stdio.h>
#ifdef __SSE2__
#include <emmintrin.h>
#endif
#include "clock.h"
#include "halide_image_io.h"
using namespace Halide::Tools;
int main(int argc, char **argv) {
Var x("x"), y("y");
Buffer<uint8_t> input = load_image("images/gray.png");
{
Func f;
f(x, y) = x + y;
f(3, 7) = 42;
f(x, y) = f(x, y) + 17;
f(x, 3) = f(x, 0) * f(x, 10);
f(0, y) = f(0, y) / f(3, y);
f(x, 17) = x + 8;
f(0, y) = y * 8;
f(x, x + 1) = x + 8;
f(y / 2, y) = f(0, y) * 17;
f.realize({100, 101});
Func g("g");
g(x, y) = x + y;
g(2, 1) = 42;
g(x, 0) = g(x, 1);
g.trace_loads();
g.trace_stores();
g.realize({4, 4});
int result[4][4];
for (int y = 0; y < 4; y++) {
for (int x = 0; x < 4; x++) {
result[y][x] = x + y;
}
}
result[1][2] = 42;
for (int x = 0; x < 4; x++) {
result[0][x] = result[1][x];
}
}
{
Func f;
f(x, y) = (x + y) / 100.0f;
RDom r(0, 50);
f(x, r) = f(x, r) * f(x, r);
Buffer<float> halide_result = f.realize({100, 100});
float c_result[100][100];
for (int y = 0; y < 100; y++) {
for (int x = 0; x < 100; x++) {
c_result[y][x] = (x + y) / 100.0f;
}
}
for (int x = 0; x < 100; x++) {
for (int r = 0; r < 50; r++) {
c_result[r][x] = c_result[r][x] * c_result[r][x];
}
}
for (int y = 0; y < 100; y++) {
for (int x = 0; x < 100; x++) {
if (fabs(halide_result(x, y) - c_result[y][x]) > 0.01f) {
printf("halide_result(%d, %d) = %f instead of %f\n",
x, y, halide_result(x, y), c_result[y][x]);
return -1;
}
}
}
}
{
Func histogram("histogram");
histogram(x) = 0;
RDom r(0, input.width(), 0, input.height());
histogram(input(r.x, r.y)) += 1;
Buffer<int> halide_result = histogram.realize({256});
int c_result[256];
for (int x = 0; x < 256; x++) {
c_result[x] = 0;
}
for (int r_y = 0; r_y < input.height(); r_y++) {
for (int r_x = 0; r_x < input.width(); r_x++) {
c_result[input(r_x, r_y)] += 1;
}
}
for (int x = 0; x < 256; x++) {
if (c_result[x] != halide_result(x)) {
printf("halide_result(%d) = %d instead of %d\n",
x, halide_result(x), c_result[x]);
return -1;
}
}
}
{
Func f;
f(x, y) = x * y;
f(x, 0) = f(x, 8);
f(0, y) = f(8, y) + 2;
f.vectorize(x, 4).parallel(y);
f.update(0).vectorize(x, 4);
Var yo, yi;
f.update(1).split(y, yo, yi, 4).parallel(yo);
Buffer<int> halide_result = f.realize({16, 16});
int c_result[16][16];
for (int y = 0; y < 16; y++) {
for (int x_vec = 0; x_vec < 4; x_vec++) {
int x[] = {x_vec * 4, x_vec * 4 + 1, x_vec * 4 + 2, x_vec * 4 + 3};
c_result[y][x[0]] = x[0] * y;
c_result[y][x[1]] = x[1] * y;
c_result[y][x[2]] = x[2] * y;
c_result[y][x[3]] = x[3] * y;
}
}
for (int x_vec = 0; x_vec < 4; x_vec++) {
int x[] = {x_vec * 4, x_vec * 4 + 1, x_vec * 4 + 2, x_vec * 4 + 3};
c_result[0][x[0]] = c_result[8][x[0]];
c_result[0][x[1]] = c_result[8][x[1]];
c_result[0][x[2]] = c_result[8][x[2]];
c_result[0][x[3]] = c_result[8][x[3]];
}
for (int yo = 0; yo < 4; yo++) {
for (int yi = 0; yi < 4; yi++) {
int y = yo * 4 + yi;
c_result[y][0] = c_result[y][8] + 2;
}
}
for (int y = 0; y < 16; y++) {
for (int x = 0; x < 16; x++) {
if (halide_result(x, y) != c_result[y][x]) {
printf("halide_result(%d, %d) = %d instead of %d\n",
x, y, halide_result(x, y), c_result[y][x]);
return -1;
}
}
}
}
{
Func producer, consumer;
producer(x) = x * 2;
producer(x) += 10;
consumer(x) = 2 * producer(x);
Buffer<int> halide_result = consumer.realize({10});
int c_result[10];
for (int x = 0; x < 10; x++) {
int producer_storage[1];
producer_storage[0] = x * 2;
producer_storage[0] = producer_storage[0] + 10;
c_result[x] = 2 * producer_storage[0];
}
for (int x = 0; x < 10; x++) {
if (halide_result(x) != c_result[x]) {
printf("halide_result(%d) = %d instead of %d\n",
x, halide_result(x), c_result[x]);
return -1;
}
}
}
{
Func producer, consumer;
producer(x) = x * 17;
consumer(x) = 2 * producer(x);
consumer(x) += 50;
producer.compute_at(consumer, x);
Buffer<int> halide_result = consumer.realize({10});
int c_result[10];
for (int x = 0; x < 10; x++) {
int producer_storage[1];
producer_storage[0] = x * 17;
c_result[x] = 2 * producer_storage[0];
}
for (int x = 0; x < 10; x++) {
c_result[x] += 50;
}
for (int x = 0; x < 10; x++) {
if (halide_result(x) != c_result[x]) {
printf("halide_result(%d) = %d instead of %d\n",
x, halide_result(x), c_result[x]);
return -1;
}
}
}
{
Func producer, consumer;
producer(x) = x * 17;
consumer(x) = 100 - x * 10;
consumer(x) += producer(x);
producer.compute_at(consumer, x);
Buffer<int> halide_result = consumer.realize({10});
int c_result[10];
for (int x = 0; x < 10; x++) {
c_result[x] = 100 - x * 10;
}
for (int x = 0; x < 10; x++) {
int producer_storage[1];
producer_storage[0] = x * 17;
c_result[x] += producer_storage[0];
}
for (int x = 0; x < 10; x++) {
if (halide_result(x) != c_result[x]) {
printf("halide_result(%d) = %d instead of %d\n",
x, halide_result(x), c_result[x]);
return -1;
}
}
}
{
Func producer, consumer;
producer(x) = x * 17;
consumer(x) = 170 - producer(x);
consumer(x) += producer(x) / 2;
producer.compute_at(consumer, x);
Buffer<int> halide_result = consumer.realize({10});
int c_result[10];
for (int x = 0; x < 10; x++) {
int producer_storage[1];
producer_storage[0] = x * 17;
c_result[x] = 170 - producer_storage[0];
}
for (int x = 0; x < 10; x++) {
int producer_storage[1];
producer_storage[0] = x * 17;
c_result[x] += producer_storage[0] / 2;
}
for (int x = 0; x < 10; x++) {
if (halide_result(x) != c_result[x]) {
printf("halide_result(%d) = %d instead of %d\n",
x, halide_result(x), c_result[x]);
return -1;
}
}
}
{
Func producer, consumer;
producer(x, y) = (x * y) / 10 + 8;
consumer(x, y) = x + y;
consumer(x, 0) += producer(x, x);
consumer(0, y) += producer(y, 9 - y);
Func producer_1, producer_2, consumer_2;
producer_1(x, y) = producer(x, y);
producer_2(x, y) = producer(x, y);
consumer_2(x, y) = x + y;
consumer_2(x, 0) += producer_1(x, x);
consumer_2(0, y) += producer_2(y, 9 - y);
producer_1.compute_at(consumer_2, x);
producer_2.compute_at(consumer_2, y);
Buffer<int> halide_result = consumer_2.realize({10, 10});
int c_result[10][10];
for (int y = 0; y < 10; y++) {
for (int x = 0; x < 10; x++) {
c_result[y][x] = x + y;
}
}
for (int x = 0; x < 10; x++) {
int producer_1_storage[1];
producer_1_storage[0] = (x * x) / 10 + 8;
c_result[0][x] += producer_1_storage[0];
}
for (int y = 0; y < 10; y++) {
int producer_2_storage[1];
producer_2_storage[0] = (y * (9 - y)) / 10 + 8;
c_result[y][0] += producer_2_storage[0];
}
for (int y = 0; y < 10; y++) {
for (int x = 0; x < 10; x++) {
if (halide_result(x, y) != c_result[y][x]) {
printf("halide_result(%d, %d) = %d instead of %d\n",
x, y, halide_result(x, y), c_result[y][x]);
return -1;
}
}
}
}
{
Func producer, consumer;
RDom r(0, 5);
producer(x) = x % 8;
consumer(x) = x + 10;
consumer(x) += r + producer(x + r);
producer.compute_at(consumer, r);
Buffer<int> halide_result = consumer.realize({10});
int c_result[10];
for (int x = 0; x < 10; x++) {
c_result[x] = x + 10;
}
for (int x = 0; x < 10; x++) {
for (int r = 0; r < 5; r++) {
int producer_storage[1];
producer_storage[0] = (x + r) % 8;
c_result[x] += r + producer_storage[0];
}
}
for (int x = 0; x < 10; x++) {
if (halide_result(x) != c_result[x]) {
printf("halide_result(%d) = %d instead of %d\n",
x, halide_result(x), c_result[x]);
return -1;
}
}
}
{
RDom r(-2, 5, -2, 5);
Func local_sum;
local_sum(x, y) = 0;
local_sum(x, y) += clamped(x + r.x, y + r.y);
Func blurry;
blurry(x, y) = cast<uint8_t>(local_sum(x, y) / 25);
Buffer<uint8_t> halide_result = blurry.realize({input.width(), input.height()});
Buffer<uint8_t> c_result(input.width(), input.height());
for (int y = 0; y < input.height(); y++) {
for (int x = 0; x < input.width(); x++) {
int local_sum[1];
local_sum[0] = 0;
for (int r_y = -2; r_y <= 2; r_y++) {
for (int r_x = -2; r_x <= 2; r_x++) {
local_sum[0] += input(clamped_x, clamped_y);
}
}
c_result(x, y) = (
uint8_t)(local_sum[0] / 25);
}
}
for (int y = 0; y < input.height(); y++) {
for (int x = 0; x < input.width(); x++) {
if (halide_result(x, y) != c_result(x, y)) {
printf("halide_result(%d, %d) = %d instead of %d\n",
x, y, halide_result(x, y), c_result(x, y));
return -1;
}
}
}
}
{
Func f1;
RDom r(0, 100);
Func f2;
Func anon;
anon(x) = 0;
anon(x) += r + x;
f2(x) = anon(x) * 7;
Buffer<int> halide_result_1 = f1.realize({10});
Buffer<int> halide_result_2 = f2.realize({10});
int c_result[10];
for (int x = 0; x < 10; x++) {
int anon[1];
anon[0] = 0;
for (int r = 0; r < 100; r++) {
anon[0] += r + x;
}
c_result[x] = anon[0] * 7;
}
for (int x = 0; x < 10; x++) {
if (halide_result_1(x) != c_result[x]) {
printf("halide_result_1(%d) = %d instead of %d\n",
x, halide_result_1(x), c_result[x]);
return -1;
}
if (halide_result_2(x) != c_result[x]) {
printf("halide_result_2(%d) = %d instead of %d\n",
x, halide_result_2(x), c_result[x]);
return -1;
}
}
}
{
Func clamped;
Expr x_clamped =
clamp(x, 0, input.width() - 1);
Expr y_clamped =
clamp(y, 0, input.height() - 1);
clamped(x, y) = input(x_clamped, y_clamped);
RDom box(-2, 5, -2, 5);
Func spread;
spread(x, y) = (
maximum(clamped(x + box.x, y + box.y)) -
minimum(clamped(x + box.x, y + box.y)));
Var yo, yi;
spread.split(y, yo, yi, 32).parallel(yo);
spread.vectorize(x, 16);
clamped.store_at(spread, yo).compute_at(spread, yi);
Buffer<uint8_t> halide_result = spread.realize({input.width(), input.height()});
#ifdef __SSE2__
Buffer<uint8_t> c_result(input.width(), input.height());
#ifdef _OPENMP
double t1 = current_time();
#endif
for (int iters = 0; iters < 100; iters++) {
#pragma omp parallel for
for (int yo = 0; yo < (input.height() + 31) / 32; yo++) {
int y_base =
std::min(yo * 32, input.height() - 32);
int clamped_width = input.width() + 4;
for (int yi = 0; yi < 32; yi++) {
int y = y_base + yi;
uint8_t *output_row = &c_result(0, y);
int min_y_clamped = (yi == 0) ? (y - 2) : (y + 2);
int max_y_clamped = (y + 2);
for (int cy = min_y_clamped; cy <= max_y_clamped; cy++) {
clamped_storage + (cy & 7) * clamped_width;
uint8_t *input_row = &input(0, clamped_y);
for (int x = -2; x < input.width() + 2; x++) {
*clamped_row++ = input_row[clamped_x];
}
}
for (int x_vec = 0; x_vec < (input.width() + 15) / 16; x_vec++) {
int x_base =
std::min(x_vec * 16, input.width() - 16);
__m128i minimum_storage, maximum_storage;
maximum_storage = _mm_setzero_si128();
for (int max_y = y - 2; max_y <= y + 2; max_y++) {
clamped_storage + (max_y & 7) * clamped_width;
for (int max_x = x_base - 2; max_x <= x_base + 2; max_x++) {
__m128i v = _mm_loadu_si128(
(__m128i const *)(clamped_row + max_x + 2));
maximum_storage = _mm_max_epu8(maximum_storage, v);
}
}
minimum_storage = _mm_cmpeq_epi32(_mm_setzero_si128(),
_mm_setzero_si128());
for (int min_y = y - 2; min_y <= y + 2; min_y++) {
clamped_storage + (min_y & 7) * clamped_width;
for (int min_x = x_base - 2; min_x <= x_base + 2; min_x++) {
__m128i v = _mm_loadu_si128(
(__m128i const *)(clamped_row + min_x + 2));
minimum_storage = _mm_min_epu8(minimum_storage, v);
}
}
__m128i spread = _mm_sub_epi8(maximum_storage, minimum_storage);
_mm_storeu_si128((__m128i *)(output_row + x_base), spread);
}
}
}
}
#ifdef _OPENMP
double t2 = current_time();
for (int iters = 0; iters < 100; iters++) {
spread.realize(halide_result);
}
double t3 = current_time();
printf("Halide spread took %f ms. C equivalent took %f ms\n",
(t3 - t2) / 100, (t2 - t1) / 100);
#endif
for (int y = 0; y < input.height(); y++) {
for (int x = 0; x < input.width(); x++) {
if (halide_result(x, y) != c_result(x, y)) {
printf("halide_result(%d, %d) = %d instead of %d\n",
x, y, halide_result(x, y), c_result(x, y));
return -1;
}
}
}
#endif
}
printf("Success!\n");
return 0;
}
Func repeat_edge(const Func &source, const Region &bounds)
Impose a boundary condition such that the nearest edge sample is returned everywhere outside the give...
This file defines the class FunctionDAG, which is our representation of a Halide pipeline,...
Expr maximum(Expr, const std::string &s="maximum")
Expr clamp(Expr a, const Expr &min_val, const Expr &max_val)
Clamps an expression to lie within the given bounds.
Expr sum(Expr, const std::string &s="sum")
An inline reduction.
Expr min(const FuncRef &a, const FuncRef &b)
Explicit overloads of min and max for FuncRef.
Expr minimum(Expr, const std::string &s="minimum")
Expr max(const FuncRef &a, const FuncRef &b)
unsigned __INT8_TYPE__ uint8_t