From bb1b5ddac03b8dc7350049374ca565f92a9238a5 Mon Sep 17 00:00:00 2001 From: mindv0rtex Date: Thu, 25 Feb 2021 15:17:06 -0500 Subject: [PATCH] Optimize vertical box filter to iterate over rows for cache friendliness. --- Cargo.lock | 10 +++ Cargo.toml | 1 + src/blur.rs | 215 ++++++++++++++++++++++++++++++---------------------- src/grid.rs | 17 ++--- src/main.rs | 5 +- 5 files changed, 143 insertions(+), 105 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 1a63fb3..fa18a49 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -190,6 +190,15 @@ dependencies = [ "tiff", ] +[[package]] +name = "itertools" +version = "0.10.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "37d572918e350e82412fe766d24b15e6682fb2ed2bbe018280caa810397cb319" +dependencies = [ + "either", +] + [[package]] name = "jpeg-decoder" version = "0.1.22" @@ -315,6 +324,7 @@ name = "physarum" version = "0.1.0" dependencies = [ "image", + "itertools", "rand", "rayon", ] diff --git a/Cargo.toml b/Cargo.toml index 1f1c1bd..584b8b2 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -8,5 +8,6 @@ edition = "2018" [dependencies] image = "*" +itertools = "*" rand = "*" rayon = "*" diff --git a/src/blur.rs b/src/blur.rs index dc1852e..2e384c3 100644 --- a/src/blur.rs +++ b/src/blur.rs @@ -1,102 +1,116 @@ -/// Approximate 1D Gaussian filter of standard deviation sigma with N box filter passes. Each -/// element in the output array contains the radius of the box filter for the corresponding pass. -pub fn boxes_for_gaussian(sigma: f32) -> ([usize; N]) { - let w_ideal = (12.0 * sigma * sigma / N as f32 + 1.0).sqrt(); - let mut w = w_ideal as usize; - w -= 1 - w & 1; +use itertools::multizip; - let mut m = ((w * w + 4 * w + 3) * N) as f32; - m -= 12.0 * sigma * sigma; - m *= 0.25; - m /= (w + 1) as f32; - let m = m.round() as usize; - - let mut result = [0; N]; - for (i, value) in result.iter_mut().enumerate() { - *value = (if i < m { w - 1 } else { w + 1 }) / 2; - } - result -} - -/// Blur an image with 3 box filter passes. The result will be written to the src slice, while the -/// buf slice is used as a scratch space. -pub fn approximate_gauss_blur( - src: &mut [f32], - buf: &mut [f32], +#[derive(Debug)] +pub struct Blur { width: usize, height: usize, - sigma: f32, - decay: f32, -) { - let boxes = boxes_for_gaussian::<3>(sigma); - box_blur(src, buf, width, height, boxes[0], 1.0); - box_blur(src, buf, width, height, boxes[1], 1.0); - box_blur(src, buf, width, height, boxes[2], decay); + row_buffer: Vec, } -/// Perform one pass of the 2D box filter of the given radius. The result will be written to the src -/// slice, while the buf slice is used as a scratch space. -fn box_blur( - src: &mut [f32], - buf: &mut [f32], - width: usize, - height: usize, - radius: usize, - decay: f32, -) { - box_blur_h(src, buf, width, radius); - box_blur_v(buf, src, width, height, radius, decay); -} - -/// Perform one pass of the 1D box filter of the given radius along x axis. -fn box_blur_h(src: &[f32], dst: &mut [f32], width: usize, radius: usize) { - let weight = 1.0 / (2 * radius + 1) as f32; - - // TODO: Parallelize with rayon - for (src_row, dst_row) in src.chunks_exact(width).zip(dst.chunks_exact_mut(width)) { - // First we build a value for the beginning of each row. We assume periodic boundary - // conditions, so we need to push the left index to the opposite side of the row. - let mut value = src_row[width - radius - 1]; - for j in 0..radius { - value += src_row[width - radius + j] + src_row[j]; - } - // At this point "value" contains the unweighted sum for the right-most row element. - for current_id in 0..width { - let left_id = (current_id + width - radius - 1) & (width - 1); - let right_id = (current_id + radius) & (width - 1); - value += src_row[right_id] - src_row[left_id]; - dst_row[current_id] = value * weight; +impl Blur { + pub fn new(width: usize, height: usize) -> Self { + Blur { + width, + height, + row_buffer: vec![0.0; width], } } -} -/// Perform one pass of the 1D box filter of the given radius along y axis. Applies the decay factor -/// to the destination buffer. -fn box_blur_v( - src: &[f32], - dst: &mut [f32], - width: usize, - height: usize, - radius: usize, - decay: f32, -) { - let weight = decay / (2 * radius + 1) as f32; + /// Blur an image with 3 box filter passes. The result will be written to the src slice, while + /// the buf slice is used as a scratch space. + pub fn run(&mut self, src: &mut [f32], buf: &mut [f32], sigma: f32, decay: f32) { + let boxes = Blur::boxes_for_gaussian::<3>(sigma); + self.box_blur(src, buf, boxes[0], 1.0); + self.box_blur(src, buf, boxes[1], 1.0); + self.box_blur(src, buf, boxes[2], decay); + } - // TODO: Parallelize with rayon - for i in 0..width { - // First we build a value for the beginning of each column. We assume periodic boundary - // conditions, so we need to push the bottom index to the opposite side of the column. - let mut value = src[i + (height - radius - 1) * width]; - for j in 0..radius { - value += src[i + (height - radius + j) * width] + src[i + j * width]; + /// Approximate 1D Gaussian filter of standard deviation sigma with N box filter passes. Each + /// element in the output array contains the radius of the box filter for the corresponding + /// pass. + fn boxes_for_gaussian(sigma: f32) -> ([usize; N]) { + let w_ideal = (12.0 * sigma * sigma / N as f32 + 1.0).sqrt(); + let mut w = w_ideal as usize; + w -= 1 - w & 1; + + let mut m = 0.25 * (N * (w + 3)) as f32; + m -= 3.0 * sigma * sigma / (w + 1) as f32; + let m = m.round() as usize; + + let mut result = [0; N]; + for (i, value) in result.iter_mut().enumerate() { + *value = (if i < m { w - 1 } else { w + 1 }) / 2; } - // At this point "value" contains the unweighted sum for the top-most column element. + result + } - for current_id in (i..i + height * width).step_by(width) { - let bottom_id = (current_id + (height - radius - 1) * width) & (width * height - 1); - let top_id = (current_id + radius * width) & (width * height - 1); - value += src[top_id] - src[bottom_id]; - dst[current_id] = value * weight; + /// Perform one pass of the 2D box filter of the given radius. The result will be written to the + /// src slice, while the buf slice is used as a scratch space. + fn box_blur(&mut self, src: &mut [f32], buf: &mut [f32], radius: usize, decay: f32) { + self.box_blur_h(src, buf, radius); + self.box_blur_v(buf, src, radius, decay); + } + + /// Perform one pass of the 1D box filter of the given radius along x axis. + fn box_blur_h(&mut self, src: &[f32], dst: &mut [f32], radius: usize) { + let weight = 1.0 / (2 * radius + 1) as f32; + let width = self.width; + + for (src_row, dst_row) in src.chunks_exact(width).zip(dst.chunks_exact_mut(width)) { + // First we build a value for the beginning of each row. We assume periodic boundary + // conditions, so we need to push the left index to the opposite side of the row. + let mut value = src_row[width - radius - 1]; + for j in 0..radius { + value += src_row[width - radius + j] + src_row[j]; + } + // At this point "value" contains the unweighted sum for the right-most row element. + for i in 0..width { + let left = (i + width - radius - 1) & (width - 1); + let right = (i + radius) & (width - 1); + value += src_row[right] - src_row[left]; + dst_row[i] = value * weight; + } + } + } + + /// Perform one pass of the 1D box filter of the given radius along y axis. Applies the decay + /// factor to the destination buffer. + fn box_blur_v(&mut self, src: &[f32], dst: &mut [f32], radius: usize, decay: f32) { + let weight = decay / (2 * radius + 1) as f32; + let (width, height) = (self.width, self.height); + + // We don't replicate the horizontal filter logic because of the cache-unfriendly memory + // access patterns of sequential iteration over individual columns. Instead, we iterate over + // rows via loop interchange. + + let offset = (height - radius - 1) * width; + self.row_buffer + .copy_from_slice(&src[offset..offset + width]); + + for j in 0..radius { + let bottom_off = (height - radius + j) * width; + let bottom_row = &src[bottom_off..bottom_off + width]; + let top_off = j * width; + let top_row = &src[top_off..top_off + width]; + + for (buf, bottom, top) in multizip((&mut self.row_buffer, bottom_row, top_row)) { + *buf += bottom + top; + } + } + // At this point row_buffer contains the unweighted sum for all top elements. + + for (i, dst_row) in dst.chunks_exact_mut(width).enumerate() { + let bottom_off = ((i + height - radius - 1) & (height - 1)) * width; + let bottom_row = &src[bottom_off..bottom_off + width]; + let top_off = ((i + radius) & (height - 1)) * width; + let top_row = &src[top_off..top_off + width]; + + for (dst, buf, bottom, top) in + multizip((dst_row, &mut self.row_buffer, bottom_row, top_row)) + { + *buf += top - bottom; + *dst = *buf * weight; + } } } } @@ -107,9 +121,26 @@ mod tests { #[test] fn test_blur() { - let src = vec![1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0]; - let mut dst = vec![0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]; - box_blur_h(&src, &mut dst, 4, 1); - println!("Out: {:?}", dst); + let src: Vec = (1..17).map(|v| v as f32).collect(); + let mut dst = vec![0.0; 16]; + let mut blur = Blur::new(4, 4); + + blur.box_blur_h(&src, &mut dst, 1); + assert_eq!( + dst, + &[ + 2.3333335, 2.0, 3.0, 2.6666667, 6.3333335, 6.0, 7.0, 6.666667, 10.333334, 10.0, + 11.0, 10.666667, 14.333334, 14.0, 15.0, 14.666667 + ] + ); + + blur.box_blur_v(&src, &mut dst, 1, 1.0); + assert_eq!( + dst, + [ + 6.3333335, 7.3333335, 8.333334, 9.333334, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, + 12.0, 7.666667, 8.666667, 9.666667, 10.666667 + ] + ) } } diff --git a/src/grid.rs b/src/grid.rs index 37b93cc..f3298cd 100644 --- a/src/grid.rs +++ b/src/grid.rs @@ -1,13 +1,17 @@ -use crate::blur::approximate_gauss_blur; use rand::{distributions::Uniform, Rng}; +use crate::blur::Blur; + /// A 2D grid with a scalar value per each grid block. #[derive(Debug)] pub struct Grid { width: usize, height: usize, data: Vec, + + // The scratch space for the blur operation. buf: Vec, + blur: Blur, } #[inline(always)] @@ -30,6 +34,7 @@ impl Grid { height, data, buf: vec![0.0; width * height], + blur: Blur::new(width, height), } } @@ -60,14 +65,8 @@ impl Grid { /// Diffuse grid data and apply a decay multiplier. pub fn diffuse(&mut self, radius: usize, decay_factor: f32) { - approximate_gauss_blur( - &mut self.data, - &mut self.buf, - self.width, - self.height, - radius as f32, - decay_factor, - ); + self.blur + .run(&mut self.data, &mut self.buf, radius as f32, decay_factor); } } diff --git a/src/main.rs b/src/main.rs index b58ee81..cc44dba 100644 --- a/src/main.rs +++ b/src/main.rs @@ -2,7 +2,4 @@ mod blur; mod grid; mod model; -fn main() { - let boxes = blur::boxes_for_gaussian::<3>(2.5); - println!("boxes: {:?}", boxes); -} +fn main() {}