Another example of changing the algorithm to be faster but less precise came during the development of an image scaling library for Tantalum 3 (Complete source is available here). Here the goal was to make an image slightly smaller, but not give up image quality in doing so as with simpler pixel-decimation algorithms that can leave jagged lines and visual artifacts.
A very precise image scaling algorithm was created and hand optimised, including the use of fixed point math to avoid slow floating point calculations. It is quite fast, but due to loop unwrapping for speed there is a good bit of code and complexity:
private static final int FP_SHIFT = 13; private static final int ALPHA = 0xFF000000; private static final int RED = 0x00FF0000; private static final int GREEN = 0x0000FF00; private static final int BLUE = 0x000000FF; private static void pureDownscale(final int[] data, final int srcW, final int srcH, final int w, final int h, final boolean preserveAspectRatio) { final int predictedCount = 1 + (srcW / w); final int[] lut = new int[predictedCount << 8]; // Init division lookup table for (int i = 0; i < lut.length; i++) { lut[i] = i / predictedCount; } { // precalculate src/dest ratios final int ratioW = (srcW << FP_SHIFT) / w; // horizontal resampling (srcY = destY) for (int destY = 0; destY < srcH; ++destY) { final int srcRowStartIndex = destY * srcW; final int destRowStartIndex = destY * w; for (int destX = 0; destX < w; ++destX) { int srcX = (destX * ratioW) >> FP_SHIFT; // calculate beginning of sample final int initialSrcX = srcX; final int srcX2 = ((destX + 1) * ratioW) >> FP_SHIFT; // calculate end of sample int a = 0; int r = 0; int g = 0; int b = 0; // now loop from srcX to srcX2 and add up the values for each channel do { final int argb = data[srcX + srcRowStartIndex]; a += (argb & ALPHA) >>> 24; r += argb & RED; g += argb & GREEN; b += argb & BLUE; ++srcX; // move on to the next pixel } while (srcX <= srcX2 && srcX + srcRowStartIndex < data.length); // average out the channel values // recreate color from the averaged channels and place it into the destination buffer r >>>= 16; g >>>= 8; final int count = srcX - initialSrcX; if (count == predictedCount) { data[destX + destRowStartIndex] = (lut[a] << 24) | (lut[r] << 16) | (lut[g] << 8) | lut[b]; } else { a /= count; r /= count; g /= count; b /= count; data[destX + destRowStartIndex] = ((a << 24) | (r << 16) | (g << 8) | b); } } } } // precalculate src/dest ratios final int predictedCount2; final int[] lut2; if (preserveAspectRatio) { predictedCount2 = predictedCount; lut2 = lut; } else { predictedCount2 = 1 + (srcH / h); lut2 = new int[predictedCount2 << 8]; // Init division lookup table for (int i = 0; i < lut2.length; i++) { lut2[i] = i / predictedCount2; } } // vertical resampling (srcX = destX) final int ratioH = (srcH << FP_SHIFT) / h; for (int destX = 0; destX < w; ++destX) { for (int destY = 0; destY < h; ++destY) { int srcY = (destY * ratioH) >> FP_SHIFT; // calculate beginning of sample final int initialSrcY = srcY; final int srcY2 = ((destY + 1) * ratioH) >> FP_SHIFT; // calculate end of sample int a = 0; int r = 0; int g = 0; int b = 0; // now loop from srcY to srcY2 and add up the values for each channel do { final int argb = data[destX + srcY * w]; a += (argb & ALPHA) >>> 24; r += argb & RED; g += argb & GREEN; b += argb & BLUE; ++srcY; // move on to the next pixel } while (srcY <= srcY2 && destX + srcY * w < data.length); // average out the channel values r >>>= 16; g >>>= 8; final int count = srcY - initialSrcY; if (count == predictedCount2) { data[destX + destY * w] = (lut2[a] << 24) | (lut2[r] << 16) | (lut2[g] << 8) | lut2[b]; } else { a /= count; r /= count; g /= count; b /= count; data[destX + destY * w] = (a << 24) | (r << 16) | (g << 8) | b; } } } }
Consultation with a colleague is often the best way to improve your algorithms. In this case, suggestions from an image processing expert showed that sampling 5 points for the source image can be done quite quickly, in fact about four times faster than the original algorithm and with virtually no visible decrease in scaled image quality. As a bonus, the code is also much more tight.
private static final int M1 = 127 | 127 << 8 | 127 << 16 | 127 << 24; private static final int M3 = 31 | 31 << 8 | 31 << 16 | 31 << 24; private static void downscale(final int[] in, final int srcW, final int srcH, final int w, final int h) { final float dx = srcW / (float) (w + 1); final float dy = srcH / (float) (h + 1); int x, y = 0, z = 0, e; for (; y < h - 1; y++) { final int rowstart = 1 + srcW + (srcW * (int) (y * dy)); for (x = 0; x < w; x++) { int i = rowstart + (int) (x * dx); e = in[i--] >>> 1 & M1; i -= srcW; e += (in[i++] >>> 3 & M3) + (in[++i] >>> 3 & M3); i += srcW << 1; in[z++] = e + (in[i--] >>> 3 & M3) + (in[--i] >>> 3 & M3); } } }