Cone Step Mapping
Cone step mapping based on the paper by Jonathan “LoneSock” Dummer, “Cone Step Mapping: An Iterative Ray-Heightfield Intersection Algorithm”.
Shader code
// ElSuicio, 2026.
// GODOT v4.6.2.stable.
// x.com/ElSuicio
// github.com/ElSuicio
// Contact email [interdreamsoft@gmail.com]
shader_type spatial;
render_mode skip_vertex_transform;
group_uniforms _Transform;
uniform vec3 _Position = vec3(0.0);
uniform vec3 _Rotation = vec3(0.0);
uniform vec3 _Scale = vec3(1.0);
group_uniforms _Albedo;
uniform vec4 _DiffuseColor : source_color = vec4(1.0);
uniform sampler2D _DiffuseTexture : source_color, filter_nearest_mipmap, repeat_enable;
group_uniforms _Metallic;
uniform float _Metallic : hint_range(0.0, 1.0, 1e-3) = 0.0;
uniform float _Specular : hint_range(0.0, 1.0, 1e-3) = 0.5;
uniform sampler2D _MetallicTexture : hint_default_white, filter_nearest_mipmap, repeat_enable;
group_uniforms _Roughness;
uniform float _Roughness : hint_range(0.0, 1.0, 1e-3) = 1.0;
uniform sampler2D _RoughnessTexture : hint_roughness_r, filter_nearest_mipmap, repeat_enable;
group_uniforms _Emission;
uniform vec4 _EmissionColor : source_color = vec4(vec3(0.0), 1.0);
uniform sampler2D _EmissionTexture : source_color, hint_default_black, filter_nearest_mipmap, repeat_enable;
uniform float _EmissionEnergyMultiplier : hint_range(0.0, 100.0, 1e-3) = 0.0;
group_uniforms _NormalMap;
uniform sampler2D _NormalMap : hint_roughness_normal, filter_nearest_mipmap, repeat_enable;
uniform float _NormalMapScale : hint_range(-16.0, 16.0, 1e-3) = 1.0;
group_uniforms _ConeStepMapping;
uniform float _TextureSide = 2048.0;
uniform sampler2D _ConeStepMap : filter_linear, repeat_enable;
uniform float _HeightScale : hint_range(0.0, 1.0, 1e-3) = 0.0;
uniform float _MaxSteps : hint_range(1.0, 1024.0, 1.0) = 256.0;
group_uniforms _UV;
uniform vec2 _Tiling = vec2(1.0, 1.0);
uniform vec2 _Offset = vec2(0.0, 0.0);
varying mat3 TBN;
varying mat3 TBN_TRANSPOSE;
varying vec2 _parallax_uv;
varying float _parallax_height;
varying float _height_scale;
vec2 tiling_and_offset(
in vec2 st,
in vec2 tiling,
in vec2 offset
)
{
return vec2(st.x * tiling.x + offset.x, st.y * tiling.y + offset.y);
}
mat4 translation_matrix(
in vec3 position
)
{
return mat4(
vec4(1, 0, 0, 0),
vec4(0, 1, 0, 0),
vec4(0, 0, 1, 0),
vec4(position.x, position.y, position.z, 1)
);
}
mat4 rotation_matrix(
in vec3 angle // In radians.
)
{
mat4 x = mat4(
vec4(1, 0, 0, 0),
vec4(0, cos(angle.x), sin(angle.x), 0),
vec4(0, -sin(angle.x), cos(angle.x), 0),
vec4(0, 0, 0, 1)
);
mat4 y = mat4(
vec4(cos(angle.y), 0, -sin(angle.y), 0),
vec4(0, 1, 0, 0),
vec4(sin(angle.y), 0, cos(angle.y), 0),
vec4(0, 0, 0, 1)
);
mat4 z = mat4(
vec4( cos(angle.z), sin(angle.z), 0, 0),
vec4(-sin(angle.z), cos(angle.z), 0, 0),
vec4(0, 0, 1, 0),
vec4(0, 0, 0, 1)
);
// Godot Default Rotation Order (YXZ).
return y * x * z;
}
mat4 escalation_matrix(
in vec3 scale
)
{
return mat4(
vec4(scale.x, 0, 0, 0),
vec4(0, scale.y, 0, 0),
vec4(0, 0, scale.z, 0),
vec4(0, 0, 0, 1)
);
}
vec2 step_cone_mapping_exact(
in vec2 st,
in vec3 view_ts,
in sampler2D cone_step_map,
in float texture_side,
in float height_scale,
inout float current_layer_height
)
{
float w = 1.0 / texture_side;
float view_lenght = length(view_ts.xy);
vec4 current_cone_step_map = texture(cone_step_map, st);
float scale = 0.0;
while ((1.0 - view_ts.z * scale) > current_cone_step_map.r) {
scale += w + (1.0 - view_ts.z * scale - current_cone_step_map.r) / (view_ts.z + view_lenght / (current_cone_step_map.g * current_cone_step_map.g));
current_cone_step_map = texture(cone_step_map, st + view_ts.xy * scale * height_scale);
}
current_layer_height = view_ts.z * scale;
scale -= w;
return st + view_ts.xy * scale * height_scale;
}
vec2 step_cone_mapping_loop(
in vec2 st,
in vec3 view_ts,
in int max_steps,
in sampler2D cone_step_map,
in float texture_side,
in float height_scale,
inout float current_layer_height
)
{
float w = 1.0 / texture_side;
float view_lenght = length(view_ts.xy);
vec4 current_cone_step_map = texture(cone_step_map, st);
float scale = 0.0;
for (int i = 0; i < max_steps; ++i) {
if ((1.0 - view_ts.z * scale) <= current_cone_step_map.r) {
break;
}
scale += w + (1.0 - view_ts.z * scale - current_cone_step_map.r) / (view_ts.z + view_lenght / (current_cone_step_map.g * current_cone_step_map.g));
current_cone_step_map = texture(cone_step_map, st + view_ts.xy * scale * height_scale);
}
current_layer_height = view_ts.z * scale;
scale -= w;
return st + view_ts.xy * scale * height_scale;
}
float pixel_depth_offset(
in vec2 base_st,
in vec3 view_ts,
in vec3 vertex_vs,
in float height_scale,
in float parallax_height,
in mat3 tbn,
in mat4 projection_matrix
)
{
vec3 dpdx = dFdx(vertex_vs);
vec3 dpdy = dFdy(vertex_vs);
vec2 dstdx = dFdx(base_st);
vec2 dstdy = dFdy(base_st);
float scale_x = length(dpdx) / max(length(dstdx), 1e-5);
float scale_y = length(dpdy) / max(length(dstdy), 1e-5);
float mesh_scale = (scale_x + scale_y) * 0.5;
float height = parallax_height * height_scale * mesh_scale;
vec3 parallax_offset_ts = view_ts * (height / view_ts.z);
vec3 parallax_offset_vs = tbn * parallax_offset_ts;
vec3 disp_pos_vs = vertex_vs - parallax_offset_vs;
vec4 disp_pos_proj = projection_matrix * vec4(disp_pos_vs, 1.0);
disp_pos_proj.xyz /= disp_pos_proj.w;
return disp_pos_proj.z;
}
void vertex()
{
/* UV */
UV = tiling_and_offset(UV, _Tiling, _Offset);
/* Translation */
mat4 TRANSLATION_MATRIX = translation_matrix(_Position);
/* Rotation */
mat4 ROTATION_MATRIX = rotation_matrix(radians(_Rotation));
/* Scale */
mat4 ESCALATION_MATRIX = escalation_matrix(_Scale);
/* Vertex */
vec4 vertex_os = vec4(VERTEX, 1.0);
vertex_os = ESCALATION_MATRIX * vertex_os;
vertex_os = ROTATION_MATRIX * vertex_os;
vec4 vertex_ws = MODEL_MATRIX * vertex_os;
vertex_ws = TRANSLATION_MATRIX * vertex_ws;
vec4 vertex_vs = VIEW_MATRIX * vertex_ws;
VERTEX = vertex_vs.xyz;
/* Normal */
vec4 normal_os = vec4(NORMAL, 0.0);
normal_os = ROTATION_MATRIX * normal_os;
vec4 normal_ws = MODEL_MATRIX * normal_os;
vec4 normal_vs = VIEW_MATRIX * normal_ws;
NORMAL = normalize(normal_vs.xyz);
/* Binormal */
vec4 binormal_os = vec4(BINORMAL, 0.0);
binormal_os = ROTATION_MATRIX * binormal_os;
vec4 binormal_ws = MODEL_MATRIX * binormal_os;
vec4 binormal_vs = VIEW_MATRIX * binormal_ws;
BINORMAL = normalize(binormal_vs.xyz);
/* Tangent */
TANGENT = normalize(cross(BINORMAL, NORMAL));
/* TBN */
TBN = mat3(-TANGENT, BINORMAL, NORMAL);
TBN_TRANSPOSE = transpose(TBN);
/* Projection */
vec4 vertex_proj = PROJECTION_MATRIX * vertex_vs;
POSITION = vertex_proj;
}
void fragment() {
/* Parallax UV */
_parallax_uv = UV;
/* View in Tangent Space */
vec3 view_ts = TBN_TRANSPOSE * VIEW;
/* Cone Step Mapping */
_parallax_height = 0.0;
_height_scale = _HeightScale * 0.5;
//_parallax_uv = step_cone_mapping_exact(_parallax_uv, view_ts, _ConeStepMap, _TextureSide, _height_scale, _parallax_height);
_parallax_uv = step_cone_mapping_loop(_parallax_uv, view_ts, int(_MaxSteps), _ConeStepMap, _TextureSide, _height_scale, _parallax_height);
/* Pixel Depth Offset (PDO) */
DEPTH = pixel_depth_offset(UV, view_ts, VERTEX, _height_scale, _parallax_height, TBN, PROJECTION_MATRIX);
/* Diffuse Color */
vec4 albedo = _DiffuseColor * texture(_DiffuseTexture, _parallax_uv);
ALBEDO = albedo.rgb;
//ALPHA = albedo.a;
/* Metallic */
float metallic = _Metallic * texture(_MetallicTexture, _parallax_uv).b;
METALLIC = metallic;
SPECULAR = _Specular;
/* Roughness */
float roughness = _Roughness * texture(_RoughnessTexture, _parallax_uv).g;
ROUGHNESS = roughness;
/* Emission */
vec4 emission = _EmissionColor + texture(_EmissionTexture, _parallax_uv); // Emission operator add.
//vec4 emission = _EmissionColor * texture(_EmissionTexture, _parallax_uv); // Emission operator multiply.
EMISSION = emission.rgb * _EmissionEnergyMultiplier;
/* Normal Mapping */
NORMAL_MAP = texture(_NormalMap, _parallax_uv).xyz;
NORMAL_MAP_DEPTH = _NormalMapScale;
}



Nice to see an implementation of this, how are you generating the cone map?
Jonathan “LoneSock” Dummer generates the cone step map on the CPU using this algorithm described in his article (C++):
// Theirs #include <iostream> #include <fstream> #include <cstdlib> #include <cmath> #include <vector> // in-between #include <corona.h> using namespace std; /* basically, 99% of all pixels will fall in under 2.0 (most of the time, on the heightmaps I've tested) the question: Is reduced resolution worth missing the speedup of the slow ones? */ const float max_ratio = 1.0; /* And for the cone version, how tolerant am I? (and should it be a ratio, tolerance*r^2, or flat?) */ const float cone_tol2 = 4.0 / (255.0 * 255.0); // Do I want the textures to be computed as tileable? bool x_tileable = true; bool y_tileable = true; int main(int argc, char *argv[]) { char OutName[1024]; int FileCounter; long TheSize, ScanWidth; long width, height; float iwidth, iheight; long chans; corona::Image *image, *outimage; unsigned char *Data; long tin; int wProgress; float really_max = 1.0; cout << "********** Height Map Processor **********" << endl << endl; // Did I get a file name? if (argc < 2) { // Oops, no file to convert cout << "usage: HeightProc input_file" << endl << endl; system ("pause"); return (0); } for (FileCounter = 1; FileCounter < argc; ++FileCounter) { cout << "Converting file #" << FileCounter << endl; // OK, open the image file image = corona::OpenImage (argv[FileCounter], corona::PF_R8G8B8A8); if (!image) { cout << "I could not open " << argv[FileCounter] << endl << endl; system ("pause"); return (0); } cout << "Loading " << argv[FileCounter] << ":" << endl; width = image->getWidth (); height = image->getHeight (); chans = 4; // forced for now (by corona) ScanWidth = chans * width; TheSize = ScanWidth * height; Data = (unsigned char *)(image->getPixels()); // invert this (used to convert depth-map to height-map) if (false) { for (int px = 0; px < width*height; ++px) { Data[px*chans] = 255 - Data[px*chans]; } } iheight = 1.0 / height; iwidth = 1.0 / width; wProgress = width / 50; cout << endl << "The image " << argv[FileCounter] << " has been loaded." << endl; cout << " size: " << width << " x " << height << " by " << 32 << " bits" << endl; cout << endl << "Computing:" << endl; // warning message if the texture is not square if (width != height) { cout << endl << "Warning: The image is not square! Results not guaranteed!" << endl; system ("pause"); } // Redo the best, and save it // (only writing formats supported: PNG, TGA) // ((and PNG is compressed and has alpha!!)) strcpy (OutName, argv[FileCounter]); strcat (OutName, "-step.png"); tin = clock (); // pre-processing: compute derivatives cout << "Calculating derivatives ["; for (int y = 0; y < height; ++y) { // progress report: works great...if it's square! if (y % wProgress == 0) { cout << "."; } for (int x = 0; x < width; ++x) { int der; // Blue is the slope in x if (x == 0) { der = (Data[y*ScanWidth + chans*(x+1)] - Data[y*ScanWidth + chans*(x)]) / 2; } else if (x == width - 1) { der = (Data[y*ScanWidth + chans*(x)] - Data[y*ScanWidth + chans*(x-1)]) / 2; } else { der = Data[y*ScanWidth + chans*(x+1)] - Data[y*ScanWidth + chans*(x-1)]; } Data[y*ScanWidth + chans*x + 2] = 127 + der / 2; // Alpha is the slope in y if (y == 0) { der = (Data[(y+1)*ScanWidth + chans*x] - Data[(y)*ScanWidth + chans*x]) / 2; } else if (y == height - 1) { der = (Data[(y)*ScanWidth + chans*x] - Data[(y-1)*ScanWidth + chans*x]) / 2; } else { der = (Data[(y+1)*ScanWidth + chans*x] - Data[(y-1)*ScanWidth + chans*x]); } // And the sign of Y will be reversed in OpenGL Data[y*ScanWidth + chans*x + 3] = 127 - der / 2; } } cout << "]" << endl; // OK, do the processing for (int y = 0; y < height; ++y) { cout << "img " << (argc - FileCounter) << ": line " << (height - y) << " ["; for (int x = 0; x < width; ++x) { float min_ratio2, actual_ratio; int x1, x2, y1, y2; float ht, dhdx, dhdy, r2, h2; if ((x % wProgress) == 0) { cout << "."; } // set up some initial values // (note I'm using ratio squared throughout, // and taking sqrt at the end...faster) min_ratio2 = max_ratio * max_ratio; // information about this center point ht = Data[y*ScanWidth + chans*x] / 255.0; dhdx = +(Data[y*ScanWidth + chans*x + 2] / 255.0 - 0.5) * width; dhdy = -(Data[y*ScanWidth + chans*x + 3] / 255.0 - 0.5) * height; // scan in outwardly expanding blocks // (so I can stop if I reach my minimum ratio) for ( int rad = 1; (rad*rad <= 1.1*1.1*(1.0-ht)*(1.0-ht)*min_ratio2*width*height) && (rad <= 1.1*(1.0-ht)*width) && (rad <= 1.1*(1.0-ht)*height); ++rad ) { // ok, for each of these lines... // West x1 = x - rad; while (x_tileable && (x1 < 0)) { x1 += width; } if (x1 >= 0) { float delx = -rad*iwidth; // y limits // (+- 1 because I'll cover the corners in the X-run) y1 = y - rad + 1; if (y1 < 0) { y1 = 0; } y2 = y + rad - 1; if (y2 >= height) { y2 = height - 1; } // and check the line for (int dy = y1; dy <= y2; ++dy) { float dely = (dy-y)*iheight; r2 = delx*delx + dely*dely; h2 = Data[dy*ScanWidth + chans*x1] / 255.0 - ht; if ((h2 > 0.0) && (h2*h2 * min_ratio2> r2)) { // this is the new (lowest) value min_ratio2 = r2 / (h2 * h2); } } } // East x2 = x + rad; while (x_tileable && (x2 >= width)) { x2 -= width; } if (x2 < width) { float delx = rad*iwidth; // y limits // (+- 1 because I'll cover the corners in the X-run) y1 = y - rad + 1; if (y1 < 0) { y1 = 0; } y2 = y + rad - 1; if (y2 >= height) { y2 = height - 1; } // and check the line for (int dy = y1; dy <= y2; ++dy) { float dely = (dy-y)*iheight; r2 = delx*delx + dely*dely; h2 = Data[dy*ScanWidth + chans*x2] / 255.0 - ht; if ((h2 > 0.0) && (h2*h2 * min_ratio2> r2)) { // this is the new (lowest) value min_ratio2 = r2 / (h2 * h2); } } } // North y1 = y - rad; while (y_tileable && (y1 < 0)) { y1 += height; } if (y1 >= 0) { float dely = -rad*iheight; // y limits // (+- 1 because I'll cover the corners in the X-run) x1 = x - rad; if (x1 < 0) { x1 = 0; } x2 = x + rad; if (x2 >= width) { x2 = width - 1; } // and check the line for (int dx = x1; dx <= x2; ++dx) { float delx = (dx-x)*iwidth; r2 = delx*delx + dely*dely; h2 = Data[y1*ScanWidth + chans*dx] / 255.0 - ht; if ((h2 > 0.0) && (h2*h2 * min_ratio2> r2)) { // this is the new (lowest) value min_ratio2 = r2 / (h2 * h2); } } } // South y2 = y + rad; while (y_tileable && (y2 >= height)) { y2 -= height; } if (y2 < height) { float dely = rad*iheight; // y limits // (+- 1 because I'll cover the corners in the X-run) x1 = x - rad; if (x1 < 0) { x1 = 0; } x2 = x + rad; if (x2 >= width) { x2 = width - 1; } // and check the line for (int dx = x1; dx <= x2; ++dx) { float delx = (dx-x)*iwidth; r2 = delx*delx + dely*dely; h2 = Data[y2*ScanWidth + chans*dx] / 255.0 - ht; if ((h2 > 0.0) && (h2*h2 * min_ratio2> r2)) { // this is the new (lowest) value min_ratio2 = r2 / (h2 * h2); } } } //done with the expanding loop } /********** CONE VERSION **********/ // actually I have the ratio squared. Sqrt it actual_ratio = sqrt (min_ratio2); // here I need to scale to 1.0 actual_ratio /= max_ratio; // most of the data is on the low end...sqrting again spreads it better // (plus multiply is a cheap operation in shaders!) actual_ratio = sqrt (actual_ratio); // Red stays height // Blue remains the slope in x // Alpha remains the slope in y // but Green becomes Step-Cone-Ratio Data[y*ScanWidth + chans*x + 1] = static_cast<unsigned char>(255.0 * actual_ratio + 0.5); // but make sure it is > 0.0, since I divide by it in the shader if (Data[y*ScanWidth + chans*x + 1] < 1) { Data[y*ScanWidth + chans*x + 1] = 1; } } cout << "]" << endl; } // end my timing after the computation phase tin = clock() - tin; cout << "Processed in " << tin * 0.001 << " seconds" << endl << endl; outimage = corona::CreateImage(width, height, corona::PF_R8G8B8A8, Data); if (outimage != NULL) { tin = clock(); cout << "Saved: " << OutName << " " << corona::SaveImage (OutName, corona::FF_PNG, outimage) << endl; tin = clock() - tin; cout << "(That took " << tin * 0.001 << " seconds)" << endl; } else { cout << "Couldn't create the new image" << endl; } // Report cout << endl << endl; } // and finish all file names passed in /* cout << "really_max = " << really_max << endl; system ("pause"); */ // And end it cout << endl << "Done processing the image(s)." << endl; //system("PAUSE"); return(0); }However, it can be quite slow, so you can use my implementation. You’ll need OpenGL v4.3+ Core and Glad2. I’ll be releasing a cone map generator for Godot soon.
Compute Shader (GLSL):
#version 430 layout(local_size_x = 16, local_size_y = 16) in; layout(std430, binding = 0) buffer DataBuffer { uint data[]; }; const float inv_255 = 1.0 / 255.0; const float max_ratio = 1.0; const float tolerance = 1.1; const float tolerance2 = tolerance * tolerance; uniform int side; uniform int side2; void main() { int x = int(gl_GlobalInvocationID.x); int y = int(gl_GlobalInvocationID.y); if (x >= side || y >= side) { return; } int idx = y * side + x; float height = float((data[idx] >> 0) & 0xFFu) * inv_255; float lheight = float((data[y * side + max(x - 1, 0)] >> 0) & 0xFFu) * inv_255; float rheight = float((data[y * side + min(x + 1, side - 1)] >> 0) & 0xFFu) * inv_255; float uheight = float((data[max(y - 1, 0) * side + x] >> 0) & 0xFFu) * inv_255; float dheight = float((data[min(y + 1, side - 1) * side + x] >> 0) & 0xFFu) * inv_255; float inv_side = 1.0 / float(side); float depth = 1.0 - height; float depth2 = depth * depth; int x_left, x_right, y_up, y_down; float x_offset, y_offset, dhdx, dhdy; float min_ratio2 = max_ratio * max_ratio; float current_ratio = 0.0; for (int r = 1; float(r) * float(r) <= tolerance2 * depth2 * min_ratio2 * float(side2) && r <= tolerance * depth * float(side); ++r) { // Left. x_left = (x + side - r) % side; x_offset = -float(r) * inv_side; y_up = max(y - r + 1, 0); y_down = min(y + r - 1, side - 1); for (int delta_y = y_up; delta_y <= y_down; ++delta_y) { y_offset = float(delta_y - y) * inv_side; float distance2 = x_offset * x_offset + y_offset * y_offset; float delta_height = float((data[delta_y * side + x_left] >> 0) & 0xFFu) * inv_255 - height; if (delta_height > 0.0) { float delta_height2 = delta_height * delta_height; if (delta_height2 * min_ratio2 > distance2) { min_ratio2 = distance2 / delta_height2; } } } // Right. x_right = (x + r) % side; x_offset = float(r) * inv_side; y_up = max(y - r + 1, 0); y_down = min(y + r - 1, side - 1); for (int delta_y = y_up; delta_y <= y_down; ++delta_y) { y_offset = float(delta_y - y) * inv_side; float distance2 = x_offset * x_offset + y_offset * y_offset; float delta_height = float((data[delta_y * side + x_right] >> 0) & 0xFFu) * inv_255 - height; if (delta_height > 0.0) { float delta_height2 = delta_height * delta_height; if (delta_height2 * min_ratio2 > distance2) { min_ratio2 = distance2 / delta_height2; } } } // Up. y_up = (y + side - r) % side; y_offset = -float(r) * inv_side; x_left = max(x - r, 0); x_right = min(x + r, side - 1); for (int delta_x = x_left; delta_x <= x_right; ++delta_x) { x_offset = float(delta_x - x) * inv_side; float distance2 = x_offset * x_offset + y_offset * y_offset; float delta_height = float((data[y_up * side + delta_x] >> 0) & 0xFFu) * inv_255 - height; if (delta_height > 0.0) { float delta_height2 = delta_height * delta_height; if (delta_height2 * min_ratio2 > distance2) { min_ratio2 = distance2 / delta_height2; } } } // Down. y_down = (y + r) % side; y_offset = float(r) * inv_side; x_left = max(x - r, 0); x_right = min(x + r, side - 1); for (int delta_x = x_left; delta_x <= x_right; ++delta_x) { x_offset = float(delta_x - x) * inv_side; float distance2 = x_offset * x_offset + y_offset * y_offset; float delta_height = float((data[y_down * side + delta_x] >> 0) & 0xFFu) * inv_255 - height; if (delta_height > 0.0) { float delta_height2 = delta_height * delta_height; if (delta_height2 * min_ratio2 > distance2) { min_ratio2 = distance2 / delta_height2; } } } } current_ratio = sqrt(min_ratio2); current_ratio /= max_ratio; current_ratio = sqrt(current_ratio); data[idx] = (data[idx] & ~(0xFFu << 8)) | (max(uint(255.0 * current_ratio + 0.5), 1u) << 8); // dhdx. if (x == 0) { dhdx = (rheight - height) * 0.5; } else if (x == side - 1) { dhdx = (height - lheight) * 0.5; } else { dhdx = (rheight - lheight); } data[idx] = (data[idx] & ~(0xFFu << 16)) | (uint(clamp((dhdx * 0.5 + 0.5) * 255.0, 0.0, 255.0)) << 16); // dhdy. if (y == 0) { dhdy = (dheight - height) * 0.5; } else if (y == side - 1) { dhdy = (height - uheight) * 0.5; } else { dhdy = (dheight - uheight); } data[idx] = (data[idx] & ~(0xFFu << 24)) | (uint(clamp((-dhdy * 0.5 + 0.5) * 255.0, 0.0, 255.0)) << 24); }On my Quadro RTX 4000 GPU for a texture of:
1024 * 1024 – 2476318700 ns
2048 * 2048 – 9084063900 ns