// File: basisu_astc_ldr_common.cpp // Copyright (C) 2019-2026 Binomial LLC. All Rights Reserved. // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. #include "basisu_enc.h" #include "../transcoder/basisu_astc_helpers.h" #include "../transcoder/basisu_astc_hdr_core.h" #include "basisu_astc_hdr_common.h" #include "basisu_astc_ldr_common.h" #define BASISU_ASTC_LDR_DEBUG_MSGS (1) namespace basisu { namespace astc_ldr { static bool g_initialized; static vec4F g_astc_ls_raw_weights_ise[ASTC_LDR_MAX_RAW_WEIGHTS]; color_rgba blue_contract_enc(color_rgba orig, bool& did_clamp, int encoded_b) { color_rgba enc; int tr = orig.r * 2 - encoded_b; int tg = orig.g * 2 - encoded_b; if ((tr < 0) || (tr > 255) || (tg < 0) || (tg > 255)) did_clamp = true; enc.r = (uint8_t)basisu::clamp(tr, 0, 255); enc.g = (uint8_t)basisu::clamp(tg, 0, 255); enc.b = (uint8_t)orig.b; enc.a = orig.a; return enc; } color_rgba blue_contract_dec(int enc_r, int enc_g, int enc_b, int enc_a) { color_rgba dec; dec.r = (uint8_t)((enc_r + enc_b) >> 1); dec.g = (uint8_t)((enc_g + enc_b) >> 1); dec.b = (uint8_t)enc_b; dec.a = (uint8_t)enc_a; return dec; } void global_init() { if (g_initialized) return; // Precomputed weight constants used during least fit determination. For each entry: w * w, (1.0f - w) * w, (1.0f - w) * (1.0f - w), w for (uint32_t iw = 0; iw <= 64; iw++) { float w = (float)iw * (1.0f / 64.0f); g_astc_ls_raw_weights_ise[iw].set(w * w, (1.0f - w) * w, (1.0f - w) * (1.0f - w), w); } g_initialized = true; } static inline const vec4F* get_ls_weights_ise(uint32_t weight_ise_range) { assert((weight_ise_range <= astc_helpers::BISE_32_LEVELS) || (weight_ise_range == astc_helpers::BISE_64_LEVELS)); // astc_helpers::BISE_64_LEVELS indicates raw [0,64] weights (65 total), otherwise ISE weights (<= 32 levels total) return (weight_ise_range == astc_helpers::BISE_64_LEVELS) ? g_astc_ls_raw_weights_ise : &g_astc_ls_weights_ise[weight_ise_range][0]; } static bool compute_least_squares_endpoints_1D( uint32_t N, const uint8_t* pSelectors, const vec4F* pSelector_weights, float* pXl, float* pXh, const float* pVals, float bounds_min, float bounds_max) { float z00 = 0.0f, z01 = 0.0f, z10 = 0.0f, z11 = 0.0f; float q00_r = 0.0f, q10_r = 0.0f, t_r = 0.0f; for (uint32_t i = 0; i < N; i++) { const uint32_t sel = pSelectors[i]; z00 += pSelector_weights[sel][0]; z10 += pSelector_weights[sel][1]; z11 += pSelector_weights[sel][2]; float w = pSelector_weights[sel][3]; q00_r += w * pVals[i]; t_r += pVals[i]; } q10_r = t_r - q00_r; z01 = z10; float det = z00 * z11 - z01 * z10; if (fabs(det) < 1e-8f) return false; det = 1.0f / det; float iz00, iz01, iz10, iz11; iz00 = z11 * det; iz01 = -z01 * det; iz10 = -z10 * det; iz11 = z00 * det; *pXh = (float)(iz00 * q00_r + iz01 * q10_r); *pXl = (float)(iz10 * q00_r + iz11 * q10_r); float l = saturate(*pXl), h = saturate(*pXh); if (bounds_min == bounds_max) { l = bounds_min; h = bounds_max; } *pXl = l; *pXh = h; return true; } static bool compute_least_squares_endpoints_2D( uint32_t N, const uint8_t* pSelectors, const vec4F* pSelector_weights, vec2F* pXl, vec2F* pXh, const vec2F* pColors, const vec2F& bounds_min, const vec2F& bounds_max) { float z00 = 0.0f, z01 = 0.0f, z10 = 0.0f, z11 = 0.0f; float q00_r = 0.0f, q10_r = 0.0f, t_r = 0.0f; float q00_g = 0.0f, q10_g = 0.0f, t_g = 0.0f; for (uint32_t i = 0; i < N; i++) { const uint32_t sel = pSelectors[i]; z00 += pSelector_weights[sel][0]; z10 += pSelector_weights[sel][1]; z11 += pSelector_weights[sel][2]; float w = pSelector_weights[sel][3]; q00_r += w * pColors[i][0]; t_r += pColors[i][0]; q00_g += w * pColors[i][1]; t_g += pColors[i][1]; } q10_r = t_r - q00_r; q10_g = t_g - q00_g; z01 = z10; float det = z00 * z11 - z01 * z10; if (fabs(det) < 1e-8f) return false; det = 1.0f / det; float iz00, iz01, iz10, iz11; iz00 = z11 * det; iz01 = -z01 * det; iz10 = -z10 * det; iz11 = z00 * det; (*pXh)[0] = (float)(iz00 * q00_r + iz01 * q10_r); (*pXl)[0] = (float)(iz10 * q00_r + iz11 * q10_r); (*pXh)[1] = (float)(iz00 * q00_g + iz01 * q10_g); (*pXl)[1] = (float)(iz10 * q00_g + iz11 * q10_g); for (uint32_t c = 0; c < 2; c++) { float l = saturate((*pXl)[c]), h = saturate((*pXh)[c]); if (bounds_min[c] == bounds_max[c]) { l = bounds_min[c]; h = bounds_max[c]; } (*pXl)[c] = l; (*pXh)[c] = h; } return true; } static bool compute_least_squares_endpoints_3D( uint32_t N, const uint8_t* pSelectors, const vec4F* pSelector_weights, vec4F* pXl, vec4F* pXh, const vec4F* pColors, const vec4F& bounds_min, const vec4F& bounds_max) { float z00 = 0.0f, z01 = 0.0f, z10 = 0.0f, z11 = 0.0f; float q00_r = 0.0f, q10_r = 0.0f, t_r = 0.0f; float q00_g = 0.0f, q10_g = 0.0f, t_g = 0.0f; float q00_b = 0.0f, q10_b = 0.0f, t_b = 0.0f; for (uint32_t i = 0; i < N; i++) { const uint32_t sel = pSelectors[i]; z00 += pSelector_weights[sel][0]; z10 += pSelector_weights[sel][1]; z11 += pSelector_weights[sel][2]; float w = pSelector_weights[sel][3]; q00_r += w * pColors[i][0]; t_r += pColors[i][0]; q00_g += w * pColors[i][1]; t_g += pColors[i][1]; q00_b += w * pColors[i][2]; t_b += pColors[i][2]; } q10_r = t_r - q00_r; q10_g = t_g - q00_g; q10_b = t_b - q00_b; z01 = z10; float det = z00 * z11 - z01 * z10; if (fabs(det) < 1e-8f) return false; det = 1.0f / det; float iz00, iz01, iz10, iz11; iz00 = z11 * det; iz01 = -z01 * det; iz10 = -z10 * det; iz11 = z00 * det; (*pXh)[0] = (float)(iz00 * q00_r + iz01 * q10_r); (*pXl)[0] = (float)(iz10 * q00_r + iz11 * q10_r); (*pXh)[1] = (float)(iz00 * q00_g + iz01 * q10_g); (*pXl)[1] = (float)(iz10 * q00_g + iz11 * q10_g); (*pXh)[2] = (float)(iz00 * q00_b + iz01 * q10_b); (*pXl)[2] = (float)(iz10 * q00_b + iz11 * q10_b); (*pXh)[3] = 0; (*pXl)[3] = 0; for (uint32_t c = 0; c < 3; c++) { float l = saturate((*pXl)[c]), h = saturate((*pXh)[c]); if (bounds_min[c] == bounds_max[c]) { l = bounds_min[c]; h = bounds_max[c]; } (*pXl)[c] = l; (*pXh)[c] = h; } return true; } static bool compute_least_squares_endpoints_4D( uint32_t N, const uint8_t* pSelectors, const vec4F* pSelector_weights, vec4F* pXl, vec4F* pXh, const vec4F* pColors, const vec4F& bounds_min, const vec4F& bounds_max) { float z00 = 0.0f, z01 = 0.0f, z10 = 0.0f, z11 = 0.0f; float q00_r = 0.0f, q10_r = 0.0f, t_r = 0.0f; float q00_g = 0.0f, q10_g = 0.0f, t_g = 0.0f; float q00_b = 0.0f, q10_b = 0.0f, t_b = 0.0f; float q00_a = 0.0f, q10_a = 0.0f, t_a = 0.0f; for (uint32_t i = 0; i < N; i++) { const uint32_t sel = pSelectors[i]; z00 += pSelector_weights[sel][0]; z10 += pSelector_weights[sel][1]; z11 += pSelector_weights[sel][2]; float w = pSelector_weights[sel][3]; q00_r += w * pColors[i][0]; t_r += pColors[i][0]; q00_g += w * pColors[i][1]; t_g += pColors[i][1]; q00_b += w * pColors[i][2]; t_b += pColors[i][2]; q00_a += w * pColors[i][3]; t_a += pColors[i][3]; } q10_r = t_r - q00_r; q10_g = t_g - q00_g; q10_b = t_b - q00_b; q10_a = t_a - q00_a; z01 = z10; float det = z00 * z11 - z01 * z10; if (fabs(det) < 1e-8f) return false; det = 1.0f / det; float iz00, iz01, iz10, iz11; iz00 = z11 * det; iz01 = -z01 * det; iz10 = -z10 * det; iz11 = z00 * det; (*pXh)[0] = (float)(iz00 * q00_r + iz01 * q10_r); (*pXl)[0] = (float)(iz10 * q00_r + iz11 * q10_r); (*pXh)[1] = (float)(iz00 * q00_g + iz01 * q10_g); (*pXl)[1] = (float)(iz10 * q00_g + iz11 * q10_g); (*pXh)[2] = (float)(iz00 * q00_b + iz01 * q10_b); (*pXl)[2] = (float)(iz10 * q00_b + iz11 * q10_b); (*pXh)[3] = (float)(iz00 * q00_a + iz01 * q10_a); (*pXl)[3] = (float)(iz10 * q00_a + iz11 * q10_a); for (uint32_t c = 0; c < 4; c++) { float l = saturate((*pXl)[c]), h = saturate((*pXh)[c]); if (bounds_min[c] == bounds_max[c]) { l = bounds_min[c]; h = bounds_max[c]; } (*pXl)[c] = l; (*pXh)[c] = h; } return true; } #if 0 static void dequant_astc_weights(uint32_t n, const uint8_t* pSrc_ise_vals, uint32_t from_ise_range, uint8_t* pDst_raw_weights) { const auto& dequant_tab = astc_helpers::g_dequant_tables.get_weight_tab(from_ise_range).m_ISE_to_val; for (uint32_t i = 0; i < n; i++) pDst_raw_weights[i] = dequant_tab[pSrc_ise_vals[i]]; } #endif #if 0 static void dequant_astc_endpoints(uint32_t n, const uint8_t* pSrc_ise_vals, uint32_t from_ise_range, uint8_t* pDst_raw_weights) { const auto& dequant_tab = astc_helpers::g_dequant_tables.get_endpoint_tab(from_ise_range).m_ISE_to_val; for (uint32_t i = 0; i < n; i++) pDst_raw_weights[i] = dequant_tab[pSrc_ise_vals[i]]; } #endif int apply_delta_to_bise_weight_val(uint32_t weight_ise_range, int ise_val, int delta) { if (delta == 0) return ise_val; uint32_t num_ise_levels = astc_helpers::get_ise_levels(weight_ise_range); const auto& ISE_to_rank = astc_helpers::g_dequant_tables.get_weight_tab(weight_ise_range).m_ISE_to_rank; const auto& rank_to_ISE = astc_helpers::g_dequant_tables.get_weight_tab(weight_ise_range).m_rank_to_ISE; int cur_rank = ISE_to_rank[ise_val]; int new_rank = basisu::clamp(cur_rank + delta, 0, (int)num_ise_levels - 1); return rank_to_ISE[new_rank]; } // v must be [0,1] // converts to nearest ISE index with proper precise rounding static uint8_t precise_round_bise_endpoint_val(float v, uint32_t endpoint_ise_range) { assert((v >= 0) && (v <= 1.0f)); const auto& quant_tab = astc_helpers::g_dequant_tables.get_endpoint_tab(endpoint_ise_range).m_val_to_ise; const auto& dequant_tab = astc_helpers::g_dequant_tables.get_endpoint_tab(endpoint_ise_range).m_ISE_to_val; v = saturate(v); const int iv = clamp((int)std::roundf(v * 255.0f), 0, 255); uint8_t ise_index = 0; float best_err = BIG_FLOAT_VAL; for (int iscale_delta = -1; iscale_delta <= 1; iscale_delta++) { const int trial_ise_index = astc_helpers::apply_delta_to_bise_endpoint_val(endpoint_ise_range, quant_tab[iv], iscale_delta); const float dequant_val = dequant_tab[trial_ise_index] * (1.0f / 255.0f); const float dequant_err = fabs(dequant_val - v); if (dequant_err < best_err) { best_err = dequant_err; ise_index = (uint8_t)trial_ise_index; } } // iscale_delta return ise_index; } // returns true if blue contraction was actually used // note the encoded endpoints may be swapped // TODO: Pass in vec4F l/h and let it more precisely quantize in here. struct cem_encode_ldr_rgb_or_rgba_direct_result { bool m_is_blue_contracted; bool m_endpoints_are_swapped; bool m_any_degen; }; static cem_encode_ldr_rgb_or_rgba_direct_result cem_encode_ldr_rgb_or_rgba_direct( uint32_t cem_index, uint32_t endpoint_ise_range, const color_rgba& l, const color_rgba& h, uint8_t* pEndpoint_vals, bool try_blue_contract) { assert((cem_index == astc_helpers::CEM_LDR_RGB_DIRECT) || (cem_index == astc_helpers::CEM_LDR_RGBA_DIRECT)); cem_encode_ldr_rgb_or_rgba_direct_result res; bool& endpoints_are_swapped = res.m_endpoints_are_swapped; bool& any_degen = res.m_any_degen; bool& is_blue_contracted = res.m_is_blue_contracted; assert((cem_index == astc_helpers::CEM_LDR_RGB_DIRECT) || (cem_index == astc_helpers::CEM_LDR_RGBA_DIRECT)); const bool has_alpha = (cem_index == astc_helpers::CEM_LDR_RGBA_DIRECT); const auto& quant_tab = astc_helpers::g_dequant_tables.get_endpoint_tab(endpoint_ise_range).m_val_to_ise; const auto& dequant_tab = astc_helpers::g_dequant_tables.get_endpoint_tab(endpoint_ise_range).m_ISE_to_val; //const auto &ISE_to_rank = astc_helpers::g_dequant_tables.get_endpoint_tab(endpoint_ise_range).m_ISE_to_rank; //const auto &rank_to_ISE = astc_helpers::g_dequant_tables.get_endpoint_tab(endpoint_ise_range).m_rank_to_ISE; color_rgba enc_l(l), enc_h(h); endpoints_are_swapped = false; is_blue_contracted = false; if (try_blue_contract) { int enc_v4 = quant_tab[enc_l.b], enc_v5 = quant_tab[enc_h.b]; int dec_v4 = dequant_tab[enc_v4], dec_v5 = dequant_tab[enc_v5]; bool did_clamp = false; enc_l = blue_contract_enc(h, did_clamp, dec_v5); // yes, they're swapped in the spec enc_h = blue_contract_enc(l, did_clamp, dec_v4); if (!did_clamp) { is_blue_contracted = true; endpoints_are_swapped = true; } else { enc_l = l; enc_h = h; } } int enc_v0 = quant_tab[enc_l.r], enc_v2 = quant_tab[enc_l.g], enc_v4 = quant_tab[enc_l.b]; int enc_v1 = quant_tab[enc_h.r], enc_v3 = quant_tab[enc_h.g], enc_v5 = quant_tab[enc_h.b]; int enc_v6 = 0, enc_v7 = 0; if (has_alpha) { enc_v6 = quant_tab[enc_l.a]; enc_v7 = quant_tab[enc_h.a]; } any_degen = false; if ((enc_v0 == enc_v1) && (l.r != h.r)) any_degen = true; if ((enc_v2 == enc_v3) && (l.g != h.g)) any_degen = true; if ((enc_v4 == enc_v5) && (l.b != h.b)) any_degen = true; if (has_alpha) { if ((enc_v6 == enc_v7) && (l.a != h.a)) any_degen = true; } int dec_v0 = dequant_tab[enc_v0], dec_v2 = dequant_tab[enc_v2], dec_v4 = dequant_tab[enc_v4]; int dec_v1 = dequant_tab[enc_v1], dec_v3 = dequant_tab[enc_v3], dec_v5 = dequant_tab[enc_v5]; int s0 = dec_v0 + dec_v2 + dec_v4; int s1 = dec_v1 + dec_v3 + dec_v5; bool should_swap = false; if ((s1 == s0) && (is_blue_contracted)) { // if sums are equal we can't use blue contraction at all, so undo it enc_l = l; enc_h = h; is_blue_contracted = false; endpoints_are_swapped = false; enc_v0 = quant_tab[enc_l.r], enc_v2 = quant_tab[enc_l.g], enc_v4 = quant_tab[enc_l.b]; enc_v1 = quant_tab[enc_h.r], enc_v3 = quant_tab[enc_h.g], enc_v5 = quant_tab[enc_h.b]; dec_v0 = dequant_tab[enc_v0], dec_v2 = dequant_tab[enc_v2], dec_v4 = dequant_tab[enc_v4]; dec_v1 = dequant_tab[enc_v1], dec_v3 = dequant_tab[enc_v3], dec_v5 = dequant_tab[enc_v5]; if (has_alpha) { enc_v6 = quant_tab[enc_l.a]; enc_v7 = quant_tab[enc_h.a]; } s0 = dec_v0 + dec_v2 + dec_v4; s1 = dec_v1 + dec_v3 + dec_v5; } if (s1 >= s0) { if (is_blue_contracted) should_swap = true; } else { if (!is_blue_contracted) should_swap = true; } if (should_swap) { endpoints_are_swapped = !endpoints_are_swapped; std::swap(enc_v0, enc_v1); std::swap(enc_v2, enc_v3); std::swap(enc_v4, enc_v5); std::swap(enc_v6, enc_v7); } pEndpoint_vals[0] = (uint8_t)enc_v0; pEndpoint_vals[1] = (uint8_t)enc_v1; pEndpoint_vals[2] = (uint8_t)enc_v2; pEndpoint_vals[3] = (uint8_t)enc_v3; pEndpoint_vals[4] = (uint8_t)enc_v4; pEndpoint_vals[5] = (uint8_t)enc_v5; if (has_alpha) { pEndpoint_vals[6] = (uint8_t)enc_v6; pEndpoint_vals[7] = (uint8_t)enc_v7; } #ifdef _DEBUG { int check_s0 = dequant_tab[enc_v0] + dequant_tab[enc_v2] + dequant_tab[enc_v4]; int check_s1 = dequant_tab[enc_v1] + dequant_tab[enc_v3] + dequant_tab[enc_v5]; if (check_s1 >= check_s0) { assert(!is_blue_contracted); } else { assert(is_blue_contracted); } } #endif return res; } // Cannot fail // scale=1 cannot be packed static void cem_encode_ldr_rgb_or_rgba_base_scale( uint32_t cem_index, uint32_t endpoint_ise_range, float scale, float l_a, const vec4F& h, uint8_t* pEndpoint_vals) { assert((cem_index == astc_helpers::CEM_LDR_RGB_BASE_SCALE) || (cem_index == astc_helpers::CEM_LDR_RGB_BASE_SCALE_PLUS_TWO_A)); assert((scale >= 0.0f) && (scale < 1.0f)); const bool has_alpha = (cem_index == astc_helpers::CEM_LDR_RGB_BASE_SCALE_PLUS_TWO_A); const auto& quant_tab = astc_helpers::g_dequant_tables.get_endpoint_tab(endpoint_ise_range).m_val_to_ise; const auto& dequant_tab = astc_helpers::g_dequant_tables.get_endpoint_tab(endpoint_ise_range).m_ISE_to_val; const uint32_t total_vals_to_pack = has_alpha ? 6 : 4; float vals_to_pack[6] = { 0 }; vals_to_pack[0] = h[0]; vals_to_pack[1] = h[1]; vals_to_pack[2] = h[2]; vals_to_pack[3] = clamp(scale * (256.0f / 255.0f), 0.0f, 1.0f); if (has_alpha) { vals_to_pack[4] = l_a; vals_to_pack[5] = h[3]; } for (uint32_t c = 0; c < total_vals_to_pack; c++) { const float v = vals_to_pack[c]; const int iv = clamp((int)std::roundf(v * 255.0f), 0, 255); float best_err = BIG_FLOAT_VAL; for (int iscale_delta = -1; iscale_delta <= 1; iscale_delta++) { const int trial_ise_index = astc_helpers::apply_delta_to_bise_endpoint_val(endpoint_ise_range, quant_tab[iv], iscale_delta); const float dequant_val = dequant_tab[trial_ise_index] * (1.0f / 255.0f); const float dequant_err = fabs(dequant_val - v); if (dequant_err < best_err) { best_err = dequant_err; pEndpoint_vals[c] = (uint8_t)trial_ise_index; } } // iscale_delta } // c } #if 0 static int clamp6(int val, bool& was_clamped) { if (val < -32) { val = -32; was_clamped = true; } else if (val > 31) { val = 31; was_clamped = true; } return val; } #endif // returns true if blue contraction was used // note the encoded endpoints may be swapped struct rgb_base_offset_res { bool m_failed_flag; bool m_used_blue_contraction; bool m_blue_contraction_clamped; bool m_delta_clamped; bool m_any_degen; bool m_endpoints_swapped; }; // May fail if the tiebreaking logic isn't strong enough. static rgb_base_offset_res cem_encode_ldr_rgb_or_rgba_base_offset(uint32_t cem_index, uint32_t endpoint_ise_range, const color_rgba& orig_l, const color_rgba& orig_h, uint8_t* pEndpoint_vals, bool use_blue_contract) { assert((cem_index == astc_helpers::CEM_LDR_RGB_BASE_PLUS_OFFSET) || (cem_index == astc_helpers::CEM_LDR_RGBA_BASE_PLUS_OFFSET)); const bool has_alpha = (cem_index == astc_helpers::CEM_LDR_RGBA_BASE_PLUS_OFFSET); rgb_base_offset_res res; res.m_failed_flag = false; res.m_used_blue_contraction = false; res.m_blue_contraction_clamped = false; res.m_delta_clamped = false; res.m_any_degen = false; res.m_endpoints_swapped = false; bool blue_contraction_clamped = false; bool status = basist::astc_ldr_t::pack_base_offset( cem_index, endpoint_ise_range, pEndpoint_vals, convert_to_basist_color_rgba(orig_l), convert_to_basist_color_rgba(orig_h), use_blue_contract, true, blue_contraction_clamped, res.m_delta_clamped, res.m_endpoints_swapped); assert(status); if (!status) { res.m_failed_flag = true; return res; } // Verify the actual BC status by unpacking to be absolutely sure res.m_used_blue_contraction = astc_helpers::used_blue_contraction(cem_index, pEndpoint_vals, endpoint_ise_range); color_rgba dec_l, dec_h; astc_ldr::decode_endpoints(cem_index, pEndpoint_vals, endpoint_ise_range, dec_l, dec_h); const uint32_t num_comps = (has_alpha ? 4 : 3); for (uint32_t c = 0; c < num_comps; c++) { if (orig_l[c] != orig_h[c]) continue; // Desired L/H are not equal, but packed are equal=degenerate pack (loss of freedom). if (dec_l[c] == dec_h[c]) { res.m_any_degen = true; break; } } // c return res; } // L or LA direct static void encode_cem0_4(uint32_t cem_index, float lum_l, float lum_h, float a_l, float a_h, uint32_t endpoint_ise_range, uint8_t* pEndpoints) { assert((cem_index == astc_helpers::CEM_LDR_LUM_DIRECT) || (cem_index == astc_helpers::CEM_LDR_LUM_ALPHA_DIRECT)); const bool has_alpha = (cem_index == astc_helpers::CEM_LDR_LUM_ALPHA_DIRECT); pEndpoints[0] = precise_round_bise_endpoint_val(lum_l, endpoint_ise_range); pEndpoints[1] = precise_round_bise_endpoint_val(lum_h, endpoint_ise_range); if (has_alpha) { pEndpoints[2] = precise_round_bise_endpoint_val(a_l, endpoint_ise_range); pEndpoints[3] = precise_round_bise_endpoint_val(a_h, endpoint_ise_range); } } // Returned in ISE order uint32_t get_colors(const color_rgba& l, const color_rgba& h, uint32_t weight_ise_index, color_rgba* pColors, bool decode_mode_srgb) { const uint32_t total_weights = astc_helpers::get_ise_levels(weight_ise_index); for (uint32_t i = 0; i < total_weights; i++) { uint32_t w = basisu::g_ise_weight_lerps[weight_ise_index][1 + i]; for (uint32_t c = 0; c < 4; c++) { int le = l[c], he = h[c]; // TODO: Investigate alpha handling here vs. latest spec. // https://raw.githubusercontent.com/KhronosGroup/DataFormat/refs/heads/main/astc.txt // The safest thing to do may be to assume non-sRGB in the encoder. I don't know yet. // How should alpha be handled here for lowest divergence from actual ASTC decoding hardware? if (decode_mode_srgb) { le = (le << 8) | 0x80; he = (he << 8) | 0x80; } else { le = (le << 8) | le; he = (he << 8) | he; } uint32_t k = astc_helpers::weight_interpolate(le, he, w); // See https://registry.khronos.org/OpenGL/extensions/EXT/EXT_texture_compression_astc_decode_mode.txt // All channels including alpha >>8. pColors[i][c] = (uint8_t)(k >> 8); } // c } // i return total_weights; } // Returns 65 colors (NOT just 64 - 0-64 weight levels, so 65). uint32_t get_colors_raw_weights(const color_rgba& l, const color_rgba& h, color_rgba* pColors, bool decode_mode_srgb) { for (uint32_t w = 0; w <= 64; w++) { for (uint32_t c = 0; c < 4; c++) { int le = l[c], he = h[c]; // TODO: Investigate alpha handling here vs. latest spec. // https://raw.githubusercontent.com/KhronosGroup/DataFormat/refs/heads/main/astc.txt // The safest thing to do may be to assume non-sRGB in the encoder. I don't know yet. // How should alpha be handled here for lowest divergence from actual ASTC decoding hardware? if (decode_mode_srgb) { le = (le << 8) | 0x80; he = (he << 8) | 0x80; } else { le = (le << 8) | le; he = (he << 8) | he; } uint32_t k = astc_helpers::weight_interpolate(le, he, w); // See https://registry.khronos.org/OpenGL/extensions/EXT/EXT_texture_compression_astc_decode_mode.txt // All channels including alpha >>8. pColors[w][c] = (uint8_t)(k >> 8); } // c } // i return ASTC_LDR_MAX_RAW_WEIGHTS; } // Assumes ise 20 (256 levels) void decode_endpoints_ise20(uint32_t cem_index, const uint8_t* pEndpoint_vals, color_rgba& l, color_rgba& h) { assert(astc_helpers::is_cem_ldr(cem_index)); int ldr_endpoints[4][2]; astc_helpers::decode_endpoint(cem_index, ldr_endpoints, pEndpoint_vals); for (uint32_t c = 0; c < 4; c++) { assert((ldr_endpoints[c][0] >= 0) && (ldr_endpoints[c][0] <= 255)); assert((ldr_endpoints[c][1] >= 0) && (ldr_endpoints[c][1] <= 255)); l[c] = (uint8_t)ldr_endpoints[c][0]; h[c] = (uint8_t)ldr_endpoints[c][1]; } } void decode_endpoints(uint32_t cem_index, const uint8_t* pEndpoint_vals, uint32_t endpoint_ise_index, color_rgba& l, color_rgba& h, float* pScale) { const uint32_t total_endpoint_vals = astc_helpers::get_num_cem_values(cem_index); const auto& endpoint_dequant_tab = astc_helpers::g_dequant_tables.get_endpoint_tab(endpoint_ise_index).m_ISE_to_val; uint8_t dequantized_endpoints[astc_helpers::MAX_CEM_ENDPOINT_VALS]; for (uint32_t i = 0; i < total_endpoint_vals; i++) dequantized_endpoints[i] = endpoint_dequant_tab[pEndpoint_vals[i]]; decode_endpoints_ise20(cem_index, dequantized_endpoints, l, h); if ((pScale) && ((cem_index == astc_helpers::CEM_LDR_RGB_BASE_SCALE) || (cem_index == astc_helpers::CEM_LDR_RGB_BASE_SCALE_PLUS_TWO_A))) { *pScale = (float)dequantized_endpoints[3] * (1.0f / 256.0f); } } uint32_t get_colors(uint32_t cem_index, const uint8_t* pEndpoint_vals, uint32_t endpoint_ise_index, uint32_t weight_ise_index, color_rgba* pColors, bool decode_mode_srgb) { color_rgba l, h; decode_endpoints(cem_index, pEndpoint_vals, endpoint_ise_index, l, h); return get_colors(l, h, weight_ise_index, pColors, decode_mode_srgb); } // Decodes 65 colors uint32_t get_colors_raw_weights(uint32_t cem_index, const uint8_t* pEndpoint_vals, uint32_t endpoint_ise_index, color_rgba* pColors, bool decode_mode_srgb) { color_rgba l, h; decode_endpoints(cem_index, pEndpoint_vals, endpoint_ise_index, l, h); return get_colors_raw_weights(l, h, pColors, decode_mode_srgb); } #if 0 static vec4F calc_incremental_pca_4D(uint32_t num_pixels, const vec4F* pPixels, const vec4F& mean_f) { vec4F mean_axis(0.0f); for (uint32_t i = 0; i < num_pixels; i++) { vec4F orig_color(pPixels[i]); vec4F color(orig_color - mean_f); vec4F a(color * color[0]); vec4F b(color * color[1]); vec4F c(color * color[2]); vec4F d(color * color[3]); vec4F n(i ? mean_axis : color); n.normalize_in_place(); mean_axis[0] += a.dot(n); mean_axis[1] += b.dot(n); mean_axis[2] += c.dot(n); mean_axis[3] += d.dot(n); } if (mean_axis.norm() < 1e-5f) mean_axis = vec4F(1.0f, 1.0f, 1.0f, 1.0f); mean_axis.normalize_in_place(); return mean_axis; } #endif // TODO: Try two-step Lanczos iteration/Rayleigh–Ritz approximation in a 2-dimensional Krylov subspace method vs. power method. static vec4F calc_pca_4D(uint32_t num_pixels, const vec4F* pPixels, const vec4F& mean_f) { float m00 = 0, m01 = 0, m02 = 0, m03 = 0; float m11 = 0, m12 = 0, m13 = 0; float m22 = 0, m23 = 0; float m33 = 0; for (size_t i = 0; i < num_pixels; ++i) { const vec4F v(pPixels[i] - mean_f); m00 += v[0] * v[0]; m01 += v[0] * v[1]; m02 += v[0] * v[2]; m03 += v[0] * v[3]; m11 += v[1] * v[1]; m12 += v[1] * v[2]; m13 += v[1] * v[3]; m22 += v[2] * v[2]; m23 += v[2] * v[3]; m33 += v[3] * v[3]; } // TODO: Seed from channel variances vec4F v(.6f, .75f, .4f, .75f); const uint32_t NUM_POW_ITERS = 6; // must be even for (uint32_t i = 0; i < NUM_POW_ITERS; ++i) { vec4F w( m00 * v[0] + m01 * v[1] + m02 * v[2] + m03 * v[3], m01 * v[0] + m11 * v[1] + m12 * v[2] + m13 * v[3], m02 * v[0] + m12 * v[1] + m22 * v[2] + m23 * v[3], m03 * v[0] + m13 * v[1] + m23 * v[2] + m33 * v[3] ); if (i & 1) w.normalize_in_place(); v = w; } if (v.norm() < 1e-5f) v = vec4F(.5f, .5f, .5f, .5f); return v; } static vec4F calc_pca_3D(uint32_t num_pixels, const vec4F* pPixels, const vec4F& mean_f) { float cov[6] = { 0, 0, 0, 0, 0, 0 }; for (uint32_t i = 0; i < num_pixels; i++) { const vec4F& v = pPixels[i]; float r = v[0] - mean_f[0]; float g = v[1] - mean_f[1]; float b = v[2] - mean_f[2]; cov[0] += r * r; cov[1] += r * g; cov[2] += r * b; cov[3] += g * g; cov[4] += g * b; cov[5] += b * b; } float xr = .9f, xg = 1.0f, xb = .7f; for (uint32_t iter = 0; iter < 3; iter++) { float r = xr * cov[0] + xg * cov[1] + xb * cov[2]; float g = xr * cov[1] + xg * cov[3] + xb * cov[4]; float b = xr * cov[2] + xg * cov[4] + xb * cov[5]; float m = maximumf(maximumf(fabsf(r), fabsf(g)), fabsf(b)); if (m > 1e-10f) { m = 1.0f / m; r *= m; g *= m; b *= m; } xr = r; xg = g; xb = b; } float nrm = xr * xr + xg * xg + xb * xb; vec4F axis(0.57735027f, 0.57735027f, 0.57735027f, 0.0f); if (nrm > 1e-5f) { float inv_nrm = 1.0f / sqrtf(nrm); xr *= inv_nrm; xg *= inv_nrm; xb *= inv_nrm; axis.set(xr, xg, xb, 0); } return axis; } void pixel_stats_t::init(uint32_t num_pixels, const color_rgba* pPixels) { m_num_pixels = num_pixels; m_has_alpha = false; m_min.set(255, 255, 255, 255); m_max.set(0, 0, 0, 0); m_mean_f.clear(); for (uint32_t i = 0; i < m_num_pixels; i++) { const color_rgba& px = pPixels[i]; m_pixels[i] = px; m_pixels_f[i].set((float)px.r * (1.0f / 255.0f), (float)px.g * (1.0f / 255.0f), (float)px.b * (1.0f / 255.0f), (float)px.a * (1.0f / 255.0f)); m_mean_f += m_pixels_f[i]; m_min.r = basisu::minimum(m_min.r, px.r); m_min.g = basisu::minimum(m_min.g, px.g); m_min.b = basisu::minimum(m_min.b, px.b); m_min.a = basisu::minimum(m_min.a, px.a); m_max.r = basisu::maximum(m_max.r, px.r); m_max.g = basisu::maximum(m_max.g, px.g); m_max.b = basisu::maximum(m_max.b, px.b); m_max.a = basisu::maximum(m_max.a, px.a); } m_mean_f *= (1.0f / (float)m_num_pixels); m_mean_f.clamp(0.0f, 1.0f); m_min_f.set(m_min.r * (1.0f / 255.0f), m_min.g * (1.0f / 255.0f), m_min.b * (1.0f / 255.0f), m_min.a * (1.0f / 255.0f)); m_max_f.set(m_max.r * (1.0f / 255.0f), m_max.g * (1.0f / 255.0f), m_max.b * (1.0f / 255.0f), m_max.a * (1.0f / 255.0f)); m_has_alpha = (m_min.a < 255); // Mean and zero relative RGB (3D) PCA axes m_mean_rel_axis3 = calc_pca_3D(m_num_pixels, m_pixels_f, m_mean_f); m_zero_rel_axis3 = calc_pca_3D(m_num_pixels, m_pixels_f, vec4F(0.0f)); // Mean and zero relative RGBA (4D) PCA axes m_mean_rel_axis4 = calc_pca_4D(m_num_pixels, m_pixels_f, m_mean_f); for (uint32_t c = 0; c < 4u; c++) m_rgba_stats[c].calc_simplified_with_range(m_num_pixels, &m_pixels_f[0][c], 4); } static inline uint32_t square_of_diff(int a, int b) { assert((a >= 0) && (a <= 255)); assert((b >= 0) && (b <= 255)); int d = a - b; return (uint32_t)(d * d); } uint64_t eval_solution( const pixel_stats_t& pixel_stats, uint32_t total_weights, const color_rgba* pWeight_colors, uint8_t* pWeight_vals, uint32_t weight_ise_index, const cem_encode_params& params) { BASISU_NOTE_UNUSED(weight_ise_index); assert((total_weights <= 32) || (total_weights == 65)); uint64_t total_err = 0; if (params.m_pForced_weight_vals0) { for (uint32_t c = 0; c < pixel_stats.m_num_pixels; c++) { const color_rgba& px = pixel_stats.m_pixels[c]; const uint32_t w = params.m_pForced_weight_vals0[c]; assert(w < total_weights); uint32_t err = params.m_comp_weights[0] * square_of_diff(px.r, pWeight_colors[w].r) + params.m_comp_weights[1] * square_of_diff(px.g, pWeight_colors[w].g) + params.m_comp_weights[2] * square_of_diff(px.b, pWeight_colors[w].b) + params.m_comp_weights[3] * square_of_diff(px.a, pWeight_colors[w].a); total_err += err; pWeight_vals[c] = (uint8_t)w; } } else { for (uint32_t c = 0; c < pixel_stats.m_num_pixels; c++) { const color_rgba& px = pixel_stats.m_pixels[c]; uint32_t best_err = UINT32_MAX; uint32_t best_sel = 0; for (uint32_t i = 0; i < total_weights; i++) { uint32_t err = params.m_comp_weights[0] * square_of_diff(px.r, pWeight_colors[i].r) + params.m_comp_weights[1] * square_of_diff(px.g, pWeight_colors[i].g) + params.m_comp_weights[2] * square_of_diff(px.b, pWeight_colors[i].b) + params.m_comp_weights[3] * square_of_diff(px.a, pWeight_colors[i].a); if (err < best_err) { best_err = err; best_sel = i; } } total_err += best_err; pWeight_vals[c] = (uint8_t)best_sel; } } // if (params.m_pForced_weight_vals0) return total_err; } // Evaluates against raw weights [0,64], or to ISE quantized weights, depending on weight_ise_index. uint64_t eval_solution( const pixel_stats_t& pixel_stats, uint32_t cem_index, const uint8_t* pEndpoint_vals, uint32_t endpoint_ise_index, uint8_t* pWeight_vals, uint32_t weight_ise_index, const cem_encode_params& params) { assert((weight_ise_index <= astc_helpers::BISE_32_LEVELS) || (weight_ise_index == astc_helpers::BISE_64_LEVELS)); color_rgba weight_colors[ASTC_LDR_MAX_RAW_WEIGHTS]; uint32_t num_weights; assert((weight_ise_index <= astc_helpers::BISE_32_LEVELS) || (weight_ise_index == astc_helpers::BISE_64_LEVELS)); // 64 levels isn't valid ASTC. It's used for raw weight mode. if (weight_ise_index == astc_helpers::BISE_64_LEVELS) num_weights = get_colors_raw_weights(cem_index, pEndpoint_vals, endpoint_ise_index, weight_colors, params.m_decode_mode_srgb); else num_weights = get_colors(cem_index, pEndpoint_vals, endpoint_ise_index, weight_ise_index, weight_colors, params.m_decode_mode_srgb); assert(num_weights <= std::size(weight_colors)); uint64_t trial_err = eval_solution( pixel_stats, num_weights, weight_colors, pWeight_vals, weight_ise_index, params); return trial_err; } // Evaluates against raw weights [0,64], or to ISE quantized weights, depending on weight_ise_index. uint64_t eval_solution_dp( uint32_t ccs_index, const pixel_stats_t& pixel_stats, uint32_t total_weights, const color_rgba* pWeight_colors, uint8_t* pWeight_vals0, uint8_t* pWeight_vals1, uint32_t weight_ise_index, const cem_encode_params& params) { BASISU_NOTE_UNUSED(weight_ise_index); assert((ccs_index >= 0) && (ccs_index <= 3)); assert((total_weights <= 32) || (total_weights == 65)); uint64_t total_err = 0; if (params.m_pForced_weight_vals0) { for (uint32_t c = 0; c < pixel_stats.m_num_pixels; c++) { const color_rgba& px = pixel_stats.m_pixels[c]; const uint32_t w = params.m_pForced_weight_vals0[c]; assert(w < total_weights); uint32_t err = 0; for (uint32_t o = 0; o < 4; o++) if (o != ccs_index) err += params.m_comp_weights[o] * square_of_diff(px[o], pWeight_colors[w][o]); total_err += err; pWeight_vals0[c] = (uint8_t)w; } } else { for (uint32_t c = 0; c < pixel_stats.m_num_pixels; c++) { const color_rgba& px = pixel_stats.m_pixels[c]; uint32_t best_err = UINT32_MAX; uint32_t best_sel = 0; for (uint32_t i = 0; i < total_weights; i++) { uint32_t err = 0; for (uint32_t o = 0; o < 4; o++) if (o != ccs_index) err += params.m_comp_weights[o] * square_of_diff(px[o], pWeight_colors[i][o]); if (err < best_err) { best_err = err; best_sel = i; } } total_err += best_err; pWeight_vals0[c] = (uint8_t)best_sel; } } if (params.m_pForced_weight_vals1) { for (uint32_t c = 0; c < pixel_stats.m_num_pixels; c++) { const color_rgba& px = pixel_stats.m_pixels[c]; const uint32_t w = params.m_pForced_weight_vals1[c]; assert(w < total_weights); uint32_t err = square_of_diff(px[ccs_index], pWeight_colors[w][ccs_index]); total_err += err * params.m_comp_weights[ccs_index]; pWeight_vals1[c] = (uint8_t)w; } } else { for (uint32_t c = 0; c < pixel_stats.m_num_pixels; c++) { const color_rgba& px = pixel_stats.m_pixels[c]; uint32_t best_err = UINT32_MAX; uint32_t best_sel = 0; for (uint32_t i = 0; i < total_weights; i++) { uint32_t err = square_of_diff(px[ccs_index], pWeight_colors[i][ccs_index]); if (err < best_err) { best_err = err; best_sel = i; } } total_err += best_err * params.m_comp_weights[ccs_index]; pWeight_vals1[c] = (uint8_t)best_sel; } } return total_err; } // Evaluates against raw weights [0,64], or to ISE quantized weights, depending on weight_ise_index. uint64_t eval_solution_dp( const pixel_stats_t& pixel_stats, uint32_t cem_index, uint32_t ccs_index, const uint8_t* pEndpoint_vals, uint32_t endpoint_ise_index, uint8_t* pWeight_vals0, uint8_t* pWeight_vals1, uint32_t weight_ise_index, const cem_encode_params& params) { assert((weight_ise_index <= astc_helpers::BISE_32_LEVELS) || (weight_ise_index == astc_helpers::BISE_64_LEVELS)); color_rgba weight_colors[ASTC_LDR_MAX_RAW_WEIGHTS]; uint32_t num_weights; // 64 levels isn't valid ASTC. It's used for raw weight mode. if (weight_ise_index == astc_helpers::BISE_64_LEVELS) num_weights = get_colors_raw_weights(cem_index, pEndpoint_vals, endpoint_ise_index, weight_colors, params.m_decode_mode_srgb); else num_weights = get_colors(cem_index, pEndpoint_vals, endpoint_ise_index, weight_ise_index, weight_colors, params.m_decode_mode_srgb); uint64_t trial_err = eval_solution_dp( ccs_index, pixel_stats, num_weights, weight_colors, pWeight_vals0, pWeight_vals1, weight_ise_index, params); return trial_err; } // Direct - refine ISE quantized endpoints from float endpoints static void refine_cem8_or_12_endpoints(uint32_t cem_index, uint32_t endpoint_ise_range, uint8_t* pTrial_endpoint_vals, const vec4F& low_color_f, const vec4F& high_color_f, bool endpoints_are_swapped) { assert((cem_index == astc_helpers::CEM_LDR_RGB_DIRECT) || (cem_index == astc_helpers::CEM_LDR_RGBA_DIRECT)); if (endpoint_ise_range == astc_helpers::BISE_256_LEVELS) return; const uint32_t total_comps = (cem_index == astc_helpers::CEM_LDR_RGBA_DIRECT) ? 4 : 3; assert((cem_index == astc_helpers::CEM_LDR_RGB_DIRECT) || (cem_index == astc_helpers::CEM_LDR_RGBA_DIRECT)); assert((endpoint_ise_range >= astc_helpers::FIRST_VALID_ENDPOINT_ISE_RANGE) && (endpoint_ise_range <= astc_helpers::LAST_VALID_ENDPOINT_ISE_RANGE)); const uint32_t total_endpoint_vals = astc_helpers::get_num_cem_values(cem_index); const uint32_t num_endpoint_ise_levels = astc_helpers::get_ise_levels(endpoint_ise_range); const auto& endpoint_dequant_tab = astc_helpers::g_dequant_tables.get_endpoint_tab(endpoint_ise_range).m_ISE_to_val; const auto& ISE_to_rank = astc_helpers::g_dequant_tables.get_endpoint_tab(endpoint_ise_range).m_ISE_to_rank; const auto& rank_to_ISE = astc_helpers::g_dequant_tables.get_endpoint_tab(endpoint_ise_range).m_rank_to_ISE; const bool orig_used_blue_contraction = astc_helpers::cem8_or_12_used_blue_contraction(cem_index, pTrial_endpoint_vals, endpoint_ise_range); uint32_t first_comp = 0; uint8_t refined_endpoint_vals[astc_helpers::NUM_MODE12_ENDPOINTS]; memcpy(refined_endpoint_vals, pTrial_endpoint_vals, total_endpoint_vals); if (orig_used_blue_contraction) { // TODO expensive: 2*3*9 = 54 tries for (uint32_t e = 0; e < 2; e++) { float best_err = BIG_FLOAT_VAL; uint8_t best_refined_endpoint_vals[3] = { 0, 0, 0 }; for (int b_delta = -1; b_delta <= 1; b_delta++) { for (int k = 0; k < 9; k++) { const int r_delta = (k % 3) - 1; const int g_delta = (k / 3) - 1; const int comp_deltas[3] = { r_delta, g_delta, b_delta }; uint8_t trial_refined_endpoint_vals[3] = { 0, 0, 0 }; for (uint32_t c = 0; c < 3; c++) { const int enc_val = pTrial_endpoint_vals[c * 2 + e]; const int orig_rank = ISE_to_rank[enc_val]; const int v_delta = comp_deltas[c]; const int new_rank = basisu::clamp(orig_rank + v_delta, 0, (int)num_endpoint_ise_levels - 1); const int new_enc_ise_val = rank_to_ISE[new_rank]; trial_refined_endpoint_vals[c] = (uint8_t)new_enc_ise_val; } // c color_rgba trial_refined_endpoints_dequant(blue_contract_dec(endpoint_dequant_tab[trial_refined_endpoint_vals[0]], endpoint_dequant_tab[trial_refined_endpoint_vals[1]], endpoint_dequant_tab[trial_refined_endpoint_vals[2]], 255)); vec3F trial_refined_endpoints_dequant_f(0.0f); for (uint32_t c = 0; c < 3; c++) trial_refined_endpoints_dequant_f[c] = (float)trial_refined_endpoints_dequant[c] * (1.0f / 255.0f); vec3F desired_endpoint; if (endpoints_are_swapped) desired_endpoint = (e == 0) ? vec3F(high_color_f) : vec3F(low_color_f); else desired_endpoint = (e == 0) ? vec3F(low_color_f) : vec3F(high_color_f); float trial_err = desired_endpoint.squared_distance(trial_refined_endpoints_dequant_f); if (trial_err < best_err) { best_err = trial_err; memcpy(best_refined_endpoint_vals, trial_refined_endpoint_vals, 3); } } // k } // b_delta for (uint32_t c = 0; c < 3; c++) { refined_endpoint_vals[c * 2 + e] = best_refined_endpoint_vals[c]; } // c } // e // just refine A now (if it exists) first_comp = 3; } if (first_comp < total_comps) { for (uint32_t e = 0; e < 2; e++) { for (uint32_t c = first_comp; c < total_comps; c++) { const uint32_t idx = c * 2 + e; const int enc_val = pTrial_endpoint_vals[idx]; const int orig_rank = ISE_to_rank[enc_val]; int best_rank = orig_rank; float best_err = BIG_FLOAT_VAL; for (int v_delta = -1; v_delta <= 1; v_delta++) { int new_rank = basisu::clamp(orig_rank + v_delta, 0, (int)num_endpoint_ise_levels - 1); int new_enc_ise_val = rank_to_ISE[new_rank]; float dequant_val = (float)endpoint_dequant_tab[new_enc_ise_val] * (1.0f / 255.0f); float orig_val; if (endpoints_are_swapped) orig_val = (e == 0) ? high_color_f[c] : low_color_f[c]; else orig_val = (e == 0) ? low_color_f[c] : high_color_f[c]; float err = fabsf(dequant_val - orig_val); if (err < best_err) { best_err = err; best_rank = new_rank; } } refined_endpoint_vals[idx] = (uint8_t)rank_to_ISE[best_rank]; } // c } // e } bool refined_used_blue_contraction = astc_helpers::cem8_or_12_used_blue_contraction(cem_index, refined_endpoint_vals, endpoint_ise_range); if (refined_used_blue_contraction == orig_used_blue_contraction) { memcpy(pTrial_endpoint_vals, refined_endpoint_vals, total_endpoint_vals); } } // Direct L/LA, single plane static bool try_cem0_or_4(uint32_t cem_index, const pixel_stats_t& pixel_stats, const cem_encode_params& enc_params, uint32_t endpoint_ise_range, uint32_t weight_ise_range, float lum_l, float lum_h, float a_l, float a_h, uint8_t* pTrial_endpoint_vals, uint8_t* pTrial_weight_vals, uint64_t& trial_blk_error) { assert(g_initialized); assert((cem_index == astc_helpers::CEM_LDR_LUM_DIRECT) || (cem_index == astc_helpers::CEM_LDR_LUM_ALPHA_DIRECT)); const bool cem_has_alpha = (cem_index == astc_helpers::CEM_LDR_LUM_ALPHA_DIRECT); const uint32_t num_endpoint_vals = astc_helpers::get_num_cem_values(cem_index); uint8_t trial_endpoint_vals[astc_helpers::NUM_MODE4_ENDPOINTS] = { 0 }; uint8_t trial_weight_vals[ASTC_LDR_MAX_BLOCK_PIXELS] = { 0 }; encode_cem0_4(cem_index, lum_l, lum_h, a_l, a_h, endpoint_ise_range, trial_endpoint_vals); uint64_t trial_err = eval_solution( pixel_stats, cem_index, trial_endpoint_vals, endpoint_ise_range, trial_weight_vals, weight_ise_range, enc_params); bool improved_flag = false; if (trial_err < trial_blk_error) { trial_blk_error = trial_err; memcpy(pTrial_endpoint_vals, trial_endpoint_vals, astc_helpers::get_num_cem_values(cem_index)); memcpy(pTrial_weight_vals, trial_weight_vals, pixel_stats.m_num_pixels); improved_flag = true; } bool any_degen = false; if ((trial_endpoint_vals[0] == trial_endpoint_vals[1]) && (lum_l != lum_h)) any_degen = true; if (cem_has_alpha) { if ((trial_endpoint_vals[2] == trial_endpoint_vals[3]) && (a_l != a_h)) any_degen = true; } if (any_degen) { const int l_delta = (lum_l < lum_h) ? -1 : 1; const int a_delta = (a_l < a_h) ? -1 : 1; for (uint32_t t = 1; t <= 3; t++) { uint8_t fixed_endpoint_vals[astc_helpers::NUM_MODE4_ENDPOINTS]; memcpy(fixed_endpoint_vals, trial_endpoint_vals, num_endpoint_vals); if (t & 1) { if ((trial_endpoint_vals[0] == trial_endpoint_vals[1]) && (lum_l != lum_h)) fixed_endpoint_vals[0] = (uint8_t)astc_helpers::apply_delta_to_bise_endpoint_val(endpoint_ise_range, trial_endpoint_vals[0], l_delta); if (cem_has_alpha) { if ((trial_endpoint_vals[2] == trial_endpoint_vals[3]) && (a_l != a_h)) fixed_endpoint_vals[2] = (uint8_t)astc_helpers::apply_delta_to_bise_endpoint_val(endpoint_ise_range, trial_endpoint_vals[2], a_delta); } } if (t & 2) { if ((trial_endpoint_vals[0] == trial_endpoint_vals[1]) && (lum_l != lum_h)) fixed_endpoint_vals[1] = (uint8_t)astc_helpers::apply_delta_to_bise_endpoint_val(endpoint_ise_range, trial_endpoint_vals[1], -l_delta); if (cem_has_alpha) { if ((trial_endpoint_vals[2] == trial_endpoint_vals[3]) && (a_l != a_h)) fixed_endpoint_vals[3] = (uint8_t)astc_helpers::apply_delta_to_bise_endpoint_val(endpoint_ise_range, trial_endpoint_vals[3], -a_delta); } } trial_err = eval_solution( pixel_stats, cem_index, fixed_endpoint_vals, endpoint_ise_range, trial_weight_vals, weight_ise_range, enc_params); if (trial_err < trial_blk_error) { trial_blk_error = trial_err; memcpy(pTrial_endpoint_vals, fixed_endpoint_vals, astc_helpers::get_num_cem_values(cem_index)); memcpy(pTrial_weight_vals, trial_weight_vals, pixel_stats.m_num_pixels); improved_flag = true; } } // t } return improved_flag; } static bool try_cem4_dp_a(uint32_t cem_index, const pixel_stats_t& pixel_stats, const cem_encode_params& enc_params, uint32_t endpoint_ise_range, uint32_t weight_ise_range, float lum_l, float lum_h, float a_l, float a_h, uint8_t* pTrial_endpoint_vals, uint8_t* pTrial_weight_vals0, uint8_t* pTrial_weight_vals1, uint64_t& trial_blk_error) { assert(g_initialized); assert(cem_index == astc_helpers::CEM_LDR_LUM_ALPHA_DIRECT); const bool cem_has_alpha = (cem_index == astc_helpers::CEM_LDR_LUM_ALPHA_DIRECT); const uint32_t num_endpoint_vals = astc_helpers::get_num_cem_values(cem_index); uint8_t trial_endpoint_vals[astc_helpers::NUM_MODE4_ENDPOINTS] = { 0 }; uint8_t trial_weight_vals0[ASTC_LDR_MAX_BLOCK_PIXELS] = { 0 }; uint8_t trial_weight_vals1[ASTC_LDR_MAX_BLOCK_PIXELS] = { 0 }; encode_cem0_4(cem_index, lum_l, lum_h, a_l, a_h, endpoint_ise_range, trial_endpoint_vals); uint64_t trial_err = eval_solution_dp( pixel_stats, cem_index, 3, trial_endpoint_vals, endpoint_ise_range, trial_weight_vals0, trial_weight_vals1, weight_ise_range, enc_params); bool improved_flag = false; if (trial_err < trial_blk_error) { trial_blk_error = trial_err; memcpy(pTrial_endpoint_vals, trial_endpoint_vals, astc_helpers::get_num_cem_values(cem_index)); memcpy(pTrial_weight_vals0, trial_weight_vals0, pixel_stats.m_num_pixels); memcpy(pTrial_weight_vals1, trial_weight_vals1, pixel_stats.m_num_pixels); improved_flag = true; } bool any_degen = false; if ((trial_endpoint_vals[0] == trial_endpoint_vals[1]) && (lum_l != lum_h)) any_degen = true; if (cem_has_alpha) { if ((trial_endpoint_vals[2] == trial_endpoint_vals[3]) && (a_l != a_h)) any_degen = true; } if (any_degen) { const int l_delta = (lum_l < lum_h) ? -1 : 1; const int a_delta = (a_l < a_h) ? -1 : 1; for (uint32_t t = 1; t <= 3; t++) { uint8_t fixed_endpoint_vals[astc_helpers::NUM_MODE4_ENDPOINTS]; memcpy(fixed_endpoint_vals, trial_endpoint_vals, num_endpoint_vals); if (t & 1) { if ((trial_endpoint_vals[0] == trial_endpoint_vals[1]) && (lum_l != lum_h)) fixed_endpoint_vals[0] = (uint8_t)astc_helpers::apply_delta_to_bise_endpoint_val(endpoint_ise_range, trial_endpoint_vals[0], l_delta); if (cem_has_alpha) { if ((trial_endpoint_vals[2] == trial_endpoint_vals[3]) && (a_l != a_h)) fixed_endpoint_vals[2] = (uint8_t)astc_helpers::apply_delta_to_bise_endpoint_val(endpoint_ise_range, trial_endpoint_vals[2], a_delta); } } if (t & 2) { if ((trial_endpoint_vals[0] == trial_endpoint_vals[1]) && (lum_l != lum_h)) fixed_endpoint_vals[1] = (uint8_t)astc_helpers::apply_delta_to_bise_endpoint_val(endpoint_ise_range, trial_endpoint_vals[1], -l_delta); if (cem_has_alpha) { if ((trial_endpoint_vals[2] == trial_endpoint_vals[3]) && (a_l != a_h)) fixed_endpoint_vals[3] = (uint8_t)astc_helpers::apply_delta_to_bise_endpoint_val(endpoint_ise_range, trial_endpoint_vals[3], -a_delta); } } trial_err = eval_solution_dp( pixel_stats, cem_index, 3, fixed_endpoint_vals, endpoint_ise_range, trial_weight_vals0, trial_weight_vals1, weight_ise_range, enc_params); if (trial_err < trial_blk_error) { trial_blk_error = trial_err; memcpy(pTrial_endpoint_vals, fixed_endpoint_vals, astc_helpers::get_num_cem_values(cem_index)); memcpy(pTrial_weight_vals0, trial_weight_vals0, pixel_stats.m_num_pixels); memcpy(pTrial_weight_vals1, trial_weight_vals1, pixel_stats.m_num_pixels); improved_flag = true; } } // t } return improved_flag; } // Direct RGB/RGBA // Cannot fail, but may have to fall back to non-blue-contracted // Returns false if trial solution not improved static bool try_cem8_12( uint32_t cem_index, const pixel_stats_t& pixel_stats, const cem_encode_params& enc_params, uint32_t endpoint_ise_range, uint32_t weight_ise_range, const vec4F& low_color_f, const vec4F& high_color_f, uint8_t* pTrial_endpoint_vals, uint8_t* pTrial_weight_vals, uint64_t& trial_blk_error, bool& trial_used_blue_contraction, bool try_blue_contract, bool& tried_used_blue_contraction) { assert(g_initialized); assert((cem_index == astc_helpers::CEM_LDR_RGB_DIRECT) || (cem_index == astc_helpers::CEM_LDR_RGBA_DIRECT)); const uint32_t num_endpoint_vals = astc_helpers::get_num_cem_values(cem_index); const uint32_t num_comps = (cem_index == astc_helpers::CEM_LDR_RGB_DIRECT) ? 3 : 4; color_rgba low_color, high_color; for (uint32_t c = 0; c < 4; c++) { low_color[c] = (uint8_t)basisu::clamp((int)std::round(low_color_f[c] * 255.0f), 0, 255); high_color[c] = (uint8_t)basisu::clamp((int)std::round(high_color_f[c] * 255.0f), 0, 255); } uint8_t trial_endpoint_vals[astc_helpers::NUM_MODE12_ENDPOINTS] = { 0 }; uint8_t trial_weight_vals[ASTC_LDR_MAX_BLOCK_PIXELS] = { 0 }; // Cannot fail, but may have to fall back to non-blue-contracted cem_encode_ldr_rgb_or_rgba_direct_result res = cem_encode_ldr_rgb_or_rgba_direct(cem_index, endpoint_ise_range, low_color, high_color, trial_endpoint_vals, try_blue_contract); // Let caller know if we tried blue contraction tried_used_blue_contraction = res.m_is_blue_contracted; if (endpoint_ise_range < astc_helpers::BISE_256_LEVELS) { refine_cem8_or_12_endpoints(cem_index, endpoint_ise_range, trial_endpoint_vals, low_color_f, high_color_f, res.m_endpoints_are_swapped); } uint64_t trial_err = eval_solution( pixel_stats, cem_index, trial_endpoint_vals, endpoint_ise_range, trial_weight_vals, weight_ise_range, enc_params); bool improved_flag = false; if (trial_err < trial_blk_error) { trial_blk_error = trial_err; memcpy(pTrial_endpoint_vals, trial_endpoint_vals, astc_helpers::get_num_cem_values(cem_index)); memcpy(pTrial_weight_vals, trial_weight_vals, pixel_stats.m_num_pixels); trial_used_blue_contraction = res.m_is_blue_contracted; improved_flag = true; } if (res.m_any_degen) { color_rgba dec_l(0), dec_h(0); decode_endpoints(cem_index, trial_endpoint_vals, endpoint_ise_range, dec_l, dec_h); uint32_t s0 = dec_l.r + dec_l.g + dec_l.b + dec_l.a; uint32_t s1 = dec_h.r + dec_h.g + dec_h.b + dec_h.a; if (astc_helpers::cem8_or_12_used_blue_contraction(cem_index, trial_endpoint_vals, endpoint_ise_range)) std::swap(s0, s1); for (uint32_t t = 1; t <= 3; t++) { uint8_t fixed_endpoint_vals[astc_helpers::NUM_MODE12_ENDPOINTS]; memcpy(fixed_endpoint_vals, trial_endpoint_vals, num_endpoint_vals); if (t & 1) { for (uint32_t c = 0; c < num_comps; c++) { uint32_t l_idx = c * 2 + 0; uint32_t h_idx = c * 2 + 1; if ((trial_endpoint_vals[l_idx] == trial_endpoint_vals[h_idx]) && (low_color[c] != high_color[c])) { int delta = (s0 <= s1) ? -1 : 1; fixed_endpoint_vals[l_idx] = (uint8_t)astc_helpers::apply_delta_to_bise_endpoint_val(endpoint_ise_range, trial_endpoint_vals[l_idx], delta); } } } if (t & 2) { for (uint32_t c = 0; c < num_comps; c++) { uint32_t l_idx = c * 2 + 0; uint32_t h_idx = c * 2 + 1; if ((trial_endpoint_vals[l_idx] == trial_endpoint_vals[h_idx]) && (low_color[c] != high_color[c])) { int delta = (s0 <= s1) ? 1 : -1; fixed_endpoint_vals[h_idx] = (uint8_t)astc_helpers::apply_delta_to_bise_endpoint_val(endpoint_ise_range, trial_endpoint_vals[h_idx], delta); } } } bool fixed_used_blue_contraction = astc_helpers::cem8_or_12_used_blue_contraction(cem_index, fixed_endpoint_vals, endpoint_ise_range); if (fixed_used_blue_contraction != res.m_is_blue_contracted) continue; trial_err = eval_solution( pixel_stats, cem_index, fixed_endpoint_vals, endpoint_ise_range, trial_weight_vals, weight_ise_range, enc_params); if (trial_err < trial_blk_error) { trial_blk_error = trial_err; memcpy(pTrial_endpoint_vals, fixed_endpoint_vals, astc_helpers::get_num_cem_values(cem_index)); memcpy(pTrial_weight_vals, trial_weight_vals, pixel_stats.m_num_pixels); trial_used_blue_contraction = res.m_is_blue_contracted; improved_flag = true; } } // t } // if (res.m_any_degen) return improved_flag; } static bool try_cem8_12_dp( uint32_t cem_index, uint32_t ccs_index, const pixel_stats_t& pixel_stats, const cem_encode_params& enc_params, uint32_t endpoint_ise_range, uint32_t weight_ise_range, const vec4F& low_color_f, const vec4F& high_color_f, uint8_t* pTrial_endpoint_vals, uint8_t* pTrial_weight_vals0, uint8_t* pTrial_weight_vals1, uint64_t& trial_blk_error, bool& trial_used_blue_contraction, bool try_blue_contract, bool& tried_used_blue_contraction) { assert(g_initialized); assert((cem_index == astc_helpers::CEM_LDR_RGB_DIRECT) || (cem_index == astc_helpers::CEM_LDR_RGBA_DIRECT)); bool improved_flag = false; const uint32_t num_endpoint_vals = astc_helpers::get_num_cem_values(cem_index); const uint32_t num_comps = (cem_index == astc_helpers::CEM_LDR_RGB_DIRECT) ? 3 : 4; color_rgba low_color, high_color; for (uint32_t c = 0; c < 4; c++) { low_color[c] = (uint8_t)basisu::clamp((int)std::round(low_color_f[c] * 255.0f), 0, 255); high_color[c] = (uint8_t)basisu::clamp((int)std::round(high_color_f[c] * 255.0f), 0, 255); } uint8_t trial_endpoint_vals[astc_helpers::NUM_MODE12_ENDPOINTS] = { 0 }; uint8_t trial_weight_vals0[ASTC_LDR_MAX_BLOCK_PIXELS] = { 0 }; uint8_t trial_weight_vals1[ASTC_LDR_MAX_BLOCK_PIXELS] = { 0 }; // Cannot fail, but may have to fall back to non-blue-contracted cem_encode_ldr_rgb_or_rgba_direct_result res = cem_encode_ldr_rgb_or_rgba_direct(cem_index, endpoint_ise_range, low_color, high_color, trial_endpoint_vals, try_blue_contract); // Let caller know if we tried blue contraction tried_used_blue_contraction = res.m_is_blue_contracted; if (endpoint_ise_range < astc_helpers::BISE_256_LEVELS) { refine_cem8_or_12_endpoints(cem_index, endpoint_ise_range, trial_endpoint_vals, low_color_f, high_color_f, res.m_endpoints_are_swapped); } uint64_t trial_err = eval_solution_dp(pixel_stats, cem_index, ccs_index, trial_endpoint_vals, endpoint_ise_range, trial_weight_vals0, trial_weight_vals1, weight_ise_range, enc_params); if (trial_err < trial_blk_error) { trial_blk_error = trial_err; memcpy(pTrial_endpoint_vals, trial_endpoint_vals, astc_helpers::get_num_cem_values(cem_index)); memcpy(pTrial_weight_vals0, trial_weight_vals0, pixel_stats.m_num_pixels); memcpy(pTrial_weight_vals1, trial_weight_vals1, pixel_stats.m_num_pixels); trial_used_blue_contraction = res.m_is_blue_contracted; improved_flag = true; } if (res.m_any_degen) { color_rgba dec_l(0), dec_h(0); decode_endpoints(cem_index, trial_endpoint_vals, endpoint_ise_range, dec_l, dec_h); uint32_t s0 = dec_l.r + dec_l.g + dec_l.b + dec_l.a; uint32_t s1 = dec_h.r + dec_h.g + dec_h.b + dec_h.a; if (astc_helpers::cem8_or_12_used_blue_contraction(cem_index, trial_endpoint_vals, endpoint_ise_range)) std::swap(s0, s1); for (uint32_t t = 1; t <= 3; t++) { uint8_t fixed_endpoint_vals[astc_helpers::NUM_MODE12_ENDPOINTS]; memcpy(fixed_endpoint_vals, trial_endpoint_vals, num_endpoint_vals); if (t & 1) { for (uint32_t c = 0; c < num_comps; c++) { uint32_t l_idx = c * 2 + 0; uint32_t h_idx = c * 2 + 1; if ((trial_endpoint_vals[l_idx] == trial_endpoint_vals[h_idx]) && (low_color[c] != high_color[c])) { int delta = (s0 <= s1) ? -1 : 1; fixed_endpoint_vals[l_idx] = (uint8_t)astc_helpers::apply_delta_to_bise_endpoint_val(endpoint_ise_range, trial_endpoint_vals[l_idx], delta); } } } if (t & 2) { for (uint32_t c = 0; c < num_comps; c++) { uint32_t l_idx = c * 2 + 0; uint32_t h_idx = c * 2 + 1; if ((trial_endpoint_vals[l_idx] == trial_endpoint_vals[h_idx]) && (low_color[c] != high_color[c])) { int delta = (s0 <= s1) ? 1 : -1; fixed_endpoint_vals[h_idx] = (uint8_t)astc_helpers::apply_delta_to_bise_endpoint_val(endpoint_ise_range, trial_endpoint_vals[h_idx], delta); } } } bool fixed_used_blue_contraction = astc_helpers::cem8_or_12_used_blue_contraction(cem_index, fixed_endpoint_vals, endpoint_ise_range); if (fixed_used_blue_contraction != res.m_is_blue_contracted) continue; trial_err = eval_solution_dp(pixel_stats, cem_index, ccs_index, fixed_endpoint_vals, endpoint_ise_range, trial_weight_vals0, trial_weight_vals1, weight_ise_range, enc_params); if (trial_err < trial_blk_error) { trial_blk_error = trial_err; memcpy(pTrial_endpoint_vals, fixed_endpoint_vals, astc_helpers::get_num_cem_values(cem_index)); memcpy(pTrial_weight_vals0, trial_weight_vals0, pixel_stats.m_num_pixels); memcpy(pTrial_weight_vals1, trial_weight_vals1, pixel_stats.m_num_pixels); improved_flag = true; } } // t } // if (res.m_any_degen) return improved_flag; } // base+offset rgb/rgba, single or dual plane static bool try_cem9_13_sp_or_dp( uint32_t cem_index, int ccs_index, const pixel_stats_t& pixel_stats, const cem_encode_params& enc_params, uint32_t endpoint_ise_range, uint32_t weight_ise_range, const vec4F& low_color_f, const vec4F& high_color_f, uint8_t* pTrial_endpoint_vals, uint8_t* pTrial_weight_vals0, uint8_t* pTrial_weight_vals1, uint64_t& trial_blk_error, bool& trial_used_blue_contraction, bool try_blue_contract, bool& tried_used_blue_contraction, bool &tried_base_ofs_clamped) { assert(g_initialized); assert((cem_index == astc_helpers::CEM_LDR_RGB_BASE_PLUS_OFFSET) || (cem_index == astc_helpers::CEM_LDR_RGBA_BASE_PLUS_OFFSET)); assert((ccs_index >= -1) && (ccs_index <= 3)); assert((pixel_stats.m_num_pixels) && (pixel_stats.m_num_pixels <= ASTC_LDR_MAX_BLOCK_PIXELS)); assert((endpoint_ise_range >= astc_helpers::FIRST_VALID_ENDPOINT_ISE_RANGE) && (endpoint_ise_range <= astc_helpers::LAST_VALID_ENDPOINT_ISE_RANGE)); assert(((weight_ise_range >= astc_helpers::FIRST_VALID_WEIGHT_ISE_RANGE) && (weight_ise_range <= astc_helpers::LAST_VALID_WEIGHT_ISE_RANGE)) || (weight_ise_range == astc_helpers::BISE_64_LEVELS)); assert(pTrial_weight_vals0); assert((ccs_index == -1) || (pTrial_weight_vals1)); //const uint32_t num_endpoint_vals = astc_helpers::get_num_cem_values(cem_index); const uint32_t num_comps = (cem_index == astc_helpers::CEM_LDR_RGB_BASE_PLUS_OFFSET) ? 3 : 4; color_rgba low_color, high_color; for (uint32_t c = 0; c < 4; c++) { low_color[c] = (uint8_t)basisu::clamp((int)std::round(low_color_f[c] * 255.0f), 0, 255); high_color[c] = (uint8_t)basisu::clamp((int)std::round(high_color_f[c] * 255.0f), 0, 255); } uint8_t trial_endpoint_vals[astc_helpers::NUM_MODE13_ENDPOINTS] = { 0 }; uint8_t trial_weight_vals0[ASTC_LDR_MAX_BLOCK_PIXELS] = { 0 }; uint8_t trial_weight_vals1[ASTC_LDR_MAX_BLOCK_PIXELS] = { 0 }; rgb_base_offset_res res = cem_encode_ldr_rgb_or_rgba_base_offset(cem_index, endpoint_ise_range, low_color, high_color, trial_endpoint_vals, try_blue_contract); tried_used_blue_contraction = res.m_used_blue_contraction; tried_base_ofs_clamped = res.m_delta_clamped; if (res.m_failed_flag) return false; bool improved_flag = false; if (ccs_index == -1) { uint64_t trial_err = eval_solution( pixel_stats, cem_index, trial_endpoint_vals, endpoint_ise_range, trial_weight_vals0, weight_ise_range, enc_params); if (trial_err < trial_blk_error) { trial_blk_error = trial_err; memcpy(pTrial_endpoint_vals, trial_endpoint_vals, astc_helpers::get_num_cem_values(cem_index)); memcpy(pTrial_weight_vals0, trial_weight_vals0, pixel_stats.m_num_pixels); if (pTrial_weight_vals1) memset(pTrial_weight_vals1, 0, pixel_stats.m_num_pixels); trial_used_blue_contraction = res.m_used_blue_contraction; improved_flag = true; } } else { uint64_t trial_err = eval_solution_dp( pixel_stats, cem_index, ccs_index, trial_endpoint_vals, endpoint_ise_range, trial_weight_vals0, trial_weight_vals1, weight_ise_range, enc_params); if (trial_err < trial_blk_error) { trial_blk_error = trial_err; memcpy(pTrial_endpoint_vals, trial_endpoint_vals, astc_helpers::get_num_cem_values(cem_index)); memcpy(pTrial_weight_vals0, trial_weight_vals0, pixel_stats.m_num_pixels); memcpy(pTrial_weight_vals1, trial_weight_vals1, pixel_stats.m_num_pixels); trial_used_blue_contraction = res.m_used_blue_contraction; improved_flag = true; } } if (res.m_any_degen) { color_rgba dec_l(0), dec_h(0); decode_endpoints(cem_index, trial_endpoint_vals, endpoint_ise_range, dec_l, dec_h); // The packing in these modes is so complex that we're going to approximate the biasing, and hope for the best. const uint32_t num_ise_levels = astc_helpers::get_ise_levels(endpoint_ise_range); int vals_per_ise_level = (256 + num_ise_levels - 1) / num_ise_levels; // TODO: There is potential cross-talk between RGB and A with the way this is done. for (uint32_t p = 1; p <= 3; p++) { color_rgba trial_low_color(low_color), trial_high_color(high_color); for (uint32_t c = 0; c < num_comps; c++) { if (low_color[c] == high_color[c]) continue; if (dec_l[c] != dec_h[c]) continue; int delta = (low_color[c] < high_color[c]) ? -1 : 1; if (p & 1) trial_low_color[c] = (uint8_t)basisu::clamp((int)trial_low_color[c] + vals_per_ise_level * delta, 0, 255); if (p & 2) trial_high_color[c] = (uint8_t)basisu::clamp((int)trial_high_color[c] + vals_per_ise_level * -delta, 0, 255); } // c res = cem_encode_ldr_rgb_or_rgba_base_offset(cem_index, endpoint_ise_range, trial_low_color, trial_high_color, trial_endpoint_vals, try_blue_contract); if (res.m_failed_flag) continue; if (ccs_index == -1) { uint64_t trial_err = eval_solution( pixel_stats, cem_index, trial_endpoint_vals, endpoint_ise_range, trial_weight_vals0, weight_ise_range, enc_params); if (trial_err < trial_blk_error) { trial_blk_error = trial_err; memcpy(pTrial_endpoint_vals, trial_endpoint_vals, astc_helpers::get_num_cem_values(cem_index)); memcpy(pTrial_weight_vals0, trial_weight_vals0, pixel_stats.m_num_pixels); if (pTrial_weight_vals1) memset(pTrial_weight_vals1, 0, pixel_stats.m_num_pixels); trial_used_blue_contraction = res.m_used_blue_contraction; if (res.m_delta_clamped) tried_base_ofs_clamped = true; improved_flag = true; } } else { uint64_t trial_err = eval_solution_dp( pixel_stats, cem_index, ccs_index, trial_endpoint_vals, endpoint_ise_range, trial_weight_vals0, trial_weight_vals1, weight_ise_range, enc_params); if (trial_err < trial_blk_error) { trial_blk_error = trial_err; memcpy(pTrial_endpoint_vals, trial_endpoint_vals, astc_helpers::get_num_cem_values(cem_index)); memcpy(pTrial_weight_vals0, trial_weight_vals0, pixel_stats.m_num_pixels); memcpy(pTrial_weight_vals1, trial_weight_vals1, pixel_stats.m_num_pixels); trial_used_blue_contraction = res.m_used_blue_contraction; if (res.m_delta_clamped) tried_base_ofs_clamped = true; improved_flag = true; } } } // p } else { // Now factor in the quantization introduced into the low (base) color, and apply this to the offset, for gain. color_rgba dec_l(0), dec_h(0); decode_endpoints(cem_index, trial_endpoint_vals, endpoint_ise_range, dec_l, dec_h); if (res.m_endpoints_swapped) dec_l = low_color; // high color is the quantized base else dec_h = high_color; // low color is the quantized base res = cem_encode_ldr_rgb_or_rgba_base_offset(cem_index, endpoint_ise_range, dec_l, dec_h, trial_endpoint_vals, try_blue_contract); if (!res.m_failed_flag) { if (ccs_index == -1) { uint64_t trial_err = eval_solution( pixel_stats, cem_index, trial_endpoint_vals, endpoint_ise_range, trial_weight_vals0, weight_ise_range, enc_params); if (trial_err < trial_blk_error) { trial_blk_error = trial_err; memcpy(pTrial_endpoint_vals, trial_endpoint_vals, astc_helpers::get_num_cem_values(cem_index)); memcpy(pTrial_weight_vals0, trial_weight_vals0, pixel_stats.m_num_pixels); if (pTrial_weight_vals1) memset(pTrial_weight_vals1, 0, pixel_stats.m_num_pixels); trial_used_blue_contraction = res.m_used_blue_contraction; if (res.m_delta_clamped) tried_base_ofs_clamped = true; improved_flag = true; } } else { uint64_t trial_err = eval_solution_dp( pixel_stats, cem_index, ccs_index, trial_endpoint_vals, endpoint_ise_range, trial_weight_vals0, trial_weight_vals1, weight_ise_range, enc_params); if (trial_err < trial_blk_error) { trial_blk_error = trial_err; memcpy(pTrial_endpoint_vals, trial_endpoint_vals, astc_helpers::get_num_cem_values(cem_index)); memcpy(pTrial_weight_vals0, trial_weight_vals0, pixel_stats.m_num_pixels); memcpy(pTrial_weight_vals1, trial_weight_vals1, pixel_stats.m_num_pixels); trial_used_blue_contraction = res.m_used_blue_contraction; if (res.m_delta_clamped) tried_base_ofs_clamped = true; improved_flag = true; } } } } return improved_flag; } // l/la direct, single plane static uint64_t encode_cem0_4( uint32_t cem_index, const pixel_stats_t& pixel_stats, const cem_encode_params& enc_params, uint32_t endpoint_ise_range, uint32_t weight_ise_range, uint8_t* pEndpoint_vals, uint8_t* pWeight_vals, uint64_t cur_blk_error) { assert(g_initialized); assert((cem_index == astc_helpers::CEM_LDR_LUM_DIRECT) || (cem_index == astc_helpers::CEM_LDR_LUM_ALPHA_DIRECT)); assert((pixel_stats.m_num_pixels) && (pixel_stats.m_num_pixels <= ASTC_LDR_MAX_BLOCK_PIXELS)); assert((endpoint_ise_range >= astc_helpers::FIRST_VALID_ENDPOINT_ISE_RANGE) && (endpoint_ise_range <= astc_helpers::LAST_VALID_ENDPOINT_ISE_RANGE)); assert(((weight_ise_range >= astc_helpers::FIRST_VALID_WEIGHT_ISE_RANGE) && (weight_ise_range <= astc_helpers::LAST_VALID_WEIGHT_ISE_RANGE)) || (weight_ise_range == astc_helpers::BISE_64_LEVELS)); const bool cem_has_alpha = (cem_index == astc_helpers::CEM_LDR_LUM_ALPHA_DIRECT); const uint32_t total_endpoint_vals = astc_helpers::get_num_cem_values(cem_index); const uint32_t total_weights = pixel_stats.m_num_pixels; float lum_l = BIG_FLOAT_VAL, lum_h = -BIG_FLOAT_VAL; float pixel1F[ASTC_LDR_MAX_BLOCK_PIXELS]; vec2F pixel2F[ASTC_LDR_MAX_BLOCK_PIXELS]; for (uint32_t i = 0; i < pixel_stats.m_num_pixels; i++) { const vec4F& px = pixel_stats.m_pixels_f[i]; float l = (px[0] + px[1] + px[2]) * (1.0f / 3.0f); pixel1F[i] = l; pixel2F[i][0] = l; pixel2F[i][1] = px[3]; lum_l = minimum(lum_l, l); lum_h = maximum(lum_h, l); } const float a_l = pixel_stats.m_min_f[3]; const float a_h = pixel_stats.m_max_f[3]; const vec2F min_pixel2F(lum_l, a_l), max_pixel2F(lum_h, a_h); uint8_t trial_blk_endpoints[astc_helpers::MAX_CEM_ENDPOINT_VALS] = { 0 }; uint8_t trial_blk_weights[ASTC_LDR_MAX_BLOCK_PIXELS] = { 0 }; uint64_t trial_blk_error = UINT64_MAX; bool did_improve = try_cem0_or_4( cem_index, pixel_stats, enc_params, endpoint_ise_range, weight_ise_range, lum_l, lum_h, a_l, a_h, trial_blk_endpoints, trial_blk_weights, trial_blk_error); BASISU_NOTE_UNUSED(did_improve); if (trial_blk_error == UINT64_MAX) return cur_blk_error; if (trial_blk_error < cur_blk_error) { cur_blk_error = trial_blk_error; memcpy(pEndpoint_vals, trial_blk_endpoints, total_endpoint_vals); memcpy(pWeight_vals, trial_blk_weights, total_weights); } const uint32_t NUM_LS_OPT_PASSES = 3; for (uint32_t pass = 0; pass < NUM_LS_OPT_PASSES; pass++) { vec2F xl(lum_l, a_l), xh(lum_h, a_h); bool ls_res; if (cem_has_alpha) { ls_res = compute_least_squares_endpoints_2D( pixel_stats.m_num_pixels, trial_blk_weights, get_ls_weights_ise(weight_ise_range), &xl, &xh, pixel2F, min_pixel2F, max_pixel2F); } else { ls_res = compute_least_squares_endpoints_1D( pixel_stats.m_num_pixels, trial_blk_weights, get_ls_weights_ise(weight_ise_range), &xl[0], &xh[0], pixel1F, lum_l, lum_h); } if (!ls_res) break; bool did_improve_res = false; did_improve_res = try_cem0_or_4( cem_index, pixel_stats, enc_params, endpoint_ise_range, weight_ise_range, xl[0], xh[0], xl[1], xh[1], trial_blk_endpoints, trial_blk_weights, trial_blk_error); BASISU_NOTE_UNUSED(did_improve_res); if (trial_blk_error >= cur_blk_error) break; cur_blk_error = trial_blk_error; memcpy(pEndpoint_vals, trial_blk_endpoints, total_endpoint_vals); memcpy(pWeight_vals, trial_blk_weights, total_weights); } // pass return cur_blk_error; } // lum+alpha direct, dual plane static uint64_t encode_cem4_dp_a( uint32_t cem_index, const pixel_stats_t& pixel_stats, const cem_encode_params& enc_params, uint32_t endpoint_ise_range, uint32_t weight_ise_range, uint8_t* pEndpoint_vals, uint8_t* pWeight_vals0, uint8_t* pWeight_vals1, uint64_t cur_blk_error) { assert(g_initialized); assert(cem_index == astc_helpers::CEM_LDR_LUM_ALPHA_DIRECT); assert((pixel_stats.m_num_pixels) && (pixel_stats.m_num_pixels <= ASTC_LDR_MAX_BLOCK_PIXELS)); assert((endpoint_ise_range >= astc_helpers::FIRST_VALID_ENDPOINT_ISE_RANGE) && (endpoint_ise_range <= astc_helpers::LAST_VALID_ENDPOINT_ISE_RANGE)); assert(((weight_ise_range >= astc_helpers::FIRST_VALID_WEIGHT_ISE_RANGE) && (weight_ise_range <= astc_helpers::LAST_VALID_WEIGHT_ISE_RANGE)) || (weight_ise_range == astc_helpers::BISE_64_LEVELS)); const uint32_t total_endpoint_vals = astc_helpers::get_num_cem_values(cem_index); const uint32_t total_weights = pixel_stats.m_num_pixels; float alpha_vals[ASTC_LDR_MAX_BLOCK_PIXELS]; for (uint32_t i = 0; i < pixel_stats.m_num_pixels; i++) { const vec4F& px = pixel_stats.m_pixels_f[i]; alpha_vals[i] = px[3]; } // First get plane0's low/high (lum) uint8_t lum_endpoints[astc_helpers::MAX_CEM_ENDPOINT_VALS]; uint8_t lum_weights0[ASTC_LDR_MAX_BLOCK_PIXELS]; uint64_t lum_blk_error = encode_cem0_4( astc_helpers::CEM_LDR_LUM_DIRECT, pixel_stats, enc_params, endpoint_ise_range, weight_ise_range, lum_endpoints, lum_weights0, UINT64_MAX); if (lum_blk_error == UINT64_MAX) return cur_blk_error; const auto& dequant_endpoints_tab = astc_helpers::g_dequant_tables.get_endpoint_tab(endpoint_ise_range).m_ISE_to_val; float lum_l = (float)dequant_endpoints_tab[lum_endpoints[0]] * (1.0f / 255.0f); float lum_h = (float)dequant_endpoints_tab[lum_endpoints[1]] * (1.0f / 255.0f); float a_l = pixel_stats.m_min_f[3]; float a_h = pixel_stats.m_max_f[3]; uint8_t trial_endpoints[astc_helpers::MAX_CEM_ENDPOINT_VALS]; uint8_t trial_weights0[ASTC_LDR_MAX_BLOCK_PIXELS]; uint8_t trial_weights1[ASTC_LDR_MAX_BLOCK_PIXELS]; uint64_t trial_blk_error = UINT64_MAX; bool did_improve = try_cem4_dp_a( cem_index, pixel_stats, enc_params, endpoint_ise_range, weight_ise_range, lum_l, lum_h, a_l, a_h, trial_endpoints, trial_weights0, trial_weights1, trial_blk_error); if (!did_improve) { assert(0); return cur_blk_error; } if (trial_blk_error < cur_blk_error) { cur_blk_error = trial_blk_error; memcpy(pEndpoint_vals, trial_endpoints, total_endpoint_vals); memcpy(pWeight_vals0, trial_weights0, total_weights); memcpy(pWeight_vals1, trial_weights1, total_weights); } const uint32_t NUM_LS_OPT_PASSES = 3; for (uint32_t pass = 0; pass < NUM_LS_OPT_PASSES; pass++) { float xl = pixel_stats.m_min_f[3], xh = pixel_stats.m_max_f[3]; bool ls_res = compute_least_squares_endpoints_1D( pixel_stats.m_num_pixels, trial_weights1, get_ls_weights_ise(weight_ise_range), &xl, &xh, alpha_vals, pixel_stats.m_min_f[3], pixel_stats.m_max_f[3]); if (!ls_res) break; did_improve = try_cem4_dp_a( cem_index, pixel_stats, enc_params, endpoint_ise_range, weight_ise_range, lum_l, lum_h, xl, xh, trial_endpoints, trial_weights0, trial_weights1, trial_blk_error); if (!did_improve) break; cur_blk_error = trial_blk_error; memcpy(pEndpoint_vals, trial_endpoints, total_endpoint_vals); memcpy(pWeight_vals0, trial_weights0, total_weights); memcpy(pWeight_vals1, trial_weights1, total_weights); } // pass return cur_blk_error; } struct weight_refiner { void init(uint32_t weight_ise_range, uint32_t total_pixels, const uint8_t *pInitial_ise_weights) { m_weight_ise_range = weight_ise_range; m_total_pixels = total_pixels; m_pISE_to_rank = &astc_helpers::g_dequant_tables.get_weight_tab(weight_ise_range).m_ISE_to_rank; m_pRank_to_ise = &astc_helpers::g_dequant_tables.get_weight_tab(weight_ise_range).m_rank_to_ISE; m_num_weight_levels = astc_helpers::get_ise_levels(weight_ise_range); for (uint32_t i = 0; i < total_pixels; i++) m_start_weights[i] = (*m_pISE_to_rank)[pInitial_ise_weights[i]]; m_min_weight = UINT32_MAX; m_max_weight = 0; m_sum_weight = 0; for (uint32_t i = 0; i < total_pixels; i++) { const uint32_t weight = m_start_weights[i]; m_sum_weight += weight; m_min_weight = minimumu(m_min_weight, weight); m_max_weight = maximumu(m_max_weight, weight); } } void refine(uint32_t pass_index, uint8_t* pTrial_ise_weights) { switch (pass_index) { case 0: { for (uint32_t i = 0; i < m_total_pixels; i++) { uint32_t v = m_start_weights[i]; if ((v == m_min_weight) && (v < (m_num_weight_levels - 1))) v++; pTrial_ise_weights[i] = (*m_pRank_to_ise)[v]; } break; } case 1: { for (uint32_t i = 0; i < m_total_pixels; i++) { uint32_t v = m_start_weights[i]; if ((v == m_max_weight) && (v > 0)) v--; pTrial_ise_weights[i] = (*m_pRank_to_ise)[v]; } break; } case 2: { for (uint32_t i = 0; i < m_total_pixels; i++) { uint32_t v = m_start_weights[i]; if ((v == m_min_weight) && (v < (m_num_weight_levels - 1))) v++; else if ((v == m_max_weight) && (v > 0)) v--; pTrial_ise_weights[i] = (*m_pRank_to_ise)[v]; } break; } case 3: { const int max_weight_rank_index = m_num_weight_levels - 1; int ly = -1, hy = max_weight_rank_index + 1; for (uint32_t i = 0; i < m_total_pixels; i++) { int s = (int)clampf(floor((float)max_weight_rank_index * ((float)m_start_weights[i] - (float)ly) / ((float)hy - (float)ly) + .5f), 0, (float)max_weight_rank_index); pTrial_ise_weights[i] = (*m_pRank_to_ise)[s]; } break; } case 4: { const int max_weight_rank_index = m_num_weight_levels - 1; int ly = -2, hy = max_weight_rank_index + 2; for (uint32_t i = 0; i < m_total_pixels; i++) { int s = (int)clampf(floor((float)max_weight_rank_index * ((float)m_start_weights[i] - (float)ly) / ((float)hy - (float)ly) + .5f), 0, (float)max_weight_rank_index); pTrial_ise_weights[i] = (*m_pRank_to_ise)[s]; } break; } case 5: { const int max_weight_rank_index = m_num_weight_levels - 1; int ly = -1, hy = max_weight_rank_index + 2; for (uint32_t i = 0; i < m_total_pixels; i++) { int s = (int)clampf(floor((float)max_weight_rank_index * ((float)m_start_weights[i] - (float)ly) / ((float)hy - (float)ly) + .5f), 0, (float)max_weight_rank_index); pTrial_ise_weights[i] = (*m_pRank_to_ise)[s]; } break; } case 6: { const int max_weight_rank_index = m_num_weight_levels - 1; int ly = -2, hy = max_weight_rank_index + 1; for (uint32_t i = 0; i < m_total_pixels; i++) { int s = (int)clampf(floor((float)max_weight_rank_index * ((float)m_start_weights[i] - (float)ly) / ((float)hy - (float)ly) + .5f), 0, (float)max_weight_rank_index); pTrial_ise_weights[i] = (*m_pRank_to_ise)[s]; } break; } case 7: { for (uint32_t i = 0; i < m_total_pixels; i++) { uint32_t v = m_start_weights[i]; if ((v == m_min_weight) && (v < (m_num_weight_levels - 1))) { v++; if (v < (m_num_weight_levels - 1)) v++; } pTrial_ise_weights[i] = (*m_pRank_to_ise)[v]; } break; break; } case 8: { for (uint32_t i = 0; i < m_total_pixels; i++) { uint32_t v = m_start_weights[i]; if ((v == m_max_weight) && (v > 0)) { v--; if (v > 0) v--; } pTrial_ise_weights[i] = (*m_pRank_to_ise)[v]; } break; } case 9: { for (uint32_t i = 0; i < m_total_pixels; i++) { uint32_t v = m_start_weights[i]; if ((v == m_min_weight) && (v < (m_num_weight_levels - 1))) { v++; if (v < (m_num_weight_levels - 1)) v++; } else if ((v == m_max_weight) && (v > 0)) { v--; if (v > 0) v--; } pTrial_ise_weights[i] = (*m_pRank_to_ise)[v]; } break; } case 10: { float mid_weight = (float)m_sum_weight / (float)m_total_pixels; for (uint32_t i = 0; i < m_total_pixels; i++) { int v = m_start_weights[i]; float fv = ((float)v - mid_weight) * .8f + ((float)m_num_weight_levels * .5f); v = clamp((int)std::round(fv), 0, m_num_weight_levels - 1); pTrial_ise_weights[i] = (*m_pRank_to_ise)[v]; } break; } case 11: { float mid_weight = (float)m_sum_weight / (float)m_total_pixels; for (uint32_t i = 0; i < m_total_pixels; i++) { int v = m_start_weights[i]; float fv = ((float)v - mid_weight) * .9f + ((float)m_num_weight_levels * .5f); v = clamp((int)std::round(fv), 0, m_num_weight_levels - 1); pTrial_ise_weights[i] = (*m_pRank_to_ise)[v]; } break; } case 12: { float mid_weight = (float)m_sum_weight / (float)m_total_pixels; for (uint32_t i = 0; i < m_total_pixels; i++) { int v = m_start_weights[i]; float fv = ((float)v - mid_weight) * 1.1f + ((float)m_num_weight_levels * .5f); v = clamp((int)std::round(fv), 0, m_num_weight_levels - 1); pTrial_ise_weights[i] = (*m_pRank_to_ise)[v]; } break; } case 13: { float mid_weight = (float)m_sum_weight / (float)m_total_pixels; for (uint32_t i = 0; i < m_total_pixels; i++) { int v = m_start_weights[i]; float fv; if (v < mid_weight) fv = ((float)v - mid_weight) * .8f + ((float)m_num_weight_levels * .5f); else fv = (float)v; v = clamp((int)std::round(fv), 0, m_num_weight_levels - 1); pTrial_ise_weights[i] = (*m_pRank_to_ise)[v]; } break; } case 14: { float mid_weight = (float)m_sum_weight / (float)m_total_pixels; for (uint32_t i = 0; i < m_total_pixels; i++) { int v = m_start_weights[i]; float fv; if (v >= mid_weight) fv = ((float)v - mid_weight) * .8f + ((float)m_num_weight_levels * .5f); else fv = (float)v; v = clamp((int)std::round(fv), 0, m_num_weight_levels - 1); pTrial_ise_weights[i] = (*m_pRank_to_ise)[v]; } break; } case 15: { for (uint32_t i = 0; i < m_total_pixels; i++) { uint32_t v = m_start_weights[i]; if (v < (m_num_weight_levels - 1)) v++; pTrial_ise_weights[i] = (*m_pRank_to_ise)[v]; } break; } case 16: { for (uint32_t i = 0; i < m_total_pixels; i++) { uint32_t v = m_start_weights[i]; if (v) v--; pTrial_ise_weights[i] = (*m_pRank_to_ise)[v]; } break; } default: { assert(0); memset(pTrial_ise_weights, 0, m_total_pixels); break; } } } uint32_t m_total_pixels; uint32_t m_weight_ise_range; uint32_t m_num_weight_levels; uint8_t m_start_weights[ASTC_LDR_MAX_BLOCK_PIXELS]; // ranks, not ISE uint32_t m_min_weight, m_max_weight, m_sum_weight; const basisu::vector* m_pISE_to_rank; const basisu::vector* m_pRank_to_ise; }; // rgb/rgba direct or rgb/rgba base+offset, single plane static uint64_t encode_cem8_12_9_13( uint32_t cem_index, const pixel_stats_t& pixel_stats, const cem_encode_params& enc_params, uint32_t endpoint_ise_range, uint32_t weight_ise_range, uint8_t* pEndpoint_vals, uint8_t* pWeight_vals, uint64_t cur_blk_error, bool use_blue_contraction, bool* pBase_ofs_clamped_flag) { assert(g_initialized); assert((cem_index == astc_helpers::CEM_LDR_RGB_DIRECT) || (cem_index == astc_helpers::CEM_LDR_RGBA_DIRECT) || (cem_index == astc_helpers::CEM_LDR_RGB_BASE_PLUS_OFFSET) || (cem_index == astc_helpers::CEM_LDR_RGBA_BASE_PLUS_OFFSET)); assert((pixel_stats.m_num_pixels) && (pixel_stats.m_num_pixels <= ASTC_LDR_MAX_BLOCK_PIXELS)); assert((endpoint_ise_range >= astc_helpers::FIRST_VALID_ENDPOINT_ISE_RANGE) && (endpoint_ise_range <= astc_helpers::LAST_VALID_ENDPOINT_ISE_RANGE)); assert(((weight_ise_range >= astc_helpers::FIRST_VALID_WEIGHT_ISE_RANGE) && (weight_ise_range <= astc_helpers::LAST_VALID_WEIGHT_ISE_RANGE)) || (weight_ise_range == astc_helpers::BISE_64_LEVELS)); if (pBase_ofs_clamped_flag) *pBase_ofs_clamped_flag = false; const bool cem_has_alpha = (cem_index == astc_helpers::CEM_LDR_RGBA_DIRECT) || (cem_index == astc_helpers::CEM_LDR_RGBA_BASE_PLUS_OFFSET); const bool cem_is_base_offset = (cem_index == astc_helpers::CEM_LDR_RGB_BASE_PLUS_OFFSET) || (cem_index == astc_helpers::CEM_LDR_RGBA_BASE_PLUS_OFFSET); const uint32_t total_endpoint_vals = astc_helpers::get_num_cem_values(cem_index); const uint32_t total_weights = pixel_stats.m_num_pixels; float best_l = BIG_FLOAT_VAL, best_h = -BIG_FLOAT_VAL; //int best_l_index = 0, best_h_index = 0; for (uint32_t c = 0; c < pixel_stats.m_num_pixels; c++) { const vec4F px(pixel_stats.m_pixels_f[c] - pixel_stats.m_mean_f); float p = cem_has_alpha ? px.dot(pixel_stats.m_mean_rel_axis4) : px.dot3(pixel_stats.m_mean_rel_axis3); if (p < best_l) { best_l = p; //best_l_index = c; } if (p > best_h) { best_h = p; //best_h_index = c; } } // c #if 0 vec4F low_color_f(pixel_stats.m_pixels_f[best_l_index]), high_color_f(pixel_stats.m_pixels_f[best_h_index]); #else vec4F low_color_f, high_color_f; if (cem_has_alpha) { low_color_f = pixel_stats.m_mean_rel_axis4 * best_l + pixel_stats.m_mean_f; high_color_f = pixel_stats.m_mean_rel_axis4 * best_h + pixel_stats.m_mean_f; } else { low_color_f = vec4F(pixel_stats.m_mean_rel_axis3) * best_l + pixel_stats.m_mean_f; high_color_f = vec4F(pixel_stats.m_mean_rel_axis3) * best_h + pixel_stats.m_mean_f; } low_color_f.clamp(0.0f, 1.0f); high_color_f.clamp(0.0f, 1.0f); #endif uint8_t trial_blk_endpoints[astc_helpers::MAX_CEM_ENDPOINT_VALS] = { 0 }; uint8_t trial_blk_weights[ASTC_LDR_MAX_BLOCK_PIXELS] = { 0 }; uint64_t trial_blk_error = UINT64_MAX; bool trial_used_blue_contraction = false; bool tried_used_blue_contraction = false; if (cem_is_base_offset) { bool tried_base_ofs_clamped = false; try_cem9_13_sp_or_dp( cem_index, -1, pixel_stats, enc_params, endpoint_ise_range, weight_ise_range, low_color_f, high_color_f, trial_blk_endpoints, trial_blk_weights, nullptr, trial_blk_error, trial_used_blue_contraction, use_blue_contraction, tried_used_blue_contraction, tried_base_ofs_clamped); if ((pBase_ofs_clamped_flag) && (tried_base_ofs_clamped)) *pBase_ofs_clamped_flag = true; if (tried_used_blue_contraction) { try_cem9_13_sp_or_dp( cem_index, -1, pixel_stats, enc_params, endpoint_ise_range, weight_ise_range, low_color_f, high_color_f, trial_blk_endpoints, trial_blk_weights, nullptr, trial_blk_error, trial_used_blue_contraction, false, tried_used_blue_contraction, tried_base_ofs_clamped); if ((pBase_ofs_clamped_flag) && (tried_base_ofs_clamped)) *pBase_ofs_clamped_flag = true; } } else { try_cem8_12( cem_index, pixel_stats, enc_params, endpoint_ise_range, weight_ise_range, low_color_f, high_color_f, trial_blk_endpoints, trial_blk_weights, trial_blk_error, trial_used_blue_contraction, use_blue_contraction, tried_used_blue_contraction); if (tried_used_blue_contraction) { try_cem8_12( cem_index, pixel_stats, enc_params, endpoint_ise_range, weight_ise_range, low_color_f, high_color_f, trial_blk_endpoints, trial_blk_weights, trial_blk_error, trial_used_blue_contraction, false, tried_used_blue_contraction); } } if (trial_blk_error == UINT64_MAX) return cur_blk_error; if (trial_blk_error < cur_blk_error) { cur_blk_error = trial_blk_error; memcpy(pEndpoint_vals, trial_blk_endpoints, total_endpoint_vals); memcpy(pWeight_vals, trial_blk_weights, total_weights); } for (uint32_t pass = 0; pass < enc_params.m_max_ls_passes; pass++) { vec4F xl, xh; bool ls_res; if (cem_has_alpha) { ls_res = compute_least_squares_endpoints_4D( pixel_stats.m_num_pixels, trial_blk_weights, get_ls_weights_ise(weight_ise_range), &xl, &xh, pixel_stats.m_pixels_f, pixel_stats.m_min_f, pixel_stats.m_max_f); } else { ls_res = compute_least_squares_endpoints_3D( pixel_stats.m_num_pixels, trial_blk_weights, get_ls_weights_ise(weight_ise_range), &xl, &xh, pixel_stats.m_pixels_f, pixel_stats.m_min_f, pixel_stats.m_max_f); } if (!ls_res) break; if (cem_is_base_offset) { bool tried_base_ofs_clamped = false; try_cem9_13_sp_or_dp( cem_index, -1, pixel_stats, enc_params, endpoint_ise_range, weight_ise_range, xl, xh, trial_blk_endpoints, trial_blk_weights, nullptr, trial_blk_error, trial_used_blue_contraction, use_blue_contraction, tried_used_blue_contraction, tried_base_ofs_clamped); if ((pBase_ofs_clamped_flag) && (tried_base_ofs_clamped)) *pBase_ofs_clamped_flag = true; if (tried_used_blue_contraction) { // Try without blue contraction for a minor gain. try_cem9_13_sp_or_dp( cem_index, -1, pixel_stats, enc_params, endpoint_ise_range, weight_ise_range, xl, xh, trial_blk_endpoints, trial_blk_weights, nullptr, trial_blk_error, trial_used_blue_contraction, false, tried_used_blue_contraction, tried_base_ofs_clamped); if ((pBase_ofs_clamped_flag) && (tried_base_ofs_clamped)) *pBase_ofs_clamped_flag = true; } } else { try_cem8_12( cem_index, pixel_stats, enc_params, endpoint_ise_range, weight_ise_range, xl, xh, trial_blk_endpoints, trial_blk_weights, trial_blk_error, trial_used_blue_contraction, use_blue_contraction, tried_used_blue_contraction); if (tried_used_blue_contraction) { // Try without blue contraction for a minor gain. try_cem8_12( cem_index, pixel_stats, enc_params, endpoint_ise_range, weight_ise_range, xl, xh, trial_blk_endpoints, trial_blk_weights, trial_blk_error, trial_used_blue_contraction, false, tried_used_blue_contraction); } } if (trial_blk_error >= cur_blk_error) break; cur_blk_error = trial_blk_error; memcpy(pEndpoint_vals, trial_blk_endpoints, total_endpoint_vals); memcpy(pWeight_vals, trial_blk_weights, total_weights); } // pass if ((enc_params.m_total_weight_refine_passes) && ((weight_ise_range != astc_helpers::BISE_2_LEVELS) && (weight_ise_range != astc_helpers::BISE_64_LEVELS))) { weight_refiner refiner; refiner.init(weight_ise_range, pixel_stats.m_num_pixels, pWeight_vals); for (uint32_t pass = 0; pass < enc_params.m_total_weight_refine_passes; pass++) { refiner.refine(pass, trial_blk_weights); vec4F xl, xh; bool ls_res; if (cem_has_alpha) { ls_res = compute_least_squares_endpoints_4D( pixel_stats.m_num_pixels, trial_blk_weights, get_ls_weights_ise(weight_ise_range), &xl, &xh, pixel_stats.m_pixels_f, pixel_stats.m_min_f, pixel_stats.m_max_f); } else { ls_res = compute_least_squares_endpoints_3D( pixel_stats.m_num_pixels, trial_blk_weights, get_ls_weights_ise(weight_ise_range), &xl, &xh, pixel_stats.m_pixels_f, pixel_stats.m_min_f, pixel_stats.m_max_f); } if (!ls_res) continue; if (cem_is_base_offset) { bool tried_base_ofs_clamped = false; try_cem9_13_sp_or_dp( cem_index, -1, pixel_stats, enc_params, endpoint_ise_range, weight_ise_range, xl, xh, trial_blk_endpoints, trial_blk_weights, nullptr, trial_blk_error, trial_used_blue_contraction, use_blue_contraction, tried_used_blue_contraction, tried_base_ofs_clamped); if ((pBase_ofs_clamped_flag) && (tried_base_ofs_clamped)) *pBase_ofs_clamped_flag = true; if (tried_used_blue_contraction) { // Try without blue contraction for a minor gain. try_cem9_13_sp_or_dp( cem_index, -1, pixel_stats, enc_params, endpoint_ise_range, weight_ise_range, xl, xh, trial_blk_endpoints, trial_blk_weights, nullptr, trial_blk_error, trial_used_blue_contraction, false, tried_used_blue_contraction, tried_base_ofs_clamped); if ((pBase_ofs_clamped_flag) && (tried_base_ofs_clamped)) *pBase_ofs_clamped_flag = true; } } else { try_cem8_12( cem_index, pixel_stats, enc_params, endpoint_ise_range, weight_ise_range, xl, xh, trial_blk_endpoints, trial_blk_weights, trial_blk_error, trial_used_blue_contraction, use_blue_contraction, tried_used_blue_contraction); if (tried_used_blue_contraction) { // Try without blue contraction for a minor gain. try_cem8_12( cem_index, pixel_stats, enc_params, endpoint_ise_range, weight_ise_range, xl, xh, trial_blk_endpoints, trial_blk_weights, trial_blk_error, trial_used_blue_contraction, false, tried_used_blue_contraction); } } if (trial_blk_error < cur_blk_error) { cur_blk_error = trial_blk_error; memcpy(pEndpoint_vals, trial_blk_endpoints, total_endpoint_vals); memcpy(pWeight_vals, trial_blk_weights, total_weights); } } // pass } const uint32_t N = 4; if ((enc_params.m_worst_weight_nudging_flag) && (pixel_stats.m_num_pixels > N) && ((weight_ise_range != astc_helpers::BISE_2_LEVELS) && (weight_ise_range != astc_helpers::BISE_64_LEVELS))) { const uint32_t NUM_NUDGING_PASSES = 1; for (uint32_t pass = 0; pass < NUM_NUDGING_PASSES; pass++) { color_rgba l, h; decode_endpoints(cem_index, pEndpoint_vals, endpoint_ise_range, l, h); vec4F dir; dir[0] = (float)(h[0] - l[0]); dir[1] = (float)(h[1] - l[1]); dir[2] = (float)(h[2] - l[2]); dir[3] = cem_has_alpha ? (float)(h[3] - l[3]) : 0.0f; dir.normalize_in_place(); float errs[ASTC_LDR_MAX_BLOCK_PIXELS]; float delta_dots[ASTC_LDR_MAX_BLOCK_PIXELS]; for (uint32_t i = 0; i < pixel_stats.m_num_pixels; i++) { vec4F ofs(pixel_stats.m_pixels_f[i] - pixel_stats.m_mean_f); float proj = dir.dot(ofs); vec4F proj_vec(pixel_stats.m_mean_f + proj * dir); vec4F delta_vec(pixel_stats.m_pixels_f[i] - proj_vec); delta_dots[i] = dir.dot(delta_vec); errs[i] = cem_has_alpha ? vec4F::dot_product(delta_vec, delta_vec) : vec4F::dot_product3(delta_vec, delta_vec); } uint32_t errs_indices[ASTC_LDR_MAX_BLOCK_PIXELS]; indirect_sort(pixel_stats.m_num_pixels, errs_indices, errs); memcpy(trial_blk_weights, pWeight_vals, total_weights); for (uint32_t i = 0; i < N; i++) { const uint32_t idx = errs_indices[pixel_stats.m_num_pixels - 1 - i]; int delta_to_apply = (delta_dots[idx] > 0.0f) ? 1 : -1; trial_blk_weights[idx] = (uint8_t)apply_delta_to_bise_weight_val(weight_ise_range, trial_blk_weights[idx], delta_to_apply); } // i vec4F xl, xh; bool ls_res; if (cem_has_alpha) { ls_res = compute_least_squares_endpoints_4D( pixel_stats.m_num_pixels, trial_blk_weights, get_ls_weights_ise(weight_ise_range), &xl, &xh, pixel_stats.m_pixels_f, pixel_stats.m_min_f, pixel_stats.m_max_f); } else { ls_res = compute_least_squares_endpoints_3D( pixel_stats.m_num_pixels, trial_blk_weights, get_ls_weights_ise(weight_ise_range), &xl, &xh, pixel_stats.m_pixels_f, pixel_stats.m_min_f, pixel_stats.m_max_f); } if (!ls_res) break; if (cem_is_base_offset) { bool tried_base_ofs_clamped = false; try_cem9_13_sp_or_dp( cem_index, -1, pixel_stats, enc_params, endpoint_ise_range, weight_ise_range, xl, xh, trial_blk_endpoints, trial_blk_weights, nullptr, trial_blk_error, trial_used_blue_contraction, use_blue_contraction, tried_used_blue_contraction, tried_base_ofs_clamped); if ((pBase_ofs_clamped_flag) && (tried_base_ofs_clamped)) *pBase_ofs_clamped_flag = true; if (tried_used_blue_contraction) { // Try without blue contraction for a minor gain. try_cem9_13_sp_or_dp( cem_index, -1, pixel_stats, enc_params, endpoint_ise_range, weight_ise_range, xl, xh, trial_blk_endpoints, trial_blk_weights, nullptr, trial_blk_error, trial_used_blue_contraction, false, tried_used_blue_contraction, tried_base_ofs_clamped); if ((pBase_ofs_clamped_flag) && (tried_base_ofs_clamped)) *pBase_ofs_clamped_flag = true; } } else { try_cem8_12( cem_index, pixel_stats, enc_params, endpoint_ise_range, weight_ise_range, xl, xh, trial_blk_endpoints, trial_blk_weights, trial_blk_error, trial_used_blue_contraction, use_blue_contraction, tried_used_blue_contraction); if (tried_used_blue_contraction) { // Try without blue contraction for a minor gain. try_cem8_12( cem_index, pixel_stats, enc_params, endpoint_ise_range, weight_ise_range, xl, xh, trial_blk_endpoints, trial_blk_weights, trial_blk_error, trial_used_blue_contraction, false, tried_used_blue_contraction); } } if (trial_blk_error < cur_blk_error) { cur_blk_error = trial_blk_error; memcpy(pEndpoint_vals, trial_blk_endpoints, total_endpoint_vals); memcpy(pWeight_vals, trial_blk_weights, total_weights); } else { break; } } // pass } if (enc_params.m_endpoint_refinement_flag) { const uint32_t num_comps = cem_has_alpha ? 4 : 3; for (uint32_t c = 0; c < num_comps; c++) { uint8_t base_endpoint_vals[astc_helpers::MAX_CEM_ENDPOINT_VALS]; memcpy(base_endpoint_vals, pEndpoint_vals, total_endpoint_vals); for (int dl = -1; dl <= 1; dl++) { for (int dh = -1; dh <= 1; dh++) { if (!dl && !dh) continue; memcpy(trial_blk_endpoints, base_endpoint_vals, total_endpoint_vals); trial_blk_endpoints[c * 2 + 0] = (uint8_t)astc_helpers::apply_delta_to_bise_endpoint_val(endpoint_ise_range, trial_blk_endpoints[c * 2 + 0], dl); trial_blk_endpoints[c * 2 + 1] = (uint8_t)astc_helpers::apply_delta_to_bise_endpoint_val(endpoint_ise_range, trial_blk_endpoints[c * 2 + 1], dh); if (!use_blue_contraction) { const bool uses_blue_contraction = astc_helpers::used_blue_contraction(cem_index, trial_blk_endpoints, endpoint_ise_range); if (uses_blue_contraction) continue; } trial_blk_error = eval_solution( pixel_stats, cem_index, trial_blk_endpoints, endpoint_ise_range, trial_blk_weights, weight_ise_range, enc_params); if (trial_blk_error < cur_blk_error) { cur_blk_error = trial_blk_error; memcpy(pEndpoint_vals, trial_blk_endpoints, total_endpoint_vals); memcpy(pWeight_vals, trial_blk_weights, total_weights); } } // dh } // dl } } return cur_blk_error; } // rgb/rgba direct, or rgb/rgba base+offset, dual plane static uint64_t encode_cem8_12_9_13_dp( uint32_t cem_index, uint32_t ccs_index, const pixel_stats_t& pixel_stats, const cem_encode_params& enc_params, uint32_t endpoint_ise_range, uint32_t weight_ise_range, uint8_t* pEndpoint_vals, uint8_t* pWeight_vals0, uint8_t* pWeight_vals1, uint64_t cur_blk_error, bool use_blue_contraction, bool *pBase_ofs_clamped_flag) { assert(g_initialized); assert(ccs_index <= 3); assert((pixel_stats.m_num_pixels) && (pixel_stats.m_num_pixels <= ASTC_LDR_MAX_BLOCK_PIXELS)); assert((endpoint_ise_range >= astc_helpers::FIRST_VALID_ENDPOINT_ISE_RANGE) && (endpoint_ise_range <= astc_helpers::LAST_VALID_ENDPOINT_ISE_RANGE)); assert(((weight_ise_range >= astc_helpers::FIRST_VALID_WEIGHT_ISE_RANGE) && (weight_ise_range <= astc_helpers::LAST_VALID_WEIGHT_ISE_RANGE)) || (weight_ise_range == astc_helpers::BISE_64_LEVELS)); if (pBase_ofs_clamped_flag) *pBase_ofs_clamped_flag = false; bool cem_has_alpha = false, cem_is_base_offset = false; switch (cem_index) { case astc_helpers::CEM_LDR_RGB_DIRECT: break; case astc_helpers::CEM_LDR_RGBA_DIRECT: cem_has_alpha = true; break; case astc_helpers::CEM_LDR_RGB_BASE_PLUS_OFFSET: cem_is_base_offset = true; break; case astc_helpers::CEM_LDR_RGBA_BASE_PLUS_OFFSET: cem_is_base_offset = true; cem_has_alpha = true; break; default: assert(0); return false; } assert((ccs_index <= 2) || cem_has_alpha); const uint32_t total_endpoint_vals = astc_helpers::get_num_cem_values(cem_index); const uint32_t total_weights = pixel_stats.m_num_pixels; // Remove influence of the 2nd plane's values, recalc principle axis on other values. vec4F flattened_pixels[ASTC_LDR_MAX_BLOCK_PIXELS]; for (uint32_t i = 0; i < pixel_stats.m_num_pixels; i++) { flattened_pixels[i] = pixel_stats.m_pixels_f[i]; flattened_pixels[i][ccs_index] = 0.0f; if (!cem_has_alpha) flattened_pixels[i][3] = 0.0f; } vec4F flattened_pixels_mean(pixel_stats.m_mean_f); flattened_pixels_mean[ccs_index] = 0.0f; if (!cem_has_alpha) flattened_pixels_mean[3] = 0.0f; vec4F flattened_axis; if (!cem_has_alpha) flattened_axis = calc_pca_3D(pixel_stats.m_num_pixels, flattened_pixels, flattened_pixels_mean); else flattened_axis = calc_pca_4D(pixel_stats.m_num_pixels, flattened_pixels, flattened_pixels_mean); float best_l = BIG_FLOAT_VAL, best_h = -BIG_FLOAT_VAL; //int best_l_index = 0, best_h_index = 0; for (uint32_t c = 0; c < pixel_stats.m_num_pixels; c++) { const vec4F px(flattened_pixels[c] - flattened_pixels_mean); float p = px.dot(flattened_axis); if (p < best_l) { best_l = p; //best_l_index = c; } if (p > best_h) { best_h = p; //best_h_index = c; } } // c #if 0 vec4F low_color_f(pixel_stats.m_pixels_f[best_l_index]), high_color_f(pixel_stats.m_pixels_f[best_h_index]); #else vec4F low_color_f, high_color_f; low_color_f = flattened_pixels_mean + flattened_axis * best_l; high_color_f = flattened_pixels_mean + flattened_axis * best_h; low_color_f.clamp(0.0f, 1.0f); high_color_f.clamp(0.0f, 1.0f); #endif low_color_f[ccs_index] = pixel_stats.m_min_f[ccs_index]; high_color_f[ccs_index] = pixel_stats.m_max_f[ccs_index]; uint8_t trial_blk_endpoints[astc_helpers::MAX_CEM_ENDPOINT_VALS] = { 0 }; uint8_t trial_blk_weights0[ASTC_LDR_MAX_BLOCK_PIXELS] = { 0 }; uint8_t trial_blk_weights1[ASTC_LDR_MAX_BLOCK_PIXELS] = { 0 }; uint64_t trial_blk_error = UINT64_MAX; bool trial_used_blue_contraction = false; bool tried_used_blue_contraction = false; if (cem_is_base_offset) { bool tried_base_ofs_clamped = false; try_cem9_13_sp_or_dp( cem_index, ccs_index, pixel_stats, enc_params, endpoint_ise_range, weight_ise_range, low_color_f, high_color_f, trial_blk_endpoints, trial_blk_weights0, trial_blk_weights1, trial_blk_error, trial_used_blue_contraction, use_blue_contraction, tried_used_blue_contraction, tried_base_ofs_clamped); if ((pBase_ofs_clamped_flag) && (tried_base_ofs_clamped)) *pBase_ofs_clamped_flag = true; if (tried_used_blue_contraction) { try_cem9_13_sp_or_dp( cem_index, ccs_index, pixel_stats, enc_params, endpoint_ise_range, weight_ise_range, low_color_f, high_color_f, trial_blk_endpoints, trial_blk_weights0, trial_blk_weights1, trial_blk_error, trial_used_blue_contraction, false, tried_used_blue_contraction, tried_base_ofs_clamped); if ((pBase_ofs_clamped_flag) && (tried_base_ofs_clamped)) *pBase_ofs_clamped_flag = true; } } else { try_cem8_12_dp( cem_index, ccs_index, pixel_stats, enc_params, endpoint_ise_range, weight_ise_range, low_color_f, high_color_f, trial_blk_endpoints, trial_blk_weights0, trial_blk_weights1, trial_blk_error, trial_used_blue_contraction, use_blue_contraction, tried_used_blue_contraction); if (tried_used_blue_contraction) { try_cem8_12_dp( cem_index, ccs_index, pixel_stats, enc_params, endpoint_ise_range, weight_ise_range, low_color_f, high_color_f, trial_blk_endpoints, trial_blk_weights0, trial_blk_weights1, trial_blk_error, trial_used_blue_contraction, false, tried_used_blue_contraction); } } if (trial_blk_error == UINT64_MAX) return cur_blk_error; if (trial_blk_error < cur_blk_error) { cur_blk_error = trial_blk_error; memcpy(pEndpoint_vals, trial_blk_endpoints, total_endpoint_vals); memcpy(pWeight_vals0, trial_blk_weights0, total_weights); memcpy(pWeight_vals1, trial_blk_weights1, total_weights); } vec4F flattened_pixels_min_f(pixel_stats.m_min_f); flattened_pixels_min_f[ccs_index] = 0; vec4F flattened_pixels_max_f(pixel_stats.m_max_f); flattened_pixels_max_f[ccs_index] = 0; for (uint32_t pass = 0; pass < enc_params.m_max_ls_passes; pass++) { vec4F xl, xh; // TODO: Switch between 4D or 3D if (!compute_least_squares_endpoints_4D( pixel_stats.m_num_pixels, trial_blk_weights0, get_ls_weights_ise(weight_ise_range), &xl, &xh, flattened_pixels, flattened_pixels_min_f, flattened_pixels_max_f)) { break; } color_rgba dec_l(0), dec_h(0); decode_endpoints(cem_index, trial_blk_endpoints, endpoint_ise_range, dec_l, dec_h); xl[ccs_index] = dec_l[ccs_index] * (1.0f / 255.0f); xh[ccs_index] = dec_h[ccs_index] * (1.0f / 255.0f); if (cem_is_base_offset) { bool tried_base_ofs_clamped = false; try_cem9_13_sp_or_dp( cem_index, ccs_index, pixel_stats, enc_params, endpoint_ise_range, weight_ise_range, xl, xh, trial_blk_endpoints, trial_blk_weights0, trial_blk_weights1, trial_blk_error, trial_used_blue_contraction, use_blue_contraction, tried_used_blue_contraction, tried_base_ofs_clamped); if ((pBase_ofs_clamped_flag) && (tried_base_ofs_clamped)) *pBase_ofs_clamped_flag = true; if (tried_used_blue_contraction) { // Try without blue contraction for a minor gain. try_cem9_13_sp_or_dp( cem_index, ccs_index, pixel_stats, enc_params, endpoint_ise_range, weight_ise_range, xl, xh, trial_blk_endpoints, trial_blk_weights0, trial_blk_weights1, trial_blk_error, trial_used_blue_contraction, false, tried_used_blue_contraction, tried_base_ofs_clamped); if ((pBase_ofs_clamped_flag) && (tried_base_ofs_clamped)) *pBase_ofs_clamped_flag = true; } } else { try_cem8_12_dp( cem_index, ccs_index, pixel_stats, enc_params, endpoint_ise_range, weight_ise_range, xl, xh, trial_blk_endpoints, trial_blk_weights0, trial_blk_weights1, trial_blk_error, trial_used_blue_contraction, use_blue_contraction, tried_used_blue_contraction); if (tried_used_blue_contraction) { // Try without blue contraction for a minor gain. try_cem8_12_dp( cem_index, ccs_index, pixel_stats, enc_params, endpoint_ise_range, weight_ise_range, xl, xh, trial_blk_endpoints, trial_blk_weights0, trial_blk_weights1, trial_blk_error, trial_used_blue_contraction, false, tried_used_blue_contraction); } } if (trial_blk_error >= cur_blk_error) break; cur_blk_error = trial_blk_error; memcpy(pEndpoint_vals, trial_blk_endpoints, total_endpoint_vals); memcpy(pWeight_vals0, trial_blk_weights0, total_weights); memcpy(pWeight_vals1, trial_blk_weights1, total_weights); } // pass const float ccs_bounds_min = pixel_stats.m_min_f[ccs_index]; const float ccs_bounds_max = pixel_stats.m_max_f[ccs_index]; float ccs_vals[ASTC_LDR_MAX_BLOCK_PIXELS]; if (ccs_bounds_min != ccs_bounds_max) { for (uint32_t i = 0; i < pixel_stats.m_num_pixels; i++) ccs_vals[i] = pixel_stats.m_pixels_f[i][ccs_index]; for (uint32_t pass = 0; pass < enc_params.m_max_ls_passes; pass++) { float xl = 0.0f, xh = 0.0f; if (!compute_least_squares_endpoints_1D( pixel_stats.m_num_pixels, trial_blk_weights1, get_ls_weights_ise(weight_ise_range), &xl, &xh, ccs_vals, ccs_bounds_min, ccs_bounds_max)) { break; } color_rgba dec_l(0), dec_h(0); decode_endpoints(cem_index, trial_blk_endpoints, endpoint_ise_range, dec_l, dec_h); vec4F vl, vh; for (uint32_t c = 0; c < 4; c++) { if (c == ccs_index) { vl[c] = xl; vh[c] = xh; } else { vl[c] = (float)dec_l[c] * (1.0f / 255.0f); vh[c] = (float)dec_h[c] * (1.0f / 255.0f); } } if (cem_is_base_offset) { bool tried_base_ofs_clamped = false; try_cem9_13_sp_or_dp( cem_index, ccs_index, pixel_stats, enc_params, endpoint_ise_range, weight_ise_range, vl, vh, trial_blk_endpoints, trial_blk_weights0, trial_blk_weights1, trial_blk_error, trial_used_blue_contraction, use_blue_contraction, tried_used_blue_contraction, tried_base_ofs_clamped); if ((pBase_ofs_clamped_flag) && (tried_base_ofs_clamped)) *pBase_ofs_clamped_flag = true; if (tried_used_blue_contraction) { // Try without blue contraction for a minor gain. try_cem9_13_sp_or_dp( cem_index, ccs_index, pixel_stats, enc_params, endpoint_ise_range, weight_ise_range, vl, vh, trial_blk_endpoints, trial_blk_weights0, trial_blk_weights1, trial_blk_error, trial_used_blue_contraction, false, tried_used_blue_contraction, tried_base_ofs_clamped); if ((pBase_ofs_clamped_flag) && (tried_base_ofs_clamped)) *pBase_ofs_clamped_flag = true; } } else { try_cem8_12_dp( cem_index, ccs_index, pixel_stats, enc_params, endpoint_ise_range, weight_ise_range, vl, vh, trial_blk_endpoints, trial_blk_weights0, trial_blk_weights1, trial_blk_error, trial_used_blue_contraction, use_blue_contraction, tried_used_blue_contraction); if (tried_used_blue_contraction) { // Try without blue contraction for a minor gain. try_cem8_12_dp( cem_index, ccs_index, pixel_stats, enc_params, endpoint_ise_range, weight_ise_range, vl, vh, trial_blk_endpoints, trial_blk_weights0, trial_blk_weights1, trial_blk_error, trial_used_blue_contraction, false, tried_used_blue_contraction); } } if (trial_blk_error >= cur_blk_error) break; cur_blk_error = trial_blk_error; memcpy(pEndpoint_vals, trial_blk_endpoints, total_endpoint_vals); memcpy(pWeight_vals0, trial_blk_weights0, total_weights); memcpy(pWeight_vals1, trial_blk_weights1, total_weights); } // pass } if ((enc_params.m_total_weight_refine_passes) && ((weight_ise_range != astc_helpers::BISE_2_LEVELS) && (weight_ise_range != astc_helpers::BISE_64_LEVELS))) { weight_refiner refiner; refiner.init(weight_ise_range, pixel_stats.m_num_pixels, pWeight_vals0); for (uint32_t pass = 0; pass < enc_params.m_total_weight_refine_passes; pass++) { refiner.refine(pass, trial_blk_weights0); vec4F xl, xh; if (!compute_least_squares_endpoints_4D( pixel_stats.m_num_pixels, trial_blk_weights0, get_ls_weights_ise(weight_ise_range), &xl, &xh, flattened_pixels, flattened_pixels_min_f, flattened_pixels_max_f)) { break; } color_rgba dec_l(0), dec_h(0); decode_endpoints(cem_index, trial_blk_endpoints, endpoint_ise_range, dec_l, dec_h); xl[ccs_index] = dec_l[ccs_index] * (1.0f / 255.0f); xh[ccs_index] = dec_h[ccs_index] * (1.0f / 255.0f); if (cem_is_base_offset) { bool tried_base_ofs_clamped = false; try_cem9_13_sp_or_dp( cem_index, ccs_index, pixel_stats, enc_params, endpoint_ise_range, weight_ise_range, xl, xh, trial_blk_endpoints, trial_blk_weights0, trial_blk_weights1, trial_blk_error, trial_used_blue_contraction, use_blue_contraction, tried_used_blue_contraction, tried_base_ofs_clamped); if ((pBase_ofs_clamped_flag) && (tried_base_ofs_clamped)) *pBase_ofs_clamped_flag = true; if (tried_used_blue_contraction) { // Try without blue contraction for a minor gain. try_cem9_13_sp_or_dp( cem_index, ccs_index, pixel_stats, enc_params, endpoint_ise_range, weight_ise_range, xl, xh, trial_blk_endpoints, trial_blk_weights0, trial_blk_weights1, trial_blk_error, trial_used_blue_contraction, false, tried_used_blue_contraction, tried_base_ofs_clamped); if ((pBase_ofs_clamped_flag) && (tried_base_ofs_clamped)) *pBase_ofs_clamped_flag = true; } } else { try_cem8_12_dp( cem_index, ccs_index, pixel_stats, enc_params, endpoint_ise_range, weight_ise_range, xl, xh, trial_blk_endpoints, trial_blk_weights0, trial_blk_weights1, trial_blk_error, trial_used_blue_contraction, use_blue_contraction, tried_used_blue_contraction); if (tried_used_blue_contraction) { // Try without blue contraction for a minor gain. try_cem8_12_dp( cem_index, ccs_index, pixel_stats, enc_params, endpoint_ise_range, weight_ise_range, xl, xh, trial_blk_endpoints, trial_blk_weights0, trial_blk_weights1, trial_blk_error, trial_used_blue_contraction, false, tried_used_blue_contraction); } } if (trial_blk_error >= cur_blk_error) continue; cur_blk_error = trial_blk_error; memcpy(pEndpoint_vals, trial_blk_endpoints, total_endpoint_vals); memcpy(pWeight_vals0, trial_blk_weights0, total_weights); memcpy(pWeight_vals1, trial_blk_weights1, total_weights); } // pass if (ccs_bounds_min != ccs_bounds_max) { refiner.init(weight_ise_range, pixel_stats.m_num_pixels, pWeight_vals1); for (uint32_t pass = 0; pass < WEIGHT_REFINER_MAX_PASSES; pass++) { refiner.refine(pass, trial_blk_weights1); float xl = 0.0f, xh = 0.0f; if (!compute_least_squares_endpoints_1D( pixel_stats.m_num_pixels, trial_blk_weights1, get_ls_weights_ise(weight_ise_range), &xl, &xh, ccs_vals, ccs_bounds_min, ccs_bounds_max)) { break; } color_rgba dec_l(0), dec_h(0); decode_endpoints(cem_index, trial_blk_endpoints, endpoint_ise_range, dec_l, dec_h); vec4F vl, vh; for (uint32_t c = 0; c < 4; c++) { if (c == ccs_index) { vl[c] = xl; vh[c] = xh; } else { vl[c] = (float)dec_l[c] * (1.0f / 255.0f); vh[c] = (float)dec_h[c] * (1.0f / 255.0f); } } bool did_improve_res = false; if (cem_is_base_offset) { bool tried_base_ofs_clamped = false; did_improve_res = try_cem9_13_sp_or_dp( cem_index, ccs_index, pixel_stats, enc_params, endpoint_ise_range, weight_ise_range, vl, vh, trial_blk_endpoints, trial_blk_weights0, trial_blk_weights1, trial_blk_error, trial_used_blue_contraction, use_blue_contraction, tried_used_blue_contraction, tried_base_ofs_clamped); BASISU_NOTE_UNUSED(did_improve_res); if ((pBase_ofs_clamped_flag) && (tried_base_ofs_clamped)) *pBase_ofs_clamped_flag = true; if (tried_used_blue_contraction) { // Try without blue contraction for a minor gain. did_improve_res = try_cem9_13_sp_or_dp( cem_index, ccs_index, pixel_stats, enc_params, endpoint_ise_range, weight_ise_range, vl, vh, trial_blk_endpoints, trial_blk_weights0, trial_blk_weights1, trial_blk_error, trial_used_blue_contraction, false, tried_used_blue_contraction, tried_base_ofs_clamped); if ((pBase_ofs_clamped_flag) && (tried_base_ofs_clamped)) *pBase_ofs_clamped_flag = true; } } else { did_improve_res = try_cem8_12_dp( cem_index, ccs_index, pixel_stats, enc_params, endpoint_ise_range, weight_ise_range, vl, vh, trial_blk_endpoints, trial_blk_weights0, trial_blk_weights1, trial_blk_error, trial_used_blue_contraction, use_blue_contraction, tried_used_blue_contraction); if (tried_used_blue_contraction) { // Try without blue contraction for a minor gain. did_improve_res = try_cem8_12_dp( cem_index, ccs_index, pixel_stats, enc_params, endpoint_ise_range, weight_ise_range, vl, vh, trial_blk_endpoints, trial_blk_weights0, trial_blk_weights1, trial_blk_error, trial_used_blue_contraction, false, tried_used_blue_contraction); } } if (trial_blk_error >= cur_blk_error) continue; cur_blk_error = trial_blk_error; memcpy(pEndpoint_vals, trial_blk_endpoints, total_endpoint_vals); memcpy(pWeight_vals0, trial_blk_weights0, total_weights); memcpy(pWeight_vals1, trial_blk_weights1, total_weights); } // pass } } return cur_blk_error; } // base scale rgb/rgba // returns true if improved static bool try_cem6_10( uint32_t cem_index, const pixel_stats_t& pixel_stats, const cem_encode_params& enc_params, uint32_t endpoint_ise_range, uint32_t weight_ise_range, float scale, float low_a_f, const vec4F& high_color_f, uint8_t* pTrial_endpoint_vals, uint8_t* pTrial_weight_vals, uint64_t& trial_blk_error) { assert(g_initialized); assert((cem_index == astc_helpers::CEM_LDR_RGB_BASE_SCALE) || (cem_index == astc_helpers::CEM_LDR_RGB_BASE_SCALE_PLUS_TWO_A)); assert((pixel_stats.m_num_pixels) && (pixel_stats.m_num_pixels <= ASTC_LDR_MAX_BLOCK_PIXELS)); assert((endpoint_ise_range >= astc_helpers::FIRST_VALID_ENDPOINT_ISE_RANGE) && (endpoint_ise_range <= astc_helpers::LAST_VALID_ENDPOINT_ISE_RANGE)); assert(((weight_ise_range >= astc_helpers::FIRST_VALID_WEIGHT_ISE_RANGE) && (weight_ise_range <= astc_helpers::LAST_VALID_WEIGHT_ISE_RANGE)) || (weight_ise_range == astc_helpers::BISE_64_LEVELS)); uint8_t trial_endpoint_vals[astc_helpers::NUM_MODE10_ENDPOINTS] = { 0 }; uint8_t trial_weight_vals[ASTC_LDR_MAX_BLOCK_PIXELS] = { 0 }; cem_encode_ldr_rgb_or_rgba_base_scale(cem_index, endpoint_ise_range, scale, low_a_f, high_color_f, trial_endpoint_vals); uint64_t trial_err = eval_solution( pixel_stats, cem_index, trial_endpoint_vals, endpoint_ise_range, trial_weight_vals, weight_ise_range, enc_params); bool improved_flag = false; if (trial_err < trial_blk_error) { trial_blk_error = trial_err; memcpy(pTrial_endpoint_vals, trial_endpoint_vals, astc_helpers::get_num_cem_values(cem_index)); memcpy(pTrial_weight_vals, trial_weight_vals, pixel_stats.m_num_pixels); improved_flag = true; } const uint32_t num_endpoint_vals = astc_helpers::get_num_cem_values(cem_index); // TODO for (int delta = -1; delta <= 1; delta += 1) { if (!delta) continue; uint8_t fixed_endpoint_vals[astc_helpers::NUM_MODE10_ENDPOINTS]; memcpy(fixed_endpoint_vals, trial_endpoint_vals, num_endpoint_vals); fixed_endpoint_vals[3] = (uint8_t)astc_helpers::apply_delta_to_bise_endpoint_val(endpoint_ise_range, fixed_endpoint_vals[3], delta); trial_err = eval_solution( pixel_stats, cem_index, fixed_endpoint_vals, endpoint_ise_range, trial_weight_vals, weight_ise_range, enc_params); if (trial_err < trial_blk_error) { trial_blk_error = trial_err; memcpy(pTrial_endpoint_vals, fixed_endpoint_vals, astc_helpers::get_num_cem_values(cem_index)); memcpy(pTrial_weight_vals, trial_weight_vals, pixel_stats.m_num_pixels); improved_flag = true; } } return improved_flag; } static bool try_cem6_10_dp( uint32_t cem_index, uint32_t ccs_index, const pixel_stats_t& pixel_stats, const cem_encode_params& enc_params, uint32_t endpoint_ise_range, uint32_t weight_ise_range, float scale, float low_a_f, const vec4F& high_color_f, uint8_t* pTrial_endpoint_vals, uint8_t* pTrial_weight_vals0, uint8_t* pTrial_weight_vals1, uint64_t& trial_blk_error) { assert(g_initialized); assert((cem_index == astc_helpers::CEM_LDR_RGB_BASE_SCALE) || (cem_index == astc_helpers::CEM_LDR_RGB_BASE_SCALE_PLUS_TWO_A)); assert(ccs_index <= 3); assert((pixel_stats.m_num_pixels) && (pixel_stats.m_num_pixels <= ASTC_LDR_MAX_BLOCK_PIXELS)); assert((endpoint_ise_range >= astc_helpers::FIRST_VALID_ENDPOINT_ISE_RANGE) && (endpoint_ise_range <= astc_helpers::LAST_VALID_ENDPOINT_ISE_RANGE)); assert(((weight_ise_range >= astc_helpers::FIRST_VALID_WEIGHT_ISE_RANGE) && (weight_ise_range <= astc_helpers::LAST_VALID_WEIGHT_ISE_RANGE)) || (weight_ise_range == astc_helpers::BISE_64_LEVELS)); assert(pTrial_weight_vals0 && pTrial_weight_vals1); uint8_t trial_endpoint_vals[astc_helpers::NUM_MODE10_ENDPOINTS] = { 0 }; uint8_t trial_weight_vals0[ASTC_LDR_MAX_BLOCK_PIXELS] = { 0 }; uint8_t trial_weight_vals1[ASTC_LDR_MAX_BLOCK_PIXELS] = { 0 }; cem_encode_ldr_rgb_or_rgba_base_scale(cem_index, endpoint_ise_range, scale, low_a_f, high_color_f, trial_endpoint_vals); uint64_t trial_err = eval_solution_dp( pixel_stats, cem_index, ccs_index, trial_endpoint_vals, endpoint_ise_range, trial_weight_vals0, trial_weight_vals1, weight_ise_range, enc_params); bool improved_flag = false; if (trial_err < trial_blk_error) { trial_blk_error = trial_err; memcpy(pTrial_endpoint_vals, trial_endpoint_vals, astc_helpers::get_num_cem_values(cem_index)); memcpy(pTrial_weight_vals0, trial_weight_vals0, pixel_stats.m_num_pixels); memcpy(pTrial_weight_vals1, trial_weight_vals1, pixel_stats.m_num_pixels); improved_flag = true; } const uint32_t num_endpoint_vals = astc_helpers::get_num_cem_values(cem_index); for (int delta = -1; delta <= 1; delta += 1) { if (!delta) continue; uint8_t fixed_endpoint_vals[astc_helpers::NUM_MODE10_ENDPOINTS]; memcpy(fixed_endpoint_vals, trial_endpoint_vals, num_endpoint_vals); fixed_endpoint_vals[3] = (uint8_t)astc_helpers::apply_delta_to_bise_endpoint_val(endpoint_ise_range, fixed_endpoint_vals[3], delta); trial_err = eval_solution_dp( pixel_stats, cem_index, ccs_index, fixed_endpoint_vals, endpoint_ise_range, trial_weight_vals0, trial_weight_vals1, weight_ise_range, enc_params); if (trial_err < trial_blk_error) { trial_blk_error = trial_err; memcpy(pTrial_endpoint_vals, fixed_endpoint_vals, astc_helpers::get_num_cem_values(cem_index)); memcpy(pTrial_weight_vals0, trial_weight_vals0, pixel_stats.m_num_pixels); memcpy(pTrial_weight_vals1, trial_weight_vals1, pixel_stats.m_num_pixels); improved_flag = true; } } return improved_flag; } // rgb/rgba base+scale static uint64_t encode_cem6_10( uint32_t cem_index, const pixel_stats_t& pixel_stats, const cem_encode_params& enc_params, uint32_t endpoint_ise_range, uint32_t weight_ise_range, uint8_t* pEndpoint_vals, uint8_t* pWeight_vals, uint64_t cur_blk_error) { assert(g_initialized); assert((cem_index == astc_helpers::CEM_LDR_RGB_BASE_SCALE) || (cem_index == astc_helpers::CEM_LDR_RGB_BASE_SCALE_PLUS_TWO_A)); assert((pixel_stats.m_num_pixels) && (pixel_stats.m_num_pixels <= ASTC_LDR_MAX_BLOCK_PIXELS)); assert((endpoint_ise_range >= astc_helpers::FIRST_VALID_ENDPOINT_ISE_RANGE) && (endpoint_ise_range <= astc_helpers::LAST_VALID_ENDPOINT_ISE_RANGE)); assert(((weight_ise_range >= astc_helpers::FIRST_VALID_WEIGHT_ISE_RANGE) && (weight_ise_range <= astc_helpers::LAST_VALID_WEIGHT_ISE_RANGE)) || (weight_ise_range == astc_helpers::BISE_64_LEVELS)); const bool cem_has_alpha = (cem_index == astc_helpers::CEM_LDR_RGB_BASE_SCALE_PLUS_TWO_A); const uint32_t total_endpoint_vals = astc_helpers::get_num_cem_values(cem_index); const uint32_t total_weights = pixel_stats.m_num_pixels; float best_l = BIG_FLOAT_VAL, best_h = -BIG_FLOAT_VAL; //int best_l_index = 0, best_h_index = 0; for (uint32_t c = 0; c < pixel_stats.m_num_pixels; c++) { const vec3F px(pixel_stats.m_pixels_f[c]); float p = px.dot(pixel_stats.m_zero_rel_axis3); if (p < best_l) { best_l = p; //best_l_index = c; } if (p > best_h) { best_h = p; //best_h_index = c; } } // c const float MAX_S = 255.0f / 256.0f; const float EPS = 1e-6f; uint64_t trial_blk_error = UINT64_MAX; uint8_t trial_blk_endpoints[astc_helpers::NUM_MODE10_ENDPOINTS] = { 0 }; uint8_t trial_blk_weights[ASTC_LDR_MAX_BLOCK_PIXELS] = { 0 }; uint64_t best_blk_error = UINT64_MAX; uint8_t best_blk_endpoints[astc_helpers::NUM_MODE10_ENDPOINTS] = { 0 }; uint8_t best_blk_weights[ASTC_LDR_MAX_BLOCK_PIXELS] = { 0 }; vec3F low_color3_f(best_l * pixel_stats.m_zero_rel_axis3); low_color3_f.clamp(0.0f, 1.0f); vec3F high_color3_f(best_h * pixel_stats.m_zero_rel_axis3); high_color3_f.clamp(0.0f, 1.0f); float scale = MAX_S; float d = low_color3_f.dot(high_color3_f); float nrm = high_color3_f.norm(); if (nrm > 0.0f) scale = saturate(d / nrm); scale = minimum(scale, MAX_S); vec4F low_color_f(low_color3_f[0], low_color3_f[1], low_color3_f[2], pixel_stats.m_min_f[3]); vec4F high_color_f(high_color3_f[0], high_color3_f[1], high_color3_f[2], pixel_stats.m_max_f[3]); try_cem6_10( cem_index, pixel_stats, enc_params, endpoint_ise_range, weight_ise_range, scale, low_color_f[3], high_color_f, trial_blk_endpoints, trial_blk_weights, trial_blk_error); best_blk_error = trial_blk_error; memcpy(best_blk_endpoints, trial_blk_endpoints, total_endpoint_vals); memcpy(best_blk_weights, trial_blk_weights, total_weights); const uint32_t NUM_PASSES = 2; for (uint32_t pass = 0; pass < NUM_PASSES; pass++) { color_rgba actual_l(0), actual_h(0); float actual_scale = 0; decode_endpoints(cem_index, trial_blk_endpoints, endpoint_ise_range, actual_l, actual_h, &actual_scale); vec3F actual_high_f((float)actual_h[0], (float)actual_h[1], (float)actual_h[2]); actual_high_f *= (1.0f / 255.0f); // invalid on raw weights const auto& dequant_weights_tab = astc_helpers::g_dequant_tables.get_weight_tab(minimum(astc_helpers::BISE_32_LEVELS, weight_ise_range)).m_ISE_to_val; vec3F Pa(0.0f), Pb(0.0f); float A = 0.0f, B = 0.0f, C = 0.0f; for (uint32_t i = 0; i < pixel_stats.m_num_pixels; i++) { const vec3F px(pixel_stats.m_pixels_f[i]); const int iw = (weight_ise_range == astc_helpers::BISE_64_LEVELS) ? trial_blk_weights[i] : dequant_weights_tab[trial_blk_weights[i]]; float t = (float)iw * (1.0f / 64.0f); float bi = t, ai = 1.0f - t; Pa += px * ai; Pb += px * bi; A += ai * ai; B += ai * bi; C += bi * bi; } vec3F new_high = actual_high_f; float new_scale = actual_scale; float h2 = actual_high_f.norm(); if ((h2 > EPS) && (A > EPS)) { new_scale = (Pa.dot(actual_high_f) / h2 - B) / A; new_scale = clamp(new_scale, 0.0f, MAX_S); } const float den = A * new_scale * new_scale + 2.0f * B * new_scale + C; if (den > EPS) { new_high = (Pb + Pa * new_scale) / den; } h2 = new_high.norm(); if ((h2 > EPS) && (A > EPS)) { new_scale = (Pa.dot(new_high) / h2 - B) / A; new_scale = clamp(new_scale, 0.0f, MAX_S); } try_cem6_10( cem_index, pixel_stats, enc_params, endpoint_ise_range, weight_ise_range, new_scale, (float)actual_l[3] * (1.0f / 255.0f), vec4F(new_high[0], new_high[1], new_high[2], (float)actual_h[3] * (1.0f / 255.0f)), trial_blk_endpoints, trial_blk_weights, trial_blk_error); if (trial_blk_error >= best_blk_error) break; best_blk_error = trial_blk_error; memcpy(best_blk_endpoints, trial_blk_endpoints, total_endpoint_vals); memcpy(best_blk_weights, trial_blk_weights, total_weights); } // pass if (cem_has_alpha) { // Try to refine low a/high given the current selectors. float bounds_min = pixel_stats.m_min_f[3]; float bounds_max = pixel_stats.m_max_f[3]; if (bounds_min != bounds_max) { float a_vals[ASTC_LDR_MAX_BLOCK_PIXELS]; for (uint32_t i = 0; i < pixel_stats.m_num_pixels; i++) a_vals[i] = pixel_stats.m_pixels_f[i][3]; const uint32_t TOTAL_PASSES = 1; for (uint32_t pass = 0; pass < TOTAL_PASSES; pass++) { float xl = 0.0f, xh = 0.0f; if (compute_least_squares_endpoints_1D( pixel_stats.m_num_pixels, best_blk_weights, get_ls_weights_ise(weight_ise_range), &xl, &xh, a_vals, bounds_min, bounds_max)) { color_rgba actual_l(0), actual_h(0); float actual_scale = 0; decode_endpoints(cem_index, trial_blk_endpoints, endpoint_ise_range, actual_l, actual_h, &actual_scale); try_cem6_10( cem_index, pixel_stats, enc_params, endpoint_ise_range, weight_ise_range, actual_scale, xl, vec4F(actual_h[0], actual_h[1], actual_h[2], xh), trial_blk_endpoints, trial_blk_weights, trial_blk_error); if (trial_blk_error < best_blk_error) { best_blk_error = trial_blk_error; memcpy(best_blk_endpoints, trial_blk_endpoints, total_endpoint_vals); memcpy(best_blk_weights, trial_blk_weights, total_weights); } else { break; } } else { break; } } // pass } } if (best_blk_error < cur_blk_error) { cur_blk_error = best_blk_error; memcpy(pEndpoint_vals, trial_blk_endpoints, total_endpoint_vals); memcpy(pWeight_vals, trial_blk_weights, total_weights); } return cur_blk_error; } // rgba base+scale, dual plane a, ccs_index must be 3 static uint64_t encode_cem10_dp_a( uint32_t cem_index, const pixel_stats_t& pixel_stats, const cem_encode_params& enc_params, uint32_t endpoint_ise_range, uint32_t weight_ise_range, uint8_t* pEndpoint_vals, uint8_t* pWeight_vals0, uint8_t* pWeight_vals1, uint64_t cur_blk_error) { assert(g_initialized); assert(cem_index == astc_helpers::CEM_LDR_RGB_BASE_SCALE_PLUS_TWO_A); assert((pixel_stats.m_num_pixels) && (pixel_stats.m_num_pixels <= ASTC_LDR_MAX_BLOCK_PIXELS)); assert((endpoint_ise_range >= astc_helpers::FIRST_VALID_ENDPOINT_ISE_RANGE) && (endpoint_ise_range <= astc_helpers::LAST_VALID_ENDPOINT_ISE_RANGE)); assert(((weight_ise_range >= astc_helpers::FIRST_VALID_WEIGHT_ISE_RANGE) && (weight_ise_range <= astc_helpers::LAST_VALID_WEIGHT_ISE_RANGE)) || (weight_ise_range == astc_helpers::BISE_64_LEVELS)); // RGB uses plane0, alpha plane1. So solve RGB first. uint8_t rgba_endpoint_vals[astc_helpers::NUM_MODE10_ENDPOINTS] = { 0 }; uint8_t rgb_weight_vals[ASTC_LDR_MAX_BLOCK_PIXELS] = { 0 }; uint8_t a_weight_vals[ASTC_LDR_MAX_BLOCK_PIXELS] = { 0 }; // First just solve RGB, single plane. uint64_t rgb_blk_error = encode_cem6_10( astc_helpers::CEM_LDR_RGB_BASE_SCALE, pixel_stats, enc_params, endpoint_ise_range, weight_ise_range, rgba_endpoint_vals, rgb_weight_vals, UINT64_MAX); assert(rgb_blk_error != UINT64_MAX); if (rgb_blk_error == UINT64_MAX) return cur_blk_error; const auto& endpoint_quant_tab = astc_helpers::g_dequant_tables.get_endpoint_tab(endpoint_ise_range).m_val_to_ise; rgba_endpoint_vals[4] = endpoint_quant_tab[pixel_stats.m_min[3]]; rgba_endpoint_vals[5] = endpoint_quant_tab[pixel_stats.m_max[3]]; uint64_t rgba_blk_error = eval_solution_dp( pixel_stats, cem_index, 3, rgba_endpoint_vals, endpoint_ise_range, rgb_weight_vals, a_weight_vals, weight_ise_range, enc_params); assert(rgba_blk_error != UINT64_MAX); if (rgba_blk_error < cur_blk_error) { cur_blk_error = rgba_blk_error; memcpy(pEndpoint_vals, rgba_endpoint_vals, astc_helpers::NUM_MODE10_ENDPOINTS); memcpy(pWeight_vals0, rgb_weight_vals, pixel_stats.m_num_pixels); memcpy(pWeight_vals1, a_weight_vals, pixel_stats.m_num_pixels); if (!cur_blk_error) return cur_blk_error; } float bounds_min = pixel_stats.m_min_f[3], bounds_max = pixel_stats.m_max_f[3]; if (bounds_min != bounds_max) { float a_vals[ASTC_LDR_MAX_BLOCK_PIXELS]; for (uint32_t i = 0; i < pixel_stats.m_num_pixels; i++) a_vals[i] = pixel_stats.m_pixels_f[i][3]; const uint32_t TOTAL_PASSES = 2; uint8_t trial_rgba_endpoint_vals[astc_helpers::NUM_MODE10_ENDPOINTS] = { 0 }; uint8_t trial_rgb_weight_vals[ASTC_LDR_MAX_BLOCK_PIXELS] = { 0 }; uint8_t trial_a_weight_vals[ASTC_LDR_MAX_BLOCK_PIXELS] = { 0 }; for (uint32_t pass = 0; pass < TOTAL_PASSES; pass++) { float xl = 0.0f, xh = 0.0f; if (compute_least_squares_endpoints_1D( pixel_stats.m_num_pixels, pass ? trial_a_weight_vals : a_weight_vals, get_ls_weights_ise(weight_ise_range), &xl, &xh, a_vals, bounds_min, bounds_max)) { memcpy(trial_rgba_endpoint_vals, rgba_endpoint_vals, astc_helpers::NUM_MODE10_ENDPOINTS); trial_rgba_endpoint_vals[4] = precise_round_bise_endpoint_val(xl, endpoint_ise_range); trial_rgba_endpoint_vals[5] = precise_round_bise_endpoint_val(xh, endpoint_ise_range); uint64_t trial_rgba_blk_error = eval_solution_dp( pixel_stats, cem_index, 3, trial_rgba_endpoint_vals, endpoint_ise_range, trial_rgb_weight_vals, trial_a_weight_vals, weight_ise_range, enc_params); assert(trial_rgba_blk_error != UINT64_MAX); if (trial_rgba_blk_error < cur_blk_error) { cur_blk_error = trial_rgba_blk_error; memcpy(pEndpoint_vals, trial_rgba_endpoint_vals, astc_helpers::NUM_MODE10_ENDPOINTS); memcpy(pWeight_vals0, trial_rgb_weight_vals, pixel_stats.m_num_pixels); memcpy(pWeight_vals1, trial_a_weight_vals, pixel_stats.m_num_pixels); } else { break; } } else { break; } } // pass } return cur_blk_error; } // rgb/rgba base+scale, dual plane rgb (not a!) static uint64_t encode_cem6_10_dp_rgb( uint32_t cem_index, uint32_t ccs_index, const pixel_stats_t& pixel_stats, const cem_encode_params& enc_params, uint32_t endpoint_ise_range, uint32_t weight_ise_range, uint8_t* pEndpoint_vals, uint8_t* pWeight_vals0, uint8_t* pWeight_vals1, uint64_t cur_blk_error) { assert(g_initialized); assert((cem_index == astc_helpers::CEM_LDR_RGB_BASE_SCALE) || (cem_index == astc_helpers::CEM_LDR_RGB_BASE_SCALE_PLUS_TWO_A)); assert(ccs_index <= 2); assert((pixel_stats.m_num_pixels) && (pixel_stats.m_num_pixels <= ASTC_LDR_MAX_BLOCK_PIXELS)); assert((endpoint_ise_range >= astc_helpers::FIRST_VALID_ENDPOINT_ISE_RANGE) && (endpoint_ise_range <= astc_helpers::LAST_VALID_ENDPOINT_ISE_RANGE)); assert(((weight_ise_range >= astc_helpers::FIRST_VALID_WEIGHT_ISE_RANGE) && (weight_ise_range <= astc_helpers::LAST_VALID_WEIGHT_ISE_RANGE)) || (weight_ise_range == astc_helpers::BISE_64_LEVELS)); assert(pWeight_vals0 && pWeight_vals1); // First solve using a single plane, then we'll introduce the other plane's weights and tune the encoded H/s values uint8_t sp_endpoint_vals[astc_helpers::NUM_MODE10_ENDPOINTS] = { 0 }; uint8_t sp_weight_vals[ASTC_LDR_MAX_BLOCK_PIXELS] = { 0 }; uint64_t sp_block_err = encode_cem6_10( cem_index, pixel_stats, enc_params, endpoint_ise_range, weight_ise_range, sp_endpoint_vals, sp_weight_vals, UINT64_MAX); assert(sp_block_err != UINT64_MAX); BASISU_NOTE_UNUSED(sp_block_err); // Now compute both plane's weights using the initial H/s values uint8_t trial_weights0_vals[ASTC_LDR_MAX_BLOCK_PIXELS] = { 0 }; uint8_t trial_weights1_vals[ASTC_LDR_MAX_BLOCK_PIXELS] = { 0 }; uint64_t dp_blk_error = eval_solution_dp( pixel_stats, cem_index, ccs_index, sp_endpoint_vals, endpoint_ise_range, trial_weights0_vals, trial_weights1_vals, weight_ise_range, enc_params); if (dp_blk_error < cur_blk_error) { cur_blk_error = dp_blk_error; memcpy(pEndpoint_vals, sp_endpoint_vals, astc_helpers::NUM_MODE10_ENDPOINTS); memcpy(pWeight_vals0, trial_weights0_vals, pixel_stats.m_num_pixels); memcpy(pWeight_vals1, trial_weights1_vals, pixel_stats.m_num_pixels); if (!cur_blk_error) return cur_blk_error; } // Compute refined H/s values using the current weights. const float MAX_S = 255.0f / 256.0f; const float EPS = 1e-6f; vec3F Pa(0.0f); // (Pa_r,Pa_g,Pa_b) vec3F Pb(0.0f); // (Pb_r,Pb_g,Pb_b) float A[3] = { 0 }, B[3] = { 0 }, C[3] = { 0 }; // per-channel // invalid on raw weights const auto& dequant_weights_tab = astc_helpers::g_dequant_tables.get_weight_tab(minimum(astc_helpers::BISE_32_LEVELS, weight_ise_range)).m_ISE_to_val; for (uint32_t i = 0; i < pixel_stats.m_num_pixels; i++) { float w0, w1; if (weight_ise_range == astc_helpers::BISE_64_LEVELS) { w0 = (float)trial_weights0_vals[i] * (1.0f / 64.0f); w1 = (float)trial_weights1_vals[i] * (1.0f / 64.0f); } else { w0 = dequant_weights_tab[trial_weights0_vals[i]] * (1.0f / 64.0f); w1 = dequant_weights_tab[trial_weights1_vals[i]] * (1.0f / 64.0f); } float w[3] = { w0, w0, w0 }; w[ccs_index] = w1; const vec3F& p = pixel_stats.m_pixels_f[i]; for (int c = 0; c < 3; ++c) { const float a = 1.0f - w[c]; const float b = w[c]; Pa[c] += a * p[c]; Pb[c] += b * p[c]; A[c] += a * a; B[c] += a * b; C[c] += b * b; } // c } // i color_rgba actual_l(0), actual_h(0); float actual_scale = 0; decode_endpoints(cem_index, sp_endpoint_vals, endpoint_ise_range, actual_l, actual_h, &actual_scale); vec3F H((float)actual_h[0], (float)actual_h[1], (float)actual_h[2]); H *= (1.0f / 255.0f); const float S1 = H[0] * Pa[0] + H[1] * Pa[1] + H[2] * Pa[2]; float S2 = 0.0f, S3 = 0.0f; for (int c = 0; c < 3; c++) { const float H2 = H[c] * H[c]; S2 += H2 * A[c]; S3 += H2 * B[c]; } float new_s = actual_scale; if (S2 > EPS) new_s = (S1 - S3) / S2; new_s = clamp(new_s, 0.0f, MAX_S); vec3F new_H(0.0f); for (int c = 0; c < 3; ++c) { const float den = A[c] * new_s * new_s + 2.0f * B[c] * new_s + C[c]; float Hc = 0.0f; if (den > EPS) { const float num = Pb[c] + new_s * Pa[c]; Hc = num / den; } new_H[c] = Hc; } bool improved_flag = try_cem6_10_dp( cem_index, ccs_index, pixel_stats, enc_params, endpoint_ise_range, weight_ise_range, new_s, (float)actual_l[3] * (1.0f / 255.0f), vec4F(new_H[0], new_H[1], new_H[2], (float)actual_h[3] * (1.0f / 255.0f)), pEndpoint_vals, pWeight_vals0, pWeight_vals1, cur_blk_error); (void)improved_flag; return cur_blk_error; } // dispatcher uint64_t cem_encode_pixels( uint32_t cem_index, int ccs_index, const pixel_stats_t& pixel_stats, const cem_encode_params& enc_params, uint32_t endpoint_ise_range, uint32_t weight_ise_range, uint8_t* pEndpoint_vals, uint8_t* pWeight_vals0, uint8_t* pWeight_vals1, uint64_t cur_blk_error, bool use_blue_contraction, bool *pBase_ofs_clamped_flag) { assert(g_initialized); assert((ccs_index >= -1) && (ccs_index <= 3)); assert(astc_helpers::is_cem_ldr(cem_index)); assert(pEndpoint_vals); assert(pWeight_vals0); const bool dual_plane = (ccs_index >= 0); if (pBase_ofs_clamped_flag) *pBase_ofs_clamped_flag = false; uint64_t blk_error = UINT64_MAX; switch (cem_index) { case astc_helpers::CEM_LDR_LUM_DIRECT: { assert(!dual_plane); blk_error = encode_cem0_4( cem_index, pixel_stats, enc_params, endpoint_ise_range, weight_ise_range, pEndpoint_vals, pWeight_vals0, cur_blk_error); break; } case astc_helpers::CEM_LDR_LUM_ALPHA_DIRECT: { if (dual_plane) { assert(ccs_index == 3); assert(pWeight_vals1); blk_error = encode_cem4_dp_a( cem_index, pixel_stats, enc_params, endpoint_ise_range, weight_ise_range, pEndpoint_vals, pWeight_vals0, pWeight_vals1, cur_blk_error); } else { blk_error = encode_cem0_4( cem_index, pixel_stats, enc_params, endpoint_ise_range, weight_ise_range, pEndpoint_vals, pWeight_vals0, cur_blk_error); } break; } case astc_helpers::CEM_LDR_RGB_DIRECT: case astc_helpers::CEM_LDR_RGBA_DIRECT: case astc_helpers::CEM_LDR_RGB_BASE_PLUS_OFFSET: case astc_helpers::CEM_LDR_RGBA_BASE_PLUS_OFFSET: { if (dual_plane) { assert(pWeight_vals1); blk_error = encode_cem8_12_9_13_dp( cem_index, ccs_index, pixel_stats, enc_params, endpoint_ise_range, weight_ise_range, pEndpoint_vals, pWeight_vals0, pWeight_vals1, cur_blk_error, use_blue_contraction, pBase_ofs_clamped_flag); } else { blk_error = encode_cem8_12_9_13( cem_index, pixel_stats, enc_params, endpoint_ise_range, weight_ise_range, pEndpoint_vals, pWeight_vals0, cur_blk_error, use_blue_contraction, pBase_ofs_clamped_flag); } break; } case astc_helpers::CEM_LDR_RGB_BASE_SCALE: { if (dual_plane) { assert(ccs_index <= 2); assert(pWeight_vals1); blk_error = encode_cem6_10_dp_rgb( cem_index, ccs_index, pixel_stats, enc_params, endpoint_ise_range, weight_ise_range, pEndpoint_vals, pWeight_vals0, pWeight_vals1, cur_blk_error); } else { blk_error = encode_cem6_10( cem_index, pixel_stats, enc_params, endpoint_ise_range, weight_ise_range, pEndpoint_vals, pWeight_vals0, cur_blk_error); } break; } case astc_helpers::CEM_LDR_RGB_BASE_SCALE_PLUS_TWO_A: { if (dual_plane) { assert(pWeight_vals1); if (ccs_index == 3) { blk_error = encode_cem10_dp_a( cem_index, pixel_stats, enc_params, endpoint_ise_range, weight_ise_range, pEndpoint_vals, pWeight_vals0, pWeight_vals1, cur_blk_error); } else { blk_error = encode_cem6_10_dp_rgb( cem_index, ccs_index, pixel_stats, enc_params, endpoint_ise_range, weight_ise_range, pEndpoint_vals, pWeight_vals0, pWeight_vals1, cur_blk_error); } } else { blk_error = encode_cem6_10( cem_index, pixel_stats, enc_params, endpoint_ise_range, weight_ise_range, pEndpoint_vals, pWeight_vals0, cur_blk_error); } break; } default: { assert(0); break; } } return blk_error; } //--------------------------------------------------------------------------------------------- float surrogate_evaluate_rgba_sp(const pixel_stats_t& ps, const vec4F& l, const vec4F& h, float* pWeights0, uint32_t num_weight_levels, const cem_encode_params& enc_params, uint32_t flags) { assert(g_initialized); assert((ps.m_num_pixels) && (ps.m_num_pixels <= ASTC_LDR_MAX_BLOCK_PIXELS)); assert(pWeights0); const float wr = (float)enc_params.m_comp_weights[0], wg = (float)enc_params.m_comp_weights[1], wb = (float)enc_params.m_comp_weights[2], wa = (float)enc_params.m_comp_weights[3]; float total_err = 0; const bool compute_error = ((flags & cFlagNoError) == 0); float lr = l[0], lg = l[1], lb = l[2], la = l[3]; float dr = h[0] - lr, dg = h[1] - lg, db = h[2] - lb, da = h[3] - la; float delta_col_nrm = dr * dr + dg * dg + db * db + da * da; if (flags & cFlagDisableQuant) { float f = (float)1.0f / (delta_col_nrm + REALLY_SMALL_FLOAT_VAL); lr *= -dr; lg *= -dg; lb *= -db; la *= -da; dr *= f; dg *= f; db *= f; da *= f; float l_sum = (lr + lg + lb + la) * f; for (uint32_t i = 0; i < ps.m_num_pixels; i++) { const vec4F& p = ps.m_pixels_f[i]; const float r = p[0], g = p[1], b = p[2], a = p[3]; float w = r * dr + g * dg + b * db + a * da + l_sum; if (w < 0.0f) w = 0.0f; else if (w > 1.0f) w = 1.0f; pWeights0[i] = w; if (compute_error) { float one_minus_w = 1.0f - w; float dec_r = l[0] * one_minus_w + h[0] * w; float dec_g = l[1] * one_minus_w + h[1] * w; float dec_b = l[2] * one_minus_w + h[2] * w; float dec_a = l[3] * one_minus_w + h[3] * w; float diff_r = r - dec_r; float diff_g = g - dec_g; float diff_b = b - dec_b; float diff_a = a - dec_a; total_err += (wr * diff_r * diff_r) + (wg * diff_g * diff_g) + (wb * diff_b * diff_b) + (wa * diff_a * diff_a); } } // i } else { const float inv_weight_levels = 1.0f / (float)(num_weight_levels - 1); float f = (float)(num_weight_levels - 1) / (delta_col_nrm + REALLY_SMALL_FLOAT_VAL); lr *= -dr; lg *= -dg; lb *= -db; la *= -da; dr *= f; dg *= f; db *= f; da *= f; float l_sum_biased = (lr + lg + lb + la) * f + .5f; for (uint32_t i = 0; i < ps.m_num_pixels; i++) { const vec4F& p = ps.m_pixels_f[i]; const float r = p[0], g = p[1], b = p[2], a = p[3]; float w = (float)fast_floorf_int(r * dr + g * dg + b * db + a * da + l_sum_biased) * inv_weight_levels; if (w < 0.0f) w = 0.0f; else if (w > 1.0f) w = 1.0f; pWeights0[i] = w; if (compute_error) { float one_minus_w = 1.0f - w; float dec_r = l[0] * one_minus_w + h[0] * w; float dec_g = l[1] * one_minus_w + h[1] * w; float dec_b = l[2] * one_minus_w + h[2] * w; float dec_a = l[3] * one_minus_w + h[3] * w; float diff_r = r - dec_r; float diff_g = g - dec_g; float diff_b = b - dec_b; float diff_a = a - dec_a; total_err += (wr * diff_r * diff_r) + (wg * diff_g * diff_g) + (wb * diff_b * diff_b) + (wa * diff_a * diff_a); } } // i } return total_err; } float surrogate_evaluate_rgba_dp(uint32_t ccs_index, const pixel_stats_t& ps, const vec4F& l, const vec4F& h, float* pWeights0, float* pWeights1, uint32_t num_weight_levels, const cem_encode_params& enc_params, uint32_t flags) { assert(g_initialized); assert((ccs_index >= 0) && (ccs_index <= 3)); assert((ps.m_num_pixels) && (ps.m_num_pixels <= ASTC_LDR_MAX_BLOCK_PIXELS)); assert(pWeights0 && pWeights1); const float inv_weight_levels = 1.0f / (float)(num_weight_levels - 1); const uint32_t c0 = (ccs_index + 1) & 3, c1 = (ccs_index + 2) & 3, c2 = (ccs_index + 3) & 3; const float orig_lx = l[c0], orig_ly = l[c1], orig_lz = l[c2], orig_lw = l[ccs_index]; const float orig_hx = h[c0], orig_hy = h[c1], orig_hz = h[c2], orig_hw = h[ccs_index]; const float wx = (float)enc_params.m_comp_weights[c0], wy = (float)enc_params.m_comp_weights[c1], wz = (float)enc_params.m_comp_weights[c2], ww = (float)enc_params.m_comp_weights[ccs_index]; float total_err = 0; const bool compute_error = ((flags & cFlagNoError) == 0); if (flags & cFlagDisableQuant) { // Plane 0 { float dx = orig_hx - orig_lx, dy = orig_hy - orig_ly, dz = orig_hz - orig_lz; float delta_col_nrm = dx * dx + dy * dy + dz * dz; float f = (float)1.0f / (delta_col_nrm + REALLY_SMALL_FLOAT_VAL); float lx = orig_lx, ly = orig_ly, lz = orig_lz; lx *= -dx; ly *= -dy; lz *= -dz; dx *= f; dy *= f; dz *= f; float l_sum = (lx + ly + lz) * f; for (uint32_t i = 0; i < ps.m_num_pixels; i++) { const vec4F& p = ps.m_pixels_f[i]; const float x = p[c0], y = p[c1], z = p[c2]; float weight = x * dx + y * dy + z * dz + l_sum; if (weight < 0.0f) weight = 0.0f; else if (weight > 1.0f) weight = 1.0f; pWeights0[i] = weight; if (compute_error) { float one_minus_weight = 1.0f - weight; float dec_x = orig_lx * one_minus_weight + orig_hx * weight; float dec_y = orig_ly * one_minus_weight + orig_hy * weight; float dec_z = orig_lz * one_minus_weight + orig_hz * weight; float diff_x = x - dec_x; float diff_y = y - dec_y; float diff_z = z - dec_z; total_err += (wx * diff_x * diff_x) + (wy * diff_y * diff_y) + (wz * diff_z * diff_z); } } // i } // Plane 1 { const float delta_w = orig_hw - orig_lw; const float f = (fabsf(delta_w) > REALLY_SMALL_FLOAT_VAL) ? (1.0f / delta_w) : 0.0f; for (uint32_t i = 0; i < ps.m_num_pixels; i++) { const vec4F& p = ps.m_pixels_f[i]; const float w = p[ccs_index]; float weight = (w - orig_lw) * f; if (weight < 0.0f) weight = 0.0f; else if (weight > 1.0f) weight = 1.0f; pWeights1[i] = weight; if (compute_error) { // Error for DP here is 0 if there's no quant and L/H are sufficient to cover the entire span. if ((w < orig_lw) || (w > orig_hw)) { float one_minus_weight = 1.0f - weight; float dec_w = orig_lw * one_minus_weight + orig_hw * weight; float diff_w = w - dec_w; total_err += (ww * diff_w * diff_w); } } } // i } } else { // Plane 0 { float dx = orig_hx - orig_lx, dy = orig_hy - orig_ly, dz = orig_hz - orig_lz; float delta_col_nrm = dx * dx + dy * dy + dz * dz; float f = (float)(num_weight_levels - 1) / (delta_col_nrm + REALLY_SMALL_FLOAT_VAL); float lx = orig_lx, ly = orig_ly, lz = orig_lz; lx *= -dx; ly *= -dy; lz *= -dz; dx *= f; dy *= f; dz *= f; float l_sum_biased = (lx + ly + lz) * f + .5f; for (uint32_t i = 0; i < ps.m_num_pixels; i++) { const vec4F& p = ps.m_pixels_f[i]; const float x = p[c0], y = p[c1], z = p[c2]; float weight = (float)fast_floorf_int(x * dx + y * dy + z * dz + l_sum_biased) * inv_weight_levels; if (weight < 0.0f) weight = 0.0f; else if (weight > 1.0f) weight = 1.0f; pWeights0[i] = weight; if (compute_error) { float one_minus_weight = 1.0f - weight; float dec_x = orig_lx * one_minus_weight + orig_hx * weight; float dec_y = orig_ly * one_minus_weight + orig_hy * weight; float dec_z = orig_lz * one_minus_weight + orig_hz * weight; float diff_x = x - dec_x; float diff_y = y - dec_y; float diff_z = z - dec_z; total_err += (wx * diff_x * diff_x) + (wy * diff_y * diff_y) + (wz * diff_z * diff_z); } } // i } // Plane 1 { const float delta_w = orig_hw - orig_lw; const float f = (fabs(delta_w) > REALLY_SMALL_FLOAT_VAL) ? ((float)(num_weight_levels - 1) / delta_w) : 0.0f; for (uint32_t i = 0; i < ps.m_num_pixels; i++) { const vec4F& p = ps.m_pixels_f[i]; const float w = p[ccs_index]; float weight = (float)fast_floorf_int((w - orig_lw) * f + .5f) * inv_weight_levels; if (weight < 0.0f) weight = 0.0f; else if (weight > 1.0f) weight = 1.0f; pWeights1[i] = weight; if (compute_error) { float one_minus_weight = 1.0f - weight; float dec_w = orig_lw * one_minus_weight + orig_hw * weight; float diff_w = w - dec_w; total_err += (ww * diff_w * diff_w); } } // i } } return total_err; } //--------------------------------------------------------------------------------------------- float surrogate_quant_endpoint_val(float e, uint32_t num_endpoint_levels, uint32_t flags) { assert((e >= 0.0f) && (e <= 1.0f)); if (flags & cFlagDisableQuant) return e; const float endpoint_levels_minus_1 = (float)(num_endpoint_levels - 1); const float inv_endpoint_levels = 1.0f / endpoint_levels_minus_1; return (float)fast_roundf_pos_int(e * endpoint_levels_minus_1) * inv_endpoint_levels; } vec4F surrogate_quant_endpoint(const vec4F& e, uint32_t num_endpoint_levels, uint32_t flags) { if (flags & cFlagDisableQuant) return e; const float endpoint_levels_minus_1 = (float)(num_endpoint_levels - 1); const float inv_endpoint_levels = 1.0f / endpoint_levels_minus_1; assert((e[0] >= 0.0f) && (e[0] <= 1.0f)); assert((e[1] >= 0.0f) && (e[1] <= 1.0f)); assert((e[2] >= 0.0f) && (e[2] <= 1.0f)); assert((e[3] >= 0.0f) && (e[3] <= 1.0f)); vec4F res; res[0] = (float)fast_roundf_pos_int(e[0] * endpoint_levels_minus_1) * inv_endpoint_levels; res[1] = (float)fast_roundf_pos_int(e[1] * endpoint_levels_minus_1) * inv_endpoint_levels; res[2] = (float)fast_roundf_pos_int(e[2] * endpoint_levels_minus_1) * inv_endpoint_levels; res[3] = (float)fast_roundf_pos_int(e[3] * endpoint_levels_minus_1) * inv_endpoint_levels; return res; } static uint32_t get_num_weight_levels(uint32_t weight_ise_range) { // astc_helpers::BISE_64_LEVELS=raw weights ([0,64], NOT [0,63]) const uint32_t num_weight_levels = (weight_ise_range == astc_helpers::BISE_64_LEVELS) ? 65 : astc_helpers::get_ise_levels(weight_ise_range); return num_weight_levels; } //--------------------------------------------------------------------------------------------- static float cem_surrogate_encode_cem6_10_sp( uint32_t cem_index, const pixel_stats_t& ps, const cem_encode_params& enc_params, uint32_t endpoint_ise_range, uint32_t weight_ise_range, vec4F& low_endpoint, vec4F& high_endpoint, float &s, float* pWeights0, uint32_t flags) { const uint32_t num_endpoint_levels = astc_helpers::get_ise_levels(endpoint_ise_range); // astc_helpers::BISE_64_LEVELS=raw weights ([0,64], NOT [0,63]) const uint32_t num_weight_levels = get_num_weight_levels(weight_ise_range); const bool cem_has_alpha = (cem_index == astc_helpers::CEM_LDR_RGB_BASE_SCALE_PLUS_TWO_A); float d_min = BIG_FLOAT_VAL, d_max = -BIG_FLOAT_VAL; for (uint32_t i = 0; i < ps.m_num_pixels; i++) { const vec4F p(ps.m_pixels_f[i]); float dot = p.dot3(ps.m_zero_rel_axis3); if (dot < d_min) d_min = dot; if (dot > d_max) d_max = dot; } vec3F low_color3_f(d_min * ps.m_zero_rel_axis3); low_color3_f.clamp(0.0f, 1.0f); vec3F high_color3_f(d_max * ps.m_zero_rel_axis3); high_color3_f.clamp(0.0f, 1.0f); const float MAX_S = 255.0f / 256.0f; float scale = MAX_S; float d = low_color3_f.dot(high_color3_f); float nrm = high_color3_f.norm(); if (nrm > 0.0f) scale = d / nrm; scale = clamp(scale, 0.0f, MAX_S); scale = surrogate_quant_endpoint_val(scale * (256.0f / 255.0f), num_endpoint_levels, flags); s = scale; high_endpoint = surrogate_quant_endpoint(vec4F(high_color3_f[0], high_color3_f[1], high_color3_f[2], cem_has_alpha ? ps.m_max_f[3] : 1.0f), num_endpoint_levels, flags); low_endpoint = vec4F(high_endpoint[0] * scale, high_endpoint[1] * scale, high_endpoint[2] * scale, cem_has_alpha ? ps.m_min_f[3] : 1.0f); return surrogate_evaluate_rgba_sp(ps, low_endpoint, high_endpoint, pWeights0, num_weight_levels, enc_params, flags); } static float cem_surrogate_encode_cem6_10_dp( uint32_t cem_index, uint32_t ccs_index, const pixel_stats_t& ps, const cem_encode_params& enc_params, uint32_t endpoint_ise_range, uint32_t weight_ise_range, vec4F& low_endpoint, vec4F& high_endpoint, float& s, float* pWeights0, float* pWeights1, uint32_t flags) { const bool cem_has_alpha = (cem_index == astc_helpers::CEM_LDR_RGB_BASE_SCALE_PLUS_TWO_A); BASISU_NOTE_UNUSED(cem_has_alpha); // astc_helpers::BISE_64_LEVELS=raw weights ([0,64], NOT [0,63]) const uint32_t num_weight_levels = get_num_weight_levels(weight_ise_range); assert(cem_has_alpha || (ccs_index <= 2)); float temp_weights[astc_ldr::ASTC_LDR_MAX_BLOCK_PIXELS]; cem_surrogate_encode_cem6_10_sp( (ccs_index == 3) ? (uint32_t)astc_helpers::CEM_LDR_RGB_BASE_SCALE : cem_index, ps, enc_params, endpoint_ise_range, weight_ise_range, low_endpoint, high_endpoint, s, temp_weights, flags); if (ccs_index == 3) { low_endpoint[3] = ps.m_min_f[3]; high_endpoint[3] = ps.m_max_f[3]; } return surrogate_evaluate_rgba_dp(ccs_index, ps, low_endpoint, high_endpoint, pWeights0, pWeights1, num_weight_levels, enc_params, flags); } static float cem_surrogate_encode_cem8_12_sp( uint32_t cem_index, const pixel_stats_t& ps, const cem_encode_params& enc_params, uint32_t endpoint_ise_range, uint32_t weight_ise_range, vec4F& low_endpoint, vec4F& high_endpoint, float* pWeights0, uint32_t flags) { const uint32_t num_endpoint_levels = astc_helpers::get_ise_levels(endpoint_ise_range); // astc_helpers::BISE_64_LEVELS=raw weights ([0,64], NOT [0,63]) const uint32_t num_weight_levels = get_num_weight_levels(weight_ise_range); const bool cem_has_alpha = (cem_index == astc_helpers::CEM_LDR_RGBA_DIRECT); const uint32_t num_comps = cem_has_alpha ? 4 : 3; float d_min = BIG_FLOAT_VAL, d_max = -BIG_FLOAT_VAL; uint32_t l_idx = 0, h_idx = 0; for (uint32_t i = 0; i < ps.m_num_pixels; i++) { const vec4F p(ps.m_pixels_f[i] - ps.m_mean_f); float dot = cem_has_alpha ? p.dot(ps.m_mean_rel_axis4) : p.dot3(ps.m_mean_rel_axis3); if (dot < d_min) { d_min = dot; l_idx = i; } if (dot > d_max) { d_max = dot; h_idx = i; } } low_endpoint = surrogate_quant_endpoint(ps.m_pixels_f[l_idx], num_endpoint_levels, flags); high_endpoint = surrogate_quant_endpoint(ps.m_pixels_f[h_idx], num_endpoint_levels, flags); if (!cem_has_alpha) { low_endpoint[3] = 1.0f; high_endpoint[3] = 1.0f; } if (low_endpoint.dot(vec4F(1.0f)) > high_endpoint.dot(vec4F(1.0f))) std::swap(low_endpoint, high_endpoint); if ((flags & cFlagDisableQuant) == 0) { for (uint32_t i = 0; i < num_comps; i++) { if ((low_endpoint[i] == high_endpoint[i]) && (ps.m_min_f[i] != ps.m_max_f[i])) { const float inv_endpoint_levels = 1.0f / (float)(num_endpoint_levels - 1); float best_dist = BIG_FLOAT_VAL; float best_l = 0.0f, best_h = 0.0f; for (int ld = -2; ld <= 0; ld++) { float actual_l = saturate(low_endpoint[i] + (float)ld * inv_endpoint_levels); for (int hd = 0; hd <= 2; hd++) { float actual_h = saturate(high_endpoint[i] + (float)hd * inv_endpoint_levels); float v0 = lerp(actual_l, actual_h, 1.0f / 3.0f); float v1 = lerp(actual_l, actual_h, 2.0f / 3.0f); assert(v0 <= v1); float dist0 = v0 - ps.m_min_f[0]; float dist1 = v1 - ps.m_max_f[0]; float total_dist = dist0 * dist0 + dist1 * dist1; if (total_dist < best_dist) { best_dist = total_dist; best_l = actual_l; best_h = actual_h; } } // hd } // ld low_endpoint[i] = best_l; high_endpoint[i] = best_h; } } } return surrogate_evaluate_rgba_sp(ps, low_endpoint, high_endpoint, pWeights0, num_weight_levels, enc_params, flags); } static float cem_surrogate_encode_cem8_12_dp( uint32_t cem_index, uint32_t ccs_index, const pixel_stats_t& ps, const cem_encode_params& enc_params, uint32_t endpoint_ise_range, uint32_t weight_ise_range, vec4F& low_endpoint, vec4F& high_endpoint, float* pWeights0, float *pWeights1, uint32_t flags) { assert((ccs_index >= 0) && (ccs_index <= 3)); const uint32_t num_endpoint_levels = astc_helpers::get_ise_levels(endpoint_ise_range); // astc_helpers::BISE_64_LEVELS=raw weights ([0,64], NOT [0,63]) const uint32_t num_weight_levels = get_num_weight_levels(weight_ise_range); const bool cem_has_alpha = (cem_index == astc_helpers::CEM_LDR_RGBA_DIRECT); const uint32_t num_comps = cem_has_alpha ? 4 : 3; assert(cem_has_alpha || (ccs_index <= 2)); vec4F flattened_pixels[ASTC_LDR_MAX_BLOCK_PIXELS]; for (uint32_t i = 0; i < ps.m_num_pixels; i++) { flattened_pixels[i] = ps.m_pixels_f[i]; flattened_pixels[i][ccs_index] = 0.0f; if (!cem_has_alpha) flattened_pixels[i][3] = 0.0f; } vec4F flattened_pixels_mean(ps.m_mean_f); flattened_pixels_mean[ccs_index] = 0.0f; if (!cem_has_alpha) flattened_pixels_mean[3] = 0.0f; // suppress bogus gcc warning on flattened_pixels #ifndef __clang__ #if defined(__GNUC__) #pragma GCC diagnostic push #pragma GCC diagnostic ignored "-Wmaybe-uninitialized" #endif #endif const vec4F flattened_axis(calc_pca_4D(ps.m_num_pixels, flattened_pixels, flattened_pixels_mean)); #ifndef __clang__ #if defined(__GNUC__) #pragma GCC diagnostic pop #endif #endif float best_dl = BIG_FLOAT_VAL, best_dh = -BIG_FLOAT_VAL; int best_l_index = 0, best_h_index = 0; for (uint32_t c = 0; c < ps.m_num_pixels; c++) { const vec4F px(flattened_pixels[c] - flattened_pixels_mean); float p = px.dot(flattened_axis); if (p < best_dl) { best_dl = p; best_l_index = c; } if (p > best_dh) { best_dh = p; best_h_index = c; } } // c vec4F low_color_f(ps.m_pixels_f[best_l_index]), high_color_f(ps.m_pixels_f[best_h_index]); low_color_f[ccs_index] = 0.0f; high_color_f[ccs_index] = 0.0f; if (!cem_has_alpha) { low_color_f[3] = 1.0f; high_color_f[3] = 1.0f; } if (low_color_f.dot(vec4F(1.0f)) > high_color_f.dot(vec4F(1.0f))) std::swap(low_color_f, high_color_f); low_color_f[ccs_index] = ps.m_min_f[ccs_index]; high_color_f[ccs_index] = ps.m_max_f[ccs_index]; if (!cem_has_alpha) { low_color_f[3] = 1.0f; high_color_f[3] = 1.0f; } low_endpoint = surrogate_quant_endpoint(low_color_f, num_endpoint_levels, flags); high_endpoint = surrogate_quant_endpoint(high_color_f, num_endpoint_levels, flags); if ((flags & cFlagDisableQuant) == 0) { for (uint32_t i = 0; i < num_comps; i++) { if ((low_endpoint[i] == high_endpoint[i]) && (ps.m_min_f[i] != ps.m_max_f[i])) { const float inv_endpoint_levels = 1.0f / (float)(num_endpoint_levels - 1); float best_dist = BIG_FLOAT_VAL; float best_l = 0.0f, best_h = 0.0f; for (int ld = -2; ld <= 0; ld++) { float actual_l = saturate(low_endpoint[i] + (float)ld * inv_endpoint_levels); for (int hd = 0; hd <= 2; hd++) { float actual_h = saturate(high_endpoint[i] + (float)hd * inv_endpoint_levels); float v0 = lerp(actual_l, actual_h, 1.0f / 3.0f); float v1 = lerp(actual_l, actual_h, 2.0f / 3.0f); assert(v0 <= v1); //if (v0 > v1) // std::swap(v0, v1); float dist0 = v0 - ps.m_min_f[0]; float dist1 = v1 - ps.m_max_f[0]; float total_dist = dist0 * dist0 + dist1 * dist1; if (total_dist < best_dist) { best_dist = total_dist; best_l = actual_l; best_h = actual_h; } } // hd } // ld low_endpoint[i] = best_l; high_endpoint[i] = best_h; } } } return surrogate_evaluate_rgba_dp(ccs_index, ps, low_endpoint, high_endpoint, pWeights0, pWeights1, num_weight_levels, enc_params, flags); } static float cem_surrogate_encode_cem0_4_sp_or_dp( uint32_t cem_index, int ccs_index, const pixel_stats_t& ps, const cem_encode_params& enc_params, uint32_t endpoint_ise_range, uint32_t weight_ise_range, vec4F& low_endpoint, vec4F& high_endpoint, float* pWeights0, float *pWeights1, uint32_t flags) { const bool cem_has_alpha = (cem_index == astc_helpers::CEM_LDR_LUM_ALPHA_DIRECT); const bool dual_plane = (ccs_index == 3); if (cem_index == astc_helpers::CEM_LDR_LUM_ALPHA_DIRECT) { assert((ccs_index == -1) || (ccs_index == 3)); } else { assert(cem_index == astc_helpers::CEM_LDR_LUM_DIRECT); assert(ccs_index == -1); } const uint32_t num_endpoint_levels = astc_helpers::get_ise_levels(endpoint_ise_range); const uint32_t num_weight_levels = get_num_weight_levels(weight_ise_range); float lum_l = BIG_FLOAT_VAL, lum_h = -BIG_FLOAT_VAL; for (uint32_t i = 0; i < ps.m_num_pixels; i++) { const vec4F& px = ps.m_pixels_f[i]; float l = (px[0] + px[1] + px[2]) * (1.0f / 3.0f); lum_l = minimum(lum_l, l); lum_h = maximum(lum_h, l); } const float a_l = cem_has_alpha ? ps.m_min_f[3] : 1.0f; const float a_h = cem_has_alpha ? ps.m_max_f[3] : 1.0f; low_endpoint.set(lum_l, lum_l, lum_l, a_l); high_endpoint.set(lum_h, lum_h, lum_h, a_h); low_endpoint = surrogate_quant_endpoint(low_endpoint, num_endpoint_levels, flags); high_endpoint = surrogate_quant_endpoint(high_endpoint, num_endpoint_levels, flags); if (dual_plane) return surrogate_evaluate_rgba_dp(ccs_index, ps, low_endpoint, high_endpoint, pWeights0, pWeights1, num_weight_levels, enc_params, flags); else return surrogate_evaluate_rgba_sp(ps, low_endpoint, high_endpoint, pWeights0, num_weight_levels, enc_params, flags); } float cem_surrogate_encode_pixels( uint32_t cem_index, int ccs_index, const pixel_stats_t& ps, const cem_encode_params& enc_params, uint32_t endpoint_ise_range, uint32_t weight_ise_range, vec4F &low_endpoint, vec4F &high_endpoint, float &s, float* pWeights0, float* pWeights1, uint32_t flags) { assert(g_initialized); assert((ccs_index >= -1) && (ccs_index <= 3)); assert(astc_helpers::is_cem_ldr(cem_index)); assert(pWeights0 && pWeights1); const bool dual_plane = (ccs_index >= 0); switch (cem_index) { case astc_helpers::CEM_LDR_LUM_DIRECT: case astc_helpers::CEM_LDR_LUM_ALPHA_DIRECT: { return cem_surrogate_encode_cem0_4_sp_or_dp( cem_index, ccs_index, ps, enc_params, endpoint_ise_range, weight_ise_range, low_endpoint, high_endpoint, pWeights0, pWeights1, flags); } case astc_helpers::CEM_LDR_RGB_BASE_SCALE: case astc_helpers::CEM_LDR_RGB_BASE_SCALE_PLUS_TWO_A: { if (dual_plane) { return cem_surrogate_encode_cem6_10_dp( cem_index, ccs_index, ps, enc_params, endpoint_ise_range, weight_ise_range, low_endpoint, high_endpoint, s, pWeights0, pWeights1, flags); } else { return cem_surrogate_encode_cem6_10_sp( cem_index, ps, enc_params, endpoint_ise_range, weight_ise_range, low_endpoint, high_endpoint, s, pWeights0, flags); } break; } case astc_helpers::CEM_LDR_RGB_DIRECT: case astc_helpers::CEM_LDR_RGBA_DIRECT: { if (dual_plane) { return cem_surrogate_encode_cem8_12_dp( cem_index, ccs_index, ps, enc_params, endpoint_ise_range, weight_ise_range, low_endpoint, high_endpoint, pWeights0, pWeights1, flags); } else { return cem_surrogate_encode_cem8_12_sp( cem_index, ps, enc_params, endpoint_ise_range, weight_ise_range, low_endpoint, high_endpoint, pWeights0, flags); } break; } default: assert(0); break; } return BIG_FLOAT_VAL; } //--------------------------------------------------------------------------------------------- uint8_t g_part3_mapping[NUM_PART3_MAPPINGS][3] = { { 0, 1, 2 }, { 1, 2, 0 }, { 2, 0, 1 }, { 0, 2, 1 }, { 1, 0, 2 }, { 2, 1, 0 } }; partition_pattern_vec::partition_pattern_vec() { clear(); } partition_pattern_vec::partition_pattern_vec(const partition_pattern_vec& other) { *this = other; } partition_pattern_vec::partition_pattern_vec(uint32_t width, uint32_t height, const uint8_t *pParts) : m_width(width), m_height(height) { if (pParts) { memcpy(m_parts, pParts, get_total()); } } void partition_pattern_vec::init(uint32_t width, uint32_t height, const uint8_t* pParts) { m_width = width; m_height = height; if (pParts) { const uint32_t num_texels = get_total(); memcpy(m_parts, pParts, num_texels); } } void partition_pattern_vec::clear() { m_width = 0; m_height = 0; memset(m_parts, 0, sizeof(m_parts)); } partition_pattern_vec& partition_pattern_vec::operator= (const partition_pattern_vec& rhs) { if (this == &rhs) return *this; m_width = rhs.m_width; m_height = rhs.m_height; memcpy(m_parts, rhs.m_parts, get_total()); return *this; } // misnamed- just SAD distance, not square int partition_pattern_vec::get_squared_distance(const partition_pattern_vec& other) const { const uint32_t total_pixels = get_total(); int total_dist = 0; for (uint32_t i = 0; i < total_pixels; i++) total_dist += iabs((int)m_parts[i] - (int)other.m_parts[i]); return total_dist; } partition_pattern_vec partition_pattern_vec::get_permuted2(uint32_t permute_index) const { assert(permute_index <= 1); const uint32_t total_pixels = get_total(); partition_pattern_vec res(m_width, m_height); for (uint32_t i = 0; i < total_pixels; i++) { assert(m_parts[i] <= 1); res.m_parts[i] = (uint8_t)(m_parts[i] ^ permute_index); } return res; } partition_pattern_vec partition_pattern_vec::get_permuted3(uint32_t permute_index) const { assert(permute_index <= 5); const uint32_t total_pixels = get_total(); partition_pattern_vec res(m_width, m_height); for (uint32_t i = 0; i < total_pixels; i++) { assert(m_parts[i] <= 2); res.m_parts[i] = g_part3_mapping[permute_index][m_parts[i]]; } return res; } partition_pattern_vec partition_pattern_vec::get_canonicalized() const { partition_pattern_vec res(m_width, m_height); const uint32_t total_pixels = get_total(); int new_labels[4] = { -1, -1, -1, -1 }; uint32_t next_index = 0; for (uint32_t i = 0; i < total_pixels; i++) { uint32_t p = m_parts[i]; assert(p <= 3); if (new_labels[p] == -1) new_labels[p] = next_index++; res.m_parts[i] = (uint8_t)new_labels[p]; } return res; } // This requires no redundant patterns, i.e. all must be unique. bool vp_tree::init(uint32_t n, const partition_pattern_vec* pUnique_pats) { clear(); uint_vec pat_indices(n); for (uint32_t i = 0; i < n; i++) pat_indices[i] = i; std::pair root_idx = find_best_vantage_point(n, pUnique_pats, pat_indices); if (root_idx.first == -1) return false; m_nodes.resize(1); m_nodes[0].m_vantage_point = pUnique_pats[root_idx.first]; m_nodes[0].m_point_index = root_idx.first; m_nodes[0].m_dist = root_idx.second; m_nodes[0].m_inner_node = -1; m_nodes[0].m_outer_node = -1; uint_vec inner_list, outer_list; inner_list.reserve(n / 2); outer_list.reserve(n / 2); for (uint32_t pat_index = 0; pat_index < n; pat_index++) { if ((int)pat_index == root_idx.first) continue; const float dist = m_nodes[0].m_vantage_point.get_distance(pUnique_pats[pat_index]); if (dist <= root_idx.second) inner_list.push_back(pat_index); else outer_list.push_back(pat_index); } if (inner_list.size()) { m_nodes[0].m_inner_node = create_node(n, pUnique_pats, inner_list); if (m_nodes[0].m_inner_node < 0) return false; } if (outer_list.size()) { m_nodes[0].m_outer_node = create_node(n, pUnique_pats, outer_list); if (m_nodes[0].m_outer_node < 0) return false; } return true; } void vp_tree::find_nearest(uint32_t num_subsets, const partition_pattern_vec& desired_pat, result_queue& results, uint32_t max_results) const { assert((num_subsets >= 2) && (num_subsets <= 3)); results.clear(); if (!m_nodes.size()) return; uint32_t num_desired_pats; partition_pattern_vec desired_pats[NUM_PART3_MAPPINGS]; if (num_subsets == 2) { num_desired_pats = 2; for (uint32_t i = 0; i < 2; i++) desired_pats[i] = desired_pat.get_permuted2(i); } else { num_desired_pats = NUM_PART3_MAPPINGS; for (uint32_t i = 0; i < NUM_PART3_MAPPINGS; i++) desired_pats[i] = desired_pat.get_permuted3(i); } #if 0 find_nearest_at_node(0, num_desired_pats, desired_pats, results, max_results); #else find_nearest_at_node_non_recursive(0, num_desired_pats, desired_pats, results, max_results); #endif } void vp_tree::find_nearest_at_node(int node_index, uint32_t num_desired_pats, const partition_pattern_vec* pDesired_pats, result_queue& results, uint32_t max_results) const { float best_dist_to_vantage = BIG_FLOAT_VAL; uint32_t best_mapping = 0; for (uint32_t i = 0; i < num_desired_pats; i++) { float dist = pDesired_pats[i].get_distance(m_nodes[node_index].m_vantage_point); if (dist < best_dist_to_vantage) { best_dist_to_vantage = dist; best_mapping = i; } } result r; r.m_dist = best_dist_to_vantage; r.m_mapping_index = best_mapping; r.m_pat_index = m_nodes[node_index].m_point_index; results.insert(r, max_results); if (best_dist_to_vantage <= m_nodes[node_index].m_dist) { // inner first if (m_nodes[node_index].m_inner_node >= 0) find_nearest_at_node(m_nodes[node_index].m_inner_node, num_desired_pats, pDesired_pats, results, max_results); if (m_nodes[node_index].m_outer_node >= 0) { if ((results.get_size() < max_results) || ((m_nodes[node_index].m_dist - best_dist_to_vantage) <= results.get_highest_dist()) ) { find_nearest_at_node(m_nodes[node_index].m_outer_node, num_desired_pats, pDesired_pats, results, max_results); } } } else { // outer first if (m_nodes[node_index].m_outer_node >= 0) find_nearest_at_node(m_nodes[node_index].m_outer_node, num_desired_pats, pDesired_pats, results, max_results); if (m_nodes[node_index].m_inner_node >= 0) { if ((results.get_size() < max_results) || ((best_dist_to_vantage - m_nodes[node_index].m_dist) <= results.get_highest_dist()) ) { find_nearest_at_node(m_nodes[node_index].m_inner_node, num_desired_pats, pDesired_pats, results, max_results); } } } } void vp_tree::find_nearest_at_node_non_recursive(int init_node_index, uint32_t num_desired_pats, const partition_pattern_vec* pDesired_pats, result_queue& results, uint32_t max_results) const { uint_vec node_stack; node_stack.reserve(16); node_stack.push_back(init_node_index); do { const uint32_t node_index = node_stack.back(); node_stack.pop_back(); float best_dist_to_vantage = BIG_FLOAT_VAL; uint32_t best_mapping = 0; for (uint32_t i = 0; i < num_desired_pats; i++) { float dist = pDesired_pats[i].get_distance(m_nodes[node_index].m_vantage_point); if (dist < best_dist_to_vantage) { best_dist_to_vantage = dist; best_mapping = i; } } result r; r.m_dist = best_dist_to_vantage; r.m_mapping_index = best_mapping; r.m_pat_index = m_nodes[node_index].m_point_index; results.insert(r, max_results); if (best_dist_to_vantage <= m_nodes[node_index].m_dist) { if (m_nodes[node_index].m_outer_node >= 0) { if ((results.get_size() < max_results) || ((m_nodes[node_index].m_dist - best_dist_to_vantage) <= results.get_highest_dist()) ) { node_stack.push_back(m_nodes[node_index].m_outer_node); } } // inner first if (m_nodes[node_index].m_inner_node >= 0) { node_stack.push_back(m_nodes[node_index].m_inner_node); } } else { if (m_nodes[node_index].m_inner_node >= 0) { if ((results.get_size() < max_results) || ((best_dist_to_vantage - m_nodes[node_index].m_dist) <= results.get_highest_dist()) ) { node_stack.push_back(m_nodes[node_index].m_inner_node); } } // outer first if (m_nodes[node_index].m_outer_node >= 0) { node_stack.push_back(m_nodes[node_index].m_outer_node); } } } while (!node_stack.empty()); } // returns the index of the new node, or -1 on error int vp_tree::create_node(uint32_t n, const partition_pattern_vec* pUnique_pats, const uint_vec& pat_indices) { std::pair root_idx = find_best_vantage_point(n, pUnique_pats, pat_indices); if (root_idx.first < 0) return -1; m_nodes.resize(m_nodes.size() + 1); const uint32_t new_node_index = m_nodes.size_u32() - 1; m_nodes[new_node_index].m_vantage_point = pUnique_pats[root_idx.first]; m_nodes[new_node_index].m_point_index = root_idx.first; m_nodes[new_node_index].m_dist = root_idx.second; m_nodes[new_node_index].m_inner_node = -1; m_nodes[new_node_index].m_outer_node = -1; uint_vec inner_list, outer_list; inner_list.reserve(pat_indices.size_u32() / 2); outer_list.reserve(pat_indices.size_u32() / 2); for (uint32_t pat_indices_iter = 0; pat_indices_iter < pat_indices.size(); pat_indices_iter++) { const uint32_t pat_index = pat_indices[pat_indices_iter]; if ((int)pat_index == root_idx.first) continue; const float dist = m_nodes[new_node_index].m_vantage_point.get_distance(pUnique_pats[pat_index]); if (dist <= root_idx.second) inner_list.push_back(pat_index); else outer_list.push_back(pat_index); } if (inner_list.size()) m_nodes[new_node_index].m_inner_node = create_node(n, pUnique_pats, inner_list); if (outer_list.size()) m_nodes[new_node_index].m_outer_node = create_node(n, pUnique_pats, outer_list); return new_node_index; } // returns the pattern index of the vantage point (-1 on error), and the optimal split distance std::pair vp_tree::find_best_vantage_point(uint32_t num_unique_pats, const partition_pattern_vec* pUnique_pats, const uint_vec& pat_indices) { BASISU_NOTE_UNUSED(num_unique_pats); const uint32_t n = pat_indices.size_u32(); assert(n); if (n == 1) return std::pair(pat_indices[0], 0.0f); float best_split_metric = -1.0f; int best_split_pat = -1; float best_split_dist = 0.0f; float best_split_var = 0.0f; basisu::vector< std::pair > dists; dists.reserve(n); float_vec float_dists; float_dists.reserve(n); for (uint32_t pat_indices_iter = 0; pat_indices_iter < n; pat_indices_iter++) { const uint32_t split_pat_index = pat_indices[pat_indices_iter]; assert(split_pat_index < num_unique_pats); const partition_pattern_vec& trial_vantage = pUnique_pats[split_pat_index]; dists.resize(0); float_dists.resize(0); for (uint32_t j = 0; j < n; j++) { const uint32_t pat_index = pat_indices[j]; assert(pat_index < num_unique_pats); if (pat_index == split_pat_index) continue; float dist = trial_vantage.get_distance(pUnique_pats[pat_index]); dists.emplace_back(std::pair(dist, pat_index)); float_dists.push_back(dist); } stats s; s.calc(float_dists.size_u32(), float_dists.data()); std::sort(dists.begin(), dists.end(), [](const auto& a, const auto& b) { return a.first < b.first; }); const uint32_t num_dists = dists.size_u32(); float split_dist = dists[num_dists / 2].first; if ((num_dists & 1) == 0) split_dist = (split_dist + dists[(num_dists / 2) - 1].first) * .5f; uint32_t total_inner = 0, total_outer = 0; for (uint32_t j = 0; j < n; j++) { const uint32_t pat_index = pat_indices[j]; if (pat_index == split_pat_index) continue; float dist = trial_vantage.get_distance(pUnique_pats[pat_index]); if (dist <= split_dist) total_inner++; else total_outer++; } float split_metric = (float)minimum(total_inner, total_outer) / (float)maximum(total_inner, total_outer); if ((split_metric > best_split_metric) || ((split_metric == best_split_metric) && (s.m_var > best_split_var))) { best_split_metric = split_metric; best_split_dist = split_dist; best_split_pat = split_pat_index; best_split_var = (float)s.m_var; } } return std::pair(best_split_pat, best_split_dist); } void partitions_data::init(uint32_t num_partitions, uint32_t block_width, uint32_t block_height, bool init_vp_tree) { assert((num_partitions >= 2) && (num_partitions <= 4)); //const uint32_t total_texels = block_width * block_height; m_width = block_width; m_height = block_height; m_num_partitions = num_partitions; m_part_vp_tree.clear(); for (uint32_t i = 0; i < 1024; i++) { m_part_seed_to_unique_index[i] = -1; m_unique_index_to_part_seed[i] = -1; } //const bool is_small_block = astc_helpers::is_small_block(block_width, block_height); partition_hash_map part_hash; part_hash.reserve(1024); m_total_unique_patterns = 0; clear_obj(m_partition_pat_histograms); for (uint32_t seed_index = 0; seed_index < astc_helpers::NUM_PARTITION_PATTERNS; seed_index++) { partition_pattern_vec pat; uint32_t part_hist[4] = { 0 }; pat.init(block_width, block_height); for (uint32_t y = 0; y < block_height; y++) { for (uint32_t x = 0; x < block_width; x++) { //const uint8_t p = (uint8_t)astc_helpers::compute_texel_partition(seed_index, x, y, 0, m_num_partitions, is_small_block); const uint8_t p = (uint8_t)astc_helpers::get_precomputed_texel_partition(block_width, block_height, seed_index, x, y, num_partitions); assert((p < m_num_partitions) && (p < 4)); pat(x, y) = p; part_hist[p]++; } // x } // y bool skip_pat = false; for (uint32_t i = 0; i < m_num_partitions; i++) { if (!part_hist[i]) { skip_pat = true; break; } } if (skip_pat) continue; partition_pattern_vec std_pat(pat.get_canonicalized()); if (part_hash.contains(std_pat)) continue; if (num_partitions == 2) { assert(!part_hash.contains(pat)); assert(!part_hash.contains(pat.get_permuted2(1))); } else if (num_partitions == 3) { for (uint32_t i = 0; i < partition_pattern_vec::cMaxPermute3Index; i++) { assert(!part_hash.contains(pat.get_permuted3(i))); } } for (uint32_t c = 0; c < 4; c++) m_partition_pat_histograms[m_total_unique_patterns].m_hist[c] = (uint8_t)part_hist[c]; part_hash.insert(std_pat, std::make_pair(seed_index, m_total_unique_patterns)); m_part_seed_to_unique_index[seed_index] = (int16_t)m_total_unique_patterns; m_unique_index_to_part_seed[m_total_unique_patterns] = (int16_t)seed_index; m_partition_pats[m_total_unique_patterns] = pat; m_total_unique_patterns++; } // seed_index if (init_vp_tree) m_part_vp_tree.init(m_total_unique_patterns, m_partition_pats); } } // namespace astc_ldr } // namespace basisu