diff --git a/encoder/basisu_astc_ldr_common.cpp b/encoder/basisu_astc_ldr_common.cpp new file mode 100644 index 0000000..a66969a --- /dev/null +++ b/encoder/basisu_astc_ldr_common.cpp @@ -0,0 +1,5667 @@ +// File: basisu_astc_ldr_common.cpp +// Copyright (C) 2019-2026 Binomial LLC. All Rights Reserved. +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +#include "basisu_enc.h" +#include "../transcoder/basisu_astc_helpers.h" +#include "../transcoder/basisu_astc_hdr_core.h" +#include "basisu_astc_hdr_common.h" +#include "basisu_astc_ldr_common.h" + +#define BASISU_ASTC_LDR_DEBUG_MSGS (1) + +namespace basisu +{ + +namespace astc_ldr +{ + static bool g_initialized; + static vec4F g_astc_ls_raw_weights_ise[ASTC_LDR_MAX_RAW_WEIGHTS]; + + color_rgba blue_contract_enc(color_rgba orig, bool& did_clamp, int encoded_b) + { + color_rgba enc; + + int tr = orig.r * 2 - encoded_b; + int tg = orig.g * 2 - encoded_b; + if ((tr < 0) || (tr > 255) || (tg < 0) || (tg > 255)) + did_clamp = true; + + enc.r = (uint8_t)basisu::clamp(tr, 0, 255); + enc.g = (uint8_t)basisu::clamp(tg, 0, 255); + enc.b = (uint8_t)orig.b; + enc.a = orig.a; + return enc; + } + + color_rgba blue_contract_dec(int enc_r, int enc_g, int enc_b, int enc_a) + { + color_rgba dec; + dec.r = (uint8_t)((enc_r + enc_b) >> 1); + dec.g = (uint8_t)((enc_g + enc_b) >> 1); + dec.b = (uint8_t)enc_b; + dec.a = (uint8_t)enc_a; + return dec; + } + + void global_init() + { + if (g_initialized) + return; + + // Precomputed weight constants used during least fit determination. For each entry: w * w, (1.0f - w) * w, (1.0f - w) * (1.0f - w), w + for (uint32_t iw = 0; iw <= 64; iw++) + { + float w = (float)iw * (1.0f / 64.0f); + + g_astc_ls_raw_weights_ise[iw].set(w * w, (1.0f - w) * w, (1.0f - w) * (1.0f - w), w); + } + + g_initialized = true; + } + + static inline const vec4F* get_ls_weights_ise(uint32_t weight_ise_range) + { + assert((weight_ise_range <= astc_helpers::BISE_32_LEVELS) || (weight_ise_range == astc_helpers::BISE_64_LEVELS)); + + // astc_helpers::BISE_64_LEVELS indicates raw [0,64] weights (65 total), otherwise ISE weights (<= 32 levels total) + return (weight_ise_range == astc_helpers::BISE_64_LEVELS) ? g_astc_ls_raw_weights_ise : &g_astc_ls_weights_ise[weight_ise_range][0]; + } + + static bool compute_least_squares_endpoints_1D( + uint32_t N, const uint8_t* pSelectors, const vec4F* pSelector_weights, + float* pXl, float* pXh, const float* pVals, float bounds_min, float bounds_max) + { + float z00 = 0.0f, z01 = 0.0f, z10 = 0.0f, z11 = 0.0f; + float q00_r = 0.0f, q10_r = 0.0f, t_r = 0.0f; + + for (uint32_t i = 0; i < N; i++) + { + const uint32_t sel = pSelectors[i]; + + z00 += pSelector_weights[sel][0]; + z10 += pSelector_weights[sel][1]; + z11 += pSelector_weights[sel][2]; + + float w = pSelector_weights[sel][3]; + + q00_r += w * pVals[i]; + t_r += pVals[i]; + } + + q10_r = t_r - q00_r; + + z01 = z10; + + float det = z00 * z11 - z01 * z10; + if (fabs(det) < 1e-8f) + return false; + + det = 1.0f / det; + + float iz00, iz01, iz10, iz11; + iz00 = z11 * det; + iz01 = -z01 * det; + iz10 = -z10 * det; + iz11 = z00 * det; + + *pXh = (float)(iz00 * q00_r + iz01 * q10_r); *pXl = (float)(iz10 * q00_r + iz11 * q10_r); + + float l = saturate(*pXl), h = saturate(*pXh); + + if (bounds_min == bounds_max) + { + l = bounds_min; + h = bounds_max; + } + + *pXl = l; + *pXh = h; + + return true; + } + + static bool compute_least_squares_endpoints_2D( + uint32_t N, const uint8_t* pSelectors, const vec4F* pSelector_weights, + vec2F* pXl, vec2F* pXh, const vec2F* pColors, const vec2F& bounds_min, const vec2F& bounds_max) + { + float z00 = 0.0f, z01 = 0.0f, z10 = 0.0f, z11 = 0.0f; + float q00_r = 0.0f, q10_r = 0.0f, t_r = 0.0f; + float q00_g = 0.0f, q10_g = 0.0f, t_g = 0.0f; + + for (uint32_t i = 0; i < N; i++) + { + const uint32_t sel = pSelectors[i]; + + z00 += pSelector_weights[sel][0]; + z10 += pSelector_weights[sel][1]; + z11 += pSelector_weights[sel][2]; + + float w = pSelector_weights[sel][3]; + + q00_r += w * pColors[i][0]; + t_r += pColors[i][0]; + + q00_g += w * pColors[i][1]; + t_g += pColors[i][1]; + } + + q10_r = t_r - q00_r; + q10_g = t_g - q00_g; + + z01 = z10; + + float det = z00 * z11 - z01 * z10; + if (fabs(det) < 1e-8f) + return false; + + det = 1.0f / det; + + float iz00, iz01, iz10, iz11; + iz00 = z11 * det; + iz01 = -z01 * det; + iz10 = -z10 * det; + iz11 = z00 * det; + + (*pXh)[0] = (float)(iz00 * q00_r + iz01 * q10_r); (*pXl)[0] = (float)(iz10 * q00_r + iz11 * q10_r); + (*pXh)[1] = (float)(iz00 * q00_g + iz01 * q10_g); (*pXl)[1] = (float)(iz10 * q00_g + iz11 * q10_g); + + for (uint32_t c = 0; c < 2; c++) + { + float l = saturate((*pXl)[c]), h = saturate((*pXh)[c]); + + if (bounds_min[c] == bounds_max[c]) + { + l = bounds_min[c]; + h = bounds_max[c]; + } + + (*pXl)[c] = l; + (*pXh)[c] = h; + } + + return true; + } + + static bool compute_least_squares_endpoints_3D( + uint32_t N, const uint8_t* pSelectors, const vec4F* pSelector_weights, + vec4F* pXl, vec4F* pXh, const vec4F* pColors, const vec4F& bounds_min, const vec4F& bounds_max) + { + float z00 = 0.0f, z01 = 0.0f, z10 = 0.0f, z11 = 0.0f; + float q00_r = 0.0f, q10_r = 0.0f, t_r = 0.0f; + float q00_g = 0.0f, q10_g = 0.0f, t_g = 0.0f; + float q00_b = 0.0f, q10_b = 0.0f, t_b = 0.0f; + + for (uint32_t i = 0; i < N; i++) + { + const uint32_t sel = pSelectors[i]; + + z00 += pSelector_weights[sel][0]; + z10 += pSelector_weights[sel][1]; + z11 += pSelector_weights[sel][2]; + + float w = pSelector_weights[sel][3]; + + q00_r += w * pColors[i][0]; + t_r += pColors[i][0]; + + q00_g += w * pColors[i][1]; + t_g += pColors[i][1]; + + q00_b += w * pColors[i][2]; + t_b += pColors[i][2]; + } + + q10_r = t_r - q00_r; + q10_g = t_g - q00_g; + q10_b = t_b - q00_b; + + z01 = z10; + + float det = z00 * z11 - z01 * z10; + if (fabs(det) < 1e-8f) + return false; + + det = 1.0f / det; + + float iz00, iz01, iz10, iz11; + iz00 = z11 * det; + iz01 = -z01 * det; + iz10 = -z10 * det; + iz11 = z00 * det; + + (*pXh)[0] = (float)(iz00 * q00_r + iz01 * q10_r); (*pXl)[0] = (float)(iz10 * q00_r + iz11 * q10_r); + (*pXh)[1] = (float)(iz00 * q00_g + iz01 * q10_g); (*pXl)[1] = (float)(iz10 * q00_g + iz11 * q10_g); + (*pXh)[2] = (float)(iz00 * q00_b + iz01 * q10_b); (*pXl)[2] = (float)(iz10 * q00_b + iz11 * q10_b); + + (*pXh)[3] = 0; + (*pXl)[3] = 0; + + for (uint32_t c = 0; c < 3; c++) + { + float l = saturate((*pXl)[c]), h = saturate((*pXh)[c]); + + if (bounds_min[c] == bounds_max[c]) + { + l = bounds_min[c]; + h = bounds_max[c]; + } + + (*pXl)[c] = l; + (*pXh)[c] = h; + } + + return true; + } + + static bool compute_least_squares_endpoints_4D( + uint32_t N, const uint8_t* pSelectors, const vec4F* pSelector_weights, + vec4F* pXl, vec4F* pXh, const vec4F* pColors, const vec4F& bounds_min, const vec4F& bounds_max) + { + float z00 = 0.0f, z01 = 0.0f, z10 = 0.0f, z11 = 0.0f; + float q00_r = 0.0f, q10_r = 0.0f, t_r = 0.0f; + float q00_g = 0.0f, q10_g = 0.0f, t_g = 0.0f; + float q00_b = 0.0f, q10_b = 0.0f, t_b = 0.0f; + float q00_a = 0.0f, q10_a = 0.0f, t_a = 0.0f; + + for (uint32_t i = 0; i < N; i++) + { + const uint32_t sel = pSelectors[i]; + z00 += pSelector_weights[sel][0]; + z10 += pSelector_weights[sel][1]; + z11 += pSelector_weights[sel][2]; + + float w = pSelector_weights[sel][3]; + q00_r += w * pColors[i][0]; t_r += pColors[i][0]; + q00_g += w * pColors[i][1]; t_g += pColors[i][1]; + q00_b += w * pColors[i][2]; t_b += pColors[i][2]; + q00_a += w * pColors[i][3]; t_a += pColors[i][3]; + } + + q10_r = t_r - q00_r; + q10_g = t_g - q00_g; + q10_b = t_b - q00_b; + q10_a = t_a - q00_a; + + z01 = z10; + + float det = z00 * z11 - z01 * z10; + if (fabs(det) < 1e-8f) + return false; + + det = 1.0f / det; + + float iz00, iz01, iz10, iz11; + iz00 = z11 * det; + iz01 = -z01 * det; + iz10 = -z10 * det; + iz11 = z00 * det; + + (*pXh)[0] = (float)(iz00 * q00_r + iz01 * q10_r); (*pXl)[0] = (float)(iz10 * q00_r + iz11 * q10_r); + (*pXh)[1] = (float)(iz00 * q00_g + iz01 * q10_g); (*pXl)[1] = (float)(iz10 * q00_g + iz11 * q10_g); + (*pXh)[2] = (float)(iz00 * q00_b + iz01 * q10_b); (*pXl)[2] = (float)(iz10 * q00_b + iz11 * q10_b); + (*pXh)[3] = (float)(iz00 * q00_a + iz01 * q10_a); (*pXl)[3] = (float)(iz10 * q00_a + iz11 * q10_a); + + for (uint32_t c = 0; c < 4; c++) + { + float l = saturate((*pXl)[c]), h = saturate((*pXh)[c]); + + if (bounds_min[c] == bounds_max[c]) + { + l = bounds_min[c]; + h = bounds_max[c]; + } + + (*pXl)[c] = l; + (*pXh)[c] = h; + } + + return true; + } + +#if 0 + static void dequant_astc_weights(uint32_t n, const uint8_t* pSrc_ise_vals, uint32_t from_ise_range, uint8_t* pDst_raw_weights) + { + const auto& dequant_tab = astc_helpers::g_dequant_tables.get_weight_tab(from_ise_range).m_ISE_to_val; + + for (uint32_t i = 0; i < n; i++) + pDst_raw_weights[i] = dequant_tab[pSrc_ise_vals[i]]; + } +#endif + +#if 0 + static void dequant_astc_endpoints(uint32_t n, const uint8_t* pSrc_ise_vals, uint32_t from_ise_range, uint8_t* pDst_raw_weights) + { + const auto& dequant_tab = astc_helpers::g_dequant_tables.get_endpoint_tab(from_ise_range).m_ISE_to_val; + + for (uint32_t i = 0; i < n; i++) + pDst_raw_weights[i] = dequant_tab[pSrc_ise_vals[i]]; + } +#endif + + int apply_delta_to_bise_weight_val(uint32_t weight_ise_range, int ise_val, int delta) + { + if (delta == 0) + return ise_val; + + uint32_t num_ise_levels = astc_helpers::get_ise_levels(weight_ise_range); + + const auto& ISE_to_rank = astc_helpers::g_dequant_tables.get_weight_tab(weight_ise_range).m_ISE_to_rank; + const auto& rank_to_ISE = astc_helpers::g_dequant_tables.get_weight_tab(weight_ise_range).m_rank_to_ISE; + + int cur_rank = ISE_to_rank[ise_val]; + int new_rank = basisu::clamp(cur_rank + delta, 0, (int)num_ise_levels - 1); + + return rank_to_ISE[new_rank]; + } + + // v must be [0,1] + // converts to nearest ISE index with proper precise rounding + static uint8_t precise_round_bise_endpoint_val(float v, uint32_t endpoint_ise_range) + { + assert((v >= 0) && (v <= 1.0f)); + + const auto& quant_tab = astc_helpers::g_dequant_tables.get_endpoint_tab(endpoint_ise_range).m_val_to_ise; + const auto& dequant_tab = astc_helpers::g_dequant_tables.get_endpoint_tab(endpoint_ise_range).m_ISE_to_val; + + v = saturate(v); + + const int iv = clamp((int)std::roundf(v * 255.0f), 0, 255); + + uint8_t ise_index = 0; + + float best_err = BIG_FLOAT_VAL; + for (int iscale_delta = -1; iscale_delta <= 1; iscale_delta++) + { + const int trial_ise_index = astc_helpers::apply_delta_to_bise_endpoint_val(endpoint_ise_range, quant_tab[iv], iscale_delta); + + const float dequant_val = dequant_tab[trial_ise_index] * (1.0f / 255.0f); + + const float dequant_err = fabs(dequant_val - v); + if (dequant_err < best_err) + { + best_err = dequant_err; + ise_index = (uint8_t)trial_ise_index; + } + } // iscale_delta + + return ise_index; + } + + // returns true if blue contraction was actually used + // note the encoded endpoints may be swapped + // TODO: Pass in vec4F l/h and let it more precisely quantize in here. + struct cem_encode_ldr_rgb_or_rgba_direct_result + { + bool m_is_blue_contracted; + bool m_endpoints_are_swapped; + bool m_any_degen; + }; + + static cem_encode_ldr_rgb_or_rgba_direct_result cem_encode_ldr_rgb_or_rgba_direct( + uint32_t cem_index, uint32_t endpoint_ise_range, const color_rgba& l, const color_rgba& h, uint8_t* pEndpoint_vals, + bool try_blue_contract) + { + assert((cem_index == astc_helpers::CEM_LDR_RGB_DIRECT) || (cem_index == astc_helpers::CEM_LDR_RGBA_DIRECT)); + + cem_encode_ldr_rgb_or_rgba_direct_result res; + + bool& endpoints_are_swapped = res.m_endpoints_are_swapped; + bool& any_degen = res.m_any_degen; + bool& is_blue_contracted = res.m_is_blue_contracted; + + assert((cem_index == astc_helpers::CEM_LDR_RGB_DIRECT) || (cem_index == astc_helpers::CEM_LDR_RGBA_DIRECT)); + + const bool has_alpha = (cem_index == astc_helpers::CEM_LDR_RGBA_DIRECT); + + const auto& quant_tab = astc_helpers::g_dequant_tables.get_endpoint_tab(endpoint_ise_range).m_val_to_ise; + const auto& dequant_tab = astc_helpers::g_dequant_tables.get_endpoint_tab(endpoint_ise_range).m_ISE_to_val; + + //const auto &ISE_to_rank = astc_helpers::g_dequant_tables.get_endpoint_tab(endpoint_ise_range).m_ISE_to_rank; + //const auto &rank_to_ISE = astc_helpers::g_dequant_tables.get_endpoint_tab(endpoint_ise_range).m_rank_to_ISE; + + color_rgba enc_l(l), enc_h(h); + endpoints_are_swapped = false; + + is_blue_contracted = false; + if (try_blue_contract) + { + int enc_v4 = quant_tab[enc_l.b], enc_v5 = quant_tab[enc_h.b]; + int dec_v4 = dequant_tab[enc_v4], dec_v5 = dequant_tab[enc_v5]; + + bool did_clamp = false; + enc_l = blue_contract_enc(h, did_clamp, dec_v5); // yes, they're swapped in the spec + enc_h = blue_contract_enc(l, did_clamp, dec_v4); + + if (!did_clamp) + { + is_blue_contracted = true; + endpoints_are_swapped = true; + } + else + { + enc_l = l; + enc_h = h; + } + } + + int enc_v0 = quant_tab[enc_l.r], enc_v2 = quant_tab[enc_l.g], enc_v4 = quant_tab[enc_l.b]; + int enc_v1 = quant_tab[enc_h.r], enc_v3 = quant_tab[enc_h.g], enc_v5 = quant_tab[enc_h.b]; + + int enc_v6 = 0, enc_v7 = 0; + if (has_alpha) + { + enc_v6 = quant_tab[enc_l.a]; + enc_v7 = quant_tab[enc_h.a]; + } + + any_degen = false; + if ((enc_v0 == enc_v1) && (l.r != h.r)) + any_degen = true; + if ((enc_v2 == enc_v3) && (l.g != h.g)) + any_degen = true; + if ((enc_v4 == enc_v5) && (l.b != h.b)) + any_degen = true; + if (has_alpha) + { + if ((enc_v6 == enc_v7) && (l.a != h.a)) + any_degen = true; + } + + int dec_v0 = dequant_tab[enc_v0], dec_v2 = dequant_tab[enc_v2], dec_v4 = dequant_tab[enc_v4]; + int dec_v1 = dequant_tab[enc_v1], dec_v3 = dequant_tab[enc_v3], dec_v5 = dequant_tab[enc_v5]; + + int s0 = dec_v0 + dec_v2 + dec_v4; + int s1 = dec_v1 + dec_v3 + dec_v5; + + bool should_swap = false; + + if ((s1 == s0) && (is_blue_contracted)) + { + // if sums are equal we can't use blue contraction at all, so undo it + enc_l = l; + enc_h = h; + + is_blue_contracted = false; + endpoints_are_swapped = false; + + enc_v0 = quant_tab[enc_l.r], enc_v2 = quant_tab[enc_l.g], enc_v4 = quant_tab[enc_l.b]; + enc_v1 = quant_tab[enc_h.r], enc_v3 = quant_tab[enc_h.g], enc_v5 = quant_tab[enc_h.b]; + + dec_v0 = dequant_tab[enc_v0], dec_v2 = dequant_tab[enc_v2], dec_v4 = dequant_tab[enc_v4]; + dec_v1 = dequant_tab[enc_v1], dec_v3 = dequant_tab[enc_v3], dec_v5 = dequant_tab[enc_v5]; + + if (has_alpha) + { + enc_v6 = quant_tab[enc_l.a]; + enc_v7 = quant_tab[enc_h.a]; + } + + s0 = dec_v0 + dec_v2 + dec_v4; + s1 = dec_v1 + dec_v3 + dec_v5; + } + + if (s1 >= s0) + { + if (is_blue_contracted) + should_swap = true; + } + else + { + if (!is_blue_contracted) + should_swap = true; + } + + if (should_swap) + { + endpoints_are_swapped = !endpoints_are_swapped; + + std::swap(enc_v0, enc_v1); + std::swap(enc_v2, enc_v3); + std::swap(enc_v4, enc_v5); + std::swap(enc_v6, enc_v7); + } + + pEndpoint_vals[0] = (uint8_t)enc_v0; + pEndpoint_vals[1] = (uint8_t)enc_v1; + + pEndpoint_vals[2] = (uint8_t)enc_v2; + pEndpoint_vals[3] = (uint8_t)enc_v3; + + pEndpoint_vals[4] = (uint8_t)enc_v4; + pEndpoint_vals[5] = (uint8_t)enc_v5; + + if (has_alpha) + { + pEndpoint_vals[6] = (uint8_t)enc_v6; + pEndpoint_vals[7] = (uint8_t)enc_v7; + } + + #ifdef _DEBUG + { + int check_s0 = dequant_tab[enc_v0] + dequant_tab[enc_v2] + dequant_tab[enc_v4]; + int check_s1 = dequant_tab[enc_v1] + dequant_tab[enc_v3] + dequant_tab[enc_v5]; + + if (check_s1 >= check_s0) + { + assert(!is_blue_contracted); + } + else + { + assert(is_blue_contracted); + } + } + #endif + + return res; + } + + // Cannot fail + // scale=1 cannot be packed + static void cem_encode_ldr_rgb_or_rgba_base_scale( + uint32_t cem_index, uint32_t endpoint_ise_range, float scale, float l_a, const vec4F& h, uint8_t* pEndpoint_vals) + { + assert((cem_index == astc_helpers::CEM_LDR_RGB_BASE_SCALE) || (cem_index == astc_helpers::CEM_LDR_RGB_BASE_SCALE_PLUS_TWO_A)); + assert((scale >= 0.0f) && (scale < 1.0f)); + + const bool has_alpha = (cem_index == astc_helpers::CEM_LDR_RGB_BASE_SCALE_PLUS_TWO_A); + + const auto& quant_tab = astc_helpers::g_dequant_tables.get_endpoint_tab(endpoint_ise_range).m_val_to_ise; + const auto& dequant_tab = astc_helpers::g_dequant_tables.get_endpoint_tab(endpoint_ise_range).m_ISE_to_val; + + const uint32_t total_vals_to_pack = has_alpha ? 6 : 4; + + float vals_to_pack[6] = { 0 }; + + vals_to_pack[0] = h[0]; + vals_to_pack[1] = h[1]; + vals_to_pack[2] = h[2]; + vals_to_pack[3] = clamp(scale * (256.0f / 255.0f), 0.0f, 1.0f); + + if (has_alpha) + { + vals_to_pack[4] = l_a; + vals_to_pack[5] = h[3]; + } + + for (uint32_t c = 0; c < total_vals_to_pack; c++) + { + const float v = vals_to_pack[c]; + const int iv = clamp((int)std::roundf(v * 255.0f), 0, 255); + + float best_err = BIG_FLOAT_VAL; + for (int iscale_delta = -1; iscale_delta <= 1; iscale_delta++) + { + const int trial_ise_index = astc_helpers::apply_delta_to_bise_endpoint_val(endpoint_ise_range, quant_tab[iv], iscale_delta); + + const float dequant_val = dequant_tab[trial_ise_index] * (1.0f / 255.0f); + + const float dequant_err = fabs(dequant_val - v); + if (dequant_err < best_err) + { + best_err = dequant_err; + pEndpoint_vals[c] = (uint8_t)trial_ise_index; + } + } // iscale_delta + + } // c + } + +#if 0 + static int clamp6(int val, bool& was_clamped) + { + if (val < -32) + { + val = -32; + was_clamped = true; + } + else if (val > 31) + { + val = 31; + was_clamped = true; + } + return val; + } +#endif + + // returns true if blue contraction was used + // note the encoded endpoints may be swapped + struct rgb_base_offset_res + { + bool m_failed_flag; + bool m_used_blue_contraction; + bool m_blue_contraction_clamped; + bool m_delta_clamped; + bool m_any_degen; + bool m_endpoints_swapped; + }; + + // May fail if the tiebreaking logic isn't strong enough. + static rgb_base_offset_res cem_encode_ldr_rgb_or_rgba_base_offset(uint32_t cem_index, uint32_t endpoint_ise_range, const color_rgba& orig_l, const color_rgba& orig_h, uint8_t* pEndpoint_vals, bool use_blue_contract) + { + assert((cem_index == astc_helpers::CEM_LDR_RGB_BASE_PLUS_OFFSET) || (cem_index == astc_helpers::CEM_LDR_RGBA_BASE_PLUS_OFFSET)); + + const bool has_alpha = (cem_index == astc_helpers::CEM_LDR_RGBA_BASE_PLUS_OFFSET); + + rgb_base_offset_res res; + res.m_failed_flag = false; + res.m_used_blue_contraction = false; + res.m_blue_contraction_clamped = false; + res.m_delta_clamped = false; + res.m_any_degen = false; + res.m_endpoints_swapped = false; + + bool blue_contraction_clamped = false; + + bool status = basist::astc_ldr_t::pack_base_offset( + cem_index, endpoint_ise_range, pEndpoint_vals, + convert_to_basist_color_rgba(orig_l), convert_to_basist_color_rgba(orig_h), + use_blue_contract, true, + blue_contraction_clamped, res.m_delta_clamped, res.m_endpoints_swapped); + + assert(status); + + if (!status) + { + res.m_failed_flag = true; + return res; + } + + // Verify the actual BC status by unpacking to be absolutely sure + res.m_used_blue_contraction = astc_helpers::used_blue_contraction(cem_index, pEndpoint_vals, endpoint_ise_range); + + color_rgba dec_l, dec_h; + astc_ldr::decode_endpoints(cem_index, pEndpoint_vals, endpoint_ise_range, dec_l, dec_h); + + const uint32_t num_comps = (has_alpha ? 4 : 3); + for (uint32_t c = 0; c < num_comps; c++) + { + if (orig_l[c] != orig_h[c]) + continue; + + // Desired L/H are not equal, but packed are equal=degenerate pack (loss of freedom). + if (dec_l[c] == dec_h[c]) + { + res.m_any_degen = true; + break; + } + } // c + + return res; + } + + // L or LA direct + static void encode_cem0_4(uint32_t cem_index, float lum_l, float lum_h, float a_l, float a_h, uint32_t endpoint_ise_range, uint8_t* pEndpoints) + { + assert((cem_index == astc_helpers::CEM_LDR_LUM_DIRECT) || (cem_index == astc_helpers::CEM_LDR_LUM_ALPHA_DIRECT)); + + const bool has_alpha = (cem_index == astc_helpers::CEM_LDR_LUM_ALPHA_DIRECT); + + pEndpoints[0] = precise_round_bise_endpoint_val(lum_l, endpoint_ise_range); + pEndpoints[1] = precise_round_bise_endpoint_val(lum_h, endpoint_ise_range); + + if (has_alpha) + { + pEndpoints[2] = precise_round_bise_endpoint_val(a_l, endpoint_ise_range); + pEndpoints[3] = precise_round_bise_endpoint_val(a_h, endpoint_ise_range); + } + } + + // Returned in ISE order + uint32_t get_colors(const color_rgba& l, const color_rgba& h, uint32_t weight_ise_index, color_rgba* pColors, bool decode_mode_srgb) + { + const uint32_t total_weights = astc_helpers::get_ise_levels(weight_ise_index); + + for (uint32_t i = 0; i < total_weights; i++) + { + uint32_t w = basisu::g_ise_weight_lerps[weight_ise_index][1 + i]; + + for (uint32_t c = 0; c < 4; c++) + { + int le = l[c], he = h[c]; + + // TODO: Investigate alpha handling here vs. latest spec. + // https://raw.githubusercontent.com/KhronosGroup/DataFormat/refs/heads/main/astc.txt + // The safest thing to do may be to assume non-sRGB in the encoder. I don't know yet. + // How should alpha be handled here for lowest divergence from actual ASTC decoding hardware? + if (decode_mode_srgb) + { + le = (le << 8) | 0x80; + he = (he << 8) | 0x80; + } + else + { + le = (le << 8) | le; + he = (he << 8) | he; + } + + uint32_t k = astc_helpers::weight_interpolate(le, he, w); + + // See https://registry.khronos.org/OpenGL/extensions/EXT/EXT_texture_compression_astc_decode_mode.txt + // All channels including alpha >>8. + pColors[i][c] = (uint8_t)(k >> 8); + } // c + } // i + + return total_weights; + } + + // Returns 65 colors (NOT just 64 - 0-64 weight levels, so 65). + uint32_t get_colors_raw_weights(const color_rgba& l, const color_rgba& h, color_rgba* pColors, bool decode_mode_srgb) + { + for (uint32_t w = 0; w <= 64; w++) + { + for (uint32_t c = 0; c < 4; c++) + { + int le = l[c], he = h[c]; + + // TODO: Investigate alpha handling here vs. latest spec. + // https://raw.githubusercontent.com/KhronosGroup/DataFormat/refs/heads/main/astc.txt + // The safest thing to do may be to assume non-sRGB in the encoder. I don't know yet. + // How should alpha be handled here for lowest divergence from actual ASTC decoding hardware? + if (decode_mode_srgb) + { + le = (le << 8) | 0x80; + he = (he << 8) | 0x80; + } + else + { + le = (le << 8) | le; + he = (he << 8) | he; + } + + uint32_t k = astc_helpers::weight_interpolate(le, he, w); + + // See https://registry.khronos.org/OpenGL/extensions/EXT/EXT_texture_compression_astc_decode_mode.txt + // All channels including alpha >>8. + pColors[w][c] = (uint8_t)(k >> 8); + + } // c + } // i + + return ASTC_LDR_MAX_RAW_WEIGHTS; + } + + // Assumes ise 20 (256 levels) + void decode_endpoints_ise20(uint32_t cem_index, const uint8_t* pEndpoint_vals, color_rgba& l, color_rgba& h) + { + assert(astc_helpers::is_cem_ldr(cem_index)); + + int ldr_endpoints[4][2]; + astc_helpers::decode_endpoint(cem_index, ldr_endpoints, pEndpoint_vals); + + for (uint32_t c = 0; c < 4; c++) + { + assert((ldr_endpoints[c][0] >= 0) && (ldr_endpoints[c][0] <= 255)); + assert((ldr_endpoints[c][1] >= 0) && (ldr_endpoints[c][1] <= 255)); + + l[c] = (uint8_t)ldr_endpoints[c][0]; + h[c] = (uint8_t)ldr_endpoints[c][1]; + } + } + + void decode_endpoints(uint32_t cem_index, const uint8_t* pEndpoint_vals, uint32_t endpoint_ise_index, color_rgba& l, color_rgba& h, float* pScale) + { + const uint32_t total_endpoint_vals = astc_helpers::get_num_cem_values(cem_index); + + const auto& endpoint_dequant_tab = astc_helpers::g_dequant_tables.get_endpoint_tab(endpoint_ise_index).m_ISE_to_val; + + uint8_t dequantized_endpoints[astc_helpers::MAX_CEM_ENDPOINT_VALS]; + for (uint32_t i = 0; i < total_endpoint_vals; i++) + dequantized_endpoints[i] = endpoint_dequant_tab[pEndpoint_vals[i]]; + + decode_endpoints_ise20(cem_index, dequantized_endpoints, l, h); + + if ((pScale) && ((cem_index == astc_helpers::CEM_LDR_RGB_BASE_SCALE) || (cem_index == astc_helpers::CEM_LDR_RGB_BASE_SCALE_PLUS_TWO_A))) + { + *pScale = (float)dequantized_endpoints[3] * (1.0f / 256.0f); + } + } + + uint32_t get_colors(uint32_t cem_index, const uint8_t* pEndpoint_vals, uint32_t endpoint_ise_index, uint32_t weight_ise_index, color_rgba* pColors, bool decode_mode_srgb) + { + color_rgba l, h; + decode_endpoints(cem_index, pEndpoint_vals, endpoint_ise_index, l, h); + + return get_colors(l, h, weight_ise_index, pColors, decode_mode_srgb); + } + + // Decodes 65 colors + uint32_t get_colors_raw_weights(uint32_t cem_index, const uint8_t* pEndpoint_vals, uint32_t endpoint_ise_index, color_rgba* pColors, bool decode_mode_srgb) + { + color_rgba l, h; + decode_endpoints(cem_index, pEndpoint_vals, endpoint_ise_index, l, h); + + return get_colors_raw_weights(l, h, pColors, decode_mode_srgb); + } + +#if 0 + static vec4F calc_incremental_pca_4D(uint32_t num_pixels, const vec4F* pPixels, const vec4F& mean_f) + { + vec4F mean_axis(0.0f); + + for (uint32_t i = 0; i < num_pixels; i++) + { + vec4F orig_color(pPixels[i]); + + vec4F color(orig_color - mean_f); + + vec4F a(color * color[0]); + vec4F b(color * color[1]); + vec4F c(color * color[2]); + vec4F d(color * color[3]); + vec4F n(i ? mean_axis : color); + + n.normalize_in_place(); + + mean_axis[0] += a.dot(n); + mean_axis[1] += b.dot(n); + mean_axis[2] += c.dot(n); + mean_axis[3] += d.dot(n); + } + + if (mean_axis.norm() < 1e-5f) + mean_axis = vec4F(1.0f, 1.0f, 1.0f, 1.0f); + + mean_axis.normalize_in_place(); + + return mean_axis; + } +#endif + + // TODO: Try two-step Lanczos iteration/Rayleigh–Ritz approximation in a 2-dimensional Krylov subspace method vs. power method. + static vec4F calc_pca_4D(uint32_t num_pixels, const vec4F* pPixels, const vec4F& mean_f) + { + float m00 = 0, m01 = 0, m02 = 0, m03 = 0; + float m11 = 0, m12 = 0, m13 = 0; + float m22 = 0, m23 = 0; + float m33 = 0; + + for (size_t i = 0; i < num_pixels; ++i) + { + const vec4F v(pPixels[i] - mean_f); + + m00 += v[0] * v[0]; m01 += v[0] * v[1]; m02 += v[0] * v[2]; m03 += v[0] * v[3]; + m11 += v[1] * v[1]; m12 += v[1] * v[2]; m13 += v[1] * v[3]; + m22 += v[2] * v[2]; m23 += v[2] * v[3]; + m33 += v[3] * v[3]; + } + + // TODO: Seed from channel variances + vec4F v(.6f, .75f, .4f, .75f); + + const uint32_t NUM_POW_ITERS = 6; // must be even + for (uint32_t i = 0; i < NUM_POW_ITERS; ++i) + { + vec4F w( + m00 * v[0] + m01 * v[1] + m02 * v[2] + m03 * v[3], + m01 * v[0] + m11 * v[1] + m12 * v[2] + m13 * v[3], + m02 * v[0] + m12 * v[1] + m22 * v[2] + m23 * v[3], + m03 * v[0] + m13 * v[1] + m23 * v[2] + m33 * v[3] + ); + + if (i & 1) + w.normalize_in_place(); + v = w; + } + + if (v.norm() < 1e-5f) + v = vec4F(.5f, .5f, .5f, .5f); + + return v; + } + + static vec4F calc_pca_3D(uint32_t num_pixels, const vec4F* pPixels, const vec4F& mean_f) + { + float cov[6] = { 0, 0, 0, 0, 0, 0 }; + + for (uint32_t i = 0; i < num_pixels; i++) + { + const vec4F& v = pPixels[i]; + float r = v[0] - mean_f[0]; + float g = v[1] - mean_f[1]; + float b = v[2] - mean_f[2]; + cov[0] += r * r; cov[1] += r * g; cov[2] += r * b; cov[3] += g * g; cov[4] += g * b; cov[5] += b * b; + } + + float xr = .9f, xg = 1.0f, xb = .7f; + for (uint32_t iter = 0; iter < 3; iter++) + { + float r = xr * cov[0] + xg * cov[1] + xb * cov[2]; + float g = xr * cov[1] + xg * cov[3] + xb * cov[4]; + float b = xr * cov[2] + xg * cov[4] + xb * cov[5]; + + float m = maximumf(maximumf(fabsf(r), fabsf(g)), fabsf(b)); + if (m > 1e-10f) + { + m = 1.0f / m; + r *= m; g *= m; b *= m; + } + + xr = r; xg = g; xb = b; + } + + float nrm = xr * xr + xg * xg + xb * xb; + + vec4F axis(0.57735027f, 0.57735027f, 0.57735027f, 0.0f); + if (nrm > 1e-5f) + { + float inv_nrm = 1.0f / sqrtf(nrm); + xr *= inv_nrm; xg *= inv_nrm; xb *= inv_nrm; + axis.set(xr, xg, xb, 0); + } + + return axis; + } + + void pixel_stats_t::init(uint32_t num_pixels, const color_rgba* pPixels) + { + m_num_pixels = num_pixels; + m_has_alpha = false; + + m_min.set(255, 255, 255, 255); + m_max.set(0, 0, 0, 0); + + m_mean_f.clear(); + + for (uint32_t i = 0; i < m_num_pixels; i++) + { + const color_rgba& px = pPixels[i]; + + m_pixels[i] = px; + + m_pixels_f[i].set((float)px.r * (1.0f / 255.0f), (float)px.g * (1.0f / 255.0f), (float)px.b * (1.0f / 255.0f), (float)px.a * (1.0f / 255.0f)); + + m_mean_f += m_pixels_f[i]; + + m_min.r = basisu::minimum(m_min.r, px.r); + m_min.g = basisu::minimum(m_min.g, px.g); + m_min.b = basisu::minimum(m_min.b, px.b); + m_min.a = basisu::minimum(m_min.a, px.a); + + m_max.r = basisu::maximum(m_max.r, px.r); + m_max.g = basisu::maximum(m_max.g, px.g); + m_max.b = basisu::maximum(m_max.b, px.b); + m_max.a = basisu::maximum(m_max.a, px.a); + } + + m_mean_f *= (1.0f / (float)m_num_pixels); + m_mean_f.clamp(0.0f, 1.0f); + + m_min_f.set(m_min.r * (1.0f / 255.0f), m_min.g * (1.0f / 255.0f), m_min.b * (1.0f / 255.0f), m_min.a * (1.0f / 255.0f)); + m_max_f.set(m_max.r * (1.0f / 255.0f), m_max.g * (1.0f / 255.0f), m_max.b * (1.0f / 255.0f), m_max.a * (1.0f / 255.0f)); + + m_has_alpha = (m_min.a < 255); + + // Mean and zero relative RGB (3D) PCA axes + m_mean_rel_axis3 = calc_pca_3D(m_num_pixels, m_pixels_f, m_mean_f); + m_zero_rel_axis3 = calc_pca_3D(m_num_pixels, m_pixels_f, vec4F(0.0f)); + + // Mean and zero relative RGBA (4D) PCA axes + m_mean_rel_axis4 = calc_pca_4D(m_num_pixels, m_pixels_f, m_mean_f); + + for (uint32_t c = 0; c < 4u; c++) + m_rgba_stats[c].calc_simplified_with_range(m_num_pixels, &m_pixels_f[0][c], 4); + } + + static inline uint32_t square_of_diff(int a, int b) + { + assert((a >= 0) && (a <= 255)); + assert((b >= 0) && (b <= 255)); + + int d = a - b; + return (uint32_t)(d * d); + } + + uint64_t eval_solution( + const pixel_stats_t& pixel_stats, + uint32_t total_weights, const color_rgba* pWeight_colors, + uint8_t* pWeight_vals, uint32_t weight_ise_index, + const cem_encode_params& params) + { + BASISU_NOTE_UNUSED(weight_ise_index); + assert((total_weights <= 32) || (total_weights == 65)); + + uint64_t total_err = 0; + + if (params.m_pForced_weight_vals0) + { + for (uint32_t c = 0; c < pixel_stats.m_num_pixels; c++) + { + const color_rgba& px = pixel_stats.m_pixels[c]; + + const uint32_t w = params.m_pForced_weight_vals0[c]; + assert(w < total_weights); + + uint32_t err = + params.m_comp_weights[0] * square_of_diff(px.r, pWeight_colors[w].r) + + params.m_comp_weights[1] * square_of_diff(px.g, pWeight_colors[w].g) + + params.m_comp_weights[2] * square_of_diff(px.b, pWeight_colors[w].b) + + params.m_comp_weights[3] * square_of_diff(px.a, pWeight_colors[w].a); + + total_err += err; + + pWeight_vals[c] = (uint8_t)w; + } + } + else + { + for (uint32_t c = 0; c < pixel_stats.m_num_pixels; c++) + { + const color_rgba& px = pixel_stats.m_pixels[c]; + + uint32_t best_err = UINT32_MAX; + uint32_t best_sel = 0; + + for (uint32_t i = 0; i < total_weights; i++) + { + uint32_t err = + params.m_comp_weights[0] * square_of_diff(px.r, pWeight_colors[i].r) + + params.m_comp_weights[1] * square_of_diff(px.g, pWeight_colors[i].g) + + params.m_comp_weights[2] * square_of_diff(px.b, pWeight_colors[i].b) + + params.m_comp_weights[3] * square_of_diff(px.a, pWeight_colors[i].a); + + if (err < best_err) + { + best_err = err; + best_sel = i; + } + } + + total_err += best_err; + pWeight_vals[c] = (uint8_t)best_sel; + } + } // if (params.m_pForced_weight_vals0) + + return total_err; + } + + // Evaluates against raw weights [0,64], or to ISE quantized weights, depending on weight_ise_index. + uint64_t eval_solution( + const pixel_stats_t& pixel_stats, + uint32_t cem_index, + const uint8_t* pEndpoint_vals, uint32_t endpoint_ise_index, + uint8_t* pWeight_vals, uint32_t weight_ise_index, + const cem_encode_params& params) + { + assert((weight_ise_index <= astc_helpers::BISE_32_LEVELS) || (weight_ise_index == astc_helpers::BISE_64_LEVELS)); + + color_rgba weight_colors[ASTC_LDR_MAX_RAW_WEIGHTS]; + uint32_t num_weights; + + assert((weight_ise_index <= astc_helpers::BISE_32_LEVELS) || (weight_ise_index == astc_helpers::BISE_64_LEVELS)); + + // 64 levels isn't valid ASTC. It's used for raw weight mode. + if (weight_ise_index == astc_helpers::BISE_64_LEVELS) + num_weights = get_colors_raw_weights(cem_index, pEndpoint_vals, endpoint_ise_index, weight_colors, params.m_decode_mode_srgb); + else + num_weights = get_colors(cem_index, pEndpoint_vals, endpoint_ise_index, weight_ise_index, weight_colors, params.m_decode_mode_srgb); + + assert(num_weights <= std::size(weight_colors)); + + uint64_t trial_err = eval_solution( + pixel_stats, + num_weights, weight_colors, + pWeight_vals, weight_ise_index, + params); + + return trial_err; + } + + // Evaluates against raw weights [0,64], or to ISE quantized weights, depending on weight_ise_index. + uint64_t eval_solution_dp( + uint32_t ccs_index, + const pixel_stats_t& pixel_stats, + uint32_t total_weights, const color_rgba* pWeight_colors, + uint8_t* pWeight_vals0, uint8_t* pWeight_vals1, uint32_t weight_ise_index, + const cem_encode_params& params) + { + BASISU_NOTE_UNUSED(weight_ise_index); + + assert((ccs_index >= 0) && (ccs_index <= 3)); + assert((total_weights <= 32) || (total_weights == 65)); + + uint64_t total_err = 0; + + if (params.m_pForced_weight_vals0) + { + for (uint32_t c = 0; c < pixel_stats.m_num_pixels; c++) + { + const color_rgba& px = pixel_stats.m_pixels[c]; + + const uint32_t w = params.m_pForced_weight_vals0[c]; + assert(w < total_weights); + + uint32_t err = 0; + for (uint32_t o = 0; o < 4; o++) + if (o != ccs_index) + err += params.m_comp_weights[o] * square_of_diff(px[o], pWeight_colors[w][o]); + + total_err += err; + + pWeight_vals0[c] = (uint8_t)w; + } + } + else + { + for (uint32_t c = 0; c < pixel_stats.m_num_pixels; c++) + { + const color_rgba& px = pixel_stats.m_pixels[c]; + + uint32_t best_err = UINT32_MAX; + uint32_t best_sel = 0; + + for (uint32_t i = 0; i < total_weights; i++) + { + uint32_t err = 0; + for (uint32_t o = 0; o < 4; o++) + if (o != ccs_index) + err += params.m_comp_weights[o] * square_of_diff(px[o], pWeight_colors[i][o]); + + if (err < best_err) + { + best_err = err; + best_sel = i; + } + } + + total_err += best_err; + pWeight_vals0[c] = (uint8_t)best_sel; + } + } + + if (params.m_pForced_weight_vals1) + { + for (uint32_t c = 0; c < pixel_stats.m_num_pixels; c++) + { + const color_rgba& px = pixel_stats.m_pixels[c]; + + const uint32_t w = params.m_pForced_weight_vals1[c]; + assert(w < total_weights); + + uint32_t err = square_of_diff(px[ccs_index], pWeight_colors[w][ccs_index]); + + total_err += err * params.m_comp_weights[ccs_index]; + pWeight_vals1[c] = (uint8_t)w; + } + } + else + { + for (uint32_t c = 0; c < pixel_stats.m_num_pixels; c++) + { + const color_rgba& px = pixel_stats.m_pixels[c]; + + uint32_t best_err = UINT32_MAX; + uint32_t best_sel = 0; + + for (uint32_t i = 0; i < total_weights; i++) + { + uint32_t err = square_of_diff(px[ccs_index], pWeight_colors[i][ccs_index]); + + if (err < best_err) + { + best_err = err; + best_sel = i; + } + } + + total_err += best_err * params.m_comp_weights[ccs_index]; + pWeight_vals1[c] = (uint8_t)best_sel; + } + } + + return total_err; + } + + // Evaluates against raw weights [0,64], or to ISE quantized weights, depending on weight_ise_index. + uint64_t eval_solution_dp( + const pixel_stats_t& pixel_stats, + uint32_t cem_index, uint32_t ccs_index, + const uint8_t* pEndpoint_vals, uint32_t endpoint_ise_index, + uint8_t* pWeight_vals0, uint8_t* pWeight_vals1, uint32_t weight_ise_index, + const cem_encode_params& params) + { + assert((weight_ise_index <= astc_helpers::BISE_32_LEVELS) || (weight_ise_index == astc_helpers::BISE_64_LEVELS)); + + color_rgba weight_colors[ASTC_LDR_MAX_RAW_WEIGHTS]; + uint32_t num_weights; + + // 64 levels isn't valid ASTC. It's used for raw weight mode. + if (weight_ise_index == astc_helpers::BISE_64_LEVELS) + num_weights = get_colors_raw_weights(cem_index, pEndpoint_vals, endpoint_ise_index, weight_colors, params.m_decode_mode_srgb); + else + num_weights = get_colors(cem_index, pEndpoint_vals, endpoint_ise_index, weight_ise_index, weight_colors, params.m_decode_mode_srgb); + + uint64_t trial_err = eval_solution_dp( + ccs_index, + pixel_stats, + num_weights, weight_colors, + pWeight_vals0, pWeight_vals1, weight_ise_index, + params); + + return trial_err; + } + + // Direct - refine ISE quantized endpoints from float endpoints + static void refine_cem8_or_12_endpoints(uint32_t cem_index, uint32_t endpoint_ise_range, uint8_t* pTrial_endpoint_vals, const vec4F& low_color_f, const vec4F& high_color_f, bool endpoints_are_swapped) + { + assert((cem_index == astc_helpers::CEM_LDR_RGB_DIRECT) || (cem_index == astc_helpers::CEM_LDR_RGBA_DIRECT)); + + if (endpoint_ise_range == astc_helpers::BISE_256_LEVELS) + return; + + const uint32_t total_comps = (cem_index == astc_helpers::CEM_LDR_RGBA_DIRECT) ? 4 : 3; + + assert((cem_index == astc_helpers::CEM_LDR_RGB_DIRECT) || (cem_index == astc_helpers::CEM_LDR_RGBA_DIRECT)); + assert((endpoint_ise_range >= astc_helpers::FIRST_VALID_ENDPOINT_ISE_RANGE) && (endpoint_ise_range <= astc_helpers::LAST_VALID_ENDPOINT_ISE_RANGE)); + + const uint32_t total_endpoint_vals = astc_helpers::get_num_cem_values(cem_index); + const uint32_t num_endpoint_ise_levels = astc_helpers::get_ise_levels(endpoint_ise_range); + + const auto& endpoint_dequant_tab = astc_helpers::g_dequant_tables.get_endpoint_tab(endpoint_ise_range).m_ISE_to_val; + + const auto& ISE_to_rank = astc_helpers::g_dequant_tables.get_endpoint_tab(endpoint_ise_range).m_ISE_to_rank; + const auto& rank_to_ISE = astc_helpers::g_dequant_tables.get_endpoint_tab(endpoint_ise_range).m_rank_to_ISE; + + const bool orig_used_blue_contraction = astc_helpers::cem8_or_12_used_blue_contraction(cem_index, pTrial_endpoint_vals, endpoint_ise_range); + + uint32_t first_comp = 0; + + uint8_t refined_endpoint_vals[astc_helpers::NUM_MODE12_ENDPOINTS]; + memcpy(refined_endpoint_vals, pTrial_endpoint_vals, total_endpoint_vals); + + if (orig_used_blue_contraction) + { + // TODO expensive: 2*3*9 = 54 tries + for (uint32_t e = 0; e < 2; e++) + { + float best_err = BIG_FLOAT_VAL; + uint8_t best_refined_endpoint_vals[3] = { 0, 0, 0 }; + + for (int b_delta = -1; b_delta <= 1; b_delta++) + { + for (int k = 0; k < 9; k++) + { + const int r_delta = (k % 3) - 1; + const int g_delta = (k / 3) - 1; + + const int comp_deltas[3] = { r_delta, g_delta, b_delta }; + + uint8_t trial_refined_endpoint_vals[3] = { 0, 0, 0 }; + + for (uint32_t c = 0; c < 3; c++) + { + const int enc_val = pTrial_endpoint_vals[c * 2 + e]; + + const int orig_rank = ISE_to_rank[enc_val]; + + const int v_delta = comp_deltas[c]; + const int new_rank = basisu::clamp(orig_rank + v_delta, 0, (int)num_endpoint_ise_levels - 1); + const int new_enc_ise_val = rank_to_ISE[new_rank]; + + trial_refined_endpoint_vals[c] = (uint8_t)new_enc_ise_val; + + } // c + + color_rgba trial_refined_endpoints_dequant(blue_contract_dec(endpoint_dequant_tab[trial_refined_endpoint_vals[0]], endpoint_dequant_tab[trial_refined_endpoint_vals[1]], endpoint_dequant_tab[trial_refined_endpoint_vals[2]], 255)); + + vec3F trial_refined_endpoints_dequant_f(0.0f); + for (uint32_t c = 0; c < 3; c++) + trial_refined_endpoints_dequant_f[c] = (float)trial_refined_endpoints_dequant[c] * (1.0f / 255.0f); + + vec3F desired_endpoint; + if (endpoints_are_swapped) + desired_endpoint = (e == 0) ? vec3F(high_color_f) : vec3F(low_color_f); + else + desired_endpoint = (e == 0) ? vec3F(low_color_f) : vec3F(high_color_f); + + float trial_err = desired_endpoint.squared_distance(trial_refined_endpoints_dequant_f); + if (trial_err < best_err) + { + best_err = trial_err; + memcpy(best_refined_endpoint_vals, trial_refined_endpoint_vals, 3); + } + + } // k + + } // b_delta + + for (uint32_t c = 0; c < 3; c++) + { + refined_endpoint_vals[c * 2 + e] = best_refined_endpoint_vals[c]; + } // c + + } // e + + // just refine A now (if it exists) + first_comp = 3; + } + + if (first_comp < total_comps) + { + for (uint32_t e = 0; e < 2; e++) + { + for (uint32_t c = first_comp; c < total_comps; c++) + { + const uint32_t idx = c * 2 + e; + const int enc_val = pTrial_endpoint_vals[idx]; + + const int orig_rank = ISE_to_rank[enc_val]; + + int best_rank = orig_rank; + float best_err = BIG_FLOAT_VAL; + for (int v_delta = -1; v_delta <= 1; v_delta++) + { + int new_rank = basisu::clamp(orig_rank + v_delta, 0, (int)num_endpoint_ise_levels - 1); + int new_enc_ise_val = rank_to_ISE[new_rank]; + + float dequant_val = (float)endpoint_dequant_tab[new_enc_ise_val] * (1.0f / 255.0f); + + float orig_val; + if (endpoints_are_swapped) + orig_val = (e == 0) ? high_color_f[c] : low_color_f[c]; + else + orig_val = (e == 0) ? low_color_f[c] : high_color_f[c]; + + float err = fabsf(dequant_val - orig_val); + if (err < best_err) + { + best_err = err; + best_rank = new_rank; + } + } + + refined_endpoint_vals[idx] = (uint8_t)rank_to_ISE[best_rank]; + + } // c + } // e + } + + bool refined_used_blue_contraction = astc_helpers::cem8_or_12_used_blue_contraction(cem_index, refined_endpoint_vals, endpoint_ise_range); + if (refined_used_blue_contraction == orig_used_blue_contraction) + { + memcpy(pTrial_endpoint_vals, refined_endpoint_vals, total_endpoint_vals); + } + } + + // Direct L/LA, single plane + static bool try_cem0_or_4(uint32_t cem_index, + const pixel_stats_t& pixel_stats, const cem_encode_params& enc_params, + uint32_t endpoint_ise_range, uint32_t weight_ise_range, + float lum_l, float lum_h, float a_l, float a_h, + uint8_t* pTrial_endpoint_vals, uint8_t* pTrial_weight_vals, uint64_t& trial_blk_error) + { + assert(g_initialized); + assert((cem_index == astc_helpers::CEM_LDR_LUM_DIRECT) || (cem_index == astc_helpers::CEM_LDR_LUM_ALPHA_DIRECT)); + + const bool cem_has_alpha = (cem_index == astc_helpers::CEM_LDR_LUM_ALPHA_DIRECT); + + const uint32_t num_endpoint_vals = astc_helpers::get_num_cem_values(cem_index); + + uint8_t trial_endpoint_vals[astc_helpers::NUM_MODE4_ENDPOINTS] = { 0 }; + uint8_t trial_weight_vals[ASTC_LDR_MAX_BLOCK_PIXELS] = { 0 }; + + encode_cem0_4(cem_index, lum_l, lum_h, a_l, a_h, endpoint_ise_range, trial_endpoint_vals); + + uint64_t trial_err = eval_solution( + pixel_stats, + cem_index, trial_endpoint_vals, endpoint_ise_range, + trial_weight_vals, weight_ise_range, + enc_params); + + bool improved_flag = false; + if (trial_err < trial_blk_error) + { + trial_blk_error = trial_err; + memcpy(pTrial_endpoint_vals, trial_endpoint_vals, astc_helpers::get_num_cem_values(cem_index)); + memcpy(pTrial_weight_vals, trial_weight_vals, pixel_stats.m_num_pixels); + improved_flag = true; + } + + bool any_degen = false; + if ((trial_endpoint_vals[0] == trial_endpoint_vals[1]) && (lum_l != lum_h)) + any_degen = true; + + if (cem_has_alpha) + { + if ((trial_endpoint_vals[2] == trial_endpoint_vals[3]) && (a_l != a_h)) + any_degen = true; + } + + if (any_degen) + { + const int l_delta = (lum_l < lum_h) ? -1 : 1; + const int a_delta = (a_l < a_h) ? -1 : 1; + + for (uint32_t t = 1; t <= 3; t++) + { + uint8_t fixed_endpoint_vals[astc_helpers::NUM_MODE4_ENDPOINTS]; + memcpy(fixed_endpoint_vals, trial_endpoint_vals, num_endpoint_vals); + + if (t & 1) + { + if ((trial_endpoint_vals[0] == trial_endpoint_vals[1]) && (lum_l != lum_h)) + fixed_endpoint_vals[0] = (uint8_t)astc_helpers::apply_delta_to_bise_endpoint_val(endpoint_ise_range, trial_endpoint_vals[0], l_delta); + + if (cem_has_alpha) + { + if ((trial_endpoint_vals[2] == trial_endpoint_vals[3]) && (a_l != a_h)) + fixed_endpoint_vals[2] = (uint8_t)astc_helpers::apply_delta_to_bise_endpoint_val(endpoint_ise_range, trial_endpoint_vals[2], a_delta); + } + } + + if (t & 2) + { + if ((trial_endpoint_vals[0] == trial_endpoint_vals[1]) && (lum_l != lum_h)) + fixed_endpoint_vals[1] = (uint8_t)astc_helpers::apply_delta_to_bise_endpoint_val(endpoint_ise_range, trial_endpoint_vals[1], -l_delta); + + if (cem_has_alpha) + { + if ((trial_endpoint_vals[2] == trial_endpoint_vals[3]) && (a_l != a_h)) + fixed_endpoint_vals[3] = (uint8_t)astc_helpers::apply_delta_to_bise_endpoint_val(endpoint_ise_range, trial_endpoint_vals[3], -a_delta); + } + } + + trial_err = eval_solution( + pixel_stats, + cem_index, fixed_endpoint_vals, endpoint_ise_range, + trial_weight_vals, weight_ise_range, + enc_params); + + if (trial_err < trial_blk_error) + { + trial_blk_error = trial_err; + memcpy(pTrial_endpoint_vals, fixed_endpoint_vals, astc_helpers::get_num_cem_values(cem_index)); + memcpy(pTrial_weight_vals, trial_weight_vals, pixel_stats.m_num_pixels); + improved_flag = true; + } + + } // t + } + + return improved_flag; + } + + static bool try_cem4_dp_a(uint32_t cem_index, + const pixel_stats_t& pixel_stats, const cem_encode_params& enc_params, + uint32_t endpoint_ise_range, uint32_t weight_ise_range, + float lum_l, float lum_h, float a_l, float a_h, + uint8_t* pTrial_endpoint_vals, uint8_t* pTrial_weight_vals0, uint8_t* pTrial_weight_vals1, uint64_t& trial_blk_error) + { + assert(g_initialized); + assert(cem_index == astc_helpers::CEM_LDR_LUM_ALPHA_DIRECT); + + const bool cem_has_alpha = (cem_index == astc_helpers::CEM_LDR_LUM_ALPHA_DIRECT); + + const uint32_t num_endpoint_vals = astc_helpers::get_num_cem_values(cem_index); + + uint8_t trial_endpoint_vals[astc_helpers::NUM_MODE4_ENDPOINTS] = { 0 }; + uint8_t trial_weight_vals0[ASTC_LDR_MAX_BLOCK_PIXELS] = { 0 }; + uint8_t trial_weight_vals1[ASTC_LDR_MAX_BLOCK_PIXELS] = { 0 }; + + encode_cem0_4(cem_index, lum_l, lum_h, a_l, a_h, endpoint_ise_range, trial_endpoint_vals); + + uint64_t trial_err = eval_solution_dp( + pixel_stats, cem_index, 3, + trial_endpoint_vals, endpoint_ise_range, + trial_weight_vals0, trial_weight_vals1, weight_ise_range, + enc_params); + + bool improved_flag = false; + if (trial_err < trial_blk_error) + { + trial_blk_error = trial_err; + memcpy(pTrial_endpoint_vals, trial_endpoint_vals, astc_helpers::get_num_cem_values(cem_index)); + memcpy(pTrial_weight_vals0, trial_weight_vals0, pixel_stats.m_num_pixels); + memcpy(pTrial_weight_vals1, trial_weight_vals1, pixel_stats.m_num_pixels); + improved_flag = true; + } + + bool any_degen = false; + if ((trial_endpoint_vals[0] == trial_endpoint_vals[1]) && (lum_l != lum_h)) + any_degen = true; + + if (cem_has_alpha) + { + if ((trial_endpoint_vals[2] == trial_endpoint_vals[3]) && (a_l != a_h)) + any_degen = true; + } + + if (any_degen) + { + const int l_delta = (lum_l < lum_h) ? -1 : 1; + const int a_delta = (a_l < a_h) ? -1 : 1; + + for (uint32_t t = 1; t <= 3; t++) + { + uint8_t fixed_endpoint_vals[astc_helpers::NUM_MODE4_ENDPOINTS]; + memcpy(fixed_endpoint_vals, trial_endpoint_vals, num_endpoint_vals); + + if (t & 1) + { + if ((trial_endpoint_vals[0] == trial_endpoint_vals[1]) && (lum_l != lum_h)) + fixed_endpoint_vals[0] = (uint8_t)astc_helpers::apply_delta_to_bise_endpoint_val(endpoint_ise_range, trial_endpoint_vals[0], l_delta); + + if (cem_has_alpha) + { + if ((trial_endpoint_vals[2] == trial_endpoint_vals[3]) && (a_l != a_h)) + fixed_endpoint_vals[2] = (uint8_t)astc_helpers::apply_delta_to_bise_endpoint_val(endpoint_ise_range, trial_endpoint_vals[2], a_delta); + } + } + + if (t & 2) + { + if ((trial_endpoint_vals[0] == trial_endpoint_vals[1]) && (lum_l != lum_h)) + fixed_endpoint_vals[1] = (uint8_t)astc_helpers::apply_delta_to_bise_endpoint_val(endpoint_ise_range, trial_endpoint_vals[1], -l_delta); + + if (cem_has_alpha) + { + if ((trial_endpoint_vals[2] == trial_endpoint_vals[3]) && (a_l != a_h)) + fixed_endpoint_vals[3] = (uint8_t)astc_helpers::apply_delta_to_bise_endpoint_val(endpoint_ise_range, trial_endpoint_vals[3], -a_delta); + } + } + + trial_err = eval_solution_dp( + pixel_stats, cem_index, 3, + fixed_endpoint_vals, endpoint_ise_range, + trial_weight_vals0, trial_weight_vals1, weight_ise_range, + enc_params); + + if (trial_err < trial_blk_error) + { + trial_blk_error = trial_err; + memcpy(pTrial_endpoint_vals, fixed_endpoint_vals, astc_helpers::get_num_cem_values(cem_index)); + memcpy(pTrial_weight_vals0, trial_weight_vals0, pixel_stats.m_num_pixels); + memcpy(pTrial_weight_vals1, trial_weight_vals1, pixel_stats.m_num_pixels); + improved_flag = true; + } + + } // t + } + + return improved_flag; + } + + // Direct RGB/RGBA + // Cannot fail, but may have to fall back to non-blue-contracted + // Returns false if trial solution not improved + static bool try_cem8_12( + uint32_t cem_index, + const pixel_stats_t& pixel_stats, const cem_encode_params& enc_params, + uint32_t endpoint_ise_range, uint32_t weight_ise_range, + const vec4F& low_color_f, const vec4F& high_color_f, + uint8_t* pTrial_endpoint_vals, uint8_t* pTrial_weight_vals, uint64_t& trial_blk_error, bool& trial_used_blue_contraction, + bool try_blue_contract, bool& tried_used_blue_contraction) + { + assert(g_initialized); + assert((cem_index == astc_helpers::CEM_LDR_RGB_DIRECT) || (cem_index == astc_helpers::CEM_LDR_RGBA_DIRECT)); + + const uint32_t num_endpoint_vals = astc_helpers::get_num_cem_values(cem_index); + const uint32_t num_comps = (cem_index == astc_helpers::CEM_LDR_RGB_DIRECT) ? 3 : 4; + + color_rgba low_color, high_color; + for (uint32_t c = 0; c < 4; c++) + { + low_color[c] = (uint8_t)basisu::clamp((int)std::round(low_color_f[c] * 255.0f), 0, 255); + high_color[c] = (uint8_t)basisu::clamp((int)std::round(high_color_f[c] * 255.0f), 0, 255); + } + + uint8_t trial_endpoint_vals[astc_helpers::NUM_MODE12_ENDPOINTS] = { 0 }; + uint8_t trial_weight_vals[ASTC_LDR_MAX_BLOCK_PIXELS] = { 0 }; + + // Cannot fail, but may have to fall back to non-blue-contracted + cem_encode_ldr_rgb_or_rgba_direct_result res = cem_encode_ldr_rgb_or_rgba_direct(cem_index, endpoint_ise_range, low_color, high_color, trial_endpoint_vals, try_blue_contract); + + // Let caller know if we tried blue contraction + tried_used_blue_contraction = res.m_is_blue_contracted; + + if (endpoint_ise_range < astc_helpers::BISE_256_LEVELS) + { + refine_cem8_or_12_endpoints(cem_index, endpoint_ise_range, trial_endpoint_vals, low_color_f, high_color_f, res.m_endpoints_are_swapped); + } + + uint64_t trial_err = eval_solution( + pixel_stats, cem_index, + trial_endpoint_vals, endpoint_ise_range, + trial_weight_vals, weight_ise_range, + enc_params); + + bool improved_flag = false; + if (trial_err < trial_blk_error) + { + trial_blk_error = trial_err; + memcpy(pTrial_endpoint_vals, trial_endpoint_vals, astc_helpers::get_num_cem_values(cem_index)); + memcpy(pTrial_weight_vals, trial_weight_vals, pixel_stats.m_num_pixels); + trial_used_blue_contraction = res.m_is_blue_contracted; + improved_flag = true; + } + + if (res.m_any_degen) + { + color_rgba dec_l(0), dec_h(0); + decode_endpoints(cem_index, trial_endpoint_vals, endpoint_ise_range, dec_l, dec_h); + + uint32_t s0 = dec_l.r + dec_l.g + dec_l.b + dec_l.a; + uint32_t s1 = dec_h.r + dec_h.g + dec_h.b + dec_h.a; + if (astc_helpers::cem8_or_12_used_blue_contraction(cem_index, trial_endpoint_vals, endpoint_ise_range)) + std::swap(s0, s1); + + for (uint32_t t = 1; t <= 3; t++) + { + uint8_t fixed_endpoint_vals[astc_helpers::NUM_MODE12_ENDPOINTS]; + memcpy(fixed_endpoint_vals, trial_endpoint_vals, num_endpoint_vals); + + if (t & 1) + { + for (uint32_t c = 0; c < num_comps; c++) + { + uint32_t l_idx = c * 2 + 0; + uint32_t h_idx = c * 2 + 1; + + if ((trial_endpoint_vals[l_idx] == trial_endpoint_vals[h_idx]) && (low_color[c] != high_color[c])) + { + int delta = (s0 <= s1) ? -1 : 1; + + fixed_endpoint_vals[l_idx] = (uint8_t)astc_helpers::apply_delta_to_bise_endpoint_val(endpoint_ise_range, trial_endpoint_vals[l_idx], delta); + } + } + } + + if (t & 2) + { + for (uint32_t c = 0; c < num_comps; c++) + { + uint32_t l_idx = c * 2 + 0; + uint32_t h_idx = c * 2 + 1; + + if ((trial_endpoint_vals[l_idx] == trial_endpoint_vals[h_idx]) && (low_color[c] != high_color[c])) + { + int delta = (s0 <= s1) ? 1 : -1; + + fixed_endpoint_vals[h_idx] = (uint8_t)astc_helpers::apply_delta_to_bise_endpoint_val(endpoint_ise_range, trial_endpoint_vals[h_idx], delta); + } + } + } + + bool fixed_used_blue_contraction = astc_helpers::cem8_or_12_used_blue_contraction(cem_index, fixed_endpoint_vals, endpoint_ise_range); + if (fixed_used_blue_contraction != res.m_is_blue_contracted) + continue; + + trial_err = eval_solution( + pixel_stats, + cem_index, fixed_endpoint_vals, endpoint_ise_range, + trial_weight_vals, weight_ise_range, + enc_params); + + if (trial_err < trial_blk_error) + { + trial_blk_error = trial_err; + memcpy(pTrial_endpoint_vals, fixed_endpoint_vals, astc_helpers::get_num_cem_values(cem_index)); + memcpy(pTrial_weight_vals, trial_weight_vals, pixel_stats.m_num_pixels); + trial_used_blue_contraction = res.m_is_blue_contracted; + improved_flag = true; + } + + } // t + + } // if (res.m_any_degen) + + return improved_flag; + } + + static bool try_cem8_12_dp( + uint32_t cem_index, uint32_t ccs_index, + const pixel_stats_t& pixel_stats, const cem_encode_params& enc_params, + uint32_t endpoint_ise_range, uint32_t weight_ise_range, + const vec4F& low_color_f, const vec4F& high_color_f, + uint8_t* pTrial_endpoint_vals, uint8_t* pTrial_weight_vals0, uint8_t* pTrial_weight_vals1, uint64_t& trial_blk_error, bool& trial_used_blue_contraction, + bool try_blue_contract, bool& tried_used_blue_contraction) + { + assert(g_initialized); + assert((cem_index == astc_helpers::CEM_LDR_RGB_DIRECT) || (cem_index == astc_helpers::CEM_LDR_RGBA_DIRECT)); + + bool improved_flag = false; + + const uint32_t num_endpoint_vals = astc_helpers::get_num_cem_values(cem_index); + const uint32_t num_comps = (cem_index == astc_helpers::CEM_LDR_RGB_DIRECT) ? 3 : 4; + + color_rgba low_color, high_color; + for (uint32_t c = 0; c < 4; c++) + { + low_color[c] = (uint8_t)basisu::clamp((int)std::round(low_color_f[c] * 255.0f), 0, 255); + high_color[c] = (uint8_t)basisu::clamp((int)std::round(high_color_f[c] * 255.0f), 0, 255); + } + + uint8_t trial_endpoint_vals[astc_helpers::NUM_MODE12_ENDPOINTS] = { 0 }; + uint8_t trial_weight_vals0[ASTC_LDR_MAX_BLOCK_PIXELS] = { 0 }; + uint8_t trial_weight_vals1[ASTC_LDR_MAX_BLOCK_PIXELS] = { 0 }; + + // Cannot fail, but may have to fall back to non-blue-contracted + cem_encode_ldr_rgb_or_rgba_direct_result res = cem_encode_ldr_rgb_or_rgba_direct(cem_index, endpoint_ise_range, low_color, high_color, trial_endpoint_vals, try_blue_contract); + + // Let caller know if we tried blue contraction + tried_used_blue_contraction = res.m_is_blue_contracted; + + if (endpoint_ise_range < astc_helpers::BISE_256_LEVELS) + { + refine_cem8_or_12_endpoints(cem_index, endpoint_ise_range, trial_endpoint_vals, low_color_f, high_color_f, res.m_endpoints_are_swapped); + } + + uint64_t trial_err = eval_solution_dp(pixel_stats, cem_index, ccs_index, trial_endpoint_vals, endpoint_ise_range, trial_weight_vals0, trial_weight_vals1, weight_ise_range, enc_params); + + if (trial_err < trial_blk_error) + { + trial_blk_error = trial_err; + memcpy(pTrial_endpoint_vals, trial_endpoint_vals, astc_helpers::get_num_cem_values(cem_index)); + memcpy(pTrial_weight_vals0, trial_weight_vals0, pixel_stats.m_num_pixels); + memcpy(pTrial_weight_vals1, trial_weight_vals1, pixel_stats.m_num_pixels); + trial_used_blue_contraction = res.m_is_blue_contracted; + improved_flag = true; + } + + if (res.m_any_degen) + { + color_rgba dec_l(0), dec_h(0); + decode_endpoints(cem_index, trial_endpoint_vals, endpoint_ise_range, dec_l, dec_h); + + uint32_t s0 = dec_l.r + dec_l.g + dec_l.b + dec_l.a; + uint32_t s1 = dec_h.r + dec_h.g + dec_h.b + dec_h.a; + if (astc_helpers::cem8_or_12_used_blue_contraction(cem_index, trial_endpoint_vals, endpoint_ise_range)) + std::swap(s0, s1); + + for (uint32_t t = 1; t <= 3; t++) + { + uint8_t fixed_endpoint_vals[astc_helpers::NUM_MODE12_ENDPOINTS]; + memcpy(fixed_endpoint_vals, trial_endpoint_vals, num_endpoint_vals); + + if (t & 1) + { + for (uint32_t c = 0; c < num_comps; c++) + { + uint32_t l_idx = c * 2 + 0; + uint32_t h_idx = c * 2 + 1; + + if ((trial_endpoint_vals[l_idx] == trial_endpoint_vals[h_idx]) && (low_color[c] != high_color[c])) + { + int delta = (s0 <= s1) ? -1 : 1; + + fixed_endpoint_vals[l_idx] = (uint8_t)astc_helpers::apply_delta_to_bise_endpoint_val(endpoint_ise_range, trial_endpoint_vals[l_idx], delta); + } + } + } + + if (t & 2) + { + for (uint32_t c = 0; c < num_comps; c++) + { + uint32_t l_idx = c * 2 + 0; + uint32_t h_idx = c * 2 + 1; + + if ((trial_endpoint_vals[l_idx] == trial_endpoint_vals[h_idx]) && (low_color[c] != high_color[c])) + { + int delta = (s0 <= s1) ? 1 : -1; + + fixed_endpoint_vals[h_idx] = (uint8_t)astc_helpers::apply_delta_to_bise_endpoint_val(endpoint_ise_range, trial_endpoint_vals[h_idx], delta); + } + } + } + + bool fixed_used_blue_contraction = astc_helpers::cem8_or_12_used_blue_contraction(cem_index, fixed_endpoint_vals, endpoint_ise_range); + if (fixed_used_blue_contraction != res.m_is_blue_contracted) + continue; + + trial_err = eval_solution_dp(pixel_stats, cem_index, ccs_index, fixed_endpoint_vals, endpoint_ise_range, trial_weight_vals0, trial_weight_vals1, weight_ise_range, enc_params); + + if (trial_err < trial_blk_error) + { + trial_blk_error = trial_err; + memcpy(pTrial_endpoint_vals, fixed_endpoint_vals, astc_helpers::get_num_cem_values(cem_index)); + memcpy(pTrial_weight_vals0, trial_weight_vals0, pixel_stats.m_num_pixels); + memcpy(pTrial_weight_vals1, trial_weight_vals1, pixel_stats.m_num_pixels); + improved_flag = true; + } + + } // t + + } // if (res.m_any_degen) + + return improved_flag; + } + + // base+offset rgb/rgba, single or dual plane + static bool try_cem9_13_sp_or_dp( + uint32_t cem_index, int ccs_index, + const pixel_stats_t& pixel_stats, const cem_encode_params& enc_params, + uint32_t endpoint_ise_range, uint32_t weight_ise_range, + const vec4F& low_color_f, const vec4F& high_color_f, + uint8_t* pTrial_endpoint_vals, uint8_t* pTrial_weight_vals0, uint8_t* pTrial_weight_vals1, uint64_t& trial_blk_error, bool& trial_used_blue_contraction, + bool try_blue_contract, bool& tried_used_blue_contraction, bool &tried_base_ofs_clamped) + { + assert(g_initialized); + assert((cem_index == astc_helpers::CEM_LDR_RGB_BASE_PLUS_OFFSET) || (cem_index == astc_helpers::CEM_LDR_RGBA_BASE_PLUS_OFFSET)); + assert((ccs_index >= -1) && (ccs_index <= 3)); + assert((pixel_stats.m_num_pixels) && (pixel_stats.m_num_pixels <= ASTC_LDR_MAX_BLOCK_PIXELS)); + assert((endpoint_ise_range >= astc_helpers::FIRST_VALID_ENDPOINT_ISE_RANGE) && (endpoint_ise_range <= astc_helpers::LAST_VALID_ENDPOINT_ISE_RANGE)); + assert(((weight_ise_range >= astc_helpers::FIRST_VALID_WEIGHT_ISE_RANGE) && (weight_ise_range <= astc_helpers::LAST_VALID_WEIGHT_ISE_RANGE)) || (weight_ise_range == astc_helpers::BISE_64_LEVELS)); + + assert(pTrial_weight_vals0); + assert((ccs_index == -1) || (pTrial_weight_vals1)); + + //const uint32_t num_endpoint_vals = astc_helpers::get_num_cem_values(cem_index); + const uint32_t num_comps = (cem_index == astc_helpers::CEM_LDR_RGB_BASE_PLUS_OFFSET) ? 3 : 4; + + color_rgba low_color, high_color; + for (uint32_t c = 0; c < 4; c++) + { + low_color[c] = (uint8_t)basisu::clamp((int)std::round(low_color_f[c] * 255.0f), 0, 255); + high_color[c] = (uint8_t)basisu::clamp((int)std::round(high_color_f[c] * 255.0f), 0, 255); + } + + uint8_t trial_endpoint_vals[astc_helpers::NUM_MODE13_ENDPOINTS] = { 0 }; + uint8_t trial_weight_vals0[ASTC_LDR_MAX_BLOCK_PIXELS] = { 0 }; + uint8_t trial_weight_vals1[ASTC_LDR_MAX_BLOCK_PIXELS] = { 0 }; + + rgb_base_offset_res res = cem_encode_ldr_rgb_or_rgba_base_offset(cem_index, endpoint_ise_range, low_color, high_color, trial_endpoint_vals, try_blue_contract); + + tried_used_blue_contraction = res.m_used_blue_contraction; + tried_base_ofs_clamped = res.m_delta_clamped; + + if (res.m_failed_flag) + return false; + + bool improved_flag = false; + + if (ccs_index == -1) + { + uint64_t trial_err = eval_solution( + pixel_stats, + cem_index, trial_endpoint_vals, endpoint_ise_range, + trial_weight_vals0, weight_ise_range, + enc_params); + + if (trial_err < trial_blk_error) + { + trial_blk_error = trial_err; + memcpy(pTrial_endpoint_vals, trial_endpoint_vals, astc_helpers::get_num_cem_values(cem_index)); + memcpy(pTrial_weight_vals0, trial_weight_vals0, pixel_stats.m_num_pixels); + if (pTrial_weight_vals1) + memset(pTrial_weight_vals1, 0, pixel_stats.m_num_pixels); + trial_used_blue_contraction = res.m_used_blue_contraction; + improved_flag = true; + } + } + else + { + uint64_t trial_err = eval_solution_dp( + pixel_stats, + cem_index, ccs_index, trial_endpoint_vals, endpoint_ise_range, + trial_weight_vals0, trial_weight_vals1, weight_ise_range, + enc_params); + + if (trial_err < trial_blk_error) + { + trial_blk_error = trial_err; + memcpy(pTrial_endpoint_vals, trial_endpoint_vals, astc_helpers::get_num_cem_values(cem_index)); + memcpy(pTrial_weight_vals0, trial_weight_vals0, pixel_stats.m_num_pixels); + memcpy(pTrial_weight_vals1, trial_weight_vals1, pixel_stats.m_num_pixels); + trial_used_blue_contraction = res.m_used_blue_contraction; + improved_flag = true; + } + } + + if (res.m_any_degen) + { + color_rgba dec_l(0), dec_h(0); + decode_endpoints(cem_index, trial_endpoint_vals, endpoint_ise_range, dec_l, dec_h); + + // The packing in these modes is so complex that we're going to approximate the biasing, and hope for the best. + const uint32_t num_ise_levels = astc_helpers::get_ise_levels(endpoint_ise_range); + int vals_per_ise_level = (256 + num_ise_levels - 1) / num_ise_levels; + + // TODO: There is potential cross-talk between RGB and A with the way this is done. + for (uint32_t p = 1; p <= 3; p++) + { + color_rgba trial_low_color(low_color), trial_high_color(high_color); + + for (uint32_t c = 0; c < num_comps; c++) + { + if (low_color[c] == high_color[c]) + continue; + + if (dec_l[c] != dec_h[c]) + continue; + + int delta = (low_color[c] < high_color[c]) ? -1 : 1; + if (p & 1) + trial_low_color[c] = (uint8_t)basisu::clamp((int)trial_low_color[c] + vals_per_ise_level * delta, 0, 255); + + if (p & 2) + trial_high_color[c] = (uint8_t)basisu::clamp((int)trial_high_color[c] + vals_per_ise_level * -delta, 0, 255); + } // c + + res = cem_encode_ldr_rgb_or_rgba_base_offset(cem_index, endpoint_ise_range, trial_low_color, trial_high_color, trial_endpoint_vals, try_blue_contract); + + if (res.m_failed_flag) + continue; + + if (ccs_index == -1) + { + uint64_t trial_err = eval_solution( + pixel_stats, + cem_index, trial_endpoint_vals, endpoint_ise_range, + trial_weight_vals0, weight_ise_range, + enc_params); + + if (trial_err < trial_blk_error) + { + trial_blk_error = trial_err; + memcpy(pTrial_endpoint_vals, trial_endpoint_vals, astc_helpers::get_num_cem_values(cem_index)); + memcpy(pTrial_weight_vals0, trial_weight_vals0, pixel_stats.m_num_pixels); + if (pTrial_weight_vals1) + memset(pTrial_weight_vals1, 0, pixel_stats.m_num_pixels); + trial_used_blue_contraction = res.m_used_blue_contraction; + if (res.m_delta_clamped) + tried_base_ofs_clamped = true; + improved_flag = true; + } + } + else + { + uint64_t trial_err = eval_solution_dp( + pixel_stats, + cem_index, ccs_index, trial_endpoint_vals, endpoint_ise_range, + trial_weight_vals0, trial_weight_vals1, weight_ise_range, + enc_params); + + if (trial_err < trial_blk_error) + { + trial_blk_error = trial_err; + memcpy(pTrial_endpoint_vals, trial_endpoint_vals, astc_helpers::get_num_cem_values(cem_index)); + memcpy(pTrial_weight_vals0, trial_weight_vals0, pixel_stats.m_num_pixels); + memcpy(pTrial_weight_vals1, trial_weight_vals1, pixel_stats.m_num_pixels); + trial_used_blue_contraction = res.m_used_blue_contraction; + if (res.m_delta_clamped) + tried_base_ofs_clamped = true; + improved_flag = true; + } + } + + } // p + } + else + { + // Now factor in the quantization introduced into the low (base) color, and apply this to the offset, for gain. + color_rgba dec_l(0), dec_h(0); + decode_endpoints(cem_index, trial_endpoint_vals, endpoint_ise_range, dec_l, dec_h); + + if (res.m_endpoints_swapped) + dec_l = low_color; // high color is the quantized base + else + dec_h = high_color; // low color is the quantized base + + res = cem_encode_ldr_rgb_or_rgba_base_offset(cem_index, endpoint_ise_range, dec_l, dec_h, trial_endpoint_vals, try_blue_contract); + + if (!res.m_failed_flag) + { + if (ccs_index == -1) + { + uint64_t trial_err = eval_solution( + pixel_stats, + cem_index, trial_endpoint_vals, endpoint_ise_range, + trial_weight_vals0, weight_ise_range, + enc_params); + + if (trial_err < trial_blk_error) + { + trial_blk_error = trial_err; + memcpy(pTrial_endpoint_vals, trial_endpoint_vals, astc_helpers::get_num_cem_values(cem_index)); + memcpy(pTrial_weight_vals0, trial_weight_vals0, pixel_stats.m_num_pixels); + if (pTrial_weight_vals1) + memset(pTrial_weight_vals1, 0, pixel_stats.m_num_pixels); + trial_used_blue_contraction = res.m_used_blue_contraction; + if (res.m_delta_clamped) + tried_base_ofs_clamped = true; + improved_flag = true; + } + } + else + { + uint64_t trial_err = eval_solution_dp( + pixel_stats, + cem_index, ccs_index, trial_endpoint_vals, endpoint_ise_range, + trial_weight_vals0, trial_weight_vals1, weight_ise_range, + enc_params); + + if (trial_err < trial_blk_error) + { + trial_blk_error = trial_err; + memcpy(pTrial_endpoint_vals, trial_endpoint_vals, astc_helpers::get_num_cem_values(cem_index)); + memcpy(pTrial_weight_vals0, trial_weight_vals0, pixel_stats.m_num_pixels); + memcpy(pTrial_weight_vals1, trial_weight_vals1, pixel_stats.m_num_pixels); + trial_used_blue_contraction = res.m_used_blue_contraction; + if (res.m_delta_clamped) + tried_base_ofs_clamped = true; + improved_flag = true; + } + } + } + } + + return improved_flag; + } + + // l/la direct, single plane + static uint64_t encode_cem0_4( + uint32_t cem_index, + const pixel_stats_t& pixel_stats, const cem_encode_params& enc_params, + uint32_t endpoint_ise_range, uint32_t weight_ise_range, + uint8_t* pEndpoint_vals, uint8_t* pWeight_vals, uint64_t cur_blk_error) + { + assert(g_initialized); + assert((cem_index == astc_helpers::CEM_LDR_LUM_DIRECT) || (cem_index == astc_helpers::CEM_LDR_LUM_ALPHA_DIRECT)); + assert((pixel_stats.m_num_pixels) && (pixel_stats.m_num_pixels <= ASTC_LDR_MAX_BLOCK_PIXELS)); + assert((endpoint_ise_range >= astc_helpers::FIRST_VALID_ENDPOINT_ISE_RANGE) && (endpoint_ise_range <= astc_helpers::LAST_VALID_ENDPOINT_ISE_RANGE)); + assert(((weight_ise_range >= astc_helpers::FIRST_VALID_WEIGHT_ISE_RANGE) && (weight_ise_range <= astc_helpers::LAST_VALID_WEIGHT_ISE_RANGE)) || (weight_ise_range == astc_helpers::BISE_64_LEVELS)); + + const bool cem_has_alpha = (cem_index == astc_helpers::CEM_LDR_LUM_ALPHA_DIRECT); + + const uint32_t total_endpoint_vals = astc_helpers::get_num_cem_values(cem_index); + const uint32_t total_weights = pixel_stats.m_num_pixels; + + float lum_l = BIG_FLOAT_VAL, lum_h = -BIG_FLOAT_VAL; + + float pixel1F[ASTC_LDR_MAX_BLOCK_PIXELS]; + vec2F pixel2F[ASTC_LDR_MAX_BLOCK_PIXELS]; + + for (uint32_t i = 0; i < pixel_stats.m_num_pixels; i++) + { + const vec4F& px = pixel_stats.m_pixels_f[i]; + + float l = (px[0] + px[1] + px[2]) * (1.0f / 3.0f); + + pixel1F[i] = l; + + pixel2F[i][0] = l; + pixel2F[i][1] = px[3]; + + lum_l = minimum(lum_l, l); + lum_h = maximum(lum_h, l); + } + + const float a_l = pixel_stats.m_min_f[3]; + const float a_h = pixel_stats.m_max_f[3]; + + const vec2F min_pixel2F(lum_l, a_l), max_pixel2F(lum_h, a_h); + + uint8_t trial_blk_endpoints[astc_helpers::MAX_CEM_ENDPOINT_VALS] = { 0 }; + uint8_t trial_blk_weights[ASTC_LDR_MAX_BLOCK_PIXELS] = { 0 }; + uint64_t trial_blk_error = UINT64_MAX; + + bool did_improve = try_cem0_or_4( + cem_index, pixel_stats, enc_params, + endpoint_ise_range, weight_ise_range, + lum_l, lum_h, a_l, a_h, + trial_blk_endpoints, trial_blk_weights, trial_blk_error); + BASISU_NOTE_UNUSED(did_improve); + + if (trial_blk_error == UINT64_MAX) + return cur_blk_error; + + if (trial_blk_error < cur_blk_error) + { + cur_blk_error = trial_blk_error; + memcpy(pEndpoint_vals, trial_blk_endpoints, total_endpoint_vals); + memcpy(pWeight_vals, trial_blk_weights, total_weights); + } + + const uint32_t NUM_LS_OPT_PASSES = 3; + + for (uint32_t pass = 0; pass < NUM_LS_OPT_PASSES; pass++) + { + vec2F xl(lum_l, a_l), xh(lum_h, a_h); + + bool ls_res; + if (cem_has_alpha) + { + ls_res = compute_least_squares_endpoints_2D( + pixel_stats.m_num_pixels, trial_blk_weights, get_ls_weights_ise(weight_ise_range), + &xl, &xh, pixel2F, min_pixel2F, max_pixel2F); + + } + else + { + ls_res = compute_least_squares_endpoints_1D( + pixel_stats.m_num_pixels, trial_blk_weights, get_ls_weights_ise(weight_ise_range), + &xl[0], &xh[0], pixel1F, lum_l, lum_h); + } + if (!ls_res) + break; + + bool did_improve_res = false; + + did_improve_res = try_cem0_or_4( + cem_index, pixel_stats, enc_params, + endpoint_ise_range, weight_ise_range, + xl[0], xh[0], xl[1], xh[1], + trial_blk_endpoints, trial_blk_weights, trial_blk_error); + + BASISU_NOTE_UNUSED(did_improve_res); + + if (trial_blk_error >= cur_blk_error) + break; + + cur_blk_error = trial_blk_error; + memcpy(pEndpoint_vals, trial_blk_endpoints, total_endpoint_vals); + memcpy(pWeight_vals, trial_blk_weights, total_weights); + + } // pass + + return cur_blk_error; + } + + // lum+alpha direct, dual plane + static uint64_t encode_cem4_dp_a( + uint32_t cem_index, + const pixel_stats_t& pixel_stats, const cem_encode_params& enc_params, + uint32_t endpoint_ise_range, uint32_t weight_ise_range, + uint8_t* pEndpoint_vals, uint8_t* pWeight_vals0, uint8_t* pWeight_vals1, uint64_t cur_blk_error) + { + assert(g_initialized); + assert(cem_index == astc_helpers::CEM_LDR_LUM_ALPHA_DIRECT); + assert((pixel_stats.m_num_pixels) && (pixel_stats.m_num_pixels <= ASTC_LDR_MAX_BLOCK_PIXELS)); + assert((endpoint_ise_range >= astc_helpers::FIRST_VALID_ENDPOINT_ISE_RANGE) && (endpoint_ise_range <= astc_helpers::LAST_VALID_ENDPOINT_ISE_RANGE)); + assert(((weight_ise_range >= astc_helpers::FIRST_VALID_WEIGHT_ISE_RANGE) && (weight_ise_range <= astc_helpers::LAST_VALID_WEIGHT_ISE_RANGE)) || (weight_ise_range == astc_helpers::BISE_64_LEVELS)); + + const uint32_t total_endpoint_vals = astc_helpers::get_num_cem_values(cem_index); + const uint32_t total_weights = pixel_stats.m_num_pixels; + + float alpha_vals[ASTC_LDR_MAX_BLOCK_PIXELS]; + + for (uint32_t i = 0; i < pixel_stats.m_num_pixels; i++) + { + const vec4F& px = pixel_stats.m_pixels_f[i]; + + alpha_vals[i] = px[3]; + } + + // First get plane0's low/high (lum) + uint8_t lum_endpoints[astc_helpers::MAX_CEM_ENDPOINT_VALS]; + uint8_t lum_weights0[ASTC_LDR_MAX_BLOCK_PIXELS]; + + uint64_t lum_blk_error = encode_cem0_4( + astc_helpers::CEM_LDR_LUM_DIRECT, + pixel_stats, enc_params, + endpoint_ise_range, weight_ise_range, + lum_endpoints, lum_weights0, UINT64_MAX); + + if (lum_blk_error == UINT64_MAX) + return cur_blk_error; + + const auto& dequant_endpoints_tab = astc_helpers::g_dequant_tables.get_endpoint_tab(endpoint_ise_range).m_ISE_to_val; + + float lum_l = (float)dequant_endpoints_tab[lum_endpoints[0]] * (1.0f / 255.0f); + float lum_h = (float)dequant_endpoints_tab[lum_endpoints[1]] * (1.0f / 255.0f); + float a_l = pixel_stats.m_min_f[3]; + float a_h = pixel_stats.m_max_f[3]; + + uint8_t trial_endpoints[astc_helpers::MAX_CEM_ENDPOINT_VALS]; + uint8_t trial_weights0[ASTC_LDR_MAX_BLOCK_PIXELS]; + uint8_t trial_weights1[ASTC_LDR_MAX_BLOCK_PIXELS]; + uint64_t trial_blk_error = UINT64_MAX; + + bool did_improve = try_cem4_dp_a( + cem_index, pixel_stats, enc_params, + endpoint_ise_range, weight_ise_range, + lum_l, lum_h, a_l, a_h, + trial_endpoints, trial_weights0, trial_weights1, trial_blk_error); + + if (!did_improve) + { + assert(0); + return cur_blk_error; + } + + if (trial_blk_error < cur_blk_error) + { + cur_blk_error = trial_blk_error; + memcpy(pEndpoint_vals, trial_endpoints, total_endpoint_vals); + memcpy(pWeight_vals0, trial_weights0, total_weights); + memcpy(pWeight_vals1, trial_weights1, total_weights); + } + + const uint32_t NUM_LS_OPT_PASSES = 3; + + for (uint32_t pass = 0; pass < NUM_LS_OPT_PASSES; pass++) + { + float xl = pixel_stats.m_min_f[3], xh = pixel_stats.m_max_f[3]; + + bool ls_res = compute_least_squares_endpoints_1D( + pixel_stats.m_num_pixels, trial_weights1, get_ls_weights_ise(weight_ise_range), + &xl, &xh, alpha_vals, pixel_stats.m_min_f[3], pixel_stats.m_max_f[3]); + if (!ls_res) + break; + + did_improve = try_cem4_dp_a( + cem_index, pixel_stats, enc_params, + endpoint_ise_range, weight_ise_range, + lum_l, lum_h, xl, xh, + trial_endpoints, trial_weights0, trial_weights1, trial_blk_error); + + if (!did_improve) + break; + + cur_blk_error = trial_blk_error; + memcpy(pEndpoint_vals, trial_endpoints, total_endpoint_vals); + memcpy(pWeight_vals0, trial_weights0, total_weights); + memcpy(pWeight_vals1, trial_weights1, total_weights); + + } // pass + + return cur_blk_error; + } + + struct weight_refiner + { + void init(uint32_t weight_ise_range, uint32_t total_pixels, const uint8_t *pInitial_ise_weights) + { + m_weight_ise_range = weight_ise_range; + m_total_pixels = total_pixels; + m_pISE_to_rank = &astc_helpers::g_dequant_tables.get_weight_tab(weight_ise_range).m_ISE_to_rank; + m_pRank_to_ise = &astc_helpers::g_dequant_tables.get_weight_tab(weight_ise_range).m_rank_to_ISE; + m_num_weight_levels = astc_helpers::get_ise_levels(weight_ise_range); + + for (uint32_t i = 0; i < total_pixels; i++) + m_start_weights[i] = (*m_pISE_to_rank)[pInitial_ise_weights[i]]; + + m_min_weight = UINT32_MAX; + m_max_weight = 0; + m_sum_weight = 0; + + for (uint32_t i = 0; i < total_pixels; i++) + { + const uint32_t weight = m_start_weights[i]; + m_sum_weight += weight; + m_min_weight = minimumu(m_min_weight, weight); + m_max_weight = maximumu(m_max_weight, weight); + } + } + + void refine(uint32_t pass_index, uint8_t* pTrial_ise_weights) + { + switch (pass_index) + { + case 0: + { + for (uint32_t i = 0; i < m_total_pixels; i++) + { + uint32_t v = m_start_weights[i]; + if ((v == m_min_weight) && (v < (m_num_weight_levels - 1))) + v++; + + pTrial_ise_weights[i] = (*m_pRank_to_ise)[v]; + } + break; + } + case 1: + { + for (uint32_t i = 0; i < m_total_pixels; i++) + { + uint32_t v = m_start_weights[i]; + if ((v == m_max_weight) && (v > 0)) + v--; + + pTrial_ise_weights[i] = (*m_pRank_to_ise)[v]; + } + break; + } + case 2: + { + for (uint32_t i = 0; i < m_total_pixels; i++) + { + uint32_t v = m_start_weights[i]; + if ((v == m_min_weight) && (v < (m_num_weight_levels - 1))) + v++; + else if ((v == m_max_weight) && (v > 0)) + v--; + + pTrial_ise_weights[i] = (*m_pRank_to_ise)[v]; + } + break; + } + case 3: + { + const int max_weight_rank_index = m_num_weight_levels - 1; + int ly = -1, hy = max_weight_rank_index + 1; + + for (uint32_t i = 0; i < m_total_pixels; i++) + { + int s = (int)clampf(floor((float)max_weight_rank_index * ((float)m_start_weights[i] - (float)ly) / ((float)hy - (float)ly) + .5f), 0, (float)max_weight_rank_index); + pTrial_ise_weights[i] = (*m_pRank_to_ise)[s]; + } + + break; + } + case 4: + { + const int max_weight_rank_index = m_num_weight_levels - 1; + int ly = -2, hy = max_weight_rank_index + 2; + + for (uint32_t i = 0; i < m_total_pixels; i++) + { + int s = (int)clampf(floor((float)max_weight_rank_index * ((float)m_start_weights[i] - (float)ly) / ((float)hy - (float)ly) + .5f), 0, (float)max_weight_rank_index); + pTrial_ise_weights[i] = (*m_pRank_to_ise)[s]; + } + + break; + } + case 5: + { + const int max_weight_rank_index = m_num_weight_levels - 1; + int ly = -1, hy = max_weight_rank_index + 2; + + for (uint32_t i = 0; i < m_total_pixels; i++) + { + int s = (int)clampf(floor((float)max_weight_rank_index * ((float)m_start_weights[i] - (float)ly) / ((float)hy - (float)ly) + .5f), 0, (float)max_weight_rank_index); + pTrial_ise_weights[i] = (*m_pRank_to_ise)[s]; + } + + break; + } + case 6: + { + const int max_weight_rank_index = m_num_weight_levels - 1; + int ly = -2, hy = max_weight_rank_index + 1; + + for (uint32_t i = 0; i < m_total_pixels; i++) + { + int s = (int)clampf(floor((float)max_weight_rank_index * ((float)m_start_weights[i] - (float)ly) / ((float)hy - (float)ly) + .5f), 0, (float)max_weight_rank_index); + pTrial_ise_weights[i] = (*m_pRank_to_ise)[s]; + } + + break; + } + case 7: + { + for (uint32_t i = 0; i < m_total_pixels; i++) + { + uint32_t v = m_start_weights[i]; + if ((v == m_min_weight) && (v < (m_num_weight_levels - 1))) + { + v++; + if (v < (m_num_weight_levels - 1)) + v++; + } + + pTrial_ise_weights[i] = (*m_pRank_to_ise)[v]; + } + break; + + break; + } + case 8: + { + for (uint32_t i = 0; i < m_total_pixels; i++) + { + uint32_t v = m_start_weights[i]; + if ((v == m_max_weight) && (v > 0)) + { + v--; + if (v > 0) + v--; + } + + pTrial_ise_weights[i] = (*m_pRank_to_ise)[v]; + } + break; + } + case 9: + { + for (uint32_t i = 0; i < m_total_pixels; i++) + { + uint32_t v = m_start_weights[i]; + if ((v == m_min_weight) && (v < (m_num_weight_levels - 1))) + { + v++; + if (v < (m_num_weight_levels - 1)) + v++; + } + else if ((v == m_max_weight) && (v > 0)) + { + v--; + if (v > 0) + v--; + } + + pTrial_ise_weights[i] = (*m_pRank_to_ise)[v]; + } + break; + } + case 10: + { + float mid_weight = (float)m_sum_weight / (float)m_total_pixels; + + for (uint32_t i = 0; i < m_total_pixels; i++) + { + int v = m_start_weights[i]; + + float fv = ((float)v - mid_weight) * .8f + ((float)m_num_weight_levels * .5f); + + v = clamp((int)std::round(fv), 0, m_num_weight_levels - 1); + + pTrial_ise_weights[i] = (*m_pRank_to_ise)[v]; + } + break; + } + case 11: + { + float mid_weight = (float)m_sum_weight / (float)m_total_pixels; + + for (uint32_t i = 0; i < m_total_pixels; i++) + { + int v = m_start_weights[i]; + + float fv = ((float)v - mid_weight) * .9f + ((float)m_num_weight_levels * .5f); + + v = clamp((int)std::round(fv), 0, m_num_weight_levels - 1); + + pTrial_ise_weights[i] = (*m_pRank_to_ise)[v]; + } + break; + } + case 12: + { + float mid_weight = (float)m_sum_weight / (float)m_total_pixels; + + for (uint32_t i = 0; i < m_total_pixels; i++) + { + int v = m_start_weights[i]; + + float fv = ((float)v - mid_weight) * 1.1f + ((float)m_num_weight_levels * .5f); + + v = clamp((int)std::round(fv), 0, m_num_weight_levels - 1); + + pTrial_ise_weights[i] = (*m_pRank_to_ise)[v]; + } + break; + } + case 13: + { + float mid_weight = (float)m_sum_weight / (float)m_total_pixels; + + for (uint32_t i = 0; i < m_total_pixels; i++) + { + int v = m_start_weights[i]; + + float fv; + if (v < mid_weight) + fv = ((float)v - mid_weight) * .8f + ((float)m_num_weight_levels * .5f); + else + fv = (float)v; + + v = clamp((int)std::round(fv), 0, m_num_weight_levels - 1); + + pTrial_ise_weights[i] = (*m_pRank_to_ise)[v]; + } + break; + } + case 14: + { + float mid_weight = (float)m_sum_weight / (float)m_total_pixels; + + for (uint32_t i = 0; i < m_total_pixels; i++) + { + int v = m_start_weights[i]; + + float fv; + if (v >= mid_weight) + fv = ((float)v - mid_weight) * .8f + ((float)m_num_weight_levels * .5f); + else + fv = (float)v; + + v = clamp((int)std::round(fv), 0, m_num_weight_levels - 1); + + pTrial_ise_weights[i] = (*m_pRank_to_ise)[v]; + } + break; + } + case 15: + { + for (uint32_t i = 0; i < m_total_pixels; i++) + { + uint32_t v = m_start_weights[i]; + if (v < (m_num_weight_levels - 1)) + v++; + + pTrial_ise_weights[i] = (*m_pRank_to_ise)[v]; + } + break; + } + case 16: + { + for (uint32_t i = 0; i < m_total_pixels; i++) + { + uint32_t v = m_start_weights[i]; + if (v) + v--; + + pTrial_ise_weights[i] = (*m_pRank_to_ise)[v]; + } + break; + } + default: + { + assert(0); + memset(pTrial_ise_weights, 0, m_total_pixels); + break; + } + } + } + + uint32_t m_total_pixels; + uint32_t m_weight_ise_range; + uint32_t m_num_weight_levels; + uint8_t m_start_weights[ASTC_LDR_MAX_BLOCK_PIXELS]; // ranks, not ISE + + uint32_t m_min_weight, m_max_weight, m_sum_weight; + + const basisu::vector* m_pISE_to_rank; + const basisu::vector* m_pRank_to_ise; + }; + + // rgb/rgba direct or rgb/rgba base+offset, single plane + static uint64_t encode_cem8_12_9_13( + uint32_t cem_index, + const pixel_stats_t& pixel_stats, const cem_encode_params& enc_params, + uint32_t endpoint_ise_range, uint32_t weight_ise_range, + uint8_t* pEndpoint_vals, uint8_t* pWeight_vals, uint64_t cur_blk_error, bool use_blue_contraction, bool* pBase_ofs_clamped_flag) + { + assert(g_initialized); + assert((cem_index == astc_helpers::CEM_LDR_RGB_DIRECT) || (cem_index == astc_helpers::CEM_LDR_RGBA_DIRECT) || + (cem_index == astc_helpers::CEM_LDR_RGB_BASE_PLUS_OFFSET) || (cem_index == astc_helpers::CEM_LDR_RGBA_BASE_PLUS_OFFSET)); + + assert((pixel_stats.m_num_pixels) && (pixel_stats.m_num_pixels <= ASTC_LDR_MAX_BLOCK_PIXELS)); + assert((endpoint_ise_range >= astc_helpers::FIRST_VALID_ENDPOINT_ISE_RANGE) && (endpoint_ise_range <= astc_helpers::LAST_VALID_ENDPOINT_ISE_RANGE)); + assert(((weight_ise_range >= astc_helpers::FIRST_VALID_WEIGHT_ISE_RANGE) && (weight_ise_range <= astc_helpers::LAST_VALID_WEIGHT_ISE_RANGE)) || (weight_ise_range == astc_helpers::BISE_64_LEVELS)); + + if (pBase_ofs_clamped_flag) + *pBase_ofs_clamped_flag = false; + + const bool cem_has_alpha = (cem_index == astc_helpers::CEM_LDR_RGBA_DIRECT) || (cem_index == astc_helpers::CEM_LDR_RGBA_BASE_PLUS_OFFSET); + const bool cem_is_base_offset = (cem_index == astc_helpers::CEM_LDR_RGB_BASE_PLUS_OFFSET) || (cem_index == astc_helpers::CEM_LDR_RGBA_BASE_PLUS_OFFSET); + + const uint32_t total_endpoint_vals = astc_helpers::get_num_cem_values(cem_index); + const uint32_t total_weights = pixel_stats.m_num_pixels; + + float best_l = BIG_FLOAT_VAL, best_h = -BIG_FLOAT_VAL; + //int best_l_index = 0, best_h_index = 0; + + for (uint32_t c = 0; c < pixel_stats.m_num_pixels; c++) + { + const vec4F px(pixel_stats.m_pixels_f[c] - pixel_stats.m_mean_f); + + float p = cem_has_alpha ? px.dot(pixel_stats.m_mean_rel_axis4) : px.dot3(pixel_stats.m_mean_rel_axis3); + if (p < best_l) + { + best_l = p; + //best_l_index = c; + } + + if (p > best_h) + { + best_h = p; + //best_h_index = c; + } + } // c + +#if 0 + vec4F low_color_f(pixel_stats.m_pixels_f[best_l_index]), high_color_f(pixel_stats.m_pixels_f[best_h_index]); +#else + vec4F low_color_f, high_color_f; + if (cem_has_alpha) + { + low_color_f = pixel_stats.m_mean_rel_axis4 * best_l + pixel_stats.m_mean_f; + high_color_f = pixel_stats.m_mean_rel_axis4 * best_h + pixel_stats.m_mean_f; + } + else + { + low_color_f = vec4F(pixel_stats.m_mean_rel_axis3) * best_l + pixel_stats.m_mean_f; + high_color_f = vec4F(pixel_stats.m_mean_rel_axis3) * best_h + pixel_stats.m_mean_f; + } + + low_color_f.clamp(0.0f, 1.0f); + high_color_f.clamp(0.0f, 1.0f); +#endif + + uint8_t trial_blk_endpoints[astc_helpers::MAX_CEM_ENDPOINT_VALS] = { 0 }; + uint8_t trial_blk_weights[ASTC_LDR_MAX_BLOCK_PIXELS] = { 0 }; + uint64_t trial_blk_error = UINT64_MAX; + bool trial_used_blue_contraction = false; + + bool tried_used_blue_contraction = false; + + if (cem_is_base_offset) + { + bool tried_base_ofs_clamped = false; + + try_cem9_13_sp_or_dp( + cem_index, -1, pixel_stats, enc_params, + endpoint_ise_range, weight_ise_range, + low_color_f, high_color_f, + trial_blk_endpoints, trial_blk_weights, nullptr, trial_blk_error, trial_used_blue_contraction, use_blue_contraction, + tried_used_blue_contraction, tried_base_ofs_clamped); + + if ((pBase_ofs_clamped_flag) && (tried_base_ofs_clamped)) + *pBase_ofs_clamped_flag = true; + + if (tried_used_blue_contraction) + { + try_cem9_13_sp_or_dp( + cem_index, -1, pixel_stats, enc_params, + endpoint_ise_range, weight_ise_range, + low_color_f, high_color_f, + trial_blk_endpoints, trial_blk_weights, nullptr, trial_blk_error, trial_used_blue_contraction, false, + tried_used_blue_contraction, tried_base_ofs_clamped); + + if ((pBase_ofs_clamped_flag) && (tried_base_ofs_clamped)) + *pBase_ofs_clamped_flag = true; + } + } + else + { + try_cem8_12( + cem_index, pixel_stats, enc_params, + endpoint_ise_range, weight_ise_range, + low_color_f, high_color_f, + trial_blk_endpoints, trial_blk_weights, trial_blk_error, trial_used_blue_contraction, use_blue_contraction, tried_used_blue_contraction); + + if (tried_used_blue_contraction) + { + try_cem8_12( + cem_index, pixel_stats, enc_params, + endpoint_ise_range, weight_ise_range, + low_color_f, high_color_f, + trial_blk_endpoints, trial_blk_weights, trial_blk_error, trial_used_blue_contraction, false, tried_used_blue_contraction); + } + } + + if (trial_blk_error == UINT64_MAX) + return cur_blk_error; + + if (trial_blk_error < cur_blk_error) + { + cur_blk_error = trial_blk_error; + memcpy(pEndpoint_vals, trial_blk_endpoints, total_endpoint_vals); + memcpy(pWeight_vals, trial_blk_weights, total_weights); + } + + for (uint32_t pass = 0; pass < enc_params.m_max_ls_passes; pass++) + { + vec4F xl, xh; + + bool ls_res; + if (cem_has_alpha) + { + ls_res = compute_least_squares_endpoints_4D( + pixel_stats.m_num_pixels, trial_blk_weights, get_ls_weights_ise(weight_ise_range), + &xl, &xh, pixel_stats.m_pixels_f, pixel_stats.m_min_f, pixel_stats.m_max_f); + } + else + { + ls_res = compute_least_squares_endpoints_3D( + pixel_stats.m_num_pixels, trial_blk_weights, get_ls_weights_ise(weight_ise_range), + &xl, &xh, pixel_stats.m_pixels_f, pixel_stats.m_min_f, pixel_stats.m_max_f); + } + if (!ls_res) + break; + + if (cem_is_base_offset) + { + bool tried_base_ofs_clamped = false; + + try_cem9_13_sp_or_dp( + cem_index, -1, pixel_stats, enc_params, + endpoint_ise_range, weight_ise_range, + xl, xh, + trial_blk_endpoints, trial_blk_weights, nullptr, trial_blk_error, trial_used_blue_contraction, use_blue_contraction, tried_used_blue_contraction, tried_base_ofs_clamped); + + if ((pBase_ofs_clamped_flag) && (tried_base_ofs_clamped)) + *pBase_ofs_clamped_flag = true; + + if (tried_used_blue_contraction) + { + // Try without blue contraction for a minor gain. + try_cem9_13_sp_or_dp( + cem_index, -1, pixel_stats, enc_params, + endpoint_ise_range, weight_ise_range, + xl, xh, + trial_blk_endpoints, trial_blk_weights, nullptr, trial_blk_error, trial_used_blue_contraction, false, tried_used_blue_contraction, tried_base_ofs_clamped); + + if ((pBase_ofs_clamped_flag) && (tried_base_ofs_clamped)) + *pBase_ofs_clamped_flag = true; + } + } + else + { + try_cem8_12( + cem_index, pixel_stats, enc_params, + endpoint_ise_range, weight_ise_range, + xl, xh, + trial_blk_endpoints, trial_blk_weights, trial_blk_error, trial_used_blue_contraction, use_blue_contraction, tried_used_blue_contraction); + + if (tried_used_blue_contraction) + { + // Try without blue contraction for a minor gain. + try_cem8_12( + cem_index, pixel_stats, enc_params, + endpoint_ise_range, weight_ise_range, + xl, xh, + trial_blk_endpoints, trial_blk_weights, trial_blk_error, trial_used_blue_contraction, false, tried_used_blue_contraction); + } + } + + if (trial_blk_error >= cur_blk_error) + break; + + cur_blk_error = trial_blk_error; + memcpy(pEndpoint_vals, trial_blk_endpoints, total_endpoint_vals); + memcpy(pWeight_vals, trial_blk_weights, total_weights); + + } // pass + + if ((enc_params.m_total_weight_refine_passes) && ((weight_ise_range != astc_helpers::BISE_2_LEVELS) && (weight_ise_range != astc_helpers::BISE_64_LEVELS))) + { + weight_refiner refiner; + refiner.init(weight_ise_range, pixel_stats.m_num_pixels, pWeight_vals); + + for (uint32_t pass = 0; pass < enc_params.m_total_weight_refine_passes; pass++) + { + refiner.refine(pass, trial_blk_weights); + + vec4F xl, xh; + + bool ls_res; + if (cem_has_alpha) + { + ls_res = compute_least_squares_endpoints_4D( + pixel_stats.m_num_pixels, trial_blk_weights, get_ls_weights_ise(weight_ise_range), + &xl, &xh, pixel_stats.m_pixels_f, pixel_stats.m_min_f, pixel_stats.m_max_f); + } + else + { + ls_res = compute_least_squares_endpoints_3D( + pixel_stats.m_num_pixels, trial_blk_weights, get_ls_weights_ise(weight_ise_range), + &xl, &xh, pixel_stats.m_pixels_f, pixel_stats.m_min_f, pixel_stats.m_max_f); + } + if (!ls_res) + continue; + + if (cem_is_base_offset) + { + bool tried_base_ofs_clamped = false; + + try_cem9_13_sp_or_dp( + cem_index, -1, pixel_stats, enc_params, + endpoint_ise_range, weight_ise_range, + xl, xh, + trial_blk_endpoints, trial_blk_weights, nullptr, trial_blk_error, trial_used_blue_contraction, use_blue_contraction, tried_used_blue_contraction, tried_base_ofs_clamped); + + if ((pBase_ofs_clamped_flag) && (tried_base_ofs_clamped)) + *pBase_ofs_clamped_flag = true; + + if (tried_used_blue_contraction) + { + // Try without blue contraction for a minor gain. + try_cem9_13_sp_or_dp( + cem_index, -1, pixel_stats, enc_params, + endpoint_ise_range, weight_ise_range, + xl, xh, + trial_blk_endpoints, trial_blk_weights, nullptr, trial_blk_error, trial_used_blue_contraction, false, tried_used_blue_contraction, tried_base_ofs_clamped); + + if ((pBase_ofs_clamped_flag) && (tried_base_ofs_clamped)) + *pBase_ofs_clamped_flag = true; + } + } + else + { + try_cem8_12( + cem_index, pixel_stats, enc_params, + endpoint_ise_range, weight_ise_range, + xl, xh, + trial_blk_endpoints, trial_blk_weights, trial_blk_error, trial_used_blue_contraction, use_blue_contraction, tried_used_blue_contraction); + + if (tried_used_blue_contraction) + { + // Try without blue contraction for a minor gain. + try_cem8_12( + cem_index, pixel_stats, enc_params, + endpoint_ise_range, weight_ise_range, + xl, xh, + trial_blk_endpoints, trial_blk_weights, trial_blk_error, trial_used_blue_contraction, false, tried_used_blue_contraction); + } + } + + if (trial_blk_error < cur_blk_error) + { + cur_blk_error = trial_blk_error; + memcpy(pEndpoint_vals, trial_blk_endpoints, total_endpoint_vals); + memcpy(pWeight_vals, trial_blk_weights, total_weights); + } + + } // pass + } + + const uint32_t N = 4; + if ((enc_params.m_worst_weight_nudging_flag) && + (pixel_stats.m_num_pixels > N) && + ((weight_ise_range != astc_helpers::BISE_2_LEVELS) && (weight_ise_range != astc_helpers::BISE_64_LEVELS))) + { + const uint32_t NUM_NUDGING_PASSES = 1; + for (uint32_t pass = 0; pass < NUM_NUDGING_PASSES; pass++) + { + color_rgba l, h; + decode_endpoints(cem_index, pEndpoint_vals, endpoint_ise_range, l, h); + + vec4F dir; + dir[0] = (float)(h[0] - l[0]); + dir[1] = (float)(h[1] - l[1]); + dir[2] = (float)(h[2] - l[2]); + dir[3] = cem_has_alpha ? (float)(h[3] - l[3]) : 0.0f; + + dir.normalize_in_place(); + + float errs[ASTC_LDR_MAX_BLOCK_PIXELS]; + float delta_dots[ASTC_LDR_MAX_BLOCK_PIXELS]; + for (uint32_t i = 0; i < pixel_stats.m_num_pixels; i++) + { + vec4F ofs(pixel_stats.m_pixels_f[i] - pixel_stats.m_mean_f); + + float proj = dir.dot(ofs); + + vec4F proj_vec(pixel_stats.m_mean_f + proj * dir); + + vec4F delta_vec(pixel_stats.m_pixels_f[i] - proj_vec); + + delta_dots[i] = dir.dot(delta_vec); + + errs[i] = cem_has_alpha ? vec4F::dot_product(delta_vec, delta_vec) : vec4F::dot_product3(delta_vec, delta_vec); + } + + uint32_t errs_indices[ASTC_LDR_MAX_BLOCK_PIXELS]; + indirect_sort(pixel_stats.m_num_pixels, errs_indices, errs); + + memcpy(trial_blk_weights, pWeight_vals, total_weights); + + for (uint32_t i = 0; i < N; i++) + { + const uint32_t idx = errs_indices[pixel_stats.m_num_pixels - 1 - i]; + + int delta_to_apply = (delta_dots[idx] > 0.0f) ? 1 : -1; + + trial_blk_weights[idx] = (uint8_t)apply_delta_to_bise_weight_val(weight_ise_range, trial_blk_weights[idx], delta_to_apply); + } // i + + vec4F xl, xh; + + bool ls_res; + if (cem_has_alpha) + { + ls_res = compute_least_squares_endpoints_4D( + pixel_stats.m_num_pixels, trial_blk_weights, get_ls_weights_ise(weight_ise_range), + &xl, &xh, pixel_stats.m_pixels_f, pixel_stats.m_min_f, pixel_stats.m_max_f); + } + else + { + ls_res = compute_least_squares_endpoints_3D( + pixel_stats.m_num_pixels, trial_blk_weights, get_ls_weights_ise(weight_ise_range), + &xl, &xh, pixel_stats.m_pixels_f, pixel_stats.m_min_f, pixel_stats.m_max_f); + } + if (!ls_res) + break; + + if (cem_is_base_offset) + { + bool tried_base_ofs_clamped = false; + + try_cem9_13_sp_or_dp( + cem_index, -1, pixel_stats, enc_params, + endpoint_ise_range, weight_ise_range, + xl, xh, + trial_blk_endpoints, trial_blk_weights, nullptr, trial_blk_error, trial_used_blue_contraction, use_blue_contraction, tried_used_blue_contraction, tried_base_ofs_clamped); + + if ((pBase_ofs_clamped_flag) && (tried_base_ofs_clamped)) + *pBase_ofs_clamped_flag = true; + + if (tried_used_blue_contraction) + { + // Try without blue contraction for a minor gain. + try_cem9_13_sp_or_dp( + cem_index, -1, pixel_stats, enc_params, + endpoint_ise_range, weight_ise_range, + xl, xh, + trial_blk_endpoints, trial_blk_weights, nullptr, trial_blk_error, trial_used_blue_contraction, false, tried_used_blue_contraction, tried_base_ofs_clamped); + + if ((pBase_ofs_clamped_flag) && (tried_base_ofs_clamped)) + *pBase_ofs_clamped_flag = true; + } + } + else + { + try_cem8_12( + cem_index, pixel_stats, enc_params, + endpoint_ise_range, weight_ise_range, + xl, xh, + trial_blk_endpoints, trial_blk_weights, trial_blk_error, trial_used_blue_contraction, use_blue_contraction, tried_used_blue_contraction); + + if (tried_used_blue_contraction) + { + // Try without blue contraction for a minor gain. + try_cem8_12( + cem_index, pixel_stats, enc_params, + endpoint_ise_range, weight_ise_range, + xl, xh, + trial_blk_endpoints, trial_blk_weights, trial_blk_error, trial_used_blue_contraction, false, tried_used_blue_contraction); + } + } + + if (trial_blk_error < cur_blk_error) + { + cur_blk_error = trial_blk_error; + memcpy(pEndpoint_vals, trial_blk_endpoints, total_endpoint_vals); + memcpy(pWeight_vals, trial_blk_weights, total_weights); + } + else + { + break; + } + } // pass + } + + if (enc_params.m_endpoint_refinement_flag) + { + const uint32_t num_comps = cem_has_alpha ? 4 : 3; + + for (uint32_t c = 0; c < num_comps; c++) + { + uint8_t base_endpoint_vals[astc_helpers::MAX_CEM_ENDPOINT_VALS]; + memcpy(base_endpoint_vals, pEndpoint_vals, total_endpoint_vals); + + for (int dl = -1; dl <= 1; dl++) + { + for (int dh = -1; dh <= 1; dh++) + { + if (!dl && !dh) + continue; + + memcpy(trial_blk_endpoints, base_endpoint_vals, total_endpoint_vals); + + trial_blk_endpoints[c * 2 + 0] = (uint8_t)astc_helpers::apply_delta_to_bise_endpoint_val(endpoint_ise_range, trial_blk_endpoints[c * 2 + 0], dl); + trial_blk_endpoints[c * 2 + 1] = (uint8_t)astc_helpers::apply_delta_to_bise_endpoint_val(endpoint_ise_range, trial_blk_endpoints[c * 2 + 1], dh); + + if (!use_blue_contraction) + { + const bool uses_blue_contraction = astc_helpers::used_blue_contraction(cem_index, trial_blk_endpoints, endpoint_ise_range); + if (uses_blue_contraction) + continue; + } + + trial_blk_error = eval_solution( + pixel_stats, + cem_index, trial_blk_endpoints, endpoint_ise_range, + trial_blk_weights, weight_ise_range, + enc_params); + + if (trial_blk_error < cur_blk_error) + { + cur_blk_error = trial_blk_error; + memcpy(pEndpoint_vals, trial_blk_endpoints, total_endpoint_vals); + memcpy(pWeight_vals, trial_blk_weights, total_weights); + } + + } // dh + + } // dl + } + } + + return cur_blk_error; + } + + // rgb/rgba direct, or rgb/rgba base+offset, dual plane + static uint64_t encode_cem8_12_9_13_dp( + uint32_t cem_index, uint32_t ccs_index, + const pixel_stats_t& pixel_stats, const cem_encode_params& enc_params, + uint32_t endpoint_ise_range, uint32_t weight_ise_range, + uint8_t* pEndpoint_vals, uint8_t* pWeight_vals0, uint8_t* pWeight_vals1, + uint64_t cur_blk_error, bool use_blue_contraction, bool *pBase_ofs_clamped_flag) + { + assert(g_initialized); + assert(ccs_index <= 3); + assert((pixel_stats.m_num_pixels) && (pixel_stats.m_num_pixels <= ASTC_LDR_MAX_BLOCK_PIXELS)); + assert((endpoint_ise_range >= astc_helpers::FIRST_VALID_ENDPOINT_ISE_RANGE) && (endpoint_ise_range <= astc_helpers::LAST_VALID_ENDPOINT_ISE_RANGE)); + assert(((weight_ise_range >= astc_helpers::FIRST_VALID_WEIGHT_ISE_RANGE) && (weight_ise_range <= astc_helpers::LAST_VALID_WEIGHT_ISE_RANGE)) || (weight_ise_range == astc_helpers::BISE_64_LEVELS)); + + if (pBase_ofs_clamped_flag) + *pBase_ofs_clamped_flag = false; + + bool cem_has_alpha = false, cem_is_base_offset = false; + switch (cem_index) + { + case astc_helpers::CEM_LDR_RGB_DIRECT: break; + case astc_helpers::CEM_LDR_RGBA_DIRECT: cem_has_alpha = true; break; + case astc_helpers::CEM_LDR_RGB_BASE_PLUS_OFFSET: cem_is_base_offset = true; break; + case astc_helpers::CEM_LDR_RGBA_BASE_PLUS_OFFSET: cem_is_base_offset = true; cem_has_alpha = true; break; + default: + assert(0); + return false; + } + + assert((ccs_index <= 2) || cem_has_alpha); + + const uint32_t total_endpoint_vals = astc_helpers::get_num_cem_values(cem_index); + const uint32_t total_weights = pixel_stats.m_num_pixels; + + // Remove influence of the 2nd plane's values, recalc principle axis on other values. + vec4F flattened_pixels[ASTC_LDR_MAX_BLOCK_PIXELS]; + for (uint32_t i = 0; i < pixel_stats.m_num_pixels; i++) + { + flattened_pixels[i] = pixel_stats.m_pixels_f[i]; + flattened_pixels[i][ccs_index] = 0.0f; + + if (!cem_has_alpha) + flattened_pixels[i][3] = 0.0f; + } + + vec4F flattened_pixels_mean(pixel_stats.m_mean_f); + flattened_pixels_mean[ccs_index] = 0.0f; + + if (!cem_has_alpha) + flattened_pixels_mean[3] = 0.0f; + + vec4F flattened_axis; + if (!cem_has_alpha) + flattened_axis = calc_pca_3D(pixel_stats.m_num_pixels, flattened_pixels, flattened_pixels_mean); + else + flattened_axis = calc_pca_4D(pixel_stats.m_num_pixels, flattened_pixels, flattened_pixels_mean); + + float best_l = BIG_FLOAT_VAL, best_h = -BIG_FLOAT_VAL; + //int best_l_index = 0, best_h_index = 0; + + for (uint32_t c = 0; c < pixel_stats.m_num_pixels; c++) + { + const vec4F px(flattened_pixels[c] - flattened_pixels_mean); + + float p = px.dot(flattened_axis); + if (p < best_l) + { + best_l = p; + //best_l_index = c; + } + + if (p > best_h) + { + best_h = p; + //best_h_index = c; + } + } // c + +#if 0 + vec4F low_color_f(pixel_stats.m_pixels_f[best_l_index]), high_color_f(pixel_stats.m_pixels_f[best_h_index]); +#else + vec4F low_color_f, high_color_f; + low_color_f = flattened_pixels_mean + flattened_axis * best_l; + high_color_f = flattened_pixels_mean + flattened_axis * best_h; + + low_color_f.clamp(0.0f, 1.0f); + high_color_f.clamp(0.0f, 1.0f); +#endif + + low_color_f[ccs_index] = pixel_stats.m_min_f[ccs_index]; + high_color_f[ccs_index] = pixel_stats.m_max_f[ccs_index]; + + uint8_t trial_blk_endpoints[astc_helpers::MAX_CEM_ENDPOINT_VALS] = { 0 }; + uint8_t trial_blk_weights0[ASTC_LDR_MAX_BLOCK_PIXELS] = { 0 }; + uint8_t trial_blk_weights1[ASTC_LDR_MAX_BLOCK_PIXELS] = { 0 }; + uint64_t trial_blk_error = UINT64_MAX; + bool trial_used_blue_contraction = false; + + bool tried_used_blue_contraction = false; + + if (cem_is_base_offset) + { + bool tried_base_ofs_clamped = false; + + try_cem9_13_sp_or_dp( + cem_index, ccs_index, pixel_stats, enc_params, + endpoint_ise_range, weight_ise_range, + low_color_f, high_color_f, + trial_blk_endpoints, trial_blk_weights0, trial_blk_weights1, + trial_blk_error, trial_used_blue_contraction, use_blue_contraction, tried_used_blue_contraction, tried_base_ofs_clamped); + + if ((pBase_ofs_clamped_flag) && (tried_base_ofs_clamped)) + *pBase_ofs_clamped_flag = true; + + if (tried_used_blue_contraction) + { + try_cem9_13_sp_or_dp( + cem_index, ccs_index, pixel_stats, enc_params, + endpoint_ise_range, weight_ise_range, + low_color_f, high_color_f, + trial_blk_endpoints, trial_blk_weights0, trial_blk_weights1, trial_blk_error, trial_used_blue_contraction, false, tried_used_blue_contraction, tried_base_ofs_clamped); + + if ((pBase_ofs_clamped_flag) && (tried_base_ofs_clamped)) + *pBase_ofs_clamped_flag = true; + } + } + else + { + try_cem8_12_dp( + cem_index, ccs_index, pixel_stats, enc_params, + endpoint_ise_range, weight_ise_range, + low_color_f, high_color_f, + trial_blk_endpoints, trial_blk_weights0, trial_blk_weights1, + trial_blk_error, trial_used_blue_contraction, use_blue_contraction, tried_used_blue_contraction); + + if (tried_used_blue_contraction) + { + try_cem8_12_dp( + cem_index, ccs_index, pixel_stats, enc_params, + endpoint_ise_range, weight_ise_range, + low_color_f, high_color_f, + trial_blk_endpoints, trial_blk_weights0, trial_blk_weights1, trial_blk_error, trial_used_blue_contraction, false, tried_used_blue_contraction); + } + } + + if (trial_blk_error == UINT64_MAX) + return cur_blk_error; + + if (trial_blk_error < cur_blk_error) + { + cur_blk_error = trial_blk_error; + memcpy(pEndpoint_vals, trial_blk_endpoints, total_endpoint_vals); + memcpy(pWeight_vals0, trial_blk_weights0, total_weights); + memcpy(pWeight_vals1, trial_blk_weights1, total_weights); + } + + vec4F flattened_pixels_min_f(pixel_stats.m_min_f); + flattened_pixels_min_f[ccs_index] = 0; + + vec4F flattened_pixels_max_f(pixel_stats.m_max_f); + flattened_pixels_max_f[ccs_index] = 0; + + for (uint32_t pass = 0; pass < enc_params.m_max_ls_passes; pass++) + { + vec4F xl, xh; + + // TODO: Switch between 4D or 3D + if (!compute_least_squares_endpoints_4D( + pixel_stats.m_num_pixels, trial_blk_weights0, get_ls_weights_ise(weight_ise_range), + &xl, &xh, flattened_pixels, flattened_pixels_min_f, flattened_pixels_max_f)) + { + break; + } + + color_rgba dec_l(0), dec_h(0); + decode_endpoints(cem_index, trial_blk_endpoints, endpoint_ise_range, dec_l, dec_h); + + xl[ccs_index] = dec_l[ccs_index] * (1.0f / 255.0f); + xh[ccs_index] = dec_h[ccs_index] * (1.0f / 255.0f); + + if (cem_is_base_offset) + { + bool tried_base_ofs_clamped = false; + + try_cem9_13_sp_or_dp( + cem_index, ccs_index, pixel_stats, enc_params, + endpoint_ise_range, weight_ise_range, + xl, xh, + trial_blk_endpoints, trial_blk_weights0, trial_blk_weights1, trial_blk_error, trial_used_blue_contraction, + use_blue_contraction, tried_used_blue_contraction, tried_base_ofs_clamped); + + if ((pBase_ofs_clamped_flag) && (tried_base_ofs_clamped)) + *pBase_ofs_clamped_flag = true; + + if (tried_used_blue_contraction) + { + // Try without blue contraction for a minor gain. + try_cem9_13_sp_or_dp( + cem_index, ccs_index, pixel_stats, enc_params, + endpoint_ise_range, weight_ise_range, + xl, xh, + trial_blk_endpoints, trial_blk_weights0, trial_blk_weights1, trial_blk_error, trial_used_blue_contraction, + false, tried_used_blue_contraction, tried_base_ofs_clamped); + + if ((pBase_ofs_clamped_flag) && (tried_base_ofs_clamped)) + *pBase_ofs_clamped_flag = true; + } + } + else + { + try_cem8_12_dp( + cem_index, ccs_index, pixel_stats, enc_params, + endpoint_ise_range, weight_ise_range, + xl, xh, + trial_blk_endpoints, trial_blk_weights0, trial_blk_weights1, trial_blk_error, trial_used_blue_contraction, + use_blue_contraction, tried_used_blue_contraction); + + if (tried_used_blue_contraction) + { + // Try without blue contraction for a minor gain. + try_cem8_12_dp( + cem_index, ccs_index, pixel_stats, enc_params, + endpoint_ise_range, weight_ise_range, + xl, xh, + trial_blk_endpoints, trial_blk_weights0, trial_blk_weights1, trial_blk_error, trial_used_blue_contraction, + false, tried_used_blue_contraction); + } + } + + if (trial_blk_error >= cur_blk_error) + break; + + cur_blk_error = trial_blk_error; + memcpy(pEndpoint_vals, trial_blk_endpoints, total_endpoint_vals); + memcpy(pWeight_vals0, trial_blk_weights0, total_weights); + memcpy(pWeight_vals1, trial_blk_weights1, total_weights); + + } // pass + + const float ccs_bounds_min = pixel_stats.m_min_f[ccs_index]; + const float ccs_bounds_max = pixel_stats.m_max_f[ccs_index]; + float ccs_vals[ASTC_LDR_MAX_BLOCK_PIXELS]; + + if (ccs_bounds_min != ccs_bounds_max) + { + for (uint32_t i = 0; i < pixel_stats.m_num_pixels; i++) + ccs_vals[i] = pixel_stats.m_pixels_f[i][ccs_index]; + + for (uint32_t pass = 0; pass < enc_params.m_max_ls_passes; pass++) + { + float xl = 0.0f, xh = 0.0f; + + if (!compute_least_squares_endpoints_1D( + pixel_stats.m_num_pixels, trial_blk_weights1, get_ls_weights_ise(weight_ise_range), + &xl, &xh, ccs_vals, ccs_bounds_min, ccs_bounds_max)) + { + break; + } + + color_rgba dec_l(0), dec_h(0); + decode_endpoints(cem_index, trial_blk_endpoints, endpoint_ise_range, dec_l, dec_h); + + vec4F vl, vh; + for (uint32_t c = 0; c < 4; c++) + { + if (c == ccs_index) + { + vl[c] = xl; + vh[c] = xh; + } + else + { + vl[c] = (float)dec_l[c] * (1.0f / 255.0f); + vh[c] = (float)dec_h[c] * (1.0f / 255.0f); + } + } + + if (cem_is_base_offset) + { + bool tried_base_ofs_clamped = false; + + try_cem9_13_sp_or_dp( + cem_index, ccs_index, pixel_stats, enc_params, + endpoint_ise_range, weight_ise_range, + vl, vh, + trial_blk_endpoints, trial_blk_weights0, trial_blk_weights1, trial_blk_error, trial_used_blue_contraction, + use_blue_contraction, tried_used_blue_contraction, tried_base_ofs_clamped); + + if ((pBase_ofs_clamped_flag) && (tried_base_ofs_clamped)) + *pBase_ofs_clamped_flag = true; + + if (tried_used_blue_contraction) + { + // Try without blue contraction for a minor gain. + try_cem9_13_sp_or_dp( + cem_index, ccs_index, pixel_stats, enc_params, + endpoint_ise_range, weight_ise_range, + vl, vh, + trial_blk_endpoints, trial_blk_weights0, trial_blk_weights1, trial_blk_error, trial_used_blue_contraction, + false, tried_used_blue_contraction, tried_base_ofs_clamped); + + if ((pBase_ofs_clamped_flag) && (tried_base_ofs_clamped)) + *pBase_ofs_clamped_flag = true; + } + } + else + { + try_cem8_12_dp( + cem_index, ccs_index, pixel_stats, enc_params, + endpoint_ise_range, weight_ise_range, + vl, vh, + trial_blk_endpoints, trial_blk_weights0, trial_blk_weights1, trial_blk_error, trial_used_blue_contraction, + use_blue_contraction, tried_used_blue_contraction); + + if (tried_used_blue_contraction) + { + // Try without blue contraction for a minor gain. + try_cem8_12_dp( + cem_index, ccs_index, pixel_stats, enc_params, + endpoint_ise_range, weight_ise_range, + vl, vh, + trial_blk_endpoints, trial_blk_weights0, trial_blk_weights1, trial_blk_error, trial_used_blue_contraction, + false, tried_used_blue_contraction); + } + } + + if (trial_blk_error >= cur_blk_error) + break; + + cur_blk_error = trial_blk_error; + memcpy(pEndpoint_vals, trial_blk_endpoints, total_endpoint_vals); + memcpy(pWeight_vals0, trial_blk_weights0, total_weights); + memcpy(pWeight_vals1, trial_blk_weights1, total_weights); + + } // pass + } + + if ((enc_params.m_total_weight_refine_passes) && ((weight_ise_range != astc_helpers::BISE_2_LEVELS) && (weight_ise_range != astc_helpers::BISE_64_LEVELS))) + { + weight_refiner refiner; + refiner.init(weight_ise_range, pixel_stats.m_num_pixels, pWeight_vals0); + + for (uint32_t pass = 0; pass < enc_params.m_total_weight_refine_passes; pass++) + { + refiner.refine(pass, trial_blk_weights0); + + vec4F xl, xh; + + if (!compute_least_squares_endpoints_4D( + pixel_stats.m_num_pixels, trial_blk_weights0, get_ls_weights_ise(weight_ise_range), + &xl, &xh, flattened_pixels, flattened_pixels_min_f, flattened_pixels_max_f)) + { + break; + } + + color_rgba dec_l(0), dec_h(0); + decode_endpoints(cem_index, trial_blk_endpoints, endpoint_ise_range, dec_l, dec_h); + + xl[ccs_index] = dec_l[ccs_index] * (1.0f / 255.0f); + xh[ccs_index] = dec_h[ccs_index] * (1.0f / 255.0f); + + if (cem_is_base_offset) + { + bool tried_base_ofs_clamped = false; + + try_cem9_13_sp_or_dp( + cem_index, ccs_index, pixel_stats, enc_params, + endpoint_ise_range, weight_ise_range, + xl, xh, + trial_blk_endpoints, trial_blk_weights0, trial_blk_weights1, trial_blk_error, trial_used_blue_contraction, + use_blue_contraction, tried_used_blue_contraction, tried_base_ofs_clamped); + + if ((pBase_ofs_clamped_flag) && (tried_base_ofs_clamped)) + *pBase_ofs_clamped_flag = true; + + if (tried_used_blue_contraction) + { + // Try without blue contraction for a minor gain. + try_cem9_13_sp_or_dp( + cem_index, ccs_index, pixel_stats, enc_params, + endpoint_ise_range, weight_ise_range, + xl, xh, + trial_blk_endpoints, trial_blk_weights0, trial_blk_weights1, trial_blk_error, trial_used_blue_contraction, + false, tried_used_blue_contraction, tried_base_ofs_clamped); + + if ((pBase_ofs_clamped_flag) && (tried_base_ofs_clamped)) + *pBase_ofs_clamped_flag = true; + } + } + else + { + try_cem8_12_dp( + cem_index, ccs_index, pixel_stats, enc_params, + endpoint_ise_range, weight_ise_range, + xl, xh, + trial_blk_endpoints, trial_blk_weights0, trial_blk_weights1, trial_blk_error, trial_used_blue_contraction, + use_blue_contraction, tried_used_blue_contraction); + + if (tried_used_blue_contraction) + { + // Try without blue contraction for a minor gain. + try_cem8_12_dp( + cem_index, ccs_index, pixel_stats, enc_params, + endpoint_ise_range, weight_ise_range, + xl, xh, + trial_blk_endpoints, trial_blk_weights0, trial_blk_weights1, trial_blk_error, trial_used_blue_contraction, + false, tried_used_blue_contraction); + } + } + + if (trial_blk_error >= cur_blk_error) + continue; + + cur_blk_error = trial_blk_error; + memcpy(pEndpoint_vals, trial_blk_endpoints, total_endpoint_vals); + memcpy(pWeight_vals0, trial_blk_weights0, total_weights); + memcpy(pWeight_vals1, trial_blk_weights1, total_weights); + + } // pass + + if (ccs_bounds_min != ccs_bounds_max) + { + refiner.init(weight_ise_range, pixel_stats.m_num_pixels, pWeight_vals1); + + for (uint32_t pass = 0; pass < WEIGHT_REFINER_MAX_PASSES; pass++) + { + refiner.refine(pass, trial_blk_weights1); + + float xl = 0.0f, xh = 0.0f; + + if (!compute_least_squares_endpoints_1D( + pixel_stats.m_num_pixels, trial_blk_weights1, get_ls_weights_ise(weight_ise_range), + &xl, &xh, ccs_vals, ccs_bounds_min, ccs_bounds_max)) + { + break; + } + + color_rgba dec_l(0), dec_h(0); + decode_endpoints(cem_index, trial_blk_endpoints, endpoint_ise_range, dec_l, dec_h); + + vec4F vl, vh; + for (uint32_t c = 0; c < 4; c++) + { + if (c == ccs_index) + { + vl[c] = xl; + vh[c] = xh; + } + else + { + vl[c] = (float)dec_l[c] * (1.0f / 255.0f); + vh[c] = (float)dec_h[c] * (1.0f / 255.0f); + } + } + + bool did_improve_res = false; + + if (cem_is_base_offset) + { + bool tried_base_ofs_clamped = false; + + did_improve_res = try_cem9_13_sp_or_dp( + cem_index, ccs_index, pixel_stats, enc_params, + endpoint_ise_range, weight_ise_range, + vl, vh, + trial_blk_endpoints, trial_blk_weights0, trial_blk_weights1, trial_blk_error, trial_used_blue_contraction, + use_blue_contraction, tried_used_blue_contraction, tried_base_ofs_clamped); + BASISU_NOTE_UNUSED(did_improve_res); + + if ((pBase_ofs_clamped_flag) && (tried_base_ofs_clamped)) + *pBase_ofs_clamped_flag = true; + + if (tried_used_blue_contraction) + { + // Try without blue contraction for a minor gain. + did_improve_res = try_cem9_13_sp_or_dp( + cem_index, ccs_index, pixel_stats, enc_params, + endpoint_ise_range, weight_ise_range, + vl, vh, + trial_blk_endpoints, trial_blk_weights0, trial_blk_weights1, trial_blk_error, trial_used_blue_contraction, + false, tried_used_blue_contraction, tried_base_ofs_clamped); + + if ((pBase_ofs_clamped_flag) && (tried_base_ofs_clamped)) + *pBase_ofs_clamped_flag = true; + } + } + else + { + did_improve_res = try_cem8_12_dp( + cem_index, ccs_index, pixel_stats, enc_params, + endpoint_ise_range, weight_ise_range, + vl, vh, + trial_blk_endpoints, trial_blk_weights0, trial_blk_weights1, trial_blk_error, trial_used_blue_contraction, + use_blue_contraction, tried_used_blue_contraction); + + if (tried_used_blue_contraction) + { + // Try without blue contraction for a minor gain. + did_improve_res = try_cem8_12_dp( + cem_index, ccs_index, pixel_stats, enc_params, + endpoint_ise_range, weight_ise_range, + vl, vh, + trial_blk_endpoints, trial_blk_weights0, trial_blk_weights1, trial_blk_error, trial_used_blue_contraction, + false, tried_used_blue_contraction); + } + } + + if (trial_blk_error >= cur_blk_error) + continue; + + cur_blk_error = trial_blk_error; + memcpy(pEndpoint_vals, trial_blk_endpoints, total_endpoint_vals); + memcpy(pWeight_vals0, trial_blk_weights0, total_weights); + memcpy(pWeight_vals1, trial_blk_weights1, total_weights); + + } // pass + } + } + + return cur_blk_error; + } + + // base scale rgb/rgba + // returns true if improved + static bool try_cem6_10( + uint32_t cem_index, + const pixel_stats_t& pixel_stats, const cem_encode_params& enc_params, + uint32_t endpoint_ise_range, uint32_t weight_ise_range, + float scale, float low_a_f, const vec4F& high_color_f, + uint8_t* pTrial_endpoint_vals, uint8_t* pTrial_weight_vals, uint64_t& trial_blk_error) + { + assert(g_initialized); + assert((cem_index == astc_helpers::CEM_LDR_RGB_BASE_SCALE) || (cem_index == astc_helpers::CEM_LDR_RGB_BASE_SCALE_PLUS_TWO_A)); + assert((pixel_stats.m_num_pixels) && (pixel_stats.m_num_pixels <= ASTC_LDR_MAX_BLOCK_PIXELS)); + assert((endpoint_ise_range >= astc_helpers::FIRST_VALID_ENDPOINT_ISE_RANGE) && (endpoint_ise_range <= astc_helpers::LAST_VALID_ENDPOINT_ISE_RANGE)); + assert(((weight_ise_range >= astc_helpers::FIRST_VALID_WEIGHT_ISE_RANGE) && (weight_ise_range <= astc_helpers::LAST_VALID_WEIGHT_ISE_RANGE)) || (weight_ise_range == astc_helpers::BISE_64_LEVELS)); + + uint8_t trial_endpoint_vals[astc_helpers::NUM_MODE10_ENDPOINTS] = { 0 }; + uint8_t trial_weight_vals[ASTC_LDR_MAX_BLOCK_PIXELS] = { 0 }; + + cem_encode_ldr_rgb_or_rgba_base_scale(cem_index, endpoint_ise_range, scale, low_a_f, high_color_f, trial_endpoint_vals); + + uint64_t trial_err = eval_solution( + pixel_stats, cem_index, trial_endpoint_vals, endpoint_ise_range, + trial_weight_vals, weight_ise_range, + enc_params); + + bool improved_flag = false; + if (trial_err < trial_blk_error) + { + trial_blk_error = trial_err; + memcpy(pTrial_endpoint_vals, trial_endpoint_vals, astc_helpers::get_num_cem_values(cem_index)); + memcpy(pTrial_weight_vals, trial_weight_vals, pixel_stats.m_num_pixels); + improved_flag = true; + } + + const uint32_t num_endpoint_vals = astc_helpers::get_num_cem_values(cem_index); + + // TODO + for (int delta = -1; delta <= 1; delta += 1) + { + if (!delta) + continue; + + uint8_t fixed_endpoint_vals[astc_helpers::NUM_MODE10_ENDPOINTS]; + memcpy(fixed_endpoint_vals, trial_endpoint_vals, num_endpoint_vals); + + fixed_endpoint_vals[3] = (uint8_t)astc_helpers::apply_delta_to_bise_endpoint_val(endpoint_ise_range, fixed_endpoint_vals[3], delta); + + trial_err = eval_solution( + pixel_stats, cem_index, fixed_endpoint_vals, endpoint_ise_range, + trial_weight_vals, weight_ise_range, + enc_params); + + if (trial_err < trial_blk_error) + { + trial_blk_error = trial_err; + memcpy(pTrial_endpoint_vals, fixed_endpoint_vals, astc_helpers::get_num_cem_values(cem_index)); + memcpy(pTrial_weight_vals, trial_weight_vals, pixel_stats.m_num_pixels); + improved_flag = true; + } + } + + return improved_flag; + } + + static bool try_cem6_10_dp( + uint32_t cem_index, uint32_t ccs_index, + const pixel_stats_t& pixel_stats, const cem_encode_params& enc_params, + uint32_t endpoint_ise_range, uint32_t weight_ise_range, + float scale, float low_a_f, const vec4F& high_color_f, + uint8_t* pTrial_endpoint_vals, uint8_t* pTrial_weight_vals0, uint8_t* pTrial_weight_vals1, uint64_t& trial_blk_error) + { + assert(g_initialized); + assert((cem_index == astc_helpers::CEM_LDR_RGB_BASE_SCALE) || (cem_index == astc_helpers::CEM_LDR_RGB_BASE_SCALE_PLUS_TWO_A)); + assert(ccs_index <= 3); + assert((pixel_stats.m_num_pixels) && (pixel_stats.m_num_pixels <= ASTC_LDR_MAX_BLOCK_PIXELS)); + assert((endpoint_ise_range >= astc_helpers::FIRST_VALID_ENDPOINT_ISE_RANGE) && (endpoint_ise_range <= astc_helpers::LAST_VALID_ENDPOINT_ISE_RANGE)); + assert(((weight_ise_range >= astc_helpers::FIRST_VALID_WEIGHT_ISE_RANGE) && (weight_ise_range <= astc_helpers::LAST_VALID_WEIGHT_ISE_RANGE)) || (weight_ise_range == astc_helpers::BISE_64_LEVELS)); + assert(pTrial_weight_vals0 && pTrial_weight_vals1); + + uint8_t trial_endpoint_vals[astc_helpers::NUM_MODE10_ENDPOINTS] = { 0 }; + uint8_t trial_weight_vals0[ASTC_LDR_MAX_BLOCK_PIXELS] = { 0 }; + uint8_t trial_weight_vals1[ASTC_LDR_MAX_BLOCK_PIXELS] = { 0 }; + + cem_encode_ldr_rgb_or_rgba_base_scale(cem_index, endpoint_ise_range, scale, low_a_f, high_color_f, trial_endpoint_vals); + + uint64_t trial_err = eval_solution_dp( + pixel_stats, cem_index, ccs_index, + trial_endpoint_vals, endpoint_ise_range, + trial_weight_vals0, trial_weight_vals1, weight_ise_range, + enc_params); + + bool improved_flag = false; + if (trial_err < trial_blk_error) + { + trial_blk_error = trial_err; + memcpy(pTrial_endpoint_vals, trial_endpoint_vals, astc_helpers::get_num_cem_values(cem_index)); + memcpy(pTrial_weight_vals0, trial_weight_vals0, pixel_stats.m_num_pixels); + memcpy(pTrial_weight_vals1, trial_weight_vals1, pixel_stats.m_num_pixels); + improved_flag = true; + } + + const uint32_t num_endpoint_vals = astc_helpers::get_num_cem_values(cem_index); + + for (int delta = -1; delta <= 1; delta += 1) + { + if (!delta) + continue; + + uint8_t fixed_endpoint_vals[astc_helpers::NUM_MODE10_ENDPOINTS]; + memcpy(fixed_endpoint_vals, trial_endpoint_vals, num_endpoint_vals); + + fixed_endpoint_vals[3] = (uint8_t)astc_helpers::apply_delta_to_bise_endpoint_val(endpoint_ise_range, fixed_endpoint_vals[3], delta); + + trial_err = eval_solution_dp( + pixel_stats, cem_index, ccs_index, + fixed_endpoint_vals, endpoint_ise_range, + trial_weight_vals0, trial_weight_vals1, weight_ise_range, + enc_params); + + if (trial_err < trial_blk_error) + { + trial_blk_error = trial_err; + memcpy(pTrial_endpoint_vals, fixed_endpoint_vals, astc_helpers::get_num_cem_values(cem_index)); + memcpy(pTrial_weight_vals0, trial_weight_vals0, pixel_stats.m_num_pixels); + memcpy(pTrial_weight_vals1, trial_weight_vals1, pixel_stats.m_num_pixels); + improved_flag = true; + } + } + + return improved_flag; + } + + // rgb/rgba base+scale + static uint64_t encode_cem6_10( + uint32_t cem_index, + const pixel_stats_t& pixel_stats, const cem_encode_params& enc_params, + uint32_t endpoint_ise_range, uint32_t weight_ise_range, + uint8_t* pEndpoint_vals, uint8_t* pWeight_vals, uint64_t cur_blk_error) + { + assert(g_initialized); + assert((cem_index == astc_helpers::CEM_LDR_RGB_BASE_SCALE) || (cem_index == astc_helpers::CEM_LDR_RGB_BASE_SCALE_PLUS_TWO_A)); + assert((pixel_stats.m_num_pixels) && (pixel_stats.m_num_pixels <= ASTC_LDR_MAX_BLOCK_PIXELS)); + assert((endpoint_ise_range >= astc_helpers::FIRST_VALID_ENDPOINT_ISE_RANGE) && (endpoint_ise_range <= astc_helpers::LAST_VALID_ENDPOINT_ISE_RANGE)); + assert(((weight_ise_range >= astc_helpers::FIRST_VALID_WEIGHT_ISE_RANGE) && (weight_ise_range <= astc_helpers::LAST_VALID_WEIGHT_ISE_RANGE)) || (weight_ise_range == astc_helpers::BISE_64_LEVELS)); + + const bool cem_has_alpha = (cem_index == astc_helpers::CEM_LDR_RGB_BASE_SCALE_PLUS_TWO_A); + + const uint32_t total_endpoint_vals = astc_helpers::get_num_cem_values(cem_index); + const uint32_t total_weights = pixel_stats.m_num_pixels; + + float best_l = BIG_FLOAT_VAL, best_h = -BIG_FLOAT_VAL; + //int best_l_index = 0, best_h_index = 0; + + for (uint32_t c = 0; c < pixel_stats.m_num_pixels; c++) + { + const vec3F px(pixel_stats.m_pixels_f[c]); + + float p = px.dot(pixel_stats.m_zero_rel_axis3); + + if (p < best_l) + { + best_l = p; + //best_l_index = c; + } + + if (p > best_h) + { + best_h = p; + //best_h_index = c; + } + } // c + + const float MAX_S = 255.0f / 256.0f; + const float EPS = 1e-6f; + + uint64_t trial_blk_error = UINT64_MAX; + uint8_t trial_blk_endpoints[astc_helpers::NUM_MODE10_ENDPOINTS] = { 0 }; + uint8_t trial_blk_weights[ASTC_LDR_MAX_BLOCK_PIXELS] = { 0 }; + + uint64_t best_blk_error = UINT64_MAX; + uint8_t best_blk_endpoints[astc_helpers::NUM_MODE10_ENDPOINTS] = { 0 }; + uint8_t best_blk_weights[ASTC_LDR_MAX_BLOCK_PIXELS] = { 0 }; + + vec3F low_color3_f(best_l * pixel_stats.m_zero_rel_axis3); + low_color3_f.clamp(0.0f, 1.0f); + + vec3F high_color3_f(best_h * pixel_stats.m_zero_rel_axis3); + high_color3_f.clamp(0.0f, 1.0f); + + float scale = MAX_S; + + float d = low_color3_f.dot(high_color3_f); + float nrm = high_color3_f.norm(); + if (nrm > 0.0f) + scale = saturate(d / nrm); + scale = minimum(scale, MAX_S); + + vec4F low_color_f(low_color3_f[0], low_color3_f[1], low_color3_f[2], pixel_stats.m_min_f[3]); + vec4F high_color_f(high_color3_f[0], high_color3_f[1], high_color3_f[2], pixel_stats.m_max_f[3]); + + try_cem6_10( + cem_index, + pixel_stats, enc_params, + endpoint_ise_range, weight_ise_range, + scale, low_color_f[3], high_color_f, + trial_blk_endpoints, trial_blk_weights, trial_blk_error); + + best_blk_error = trial_blk_error; + memcpy(best_blk_endpoints, trial_blk_endpoints, total_endpoint_vals); + memcpy(best_blk_weights, trial_blk_weights, total_weights); + + const uint32_t NUM_PASSES = 2; + for (uint32_t pass = 0; pass < NUM_PASSES; pass++) + { + color_rgba actual_l(0), actual_h(0); + float actual_scale = 0; + decode_endpoints(cem_index, trial_blk_endpoints, endpoint_ise_range, actual_l, actual_h, &actual_scale); + + vec3F actual_high_f((float)actual_h[0], (float)actual_h[1], (float)actual_h[2]); + actual_high_f *= (1.0f / 255.0f); + + // invalid on raw weights + const auto& dequant_weights_tab = astc_helpers::g_dequant_tables.get_weight_tab(minimum(astc_helpers::BISE_32_LEVELS, weight_ise_range)).m_ISE_to_val; + + vec3F Pa(0.0f), Pb(0.0f); + float A = 0.0f, B = 0.0f, C = 0.0f; + + for (uint32_t i = 0; i < pixel_stats.m_num_pixels; i++) + { + const vec3F px(pixel_stats.m_pixels_f[i]); + + const int iw = (weight_ise_range == astc_helpers::BISE_64_LEVELS) ? trial_blk_weights[i] : dequant_weights_tab[trial_blk_weights[i]]; + float t = (float)iw * (1.0f / 64.0f); + float bi = t, ai = 1.0f - t; + + Pa += px * ai; + Pb += px * bi; + + A += ai * ai; + B += ai * bi; + C += bi * bi; + } + + vec3F new_high = actual_high_f; + float new_scale = actual_scale; + + float h2 = actual_high_f.norm(); + if ((h2 > EPS) && (A > EPS)) + { + new_scale = (Pa.dot(actual_high_f) / h2 - B) / A; + new_scale = clamp(new_scale, 0.0f, MAX_S); + } + + const float den = A * new_scale * new_scale + 2.0f * B * new_scale + C; + if (den > EPS) + { + new_high = (Pb + Pa * new_scale) / den; + } + + h2 = new_high.norm(); + if ((h2 > EPS) && (A > EPS)) + { + new_scale = (Pa.dot(new_high) / h2 - B) / A; + new_scale = clamp(new_scale, 0.0f, MAX_S); + } + + try_cem6_10( + cem_index, + pixel_stats, enc_params, + endpoint_ise_range, weight_ise_range, + new_scale, (float)actual_l[3] * (1.0f / 255.0f), vec4F(new_high[0], new_high[1], new_high[2], (float)actual_h[3] * (1.0f / 255.0f)), + trial_blk_endpoints, trial_blk_weights, trial_blk_error); + + if (trial_blk_error >= best_blk_error) + break; + + best_blk_error = trial_blk_error; + memcpy(best_blk_endpoints, trial_blk_endpoints, total_endpoint_vals); + memcpy(best_blk_weights, trial_blk_weights, total_weights); + + } // pass + + if (cem_has_alpha) + { + // Try to refine low a/high given the current selectors. + float bounds_min = pixel_stats.m_min_f[3]; + float bounds_max = pixel_stats.m_max_f[3]; + if (bounds_min != bounds_max) + { + float a_vals[ASTC_LDR_MAX_BLOCK_PIXELS]; + for (uint32_t i = 0; i < pixel_stats.m_num_pixels; i++) + a_vals[i] = pixel_stats.m_pixels_f[i][3]; + + const uint32_t TOTAL_PASSES = 1; + for (uint32_t pass = 0; pass < TOTAL_PASSES; pass++) + { + float xl = 0.0f, xh = 0.0f; + + if (compute_least_squares_endpoints_1D( + pixel_stats.m_num_pixels, best_blk_weights, get_ls_weights_ise(weight_ise_range), + &xl, &xh, a_vals, bounds_min, bounds_max)) + { + color_rgba actual_l(0), actual_h(0); + float actual_scale = 0; + decode_endpoints(cem_index, trial_blk_endpoints, endpoint_ise_range, actual_l, actual_h, &actual_scale); + + try_cem6_10( + cem_index, + pixel_stats, enc_params, + endpoint_ise_range, weight_ise_range, + actual_scale, xl, vec4F(actual_h[0], actual_h[1], actual_h[2], xh), + trial_blk_endpoints, trial_blk_weights, trial_blk_error); + + if (trial_blk_error < best_blk_error) + { + best_blk_error = trial_blk_error; + memcpy(best_blk_endpoints, trial_blk_endpoints, total_endpoint_vals); + memcpy(best_blk_weights, trial_blk_weights, total_weights); + } + else + { + break; + } + } + else + { + break; + } + } // pass + } + } + + if (best_blk_error < cur_blk_error) + { + cur_blk_error = best_blk_error; + memcpy(pEndpoint_vals, trial_blk_endpoints, total_endpoint_vals); + memcpy(pWeight_vals, trial_blk_weights, total_weights); + } + + return cur_blk_error; + } + + // rgba base+scale, dual plane a, ccs_index must be 3 + static uint64_t encode_cem10_dp_a( + uint32_t cem_index, + const pixel_stats_t& pixel_stats, const cem_encode_params& enc_params, + uint32_t endpoint_ise_range, uint32_t weight_ise_range, + uint8_t* pEndpoint_vals, uint8_t* pWeight_vals0, uint8_t* pWeight_vals1, uint64_t cur_blk_error) + { + assert(g_initialized); + assert(cem_index == astc_helpers::CEM_LDR_RGB_BASE_SCALE_PLUS_TWO_A); + assert((pixel_stats.m_num_pixels) && (pixel_stats.m_num_pixels <= ASTC_LDR_MAX_BLOCK_PIXELS)); + assert((endpoint_ise_range >= astc_helpers::FIRST_VALID_ENDPOINT_ISE_RANGE) && (endpoint_ise_range <= astc_helpers::LAST_VALID_ENDPOINT_ISE_RANGE)); + assert(((weight_ise_range >= astc_helpers::FIRST_VALID_WEIGHT_ISE_RANGE) && (weight_ise_range <= astc_helpers::LAST_VALID_WEIGHT_ISE_RANGE)) || (weight_ise_range == astc_helpers::BISE_64_LEVELS)); + + // RGB uses plane0, alpha plane1. So solve RGB first. + uint8_t rgba_endpoint_vals[astc_helpers::NUM_MODE10_ENDPOINTS] = { 0 }; + uint8_t rgb_weight_vals[ASTC_LDR_MAX_BLOCK_PIXELS] = { 0 }; + uint8_t a_weight_vals[ASTC_LDR_MAX_BLOCK_PIXELS] = { 0 }; + + // First just solve RGB, single plane. + uint64_t rgb_blk_error = encode_cem6_10( + astc_helpers::CEM_LDR_RGB_BASE_SCALE, + pixel_stats, enc_params, + endpoint_ise_range, weight_ise_range, + rgba_endpoint_vals, rgb_weight_vals, UINT64_MAX); + + assert(rgb_blk_error != UINT64_MAX); + + if (rgb_blk_error == UINT64_MAX) + return cur_blk_error; + + const auto& endpoint_quant_tab = astc_helpers::g_dequant_tables.get_endpoint_tab(endpoint_ise_range).m_val_to_ise; + + rgba_endpoint_vals[4] = endpoint_quant_tab[pixel_stats.m_min[3]]; + rgba_endpoint_vals[5] = endpoint_quant_tab[pixel_stats.m_max[3]]; + + uint64_t rgba_blk_error = eval_solution_dp( + pixel_stats, + cem_index, 3, + rgba_endpoint_vals, endpoint_ise_range, + rgb_weight_vals, a_weight_vals, weight_ise_range, + enc_params); + + assert(rgba_blk_error != UINT64_MAX); + + if (rgba_blk_error < cur_blk_error) + { + cur_blk_error = rgba_blk_error; + memcpy(pEndpoint_vals, rgba_endpoint_vals, astc_helpers::NUM_MODE10_ENDPOINTS); + memcpy(pWeight_vals0, rgb_weight_vals, pixel_stats.m_num_pixels); + memcpy(pWeight_vals1, a_weight_vals, pixel_stats.m_num_pixels); + + if (!cur_blk_error) + return cur_blk_error; + } + + float bounds_min = pixel_stats.m_min_f[3], bounds_max = pixel_stats.m_max_f[3]; + if (bounds_min != bounds_max) + { + float a_vals[ASTC_LDR_MAX_BLOCK_PIXELS]; + for (uint32_t i = 0; i < pixel_stats.m_num_pixels; i++) + a_vals[i] = pixel_stats.m_pixels_f[i][3]; + + const uint32_t TOTAL_PASSES = 2; + + uint8_t trial_rgba_endpoint_vals[astc_helpers::NUM_MODE10_ENDPOINTS] = { 0 }; + uint8_t trial_rgb_weight_vals[ASTC_LDR_MAX_BLOCK_PIXELS] = { 0 }; + uint8_t trial_a_weight_vals[ASTC_LDR_MAX_BLOCK_PIXELS] = { 0 }; + + for (uint32_t pass = 0; pass < TOTAL_PASSES; pass++) + { + float xl = 0.0f, xh = 0.0f; + + if (compute_least_squares_endpoints_1D( + pixel_stats.m_num_pixels, pass ? trial_a_weight_vals : a_weight_vals, get_ls_weights_ise(weight_ise_range), + &xl, &xh, a_vals, bounds_min, bounds_max)) + { + memcpy(trial_rgba_endpoint_vals, rgba_endpoint_vals, astc_helpers::NUM_MODE10_ENDPOINTS); + + trial_rgba_endpoint_vals[4] = precise_round_bise_endpoint_val(xl, endpoint_ise_range); + trial_rgba_endpoint_vals[5] = precise_round_bise_endpoint_val(xh, endpoint_ise_range); + + uint64_t trial_rgba_blk_error = eval_solution_dp( + pixel_stats, + cem_index, 3, + trial_rgba_endpoint_vals, endpoint_ise_range, + trial_rgb_weight_vals, trial_a_weight_vals, weight_ise_range, + enc_params); + + assert(trial_rgba_blk_error != UINT64_MAX); + + if (trial_rgba_blk_error < cur_blk_error) + { + cur_blk_error = trial_rgba_blk_error; + memcpy(pEndpoint_vals, trial_rgba_endpoint_vals, astc_helpers::NUM_MODE10_ENDPOINTS); + memcpy(pWeight_vals0, trial_rgb_weight_vals, pixel_stats.m_num_pixels); + memcpy(pWeight_vals1, trial_a_weight_vals, pixel_stats.m_num_pixels); + } + else + { + break; + } + } + else + { + break; + } + } // pass + } + + return cur_blk_error; + } + + // rgb/rgba base+scale, dual plane rgb (not a!) + static uint64_t encode_cem6_10_dp_rgb( + uint32_t cem_index, uint32_t ccs_index, + const pixel_stats_t& pixel_stats, const cem_encode_params& enc_params, + uint32_t endpoint_ise_range, uint32_t weight_ise_range, + uint8_t* pEndpoint_vals, uint8_t* pWeight_vals0, uint8_t* pWeight_vals1, uint64_t cur_blk_error) + { + assert(g_initialized); + assert((cem_index == astc_helpers::CEM_LDR_RGB_BASE_SCALE) || (cem_index == astc_helpers::CEM_LDR_RGB_BASE_SCALE_PLUS_TWO_A)); + assert(ccs_index <= 2); + assert((pixel_stats.m_num_pixels) && (pixel_stats.m_num_pixels <= ASTC_LDR_MAX_BLOCK_PIXELS)); + assert((endpoint_ise_range >= astc_helpers::FIRST_VALID_ENDPOINT_ISE_RANGE) && (endpoint_ise_range <= astc_helpers::LAST_VALID_ENDPOINT_ISE_RANGE)); + assert(((weight_ise_range >= astc_helpers::FIRST_VALID_WEIGHT_ISE_RANGE) && (weight_ise_range <= astc_helpers::LAST_VALID_WEIGHT_ISE_RANGE)) || (weight_ise_range == astc_helpers::BISE_64_LEVELS)); + assert(pWeight_vals0 && pWeight_vals1); + + // First solve using a single plane, then we'll introduce the other plane's weights and tune the encoded H/s values + uint8_t sp_endpoint_vals[astc_helpers::NUM_MODE10_ENDPOINTS] = { 0 }; + uint8_t sp_weight_vals[ASTC_LDR_MAX_BLOCK_PIXELS] = { 0 }; + + uint64_t sp_block_err = encode_cem6_10( + cem_index, + pixel_stats, enc_params, + endpoint_ise_range, weight_ise_range, + sp_endpoint_vals, sp_weight_vals, UINT64_MAX); + + assert(sp_block_err != UINT64_MAX); + BASISU_NOTE_UNUSED(sp_block_err); + + // Now compute both plane's weights using the initial H/s values + uint8_t trial_weights0_vals[ASTC_LDR_MAX_BLOCK_PIXELS] = { 0 }; + uint8_t trial_weights1_vals[ASTC_LDR_MAX_BLOCK_PIXELS] = { 0 }; + uint64_t dp_blk_error = eval_solution_dp( + pixel_stats, + cem_index, ccs_index, + sp_endpoint_vals, endpoint_ise_range, + trial_weights0_vals, trial_weights1_vals, weight_ise_range, + enc_params); + + if (dp_blk_error < cur_blk_error) + { + cur_blk_error = dp_blk_error; + memcpy(pEndpoint_vals, sp_endpoint_vals, astc_helpers::NUM_MODE10_ENDPOINTS); + memcpy(pWeight_vals0, trial_weights0_vals, pixel_stats.m_num_pixels); + memcpy(pWeight_vals1, trial_weights1_vals, pixel_stats.m_num_pixels); + + if (!cur_blk_error) + return cur_blk_error; + } + + // Compute refined H/s values using the current weights. + const float MAX_S = 255.0f / 256.0f; + const float EPS = 1e-6f; + + vec3F Pa(0.0f); // (Pa_r,Pa_g,Pa_b) + vec3F Pb(0.0f); // (Pb_r,Pb_g,Pb_b) + float A[3] = { 0 }, B[3] = { 0 }, C[3] = { 0 }; // per-channel + + // invalid on raw weights + const auto& dequant_weights_tab = astc_helpers::g_dequant_tables.get_weight_tab(minimum(astc_helpers::BISE_32_LEVELS, weight_ise_range)).m_ISE_to_val; + + for (uint32_t i = 0; i < pixel_stats.m_num_pixels; i++) + { + float w0, w1; + if (weight_ise_range == astc_helpers::BISE_64_LEVELS) + { + w0 = (float)trial_weights0_vals[i] * (1.0f / 64.0f); + w1 = (float)trial_weights1_vals[i] * (1.0f / 64.0f); + } + else + { + w0 = dequant_weights_tab[trial_weights0_vals[i]] * (1.0f / 64.0f); + w1 = dequant_weights_tab[trial_weights1_vals[i]] * (1.0f / 64.0f); + } + + float w[3] = { w0, w0, w0 }; + w[ccs_index] = w1; + + const vec3F& p = pixel_stats.m_pixels_f[i]; + + for (int c = 0; c < 3; ++c) + { + const float a = 1.0f - w[c]; + const float b = w[c]; + + Pa[c] += a * p[c]; + Pb[c] += b * p[c]; + A[c] += a * a; + B[c] += a * b; + C[c] += b * b; + } // c + } // i + + color_rgba actual_l(0), actual_h(0); + float actual_scale = 0; + decode_endpoints(cem_index, sp_endpoint_vals, endpoint_ise_range, actual_l, actual_h, &actual_scale); + + vec3F H((float)actual_h[0], (float)actual_h[1], (float)actual_h[2]); + H *= (1.0f / 255.0f); + + const float S1 = H[0] * Pa[0] + H[1] * Pa[1] + H[2] * Pa[2]; + float S2 = 0.0f, S3 = 0.0f; + for (int c = 0; c < 3; c++) + { + const float H2 = H[c] * H[c]; + S2 += H2 * A[c]; + S3 += H2 * B[c]; + } + + float new_s = actual_scale; + if (S2 > EPS) + new_s = (S1 - S3) / S2; + + new_s = clamp(new_s, 0.0f, MAX_S); + + vec3F new_H(0.0f); + for (int c = 0; c < 3; ++c) + { + const float den = A[c] * new_s * new_s + 2.0f * B[c] * new_s + C[c]; + + float Hc = 0.0f; + if (den > EPS) + { + const float num = Pb[c] + new_s * Pa[c]; + Hc = num / den; + } + new_H[c] = Hc; + } + + bool improved_flag = try_cem6_10_dp( + cem_index, ccs_index, + pixel_stats, enc_params, + endpoint_ise_range, weight_ise_range, + new_s, (float)actual_l[3] * (1.0f / 255.0f), vec4F(new_H[0], new_H[1], new_H[2], (float)actual_h[3] * (1.0f / 255.0f)), + pEndpoint_vals, pWeight_vals0, pWeight_vals1, cur_blk_error); + (void)improved_flag; + + return cur_blk_error; + } + + // dispatcher + uint64_t cem_encode_pixels( + uint32_t cem_index, int ccs_index, + const pixel_stats_t& pixel_stats, const cem_encode_params& enc_params, + uint32_t endpoint_ise_range, uint32_t weight_ise_range, + uint8_t* pEndpoint_vals, uint8_t* pWeight_vals0, uint8_t* pWeight_vals1, uint64_t cur_blk_error, + bool use_blue_contraction, bool *pBase_ofs_clamped_flag) + { + assert(g_initialized); + assert((ccs_index >= -1) && (ccs_index <= 3)); + assert(astc_helpers::is_cem_ldr(cem_index)); + assert(pEndpoint_vals); + assert(pWeight_vals0); + + const bool dual_plane = (ccs_index >= 0); + + if (pBase_ofs_clamped_flag) + *pBase_ofs_clamped_flag = false; + + uint64_t blk_error = UINT64_MAX; + + switch (cem_index) + { + case astc_helpers::CEM_LDR_LUM_DIRECT: + { + assert(!dual_plane); + + blk_error = encode_cem0_4( + cem_index, + pixel_stats, enc_params, + endpoint_ise_range, weight_ise_range, + pEndpoint_vals, pWeight_vals0, cur_blk_error); + + break; + } + case astc_helpers::CEM_LDR_LUM_ALPHA_DIRECT: + { + if (dual_plane) + { + assert(ccs_index == 3); + assert(pWeight_vals1); + + blk_error = encode_cem4_dp_a( + cem_index, + pixel_stats, enc_params, + endpoint_ise_range, weight_ise_range, + pEndpoint_vals, pWeight_vals0, pWeight_vals1, cur_blk_error); + } + else + { + blk_error = encode_cem0_4( + cem_index, + pixel_stats, enc_params, + endpoint_ise_range, weight_ise_range, + pEndpoint_vals, pWeight_vals0, cur_blk_error); + } + break; + } + + case astc_helpers::CEM_LDR_RGB_DIRECT: + case astc_helpers::CEM_LDR_RGBA_DIRECT: + case astc_helpers::CEM_LDR_RGB_BASE_PLUS_OFFSET: + case astc_helpers::CEM_LDR_RGBA_BASE_PLUS_OFFSET: + { + if (dual_plane) + { + assert(pWeight_vals1); + blk_error = encode_cem8_12_9_13_dp( + cem_index, ccs_index, + pixel_stats, enc_params, + endpoint_ise_range, weight_ise_range, + pEndpoint_vals, pWeight_vals0, pWeight_vals1, cur_blk_error, use_blue_contraction, pBase_ofs_clamped_flag); + } + else + { + blk_error = encode_cem8_12_9_13( + cem_index, + pixel_stats, enc_params, + endpoint_ise_range, weight_ise_range, + pEndpoint_vals, pWeight_vals0, cur_blk_error, use_blue_contraction, pBase_ofs_clamped_flag); + } + break; + } + case astc_helpers::CEM_LDR_RGB_BASE_SCALE: + { + if (dual_plane) + { + assert(ccs_index <= 2); + assert(pWeight_vals1); + + blk_error = encode_cem6_10_dp_rgb( + cem_index, ccs_index, + pixel_stats, enc_params, + endpoint_ise_range, weight_ise_range, + pEndpoint_vals, pWeight_vals0, pWeight_vals1, cur_blk_error); + } + else + { + blk_error = encode_cem6_10( + cem_index, + pixel_stats, enc_params, + endpoint_ise_range, weight_ise_range, + pEndpoint_vals, pWeight_vals0, cur_blk_error); + } + break; + } + case astc_helpers::CEM_LDR_RGB_BASE_SCALE_PLUS_TWO_A: + { + if (dual_plane) + { + assert(pWeight_vals1); + + if (ccs_index == 3) + { + blk_error = encode_cem10_dp_a( + cem_index, + pixel_stats, enc_params, + endpoint_ise_range, weight_ise_range, + pEndpoint_vals, pWeight_vals0, pWeight_vals1, cur_blk_error); + } + else + { + blk_error = encode_cem6_10_dp_rgb( + cem_index, ccs_index, + pixel_stats, enc_params, + endpoint_ise_range, weight_ise_range, + pEndpoint_vals, pWeight_vals0, pWeight_vals1, cur_blk_error); + } + } + else + { + blk_error = encode_cem6_10( + cem_index, + pixel_stats, enc_params, + endpoint_ise_range, weight_ise_range, + pEndpoint_vals, pWeight_vals0, cur_blk_error); + } + break; + } + default: + { + assert(0); + break; + } + } + + return blk_error; + } + + //--------------------------------------------------------------------------------------------- + + float surrogate_evaluate_rgba_sp(const pixel_stats_t& ps, const vec4F& l, const vec4F& h, float* pWeights0, uint32_t num_weight_levels, + const cem_encode_params& enc_params, uint32_t flags) + { + assert(g_initialized); + assert((ps.m_num_pixels) && (ps.m_num_pixels <= ASTC_LDR_MAX_BLOCK_PIXELS)); + assert(pWeights0); + + const float wr = (float)enc_params.m_comp_weights[0], wg = (float)enc_params.m_comp_weights[1], + wb = (float)enc_params.m_comp_weights[2], wa = (float)enc_params.m_comp_weights[3]; + + float total_err = 0; + + const bool compute_error = ((flags & cFlagNoError) == 0); + + float lr = l[0], lg = l[1], lb = l[2], la = l[3]; + float dr = h[0] - lr, dg = h[1] - lg, db = h[2] - lb, da = h[3] - la; + float delta_col_nrm = dr * dr + dg * dg + db * db + da * da; + + if (flags & cFlagDisableQuant) + { + float f = (float)1.0f / (delta_col_nrm + REALLY_SMALL_FLOAT_VAL); + + lr *= -dr; lg *= -dg; lb *= -db; la *= -da; + + dr *= f; dg *= f; db *= f; da *= f; + float l_sum = (lr + lg + lb + la) * f; + + for (uint32_t i = 0; i < ps.m_num_pixels; i++) + { + const vec4F& p = ps.m_pixels_f[i]; + const float r = p[0], g = p[1], b = p[2], a = p[3]; + + float w = r * dr + g * dg + b * db + a * da + l_sum; + + if (w < 0.0f) + w = 0.0f; + else if (w > 1.0f) + w = 1.0f; + + pWeights0[i] = w; + + if (compute_error) + { + float one_minus_w = 1.0f - w; + + float dec_r = l[0] * one_minus_w + h[0] * w; + float dec_g = l[1] * one_minus_w + h[1] * w; + float dec_b = l[2] * one_minus_w + h[2] * w; + float dec_a = l[3] * one_minus_w + h[3] * w; + + float diff_r = r - dec_r; + float diff_g = g - dec_g; + float diff_b = b - dec_b; + float diff_a = a - dec_a; + + total_err += (wr * diff_r * diff_r) + (wg * diff_g * diff_g) + (wb * diff_b * diff_b) + (wa * diff_a * diff_a); + } + + } // i + } + else + { + const float inv_weight_levels = 1.0f / (float)(num_weight_levels - 1); + + float f = (float)(num_weight_levels - 1) / (delta_col_nrm + REALLY_SMALL_FLOAT_VAL); + + lr *= -dr; lg *= -dg; lb *= -db; la *= -da; + + dr *= f; dg *= f; db *= f; da *= f; + float l_sum_biased = (lr + lg + lb + la) * f + .5f; + + for (uint32_t i = 0; i < ps.m_num_pixels; i++) + { + const vec4F& p = ps.m_pixels_f[i]; + const float r = p[0], g = p[1], b = p[2], a = p[3]; + + float w = (float)fast_floorf_int(r * dr + g * dg + b * db + a * da + l_sum_biased) * inv_weight_levels; + + if (w < 0.0f) + w = 0.0f; + else if (w > 1.0f) + w = 1.0f; + + pWeights0[i] = w; + + if (compute_error) + { + float one_minus_w = 1.0f - w; + + float dec_r = l[0] * one_minus_w + h[0] * w; + float dec_g = l[1] * one_minus_w + h[1] * w; + float dec_b = l[2] * one_minus_w + h[2] * w; + float dec_a = l[3] * one_minus_w + h[3] * w; + + float diff_r = r - dec_r; + float diff_g = g - dec_g; + float diff_b = b - dec_b; + float diff_a = a - dec_a; + + total_err += (wr * diff_r * diff_r) + (wg * diff_g * diff_g) + (wb * diff_b * diff_b) + (wa * diff_a * diff_a); + } + + } // i + } + + return total_err; + + } + + float surrogate_evaluate_rgba_dp(uint32_t ccs_index, const pixel_stats_t& ps, const vec4F& l, const vec4F& h, float* pWeights0, float* pWeights1, uint32_t num_weight_levels, + const cem_encode_params& enc_params, uint32_t flags) + { + assert(g_initialized); + assert((ccs_index >= 0) && (ccs_index <= 3)); + assert((ps.m_num_pixels) && (ps.m_num_pixels <= ASTC_LDR_MAX_BLOCK_PIXELS)); + assert(pWeights0 && pWeights1); + + const float inv_weight_levels = 1.0f / (float)(num_weight_levels - 1); + + const uint32_t c0 = (ccs_index + 1) & 3, c1 = (ccs_index + 2) & 3, c2 = (ccs_index + 3) & 3; + + const float orig_lx = l[c0], orig_ly = l[c1], orig_lz = l[c2], orig_lw = l[ccs_index]; + const float orig_hx = h[c0], orig_hy = h[c1], orig_hz = h[c2], orig_hw = h[ccs_index]; + + const float wx = (float)enc_params.m_comp_weights[c0], wy = (float)enc_params.m_comp_weights[c1], + wz = (float)enc_params.m_comp_weights[c2], ww = (float)enc_params.m_comp_weights[ccs_index]; + + float total_err = 0; + + const bool compute_error = ((flags & cFlagNoError) == 0); + + if (flags & cFlagDisableQuant) + { + // Plane 0 + { + float dx = orig_hx - orig_lx, dy = orig_hy - orig_ly, dz = orig_hz - orig_lz; + + float delta_col_nrm = dx * dx + dy * dy + dz * dz; + + float f = (float)1.0f / (delta_col_nrm + REALLY_SMALL_FLOAT_VAL); + + float lx = orig_lx, ly = orig_ly, lz = orig_lz; + lx *= -dx; ly *= -dy; lz *= -dz; + + dx *= f; dy *= f; dz *= f; + float l_sum = (lx + ly + lz) * f; + + for (uint32_t i = 0; i < ps.m_num_pixels; i++) + { + const vec4F& p = ps.m_pixels_f[i]; + const float x = p[c0], y = p[c1], z = p[c2]; + + float weight = x * dx + y * dy + z * dz + l_sum; + + if (weight < 0.0f) + weight = 0.0f; + else if (weight > 1.0f) + weight = 1.0f; + + pWeights0[i] = weight; + + if (compute_error) + { + float one_minus_weight = 1.0f - weight; + + float dec_x = orig_lx * one_minus_weight + orig_hx * weight; + float dec_y = orig_ly * one_minus_weight + orig_hy * weight; + float dec_z = orig_lz * one_minus_weight + orig_hz * weight; + + float diff_x = x - dec_x; + float diff_y = y - dec_y; + float diff_z = z - dec_z; + + total_err += (wx * diff_x * diff_x) + (wy * diff_y * diff_y) + (wz * diff_z * diff_z); + } + + } // i + } + + // Plane 1 + { + const float delta_w = orig_hw - orig_lw; + const float f = (fabsf(delta_w) > REALLY_SMALL_FLOAT_VAL) ? (1.0f / delta_w) : 0.0f; + + for (uint32_t i = 0; i < ps.m_num_pixels; i++) + { + const vec4F& p = ps.m_pixels_f[i]; + const float w = p[ccs_index]; + + float weight = (w - orig_lw) * f; + + if (weight < 0.0f) + weight = 0.0f; + else if (weight > 1.0f) + weight = 1.0f; + + pWeights1[i] = weight; + + if (compute_error) + { + // Error for DP here is 0 if there's no quant and L/H are sufficient to cover the entire span. + if ((w < orig_lw) || (w > orig_hw)) + { + float one_minus_weight = 1.0f - weight; + + float dec_w = orig_lw * one_minus_weight + orig_hw * weight; + + float diff_w = w - dec_w; + + total_err += (ww * diff_w * diff_w); + } + } + + } // i + } + } + else + { + // Plane 0 + { + float dx = orig_hx - orig_lx, dy = orig_hy - orig_ly, dz = orig_hz - orig_lz; + + float delta_col_nrm = dx * dx + dy * dy + dz * dz; + + float f = (float)(num_weight_levels - 1) / (delta_col_nrm + REALLY_SMALL_FLOAT_VAL); + + float lx = orig_lx, ly = orig_ly, lz = orig_lz; + lx *= -dx; ly *= -dy; lz *= -dz; + + dx *= f; dy *= f; dz *= f; + float l_sum_biased = (lx + ly + lz) * f + .5f; + + for (uint32_t i = 0; i < ps.m_num_pixels; i++) + { + const vec4F& p = ps.m_pixels_f[i]; + const float x = p[c0], y = p[c1], z = p[c2]; + + float weight = (float)fast_floorf_int(x * dx + y * dy + z * dz + l_sum_biased) * inv_weight_levels; + + if (weight < 0.0f) + weight = 0.0f; + else if (weight > 1.0f) + weight = 1.0f; + + pWeights0[i] = weight; + + if (compute_error) + { + float one_minus_weight = 1.0f - weight; + + float dec_x = orig_lx * one_minus_weight + orig_hx * weight; + float dec_y = orig_ly * one_minus_weight + orig_hy * weight; + float dec_z = orig_lz * one_minus_weight + orig_hz * weight; + + float diff_x = x - dec_x; + float diff_y = y - dec_y; + float diff_z = z - dec_z; + + total_err += (wx * diff_x * diff_x) + (wy * diff_y * diff_y) + (wz * diff_z * diff_z); + } + + } // i + } + + // Plane 1 + { + const float delta_w = orig_hw - orig_lw; + const float f = (fabs(delta_w) > REALLY_SMALL_FLOAT_VAL) ? ((float)(num_weight_levels - 1) / delta_w) : 0.0f; + + for (uint32_t i = 0; i < ps.m_num_pixels; i++) + { + const vec4F& p = ps.m_pixels_f[i]; + const float w = p[ccs_index]; + + float weight = (float)fast_floorf_int((w - orig_lw) * f + .5f) * inv_weight_levels; + + if (weight < 0.0f) + weight = 0.0f; + else if (weight > 1.0f) + weight = 1.0f; + + pWeights1[i] = weight; + + if (compute_error) + { + float one_minus_weight = 1.0f - weight; + + float dec_w = orig_lw * one_minus_weight + orig_hw * weight; + + float diff_w = w - dec_w; + + total_err += (ww * diff_w * diff_w); + } + + } // i + } + } + + return total_err; + } + + //--------------------------------------------------------------------------------------------- + + float surrogate_quant_endpoint_val(float e, uint32_t num_endpoint_levels, uint32_t flags) + { + assert((e >= 0.0f) && (e <= 1.0f)); + + if (flags & cFlagDisableQuant) + return e; + + const float endpoint_levels_minus_1 = (float)(num_endpoint_levels - 1); + const float inv_endpoint_levels = 1.0f / endpoint_levels_minus_1; + return (float)fast_roundf_pos_int(e * endpoint_levels_minus_1) * inv_endpoint_levels; + } + + vec4F surrogate_quant_endpoint(const vec4F& e, uint32_t num_endpoint_levels, uint32_t flags) + { + if (flags & cFlagDisableQuant) + return e; + + const float endpoint_levels_minus_1 = (float)(num_endpoint_levels - 1); + const float inv_endpoint_levels = 1.0f / endpoint_levels_minus_1; + + assert((e[0] >= 0.0f) && (e[0] <= 1.0f)); + assert((e[1] >= 0.0f) && (e[1] <= 1.0f)); + assert((e[2] >= 0.0f) && (e[2] <= 1.0f)); + assert((e[3] >= 0.0f) && (e[3] <= 1.0f)); + + vec4F res; + res[0] = (float)fast_roundf_pos_int(e[0] * endpoint_levels_minus_1) * inv_endpoint_levels; + res[1] = (float)fast_roundf_pos_int(e[1] * endpoint_levels_minus_1) * inv_endpoint_levels; + res[2] = (float)fast_roundf_pos_int(e[2] * endpoint_levels_minus_1) * inv_endpoint_levels; + res[3] = (float)fast_roundf_pos_int(e[3] * endpoint_levels_minus_1) * inv_endpoint_levels; + + return res; + } + + static uint32_t get_num_weight_levels(uint32_t weight_ise_range) + { + // astc_helpers::BISE_64_LEVELS=raw weights ([0,64], NOT [0,63]) + const uint32_t num_weight_levels = (weight_ise_range == astc_helpers::BISE_64_LEVELS) ? 65 : astc_helpers::get_ise_levels(weight_ise_range); + return num_weight_levels; + } + + //--------------------------------------------------------------------------------------------- + + static float cem_surrogate_encode_cem6_10_sp( + uint32_t cem_index, + const pixel_stats_t& ps, const cem_encode_params& enc_params, + uint32_t endpoint_ise_range, uint32_t weight_ise_range, + vec4F& low_endpoint, vec4F& high_endpoint, float &s, float* pWeights0, uint32_t flags) + { + const uint32_t num_endpoint_levels = astc_helpers::get_ise_levels(endpoint_ise_range); + + // astc_helpers::BISE_64_LEVELS=raw weights ([0,64], NOT [0,63]) + const uint32_t num_weight_levels = get_num_weight_levels(weight_ise_range); + + const bool cem_has_alpha = (cem_index == astc_helpers::CEM_LDR_RGB_BASE_SCALE_PLUS_TWO_A); + + float d_min = BIG_FLOAT_VAL, d_max = -BIG_FLOAT_VAL; + + for (uint32_t i = 0; i < ps.m_num_pixels; i++) + { + const vec4F p(ps.m_pixels_f[i]); + + float dot = p.dot3(ps.m_zero_rel_axis3); + + if (dot < d_min) + d_min = dot; + + if (dot > d_max) + d_max = dot; + } + + vec3F low_color3_f(d_min * ps.m_zero_rel_axis3); + low_color3_f.clamp(0.0f, 1.0f); + + vec3F high_color3_f(d_max * ps.m_zero_rel_axis3); + high_color3_f.clamp(0.0f, 1.0f); + + const float MAX_S = 255.0f / 256.0f; + + float scale = MAX_S; + + float d = low_color3_f.dot(high_color3_f); + float nrm = high_color3_f.norm(); + if (nrm > 0.0f) + scale = d / nrm; + + scale = clamp(scale, 0.0f, MAX_S); + + scale = surrogate_quant_endpoint_val(scale * (256.0f / 255.0f), num_endpoint_levels, flags); + + s = scale; + + high_endpoint = surrogate_quant_endpoint(vec4F(high_color3_f[0], high_color3_f[1], high_color3_f[2], cem_has_alpha ? ps.m_max_f[3] : 1.0f), num_endpoint_levels, flags); + + low_endpoint = vec4F(high_endpoint[0] * scale, high_endpoint[1] * scale, high_endpoint[2] * scale, cem_has_alpha ? ps.m_min_f[3] : 1.0f); + + return surrogate_evaluate_rgba_sp(ps, low_endpoint, high_endpoint, pWeights0, num_weight_levels, enc_params, flags); + } + + static float cem_surrogate_encode_cem6_10_dp( + uint32_t cem_index, uint32_t ccs_index, + const pixel_stats_t& ps, const cem_encode_params& enc_params, + uint32_t endpoint_ise_range, uint32_t weight_ise_range, + vec4F& low_endpoint, vec4F& high_endpoint, float& s, float* pWeights0, float* pWeights1, uint32_t flags) + { + const bool cem_has_alpha = (cem_index == astc_helpers::CEM_LDR_RGB_BASE_SCALE_PLUS_TWO_A); + BASISU_NOTE_UNUSED(cem_has_alpha); + + // astc_helpers::BISE_64_LEVELS=raw weights ([0,64], NOT [0,63]) + const uint32_t num_weight_levels = get_num_weight_levels(weight_ise_range); + + assert(cem_has_alpha || (ccs_index <= 2)); + + float temp_weights[astc_ldr::ASTC_LDR_MAX_BLOCK_PIXELS]; + cem_surrogate_encode_cem6_10_sp( + (ccs_index == 3) ? (uint32_t)astc_helpers::CEM_LDR_RGB_BASE_SCALE : cem_index, + ps, enc_params, endpoint_ise_range, weight_ise_range, low_endpoint, high_endpoint, s, temp_weights, flags); + + if (ccs_index == 3) + { + low_endpoint[3] = ps.m_min_f[3]; + high_endpoint[3] = ps.m_max_f[3]; + } + + return surrogate_evaluate_rgba_dp(ccs_index, ps, low_endpoint, high_endpoint, pWeights0, pWeights1, num_weight_levels, enc_params, flags); + } + + static float cem_surrogate_encode_cem8_12_sp( + uint32_t cem_index, + const pixel_stats_t& ps, const cem_encode_params& enc_params, + uint32_t endpoint_ise_range, uint32_t weight_ise_range, + vec4F& low_endpoint, vec4F& high_endpoint, float* pWeights0, uint32_t flags) + { + const uint32_t num_endpoint_levels = astc_helpers::get_ise_levels(endpoint_ise_range); + + // astc_helpers::BISE_64_LEVELS=raw weights ([0,64], NOT [0,63]) + const uint32_t num_weight_levels = get_num_weight_levels(weight_ise_range); + + const bool cem_has_alpha = (cem_index == astc_helpers::CEM_LDR_RGBA_DIRECT); + const uint32_t num_comps = cem_has_alpha ? 4 : 3; + + float d_min = BIG_FLOAT_VAL, d_max = -BIG_FLOAT_VAL; + uint32_t l_idx = 0, h_idx = 0; + + for (uint32_t i = 0; i < ps.m_num_pixels; i++) + { + const vec4F p(ps.m_pixels_f[i] - ps.m_mean_f); + + float dot = cem_has_alpha ? p.dot(ps.m_mean_rel_axis4) : p.dot3(ps.m_mean_rel_axis3); + + if (dot < d_min) + { + d_min = dot; + l_idx = i; + } + + if (dot > d_max) + { + d_max = dot; + h_idx = i; + } + } + + low_endpoint = surrogate_quant_endpoint(ps.m_pixels_f[l_idx], num_endpoint_levels, flags); + high_endpoint = surrogate_quant_endpoint(ps.m_pixels_f[h_idx], num_endpoint_levels, flags); + + if (!cem_has_alpha) + { + low_endpoint[3] = 1.0f; + high_endpoint[3] = 1.0f; + } + + if (low_endpoint.dot(vec4F(1.0f)) > high_endpoint.dot(vec4F(1.0f))) + std::swap(low_endpoint, high_endpoint); + + if ((flags & cFlagDisableQuant) == 0) + { + for (uint32_t i = 0; i < num_comps; i++) + { + if ((low_endpoint[i] == high_endpoint[i]) && (ps.m_min_f[i] != ps.m_max_f[i])) + { + const float inv_endpoint_levels = 1.0f / (float)(num_endpoint_levels - 1); + + float best_dist = BIG_FLOAT_VAL; + float best_l = 0.0f, best_h = 0.0f; + + for (int ld = -2; ld <= 0; ld++) + { + float actual_l = saturate(low_endpoint[i] + (float)ld * inv_endpoint_levels); + + for (int hd = 0; hd <= 2; hd++) + { + float actual_h = saturate(high_endpoint[i] + (float)hd * inv_endpoint_levels); + + float v0 = lerp(actual_l, actual_h, 1.0f / 3.0f); + float v1 = lerp(actual_l, actual_h, 2.0f / 3.0f); + assert(v0 <= v1); + + float dist0 = v0 - ps.m_min_f[0]; + float dist1 = v1 - ps.m_max_f[0]; + + float total_dist = dist0 * dist0 + dist1 * dist1; + if (total_dist < best_dist) + { + best_dist = total_dist; + best_l = actual_l; + best_h = actual_h; + } + } // hd + } // ld + + low_endpoint[i] = best_l; + high_endpoint[i] = best_h; + } + } + } + + return surrogate_evaluate_rgba_sp(ps, low_endpoint, high_endpoint, pWeights0, num_weight_levels, enc_params, flags); + } + + static float cem_surrogate_encode_cem8_12_dp( + uint32_t cem_index, uint32_t ccs_index, + const pixel_stats_t& ps, const cem_encode_params& enc_params, + uint32_t endpoint_ise_range, uint32_t weight_ise_range, + vec4F& low_endpoint, vec4F& high_endpoint, float* pWeights0, float *pWeights1, uint32_t flags) + { + assert((ccs_index >= 0) && (ccs_index <= 3)); + const uint32_t num_endpoint_levels = astc_helpers::get_ise_levels(endpoint_ise_range); + + // astc_helpers::BISE_64_LEVELS=raw weights ([0,64], NOT [0,63]) + const uint32_t num_weight_levels = get_num_weight_levels(weight_ise_range); + + const bool cem_has_alpha = (cem_index == astc_helpers::CEM_LDR_RGBA_DIRECT); + const uint32_t num_comps = cem_has_alpha ? 4 : 3; + + assert(cem_has_alpha || (ccs_index <= 2)); + + vec4F flattened_pixels[ASTC_LDR_MAX_BLOCK_PIXELS]; + for (uint32_t i = 0; i < ps.m_num_pixels; i++) + { + flattened_pixels[i] = ps.m_pixels_f[i]; + + flattened_pixels[i][ccs_index] = 0.0f; + + if (!cem_has_alpha) + flattened_pixels[i][3] = 0.0f; + } + + vec4F flattened_pixels_mean(ps.m_mean_f); + flattened_pixels_mean[ccs_index] = 0.0f; + + if (!cem_has_alpha) + flattened_pixels_mean[3] = 0.0f; + + // suppress bogus gcc warning on flattened_pixels +#ifndef __clang__ +#if defined(__GNUC__) +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" +#endif +#endif + const vec4F flattened_axis(calc_pca_4D(ps.m_num_pixels, flattened_pixels, flattened_pixels_mean)); + +#ifndef __clang__ +#if defined(__GNUC__) +#pragma GCC diagnostic pop +#endif +#endif + + float best_dl = BIG_FLOAT_VAL, best_dh = -BIG_FLOAT_VAL; + int best_l_index = 0, best_h_index = 0; + + for (uint32_t c = 0; c < ps.m_num_pixels; c++) + { + const vec4F px(flattened_pixels[c] - flattened_pixels_mean); + + float p = px.dot(flattened_axis); + if (p < best_dl) + { + best_dl = p; + best_l_index = c; + } + + if (p > best_dh) + { + best_dh = p; + best_h_index = c; + } + } // c + + vec4F low_color_f(ps.m_pixels_f[best_l_index]), high_color_f(ps.m_pixels_f[best_h_index]); + + low_color_f[ccs_index] = 0.0f; + high_color_f[ccs_index] = 0.0f; + + if (!cem_has_alpha) + { + low_color_f[3] = 1.0f; + high_color_f[3] = 1.0f; + } + + if (low_color_f.dot(vec4F(1.0f)) > high_color_f.dot(vec4F(1.0f))) + std::swap(low_color_f, high_color_f); + + low_color_f[ccs_index] = ps.m_min_f[ccs_index]; + high_color_f[ccs_index] = ps.m_max_f[ccs_index]; + + if (!cem_has_alpha) + { + low_color_f[3] = 1.0f; + high_color_f[3] = 1.0f; + } + + low_endpoint = surrogate_quant_endpoint(low_color_f, num_endpoint_levels, flags); + high_endpoint = surrogate_quant_endpoint(high_color_f, num_endpoint_levels, flags); + + if ((flags & cFlagDisableQuant) == 0) + { + for (uint32_t i = 0; i < num_comps; i++) + { + if ((low_endpoint[i] == high_endpoint[i]) && (ps.m_min_f[i] != ps.m_max_f[i])) + { + const float inv_endpoint_levels = 1.0f / (float)(num_endpoint_levels - 1); + + float best_dist = BIG_FLOAT_VAL; + float best_l = 0.0f, best_h = 0.0f; + + for (int ld = -2; ld <= 0; ld++) + { + float actual_l = saturate(low_endpoint[i] + (float)ld * inv_endpoint_levels); + + for (int hd = 0; hd <= 2; hd++) + { + float actual_h = saturate(high_endpoint[i] + (float)hd * inv_endpoint_levels); + + float v0 = lerp(actual_l, actual_h, 1.0f / 3.0f); + float v1 = lerp(actual_l, actual_h, 2.0f / 3.0f); + assert(v0 <= v1); + + //if (v0 > v1) + // std::swap(v0, v1); + + float dist0 = v0 - ps.m_min_f[0]; + float dist1 = v1 - ps.m_max_f[0]; + + float total_dist = dist0 * dist0 + dist1 * dist1; + if (total_dist < best_dist) + { + best_dist = total_dist; + best_l = actual_l; + best_h = actual_h; + } + } // hd + } // ld + + low_endpoint[i] = best_l; + high_endpoint[i] = best_h; + } + } + } + + return surrogate_evaluate_rgba_dp(ccs_index, ps, low_endpoint, high_endpoint, pWeights0, pWeights1, num_weight_levels, enc_params, flags); + } + + static float cem_surrogate_encode_cem0_4_sp_or_dp( + uint32_t cem_index, int ccs_index, + const pixel_stats_t& ps, const cem_encode_params& enc_params, + uint32_t endpoint_ise_range, uint32_t weight_ise_range, + vec4F& low_endpoint, vec4F& high_endpoint, float* pWeights0, float *pWeights1, uint32_t flags) + { + const bool cem_has_alpha = (cem_index == astc_helpers::CEM_LDR_LUM_ALPHA_DIRECT); + const bool dual_plane = (ccs_index == 3); + + if (cem_index == astc_helpers::CEM_LDR_LUM_ALPHA_DIRECT) + { + assert((ccs_index == -1) || (ccs_index == 3)); + } + else + { + assert(cem_index == astc_helpers::CEM_LDR_LUM_DIRECT); + assert(ccs_index == -1); + } + + const uint32_t num_endpoint_levels = astc_helpers::get_ise_levels(endpoint_ise_range); + const uint32_t num_weight_levels = get_num_weight_levels(weight_ise_range); + + float lum_l = BIG_FLOAT_VAL, lum_h = -BIG_FLOAT_VAL; + + for (uint32_t i = 0; i < ps.m_num_pixels; i++) + { + const vec4F& px = ps.m_pixels_f[i]; + + float l = (px[0] + px[1] + px[2]) * (1.0f / 3.0f); + + lum_l = minimum(lum_l, l); + lum_h = maximum(lum_h, l); + } + + const float a_l = cem_has_alpha ? ps.m_min_f[3] : 1.0f; + const float a_h = cem_has_alpha ? ps.m_max_f[3] : 1.0f; + + low_endpoint.set(lum_l, lum_l, lum_l, a_l); + high_endpoint.set(lum_h, lum_h, lum_h, a_h); + + low_endpoint = surrogate_quant_endpoint(low_endpoint, num_endpoint_levels, flags); + high_endpoint = surrogate_quant_endpoint(high_endpoint, num_endpoint_levels, flags); + + if (dual_plane) + return surrogate_evaluate_rgba_dp(ccs_index, ps, low_endpoint, high_endpoint, pWeights0, pWeights1, num_weight_levels, enc_params, flags); + else + return surrogate_evaluate_rgba_sp(ps, low_endpoint, high_endpoint, pWeights0, num_weight_levels, enc_params, flags); + } + + float cem_surrogate_encode_pixels( + uint32_t cem_index, int ccs_index, + const pixel_stats_t& ps, const cem_encode_params& enc_params, + uint32_t endpoint_ise_range, uint32_t weight_ise_range, + vec4F &low_endpoint, vec4F &high_endpoint, float &s, float* pWeights0, float* pWeights1, uint32_t flags) + { + assert(g_initialized); + assert((ccs_index >= -1) && (ccs_index <= 3)); + assert(astc_helpers::is_cem_ldr(cem_index)); + assert(pWeights0 && pWeights1); + + const bool dual_plane = (ccs_index >= 0); + + switch (cem_index) + { + case astc_helpers::CEM_LDR_LUM_DIRECT: + case astc_helpers::CEM_LDR_LUM_ALPHA_DIRECT: + { + return cem_surrogate_encode_cem0_4_sp_or_dp( + cem_index, ccs_index, + ps, enc_params, + endpoint_ise_range, weight_ise_range, + low_endpoint, high_endpoint, pWeights0, pWeights1, flags); + } + case astc_helpers::CEM_LDR_RGB_BASE_SCALE: + case astc_helpers::CEM_LDR_RGB_BASE_SCALE_PLUS_TWO_A: + { + if (dual_plane) + { + return cem_surrogate_encode_cem6_10_dp( + cem_index, ccs_index, + ps, enc_params, + endpoint_ise_range, weight_ise_range, + low_endpoint, high_endpoint, s, pWeights0, pWeights1, flags); + } + else + { + return cem_surrogate_encode_cem6_10_sp( + cem_index, + ps, enc_params, + endpoint_ise_range, weight_ise_range, + low_endpoint, high_endpoint, s, pWeights0, flags); + } + break; + } + case astc_helpers::CEM_LDR_RGB_DIRECT: + case astc_helpers::CEM_LDR_RGBA_DIRECT: + { + if (dual_plane) + { + return cem_surrogate_encode_cem8_12_dp( + cem_index, ccs_index, + ps, enc_params, + endpoint_ise_range, weight_ise_range, + low_endpoint, high_endpoint, pWeights0, pWeights1, flags); + } + else + { + return cem_surrogate_encode_cem8_12_sp( + cem_index, + ps, enc_params, + endpoint_ise_range, weight_ise_range, + low_endpoint, high_endpoint, pWeights0, flags); + } + + break; + } + default: + assert(0); + break; + } + + return BIG_FLOAT_VAL; + } + + //--------------------------------------------------------------------------------------------- + + uint8_t g_part3_mapping[NUM_PART3_MAPPINGS][3] = + { + { 0, 1, 2 }, + { 1, 2, 0 }, + { 2, 0, 1 }, + { 0, 2, 1 }, + { 1, 0, 2 }, + { 2, 1, 0 } + }; + + partition_pattern_vec::partition_pattern_vec() + { + clear(); + } + + partition_pattern_vec::partition_pattern_vec(const partition_pattern_vec& other) + { + *this = other; + } + + partition_pattern_vec::partition_pattern_vec(uint32_t width, uint32_t height, const uint8_t *pParts) : + m_width(width), m_height(height) + { + if (pParts) + { + memcpy(m_parts, pParts, get_total()); + } + } + + void partition_pattern_vec::init(uint32_t width, uint32_t height, const uint8_t* pParts) + { + m_width = width; + m_height = height; + if (pParts) + { + const uint32_t num_texels = get_total(); + memcpy(m_parts, pParts, num_texels); + } + } + + void partition_pattern_vec::clear() + { + m_width = 0; + m_height = 0; + memset(m_parts, 0, sizeof(m_parts)); + } + + partition_pattern_vec& partition_pattern_vec::operator= (const partition_pattern_vec& rhs) + { + if (this == &rhs) + return *this; + + m_width = rhs.m_width; + m_height = rhs.m_height; + memcpy(m_parts, rhs.m_parts, get_total()); + + return *this; + } + + // misnamed- just SAD distance, not square + int partition_pattern_vec::get_squared_distance(const partition_pattern_vec& other) const + { + const uint32_t total_pixels = get_total(); + + int total_dist = 0; + for (uint32_t i = 0; i < total_pixels; i++) + total_dist += iabs((int)m_parts[i] - (int)other.m_parts[i]); + + return total_dist; + } + + partition_pattern_vec partition_pattern_vec::get_permuted2(uint32_t permute_index) const + { + assert(permute_index <= 1); + const uint32_t total_pixels = get_total(); + + partition_pattern_vec res(m_width, m_height); + for (uint32_t i = 0; i < total_pixels; i++) + { + assert(m_parts[i] <= 1); + res.m_parts[i] = (uint8_t)(m_parts[i] ^ permute_index); + } + + return res; + } + + partition_pattern_vec partition_pattern_vec::get_permuted3(uint32_t permute_index) const + { + assert(permute_index <= 5); + const uint32_t total_pixels = get_total(); + + partition_pattern_vec res(m_width, m_height); + for (uint32_t i = 0; i < total_pixels; i++) + { + assert(m_parts[i] <= 2); + res.m_parts[i] = g_part3_mapping[permute_index][m_parts[i]]; + } + + return res; + } + + partition_pattern_vec partition_pattern_vec::get_canonicalized() const + { + partition_pattern_vec res(m_width, m_height); + + const uint32_t total_pixels = get_total(); + + int new_labels[4] = { -1, -1, -1, -1 }; + + uint32_t next_index = 0; + for (uint32_t i = 0; i < total_pixels; i++) + { + uint32_t p = m_parts[i]; + assert(p <= 3); + + if (new_labels[p] == -1) + new_labels[p] = next_index++; + + res.m_parts[i] = (uint8_t)new_labels[p]; + } + + return res; + } + + // This requires no redundant patterns, i.e. all must be unique. + bool vp_tree::init(uint32_t n, const partition_pattern_vec* pUnique_pats) + { + clear(); + + uint_vec pat_indices(n); + for (uint32_t i = 0; i < n; i++) + pat_indices[i] = i; + + std::pair root_idx = find_best_vantage_point(n, pUnique_pats, pat_indices); + + if (root_idx.first == -1) + return false; + + m_nodes.resize(1); + m_nodes[0].m_vantage_point = pUnique_pats[root_idx.first]; + m_nodes[0].m_point_index = root_idx.first; + m_nodes[0].m_dist = root_idx.second; + m_nodes[0].m_inner_node = -1; + m_nodes[0].m_outer_node = -1; + + uint_vec inner_list, outer_list; + + inner_list.reserve(n / 2); + outer_list.reserve(n / 2); + + for (uint32_t pat_index = 0; pat_index < n; pat_index++) + { + if ((int)pat_index == root_idx.first) + continue; + + const float dist = m_nodes[0].m_vantage_point.get_distance(pUnique_pats[pat_index]); + + if (dist <= root_idx.second) + inner_list.push_back(pat_index); + else + outer_list.push_back(pat_index); + } + + if (inner_list.size()) + { + m_nodes[0].m_inner_node = create_node(n, pUnique_pats, inner_list); + if (m_nodes[0].m_inner_node < 0) + return false; + } + + if (outer_list.size()) + { + m_nodes[0].m_outer_node = create_node(n, pUnique_pats, outer_list); + if (m_nodes[0].m_outer_node < 0) + return false; + } + + return true; + } + + void vp_tree::find_nearest(uint32_t num_subsets, const partition_pattern_vec& desired_pat, result_queue& results, uint32_t max_results) const + { + assert((num_subsets >= 2) && (num_subsets <= 3)); + + results.clear(); + + if (!m_nodes.size()) + return; + + uint32_t num_desired_pats; + partition_pattern_vec desired_pats[NUM_PART3_MAPPINGS]; + + if (num_subsets == 2) + { + num_desired_pats = 2; + for (uint32_t i = 0; i < 2; i++) + desired_pats[i] = desired_pat.get_permuted2(i); + } + else + { + num_desired_pats = NUM_PART3_MAPPINGS; + for (uint32_t i = 0; i < NUM_PART3_MAPPINGS; i++) + desired_pats[i] = desired_pat.get_permuted3(i); + } + +#if 0 + find_nearest_at_node(0, num_desired_pats, desired_pats, results, max_results); +#else + find_nearest_at_node_non_recursive(0, num_desired_pats, desired_pats, results, max_results); +#endif + } + + void vp_tree::find_nearest_at_node(int node_index, uint32_t num_desired_pats, const partition_pattern_vec* pDesired_pats, result_queue& results, uint32_t max_results) const + { + float best_dist_to_vantage = BIG_FLOAT_VAL; + uint32_t best_mapping = 0; + for (uint32_t i = 0; i < num_desired_pats; i++) + { + float dist = pDesired_pats[i].get_distance(m_nodes[node_index].m_vantage_point); + if (dist < best_dist_to_vantage) + { + best_dist_to_vantage = dist; + best_mapping = i; + } + } + + result r; + r.m_dist = best_dist_to_vantage; + r.m_mapping_index = best_mapping; + r.m_pat_index = m_nodes[node_index].m_point_index; + + results.insert(r, max_results); + + if (best_dist_to_vantage <= m_nodes[node_index].m_dist) + { + // inner first + if (m_nodes[node_index].m_inner_node >= 0) + find_nearest_at_node(m_nodes[node_index].m_inner_node, num_desired_pats, pDesired_pats, results, max_results); + + if (m_nodes[node_index].m_outer_node >= 0) + { + if ((results.get_size() < max_results) || + ((m_nodes[node_index].m_dist - best_dist_to_vantage) <= results.get_highest_dist()) + ) + { + find_nearest_at_node(m_nodes[node_index].m_outer_node, num_desired_pats, pDesired_pats, results, max_results); + } + } + } + else + { + // outer first + if (m_nodes[node_index].m_outer_node >= 0) + find_nearest_at_node(m_nodes[node_index].m_outer_node, num_desired_pats, pDesired_pats, results, max_results); + + if (m_nodes[node_index].m_inner_node >= 0) + { + if ((results.get_size() < max_results) || + ((best_dist_to_vantage - m_nodes[node_index].m_dist) <= results.get_highest_dist()) + ) + { + find_nearest_at_node(m_nodes[node_index].m_inner_node, num_desired_pats, pDesired_pats, results, max_results); + } + } + } + } + + void vp_tree::find_nearest_at_node_non_recursive(int init_node_index, uint32_t num_desired_pats, const partition_pattern_vec* pDesired_pats, result_queue& results, uint32_t max_results) const + { + uint_vec node_stack; + node_stack.reserve(16); + node_stack.push_back(init_node_index); + + do + { + const uint32_t node_index = node_stack.back(); + node_stack.pop_back(); + + float best_dist_to_vantage = BIG_FLOAT_VAL; + uint32_t best_mapping = 0; + for (uint32_t i = 0; i < num_desired_pats; i++) + { + float dist = pDesired_pats[i].get_distance(m_nodes[node_index].m_vantage_point); + if (dist < best_dist_to_vantage) + { + best_dist_to_vantage = dist; + best_mapping = i; + } + } + + result r; + r.m_dist = best_dist_to_vantage; + r.m_mapping_index = best_mapping; + r.m_pat_index = m_nodes[node_index].m_point_index; + + results.insert(r, max_results); + + if (best_dist_to_vantage <= m_nodes[node_index].m_dist) + { + if (m_nodes[node_index].m_outer_node >= 0) + { + if ((results.get_size() < max_results) || + ((m_nodes[node_index].m_dist - best_dist_to_vantage) <= results.get_highest_dist()) + ) + { + node_stack.push_back(m_nodes[node_index].m_outer_node); + } + } + + // inner first + if (m_nodes[node_index].m_inner_node >= 0) + { + node_stack.push_back(m_nodes[node_index].m_inner_node); + } + } + else + { + if (m_nodes[node_index].m_inner_node >= 0) + { + if ((results.get_size() < max_results) || + ((best_dist_to_vantage - m_nodes[node_index].m_dist) <= results.get_highest_dist()) + ) + { + node_stack.push_back(m_nodes[node_index].m_inner_node); + } + } + + // outer first + if (m_nodes[node_index].m_outer_node >= 0) + { + node_stack.push_back(m_nodes[node_index].m_outer_node); + } + } + + } while (!node_stack.empty()); + } + + // returns the index of the new node, or -1 on error + int vp_tree::create_node(uint32_t n, const partition_pattern_vec* pUnique_pats, const uint_vec& pat_indices) + { + std::pair root_idx = find_best_vantage_point(n, pUnique_pats, pat_indices); + + if (root_idx.first < 0) + return -1; + + m_nodes.resize(m_nodes.size() + 1); + const uint32_t new_node_index = m_nodes.size_u32() - 1; + + m_nodes[new_node_index].m_vantage_point = pUnique_pats[root_idx.first]; + m_nodes[new_node_index].m_point_index = root_idx.first; + m_nodes[new_node_index].m_dist = root_idx.second; + m_nodes[new_node_index].m_inner_node = -1; + m_nodes[new_node_index].m_outer_node = -1; + + uint_vec inner_list, outer_list; + + inner_list.reserve(pat_indices.size_u32() / 2); + outer_list.reserve(pat_indices.size_u32() / 2); + + for (uint32_t pat_indices_iter = 0; pat_indices_iter < pat_indices.size(); pat_indices_iter++) + { + const uint32_t pat_index = pat_indices[pat_indices_iter]; + + if ((int)pat_index == root_idx.first) + continue; + + const float dist = m_nodes[new_node_index].m_vantage_point.get_distance(pUnique_pats[pat_index]); + + if (dist <= root_idx.second) + inner_list.push_back(pat_index); + else + outer_list.push_back(pat_index); + } + + if (inner_list.size()) + m_nodes[new_node_index].m_inner_node = create_node(n, pUnique_pats, inner_list); + + if (outer_list.size()) + m_nodes[new_node_index].m_outer_node = create_node(n, pUnique_pats, outer_list); + + return new_node_index; + } + + // returns the pattern index of the vantage point (-1 on error), and the optimal split distance + std::pair vp_tree::find_best_vantage_point(uint32_t num_unique_pats, const partition_pattern_vec* pUnique_pats, const uint_vec& pat_indices) + { + BASISU_NOTE_UNUSED(num_unique_pats); + + const uint32_t n = pat_indices.size_u32(); + + assert(n); + if (n == 1) + return std::pair(pat_indices[0], 0.0f); + + float best_split_metric = -1.0f; + int best_split_pat = -1; + float best_split_dist = 0.0f; + float best_split_var = 0.0f; + + basisu::vector< std::pair > dists; + dists.reserve(n); + + float_vec float_dists; + float_dists.reserve(n); + + for (uint32_t pat_indices_iter = 0; pat_indices_iter < n; pat_indices_iter++) + { + const uint32_t split_pat_index = pat_indices[pat_indices_iter]; + assert(split_pat_index < num_unique_pats); + + const partition_pattern_vec& trial_vantage = pUnique_pats[split_pat_index]; + + dists.resize(0); + float_dists.resize(0); + + for (uint32_t j = 0; j < n; j++) + { + const uint32_t pat_index = pat_indices[j]; + assert(pat_index < num_unique_pats); + + if (pat_index == split_pat_index) + continue; + + float dist = trial_vantage.get_distance(pUnique_pats[pat_index]); + dists.emplace_back(std::pair(dist, pat_index)); + + float_dists.push_back(dist); + } + + stats s; + s.calc(float_dists.size_u32(), float_dists.data()); + + std::sort(dists.begin(), dists.end(), [](const auto& a, const auto& b) { + return a.first < b.first; + }); + + const uint32_t num_dists = dists.size_u32(); + float split_dist = dists[num_dists / 2].first; + if ((num_dists & 1) == 0) + split_dist = (split_dist + dists[(num_dists / 2) - 1].first) * .5f; + + uint32_t total_inner = 0, total_outer = 0; + + for (uint32_t j = 0; j < n; j++) + { + const uint32_t pat_index = pat_indices[j]; + if (pat_index == split_pat_index) + continue; + + float dist = trial_vantage.get_distance(pUnique_pats[pat_index]); + + if (dist <= split_dist) + total_inner++; + else + total_outer++; + } + + float split_metric = (float)minimum(total_inner, total_outer) / (float)maximum(total_inner, total_outer); + + if ((split_metric > best_split_metric) || + ((split_metric == best_split_metric) && (s.m_var > best_split_var))) + { + best_split_metric = split_metric; + best_split_dist = split_dist; + best_split_pat = split_pat_index; + best_split_var = (float)s.m_var; + } + } + + return std::pair(best_split_pat, best_split_dist); + } + + void partitions_data::init(uint32_t num_partitions, uint32_t block_width, uint32_t block_height, bool init_vp_tree) + { + assert((num_partitions >= 2) && (num_partitions <= 4)); + + //const uint32_t total_texels = block_width * block_height; + + m_width = block_width; + m_height = block_height; + m_num_partitions = num_partitions; + + m_part_vp_tree.clear(); + + for (uint32_t i = 0; i < 1024; i++) + { + m_part_seed_to_unique_index[i] = -1; + m_unique_index_to_part_seed[i] = -1; + } + + //const bool is_small_block = astc_helpers::is_small_block(block_width, block_height); + + partition_hash_map part_hash; + part_hash.reserve(1024); + m_total_unique_patterns = 0; + + clear_obj(m_partition_pat_histograms); + + for (uint32_t seed_index = 0; seed_index < astc_helpers::NUM_PARTITION_PATTERNS; seed_index++) + { + partition_pattern_vec pat; + uint32_t part_hist[4] = { 0 }; + + pat.init(block_width, block_height); + + for (uint32_t y = 0; y < block_height; y++) + { + for (uint32_t x = 0; x < block_width; x++) + { + //const uint8_t p = (uint8_t)astc_helpers::compute_texel_partition(seed_index, x, y, 0, m_num_partitions, is_small_block); + const uint8_t p = (uint8_t)astc_helpers::get_precomputed_texel_partition(block_width, block_height, seed_index, x, y, num_partitions); + + assert((p < m_num_partitions) && (p < 4)); + + pat(x, y) = p; + + part_hist[p]++; + } // x + } // y + + bool skip_pat = false; + for (uint32_t i = 0; i < m_num_partitions; i++) + { + if (!part_hist[i]) + { + skip_pat = true; + break; + } + } + if (skip_pat) + continue; + + partition_pattern_vec std_pat(pat.get_canonicalized()); + + if (part_hash.contains(std_pat)) + continue; + + if (num_partitions == 2) + { + assert(!part_hash.contains(pat)); + assert(!part_hash.contains(pat.get_permuted2(1))); + } + else if (num_partitions == 3) + { + for (uint32_t i = 0; i < partition_pattern_vec::cMaxPermute3Index; i++) + { + assert(!part_hash.contains(pat.get_permuted3(i))); + } + } + + for (uint32_t c = 0; c < 4; c++) + m_partition_pat_histograms[m_total_unique_patterns].m_hist[c] = (uint8_t)part_hist[c]; + + part_hash.insert(std_pat, std::make_pair(seed_index, m_total_unique_patterns)); + + m_part_seed_to_unique_index[seed_index] = (int16_t)m_total_unique_patterns; + m_unique_index_to_part_seed[m_total_unique_patterns] = (int16_t)seed_index; + + m_partition_pats[m_total_unique_patterns] = pat; + + m_total_unique_patterns++; + + } // seed_index + + if (init_vp_tree) + m_part_vp_tree.init(m_total_unique_patterns, m_partition_pats); + } + +} // namespace astc_ldr + +} // namespace basisu diff --git a/encoder/basisu_astc_ldr_common.h b/encoder/basisu_astc_ldr_common.h new file mode 100644 index 0000000..76e7e3f --- /dev/null +++ b/encoder/basisu_astc_ldr_common.h @@ -0,0 +1,445 @@ +// File: basisu_astc_ldr_common.h +#pragma once +#include "basisu_enc.h" +#include "basisu_gpu_texture.h" +#include + +namespace basisu +{ + +namespace astc_ldr +{ + const uint32_t ASTC_LDR_MAX_BLOCK_WIDTH = astc_helpers::MAX_BLOCK_DIM; // 12 + const uint32_t ASTC_LDR_MAX_BLOCK_HEIGHT = astc_helpers::MAX_BLOCK_DIM; // 12 + const uint32_t ASTC_LDR_MAX_BLOCK_PIXELS = astc_helpers::MAX_BLOCK_PIXELS; // 144 + const uint32_t ASTC_LDR_MAX_RAW_WEIGHTS = astc_helpers::MAX_WEIGHT_INTERPOLANT_VALUE + 1; // 65 + + const uint32_t WEIGHT_REFINER_MAX_PASSES = 17; + + inline basist::color_rgba convert_to_basist_color_rgba(const color_rgba& c) + { + return basist::color_rgba(c.r, c.g, c.b, c.a); + } + + struct cem_encode_params + { + uint32_t m_comp_weights[4]; + bool m_decode_mode_srgb; // todo: store astc_helpers::cDecodeModeSRGB8 : astc_helpers::cDecodeModeLDR8 instead, also the alpha mode for srgb because the decoders are broken + + const uint8_t* m_pForced_weight_vals0; + const uint8_t* m_pForced_weight_vals1; + + uint32_t m_max_ls_passes, m_total_weight_refine_passes; + bool m_worst_weight_nudging_flag; + bool m_endpoint_refinement_flag; + + cem_encode_params() + { + init(); + } + + void init() + { + m_comp_weights[0] = 1; + m_comp_weights[1] = 1; + m_comp_weights[2] = 1; + m_comp_weights[3] = 1; + + m_decode_mode_srgb = true; + + m_pForced_weight_vals0 = nullptr; + m_pForced_weight_vals1 = nullptr; + + m_max_ls_passes = 3; + m_total_weight_refine_passes = 0; + m_worst_weight_nudging_flag = false; + m_endpoint_refinement_flag = false; + } + + float get_total_comp_weights() const + { + return (float)(m_comp_weights[0] + m_comp_weights[1] + m_comp_weights[2] + m_comp_weights[3]); + } + }; + + struct pixel_stats_t + { + uint32_t m_num_pixels; + + color_rgba m_pixels[ASTC_LDR_MAX_BLOCK_PIXELS]; + vec4F m_pixels_f[ASTC_LDR_MAX_BLOCK_PIXELS]; + + color_rgba m_min, m_max; + + vec4F m_min_f, m_max_f; + vec4F m_mean_f; + + // Always 3D, ignoring alpha + vec3F m_mean_rel_axis3; + vec3F m_zero_rel_axis3; + + // Always 4D + vec4F m_mean_rel_axis4; + + bool m_has_alpha; + + stats m_rgba_stats[4]; + + void clear() + { + clear_obj(*this); + } + + void init(uint32_t num_pixels, const color_rgba* pPixels); + + }; // struct struct pixel_stats + + void global_init(); + + void bit_transfer_signed_enc(int& a, int& b); + void bit_transfer_signed_dec(int& a, int& b); // transfers MSB from a to b, a is then [-32,31] + color_rgba blue_contract_enc(color_rgba orig, bool& did_clamp, int encoded_b); + int quant_preserve2(uint32_t ise_range, uint32_t v); + + uint32_t get_colors(const color_rgba& l, const color_rgba& h, uint32_t weight_ise_index, color_rgba* pColors, bool decode_mode_srgb); + uint32_t get_colors_raw_weights(const color_rgba& l, const color_rgba& h, color_rgba* pColors, bool decode_mode_srgb); + void decode_endpoints_ise20(uint32_t cem_index, const uint8_t* pEndpoint_vals, color_rgba& l, color_rgba& h); // assume BISE 20 + void decode_endpoints(uint32_t cem_index, const uint8_t* pEndpoint_vals, uint32_t endpoint_ise_index, color_rgba& l, color_rgba& h, float* pScale = nullptr); + uint32_t get_colors(uint32_t cem_index, const uint8_t* pEndpoint_vals, uint32_t endpoint_ise_index, uint32_t weight_ise_index, color_rgba* pColors, bool decode_mode_srgb); + uint32_t get_colors_raw_weights(uint32_t cem_index, const uint8_t* pEndpoint_vals, uint32_t endpoint_ise_index, color_rgba* pColors, bool decode_mode_srgb); + + //int apply_delta_to_bise_endpoint_val(uint32_t endpoint_ise_range, int ise_val, int delta); + int apply_delta_to_bise_weight_val(uint32_t weight_ise_range, int ise_val, int delta); + + uint64_t eval_solution( + const pixel_stats_t& pixel_stats, + uint32_t total_weights, const color_rgba* pWeight_colors, + uint8_t* pWeight_vals, uint32_t weight_ise_index, + const cem_encode_params& params); + + uint64_t eval_solution( + const pixel_stats_t& pixel_stats, + uint32_t cem_index, + const uint8_t* pEndpoint_vals, uint32_t endpoint_ise_index, + uint8_t* pWeight_vals, uint32_t weight_ise_index, + const cem_encode_params& params); + + uint64_t eval_solution_dp( + uint32_t ccs_index, + const pixel_stats_t& pixel_stats, + uint32_t total_weights, const color_rgba* pWeight_colors, + uint8_t* pWeight_vals0, uint8_t* pWeight_vals1, uint32_t weight_ise_index, + const cem_encode_params& params); + + uint64_t eval_solution_dp( + const pixel_stats_t& pixel_stats, + uint32_t cem_index, uint32_t ccs_index, + const uint8_t* pEndpoint_vals, uint32_t endpoint_ise_index, + uint8_t* pWeight_vals0, uint8_t* pWeight_vals1, uint32_t weight_ise_index, + const cem_encode_params& params); + + //bool cem8_or_12_used_blue_contraction(uint32_t cem_index, const uint8_t* pEndpoint_vals, uint32_t endpoint_ise_index); + //bool cem9_or_13_used_blue_contraction(uint32_t cem_index, const uint8_t* pEndpoint_vals, uint32_t endpoint_ise_index); + //bool used_blue_contraction(uint32_t cem_index, const uint8_t* pEndpoint_vals, uint32_t endpoint_ise_index); + + uint64_t cem_encode_pixels( + uint32_t cem_index, int ccs_index, + const pixel_stats_t& pixel_stats, const cem_encode_params& enc_params, + uint32_t endpoint_ise_range, uint32_t weight_ise_range, + uint8_t* pEndpoint_vals, uint8_t* pWeight_vals0, uint8_t* pWeight_vals1, uint64_t cur_blk_error, + bool use_blue_contraction, bool* pBase_ofs_clamped_flag); + + // TODO: Rename, confusing vs. std::vector or basisu::vector or vec4F etc. + struct partition_pattern_vec + { + uint32_t m_width, m_height; + uint8_t m_parts[ASTC_LDR_MAX_BLOCK_PIXELS]; + + partition_pattern_vec(); + + partition_pattern_vec(const partition_pattern_vec& other); + + partition_pattern_vec(uint32_t width, uint32_t height, const uint8_t* pParts = nullptr); + + void init(uint32_t width, uint32_t height, const uint8_t* pParts = nullptr); + + void init_part_hist(); + + void clear(); + + partition_pattern_vec& operator= (const partition_pattern_vec& rhs); + + uint32_t get_width() const { return m_width; } + uint32_t get_height() const { return m_height; } + uint32_t get_total() const { return m_width * m_height; } + + uint8_t operator[] (uint32_t i) const { assert(i < get_total()); return m_parts[i]; } + uint8_t& operator[] (uint32_t i) { assert(i < get_total()); return m_parts[i]; } + + uint8_t operator() (uint32_t x, uint32_t y) const { assert((x < m_width) && (y < m_height)); return m_parts[x + y * m_width]; } + uint8_t& operator() (uint32_t x, uint32_t y) { assert((x < m_width) && (y < m_height)); return m_parts[x + y * m_width]; } + + int get_squared_distance(const partition_pattern_vec& other) const; + + float get_distance(const partition_pattern_vec& other) const + { + return sqrtf((float)get_squared_distance(other)); + } + + enum { cMaxPermute2Index = 1 }; + partition_pattern_vec get_permuted2(uint32_t permute_index) const; + + enum { cMaxPermute3Index = 5 }; + partition_pattern_vec get_permuted3(uint32_t permute_index) const; + + partition_pattern_vec get_canonicalized() const; + + bool operator== (const partition_pattern_vec& rhs) const + { + if ((m_width != rhs.m_width) || (m_height != rhs.m_height)) + return false; + + return memcmp(m_parts, rhs.m_parts, get_total()) == 0; + } + + operator size_t() const + { + return basist::hash_hsieh(m_parts, get_total()); + } + }; + + struct vp_tree_node + { + partition_pattern_vec m_vantage_point; + uint32_t m_point_index; + float m_dist; + + int m_inner_node, m_outer_node; + }; + + const uint32_t NUM_PART3_MAPPINGS = 6; + extern uint8_t g_part3_mapping[NUM_PART3_MAPPINGS][3]; + + class vp_tree + { + public: + vp_tree() + { + } + + void clear() + { + m_nodes.clear(); + } + + // This requires no redundant patterns, i.e. all must be unique. + bool init(uint32_t n, const partition_pattern_vec* pUnique_pats); + + struct result + { + uint32_t m_pat_index; + uint32_t m_mapping_index; + float m_dist; + + bool operator< (const result& rhs) const { return m_dist < rhs.m_dist; } + bool operator> (const result& rhs) const { return m_dist > rhs.m_dist; } + }; + + class result_queue + { + enum { MaxSupportedSize = 512 + 1 }; + + public: + result_queue() : + m_cur_size(0) + { + } + + size_t get_size() const + { + return m_cur_size; + } + + bool empty() const + { + return !m_cur_size; + } + + typedef std::array result_array_type; + + const result_array_type& get_elements() const { return m_elements; } + result_array_type& get_elements() { return m_elements; } + + void clear() + { + m_cur_size = 0; + } + + void reserve(uint32_t n) + { + BASISU_NOTE_UNUSED(n); + } + + const result& top() const + { + assert(m_cur_size); + return m_elements[1]; + } + + bool insert(const result& val, uint32_t max_size) + { + assert(max_size < MaxSupportedSize); + + if (m_cur_size >= MaxSupportedSize) + return false; + + m_elements[++m_cur_size] = val; + up_heap(m_cur_size); + + if (m_cur_size > max_size) + pop(); + + return true; + } + + bool pop() + { + if (m_cur_size == 0) + return false; + + m_elements[1] = m_elements[m_cur_size--]; + down_heap(1); + return true; + } + + float get_highest_dist() const + { + if (!m_cur_size) + return 0.0f; + + return top().m_dist; + } + + private: + result_array_type m_elements; + size_t m_cur_size; + + void up_heap(size_t index) + { + while ((index > 1) && (m_elements[index] > m_elements[index >> 1])) + { + std::swap(m_elements[index], m_elements[index >> 1]); + index >>= 1; + } + } + + void down_heap(size_t index) + { + for (; ; ) + { + size_t largest = index, left_child = 2 * index, right_child = 2 * index + 1; + + if ((left_child <= m_cur_size) && (m_elements[left_child] > m_elements[largest])) + largest = left_child; + + if ((right_child <= m_cur_size) && (m_elements[right_child] > m_elements[largest])) + largest = right_child; + + if (largest == index) + break; + + std::swap(m_elements[index], m_elements[largest]); + index = largest; + } + } + }; + + void find_nearest(uint32_t num_subsets, const partition_pattern_vec& desired_pat, result_queue& results, uint32_t max_results) const; + + private: + basisu::vector m_nodes; + + void find_nearest_at_node(int node_index, uint32_t num_desired_pats, const partition_pattern_vec* pDesired_pats, result_queue& results, uint32_t max_results) const; + + void find_nearest_at_node_non_recursive(int init_node_index, uint32_t num_desired_pats, const partition_pattern_vec* pDesired_pats, result_queue& results, uint32_t max_results) const; + + // returns the index of the new node, or -1 on error + int create_node(uint32_t n, const partition_pattern_vec* pUnique_pats, const uint_vec& pat_indices); + + // returns the pattern index of the vantage point (-1 on error), and the optimal split distance + std::pair find_best_vantage_point(uint32_t num_unique_pats, const partition_pattern_vec* pUnique_pats, const uint_vec& pat_indices); + }; + + typedef basisu::hash_map > partition_hash_map; + + struct partition_pattern_hist + { + uint8_t m_hist[4]; + + partition_pattern_hist() { clear(); } + + void clear() { clear_obj(m_hist); } + }; + + struct partitions_data + { + uint32_t m_width, m_height, m_num_partitions; + partition_pattern_vec m_partition_pats[astc_helpers::NUM_PARTITION_PATTERNS]; // indexed by unique index, NOT the 10-bit ASTC seed/pattern index + + partition_pattern_hist m_partition_pat_histograms[astc_helpers::NUM_PARTITION_PATTERNS]; // indexed by unique index, histograms of each pattern + + // ASTC seed to unique index and vice versa + int16_t m_part_seed_to_unique_index[astc_helpers::NUM_PARTITION_PATTERNS]; + int16_t m_unique_index_to_part_seed[astc_helpers::NUM_PARTITION_PATTERNS]; + + // Total number of unique patterns + uint32_t m_total_unique_patterns; + + // VP tree used to rapidly find nearby/similar patterns. + vp_tree m_part_vp_tree; + + void init(uint32_t num_partitions, uint32_t block_width, uint32_t block_height, bool init_vp_tree = true); + }; + + float surrogate_quant_endpoint_val(float e, uint32_t num_endpoint_levels, uint32_t flags); + vec4F surrogate_quant_endpoint(const vec4F& e, uint32_t num_endpoint_levels, uint32_t flags); + + float surrogate_evaluate_rgba_sp(const pixel_stats_t& ps, const vec4F& l, const vec4F& h, float* pWeights0, uint32_t num_weight_levels, const cem_encode_params& enc_params, uint32_t flags); + float surrogate_evaluate_rgba_dp(uint32_t ccs_index, const pixel_stats_t& ps, const vec4F& l, const vec4F& h, float* pWeights0, float* pWeights1, uint32_t num_weight_levels, const cem_encode_params& enc_params, uint32_t flags); + + enum + { + cFlagDisableQuant = 1, + cFlagNoError = 2 + } + ; + float cem_surrogate_encode_pixels( + uint32_t cem_index, int ccs_index, + const pixel_stats_t& pixel_stats, const cem_encode_params& enc_params, + uint32_t endpoint_ise_range, uint32_t weight_ise_range, + vec4F& low_endpoint, vec4F& high_endpoint, float& s, float* pWeights0, float* pWeights1, uint32_t flags = 0); + +#if 0 + bool requantize_ise_endpoints(uint32_t cem, + uint32_t src_ise_endpoint_range, const uint8_t* pSrc_endpoints, + uint32_t dst_ise_endpoint_range, uint8_t* pDst_endpoints); + + uint32_t get_base_cem_without_alpha(uint32_t cem); + + bool pack_base_offset( + uint32_t cem_index, uint32_t dst_ise_endpoint_range, uint8_t* pPacked_endpoints, + const color_rgba& l, const color_rgba& h, + bool use_blue_contraction, bool auto_disable_blue_contraction_if_clamped, + bool& blue_contraction_clamped_flag, bool& base_ofs_clamped_flag, bool& endpoints_swapped); + + bool convert_endpoints_across_cems( + uint32_t prev_cem, uint32_t prev_endpoint_ise_range, const uint8_t* pPrev_endpoints, + uint32_t dst_cem, uint32_t dst_endpoint_ise_range, uint8_t* pDst_endpoints, + bool always_repack, + bool use_blue_contraction, bool auto_disable_blue_contraction_if_clamped, + bool& blue_contraction_clamped_flag, bool& base_ofs_clamped_flag); +#endif + +} // namespace astc_ldr + +} // namespace basisu diff --git a/encoder/basisu_astc_ldr_encode.cpp b/encoder/basisu_astc_ldr_encode.cpp new file mode 100644 index 0000000..1710b1b --- /dev/null +++ b/encoder/basisu_astc_ldr_encode.cpp @@ -0,0 +1,11065 @@ +// File: basisu_astc_ldr_encode.cpp +// Copyright (C) 2019-2026 Binomial LLC. All Rights Reserved. +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +#include "basisu_enc.h" +#include "basisu_astc_ldr_encode.h" +#include "basisu_astc_hdr_common.h" +#include "basisu_astc_ldr_common.h" +#include "3rdparty/android_astc_decomp.h" +#include + +#include "../zstd/zstd.h" + +namespace basisu { +namespace astc_ldr { + +const bool g_devel_messages = true; +const bool ASTC_LDR_CONSISTENCY_CHECKING = true; + +bool g_initialized; + +const uint32_t EXPECTED_SUPERBUCKET_HASH_SIZE = 8192; +const uint32_t EXPECTED_SHORTLIST_HASH_SIZE = 4096; + +const uint32_t MAX_BASE_PARTS2 = 128; +const uint32_t MAX_BASE_PARTS3 = 128; + +const uint32_t PART_ESTIMATE_STAGE1_MULTIPLIER = 4; + +const uint32_t MAX_WIDTH = 65535, MAX_HEIGHT = 65535; + +void code_block_weights( + basist::astc_ldr_t::grid_weight_dct &gw_dct, + float q, uint32_t plane_index, + const astc_helpers::log_astc_block& log_blk, + const basist::astc_ldr_t::astc_block_grid_data* pGrid_data, + basisu::bitwise_coder& c, + basist::astc_ldr_t::dct_syms& syms) +{ + assert(q > 0.0f); + + syms.clear(); + + const uint32_t grid_width = log_blk.m_grid_width, grid_height = log_blk.m_grid_height; + const uint32_t total_grid_samples = grid_width * grid_height; + const uint32_t num_planes = log_blk.m_dual_plane ? 2 : 1; + + //const auto& dequant_tab = astc_helpers::g_dequant_tables.get_weight_tab(log_blk.m_weight_ise_range).m_ISE_to_val; + //const auto& quant_tab = astc_helpers::g_dequant_tables.get_weight_tab(log_blk.m_weight_ise_range).m_val_to_ise; + + uint8_t dequantized_raw_weights0[astc_helpers::MAX_BLOCK_PIXELS]; + + for (uint32_t i = 0; i < grid_width * grid_height; i++) + dequantized_raw_weights0[i] = astc_helpers::g_dequant_tables.get_weight_tab(log_blk.m_weight_ise_range).m_ISE_to_val[log_blk.m_weights[i * num_planes + plane_index]]; + + auto grid_dim_vals_iter = gw_dct.m_grid_dim_key_vals.find(basist::astc_ldr_t::grid_dim_key(grid_width, grid_height)); + assert(grid_dim_vals_iter != gw_dct.m_grid_dim_key_vals.end()); + + auto& grid_dim_vals = grid_dim_vals_iter->second; + + float orig_weights[astc_helpers::MAX_BLOCK_PIXELS]; + float weight_sum = 0; + for (uint32_t y = 0; y < grid_height; y++) + { + for (uint32_t x = 0; x < grid_width; x++) + { + orig_weights[x + y * grid_width] = dequantized_raw_weights0[x + y * grid_width]; + weight_sum += orig_weights[x + y * grid_width]; + } + } + + float scaled_weight_coding_scale = basist::astc_ldr_t::SCALED_WEIGHT_BASE_CODING_SCALE; + if (log_blk.m_weight_ise_range <= astc_helpers::BISE_8_LEVELS) + scaled_weight_coding_scale = 1.0f / 8.0f; + + float scaled_mean_weight = std::round((float)scaled_weight_coding_scale * (weight_sum / total_grid_samples)); + scaled_mean_weight = basisu::clamp(scaled_mean_weight, 0.0f, 64.0f * (float)scaled_weight_coding_scale); + + float mean_weight = scaled_mean_weight / (float)scaled_weight_coding_scale; + + for (uint32_t y = 0; y < grid_height; y++) + for (uint32_t x = 0; x < grid_width; x++) + orig_weights[x + y * grid_width] -= mean_weight; + + const float span_len = gw_dct.get_max_span_len(log_blk, plane_index); + + float dct_weights[astc_helpers::MAX_BLOCK_PIXELS]; + + // TODO - temp alloc + basist::astc_ldr_t::fvec dct_work; + grid_dim_vals.m_dct.forward(orig_weights, dct_weights, dct_work); + + const float level_scale = gw_dct.compute_level_scale(q, span_len, pGrid_data->m_weight_gamma, grid_width, grid_height, log_blk.m_weight_ise_range); + + int dct_quant_tab[astc_helpers::MAX_BLOCK_PIXELS]; + gw_dct.compute_quant_table(q, grid_width, grid_height, level_scale, dct_quant_tab); + +#if defined(DEBUG) || defined(_DEBUG) + // sanity checking + basist::astc_ldr_t::sample_quant_table_state quant_state; + quant_state.init(q, gw_dct.m_block_width, gw_dct.m_block_height, level_scale); +#endif + + c.put_truncated_binary((int)scaled_mean_weight, (uint32_t)(64.0f * scaled_weight_coding_scale) + 1); + + syms.m_dc_sym = (int)scaled_mean_weight; + syms.m_num_dc_levels = (uint32_t)(64.0f * scaled_weight_coding_scale) + 1; + assert(syms.m_num_dc_levels == gw_dct.get_num_weight_dc_levels(log_blk.m_weight_ise_range)); + + int dct_coeffs[astc_helpers::MAX_BLOCK_PIXELS]; + + for (uint32_t y = 0; y < grid_height; y++) + { + for (uint32_t x = 0; x < grid_width; x++) + { + if (!x && !y) + { + dct_coeffs[0] = 0; + continue; + } + + const int levels = dct_quant_tab[x + y * grid_width]; + +#if defined(DEBUG) || defined(_DEBUG) + // sanity checking + assert(levels == gw_dct.sample_quant_table(quant_state, x, y)); +#endif + + float d = dct_weights[x + y * grid_width]; + + int id = gw_dct.quantize_deadzone(d, levels, basist::astc_ldr_t::DEADZONE_ALPHA, x, y); + + dct_coeffs[x + y * grid_width] = id; + + } // x + + } // y + + const basisu::int_vec& zigzag = grid_dim_vals.m_zigzag; + assert(zigzag.size() == total_grid_samples); + + int total_zeros = 0; + for (uint32_t i = 0; i < total_grid_samples; i++) + { + uint32_t dct_idx = zigzag[i]; + if (!dct_idx) + continue; + + int coeff = dct_coeffs[dct_idx]; + if (!coeff) + { + total_zeros++; + continue; + } + + basist::astc_ldr_t::dct_syms::coeff cf; + cf.m_num_zeros = basisu::safe_cast_uint16(total_zeros); + cf.m_coeff = basisu::safe_cast_int16(coeff); + syms.m_coeffs.push_back(cf); + syms.m_max_coeff_mag = basisu::maximum(syms.m_max_coeff_mag, basisu::iabs(coeff)); + syms.m_max_zigzag_index = basisu::maximum(syms.m_max_zigzag_index, i); + + c.put_rice(total_zeros, gw_dct.m_zero_run); + total_zeros = 0; + + c.put_bits(coeff < 0 ? 1 : 0, 1); + + if (coeff < 0) + coeff = -coeff; + + c.put_rice(coeff, gw_dct.m_coeff); + } + + if (total_zeros) + { + basist::astc_ldr_t::dct_syms::coeff cf; + cf.m_num_zeros = basisu::safe_cast_uint16(total_zeros); + cf.m_coeff = INT16_MAX; + syms.m_coeffs.push_back(cf); + + c.put_rice(total_zeros, gw_dct.m_zero_run); + } +} + +void astc_ldr_requantize_astc_weights(uint32_t n, const uint8_t* pSrc_ise_vals, uint32_t from_ise_range, uint8_t* pDst_ise_vals, uint32_t to_ise_range) +{ + if (from_ise_range == to_ise_range) + { + if (pDst_ise_vals != pSrc_ise_vals) + memcpy(pDst_ise_vals, pSrc_ise_vals, n); + return; + } + + // from/to BISE ranges not equal + if (from_ise_range == astc_helpers::BISE_64_LEVELS) + { + // from [0,64] + const auto& quant_tab = astc_helpers::g_dequant_tables.get_weight_tab(to_ise_range).m_val_to_ise; + + for (uint32_t i = 0; i < n; i++) + pDst_ise_vals[i] = quant_tab[pSrc_ise_vals[i]]; + } + else if (to_ise_range == astc_helpers::BISE_64_LEVELS) + { + // to [0,64] + const auto& dequant_tab = astc_helpers::g_dequant_tables.get_weight_tab(from_ise_range).m_ISE_to_val; + + for (uint32_t i = 0; i < n; i++) + pDst_ise_vals[i] = dequant_tab[pSrc_ise_vals[i]]; + } + else + { + // from/to any other + const auto& dequant_tab = astc_helpers::g_dequant_tables.get_weight_tab(from_ise_range).m_ISE_to_val; + const auto& quant_tab = astc_helpers::g_dequant_tables.get_weight_tab(to_ise_range).m_val_to_ise; + + for (uint32_t i = 0; i < n; i++) + pDst_ise_vals[i] = quant_tab[dequant_tab[pSrc_ise_vals[i]]]; + } +} + +void astc_ldr_downsample_ise_weights( + uint32_t dequant_weight_ise_range, uint32_t quant_weight_ise_range, + uint32_t block_w, uint32_t block_h, + uint32_t grid_w, uint32_t grid_h, + const uint8_t* pSrc_weights, uint8_t* pDst_weights, + const float* pDownsample_matrix) +{ + assert((block_w <= astc_ldr::ASTC_LDR_MAX_BLOCK_WIDTH) && (block_h <= astc_ldr::ASTC_LDR_MAX_BLOCK_HEIGHT)); + assert((grid_w >= 2) && (grid_w <= block_w)); + assert((grid_h >= 2) && (grid_h <= block_h)); + + assert(((dequant_weight_ise_range >= astc_helpers::FIRST_VALID_WEIGHT_ISE_RANGE) && (dequant_weight_ise_range <= astc_helpers::LAST_VALID_WEIGHT_ISE_RANGE)) || + (dequant_weight_ise_range == astc_helpers::BISE_64_LEVELS)); + + assert(((quant_weight_ise_range >= astc_helpers::FIRST_VALID_WEIGHT_ISE_RANGE) && (quant_weight_ise_range <= astc_helpers::LAST_VALID_WEIGHT_ISE_RANGE)) || + (quant_weight_ise_range == astc_helpers::BISE_64_LEVELS)); + + assert(pDownsample_matrix); + + if ((block_w == grid_w) && (block_h == grid_h)) + { + if (dequant_weight_ise_range != quant_weight_ise_range) + { + astc_ldr_requantize_astc_weights(block_w * block_h, pSrc_weights, dequant_weight_ise_range, pDst_weights, quant_weight_ise_range); + } + else + { + if (pDst_weights != pSrc_weights) + memcpy(pDst_weights, pSrc_weights, block_w * block_h); + } + + return; + } + + uint8_t desired_weights[astc_ldr::ASTC_LDR_MAX_BLOCK_PIXELS]; + + if (dequant_weight_ise_range == astc_helpers::BISE_64_LEVELS) + { + memcpy(desired_weights, pSrc_weights, block_w * block_h); + } + else + { + const auto& dequant_tab = astc_helpers::g_dequant_tables.get_weight_tab(dequant_weight_ise_range).m_ISE_to_val; + + for (uint32_t by = 0; by < block_h; by++) + for (uint32_t bx = 0; bx < block_w; bx++) + desired_weights[bx + by * block_w] = dequant_tab[pSrc_weights[bx + by * block_w]]; + } + + if (quant_weight_ise_range == astc_helpers::BISE_64_LEVELS) + { + downsample_weight_grid( + pDownsample_matrix, + block_w, block_h, // source/from dimension (block size) + grid_w, grid_h, // dest/to dimension (grid size) + desired_weights, // these are dequantized weights, NOT ISE symbols, [by][bx] + pDst_weights); // [wy][wx] + } + else + { + uint8_t raw_downsampled_weights[astc_ldr::ASTC_LDR_MAX_BLOCK_PIXELS]; + + downsample_weight_grid( + pDownsample_matrix, + block_w, block_h, // source/from dimension (block size) + grid_w, grid_h, // dest/to dimension (grid size) + desired_weights, // these are dequantized weights, NOT ISE symbols, [by][bx] + raw_downsampled_weights); // [wy][wx] + + const auto& weight_quant_tab = astc_helpers::g_dequant_tables.get_weight_tab(quant_weight_ise_range).m_val_to_ise; + + for (uint32_t gy = 0; gy < grid_h; gy++) + for (uint32_t gx = 0; gx < grid_w; gx++) + pDst_weights[gx + gy * grid_w] = weight_quant_tab[raw_downsampled_weights[gx + gy * grid_w]]; + } +} + +void downsample_weight_residual_grid( + const float* pMatrix_weights, + uint32_t bx, uint32_t by, // source/from dimension (block size) + uint32_t wx, uint32_t wy, // dest/to dimension (grid size) + const int* pSrc_weights, // these are dequantized weights, NOT ISE symbols, [by][bx] + float* pDst_weights) // [wy][wx] +{ + const uint32_t total_block_samples = bx * by; + + for (uint32_t y = 0; y < wy; y++) + { + for (uint32_t x = 0; x < wx; x++) + { + float total = 0.0f; + + for (uint32_t i = 0; i < total_block_samples; i++) + if (pMatrix_weights[i]) + total += pMatrix_weights[i] * (float)pSrc_weights[i]; + + pDst_weights[x + y * wx] = total; + + pMatrix_weights += total_block_samples; + } + } +} + +void downsample_weightsf( + const float* pMatrix_weights, + uint32_t bx, uint32_t by, // source/from dimension (block size) + uint32_t wx, uint32_t wy, // dest/to dimension (grid size) + const float* pSrc_weights, // these are dequantized weights, NOT ISE symbols, [by][bx] + float* pDst_weights) // [wy][wx] +{ + const uint32_t total_block_samples = bx * by; + + for (uint32_t y = 0; y < wy; y++) + { + for (uint32_t x = 0; x < wx; x++) + { + float total = 0.0f; + + for (uint32_t i = 0; i < total_block_samples; i++) + if (pMatrix_weights[i]) + total += pMatrix_weights[i] * pSrc_weights[i]; + + pDst_weights[x + y * wx] = total; + + pMatrix_weights += total_block_samples; + } + } +} + +static inline uint32_t weighted_color_error(const color_rgba& a, const color_rgba& b, const astc_ldr::cem_encode_params& p) +{ + uint32_t total_e = 0; + for (uint32_t c = 0; c < 4; c++) + { + int av = a[c]; + int bv = b[c]; + int ev = av - bv; + total_e += (uint32_t)(ev * ev) * p.m_comp_weights[c]; + } + + return total_e; +} + +uint64_t eval_error( + uint32_t block_width, uint32_t block_height, + const astc_helpers::log_astc_block& enc_log_block, + const astc_ldr::pixel_stats_t& pixel_stats, + const astc_ldr::cem_encode_params& params) +{ + color_rgba dec_block_pixels[astc_ldr::ASTC_LDR_MAX_BLOCK_PIXELS]; + bool status = astc_helpers::decode_block_xuastc_ldr(enc_log_block, dec_block_pixels, block_width, block_height, params.m_decode_mode_srgb ? astc_helpers::cDecodeModeSRGB8 : astc_helpers::cDecodeModeLDR8); + if (!status) + { + // Shouldn't ever happen + assert(0); + return UINT64_MAX; + } + +#if defined(_DEBUG) || defined(DEBUG) + // Sanity check vs. unoptimized decoder + color_rgba dec_block_pixels_alt[astc_ldr::ASTC_LDR_MAX_BLOCK_PIXELS]; + bool alt_status = astc_helpers::decode_block(enc_log_block, dec_block_pixels_alt, block_width, block_height, params.m_decode_mode_srgb ? astc_helpers::cDecodeModeSRGB8 : astc_helpers::cDecodeModeLDR8); + if (!alt_status) + { + // Shouldn't ever happen + assert(0); + return UINT64_MAX; + } + + if (memcmp(dec_block_pixels, dec_block_pixels_alt, sizeof(color_rgba) * block_width * block_height) != 0) + { + // Very bad + assert(0); + return UINT64_MAX; + } +#endif + + uint64_t total_err = 0; + + const uint32_t total_block_pixels = block_width * block_height; + for (uint32_t i = 0; i < total_block_pixels; i++) + total_err += weighted_color_error(dec_block_pixels[i], pixel_stats.m_pixels[i], params); + + return total_err; +} + +uint64_t eval_error( + uint32_t block_width, uint32_t block_height, + const astc_ldr::pixel_stats_t& pixel_stats, + uint32_t cem_index, + bool dual_plane_flag, int ccs_index, + uint32_t endpoint_ise_range, uint32_t weight_ise_range, + uint32_t grid_width, uint32_t grid_height, + const uint8_t* pEndpoint_vals, const uint8_t* pWeight_grid_vals0, const uint8_t* pWeight_grid_vals1, + const astc_ldr::cem_encode_params& params) +{ + const uint32_t total_block_pixels = block_width * block_height; + const uint32_t total_grid_pixels = grid_width * grid_height; + + astc_helpers::log_astc_block enc_log_block; + + enc_log_block.clear(); + enc_log_block.m_grid_width = (uint8_t)grid_width; + enc_log_block.m_grid_height = (uint8_t)grid_height; + enc_log_block.m_weight_ise_range = (uint8_t)weight_ise_range; + enc_log_block.m_endpoint_ise_range = (uint8_t)endpoint_ise_range; + enc_log_block.m_color_endpoint_modes[0] = (uint8_t)cem_index; + enc_log_block.m_num_partitions = 1; + + memcpy(enc_log_block.m_endpoints, pEndpoint_vals, astc_helpers::get_num_cem_values(cem_index)); + + if (dual_plane_flag) + { + assert((ccs_index >= 0) && (ccs_index <= 3)); + + enc_log_block.m_dual_plane = true; + enc_log_block.m_color_component_selector = (uint8_t)ccs_index; + + for (uint32_t i = 0; i < total_grid_pixels; i++) + { + enc_log_block.m_weights[i * 2 + 0] = pWeight_grid_vals0[i]; + enc_log_block.m_weights[i * 2 + 1] = pWeight_grid_vals1[i]; + } + } + else + { + assert(ccs_index < 0); + + memcpy(enc_log_block.m_weights, pWeight_grid_vals0, total_grid_pixels); + } + + color_rgba decoded_pixels[astc_ldr::ASTC_LDR_MAX_BLOCK_PIXELS]; + bool status = astc_helpers::decode_block(enc_log_block, decoded_pixels, block_width, block_height, params.m_decode_mode_srgb ? astc_helpers::cDecodeModeSRGB8 : astc_helpers::cDecodeModeLDR8); + assert(status); + + if (!status) + return UINT64_MAX; + + uint64_t total_err = 0; + + for (uint32_t i = 0; i < total_block_pixels; i++) + total_err += weighted_color_error(pixel_stats.m_pixels[i], decoded_pixels[i], params); + + return total_err; +} + +float compute_psnr_from_wsse(uint32_t block_width, uint32_t block_height, uint64_t sse, float total_comp_weights) +{ + const uint32_t total_block_pixels = block_width * block_height; + const float wmse = (float)sse / (total_comp_weights * (float)total_block_pixels); + const float wpsnr = (wmse > 1e-5f) ? (20.0f * log10f(255.0f / sqrtf(wmse))) : 10000.0f; + return wpsnr; +} + +// quantized coordinate descent (QCD), quadratic objective +namespace qcd +{ + struct qcd_min_solver + { + // geometry / sizes + int m_N = 0; // texels + int m_K = 0; // controls + int m_Q = 0; // label count + + // inputs (not owned), (N x K) row-major + const float* m_pU = nullptr; // grid to texel upsample matrix + + // cached + float_vec m_ucols; // N*K, column k at &m_ucols[k*m_N] + float_vec m_alpha; // K, ||u_k||^2 (>= eps) + float_vec m_labels; // Q, sorted unique u-labels (ints in [0..64]), ASTC raw [0,64] weights + + bool m_ready_flag = false; + + // init: cache columns, norms, and label set + bool init(const float* pU_rowmajor, int N, int K, const int* pLabels_u, int Q) + { + if ((!pU_rowmajor) || (!pLabels_u) || (N <= 0) || (K <= 0) || (Q <= 0)) + return false; + + m_pU = pU_rowmajor; + m_N = N; + m_K = K; + m_Q = Q; + + // cache columns + m_ucols.assign(size_t(N) * K, 0.0f); + + for (int k = 0; k < K; ++k) + { + float* pDst = &m_ucols[size_t(k) * size_t(N)]; + const float* pSrc = m_pU + k; // first element of column k + for (int t = 0; t < N; ++t) + pDst[t] = pSrc[size_t(t) * size_t(K)]; + } + + // column norms + m_alpha.resize(K); + + for (int k = 0; k < K; ++k) + { + const float* pUK = &m_ucols[size_t(k) * size_t(N)]; + + float a = 0.0f; + for (int t = 0; t < N; ++t) + a += pUK[t] * pUK[t]; + + if (!(a > 0.0f)) + a = 1e-8f; + + m_alpha[k] = a; + } + + m_labels.assign(pLabels_u, pLabels_u + Q); + +#if defined(_DEBUG) || defined(DEBUG) + for (size_t i = 1; i < m_labels.size(); ++i) + { + assert(m_labels[i] > m_labels[i - 1]); // strictly increasing + assert((m_labels[i] >= 0) && (m_labels[i] <= 64)); + } +#endif + + m_Q = (int)m_labels.size(); + if (m_Q <= 0) + return false; + + m_ready_flag = true; + return true; + } + + // compute residual r = U*g - w* (uses label IDs -> u-values) + void build_residual(const int* pG_idx, const float* pW_star, float* pR_out) const + { + assert(m_ready_flag && pG_idx && pW_star && pR_out); + + // r = sum_k (u_label[pG_idx[k]] * ucol_k) - pW_star + std::fill(pR_out, pR_out + m_N, 0.0f); + + for (int k = 0; k < m_K; ++k) + { + const float* pUK = &m_ucols[size_t(k) * size_t(m_N)]; + const float s = m_labels[pG_idx[k]]; + + for (int t = 0; t < m_N; ++t) + pR_out[t] += s * pUK[t]; + } + + for (int t = 0; t < m_N; ++t) + pR_out[t] -= pW_star[t]; + } + + // one QCD sweep: returns num moves accepted (strict dE < -eps) + int sweep(int* pG_idx, float* pR_io, float accept_eps = 1e-6f) const + { + assert(m_ready_flag && pG_idx && pR_io); + int num_moved = 0; + + for (int k = 0; k < m_K; ++k) + { + const float* pUK = &m_ucols[size_t(k) * size_t(m_N)]; + + // beta = + float beta = 0.0f; + for (int t = 0; t < m_N; ++t) + beta += pR_io[t] * pUK[t]; + + const float a = m_alpha[k]; // >= 1e-8 + + const float cur_u = m_labels[pG_idx[k]]; + const float s_star = cur_u - beta / a; // continuous minimizer (u-domain) + + // nearest label index to s_star (binary search) + const int j0 = nearest_label_idx(s_star); + + const int cand[3] = + { + j0, + (j0 + 1 < m_Q) ? (j0 + 1) : j0, + (j0 - 1 >= 0) ? (j0 - 1) : j0 + }; + + int best_j = pG_idx[k]; + float best_dE = 0.0f; + + for (int c = 0; c < 3; ++c) + { + const int j = cand[c]; + if (j == pG_idx[k]) + continue; + + const float s = m_labels[j]; + const float d = s - cur_u; // u-change at coord k + const float dE = 2.0f * d * beta + d * d * a; // exact delta E + + if ((best_j == pG_idx[k]) || (dE < best_dE)) + { + best_dE = dE; + best_j = j; + } + } + + if ((best_j != pG_idx[k]) && (best_dE < -accept_eps)) + { + // commit: update residual and label ID + const float d = m_labels[best_j] - cur_u; + + for (int t = 0; t < m_N; ++t) + pR_io[t] += d * pUK[t]; + + pG_idx[k] = best_j; + ++num_moved; + } + } // k + + return num_moved; + } + + // utility: energy from residual (sum r^2) + float residual_energy(const float* pR) const + { + assert(pR); + + float E = 0.0f; + for (int t = 0; t < m_N; ++t) + E += pR[t] * pR[t]; + + return E; + } + + private: + // nearest label index by u-value (handles non-uniform spacing) + int nearest_label_idx(float x) const + { + const int Q = m_Q; + + if (Q <= 1) + return 0; + if (x <= m_labels.front()) + return 0; + if (x >= m_labels.back()) + return Q - 1; + + int lo = 0, hi = Q - 1; + while (hi - lo > 1) + { + const int mid = (lo + hi) >> 1; + (x >= m_labels[mid]) ? lo = mid : hi = mid; + } + + const float dlo = std::fabs(x - m_labels[lo]); + const float dhi = std::fabs(x - m_labels[hi]); + return (dlo <= dhi) ? lo : hi; + } + }; + +} // namespace qcd + +// 1-3 subsets, requires initial weights +bool polish_block_weights( + uint32_t block_width, uint32_t block_height, + const astc_ldr::pixel_stats_t& pixel_stats, + astc_helpers::log_astc_block& enc_log_block, // assumes there is already a good encoding to improve here + const astc_ldr::cem_encode_params& params, + const astc_ldr::partition_pattern_vec* pPat, + bool& improved_flag, + bool gradient_descent_flag, bool polish_weights_flag, bool qcd_enabled_flag) +{ + improved_flag = false; + + if (!gradient_descent_flag && !polish_weights_flag && !qcd_enabled_flag) + return true; + + const uint32_t grid_width = enc_log_block.m_grid_width, grid_height = enc_log_block.m_grid_height; + const uint32_t cem_index = enc_log_block.m_color_endpoint_modes[0]; + const uint32_t num_subsets = enc_log_block.m_num_partitions; + const bool dual_plane_flag = enc_log_block.m_dual_plane; + //const uint32_t num_planes = dual_plane_flag ? 2 : 1; + const int ccs_index = dual_plane_flag ? enc_log_block.m_color_component_selector : -1; + + const uint32_t endpoint_ise_range = enc_log_block.m_endpoint_ise_range; + const uint32_t weight_ise_range = enc_log_block.m_weight_ise_range; + + const auto& dequant_tab = astc_helpers::g_dequant_tables.get_weight_tab(weight_ise_range).m_ISE_to_val; + const auto& quant_tab = astc_helpers::g_dequant_tables.get_weight_tab(weight_ise_range).m_val_to_ise; + + //const bool is_downsampling = (grid_width < block_width) || (grid_height < block_height); + +#if defined(_DEBUG) || defined(DEBUG) + if (num_subsets > 1) + { + for (uint32_t i = 1; i < num_subsets; i++) + { + assert(enc_log_block.m_color_endpoint_modes[i] == cem_index); + } + } +#endif + + //const astc_block_grid_data* pBlock_grid_data = find_astc_block_grid_data(block_width, block_height, grid_width, grid_height); + + const uint32_t total_block_pixels = block_width * block_height; + const uint32_t total_grid_pixels = grid_width * grid_height; + + uint64_t cur_err = eval_error(block_width, block_height, enc_log_block, pixel_stats, params); + + uint8_t weights0[astc_ldr::ASTC_LDR_MAX_BLOCK_PIXELS]; + uint8_t weights1[astc_ldr::ASTC_LDR_MAX_BLOCK_PIXELS]; + + astc_helpers::extract_weights(enc_log_block, weights0, 0); + + if (dual_plane_flag) + astc_helpers::extract_weights(enc_log_block, weights1, 1); + + const bool global_gradient_desc_enabled = true; + const bool global_qcd_enabled = true; + const bool global_polish_weights_enabled = true; + + const uint32_t NUM_WEIGHT_POLISH_PASSES = 1; + + // Gradient descent + if ((gradient_descent_flag) && (global_gradient_desc_enabled)) + { + // Downsample the residuals to grid res + vector2D upsample_matrix; + compute_upsample_matrix(upsample_matrix, block_width, block_height, grid_width, grid_height); + + // First compute the block's ideal raw weights given the current endpoints at full block/texel res + // TODO: Move to helper + uint8_t ideal_block_raw_weights0[astc_ldr::ASTC_LDR_MAX_BLOCK_PIXELS], ideal_block_raw_weights1[astc_ldr::ASTC_LDR_MAX_BLOCK_PIXELS]; + + if (num_subsets == 1) + { + if (dual_plane_flag) + astc_ldr::eval_solution_dp(pixel_stats, cem_index, ccs_index, enc_log_block.m_endpoints, endpoint_ise_range, ideal_block_raw_weights0, ideal_block_raw_weights1, astc_helpers::BISE_64_LEVELS, params); + else + astc_ldr::eval_solution(pixel_stats, cem_index, enc_log_block.m_endpoints, endpoint_ise_range, ideal_block_raw_weights0, astc_helpers::BISE_64_LEVELS, params); + } + else + { + // Extract each subset's texels, compute the raw weights, place back into full res texel/block weight grid. + color_rgba part_pixels[astc_helpers::MAX_PARTITIONS][astc_ldr::ASTC_LDR_MAX_BLOCK_PIXELS]; + uint32_t num_part_pixels[astc_helpers::MAX_PARTITIONS] = { 0 }; + + for (uint32_t y = 0; y < block_height; y++) + { + for (uint32_t x = 0; x < block_width; x++) + { + const color_rgba& px = pixel_stats.m_pixels[x + y * block_width]; + + const uint32_t part_index = (*pPat)(x, y); + assert(part_index < num_subsets); + + // Sanity check + assert(part_index == (uint32_t)astc_helpers::compute_texel_partition(enc_log_block.m_partition_id, x, y, 0, num_subsets, astc_helpers::is_small_block(block_width, block_height))); + + part_pixels[part_index][num_part_pixels[part_index]] = px; + num_part_pixels[part_index]++; + } // x + } // y + + astc_ldr::pixel_stats_t part_pixel_stats[astc_helpers::MAX_PARTITIONS]; + + for (uint32_t i = 0; i < num_subsets; i++) + part_pixel_stats[i].clear(); + + uint8_t part_raw_weights[astc_helpers::MAX_PARTITIONS][astc_ldr::ASTC_LDR_MAX_BLOCK_PIXELS]; + + for (uint32_t part_index = 0; part_index < num_subsets; part_index++) + { + part_pixel_stats[part_index].init(num_part_pixels[part_index], &part_pixels[part_index][0]); + + const uint8_t* pPart_endpoints = astc_helpers::get_endpoints(enc_log_block, part_index); + + astc_ldr::eval_solution(part_pixel_stats[part_index], cem_index, pPart_endpoints, endpoint_ise_range, &part_raw_weights[part_index][0], astc_helpers::BISE_64_LEVELS, params); + + } // part_index + + clear_obj(num_part_pixels); + + for (uint32_t y = 0; y < block_height; y++) + { + for (uint32_t x = 0; x < block_width; x++) + { + const uint32_t part_index = (*pPat)(x, y); + assert(part_index < num_subsets); + + ideal_block_raw_weights0[x + y * block_width] = part_raw_weights[part_index][num_part_pixels[part_index]]; + num_part_pixels[part_index]++; + } // x + } // y + } + +#if 1 + // Now compute the current block/texel res (upsampled) raw [0,64] weights given the current quantized grid weights. Dequant then upsample. + // This is what an ASTC decoder would use during unpacking. + uint8_t dequantized_grid_weights0[astc_ldr::ASTC_LDR_MAX_BLOCK_PIXELS], dequantized_grid_weights1[astc_ldr::ASTC_LDR_MAX_BLOCK_PIXELS]; + uint8_t dequantized_block_weights_upsampled0[astc_ldr::ASTC_LDR_MAX_BLOCK_PIXELS], dequantized_block_weights_upsampled1[astc_ldr::ASTC_LDR_MAX_BLOCK_PIXELS]; + + astc_ldr_requantize_astc_weights(total_grid_pixels, weights0, weight_ise_range, dequantized_grid_weights0, astc_helpers::BISE_64_LEVELS); + + if (dual_plane_flag) + astc_ldr_requantize_astc_weights(total_grid_pixels, weights1, weight_ise_range, dequantized_grid_weights1, astc_helpers::BISE_64_LEVELS); + + astc_helpers::upsample_weight_grid( + block_width, block_height, // destination/to dimension + grid_width, grid_height, // source/from dimension + dequantized_grid_weights0, // these are dequantized [0,64] weights, NOT ISE symbols, [wy][wx] + dequantized_block_weights_upsampled0); // [by][bx] + + if (dual_plane_flag) + { + astc_helpers::upsample_weight_grid( + block_width, block_height, // destination/to dimension + grid_width, grid_height, // source/from dimension + dequantized_grid_weights1, // these are dequantized [0,64] weights, NOT ISE symbols, [wy][wx] + dequantized_block_weights_upsampled1); // [by][bx] + } + + // Now compute residuals at the block res + int weight_block_raw_residuals0[astc_ldr::ASTC_LDR_MAX_BLOCK_PIXELS], weight_block_raw_residuals1[astc_ldr::ASTC_LDR_MAX_BLOCK_PIXELS]; + + for (uint32_t i = 0; i < total_block_pixels; i++) + weight_block_raw_residuals0[i] = ideal_block_raw_weights0[i] - dequantized_block_weights_upsampled0[i]; + + if (dual_plane_flag) + { + for (uint32_t i = 0; i < total_block_pixels; i++) + weight_block_raw_residuals1[i] = ideal_block_raw_weights1[i] - dequantized_block_weights_upsampled1[i]; + } + + float weight_grid_residuals_downsampled0[astc_ldr::ASTC_LDR_MAX_BLOCK_PIXELS], weight_grid_residuals_downsampled1[astc_ldr::ASTC_LDR_MAX_BLOCK_PIXELS]; + + basisu::vector unweighted_downsample_matrix; + + // TODO: precompute, store in weight grid data + compute_upsample_matrix_transposed(unweighted_downsample_matrix, block_width, block_height, grid_width, grid_height); + + basisu::vector diag_AtA(total_grid_pixels); + compute_diag_AtA_vector(block_width, block_height, grid_width, grid_height, upsample_matrix, diag_AtA.get_ptr()); + + downsample_weight_residual_grid( + unweighted_downsample_matrix.get_ptr(), + block_width, block_height, // source/from dimension (block size) + grid_width, grid_height, // dest/to dimension (grid size) + weight_block_raw_residuals0, // these are dequantized weights, NOT ISE symbols, [by][bx] + weight_grid_residuals_downsampled0); // [wy][wx] + + for (uint32_t i = 0; i < total_grid_pixels; i++) + weight_grid_residuals_downsampled0[i] /= diag_AtA[i]; + + if (dual_plane_flag) + { + downsample_weight_residual_grid( + unweighted_downsample_matrix.get_ptr(), + block_width, block_height, // source/from dimension (block size) + grid_width, grid_height, // dest/to dimension (grid size) + weight_block_raw_residuals1, // these are dequantized weights, NOT ISE symbols, [by][bx] + weight_grid_residuals_downsampled1); // [wy][wx] + + for (uint32_t i = 0; i < total_grid_pixels; i++) + weight_grid_residuals_downsampled1[i] /= diag_AtA[i]; + } + + // Apply the residuals at grid res and quantize + const float Q = 1.0f; + + uint8_t refined_grid_weights0[astc_ldr::ASTC_LDR_MAX_BLOCK_PIXELS], refined_grid_weights1[astc_ldr::ASTC_LDR_MAX_BLOCK_PIXELS]; + + for (uint32_t i = 0; i < total_grid_pixels; i++) + { + float v = (float)dequant_tab[weights0[i]] + weight_grid_residuals_downsampled0[i] * Q; + int iv = clamp((int)std::roundf(v), 0, 64); + refined_grid_weights0[i] = quant_tab[iv]; + } + + if (dual_plane_flag) + { + for (uint32_t i = 0; i < total_grid_pixels; i++) + { + float v = (float)dequant_tab[weights1[i]] + weight_grid_residuals_downsampled1[i] * Q; + int iv = clamp((int)std::roundf(v), 0, 64); + refined_grid_weights1[i] = quant_tab[iv]; + } + } +#else + uint8_t refined_grid_weights0[astc_ldr::ASTC_LDR_MAX_BLOCK_PIXELS], refined_grid_weights1[astc_ldr::ASTC_LDR_MAX_BLOCK_PIXELS]; + + for (uint32_t i = 0; i < total_grid_pixels; i++) + refined_grid_weights0[i] = weights0[i]; + + if (dual_plane_flag) + { + for (uint32_t i = 0; i < total_grid_pixels; i++) + refined_grid_weights1[i] = weights1[i]; + } +#endif + + astc_helpers::log_astc_block refined_log_block(enc_log_block); + + // TODO: This refines both weight planes simultanously, probably not optimal, could do individually. + astc_helpers::set_weights(refined_log_block, refined_grid_weights0, 0); + + if (dual_plane_flag) + astc_helpers::set_weights(refined_log_block, refined_grid_weights1, 1); + + uint64_t refined_err = eval_error(block_width, block_height, refined_log_block, pixel_stats, params); + + if (refined_err < cur_err) + { + cur_err = refined_err; + + memcpy(weights0, refined_grid_weights0, total_grid_pixels); + + if (dual_plane_flag) + memcpy(weights1, refined_grid_weights1, total_grid_pixels); + + improved_flag = true; + } + + // QCD - not a huge boost (.05-.75 dB), but on the toughest blocks it does help. + if ((qcd_enabled_flag) && (global_qcd_enabled)) + { + float ideal_block_weights0[astc_ldr::ASTC_LDR_MAX_BLOCK_PIXELS], ideal_block_weights1[astc_ldr::ASTC_LDR_MAX_BLOCK_PIXELS]; + for (uint32_t i = 0; i < total_block_pixels; i++) + { + ideal_block_weights0[i] = (float)ideal_block_raw_weights0[i]; + + if (dual_plane_flag) + ideal_block_weights1[i] = (float)ideal_block_raw_weights1[i]; + } + + const float* pUpsample_matrix = basist::astc_ldr_t::find_astc_block_grid_data(block_width, block_height, grid_width, grid_height)->m_upsample_matrix.get_ptr(); + + qcd::qcd_min_solver solver; + + const uint32_t num_weight_levels = astc_helpers::get_ise_levels(weight_ise_range); + + assert(num_weight_levels <= 32); + int labels[32 + 1]; + + for (uint32_t i = 0; i < num_weight_levels; i++) + labels[i] = astc_helpers::g_dequant_tables.get_weight_tab(weight_ise_range).get_rank_to_val(i); + + solver.init(pUpsample_matrix, total_block_pixels, total_grid_pixels, labels, num_weight_levels); + + int grid_idx0[astc_ldr::ASTC_LDR_MAX_BLOCK_PIXELS], grid_idx1[astc_ldr::ASTC_LDR_MAX_BLOCK_PIXELS]; + + const auto& ise_to_rank = astc_helpers::g_dequant_tables.get_weight_tab(weight_ise_range).m_ISE_to_rank; + + for (uint32_t i = 0; i < total_grid_pixels; i++) + { + grid_idx0[i] = ise_to_rank[refined_grid_weights0[i]]; + + if (dual_plane_flag) + grid_idx1[i] = ise_to_rank[refined_grid_weights1[i]]; + } + + float resid0[astc_ldr::ASTC_LDR_MAX_BLOCK_PIXELS], resid1[astc_ldr::ASTC_LDR_MAX_BLOCK_PIXELS]; + + solver.build_residual(grid_idx0, ideal_block_weights0, resid0); + + const uint32_t MAX_QCD_SWEEPS = 5; + for (uint32_t t = 0; t < MAX_QCD_SWEEPS; t++) + { + int moved0 = solver.sweep(grid_idx0, resid0); + if (!moved0) + break; + } + + if (dual_plane_flag) + { + solver.build_residual(grid_idx1, ideal_block_weights1, resid1); + + for (uint32_t t = 0; t < MAX_QCD_SWEEPS; t++) + { + int moved1 = solver.sweep(grid_idx1, resid1); + if (!moved1) + break; + } + } + + const auto& rank_to_ise = astc_helpers::g_dequant_tables.get_weight_tab(weight_ise_range).m_rank_to_ISE; + + for (uint32_t i = 0; i < total_grid_pixels; i++) + { + refined_grid_weights0[i] = rank_to_ise[grid_idx0[i]]; + + if (dual_plane_flag) + refined_grid_weights1[i] = rank_to_ise[grid_idx1[i]]; + } + + refined_log_block = enc_log_block; + + astc_helpers::set_weights(refined_log_block, refined_grid_weights0, 0); + + if (dual_plane_flag) + astc_helpers::set_weights(refined_log_block, refined_grid_weights1, 1); + + refined_err = eval_error(block_width, block_height, refined_log_block, pixel_stats, params); + + if (refined_err < cur_err) + { + cur_err = refined_err; + + memcpy(weights0, refined_grid_weights0, total_grid_pixels); + + if (dual_plane_flag) + memcpy(weights1, refined_grid_weights1, total_grid_pixels); + + improved_flag = true; + } + } + } // if (qcd_enabled) + + if ((polish_weights_flag) && (global_polish_weights_enabled)) + { + // Final, expensive, weight polish. Much can be done to improve this, but it's hopefully not ran much in the first place. + // TODO: The dB gain from this is large, must optimize. + for (uint32_t polish_pass = 0; polish_pass < NUM_WEIGHT_POLISH_PASSES; polish_pass++) + { + for (uint32_t y = 0; y < grid_height; y++) + { + for (uint32_t x = 0; x < grid_width; x++) + { + for (uint32_t plane_iter = 0; plane_iter < (dual_plane_flag ? 2u : 1u); plane_iter++) + { + uint8_t base_grid_weights0[astc_ldr::ASTC_LDR_MAX_BLOCK_PIXELS], base_grid_weights1[astc_ldr::ASTC_LDR_MAX_BLOCK_PIXELS]; + + memcpy(base_grid_weights0, weights0, total_grid_pixels); + if (dual_plane_flag) + memcpy(base_grid_weights1, weights1, total_grid_pixels); + + for (int delta = -1; delta <= 1; delta += 2) + { + uint8_t trial_grid_weights0[astc_ldr::ASTC_LDR_MAX_BLOCK_PIXELS], trial_grid_weights1[astc_ldr::ASTC_LDR_MAX_BLOCK_PIXELS]; + + memcpy(trial_grid_weights0, base_grid_weights0, total_grid_pixels); + + if (dual_plane_flag) + memcpy(trial_grid_weights1, base_grid_weights1, total_grid_pixels); + + if (plane_iter == 0) + trial_grid_weights0[x + y * grid_width] = (uint8_t)astc_ldr::apply_delta_to_bise_weight_val(weight_ise_range, base_grid_weights0[x + y * grid_width], delta); + else + trial_grid_weights1[x + y * grid_width] = (uint8_t)astc_ldr::apply_delta_to_bise_weight_val(weight_ise_range, base_grid_weights1[x + y * grid_width], delta); + + astc_helpers::log_astc_block trial_log_block(enc_log_block); + + astc_helpers::set_weights(trial_log_block, trial_grid_weights0, 0); + + if (dual_plane_flag) + astc_helpers::set_weights(trial_log_block, trial_grid_weights1, 1); + + uint64_t trial_err = eval_error(block_width, block_height, trial_log_block, pixel_stats, params); + + if (trial_err < cur_err) + { + cur_err = trial_err; + + memcpy(weights0, trial_grid_weights0, total_grid_pixels); + + if (dual_plane_flag) + memcpy(weights1, trial_grid_weights1, total_grid_pixels); + + improved_flag = true; + } + + } // delta + + } // plane_iter + + } // x + } // y + + } // polish_pass + + } // polish_flag + + astc_helpers::log_astc_block new_log_block(enc_log_block); + + astc_helpers::set_weights(new_log_block, weights0, 0); + + if (dual_plane_flag) + astc_helpers::set_weights(new_log_block, weights1, 1); + +#if defined(_DEBUG) || defined(DEBUG) + uint64_t new_err = eval_error(block_width, block_height, new_log_block, pixel_stats, params); + + assert(cur_err == new_err); + + if (improved_flag) + { + uint64_t orig_err = eval_error(block_width, block_height, enc_log_block, pixel_stats, params); + + assert(new_err < orig_err); + } +#endif + + enc_log_block = new_log_block; + + return true; +} + +bool encode_trial_subsets( + uint32_t block_width, uint32_t block_height, + const astc_ldr::pixel_stats_t& pixel_stats, + uint32_t cem_index, uint32_t num_parts, + uint32_t pat_seed_index, const astc_ldr::partition_pattern_vec* pPat, // seed index is a ASTC partition pattern index + uint32_t endpoint_ise_range, uint32_t weight_ise_range, + uint32_t grid_width, uint32_t grid_height, + astc_helpers::log_astc_block& enc_log_block, + const astc_ldr::cem_encode_params& params, + bool refine_only_flag = false, + bool gradient_descent_flag = true, bool polish_weights_flag = true, bool qcd_enabled_flag = true, + bool use_blue_contraction = true, + bool* pBase_ofs_clamped_flag = nullptr) +{ + assert((num_parts >= 2) && (num_parts <= astc_helpers::MAX_PARTITIONS)); + assert(pPat); + assert(pat_seed_index < astc_helpers::NUM_PARTITION_PATTERNS); + + if (pBase_ofs_clamped_flag) + *pBase_ofs_clamped_flag = false; + + const bool is_downsampling = (grid_width < block_width) || (grid_height < block_height); + //const uint32_t total_block_pixels = block_width * block_height; + const uint32_t total_grid_pixels = grid_width * grid_height; + + color_rgba part_pixels[astc_helpers::MAX_PARTITIONS][astc_ldr::ASTC_LDR_MAX_BLOCK_PIXELS]; + uint32_t num_part_pixels[astc_helpers::MAX_PARTITIONS] = { 0 }; + + for (uint32_t y = 0; y < block_height; y++) + { + for (uint32_t x = 0; x < block_width; x++) + { + const color_rgba& px = pixel_stats.m_pixels[x + y * block_width]; + + const uint32_t part_index = (*pPat)(x, y); + assert(part_index < num_parts); + + part_pixels[part_index][num_part_pixels[part_index]] = px; + num_part_pixels[part_index]++; + } // x + } // y + +#if defined(_DEBUG) || defined(DEBUG) + for (uint32_t i = 0; i < num_parts; i++) + assert(num_part_pixels[i]); +#endif + + astc_ldr::pixel_stats_t part_pixel_stats[astc_helpers::MAX_PARTITIONS]; + + for (uint32_t i = 0; i < num_parts; i++) + part_pixel_stats[i].clear(); + + uint8_t part_endpoints[astc_helpers::MAX_PARTITIONS][astc_helpers::MAX_CEM_ENDPOINT_VALS]; + uint8_t part_weights[astc_helpers::MAX_PARTITIONS][astc_ldr::ASTC_LDR_MAX_BLOCK_PIXELS]; + + for (uint32_t part_index = 0; part_index < num_parts; part_index++) + { + part_pixel_stats[part_index].init(num_part_pixels[part_index], &part_pixels[part_index][0]); + + if (!refine_only_flag) + { + bool base_ofs_clamped_flag = false; + + // Encode at block res, but with quantized weights + uint64_t block_err = astc_ldr::cem_encode_pixels(cem_index, -1, part_pixel_stats[part_index], params, + endpoint_ise_range, weight_ise_range, + &part_endpoints[part_index][0], &part_weights[part_index][0], nullptr, UINT64_MAX, use_blue_contraction, &base_ofs_clamped_flag); + + if (block_err == UINT64_MAX) + return false; + + if ((pBase_ofs_clamped_flag) && (base_ofs_clamped_flag)) + *pBase_ofs_clamped_flag = true; + } + + } // part_index + + const uint32_t num_endpoint_vals = astc_helpers::get_num_cem_values(cem_index); + + if (!refine_only_flag) + { + uint8_t block_weights[astc_ldr::ASTC_LDR_MAX_BLOCK_PIXELS]; + + clear_obj(num_part_pixels); + + for (uint32_t y = 0; y < block_height; y++) + { + for (uint32_t x = 0; x < block_width; x++) + { + const uint32_t part_index = (*pPat)(x, y); + assert(part_index < num_parts); + + block_weights[x + y * block_width] = part_weights[part_index][num_part_pixels[part_index]]; + num_part_pixels[part_index]++; + } // x + } // y + + enc_log_block.clear(); + + enc_log_block.m_grid_width = (uint8_t)grid_width; + enc_log_block.m_grid_height = (uint8_t)grid_height; + enc_log_block.m_weight_ise_range = (uint8_t)weight_ise_range; + enc_log_block.m_endpoint_ise_range = (uint8_t)endpoint_ise_range; + + enc_log_block.m_num_partitions = (uint8_t)num_parts; + for (uint32_t i = 0; i < num_parts; i++) + enc_log_block.m_color_endpoint_modes[i] = (uint8_t)cem_index; + enc_log_block.m_partition_id = (uint16_t)pat_seed_index; + + if (is_downsampling) + { + // TODO: Make the downsample step faster + const float* pDownsample_matrix = basist::astc_ldr_t::find_astc_block_grid_data(block_width, block_height, grid_width, grid_height)->m_downsample_matrix.get_ptr(); + + // Now downsample the weight grid (quantized to quantized) + astc_ldr_downsample_ise_weights( + weight_ise_range, weight_ise_range, + block_width, block_height, + grid_width, grid_height, + block_weights, enc_log_block.m_weights, + pDownsample_matrix); + } + else + { + memcpy(enc_log_block.m_weights, block_weights, total_grid_pixels); + } + + for (uint32_t p = 0; p < num_parts; p++) + memcpy(enc_log_block.m_endpoints + num_endpoint_vals * p, &part_endpoints[p][0], num_endpoint_vals); + } + + // attempt endpoint refinement given the current weights + // TODO: Expose to caller + const uint32_t NUM_REFINEMENT_PASSES = 3; + for (uint32_t refine_pass = 0; refine_pass < NUM_REFINEMENT_PASSES; refine_pass++) + { + uint8_t dequantized_raw_weights0[astc_ldr::ASTC_LDR_MAX_BLOCK_PIXELS]; + uint8_t upsampled_weights0[astc_ldr::ASTC_LDR_MAX_BLOCK_PIXELS]; // raw weights, NOT ISE + + for (uint32_t i = 0; i < total_grid_pixels; i++) + dequantized_raw_weights0[i] = astc_helpers::g_dequant_tables.get_weight_tab(weight_ise_range).m_ISE_to_val[enc_log_block.m_weights[i]]; + + astc_helpers::upsample_weight_grid(block_width, block_height, grid_width, grid_height, dequantized_raw_weights0, upsampled_weights0); + + astc_helpers::log_astc_block alt_enc_log_block(enc_log_block); + + uint8_t raw_part_weights[astc_helpers::MAX_PARTITIONS][astc_ldr::ASTC_LDR_MAX_BLOCK_PIXELS]; + + clear_obj(num_part_pixels); + + for (uint32_t y = 0; y < block_height; y++) + { + for (uint32_t x = 0; x < block_width; x++) + { + const uint32_t part_index = (*pPat)(x, y); + assert(part_index < num_parts); + + raw_part_weights[part_index][num_part_pixels[part_index]] = upsampled_weights0[x + y * block_width]; + num_part_pixels[part_index]++; + } // x + } // y + + for (uint32_t part_index = 0; part_index < num_parts; part_index++) + { + assert(num_part_pixels[part_index] == part_pixel_stats[part_index].m_num_pixels); + + astc_ldr::cem_encode_params temp_params(params); + temp_params.m_pForced_weight_vals0 = &raw_part_weights[part_index][0]; + + uint8_t temp_weights[astc_ldr::ASTC_LDR_MAX_BLOCK_PIXELS]; + + bool base_ofs_clamped_flag = false; + + // Encode at block res, but with quantized weights + uint64_t block_err = astc_ldr::cem_encode_pixels(cem_index, -1, part_pixel_stats[part_index], temp_params, + endpoint_ise_range, astc_helpers::BISE_64_LEVELS, + &alt_enc_log_block.m_endpoints[num_endpoint_vals * part_index], temp_weights, nullptr, UINT64_MAX, use_blue_contraction, &base_ofs_clamped_flag); + + if (block_err == UINT64_MAX) + return false; + + if ((pBase_ofs_clamped_flag) && (base_ofs_clamped_flag)) + *pBase_ofs_clamped_flag = true; + +#if defined(_DEBUG) || defined(DEBUG) + for (uint32_t i = 0; i < part_pixel_stats[part_index].m_num_pixels; i++) + { + assert(temp_weights[i] == temp_params.m_pForced_weight_vals0[i]); + } +#endif + + } // part_index + + uint64_t cur_err = eval_error(block_width, block_height, enc_log_block, pixel_stats, params); + uint64_t ref_err = eval_error(block_width, block_height, alt_enc_log_block, pixel_stats, params); + + if (ref_err < cur_err) + { + memcpy(&enc_log_block, &alt_enc_log_block, sizeof(astc_helpers::log_astc_block)); + } + + if (refine_pass == (NUM_REFINEMENT_PASSES - 1)) + break; + + if ((is_downsampling) && (gradient_descent_flag || polish_weights_flag)) + { + bool improved_flag = false; + bool status = polish_block_weights(block_width, block_height, pixel_stats, enc_log_block, params, pPat, improved_flag, gradient_descent_flag, polish_weights_flag, qcd_enabled_flag); + if (!status) + { + assert(0); + } + + if (!improved_flag) + break; + } + else + { + break; + } + } // refine_pass + + return true; +} + +bool encode_trial( + uint32_t block_width, uint32_t block_height, + const astc_ldr::pixel_stats_t& pixel_stats, + uint32_t cem_index, + bool dual_plane_flag, int ccs_index, + uint32_t endpoint_ise_range, uint32_t weight_ise_range, + uint32_t grid_width, uint32_t grid_height, + astc_helpers::log_astc_block& enc_log_block, + const astc_ldr::cem_encode_params& params, + bool gradient_descent_flag = true, bool polish_weights_flag = true, bool qcd_enabled_flag = true, + bool use_blue_contraction = true, + bool* pBase_ofs_clamped_flag = nullptr) +{ + assert(dual_plane_flag || (ccs_index == -1)); + + if (pBase_ofs_clamped_flag) + *pBase_ofs_clamped_flag = false; + + const bool is_downsampling = (grid_width < block_width) || (grid_height < block_height); + + const basist::astc_ldr_t::astc_block_grid_data* pBlock_grid_data = basist::astc_ldr_t::find_astc_block_grid_data(block_width, block_height, grid_width, grid_height); + + const float* pDownsample_matrix = nullptr; + if (is_downsampling) + pDownsample_matrix = pBlock_grid_data->m_downsample_matrix.get_ptr(); + + //const uint32_t total_block_pixels = block_width * block_height; + const uint32_t total_grid_pixels = grid_width * grid_height; + + const auto& dequant_tab = astc_helpers::g_dequant_tables.get_weight_tab(weight_ise_range).m_ISE_to_val; + //const auto& quant_tab = astc_helpers::g_dequant_tables.get_weight_tab(weight_ise_range).m_val_to_ise; + + enc_log_block.clear(); + + enc_log_block.m_grid_width = (uint8_t)grid_width; + enc_log_block.m_grid_height = (uint8_t)grid_height; + enc_log_block.m_weight_ise_range = (uint8_t)weight_ise_range; + enc_log_block.m_endpoint_ise_range = (uint8_t)endpoint_ise_range; + + enc_log_block.m_dual_plane = dual_plane_flag; + if (dual_plane_flag) + { + assert((ccs_index >= 0) && (ccs_index <= 3)); + enc_log_block.m_color_component_selector = (uint8_t)ccs_index; + } + else + { + assert(ccs_index == -1); + } + + enc_log_block.m_num_partitions = 1; + enc_log_block.m_color_endpoint_modes[0] = (uint8_t)cem_index; + + uint8_t fullres_endpoints[astc_helpers::MAX_CEM_ENDPOINT_VALS]; + uint8_t weights0[astc_ldr::ASTC_LDR_MAX_BLOCK_PIXELS]; + uint8_t weights1[astc_ldr::ASTC_LDR_MAX_BLOCK_PIXELS]; + + if ((grid_width == block_width) && (grid_height == block_height)) + { + bool base_ofs_clamped_flag = false; + + uint64_t block_err = astc_ldr::cem_encode_pixels(cem_index, ccs_index, pixel_stats, params, + endpoint_ise_range, weight_ise_range, + fullres_endpoints, weights0, weights1, UINT64_MAX, use_blue_contraction, &base_ofs_clamped_flag); + + if (block_err == UINT64_MAX) + return false; + + if ((pBase_ofs_clamped_flag) && (base_ofs_clamped_flag)) + *pBase_ofs_clamped_flag = base_ofs_clamped_flag; + + if (dual_plane_flag) + { + for (uint32_t i = 0; i < total_grid_pixels; i++) + { + enc_log_block.m_weights[i * 2 + 0] = weights0[i]; + enc_log_block.m_weights[i * 2 + 1] = weights1[i]; + } + } + else + { + memcpy(enc_log_block.m_weights, weights0, total_grid_pixels); + } + + memcpy(enc_log_block.m_endpoints, fullres_endpoints, astc_helpers::get_num_cem_values(cem_index)); + + return true; + } + + // Handle downsampled weight grids case + + uint8_t fullres_raw_weights0[astc_ldr::ASTC_LDR_MAX_BLOCK_PIXELS]; + uint8_t fullres_raw_weights1[astc_ldr::ASTC_LDR_MAX_BLOCK_PIXELS]; + + bool base_ofs_clamped_flag = false; + + // Encode at block res, but with quantized weights + uint64_t block_err = astc_ldr::cem_encode_pixels(cem_index, ccs_index, pixel_stats, params, + endpoint_ise_range, weight_ise_range, + fullres_endpoints, fullres_raw_weights0, fullres_raw_weights1, UINT64_MAX, use_blue_contraction, &base_ofs_clamped_flag); + + if (block_err == UINT64_MAX) + return false; + + if ((pBase_ofs_clamped_flag) && (base_ofs_clamped_flag)) + *pBase_ofs_clamped_flag = base_ofs_clamped_flag; + + // Now downsample the weight grid (quantized to quantized) + astc_ldr_downsample_ise_weights( + weight_ise_range, weight_ise_range, + block_width, block_height, + grid_width, grid_height, + fullres_raw_weights0, weights0, + pDownsample_matrix); + + astc_helpers::set_weights(enc_log_block, weights0, 0); + + if (dual_plane_flag) + { + astc_ldr_downsample_ise_weights( + weight_ise_range, weight_ise_range, + block_width, block_height, + grid_width, grid_height, + fullres_raw_weights1, weights1, + pDownsample_matrix); + } + + if (dual_plane_flag) + astc_helpers::set_weights(enc_log_block, weights1, 1); + + memcpy(enc_log_block.m_endpoints, fullres_endpoints, astc_helpers::get_num_cem_values(cem_index)); + + // TODO: Expose to caller + const uint32_t NUM_OUTER_PASSES = 3; + for (uint32_t outer_pass = 0; outer_pass < NUM_OUTER_PASSES; outer_pass++) + { + // endpoint refinement, given current upsampled weights + { + astc_helpers::extract_weights(enc_log_block, weights0, 0); + + if (dual_plane_flag) + astc_helpers::extract_weights(enc_log_block, weights1, 1); + + // Plane 0 + uint8_t dequantized_raw_weights0[astc_ldr::ASTC_LDR_MAX_BLOCK_PIXELS]; + uint8_t upsampled_weights0[astc_ldr::ASTC_LDR_MAX_BLOCK_PIXELS]; // raw weights, NOT ISE + + for (uint32_t i = 0; i < total_grid_pixels; i++) + dequantized_raw_weights0[i] = dequant_tab[weights0[i]]; + + astc_helpers::upsample_weight_grid(block_width, block_height, grid_width, grid_height, dequantized_raw_weights0, upsampled_weights0); + + // Plane 1 + uint8_t dequantized_raw_weights1[astc_ldr::ASTC_LDR_MAX_BLOCK_PIXELS]; + uint8_t upsampled_weights1[astc_ldr::ASTC_LDR_MAX_BLOCK_PIXELS]; // raw weights, NOT ISE + + if (dual_plane_flag) + { + for (uint32_t i = 0; i < total_grid_pixels; i++) + dequantized_raw_weights1[i] = dequant_tab[weights1[i]]; + astc_helpers::upsample_weight_grid(block_width, block_height, grid_width, grid_height, dequantized_raw_weights1, upsampled_weights1); + } + + // Jam in the weights to the actual raw [0,64] weights the decoder is going to use after upsampling the grid. + astc_ldr::cem_encode_params refine_params(params); + refine_params.m_pForced_weight_vals0 = upsampled_weights0; + if (dual_plane_flag) + refine_params.m_pForced_weight_vals1 = upsampled_weights1; + + uint8_t refined_endpoints[astc_helpers::MAX_CEM_ENDPOINT_VALS]; + uint8_t refined_weights0[astc_ldr::ASTC_LDR_MAX_BLOCK_PIXELS]; + uint8_t refined_weights1[astc_ldr::ASTC_LDR_MAX_BLOCK_PIXELS]; + + uint64_t refined_block_err = astc_ldr::cem_encode_pixels(cem_index, ccs_index, pixel_stats, refine_params, + endpoint_ise_range, astc_helpers::BISE_64_LEVELS, + refined_endpoints, refined_weights0, refined_weights1, UINT64_MAX, use_blue_contraction, &base_ofs_clamped_flag); + assert(refined_block_err != UINT64_MAX); + + if ((pBase_ofs_clamped_flag) && (base_ofs_clamped_flag)) + *pBase_ofs_clamped_flag = base_ofs_clamped_flag; + + if (refined_block_err != UINT64_MAX) + { + uint64_t cur_err = eval_error( + block_width, block_height, + pixel_stats, + cem_index, + dual_plane_flag, ccs_index, + endpoint_ise_range, weight_ise_range, + grid_width, grid_height, + enc_log_block.m_endpoints, weights0, weights1, + params); + + if (refined_block_err < cur_err) + { + memcpy(enc_log_block.m_endpoints, refined_endpoints, astc_helpers::get_num_cem_values(cem_index)); + } + } + } + + if (outer_pass == (NUM_OUTER_PASSES - 1)) + break; + + if ((!gradient_descent_flag) && (!polish_weights_flag)) + break; + + bool improved_flag = false; + + bool status = polish_block_weights( + block_width, block_height, + pixel_stats, + enc_log_block, // assumes there is already a good encoding to improve here + params, + nullptr, + improved_flag, + gradient_descent_flag, + polish_weights_flag, + qcd_enabled_flag); + + if (!status) + { + assert(0); + return false; + } + + if (!improved_flag) + break; + + } // outer_pass + + return true; +} + +// 1 part only, refines endpoints given current weights +bool encode_trial_refine_only( + uint32_t block_width, uint32_t block_height, + const astc_ldr::pixel_stats_t& pixel_stats, + astc_helpers::log_astc_block& enc_log_block, + const astc_ldr::cem_encode_params& params, + bool use_blue_contraction = true, + bool* pBase_ofs_clamped_flag = nullptr) +{ + assert(enc_log_block.m_num_partitions == 1); + + if (pBase_ofs_clamped_flag) + *pBase_ofs_clamped_flag = false; + + const uint32_t cem_index = enc_log_block.m_color_endpoint_modes[0]; + const bool dual_plane_flag = enc_log_block.m_dual_plane; + const int ccs_index = dual_plane_flag ? enc_log_block.m_color_component_selector : -1; + const uint32_t endpoint_ise_range = enc_log_block.m_endpoint_ise_range; + const uint32_t weight_ise_range = enc_log_block.m_weight_ise_range; + const uint32_t grid_width = enc_log_block.m_grid_width; + const uint32_t grid_height = enc_log_block.m_grid_height; + + //const bool is_downsampling = (grid_width < block_width) || (grid_height < block_height); + + //const uint32_t total_block_pixels = block_width * block_height; + const uint32_t total_grid_pixels = grid_width * grid_height; + + uint8_t dequantized_raw_weights0[astc_ldr::ASTC_LDR_MAX_BLOCK_PIXELS]; + uint8_t upsampled_weights0[astc_ldr::ASTC_LDR_MAX_BLOCK_PIXELS]; // raw weights, NOT ISE + + for (uint32_t i = 0; i < total_grid_pixels; i++) + dequantized_raw_weights0[i] = astc_helpers::g_dequant_tables.get_weight_tab(weight_ise_range).m_ISE_to_val[astc_helpers::get_weight(enc_log_block, 0, i)]; + + // suppress bogus gcc warning on dequantized_raw_weights0 +#ifndef __clang__ +#if defined(__GNUC__) +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" +#endif +#endif + + astc_helpers::upsample_weight_grid(block_width, block_height, grid_width, grid_height, dequantized_raw_weights0, upsampled_weights0); + +#ifndef __clang__ +#if defined(__GNUC__) +#pragma GCC diagnostic pop +#endif +#endif + + uint8_t dequantized_raw_weights1[astc_ldr::ASTC_LDR_MAX_BLOCK_PIXELS]; + uint8_t upsampled_weights1[astc_ldr::ASTC_LDR_MAX_BLOCK_PIXELS]; // raw weights, NOT ISE + + if (dual_plane_flag) + { + for (uint32_t i = 0; i < total_grid_pixels; i++) + dequantized_raw_weights1[i] = astc_helpers::g_dequant_tables.get_weight_tab(weight_ise_range).m_ISE_to_val[astc_helpers::get_weight(enc_log_block, 1, i)]; + astc_helpers::upsample_weight_grid(block_width, block_height, grid_width, grid_height, dequantized_raw_weights1, upsampled_weights1); + } + + astc_ldr::cem_encode_params refine_params(params); + refine_params.m_pForced_weight_vals0 = upsampled_weights0; + if (dual_plane_flag) + refine_params.m_pForced_weight_vals1 = upsampled_weights1; + + uint8_t refined_endpoints[astc_helpers::MAX_CEM_ENDPOINT_VALS]; + uint8_t refined_weights0[astc_ldr::ASTC_LDR_MAX_BLOCK_PIXELS]; + uint8_t refined_weights1[astc_ldr::ASTC_LDR_MAX_BLOCK_PIXELS]; + + //bool use_blue_contraction = true; + + bool base_ofs_clamped_flag = false; + + uint64_t refined_block_err = astc_ldr::cem_encode_pixels(cem_index, ccs_index, pixel_stats, refine_params, + endpoint_ise_range, astc_helpers::BISE_64_LEVELS, + refined_endpoints, refined_weights0, refined_weights1, UINT64_MAX, use_blue_contraction, &base_ofs_clamped_flag); + assert(refined_block_err != UINT64_MAX); + + if ((pBase_ofs_clamped_flag) && (base_ofs_clamped_flag)) + *pBase_ofs_clamped_flag = base_ofs_clamped_flag; + +#if defined(_DEBUG) || defined(DEBUG) + for (uint32_t i = 0; i < total_grid_pixels; i++) + { + assert(refined_weights0[i] == upsampled_weights0[i]); + + if (dual_plane_flag) + { + assert(refined_weights1[i] == upsampled_weights1[i]); + } + } +#endif + + if (refined_block_err != UINT64_MAX) + { + astc_helpers::log_astc_block alt_enc_log_block(enc_log_block); + memcpy(alt_enc_log_block.m_endpoints, refined_endpoints, astc_helpers::get_num_cem_values(cem_index)); + +#if defined(_DEBUG) || defined(DEBUG) + // refined_block_err was computed on the actual ASTC [0,64] upsampled weights the decoder would use. But double check this for sanity. + { + uint64_t ref_err = eval_error(block_width, block_height, alt_enc_log_block, pixel_stats, params); + assert(ref_err == refined_block_err); + } +#endif + + uint64_t cur_err = eval_error(block_width, block_height, enc_log_block, pixel_stats, params); + + if (refined_block_err < cur_err) + { + memcpy(enc_log_block.m_endpoints, refined_endpoints, astc_helpers::get_num_cem_values(cem_index)); + } + } + + return true; +} + +struct log_surrogate_astc_blk +{ + int m_grid_width, m_grid_height; + + uint32_t m_cem_index; // base+scale or direct variants only + int m_ccs_index; // -1 for single plane + + uint32_t m_num_endpoint_levels; + uint32_t m_num_weight_levels; + + uint32_t m_num_parts; // 1-3 + uint32_t m_seed_index; // ASTC seed index, 10-bits if m_num_parts > 1 + + vec4F m_endpoints[astc_helpers::MAX_PARTITIONS][2]; // [subset_index][l/h endpoint] + float m_scales[astc_helpers::MAX_PARTITIONS]; // scale factor used for each subset + + float m_weights0[astc_ldr::ASTC_LDR_MAX_BLOCK_PIXELS]; + float m_weights1[astc_ldr::ASTC_LDR_MAX_BLOCK_PIXELS]; + + void clear() + { + memset((void *)this, 0, sizeof(*this)); + } + + void decode(uint32_t block_width, uint32_t block_height, vec4F* pPixels, const astc_ldr::partition_pattern_vec* pPat) const; + void decode(uint32_t block_width, uint32_t block_height, vec4F* pPixels, const astc_ldr::partitions_data* pPat_data) const; +}; + +void upsample_surrogate_weights( + const astc_helpers::weighted_sample* pWeighted_samples, + const float* pSrc_weights, + float* pDst_weights, + uint32_t by, uint32_t bx, + uint32_t wx, uint32_t wy, + uint32_t num_weight_levels) +{ + const uint32_t total_src_weights = wx * wy; + const float weight_levels_minus_1 = (float)(num_weight_levels - 1) * (1.0f / 16.0f); + const float inv_weight_levels = 1.0f / (float)(num_weight_levels - 1); + + const astc_helpers::weighted_sample* pS = pWeighted_samples; + + for (uint32_t y = 0; y < by; y++) + { + for (uint32_t x = 0; x < bx; x++, ++pS) + { + const uint32_t w00 = pS->m_weights[0][0]; + const uint32_t w01 = pS->m_weights[0][1]; + const uint32_t w10 = pS->m_weights[1][0]; + const uint32_t w11 = pS->m_weights[1][1]; + + assert(w00 || w01 || w10 || w11); + + const uint32_t sx = pS->m_src_x, sy = pS->m_src_y; + + float total = 0.0f; + + if (w00) total += pSrc_weights[bounds_check(sx + sy * wx, 0U, total_src_weights)] * (float)w00; + if (w01) total += pSrc_weights[bounds_check(sx + 1 + sy * wx, 0U, total_src_weights)] * (float)w01; + if (w10) total += pSrc_weights[bounds_check(sx + (sy + 1) * wx, 0U, total_src_weights)] * (float)w10; + if (w11) total += pSrc_weights[bounds_check(sx + 1 + (sy + 1) * wx, 0U, total_src_weights)] * (float)w11; + + float w = (float)fast_roundf_pos_int(total * weight_levels_minus_1) * inv_weight_levels; + + pDst_weights[x + y * bx] = w; + } // x + } // y +} + +void log_surrogate_astc_blk::decode(uint32_t block_width, uint32_t block_height, vec4F* pPixels, const astc_ldr::partition_pattern_vec* pPat) const +{ + const bool dual_plane = (m_ccs_index >= 0); + + const uint32_t total_block_pixels = block_width * block_height; + const uint32_t total_grid_pixels = m_grid_width * m_grid_height; + + const bool needs_upsampling = total_grid_pixels < total_block_pixels; + + const bool is_small_block = total_block_pixels < 31; // astc_helpers::is_small_block(block_width, block_height); + BASISU_NOTE_UNUSED(is_small_block); + + float upsampled_weights0[astc_ldr::ASTC_LDR_MAX_BLOCK_PIXELS], upsampled_weights1[astc_ldr::ASTC_LDR_MAX_BLOCK_PIXELS]; + + const float* pWeights0 = m_weights0; + const float* pWeights1 = m_weights1; + + if (needs_upsampling) + { + // TODO: Precompute these in tables + astc_helpers::weighted_sample up_weights[astc_helpers::MAX_BLOCK_DIM * astc_helpers::MAX_BLOCK_DIM]; + astc_helpers::compute_upsample_weights(block_width, block_height, m_grid_width, m_grid_height, up_weights); + + upsample_surrogate_weights(up_weights, m_weights0, upsampled_weights0, block_width, block_height, m_grid_width, m_grid_height, m_num_weight_levels); + pWeights0 = upsampled_weights0; + + if (dual_plane) + { + upsample_surrogate_weights(up_weights, m_weights1, upsampled_weights1, block_width, block_height, m_grid_width, m_grid_height, m_num_weight_levels); + pWeights1 = upsampled_weights1; + } + } + + for (uint32_t y = 0; y < block_height; y++) + { + for (uint32_t x = 0; x < block_width; x++) + { + uint32_t part_index = 0; + if (m_num_parts > 1) + { + part_index = (*pPat)(x, y); + assert(part_index < m_num_parts); + + assert(part_index == (uint32_t)astc_helpers::compute_texel_partition(m_seed_index, x, y, 0, m_num_parts, is_small_block)); + } + + const vec4F& l = m_endpoints[part_index][0]; + const vec4F& h = m_endpoints[part_index][1]; + + vec4F& dst = pPixels[x + y * block_width]; + + for (uint32_t c = 0; c < 4; c++) + { + float w = ((int)c == m_ccs_index) ? pWeights1[x + y * block_width] : pWeights0[x + y * block_width]; + + //dst[c] = lerp(l[c], h[c], w); + + const float one_minus_w = 1.0f - w; + dst[c] = l[c] * one_minus_w + h[c] * w; + } // c + + } // x + } // y +} + +void log_surrogate_astc_blk::decode(uint32_t block_width, uint32_t block_height, vec4F* pPixels, const astc_ldr::partitions_data* pPat_data) const +{ + if (m_num_parts == 1) + return decode(block_width, block_height, pPixels, (const astc_ldr::partition_pattern_vec*)nullptr); + + uint32_t unique_pat_index = pPat_data->m_part_seed_to_unique_index[m_seed_index]; + assert(unique_pat_index < pPat_data->m_total_unique_patterns); + + return decode(block_width, block_height, pPixels, &pPat_data->m_partition_pats[unique_pat_index]); +} + +void downsample_float_weight_grid( + const float* pMatrix_weights, + uint32_t bx, uint32_t by, // source/from dimension (block size) + uint32_t wx, uint32_t wy, // dest/to dimension (grid size) + const float* pSrc_weights, // these are dequantized weights, NOT ISE symbols, [by][bx] + float* pDst_weights, // [wy][wx] + uint32_t num_weight_levels) +{ + const uint32_t total_block_samples = bx * by; + const float weight_levels_minus_1 = (float)(num_weight_levels - 1); + const float inv_weight_levels = 1.0f / (float)(num_weight_levels - 1); + + for (uint32_t y = 0; y < wy; y++) + { + for (uint32_t x = 0; x < wx; x++) + { + float total = 0.0f; + + // TODO - optimize! + for (uint32_t i = 0; i < total_block_samples; i++) + if (pMatrix_weights[i]) + total += pMatrix_weights[i] * (float)pSrc_weights[i]; + + pDst_weights[x + y * wx] = (float)fast_roundf_pos_int(total * weight_levels_minus_1) * inv_weight_levels; + + pMatrix_weights += total_block_samples; + } + } +} + +float decode_surrogate_and_compute_error( + uint32_t block_width, uint32_t block_height, + const astc_ldr::pixel_stats_t& pixel_stats, + log_surrogate_astc_blk& log_block, + const astc_ldr::partition_pattern_vec* pPat, + const astc_ldr::cem_encode_params& params) +{ + vec4F dec_pixels[astc_ldr::ASTC_LDR_MAX_BLOCK_PIXELS]; + log_block.decode(block_width, block_height, dec_pixels, pPat); + + const float wr = (float)params.m_comp_weights[0]; + const float wg = (float)params.m_comp_weights[1]; + const float wb = (float)params.m_comp_weights[2]; + const float wa = (float)params.m_comp_weights[3]; + + float total_err = 0.0f; + for (uint32_t by = 0; by < block_height; by++) + { + for (uint32_t bx = 0; bx < block_width; bx++) + { + const vec4F& s = pixel_stats.m_pixels_f[bx + by * block_width]; + const vec4F& d = dec_pixels[bx + by * block_width]; + + float dr = s[0] - d[0]; + float dg = s[1] - d[1]; + float db = s[2] - d[2]; + float da = s[3] - d[3]; + + total_err += (wr * dr * dr) + (wg * dg * dg) + (wb * db * db) + (wa * da * da); + } // bx + + } // by + + return total_err; +} + +// Returns WSSE error +float encode_surrogate_trial( + uint32_t block_width, uint32_t block_height, + const astc_ldr::pixel_stats_t& pixel_stats, + uint32_t cem_index, + int ccs_index, + uint32_t endpoint_ise_range, uint32_t weight_ise_range, + uint32_t grid_width, uint32_t grid_height, + log_surrogate_astc_blk& log_block, + const astc_ldr::cem_encode_params& params, + uint32_t flags) +{ + const bool is_downsampling = (grid_width < block_width) || (grid_height < block_height); + const bool dual_plane_flag = (ccs_index >= 0); + + const basist::astc_ldr_t::astc_block_grid_data* pBlock_grid_data = basist::astc_ldr_t::find_astc_block_grid_data(block_width, block_height, grid_width, grid_height); + + const float* pDownsample_matrix = nullptr; + if (is_downsampling) + pDownsample_matrix = pBlock_grid_data->m_downsample_matrix.get_ptr(); + + //const uint32_t total_block_pixels = block_width * block_height; + //const uint32_t total_grid_pixels = grid_width * grid_height; + + log_block.m_cem_index = cem_index; + log_block.m_ccs_index = ccs_index; + log_block.m_grid_width = grid_width; + log_block.m_grid_height = grid_height; + log_block.m_num_parts = 1; + log_block.m_seed_index = 0; + clear_obj(log_block.m_scales); + log_block.m_num_endpoint_levels = astc_helpers::get_ise_levels(endpoint_ise_range); + log_block.m_num_weight_levels = astc_helpers::get_ise_levels(weight_ise_range); + + float wsse_err = 0.0f; + + if (is_downsampling) + { + float temp_weights0[astc_ldr::ASTC_LDR_MAX_BLOCK_PIXELS], temp_weights1[astc_ldr::ASTC_LDR_MAX_BLOCK_PIXELS]; + + astc_ldr::cem_surrogate_encode_pixels( + cem_index, ccs_index, + pixel_stats, params, + endpoint_ise_range, weight_ise_range, + log_block.m_endpoints[0][0], log_block.m_endpoints[0][1], log_block.m_scales[0], temp_weights0, temp_weights1, + flags); + + downsample_float_weight_grid( + pDownsample_matrix, + block_width, block_height, + grid_width, grid_height, + temp_weights0, + log_block.m_weights0, + log_block.m_num_weight_levels); + + if (dual_plane_flag) + { + downsample_float_weight_grid( + pDownsample_matrix, + block_width, block_height, + grid_width, grid_height, + temp_weights1, + log_block.m_weights1, + log_block.m_num_weight_levels); + } + + wsse_err = decode_surrogate_and_compute_error(block_width, block_height, pixel_stats, log_block, nullptr, params); + } + else + { + wsse_err = astc_ldr::cem_surrogate_encode_pixels( + cem_index, ccs_index, + pixel_stats, params, + endpoint_ise_range, weight_ise_range, + log_block.m_endpoints[0][0], log_block.m_endpoints[0][1], log_block.m_scales[0], log_block.m_weights0, log_block.m_weights1, + flags); + +#if defined(_DEBUG) || defined(DEBUG) + { + float alt_wsse_err = decode_surrogate_and_compute_error(block_width, block_height, pixel_stats, log_block, nullptr, params); + assert(fabs(wsse_err - alt_wsse_err) < .00125f); + } +#endif + } + + return wsse_err; +} + +float encode_surrogate_trial_subsets( + uint32_t block_width, uint32_t block_height, + const astc_ldr::pixel_stats_t& pixel_stats, + uint32_t cem_index, + uint32_t num_subsets, uint32_t pat_seed_index, const astc_ldr::partition_pattern_vec* pPat, + uint32_t endpoint_ise_range, uint32_t weight_ise_range, + uint32_t grid_width, uint32_t grid_height, + log_surrogate_astc_blk& log_block, + const astc_ldr::cem_encode_params& params, + uint32_t flags) +{ + assert((num_subsets >= 2) && (num_subsets <= astc_helpers::MAX_PARTITIONS)); + + const bool is_downsampling = (grid_width < block_width) || (grid_height < block_height); + //const uint32_t total_block_pixels = block_width * block_height; + //const uint32_t total_grid_pixels = grid_width * grid_height; + + const uint32_t num_weight_levels = astc_helpers::get_ise_levels(weight_ise_range); + const uint32_t num_endpoint_levels = astc_helpers::get_ise_levels(endpoint_ise_range); + + const basist::astc_ldr_t::astc_block_grid_data* pBlock_grid_data = basist::astc_ldr_t::find_astc_block_grid_data(block_width, block_height, grid_width, grid_height); + + const float* pDownsample_matrix = nullptr; + if (is_downsampling) + pDownsample_matrix = pBlock_grid_data->m_downsample_matrix.get_ptr(); + + color_rgba part_pixels[astc_helpers::MAX_PARTITIONS][astc_ldr::ASTC_LDR_MAX_BLOCK_PIXELS]; + uint32_t num_part_pixels[astc_helpers::MAX_PARTITIONS] = { 0 }; + + for (uint32_t y = 0; y < block_height; y++) + { + for (uint32_t x = 0; x < block_width; x++) + { + const color_rgba& px = pixel_stats.m_pixels[x + y * block_width]; + + const uint32_t part_index = (*pPat)(x, y); + assert(part_index < num_subsets); + + part_pixels[part_index][num_part_pixels[part_index]] = px; + num_part_pixels[part_index]++; + } // x + } // y + +#if defined(_DEBUG) || defined(DEBUG) + for (uint32_t i = 0; i < num_subsets; i++) + assert(num_part_pixels[i] > 0); +#endif + + astc_ldr::pixel_stats_t part_pixel_stats[astc_helpers::MAX_PARTITIONS]; + + for (uint32_t i = 0; i < num_subsets; i++) + part_pixel_stats[i].clear(); + + float part_weights[astc_helpers::MAX_PARTITIONS][astc_ldr::ASTC_LDR_MAX_BLOCK_PIXELS]; + float temp_block_weights[astc_ldr::ASTC_LDR_MAX_BLOCK_PIXELS]; + + double total_subset_err = 0.0f; + for (uint32_t part_index = 0; part_index < num_subsets; part_index++) + { + part_pixel_stats[part_index].init(num_part_pixels[part_index], &part_pixels[part_index][0]); + + float subset_err = astc_ldr::cem_surrogate_encode_pixels( + cem_index, -1, + part_pixel_stats[part_index], params, + endpoint_ise_range, weight_ise_range, + log_block.m_endpoints[part_index][0], log_block.m_endpoints[part_index][1], + log_block.m_scales[part_index], part_weights[part_index], temp_block_weights, + flags); + + total_subset_err += subset_err; + + } // part_index + + float* pDst_weights = is_downsampling ? temp_block_weights : log_block.m_weights0; + + clear_obj(num_part_pixels); + + for (uint32_t y = 0; y < block_height; y++) + { + for (uint32_t x = 0; x < block_width; x++) + { + const uint32_t part_index = (*pPat)(x, y); + assert(part_index < num_subsets); + + pDst_weights[x + y * block_width] = part_weights[part_index][num_part_pixels[part_index]]; + num_part_pixels[part_index]++; + } // x + } // y + + log_block.m_cem_index = cem_index; + log_block.m_ccs_index = -1; + log_block.m_num_endpoint_levels = num_endpoint_levels; + log_block.m_num_weight_levels = num_weight_levels; + log_block.m_grid_width = grid_width; + log_block.m_grid_height = grid_height; + log_block.m_num_parts = num_subsets; + log_block.m_seed_index = pat_seed_index; + + if (is_downsampling) + { + downsample_float_weight_grid( + pDownsample_matrix, + block_width, block_height, + grid_width, grid_height, + temp_block_weights, + log_block.m_weights0, + astc_helpers::get_ise_levels(weight_ise_range)); + + total_subset_err = decode_surrogate_and_compute_error(block_width, block_height, pixel_stats, log_block, pPat, params); + } + +#if defined(_DEBUG) || defined(DEBUG) + if (!is_downsampling) + { + float alt_subset_err = decode_surrogate_and_compute_error(block_width, block_height, pixel_stats, log_block, pPat, params); + + assert(fabs(total_subset_err - alt_subset_err) < .00125f); + } +#endif + + return (float)total_subset_err; +} + +#if 0 +static inline vec4F vec4F_norm_approx(vec4F axis) +{ + float l = axis.norm(); + axis = (fabs(l) >= SMALL_FLOAT_VAL) ? (axis * bu_math::inv_sqrt(l)) : vec4F(.5f); + return axis; +} +#endif + +static bool estimate_partition2( + uint32_t block_width, uint32_t block_height, + const astc_ldr::pixel_stats_t& pixels, + int* pBest_parts, uint32_t num_best_parts, // unique indices, not ASTC seeds + const astc_ldr::partitions_data* pPart_data, bool brute_force_flag) +{ + assert(num_best_parts && (num_best_parts <= pPart_data->m_total_unique_patterns)); + + const uint32_t num_block_pixels = block_width * block_height; + + if (brute_force_flag) + { + int desired_parts[astc_ldr::ASTC_LDR_MAX_BLOCK_HEIGHT][astc_ldr::ASTC_LDR_MAX_BLOCK_WIDTH]; // [y][x] + + for (uint32_t i = 0; i < num_block_pixels; i++) + { + float proj = (pixels.m_pixels_f[i] - pixels.m_mean_f).dot(pixels.m_mean_rel_axis4); + + desired_parts[i / block_width][i % block_width] = proj < 0.0f; + } + + uint32_t part_similarity[astc_helpers::NUM_PARTITION_PATTERNS]; + + for (uint32_t part_index = 0; part_index < pPart_data->m_total_unique_patterns; part_index++) + { + const astc_ldr::partition_pattern_vec& pat_vec = pPart_data->m_partition_pats[part_index]; + + int total_sim_non_inv = 0; + int total_sim_inv = 0; + + for (uint32_t y = 0; y < block_height; y++) + { + for (uint32_t x = 0; x < block_width; x++) + { + int part = pat_vec[x + y * block_width]; + + if (part == desired_parts[y][x]) + total_sim_non_inv++; + + if ((part ^ 1) == desired_parts[y][x]) + total_sim_inv++; + } + } + + int total_sim = maximum(total_sim_non_inv, total_sim_inv); + + part_similarity[part_index] = (total_sim << 16) | part_index; + + } // part_index; + + std::sort(part_similarity, part_similarity + pPart_data->m_total_unique_patterns); + + for (uint32_t i = 0; i < num_best_parts; i++) + pBest_parts[i] = part_similarity[(pPart_data->m_total_unique_patterns - 1) - i] & 0xFFFF; + } + else + { + astc_ldr::partition_pattern_vec desired_part(block_width, block_height); + + for (uint32_t i = 0; i < num_block_pixels; i++) + { + float proj = (pixels.m_pixels_f[i] - pixels.m_mean_f).dot(pixels.m_mean_rel_axis4); + + desired_part.m_parts[i] = proj < 0.0f; + } + + astc_ldr::vp_tree::result_queue results; + results.reserve(num_best_parts); + + pPart_data->m_part_vp_tree.find_nearest(2, desired_part, results, num_best_parts); + + assert(results.get_size() == num_best_parts); + + const auto& elements = results.get_elements(); + + for (uint32_t i = 0; i < results.get_size(); i++) + pBest_parts[i] = elements[1 + i].m_pat_index; + } + + return true; +} + +static bool estimate_partition3( + uint32_t block_width, uint32_t block_height, + const astc_ldr::pixel_stats_t& pixels, + int* pBest_parts, uint32_t num_best_parts, + const astc_ldr::partitions_data* pPart_data, bool brute_force_flag) +{ + assert(num_best_parts && (num_best_parts <= pPart_data->m_total_unique_patterns)); + + vec4F training_vecs[astc_ldr::ASTC_LDR_MAX_BLOCK_PIXELS], mean(0.0f); + + const uint32_t num_block_pixels = block_width * block_height, NUM_SUBSETS = 3; + + float brightest_inten = 0.0f, darkest_inten = BIG_FLOAT_VAL; + vec4F cluster_centroids[NUM_SUBSETS]; + clear_obj(cluster_centroids); + + for (uint32_t i = 0; i < num_block_pixels; i++) + { + vec4F& v = training_vecs[i]; + + v = pixels.m_pixels_f[i]; + + float inten = v.dot(vec4F(1.0f)); + if (inten < darkest_inten) + { + darkest_inten = inten; + cluster_centroids[0] = v; + } + + if (inten > brightest_inten) + { + brightest_inten = inten; + cluster_centroids[1] = v; + } + } + + if (cluster_centroids[0] == cluster_centroids[1]) + return false; + + float furthest_dist2 = 0.0f; + for (uint32_t i = 0; i < num_block_pixels; i++) + { + vec4F& v = training_vecs[i]; + + float dist_a = v.squared_distance(cluster_centroids[0]); + if (dist_a == 0.0f) + continue; + + float dist_b = v.squared_distance(cluster_centroids[1]); + if (dist_b == 0.0f) + continue; + + float dist2 = dist_a + dist_b; + if (dist2 > furthest_dist2) + { + furthest_dist2 = dist2; + cluster_centroids[2] = v; + } + } + + if ((cluster_centroids[0] == cluster_centroids[2]) || (cluster_centroids[1] == cluster_centroids[2])) + return false; + + uint32_t cluster_pixels[NUM_SUBSETS][astc_ldr::ASTC_LDR_MAX_BLOCK_PIXELS]; + uint32_t num_cluster_pixels[NUM_SUBSETS]; + vec4F new_cluster_means[NUM_SUBSETS]; + + const uint32_t NUM_ITERS = 4; + + for (uint32_t s = 0; s < NUM_ITERS; s++) + { + memset(num_cluster_pixels, 0, sizeof(num_cluster_pixels)); + memset((void *)new_cluster_means, 0, sizeof(new_cluster_means)); + + for (uint32_t i = 0; i < num_block_pixels; i++) + { + float d[NUM_SUBSETS] = { + training_vecs[i].squared_distance(cluster_centroids[0]), + training_vecs[i].squared_distance(cluster_centroids[1]), + training_vecs[i].squared_distance(cluster_centroids[2]) }; + + float min_d = d[0]; + uint32_t min_idx = 0; + for (uint32_t j = 1; j < NUM_SUBSETS; j++) + { + if (d[j] < min_d) + { + min_d = d[j]; + min_idx = j; + } + } + + cluster_pixels[min_idx][num_cluster_pixels[min_idx]] = i; + new_cluster_means[min_idx] += training_vecs[i]; + num_cluster_pixels[min_idx]++; + } // i + + // Can skip updating the centroids on the last iteration - all we care about is the final partitioning. + if (s == (NUM_ITERS - 1)) + { + for (uint32_t j = 0; j < NUM_SUBSETS; j++) + { + if (!num_cluster_pixels[j]) + return false; + } + } + else + { + for (uint32_t j = 0; j < NUM_SUBSETS; j++) + { + if (!num_cluster_pixels[j]) + return false; + + cluster_centroids[j] = new_cluster_means[j] / (float)num_cluster_pixels[j]; + } // j + } + + } // s + + astc_ldr::partition_pattern_vec desired_part(block_width, block_height); + + for (uint32_t p = 0; p < NUM_SUBSETS; p++) + { + for (uint32_t i = 0; i < num_cluster_pixels[p]; i++) + { + const uint32_t pix_index = cluster_pixels[p][i]; + desired_part[pix_index] = (uint8_t)p; + } // i + } // p + + if (brute_force_flag) + { + astc_ldr::partition_pattern_vec desired_parts[astc_ldr::NUM_PART3_MAPPINGS]; + for (uint32_t j = 0; j < astc_ldr::NUM_PART3_MAPPINGS; j++) + desired_parts[j] = desired_part.get_permuted3(j); + + uint32_t part_similarity[astc_helpers::NUM_PARTITION_PATTERNS]; + + for (uint32_t part_index = 0; part_index < pPart_data->m_total_unique_patterns; part_index++) + { + const astc_ldr::partition_pattern_vec& pat = pPart_data->m_partition_pats[part_index]; + + uint32_t lowest_pat_dist = UINT32_MAX; + for (uint32_t p = 0; p < astc_ldr::NUM_PART3_MAPPINGS; p++) + { + uint32_t dist = pat.get_squared_distance(desired_parts[p]); + if (dist < lowest_pat_dist) + lowest_pat_dist = dist; + } + + part_similarity[part_index] = (lowest_pat_dist << 16) | part_index; + + } // part_index; + + std::sort(part_similarity, part_similarity + pPart_data->m_total_unique_patterns); + + for (uint32_t i = 0; i < num_best_parts; i++) + pBest_parts[i] = part_similarity[i] & 0xFFFF; + } + else + { + astc_ldr::vp_tree::result_queue results; + results.reserve(num_best_parts); + + pPart_data->m_part_vp_tree.find_nearest(3, desired_part, results, num_best_parts); + + assert(results.get_size() == num_best_parts); + + const auto& elements = results.get_elements(); + + for (uint32_t i = 0; i < results.get_size(); i++) + pBest_parts[i] = elements[1 + i].m_pat_index; + } + + return true; +} + +//--------------------------------------------------------------------- + +static const float g_sobel_x[3][3] = // [y][x] +{ + { -1.0f, 0.0f, 1.0f }, + { -2.0f, 0.0f, 2.0f }, + { -1.0f, 0.0f, 1.0f } +}; + +static const float g_sobel_y[3][3] = // [y][x] +{ + { -1.0f, -2.0f, -1.0f }, + { 0.0f, 0.0f, 0.0f }, + { 1.0f, 2.0f, 1.0f } +}; + +void compute_sobel(const image& orig, image& dest, const float* pMatrix_3x3) +{ + const uint32_t width = orig.get_width(); + const uint32_t height = orig.get_height(); + + dest.resize(width, height); + + for (int y = 0; y < (int)height; y++) + { + for (int x = 0; x < (int)width; x++) + { + vec4F d(128.0f); + + for (int my = -1; my <= 1; my++) + { + for (int mx = -1; mx <= 1; mx++) + { + float w = pMatrix_3x3[(my + 1) * 3 + (mx + 1)]; + if (w == 0.0f) + continue; + + const color_rgba& s = orig.get_clamped(x + mx, y + my); + + for (uint32_t c = 0; c < 4; c++) + d[c] += w * (float)s[c]; + + } // mx + + } // my + + dest(x, y).set(fast_roundf_int(d[0]), fast_roundf_int(d[1]), fast_roundf_int(d[2]), fast_roundf_int(d[3])); + + } // x + } // y +} + +void compute_energy_from_dct(uint32_t block_width, uint32_t block_height, float* pDCT) +{ + const uint32_t num_texels = block_width * block_height; + + for (uint32_t i = 1; i < num_texels; i++) + pDCT[i] = square(pDCT[i]); + + pDCT[0] = 0.0f; +} + +// Results scaled by # block texels (block-SSE in weight space) +float compute_preserved_dct_energy(uint32_t block_width, uint32_t block_height, const float* pEnergy, uint32_t grid_w, uint32_t grid_h) +{ + float tot = 0.0f; + + for (uint32_t y = 0; y < block_height; y++) + { + for (uint32_t x = 0; x < block_width; x++) + { + if ((x < grid_w) && (y < grid_h)) + tot += pEnergy[x + y * block_width]; + } + } + + return tot; +} + +// Results scaled by # block texels (block-SSE in weight space) +inline float compute_lost_dct_energy(uint32_t block_width, uint32_t block_height, const float* pEnergy, uint32_t grid_w, uint32_t grid_h) +{ + float tot = 0.0f; + + for (uint32_t y = 0; y < block_height; y++) + { + for (uint32_t x = 0; x < block_width; x++) + { + if ((x < grid_w) && (y < grid_h)) + continue; + + tot += pEnergy[x + y * block_width]; + } + } + + return tot; +} + +struct ldr_astc_lowlevel_block_encoder_params +{ + ldr_astc_lowlevel_block_encoder_params() + { + clear(); + } + + void clear() + { + clear_obj(*this); + + for (uint32_t i = 0; i < 4; i++) + m_dp_active_chans[i] = true; + + m_subsets_edge_filtering = true; + + m_use_superbuckets = true; + m_bucket_pruning_passes = true; + m_use_dual_planes = true; + + m_superbucket_max_to_retain[0] = 4; + m_superbucket_max_to_retain[1] = 8; + m_superbucket_max_to_retain[2] = 16; + + m_shortlist_buckets_to_examine_fract = 1.0f; // after high-level bucket surrogate encoding and pruning stages, 1.0=effectively disabled + m_shortlist_buckets_to_examine_min = 1; + m_shortlist_buckets_to_examine_max = 1024; + + // TODO: Expose these at a higher level. Add alpha specific? + m_num_similar_modes_in_bucket_to_shortlist_fract = .33f; + m_num_similar_modes_in_bucket_to_shortlist_fract_min = 2; + m_num_similar_modes_in_bucket_to_shortlist_fract_max = 4096; + + m_final_shortlist_fraction[0] = .2f; + m_final_shortlist_fraction[1] = .3f; + m_final_shortlist_fraction[2] = .5f; + m_final_shortlist_min_size[0] = 1; + m_final_shortlist_min_size[1] = 1; + m_final_shortlist_min_size[2] = 1; + m_final_shortlist_max_size[0] = 4096; + m_final_shortlist_max_size[1] = 4096; + m_final_shortlist_max_size[2] = 4096; + + m_gradient_descent_flag = true; + m_polish_weights_flag = true; + m_qcd_enabled_flag = true; + + m_final_encode_try_base_ofs = true; + m_final_encode_always_try_rgb_direct = false; // if true, even if base_ofs succeeds, we try RGB/RGBA direct too + + m_use_parts_std_dev_thresh = (8.0f / 255.0f); + m_use_parts_std_dev_thresh2 = (40.0f / 255.0f); + m_sobel_energy_thresh1 = 3200.0f; + m_sobel_energy_thresh2 = 30000.0f; + m_sobel_energy_thresh3 = 50000.0f; + + m_part2_fraction_to_keep = 2; + m_part3_fraction_to_keep = 2; + m_base_parts2 = 32; + m_base_parts3 = 32; + + // TODO: Prehaps expose this at a higher level. + m_use_blue_contraction = true; + } + + uint32_t m_bx, m_by, m_block_width, m_block_height, m_total_block_pixels; + + const image* m_pOrig_img_sobel_xy_t; + + const astc_ldr::partitions_data* m_pPart_data_p2; + const astc_ldr::partitions_data* m_pPart_data_p3; + + const astc_ldr::cem_encode_params* m_pEnc_params; + + // RGB or alpha trial lists (shouldn't have both in same lists) + uint32_t m_num_trial_modes; + const basist::astc_ldr_t::trial_mode* m_pTrial_modes; + + const basist::astc_ldr_t::grouped_trial_modes* m_pGrouped_trial_modes; + + uint32_t m_superbucket_max_to_retain[3]; // [block_complexity_index] + + float m_shortlist_buckets_to_examine_fract; + uint32_t m_shortlist_buckets_to_examine_min; + uint32_t m_shortlist_buckets_to_examine_max; + + float m_num_similar_modes_in_bucket_to_shortlist_fract; + uint32_t m_num_similar_modes_in_bucket_to_shortlist_fract_min; + uint32_t m_num_similar_modes_in_bucket_to_shortlist_fract_max; + + float m_final_shortlist_fraction[3]; + uint32_t m_final_shortlist_min_size[3]; + uint32_t m_final_shortlist_max_size[3]; + + bool m_use_superbuckets; + bool m_bucket_pruning_passes; + + // true if this is a trial mode list containing alpha + bool m_alpha_cems; + + bool m_use_alpha_or_opaque_modes; // true for only alpha cems, false of only opaque cems; + bool m_use_lum_direct_modes; + bool m_use_base_scale_modes; + bool m_use_direct_modes; + bool m_use_dual_planes; + + bool m_grid_hv_filtering; + bool m_filter_horizontally_flag; // = h_energy_lost < v_energy_lost, if true it's visually better to resample the block on the X axis vs. Y + bool m_use_small_grids_only; + + bool m_dp_active_chans[4]; + + bool m_subsets_enabled; + bool m_subsets_edge_filtering; + + // TODO: Make polishing controllable per superpass. + bool m_gradient_descent_flag; + bool m_polish_weights_flag; + bool m_qcd_enabled_flag; + + bool m_final_encode_try_base_ofs; + bool m_final_encode_always_try_rgb_direct; + + bool m_brute_force_est_parts; + bool m_disable_part_est_stage2; // only use single stage partition estimation + + bool m_use_blue_contraction; // currently global enable/disable + + float m_use_parts_std_dev_thresh; + float m_use_parts_std_dev_thresh2; + float m_sobel_energy_thresh1; + float m_sobel_energy_thresh2; + float m_sobel_energy_thresh3; + + uint32_t m_part2_fraction_to_keep; + uint32_t m_part3_fraction_to_keep; + uint32_t m_base_parts2; + uint32_t m_base_parts3; + + float m_early_stop_wpsnr; + float m_early_stop2_wpsnr; + + basist::astc_ldr_t::dct2f* m_pDCT2F; // at block size +}; + +struct trial_surrogate +{ + uint32_t m_trial_mode_index; + float m_err; + + log_surrogate_astc_blk m_log_blk; + + void clear() + { + m_trial_mode_index = 0; + m_err = 0; + m_log_blk.clear(); + } + + bool operator < (const trial_surrogate& rhs) const + { + return m_err < rhs.m_err; + } +}; + +struct encode_block_output +{ + int16_t m_trial_mode_index; // -1 = solid, no trial mode + uint16_t m_blur_id; // blur index + + astc_helpers::log_astc_block m_log_blk; + + // Packed per-plane DCT data + basist::astc_ldr_t::dct_syms m_packed_dct_plane_data[2]; + + uint64_t m_sse; + + void clear() + { + m_trial_mode_index = -1; + m_blur_id = 0; + m_log_blk.clear(); + m_sse = 0; + } +}; + +struct encode_block_stats +{ + uint32_t m_total_superbuckets_created; + uint32_t m_total_buckets_created; + uint32_t m_total_surrogate_encodes; + uint32_t m_total_shortlist_candidates; + uint32_t m_total_full_encodes; + + encode_block_stats() { clear(); } + + void clear() + { + clear_obj(*this); + } +}; + +struct chan_mse_est +{ + float m_ep; + float m_wp; + + chan_mse_est() {} + chan_mse_est(float ep, float wp) : m_ep(ep), m_wp(wp) {} +}; + +struct weight_terms +{ + float m_mean; + float m_var; + float m_endpoint_factor; + float m_weight_spread_scale; + + void calc(uint32_t n, const float* pWeights) + { + assert(n); + + float weight_total = 0.0f; + for (uint32_t i = 0; i < n; i++) + { + assert(is_in_range(pWeights[i], 0.0f, 1.0f)); + weight_total += pWeights[i]; + } + m_mean = weight_total / (float)n; + + float weight_var = 0.0f; + for (uint32_t i = 0; i < n; i++) + weight_var += squaref(pWeights[i] - m_mean); + m_var = weight_var / (float)n; + + // drops below 2/3 on smooth blocks and tends to 2/3 when weights are well spread + m_endpoint_factor = (1.0f + 2.0f * m_var + 2.0f * m_mean * m_mean - 2.0f * m_mean) / (2.0f / 3.0f); + m_endpoint_factor = clamp(m_endpoint_factor, .25f, 1.50f); + + const float UNIFORM_VAR = 1.0f / 12.0f; + float s = m_var / UNIFORM_VAR; + + // shrinks the weight term on smooth blocks and is ~1 when weights are spread. + m_weight_spread_scale = saturate(s); + } +}; + +// weight_gamma is block size/grid size specific factor (0,1] (the amount of MSE quant error remaining taking into account bilinear smoothing) +inline chan_mse_est compute_quantized_channel_mse_estimates(uint32_t num_endpoint_levels, uint32_t num_weight_levels, float span_size, float weight_gamma, const weight_terms* pWeight_terms = nullptr) +{ + assert(num_endpoint_levels >= 2); + assert(num_weight_levels >= 2); + + const float Dep = 1.0f / (float)(num_endpoint_levels - 1); // endpoint quant step + const float Dw = 1.0f / (float)(num_weight_levels - 1); // weight quant step + + // Endpoint quant MSE estimate is not span dependent + float ep_lower = (Dep * Dep) / 12.0f * (2.0f / 3.0f); + + // Weight quant MSE estimate is span dependent + float wq_lower = (Dw * Dw) / 12.0f * weight_gamma * (span_size * span_size); + + if (pWeight_terms) + { + ep_lower *= pWeight_terms->m_endpoint_factor; + wq_lower *= pWeight_terms->m_weight_spread_scale; + } + + return chan_mse_est(ep_lower, wq_lower); +} + +inline float compute_quantized_channel_endpoint_mse_estimate(uint32_t num_endpoint_levels, const weight_terms* pWeight_terms = nullptr) +{ + assert(num_endpoint_levels >= 2); + + const float Dep = 1.0f / (float)(num_endpoint_levels - 1); // endpoint quant step + + // Endpoint quant MSE estimate is not span dependent + float ep_lower = (Dep * Dep) / 12.0f * (2.0f / 3.0f); + + if (pWeight_terms) + ep_lower *= pWeight_terms->m_endpoint_factor; + + return ep_lower; +} + +inline float compute_quantized_channel_weight_mse_estimate(uint32_t num_weight_levels, float span_size, float weight_gamma, const weight_terms* pWeight_terms = nullptr) +{ + assert(num_weight_levels >= 2); + + const float Dw = 1.0f / (float)(num_weight_levels - 1); // weight quant step + + // Weight quant MSE estimate is span dependent + float wq_lower = (Dw * Dw) / 12.0f * weight_gamma * (span_size * span_size); + + if (pWeight_terms) + wq_lower *= pWeight_terms->m_weight_spread_scale; + + return wq_lower; +} + +const float BLUE_CONTRACTION_BASE_OFS_DISCOUNT = .9f; +const float SKIP_IF_BUCKET_WORSE_MULTIPLIER = 5.0f; + +struct shortlist_bucket +{ + bool m_examined_flag; + int8_t m_grid_width, m_grid_height; + int8_t m_ccs_index; + + uint8_t m_cem_index; + uint8_t m_num_parts; + uint16_t m_unique_seed_index; + + log_surrogate_astc_blk m_surrogate_log_blk; + float m_sse; + + shortlist_bucket() + { + } + + shortlist_bucket(int grid_width, int grid_height, uint32_t cem_index, int ccs_index, uint32_t num_parts, uint32_t unique_seed_index) : + m_grid_width((int8_t)grid_width), m_grid_height((int8_t)grid_height), + m_ccs_index((int8_t)ccs_index), + m_cem_index((uint8_t)cem_index), + m_num_parts((uint8_t)num_parts), + m_unique_seed_index((uint16_t)unique_seed_index) + { + m_surrogate_log_blk.clear(); + m_sse = 0.0f; + m_examined_flag = false; + } + + operator size_t() const + { +#define ADD_HASH(H) h ^= basist::hash_hsieh((uint8_t*)&(H), sizeof(H)); + size_t h = 0; + ADD_HASH(m_grid_width); + ADD_HASH(m_grid_height); + ADD_HASH(m_ccs_index); + ADD_HASH(m_cem_index); + ADD_HASH(m_num_parts); + ADD_HASH(m_unique_seed_index); +#undef ADD_HASH + return h; + } + + // equality for hashing + bool operator== (const shortlist_bucket& rhs) const + { + return (m_grid_width == rhs.m_grid_width) && (m_grid_height == rhs.m_grid_height) && (m_cem_index == rhs.m_cem_index) && (m_ccs_index == rhs.m_ccs_index) && + (m_num_parts == rhs.m_num_parts) && (m_unique_seed_index == rhs.m_unique_seed_index); + } +}; + +typedef static_vector trial_mode_index_vec; +typedef basisu::hash_map shortlist_bucket_hash_t; + +#pragma pack(push, 1) +struct trial_mode_estimate_superbucket_key +{ + // All member vars from beginning to m_last will be hashed. Be careful of alignment. + uint8_t m_cem_index; + int8_t m_ccs_index; + uint16_t m_subset_unique_index; + + uint8_t m_num_subsets; + uint8_t m_last; + uint8_t m_unused[2]; + + trial_mode_estimate_superbucket_key() + { + static_assert((sizeof(*this) % 4) == 0, "struct size must be divisible by 4"); + } + + void clear() + { + clear_obj(*this); + } + + operator size_t() const + { + return basist::hash_hsieh((const uint8_t*)this, BASISU_OFFSETOF(trial_mode_estimate_superbucket_key, m_last)); + } + + bool operator== (const trial_mode_estimate_superbucket_key& rhs) const + { +#define COMP(e) if (e != rhs.e) return false; + COMP(m_cem_index); + COMP(m_ccs_index); + COMP(m_subset_unique_index); + COMP(m_num_subsets); +#undef COMP + return true; + } +}; +#pragma pack(pop) + +struct trial_mode_estimate_superbucket_value +{ + basisu::vector m_trial_mode_list; +}; + +typedef hash_map trial_mode_estimate_superbucket_hash; + +struct trial_mode_estimate +{ + trial_mode_estimate_superbucket_key m_superbucket_key; + + uint32_t m_trial_mode_index; + float m_wsse; + + bool operator< (const trial_mode_estimate& rhs) const + { + return m_wsse < rhs.m_wsse; + } +}; + +struct ranked_shortlist_bucket +{ + shortlist_bucket m_bucket; + trial_mode_index_vec m_trial_mode_indices; + + bool operator < (const ranked_shortlist_bucket& rhs) const { return m_bucket.m_sse < rhs.m_bucket.m_sse; } +}; + +struct ldr_astc_lowlevel_block_encoder +{ + ldr_astc_lowlevel_block_encoder() : + m_used_flag(false) + { + clear(); + } + + // Warning: These objects can migrate between threads (be cautious of determinism issues with containers/hash tables!) + bool m_used_flag; + + // Thread-local data follows + uint_vec m_trial_modes_to_estimate; + + trial_mode_estimate_superbucket_hash m_superbucket_hash; + + std::priority_queue m_trial_mode_estimate_priority_queue; + + basist::astc_ldr_t::fvec m_dct_work; + + shortlist_bucket_hash_t m_shortlist_hash0; + shortlist_bucket_hash_t m_shortlist_hash1; + + basisu::vector m_trial_surrogates; + + float m_sobel_energy; + float m_max_std_dev; + + uint32_t m_block_complexity_index; // [0,2] + bool m_strong_edges; + bool m_very_strong_edges; + bool m_super_strong_edges; + + bool m_used_superbuckets; + + int m_best_parts2[2][MAX_BASE_PARTS2 * PART_ESTIMATE_STAGE1_MULTIPLIER]; // [rgb[a]direct/rgbs][est_part] + int m_num_est_parts2[2]; + + int m_best_parts3[2][MAX_BASE_PARTS3 * PART_ESTIMATE_STAGE1_MULTIPLIER]; // [rgb[a]direct/rgbs][est_part] + int m_num_est_parts3[2]; + + basisu::vector m_ranked_buckets; + + void clear() + { + m_trial_modes_to_estimate.resize(0); + m_superbucket_hash.reset(); + + m_trial_surrogates.resize(0); + + m_sobel_energy = 0; + m_max_std_dev = 0; + m_block_complexity_index = 0; + m_strong_edges = false; + m_very_strong_edges = false; + m_super_strong_edges = false; + + m_used_superbuckets = false; + + clear_obj(m_best_parts2); + clear_obj(m_num_est_parts2); + + clear_obj(m_best_parts3); + clear_obj(m_num_est_parts3); + + m_ranked_buckets.resize(0); + } + + bool init( + const ldr_astc_lowlevel_block_encoder_params& p, + const astc_ldr::pixel_stats_t& pixel_stats, + basisu::vector& out_blocks, + uint32_t blur_id, + encode_block_stats& stats) + { + BASISU_NOTE_UNUSED(blur_id); + BASISU_NOTE_UNUSED(out_blocks); + BASISU_NOTE_UNUSED(stats); + + // TODO: This sums the *original* (not blurred) block's energy - precompute this? Replace with DCT? + m_sobel_energy = 0.0f; + for (uint32_t y = 0; y < p.m_block_height; y++) + { + for (uint32_t x = 0; x < p.m_block_width; x++) + { + const color_rgba& s = p.m_pOrig_img_sobel_xy_t->get_clamped(p.m_bx * p.m_block_width + x, p.m_by * p.m_block_height + y); + + // TODO: sum max of all channels instead? + m_sobel_energy += s[0] * s[0] + s[1] * s[1] + s[2] * s[2] + s[3] * s[3]; + } // x + } // y + + m_sobel_energy /= (float)p.m_total_block_pixels; + + m_max_std_dev = 0.0f; + for (uint32_t i = 0; i < 4; i++) + m_max_std_dev = maximum(m_max_std_dev, pixel_stats.m_rgba_stats[i].m_std_dev); + + m_strong_edges = (m_max_std_dev > p.m_use_parts_std_dev_thresh) && (m_sobel_energy > p.m_sobel_energy_thresh1); + m_very_strong_edges = (m_max_std_dev > p.m_use_parts_std_dev_thresh2) && (m_sobel_energy > p.m_sobel_energy_thresh2); + m_super_strong_edges = (m_max_std_dev > p.m_use_parts_std_dev_thresh2) && (m_sobel_energy > p.m_sobel_energy_thresh3); + + m_block_complexity_index = m_super_strong_edges ? 2 : (m_very_strong_edges ? 1 : 0); + + return true; + } + + bool partition_triage( + const ldr_astc_lowlevel_block_encoder_params& p, + const astc_ldr::pixel_stats_t& pixel_stats, + basisu::vector& out_blocks, + uint32_t blur_id, + encode_block_stats& stats) + { + BASISU_NOTE_UNUSED(blur_id); + BASISU_NOTE_UNUSED(out_blocks); + + clear_obj(m_num_est_parts2); + clear_obj(m_num_est_parts3); + + if (!p.m_subsets_enabled) + return true; + + if (p.m_subsets_edge_filtering) + { + if (!m_strong_edges) + return true; + } + + assert(p.m_base_parts2 <= MAX_BASE_PARTS2); + assert(p.m_base_parts3 <= MAX_BASE_PARTS3); + + // 2 subsets + int total_parts2 = m_super_strong_edges ? (p.m_base_parts2 * PART_ESTIMATE_STAGE1_MULTIPLIER) : (m_very_strong_edges ? (p.m_base_parts2 * 2) : p.m_base_parts2); + total_parts2 = minimum(total_parts2, MAX_BASE_PARTS2 * PART_ESTIMATE_STAGE1_MULTIPLIER); + total_parts2 = minimum(total_parts2, p.m_pPart_data_p2->m_total_unique_patterns); + + const uint32_t surrogate_encode_flags = 0; + + if (total_parts2) + { + int best_parts2_temp[MAX_BASE_PARTS2 * PART_ESTIMATE_STAGE1_MULTIPLIER]; + assert(total_parts2 <= (int)std::size(best_parts2_temp)); + + // Stage 1: kmeans+vptree + const bool has_est_parts2 = estimate_partition2( + p.m_block_width, p.m_block_height, + pixel_stats, + best_parts2_temp, total_parts2, + p.m_pPart_data_p2, p.m_brute_force_est_parts); + + if (has_est_parts2) + { + // Always try direct, optionally base+scale cem's + for (uint32_t s = 0; s < 2; s++) + { + if ((s) && (!p.m_use_base_scale_modes)) + continue; + + if (p.m_disable_part_est_stage2) + { + m_num_est_parts2[s] = total_parts2; + memcpy(m_best_parts2[s], best_parts2_temp, m_num_est_parts2[s] * sizeof(int)); + continue; + } + + uint32_t cem_to_surrogate_encode = p.m_alpha_cems ? astc_helpers::CEM_LDR_RGBA_DIRECT : astc_helpers::CEM_LDR_RGB_DIRECT; + if (s) + cem_to_surrogate_encode = p.m_alpha_cems ? astc_helpers::CEM_LDR_RGB_BASE_SCALE_PLUS_TWO_A : astc_helpers::CEM_LDR_RGB_BASE_SCALE; + + // Stage 2: Analytic surrogate WSSE + basisu::vector part_sses(total_parts2); + + for (int i = 0; i < total_parts2; i++) + { + const astc_ldr::partitions_data* pPart_data = p.m_pPart_data_p2; + + const uint32_t unique_seed_index = best_parts2_temp[i]; + const uint32_t part_seed_index = pPart_data->m_unique_index_to_part_seed[unique_seed_index]; + + const astc_ldr::partition_pattern_vec* pPat = &pPart_data->m_partition_pats[unique_seed_index]; + + log_surrogate_astc_blk surrogate_log_blk; + float sse = encode_surrogate_trial_subsets( + p.m_block_width, p.m_block_height, + pixel_stats, + cem_to_surrogate_encode, 2, part_seed_index, pPat, + astc_helpers::BISE_256_LEVELS, astc_helpers::BISE_64_LEVELS, + p.m_block_width, p.m_block_height, + surrogate_log_blk, + *p.m_pEnc_params, surrogate_encode_flags); + + stats.m_total_surrogate_encodes++; + + part_sses[i] = sse; + } // i + + basisu::vector part_sses_ranks(total_parts2); + + indirect_sort(total_parts2, part_sses_ranks.get_ptr(), part_sses.get_ptr()); + + m_num_est_parts2[s] = maximum(1, (total_parts2 + p.m_part2_fraction_to_keep - 1) / p.m_part2_fraction_to_keep); + + for (int i = 0; i < m_num_est_parts2[s]; i++) + { + const uint32_t rank_index = part_sses_ranks[i]; + const uint32_t unique_seed_unique = best_parts2_temp[rank_index]; + m_best_parts2[s][i] = unique_seed_unique; + } // i + + } // s + + } // if (has_est_parts2) + + } // if (total_parts2) + + // 3 subsets + int total_parts3 = m_super_strong_edges ? (p.m_base_parts3 * PART_ESTIMATE_STAGE1_MULTIPLIER) : (m_very_strong_edges ? (p.m_base_parts3 * 2) : p.m_base_parts3); + total_parts3 = minimum(total_parts3, MAX_BASE_PARTS3 * PART_ESTIMATE_STAGE1_MULTIPLIER); + total_parts3 = minimum(total_parts3, p.m_pPart_data_p3->m_total_unique_patterns); + + if (total_parts3) + { + int best_parts3_temp[MAX_BASE_PARTS3 * PART_ESTIMATE_STAGE1_MULTIPLIER]; + assert(total_parts3 <= (int)std::size(best_parts3_temp)); + + // Stage 1: kmeans+vptree + const bool has_est_parts3 = estimate_partition3( + p.m_block_width, p.m_block_height, + pixel_stats, + best_parts3_temp, total_parts3, + p.m_pPart_data_p3, p.m_brute_force_est_parts); + + if (has_est_parts3) + { + // Always try direct, optionally base+scale cem's + for (uint32_t s = 0; s < 2; s++) + { + if ((s) && (!p.m_use_base_scale_modes)) + continue; + + if (p.m_disable_part_est_stage2) + { + m_num_est_parts3[s] = total_parts3; + memcpy(m_best_parts3[s], best_parts3_temp, m_num_est_parts3[s] * sizeof(int)); + continue; + } + + uint32_t cem_to_surrogate_encode = p.m_alpha_cems ? astc_helpers::CEM_LDR_RGBA_DIRECT : astc_helpers::CEM_LDR_RGB_DIRECT; + if (s) + cem_to_surrogate_encode = p.m_alpha_cems ? astc_helpers::CEM_LDR_RGB_BASE_SCALE_PLUS_TWO_A : astc_helpers::CEM_LDR_RGB_BASE_SCALE; + + // Stage 2: Analytic surrogate WSSE + basisu::vector part_sses(total_parts3); + for (int i = 0; i < total_parts3; i++) + { + const astc_ldr::partitions_data* pPart_data = p.m_pPart_data_p3; + + const uint32_t unique_seed_index = best_parts3_temp[i]; + const uint32_t part_seed_index = pPart_data->m_unique_index_to_part_seed[unique_seed_index]; + + const astc_ldr::partition_pattern_vec* pPat = &pPart_data->m_partition_pats[unique_seed_index]; + + log_surrogate_astc_blk surrogate_log_blk; + float sse = encode_surrogate_trial_subsets( + p.m_block_width, p.m_block_height, + pixel_stats, + cem_to_surrogate_encode, 3, part_seed_index, pPat, + astc_helpers::BISE_256_LEVELS, astc_helpers::BISE_64_LEVELS, + p.m_block_width, p.m_block_height, + surrogate_log_blk, + *p.m_pEnc_params, surrogate_encode_flags); + + stats.m_total_surrogate_encodes++; + + part_sses[i] = sse; + } // i + + basisu::vector part_sses_ranks(total_parts3); + + indirect_sort(total_parts3, part_sses_ranks.get_ptr(), part_sses.get_ptr()); + + m_num_est_parts3[s] = maximum(1, (total_parts3 + p.m_part3_fraction_to_keep - 1) / p.m_part3_fraction_to_keep); + + for (int i = 0; i < m_num_est_parts3[s]; i++) + { + const uint32_t rank_index = part_sses_ranks[i]; + const uint32_t unique_seed_unique = best_parts3_temp[rank_index]; + m_best_parts3[s][i] = unique_seed_unique; + } // i + + } // s + + } // if (has_est_parts3) + + } // if (total_parts3) + + return true; + } + + bool trivial_triage( + const ldr_astc_lowlevel_block_encoder_params& p, + const astc_ldr::pixel_stats_t& pixel_stats, + basisu::vector& out_blocks, + uint32_t blur_id, + encode_block_stats& stats) + { + BASISU_NOTE_UNUSED(pixel_stats); + BASISU_NOTE_UNUSED(stats); + BASISU_NOTE_UNUSED(out_blocks); + BASISU_NOTE_UNUSED(blur_id); + + if (m_trial_modes_to_estimate.capacity() < 1024) + m_trial_modes_to_estimate.reserve(1024); + m_trial_modes_to_estimate.resize(0); + + assert((astc_helpers::CEM_LDR_RGBA_BASE_PLUS_OFFSET + 1) == basist::astc_ldr_t::OTM_NUM_CEMS); + + for (uint32_t cem_index = astc_helpers::CEM_LDR_LUM_DIRECT; cem_index < basist::astc_ldr_t::OTM_NUM_CEMS; cem_index++) + { + if (astc_helpers::does_cem_have_alpha(cem_index) != p.m_alpha_cems) + continue; + + const bool cem_has_alpha = astc_helpers::does_cem_have_alpha(cem_index); + if (cem_has_alpha != p.m_use_alpha_or_opaque_modes) + continue; + + bool accept_flag = false; + switch (cem_index) + { + case astc_helpers::CEM_LDR_LUM_DIRECT: + case astc_helpers::CEM_LDR_LUM_ALPHA_DIRECT: + { + accept_flag = p.m_use_lum_direct_modes; + break; + } + case astc_helpers::CEM_LDR_RGB_DIRECT: + case astc_helpers::CEM_LDR_RGBA_DIRECT: + { + accept_flag = p.m_use_direct_modes; + break; + } + case astc_helpers::CEM_LDR_RGB_BASE_SCALE: + case astc_helpers::CEM_LDR_RGB_BASE_SCALE_PLUS_TWO_A: + { + accept_flag = p.m_use_base_scale_modes; + break; + } + default: + break; + } + + if (!accept_flag) + continue; + + const uint32_t s = astc_helpers::cem_is_ldr_base_scale(cem_index) ? 1 : 0; + + for (uint32_t subsets_index = 0; subsets_index < basist::astc_ldr_t::OTM_NUM_SUBSETS; subsets_index++) + { + if (subsets_index == 1) + { + if (!m_num_est_parts2[s]) + continue; + } + else if (subsets_index == 2) + { + if (!m_num_est_parts3[s]) + continue; + } + + const uint32_t ccs_max_index = (p.m_use_dual_planes ? basist::astc_ldr_t::OTM_NUM_CCS : 1); + for (uint32_t ccs_index = 0; ccs_index < ccs_max_index; ccs_index++) + { + if (ccs_index) + { + if (!p.m_dp_active_chans[ccs_index - 1]) + continue; + } + + for (uint32_t grid_size_index = 0; grid_size_index < basist::astc_ldr_t::OTM_NUM_GRID_SIZES; grid_size_index++) + { + if (grid_size_index) // if large grid + { + if (p.m_use_small_grids_only) + continue; + } + + for (uint32_t grid_anisos_index = 0; grid_anisos_index < basist::astc_ldr_t::OTM_NUM_GRID_ANISOS; grid_anisos_index++) + { + if (p.m_grid_hv_filtering) + { + if (grid_anisos_index == 1) + { + // W>=H + if (p.m_filter_horizontally_flag) + continue; + } + else if (grid_anisos_index == 2) + { + // Wm_tm_groups[cem_index][subsets_index][ccs_index][grid_size_index][grid_anisos_index]); + + } // grid_aniso_index + + } // grid_size_index + + } // ccs_index + + } // subsets_index + + } // cem_iter + + if (!m_trial_modes_to_estimate.size()) + { + assert(0); + return false; + } + + return true; + } + + bool analytic_triage( + const ldr_astc_lowlevel_block_encoder_params& p, + const astc_ldr::pixel_stats_t& pixel_stats, + basisu::vector& out_blocks, + uint32_t blur_id, + encode_block_stats& stats) + { + BASISU_NOTE_UNUSED(blur_id); + BASISU_NOTE_UNUSED(out_blocks); + + //--------------------------------- superbucket analytical estimation + + shortlist_bucket_hash_t& shortlist_buckets = m_shortlist_hash0; + + if (m_shortlist_hash0.get_table_size() != EXPECTED_SHORTLIST_HASH_SIZE) + { + const bool was_allocated = m_shortlist_hash0.get_table_size() > 0; + + m_shortlist_hash0.clear(); + m_shortlist_hash0.reserve(EXPECTED_SHORTLIST_HASH_SIZE / 2); + + if ((g_devel_messages) && (was_allocated)) + fmt_debug_printf("shortlist hash0 thrash\n"); + } + else + { + m_shortlist_hash0.reset(); + } + + m_used_superbuckets = false; + + if (p.m_use_superbuckets) + { + m_used_superbuckets = true; + + // This may thrash if it grows larger on another thread, but we must avoid determinism issues. + if (m_superbucket_hash.get_table_size() != EXPECTED_SUPERBUCKET_HASH_SIZE) + { + const bool was_allocated = m_superbucket_hash.get_table_size() > 0; + + m_superbucket_hash.clear(); + m_superbucket_hash.reserve(EXPECTED_SUPERBUCKET_HASH_SIZE >> 1); + + if ((g_devel_messages) && (was_allocated)) + fmt_debug_printf("superbucket hash thrash\n"); + } + else + { + m_superbucket_hash.reset(); + } + + trial_mode_estimate_superbucket_key new_key; + new_key.clear(); + + trial_mode_estimate_superbucket_value new_val; + + // Create superbuckets + uint32_t max_superbucket_tm_indices = 0; + for (uint32_t j = 0; j < m_trial_modes_to_estimate.size(); j++) + { + const uint32_t trial_mode_iter = m_trial_modes_to_estimate[j]; + + assert(trial_mode_iter < p.m_num_trial_modes); + const basist::astc_ldr_t::trial_mode& tm = p.m_pTrial_modes[trial_mode_iter]; + + new_key.m_cem_index = safe_cast_uint8(tm.m_cem); + new_key.m_ccs_index = safe_cast_int8(tm.m_ccs_index); + + new_key.m_subset_unique_index = 0; + new_key.m_num_subsets = (uint8_t)tm.m_num_parts; + + if (tm.m_num_parts == 1) + { + auto ins_res = m_superbucket_hash.insert(new_key, new_val); + const bool created_flag = ins_res.second; + + assert(ins_res.first->first.m_cem_index == tm.m_cem); + assert(ins_res.first->first.m_ccs_index == tm.m_ccs_index); + assert(ins_res.first->first.m_num_subsets == tm.m_num_parts); + + trial_mode_estimate_superbucket_value& v = (ins_res.first)->second; + + if (created_flag) + v.m_trial_mode_list.reserve(256); + + v.m_trial_mode_list.push_back(trial_mode_iter); + + max_superbucket_tm_indices = maximum(max_superbucket_tm_indices, v.m_trial_mode_list.size_u32()); + } + else + { + //const astc_ldr::partitions_data* pPart_data = (tm.m_num_parts == 2) ? p.m_pPart_data_p2 : p.m_pPart_data_p3; + + const uint32_t s = astc_helpers::cem_is_ldr_base_scale(tm.m_cem) ? 1 : 0; + const uint32_t num_est_parts_to_try = (tm.m_num_parts == 2) ? m_num_est_parts2[s] : m_num_est_parts3[s]; + + for (uint32_t est_part_iter = 0; est_part_iter < num_est_parts_to_try; est_part_iter++) + { + const uint32_t part_unique_index = (tm.m_num_parts == 2) ? m_best_parts2[s][est_part_iter] : m_best_parts3[s][est_part_iter]; + + new_key.m_subset_unique_index = safe_cast_uint16(part_unique_index); + + auto ins_res = m_superbucket_hash.insert(new_key, new_val); + const bool created_flag = ins_res.second; + + assert(ins_res.first->first.m_cem_index == tm.m_cem); + assert(ins_res.first->first.m_ccs_index == tm.m_ccs_index); + assert(ins_res.first->first.m_num_subsets == tm.m_num_parts); + + trial_mode_estimate_superbucket_value& v = (ins_res.first)->second; + if (created_flag) + v.m_trial_mode_list.reserve(256); + + v.m_trial_mode_list.push_back(trial_mode_iter); + + max_superbucket_tm_indices = maximum(max_superbucket_tm_indices, v.m_trial_mode_list.size_u32()); + + } // est_part_iter + } + + } // j + + //fmt_debug_printf("Total superbucket entries: {}\n", m_superbucket_hash.size()); + //fmt_debug_printf("Max superbucket tm indices: {}\n", max_superbucket_tm_indices); + + const uint32_t total_block_texels = p.m_total_block_pixels; + const float inv_total_block_texels = 1.0f / (float)total_block_texels; + + while (m_trial_mode_estimate_priority_queue.size()) + m_trial_mode_estimate_priority_queue.pop(); + + const uint32_t max_priority_queue_size = p.m_superbucket_max_to_retain[m_block_complexity_index]; + + // purposely downscale lost scale energy relative to the other error sources + // this biased the encoder towards smaller grids + const float SLAM_TO_LINE_WEIGHT = 1.5f; // upweight STL relative to other errors to give the estimator more of a signal especially for dual plane + const float QUANT_ERROR_WEIGHT = 1.0f; // quant error is naturally quite pessimistic + const float SCALE_ERROR_WEIGHT = 3.0f; // weight grid downsample (scale) error + + // Discount for blue contraction encoding and base+offset CEM's. + const float BLUE_CONTRACTION_ENDPOINT_QUANT_DISCOUNT = .5f; + + // Iterate over all superbuckets, surrogate encode to compute slam to line error, DCT of weight grid(s) to estimate energy lost during weight grid downsampling. + // TODO: priority queue and aggressive early outs + for (auto superbucket_iter = m_superbucket_hash.begin(); superbucket_iter != m_superbucket_hash.end(); ++superbucket_iter) + { + const trial_mode_estimate_superbucket_key& key = superbucket_iter->first; + const trial_mode_estimate_superbucket_value& val = superbucket_iter->second; + + //const bool cem_has_alpha = astc_helpers::does_cem_have_alpha(key.m_cem_index); + + log_surrogate_astc_blk log_blk; + + const astc_ldr::partitions_data* pPart_data = nullptr; + const astc_ldr::partition_pattern_vec* pPat = nullptr; + + //const uint32_t num_planes = (key.m_ccs_index >= 0) ? 2 : 1; + + const float worst_wsse_found_so_far = (m_trial_mode_estimate_priority_queue.size() >= max_priority_queue_size) ? m_trial_mode_estimate_priority_queue.top().m_wsse : 1e+9f; + + float slam_to_line_wsse = 0; + if (key.m_num_subsets == 1) + { + slam_to_line_wsse = encode_surrogate_trial( + p.m_block_width, p.m_block_height, + pixel_stats, + key.m_cem_index, + key.m_ccs_index, + astc_helpers::BISE_256_LEVELS, astc_helpers::BISE_64_LEVELS, + p.m_block_width, p.m_block_height, + log_blk, + *p.m_pEnc_params, + astc_ldr::cFlagDisableQuant); + } + else + { + pPart_data = (key.m_num_subsets == 3) ? p.m_pPart_data_p3 : p.m_pPart_data_p2; + + const uint32_t unique_seed_index = key.m_subset_unique_index; + const uint32_t part_seed_index = pPart_data->m_unique_index_to_part_seed[unique_seed_index]; + + pPat = &pPart_data->m_partition_pats[unique_seed_index]; + + slam_to_line_wsse = encode_surrogate_trial_subsets( + p.m_block_width, p.m_block_height, + pixel_stats, + key.m_cem_index, key.m_num_subsets, part_seed_index, pPat, + astc_helpers::BISE_256_LEVELS, astc_helpers::BISE_64_LEVELS, + p.m_block_width, p.m_block_height, + log_blk, + *p.m_pEnc_params, + astc_ldr::cFlagDisableQuant); + } + + stats.m_total_surrogate_encodes++; + + // Early out: Slam to line error is so high it's impossible for any blocks in this bucket to win. + if ((SLAM_TO_LINE_WEIGHT * slam_to_line_wsse) >= worst_wsse_found_so_far) + continue; + + bool can_use_base_ofs = false; + if ((key.m_cem_index == astc_helpers::CEM_LDR_RGB_DIRECT) || (key.m_cem_index == astc_helpers::CEM_LDR_RGBA_DIRECT)) + { + float max_span_size = 0.0f; + + for (uint32_t subset_index = 0; subset_index < key.m_num_subsets; subset_index++) + { + const vec4F subset_chan_spans(log_blk.m_endpoints[subset_index][1] - log_blk.m_endpoints[subset_index][0]); + for (uint32_t c = 0; c < 4; c++) + { + float span_size = fabs(subset_chan_spans[c]); + max_span_size = maximum(max_span_size, span_size); + } + } + + can_use_base_ofs = (max_span_size < .25f); + } + + assert(p.m_pDCT2F); + + assert((p.m_pDCT2F->rows() == p.m_block_height) && (p.m_pDCT2F->cols() == p.m_block_width)); + + float weight0_energy[astc_ldr::ASTC_LDR_MAX_BLOCK_PIXELS]; + float weight1_energy[astc_ldr::ASTC_LDR_MAX_BLOCK_PIXELS]; + + basist::astc_ldr_t::fvec& dct_work = m_dct_work; + + // Forward DCT in normalized weight (surrogate) space + p.m_pDCT2F->forward(log_blk.m_weights0, weight0_energy, dct_work); + compute_energy_from_dct(p.m_block_width, p.m_block_height, weight0_energy); + + if (key.m_ccs_index >= 0) + { + p.m_pDCT2F->forward(log_blk.m_weights1, weight1_energy, dct_work); + compute_energy_from_dct(p.m_block_width, p.m_block_height, weight1_energy); + } + + weight_terms weight0_terms, weight1_terms; + weight_terms* pWeight0_terms = &weight0_terms; + weight_terms* pWeight1_terms = nullptr; + weight0_terms.calc(total_block_texels, log_blk.m_weights0); + if (key.m_ccs_index >= 0) + { + weight1_terms.calc(total_block_texels, log_blk.m_weights1); + pWeight1_terms = &weight1_terms; + } + + // Precompute subset span and total pixels info + vec4F subset_spans[astc_helpers::MAX_PARTITIONS]; + uint32_t subset_pixels[astc_helpers::MAX_PARTITIONS]; + + for (uint32_t subset_index = 0; subset_index < key.m_num_subsets; subset_index++) + { + subset_spans[subset_index] = log_blk.m_endpoints[subset_index][1] - log_blk.m_endpoints[subset_index][0]; + + uint32_t total_subset_pixels = p.m_total_block_pixels; + if (key.m_num_subsets > 1) + total_subset_pixels = pPart_data->m_partition_pat_histograms[key.m_subset_unique_index].m_hist[subset_index]; + + subset_pixels[subset_index] = total_subset_pixels; + } + + // Loop through all trial modes in this sueprbucket. TODO: Sort by endpoint levels? + for (uint32_t k = 0; k < val.m_trial_mode_list.size(); k++) + { + const uint32_t trial_mode_index = val.m_trial_mode_list[k]; + assert(trial_mode_index < p.m_num_trial_modes); + + const basist::astc_ldr_t::trial_mode& tm = p.m_pTrial_modes[trial_mode_index]; + + assert(tm.m_cem == key.m_cem_index); + assert(tm.m_ccs_index == key.m_ccs_index); + assert(tm.m_num_parts == key.m_num_subsets); + + const basist::astc_ldr_t::astc_block_grid_data* pGrid_data = basist::astc_ldr_t::find_astc_block_grid_data(p.m_block_width, p.m_block_height, tm.m_grid_width, tm.m_grid_height); + + const uint32_t total_endpoint_levels = astc_helpers::get_ise_levels(tm.m_endpoint_ise_range); + const uint32_t total_weight_levels = astc_helpers::get_ise_levels(tm.m_weight_ise_range); + + const uint32_t num_effective_e_levels = can_use_base_ofs ? minimum(total_endpoint_levels * 2, 256) : total_endpoint_levels; + float qe0 = compute_quantized_channel_endpoint_mse_estimate(num_effective_e_levels); + const float qe1 = (key.m_ccs_index >= 0) ? (qe0 * pWeight1_terms->m_endpoint_factor) : 0.0f; + qe0 *= pWeight0_terms->m_endpoint_factor; + + float total_e_quant_wsse = 0.0f; + + for (uint32_t subset_index = 0; subset_index < key.m_num_subsets; subset_index++) + { + const vec4F& subset_chan_spans = subset_spans[subset_index]; + const uint32_t total_subset_pixels = subset_pixels[subset_index]; + + for (uint32_t c = 0; c < 4; c++) + { + float span_size = fabs(subset_chan_spans[c]); + + if ((span_size == 0.0f) && ((log_blk.m_endpoints[subset_index][1][c] == 0.0f) || (log_blk.m_endpoints[subset_index][1][c] == 1.0f))) + continue; + + // Scale channel MSE by chan weight and the # of subset pixels to get weighted SSE + const float chan_N = (float)p.m_pEnc_params->m_comp_weights[c] * (float)total_subset_pixels; + + total_e_quant_wsse += ((key.m_ccs_index == (int)c) ? qe1 : qe0) * chan_N; + + } // chan_index + } + + if ((tm.m_cem == astc_helpers::CEM_LDR_RGB_DIRECT) || (tm.m_cem == astc_helpers::CEM_LDR_RGBA_DIRECT)) + total_e_quant_wsse *= BLUE_CONTRACTION_ENDPOINT_QUANT_DISCOUNT; + + float total_wsse_so_far = (SLAM_TO_LINE_WEIGHT * slam_to_line_wsse) + (QUANT_ERROR_WEIGHT * total_e_quant_wsse); + if (total_wsse_so_far >= worst_wsse_found_so_far) + continue; + + float lost_weight_energy0 = compute_lost_dct_energy(p.m_block_width, p.m_block_height, weight0_energy, tm.m_grid_width, tm.m_grid_height) * inv_total_block_texels; + + float lost_weight_energy1 = 0; + if (key.m_ccs_index >= 0) + lost_weight_energy1 = compute_lost_dct_energy(p.m_block_width, p.m_block_height, weight1_energy, tm.m_grid_width, tm.m_grid_height) * inv_total_block_texels; + + // Add up: + // slam to line error WSSE (weighted sum of squared errors) + // weight quant error WSSE + // endpoint quant error WSSE + // weight grid rescale error WSSE (scaled by span^2) + float total_scale_wsse = 0.0f; + + for (uint32_t subset_index = 0; subset_index < key.m_num_subsets; subset_index++) + { + const vec4F& subset_chan_spans = subset_spans[subset_index]; + const uint32_t total_subset_pixels = subset_pixels[subset_index]; + + for (uint32_t c = 0; c < 4; c++) + { + float span_size = fabs(subset_chan_spans[c]); + + if ((span_size == 0.0f) && ((log_blk.m_endpoints[subset_index][1][c] == 0.0f) || (log_blk.m_endpoints[subset_index][1][c] == 1.0f))) + { + // Won't have any E/W quant err at extremes (0.0 or 1.0 are always perfectly represented), no weight downsample error either. + //chan_mse.m_ep = 0.0f; + //chan_mse.m_wp = 0.0f; + } + else + { + // Scale channel MSE by chan weight and the # of subset pixels to get weighted SSE + const float chan_N = (float)p.m_pEnc_params->m_comp_weights[c] * (float)total_subset_pixels; + + // sum in the plane's lost weight energy, scaled by span_size^2 * chan_weight * num_texels_covered + if (key.m_ccs_index == (int)c) + total_scale_wsse += lost_weight_energy1 * square(span_size) * chan_N; + else + total_scale_wsse += lost_weight_energy0 * square(span_size) * chan_N; + } + + } // chan_index + } + + total_wsse_so_far += (SCALE_ERROR_WEIGHT * total_scale_wsse); + if (total_wsse_so_far >= worst_wsse_found_so_far) + continue; + + float total_w_quant_wsse = 0.0f; + for (uint32_t subset_index = 0; subset_index < key.m_num_subsets; subset_index++) + { + const vec4F& subset_chan_spans = subset_spans[subset_index]; + const uint32_t total_subset_pixels = subset_pixels[subset_index]; + + for (uint32_t c = 0; c < 4; c++) + { + float span_size = fabs(subset_chan_spans[c]); + + if ((span_size == 0.0f) && ((log_blk.m_endpoints[subset_index][1][c] == 0.0f) || (log_blk.m_endpoints[subset_index][1][c] == 1.0f))) + { + // Won't have any E/W quant err at extremes (0.0 or 1.0 are always perfectly represented), no weight downsample error either. + //chan_mse.m_ep = 0.0f; + //chan_mse.m_wp = 0.0f; + } + else + { + // span_size != 0 here - estimate weight/endpoint quantization errors + float chan_w_mse = compute_quantized_channel_weight_mse_estimate( + total_weight_levels, span_size, + pGrid_data->m_weight_gamma, (key.m_ccs_index == (int)c) ? pWeight1_terms : pWeight0_terms); + + // Scale channel MSE by chan weight and the # of subset pixels to get weighted SSE + const float chan_N = (float)p.m_pEnc_params->m_comp_weights[c] * (float)total_subset_pixels; + + total_w_quant_wsse += chan_w_mse * chan_N; + } + + } // chan_index + + } // subset_index + + const float total_wsse = total_wsse_so_far + (QUANT_ERROR_WEIGHT * total_w_quant_wsse); + + if (m_trial_mode_estimate_priority_queue.size() >= max_priority_queue_size) + { + if (total_wsse < m_trial_mode_estimate_priority_queue.top().m_wsse) + { + m_trial_mode_estimate_priority_queue.pop(); + + trial_mode_estimate est; + est.m_superbucket_key = key; + est.m_trial_mode_index = trial_mode_index; + est.m_wsse = total_wsse; + + m_trial_mode_estimate_priority_queue.push(est); + } + } + else + { + trial_mode_estimate est; + est.m_superbucket_key = key; + est.m_trial_mode_index = trial_mode_index; + est.m_wsse = total_wsse; + + m_trial_mode_estimate_priority_queue.push(est); + } + + } // k + + } // superbucket_iter + + stats.m_total_superbuckets_created += m_superbucket_hash.size_u32(); + + const uint32_t total_estimates_to_retain = (uint32_t)m_trial_mode_estimate_priority_queue.size(); + assert(total_estimates_to_retain); + + for (uint32_t i = 0; i < total_estimates_to_retain; i++) + { + const trial_mode_estimate &est = m_trial_mode_estimate_priority_queue.top(); + + const trial_mode_estimate_superbucket_key& key = est.m_superbucket_key; + const uint32_t trial_mode_iter = est.m_trial_mode_index; + + assert(trial_mode_iter < p.m_num_trial_modes); + const basist::astc_ldr_t::trial_mode& tm = p.m_pTrial_modes[trial_mode_iter]; + + assert(tm.m_cem == key.m_cem_index); + assert(tm.m_ccs_index == key.m_ccs_index); + assert(tm.m_num_parts == key.m_num_subsets); + + const uint32_t part_unique_index = key.m_subset_unique_index; + + auto ins_res = shortlist_buckets.insert(shortlist_bucket(tm.m_grid_width, tm.m_grid_height, tm.m_cem, tm.m_ccs_index, tm.m_num_parts, part_unique_index)); + + ins_res.first->second.push_back(safe_cast_uint16(trial_mode_iter)); + + m_trial_mode_estimate_priority_queue.pop(); + } + } + else + { + for (uint32_t j = 0; j < m_trial_modes_to_estimate.size(); j++) + { + const uint32_t trial_mode_iter = m_trial_modes_to_estimate[j]; + + assert(trial_mode_iter < p.m_num_trial_modes); + const basist::astc_ldr_t::trial_mode& tm = p.m_pTrial_modes[trial_mode_iter]; + + if (tm.m_num_parts > 1) + { + //const astc_ldr::partitions_data* pPart_data = (tm.m_num_parts == 2) ? p.m_pPart_data_p2 : p.m_pPart_data_p3; + + const uint32_t s = astc_helpers::cem_is_ldr_base_scale(tm.m_cem) ? 1 : 0; + const uint32_t num_est_parts_to_try = (tm.m_num_parts == 2) ? m_num_est_parts2[s] : m_num_est_parts3[s]; + + for (uint32_t est_part_iter = 0; est_part_iter < num_est_parts_to_try; est_part_iter++) + { + const uint32_t part_unique_index = (tm.m_num_parts == 2) ? m_best_parts2[s][est_part_iter] : m_best_parts3[s][est_part_iter]; + + auto ins_res = shortlist_buckets.insert(shortlist_bucket(tm.m_grid_width, tm.m_grid_height, tm.m_cem, tm.m_ccs_index, tm.m_num_parts, part_unique_index)); + + ins_res.first->second.push_back(safe_cast_uint16(trial_mode_iter)); + + } // est_part_iter + + } + else + { + auto ins_res = shortlist_buckets.insert(shortlist_bucket(tm.m_grid_width, tm.m_grid_height, tm.m_cem, tm.m_ccs_index, 1, 0)); + ins_res.first->second.push_back(safe_cast_uint16(trial_mode_iter)); + + } + } + } + + stats.m_total_buckets_created += (uint32_t)shortlist_buckets.size(); + +#if 0 + // TEMP + uint32_t max_bucket_tm_indices = 0; + for (auto it = shortlist_buckets.begin(); it != shortlist_buckets.end(); ++it) + { + shortlist_bucket& bucket = it->first; + trial_mode_index_vec& trial_mode_indices = it->second; + max_bucket_tm_indices = maximum(max_bucket_tm_indices, trial_mode_indices.size_u32()); + } + + fmt_debug_printf("max_bucket_tm_indices: {}\n", max_bucket_tm_indices); +#endif + + return true; + } + + bool surrogate_encode_shortlist_bucket_representatives( + const ldr_astc_lowlevel_block_encoder_params& p, + const astc_ldr::pixel_stats_t& pixel_stats, + basisu::vector& out_blocks, + uint32_t blur_id, + encode_block_stats& stats) + { + BASISU_NOTE_UNUSED(blur_id); + BASISU_NOTE_UNUSED(out_blocks); + + shortlist_bucket_hash_t& shortlist_buckets = m_shortlist_hash0; + + // Surrogate encode a representative for each bucket. + for (auto it = shortlist_buckets.begin(); it != shortlist_buckets.end(); ++it) + { + shortlist_bucket& bucket = it->first; + //const uint_vec& trial_mode_indices = it->second; + const trial_mode_index_vec& trial_mode_indices = it->second; + + // Choose bucket's largest endpoint/weight ise ranges (finest quant levels) - anything in the bucket will quite likely encode to worse SSE, which we can rapidly estimate. + uint32_t max_endpoint_ise_range = 0, max_weight_ise_range = 0; + for (uint32_t i = 0; i < trial_mode_indices.size(); i++) + { + const basist::astc_ldr_t::trial_mode& tm = p.m_pTrial_modes[trial_mode_indices[i]]; + + max_endpoint_ise_range = maximum(max_endpoint_ise_range, tm.m_endpoint_ise_range); + max_weight_ise_range = maximum(max_weight_ise_range, tm.m_weight_ise_range); + } + + log_surrogate_astc_blk& log_block = bucket.m_surrogate_log_blk; + + if (bucket.m_num_parts == 1) + { + bucket.m_sse = encode_surrogate_trial( + p.m_block_width, p.m_block_height, + pixel_stats, + bucket.m_cem_index, + bucket.m_ccs_index, + max_endpoint_ise_range, max_weight_ise_range, + bucket.m_grid_width, bucket.m_grid_height, + log_block, + *p.m_pEnc_params, 0); + + stats.m_total_surrogate_encodes++; + } + else + { + const astc_ldr::partitions_data* pPart_data = (bucket.m_num_parts == 2) ? p.m_pPart_data_p2 : p.m_pPart_data_p3; + + const uint32_t part_seed_index = pPart_data->m_unique_index_to_part_seed[bucket.m_unique_seed_index]; + + const astc_ldr::partition_pattern_vec* pPat = &pPart_data->m_partition_pats[bucket.m_unique_seed_index]; + + bucket.m_sse = encode_surrogate_trial_subsets( + p.m_block_width, p.m_block_height, + pixel_stats, + bucket.m_cem_index, bucket.m_num_parts, part_seed_index, pPat, + max_endpoint_ise_range, max_weight_ise_range, + bucket.m_grid_width, bucket.m_grid_height, + log_block, + *p.m_pEnc_params, 0); + + stats.m_total_surrogate_encodes++; + } + + if ((bucket.m_cem_index == astc_helpers::CEM_LDR_RGB_DIRECT) || (bucket.m_cem_index == astc_helpers::CEM_LDR_RGBA_DIRECT)) + { + // blue contraction/base+offset discount + bucket.m_sse *= BLUE_CONTRACTION_BASE_OFS_DISCOUNT; + } + + } // it + + return true; + } + + bool prune_shortlist_buckets( + const ldr_astc_lowlevel_block_encoder_params& p, + const astc_ldr::pixel_stats_t& pixel_stats, + basisu::vector& out_blocks, + uint32_t blur_id, + encode_block_stats& stats) + { + BASISU_NOTE_UNUSED(pixel_stats); + BASISU_NOTE_UNUSED(stats); + BASISU_NOTE_UNUSED(blur_id); + BASISU_NOTE_UNUSED(out_blocks); + + shortlist_bucket_hash_t& shortlist_buckets = m_shortlist_hash0; + + if (p.m_bucket_pruning_passes) + { + shortlist_bucket_hash_t& new_shortlist_buckets = m_shortlist_hash1; + + if (m_shortlist_hash1.get_table_size() != EXPECTED_SHORTLIST_HASH_SIZE) + { + const bool was_allocated = m_shortlist_hash1.get_table_size() > 0; + + m_shortlist_hash1.clear(); + m_shortlist_hash1.reserve(EXPECTED_SHORTLIST_HASH_SIZE / 2); + + if ((g_devel_messages) && (was_allocated)) + fmt_debug_printf("shortlist hash1 thrash\n"); + } + else + { + m_shortlist_hash1.reset(); + } + + const uint32_t NUM_PRUNE_PASSES = 3; + for (uint32_t prune_pass = 0; prune_pass < NUM_PRUNE_PASSES; prune_pass++) + { + for (auto it = shortlist_buckets.begin(); it != shortlist_buckets.end(); ++it) + it->first.m_examined_flag = false; + + new_shortlist_buckets.reset(); + + for (auto it = shortlist_buckets.begin(); it != shortlist_buckets.end(); ++it) + { + shortlist_bucket& bucket = it->first; + + if (bucket.m_examined_flag) + continue; + + if (prune_pass == 0) + { + // Prune pass 0: Dual plane groups: only accept best CCS index + if (bucket.m_ccs_index >= 0) + { + shortlist_bucket_hash_t::iterator ccs_buckets[4]; + + int best_ccs_index = -1; + float best_ccs_err = BIG_FLOAT_VAL; + + bool skip_bucket = false; + for (uint32_t c = 0; c < 4; c++) + { + auto ccs_res_it = shortlist_buckets.find(shortlist_bucket(bucket.m_grid_width, bucket.m_grid_height, bucket.m_cem_index, c, bucket.m_num_parts, bucket.m_unique_seed_index)); + ccs_buckets[c] = ccs_res_it; + + if (ccs_res_it == shortlist_buckets.end()) + continue; + + assert(!ccs_res_it->first.m_examined_flag); + + ccs_res_it->first.m_examined_flag = true; + + float ccs_sse_err = ccs_res_it->first.m_sse; + if (ccs_sse_err < best_ccs_err) + { + best_ccs_err = ccs_sse_err; + best_ccs_index = c; + } + } // c + + if (!skip_bucket) + { + assert(best_ccs_index >= 0); + + shortlist_bucket_hash_t::iterator best_ccs_it = ccs_buckets[best_ccs_index]; + assert(best_ccs_it != shortlist_buckets.end()); + + new_shortlist_buckets.insert(best_ccs_it->first, best_ccs_it->second); + } + } + else + { + new_shortlist_buckets.insert(it->first, it->second); + } + } + else if (prune_pass == 1) + { + // Prune pass 1: Same # of weight samples, compare WxH vs. HxW + if (bucket.m_grid_width != bucket.m_grid_height) + { + auto alt_res_it = shortlist_buckets.find(shortlist_bucket(bucket.m_grid_height, bucket.m_grid_width, bucket.m_cem_index, bucket.m_ccs_index, bucket.m_num_parts, bucket.m_unique_seed_index)); + if (alt_res_it == shortlist_buckets.end()) + { + new_shortlist_buckets.insert(it->first, it->second); + } + else + { + assert(!alt_res_it->first.m_examined_flag); + alt_res_it->first.m_examined_flag = true; + + const float fract = (bucket.m_sse > 0.0f) ? (alt_res_it->first.m_sse / bucket.m_sse) : 0.0f; + + const float ALT_RES_SSE_THRESH = .2f; + if (fract < (1.0f - ALT_RES_SSE_THRESH)) + new_shortlist_buckets.insert(alt_res_it->first, alt_res_it->second); + else if (fract > (1.0f + ALT_RES_SSE_THRESH)) + new_shortlist_buckets.insert(it->first, it->second); + else + { + new_shortlist_buckets.insert(alt_res_it->first, alt_res_it->second); + new_shortlist_buckets.insert(it->first, it->second); + } + } + } + else + { + new_shortlist_buckets.insert(it->first, it->second); + } + + } + else if (prune_pass == 2) + { + // Prune pass 2: RGB Direct vs. Scale bucket groups + + if ((bucket.m_cem_index == astc_helpers::CEM_LDR_RGB_DIRECT) || (bucket.m_cem_index == astc_helpers::CEM_LDR_RGB_BASE_SCALE) || + (bucket.m_cem_index == astc_helpers::CEM_LDR_RGBA_DIRECT) || (bucket.m_cem_index == astc_helpers::CEM_LDR_RGB_BASE_SCALE_PLUS_TWO_A)) + { + uint32_t alt_cem_index_to_find = astc_helpers::CEM_LDR_RGB_BASE_SCALE; + + // Check for pairs: CEM_LDR_RGB_DIRECT vs. CEM_LDR_RGB_BASE_SCALE, or CEM_LDR_RGBA_DIRECT vs. CEM_LDR_RGB_BASE_SCALE_PLUS_TWO_A. + switch (bucket.m_cem_index) + { + case astc_helpers::CEM_LDR_RGB_DIRECT: + alt_cem_index_to_find = astc_helpers::CEM_LDR_RGB_BASE_SCALE; + break; + case astc_helpers::CEM_LDR_RGB_BASE_SCALE: + alt_cem_index_to_find = astc_helpers::CEM_LDR_RGB_DIRECT; + break; + case astc_helpers::CEM_LDR_RGBA_DIRECT: + alt_cem_index_to_find = astc_helpers::CEM_LDR_RGB_BASE_SCALE_PLUS_TWO_A; + break; + case astc_helpers::CEM_LDR_RGB_BASE_SCALE_PLUS_TWO_A: + alt_cem_index_to_find = astc_helpers::CEM_LDR_RGBA_DIRECT; + break; + default: + assert(0); + break; + } + + auto alt_res_it = shortlist_buckets.find(shortlist_bucket(bucket.m_grid_width, bucket.m_grid_height, alt_cem_index_to_find, bucket.m_ccs_index, bucket.m_num_parts, bucket.m_unique_seed_index)); + + if (alt_res_it == shortlist_buckets.end()) + { + new_shortlist_buckets.insert(it->first, it->second); + } + else + { + assert(!alt_res_it->first.m_examined_flag); + + alt_res_it->first.m_examined_flag = true; + + // Compare the two buckets, decide if one or another can be tossed as not worth it. + const float fract = (bucket.m_sse > 0.0f) ? (alt_res_it->first.m_sse / bucket.m_sse) : 0.0f; + + const float ALT_RES_SSE_THRESH = .1f; + if (fract < (1.0f - ALT_RES_SSE_THRESH)) + new_shortlist_buckets.insert(alt_res_it->first, alt_res_it->second); + else if (fract > (1.0f + ALT_RES_SSE_THRESH)) + new_shortlist_buckets.insert(it->first, it->second); + else + { + new_shortlist_buckets.insert(alt_res_it->first, alt_res_it->second); + new_shortlist_buckets.insert(it->first, it->second); + } + } + } + else + { + new_shortlist_buckets.insert(it->first, it->second); + } + + } // if (prune_pass + + it->first.m_examined_flag = true; + } + + new_shortlist_buckets.swap(shortlist_buckets); + } // prune_pass + } // if (g_bucket_pruning_passes) + + assert(shortlist_buckets.size()); + + if (m_ranked_buckets.capacity() < shortlist_buckets.size()) + m_ranked_buckets.reserve(shortlist_buckets.size()); + + for (auto it = shortlist_buckets.begin(); it != shortlist_buckets.end(); ++it) + { + shortlist_bucket& bucket = it->first; + const trial_mode_index_vec& trial_mode_indices = it->second; + + ranked_shortlist_bucket* pDst = m_ranked_buckets.enlarge(1); + pDst->m_bucket = bucket; + pDst->m_trial_mode_indices = trial_mode_indices; + } + + assert(m_ranked_buckets.size()); + + // Sort the buckets by their surrogate encoded SSE to rank them. + std::sort(m_ranked_buckets.begin(), m_ranked_buckets.end()); + + return true; + } + + bool rank_and_sort_shortlist_buckets( + const ldr_astc_lowlevel_block_encoder_params& p, + const astc_ldr::pixel_stats_t& pixel_stats, + basisu::vector& out_blocks, + uint32_t blur_id, + encode_block_stats& stats) + { + BASISU_NOTE_UNUSED(blur_id); + BASISU_NOTE_UNUSED(out_blocks); + + basisu::vector& shortlist_trials = m_trial_surrogates; + + // TODO: Tune this further. Memory here adds up across all encoding threads. + { + //const float reserve_factor = (sizeof(void*) > 4) ? .5f : .25f; + const uint32_t reserve_size = 64;// maximum(256, (int)(p.m_num_trial_modes * reserve_factor)); + + if (shortlist_trials.capacity() < reserve_size) + shortlist_trials.reserve(reserve_size); + + shortlist_trials.resize(0); + } + + uint32_t num_buckets_to_examine = fast_roundf_int((float)m_ranked_buckets.size_u32() * p.m_shortlist_buckets_to_examine_fract); + num_buckets_to_examine = clamp(num_buckets_to_examine, p.m_shortlist_buckets_to_examine_min, p.m_shortlist_buckets_to_examine_max); + + num_buckets_to_examine = clamp(num_buckets_to_examine, 1, m_ranked_buckets.size_u32()); + + float best_err_so_far = BIG_FLOAT_VAL; + + for (uint32_t bucket_index = 0; bucket_index < num_buckets_to_examine; bucket_index++) + { + const shortlist_bucket& bucket = m_ranked_buckets[bucket_index].m_bucket; + const trial_mode_index_vec& bucket_trial_mode_indices = m_ranked_buckets[bucket_index].m_trial_mode_indices; + + if (best_err_so_far != BIG_FLOAT_VAL) + { + if (bucket.m_sse > best_err_so_far * SKIP_IF_BUCKET_WORSE_MULTIPLIER) + continue; + } + best_err_so_far = minimum(best_err_so_far, bucket.m_sse); + + if (bucket_trial_mode_indices.size() == 1) + { + // Bucket only contains 1 mode, so we've already encoded its surrogate. + trial_surrogate& s = *shortlist_trials.try_enlarge(1); + + s.m_trial_mode_index = bucket_trial_mode_indices[0]; + s.m_err = bucket.m_sse; + s.m_log_blk = bucket.m_surrogate_log_blk; + continue; + } + + //----- + // We have a bucket sharing all config except for ISE weight/endpoint levels. Decide how many to place on the shortlist using analytic weighted MSE/SSE estimates. + + const uint32_t num_modes_in_bucket = bucket_trial_mode_indices.size_u32(); + + uint32_t num_modes_in_bucket_to_shortlist = fast_roundf_pos_int(num_modes_in_bucket * p.m_num_similar_modes_in_bucket_to_shortlist_fract); + + num_modes_in_bucket_to_shortlist = clamp(num_modes_in_bucket_to_shortlist, p.m_num_similar_modes_in_bucket_to_shortlist_fract_min, p.m_num_similar_modes_in_bucket_to_shortlist_fract_max); + + num_modes_in_bucket_to_shortlist = clamp(num_modes_in_bucket_to_shortlist, 1, num_modes_in_bucket); + + basisu::vector bucket_indices(num_modes_in_bucket); + for (uint32_t i = 0; i < num_modes_in_bucket; i++) + bucket_indices[i] = i; + + if (num_modes_in_bucket_to_shortlist < num_modes_in_bucket) + { + basisu::vector sse_estimates(num_modes_in_bucket); + + const uint32_t bucket_surrogate_endpoint_levels = bucket.m_surrogate_log_blk.m_num_endpoint_levels; + const uint32_t bucket_surrogate_weight_levels = bucket.m_surrogate_log_blk.m_num_weight_levels; + const float bucket_surrogate_base_sse = bucket.m_sse; + + const basist::astc_ldr_t::astc_block_grid_data* pGrid_data = basist::astc_ldr_t::find_astc_block_grid_data(p.m_block_width, p.m_block_height, bucket.m_grid_width, bucket.m_grid_height); + const astc_ldr::partitions_data* pBucket_part_data = (bucket.m_num_parts == 1) ? nullptr : ((bucket.m_num_parts == 2) ? p.m_pPart_data_p2 : p.m_pPart_data_p3); + + bool can_use_base_ofs = false; + if ((bucket.m_cem_index == astc_helpers::CEM_LDR_RGB_DIRECT) || (bucket.m_cem_index == astc_helpers::CEM_LDR_RGBA_DIRECT)) + { + float max_span_size = 0.0f; + for (uint32_t part_iter = 0; part_iter < bucket.m_num_parts; part_iter++) + { + for (uint32_t c = 0; c < 4; c++) + { + float span_size = fabs(bucket.m_surrogate_log_blk.m_endpoints[part_iter][1][c] - bucket.m_surrogate_log_blk.m_endpoints[part_iter][0][c]); + max_span_size = maximum(max_span_size, span_size); + } + } + + can_use_base_ofs = max_span_size < .25f; + } + + chan_mse_est bucket_sse_est(0.0f, 0.0f); + for (uint32_t part_iter = 0; part_iter < bucket.m_num_parts; part_iter++) + { + uint32_t total_texels_in_part = p.m_block_width * p.m_block_height; + if (bucket.m_num_parts > 1) + { + total_texels_in_part = pBucket_part_data->m_partition_pat_histograms[bucket.m_unique_seed_index].m_hist[part_iter]; + assert(total_texels_in_part && total_texels_in_part < p.m_block_width * p.m_block_height); + } + + for (uint32_t c = 0; c < 4; c++) + { + float span_size = fabs(bucket.m_surrogate_log_blk.m_endpoints[part_iter][1][c] - bucket.m_surrogate_log_blk.m_endpoints[part_iter][0][c]); + + chan_mse_est chan_mse_est(compute_quantized_channel_mse_estimates( + can_use_base_ofs ? minimum(bucket_surrogate_endpoint_levels * 2, 256) : bucket_surrogate_endpoint_levels, + bucket_surrogate_weight_levels, + span_size, pGrid_data->m_weight_gamma)); + + if (span_size == 0.0f) + { + if ((bucket.m_surrogate_log_blk.m_endpoints[part_iter][1][c] == 1.0f) || (bucket.m_surrogate_log_blk.m_endpoints[part_iter][1][c] == 0.0f)) + { + chan_mse_est.m_ep = 0.0f; + chan_mse_est.m_wp = 0.0f; + } + } + + bucket_sse_est.m_ep += chan_mse_est.m_ep * (float)p.m_pEnc_params->m_comp_weights[c] * total_texels_in_part; + bucket_sse_est.m_wp += chan_mse_est.m_wp * (float)p.m_pEnc_params->m_comp_weights[c] * total_texels_in_part; + } // c + + } // part_iter + +#if 0 + fmt_debug_printf("----------------\n"); + + fmt_debug_printf("bucket endpoint levels: {}, weight levels: {}, surrogate sse: {}, ep_est: {}, wp_est: {}, avg RGB subset0 span: {}\n", + bucket_surrogate_endpoint_levels, bucket_surrogate_weight_levels, + bucket.m_sse, + bucket_sse_est.m_ep, bucket_sse_est.m_wp, + (fabs(bucket.m_surrogate_log_blk.m_endpoints[0][1][0] - bucket.m_surrogate_log_blk.m_endpoints[0][0][0]) + + fabs(bucket.m_surrogate_log_blk.m_endpoints[0][1][1] - bucket.m_surrogate_log_blk.m_endpoints[0][0][1]) + + fabs(bucket.m_surrogate_log_blk.m_endpoints[0][1][2] - bucket.m_surrogate_log_blk.m_endpoints[0][0][2])) / 3.0f); +#endif + + for (uint32_t j = 0; j < bucket_trial_mode_indices.size(); j++) + { + const uint32_t trial_mode_index = bucket_trial_mode_indices[j]; + const basist::astc_ldr_t::trial_mode& tm = p.m_pTrial_modes[trial_mode_index]; + + const uint32_t trial_mode_endpoint_levels = astc_helpers::get_ise_levels(tm.m_endpoint_ise_range); + const uint32_t trial_mode_weight_levels = astc_helpers::get_ise_levels(tm.m_weight_ise_range); + + assert(trial_mode_endpoint_levels <= bucket_surrogate_endpoint_levels); + assert(trial_mode_weight_levels <= bucket_surrogate_weight_levels); + + chan_mse_est mode_sse_est(0.0f, 0.0f); + for (uint32_t part_iter = 0; part_iter < bucket.m_num_parts; part_iter++) + { + uint32_t total_texels_in_part = p.m_block_width * p.m_block_height; + if (bucket.m_num_parts > 1) + { + total_texels_in_part = pBucket_part_data->m_partition_pat_histograms[bucket.m_unique_seed_index].m_hist[part_iter]; + assert(total_texels_in_part && total_texels_in_part < p.m_block_width * p.m_block_height); + } + + for (uint32_t c = 0; c < 4; c++) + { + float span_size = fabs(bucket.m_surrogate_log_blk.m_endpoints[part_iter][1][c] - bucket.m_surrogate_log_blk.m_endpoints[part_iter][0][c]); + + chan_mse_est chan_mse_est(compute_quantized_channel_mse_estimates( + can_use_base_ofs ? minimum(trial_mode_endpoint_levels * 2, 256) : trial_mode_endpoint_levels, + trial_mode_weight_levels, + span_size, pGrid_data->m_weight_gamma)); + + if (span_size == 0.0f) + { + if ((bucket.m_surrogate_log_blk.m_endpoints[part_iter][1][c] == 1.0f) || (bucket.m_surrogate_log_blk.m_endpoints[part_iter][1][c] == 0.0f)) + { + chan_mse_est.m_ep = 0.0f; + chan_mse_est.m_wp = 0.0f; + } + } + + mode_sse_est.m_ep += chan_mse_est.m_ep * (float)p.m_pEnc_params->m_comp_weights[c] * total_texels_in_part; + mode_sse_est.m_wp += chan_mse_est.m_wp * (float)p.m_pEnc_params->m_comp_weights[c] * total_texels_in_part; + } // c + + } // part_iter + + // Remove the bucket's base estimated endpoint/weight quant + if (trial_mode_endpoint_levels == bucket_surrogate_endpoint_levels) + { + mode_sse_est.m_ep = 0.0f; + } + else + { + mode_sse_est.m_ep -= bucket_sse_est.m_ep; + + if (mode_sse_est.m_ep < 0.0f) + mode_sse_est.m_ep = 0.0f; + } + + if (trial_mode_weight_levels == bucket_surrogate_weight_levels) + { + mode_sse_est.m_wp = 0.0f; + } + else + { + mode_sse_est.m_wp -= bucket_sse_est.m_wp; + + if (mode_sse_est.m_wp < 0.0f) + mode_sse_est.m_wp = 0.0f; + } + + float mode_total_sse_est = bucket_surrogate_base_sse + mode_sse_est.m_ep + mode_sse_est.m_wp; + + sse_estimates[j] = mode_total_sse_est; + +#if 0 + // TEMP comparison code + float actual_sse = 0.0f; + + { + log_surrogate_astc_blk temp_surrogate_log_blk; + if (bucket.m_num_parts == 1) + { + actual_sse = encode_surrogate_trial( + p.m_block_width, p.m_block_height, + pixel_stats, + bucket.m_cem_index, + bucket.m_ccs_index, + tm.m_endpoint_ise_range, tm.m_weight_ise_range, + bucket.m_grid_width, bucket.m_grid_height, + temp_surrogate_log_blk, + *p.m_pEnc_params); + } + else + { + const astc_ldr::partitions_data* pPart_data = (bucket.m_num_parts == 2) ? p.m_pPart_data_p2 : p.m_pPart_data_p3; + + const uint32_t part_seed_index = pPart_data->m_unique_index_to_part_seed[bucket.m_unique_seed_index]; + + const astc_ldr::partition_pattern_vec* pPat = &pPart_data->m_partition_pats[bucket.m_unique_seed_index]; + + actual_sse = encode_surrogate_trial_subsets( + p.m_block_width, p.m_block_height, + pixel_stats, + bucket.m_cem_index, bucket.m_num_parts, part_seed_index, pPat, + tm.m_endpoint_ise_range, tm.m_weight_ise_range, + bucket.m_grid_width, bucket.m_grid_height, + temp_surrogate_log_blk, + *p.m_pEnc_params, 0); + } + + stats.m_total_surrogate_encodes++; + } + + fmt_debug_printf("sse: {}, actual sse: {}, endpoint levels: {} weight levels: {}\n", sse_estimates[j], actual_sse, trial_mode_endpoint_levels, trial_mode_weight_levels); +#endif + + } // j + +#if 0 + fmt_debug_printf("\n"); +#endif + + indirect_sort(num_modes_in_bucket, bucket_indices.get_ptr(), sse_estimates.get_ptr()); + + } // if (num_modes_in_bucket_to_shortlist < num_modes_in_bucket) + + // Surrogate encode the best looking buckets after factoring in estimate SSE errors. + + for (uint32_t q = 0; q < num_modes_in_bucket_to_shortlist; q++) + { + const uint32_t j = bucket_indices[q]; + + trial_surrogate& s = *shortlist_trials.try_enlarge(1); + + const uint32_t trial_mode_index = bucket_trial_mode_indices[j]; + const basist::astc_ldr_t::trial_mode& tm = p.m_pTrial_modes[trial_mode_index]; + + s.m_trial_mode_index = trial_mode_index; + + if (bucket.m_num_parts == 1) + { + s.m_err = encode_surrogate_trial( + p.m_block_width, p.m_block_height, + pixel_stats, + bucket.m_cem_index, + bucket.m_ccs_index, + tm.m_endpoint_ise_range, tm.m_weight_ise_range, + bucket.m_grid_width, bucket.m_grid_height, + s.m_log_blk, + *p.m_pEnc_params, 0); + + stats.m_total_surrogate_encodes++; + } + else + { + const astc_ldr::partitions_data* pPart_data = (bucket.m_num_parts == 2) ? p.m_pPart_data_p2 : p.m_pPart_data_p3; + + const uint32_t part_seed_index = pPart_data->m_unique_index_to_part_seed[bucket.m_unique_seed_index]; + + const astc_ldr::partition_pattern_vec* pPat = &pPart_data->m_partition_pats[bucket.m_unique_seed_index]; + + s.m_err = encode_surrogate_trial_subsets( + p.m_block_width, p.m_block_height, + pixel_stats, + bucket.m_cem_index, bucket.m_num_parts, part_seed_index, pPat, + tm.m_endpoint_ise_range, tm.m_weight_ise_range, + bucket.m_grid_width, bucket.m_grid_height, + s.m_log_blk, + *p.m_pEnc_params, 0); + + stats.m_total_surrogate_encodes++; + } + + if ((bucket.m_cem_index == astc_helpers::CEM_LDR_RGB_DIRECT) || (bucket.m_cem_index == astc_helpers::CEM_LDR_RGBA_DIRECT)) + { + // blue contraction/base+offset discount + s.m_err *= BLUE_CONTRACTION_BASE_OFS_DISCOUNT; + } + + } // j + + } // bucket_index + + if (!shortlist_trials.size()) + return false; + + shortlist_trials.sort(); + + stats.m_total_shortlist_candidates += shortlist_trials.size_u32(); + + return true; + } + + bool final_polish_encode_from_shortlist( + const ldr_astc_lowlevel_block_encoder_params& p, + const astc_ldr::pixel_stats_t& pixel_stats, + basisu::vector& out_blocks, + uint32_t blur_id, + encode_block_stats& stats) + { + basisu::vector& shortlist_trials = m_trial_surrogates; + + // TODO: Diversity selection + const float shortlist_fract = p.m_final_shortlist_fraction[m_block_complexity_index]; + + uint32_t max_shortlist_trials = (uint32_t)std::roundf((float)shortlist_trials.size_u32() * shortlist_fract); + + max_shortlist_trials = clamp(max_shortlist_trials, p.m_final_shortlist_min_size[m_block_complexity_index], p.m_final_shortlist_max_size[m_block_complexity_index]); + + uint32_t total_shortlist_trials = clamp(max_shortlist_trials, 1, shortlist_trials.size_u32()); + + const uint32_t EARLY_STOP2_SHORTLIST_ITER_INDEX = 5; + + // Now do the real encodes on the top surrogate shortlist trials. + for (uint32_t shortlist_iter = 0; shortlist_iter < total_shortlist_trials; shortlist_iter++) + { + const uint32_t trial_mode_index = shortlist_trials[shortlist_iter].m_trial_mode_index; + const basist::astc_ldr_t::trial_mode& tm = p.m_pTrial_modes[trial_mode_index]; + + astc_helpers::log_astc_block log_astc_blk; + + bool base_ofs_succeeded_flag = false; + + if ((p.m_final_encode_try_base_ofs) && ((tm.m_cem == astc_helpers::CEM_LDR_RGB_DIRECT) || (tm.m_cem == astc_helpers::CEM_LDR_RGBA_DIRECT))) + { + // Add RGB/RGBA BASE PLUS OFFSET variant. + astc_helpers::log_astc_block log_astc_blk_alt; + + const uint32_t base_ofs_cem_index = (tm.m_cem == astc_helpers::CEM_LDR_RGB_DIRECT) ? astc_helpers::CEM_LDR_RGB_BASE_PLUS_OFFSET : astc_helpers::CEM_LDR_RGBA_BASE_PLUS_OFFSET; + + bool base_ofs_clamped_flag = false; + + bool alt_enc_trial_status; + if (tm.m_num_parts > 1) + { + const astc_ldr::partitions_data* pPart_data = (tm.m_num_parts == 2) ? p.m_pPart_data_p2 : p.m_pPart_data_p3; + + const uint32_t part_seed_index = shortlist_trials[shortlist_iter].m_log_blk.m_seed_index; + const uint32_t part_unique_index = pPart_data->m_part_seed_to_unique_index[part_seed_index]; + const astc_ldr::partition_pattern_vec* pPat = &pPart_data->m_partition_pats[part_unique_index]; + + alt_enc_trial_status = encode_trial_subsets( + p.m_block_width, p.m_block_height, pixel_stats, base_ofs_cem_index, tm.m_num_parts, + part_seed_index, pPat, + tm.m_endpoint_ise_range, tm.m_weight_ise_range, + tm.m_grid_width, tm.m_grid_height, log_astc_blk_alt, *p.m_pEnc_params, false, + p.m_gradient_descent_flag, p.m_polish_weights_flag, p.m_qcd_enabled_flag, + p.m_use_blue_contraction, &base_ofs_clamped_flag); + } + else + { + alt_enc_trial_status = encode_trial( + p.m_block_width, p.m_block_height, pixel_stats, base_ofs_cem_index, + tm.m_ccs_index != -1, tm.m_ccs_index, + tm.m_endpoint_ise_range, tm.m_weight_ise_range, + tm.m_grid_width, tm.m_grid_height, log_astc_blk_alt, *p.m_pEnc_params, + p.m_gradient_descent_flag, p.m_polish_weights_flag, p.m_qcd_enabled_flag, + p.m_use_blue_contraction, &base_ofs_clamped_flag); + } + + assert(alt_enc_trial_status); + + if (alt_enc_trial_status) + { + stats.m_total_full_encodes++; + + encode_block_output* pOut_block2 = out_blocks.enlarge(1); + pOut_block2->clear(); + pOut_block2->m_trial_mode_index = safe_cast_int16(trial_mode_index); + pOut_block2->m_log_blk = log_astc_blk_alt; + pOut_block2->m_blur_id = safe_cast_uint16(blur_id); + pOut_block2->m_sse = eval_error(p.m_block_width, p.m_block_height, log_astc_blk_alt, pixel_stats, *p.m_pEnc_params); + + if ((p.m_early_stop_wpsnr) || (p.m_early_stop2_wpsnr)) + { + const float wpsnr = compute_psnr_from_wsse(p.m_block_width, p.m_block_height, pOut_block2->m_sse, p.m_pEnc_params->get_total_comp_weights()); + + if ((p.m_early_stop_wpsnr) && (wpsnr >= p.m_early_stop_wpsnr)) + break; + + if (shortlist_iter >= EARLY_STOP2_SHORTLIST_ITER_INDEX) + { + if ((p.m_early_stop2_wpsnr) && (wpsnr >= p.m_early_stop2_wpsnr)) + break; + } + } + + base_ofs_succeeded_flag = !base_ofs_clamped_flag; + } + + } // (p.m_final_encode_try_base_ofs) + + if ((p.m_final_encode_always_try_rgb_direct) || (!base_ofs_succeeded_flag)) + { + bool enc_trial_status; + + if (tm.m_num_parts > 1) + { + const astc_ldr::partitions_data* pPart_data = (tm.m_num_parts == 2) ? p.m_pPart_data_p2 : p.m_pPart_data_p3; + + const uint32_t part_seed_index = shortlist_trials[shortlist_iter].m_log_blk.m_seed_index; + const uint32_t part_unique_index = pPart_data->m_part_seed_to_unique_index[part_seed_index]; + assert(part_unique_index < astc_helpers::NUM_PARTITION_PATTERNS); + const astc_ldr::partition_pattern_vec* pPat = &pPart_data->m_partition_pats[part_unique_index]; + + enc_trial_status = encode_trial_subsets( + p.m_block_width, p.m_block_height, pixel_stats, tm.m_cem, tm.m_num_parts, + part_seed_index, pPat, + tm.m_endpoint_ise_range, tm.m_weight_ise_range, + tm.m_grid_width, tm.m_grid_height, log_astc_blk, *p.m_pEnc_params, false, + p.m_gradient_descent_flag, p.m_polish_weights_flag, p.m_qcd_enabled_flag, + p.m_use_blue_contraction); + } + else + { + enc_trial_status = encode_trial( + p.m_block_width, p.m_block_height, pixel_stats, tm.m_cem, + tm.m_ccs_index != -1, tm.m_ccs_index, + tm.m_endpoint_ise_range, tm.m_weight_ise_range, + tm.m_grid_width, tm.m_grid_height, log_astc_blk, *p.m_pEnc_params, + p.m_gradient_descent_flag, p.m_polish_weights_flag, p.m_qcd_enabled_flag, + p.m_use_blue_contraction); + } + + assert(enc_trial_status); + + if (!enc_trial_status) + return false; + + stats.m_total_full_encodes++; + + { + encode_block_output* pOut_block1 = out_blocks.enlarge(1); + pOut_block1->clear(); + pOut_block1->m_trial_mode_index = safe_cast_int16(trial_mode_index); + pOut_block1->m_log_blk = log_astc_blk; + pOut_block1->m_blur_id = safe_cast_uint16(blur_id); + pOut_block1->m_sse = eval_error(p.m_block_width, p.m_block_height, log_astc_blk, pixel_stats, *p.m_pEnc_params); + + if ((p.m_early_stop_wpsnr) || (p.m_early_stop2_wpsnr)) + { + const float wpsnr = compute_psnr_from_wsse(p.m_block_width, p.m_block_height, pOut_block1->m_sse, p.m_pEnc_params->get_total_comp_weights()); + + if ((p.m_early_stop_wpsnr) && (wpsnr >= p.m_early_stop_wpsnr)) + break; + + if (shortlist_iter >= EARLY_STOP2_SHORTLIST_ITER_INDEX) + { + if ((p.m_early_stop2_wpsnr) && (wpsnr >= p.m_early_stop2_wpsnr)) + break; + } + } + } + + } // if (!skip_encode_flag) + + } // shortlist_iter + + return true; + } + + bool full_encode(const ldr_astc_lowlevel_block_encoder_params& p, + const astc_ldr::pixel_stats_t& pixel_stats, + basisu::vector& out_blocks, + uint32_t blur_id, + encode_block_stats& stats) + { + clear(); + + if (!init(p, pixel_stats, out_blocks, blur_id, stats)) + return false; + + if (!partition_triage(p, pixel_stats, out_blocks, blur_id, stats)) + return false; + + if (!trivial_triage(p, pixel_stats, out_blocks, blur_id, stats)) + return false; + + if (!analytic_triage(p, pixel_stats, out_blocks, blur_id, stats)) + return false; + + if (!surrogate_encode_shortlist_bucket_representatives(p, pixel_stats, out_blocks, blur_id, stats)) + return false; + + if (!prune_shortlist_buckets(p, pixel_stats, out_blocks, blur_id, stats)) + return false; + + if (!rank_and_sort_shortlist_buckets(p, pixel_stats, out_blocks, blur_id, stats)) + return false; + + if (!final_polish_encode_from_shortlist(p, pixel_stats, out_blocks, blur_id, stats)) + return false; + + return true; + } +}; + +class ldr_astc_lowlevel_block_encoder_pool +{ +public: + ldr_astc_lowlevel_block_encoder_pool() + { + } + + void init(uint32_t total_threads) + { + std::lock_guard g(m_mutex); + + m_pool.resize(total_threads); + + for (uint32_t i = 0; i < total_threads; i++) + m_pool[i].m_used_flag = false; + } + + void deinit() + { + std::lock_guard g(m_mutex); + + for (uint32_t i = 0; i < m_pool.size(); i++) + { + if (m_pool[i].m_used_flag) + { + assert(0); + debug_printf("ldr_astc_lowlevel_block_encoder_pool::deinit: Pool entry still marked as used\n"); + } + + m_pool[i].m_used_flag = false; + } + + m_pool.resize(0); + } + + ldr_astc_lowlevel_block_encoder* acquire() + { + std::lock_guard g(m_mutex); + + assert(m_pool.size()); + + ldr_astc_lowlevel_block_encoder* pRes = nullptr; + + for (uint32_t i = 0; i < m_pool.size(); i++) + { + if (!m_pool[i].m_used_flag) + { + pRes = &m_pool[i]; + pRes->m_used_flag = true; + + break; + } + } + + assert(pRes); + + return pRes; + } + + bool release(ldr_astc_lowlevel_block_encoder* pTemps) + { + std::lock_guard g(m_mutex); + + assert(m_pool.size()); + + if ((pTemps < m_pool.begin()) || (pTemps >= m_pool.end())) + { + assert(0); + return false; + } + + size_t idx = pTemps - m_pool.begin(); + if (idx >= m_pool.size()) + { + assert(0); + return false; + } + + m_pool[idx].m_used_flag = false; + + return true; + } + +private: + std::mutex m_mutex; + basisu::vector m_pool; +}; + +class scoped_ldr_astc_lowlevel_block_encoder +{ +public: + scoped_ldr_astc_lowlevel_block_encoder(ldr_astc_lowlevel_block_encoder_pool& pool) : + m_pool(pool) + { + m_pTemps = pool.acquire(); + } + + ~scoped_ldr_astc_lowlevel_block_encoder() + { + m_pool.release(m_pTemps); + } + + ldr_astc_lowlevel_block_encoder_pool& get_pool() const + { + return m_pool; + } + + ldr_astc_lowlevel_block_encoder* get_ptr() + { + return m_pTemps; + } + +private: + ldr_astc_lowlevel_block_encoder_pool& m_pool; + ldr_astc_lowlevel_block_encoder* m_pTemps; +}; + + +//------------------------------------------------------------------- + +#pragma pack(push, 1) +struct trial_mode_desc +{ + uint8_t m_unique_cem_index; // LDR base CEM's, 0-5 + uint8_t m_ccs; // 0 if SP, 1-4 for DP + uint8_t m_subsets; // 1-3 + uint8_t m_eise; // endpoint ise range, 4-20 + uint8_t m_wise; // weight ise range, 0-11 + uint8_t m_grid_w, m_grid_h; // grid resolution, 4-12 +}; +#pragma pack(pop) + +static const int s_astc_cem_to_unique_ldr_index[16] = +{ + 0, // CEM_LDR_LUM_DIRECT + -1, // CEM_LDR_LUM_BASE_PLUS_OFS + -1, // CEM_HDR_LUM_LARGE_RANGE + -1, // CEM_HDR_LUM_SMALL_RANGE + 1, // CEM_LDR_LUM_ALPHA_DIRECT + -1, // CEM_LDR_LUM_ALPHA_BASE_PLUS_OFS + 2, // CEM_LDR_RGB_BASE_SCALE + -1, // CEM_HDR_RGB_BASE_SCALE + 3, // CEM_LDR_RGB_DIRECT + -1, // CEM_LDR_RGB_BASE_PLUS_OFFSET + 4, // CEM_LDR_RGB_BASE_SCALE_PLUS_TWO_A + -1, // CEM_HDR_RGB + 5, // CEM_LDR_RGBA_DIRECT + -1, // CEM_LDR_RGBA_BASE_PLUS_OFFSET + -1, // CEM_HDR_RGB_LDR_ALPHA + -1, // CEM_HDR_RGB_HDR_ALPHA +}; + +#if 0 +static const int s_unique_ldr_index_to_astc_cem[6] = +{ + astc_helpers::CEM_LDR_LUM_DIRECT, + astc_helpers::CEM_LDR_LUM_ALPHA_DIRECT, + astc_helpers::CEM_LDR_RGB_BASE_SCALE, + astc_helpers::CEM_LDR_RGB_DIRECT, + astc_helpers::CEM_LDR_RGB_BASE_SCALE_PLUS_TWO_A, + astc_helpers::CEM_LDR_RGBA_DIRECT +}; +#endif + +static uint32_t pack_tm_desc( + uint32_t grid_width, uint32_t grid_height, + uint32_t cem_index, uint32_t ccs_index, uint32_t num_subsets, + uint32_t endpoint_ise_range, uint32_t weight_ise_range) +{ + assert((grid_width >= 2) && (grid_width <= 12)); + assert((grid_height >= 2) && (grid_height <= 12)); + assert((cem_index < 16) && astc_helpers::is_cem_ldr(cem_index)); + assert((num_subsets >= 1) && (num_subsets <= 3)); + assert(ccs_index <= 4); // 0 for SP, 1-4 for DP + assert((endpoint_ise_range >= astc_helpers::FIRST_VALID_ENDPOINT_ISE_RANGE) && (endpoint_ise_range <= astc_helpers::LAST_VALID_ENDPOINT_ISE_RANGE)); + assert((weight_ise_range >= astc_helpers::FIRST_VALID_WEIGHT_ISE_RANGE) && (weight_ise_range <= astc_helpers::LAST_VALID_WEIGHT_ISE_RANGE)); + + grid_width -= 2; + grid_height -= 2; + assert((grid_width <= 10) && (grid_height <= 10)); + + const int unique_cem_index = s_astc_cem_to_unique_ldr_index[cem_index]; + assert((unique_cem_index >= 0) && (unique_cem_index <= 5)); + assert(basist::astc_ldr_t::s_unique_ldr_index_to_astc_cem[unique_cem_index] == (int)cem_index); + + num_subsets--; + + endpoint_ise_range -= astc_helpers::FIRST_VALID_ENDPOINT_ISE_RANGE; + + uint32_t cur_bit_ofs = 0; + +#define BU_PACK_FIELD(val, bits) do { uint32_t v = (uint32_t)(val); assert(v < (1u << bits)); packed_id |= (v << cur_bit_ofs); cur_bit_ofs += (bits); } while(0) + + uint32_t packed_id = 0; + BU_PACK_FIELD(endpoint_ise_range, basist::astc_ldr_t::CFG_PACK_EISE_BITS); + BU_PACK_FIELD(weight_ise_range, basist::astc_ldr_t::CFG_PACK_WISE_BITS); + BU_PACK_FIELD(ccs_index, basist::astc_ldr_t::CFG_PACK_CCS_BITS); + BU_PACK_FIELD(num_subsets, basist::astc_ldr_t::CFG_PACK_SUBSETS_BITS); + BU_PACK_FIELD(unique_cem_index, basist::astc_ldr_t::CFG_PACK_CEM_BITS); + // must be at the top + BU_PACK_FIELD(grid_width * 11 + grid_height, basist::astc_ldr_t::CFG_PACK_GRID_BITS); +#undef BU_PACK_FIELD + + assert(cur_bit_ofs == 24); + + return packed_id; +} + +void create_encoder_trial_modes_full_eval(uint32_t block_width, uint32_t block_height, + basisu::vector& encoder_trial_modes, basist::astc_ldr_t::grouped_trial_modes& grouped_encoder_trial_modes, + bool print_debug_info = true, bool print_modes = false) +{ + interval_timer itm; + itm.start(); + + encoder_trial_modes.resize(0); + grouped_encoder_trial_modes.clear(); + + uint32_t max_grid_width = 0, max_grid_height = 0; + uint32_t total_evals = 0, total_partial_evals = 0, total_evals_succeeded = 0; + uint32_t mode_index = 0; + uint_vec packed_mode_ids; + + for (uint32_t alpha_iter = 0; alpha_iter < 2; alpha_iter++) + { + if (print_modes) + { + if (alpha_iter) + fmt_debug_printf("ALPHA TRIAL MODES\n"); + else + fmt_debug_printf("RGB TRIAL MODES\n"); + } + + astc_helpers::astc_block phys_block; + + for (uint32_t cem_mode_iter = 0; cem_mode_iter < 3; cem_mode_iter++) + { + const uint32_t s_rgb_cems[3] = { astc_helpers::CEM_LDR_LUM_DIRECT, astc_helpers::CEM_LDR_RGB_BASE_SCALE, astc_helpers::CEM_LDR_RGB_DIRECT }; + const uint32_t s_alpha_cems[3] = { astc_helpers::CEM_LDR_LUM_ALPHA_DIRECT, astc_helpers::CEM_LDR_RGB_BASE_SCALE_PLUS_TWO_A, astc_helpers::CEM_LDR_RGBA_DIRECT }; + + const uint32_t cem_index = alpha_iter ? s_alpha_cems[cem_mode_iter] : s_rgb_cems[cem_mode_iter]; + + uint32_t num_dp_chans = 0; + bool cem_supports_dual_plane = false; + bool cem_supports_subsets = false; + + // base+ofs variants are automatically used later as alternates to RGB/RGBA direct modes + switch (cem_index) + { + case astc_helpers::CEM_LDR_LUM_DIRECT: + num_dp_chans = 0; // only a single component, so only a single plane + cem_supports_dual_plane = false; + cem_supports_subsets = true; + break; + case astc_helpers::CEM_LDR_LUM_ALPHA_DIRECT: + num_dp_chans = 1; // CCS can only be 3 + cem_supports_dual_plane = true; + cem_supports_subsets = true; + break; + case astc_helpers::CEM_LDR_RGB_DIRECT: + num_dp_chans = 3; + cem_supports_dual_plane = true; + cem_supports_subsets = true; + break; + case astc_helpers::CEM_LDR_RGB_BASE_SCALE: + num_dp_chans = 3; + cem_supports_dual_plane = true; + cem_supports_subsets = true; + break; + case astc_helpers::CEM_LDR_RGBA_DIRECT: + num_dp_chans = 4; + cem_supports_dual_plane = true; + cem_supports_subsets = true; + break; + case astc_helpers::CEM_LDR_RGB_BASE_SCALE_PLUS_TWO_A: + num_dp_chans = 4; + cem_supports_dual_plane = true; + cem_supports_subsets = true; + break; + default: + assert(0); + break; + } + + for (int dp = 0; dp < (cem_supports_dual_plane ? 2 : 1); dp++) + { + const bool use_subsets = !dp && cem_supports_subsets; + + for (int subsets = 1; subsets <= (use_subsets ? 3 : 1); subsets++) + { + for (uint32_t grid_height = 2; grid_height <= block_height; grid_height++) + { + for (uint32_t grid_width = 2; grid_width <= block_width; grid_width++) + { + for (uint32_t dp_chan_index = 0; dp_chan_index < (dp ? num_dp_chans : 1); dp_chan_index++) + { + astc_helpers::log_astc_block log_block; + log_block.clear(); + + log_block.m_grid_width = (uint8_t)grid_width; + log_block.m_grid_height = (uint8_t)grid_height; + + log_block.m_num_partitions = (uint8_t)subsets; + + for (int i = 0; i < subsets; i++) + log_block.m_color_endpoint_modes[i] = (uint8_t)cem_index; + + log_block.m_dual_plane = dp > 0; + + if (log_block.m_dual_plane) + { + uint32_t ccs_index = dp_chan_index; + + if (cem_index == astc_helpers::CEM_LDR_LUM_ALPHA_DIRECT) + { + // must be 3 for LA if DP is enabled + ccs_index = 3; + } + + log_block.m_color_component_selector = (uint8_t)ccs_index; + } + + for (uint32_t weight_ise_range = astc_helpers::FIRST_VALID_WEIGHT_ISE_RANGE; weight_ise_range <= astc_helpers::LAST_VALID_WEIGHT_ISE_RANGE; weight_ise_range++) + { + log_block.m_weight_ise_range = (uint8_t)weight_ise_range; + log_block.m_endpoint_ise_range = astc_helpers::FIRST_VALID_ENDPOINT_ISE_RANGE; // dummy value + + total_partial_evals++; + + bool success = astc_helpers::pack_astc_block(phys_block, log_block, nullptr, nullptr, astc_helpers::cValidateEarlyOutAtEndpointISEChecks); + if (!success) + continue; + + // in reality only 1 endpoint ISE range is valid here + for (uint32_t endpoint_ise_range = astc_helpers::FIRST_VALID_ENDPOINT_ISE_RANGE; endpoint_ise_range <= astc_helpers::LAST_VALID_ENDPOINT_ISE_RANGE; endpoint_ise_range++) + { + log_block.m_endpoint_ise_range = (uint8_t)endpoint_ise_range; + + total_evals++; + + success = astc_helpers::pack_astc_block(phys_block, log_block, nullptr, nullptr, astc_helpers::cValidateSkipFinalEndpointWeightPacking); + if (!success) + continue; + + total_evals_succeeded++; + + if (print_modes) + { + fmt_debug_printf("{}: CEM: {} DP: {}, CCS: {}, SUBSETS: {}, GRID: {}x{}, ENDPOINTS: {}, WEIGHTS: {}\n", + mode_index, + log_block.m_color_endpoint_modes[0], + log_block.m_dual_plane, + log_block.m_color_component_selector, + log_block.m_num_partitions, + log_block.m_grid_width, log_block.m_grid_height, + astc_helpers::get_ise_levels(log_block.m_endpoint_ise_range), + astc_helpers::get_ise_levels(log_block.m_weight_ise_range)); + } + + basist::astc_ldr_t::trial_mode m; + m.m_ccs_index = log_block.m_dual_plane ? log_block.m_color_component_selector : -1; + m.m_cem = log_block.m_color_endpoint_modes[0]; + m.m_endpoint_ise_range = log_block.m_endpoint_ise_range; + m.m_weight_ise_range = log_block.m_weight_ise_range; + m.m_grid_width = grid_width; + m.m_grid_height = grid_height; + m.m_num_parts = log_block.m_num_partitions; + + uint32_t packed_index = pack_tm_desc( + log_block.m_grid_width, log_block.m_grid_height, + log_block.m_color_endpoint_modes[0], log_block.m_dual_plane ? (log_block.m_color_component_selector + 1) : 0, log_block.m_num_partitions, + log_block.m_endpoint_ise_range, log_block.m_weight_ise_range); + + assert(packed_index <= 0xFFFFFF); + packed_mode_ids.push_back(packed_index); + + grouped_encoder_trial_modes.add(block_width, block_height, m, encoder_trial_modes.size_u32()); + + encoder_trial_modes.push_back(m); + + max_grid_width = maximum(max_grid_width, grid_width); + max_grid_height = maximum(max_grid_height, grid_height); + + ++mode_index; + + } // weight_ise_range + } // endpoint_ise_range + + } // ccs_index + + } // grid_width + + } // grid_height + + } // subsets + + } // dp + + } // cem_mode_iter + + } // alpha_iter + +#if 0 + packed_mode_ids.sort(); + + for (uint32_t i = 0; i < packed_mode_ids.size(); i++) + { + uint32_t packed_index = packed_mode_ids[i]; + + fmt_debug_printf("{},{},{},", packed_index & 0xFF, (packed_index >> 8) & 0xFF, (packed_index >> 16) & 0xFF); + if ((i & 15) == 15) + fmt_debug_printf("\n"); + } +#endif + + if (print_debug_info) + { + fmt_debug_printf("create_encoder_trial_modes_full_eval() time: {} secs\n", itm.get_elapsed_secs()); + + fmt_debug_printf("create_encoder_trial_modes_full_eval() - ASTC {}x{} modes\n", block_width, block_height); + fmt_debug_printf("total_evals: {}, total_partial_evals: {}, total_evals_succeeded: {}\n", total_evals, total_partial_evals, total_evals_succeeded); + fmt_debug_printf("Total trial modes: {}\n", (uint32_t)encoder_trial_modes.size()); + fmt_debug_printf("Total used trial mode groups: {}\n", grouped_encoder_trial_modes.count_used_groups()); + fmt_debug_printf("Max ever grid dimensions: {}x{}\n", max_grid_width, max_grid_height); + } + + // sanity check + assert(encoder_trial_modes.size() < 11000); +} + +const uint32_t TOTAL_RGBA_CHAN_PAIRS = 6; +//const uint32_t TOTAL_RGB_CHAN_PAIRS = 3; +static const uint8_t g_rgba_chan_pairs[TOTAL_RGBA_CHAN_PAIRS][2] = +{ + { 0, 1 }, + { 0, 2 }, + { 1, 2 }, + { 0, 3 }, + { 1, 3 }, + { 2, 3 } +}; + +bool encoder_trial_mode_test() +{ + for (uint32_t w = 4; w <= 12; w++) + { + for (uint32_t h = 4; h <= 12; h++) + { + if (!astc_helpers::is_valid_block_size(w, h)) + continue; + + basisu::vector encoder_trial_modes_orig; + basist::astc_ldr_t::grouped_trial_modes grouped_encoder_trial_modes_orig; + + create_encoder_trial_modes_full_eval(w, h, + encoder_trial_modes_orig, grouped_encoder_trial_modes_orig, + false, false); + + fmt_debug_printf("Testing block size {}x{}, {} total modes\n", w, h, encoder_trial_modes_orig.size_u32()); + + basisu::hash_map trial_mode_hash; + for (uint32_t i = 0; i < encoder_trial_modes_orig.size(); i++) + { + trial_mode_hash.insert(encoder_trial_modes_orig[i]); + } + + basisu::vector encoder_trial_modes_new; + basist::astc_ldr_t::grouped_trial_modes grouped_encoder_trial_modes_new; + + basist::astc_ldr_t::create_encoder_trial_modes_table(w, h, + encoder_trial_modes_new, grouped_encoder_trial_modes_new, + false, false); + + if (encoder_trial_modes_new.size() != encoder_trial_modes_orig.size()) + { + fmt_error_printf("trial mode test failed!\n"); + + assert(0); + return false; + } + + for (uint32_t i = 0; i < encoder_trial_modes_new.size(); i++) + { + const basist::astc_ldr_t::trial_mode& tm = encoder_trial_modes_new[i]; + if (trial_mode_hash.find(tm) == trial_mode_hash.end()) + { + fmt_error_printf("trial mode test failed!\n"); + + assert(0); + return false; + } + } + + } // h + } // w + + fmt_debug_printf("trial mode test succeeded\n"); + return true; +} + +//---------------------------------------------------------------------------------- + +struct ldr_astc_block_encode_image_high_level_config +{ + uint32_t m_block_width = 6; + uint32_t m_block_height = 6; + + bool m_second_superpass_refinement = true; + float m_second_superpass_fract_to_recompress = .075f; + + bool m_third_superpass_try_neighbors = true; + + float m_base_q = 75.0f; + bool m_use_dct = false; + + bool m_subsets_enabled = true; + bool m_subsets_edge_filtering = true; + + bool m_filter_by_pca_angles_flag = true; + float m_use_direct_angle_thresh = 2.0f; + float m_use_base_scale_angle_thresh = 7.0f; + + bool m_force_all_dual_plane_chan_evals = false; // much slower, test on base + bool m_disable_rgb_dual_plane = false; // DP can be on alpha only, if block has alpha + float m_strong_dp_decorr_thresh_rgb = .998f; + + bool m_use_base_ofs = true; + bool m_use_blue_contraction = true; + + bool m_grid_hv_filtering = true; + bool m_low_freq_block_filtering = true; + + uint32_t m_superbucket_max_to_retain[3] = { 4, 8, 16 }; + + float m_final_shortlist_fraction[3] = { .25f, .33f, .5f }; + uint32_t m_final_shortlist_min_size[3] = { 1, 1, 1 }; + uint32_t m_final_shortlist_max_size[3] = { 4096, 4096, 4096 }; + + uint32_t m_part2_fraction_to_keep = 2; + uint32_t m_part3_fraction_to_keep = 2; + uint32_t m_base_parts2 = 32; + uint32_t m_base_parts3 = 32; + + float m_early_stop_wpsnr = 0.0f; + float m_early_stop2_wpsnr = 0.0f; + + bool m_blurring_enabled = false; + bool m_blurring_enabled_p2 = false; + + bool m_gradient_descent_flag = true; + bool m_polish_weights_flag = true; + bool m_qcd_enabled_flag = true; // gradient descent must be enabled too + bool m_bucket_pruning_passes = true; + + // 2nd superpass options + uint32_t m_base_parts2_p2 = 64; + uint32_t m_base_parts3_p2 = 64; + uint32_t m_superbucket_max_to_retain_p2[3] = { 16, 32, 256 }; + uint32_t m_final_shortlist_max_size_p2[3] = { 4096, 4096, 4096 }; + uint32_t m_second_pass_total_weight_refine_passes = astc_ldr::WEIGHT_REFINER_MAX_PASSES; + bool m_second_pass_force_subsets_enabled = true; + bool m_force_all_dp_chans_p2 = false; + bool m_final_encode_always_try_rgb_direct = false; + bool m_filter_by_pca_angles_flag_p2 = true; + + // only store the single best result per block + //bool m_save_single_result = false; + + bool m_debug_images = false; + bool m_debug_output = false; + + std::string m_debug_file_prefix; + + job_pool* m_pJob_pool; + + //saliency_map m_saliency_map; + + astc_ldr::cem_encode_params m_cem_enc_params; +}; + +struct ldr_astc_block_encode_image_output +{ + ldr_astc_block_encode_image_output() + { + } + + ~ldr_astc_block_encode_image_output() + { + interval_timer itm; + itm.start(); + + const int num_blocks_x = m_image_block_info.get_width(); + const int num_blocks_y = m_image_block_info.get_height(); + + for (int y = num_blocks_y - 1; y >= 0; --y) + { + for (int x = num_blocks_x - 1; x >= 0; --x) + { + auto& out_blocks = m_image_block_info(x, y).m_out_blocks; + out_blocks.clear(); + } + } // y + + //fmt_debug_printf("Cleared enc_out image block info: {3.3} secs\n", itm.get_elapsed_secs()); + } + + astc_ldr::partitions_data m_part_data_p2; + astc_ldr::partitions_data m_part_data_p3; + + basisu::vector m_encoder_trial_modes; + basist::astc_ldr_t::grouped_trial_modes m_grouped_encoder_trial_modes; + + vector2D m_packed_phys_blocks; + + struct block_info + { + block_info() + { + m_pixel_stats.clear(); + } + + astc_ldr::pixel_stats_t m_pixel_stats; // of original/input block + + basisu::vector m_out_blocks; + + uint32_t m_packed_out_block_index = 0; // index of best out block by WSSE + + bool m_low_freq_block_flag = false; + bool m_super_strong_edges = false; + bool m_very_strong_edges = false; + bool m_strong_edges = false; + }; + + vector2D m_image_block_info; + + struct block_info_superpass1 + { + int m_config_reuse_neighbor_out_block_indices[basist::astc_ldr_t::cMaxConfigReuseNeighbors] = { cInvalidIndex, cInvalidIndex, cInvalidIndex }; + + bool m_config_reuse_new_neighbor_out_block_flags[basist::astc_ldr_t::cMaxConfigReuseNeighbors] = { false, false, false }; + + basisu::vector m_new_out_config_reuse_blocks; + basisu::vector m_new_out_config_endpoint_reuse_blocks; + }; + + vector2D m_image_block_info_superpass2; + +private: + ldr_astc_block_encode_image_output(const ldr_astc_block_encode_image_output&); + ldr_astc_block_encode_image_output& operator= (const ldr_astc_block_encode_image_output&); +}; + +constexpr bool selective_blurring = true; + +bool ldr_astc_block_encode_image( + const image& orig_img, + const ldr_astc_block_encode_image_high_level_config& enc_cfg, + ldr_astc_block_encode_image_output& enc_out) +{ + if (enc_cfg.m_debug_output) + fmt_debug_printf("ldr_astc_block_encode_image:\n"); + + const uint32_t block_width = enc_cfg.m_block_width, block_height = enc_cfg.m_block_height; + const uint32_t width = orig_img.get_width(), height = orig_img.get_height(); + const uint32_t total_pixels = width * height; + const uint32_t total_block_pixels = enc_cfg.m_block_width * enc_cfg.m_block_height; + const uint32_t num_blocks_x = orig_img.get_block_width(enc_cfg.m_block_width); + const uint32_t num_blocks_y = orig_img.get_block_height(enc_cfg.m_block_height); + const uint32_t total_blocks = num_blocks_x * num_blocks_y; + + if (enc_cfg.m_debug_output) + { + fmt_debug_printf("ASTC base bitrate: {3.3} bpp\n", 128.0f / (float)(enc_cfg.m_block_width * enc_cfg.m_block_height)); + + fmt_debug_printf("ASTC block size: {}x{}\n", enc_cfg.m_block_width, enc_cfg.m_block_height); + } + + if (enc_cfg.m_debug_output) + fmt_debug_printf("Image has alpha: {}\n", orig_img.has_alpha()); + + astc_ldr::partitions_data* pPart_data_p2 = &enc_out.m_part_data_p2; + pPart_data_p2->init(2, enc_cfg.m_block_width, enc_cfg.m_block_height); + + astc_ldr::partitions_data* pPart_data_p3 = &enc_out.m_part_data_p3; + pPart_data_p3->init(3, enc_cfg.m_block_width, enc_cfg.m_block_height); + + // blurring coefficients + const float bw0 = 1.15f; + const float bw1 = 1.25f, bw1_a = 1.0f; + const float bw2 = 1.25f; + + // TODO: Make this optional/tune this, add only 2 level blurring support + image orig_img_blurred2, orig_img_blurred3, orig_img_blurred4, orig_img_blurred5; + + if ((enc_cfg.m_blurring_enabled) || (enc_cfg.m_blurring_enabled_p2)) + { + orig_img_blurred2.resize(orig_img.get_width(), orig_img.get_height()); + orig_img_blurred3.resize(orig_img.get_width(), orig_img.get_height()); + orig_img_blurred4.resize(orig_img.get_width(), orig_img.get_height()); + orig_img_blurred5.resize(orig_img.get_width(), orig_img.get_height()); + + image_resample(orig_img, orig_img_blurred2, true, "gaussian", bw0); + image_resample(orig_img, orig_img_blurred3, true, "gaussian", bw1, false, 0, 4, bw1_a); + image_resample(orig_img, orig_img_blurred4, true, "gaussian", bw1_a, false, 0, 4, bw1); + image_resample(orig_img, orig_img_blurred5, true, "gaussian", bw2, false); + } + + if (enc_cfg.m_debug_images) + { + save_png(enc_cfg.m_debug_file_prefix + "dbg_astc_ldr_orig_img.png", orig_img); + + if ((enc_cfg.m_blurring_enabled) || (enc_cfg.m_blurring_enabled_p2)) + { + save_png(enc_cfg.m_debug_file_prefix + "vis_orig_blurred2.png", orig_img_blurred2); + save_png(enc_cfg.m_debug_file_prefix + "vis_orig_blurred3.png", orig_img_blurred3); + save_png(enc_cfg.m_debug_file_prefix + "vis_orig_blurred4.png", orig_img_blurred4); + save_png(enc_cfg.m_debug_file_prefix + "vis_orig_blurred5.png", orig_img_blurred5); + } + } + + if (enc_cfg.m_debug_output) + fmt_debug_printf("Dimensions: {}x{}, Blocks: {}x{}, Total blocks: {}\n", width, height, num_blocks_x, num_blocks_y, total_blocks); + + image orig_img_sobel_x, orig_img_sobel_y; + compute_sobel(orig_img, orig_img_sobel_x, &g_sobel_x[0][0]); + compute_sobel(orig_img, orig_img_sobel_y, &g_sobel_y[0][0]); + + if (enc_cfg.m_debug_images) + { + save_png(enc_cfg.m_debug_file_prefix + "vis_orig_sobel_x.png", orig_img_sobel_x); + save_png(enc_cfg.m_debug_file_prefix + "vis_orig_sobel_y.png", orig_img_sobel_y); + } + + image orig_img_sobel_xy(width, height); + for (uint32_t y = 0; y < height; y++) + { + for (uint32_t x = 0; x < width; x++) + { + const color_rgba& sx = orig_img_sobel_x(x, y); + const color_rgba& sy = orig_img_sobel_y(x, y); + + orig_img_sobel_xy(x, y).set( + iabs((int)sx.r - 128) + iabs((int)sy.r - 128), + iabs((int)sx.g - 128) + iabs((int)sy.g - 128), + iabs((int)sx.b - 128) + iabs((int)sy.b - 128), + iabs((int)sx.a - 128) + iabs((int)sy.a - 128)); + } + } + + if (enc_cfg.m_debug_images) + save_png(enc_cfg.m_debug_file_prefix + "vis_orig_sobel_xy.png", orig_img_sobel_xy); + + vector2D& packed_blocks = enc_out.m_packed_phys_blocks; + packed_blocks.resize(num_blocks_x, num_blocks_y); + memset(packed_blocks.get_ptr(), 0, packed_blocks.size_in_bytes()); + + assert(enc_cfg.m_pJob_pool); + job_pool& job_pool = *enc_cfg.m_pJob_pool; + + std::atomic encoder_failed_flag; + encoder_failed_flag.store(false); + + std::mutex global_mutex; + + basisu::vector& encoder_trial_modes = enc_out.m_encoder_trial_modes; + encoder_trial_modes.reserve(4096); + + basist::astc_ldr_t::grouped_trial_modes& grouped_encoder_trial_modes = enc_out.m_grouped_encoder_trial_modes; + basist::astc_ldr_t::create_encoder_trial_modes_table(block_width, block_height, encoder_trial_modes, grouped_encoder_trial_modes, enc_cfg.m_debug_output, false); + + if (enc_cfg.m_debug_output) + { + uint32_t total_actual_modes = encoder_trial_modes.size_u32(); + + if (enc_cfg.m_use_base_ofs) + { + for (uint32_t i = 0; i < encoder_trial_modes.size(); i++) + { + const auto& tm = encoder_trial_modes[i]; + + switch (tm.m_cem) + { + case astc_helpers::CEM_LDR_RGBA_DIRECT: + case astc_helpers::CEM_LDR_RGB_DIRECT: + // add base+ofs variant + total_actual_modes++; + break; + default: + break; + } + } // i + } + + fmt_debug_printf("Base encoder trial modes: {}, grand total including base+ofs CEM's: {}\n", encoder_trial_modes.size_u32(), total_actual_modes); + } + + uint32_t total_used_bc = 0; + + uint_vec used_rgb_direct_count; + used_rgb_direct_count.resize(encoder_trial_modes.size()); + + uint_vec used_base_offset_count; + used_base_offset_count.resize(encoder_trial_modes.size()); + + uint32_t total_void_extent_blocks_skipped = 0; + + uint32_t total_superbuckets_created = 0; + uint32_t total_buckets_created = 0; + uint32_t total_surrogate_encodes = 0; + uint32_t total_full_encodes = 0; + uint32_t total_shortlist_candidates = 0; + uint32_t total_full_encodes_pass1 = 0; + uint32_t total_full_encodes_pass2 = 0; + + uint32_t total_blur_encodes = 0; + uint32_t total_blurred_blocks1 = 0; + uint32_t total_blurred_blocks2 = 0; + uint32_t total_blurred_blocks3 = 0; + uint32_t total_blurred_blocks4 = 0; + + basist::astc_ldr_t::dct2f dct; + dct.init(enc_cfg.m_block_height, enc_cfg.m_block_width); + + image vis_part_usage_img, vis_part_pat_img, vis_strong_edge, vis_dct_low_freq_block, vis_dp_img, vis_base_ofs_img; + if (enc_cfg.m_debug_images) + { + vis_part_usage_img.resize(block_width * num_blocks_x, block_height * num_blocks_y); + vis_part_pat_img.resize(block_width * num_blocks_x, block_height * num_blocks_y); + vis_strong_edge.resize(block_width * num_blocks_x, block_height * num_blocks_y); + vis_dct_low_freq_block.resize(block_width * num_blocks_x, block_height * num_blocks_y); + vis_dp_img.resize(block_width * num_blocks_x, block_height * num_blocks_y); + vis_base_ofs_img.resize(block_width * num_blocks_x, block_height * num_blocks_y); + } + + ldr_astc_lowlevel_block_encoder_pool encoder_pool; + assert(job_pool.get_total_threads()); + encoder_pool.init((uint32_t)job_pool.get_total_threads()); + + basist::astc_ldr_t::grid_weight_dct grid_coder; + grid_coder.init(block_width, block_height); + + struct output_block_devel_desc + { + const basist::astc_ldr_t::trial_mode* m_pTrial_modes; + int m_trial_mode_index; // this is the index of the mode it tried to encode, but the actual output/enc block could have used base+ofs + bool m_had_alpha; + + bool m_low_freq_block_flag; + bool m_super_strong_edges; + bool m_very_strong_edges; + bool m_strong_edges; + + void clear() + { + clear_obj(*this); + } + }; + + enc_out.m_image_block_info.resize(0, 0); + enc_out.m_image_block_info.resize(num_blocks_x, num_blocks_y); + +#if 0 + for (uint32_t y = 0; y < num_blocks_y; y++) + { + for (uint32_t x = 0; x < num_blocks_x; x++) + { + auto& out_blocks = enc_out.m_image_block_info(x, y).m_out_blocks; + out_blocks.reserve(16); + out_blocks.resize(0); + } + } // y +#endif + + vector2D superpass2_recompress_block_flags; + + if (enc_cfg.m_second_superpass_refinement) + superpass2_recompress_block_flags.resize(num_blocks_x, num_blocks_y); + + if (enc_cfg.m_third_superpass_try_neighbors) + enc_out.m_image_block_info_superpass2.resize(num_blocks_x, num_blocks_y); + + interval_timer itm; + itm.start(); + + //-------------------------------------------------------------------------------------- + // ASTC compression loop + + vector2D output_block_devel_info(num_blocks_x, num_blocks_y); + + uint32_t total_superpasses = 1; + if (enc_cfg.m_third_superpass_try_neighbors) + total_superpasses = 3; + else if (enc_cfg.m_second_superpass_refinement) + total_superpasses = 2; + + uint32_t total_blocks_to_recompress = 0; + + for (uint32_t superpass_index = 0; superpass_index < total_superpasses; superpass_index++) + { + if (superpass_index == 1) + { + if (!enc_cfg.m_second_superpass_refinement) + continue; + if (!total_blocks_to_recompress) + continue; + } + + if (enc_cfg.m_debug_output) + fmt_debug_printf("ASTC packing superpass: {}\n", 1 + superpass_index); + + uint32_t total_blocks_done = 0; + float last_printed_progress_val = -100.0f; + + for (uint32_t by = 0; by < num_blocks_y; by++) + { + for (uint32_t bx = 0; bx < num_blocks_x; bx++) + { + job_pool.add_job([superpass_index, + //width, height, + bx, by, + //num_blocks_x, num_blocks_y, + total_blocks, block_width, block_height, total_block_pixels, &packed_blocks, &global_mutex, + &orig_img, &orig_img_sobel_xy, &orig_img_blurred2, &orig_img_blurred3, &orig_img_blurred4, &orig_img_blurred5, + &enc_cfg, &encoder_failed_flag, pPart_data_p2, pPart_data_p3, + &total_blocks_done, &total_superbuckets_created, &total_buckets_created, &total_surrogate_encodes, &total_full_encodes, &total_shortlist_candidates, + &encoder_trial_modes, + &total_blur_encodes, &total_blurred_blocks1, + &total_full_encodes_pass1, &total_full_encodes_pass2, + &dct, &vis_dct_low_freq_block, + &encoder_pool, &grid_coder, &grouped_encoder_trial_modes, + &enc_out, &output_block_devel_info, &total_void_extent_blocks_skipped, &superpass2_recompress_block_flags, &total_blocks_to_recompress, &last_printed_progress_val] + { + if (encoder_failed_flag) + return; + + //const uint32_t base_x = bx * block_width, base_y = by * block_height; + + color_rgba block_pixels[astc_ldr::ASTC_LDR_MAX_BLOCK_PIXELS]; + orig_img.extract_block_clamped(block_pixels, bx * block_width, by * block_height, block_width, block_height); + + if (superpass_index == 2) + { + // Superpass 2: Encode to best neighbor configurations + const ldr_astc_block_encode_image_output::block_info& out_block_info = enc_out.m_image_block_info(bx, by); + + ldr_astc_block_encode_image_output::block_info_superpass1& out_block_info_superpass1 = enc_out.m_image_block_info_superpass2(bx, by); + + const astc_ldr::pixel_stats_t& pixel_stats = out_block_info.m_pixel_stats; + + const bool is_purely_solid_block = (pixel_stats.m_min == pixel_stats.m_max); + + // if void extent, just skip + if (is_purely_solid_block) + return; + + //const basisu::vector& out_blocks = out_block_info.m_out_blocks; + + for (uint32_t neighbor_index = 0; neighbor_index < basist::astc_ldr_t::cMaxConfigReuseNeighbors; neighbor_index++) + { + const ldr_astc_block_encode_image_output::block_info* pNeighbor_out_block_info = nullptr; + + if (neighbor_index == 0) + { + // Left + if (bx) + pNeighbor_out_block_info = &enc_out.m_image_block_info(bx - 1, by); + } + else if (neighbor_index == 1) + { + // Up + if (by) + pNeighbor_out_block_info = &enc_out.m_image_block_info(bx, by - 1); + } + else + { + assert(neighbor_index == 2); + + // Diagonal + if ((bx) && (by)) + pNeighbor_out_block_info = &enc_out.m_image_block_info(bx - 1, by - 1); + } + + if (!pNeighbor_out_block_info) + continue; + + const encode_block_output& neighbor_output = pNeighbor_out_block_info->m_out_blocks[pNeighbor_out_block_info->m_packed_out_block_index]; + + // Best neighbor was solid, skip it (TODO: reusing it is possible) + if (neighbor_output.m_log_blk.m_solid_color_flag_ldr) + continue; + + const uint32_t neighbor_tm_index = neighbor_output.m_trial_mode_index; + assert(neighbor_tm_index < encoder_trial_modes.size()); + + //const trial_mode& neighbor_tm = encoder_trial_modes[neighbor_tm_index]; // do not use the tm's cem, it may be base+ofs, use the log blk instead + + const astc_helpers::log_astc_block& neighbor_log_blk = neighbor_output.m_log_blk; + assert(!neighbor_log_blk.m_solid_color_flag_ldr); + + const uint32_t neighbor_actual_cem = neighbor_log_blk.m_color_endpoint_modes[0]; + const uint32_t neighbor_partition_id = neighbor_log_blk.m_partition_id; + + // See if we've already encoded this full config + int already_existing_out_block_index = cInvalidIndex; + for (uint32_t i = 0; i < out_block_info.m_out_blocks.size(); i++) + { + if ((out_block_info.m_out_blocks[i].m_trial_mode_index == (int)neighbor_tm_index) && + (out_block_info.m_out_blocks[i].m_log_blk.m_color_endpoint_modes[0] == neighbor_actual_cem) && + (out_block_info.m_out_blocks[i].m_log_blk.m_partition_id == neighbor_partition_id)) + { + already_existing_out_block_index = i; + break; + } + } + + if (already_existing_out_block_index != cInvalidIndex) + { + // We already have an output block using this neighbor trial mode, skip + out_block_info_superpass1.m_config_reuse_neighbor_out_block_indices[neighbor_index] = (uint32_t)already_existing_out_block_index; + out_block_info_superpass1.m_config_reuse_new_neighbor_out_block_flags[neighbor_index] = false; + } + else + { + // Re-encode using the neighbor's full config (tm, base+ofs, partition ID) + astc_helpers::log_astc_block new_log_block; + + bool status = false; + + if (neighbor_log_blk.m_num_partitions > 1) + { + const astc_ldr::partitions_data* pPart_data = (neighbor_log_blk.m_num_partitions == 2) ? pPart_data_p2 : pPart_data_p3; + + const uint32_t part_seed_index = neighbor_log_blk.m_partition_id; + const uint32_t part_unique_index = pPart_data->m_part_seed_to_unique_index[part_seed_index]; + + assert(part_unique_index < astc_helpers::NUM_PARTITION_PATTERNS); + const astc_ldr::partition_pattern_vec* pPat = &pPart_data->m_partition_pats[part_unique_index]; + + bool refine_only_flag = false; + + status = encode_trial_subsets( + block_width, block_height, + pixel_stats, + neighbor_log_blk.m_color_endpoint_modes[0], neighbor_log_blk.m_num_partitions, neighbor_log_blk.m_partition_id, pPat, + neighbor_log_blk.m_endpoint_ise_range, neighbor_log_blk.m_weight_ise_range, + neighbor_log_blk.m_grid_width, neighbor_log_blk.m_grid_height, + new_log_block, + enc_cfg.m_cem_enc_params, + refine_only_flag, + enc_cfg.m_gradient_descent_flag, enc_cfg.m_polish_weights_flag, enc_cfg.m_qcd_enabled_flag, + enc_cfg.m_use_blue_contraction); + } + else + { + status = encode_trial( + block_width, block_height, + pixel_stats, + neighbor_log_blk.m_color_endpoint_modes[0], + neighbor_log_blk.m_dual_plane, neighbor_log_blk.m_dual_plane ? neighbor_log_blk.m_color_component_selector : -1, + neighbor_log_blk.m_endpoint_ise_range, neighbor_log_blk.m_weight_ise_range, + neighbor_log_blk.m_grid_width, neighbor_log_blk.m_grid_height, + new_log_block, + enc_cfg.m_cem_enc_params, + enc_cfg.m_gradient_descent_flag, enc_cfg.m_polish_weights_flag, enc_cfg.m_qcd_enabled_flag, + enc_cfg.m_use_blue_contraction); + } + + if (!status) + { + fmt_debug_printf("encode_trial/encode_trial_subsets failed in superpass 1!\n"); + encoder_failed_flag.store(true); + return; + } + + out_block_info_superpass1.m_config_reuse_neighbor_out_block_indices[neighbor_index] = out_block_info_superpass1.m_new_out_config_reuse_blocks.size_u32(); + out_block_info_superpass1.m_config_reuse_new_neighbor_out_block_flags[neighbor_index] = true; + + encode_block_output& new_output_blk = *out_block_info_superpass1.m_new_out_config_reuse_blocks.enlarge(1); + + new_output_blk.clear(); + + if (enc_cfg.m_use_dct) + { + const basist::astc_ldr_t::astc_block_grid_data* pGrid_data = basist::astc_ldr_t::find_astc_block_grid_data(block_width, block_height, new_log_block.m_grid_width, new_log_block.m_grid_height); + + const uint32_t num_planes = (new_log_block.m_dual_plane ? 2 : 1); + + for (uint32_t plane_index = 0; plane_index < num_planes; plane_index++) + { + bitwise_coder c; + basist::astc_ldr_t::dct_syms syms; + code_block_weights(grid_coder, enc_cfg.m_base_q, plane_index, new_log_block, pGrid_data, c, syms); + + new_output_blk.m_packed_dct_plane_data[plane_index] = syms; + + c.flush(); + + basist::bitwise_decoder d; + d.init(c.get_bytes().data(), c.get_bytes().size_u32()); + + // ensure existing weights get blown away + for (uint32_t i = 0; i < (uint32_t)(new_log_block.m_grid_width * new_log_block.m_grid_height); i++) + new_log_block.m_weights[i * num_planes + plane_index] = 0; + + basist::astc_ldr_t::fvec dct_temp; + bool dec_status = grid_coder.decode_block_weights(enc_cfg.m_base_q, plane_index, new_log_block, &d, pGrid_data, nullptr, dct_temp, nullptr); + + assert(dec_status); + if (!dec_status) + { + error_printf("grid_coder.decode_block_weights() failed!\n"); + + encoder_failed_flag.store(true); + return; + } + } + } // if (enc_cfg.m_use_dct) + + new_output_blk.m_trial_mode_index = safe_cast_int16(neighbor_tm_index); + new_output_blk.m_log_blk = new_log_block; + //new_output_blk.m_trial_surrogate.clear(); + + new_output_blk.m_sse = eval_error(block_width, block_height, new_log_block, pixel_stats, enc_cfg.m_cem_enc_params); + + { + std::lock_guard g(global_mutex); + + total_full_encodes_pass2++; + } + } // if (already_existing_out_block_index != cInvalidIndex) + + { + // Re-encode using the neighbor's full config (tm, base+ofs, partition ID) AND its endpoints + astc_helpers::log_astc_block new_log_block(neighbor_log_blk); + + // Start with fresh 0 weights, then polish them. + clear_obj(new_log_block.m_weights); + + //const bool use_blue_contraction = enc_cfg.m_use_blue_contraction; + + bool improved_flag = false; + + const astc_ldr::partition_pattern_vec* pPat = nullptr; + if (neighbor_log_blk.m_num_partitions > 1) + { + const astc_ldr::partitions_data* pPart_data = (neighbor_log_blk.m_num_partitions == 2) ? pPart_data_p2 : pPart_data_p3; + + const uint32_t part_seed_index = neighbor_log_blk.m_partition_id; + const uint32_t part_unique_index = pPart_data->m_part_seed_to_unique_index[part_seed_index]; + + assert(part_unique_index < astc_helpers::NUM_PARTITION_PATTERNS); + pPat = &pPart_data->m_partition_pats[part_unique_index]; + } + + bool status = polish_block_weights( + block_width, block_height, + pixel_stats, + new_log_block, + enc_cfg.m_cem_enc_params, pPat, improved_flag, + enc_cfg.m_gradient_descent_flag, enc_cfg.m_polish_weights_flag, enc_cfg.m_qcd_enabled_flag); + + if (!status) + { + fmt_error_printf("polish_block_weights failed in superpass 1!\n"); + encoder_failed_flag.store(true); + return; + } + + encode_block_output& new_output_blk = *out_block_info_superpass1.m_new_out_config_endpoint_reuse_blocks.enlarge(1); + + new_output_blk.clear(); + + if (enc_cfg.m_use_dct) + { + const basist::astc_ldr_t::astc_block_grid_data* pGrid_data = basist::astc_ldr_t::find_astc_block_grid_data(block_width, block_height, new_log_block.m_grid_width, new_log_block.m_grid_height); + + const uint32_t num_planes = (new_log_block.m_dual_plane ? 2 : 1); + + for (uint32_t plane_index = 0; plane_index < num_planes; plane_index++) + { + bitwise_coder c; + basist::astc_ldr_t::dct_syms syms; + code_block_weights(grid_coder, enc_cfg.m_base_q, plane_index, new_log_block, pGrid_data, c, syms); + + new_output_blk.m_packed_dct_plane_data[plane_index] = syms; + + c.flush(); + + basist::bitwise_decoder d; + d.init(c.get_bytes().data(), c.get_bytes().size_u32()); + + // ensure existing weights get blown away + for (uint32_t i = 0; i < (uint32_t)(new_log_block.m_grid_width * new_log_block.m_grid_height); i++) + new_log_block.m_weights[i * num_planes + plane_index] = 0; + + basist::astc_ldr_t::fvec dct_temp; + bool dec_status = grid_coder.decode_block_weights(enc_cfg.m_base_q, plane_index, new_log_block, &d, pGrid_data, nullptr, dct_temp, nullptr); + + assert(dec_status); + if (!dec_status) + { + error_printf("grid_coder.decode_block_weights() failed!\n"); + + encoder_failed_flag.store(true); + return; + } + } + } // if (enc_cfg.m_use_dct) + + new_output_blk.m_trial_mode_index = safe_cast_int16(neighbor_tm_index); + new_output_blk.m_log_blk = new_log_block; + //new_output_blk.m_trial_surrogate.clear(); + + new_output_blk.m_sse = eval_error(block_width, block_height, new_log_block, pixel_stats, enc_cfg.m_cem_enc_params); + + { + std::lock_guard g(global_mutex); + + total_full_encodes_pass2++; + } + } + + } // neighbor_index + } + else + { + if (superpass_index == 1) + { + if (!superpass2_recompress_block_flags(bx, by)) + return; + } + + // Superpass 0/2: core ASTC encoding + basisu::vector& out_blocks = enc_out.m_image_block_info(bx, by).m_out_blocks; + out_blocks.resize(0); + + astc_ldr::pixel_stats_t& pixel_stats = enc_out.m_image_block_info(bx, by).m_pixel_stats; + + if (superpass_index == 0) + pixel_stats.init(total_block_pixels, block_pixels); + + const bool is_purely_solid_block = (pixel_stats.m_min == pixel_stats.m_max); + + // early out on totally solid blocks + if (is_purely_solid_block) + { + encode_block_output* pOut = out_blocks.enlarge(1); + pOut->clear(); + + astc_helpers::log_astc_block& log_blk = pOut->m_log_blk; + + log_blk.clear(); + log_blk.m_solid_color_flag_ldr = true; + + for (uint32_t c = 0; c < 4; c++) + log_blk.m_solid_color[c] = pixel_stats.m_min[c]; + + // Expand each component to 16-bits + for (uint32_t c = 0; c < 4; c++) + log_blk.m_solid_color[c] |= (uint16_t)(log_blk.m_solid_color[c]) << 8u; + + pOut->m_sse = eval_error(block_width, block_height, log_blk, pixel_stats, enc_cfg.m_cem_enc_params); + + ldr_astc_block_encode_image_output::block_info& block_info_out = enc_out.m_image_block_info(bx, by); + + block_info_out.m_low_freq_block_flag = true; + block_info_out.m_super_strong_edges = false; + block_info_out.m_very_strong_edges = false; + block_info_out.m_strong_edges = false; + block_info_out.m_packed_out_block_index = 0; + + // Create packed ASTC block + astc_helpers::astc_block& best_phys_block = packed_blocks(bx, by); + bool pack_success = astc_helpers::pack_astc_block(best_phys_block, log_blk); + if (!pack_success) + { + encoder_failed_flag.store(true); + return; + } + + output_block_devel_desc& out_devel_desc = output_block_devel_info(bx, by); + out_devel_desc.m_low_freq_block_flag = true; + out_devel_desc.m_super_strong_edges = false; + out_devel_desc.m_very_strong_edges = false; + out_devel_desc.m_strong_edges = false; + + { + std::lock_guard g(global_mutex); + + total_void_extent_blocks_skipped++; + + total_blocks_done++; + } + + return; + } + + float max_std_dev = 0.0f; + for (uint32_t i = 0; i < 4; i++) + max_std_dev = maximum(max_std_dev, pixel_stats.m_rgba_stats[i].m_std_dev); + + bool is_lum_only = true; + + for (uint32_t y = 0; y < block_height; y++) + { + for (uint32_t x = 0; x < block_width; x++) + { + const color_rgba& c = pixel_stats.m_pixels[x + y * block_width]; + bool is_lum_texel = (c.r == c.g) && (c.r == c.b); + if (!is_lum_texel) + { + is_lum_only = false; + break; + } + } + if (is_lum_only) + break; + } + + basisu::vector block_dct_energy(total_block_pixels); + + bool filter_horizontally_flag = false; + bool low_freq_block_flag = 0; + + { + basisu::vector block_floats(total_block_pixels); + basisu::vector block_dct(total_block_pixels); + basist::astc_ldr_t::fvec work; + + for (uint32_t c = 0; c < 4; c++) + { + for (uint32_t i = 0; i < total_block_pixels; i++) + block_floats[i] = pixel_stats.m_pixels_f[i][c]; + + dct.forward(block_floats.data(), block_dct.data(), work); + + for (uint32_t y = 0; y < block_height; y++) + for (uint32_t x = 0; x < block_width; x++) + block_dct_energy[x + y * block_width] += (float)enc_cfg.m_cem_enc_params.m_comp_weights[c] * squaref(block_dct[x + y * block_width]); + + } // c + + // Wipe DC + block_dct_energy[0] = 0.0f; + + float tot_energy = compute_preserved_dct_energy(block_width, block_height, block_dct_energy.get_ptr(), block_width, block_height); + + float h_energy_lost = compute_lost_dct_energy(block_width, block_height, block_dct_energy.get_ptr(), block_width / 2, block_height); + float v_energy_lost = compute_lost_dct_energy(block_width, block_height, block_dct_energy.get_ptr(), block_width, block_height / 2); + + filter_horizontally_flag = h_energy_lost < v_energy_lost; + + float hv2_lost_energy_fract = compute_lost_dct_energy(block_width, block_height, block_dct_energy.get_ptr(), 2, 2); + if (tot_energy) + hv2_lost_energy_fract /= tot_energy; + + if ((hv2_lost_energy_fract < .03f) || (max_std_dev < (1.0f / 255.0f))) + low_freq_block_flag = true; + } + + if (enc_cfg.m_debug_images) + vis_dct_low_freq_block.fill_box(bx * block_width, by * block_height, block_width, block_height, low_freq_block_flag ? color_rgba(255, 0, 0, 255) : g_black_color); + + bool active_chan_flags[4] = { }; + + // The number of channels with non-zero spans + uint32_t total_active_chans = 0; + // The indices of the channels with non-zero spans. + //uint32_t active_chan_list[4] = { 0 }; + + for (uint32_t i = 0; i < 4; i++) + { + if (pixel_stats.m_rgba_stats[i].m_range > 0.0f) + { + assert(pixel_stats.m_max[i] != pixel_stats.m_min[i]); + + active_chan_flags[i] = true; + + //active_chan_list[total_active_chans] = i; + total_active_chans++; + } + else + { + assert(pixel_stats.m_max[i] == pixel_stats.m_min[i]); + } + } + + basisu::comparative_stats cross_chan_stats[TOTAL_RGBA_CHAN_PAIRS]; + + // def=max correlation for each channel pair (or 1 if one of the channels is inactive) + float chan_pair_correlations[6] = { 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f }; + // 0=0, 1 + // 1=0, 2 + // 2=1, 2 + // 3=0, 3 + // 4=1, 3 + // 5=2, 3 + + float min_corr = 1.0f, max_corr = 0.0f; + + for (uint32_t pair_index = 0; pair_index < TOTAL_RGBA_CHAN_PAIRS; pair_index++) + { + const uint32_t chanA = g_rgba_chan_pairs[pair_index][0]; + const uint32_t chanB = g_rgba_chan_pairs[pair_index][1]; + + // If both channels were active, we've got usable correlation statistics. + if (active_chan_flags[chanA] && active_chan_flags[chanB]) + { + // TODO: This can be directly derived from the 3D/4D covariance matrix entries. + cross_chan_stats[pair_index].calc_pearson(total_block_pixels, + &pixel_stats.m_pixels_f[0][chanA], + &pixel_stats.m_pixels_f[0][chanB], + 4, 4, + &pixel_stats.m_rgba_stats[chanA], + &pixel_stats.m_rgba_stats[chanB]); + + chan_pair_correlations[pair_index] = fabsf(cross_chan_stats[pair_index].m_pearson); + + const float c = fabsf((float)cross_chan_stats[pair_index].m_pearson); + min_corr = minimum(min_corr, c); + max_corr = maximum(max_corr, c); + } + } + + // min_cor will be 1.0f if all channels inactive (solid) + + // Pixel the trial modes the encoder will use: RGB or RGBA (we don't currently support trying both) + + const bool used_alpha_encoder_modes = pixel_stats.m_has_alpha; + + float sobel_energy = 0.0f; + for (uint32_t y = 0; y < block_height; y++) + { + for (uint32_t x = 0; x < block_width; x++) + { + const color_rgba& s = orig_img_sobel_xy.get_clamped(bx * block_width + x, by * block_height + y); + sobel_energy += s[0] * s[0] + s[1] * s[1] + s[2] * s[2] + s[3] * s[3]; + } // x + } // y + + sobel_energy /= (float)total_block_pixels; + + // Configure low-level block encoder. + ldr_astc_lowlevel_block_encoder_params enc_blk_params; + + enc_blk_params.m_block_width = block_width; + enc_blk_params.m_block_height = block_height; + enc_blk_params.m_total_block_pixels = total_block_pixels; + enc_blk_params.m_bx = bx; + enc_blk_params.m_by = by; + + enc_blk_params.m_pOrig_img_sobel_xy_t = &orig_img_sobel_xy; + + enc_blk_params.m_num_trial_modes = encoder_trial_modes.size_u32(); + enc_blk_params.m_pTrial_modes = encoder_trial_modes.get_ptr(); + enc_blk_params.m_pGrouped_trial_modes = &grouped_encoder_trial_modes; + + enc_blk_params.m_pPart_data_p2 = pPart_data_p2; + enc_blk_params.m_pPart_data_p3 = pPart_data_p3; + enc_blk_params.m_pEnc_params = &enc_cfg.m_cem_enc_params; + + float ang_dot = saturate(pixel_stats.m_zero_rel_axis3.dot3(pixel_stats.m_mean_rel_axis3)); + const float pca_axis_angles = acosf(ang_dot) * (180.0f / (float)cPiD); + + enc_blk_params.m_use_alpha_or_opaque_modes = used_alpha_encoder_modes; + enc_blk_params.m_use_lum_direct_modes = is_lum_only; + + const bool filter_by_pca_angles_flag = (superpass_index == 1) ? enc_cfg.m_filter_by_pca_angles_flag_p2 : enc_cfg.m_filter_by_pca_angles_flag; + if (!filter_by_pca_angles_flag) + { + enc_blk_params.m_use_direct_modes = true; + enc_blk_params.m_use_base_scale_modes = true; + } + else + { + // TODO: Make selective based off edge blocks? + enc_blk_params.m_use_direct_modes = (!total_active_chans) || (pca_axis_angles > enc_cfg.m_use_direct_angle_thresh); + enc_blk_params.m_use_base_scale_modes = (pca_axis_angles <= enc_cfg.m_use_base_scale_angle_thresh); + } + + enc_blk_params.m_grid_hv_filtering = enc_cfg.m_grid_hv_filtering; + enc_blk_params.m_filter_horizontally_flag = filter_horizontally_flag; + + enc_blk_params.m_use_small_grids_only = low_freq_block_flag && enc_cfg.m_low_freq_block_filtering; + + enc_blk_params.m_subsets_enabled = enc_cfg.m_subsets_enabled && (!low_freq_block_flag || !enc_cfg.m_subsets_edge_filtering); + + enc_blk_params.m_subsets_edge_filtering = enc_cfg.m_subsets_edge_filtering; + + enc_blk_params.m_use_blue_contraction = enc_cfg.m_use_blue_contraction; + enc_blk_params.m_final_encode_try_base_ofs = enc_cfg.m_use_base_ofs; + + memcpy(enc_blk_params.m_superbucket_max_to_retain, enc_cfg.m_superbucket_max_to_retain, sizeof(enc_cfg.m_superbucket_max_to_retain)); + + memcpy(enc_blk_params.m_final_shortlist_fraction, enc_cfg.m_final_shortlist_fraction, sizeof(enc_blk_params.m_final_shortlist_fraction)); + memcpy(enc_blk_params.m_final_shortlist_min_size, enc_cfg.m_final_shortlist_min_size, sizeof(enc_cfg.m_final_shortlist_min_size)); + memcpy(enc_blk_params.m_final_shortlist_max_size, enc_cfg.m_final_shortlist_max_size, sizeof(enc_blk_params.m_final_shortlist_max_size)); + + enc_blk_params.m_part2_fraction_to_keep = enc_cfg.m_part2_fraction_to_keep; + enc_blk_params.m_part3_fraction_to_keep = enc_cfg.m_part3_fraction_to_keep; + enc_blk_params.m_base_parts2 = enc_cfg.m_base_parts2; + enc_blk_params.m_base_parts3 = enc_cfg.m_base_parts3; + enc_blk_params.m_gradient_descent_flag = enc_cfg.m_gradient_descent_flag; + enc_blk_params.m_polish_weights_flag = enc_cfg.m_polish_weights_flag; + enc_blk_params.m_qcd_enabled_flag = enc_cfg.m_qcd_enabled_flag; + enc_blk_params.m_bucket_pruning_passes = enc_cfg.m_bucket_pruning_passes; + + enc_blk_params.m_alpha_cems = used_alpha_encoder_modes; + + enc_blk_params.m_early_stop_wpsnr = enc_cfg.m_early_stop_wpsnr; + enc_blk_params.m_early_stop2_wpsnr = enc_cfg.m_early_stop2_wpsnr; + + enc_blk_params.m_final_encode_always_try_rgb_direct = enc_cfg.m_final_encode_always_try_rgb_direct; + + enc_blk_params.m_pDCT2F = &dct; + + // Determine DP usage + if (enc_cfg.m_force_all_dual_plane_chan_evals) + { + for (uint32_t i = 0; i < 4; i++) + enc_blk_params.m_dp_active_chans[i] = active_chan_flags[i]; + } + else + { + for (uint32_t i = 0; i < 3; i++) + enc_blk_params.m_dp_active_chans[i] = false; + + // Being very conservative with alpha here - always let the analytical evaluator consider it. + enc_blk_params.m_dp_active_chans[3] = pixel_stats.m_has_alpha; + + if (!enc_cfg.m_disable_rgb_dual_plane) + { + const float rg_corr = chan_pair_correlations[0]; + const float rb_corr = chan_pair_correlations[1]; + const float gb_corr = chan_pair_correlations[2]; + + int desired_dp_chan_rgb = -1; + + float min_p = minimum(rg_corr, rb_corr, gb_corr); + + if (min_p < enc_cfg.m_strong_dp_decorr_thresh_rgb) + { + const bool has_r = active_chan_flags[0], has_g = active_chan_flags[1]; + //const bool has_b = active_chan_flags[2]; + + uint32_t total_active_chans_rgb = 0; + for (uint32_t i = 0; i < 3; i++) + total_active_chans_rgb += active_chan_flags[i]; + + if (total_active_chans_rgb == 2) + { + if (!has_r) + desired_dp_chan_rgb = 1; + else if (!has_g) + desired_dp_chan_rgb = 0; + else + desired_dp_chan_rgb = 0; + } + else if (total_active_chans_rgb == 3) + { + // see if rg/rb is weakly correlated vs. gb + if ((rg_corr < gb_corr) && (rb_corr < gb_corr)) + desired_dp_chan_rgb = 0; + // see if gr/gb is weakly correlated vs. rb + else if ((rg_corr < rb_corr) && (gb_corr < rb_corr)) + desired_dp_chan_rgb = 1; + // assume b is weakest + else + desired_dp_chan_rgb = 2; + } + } + + if (desired_dp_chan_rgb != -1) + { + assert(active_chan_flags[desired_dp_chan_rgb]); + enc_blk_params.m_dp_active_chans[desired_dp_chan_rgb] = true; + } + } + } + + if (!enc_blk_params.m_dp_active_chans[0] && !enc_blk_params.m_dp_active_chans[1] && !enc_blk_params.m_dp_active_chans[2] && !enc_blk_params.m_dp_active_chans[3]) + { + enc_blk_params.m_use_dual_planes = false; + } + + astc_ldr::cem_encode_params temp_cem_enc_params; + if (superpass_index == 1) + { + enc_blk_params.m_base_parts2 = enc_cfg.m_base_parts2_p2; + enc_blk_params.m_base_parts3 = enc_cfg.m_base_parts3_p2; + enc_blk_params.m_part2_fraction_to_keep = 1; + enc_blk_params.m_part3_fraction_to_keep = 1; + + memcpy(enc_blk_params.m_superbucket_max_to_retain, enc_cfg.m_superbucket_max_to_retain_p2, sizeof(enc_cfg.m_superbucket_max_to_retain_p2)); + memcpy(enc_blk_params.m_final_shortlist_max_size, enc_cfg.m_final_shortlist_max_size_p2, sizeof(enc_cfg.m_final_shortlist_max_size_p2)); + + if (enc_cfg.m_second_pass_force_subsets_enabled) + enc_blk_params.m_subsets_enabled = true; + enc_blk_params.m_subsets_edge_filtering = false; + + if (enc_cfg.m_force_all_dp_chans_p2) + { + enc_blk_params.m_dp_active_chans[0] = active_chan_flags[0]; + enc_blk_params.m_dp_active_chans[1] = active_chan_flags[1]; + enc_blk_params.m_dp_active_chans[2] = active_chan_flags[2]; + enc_blk_params.m_dp_active_chans[3] = active_chan_flags[3]; + enc_blk_params.m_use_dual_planes = true; + + if (!enc_blk_params.m_dp_active_chans[0] && !enc_blk_params.m_dp_active_chans[1] && !enc_blk_params.m_dp_active_chans[2] && !enc_blk_params.m_dp_active_chans[3]) + { + enc_blk_params.m_use_dual_planes = false; + } + } + + enc_blk_params.m_gradient_descent_flag = true; + enc_blk_params.m_polish_weights_flag = true; + + enc_blk_params.m_use_direct_modes = true; + enc_blk_params.m_use_base_scale_modes = true; + + enc_blk_params.m_early_stop_wpsnr = enc_cfg.m_early_stop_wpsnr + 2.0f; + enc_blk_params.m_early_stop2_wpsnr = enc_cfg.m_early_stop2_wpsnr + 2.0f; + + if (enc_cfg.m_second_pass_total_weight_refine_passes) + { + temp_cem_enc_params = enc_cfg.m_cem_enc_params; + enc_blk_params.m_pEnc_params = &temp_cem_enc_params; + + temp_cem_enc_params.m_total_weight_refine_passes = enc_cfg.m_second_pass_total_weight_refine_passes; + temp_cem_enc_params.m_worst_weight_nudging_flag = true; + temp_cem_enc_params.m_endpoint_refinement_flag = true; + } + } + + scoped_ldr_astc_lowlevel_block_encoder scoped_block_encoder(encoder_pool); + if (scoped_block_encoder.get_ptr() == nullptr) + { + error_printf("Failed allocating thread local encode block temps\n"); + encoder_failed_flag.store(true); + return; + } + + // solid color + { + encode_block_output* pOut = out_blocks.enlarge(1); + pOut->clear(); + + astc_helpers::log_astc_block& log_blk = pOut->m_log_blk; + + log_blk.clear(); + log_blk.m_solid_color_flag_ldr = true; + + for (uint32_t c = 0; c < 4; c++) + log_blk.m_solid_color[c] = (uint16_t)clamp((int)std::round(pixel_stats.m_mean_f[c] * 255.0f), 0, 255); + + // Expand each component to 16-bits + for (uint32_t c = 0; c < 4; c++) + log_blk.m_solid_color[c] |= (uint16_t)(log_blk.m_solid_color[c]) << 8u; + + pOut->m_sse = eval_error(block_width, block_height, log_blk, pixel_stats, enc_cfg.m_cem_enc_params); + } + + encode_block_stats enc_block_stats; + + bool enc_status = scoped_block_encoder.get_ptr()->full_encode(enc_blk_params, pixel_stats, out_blocks, 0, enc_block_stats); + if (!enc_status) + { + encoder_failed_flag.store(true); + return; + } + +#if 1 + // --------------------- BLOCK BLURRING + // TODO - very slow, needs more configuration and tuning, experimental + const float BLUR_STD_DEV_THRESH = (15.0f / 255.0f); + const float BLUR_SOBEL_ENERGY_THRESH = 15000.0f; + + const bool use_blurs = (enc_cfg.m_blurring_enabled && (!selective_blurring || ((max_std_dev > BLUR_STD_DEV_THRESH) && (sobel_energy > BLUR_SOBEL_ENERGY_THRESH)))) || + (enc_cfg.m_blurring_enabled_p2 && (superpass_index == 1)); + + if (use_blurs) + { + { + assert(orig_img_blurred2.get_width()); + + color_rgba block_pixels_blurred2[astc_ldr::ASTC_LDR_MAX_BLOCK_PIXELS]; + orig_img_blurred2.extract_block_clamped(block_pixels_blurred2, bx * block_width, by * block_height, block_width, block_height); + + astc_ldr::pixel_stats_t pixel_stats_blurred2; + pixel_stats_blurred2.init(total_block_pixels, block_pixels_blurred2); + + enc_status = scoped_block_encoder.get_ptr()->full_encode(enc_blk_params, pixel_stats_blurred2, out_blocks, 1, enc_block_stats); + if (!enc_status) + { + encoder_failed_flag.store(true); + return; + } + } + + { + assert(orig_img_blurred3.get_width()); + + color_rgba block_pixels_blurred3[astc_ldr::ASTC_LDR_MAX_BLOCK_PIXELS]; + orig_img_blurred3.extract_block_clamped(block_pixels_blurred3, bx * block_width, by * block_height, block_width, block_height); + + astc_ldr::pixel_stats_t pixel_stats_blurred3; + pixel_stats_blurred3.init(total_block_pixels, block_pixels_blurred3); + + enc_status = scoped_block_encoder.get_ptr()->full_encode(enc_blk_params, pixel_stats_blurred3, out_blocks, 2, enc_block_stats); + if (!enc_status) + { + encoder_failed_flag.store(true); + return; + } + } + + { + assert(orig_img_blurred4.get_width()); + + color_rgba block_pixels_blurred4[astc_ldr::ASTC_LDR_MAX_BLOCK_PIXELS]; + orig_img_blurred4.extract_block_clamped(block_pixels_blurred4, bx * block_width, by * block_height, block_width, block_height); + + astc_ldr::pixel_stats_t pixel_stats_blurred4; + pixel_stats_blurred4.init(total_block_pixels, block_pixels_blurred4); + + enc_status = scoped_block_encoder.get_ptr()->full_encode(enc_blk_params, pixel_stats_blurred4, out_blocks, 3, enc_block_stats); + if (!enc_status) + { + encoder_failed_flag.store(true); + return; + } + } + + { + assert(orig_img_blurred5.get_width()); + + color_rgba block_pixels_blurred5[astc_ldr::ASTC_LDR_MAX_BLOCK_PIXELS]; + orig_img_blurred5.extract_block_clamped(block_pixels_blurred5, bx * block_width, by * block_height, block_width, block_height); + + astc_ldr::pixel_stats_t pixel_stats_blurred5; + pixel_stats_blurred5.init(total_block_pixels, block_pixels_blurred5); + + enc_status = scoped_block_encoder.get_ptr()->full_encode(enc_blk_params, pixel_stats_blurred5, out_blocks, 4, enc_block_stats); + if (!enc_status) + { + encoder_failed_flag.store(true); + return; + } + } + } +#endif + + // --------------------- WEIGHT GRID DCT CODING + if (enc_cfg.m_use_dct) + { + // apply DCT to weights + for (uint32_t out_block_iter = 0; out_block_iter < out_blocks.size_u32(); out_block_iter++) + { + if (out_blocks[out_block_iter].m_trial_mode_index < 0) + continue; + + astc_helpers::log_astc_block& log_astc_blk = out_blocks[out_block_iter].m_log_blk; + + const basist::astc_ldr_t::astc_block_grid_data* pGrid_data = basist::astc_ldr_t::find_astc_block_grid_data(block_width, block_height, log_astc_blk.m_grid_width, log_astc_blk.m_grid_height); + + const uint32_t num_planes = (log_astc_blk.m_dual_plane ? 2 : 1); + for (uint32_t plane_index = 0; plane_index < num_planes; plane_index++) + { + bitwise_coder c; + basist::astc_ldr_t::dct_syms syms; + code_block_weights(grid_coder, enc_cfg.m_base_q, plane_index, log_astc_blk, pGrid_data, c, syms); + + out_blocks[out_block_iter].m_packed_dct_plane_data[plane_index] = syms; + + c.flush(); + + basist::bitwise_decoder d; + d.init(c.get_bytes().data(), c.get_bytes().size_u32()); + + // ensure existing weights get blown away + for (uint32_t i = 0; i < (uint32_t)(log_astc_blk.m_grid_width * log_astc_blk.m_grid_height); i++) + log_astc_blk.m_weights[i * num_planes + plane_index] = 0; + + basist::astc_ldr_t::fvec dct_temp; + bool status = grid_coder.decode_block_weights(enc_cfg.m_base_q, plane_index, log_astc_blk, &d, pGrid_data, nullptr, dct_temp, nullptr); + + assert(status); + if (!status) + { + error_printf("grid_coder.decode_block_weights() failed!\n"); + + encoder_failed_flag.store(true); + return; + } + +#if 0 + { + astc_helpers::log_astc_block alt_log_astc_blk(log_astc_blk); + + for (uint32_t i = 0; i < (uint32_t)(log_astc_blk.m_grid_width * log_astc_blk.m_grid_height); i++) + alt_log_astc_blk.m_weights[i * num_planes + plane_index] = 0; + + status = grid_coder.decode_block_weights(q, plane_index, alt_log_astc_blk, nullptr, pGrid_data, &out_block_dct_stats[out_block_iter], &syms); + assert(status); + + for (uint32_t i = 0; i < (uint32_t)(log_astc_blk.m_grid_width * log_astc_blk.m_grid_height); i++) + { + assert(log_astc_blk.m_weights[i * num_planes + plane_index] == alt_log_astc_blk.m_weights[i * num_planes + plane_index]); + } + + } +#endif + // TODO: in theory, endpoints can be refined if they don't change the DCT span. + } + + out_blocks[out_block_iter].m_sse = eval_error(block_width, block_height, log_astc_blk, pixel_stats, enc_cfg.m_cem_enc_params); + + } // for + + } // use_dct + + // Find best output block + uint64_t best_out_blocks_err = UINT64_MAX; + uint32_t best_out_blocks_index = 0; + astc_helpers::log_astc_block best_out_blocks_log_astc_blk; + + for (uint32_t out_block_iter = 0; out_block_iter < out_blocks.size_u32(); out_block_iter++) + { + const astc_helpers::log_astc_block& log_astc_blk = out_blocks[out_block_iter].m_log_blk; + + color_rgba dec_pixels[astc_helpers::MAX_BLOCK_DIM * astc_helpers::MAX_BLOCK_DIM]; + bool dec_status = astc_helpers::decode_block(log_astc_blk, dec_pixels, block_width, block_height, enc_cfg.m_cem_enc_params.m_decode_mode_srgb ? astc_helpers::cDecodeModeSRGB8 : astc_helpers::cDecodeModeLDR8); + + assert(dec_status); + if (!dec_status) + { + encoder_failed_flag.store(true); + return; + } + + uint64_t total_err = 0; + for (uint32_t i = 0; i < total_block_pixels; i++) + total_err += weighted_color_error(block_pixels[i], dec_pixels[i], enc_cfg.m_cem_enc_params); + + // if not blurred + if (out_blocks[out_block_iter].m_blur_id == 0) + { + if (out_blocks[out_block_iter].m_sse != total_err) + { + assert(0); + fmt_error_printf("output block SSE invalid\n"); + encoder_failed_flag.store(true); + return; + } + } + + // Replace m_sse with the actual WSSE vs. the original source block (in case it was blurred) + out_blocks[out_block_iter].m_sse = total_err; + + if (total_err < best_out_blocks_err) + { + best_out_blocks_err = total_err; + best_out_blocks_log_astc_blk = log_astc_blk; + best_out_blocks_index = out_block_iter; + } + } // out_block_iter + +#if 0 + // TODO: Save memory, only minimally tested + if (enc_cfg.m_save_single_result) + { + basisu::vector new_out_blocks(1); + new_out_blocks[0] = out_blocks[best_out_blocks_index]; + + std::swap(out_blocks, new_out_blocks); + + best_out_blocks_index = 0; + } +#endif + + ldr_astc_block_encode_image_output::block_info& block_info_out = enc_out.m_image_block_info(bx, by); + + block_info_out.m_low_freq_block_flag = low_freq_block_flag; + block_info_out.m_super_strong_edges = scoped_block_encoder.get_ptr()->m_super_strong_edges; + block_info_out.m_very_strong_edges = scoped_block_encoder.get_ptr()->m_very_strong_edges; + block_info_out.m_strong_edges = scoped_block_encoder.get_ptr()->m_strong_edges; + block_info_out.m_packed_out_block_index = best_out_blocks_index; + + // Create packed ASTC block + astc_helpers::astc_block& best_phys_block = packed_blocks(bx, by); + bool pack_success = astc_helpers::pack_astc_block(best_phys_block, best_out_blocks_log_astc_blk); + if (!pack_success) + { + encoder_failed_flag.store(true); + return; + } + + output_block_devel_desc& out_devel_desc = output_block_devel_info(bx, by); + out_devel_desc.m_low_freq_block_flag = low_freq_block_flag; + out_devel_desc.m_super_strong_edges = scoped_block_encoder.get_ptr()->m_super_strong_edges; + out_devel_desc.m_very_strong_edges = scoped_block_encoder.get_ptr()->m_very_strong_edges; + out_devel_desc.m_strong_edges = scoped_block_encoder.get_ptr()->m_strong_edges; + + // Critical Section + { + std::lock_guard g(global_mutex); + + if (use_blurs) + total_blur_encodes++; + + if (out_blocks[best_out_blocks_index].m_blur_id) + total_blurred_blocks1++; + + if (superpass_index == 0) + { + // TODO: Add 2nd pass statistics + total_superbuckets_created += enc_block_stats.m_total_superbuckets_created; + total_buckets_created += enc_block_stats.m_total_buckets_created; + total_surrogate_encodes += enc_block_stats.m_total_surrogate_encodes; + total_full_encodes += enc_block_stats.m_total_full_encodes; + total_shortlist_candidates += enc_block_stats.m_total_shortlist_candidates; + } + else if (superpass_index == 1) + { + total_full_encodes_pass1 += enc_block_stats.m_total_full_encodes; + } + + total_blocks_done++; + if (enc_cfg.m_debug_output) + { + if (superpass_index == 1) + { + if ((total_blocks_done & 63) == 63) + { + float new_val = ((float)total_blocks_done * 100.0f) / (float)total_blocks_to_recompress; + if ((new_val - last_printed_progress_val) >= 5.0f) + { + last_printed_progress_val = new_val; + fmt_printf("{3.2}%\n", new_val); + } + } + } + else if ((total_blocks_done & 255) == 255) + { + float new_val = ((float)total_blocks_done * 100.0f) / (float)total_blocks; + if ((new_val - last_printed_progress_val) >= 5.0f) + { + last_printed_progress_val = new_val; + fmt_printf("{3.2}%\n", new_val); + } + } + } + + } // lock_guard (global_mutex) + + } // if (superpass_index == ...) + + }); + + if (encoder_failed_flag) + break; + + } // bx + + if (encoder_failed_flag) + break; + + } // by + + if (encoder_failed_flag) + { + fmt_error_printf("Main compressor block loop failed!\n"); + return false; + } + + job_pool.wait_for_all(); + + if (encoder_failed_flag) + { + fmt_error_printf("Main compressor block loop failed!\n"); + return false; + } + + if ((superpass_index == 0) && (enc_cfg.m_second_superpass_refinement) && (enc_cfg.m_second_superpass_fract_to_recompress > 0.0f)) + { + uint_vec block_wsse_indices(total_blocks); + + float_vec block_wsses(total_blocks); + for (uint32_t by = 0; by < num_blocks_y; by++) + { + for (uint32_t bx = 0; bx < num_blocks_x; bx++) + { + ldr_astc_block_encode_image_output::block_info& out_block_info = enc_out.m_image_block_info(bx, by); + + float wsse = (float)out_block_info.m_out_blocks[out_block_info.m_packed_out_block_index].m_sse; + + block_wsses[bx + by * num_blocks_x] = wsse; + } // bx + } // by + + indirect_sort(total_blocks, block_wsse_indices.data(), block_wsses.data()); + + if (block_wsses[block_wsse_indices[total_blocks - 1]] > 0.0f) + { + total_blocks_to_recompress = clamp((uint32_t)std::round((float)total_blocks * enc_cfg.m_second_superpass_fract_to_recompress), 0, total_blocks); + + image vis_recomp_img; + if (enc_cfg.m_debug_images) + vis_recomp_img.resize(width, height); + + for (uint32_t i = 0; i < total_blocks_to_recompress; i++) + { + const uint32_t block_index = block_wsse_indices[total_blocks - 1 - i]; + + const uint32_t block_x = block_index % num_blocks_x; + const uint32_t block_y = block_index / num_blocks_x; + + superpass2_recompress_block_flags(block_x, block_y) = true; + + if (enc_cfg.m_debug_images) + vis_recomp_img.fill_box(block_x * block_width, block_y * block_height, block_width, block_height, color_rgba(255, 255, 255, 255)); + } + + if (enc_cfg.m_debug_images) + save_png(enc_cfg.m_debug_file_prefix + "vis_recomp_img.png", vis_recomp_img); + } + } + + } // superpass_index + + if (enc_cfg.m_third_superpass_try_neighbors) + { + uint32_t total_superpass1_improved_blocks1 = 0; + uint32_t total_superpass1_improved_blocks2 = 0; + + // Merge pass 2's output into pass 0's/1's output, which can be done safely now. + for (uint32_t by = 0; by < num_blocks_y; by++) + { + for (uint32_t bx = 0; bx < num_blocks_x; bx++) + { + ldr_astc_block_encode_image_output::block_info& out_block_info = enc_out.m_image_block_info(bx, by); + + const ldr_astc_block_encode_image_output::block_info_superpass1& out_block_info_superpass1 = enc_out.m_image_block_info_superpass2(bx, by); + + for (uint32_t neighbor_index = 0; neighbor_index < basist::astc_ldr_t::cMaxConfigReuseNeighbors; neighbor_index++) + { + const int new_neighbor_index = out_block_info_superpass1.m_config_reuse_neighbor_out_block_indices[neighbor_index]; + + if (new_neighbor_index == cInvalidIndex) + { + // Can't reuse neighbor's best output block + continue; + } + + if (!out_block_info_superpass1.m_config_reuse_new_neighbor_out_block_flags[neighbor_index]) + { + // Reuses an existing, already encoded output block which matches the neighbor + assert((size_t)new_neighbor_index < out_block_info.m_out_blocks.size()); + continue; + } + + const uint32_t new_out_block_index = out_block_info.m_out_blocks.size_u32(); + + const encode_block_output& new_output_blk = out_block_info_superpass1.m_new_out_config_reuse_blocks[new_neighbor_index]; + + out_block_info.m_out_blocks.push_back(new_output_blk); + +#define BU_CHECK_NEIGHBOR_BEST (1) + +#if BU_CHECK_NEIGHBOR_BEST + // See if the solution has improved + if (new_output_blk.m_sse < out_block_info.m_out_blocks[out_block_info.m_packed_out_block_index].m_sse) + { + total_superpass1_improved_blocks1++; + + // Warning: This invalidate the neighbor indices + out_block_info.m_packed_out_block_index = new_out_block_index; + + //astc_helpers::astc_block& packed_block = enc_out.m_packed_phys_blocks(bx, by); + + bool pack_success = astc_helpers::pack_astc_block((astc_helpers::astc_block&)packed_blocks(bx, by), new_output_blk.m_log_blk); + if (!pack_success) + { + fmt_error_printf("astc_helpers::pack_astc_block failed\n"); + + return false; + } + } +#endif + + } // neighbor_index + + for (uint32_t j = 0; j < out_block_info_superpass1.m_new_out_config_endpoint_reuse_blocks.size(); j++) + { + const uint32_t new_out_block_index = out_block_info.m_out_blocks.size_u32(); + + const encode_block_output& new_output_blk = out_block_info_superpass1.m_new_out_config_endpoint_reuse_blocks[j]; + + out_block_info.m_out_blocks.push_back(new_output_blk); + +#define BU_CHECK_NEIGHBOR_BEST (1) + +#if BU_CHECK_NEIGHBOR_BEST + // See if the solution has improved + if (new_output_blk.m_sse < out_block_info.m_out_blocks[out_block_info.m_packed_out_block_index].m_sse) + { + total_superpass1_improved_blocks2++; + + // Warning: This invalidate the neighbor indices + out_block_info.m_packed_out_block_index = new_out_block_index; + + //astc_helpers::astc_block& packed_block = enc_out.m_packed_phys_blocks(bx, by); + + bool pack_success = astc_helpers::pack_astc_block((astc_helpers::astc_block&)packed_blocks(bx, by), new_output_blk.m_log_blk); + if (!pack_success) + { + fmt_error_printf("astc_helpers::pack_astc_block failed\n"); + + return false; + } + } +#endif + + } // j + + } // bx + } // by + + if (enc_cfg.m_debug_output) + { + fmt_debug_printf("Total superpass 1 improved blocks 1: {} {3.2}%\n", total_superpass1_improved_blocks1, ((float)total_superpass1_improved_blocks1 * 100.0f) / (float)(total_blocks)); + fmt_debug_printf("Total superpass 1 improved blocks 2: {} {3.2}%\n", total_superpass1_improved_blocks2, ((float)total_superpass1_improved_blocks2 * 100.0f) / (float)(total_blocks)); + } + } + + if (ASTC_LDR_CONSISTENCY_CHECKING) + { + if (enc_cfg.m_debug_output) + fmt_debug_printf("consistency checking\n"); + + // Consistency/sanity cross checking + //uint32_t total_blocks_using_neighbor_config = 0; + for (uint32_t by = 0; by < num_blocks_y; by++) + { + for (uint32_t bx = 0; bx < num_blocks_x; bx++) + { + const ldr_astc_block_encode_image_output::block_info& out_block_info = enc_out.m_image_block_info(bx, by); + +#if BU_CHECK_NEIGHBOR_BEST + uint64_t best_sse = UINT64_MAX; + uint32_t best_out_block_index = 0; + + for (uint32_t i = 0; i < out_block_info.m_out_blocks.size(); i++) + { + if (out_block_info.m_out_blocks[i].m_sse < best_sse) + { + best_sse = out_block_info.m_out_blocks[i].m_sse; + best_out_block_index = i; + } + } // i + + if (best_out_block_index != out_block_info.m_packed_out_block_index) + { + fmt_error_printf("consistency check failed\n"); + assert(0); + return false; + } +#endif + + if (out_block_info.m_out_blocks[out_block_info.m_packed_out_block_index].m_sse != + eval_error(block_width, block_height, out_block_info.m_out_blocks[out_block_info.m_packed_out_block_index].m_log_blk, out_block_info.m_pixel_stats, enc_cfg.m_cem_enc_params)) + { + fmt_error_printf("consistency check failed\n"); + assert(0); + return false; + } + + // Ensure packed output block matches the expected best WSSE block. + astc_helpers::astc_block packed_block; + bool pack_success = astc_helpers::pack_astc_block(packed_block, out_block_info.m_out_blocks[out_block_info.m_packed_out_block_index].m_log_blk); + if (!pack_success) + { + fmt_error_printf("astc_helpers::pack_astc_block failed\n"); + return false; + } + + if (memcmp(&packed_block, &enc_out.m_packed_phys_blocks(bx, by), sizeof(astc_helpers::astc_block)) != 0) + { + fmt_error_printf("consistency check failed\n"); + assert(0); + return false; + } + + // DCT check + if ((enc_cfg.m_use_dct) && (out_block_info.m_out_blocks[out_block_info.m_packed_out_block_index].m_trial_mode_index >= 0)) + { + const auto& best_log_blk = out_block_info.m_out_blocks[out_block_info.m_packed_out_block_index].m_log_blk; + if (best_log_blk.m_solid_color_flag_ldr) + { + fmt_error_printf("consistency check failed\n"); + assert(0); + return false; + } + + const basist::astc_ldr_t::astc_block_grid_data* pGrid_data = basist::astc_ldr_t::find_astc_block_grid_data(block_width, block_height, best_log_blk.m_grid_width, best_log_blk.m_grid_height); + const uint32_t total_planes = best_log_blk.m_num_partitions ? (best_log_blk.m_dual_plane ? 2 : 1) : 0; + + astc_helpers::log_astc_block verify_log_blk(best_log_blk); + + for (uint32_t plane_index = 0; plane_index < total_planes; plane_index++) + { + if (!out_block_info.m_out_blocks[out_block_info.m_packed_out_block_index].m_packed_dct_plane_data[plane_index].m_coeffs.size()) + { + fmt_error_printf("consistency check failed\n"); + assert(0); + return false; + } + + basist::astc_ldr_t::fvec dct_temp; + bool dec_status = grid_coder.decode_block_weights(enc_cfg.m_base_q, plane_index, verify_log_blk, nullptr, pGrid_data, nullptr, dct_temp, + &out_block_info.m_out_blocks[out_block_info.m_packed_out_block_index].m_packed_dct_plane_data[plane_index]); + + if (!dec_status) + { + fmt_error_printf("consistency check failed\n"); + assert(0); + return false; + } + + for (uint32_t i = 0; i < (uint32_t)(best_log_blk.m_grid_width * best_log_blk.m_grid_height); i++) + { + if (best_log_blk.m_weights[i * total_planes + plane_index] != verify_log_blk.m_weights[i * total_planes + plane_index]) + { + fmt_error_printf("consistency check failed\n"); + assert(0); + return false; + } + } + + } // plane_index + } + + } // bx + } // by + + if (enc_cfg.m_debug_output) + fmt_debug_printf("consistency checking PASSED\n"); + } + + //fmt_debug_printf("Total blocks using neighbor config: {} {3.2}%\n", total_blocks_using_neighbor_config, ((float)total_blocks_using_neighbor_config * 100.0f) / (float)(total_blocks)); + + // Debug output + uint_vec trial_mode_hist; + trial_mode_hist.resize(encoder_trial_modes.size()); + uint32_t total_alpha_blocks = 0; + + for (uint32_t by = 0; by < num_blocks_y; by++) + { + for (uint32_t bx = 0; bx < num_blocks_x; bx++) + { + const ldr_astc_block_encode_image_output::block_info& out_block_info = enc_out.m_image_block_info(bx, by); + const astc_ldr::pixel_stats_t& pixel_stats = out_block_info.m_pixel_stats; + + const encode_block_output& best_out_block = out_block_info.m_out_blocks[out_block_info.m_packed_out_block_index]; + const astc_helpers::log_astc_block& best_out_blocks_log_astc_blk = best_out_block.m_log_blk; + + if (pixel_stats.m_has_alpha) + total_alpha_blocks++; + + output_block_devel_desc& out_devel_desc = output_block_devel_info(bx, by); + out_devel_desc.m_had_alpha = pixel_stats.m_has_alpha; + out_devel_desc.m_trial_mode_index = best_out_block.m_trial_mode_index; + out_devel_desc.m_pTrial_modes = encoder_trial_modes.data(); + + if (out_devel_desc.m_trial_mode_index >= 0) + trial_mode_hist[out_devel_desc.m_trial_mode_index]++; + + //const float total_astc_weight_bits = log2f((float)astc_helpers::get_ise_levels(best_out_block.m_log_blk.m_weight_ise_range)) * + // best_out_block.m_log_blk.m_grid_width * best_out_block.m_log_blk.m_grid_height * (best_out_block.m_log_blk.m_dual_plane ? 2 : 1); + + //bool used_blue_contraction = astc_ldr::used_blue_contraction(best_out_blocks_log_astc_blk.m_color_endpoint_modes[0], best_out_blocks_log_astc_blk.m_endpoints, best_out_blocks_log_astc_blk.m_endpoint_ise_range); + + if (enc_cfg.m_debug_images) + { + color_rgba vis_col(g_black_color); + color_rgba vis2_col(g_black_color); + color_rgba dp_vis(g_black_color); + color_rgba base_ofs_vis(g_black_color); + //color_rgba dct_bits_abs_vis(g_black_color); + //color_rgba dct_bits_vs_astc_vis(g_black_color); + + const astc_ldr::partition_pattern_vec* pPat = nullptr; + + if (best_out_blocks_log_astc_blk.m_num_partitions == 2) + { + vis_col.set(0, 255, 0, 255); + + const astc_ldr::partitions_data* pPart_data = pPart_data_p2; + + const uint32_t part_seed_index = best_out_blocks_log_astc_blk.m_partition_id; + const uint32_t part_unique_index = pPart_data->m_part_seed_to_unique_index[part_seed_index]; + + pPat = &pPart_data->m_partition_pats[part_unique_index]; + } + else if (best_out_blocks_log_astc_blk.m_num_partitions == 3) + { + vis_col.set(0, 0, 255, 255); + + const astc_ldr::partitions_data* pPart_data = pPart_data_p3; + + const uint32_t part_seed_index = best_out_blocks_log_astc_blk.m_partition_id; + const uint32_t part_unique_index = pPart_data->m_part_seed_to_unique_index[part_seed_index]; + + pPat = &pPart_data->m_partition_pats[part_unique_index]; + } + + // vis_col.r = enc_blk_params.m_use_base_scale_modes ? 255 : 0; + // vis_col.g = enc_blk_params.m_use_direct_modes ? 255 : 0; + + if (!out_devel_desc.m_low_freq_block_flag) + { + if (out_devel_desc.m_super_strong_edges) + vis2_col.set(255, 0, 255, 255); + else if (out_devel_desc.m_very_strong_edges) + vis2_col.set(255, 0, 0, 255); + else if (out_devel_desc.m_strong_edges) + vis2_col.set(0, 255, 0, 255); + } + + if (pPat) + { + for (uint32_t y = 0; y < block_height; y++) + { + for (uint32_t x = 0; x < block_width; x++) + { + const uint32_t subset_idx = (*pPat)(x, y); + + color_rgba c(g_black_color); + + if (best_out_blocks_log_astc_blk.m_num_partitions == 2) + { + assert(subset_idx < 2); + c = subset_idx ? color_rgba(255, 0, 0, 255) : color_rgba(0, 255, 0, 255); + } + else + { + assert(best_out_blocks_log_astc_blk.m_num_partitions == 3); + assert(subset_idx < 3); + + if (subset_idx == 2) + c = color_rgba(0, 0, 255, 255); + else if (subset_idx == 1) + c = color_rgba(32, 0, 190, 255); + else + c = color_rgba(64, 0, 64, 255); + } + + vis_part_pat_img.set_clipped(bx * block_width + x, by * block_height + y, c); + } + } + } + + if (best_out_blocks_log_astc_blk.m_dual_plane) + dp_vis.g = 255; + + if ((best_out_blocks_log_astc_blk.m_color_endpoint_modes[0] == astc_helpers::CEM_LDR_RGB_BASE_PLUS_OFFSET) || + (best_out_blocks_log_astc_blk.m_color_endpoint_modes[0] == astc_helpers::CEM_LDR_RGBA_BASE_PLUS_OFFSET)) + { + base_ofs_vis.b = 255; + } + + vis_part_usage_img.fill_box(bx * block_width, by * block_height, block_width, block_height, vis_col); + vis_strong_edge.fill_box(bx * block_width, by * block_height, block_width, block_height, vis2_col); + vis_dp_img.fill_box(bx * block_width, by * block_height, block_width, block_height, dp_vis); + vis_base_ofs_img.fill_box(bx * block_width, by * block_height, block_width, block_height, base_ofs_vis); + } + + } // bx + + } // by + + const double total_enc_time = itm.get_elapsed_secs(); + + if (enc_cfg.m_debug_output) + fmt_debug_printf("ASTC packing complete\n"); + + image unpacked_img(width, height); + + // Unpack packed image, validate ASTC data with several decoders. + for (uint32_t by = 0; by < num_blocks_y; by++) + { + for (uint32_t bx = 0; bx < num_blocks_x; bx++) + { + const astc_helpers::astc_block* pPhys_block = &packed_blocks(bx, by); + + astc_helpers::log_astc_block log_blk; + bool status = astc_helpers::unpack_block(pPhys_block, log_blk, block_width, block_height); + if (!status) + { + fmt_error_printf("unpack_block() failed\n"); + return false; + } + + // Decode with our generic ASTC decoder. + color_rgba block_pixels[astc_ldr::ASTC_LDR_MAX_BLOCK_PIXELS]; + status = astc_helpers::decode_block(log_blk, block_pixels, block_width, block_height, enc_cfg.m_cem_enc_params.m_decode_mode_srgb ? astc_helpers::cDecodeModeSRGB8 : astc_helpers::cDecodeModeLDR8); + if (!status) + { + fmt_error_printf("decode_block() failed\n"); + return false; + } + + unpacked_img.set_block_clipped(block_pixels, bx * block_width, by * block_height, block_width, block_height); + + // Decode with the Android testing framework ASTC decoder + { + uint8_t dec_pixels_android[astc_ldr::ASTC_LDR_MAX_BLOCK_PIXELS * 4]; + + bool android_success = basisu_astc::astc::decompress_ldr(dec_pixels_android, (const uint8_t*)pPhys_block, enc_cfg.m_cem_enc_params.m_decode_mode_srgb, block_width, block_height); + if (!android_success) + { + fmt_error_printf("Android ASTC decoder failed!\n"); + return false; + } + + if (memcmp(dec_pixels_android, block_pixels, total_block_pixels * 4) != 0) + { + fmt_error_printf("Android ASTC decoder mismatch!\n"); + return false; + } + } + + // Decode with our optimized XUASTC LDR decoder + { + color_rgba block_pixels_alt[astc_ldr::ASTC_LDR_MAX_BLOCK_PIXELS]; + status = astc_helpers::decode_block_xuastc_ldr(log_blk, block_pixels_alt, block_width, block_height, enc_cfg.m_cem_enc_params.m_decode_mode_srgb ? astc_helpers::cDecodeModeSRGB8 : astc_helpers::cDecodeModeLDR8); + if (!status) + { + fmt_error_printf("decode_block_xuastc_ldr() failed\n"); + return false; + } + + if (memcmp(block_pixels, block_pixels_alt, total_block_pixels * 4) != 0) + { + fmt_error_printf("XUASTC LDR ASTC decoder mismatch!\n"); + return false; + } + } + + } // bx + } // by + + if (enc_cfg.m_debug_images) + { + save_png(enc_cfg.m_debug_file_prefix + "dbg_astc_ldr_unpacked_img.png", unpacked_img); + + if (vis_part_usage_img.is_valid()) + save_png(enc_cfg.m_debug_file_prefix + "vis_part_usage.png", vis_part_usage_img); + + if (vis_part_pat_img.is_valid()) + save_png(enc_cfg.m_debug_file_prefix + "vis_part_pat_img.png", vis_part_pat_img); + + if (vis_strong_edge.is_valid()) + save_png(enc_cfg.m_debug_file_prefix + "vis_strong_edge.png", vis_strong_edge); + + if (vis_dct_low_freq_block.is_valid()) + save_png(enc_cfg.m_debug_file_prefix + "vis_dct_low_freq_block.png", vis_dct_low_freq_block); + + if (vis_dp_img.is_valid()) + save_png(enc_cfg.m_debug_file_prefix + "vis_dp.png", vis_dp_img); + + if (vis_base_ofs_img.is_valid()) + save_png(enc_cfg.m_debug_file_prefix + "vis_base_ofs.png", vis_base_ofs_img); + } + + if (enc_cfg.m_debug_output) + { + uint32_t cem_used_hist[16] = { 0 }; + uint32_t cem_used_bc[16] = { 0 }; + uint32_t cem_used_subsets[16] = { 0 }; + uint32_t cem_used_dp[16] = { 0 }; + uint32_t total_dp = 0, total_base_ofs = 0; + uint32_t subset_used_hist[4] = { 0 }; + uint32_t grid_usage_hist[astc_ldr::ASTC_LDR_MAX_BLOCK_PIXELS * astc_ldr::ASTC_LDR_MAX_BLOCK_PIXELS + 1] = { 0 }; + + uint32_t total_header_bits = 0; + uint32_t total_weight_bits = 0; + uint32_t total_endpoint_bits = 0; + + uint32_t total_void_extent = 0; + + uint32_t used_endpoint_levels_hist[astc_helpers::LAST_VALID_ENDPOINT_ISE_RANGE - astc_helpers::FIRST_VALID_ENDPOINT_ISE_RANGE + 1] = { 0 }; + uint32_t used_weight_levels_hist[astc_helpers::LAST_VALID_WEIGHT_ISE_RANGE - astc_helpers::FIRST_VALID_WEIGHT_ISE_RANGE + 1] = { 0 }; + + uint32_t total_blocks_using_subsets = 0; + + for (uint32_t by = 0; by < num_blocks_y; by++) + { + for (uint32_t bx = 0; bx < num_blocks_x; bx++) + { + const output_block_devel_desc& desc = output_block_devel_info(bx, by); + + const astc_helpers::astc_block* pPhys_block = &packed_blocks(bx, by); + + astc_helpers::log_astc_block log_blk; + bool status = astc_helpers::unpack_block(pPhys_block, log_blk, block_width, block_height); + if (!status) + { + fmt_error_printf("unpack_block() failed\n"); + return false; + } + + if (desc.m_trial_mode_index < 0) + { + total_void_extent++; + continue; + } + else + { + const basist::astc_ldr_t::trial_mode& tm = desc.m_pTrial_modes[desc.m_trial_mode_index]; + + const uint32_t actual_cem = log_blk.m_color_endpoint_modes[0]; + //assert(tm.m_cem == log_blk.m_color_endpoint_modes[0]); // may differ due to base+ofs usage + + assert((tm.m_ccs_index >= 0) == log_blk.m_dual_plane); + assert((!log_blk.m_dual_plane) || (tm.m_ccs_index == log_blk.m_color_component_selector)); + assert(tm.m_endpoint_ise_range == log_blk.m_endpoint_ise_range); + assert(tm.m_weight_ise_range == log_blk.m_weight_ise_range); + assert(tm.m_grid_width == log_blk.m_grid_width); + assert(tm.m_grid_height == log_blk.m_grid_height); + assert(tm.m_num_parts == log_blk.m_num_partitions); + + used_weight_levels_hist[open_range_check(tm.m_weight_ise_range - astc_helpers::FIRST_VALID_WEIGHT_ISE_RANGE, std::size(used_weight_levels_hist))]++; + used_endpoint_levels_hist[open_range_check(tm.m_endpoint_ise_range - astc_helpers::FIRST_VALID_ENDPOINT_ISE_RANGE, std::size(used_endpoint_levels_hist))]++; + + cem_used_hist[actual_cem]++; + if (log_blk.m_dual_plane) + total_dp++; + + subset_used_hist[open_range_check(log_blk.m_num_partitions - 1, std::size(subset_used_hist))]++; + + bool used_bc = false; + for (uint32_t i = 0; i < tm.m_num_parts; i++) + { + if (astc_helpers::used_blue_contraction(actual_cem, log_blk.m_endpoints + i * astc_helpers::get_num_cem_values(actual_cem), log_blk.m_endpoint_ise_range)) + { + used_bc = true; + } + } + + if (used_bc) + cem_used_bc[actual_cem]++; + + if (tm.m_num_parts > 1) + cem_used_subsets[actual_cem]++; + + // TODO: add CCS index histogram per CEM + if (log_blk.m_dual_plane) + cem_used_dp[actual_cem]++; + + if ((actual_cem == astc_helpers::CEM_LDR_RGB_BASE_PLUS_OFFSET) || + (actual_cem == astc_helpers::CEM_LDR_RGBA_BASE_PLUS_OFFSET)) + { + total_base_ofs++; + } + + grid_usage_hist[open_range_check(log_blk.m_grid_width * log_blk.m_grid_height, std::size(grid_usage_hist))]++; + + if (tm.m_num_parts > 1) + total_blocks_using_subsets++; + } + + astc_helpers::pack_stats pack_stats; + pack_stats.clear(); + + astc_helpers::astc_block temp_phys_block; + int expected_endpoint_range = 0; + status = astc_helpers::pack_astc_block(temp_phys_block, log_blk, &expected_endpoint_range, &pack_stats); + assert(status); + + total_header_bits += pack_stats.m_header_bits; + total_weight_bits += pack_stats.m_weight_bits; + total_endpoint_bits += pack_stats.m_endpoint_bits; + + } // bx + } // by + + uint32_t total_used_modes = 0; + + fmt_debug_printf("--------------------- Trial Modes:\n"); + + for (uint32_t i = 0; i < trial_mode_hist.size(); i++) + { + if (!trial_mode_hist[i]) + continue; + + if (trial_mode_hist[i]) + total_used_modes++; + +#if 0 + const uint32_t total_mode_blocks = trial_mode_hist[i]; + + const uint32_t num_subsets = encoder_trial_modes[i].m_num_parts; + const uint32_t cem_index = encoder_trial_modes[i].m_cem; + + fmt_debug_printf("{}: {} {3.2}%: cem: {}, grid {}x{}, e: {} w: {}, ccs: {}, parts: {}, total base+ofs: {}, total direct: {}\n", i, total_mode_blocks, (float)total_mode_blocks * 100.0f / (float)total_blocks, + encoder_trial_modes[i].m_cem, + encoder_trial_modes[i].m_grid_width, encoder_trial_modes[i].m_grid_height, + astc_helpers::get_ise_levels(encoder_trial_modes[i].m_endpoint_ise_range), astc_helpers::get_ise_levels(encoder_trial_modes[i].m_weight_ise_range), + encoder_trial_modes[i].m_ccs_index, + encoder_trial_modes[i].m_num_parts, + used_base_offset_count[i], + used_rgb_direct_count[i]); +#endif + } + + fmt_debug_printf("\n"); + + fmt_debug_printf("Used endpoint ISE levels:\n"); + for (uint32_t i = 0; i < std::size(used_endpoint_levels_hist); i++) + fmt_debug_printf("{} levels: {}\n", astc_helpers::get_ise_levels(astc_helpers::FIRST_VALID_ENDPOINT_ISE_RANGE + i), used_endpoint_levels_hist[i]); + + fmt_debug_printf("\nUsed weight ISE levels:\n"); + for (uint32_t i = 0; i < std::size(used_weight_levels_hist); i++) + fmt_debug_printf("{} levels: {}\n", astc_helpers::get_ise_levels(astc_helpers::FIRST_VALID_WEIGHT_ISE_RANGE + i), used_weight_levels_hist[i]); + + const uint32_t total_blocks_excluding_void_extent = total_blocks - total_void_extent; + + fmt_debug_printf("\nTotal blocks: {}, excluding void extent: {}\n", total_blocks, total_blocks_excluding_void_extent); + fmt_debug_printf("Total void extent blocks skipped by compressor: {}\n", total_void_extent_blocks_skipped); + fmt_debug_printf("Total final void extent blocks: {}\n", total_void_extent); + fmt_debug_printf("Total input blocks with alpha: {} {3.1}%\n", total_alpha_blocks, (float)total_alpha_blocks * 100.0f / (float)total_blocks); + + fmt_debug_printf("\nASTC phys avg block stats (including void extent):\n"); + fmt_debug_printf("Total header bits: {}, {} per block, {} per pixel\n", total_header_bits, (float)total_header_bits / (float)total_blocks, (float)total_header_bits / (float)(total_pixels)); + fmt_debug_printf("Total weight bits: {}, {} per block, {} per pixel\n", total_weight_bits, (float)total_weight_bits / (float)total_blocks, (float)total_weight_bits / (float)(total_pixels)); + fmt_debug_printf("Total endpoint bits: {}, {} per block, {} per pixel\n", total_endpoint_bits, (float)total_endpoint_bits / (float)total_blocks, (float)total_endpoint_bits / (float)(total_pixels)); + fmt_debug_printf("Total header+endpoint bits: {}, {} per block, {} per pixel\n", total_header_bits + total_endpoint_bits, + (float)(total_header_bits + total_endpoint_bits) / (float)total_blocks, (float)(total_header_bits + total_endpoint_bits) / (float)(total_pixels)); + fmt_debug_printf("Total header+endpoint+weight bits: {}, {} per block, {} per pixel\n", total_header_bits + total_endpoint_bits + total_weight_bits, + (float)(total_header_bits + total_endpoint_bits + total_weight_bits) / (float)total_blocks, (float)(total_header_bits + total_endpoint_bits + total_weight_bits) / (float)(total_pixels)); + + fmt_debug_printf("\nEncoder stats:\n"); + fmt_debug_printf("Total utilized encoder trial modes: {} {3.2}%\n", total_used_modes, (float)total_used_modes * 100.0f / (float)encoder_trial_modes.size()); + + const uint32_t total_blurred_blocks = total_blurred_blocks1 + total_blurred_blocks2 + total_blurred_blocks3 + total_blurred_blocks4; + + fmt_debug_printf("\nTotal blur encodes: {} ({3.2}%)\n", total_blur_encodes, (float)total_blur_encodes * 100.0f / (float)total_blocks); + fmt_debug_printf("Total blurred blocks: {} ({3.2}%)\n", total_blurred_blocks, (float)total_blurred_blocks * 100.0f / (float)total_blocks); + fmt_debug_printf("Total blurred1 blocks: {} ({3.2}%)\n", total_blurred_blocks1, (float)total_blurred_blocks1 * 100.0f / (float)total_blocks); + fmt_debug_printf("Total blurred2 blocks: {} ({3.2}%)\n", total_blurred_blocks2, (float)total_blurred_blocks2 * 100.0f / (float)total_blocks); + fmt_debug_printf("Total blurred3 blocks: {} ({3.2}%)\n", total_blurred_blocks3, (float)total_blurred_blocks3 * 100.0f / (float)total_blocks); + fmt_debug_printf("Total blurred4 blocks: {} ({3.2}%)\n", total_blurred_blocks4, (float)total_blurred_blocks4 * 100.0f / (float)total_blocks); + + fmt_debug_printf("\nTotal superbuckets created: {} ({4.1} per block)\n", total_superbuckets_created, (float)total_superbuckets_created / (float)total_blocks); + fmt_debug_printf("Total shortlist buckets created: {} ({4.1} per block)\n", total_buckets_created, (float)total_buckets_created / (float)total_blocks); + fmt_debug_printf("Total surrogate encodes: {} ({4.1} per block)\n", total_surrogate_encodes, (float)total_surrogate_encodes / (float)total_blocks); + fmt_debug_printf("Total shortlist candidates (before full encoding): {} ({4.1} per block)\n", total_shortlist_candidates, (float)total_shortlist_candidates / (float)total_blocks); + fmt_debug_printf("Total full encodes on superpass 0: {} ({4.1} per block)\n", total_full_encodes, (float)total_full_encodes / (float)total_blocks); + fmt_debug_printf("Total full encodes on superpass 1: {} ({4.1} per block)\n", total_full_encodes_pass1, (float)total_full_encodes_pass1 / (float)total_blocks); + fmt_debug_printf("Total full encodes on superpass 2: {} ({4.1} per block)\n", total_full_encodes_pass2, (float)total_full_encodes_pass2 / (float)total_blocks); + + debug_printf("\nTotal final encoded ASTC blocks using blue contraction: %u (%.2f%%)\n", total_used_bc, 100.0f * (float)total_used_bc / (float)total_blocks); + + fmt_debug_printf("Total final encoded ASTC blocks using dual planes: {} {3.2}%\n", total_dp, (float)total_dp * 100.0f / (float)total_blocks); + fmt_debug_printf("Total final encoded ASTC blocks using base+ofs: {} {3.2}%\n", total_dp, (float)total_base_ofs * 100.0f / (float)total_blocks); + fmt_debug_printf("Total final encoded ASTC blocks using subsets: {} {3.2}%\n", total_blocks_using_subsets, (float)total_blocks_using_subsets * 100.0f / (float)total_blocks); + + debug_printf("\nSubset usage histogram:\n"); + for (uint32_t i = 0; i < 4; i++) + fmt_debug_printf("{} subsets: {} {3.2}%\n", i + 1, subset_used_hist[i], (float)subset_used_hist[i] * 100.0f / (float)total_blocks); + debug_printf("\n"); + + debug_printf("CEM usage histogram:\n"); + for (uint32_t i = 0; i < 16; i++) + { + if (astc_helpers::is_cem_hdr(i)) + continue; + + std::string n(astc_helpers::get_cem_name(i)); + while (n.size() < 40) + n.push_back(' '); + + fmt_debug_printf("{}: {} {3.2}%, Used BC: {3.2}%, Used subsets: {3.2}%, Used DP: {3.2}%\n", + n, + cem_used_hist[i], + (float)cem_used_hist[i] * 100.0f / (float)total_blocks, + (float)cem_used_bc[i] * 100.0f / (float)total_blocks, + (float)cem_used_subsets[i] * 100.0f / (float)total_blocks, + (float)cem_used_dp[i] * 100.0f / (float)total_blocks); + } + debug_printf("\n"); + + debug_printf("Grid samples histogram:\n"); + for (uint32_t i = 1; i <= block_width * block_height; i++) + { + if (grid_usage_hist[i]) + fmt_debug_printf("{} samples: {} {3.2}%\n", i, grid_usage_hist[i], (float)grid_usage_hist[i] * 100.0f / (float)total_blocks); + } + debug_printf("\n"); + + fmt_debug_printf("orig vs. ASTC compressed:\n"); + print_image_metrics(orig_img, unpacked_img); + + fmt_debug_printf("Total encode time: {.3} secs, {.3} ms per block, {.1} blocks/sec\n", total_enc_time, total_enc_time * 1000.0f / total_blocks, total_blocks / total_enc_time); + + fmt_debug_printf("OK\n"); + } + + return true; +} + +//const uint32_t rice_zero_run_m = 3, rice_dct_coeff_m = 2; + +const uint_vec& separate_tm_index(uint32_t block_width, uint32_t block_height, const basist::astc_ldr_t::grouped_trial_modes& grouped_enc_trial_modes, const basist::astc_ldr_t::trial_mode& tm, + uint32_t& cem_index, uint32_t& subset_index, uint32_t& ccs_index, uint32_t& grid_size, uint32_t& grid_aniso) +{ + cem_index = tm.m_cem; + assert(cem_index < basist::astc_ldr_t::OTM_NUM_CEMS); + + subset_index = tm.m_num_parts - 1; + assert(subset_index < basist::astc_ldr_t::OTM_NUM_SUBSETS); + + ccs_index = tm.m_ccs_index + 1; + assert(ccs_index < basist::astc_ldr_t::OTM_NUM_CCS); + + grid_size = (tm.m_grid_width >= (block_width - 1)) && (tm.m_grid_height >= (block_height - 1)); + grid_aniso = basist::astc_ldr_t::calc_grid_aniso_val(tm.m_grid_width, tm.m_grid_height, block_width, block_height); + + const uint_vec& modes = grouped_enc_trial_modes.m_tm_groups[cem_index][subset_index][ccs_index][grid_size][grid_aniso]; + return modes; +} + +static bool compare_log_block_configs(const astc_helpers::log_astc_block& trial_log_blk, const astc_helpers::log_astc_block& neighbor_log_blk) +{ + assert(!trial_log_blk.m_solid_color_flag_ldr); + + if (neighbor_log_blk.m_solid_color_flag_ldr) + return false; + + if ((trial_log_blk.m_color_endpoint_modes[0] == neighbor_log_blk.m_color_endpoint_modes[0]) && + (trial_log_blk.m_dual_plane == neighbor_log_blk.m_dual_plane) && (trial_log_blk.m_color_component_selector == neighbor_log_blk.m_color_component_selector) && + (trial_log_blk.m_num_partitions == neighbor_log_blk.m_num_partitions) && (trial_log_blk.m_partition_id == neighbor_log_blk.m_partition_id) && + (trial_log_blk.m_grid_width == neighbor_log_blk.m_grid_width) && (trial_log_blk.m_grid_height == neighbor_log_blk.m_grid_height) && + (trial_log_blk.m_endpoint_ise_range == neighbor_log_blk.m_endpoint_ise_range) && (trial_log_blk.m_weight_ise_range == neighbor_log_blk.m_weight_ise_range)) + { + return true; + } + + return false; +} + +static bool compare_log_block_configs_and_endpoints(const astc_helpers::log_astc_block& trial_log_blk, const astc_helpers::log_astc_block& neighbor_log_blk) +{ + if (!compare_log_block_configs(trial_log_blk, neighbor_log_blk)) + return false; + + const uint32_t total_endpoint_vals = trial_log_blk.m_num_partitions * astc_helpers::get_num_cem_values(trial_log_blk.m_color_endpoint_modes[0]); + if (memcmp(trial_log_blk.m_endpoints, neighbor_log_blk.m_endpoints, total_endpoint_vals) == 0) + return true; + + return false; +} + +static bool compare_log_blocks_for_equality(const astc_helpers::log_astc_block& trial_log_blk, const astc_helpers::log_astc_block& neighbor_log_blk) +{ + if (trial_log_blk.m_solid_color_flag_ldr) + { + if (!neighbor_log_blk.m_solid_color_flag_ldr) + return false; + + for (uint32_t i = 0; i < 4; i++) + if (trial_log_blk.m_solid_color[i] != neighbor_log_blk.m_solid_color[i]) + return false; + + return true; + } + else if (neighbor_log_blk.m_solid_color_flag_ldr) + { + return false; + } + + assert(!trial_log_blk.m_solid_color_flag_ldr && !neighbor_log_blk.m_solid_color_flag_ldr); + + if ((trial_log_blk.m_color_endpoint_modes[0] == neighbor_log_blk.m_color_endpoint_modes[0]) && + (trial_log_blk.m_dual_plane == neighbor_log_blk.m_dual_plane) && (trial_log_blk.m_color_component_selector == neighbor_log_blk.m_color_component_selector) && + (trial_log_blk.m_num_partitions == neighbor_log_blk.m_num_partitions) && (trial_log_blk.m_partition_id == neighbor_log_blk.m_partition_id) && + (trial_log_blk.m_grid_width == neighbor_log_blk.m_grid_width) && (trial_log_blk.m_grid_height == neighbor_log_blk.m_grid_height) && + (trial_log_blk.m_endpoint_ise_range == neighbor_log_blk.m_endpoint_ise_range) && (trial_log_blk.m_weight_ise_range == neighbor_log_blk.m_weight_ise_range)) + { + const uint32_t total_endpoint_vals = trial_log_blk.m_num_partitions * astc_helpers::get_num_cem_values(trial_log_blk.m_color_endpoint_modes[0]); + if (memcmp(trial_log_blk.m_endpoints, neighbor_log_blk.m_endpoints, total_endpoint_vals) == 0) + { + const uint32_t total_weights = (trial_log_blk.m_dual_plane ? 2 : 1) * (trial_log_blk.m_grid_width * trial_log_blk.m_grid_height); + return memcmp(trial_log_blk.m_weights, neighbor_log_blk.m_weights, total_weights) == 0; + } + } + + return false; +} + +void configure_encoder_effort_level(int level, ldr_astc_block_encode_image_high_level_config& cfg) +{ + switch (level) + { + case 10: + { + cfg.m_second_superpass_refinement = true; + cfg.m_third_superpass_try_neighbors = true; + + cfg.m_subsets_enabled = true; + cfg.m_use_blue_contraction = true; + cfg.m_use_base_ofs = true; + + cfg.m_force_all_dual_plane_chan_evals = true; + cfg.m_filter_by_pca_angles_flag = false; + + cfg.m_superbucket_max_to_retain[0] = 256; + cfg.m_superbucket_max_to_retain[1] = 256; + cfg.m_superbucket_max_to_retain[2] = 256; + + cfg.m_base_parts2 = 128; + cfg.m_base_parts3 = 128; + cfg.m_part2_fraction_to_keep = 1; + cfg.m_part3_fraction_to_keep = 1; + + cfg.m_final_shortlist_fraction[0] = 1.0f; + cfg.m_final_shortlist_fraction[1] = 1.0f; + cfg.m_final_shortlist_fraction[2] = 1.0f; + + cfg.m_final_shortlist_max_size[0] = 128; + cfg.m_final_shortlist_max_size[1] = 128; + cfg.m_final_shortlist_max_size[2] = 128; + + // Second superpass + cfg.m_second_superpass_fract_to_recompress = .075f; + cfg.m_superbucket_max_to_retain_p2[0] = 1024; + cfg.m_superbucket_max_to_retain_p2[1] = 1024; + cfg.m_superbucket_max_to_retain_p2[2] = 1024; + cfg.m_final_shortlist_max_size_p2[0] = 256; + cfg.m_final_shortlist_max_size_p2[1] = 256; + cfg.m_final_shortlist_max_size_p2[2] = 256; + cfg.m_base_parts2_p2 = 128; + cfg.m_base_parts3_p2 = 128; + cfg.m_force_all_dp_chans_p2 = true; + cfg.m_filter_by_pca_angles_flag_p2 = false; + + cfg.m_final_encode_always_try_rgb_direct = true; + + cfg.m_early_stop_wpsnr = 90.0f; + cfg.m_early_stop2_wpsnr = 90.0f; + cfg.m_grid_hv_filtering = false; + cfg.m_low_freq_block_filtering = false; + + break; + } + case 9: + { + cfg.m_second_superpass_refinement = true; + cfg.m_third_superpass_try_neighbors = true; + + cfg.m_subsets_enabled = true; + cfg.m_use_blue_contraction = true; + cfg.m_use_base_ofs = true; + + cfg.m_force_all_dual_plane_chan_evals = false; + cfg.m_filter_by_pca_angles_flag = true; + + cfg.m_superbucket_max_to_retain[0] = 8; + cfg.m_superbucket_max_to_retain[1] = 16; + cfg.m_superbucket_max_to_retain[2] = 32; + + cfg.m_base_parts2 = 32; + cfg.m_base_parts3 = 32; + cfg.m_part2_fraction_to_keep = 2; + cfg.m_part3_fraction_to_keep = 2; + + cfg.m_final_shortlist_fraction[0] = 1.0f; + cfg.m_final_shortlist_fraction[1] = 1.0f; + cfg.m_final_shortlist_fraction[2] = 1.0f; + + cfg.m_final_shortlist_max_size[0] = 4; + cfg.m_final_shortlist_max_size[1] = 12; + cfg.m_final_shortlist_max_size[2] = 24; + + // Second superpass + cfg.m_second_superpass_fract_to_recompress = .075f; + cfg.m_superbucket_max_to_retain_p2[0] = 16; + cfg.m_superbucket_max_to_retain_p2[1] = 64; + cfg.m_superbucket_max_to_retain_p2[2] = 256; + cfg.m_final_shortlist_max_size_p2[0] = 8; + cfg.m_final_shortlist_max_size_p2[1] = 16; + cfg.m_final_shortlist_max_size_p2[2] = 32; + cfg.m_base_parts2_p2 = 64; + cfg.m_base_parts3_p2 = 64; + cfg.m_force_all_dp_chans_p2 = false; + cfg.m_filter_by_pca_angles_flag_p2 = false; + + cfg.m_final_encode_always_try_rgb_direct = false; + + cfg.m_early_stop_wpsnr = 75.0f; + cfg.m_early_stop2_wpsnr = 70.0f; + + break; + } + case 8: + { + cfg.m_second_superpass_refinement = true; + cfg.m_third_superpass_try_neighbors = true; + + cfg.m_subsets_enabled = true; + cfg.m_use_blue_contraction = true; + cfg.m_use_base_ofs = true; + + cfg.m_force_all_dual_plane_chan_evals = false; + cfg.m_filter_by_pca_angles_flag = true; + + cfg.m_superbucket_max_to_retain[0] = 4; + cfg.m_superbucket_max_to_retain[1] = 8; + cfg.m_superbucket_max_to_retain[2] = 16; + + cfg.m_base_parts2 = 16; + cfg.m_base_parts3 = 16; + cfg.m_part2_fraction_to_keep = 2; + cfg.m_part3_fraction_to_keep = 2; + + cfg.m_final_shortlist_fraction[0] = 1.0f; + cfg.m_final_shortlist_fraction[1] = 1.0f; + cfg.m_final_shortlist_fraction[2] = 1.0f; + + cfg.m_final_shortlist_max_size[0] = 3; + cfg.m_final_shortlist_max_size[1] = 8; + cfg.m_final_shortlist_max_size[2] = 12; + + // Second superpass + cfg.m_second_superpass_fract_to_recompress = .075f; + cfg.m_superbucket_max_to_retain_p2[0] = 16; + cfg.m_superbucket_max_to_retain_p2[1] = 64; + cfg.m_superbucket_max_to_retain_p2[2] = 256; + cfg.m_final_shortlist_max_size_p2[0] = 8; + cfg.m_final_shortlist_max_size_p2[1] = 16; + cfg.m_final_shortlist_max_size_p2[2] = 32; + cfg.m_base_parts2_p2 = 64; + cfg.m_base_parts3_p2 = 64; + cfg.m_force_all_dp_chans_p2 = false; + cfg.m_filter_by_pca_angles_flag_p2 = false; + + cfg.m_final_encode_always_try_rgb_direct = false; + + cfg.m_early_stop_wpsnr = 75.0f; + cfg.m_early_stop2_wpsnr = 70.0f; + break; + } + case 7: + { + cfg.m_second_superpass_refinement = true; + cfg.m_third_superpass_try_neighbors = true; + + cfg.m_subsets_enabled = true; + cfg.m_use_blue_contraction = true; + cfg.m_use_base_ofs = true; + + cfg.m_disable_rgb_dual_plane = false; + cfg.m_strong_dp_decorr_thresh_rgb = .9f; + + cfg.m_force_all_dual_plane_chan_evals = false; + cfg.m_filter_by_pca_angles_flag = true; + + cfg.m_superbucket_max_to_retain[0] = 3; + cfg.m_superbucket_max_to_retain[1] = 7; + cfg.m_superbucket_max_to_retain[2] = 12; + + cfg.m_base_parts2 = 12; + cfg.m_base_parts3 = 12; + cfg.m_part2_fraction_to_keep = 2; + cfg.m_part3_fraction_to_keep = 2; + + cfg.m_final_shortlist_fraction[0] = 1.0f; + cfg.m_final_shortlist_fraction[1] = 1.0f; + cfg.m_final_shortlist_fraction[2] = 1.0f; + + cfg.m_final_shortlist_max_size[0] = 2; + cfg.m_final_shortlist_max_size[1] = 4; + cfg.m_final_shortlist_max_size[2] = 8; + + cfg.m_gradient_descent_flag = true; + cfg.m_polish_weights_flag = true; + cfg.m_qcd_enabled_flag = true; + + cfg.m_bucket_pruning_passes = false; + cfg.m_cem_enc_params.m_max_ls_passes = 1; + + // Second superpass + cfg.m_second_superpass_fract_to_recompress = .075f; + cfg.m_superbucket_max_to_retain_p2[0] = 4; + cfg.m_superbucket_max_to_retain_p2[1] = 16; + cfg.m_superbucket_max_to_retain_p2[2] = 32; + cfg.m_final_shortlist_max_size_p2[0] = 4; + cfg.m_final_shortlist_max_size_p2[1] = 16; + cfg.m_final_shortlist_max_size_p2[2] = 32; + cfg.m_base_parts2_p2 = 32; + cfg.m_base_parts3_p2 = 8; + cfg.m_force_all_dp_chans_p2 = false; + cfg.m_filter_by_pca_angles_flag_p2 = true; + + cfg.m_early_stop_wpsnr = 65.0f; + cfg.m_early_stop2_wpsnr = 60.0f; + break; + } + case 6: + { + cfg.m_second_superpass_refinement = true; + cfg.m_third_superpass_try_neighbors = true; + + cfg.m_subsets_enabled = true; + cfg.m_use_blue_contraction = true; + cfg.m_use_base_ofs = true; + + cfg.m_disable_rgb_dual_plane = false; + cfg.m_strong_dp_decorr_thresh_rgb = .75f; + + cfg.m_force_all_dual_plane_chan_evals = false; + cfg.m_filter_by_pca_angles_flag = true; + + cfg.m_superbucket_max_to_retain[0] = 2; + cfg.m_superbucket_max_to_retain[1] = 5; + cfg.m_superbucket_max_to_retain[2] = 10; + + cfg.m_base_parts2 = 12; + cfg.m_base_parts3 = 10; + cfg.m_part2_fraction_to_keep = 2; + cfg.m_part3_fraction_to_keep = 2; + + cfg.m_final_shortlist_fraction[0] = 1.0f; + cfg.m_final_shortlist_fraction[1] = 1.0f; + cfg.m_final_shortlist_fraction[2] = 1.0f; + + cfg.m_final_shortlist_max_size[0] = 1; + cfg.m_final_shortlist_max_size[1] = 4; + cfg.m_final_shortlist_max_size[2] = 8; + + cfg.m_gradient_descent_flag = true; + cfg.m_polish_weights_flag = true; + cfg.m_qcd_enabled_flag = true; + + cfg.m_bucket_pruning_passes = false; + cfg.m_cem_enc_params.m_max_ls_passes = 1; + + // Second superpass + cfg.m_second_superpass_fract_to_recompress = .075f; + cfg.m_superbucket_max_to_retain_p2[0] = 2; + cfg.m_superbucket_max_to_retain_p2[1] = 8; + cfg.m_superbucket_max_to_retain_p2[2] = 16; + cfg.m_final_shortlist_max_size_p2[0] = 2; + cfg.m_final_shortlist_max_size_p2[1] = 8; + cfg.m_final_shortlist_max_size_p2[2] = 16; + cfg.m_base_parts2_p2 = 32; + cfg.m_base_parts3_p2 = 8; + cfg.m_force_all_dp_chans_p2 = false; + cfg.m_filter_by_pca_angles_flag_p2 = true; + + cfg.m_early_stop_wpsnr = 65.0f; + cfg.m_early_stop2_wpsnr = 60.0f; + break; + } + case 5: + { + cfg.m_second_superpass_refinement = true; + cfg.m_third_superpass_try_neighbors = true; + + cfg.m_subsets_enabled = true; + cfg.m_use_blue_contraction = true; + cfg.m_use_base_ofs = true; + + cfg.m_disable_rgb_dual_plane = false; + cfg.m_strong_dp_decorr_thresh_rgb = .75f; + + cfg.m_force_all_dual_plane_chan_evals = false; + cfg.m_filter_by_pca_angles_flag = true; + + cfg.m_superbucket_max_to_retain[0] = 1; + cfg.m_superbucket_max_to_retain[1] = 4; + cfg.m_superbucket_max_to_retain[2] = 8; + + cfg.m_base_parts2 = 12; + cfg.m_base_parts3 = 8; + cfg.m_part2_fraction_to_keep = 2; + cfg.m_part3_fraction_to_keep = 2; + + cfg.m_final_shortlist_fraction[0] = 1.0f; + cfg.m_final_shortlist_fraction[1] = 1.0f; + cfg.m_final_shortlist_fraction[2] = 1.0f; + + cfg.m_final_shortlist_max_size[0] = 1; + cfg.m_final_shortlist_max_size[1] = 4; + cfg.m_final_shortlist_max_size[2] = 8; + + cfg.m_gradient_descent_flag = true; + cfg.m_polish_weights_flag = true; + cfg.m_qcd_enabled_flag = false; + + cfg.m_bucket_pruning_passes = false; + cfg.m_cem_enc_params.m_max_ls_passes = 1; + + // Second superpass + cfg.m_second_superpass_fract_to_recompress = .075f; + cfg.m_superbucket_max_to_retain_p2[0] = 2; + cfg.m_superbucket_max_to_retain_p2[1] = 8; + cfg.m_superbucket_max_to_retain_p2[2] = 16; + cfg.m_final_shortlist_max_size_p2[0] = 2; + cfg.m_final_shortlist_max_size_p2[1] = 8; + cfg.m_final_shortlist_max_size_p2[2] = 16; + cfg.m_base_parts2_p2 = 32; + cfg.m_base_parts3_p2 = 8; + cfg.m_force_all_dp_chans_p2 = false; + cfg.m_filter_by_pca_angles_flag_p2 = true; + + cfg.m_early_stop_wpsnr = 65.0f; + cfg.m_early_stop2_wpsnr = 60.0f; + break; + } + case 4: + { + cfg.m_second_superpass_refinement = true; + cfg.m_third_superpass_try_neighbors = true; + + cfg.m_subsets_enabled = true; + cfg.m_use_blue_contraction = true; + cfg.m_use_base_ofs = true; + + cfg.m_disable_rgb_dual_plane = false; + cfg.m_strong_dp_decorr_thresh_rgb = .75f; + + cfg.m_force_all_dual_plane_chan_evals = false; + cfg.m_filter_by_pca_angles_flag = true; + + cfg.m_superbucket_max_to_retain[0] = 1; + cfg.m_superbucket_max_to_retain[1] = 4; + cfg.m_superbucket_max_to_retain[2] = 8; + + cfg.m_base_parts2 = 8; + cfg.m_base_parts3 = 4; + cfg.m_part2_fraction_to_keep = 2; + cfg.m_part3_fraction_to_keep = 2; + + cfg.m_final_shortlist_fraction[0] = 1.0f; + cfg.m_final_shortlist_fraction[1] = 1.0f; + cfg.m_final_shortlist_fraction[2] = 1.0f; + + cfg.m_final_shortlist_max_size[0] = 1; + cfg.m_final_shortlist_max_size[1] = 4; + cfg.m_final_shortlist_max_size[2] = 8; + + cfg.m_gradient_descent_flag = true; + cfg.m_polish_weights_flag = true; + cfg.m_qcd_enabled_flag = false; + + cfg.m_bucket_pruning_passes = false; + cfg.m_cem_enc_params.m_max_ls_passes = 1; + + // Second superpass + cfg.m_second_superpass_fract_to_recompress = .075f; + cfg.m_superbucket_max_to_retain_p2[0] = 2; + cfg.m_superbucket_max_to_retain_p2[1] = 8; + cfg.m_superbucket_max_to_retain_p2[2] = 16; + cfg.m_final_shortlist_max_size_p2[0] = 2; + cfg.m_final_shortlist_max_size_p2[1] = 8; + cfg.m_final_shortlist_max_size_p2[2] = 16; + cfg.m_base_parts2_p2 = 32; + cfg.m_base_parts3_p2 = 8; + cfg.m_force_all_dp_chans_p2 = false; + cfg.m_filter_by_pca_angles_flag_p2 = true; + + cfg.m_early_stop_wpsnr = 65.0f; + cfg.m_early_stop2_wpsnr = 60.0f; + break; + } + default: + case 3: + { + cfg.m_second_superpass_refinement = true; + cfg.m_third_superpass_try_neighbors = true; + + cfg.m_subsets_enabled = true; + cfg.m_use_blue_contraction = true; + cfg.m_use_base_ofs = false; + + cfg.m_disable_rgb_dual_plane = false; + cfg.m_strong_dp_decorr_thresh_rgb = .75f; + + cfg.m_force_all_dual_plane_chan_evals = false; + cfg.m_filter_by_pca_angles_flag = true; + + cfg.m_superbucket_max_to_retain[0] = 1; + cfg.m_superbucket_max_to_retain[1] = 4; + cfg.m_superbucket_max_to_retain[2] = 8; + + cfg.m_base_parts2 = 4; + cfg.m_base_parts3 = 2; + cfg.m_part2_fraction_to_keep = 2; + cfg.m_part3_fraction_to_keep = 2; + + cfg.m_final_shortlist_fraction[0] = 1.0f; + cfg.m_final_shortlist_fraction[1] = 1.0f; + cfg.m_final_shortlist_fraction[2] = 1.0f; + + cfg.m_final_shortlist_max_size[0] = 1; + cfg.m_final_shortlist_max_size[1] = 4; + cfg.m_final_shortlist_max_size[2] = 8; + + cfg.m_gradient_descent_flag = true; + cfg.m_polish_weights_flag = true; + cfg.m_qcd_enabled_flag = false; + + cfg.m_bucket_pruning_passes = false; + cfg.m_cem_enc_params.m_max_ls_passes = 1; + + // Second superpass + cfg.m_second_superpass_fract_to_recompress = .075f; + cfg.m_superbucket_max_to_retain_p2[0] = 2; + cfg.m_superbucket_max_to_retain_p2[1] = 8; + cfg.m_superbucket_max_to_retain_p2[2] = 16; + cfg.m_final_shortlist_max_size_p2[0] = 2; + cfg.m_final_shortlist_max_size_p2[1] = 8; + cfg.m_final_shortlist_max_size_p2[2] = 16; + cfg.m_base_parts2_p2 = 32; + cfg.m_base_parts3_p2 = 8; + cfg.m_force_all_dp_chans_p2 = false; + cfg.m_filter_by_pca_angles_flag_p2 = true; + + cfg.m_early_stop_wpsnr = 65.0f; + cfg.m_early_stop2_wpsnr = 60.0f; + break; + } + case 2: + { + // Level 2+ have subsets and RGB dual-plane enabled + cfg.m_second_superpass_refinement = false; + cfg.m_third_superpass_try_neighbors = true; + + cfg.m_subsets_enabled = true; + cfg.m_use_blue_contraction = true; + cfg.m_use_base_ofs = false; + cfg.m_disable_rgb_dual_plane = false; + + cfg.m_force_all_dual_plane_chan_evals = false; + cfg.m_filter_by_pca_angles_flag = true; + + cfg.m_superbucket_max_to_retain[0] = 1; + cfg.m_superbucket_max_to_retain[1] = 2; + cfg.m_superbucket_max_to_retain[2] = 3; + + cfg.m_base_parts2 = 1; + cfg.m_base_parts3 = 0; + cfg.m_part2_fraction_to_keep = 1; + cfg.m_part3_fraction_to_keep = 1; + + cfg.m_final_shortlist_fraction[0] = 1.0f; + cfg.m_final_shortlist_fraction[1] = 1.0f; + cfg.m_final_shortlist_fraction[2] = 1.0f; + + cfg.m_final_shortlist_max_size[0] = 1; + cfg.m_final_shortlist_max_size[1] = 2; + cfg.m_final_shortlist_max_size[2] = 3; + + cfg.m_gradient_descent_flag = false; + cfg.m_polish_weights_flag = true; + cfg.m_qcd_enabled_flag = false; + + cfg.m_bucket_pruning_passes = false; + cfg.m_cem_enc_params.m_max_ls_passes = 1; + + // Second superpass + cfg.m_second_superpass_fract_to_recompress = .04f; + cfg.m_second_pass_force_subsets_enabled = true; + cfg.m_superbucket_max_to_retain_p2[0] = 1; + cfg.m_superbucket_max_to_retain_p2[1] = 2; + cfg.m_superbucket_max_to_retain_p2[2] = 8; + cfg.m_final_shortlist_max_size_p2[0] = 1; + cfg.m_final_shortlist_max_size_p2[1] = 2; + cfg.m_final_shortlist_max_size_p2[2] = 8; + cfg.m_base_parts2_p2 = 16; + cfg.m_base_parts3_p2 = 0; + cfg.m_force_all_dp_chans_p2 = false; + cfg.m_filter_by_pca_angles_flag_p2 = true; + + cfg.m_early_stop_wpsnr = 45.0f; + cfg.m_early_stop2_wpsnr = 40.0f; + break; + } + case 1: + { + cfg.m_second_superpass_refinement = false; + cfg.m_third_superpass_try_neighbors = false; + + cfg.m_subsets_enabled = false; + cfg.m_use_blue_contraction = true; + cfg.m_use_base_ofs = false; + cfg.m_disable_rgb_dual_plane = true; + + cfg.m_force_all_dual_plane_chan_evals = false; + cfg.m_filter_by_pca_angles_flag = true; + + cfg.m_superbucket_max_to_retain[0] = 1; + cfg.m_superbucket_max_to_retain[1] = 1; + cfg.m_superbucket_max_to_retain[2] = 1; + + cfg.m_base_parts2 = 0; + cfg.m_base_parts3 = 0; + cfg.m_part2_fraction_to_keep = 1; + cfg.m_part3_fraction_to_keep = 1; + + cfg.m_final_shortlist_fraction[0] = 1.0f; + cfg.m_final_shortlist_fraction[1] = 1.0f; + cfg.m_final_shortlist_fraction[2] = 1.0f; + + cfg.m_final_shortlist_max_size[0] = 1; + cfg.m_final_shortlist_max_size[1] = 1; + cfg.m_final_shortlist_max_size[2] = 1; + + cfg.m_gradient_descent_flag = false; + cfg.m_polish_weights_flag = true; + cfg.m_qcd_enabled_flag = false; + + cfg.m_bucket_pruning_passes = false; + cfg.m_cem_enc_params.m_max_ls_passes = 1; + + cfg.m_early_stop_wpsnr = 45.0f; + cfg.m_early_stop2_wpsnr = 40.0f; + break; + } + case 0: + { + cfg.m_second_superpass_refinement = false; + cfg.m_third_superpass_try_neighbors = false; + + cfg.m_subsets_enabled = false; + cfg.m_use_blue_contraction = true; + cfg.m_use_base_ofs = false; + cfg.m_disable_rgb_dual_plane = true; + + cfg.m_force_all_dual_plane_chan_evals = false; + cfg.m_filter_by_pca_angles_flag = true; + + cfg.m_superbucket_max_to_retain[0] = 1; + cfg.m_superbucket_max_to_retain[1] = 1; + cfg.m_superbucket_max_to_retain[2] = 1; + + cfg.m_base_parts2 = 0; + cfg.m_base_parts3 = 0; + cfg.m_part2_fraction_to_keep = 1; + cfg.m_part3_fraction_to_keep = 1; + + cfg.m_final_shortlist_fraction[0] = 1.0f; + cfg.m_final_shortlist_fraction[1] = 1.0f; + cfg.m_final_shortlist_fraction[2] = 1.0f; + + cfg.m_final_shortlist_max_size[0] = 1; + cfg.m_final_shortlist_max_size[1] = 1; + cfg.m_final_shortlist_max_size[2] = 1; + + cfg.m_gradient_descent_flag = false; + cfg.m_polish_weights_flag = false; + cfg.m_qcd_enabled_flag = false; + + cfg.m_bucket_pruning_passes = false; + cfg.m_cem_enc_params.m_max_ls_passes = 1; + + cfg.m_early_stop_wpsnr = 45.0f; + cfg.m_early_stop2_wpsnr = 40.0f; + break; + } + } +} + +static bool zstd_compress(const uint8_t* pData, size_t data_len, uint8_vec& comp_data, int zstd_level) +{ + if (!data_len) + { + comp_data.resize(0); + return true; + } + + assert(pData); + + comp_data.resize(ZSTD_compressBound(data_len)); + + size_t result = ZSTD_compress(comp_data.data(), comp_data.size(), pData, data_len, zstd_level); + + if (ZSTD_isError(result)) + { + comp_data.resize(0); + return false; + } + + if (result > UINT32_MAX) + { + comp_data.resize(0); + return false; + } + + comp_data.resize(result); + return true; +} + +static bool zstd_compress(const bitwise_coder& coder, uint8_vec& comp_data, int zstd_level) +{ + return zstd_compress(coder.get_bytes().data(), coder.get_bytes().size(), comp_data, zstd_level); +} + +static bool zstd_compress(const uint8_vec& vec, uint8_vec& comp_data, int zstd_level) +{ + return zstd_compress(vec.data(), vec.size(), comp_data, zstd_level); +} + +static uint32_t encode_values(bitwise_coder& coder, uint32_t total_values, const uint8_t* pVals, uint32_t endpoint_range) +{ + const uint32_t MAX_VALS = 64; + uint32_t bit_values[MAX_VALS], tq_values[(MAX_VALS + 2) / 3]; + uint32_t total_tq_values = 0, tq_accum = 0, tq_mul = 1; + + assert((total_values) && (total_values <= MAX_VALS)); + + const uint32_t ep_bits = astc_helpers::g_ise_range_table[endpoint_range][0]; + const uint32_t ep_trits = astc_helpers::g_ise_range_table[endpoint_range][1]; + const uint32_t ep_quints = astc_helpers::g_ise_range_table[endpoint_range][2]; + + for (uint32_t i = 0; i < total_values; i++) + { + uint32_t val = pVals[i]; + + uint32_t bits = val & ((1 << ep_bits) - 1); + uint32_t tq = val >> ep_bits; + + bit_values[i] = bits; + + if (ep_trits) + { + assert(tq < 3); + tq_accum += tq * tq_mul; + tq_mul *= 3; + if (tq_mul == 243) + { + assert(total_tq_values < BASISU_ARRAY_SIZE(tq_values)); + tq_values[total_tq_values++] = tq_accum; + tq_accum = 0; + tq_mul = 1; + } + } + else if (ep_quints) + { + assert(tq < 5); + tq_accum += tq * tq_mul; + tq_mul *= 5; + if (tq_mul == 125) + { + assert(total_tq_values < BASISU_ARRAY_SIZE(tq_values)); + tq_values[total_tq_values++] = tq_accum; + tq_accum = 0; + tq_mul = 1; + } + } + } + + uint32_t total_bits_output = 0; + + for (uint32_t i = 0; i < total_tq_values; i++) + { + const uint32_t num_bits = ep_trits ? 8 : 7; + coder.put_bits(tq_values[i], num_bits); + total_bits_output += num_bits; + } + + if (tq_mul > 1) + { + uint32_t num_bits; + if (ep_trits) + { + if (tq_mul == 3) + num_bits = 2; + else if (tq_mul == 9) + num_bits = 4; + else if (tq_mul == 27) + num_bits = 5; + else //if (tq_mul == 81) + num_bits = 7; + } + else + { + if (tq_mul == 5) + num_bits = 3; + else //if (tq_mul == 25) + num_bits = 5; + } + coder.put_bits(tq_accum, num_bits); + total_bits_output += num_bits; + } + + for (uint32_t i = 0; i < total_values; i++) + { + coder.put_bits(bit_values[i], ep_bits); + total_bits_output += ep_bits; + } + + return total_bits_output; +} + +static bool compress_image_full_zstd( + const image& orig_img, uint8_vec& comp_data, vector2D& coded_blocks, + const astc_ldr_encode_config& global_cfg, + job_pool& job_pool, + ldr_astc_block_encode_image_high_level_config& enc_cfg, const ldr_astc_block_encode_image_output& enc_out) +{ + BASISU_NOTE_UNUSED(job_pool); + + const uint32_t width = orig_img.get_width(), height = orig_img.get_height(); + + const uint32_t block_width = global_cfg.m_astc_block_width; + const uint32_t block_height = global_cfg.m_astc_block_height; + const uint32_t total_block_pixels = block_width * block_height; + + const uint32_t total_pixels = width * height; + const uint32_t num_blocks_x = (width + block_width - 1) / block_width; + const uint32_t num_blocks_y = (height + block_height - 1) / block_height; + const uint32_t total_blocks = num_blocks_x * num_blocks_y; + const bool has_alpha = orig_img.has_alpha(); + + // Mode + uint8_vec mode_bytes; + mode_bytes.reserve(8192); + + bitwise_coder raw_bits; + raw_bits.init(8192); + + uint8_vec solid_dpcm_bytes; + solid_dpcm_bytes.reserve(8192); + + // Endpoints + uint8_vec endpoint_dpcm_reuse_indices; + endpoint_dpcm_reuse_indices.reserve(8192); + + bitwise_coder use_bc_bits; + use_bc_bits.init(1024); + + bitwise_coder endpoint_dpcm_3bit; + endpoint_dpcm_3bit.init(1024); + + bitwise_coder endpoint_dpcm_4bit; + endpoint_dpcm_4bit.init(1024); + + uint8_vec endpoint_dpcm_5bit; + endpoint_dpcm_5bit.reserve(8192); + + uint8_vec endpoint_dpcm_6bit; + endpoint_dpcm_6bit.reserve(8192); + + uint8_vec endpoint_dpcm_7bit; + endpoint_dpcm_7bit.reserve(8192); + + uint8_vec endpoint_dpcm_8bit; + endpoint_dpcm_8bit.reserve(8192); + + // Weights + bitwise_coder mean0_bits; + uint8_vec mean1_bytes; + uint8_vec run_bytes; + uint8_vec coeff_bytes; + bitwise_coder sign_bits; + bitwise_coder weight2_bits; + bitwise_coder weight3_bits; + bitwise_coder weight4_bits; + uint8_vec weight8_bits; + + mean0_bits.init(1024); + mean1_bytes.reserve(1024); + run_bytes.reserve(8192); + coeff_bytes.reserve(8192); + sign_bits.init(1024); + weight2_bits.init(1024); + weight3_bits.init(1024); + weight4_bits.init(1024); + weight8_bits.reserve(8192); + + const float replacement_min_psnr = has_alpha ? global_cfg.m_replacement_min_psnr_alpha : global_cfg.m_replacement_min_psnr; + const float psnr_trial_diff_thresh = has_alpha ? global_cfg.m_psnr_trial_diff_thresh_alpha : global_cfg.m_psnr_trial_diff_thresh; + const float psnr_trial_diff_thresh_edge = has_alpha ? global_cfg.m_psnr_trial_diff_thresh_edge_alpha : global_cfg.m_psnr_trial_diff_thresh_edge; + const float total_comp_weights = enc_cfg.m_cem_enc_params.get_total_comp_weights(); + + basist::astc_ldr_t::grid_weight_dct grid_dct; + grid_dct.init(block_width, block_height); + + coded_blocks.resize(num_blocks_x, num_blocks_y); + for (uint32_t y = 0; y < num_blocks_y; y++) + for (uint32_t x = 0; x < num_blocks_x; x++) + coded_blocks(x, y).clear(); + + vector2D prev_block_states(num_blocks_x, num_blocks_y); + + int part2_hash[basist::astc_ldr_t::PART_HASH_SIZE]; + std::fill(part2_hash, part2_hash + basist::astc_ldr_t::PART_HASH_SIZE, -1); + + int part3_hash[basist::astc_ldr_t::PART_HASH_SIZE]; + std::fill(part3_hash, part3_hash + basist::astc_ldr_t::PART_HASH_SIZE, -1); + + int tm_hash[basist::astc_ldr_t::TM_HASH_SIZE]; + std::fill(tm_hash, tm_hash + basist::astc_ldr_t::TM_HASH_SIZE, -1); + + const bool use_run_commands_global_enable = true; + const bool endpoint_dpcm_global_enable = true; + + uint32_t cur_run_len = 0; + + uint32_t total_runs = 0, total_run_blocks = 0, total_nonrun_blocks = 0; + uint32_t total_lossy_replacements = 0; + uint32_t total_solid_blocks = 0; + uint32_t total_full_reuse_commands = 0; + uint32_t total_raw_commands = 0; + uint32_t total_reuse_full_cfg_emitted = 0; + uint32_t total_full_cfg_emitted = 0; + uint32_t num_part_hash_probes = 0; + uint32_t num_part_hash_hits = 0; + uint32_t total_used_endpoint_dpcm = 0; + uint32_t total_used_endpoint_raw = 0; + uint32_t total_used_dct = 0; + uint32_t total_used_weight_dpcm = 0; + uint32_t num_tm_hash_hits = 0, num_tm_hash_probes = 0; + + raw_bits.put_bits(basist::astc_ldr_t::FULL_ZSTD_HEADER_MARKER, basist::astc_ldr_t::FULL_ZSTD_HEADER_MARKER_BITS); + + const int block_dim_index = astc_helpers::find_astc_block_size_index(block_width, block_height); + assert((block_dim_index >= 0) && (block_dim_index < (int)astc_helpers::NUM_ASTC_BLOCK_SIZES)); + + raw_bits.put_bits(block_dim_index, 4); + + raw_bits.put_bits(enc_cfg.m_cem_enc_params.m_decode_mode_srgb, 1); + + raw_bits.put_bits(width, 16); + raw_bits.put_bits(height, 16); + + raw_bits.put_bits(has_alpha, 1); + + raw_bits.put_bits(enc_cfg.m_use_dct, 1); + if (enc_cfg.m_use_dct) + { + const int int_q = clamp((int)std::round(global_cfg.m_dct_quality * 2.0f), 0, 200); + raw_bits.put_bits(int_q, 8); + } + + const uint32_t FULL_ZSTD_MAX_RUN_LEN = 64; + + for (uint32_t by = 0; by < num_blocks_y; by++) + { + //const uint32_t base_y = by * block_height; + + for (uint32_t bx = 0; bx < num_blocks_x; bx++) + { + //const uint32_t base_x = bx * block_width; + //raw_bits.put_bits(0xA1, 8); + + basist::astc_ldr_t::prev_block_state_full_zstd& prev_state = prev_block_states(bx, by); + + const basist::astc_ldr_t::prev_block_state_full_zstd* pLeft_state = bx ? &prev_block_states(bx - 1, by) : nullptr; + const basist::astc_ldr_t::prev_block_state_full_zstd* pUpper_state = by ? &prev_block_states(bx, by - 1) : nullptr; + const basist::astc_ldr_t::prev_block_state_full_zstd* pDiag_state = (bx && by) ? &prev_block_states(bx - 1, by - 1) : nullptr; + + const ldr_astc_block_encode_image_output::block_info& blk_info = enc_out.m_image_block_info(bx, by); + + uint32_t best_packed_out_block_index = blk_info.m_packed_out_block_index; + + // check for run + if ((use_run_commands_global_enable) && (bx || by)) + { + const encode_block_output& blk_out = blk_info.m_out_blocks[best_packed_out_block_index]; + const astc_helpers::log_astc_block& cur_log_blk = blk_out.m_log_blk; + + const astc_helpers::log_astc_block& prev_log_blk = bx ? coded_blocks(bx - 1, by) : coded_blocks(0, by - 1); + const basist::astc_ldr_t::prev_block_state_full_zstd* pPrev_block_state = bx ? pLeft_state : pUpper_state; + + assert(pPrev_block_state); + + if (compare_log_blocks_for_equality(cur_log_blk, prev_log_blk)) + { + // Left or upper is exactly the same logical block, so expand the run. + cur_run_len++; + + // Accept the previous block (left or upper) as if it's been coded normally. + + coded_blocks(bx, by) = prev_log_blk; + + //prev_state.m_was_solid_color = pPrev_block_state->m_was_solid_color; + prev_state.m_tm_index = pPrev_block_state->m_tm_index; + //prev_state.m_base_cem_index = pPrev_block_state->m_base_cem_index; + + if (cur_run_len == FULL_ZSTD_MAX_RUN_LEN) + { + total_runs++; + total_run_blocks += cur_run_len; + mode_bytes.push_back((uint8_t)((uint32_t)basist::astc_ldr_t::xuastc_zstd_mode::cMODE_RUN | ((cur_run_len - 1) << 2))); + cur_run_len = 0; + } + + continue; + } + } + + if (cur_run_len) + { + assert(cur_run_len <= FULL_ZSTD_MAX_RUN_LEN); + + total_runs++; + total_run_blocks += cur_run_len; + mode_bytes.push_back((uint8_t)((uint32_t)basist::astc_ldr_t::xuastc_zstd_mode::cMODE_RUN | ((cur_run_len - 1) << 2))); + cur_run_len = 0; + } + + total_nonrun_blocks++; + + // TODO: Move this to a prepass that's shared between arith/zstd + const float ref_wmse = (float)blk_info.m_out_blocks[best_packed_out_block_index].m_sse / (total_comp_weights * (float)total_block_pixels); + const float ref_wpsnr = (ref_wmse > 1e-5f) ? 20.0f * log10f(255.0f / sqrtf(ref_wmse)) : 10000.0f; + + if ((global_cfg.m_lossy_supercompression) && (ref_wpsnr >= replacement_min_psnr) && + (!blk_info.m_out_blocks[blk_info.m_packed_out_block_index].m_log_blk.m_solid_color_flag_ldr)) + { + const float psnr_thresh = blk_info.m_strong_edges ? psnr_trial_diff_thresh_edge : psnr_trial_diff_thresh; + + float best_alt_wpsnr = 0.0f; + bool found_alternative = false; + + // Pass: 0 consider full config+part ID endpoint reuse + // Pass: 1 fall back to just full config+part ID reuse (no endpoints) + for (uint32_t pass = 0; pass < 2; pass++) + { + // Iterate through all available alternative candidates + for (uint32_t out_block_iter = 0; out_block_iter < blk_info.m_out_blocks.size(); out_block_iter++) + { + if (out_block_iter == blk_info.m_packed_out_block_index) + continue; + + const float trial_wmse = (float)blk_info.m_out_blocks[out_block_iter].m_sse / (total_comp_weights * (float)total_block_pixels); + const float trial_wpsnr = (trial_wmse > 1e-5f) ? 20.0f * log10f(255.0f / sqrtf(trial_wmse)) : 10000.0f; + + // Reject if PSNR too low + if (trial_wpsnr < (ref_wpsnr - psnr_thresh)) + continue; + + // Reject if inferior than best found so far + if (trial_wpsnr < best_alt_wpsnr) + continue; + + const astc_helpers::log_astc_block& trial_log_blk = blk_info.m_out_blocks[out_block_iter].m_log_blk; + + if (trial_log_blk.m_solid_color_flag_ldr) + continue; + + // Examine nearby neighbors + for (uint32_t i = 0; i < basist::astc_ldr_t::cMaxConfigReuseNeighbors; i++) + { + int dx = 0, dy = 0; + switch (i) + { + case 0: dx = -1; break; + case 1: dy = -1; break; + case 2: dx = -1; dy = -1; break; + default: assert(0); break; + } + + const int n_bx = bx + dx, n_by = by + dy; + if ((n_bx < 0) || (n_by < 0)) + continue; + + astc_helpers::log_astc_block& neighbor_log_blk = coded_blocks(n_bx, n_by); + + if (neighbor_log_blk.m_solid_color_flag_ldr) + continue; + + bool accept_flag = false; + if (pass == 0) + { + // prefer full config+endpoint equality first + accept_flag = compare_log_block_configs_and_endpoints(trial_log_blk, neighbor_log_blk); + } + else + { + // next check for just config equality + accept_flag = compare_log_block_configs(trial_log_blk, neighbor_log_blk); + } + + if (accept_flag) + { + best_alt_wpsnr = trial_wpsnr; + best_packed_out_block_index = out_block_iter; + found_alternative = true; + break; + } + + } // i + + } // out_block_iter + + if (found_alternative) + break; + + } // pass + + if (best_packed_out_block_index != blk_info.m_packed_out_block_index) + total_lossy_replacements++; + + } // global_cfg.m_lossy_supercompression + + const encode_block_output& blk_out = blk_info.m_out_blocks[best_packed_out_block_index]; + + astc_helpers::log_astc_block& cur_log_blk = coded_blocks(bx, by); + + cur_log_blk = blk_out.m_log_blk; + + // Solid color/void extent + if (blk_out.m_trial_mode_index < 0) + { + assert(cur_log_blk.m_solid_color_flag_ldr); + + total_solid_blocks++; + + mode_bytes.push_back((uint8_t)basist::astc_ldr_t::xuastc_zstd_mode::cMODE_SOLID); + + uint32_t cur_solid_color[4]; + for (uint32_t i = 0; i < 4; i++) + cur_solid_color[i] = blk_out.m_log_blk.m_solid_color[i] >> 8; + + uint32_t prev_solid_color[4] = { 0 }; + + const uint32_t num_comps = has_alpha ? 4 : 3; + + astc_helpers::log_astc_block* pPrev_log_blk = bx ? &coded_blocks(bx - 1, by) : (by ? &coded_blocks(bx, by - 1) : nullptr); + if (pPrev_log_blk) + { + if (pPrev_log_blk->m_solid_color_flag_ldr) + { + prev_solid_color[0] = pPrev_log_blk->m_solid_color[0] >> 8; + prev_solid_color[1] = pPrev_log_blk->m_solid_color[1] >> 8; + prev_solid_color[2] = pPrev_log_blk->m_solid_color[2] >> 8; + prev_solid_color[3] = pPrev_log_blk->m_solid_color[3] >> 8; + } + else + { + // Decode previous block's first CEM, use the halfway point as the predictor. + color_rgba prev_l, prev_h; + decode_endpoints(pPrev_log_blk->m_color_endpoint_modes[0], pPrev_log_blk->m_endpoints, pPrev_log_blk->m_endpoint_ise_range, prev_l, prev_h); + + prev_solid_color[0] = (prev_l[0] + prev_h[0] + 1) >> 1; + prev_solid_color[1] = (prev_l[1] + prev_h[1] + 1) >> 1; + prev_solid_color[2] = (prev_l[2] + prev_h[2] + 1) >> 1; + prev_solid_color[3] = (prev_l[3] + prev_h[3] + 1) >> 1; + } + } + + for (uint32_t i = 0; i < num_comps; i++) + { + const uint32_t delta = (cur_solid_color[i] - prev_solid_color[i]) & 0xFF; + solid_dpcm_bytes.push_back((uint8_t)delta); + } + + //prev_state.m_was_solid_color = true; + prev_state.m_tm_index = -1; + //prev_state.m_base_cem_index = astc_helpers::CEM_LDR_RGB_DIRECT; + + continue; + } + + assert(!cur_log_blk.m_solid_color_flag_ldr); + + int full_cfg_endpoint_reuse_index = -1; + + for (uint32_t i = 0; i < basist::astc_ldr_t::cMaxConfigReuseNeighbors; i++) + { + int dx = 0, dy = 0; + switch (i) + { + case 0: dx = -1; break; + case 1: dy = -1; break; + case 2: dx = -1; dy = -1; break; + default: assert(0); break; + } + + const int n_bx = bx + dx, n_by = by + dy; + if ((n_bx < 0) || (n_by < 0)) + continue; + + astc_helpers::log_astc_block& neighbor_log_blk = coded_blocks(n_bx, n_by); + + if (neighbor_log_blk.m_solid_color_flag_ldr) + continue; + + if (compare_log_block_configs_and_endpoints(cur_log_blk, neighbor_log_blk)) + { + full_cfg_endpoint_reuse_index = i; + break; + } + } // i + + if (full_cfg_endpoint_reuse_index >= 0) + { + // Reused full config, part ID and endpoint values from an immediate neighbor + mode_bytes.push_back((uint8_t)((uint32_t)basist::astc_ldr_t::xuastc_zstd_mode::cMODE_REUSE_CFG_ENDPOINTS_LEFT + (full_cfg_endpoint_reuse_index << 2))); + + total_full_reuse_commands++; + + const basist::astc_ldr_t::prev_block_state_full_zstd* pReused_cfg_state = nullptr; + + switch (full_cfg_endpoint_reuse_index) + { + case 0: pReused_cfg_state = pLeft_state; break; + case 1: pReused_cfg_state = pUpper_state; break; + case 2: pReused_cfg_state = pDiag_state; break; + default: assert(0); break; + } + + if (!pReused_cfg_state) + { + assert(0); + fmt_error_printf("encoding internal failure\n"); + return false; + } + + assert(pReused_cfg_state->m_tm_index == blk_out.m_trial_mode_index); + + prev_state.m_tm_index = blk_out.m_trial_mode_index; + } + else + { + // No nearby full config+part ID+endpoint reuse, so send raw command + // Must send endpoints too. + total_raw_commands++; + + // Format of mode byte (UD bit used in modes other than raw) + // 7 6 5 4 3 2 1 0 + // UD C ED HH BO I I M + + // MMM=mode + // II=neighbor reuse index [0,3], 3=no reuse + // BO=base offset flag + // HH=partition hash hit flag + // ED=endpoint DPCM flag + // C=config hash table hit + // UD=use DCT flag + + mode_bytes.push_back((uint8_t)basist::astc_ldr_t::xuastc_zstd_mode::cMODE_RAW); + + const uint32_t cur_actual_cem = cur_log_blk.m_color_endpoint_modes[0]; + const uint32_t total_endpoint_vals = astc_helpers::get_num_cem_values(cur_actual_cem); + + // DO NOT use tm.m_cem because the encoder may have selected a base+ofs variant instead. Use cur_actual_cem. + const basist::astc_ldr_t::trial_mode& tm = enc_out.m_encoder_trial_modes[blk_out.m_trial_mode_index]; + + // Check for config+part ID neighbor reuse (partial refuse) + int neighbor_cfg_match_index = -1; + for (uint32_t i = 0; i < basist::astc_ldr_t::cMaxConfigReuseNeighbors; i++) + { + const basist::astc_ldr_t::prev_block_state_full_zstd* pNeighbor_state = nullptr; + + int dx = 0, dy = 0; + switch (i) + { + case 0: dx = -1; pNeighbor_state = pLeft_state; break; + case 1: dy = -1; pNeighbor_state = pUpper_state; break; + case 2: dx = -1; dy = -1; pNeighbor_state = pDiag_state; break; + default: assert(0); break; + } + + if (!pNeighbor_state) + continue; + + const int n_bx = bx + dx, n_by = by + dy; + assert((n_bx >= 0) && (n_by >= 0)); + + astc_helpers::log_astc_block& neighbor_log_blk = coded_blocks(n_bx, n_by); + + if (pNeighbor_state->m_tm_index != blk_out.m_trial_mode_index) + continue; + + if (neighbor_log_blk.m_color_endpoint_modes[0] != cur_log_blk.m_color_endpoint_modes[0]) + continue; + + if (neighbor_log_blk.m_partition_id != cur_log_blk.m_partition_id) + continue; + + assert(neighbor_log_blk.m_dual_plane == cur_log_blk.m_dual_plane); + assert(neighbor_log_blk.m_color_component_selector == cur_log_blk.m_color_component_selector); + assert(neighbor_log_blk.m_num_partitions == cur_log_blk.m_num_partitions); + assert(neighbor_log_blk.m_grid_width == cur_log_blk.m_grid_width); + assert(neighbor_log_blk.m_grid_height == cur_log_blk.m_grid_height); + assert(neighbor_log_blk.m_endpoint_ise_range == cur_log_blk.m_endpoint_ise_range); + assert(neighbor_log_blk.m_weight_ise_range == cur_log_blk.m_weight_ise_range); + + neighbor_cfg_match_index = i; + break; + } + + if (neighbor_cfg_match_index >= 0) + { + // Partial reuse (config+partition ID, but not endpoints). + // OR 2-bits into the mode byte + mode_bytes.back() |= (uint8_t)(neighbor_cfg_match_index << 1); + + const basist::astc_ldr_t::prev_block_state_full_zstd* pReused_cfg_state = nullptr; + + switch (neighbor_cfg_match_index) + { + case 0: pReused_cfg_state = pLeft_state; break; + case 1: pReused_cfg_state = pUpper_state; break; + case 2: pReused_cfg_state = pDiag_state; break; + default: assert(0); break; + } + + if (!pReused_cfg_state) + { + assert(0); + fmt_error_printf("encoding internal failure\n"); + return false; + } + + assert(pReused_cfg_state->m_tm_index == blk_out.m_trial_mode_index); + + prev_state.m_tm_index = blk_out.m_trial_mode_index; + + total_reuse_full_cfg_emitted++; + } + else + { + // No reuse - must send config, so pack it. Then send endpoints. + total_full_cfg_emitted++; + + // OR 2-bits into the mode byte (so now 5 bits total) + mode_bytes.back() |= (uint8_t)(((uint32_t)basist::astc_ldr_t::cMaxConfigReuseNeighbors) << 1); + + // Pack tm index (ASTC base config) + { + num_tm_hash_probes++; + + uint32_t tm_h = basist::astc_ldr_t::tm_hash_index(blk_out.m_trial_mode_index); + + if (tm_hash[tm_h] == blk_out.m_trial_mode_index) + { + num_tm_hash_hits++; + + mode_bytes.back() |= (uint8_t)basist::astc_ldr_t::XUASTC_LDR_MODE_BYTE_TM_HASH_HIT_FLAG; // tm hash hit flag + + raw_bits.put_bits(tm_h, basist::astc_ldr_t::TM_HASH_BITS); + } + else + { + raw_bits.put_truncated_binary(blk_out.m_trial_mode_index, (uint32_t)enc_out.m_encoder_trial_modes.size()); + + tm_hash[tm_h] = blk_out.m_trial_mode_index; + } + } + + prev_state.m_tm_index = blk_out.m_trial_mode_index; + + // Send base_ofs bit if the tm is direct + if ((tm.m_cem == astc_helpers::CEM_LDR_RGB_DIRECT) || (tm.m_cem == astc_helpers::CEM_LDR_RGBA_DIRECT)) + { + const bool is_base_ofs = (cur_log_blk.m_color_endpoint_modes[0] == astc_helpers::CEM_LDR_RGB_BASE_PLUS_OFFSET) || + (cur_log_blk.m_color_endpoint_modes[0] == astc_helpers::CEM_LDR_RGBA_BASE_PLUS_OFFSET); + + if (is_base_ofs) + mode_bytes.back() |= basist::astc_ldr_t::XUASTC_LDR_MODE_BYTE_IS_BASE_OFS_FLAG; // base_ofs bit + } + + if (tm.m_num_parts > 1) + { + // Send unique part pattern ID + const astc_ldr::partitions_data* pPart_data = (tm.m_num_parts == 2) ? &enc_out.m_part_data_p2 : &enc_out.m_part_data_p3; + + const uint32_t astc_pat_index = cur_log_blk.m_partition_id; + const uint32_t unique_pat_index = pPart_data->m_part_seed_to_unique_index[astc_pat_index]; + const uint32_t total_unique_indices = pPart_data->m_total_unique_patterns; + assert(unique_pat_index < total_unique_indices); + + num_part_hash_probes++; + + int* pPart_hash = (tm.m_num_parts == 2) ? part2_hash : part3_hash; + + const uint32_t h = basist::astc_ldr_t::part_hash_index(unique_pat_index); + + if (pPart_hash[h] != (int)unique_pat_index) + { +#if defined(_DEBUG) || defined(DEBUG) + // sanity + for (uint32_t i = 0; i < basist::astc_ldr_t::PART_HASH_SIZE; i++) + { + assert(pPart_hash[i] != (int)unique_pat_index); + } +#endif + + raw_bits.put_truncated_binary(unique_pat_index, total_unique_indices); + } + else + { + num_part_hash_hits++; + + mode_bytes.back() |= basist::astc_ldr_t::XUASTC_LDR_MODE_BYTE_PART_HASH_HIT; // hash pat_index hit bit + raw_bits.put_bits(h, basist::astc_ldr_t::PART_HASH_BITS); + } + + pPart_hash[basist::astc_ldr_t::part_hash_index(unique_pat_index)] = unique_pat_index; + } + } + + // Send endpoints + const int num_endpoint_levels = astc_helpers::get_ise_levels(cur_log_blk.m_endpoint_ise_range); + const auto& endpoint_ise_to_rank = astc_helpers::g_dequant_tables.get_endpoint_tab(cur_log_blk.m_endpoint_ise_range).m_ISE_to_rank; + + bool endpoints_use_bc[astc_helpers::MAX_PARTITIONS] = { false }; + + if (astc_helpers::cem_supports_bc(cur_actual_cem)) + { + for (uint32_t part_iter = 0; part_iter < tm.m_num_parts; part_iter++) + { + const bool cur_uses_bc = astc_helpers::used_blue_contraction(cur_actual_cem, cur_log_blk.m_endpoints + part_iter * total_endpoint_vals, cur_log_blk.m_endpoint_ise_range); + + endpoints_use_bc[part_iter] = cur_uses_bc; + + } // part_iter + } + + int best_reuse_bx = -1, best_reuse_by = -1; + uint32_t best_reuse_index = 0; + const astc_helpers::log_astc_block* pEndpoint_pred_log_blk = nullptr; + + if (endpoint_dpcm_global_enable) + { + int64_t best_trial_delta2 = INT64_MAX; + float best_trial_bits = BIG_FLOAT_VAL; + + // TODO: Decide if DPCM is even worth it. + const float N = (float)(total_endpoint_vals * tm.m_num_parts); + + for (uint32_t reuse_index = 0; reuse_index < basist::astc_6x6_hdr::NUM_REUSE_XY_DELTAS; reuse_index++) + { + const int rx = (int)bx + basist::astc_6x6_hdr::g_reuse_xy_deltas[reuse_index].m_x; + const int ry = (int)by + basist::astc_6x6_hdr::g_reuse_xy_deltas[reuse_index].m_y; + if ((rx < 0) || (ry < 0) || (rx >= (int)num_blocks_x) || (ry >= (int)num_blocks_y)) + continue; + + const astc_helpers::log_astc_block* pTrial_log_blk = &coded_blocks(rx, ry); + if (pTrial_log_blk->m_solid_color_flag_ldr) + continue; + + uint8_t trial_predicted_endpoints[astc_helpers::MAX_PARTITIONS][astc_helpers::MAX_CEM_ENDPOINT_VALS] = { }; + + uint32_t part_iter; + for (part_iter = 0; part_iter < tm.m_num_parts; part_iter++) + { + const bool always_repack_flag = false; + bool blue_contraction_clamped_flag = false, base_ofs_clamped_flag = false; + + bool conv_status = basist::astc_ldr_t::convert_endpoints_across_cems( + pTrial_log_blk->m_color_endpoint_modes[0], pTrial_log_blk->m_endpoint_ise_range, pTrial_log_blk->m_endpoints, + cur_actual_cem, cur_log_blk.m_endpoint_ise_range, trial_predicted_endpoints[part_iter], + always_repack_flag, + endpoints_use_bc[part_iter], false, + blue_contraction_clamped_flag, base_ofs_clamped_flag); + + if (!conv_status) + break; + } // part_iter + + if (part_iter < tm.m_num_parts) + continue; // failed + + int64_t trial_endpoint_delta2 = 0; + for (part_iter = 0; part_iter < tm.m_num_parts; part_iter++) + { + for (uint32_t val_iter = 0; val_iter < total_endpoint_vals; val_iter++) + { + int cur_e_rank = endpoint_ise_to_rank[cur_log_blk.m_endpoints[part_iter * total_endpoint_vals + val_iter]]; + int prev_e_rank = endpoint_ise_to_rank[trial_predicted_endpoints[part_iter][val_iter]]; + + int e_delta = cur_e_rank - prev_e_rank; + + trial_endpoint_delta2 += e_delta * e_delta; + + } // val_iter + + } // part_iter + + const float mse = (float)trial_endpoint_delta2 / N; + + // Gaussian entropy estimate - precomputed 0.5 * log2(2*pi*e) = ~2.0470956f + const float k_const = 2.0470956f; + + float bits_per_sym = 0.5f * log2f(basisu::maximum(mse, 1e-9f)) + k_const; + + bits_per_sym = clamp(bits_per_sym, 0.05f, 8.0f); + + // total est bits for this block’s endpoints + float total_est_bits = bits_per_sym * N; + + if (total_est_bits < best_trial_bits) + { + best_trial_delta2 = trial_endpoint_delta2; + best_trial_bits = total_est_bits; + + best_reuse_bx = rx; + best_reuse_by = ry; + best_reuse_index = reuse_index; + + if (!best_trial_delta2) + break; + } + + } // reuse_index + + if (best_reuse_bx >= 0) + { + pEndpoint_pred_log_blk = &coded_blocks(best_reuse_bx, best_reuse_by); + + assert(!pEndpoint_pred_log_blk->m_solid_color_flag_ldr); + } + + } // if (endpoint_dpcm_global_enable) + + uint8_t predicted_endpoints[astc_helpers::MAX_PARTITIONS][astc_helpers::MAX_CEM_ENDPOINT_VALS] = { }; + + bool use_dpcm_endpoints = false; + + if (pEndpoint_pred_log_blk) + { + use_dpcm_endpoints = true; + + assert(cur_log_blk.m_num_partitions == tm.m_num_parts); + + for (uint32_t part_iter = 0; part_iter < tm.m_num_parts; part_iter++) + { + const bool always_repack_flag = false; + bool blue_contraction_clamped_flag = false, base_ofs_clamped_flag = false; + + bool conv_status = basist::astc_ldr_t::convert_endpoints_across_cems( + pEndpoint_pred_log_blk->m_color_endpoint_modes[0], pEndpoint_pred_log_blk->m_endpoint_ise_range, pEndpoint_pred_log_blk->m_endpoints, + cur_actual_cem, cur_log_blk.m_endpoint_ise_range, predicted_endpoints[part_iter], + always_repack_flag, + endpoints_use_bc[part_iter], false, + blue_contraction_clamped_flag, base_ofs_clamped_flag); + + if (!conv_status) + { + // In practice, should never happen + use_dpcm_endpoints = false; + break; + } + } + } + + // TODO: Decide what is cheaper, endpoint DPCM vs. raw + + if (use_dpcm_endpoints) + { + // DPCM flag bit + mode_bytes.back() |= basist::astc_ldr_t::XUASTC_LDR_MODE_BYTE_DPCM_ENDPOINTS_FLAG; + + endpoint_dpcm_reuse_indices.push_back((uint8_t)best_reuse_index); + + if (astc_helpers::cem_supports_bc(cur_actual_cem)) + { + for (uint32_t part_iter = 0; part_iter < tm.m_num_parts; part_iter++) + { + use_bc_bits.put_bits(endpoints_use_bc[part_iter], 1); + + } // part_iter + } + + for (uint32_t part_iter = 0; part_iter < tm.m_num_parts; part_iter++) + { + for (uint32_t val_iter = 0; val_iter < total_endpoint_vals; val_iter++) + { + int cur_e_rank = endpoint_ise_to_rank[cur_log_blk.m_endpoints[part_iter * total_endpoint_vals + val_iter]]; + int prev_e_rank = endpoint_ise_to_rank[predicted_endpoints[part_iter][val_iter]]; + + int e_val = imod(cur_e_rank - prev_e_rank, num_endpoint_levels); + + if (num_endpoint_levels <= 8) + endpoint_dpcm_3bit.put_bits(e_val, 4); + else if (num_endpoint_levels <= 16) + endpoint_dpcm_4bit.put_bits(e_val, 4); + else if (num_endpoint_levels <= 32) + endpoint_dpcm_5bit.push_back((uint8_t)e_val); + else if (num_endpoint_levels <= 64) + endpoint_dpcm_6bit.push_back((uint8_t)e_val); + else if (num_endpoint_levels <= 128) + endpoint_dpcm_7bit.push_back((uint8_t)e_val); + else if (num_endpoint_levels <= 256) + endpoint_dpcm_8bit.push_back((uint8_t)e_val); + + } // val_iter + + } // part_iter + + total_used_endpoint_dpcm++; + } + else + { + encode_values(raw_bits, tm.m_num_parts * total_endpoint_vals, cur_log_blk.m_endpoints, cur_log_blk.m_endpoint_ise_range); + + total_used_endpoint_raw++; + } // if (use_dpcm_endpoints) + + } // if (full_cfg_endpoint_reuse_index >= 0) + + // ------------------------------------ Send weights + + const uint32_t total_planes = cur_log_blk.m_dual_plane ? 2 : 1; + const uint32_t total_weights = cur_log_blk.m_grid_width * cur_log_blk.m_grid_height; + + const int num_weight_levels = astc_helpers::get_ise_levels(cur_log_blk.m_weight_ise_range); + const auto& weight_ise_to_rank = astc_helpers::g_dequant_tables.get_weight_tab(cur_log_blk.m_weight_ise_range).m_ISE_to_rank; + + bool use_dct = enc_cfg.m_use_dct; + + // TODO - tune this threshold + const uint32_t SWITCH_TO_DPCM_NUM_COEFF_THRESH = (cur_log_blk.m_grid_width * cur_log_blk.m_grid_height * 45 + 64) >> 7; + + if (use_dct) + { + for (uint32_t plane_iter = 0; plane_iter < total_planes; plane_iter++) + { + const basist::astc_ldr_t::dct_syms& syms = blk_out.m_packed_dct_plane_data[plane_iter]; + if (syms.m_max_coeff_mag > basist::astc_ldr_t::DCT_MAX_ARITH_COEFF_MAG) + { + use_dct = false; + break; + } + + if (syms.m_coeffs.size() > SWITCH_TO_DPCM_NUM_COEFF_THRESH) + { + use_dct = false; + break; + } + } + } + + // MSB of mode byte=use DCT + if (enc_cfg.m_use_dct) + { + assert((mode_bytes.back() & basist::astc_ldr_t::XUASTC_LDR_MODE_BYTE_USE_DCT) == 0); + + if (use_dct) + mode_bytes.back() |= basist::astc_ldr_t::XUASTC_LDR_MODE_BYTE_USE_DCT; + } + + if (use_dct) + { + total_used_dct++; + + if (total_planes > 1) + { + assert(blk_out.m_packed_dct_plane_data[0].m_num_dc_levels == blk_out.m_packed_dct_plane_data[1].m_num_dc_levels); + } + + for (uint32_t plane_iter = 0; plane_iter < total_planes; plane_iter++) + { + const basist::astc_ldr_t::dct_syms& syms = blk_out.m_packed_dct_plane_data[plane_iter]; + + if (syms.m_num_dc_levels == basist::astc_ldr_t::DCT_MEAN_LEVELS1) + mean1_bytes.push_back((uint8_t)syms.m_dc_sym); + else + { + assert(syms.m_num_dc_levels == basist::astc_ldr_t::DCT_MEAN_LEVELS0); + mean0_bits.put_bits(syms.m_dc_sym, 4); + } + + for (uint32_t i = 0; i < syms.m_coeffs.size(); i++) + { + if (syms.m_coeffs[i].m_coeff == INT16_MAX) + { + run_bytes.push_back(basist::astc_ldr_t::DCT_RUN_LEN_EOB_SYM_INDEX); + } + else + { + run_bytes.push_back((uint8_t)syms.m_coeffs[i].m_num_zeros); + + sign_bits.put_bits(syms.m_coeffs[i].m_coeff < 0, 1); + + assert((syms.m_coeffs[i].m_coeff != 0) && (iabs(syms.m_coeffs[i].m_coeff) <= 255)); + + coeff_bytes.push_back((uint8_t)(iabs(syms.m_coeffs[i].m_coeff) - 1)); + } + } + + } // plane_iter + } + else + { + total_used_weight_dpcm++; + + for (uint32_t plane_iter = 0; plane_iter < total_planes; plane_iter++) + { + int prev_w = num_weight_levels / 2; + + for (uint32_t weight_iter = 0; weight_iter < total_weights; weight_iter++) + { + int ise_w = cur_log_blk.m_weights[plane_iter + weight_iter * total_planes]; + int w = weight_ise_to_rank[ise_w]; + + int w_to_code = w; + w_to_code = imod(w - prev_w, num_weight_levels); + + prev_w = w; + + if (num_weight_levels <= 4) + weight2_bits.put_bits((uint8_t)w_to_code, 2); + else if (num_weight_levels <= 8) + weight3_bits.put_bits((uint8_t)w_to_code, 4); + else if (num_weight_levels <= 16) + weight4_bits.put_bits((uint8_t)w_to_code, 4); + else + weight8_bits.push_back((uint8_t)w_to_code); + + } // weight_iter + + } // plane_iter + } + + } // bx + + if (cur_run_len) + { + assert(cur_run_len <= FULL_ZSTD_MAX_RUN_LEN); + + total_runs++; + total_run_blocks += cur_run_len; + mode_bytes.push_back((uint8_t)((uint32_t)basist::astc_ldr_t::xuastc_zstd_mode::cMODE_RUN | ((cur_run_len - 1) << 2))); + cur_run_len = 0; + } + + } // by + + raw_bits.put_bits(basist::astc_ldr_t::FINAL_SYNC_MARKER, basist::astc_ldr_t::FINAL_SYNC_MARKER_BITS); + + raw_bits.flush(); + endpoint_dpcm_3bit.flush(); + endpoint_dpcm_4bit.flush(); + use_bc_bits.flush(); + + mean0_bits.flush(); + sign_bits.flush(); + weight2_bits.flush(); + weight3_bits.flush(); + weight4_bits.flush(); + + const uint32_t zstd_level = 9; + + uint8_vec comp_mode, comp_solid_dpcm, comp_endpoint_dpcm_reuse_indices; + uint8_vec comp_use_bc_bits, comp_endpoint_dpcm_3bit, comp_endpoint_dpcm_4bit, comp_endpoint_dpcm_5bit, comp_endpoint_dpcm_6bit, comp_endpoint_dpcm_7bit, comp_endpoint_dpcm_8bit; + + // Mode + if (!zstd_compress(mode_bytes, comp_mode, zstd_level)) return false; + if (!zstd_compress(solid_dpcm_bytes, comp_solid_dpcm, zstd_level)) return false; + + // Endpoints + if (!zstd_compress(endpoint_dpcm_reuse_indices, comp_endpoint_dpcm_reuse_indices, zstd_level)) return false; + if (!zstd_compress(use_bc_bits, comp_use_bc_bits, zstd_level)) return false; + if (!zstd_compress(endpoint_dpcm_3bit, comp_endpoint_dpcm_3bit, zstd_level)) return false; + if (!zstd_compress(endpoint_dpcm_4bit, comp_endpoint_dpcm_4bit, zstd_level)) return false; + if (!zstd_compress(endpoint_dpcm_5bit, comp_endpoint_dpcm_5bit, zstd_level)) return false; + if (!zstd_compress(endpoint_dpcm_6bit, comp_endpoint_dpcm_6bit, zstd_level)) return false; + if (!zstd_compress(endpoint_dpcm_7bit, comp_endpoint_dpcm_7bit, zstd_level)) return false; + if (!zstd_compress(endpoint_dpcm_8bit, comp_endpoint_dpcm_8bit, zstd_level)) return false; + + // Weights + uint8_vec comp_mean0, comp_mean1, comp_run, comp_coeff, comp_weight2, comp_weight3, comp_weight4, comp_weight8; + + if (!zstd_compress(mean0_bits, comp_mean0, zstd_level)) return false; + if (!zstd_compress(mean1_bytes, comp_mean1, zstd_level)) return false; + if (!zstd_compress(run_bytes, comp_run, zstd_level)) return false; + if (!zstd_compress(coeff_bytes, comp_coeff, zstd_level)) return false; + if (!zstd_compress(weight2_bits, comp_weight2, zstd_level)) return false; + if (!zstd_compress(weight3_bits, comp_weight3, zstd_level)) return false; + if (!zstd_compress(weight4_bits, comp_weight4, zstd_level)) return false; + if (!zstd_compress(weight8_bits, comp_weight8, zstd_level)) return false; + + basist::astc_ldr_t::xuastc_ldr_full_zstd_header hdr; + clear_obj(hdr); + + hdr.m_flags = (uint8_t)basist::astc_ldr_t::xuastc_ldr_syntax::cFullZStd; + + hdr.m_raw_bits_len = (uint32_t)raw_bits.get_bytes().size(); + hdr.m_mode_bytes_len = (uint32_t)comp_mode.size(); + hdr.m_solid_dpcm_bytes_len = (uint32_t)comp_solid_dpcm.size(); + + hdr.m_endpoint_dpcm_reuse_indices_len = (uint32_t)comp_endpoint_dpcm_reuse_indices.size(); + hdr.m_use_bc_bits_len = (uint32_t)comp_use_bc_bits.size(); + hdr.m_endpoint_dpcm_3bit_len = (uint32_t)comp_endpoint_dpcm_3bit.size(); + hdr.m_endpoint_dpcm_4bit_len = (uint32_t)comp_endpoint_dpcm_4bit.size(); + hdr.m_endpoint_dpcm_5bit_len = (uint32_t)comp_endpoint_dpcm_5bit.size(); + hdr.m_endpoint_dpcm_6bit_len = (uint32_t)comp_endpoint_dpcm_6bit.size(); + hdr.m_endpoint_dpcm_7bit_len = (uint32_t)comp_endpoint_dpcm_7bit.size(); + hdr.m_endpoint_dpcm_8bit_len = (uint32_t)comp_endpoint_dpcm_8bit.size(); + + hdr.m_mean0_bits_len = (uint32_t)comp_mean0.size(); + hdr.m_mean1_bytes_len = (uint32_t)comp_mean1.size(); + hdr.m_run_bytes_len = (uint32_t)comp_run.size(); + hdr.m_coeff_bytes_len = (uint32_t)comp_coeff.size(); + hdr.m_sign_bits_len = (uint32_t)sign_bits.get_bytes().size(); + hdr.m_weight2_bits_len = (uint32_t)comp_weight2.size(); + hdr.m_weight3_bits_len = (uint32_t)comp_weight3.size(); + hdr.m_weight4_bits_len = (uint32_t)comp_weight4.size(); + hdr.m_weight8_bytes_len = (uint32_t)comp_weight8.size(); + + comp_data.reserve(8192); + + comp_data.resize(sizeof(hdr)); + memcpy(comp_data.data(), &hdr, sizeof(hdr)); + + comp_data.append(raw_bits.get_bytes()); + comp_data.append(comp_mode); + comp_data.append(comp_solid_dpcm); + + comp_data.append(comp_endpoint_dpcm_reuse_indices); + comp_data.append(comp_use_bc_bits); + comp_data.append(comp_endpoint_dpcm_3bit); + comp_data.append(comp_endpoint_dpcm_4bit); + comp_data.append(comp_endpoint_dpcm_5bit); + comp_data.append(comp_endpoint_dpcm_6bit); + comp_data.append(comp_endpoint_dpcm_7bit); + comp_data.append(comp_endpoint_dpcm_8bit); + + comp_data.append(comp_mean0); + comp_data.append(comp_mean1); + comp_data.append(comp_run); + comp_data.append(comp_coeff); + comp_data.append(sign_bits.get_bytes()); + comp_data.append(comp_weight2); + comp_data.append(comp_weight3); + comp_data.append(comp_weight4); + comp_data.append(comp_weight8); + + if (comp_data.size() > UINT32_MAX) + return false; + + if ((global_cfg.m_debug_images) || (global_cfg.m_debug_output)) + { + image coded_img(width, height); + + vector2D phys_blocks(num_blocks_x, num_blocks_y); + + for (uint32_t by = 0; by < num_blocks_y; by++) + { + for (uint32_t bx = 0; bx < num_blocks_x; bx++) + { + const astc_helpers::log_astc_block& log_blk = coded_blocks(bx, by); + + color_rgba block_pixels[astc_ldr::ASTC_LDR_MAX_BLOCK_PIXELS]; + + bool status = astc_helpers::decode_block(log_blk, block_pixels, block_width, block_height, enc_cfg.m_cem_enc_params.m_decode_mode_srgb ? astc_helpers::cDecodeModeSRGB8 : astc_helpers::cDecodeModeLDR8); + if (!status) + { + fmt_error_printf("astc_helpers::decode_block() failed\n"); + return false; + } + + // Be positive the logical block can be unpacked correctly as XUASTC LDR. + color_rgba block_pixels_alt[astc_ldr::ASTC_LDR_MAX_BLOCK_PIXELS]; + bool status_alt = astc_helpers::decode_block_xuastc_ldr(log_blk, block_pixels_alt, block_width, block_height, enc_cfg.m_cem_enc_params.m_decode_mode_srgb ? astc_helpers::cDecodeModeSRGB8 : astc_helpers::cDecodeModeLDR8); + if (!status_alt) + { + fmt_error_printf("astc_helpers::decode_block_xuastc_ldr() failed\n"); + return false; + } + + if (memcmp(block_pixels, block_pixels_alt, sizeof(color_rgba) * block_width * block_height) != 0) + { + fmt_error_printf("astc_helpers::decode_block_xuastc_ldr() decode pixel mismatch\n"); + return false; + } + + coded_img.set_block_clipped(block_pixels, bx * block_width, by * block_height, block_width, block_height); + + } // bx + + } //by + + if (global_cfg.m_debug_images) + save_png(global_cfg.m_debug_file_prefix + "coded_img.png", coded_img); + + if (global_cfg.m_debug_output) + { + debug_printf("Orig image vs. coded img:\n"); + print_image_metrics(orig_img, coded_img); + } + } + + if (global_cfg.m_debug_output) + { + fmt_debug_printf("Zstd compressed sizes:\n"); + + fmt_debug_printf(" Raw bytes: {}\n", (uint64_t)raw_bits.get_bytes().size()); + fmt_debug_printf(" Mode bytes: {}, comp size: {}\n", (uint64_t)mode_bytes.size(), (uint64_t)comp_mode.size()); + fmt_debug_printf(" Solid DPCM bytes: {}, comp size: {}\n", (uint64_t)solid_dpcm_bytes.size(), (uint64_t)comp_solid_dpcm.size()); + + fmt_debug_printf(" \n Endpoint DPCM Reuse Bytes: {}, comp size: {}\n", (uint64_t)endpoint_dpcm_reuse_indices.size(), (uint64_t)comp_endpoint_dpcm_reuse_indices.size()); + fmt_debug_printf(" Use BC bits bytes: {}, comp_size: {}\n", (uint64_t)use_bc_bits.get_bytes().size(), (uint64_t)comp_use_bc_bits.size()); + fmt_debug_printf(" Endpoint DPCM 3 bits: {}, comp size: {}\n", (uint64_t)endpoint_dpcm_3bit.get_bytes().size(), (uint64_t)comp_endpoint_dpcm_3bit.size()); + fmt_debug_printf(" Endpoint DPCM 4 bits: {}, comp size: {}\n", (uint64_t)endpoint_dpcm_4bit.get_bytes().size(), (uint64_t)comp_endpoint_dpcm_4bit.size()); + fmt_debug_printf(" Endpoint DPCM 5 bits: {}, comp size: {}\n", (uint64_t)endpoint_dpcm_5bit.size(), (uint64_t)comp_endpoint_dpcm_5bit.size()); + fmt_debug_printf(" Endpoint DPCM 6 bits: {}, comp size: {}\n", (uint64_t)endpoint_dpcm_6bit.size(), (uint64_t)comp_endpoint_dpcm_6bit.size()); + fmt_debug_printf(" Endpoint DPCM 7 bits: {}, comp size: {}\n", (uint64_t)endpoint_dpcm_7bit.size(), (uint64_t)comp_endpoint_dpcm_7bit.size()); + fmt_debug_printf(" Endpoint DPCM 8 bits: {}, comp size: {}\n", (uint64_t)endpoint_dpcm_8bit.size(), (uint64_t)comp_endpoint_dpcm_8bit.size()); + + fmt_debug_printf(" \n Mean0 bytes: {} comp size: {}\n", (uint64_t)mean0_bits.get_bytes().size(), (uint64_t)comp_mean0.size()); + fmt_debug_printf(" Mean1 bytes: {} comp size: {}\n", (uint64_t)mean1_bytes.size(), (uint64_t)comp_mean1.size()); + fmt_debug_printf(" Run bytes: {} comp size: {}\n", (uint64_t)run_bytes.size(), (uint64_t)comp_run.size()); + fmt_debug_printf(" Coeff bytes: {} comp size: {}\n", (uint64_t)coeff_bytes.size(), (uint64_t)comp_coeff.size()); + fmt_debug_printf(" Sign bytes: {}\n", (uint64_t)sign_bits.get_bytes().size()); + fmt_debug_printf(" Weight2 bytes: {} comp size: {}\n", (uint64_t)weight2_bits.get_bytes().size(), (uint64_t)comp_weight2.size()); + fmt_debug_printf(" Weight3 bytes: {} comp size: {}\n", (uint64_t)weight3_bits.get_bytes().size(), (uint64_t)comp_weight3.size()); + fmt_debug_printf(" Weight4 bytes: {} comp size: {}\n", (uint64_t)weight4_bits.get_bytes().size(), (uint64_t)comp_weight4.size()); + fmt_debug_printf(" Weight8 bytes: {} comp size: {}\n", (uint64_t)weight8_bits.size(), (uint64_t)comp_weight8.size()); + + fmt_debug_printf("\nTotal blocks: {}\n", total_blocks); + fmt_debug_printf("Total runs: {}, run blocks: {}, non-run blocks: {}\n", total_runs, total_run_blocks, total_nonrun_blocks); + fmt_debug_printf("Total lossy replacements: {}\n", total_lossy_replacements); + fmt_debug_printf("Total solid blocks: {}\n", total_solid_blocks); + fmt_debug_printf("Total full reuse commands: {}\n", total_full_reuse_commands); + fmt_debug_printf("Total raw commands: {}\n", total_raw_commands); + fmt_debug_printf("Total reuse full cfg emitted: {}\n", total_reuse_full_cfg_emitted); + fmt_debug_printf("Total full cfg emitted: {}\n", total_full_cfg_emitted); + fmt_debug_printf("Num part hash probes: {}, num part hash hits: {}\n", num_part_hash_probes, num_part_hash_hits); + fmt_debug_printf("Total used endpoint dpcm: {}, total used endpoint raw: {}\n", total_used_endpoint_dpcm, total_used_endpoint_raw); + fmt_debug_printf("Total used weight DCT: {}, total used weight DPCM: {}\n", total_used_dct, total_used_weight_dpcm); + fmt_debug_printf("Total tm hash probes: {}, total tm hash_hits: {}\n", num_tm_hash_probes, num_tm_hash_hits); + + fmt_debug_printf("\nCompressed to {} bytes, {3.3}bpp\n\n", comp_data.size_u32(), ((float)comp_data.size() * 8.0f) / (float)total_pixels); + } + + return true; +} + +bool compress_image( + const image& orig_img, uint8_vec& comp_data, vector2D& coded_blocks, + const astc_ldr_encode_config& global_cfg, + job_pool& job_pool) +{ + assert(g_initialized); + + if (global_cfg.m_debug_output) + { + fmt_debug_printf("\n------------------- astc_ldr::compress_image\n"); + + fmt_debug_printf("\nglobal_cfg:\n"); + global_cfg.debug_print(); + fmt_debug_printf("\n"); + } + + comp_data.resize(0); + + if (!g_initialized) + return false; + + const uint32_t width = orig_img.get_width(), height = orig_img.get_height(); + + if (!is_in_range(width, 1, (int)MAX_WIDTH) || !is_in_range(height, 1, (int)MAX_HEIGHT)) + return false; + + if (!astc_helpers::is_valid_block_size(global_cfg.m_astc_block_width, global_cfg.m_astc_block_height)) + return false; + + const uint32_t block_width = global_cfg.m_astc_block_width; + const uint32_t block_height = global_cfg.m_astc_block_height; + const uint32_t total_block_pixels = block_width * block_height; + + const uint32_t total_pixels = width * height; + const uint32_t num_blocks_x = (width + block_width - 1) / block_width; + const uint32_t num_blocks_y = (height + block_height - 1) / block_height; + const uint32_t total_blocks = num_blocks_x * num_blocks_y; + const bool has_alpha = orig_img.has_alpha(); + + if (global_cfg.m_debug_output) + fmt_debug_printf("Encoding image dimensions {}x{}, has alpha: {}\n", orig_img.get_width(), orig_img.get_height(), has_alpha); + + ldr_astc_block_encode_image_high_level_config enc_cfg; + + enc_cfg.m_block_width = block_width; + enc_cfg.m_block_height = block_height; + enc_cfg.m_pJob_pool = &job_pool; + + enc_cfg.m_use_dct = global_cfg.m_use_dct; + + if (!is_in_range(global_cfg.m_dct_quality, 1.0f, 100.0f)) + return false; + + const int int_q = clamp((int)std::round(global_cfg.m_dct_quality * 2.0f), 0, 200); + enc_cfg.m_base_q = (float)int_q / 2.0f; + + if (global_cfg.m_debug_output) + fmt_debug_printf("Use DCT: {}, base q: {}, lossy supercompression: {}\n", enc_cfg.m_use_dct, enc_cfg.m_base_q, global_cfg.m_lossy_supercompression); + + const float replacement_min_psnr = has_alpha ? global_cfg.m_replacement_min_psnr_alpha : global_cfg.m_replacement_min_psnr; + const float psnr_trial_diff_thresh = has_alpha ? global_cfg.m_psnr_trial_diff_thresh_alpha : global_cfg.m_psnr_trial_diff_thresh; + const float psnr_trial_diff_thresh_edge = has_alpha ? global_cfg.m_psnr_trial_diff_thresh_edge_alpha : global_cfg.m_psnr_trial_diff_thresh_edge; + + enc_cfg.m_blurring_enabled = global_cfg.m_block_blurring_p1; + enc_cfg.m_blurring_enabled_p2 = global_cfg.m_block_blurring_p2; + + for (uint32_t i = 0; i < 4; i++) + { + enc_cfg.m_cem_enc_params.m_comp_weights[i] = global_cfg.m_comp_weights[i]; + + if (!is_in_range(global_cfg.m_comp_weights[i], 1, 256)) + return false; + } + + int cfg_effort_level = global_cfg.m_effort_level; + if (global_cfg.m_debug_output) + fmt_debug_printf("Using cfg effort level: {}\n", cfg_effort_level); + + configure_encoder_effort_level(cfg_effort_level, enc_cfg); + + if (global_cfg.m_force_disable_subsets) + { + enc_cfg.m_subsets_enabled = false; + enc_cfg.m_second_pass_force_subsets_enabled = false; + } + + if (global_cfg.m_force_disable_rgb_dual_plane) + { + enc_cfg.m_disable_rgb_dual_plane = true; + enc_cfg.m_force_all_dp_chans_p2 = false; + } + + enc_cfg.m_cem_enc_params.m_decode_mode_srgb = global_cfg.m_astc_decode_mode_srgb; + + enc_cfg.m_debug_output = global_cfg.m_debug_output; + enc_cfg.m_debug_images = global_cfg.m_debug_images; + enc_cfg.m_debug_file_prefix = global_cfg.m_debug_file_prefix; + + ldr_astc_block_encode_image_output enc_out; + + const bool enc_status = ldr_astc_block_encode_image(orig_img, enc_cfg, enc_out); + + if (global_cfg.m_debug_output) + fmt_debug_printf("ldr_astc_block_encode_image: {}\n", enc_status); + + if (!enc_status) + return false; + + basist::astc_ldr_t::xuastc_ldr_syntax syntax = global_cfg.m_compressed_syntax; + + if (syntax >= basist::astc_ldr_t::xuastc_ldr_syntax::cTotal) + { + assert(0); + return false; + } + + // Switch to full adaptive arithmetic coding on the smallest mipmaps to avoid ZStd overhead. + const uint32_t DISABLE_FASTER_FORMAT_TOTAL_BLOCKS_THRESH = 64; + if (total_blocks <= DISABLE_FASTER_FORMAT_TOTAL_BLOCKS_THRESH) + syntax = basist::astc_ldr_t::xuastc_ldr_syntax::cFullArith; + + if (syntax == basist::astc_ldr_t::xuastc_ldr_syntax::cFullZStd) + { + // Full ZStd syntax is so different we'll move that to another function. + return compress_image_full_zstd( + orig_img, comp_data, coded_blocks, + global_cfg, + job_pool, + enc_cfg, enc_out); + } + + const bool use_faster_format = (syntax == basist::astc_ldr_t::xuastc_ldr_syntax::cHybridArithZStd); + + // Either full arithmetic, or hybrid arithmetic+ZStd for weight symbols. + basist::astc_ldr_t::xuastc_ldr_arith_header hdr; + clear_obj(hdr); + + bitwise_coder mean0_bits; + uint8_vec mean1_bytes; + uint8_vec run_bytes; + uint8_vec coeff_bytes; + bitwise_coder sign_bits; + bitwise_coder weight2_bits; + bitwise_coder weight3_bits; + bitwise_coder weight4_bits; + uint8_vec weight8_bits; + + if (use_faster_format) + { + mean0_bits.init(1024); + mean1_bytes.reserve(1024); + run_bytes.reserve(8192); + coeff_bytes.reserve(8192); + sign_bits.init(1024); + weight2_bits.init(1024); + weight3_bits.init(1024); + weight4_bits.init(1024); + weight8_bits.reserve(8192); + } + + interval_timer itm; + itm.start(); + + basist::arith::arith_enc enc; + enc.init(1024 * 1024); + + enc.put_bits(basist::astc_ldr_t::ARITH_HEADER_MARKER, basist::astc_ldr_t::ARITH_HEADER_MARKER_BITS); + + const int block_dim_index = astc_helpers::find_astc_block_size_index(block_width, block_height); + assert((block_dim_index >= 0) && (block_dim_index < (int)astc_helpers::NUM_ASTC_BLOCK_SIZES)); + + enc.put_bits(block_dim_index, 4); + + enc.put_bit(enc_cfg.m_cem_enc_params.m_decode_mode_srgb); + + enc.put_bits(width, 16); + enc.put_bits(height, 16); + + enc.put_bit(has_alpha); + + enc.put_bits(enc_cfg.m_use_dct, 1); + if (enc_cfg.m_use_dct) + enc.put_bits(int_q, 8); + + basist::arith::arith_data_model mode_model((uint32_t)basist::astc_ldr_t::xuastc_mode::cMODE_TOTAL); + + basist::arith::arith_data_model solid_color_dpcm_model[4]; + for (uint32_t i = 0; i < 4; i++) + solid_color_dpcm_model[i].init(256, true); + + basist::arith::arith_data_model raw_endpoint_models[astc_helpers::TOTAL_ENDPOINT_ISE_RANGES]; + for (uint32_t i = 0; i < astc_helpers::TOTAL_ENDPOINT_ISE_RANGES; i++) + raw_endpoint_models[i].init(astc_helpers::get_ise_levels(astc_helpers::FIRST_VALID_ENDPOINT_ISE_RANGE + i)); + + basist::arith::arith_data_model dpcm_endpoint_models[astc_helpers::TOTAL_ENDPOINT_ISE_RANGES]; + for (uint32_t i = 0; i < astc_helpers::TOTAL_ENDPOINT_ISE_RANGES; i++) + dpcm_endpoint_models[i].init(astc_helpers::get_ise_levels(astc_helpers::FIRST_VALID_ENDPOINT_ISE_RANGE + i)); + + basist::arith::arith_data_model raw_weight_models[astc_helpers::TOTAL_WEIGHT_ISE_RANGES]; + for (uint32_t i = 0; i < astc_helpers::TOTAL_WEIGHT_ISE_RANGES; i++) + raw_weight_models[i].init(astc_helpers::get_ise_levels(astc_helpers::FIRST_VALID_WEIGHT_ISE_RANGE + i)); + + basist::arith::arith_bit_model is_base_ofs_model; + basist::arith::arith_bit_model use_dct_model[4]; + basist::arith::arith_bit_model use_dpcm_endpoints_model; + + basist::arith::arith_data_model cem_index_model[8]; + for (uint32_t i = 0; i < 8; i++) + cem_index_model[i].init(basist::astc_ldr_t::OTM_NUM_CEMS); + + basist::arith::arith_data_model subset_index_model[basist::astc_ldr_t::OTM_NUM_SUBSETS]; + for (uint32_t i = 0; i < basist::astc_ldr_t::OTM_NUM_SUBSETS; i++) + subset_index_model[i].init(basist::astc_ldr_t::OTM_NUM_SUBSETS); + + basist::arith::arith_data_model ccs_index_model[basist::astc_ldr_t::OTM_NUM_CCS]; + for (uint32_t i = 0; i < basist::astc_ldr_t::OTM_NUM_CCS; i++) + ccs_index_model[i].init(basist::astc_ldr_t::OTM_NUM_CCS); + + basist::arith::arith_data_model grid_size_model[basist::astc_ldr_t::OTM_NUM_GRID_SIZES]; + for (uint32_t i = 0; i < basist::astc_ldr_t::OTM_NUM_GRID_SIZES; i++) + grid_size_model[i].init(basist::astc_ldr_t::OTM_NUM_GRID_SIZES); + + basist::arith::arith_data_model grid_aniso_model[basist::astc_ldr_t::OTM_NUM_GRID_ANISOS]; + for (uint32_t i = 0; i < basist::astc_ldr_t::OTM_NUM_GRID_ANISOS; i++) + grid_aniso_model[i].init(basist::astc_ldr_t::OTM_NUM_GRID_ANISOS); + + basist::arith::arith_data_model dct_run_len_model(65); // [0,63] or 64=EOB + basist::arith::arith_data_model dct_coeff_mag(255); // [1,255] (blocks with larger mags go DPCM) + + double total_header_bits = 0.0f, total_weight_bits = 0.0f, total_endpoint_bits = 0.0f; + + uint32_t total_solid_blocks = 0, total_used_dct = 0, total_used_weight_dpcm = 0; + + basist::astc_ldr_t::grid_weight_dct grid_dct; + grid_dct.init(block_width, block_height); + + vector2D prev_block_states(num_blocks_x, num_blocks_y); + + coded_blocks.resize(num_blocks_x, num_blocks_y); + for (uint32_t y = 0; y < num_blocks_y; y++) + for (uint32_t x = 0; x < num_blocks_x; x++) + coded_blocks(x, y).clear(); + + const bool endpoint_dpcm_global_enable = true; + uint32_t total_used_endpoint_dpcm = 0, total_used_endpoint_raw = 0; + + basist::arith::arith_data_model submode_models[basist::astc_ldr_t::OTM_NUM_CEMS][basist::astc_ldr_t::OTM_NUM_SUBSETS][basist::astc_ldr_t::OTM_NUM_CCS][basist::astc_ldr_t::OTM_NUM_GRID_SIZES][basist::astc_ldr_t::OTM_NUM_GRID_ANISOS]; + + basist::arith::arith_bit_model endpoints_use_bc_models[4]; + + basist::arith::arith_data_model endpoint_reuse_delta_model(basist::astc_6x6_hdr::NUM_REUSE_XY_DELTAS); + + basist::arith::arith_data_model weight_mean_models[2]; + weight_mean_models[0].init(basist::astc_ldr_t::DCT_MEAN_LEVELS0); + weight_mean_models[1].init(basist::astc_ldr_t::DCT_MEAN_LEVELS1); + + basist::arith::arith_data_model config_reuse_model[4]; + for (uint32_t i = 0; i < 4; i++) + config_reuse_model[i].init(basist::astc_ldr_t::cMaxConfigReuseNeighbors + 1); + + uint32_t total_reuse_full_cfg_emitted = 0, total_full_cfg_emitted = 0; + + // TODO: check weights for >= 0 + const float total_comp_weights = enc_cfg.m_cem_enc_params.get_total_comp_weights(); + + uint32_t total_lossy_replacements = 0; + uint32_t total_full_reuse_commands = 0; + uint32_t total_raw_commands = 0; + + if (global_cfg.m_debug_output) + fmt_debug_printf("Supercompressor init time: {} secs\n", itm.get_elapsed_secs()); + + uint32_t total_runs = 0, total_run_blocks = 0; + uint32_t cur_run_len = 0; + const bool use_run_commands = true; + uint32_t total_nonrun_blocks = 0; + + int part2_hash[basist::astc_ldr_t::PART_HASH_SIZE]; + std::fill(part2_hash, part2_hash + basist::astc_ldr_t::PART_HASH_SIZE, -1); + + int part3_hash[basist::astc_ldr_t::PART_HASH_SIZE]; + std::fill(part3_hash, part3_hash + basist::astc_ldr_t::PART_HASH_SIZE, -1); + + basist::arith::arith_bit_model use_part_hash_model[4]; + basist::arith::arith_data_model part2_hash_index_model(basist::astc_ldr_t::PART_HASH_SIZE, true); + basist::arith::arith_data_model part3_hash_index_model(basist::astc_ldr_t::PART_HASH_SIZE, true); + + uint32_t num_part_hash_probes = 0, num_part_hash_hits = 0; + uint32_t total_dct_syms = 0, total_dpcm_syms = 0; + + basist::arith::arith_gamma_contexts m_run_len_contexts; + + image vis_img; + if (global_cfg.m_debug_images) + { + vis_img.resize(width, height); + } + + itm.start(); + + for (uint32_t by = 0; by < num_blocks_y; by++) + { + const uint32_t base_y = by * block_height; + + for (uint32_t bx = 0; bx < num_blocks_x; bx++) + { + const uint32_t base_x = bx * block_width; + + basist::astc_ldr_t::prev_block_state& prev_state = prev_block_states(bx, by); + const basist::astc_ldr_t::prev_block_state* pLeft_state = bx ? &prev_block_states(bx - 1, by) : nullptr; + const basist::astc_ldr_t::prev_block_state* pUpper_state = by ? &prev_block_states(bx, by - 1) : nullptr; + const basist::astc_ldr_t::prev_block_state* pDiag_state = (bx && by) ? &prev_block_states(bx - 1, by - 1) : nullptr; + const basist::astc_ldr_t::prev_block_state* pPred_state = pLeft_state ? pLeft_state : pUpper_state; // left or upper, or nullptr on first block + + const ldr_astc_block_encode_image_output::block_info& blk_info = enc_out.m_image_block_info(bx, by); + + uint32_t best_packed_out_block_index = blk_info.m_packed_out_block_index; + + // check for run + if ((use_run_commands) && (bx || by)) + { + const encode_block_output& blk_out = blk_info.m_out_blocks[best_packed_out_block_index]; + const astc_helpers::log_astc_block& cur_log_blk = blk_out.m_log_blk; + + const astc_helpers::log_astc_block& prev_log_blk = bx ? coded_blocks(bx - 1, by) : coded_blocks(0, by - 1); + const basist::astc_ldr_t::prev_block_state* pPrev_block_state = bx ? pLeft_state : pUpper_state; + + assert(pPrev_block_state); + + if (compare_log_blocks_for_equality(cur_log_blk, prev_log_blk)) + { + // Left or upper is exactly the same logical block, so expand the run. + cur_run_len++; + + // Accept the previous block (left or upper) as if it's been coded normally. + + coded_blocks(bx, by) = prev_log_blk; + + prev_state.m_was_solid_color = pPrev_block_state->m_was_solid_color; + prev_state.m_used_weight_dct = pPrev_block_state->m_used_weight_dct; + prev_state.m_first_endpoint_uses_bc = pPrev_block_state->m_first_endpoint_uses_bc; + prev_state.m_reused_full_cfg = true; + prev_state.m_used_part_hash = pPrev_block_state->m_used_part_hash; + prev_state.m_tm_index = pPrev_block_state->m_tm_index; + prev_state.m_base_cem_index = pPrev_block_state->m_base_cem_index; + prev_state.m_subset_index = pPrev_block_state->m_subset_index; + prev_state.m_ccs_index = pPrev_block_state->m_ccs_index; + prev_state.m_grid_size = pPrev_block_state->m_grid_size; + prev_state.m_grid_aniso = pPrev_block_state->m_grid_aniso; + + continue; + } + } + + if (cur_run_len) + { + total_runs++; + total_run_blocks += cur_run_len; + + total_header_bits += enc.encode_and_return_price((uint32_t)basist::astc_ldr_t::xuastc_mode::cMODE_RUN, mode_model); + total_header_bits += enc.put_gamma_and_return_price(cur_run_len, m_run_len_contexts); + cur_run_len = 0; + } + + total_nonrun_blocks++; + + const float ref_wmse = (float)blk_info.m_out_blocks[best_packed_out_block_index].m_sse / (total_comp_weights * (float)total_block_pixels); + const float ref_wpsnr = (ref_wmse > 1e-5f) ? 20.0f * log10f(255.0f / sqrtf(ref_wmse)) : 10000.0f; + + if ((global_cfg.m_lossy_supercompression) && (ref_wpsnr >= replacement_min_psnr) && + (!blk_info.m_out_blocks[blk_info.m_packed_out_block_index].m_log_blk.m_solid_color_flag_ldr)) + { + const float psnr_thresh = blk_info.m_strong_edges ? psnr_trial_diff_thresh_edge : psnr_trial_diff_thresh; + + float best_alt_wpsnr = 0.0f; + bool found_alternative = false; + + // Pass: 0 consider full config+part ID endpoint reuse + // Pass: 1 fall back to just full config+part ID reuse (no endpoints) + for (uint32_t pass = 0; pass < 2; pass++) + { + // Iterate through all available alternative candidates + for (uint32_t out_block_iter = 0; out_block_iter < blk_info.m_out_blocks.size(); out_block_iter++) + { + if (out_block_iter == blk_info.m_packed_out_block_index) + continue; + + const float trial_wmse = (float)blk_info.m_out_blocks[out_block_iter].m_sse / (total_comp_weights * (float)total_block_pixels); + const float trial_wpsnr = (trial_wmse > 1e-5f) ? 20.0f * log10f(255.0f / sqrtf(trial_wmse)) : 10000.0f; + + // Reject if PSNR too low + if (trial_wpsnr < (ref_wpsnr - psnr_thresh)) + continue; + + // Reject if inferior than best found so far + if (trial_wpsnr < best_alt_wpsnr) + continue; + + const astc_helpers::log_astc_block& trial_log_blk = blk_info.m_out_blocks[out_block_iter].m_log_blk; + + if (trial_log_blk.m_solid_color_flag_ldr) + continue; + + // Examine nearby neighbors + for (uint32_t i = 0; i < basist::astc_ldr_t::cMaxConfigReuseNeighbors; i++) + { + int dx = 0, dy = 0; + switch (i) + { + case 0: dx = -1; break; + case 1: dy = -1; break; + case 2: dx = -1; dy = -1; break; + default: assert(0); break; + } + + const int n_bx = bx + dx, n_by = by + dy; + if ((n_bx < 0) || (n_by < 0)) + continue; + + astc_helpers::log_astc_block& neighbor_log_blk = coded_blocks(n_bx, n_by); + + if (neighbor_log_blk.m_solid_color_flag_ldr) + continue; + + bool accept_flag = false; + if (pass == 0) + { + // prefer full config+endpoint equality first + accept_flag = compare_log_block_configs_and_endpoints(trial_log_blk, neighbor_log_blk); + } + else + { + // next check for just config equality + accept_flag = compare_log_block_configs(trial_log_blk, neighbor_log_blk); + } + + if (accept_flag) + { + best_alt_wpsnr = trial_wpsnr; + best_packed_out_block_index = out_block_iter; + found_alternative = true; + break; + } + + } // i + + } // out_block_iter + + if (found_alternative) + break; + + } // pass + + if (best_packed_out_block_index != blk_info.m_packed_out_block_index) + total_lossy_replacements++; + + } // global_cfg.m_lossy_supercompression + + const encode_block_output& blk_out = blk_info.m_out_blocks[best_packed_out_block_index]; + + astc_helpers::log_astc_block& cur_log_blk = coded_blocks(bx, by); + + cur_log_blk = blk_out.m_log_blk; + + // TODO: Add mode model context + + if (blk_out.m_trial_mode_index < 0) + { + assert(cur_log_blk.m_solid_color_flag_ldr); + + total_solid_blocks++; + + //total_header_bits += mode_model.get_price(cMODE_SOLID) + (float)(8 * (has_alpha ? 4 : 3)); + total_header_bits += mode_model.get_price((uint32_t)basist::astc_ldr_t::xuastc_mode::cMODE_SOLID); + enc.encode((uint32_t)basist::astc_ldr_t::xuastc_mode::cMODE_SOLID, mode_model); + + uint32_t cur_solid_color[4]; + for (uint32_t i = 0; i < 4; i++) + cur_solid_color[i] = blk_out.m_log_blk.m_solid_color[i] >> 8; + + uint32_t prev_solid_color[4] = { 0 }; + + const uint32_t num_comps = has_alpha ? 4 : 3; + + astc_helpers::log_astc_block* pPrev_log_blk = bx ? &coded_blocks(bx - 1, by) : (by ? &coded_blocks(bx, by - 1) : nullptr); + if (pPrev_log_blk) + { + if (pPrev_log_blk->m_solid_color_flag_ldr) + { + prev_solid_color[0] = pPrev_log_blk->m_solid_color[0] >> 8; + prev_solid_color[1] = pPrev_log_blk->m_solid_color[1] >> 8; + prev_solid_color[2] = pPrev_log_blk->m_solid_color[2] >> 8; + prev_solid_color[3] = pPrev_log_blk->m_solid_color[3] >> 8; + } + else + { +#if 0 + color_rgba prev_block_pixels[astc_ldr::ASTC_LDR_MAX_BLOCK_PIXELS]; + bool dec_status = astc_helpers::decode_block(*pPrev_log_blk, prev_block_pixels, block_width, block_height, enc_cfg.m_cem_enc_params.m_decode_mode_srgb ? astc_helpers::cDecodeModeSRGB8 : astc_helpers::cDecodeModeLDR8); + if (!dec_status) + { + fmt_error_printf("decode_block() failed\n"); + return false; + } + + for (uint32_t i = 0; i < total_block_pixels; i++) + { + for (uint32_t j = 0; j < num_comps; j++) + prev_solid_color[j] += prev_block_pixels[i][j]; + } + + for (uint32_t j = 0; j < num_comps; j++) + prev_solid_color[j] = (prev_solid_color[j] + (total_block_pixels / 2)) / total_block_pixels; +#endif + // Decode previous block's first CEM, use the halfway point as the predictor. + color_rgba prev_l, prev_h; + decode_endpoints(pPrev_log_blk->m_color_endpoint_modes[0], pPrev_log_blk->m_endpoints, pPrev_log_blk->m_endpoint_ise_range, prev_l, prev_h); + + prev_solid_color[0] = (prev_l[0] + prev_h[0] + 1) >> 1; + prev_solid_color[1] = (prev_l[1] + prev_h[1] + 1) >> 1; + prev_solid_color[2] = (prev_l[2] + prev_h[2] + 1) >> 1; + prev_solid_color[3] = (prev_l[3] + prev_h[3] + 1) >> 1; + } + } + + for (uint32_t i = 0; i < num_comps; i++) + { + const uint32_t delta = (cur_solid_color[i] - prev_solid_color[i]) & 0xFF; + + total_header_bits += enc.encode_and_return_price(delta, solid_color_dpcm_model[i]); + } + + // Bias the statistics towards using DCT (most common case). + prev_state.m_was_solid_color = true; + prev_state.m_used_weight_dct = enc_cfg.m_use_dct; + prev_state.m_first_endpoint_uses_bc = true; + prev_state.m_tm_index = -1; + prev_state.m_base_cem_index = astc_helpers::CEM_LDR_RGB_DIRECT; + prev_state.m_subset_index = 0; + prev_state.m_ccs_index = 0; + prev_state.m_grid_size = 0; + prev_state.m_grid_aniso = 0; + prev_state.m_reused_full_cfg = false; + prev_state.m_used_part_hash = true; // bias to true + + continue; + } + + //-------------------------------------------- + // for (uint32_t out_block_iter = 0; out_block_iter < blk_info.m_out_blocks.size(); out_block_iter++) + int full_cfg_endpoint_reuse_index = -1; + + for (uint32_t i = 0; i < basist::astc_ldr_t::cMaxConfigReuseNeighbors; i++) + { + int dx = 0, dy = 0; + switch (i) + { + case 0: dx = -1; break; + case 1: dy = -1; break; + case 2: dx = -1; dy = -1; break; + default: assert(0); break; + } + + const int n_bx = bx + dx, n_by = by + dy; + if ((n_bx < 0) || (n_by < 0)) + continue; + + astc_helpers::log_astc_block& neighbor_log_blk = coded_blocks(n_bx, n_by); + + if (neighbor_log_blk.m_solid_color_flag_ldr) + continue; + + if (compare_log_block_configs_and_endpoints(cur_log_blk, neighbor_log_blk)) + { + full_cfg_endpoint_reuse_index = i; + break; + } + } // i + //-------------------------------------------- + + if (full_cfg_endpoint_reuse_index >= 0) + { + // Reused full config, part ID and endpoint values from an immediate neighbor + total_header_bits += enc.encode_and_return_price((uint32_t)basist::astc_ldr_t::xuastc_mode::cMODE_REUSE_CFG_ENDPOINTS_LEFT + full_cfg_endpoint_reuse_index, mode_model); + + total_full_reuse_commands++; + + const basist::astc_ldr_t::prev_block_state* pReused_cfg_state = nullptr; + + switch (full_cfg_endpoint_reuse_index) + { + case 0: pReused_cfg_state = pLeft_state; break; + case 1: pReused_cfg_state = pUpper_state; break; + case 2: pReused_cfg_state = pDiag_state; break; + default: assert(0); break; + } + + if (!pReused_cfg_state) + { + assert(0); + fmt_error_printf("encoding internal failure\n"); + return false; + } + + assert(pReused_cfg_state->m_tm_index == blk_out.m_trial_mode_index); + + prev_state.m_tm_index = blk_out.m_trial_mode_index; + prev_state.m_base_cem_index = pReused_cfg_state->m_base_cem_index; + prev_state.m_subset_index = pReused_cfg_state->m_subset_index; + prev_state.m_ccs_index = pReused_cfg_state->m_ccs_index; + prev_state.m_grid_size = pReused_cfg_state->m_grid_size; + prev_state.m_grid_aniso = pReused_cfg_state->m_grid_aniso; + prev_state.m_used_part_hash = pReused_cfg_state->m_used_part_hash; + prev_state.m_reused_full_cfg = true; + + const uint32_t cur_actual_cem = cur_log_blk.m_color_endpoint_modes[0]; + + if (astc_helpers::cem_supports_bc(cur_actual_cem)) + { + prev_state.m_first_endpoint_uses_bc = astc_helpers::used_blue_contraction(cur_actual_cem, cur_log_blk.m_endpoints, cur_log_blk.m_endpoint_ise_range); + assert(prev_state.m_first_endpoint_uses_bc == pReused_cfg_state->m_first_endpoint_uses_bc); + } + } + else + { + total_raw_commands++; + + // Send mode + total_header_bits += mode_model.get_price((uint32_t)basist::astc_ldr_t::xuastc_mode::cMODE_RAW); + enc.encode((uint32_t)basist::astc_ldr_t::xuastc_mode::cMODE_RAW, mode_model); + + const uint32_t cur_actual_cem = cur_log_blk.m_color_endpoint_modes[0]; + //const bool actual_cem_supports_bc = astc_helpers::cem_supports_bc(cur_actual_cem); + const uint32_t total_endpoint_vals = astc_helpers::get_num_cem_values(cur_actual_cem); + + // DO NOT use tm.m_cem because the encoder may have selected a base+ofs variant instead. Use cur_actual_cem. + const basist::astc_ldr_t::trial_mode& tm = enc_out.m_encoder_trial_modes[blk_out.m_trial_mode_index]; + + // Check for config+part ID neighbor reuse + int neighbor_cfg_match_index = -1; + for (uint32_t i = 0; i < basist::astc_ldr_t::cMaxConfigReuseNeighbors; i++) + { + const basist::astc_ldr_t::prev_block_state* pNeighbor_state = nullptr; + + int dx = 0, dy = 0; + switch (i) + { + case 0: dx = -1; pNeighbor_state = pLeft_state; break; + case 1: dy = -1; pNeighbor_state = pUpper_state; break; + case 2: dx = -1; dy = -1; pNeighbor_state = pDiag_state; break; + default: assert(0); break; + } + + if (!pNeighbor_state) + continue; + + const int n_bx = bx + dx, n_by = by + dy; + assert((n_bx >= 0) && (n_by >= 0)); + + astc_helpers::log_astc_block& neighbor_log_blk = coded_blocks(n_bx, n_by); + + if (pNeighbor_state->m_tm_index != blk_out.m_trial_mode_index) + continue; + + if (neighbor_log_blk.m_color_endpoint_modes[0] != cur_log_blk.m_color_endpoint_modes[0]) + continue; + + if (neighbor_log_blk.m_partition_id != cur_log_blk.m_partition_id) + continue; + + assert(neighbor_log_blk.m_dual_plane == cur_log_blk.m_dual_plane); + assert(neighbor_log_blk.m_color_component_selector == cur_log_blk.m_color_component_selector); + assert(neighbor_log_blk.m_num_partitions == cur_log_blk.m_num_partitions); + assert(neighbor_log_blk.m_grid_width == cur_log_blk.m_grid_width); + assert(neighbor_log_blk.m_grid_height == cur_log_blk.m_grid_height); + assert(neighbor_log_blk.m_endpoint_ise_range == cur_log_blk.m_endpoint_ise_range); + assert(neighbor_log_blk.m_weight_ise_range == cur_log_blk.m_weight_ise_range); + + neighbor_cfg_match_index = i; + break; + } + + uint32_t reuse_full_cfg_model_index = 0; + if (pLeft_state) + reuse_full_cfg_model_index = pLeft_state->m_reused_full_cfg; + else + reuse_full_cfg_model_index = 1; + + if (pUpper_state) + reuse_full_cfg_model_index |= pUpper_state->m_reused_full_cfg ? 2 : 0; + else + reuse_full_cfg_model_index |= 2; + + if (neighbor_cfg_match_index >= 0) + { + total_header_bits += enc.encode_and_return_price(neighbor_cfg_match_index, config_reuse_model[reuse_full_cfg_model_index]); + + const basist::astc_ldr_t::prev_block_state* pReused_cfg_state = nullptr; + + switch (neighbor_cfg_match_index) + { + case 0: pReused_cfg_state = pLeft_state; break; + case 1: pReused_cfg_state = pUpper_state; break; + case 2: pReused_cfg_state = pDiag_state; break; + default: assert(0); break; + } + + if (!pReused_cfg_state) + { + assert(0); + fmt_error_printf("encoding internal failure\n"); + return false; + } + + assert(pReused_cfg_state->m_tm_index == blk_out.m_trial_mode_index); + + prev_state.m_tm_index = blk_out.m_trial_mode_index; + prev_state.m_base_cem_index = pReused_cfg_state->m_base_cem_index; + prev_state.m_subset_index = pReused_cfg_state->m_subset_index; + prev_state.m_ccs_index = pReused_cfg_state->m_ccs_index; + prev_state.m_grid_size = pReused_cfg_state->m_grid_size; + prev_state.m_grid_aniso = pReused_cfg_state->m_grid_aniso; + prev_state.m_used_part_hash = pReused_cfg_state->m_used_part_hash; + prev_state.m_reused_full_cfg = true; + + total_reuse_full_cfg_emitted++; + } + else + { + total_full_cfg_emitted++; + + total_header_bits += enc.encode_and_return_price(basist::astc_ldr_t::cMaxConfigReuseNeighbors, config_reuse_model[reuse_full_cfg_model_index]); + + // ------------------------------------------- Set TM index + { + uint32_t cem_index, subset_index, ccs_index, grid_size, grid_aniso; + + const uint_vec& submodes = separate_tm_index(block_width, block_height, enc_out.m_grouped_encoder_trial_modes, tm, + cem_index, subset_index, ccs_index, grid_size, grid_aniso); + + // TODO: sort this + uint32_t submode_index; + for (submode_index = 0; submode_index < submodes.size(); submode_index++) + if (submodes[submode_index] == (uint32_t)blk_out.m_trial_mode_index) + break; + + if (submode_index == submodes.size_u32()) + { + assert(0); + fmt_error_printf("Failed finding mode\n"); + return false; + } + + uint32_t prev_cem_index = astc_helpers::CEM_LDR_RGB_DIRECT; + uint32_t prev_subset_index = 0; + uint32_t prev_ccs_index = 0; + uint32_t prev_grid_size = 0; + uint32_t prev_grid_aniso = 0; + + if (pPred_state) + { + prev_cem_index = pPred_state->m_base_cem_index; + prev_subset_index = pPred_state->m_subset_index; + prev_ccs_index = pPred_state->m_ccs_index; + prev_grid_size = pPred_state->m_grid_size; + prev_grid_aniso = pPred_state->m_grid_aniso; + } + + const uint32_t ldrcem_index = basist::astc_ldr_t::cem_to_ldrcem_index(prev_cem_index); + + total_header_bits += cem_index_model[ldrcem_index].get_price(cem_index); + enc.encode(cem_index, cem_index_model[ldrcem_index]); + + total_header_bits += subset_index_model[prev_subset_index].get_price(subset_index); + enc.encode(subset_index, subset_index_model[prev_subset_index]); + + total_header_bits += ccs_index_model[prev_ccs_index].get_price(ccs_index); + enc.encode(ccs_index, ccs_index_model[prev_ccs_index]); + + total_header_bits += grid_size_model[prev_grid_size].get_price(grid_size); + enc.encode(grid_size, grid_size_model[prev_grid_size]); + + total_header_bits += grid_aniso_model[prev_grid_aniso].get_price(grid_aniso); + enc.encode(grid_aniso, grid_aniso_model[prev_grid_aniso]); + + if (submodes.size() > 1) + { + basist::arith::arith_data_model& submode_model = submode_models[cem_index][subset_index][ccs_index][grid_size][grid_aniso]; + if (!submode_model.get_num_data_syms()) + submode_model.init(submodes.size_u32(), true); + + total_header_bits += submode_model.get_price(submode_index); + enc.encode(submode_index, submode_model); + } + + prev_state.m_tm_index = blk_out.m_trial_mode_index; + prev_state.m_base_cem_index = cem_index; + prev_state.m_subset_index = subset_index; + prev_state.m_ccs_index = ccs_index; + prev_state.m_grid_size = grid_size; + prev_state.m_grid_aniso = grid_aniso; + prev_state.m_reused_full_cfg = false; + } + + // Send base_ofs bit if the tm is direct + if ((tm.m_cem == astc_helpers::CEM_LDR_RGB_DIRECT) || (tm.m_cem == astc_helpers::CEM_LDR_RGBA_DIRECT)) + { + const bool is_base_ofs = (cur_log_blk.m_color_endpoint_modes[0] == astc_helpers::CEM_LDR_RGB_BASE_PLUS_OFFSET) || + (cur_log_blk.m_color_endpoint_modes[0] == astc_helpers::CEM_LDR_RGBA_BASE_PLUS_OFFSET); + + total_header_bits += is_base_ofs_model.get_price(is_base_ofs); + enc.encode(is_base_ofs, is_base_ofs_model); + } + + if (tm.m_num_parts > 1) + { + // Send unique part pattern ID + astc_ldr::partitions_data* pPart_data = (tm.m_num_parts == 2) ? &enc_out.m_part_data_p2 : &enc_out.m_part_data_p3; + + const uint32_t astc_pat_index = cur_log_blk.m_partition_id; + const uint32_t unique_pat_index = pPart_data->m_part_seed_to_unique_index[astc_pat_index]; + const uint32_t total_unique_indices = pPart_data->m_total_unique_patterns; + assert(unique_pat_index < total_unique_indices); + + num_part_hash_probes++; + + uint32_t use_part_model_index = 0; + if (pLeft_state) + use_part_model_index = pLeft_state->m_used_part_hash; + else + use_part_model_index = 1; + if (pUpper_state) + use_part_model_index |= pUpper_state->m_used_part_hash ? 2 : 0; + else + use_part_model_index |= 2; + + int* pPart_hash = (tm.m_num_parts == 2) ? part2_hash : part3_hash; + + const uint32_t h = basist::astc_ldr_t::part_hash_index(unique_pat_index); + + if (pPart_hash[h] != (int)unique_pat_index) + { +#if defined(_DEBUG) || defined(DEBUG) + // sanity + for (uint32_t i = 0; i < basist::astc_ldr_t::PART_HASH_SIZE; i++) + { + assert(pPart_hash[i] != (int)unique_pat_index); + } +#endif + + total_header_bits += enc.encode_and_return_price(0, use_part_hash_model[use_part_model_index]); + total_header_bits += enc.put_truncated_binary(unique_pat_index, total_unique_indices); + + if (global_cfg.m_debug_images) + { + vis_img.fill_box(base_x, base_y, block_width, block_height, color_rgba(0, 0, 255, 255)); + } + + prev_state.m_used_part_hash = false; + } + else + { + num_part_hash_hits++; + + if (global_cfg.m_debug_images) + { + vis_img.fill_box(base_x, base_y, block_width, block_height, color_rgba(255, 0, 0, 255)); + } + + total_header_bits += enc.encode_and_return_price(1, use_part_hash_model[use_part_model_index]); + total_header_bits += enc.encode_and_return_price(h, (tm.m_num_parts == 2) ? part2_hash_index_model : part3_hash_index_model); + + prev_state.m_used_part_hash = true; + } + + pPart_hash[basist::astc_ldr_t::part_hash_index(unique_pat_index)] = unique_pat_index; + } + else + { + prev_state.m_used_part_hash = true; // bias to true + } + + } // if (neighbor_cfg_match_index >= 0) + + // ----------------------------------------- Send endpoints + const int num_endpoint_levels = astc_helpers::get_ise_levels(cur_log_blk.m_endpoint_ise_range); + const auto& endpoint_ise_to_rank = astc_helpers::g_dequant_tables.get_endpoint_tab(cur_log_blk.m_endpoint_ise_range).m_ISE_to_rank; + + uint32_t bc_model_index = 0; + if (pLeft_state) + bc_model_index = pLeft_state->m_first_endpoint_uses_bc; + else + bc_model_index = 1; + + if (pUpper_state) + bc_model_index |= pUpper_state->m_first_endpoint_uses_bc ? 2 : 0; + else + bc_model_index |= 2; + + bool endpoints_use_bc[astc_helpers::MAX_PARTITIONS] = { false }; + + if (astc_helpers::cem_supports_bc(cur_actual_cem)) + { + for (uint32_t part_iter = 0; part_iter < tm.m_num_parts; part_iter++) + { + const bool cur_uses_bc = astc_helpers::used_blue_contraction(cur_actual_cem, cur_log_blk.m_endpoints + part_iter * total_endpoint_vals, cur_log_blk.m_endpoint_ise_range); + + endpoints_use_bc[part_iter] = cur_uses_bc; + + } // part_iter + + prev_state.m_first_endpoint_uses_bc = endpoints_use_bc[0]; + } + + int best_reuse_bx = -1, best_reuse_by = -1; + uint32_t best_reuse_index = 0; + const astc_helpers::log_astc_block* pEndpoint_pred_log_blk = nullptr; + + if (endpoint_dpcm_global_enable) + { + int64_t best_trial_delta2 = INT64_MAX; + float best_trial_bits = BIG_FLOAT_VAL; + + //auto& trial_dpcm_model = dpcm_endpoint_models[cur_log_blk.m_endpoint_ise_range - astc_helpers::FIRST_VALID_ENDPOINT_ISE_RANGE]; + + for (uint32_t reuse_index = 0; reuse_index < basist::astc_6x6_hdr::NUM_REUSE_XY_DELTAS; reuse_index++) + { + const int rx = (int)bx + basist::astc_6x6_hdr::g_reuse_xy_deltas[reuse_index].m_x; + const int ry = (int)by + basist::astc_6x6_hdr::g_reuse_xy_deltas[reuse_index].m_y; + if ((rx < 0) || (ry < 0) || (rx >= (int)num_blocks_x) || (ry >= (int)num_blocks_y)) + continue; + + const astc_helpers::log_astc_block* pTrial_log_blk = &coded_blocks(rx, ry); + if (pTrial_log_blk->m_solid_color_flag_ldr) + continue; + + uint8_t trial_predicted_endpoints[astc_helpers::MAX_PARTITIONS][astc_helpers::MAX_CEM_ENDPOINT_VALS] = { }; + + uint32_t part_iter; + for (part_iter = 0; part_iter < tm.m_num_parts; part_iter++) + { + const bool always_repack_flag = false; + bool blue_contraction_clamped_flag = false, base_ofs_clamped_flag = false; + + bool conv_status = basist::astc_ldr_t::convert_endpoints_across_cems( + pTrial_log_blk->m_color_endpoint_modes[0], pTrial_log_blk->m_endpoint_ise_range, pTrial_log_blk->m_endpoints, + cur_actual_cem, cur_log_blk.m_endpoint_ise_range, trial_predicted_endpoints[part_iter], + always_repack_flag, + endpoints_use_bc[part_iter], false, + blue_contraction_clamped_flag, base_ofs_clamped_flag); + + if (!conv_status) + break; + } // part_iter + + if (part_iter < tm.m_num_parts) + continue; // failed + + int64_t trial_endpoint_delta2 = 0; + for (part_iter = 0; part_iter < tm.m_num_parts; part_iter++) + { + for (uint32_t val_iter = 0; val_iter < total_endpoint_vals; val_iter++) + { + int cur_e_rank = endpoint_ise_to_rank[cur_log_blk.m_endpoints[part_iter * total_endpoint_vals + val_iter]]; + int prev_e_rank = endpoint_ise_to_rank[trial_predicted_endpoints[part_iter][val_iter]]; + + int e_delta = cur_e_rank - prev_e_rank; + + trial_endpoint_delta2 += e_delta * e_delta; + + } // val_iter + + } // part_iter + + const float N = (float)(total_endpoint_vals * tm.m_num_parts); + const float mse = (float)trial_endpoint_delta2 / N; + + // Gaussian entropy estimate - precomputed 0.5 * log2(2*pi*e) = ~2.0470956f + const float k_const = 2.0470956f; + + float bits_per_sym = 0.5f * log2f(basisu::maximum(mse, 1e-9f)) + k_const; + + bits_per_sym = clamp(bits_per_sym, 0.05f, 8.0f); + + // total est bits for this block’s endpoints + float total_est_bits = bits_per_sym * N; + + total_est_bits += endpoint_reuse_delta_model.get_price(reuse_index); + + if (total_est_bits < best_trial_bits) + { + best_trial_delta2 = trial_endpoint_delta2; + best_trial_bits = total_est_bits; + + best_reuse_bx = rx; + best_reuse_by = ry; + best_reuse_index = reuse_index; + + if (!best_trial_delta2) + break; + } + + } // reuse_index + + if (best_reuse_bx >= 0) + { + pEndpoint_pred_log_blk = &coded_blocks(best_reuse_bx, best_reuse_by); + + assert(!pEndpoint_pred_log_blk->m_solid_color_flag_ldr); + } + + } // if (endpoint_dpcm_global_enable) + + uint8_t predicted_endpoints[astc_helpers::MAX_PARTITIONS][astc_helpers::MAX_CEM_ENDPOINT_VALS] = { }; + + bool use_dpcm_endpoints = false; + + if (pEndpoint_pred_log_blk) + { + use_dpcm_endpoints = true; + + assert(cur_log_blk.m_num_partitions == tm.m_num_parts); + + for (uint32_t part_iter = 0; part_iter < tm.m_num_parts; part_iter++) + { + const bool always_repack_flag = false; + bool blue_contraction_clamped_flag = false, base_ofs_clamped_flag = false; + + bool conv_status = basist::astc_ldr_t::convert_endpoints_across_cems( + pEndpoint_pred_log_blk->m_color_endpoint_modes[0], pEndpoint_pred_log_blk->m_endpoint_ise_range, pEndpoint_pred_log_blk->m_endpoints, + cur_actual_cem, cur_log_blk.m_endpoint_ise_range, predicted_endpoints[part_iter], + always_repack_flag, + endpoints_use_bc[part_iter], false, + blue_contraction_clamped_flag, base_ofs_clamped_flag); + + if (!conv_status) + { + // In practice, should never happen + use_dpcm_endpoints = false; + break; + } + } + } + + // TODO: Decide what is cheaper, endpoint DPCM vs. raw + + if (use_dpcm_endpoints) + { + total_endpoint_bits += enc.encode_and_return_price(1, use_dpcm_endpoints_model); + + total_endpoint_bits += enc.encode_and_return_price(best_reuse_index, endpoint_reuse_delta_model); + + if (astc_helpers::cem_supports_bc(cur_actual_cem)) + { + for (uint32_t part_iter = 0; part_iter < tm.m_num_parts; part_iter++) + { + total_endpoint_bits += enc.encode_and_return_price(endpoints_use_bc[part_iter], endpoints_use_bc_models[bc_model_index]); + + } // part_iter + } + + // TODO: Perhaps separate DPCM models by CEM, entry index + auto& dpcm_model = dpcm_endpoint_models[cur_log_blk.m_endpoint_ise_range - astc_helpers::FIRST_VALID_ENDPOINT_ISE_RANGE]; + + for (uint32_t part_iter = 0; part_iter < tm.m_num_parts; part_iter++) + { + for (uint32_t val_iter = 0; val_iter < total_endpoint_vals; val_iter++) + { + int cur_e_rank = endpoint_ise_to_rank[cur_log_blk.m_endpoints[part_iter * total_endpoint_vals + val_iter]]; + int prev_e_rank = endpoint_ise_to_rank[predicted_endpoints[part_iter][val_iter]]; + + int e_val = imod(cur_e_rank - prev_e_rank, num_endpoint_levels); + + total_endpoint_bits += dpcm_model.get_price(e_val); + enc.encode(e_val, dpcm_model); + + } // val_iter + + } // part_iter + + total_used_endpoint_dpcm++; + } + else + { + total_endpoint_bits += enc.encode_and_return_price(0, use_dpcm_endpoints_model); + + for (uint32_t part_iter = 0; part_iter < tm.m_num_parts; part_iter++) + { + for (uint32_t val_iter = 0; val_iter < total_endpoint_vals; val_iter++) + { + auto& model = raw_endpoint_models[cur_log_blk.m_endpoint_ise_range - astc_helpers::FIRST_VALID_ENDPOINT_ISE_RANGE]; + uint32_t e_val = cur_log_blk.m_endpoints[part_iter * total_endpoint_vals + val_iter]; + + total_endpoint_bits += model.get_price(e_val); + enc.encode(e_val, model); + + } // val_iter + + } // part_iter + + total_used_endpoint_raw++; + } + + } // if (full_cfg_endpoint_reuse_index >= 0) + + // ------------------------------------ Send weights + const uint32_t total_planes = cur_log_blk.m_dual_plane ? 2 : 1; + const uint32_t total_weights = cur_log_blk.m_grid_width * cur_log_blk.m_grid_height; + + const int num_weight_levels = astc_helpers::get_ise_levels(cur_log_blk.m_weight_ise_range); + const auto& weight_ise_to_rank = astc_helpers::g_dequant_tables.get_weight_tab(cur_log_blk.m_weight_ise_range).m_ISE_to_rank; + + uint32_t use_dct_model_index = 0; + + if (enc_cfg.m_use_dct) + { + if (pLeft_state) + use_dct_model_index = pLeft_state->m_used_weight_dct; + else + use_dct_model_index = 1; + + if (pUpper_state) + use_dct_model_index |= pUpper_state->m_used_weight_dct ? 2 : 0; + else + use_dct_model_index |= 2; + } + + if (use_faster_format) + { + bool use_dct = enc_cfg.m_use_dct; + + // TODO - tune this threshold + //const uint32_t SWITCH_TO_DPCM_NUM_COEFF_THRESH = (cur_log_blk.m_grid_width * cur_log_blk.m_grid_height * 102 + 64) >> 7; + const uint32_t SWITCH_TO_DPCM_NUM_COEFF_THRESH = (cur_log_blk.m_grid_width * cur_log_blk.m_grid_height * 45 + 64) >> 7; + + if (use_dct) + { + for (uint32_t plane_iter = 0; plane_iter < total_planes; plane_iter++) + { + const basist::astc_ldr_t::dct_syms& syms = blk_out.m_packed_dct_plane_data[plane_iter]; + if (syms.m_max_coeff_mag > basist::astc_ldr_t::DCT_MAX_ARITH_COEFF_MAG) + { + use_dct = false; + break; + } + + if (syms.m_coeffs.size() > SWITCH_TO_DPCM_NUM_COEFF_THRESH) + { + use_dct = false; + break; + } + } + } + + if (enc_cfg.m_use_dct) + { + total_weight_bits += use_dct_model[use_dct_model_index].get_price(use_dct); + enc.encode(use_dct, use_dct_model[use_dct_model_index]); + } + + if (use_dct) + { + prev_state.m_used_weight_dct = true; + + total_used_dct++; + + if (total_planes > 1) + { + assert(blk_out.m_packed_dct_plane_data[0].m_num_dc_levels == blk_out.m_packed_dct_plane_data[1].m_num_dc_levels); + } + + for (uint32_t plane_iter = 0; plane_iter < total_planes; plane_iter++) + { + const basist::astc_ldr_t::dct_syms& syms = blk_out.m_packed_dct_plane_data[plane_iter]; + + if (syms.m_num_dc_levels == basist::astc_ldr_t::DCT_MEAN_LEVELS1) + mean1_bytes.push_back((uint8_t)syms.m_dc_sym); + else + { + assert(syms.m_num_dc_levels == basist::astc_ldr_t::DCT_MEAN_LEVELS0); + mean0_bits.put_bits(syms.m_dc_sym, 4); + } + + for (uint32_t i = 0; i < syms.m_coeffs.size(); i++) + { + if (syms.m_coeffs[i].m_coeff == INT16_MAX) + { + run_bytes.push_back(basist::astc_ldr_t::DCT_RUN_LEN_EOB_SYM_INDEX); + } + else + { + run_bytes.push_back((uint8_t)syms.m_coeffs[i].m_num_zeros); + + sign_bits.put_bits(syms.m_coeffs[i].m_coeff < 0, 1); + + assert((syms.m_coeffs[i].m_coeff != 0) && (iabs(syms.m_coeffs[i].m_coeff) <= 255)); + + coeff_bytes.push_back((uint8_t)(iabs(syms.m_coeffs[i].m_coeff) - 1)); + } + } + + } // plane_iter + } + else + { + total_used_weight_dpcm++; + + for (uint32_t plane_iter = 0; plane_iter < total_planes; plane_iter++) + { + int prev_w = num_weight_levels / 2; + + for (uint32_t weight_iter = 0; weight_iter < total_weights; weight_iter++) + { + int ise_w = cur_log_blk.m_weights[plane_iter + weight_iter * total_planes]; + int w = weight_ise_to_rank[ise_w]; + + int w_to_code = w; + w_to_code = imod(w - prev_w, num_weight_levels); + + prev_w = w; + + if (num_weight_levels <= 4) + weight2_bits.put_bits((uint8_t)w_to_code, 2); + else if (num_weight_levels <= 8) + weight3_bits.put_bits((uint8_t)w_to_code, 4); + else if (num_weight_levels <= 16) + weight4_bits.put_bits((uint8_t)w_to_code, 4); + else + weight8_bits.push_back((uint8_t)w_to_code); + + } // weight_iter + + } // plane_iter + } + } + else + { + float total_dpcm_bits = 0.0f, total_dct_bits = 0.0f; + const float FORBID_DCT_BITS = 1e+8f; + + for (uint32_t plane_iter = 0; plane_iter < total_planes; plane_iter++) + { + int prev_w = num_weight_levels / 2; + + for (uint32_t weight_iter = 0; weight_iter < total_weights; weight_iter++) + { + const auto& model = raw_weight_models[cur_log_blk.m_weight_ise_range - astc_helpers::FIRST_VALID_WEIGHT_ISE_RANGE]; + + int ise_w = cur_log_blk.m_weights[plane_iter + weight_iter * total_planes]; + int w = weight_ise_to_rank[ise_w]; + + int w_to_code = w; + w_to_code = imod(w - prev_w, num_weight_levels); + + prev_w = w; + + total_dpcm_bits += model.get_price(w_to_code); + + } // weight_iter + + } // plane_iter + + if (enc_cfg.m_use_dct) + { + for (uint32_t plane_iter = 0; plane_iter < total_planes; plane_iter++) + { + const basist::astc_ldr_t::dct_syms& syms = blk_out.m_packed_dct_plane_data[plane_iter]; + if (syms.m_max_coeff_mag > basist::astc_ldr_t::DCT_MAX_ARITH_COEFF_MAG) + { + total_dct_bits = FORBID_DCT_BITS; + break; + } + } + + if (total_dct_bits < FORBID_DCT_BITS) + { + for (uint32_t plane_iter = 0; plane_iter < total_planes; plane_iter++) + { + const basist::astc_ldr_t::dct_syms& syms = blk_out.m_packed_dct_plane_data[plane_iter]; + + assert((syms.m_num_dc_levels == basist::astc_ldr_t::DCT_MEAN_LEVELS0) || (syms.m_num_dc_levels == basist::astc_ldr_t::DCT_MEAN_LEVELS1)); + + total_dct_bits += weight_mean_models[(syms.m_num_dc_levels == basist::astc_ldr_t::DCT_MEAN_LEVELS1) ? 1 : 0].get_price(syms.m_dc_sym); + + for (uint32_t i = 0; i < syms.m_coeffs.size(); i++) + { + if (syms.m_coeffs[i].m_coeff == INT16_MAX) + { + total_dct_bits += dct_run_len_model.get_price(basist::astc_ldr_t::DCT_RUN_LEN_EOB_SYM_INDEX); + } + else + { + assert(syms.m_coeffs[i].m_num_zeros < basist::astc_ldr_t::DCT_RUN_LEN_EOB_SYM_INDEX); + + total_dct_bits += dct_run_len_model.get_price(syms.m_coeffs[i].m_num_zeros); + + total_dct_bits += 1.0f; // sign bit + assert((syms.m_coeffs[i].m_coeff != 0) && (iabs(syms.m_coeffs[i].m_coeff) <= 255)); + total_dct_bits += dct_coeff_mag.get_price(iabs(syms.m_coeffs[i].m_coeff) - 1); + } + } // i + } // plane_iter + } + } + + // TODO: Check if any DCT coeff overflows 8-bit mags, switch to DPCM. (In practice, not needed.) + bool use_dct = false; + if ((enc_cfg.m_use_dct) && + (total_dct_bits < FORBID_DCT_BITS) && + ((total_dct_bits + use_dct_model[use_dct_model_index].get_price(1)) <= (total_dpcm_bits + use_dct_model[use_dct_model_index].get_price(0)))) + { + use_dct = true; + } + + if (enc_cfg.m_use_dct) + { + total_weight_bits += use_dct_model[use_dct_model_index].get_price(use_dct); + enc.encode(use_dct, use_dct_model[use_dct_model_index]); + } + + if (use_dct) + { + prev_state.m_used_weight_dct = true; + + total_used_dct++; + + if (total_planes > 1) + { + assert(blk_out.m_packed_dct_plane_data[0].m_num_dc_levels == blk_out.m_packed_dct_plane_data[1].m_num_dc_levels); + } + + for (uint32_t plane_iter = 0; plane_iter < total_planes; plane_iter++) + { + const basist::astc_ldr_t::dct_syms& syms = blk_out.m_packed_dct_plane_data[plane_iter]; + + total_weight_bits += enc.encode_and_return_price(syms.m_dc_sym, weight_mean_models[(syms.m_num_dc_levels == basist::astc_ldr_t::DCT_MEAN_LEVELS1) ? 1 : 0]); + + for (uint32_t i = 0; i < syms.m_coeffs.size(); i++) + { + if (syms.m_coeffs[i].m_coeff == INT16_MAX) + { + total_weight_bits += enc.encode_and_return_price(basist::astc_ldr_t::DCT_RUN_LEN_EOB_SYM_INDEX, dct_run_len_model); + + total_dct_syms++; + } + else + { + total_weight_bits += enc.encode_and_return_price(syms.m_coeffs[i].m_num_zeros, dct_run_len_model); + + total_dct_syms++; + + enc.put_bit(syms.m_coeffs[i].m_coeff < 0); + total_weight_bits += 1.0f; + + assert((syms.m_coeffs[i].m_coeff != 0) && (iabs(syms.m_coeffs[i].m_coeff) <= 255)); + total_weight_bits += enc.encode_and_return_price(iabs(syms.m_coeffs[i].m_coeff) - 1, dct_coeff_mag); + + total_dct_syms++; + } + } + + } // plane_iter + } + else + { + total_used_weight_dpcm++; + auto& model = raw_weight_models[cur_log_blk.m_weight_ise_range - astc_helpers::FIRST_VALID_WEIGHT_ISE_RANGE]; + + for (uint32_t plane_iter = 0; plane_iter < total_planes; plane_iter++) + { + int prev_w = num_weight_levels / 2; + + for (uint32_t weight_iter = 0; weight_iter < total_weights; weight_iter++) + { + int ise_w = cur_log_blk.m_weights[plane_iter + weight_iter * total_planes]; + int w = weight_ise_to_rank[ise_w]; + + int w_to_code = w; + w_to_code = imod(w - prev_w, num_weight_levels); + + prev_w = w; + + total_weight_bits += model.get_price(w_to_code); + enc.encode(w_to_code, model); + + total_dpcm_syms++; + + } // weight_iter + + } // plane_iter + } + + } // use_faster_format + + } // bx + + if (cur_run_len) + { + total_runs++; + total_run_blocks += cur_run_len; + + total_header_bits += enc.encode_and_return_price((uint32_t)basist::astc_ldr_t::xuastc_mode::cMODE_RUN, mode_model); + total_header_bits += enc.put_gamma_and_return_price(cur_run_len, m_run_len_contexts); + cur_run_len = 0; + } + + } // by + + enc.put_bits(basist::astc_ldr_t::FINAL_SYNC_MARKER, basist::astc_ldr_t::FINAL_SYNC_MARKER_BITS); + + enc.flush(); + + if (global_cfg.m_debug_output) + { + fmt_debug_printf("Encoding time: {} secs\n", itm.get_elapsed_secs()); + } + + if (global_cfg.m_debug_images) + { + save_png(global_cfg.m_debug_file_prefix + "vis_img.png", vis_img); + } + + if ((global_cfg.m_debug_images) || (global_cfg.m_debug_output)) + { + image coded_img(width, height); + + vector2D phys_blocks(num_blocks_x, num_blocks_y); + + for (uint32_t by = 0; by < num_blocks_y; by++) + { + for (uint32_t bx = 0; bx < num_blocks_x; bx++) + { + const astc_helpers::log_astc_block& log_blk = coded_blocks(bx, by); + + color_rgba block_pixels[astc_ldr::ASTC_LDR_MAX_BLOCK_PIXELS]; + + bool status = astc_helpers::decode_block(log_blk, block_pixels, block_width, block_height, enc_cfg.m_cem_enc_params.m_decode_mode_srgb ? astc_helpers::cDecodeModeSRGB8 : astc_helpers::cDecodeModeLDR8); + if (!status) + { + fmt_error_printf("astc_helpers::decode_block() failed\n"); + return false; + } + + // Be positive the logical block can be unpacked correctly as XUASTC LDR. + color_rgba block_pixels_alt[astc_ldr::ASTC_LDR_MAX_BLOCK_PIXELS]; + bool status_alt = astc_helpers::decode_block_xuastc_ldr(log_blk, block_pixels_alt, block_width, block_height, enc_cfg.m_cem_enc_params.m_decode_mode_srgb ? astc_helpers::cDecodeModeSRGB8 : astc_helpers::cDecodeModeLDR8); + if (!status_alt) + { + fmt_error_printf("astc_helpers::decode_block_xuastc_ldr() failed\n"); + return false; + } + + if (memcmp(block_pixels, block_pixels_alt, sizeof(color_rgba) * block_width * block_height) != 0) + { + fmt_error_printf("astc_helpers::decode_block_xuastc_ldr() decode pixel mismatch\n"); + return false; + } + + coded_img.set_block_clipped(block_pixels, bx * block_width, by * block_height, block_width, block_height); + + } // bx + + } //by + + if (global_cfg.m_debug_images) + save_png(global_cfg.m_debug_file_prefix + "coded_img.png", coded_img); + + if (global_cfg.m_debug_output) + { + debug_printf("Orig image vs. coded img:\n"); + print_image_metrics(orig_img, coded_img); + } + } + + const uint64_t comp_data_size = enc.get_data_buf().size(); + if (comp_data_size > UINT32_MAX) + return false; + + uint8_vec suffix_bytes; + + if (use_faster_format) + { + suffix_bytes.reserve(8192); + + mean0_bits.flush(); + sign_bits.flush(); + weight2_bits.flush(); + weight3_bits.flush(); + weight4_bits.flush(); + + const uint32_t zstd_level = 9; + + uint8_vec comp_mean0, comp_mean1, comp_run, comp_coeff, comp_weight2, comp_weight3, comp_weight4, comp_weight8; + + if (!zstd_compress(mean0_bits.get_bytes().data(), mean0_bits.get_bytes().size(), comp_mean0, zstd_level)) + return false; + if (!zstd_compress(mean1_bytes.data(), mean1_bytes.size(), comp_mean1, zstd_level)) + return false; + if (!zstd_compress(run_bytes.data(), run_bytes.size(), comp_run, zstd_level)) + return false; + if (!zstd_compress(coeff_bytes.data(), coeff_bytes.size(), comp_coeff, zstd_level)) + return false; + if (!zstd_compress(weight2_bits.get_bytes().data(), weight2_bits.get_bytes().size(), comp_weight2, zstd_level)) + return false; + if (!zstd_compress(weight3_bits.get_bytes().data(), weight3_bits.get_bytes().size(), comp_weight3, zstd_level)) + return false; + if (!zstd_compress(weight4_bits.get_bytes().data(), weight4_bits.get_bytes().size(), comp_weight4, zstd_level)) + return false; + if (!zstd_compress(weight8_bits.data(), weight8_bits.size(), comp_weight8, zstd_level)) + return false; + + hdr.m_flags = (uint8_t)basist::astc_ldr_t::xuastc_ldr_syntax::cHybridArithZStd; + + hdr.m_arith_bytes_len = (uint32_t)comp_data_size; + hdr.m_mean0_bits_len = (uint32_t)comp_mean0.size(); + hdr.m_mean1_bytes_len = (uint32_t)comp_mean1.size(); + hdr.m_run_bytes_len = (uint32_t)comp_run.size(); + hdr.m_coeff_bytes_len = (uint32_t)comp_coeff.size(); + hdr.m_sign_bits_len = (uint32_t)sign_bits.get_bytes().size(); + hdr.m_weight2_bits_len = (uint32_t)comp_weight2.size(); + hdr.m_weight3_bits_len = (uint32_t)comp_weight3.size(); + hdr.m_weight4_bits_len = (uint32_t)comp_weight4.size(); + hdr.m_weight8_bytes_len = (uint32_t)comp_weight8.size(); + + suffix_bytes.append(comp_mean0); + suffix_bytes.append(comp_mean1); + suffix_bytes.append(comp_run); + suffix_bytes.append(comp_coeff); + suffix_bytes.append(sign_bits.get_bytes()); + suffix_bytes.append(comp_weight2); + suffix_bytes.append(comp_weight3); + suffix_bytes.append(comp_weight4); + suffix_bytes.append(comp_weight8); + + if (global_cfg.m_debug_output) + { + fmt_debug_printf("Zstd compressed sizes:\n"); + fmt_debug_printf(" Mean0 bytes: {} comp size: {}\n", (uint64_t)mean0_bits.get_bytes().size(), (uint64_t)comp_mean0.size()); + fmt_debug_printf(" Mean1 bytes: {} comp size: {}\n", (uint64_t)mean1_bytes.size(), (uint64_t)comp_mean1.size()); + fmt_debug_printf(" Run bytes: {} comp size: {}\n", (uint64_t)run_bytes.size(), (uint64_t)comp_run.size()); + fmt_debug_printf(" Coeff bytes: {} comp size: {}\n", (uint64_t)coeff_bytes.size(), (uint64_t)comp_coeff.size()); + fmt_debug_printf(" Sign bytes: {}\n", (uint64_t)sign_bits.get_bytes().size()); + fmt_debug_printf(" Weight2 bytes: {} comp size: {}\n", (uint64_t)weight2_bits.get_bytes().size(), (uint64_t)comp_weight2.size()); + fmt_debug_printf(" Weight3 bytes: {} comp size: {}\n", (uint64_t)weight3_bits.get_bytes().size(), (uint64_t)comp_weight3.size()); + fmt_debug_printf(" Weight4 bytes: {} comp size: {}\n", (uint64_t)weight4_bits.get_bytes().size(), (uint64_t)comp_weight4.size()); + fmt_debug_printf(" Weight8 bytes: {} comp size: {}\n", (uint64_t)weight8_bits.size(), (uint64_t)comp_weight8.size()); + } + } + + assert(comp_data.size() == 0); + if (use_faster_format) + { + comp_data.resize(sizeof(hdr)); + memcpy(comp_data.data(), &hdr, sizeof(hdr)); + } + else + { + comp_data.push_back((uint8_t)basist::astc_ldr_t::xuastc_ldr_syntax::cFullArith); + } + + comp_data.append(enc.get_data_buf()); + + comp_data.append(suffix_bytes); + + if (comp_data.size() > UINT32_MAX) + return false; + + if (global_cfg.m_debug_output) + { + fmt_debug_printf("Total blocks: {}\n", total_blocks); + fmt_debug_printf("Total lossy replacements made by supercompression layer: {} {3.2}%\n", total_lossy_replacements, (float)total_lossy_replacements * 100.0f / (float)total_blocks); + fmt_debug_printf("Total runs: {}, total run blocks: {} {3.2}%\n", total_runs, total_run_blocks, (float)total_run_blocks * 100.0f / (float)total_blocks); + fmt_debug_printf("Total blocks coded (not inside runs): {} {3.2}%\n", total_nonrun_blocks, (float)total_nonrun_blocks * 100.0f / (float)total_blocks); + fmt_debug_printf("num_part_hash_probes: {}, num_part_hash_hits: {} {3.2}%\n", num_part_hash_probes, num_part_hash_hits, num_part_hash_probes ? ((float)num_part_hash_hits * 100.0f / (float)num_part_hash_probes) : 0); + fmt_debug_printf("Total DCT syms: {}, DPCM syms: {}\n", total_dct_syms, total_dpcm_syms); + + const uint32_t total_non_void_extent_blocks = total_blocks - total_solid_blocks; + + fmt_debug_printf("Total blocks using void extent: {} {3.2}%\n", + total_solid_blocks, (float)total_solid_blocks * 100.0f / (float)total_blocks); + + fmt_debug_printf("Total non void-extent blocks: {} {3.2}%\n", + total_non_void_extent_blocks, (float)total_non_void_extent_blocks * 100.0f / (float)total_blocks); + + fmt_debug_printf("Total full cfg+part ID+endpoint reuse commands: {} {3.2}%\n", + total_full_reuse_commands, (float)total_full_reuse_commands * 100.0f / (float)total_blocks); + + fmt_debug_printf("Total raw commands: {} {3.2}%\n", + total_raw_commands, (float)total_raw_commands * 100.0f / (float)total_blocks); + + fmt_debug_printf("Total reuse cfg+part ID emitted: {} {3.2}%, Total full cfg emitted: {} {3.2}%\n", + total_reuse_full_cfg_emitted, (float)total_reuse_full_cfg_emitted * 100.0f / (float)total_blocks, + total_full_cfg_emitted, (float)total_full_cfg_emitted * 100.0f / (float)total_blocks); + + fmt_debug_printf("Total coded endpoints using DPCM: {} {3.2}%\n", + total_used_endpoint_dpcm, (float)total_used_endpoint_dpcm * 100.0f / (float)total_non_void_extent_blocks); + + fmt_debug_printf("Total coded endpoints using RAW: {} {3.2}%\n", + total_used_endpoint_raw, (float)total_used_endpoint_raw * 100.0f / (float)total_non_void_extent_blocks); + + fmt_debug_printf("Total coded blocks using weight DCT: {} {3.2}%, total blocks using weight DPCM: {} {3.2}%\n", + total_used_dct, (float)total_used_dct * 100.0f / total_non_void_extent_blocks, + total_used_weight_dpcm, (float)total_used_weight_dpcm * 100.0f / (float)total_non_void_extent_blocks); + + fmt_debug_printf("Total header bits: {} bytes: {}, bpp: {}, bits per non-void extent block: {}\nTotal endpoint bits: {}, bytes: {}, bpp: {}, bits per non-void extent block: {}\nTotal weight bits: {}, bytes: {}, bpp: {}, bits per non-void extent block: {}\nTotal_bits: {} bytes: {}, bpp {}, bits per non-void extent block: {}\n", + total_header_bits, total_header_bits / 8.0f, total_header_bits / (double)total_pixels, total_header_bits / (double)total_non_void_extent_blocks, + total_endpoint_bits, total_endpoint_bits / 8.0f, total_endpoint_bits / (double)total_pixels, total_endpoint_bits / (double)total_non_void_extent_blocks, + total_weight_bits, total_weight_bits / 8.0f, total_weight_bits / (double)total_pixels, total_weight_bits / (double)total_non_void_extent_blocks, + total_header_bits + total_endpoint_bits + total_weight_bits, + (total_header_bits + total_endpoint_bits + total_weight_bits) / 8.0f, + (total_header_bits + total_endpoint_bits + total_weight_bits) / (double)total_pixels, + (total_header_bits + total_endpoint_bits + total_weight_bits) / (double)total_non_void_extent_blocks); + + fmt_debug_printf("Compressed to {} bytes, {3.3}bpp\n\n", comp_data.size_u32(), ((float)comp_data.size() * 8.0f) / (float)total_pixels); + +#if 0 + for (uint32_t i = 0; i < 4; i++) + { + solid_color_dpcm_model[i].print_prices(fmt_string("solid_color_dpcm_model[{}]:\n\n", i).c_str()); + } +#endif + } + + return true; +} + +void encoder_init() +{ + if (g_initialized) + return; + + g_initialized = true; +} + +void deblock_filter(uint32_t filter_block_width, uint32_t filter_block_height, const image& src_img, image& dst_img, bool stronger_filtering, int SKIP_THRESH) +{ + image temp_img(src_img); + + for (int y = 0; y < (int)src_img.get_height(); y++) + { + for (int x = filter_block_width; x < (int)src_img.get_width(); x += filter_block_width) + { + color_rgba ll(src_img.get_clamped(x - 2, y)); + color_rgba l(src_img.get_clamped(x - 1, y)); + color_rgba r(src_img.get_clamped(x, y)); + color_rgba rr(src_img.get_clamped(x + 1, y)); + + if (SKIP_THRESH < 256) + { + bool skip_flag = false; + for (uint32_t c = 0; c < 4; c++) + { + int delta = iabs((int)l[c] - (int)r[c]); + if (delta > SKIP_THRESH) + { + skip_flag = true; + break; + } + } + + if (skip_flag) + continue; + } + + color_rgba ml, mr; + for (uint32_t c = 0; c < 4; c++) + { + if (stronger_filtering) + { + ml[c] = (3 * l[c] + 2 * r[c] + ll[c] + 3) / 6; + mr[c] = (3 * r[c] + 2 * l[c] + rr[c] + 3) / 6; + } + else + { + ml[c] = (5 * l[c] + 2 * r[c] + ll[c] + 4) / 8; + mr[c] = (5 * r[c] + 2 * l[c] + rr[c] + 4) / 8; + } + } + + temp_img.set_clipped(x - 1, y, ml); + temp_img.set_clipped(x, y, mr); + + } // x + + } // y + + dst_img = temp_img; + + for (int x = 0; x < (int)temp_img.get_width(); x++) + { + for (int y = filter_block_height; y < (int)temp_img.get_height(); y += filter_block_height) + { + color_rgba uu(temp_img.get_clamped(x, y - 2)); + color_rgba u(temp_img.get_clamped(x, y - 1)); + color_rgba d(temp_img.get_clamped(x, y)); + color_rgba dd(temp_img.get_clamped(x, y + 1)); + + if (SKIP_THRESH < 256) + { + bool skip_flag = false; + for (uint32_t c = 0; c < 4; c++) + { + int delta = iabs((int)u[c] - (int)d[c]); + if (delta > SKIP_THRESH) + { + skip_flag = true; + break; + } + } + + if (skip_flag) + continue; + } + + color_rgba mu, md; + for (uint32_t c = 0; c < 4; c++) + { + if (stronger_filtering) + { + mu[c] = (3 * u[c] + 2 * d[c] + uu[c] + 3) / 6; + md[c] = (3 * d[c] + 2 * u[c] + dd[c] + 3) / 6; + } + else + { + mu[c] = (5 * u[c] + 2 * d[c] + uu[c] + 4) / 8; + md[c] = (5 * d[c] + 2 * u[c] + dd[c] + 4) / 8; + } + } + + dst_img.set_clipped(x, y - 1, mu); + dst_img.set_clipped(x, y, md); + + } // x + + } // y +} + +} // namespace astc_ldr +} // namespace basisu diff --git a/encoder/basisu_astc_ldr_encode.h b/encoder/basisu_astc_ldr_encode.h new file mode 100644 index 0000000..0cbe51b --- /dev/null +++ b/encoder/basisu_astc_ldr_encode.h @@ -0,0 +1,125 @@ +// File: basisu_astc_ldr_encode.cpp +#pragma once +#include "basisu_enc.h" +#include "../transcoder/basisu_astc_helpers.h" + +namespace basisu { +namespace astc_ldr { + + void encoder_init(); + + const int EFFORT_LEVEL_MIN = 0, EFFORT_LEVEL_MAX = 10, EFFORT_LEVEL_DEF = 3; + const int DCT_QUALITY_MIN = 1, DCT_QUALITY_MAX = 100; + + struct astc_ldr_encode_config + { + astc_ldr_encode_config() + { + } + + void clear() + { + *this = astc_ldr_encode_config(); + } + + // ASTC LDR block dimensions. Must be a valid ASTC block dimension. Any supported from 4x4-12x12, including unequal dimensions. + uint32_t m_astc_block_width = 6; + uint32_t m_astc_block_height = 6; + + // If true, the encoder assumes all ASTC blocks will be decompressed using sRGB vs. LDR8 mode. This corresponds to astcenc's -cs vs. cl color profiles. + // This should match how the texture is later decoded by the GPU for maximum quality. This bit is stored into the output file. + bool m_astc_decode_mode_srgb = true; + + // If true, trade off some compression (3-10%) for faster decompression. + // If false, favor highest compression, but slower decompression. + //bool m_use_faster_format = true; + + basist::astc_ldr_t::xuastc_ldr_syntax m_compressed_syntax = basist::astc_ldr_t::xuastc_ldr_syntax::cFullArith; + + // Encoder CPU effort vs. quality. [0,10], higher=better. + // 0=extremely fast but very brittle (no subsets) + // 1=first 2 subset effort level + // 10=extremely high CPU requirements. + uint32_t m_effort_level = 3; + + // Weight grid DCT quality [1,100] - higher=better quality (JPEG-style). + float m_dct_quality = 85; + + // true=use weight grid DCT, false=always use DPCM + bool m_use_dct = false; + + // true=use lossy supercompression, false=supercompression stage is always lossless. + bool m_lossy_supercompression = false; + + // Channel weights used to compute RGBA colorspace L2 errors. Must be >= 1. + uint32_t m_comp_weights[4] = { 1, 1, 1, 1 }; + + // Lossy supercompression stage parameters for RGB vs. RGBA image inputs. + // (Bounded RDO - explictly not Lagrangian.) + float m_replacement_min_psnr = 35.0f; // if the block's base PSNR is less than this, it cannot be changed + float m_psnr_trial_diff_thresh = 1.5f; // reject candidates if their PSNR is lower than m_replacement_min_psnr-m_psnr_trial_diff_thresh + float m_psnr_trial_diff_thresh_edge = 1.0f; // edge variant + + // Lossy supercompression settings - alpha texture variants + float m_replacement_min_psnr_alpha = 38.0f; + float m_psnr_trial_diff_thresh_alpha = .75f; + float m_psnr_trial_diff_thresh_edge_alpha = .5f; + + // If true, try encoding blurred blocks, in addition to unblurred, for superpass 1 and 2. + // Higher quality, but massively slower and not yet tuned/refined. + bool m_block_blurring_p1 = false, m_block_blurring_p2 = false; + + // If true, no matter what effort level subset usage will be disabled. + bool m_force_disable_subsets = false; + + // If true, no matter what effort level RGB dual plane usage will be disabled. + bool m_force_disable_rgb_dual_plane = false; + + bool m_debug_images = false; + bool m_debug_output = false; + + std::string m_debug_file_prefix; + + void debug_print() const + { + fmt_debug_printf("ASTC block dimensions: {}x{}\n", m_astc_block_width, m_astc_block_height); + fmt_debug_printf("ASTC decode profile mode sRGB: {}\n", m_astc_decode_mode_srgb); + fmt_debug_printf("Syntax: {}\n", (uint32_t)m_compressed_syntax); + fmt_debug_printf("Effort level: {}\n", m_effort_level); + fmt_debug_printf("Use DCT: {}\n", m_use_dct); + fmt_debug_printf("DCT quality level (1-100): {}\n", m_dct_quality); + fmt_debug_printf("Comp weights: {} {} {} {}\n", m_comp_weights[0], m_comp_weights[1], m_comp_weights[2], m_comp_weights[3]); + fmt_debug_printf("Block blurring: {} {}\n", m_block_blurring_p1, m_block_blurring_p2); + fmt_debug_printf("Force disable subsets: {}\n", m_force_disable_subsets); + fmt_debug_printf("Force disable RGB dual plane: {}\n", m_force_disable_rgb_dual_plane); + + fmt_debug_printf("\nLossy supercompression: {}\n", m_lossy_supercompression); + fmt_debug_printf("m_replacement_min_psnr: {}\n", m_replacement_min_psnr); + fmt_debug_printf("m_psnr_trial_diff_thresh: {}\n", m_psnr_trial_diff_thresh); + fmt_debug_printf("m_psnr_trial_diff_thresh_edge: {}\n", m_psnr_trial_diff_thresh_edge); + fmt_debug_printf("m_replacement_min_psnr_alpha: {}\n", m_replacement_min_psnr_alpha); + fmt_debug_printf("m_psnr_trial_diff_thresh_alpha: {}\n", m_psnr_trial_diff_thresh_alpha); + fmt_debug_printf("m_psnr_trial_diff_thresh_edge_alpha: {}\n", m_psnr_trial_diff_thresh_edge_alpha); + + fmt_debug_printf("m_debug_images: {}\n", m_debug_images); + } + }; + + bool compress_image( + const image& orig_img, uint8_vec &comp_data, vector2D& coded_blocks, + const astc_ldr_encode_config& global_cfg, + job_pool& job_pool); + + bool decompress_image( + const uint8_t* pComp_data, size_t comp_data_size, + vector2D& coded_blocks, // the actual supercompressed ASTC LDR blocks emitted by the compressor + uint32_t& astc_block_width, uint32_t& astc_block_height, + uint32_t &actual_width, uint32_t &actual_height, bool &has_alpha, bool& uses_srgb_astc_decode_mode, + bool debug_output); + + void deblock_filter(uint32_t filter_block_width, uint32_t filter_block_height, const image& src_img, image& dst_img, bool stronger_filtering = false, int SKIP_THRESH = 24); + +} // namespace astc_ldr +} // namespace basisu + + diff --git a/encoder/basisu_wasm_api.cpp b/encoder/basisu_wasm_api.cpp new file mode 100644 index 0000000..d4db364 --- /dev/null +++ b/encoder/basisu_wasm_api.cpp @@ -0,0 +1,319 @@ +// File: basisu_wasm_api.cpp - Simplified compression API for WASM WASI modules and Python native support. +// Also useable by plain C callers. +#include "basisu_comp.h" +#include "basisu_wasm_api.h" + +using namespace basisu; + +static inline uint64_t wasm_offset(void* p) +{ + return (uint64_t)(uintptr_t)p; +} + +static inline uint8_t* wasm_ptr(uint64_t offset) +{ + return (uint8_t*)(uintptr_t)offset; +} + +BU_WASM_EXPORT("bu_get_version") +uint32_t bu_get_version() +{ + printf("Hello from basisu_wasm_api.cpp version %u\n", BASISU_LIB_VERSION); + + return BASISU_LIB_VERSION; +} + +BU_WASM_EXPORT("bu_enable_debug_printf") +void bu_enable_debug_printf(uint32_t flag) +{ + enable_debug_printf(flag != 0); +} + +BU_WASM_EXPORT("bu_init") +void bu_init() +{ + basisu_encoder_init(false, false); +} + +// Memory alloc/free — stubs +BU_WASM_EXPORT("bu_alloc") +uint64_t bu_alloc(uint64_t size) +{ + void* p = malloc((size_t)size); + return wasm_offset(p); +} + +BU_WASM_EXPORT("bu_free") +void bu_free(uint64_t ofs) +{ + free(wasm_ptr(ofs)); +} + +const uint32_t COMP_PARAMS_MAGIC = 0x43504D50; // "CPMP" + +struct comp_params +{ + uint32_t m_magic = COMP_PARAMS_MAGIC; + + comp_params() + { + clear(); + } + + void clear() + { + assert(m_magic == COMP_PARAMS_MAGIC); + + m_comp_data.clear(); + m_images.clear(); + m_imagesf.clear(); + + m_stats.clear(); + } + + uint8_vec m_comp_data; + + basisu::vector m_images; + basisu::vector m_imagesf; + + image_stats m_stats; +}; + +BU_WASM_EXPORT("bu_new_comp_params") +uint64_t bu_new_comp_params() +{ + comp_params* p = new comp_params; + return wasm_offset(p); +} + +BU_WASM_EXPORT("bu_delete_comp_params") +wasm_bool_t bu_delete_comp_params(uint64_t params_ofs) +{ + comp_params* p = (comp_params*)wasm_ptr(params_ofs); + if (!p) + return false; + + assert(p->m_magic == COMP_PARAMS_MAGIC); + if (p->m_magic != COMP_PARAMS_MAGIC) + return false; + + delete p; + + return true; +} + +BU_WASM_EXPORT("bu_comp_params_get_comp_data_size") +uint64_t bu_comp_params_get_comp_data_size(uint64_t params_ofs) +{ + comp_params* pParams = (comp_params*)wasm_ptr(params_ofs); + if (!pParams) + return 0; + + assert(pParams->m_magic == COMP_PARAMS_MAGIC); + if (pParams->m_magic != COMP_PARAMS_MAGIC) + return 0; + + return pParams->m_comp_data.size(); +} + +BU_WASM_EXPORT("bu_comp_params_get_comp_data_ofs") +uint64_t bu_comp_params_get_comp_data_ofs(uint64_t params_ofs) +{ + comp_params* pParams = (comp_params*)wasm_ptr(params_ofs); + if (!pParams) + return 0; + + assert(pParams->m_magic == COMP_PARAMS_MAGIC); + if (pParams->m_magic != COMP_PARAMS_MAGIC) + return 0; + + return wasm_offset(pParams->m_comp_data.get_ptr()); +} + +BU_WASM_EXPORT("bu_comp_params_clear") +wasm_bool_t bu_comp_params_clear(uint64_t params_ofs) +{ + comp_params* pParams = (comp_params*)wasm_ptr(params_ofs); + if (!pParams) + return false; + + assert(pParams->m_magic == COMP_PARAMS_MAGIC); + if (pParams->m_magic != COMP_PARAMS_MAGIC) + return false; + + pParams->clear(); + + return true; +} + +// Caller wants to give us a LDR/SDR 32bpp RGBA mipmap level (4 bytes per pixel) +BU_WASM_EXPORT("bu_comp_params_set_image_rgba32") +wasm_bool_t bu_comp_params_set_image_rgba32( + uint64_t params_ofs, + uint32_t image_index, + uint64_t img_data_ofs, + uint32_t width, uint32_t height, + uint32_t pitch_in_bytes) +{ + if ((!width) || (!height) || (!pitch_in_bytes)) + return false; + + comp_params* pParams = (comp_params*)wasm_ptr(params_ofs); + if (!pParams) + return false; + + assert(pParams->m_magic == COMP_PARAMS_MAGIC); + if (pParams->m_magic != COMP_PARAMS_MAGIC) + return false; + + const uint8_t* pImage = wasm_ptr(img_data_ofs); + if (!pImage) + return false; + + const uint32_t bytes_per_pixel = sizeof(color_rgba); + + if (pitch_in_bytes < width * bytes_per_pixel) + return false; + + if (image_index >= pParams->m_images.size()) + { + if (!pParams->m_images.try_resize(image_index + 1)) + return false; + } + + basisu::image& dst_img = pParams->m_images[image_index]; + + dst_img.resize(width, height); + + if (pitch_in_bytes == width * bytes_per_pixel) + { + memcpy(dst_img.get_ptr(), pImage, pitch_in_bytes * height); + } + else + { + for (uint32_t y = 0; y < height; y++) + { + const uint8_t* pSrc_row = pImage + y * pitch_in_bytes; + + uint8_t* pDst_row = (uint8_t *)&dst_img(0, y); + + memcpy(pDst_row, pSrc_row, width * bytes_per_pixel); + } // y + } + + return true; +} + +// Caller wants to give us a float RGBA mipmap level (4*4=16 bytes per pixel) +BU_WASM_EXPORT("bu_comp_params_set_image_float_rgba") +wasm_bool_t bu_comp_params_set_image_float_rgba( + uint64_t params_ofs, + uint32_t image_index, + uint64_t img_data_ofs, + uint32_t width, uint32_t height, + uint32_t pitch_in_bytes) +{ + if ((!width) || (!height) || (!pitch_in_bytes)) + return false; + + comp_params* pParams = (comp_params*)wasm_ptr(params_ofs); + if (!pParams) + return false; + + assert(pParams->m_magic == COMP_PARAMS_MAGIC); + if (pParams->m_magic != COMP_PARAMS_MAGIC) + return false; + + const uint8_t* pImage = wasm_ptr(img_data_ofs); + if (!pImage) + return false; + + const uint32_t bytes_per_pixel = sizeof(float) * 4; + + if (pitch_in_bytes < width * bytes_per_pixel) + return false; + + if (image_index >= pParams->m_images.size()) + { + if (!pParams->m_imagesf.try_resize(image_index + 1)) + return false; + } + + basisu::imagef& dst_img = pParams->m_imagesf[image_index]; + + dst_img.resize(width, height); + + if (pitch_in_bytes == width * bytes_per_pixel) + { + memcpy((void *)dst_img.get_ptr(), (const void *)pImage, pitch_in_bytes * height); + } + else + { + for (uint32_t y = 0; y < height; y++) + { + const uint8_t* pSrc_row = pImage + y * pitch_in_bytes; + + uint8_t* pDst_row = (uint8_t*)&dst_img(0, y); + + memcpy(pDst_row, pSrc_row, width * bytes_per_pixel); + } // y + } + + return true; +} + +BU_WASM_EXPORT("bu_compress_texture") +wasm_bool_t bu_compress_texture( + uint64_t params_ofs, + uint32_t desired_basis_tex_format, // basis_tex_format + int quality_level, int effort_level, + uint64_t flags_and_quality, float low_level_uastc_rdo_or_dct_quality) +{ + //enable_debug_printf((flags_and_quality & cFlagDebug) != 0); + + comp_params* pParams = (comp_params*)wasm_ptr(params_ofs); + if (!pParams) + return false; + + assert(pParams->m_magic == COMP_PARAMS_MAGIC); + if (pParams->m_magic != COMP_PARAMS_MAGIC) + return false; + + pParams->m_comp_data.clear(); + + if (desired_basis_tex_format >= (uint32_t)basist::basis_tex_format::cTotalFormats) + return false; + + if (!pParams->m_images.size() && !pParams->m_imagesf.size()) + return false; + if (pParams->m_images.size() && pParams->m_imagesf.size()) + return false; + + size_t comp_size = 0; + + void* pComp_data = basis_compress_internal( + (basist::basis_tex_format)desired_basis_tex_format, + pParams->m_images.size() ? &pParams->m_images : nullptr, + pParams->m_imagesf.size() ? &pParams->m_imagesf : nullptr, + (uint32_t)flags_and_quality, + low_level_uastc_rdo_or_dct_quality, + &comp_size, + &pParams->m_stats, + quality_level, + effort_level); + + if (!pComp_data) + return false; + + if (!pParams->m_comp_data.try_resize(comp_size)) + { + basis_free_data(pComp_data); + return false; + } + + memcpy(pParams->m_comp_data.get_ptr(), pComp_data, comp_size); + + basis_free_data(pComp_data); + + return true; +} diff --git a/encoder/basisu_wasm_api.h b/encoder/basisu_wasm_api.h new file mode 100644 index 0000000..92266bc --- /dev/null +++ b/encoder/basisu_wasm_api.h @@ -0,0 +1,58 @@ +// File: basisu_wasm_api.h +#pragma once +#include "basisu_wasm_api_common.h" + +BU_WASM_EXPORT("bu_get_version") +uint32_t bu_get_version(); + +BU_WASM_EXPORT("bu_enable_debug_printf") +void bu_enable_debug_printf(uint32_t flag); + +BU_WASM_EXPORT("bu_init") +void bu_init(); + +BU_WASM_EXPORT("bu_alloc") +uint64_t bu_alloc(uint64_t size); + +BU_WASM_EXPORT("bu_free") +void bu_free(uint64_t ofs); + +BU_WASM_EXPORT("bu_new_comp_params") +uint64_t bu_new_comp_params(); + +BU_WASM_EXPORT("bu_delete_comp_params") +wasm_bool_t bu_delete_comp_params(uint64_t params_ofs); + +BU_WASM_EXPORT("bu_comp_params_get_comp_data_size") +uint64_t bu_comp_params_get_comp_data_size(uint64_t params_ofs); + +BU_WASM_EXPORT("bu_comp_params_get_comp_data_ofs") +uint64_t bu_comp_params_get_comp_data_ofs(uint64_t params_ofs); + +BU_WASM_EXPORT("bu_comp_params_clear") +wasm_bool_t bu_comp_params_clear(uint64_t params_ofs); + +BU_WASM_EXPORT("bu_comp_params_set_image_rgba32") +wasm_bool_t bu_comp_params_set_image_rgba32( + uint64_t params_ofs, + uint32_t image_index, + uint64_t img_data_ofs, + uint32_t width, uint32_t height, + uint32_t pitch_in_bytes); + +BU_WASM_EXPORT("bu_comp_params_set_image_float_rgba") +wasm_bool_t bu_comp_params_set_image_float_rgba( + uint64_t params_ofs, + uint32_t image_index, + uint64_t img_data_ofs, + uint32_t width, uint32_t height, + uint32_t pitch_in_bytes); + +BU_WASM_EXPORT("bu_compress_texture") +wasm_bool_t bu_compress_texture( + uint64_t params_ofs, + uint32_t desired_basis_tex_format, + int quality_level, int effort_level, + uint64_t flags_and_quality, + float low_level_uastc_rdo_or_dct_quality); + diff --git a/encoder/basisu_wasm_api_common.h b/encoder/basisu_wasm_api_common.h new file mode 100644 index 0000000..d3fe1ae --- /dev/null +++ b/encoder/basisu_wasm_api_common.h @@ -0,0 +1,156 @@ +// File: basisu_wasm_api_common.h +#pragma once +#include "stdint.h" + +#if defined(__wasm__) + #if defined(__cplusplus) + #define BU_WASM_EXPORT(name) __attribute__((export_name(name))) extern "C" + #else + #define BU_WASM_EXPORT(name) __attribute__((export_name(name))) + #endif +#elif defined(__cplusplus) + #define BU_WASM_EXPORT(name) extern "C" +#else + #define BU_WASM_EXPORT(name) +#endif + +// wasm_bool_t is an alias for uint32_t +typedef uint32_t wasm_bool_t; + +// Compression constants + +#define BU_QUALITY_MIN 0 +#define BU_QUALITY_MAX 100 + +#define BU_EFFORT_MIN 0 +#define BU_EFFORT_MAX 10 +#define BU_EFFORT_SUPER_FAST = 0 +#define BU_EFFORT_FAST = 2 +#define BU_EFFORT_NORMAL = 5 +#define BU_EFFORT_DEFAULT = 2 +#define BU_EFFORT_SLOW = 8 +#define BU_EFFORT_VERY_SLOW = 10 + +#define BU_COMP_FLAGS_NONE (0) +#define BU_COMP_FLAGS_USE_OPENCL (1 << 8 ) +#define BU_COMP_FLAGS_THREADED (1 << 9 ) +#define BU_COMP_FLAGS_DEBUG_OUTPUT (1 << 10) +#define BU_COMP_FLAGS_KTX2_OUTPUT (1 << 11) +#define BU_COMP_FLAGS_KTX2_UASTC_ZSTD (1 << 12) +#define BU_COMP_FLAGS_SRGB (1 << 13) +#define BU_COMP_FLAGS_GEN_MIPS_CLAMP (1 << 14) +#define BU_COMP_FLAGS_GEN_MIPS_WRAP (1 << 15) +#define BU_COMP_FLAGS_Y_FLIP (1 << 16) +#define BU_COMP_FLAGS_PRINT_STATS (1 << 18) +#define BU_COMP_FLAGS_PRINT_STATUS (1 << 19) +#define BU_COMP_FLAGS_DEBUG_IMAGES (1 << 20) +#define BU_COMP_FLAGS_REC2020 (1 << 21) +#define BU_COMP_FLAGS_VALIDATE_OUTPUT (1 << 22) + +#define BU_COMP_FLAGS_XUASTC_LDR_FULL_ARITH (0) +#define BU_COMP_FLAGS_XUASTC_LDR_HYBRID (1 << 23) +#define BU_COMP_FLAGS_XUASTC_LDR_FULL_ZSTD (2 << 23) +#define BU_COMP_FLAGS_XUASTC_LDR_SYNTAX_SHIFT (23) +#define BU_COMP_FLAGS_XUASTC_LDR_SYNTAX_MASK (3) + +#define BU_COMP_FLAGS_TEXTURE_TYPE_2D (0 << 25) +#define BU_COMP_FLAGS_TEXTURE_TYPE_2D_ARRAY (1 << 25) +#define BU_COMP_FLAGS_TEXTURE_TYPE_CUBEMAP_ARRAY (2 << 25) +#define BU_COMP_FLAGS_TEXTURE_TYPE_VIDEO_FRAMES (3 << 25) +#define BU_COMP_FLAGS_TEXTURE_TYPE_SHIFT (25) +#define BU_COMP_FLAGS_TEXTURE_TYPE_MASK (3) + +#define BU_COMP_FLAGS_VERBOSE (BU_COMP_FLAGS_DEBUG_OUTPUT | BU_COMP_FLAGS_PRINT_STATS | BU_COMP_FLAGS_PRINT_STATUS) + +// basist::basis_tex_format: the supported .ktx2 (and .basis) file format types +#define BTF_ETC1S 0 +#define BTF_UASTC_LDR_4X4 1 +#define BTF_UASTC_HDR_4X4 2 +#define BTF_ASTC_HDR_6X6 3 +#define BTF_UASTC_HDR_6X6 4 +#define BTF_XUASTC_LDR_4X4 5 +#define BTF_XUASTC_LDR_5X4 6 +#define BTF_XUASTC_LDR_5X5 7 +#define BTF_XUASTC_LDR_6X5 8 +#define BTF_XUASTC_LDR_6X6 9 +#define BTF_XUASTC_LDR_8X5 10 +#define BTF_XUASTC_LDR_8X6 11 +#define BTF_XUASTC_LDR_10X5 12 +#define BTF_XUASTC_LDR_10X6 13 +#define BTF_XUASTC_LDR_8X8 14 +#define BTF_XUASTC_LDR_10X8 15 +#define BTF_XUASTC_LDR_10X10 16 +#define BTF_XUASTC_LDR_12X10 17 +#define BTF_XUASTC_LDR_12X12 18 +#define BTF_ASTC_LDR_4X4 19 +#define BTF_ASTC_LDR_5X4 20 +#define BTF_ASTC_LDR_5X5 21 +#define BTF_ASTC_LDR_6X5 22 +#define BTF_ASTC_LDR_6X6 23 +#define BTF_ASTC_LDR_8X5 24 +#define BTF_ASTC_LDR_8X6 25 +#define BTF_ASTC_LDR_10X5 26 +#define BTF_ASTC_LDR_10X6 27 +#define BTF_ASTC_LDR_8X8 28 +#define BTF_ASTC_LDR_10X8 29 +#define BTF_ASTC_LDR_10X10 30 +#define BTF_ASTC_LDR_12X10 31 +#define BTF_ASTC_LDR_12X12 32 +#define BTF_TOTAL_FORMATS 33 + +// Transcoding constants + +// basist::transcoder_texture_format: the supported transcode GPU texture formats +#define TF_ETC1_RGB 0 +#define TF_ETC2_RGBA 1 +#define TF_BC1_RGB 2 +#define TF_BC3_RGBA 3 +#define TF_BC4_R 4 +#define TF_BC5_RG 5 +#define TF_BC7_RGBA 6 +#define TF_PVRTC1_4_RGB 8 +#define TF_PVRTC1_4_RGBA 9 +#define TF_ASTC_LDR_4X4_RGBA 10 +#define TF_ATC_RGB 11 +#define TF_ATC_RGBA 12 +#define TF_FXT1_RGB 17 +#define TF_PVRTC2_4_RGB 18 +#define TF_PVRTC2_4_RGBA 19 +#define TF_ETC2_EAC_R11 20 +#define TF_ETC2_EAC_RG11 21 +#define TF_BC6H 22 +#define TF_ASTC_HDR_4X4_RGBA 23 +#define TF_RGBA32 13 +#define TF_RGB565 14 +#define TF_BGR565 15 +#define TF_RGBA4444 16 +#define TF_RGB_HALF 24 +#define TF_RGBA_HALF 25 +#define TF_RGB_9E5 26 +#define TF_ASTC_HDR_6X6_RGBA 27 +#define TF_ASTC_LDR_5X4_RGBA 28 +#define TF_ASTC_LDR_5X5_RGBA 29 +#define TF_ASTC_LDR_6X5_RGBA 30 +#define TF_ASTC_LDR_6X6_RGBA 31 +#define TF_ASTC_LDR_8X5_RGBA 32 +#define TF_ASTC_LDR_8X6_RGBA 33 +#define TF_ASTC_LDR_10X5_RGBA 34 +#define TF_ASTC_LDR_10X6_RGBA 35 +#define TF_ASTC_LDR_8X8_RGBA 36 +#define TF_ASTC_LDR_10X8_RGBA 37 +#define TF_ASTC_LDR_10X10_RGBA 38 +#define TF_ASTC_LDR_12X10_RGBA 39 +#define TF_ASTC_LDR_12X12_RGBA 40 +#define TF_TOTAL_TEXTURE_FORMATS 41 + +// basist::basisu_decode_flags: Transcode decode flags (bt_ktx2_transcode_image_level decode_flags parameter, logically OR'd) +#define DECODE_FLAGS_PVRTC_DECODE_TO_NEXT_POW2 2 +#define DECODE_FLAGS_TRANSCODE_ALPHA_DATA_TO_OPAQUE_FORMATS 4 +#define DECODE_FLAGS_BC1_FORBID_THREE_COLOR_BLOCKS 8 +#define DECODE_FLAGS_OUTPUT_HAS_ALPHA_INDICES 16 +#define DECODE_FLAGS_HIGH_QUALITY 32 +#define DECODE_FLAGS_NO_ETC1S_CHROMA_FILTERING 64 +#define DECODE_FLAGS_NO_DEBLOCK_FILTERING 128 +#define DECODE_FLAGS_STRONGER_DEBLOCK_FILTERING 256 +#define DECODE_FLAGS_FORCE_DEBLOCK_FILTERING 512 +#define DECODE_FLAGS_XUASTC_LDR_DISABLE_FAST_BC7_TRANSCODING 1024 diff --git a/encoder/basisu_wasm_transcoder_api.cpp b/encoder/basisu_wasm_transcoder_api.cpp new file mode 100644 index 0000000..ab46525 --- /dev/null +++ b/encoder/basisu_wasm_transcoder_api.cpp @@ -0,0 +1,1071 @@ +// basisu_wasm_transcoder_api.cpp - Transcoding API support for WASM WASI modules and Python native support. +// Also useable by plain C callers. +#include +#include +#include +#include "../transcoder/basisu_transcoder.h" +#include "basisu_wasm_transcoder_api.h" + +using namespace basisu; +using namespace basist; + +static inline uint64_t wasm_offset(void* p) +{ + return (uint64_t)(uintptr_t)p; +} + +static inline uint8_t* wasm_ptr(uint64_t offset) +{ + return (uint8_t*)(uintptr_t)offset; +} + +// High-level functions + +BU_WASM_EXPORT("bt_get_version") +uint32_t bt_get_version() +{ + printf("Hello from basisu_wasm_transcoder_api.cpp version %u\n", BASISD_LIB_VERSION); + + return BASISD_LIB_VERSION; +} + +BU_WASM_EXPORT("bt_enable_debug_printf") +void bt_enable_debug_printf(uint32_t flag) +{ + enable_debug_printf(flag != 0); +} + +BU_WASM_EXPORT("bt_init") +void bt_init() +{ + basisu_transcoder_init(); +} + +// Memory alloc/free — stubs +BU_WASM_EXPORT("bt_alloc") +uint64_t bt_alloc(uint64_t size) +{ + void* p = malloc((size_t)size); + return wasm_offset(p); +} + +BU_WASM_EXPORT("bt_free") +void bt_free(uint64_t mem_ofs) +{ + free(wasm_ptr(mem_ofs)); +} + +// basis_tex_format helpers + +BU_WASM_EXPORT("bt_basis_tex_format_is_xuastc_ldr") +wasm_bool_t bt_basis_tex_format_is_xuastc_ldr(uint32_t basis_tex_fmt_u32) +{ + assert(basis_tex_fmt_u32 < (uint32_t)basis_tex_format::cTotalFormats); + + basis_tex_format tex_fmt = static_cast(basis_tex_fmt_u32); + + return basis_tex_format_is_xuastc_ldr(tex_fmt); +} + +BU_WASM_EXPORT("bt_basis_tex_format_is_astc_ldr") +wasm_bool_t bt_basis_tex_format_is_astc_ldr(uint32_t basis_tex_fmt_u32) +{ + assert(basis_tex_fmt_u32 < (uint32_t)basis_tex_format::cTotalFormats); + + basis_tex_format tex_fmt = static_cast(basis_tex_fmt_u32); + + return basis_tex_format_is_astc_ldr(tex_fmt); +} + +BU_WASM_EXPORT("bt_basis_tex_format_get_block_width") +uint32_t bt_basis_tex_format_get_block_width(uint32_t basis_tex_fmt_u32) +{ + assert(basis_tex_fmt_u32 < (uint32_t)basis_tex_format::cTotalFormats); + + basis_tex_format tex_fmt = static_cast(basis_tex_fmt_u32); + + return basis_tex_format_get_block_width(tex_fmt); +} + +BU_WASM_EXPORT("bt_basis_tex_format_get_block_height") +uint32_t bt_basis_tex_format_get_block_height(uint32_t basis_tex_fmt_u32) +{ + assert(basis_tex_fmt_u32 < (uint32_t)basis_tex_format::cTotalFormats); + + basis_tex_format tex_fmt = static_cast(basis_tex_fmt_u32); + + return basis_tex_format_get_block_height(tex_fmt); +} + +BU_WASM_EXPORT("bt_basis_tex_format_is_hdr") +wasm_bool_t bt_basis_tex_format_is_hdr(uint32_t basis_tex_fmt_u32) +{ + assert(basis_tex_fmt_u32 < (uint32_t)basis_tex_format::cTotalFormats); + + basis_tex_format tex_fmt = static_cast(basis_tex_fmt_u32); + + return basis_tex_format_is_hdr(tex_fmt); +} + +BU_WASM_EXPORT("bt_basis_tex_format_is_ldr") +wasm_bool_t bt_basis_tex_format_is_ldr(uint32_t basis_tex_fmt_u32) +{ + assert(basis_tex_fmt_u32 < (uint32_t)basis_tex_format::cTotalFormats); + + basis_tex_format tex_fmt = static_cast(basis_tex_fmt_u32); + + return basis_tex_format_is_ldr(tex_fmt); +} + +// transcoder_texture_format helpers + +BU_WASM_EXPORT("bt_basis_get_bytes_per_block_or_pixel") +uint32_t bt_basis_get_bytes_per_block_or_pixel(uint32_t transcoder_texture_format_u32) +{ + assert(transcoder_texture_format_u32 < (uint32_t)transcoder_texture_format::cTFTotalTextureFormats); + + transcoder_texture_format fmt = static_cast(transcoder_texture_format_u32); + + return basis_get_bytes_per_block_or_pixel(fmt); +} + +BU_WASM_EXPORT("bt_basis_transcoder_format_has_alpha") +wasm_bool_t bt_basis_transcoder_format_has_alpha(uint32_t transcoder_texture_format_u32) +{ + assert(transcoder_texture_format_u32 < (uint32_t)transcoder_texture_format::cTFTotalTextureFormats); + + transcoder_texture_format fmt = static_cast(transcoder_texture_format_u32); + + return basis_transcoder_format_has_alpha(fmt); +} + +BU_WASM_EXPORT("bt_basis_transcoder_format_is_hdr") +wasm_bool_t bt_basis_transcoder_format_is_hdr(uint32_t transcoder_texture_format_u32) +{ + assert(transcoder_texture_format_u32 < (uint32_t)transcoder_texture_format::cTFTotalTextureFormats); + + transcoder_texture_format fmt = static_cast(transcoder_texture_format_u32); + + return basis_transcoder_format_is_hdr(fmt); +} + +BU_WASM_EXPORT("bt_basis_transcoder_format_is_ldr") +wasm_bool_t bt_basis_transcoder_format_is_ldr(uint32_t transcoder_texture_format_u32) +{ + assert(transcoder_texture_format_u32 < (uint32_t)transcoder_texture_format::cTFTotalTextureFormats); + + transcoder_texture_format fmt = static_cast(transcoder_texture_format_u32); + + return basis_transcoder_format_is_ldr(fmt); +} + +BU_WASM_EXPORT("bt_basis_transcoder_texture_format_is_astc") +wasm_bool_t bt_basis_transcoder_texture_format_is_astc(uint32_t transcoder_texture_format_u32) +{ + assert(transcoder_texture_format_u32 < (uint32_t)transcoder_texture_format::cTFTotalTextureFormats); + + transcoder_texture_format fmt = static_cast(transcoder_texture_format_u32); + + return basis_is_transcoder_texture_format_astc(fmt); +} + +BU_WASM_EXPORT("bt_basis_transcoder_format_is_uncompressed") +wasm_bool_t bt_basis_transcoder_format_is_uncompressed(uint32_t transcoder_texture_format_u32) +{ + assert(transcoder_texture_format_u32 < (uint32_t)transcoder_texture_format::cTFTotalTextureFormats); + + transcoder_texture_format fmt = static_cast(transcoder_texture_format_u32); + + return basis_transcoder_format_is_uncompressed(fmt); +} + +BU_WASM_EXPORT("bt_basis_get_uncompressed_bytes_per_pixel") +uint32_t bt_basis_get_uncompressed_bytes_per_pixel(uint32_t transcoder_texture_format_u32) +{ + assert(transcoder_texture_format_u32 < (uint32_t)transcoder_texture_format::cTFTotalTextureFormats); + + transcoder_texture_format fmt = static_cast(transcoder_texture_format_u32); + + return basis_get_uncompressed_bytes_per_pixel(fmt); +} + +BU_WASM_EXPORT("bt_basis_get_block_width") +uint32_t bt_basis_get_block_width(uint32_t transcoder_texture_format_u32) +{ + assert(transcoder_texture_format_u32 < (uint32_t)transcoder_texture_format::cTFTotalTextureFormats); + + transcoder_texture_format fmt = static_cast(transcoder_texture_format_u32); + + return basis_get_block_width(fmt); +} + +BU_WASM_EXPORT("bt_basis_get_block_height") +uint32_t bt_basis_get_block_height(uint32_t transcoder_texture_format_u32) +{ + assert(transcoder_texture_format_u32 < (uint32_t)transcoder_texture_format::cTFTotalTextureFormats); + + transcoder_texture_format fmt = static_cast(transcoder_texture_format_u32); + + return basis_get_block_height(fmt); +} + +BU_WASM_EXPORT("bt_basis_get_transcoder_texture_format_from_basis_tex_format") +uint32_t bt_basis_get_transcoder_texture_format_from_basis_tex_format(uint32_t basis_tex_format_u32) +{ + assert(basis_tex_format_u32 < (uint32_t)basis_tex_format::cTotalFormats); + + basis_tex_format fmt = static_cast(basis_tex_format_u32); + + return (uint32_t)basis_get_transcoder_texture_format_from_xuastc_or_astc_ldr_basis_tex_format(fmt); +} + +BU_WASM_EXPORT("bt_basis_is_format_supported") +wasm_bool_t bt_basis_is_format_supported(uint32_t transcoder_texture_format_u32, uint32_t basis_tex_format_u32) +{ + assert(transcoder_texture_format_u32 < (uint32_t)transcoder_texture_format::cTFTotalTextureFormats); + assert(basis_tex_format_u32 < (uint32_t)basis_tex_format::cTotalFormats); + + transcoder_texture_format transcoder_tex_fmt = static_cast(transcoder_texture_format_u32); + basis_tex_format basis_tex_fmt = static_cast(basis_tex_format_u32); + + return basis_is_format_supported(transcoder_tex_fmt, basis_tex_fmt); +} + +BU_WASM_EXPORT("bt_basis_compute_transcoded_image_size_in_bytes") +uint32_t bt_basis_compute_transcoded_image_size_in_bytes(uint32_t transcoder_texture_format_u32, uint32_t orig_width, uint32_t orig_height) +{ + assert(transcoder_texture_format_u32 < (uint32_t)transcoder_texture_format::cTFTotalTextureFormats); + + transcoder_texture_format transcoder_tex_fmt = static_cast(transcoder_texture_format_u32); + + return basis_compute_transcoded_image_size_in_bytes(transcoder_tex_fmt, orig_width, orig_height); +} + +// KTX2 inspection and transcoding helpers + +const uint32_t KTX2_HANDLE_MAGIC = 0xAB21EF20; + +struct ktx2_handle_t +{ + uint32_t m_magic = KTX2_HANDLE_MAGIC; + ktx2_transcoder m_transcoder; +}; + +BU_WASM_EXPORT("bt_ktx2_open") +uint64_t bt_ktx2_open(uint64_t data_mem_ofs, uint32_t data_len) +{ + if (!data_mem_ofs || (data_len < 4)) + return 0; + + ktx2_handle_t* pHandle = new ktx2_handle_t(); + + if (!pHandle->m_transcoder.init(wasm_ptr(data_mem_ofs), data_len)) + { + delete pHandle; + return 0; + } + + return wasm_offset(pHandle); +} + +BU_WASM_EXPORT("bt_ktx2_close") +void bt_ktx2_close(uint64_t handle) +{ + if (!handle) + return; + + ktx2_handle_t* pHandle = reinterpret_cast(wasm_ptr(handle)); + + assert(pHandle->m_magic == KTX2_HANDLE_MAGIC); + if (pHandle->m_magic != KTX2_HANDLE_MAGIC) + return; + + delete pHandle; +} + +BU_WASM_EXPORT("bt_ktx2_get_width") +uint32_t bt_ktx2_get_width(uint64_t handle) +{ + if (!handle) + { + assert(0); + return 0; + } + + ktx2_handle_t* pHandle = reinterpret_cast(wasm_ptr(handle)); + + assert(pHandle->m_magic == KTX2_HANDLE_MAGIC); + if (pHandle->m_magic != KTX2_HANDLE_MAGIC) + return 0; + + return pHandle->m_transcoder.get_width(); +} + +BU_WASM_EXPORT("bt_ktx2_get_height") +uint32_t bt_ktx2_get_height(uint64_t handle) +{ + if (!handle) + { + assert(0); + return 0; + } + + ktx2_handle_t* pHandle = reinterpret_cast(wasm_ptr(handle)); + + assert(pHandle->m_magic == KTX2_HANDLE_MAGIC); + if (pHandle->m_magic != KTX2_HANDLE_MAGIC) + return 0; + + return pHandle->m_transcoder.get_height(); +} + +BU_WASM_EXPORT("bt_ktx2_get_levels") +uint32_t bt_ktx2_get_levels(uint64_t handle) +{ + if (!handle) + { + assert(0); + return 0; + } + + ktx2_handle_t* pHandle = reinterpret_cast(wasm_ptr(handle)); + + assert(pHandle->m_magic == KTX2_HANDLE_MAGIC); + if (pHandle->m_magic != KTX2_HANDLE_MAGIC) + return 0; + + return pHandle->m_transcoder.get_levels(); +} + +BU_WASM_EXPORT("bt_ktx2_get_faces") +uint32_t bt_ktx2_get_faces(uint64_t handle) +{ + if (!handle) + { + assert(0); + return 0; + } + + ktx2_handle_t* pHandle = reinterpret_cast(wasm_ptr(handle)); + + assert(pHandle->m_magic == KTX2_HANDLE_MAGIC); + if (pHandle->m_magic != KTX2_HANDLE_MAGIC) + return 0; + + return pHandle->m_transcoder.get_faces(); +} + +BU_WASM_EXPORT("bt_ktx2_get_layers") +uint32_t bt_ktx2_get_layers(uint64_t handle) +{ + if (!handle) + { + assert(0); + return 0; + } + + ktx2_handle_t* pHandle = reinterpret_cast(wasm_ptr(handle)); + + assert(pHandle->m_magic == KTX2_HANDLE_MAGIC); + if (pHandle->m_magic != KTX2_HANDLE_MAGIC) + return 0; + + return pHandle->m_transcoder.get_layers(); +} + +BU_WASM_EXPORT("bt_ktx2_get_basis_tex_format") +uint32_t bt_ktx2_get_basis_tex_format(uint64_t handle) +{ + if (!handle) + { + assert(0); + return 0; + } + + ktx2_handle_t* pHandle = reinterpret_cast(wasm_ptr(handle)); + + assert(pHandle->m_magic == KTX2_HANDLE_MAGIC); + if (pHandle->m_magic != KTX2_HANDLE_MAGIC) + return 0; + + return (uint32_t)pHandle->m_transcoder.get_basis_tex_format(); +} + +BU_WASM_EXPORT("bt_ktx2_is_etc1s") +wasm_bool_t bt_ktx2_is_etc1s(uint64_t handle) +{ + if (!handle) + { + assert(0); + return false; + } + + ktx2_handle_t* pHandle = reinterpret_cast(wasm_ptr(handle)); + + assert(pHandle->m_magic == KTX2_HANDLE_MAGIC); + if (pHandle->m_magic != KTX2_HANDLE_MAGIC) + return false; + + return pHandle->m_transcoder.is_etc1s(); +} + +BU_WASM_EXPORT("bt_ktx2_is_uastc_ldr_4x4") +wasm_bool_t bt_ktx2_is_uastc_ldr_4x4(uint64_t handle) +{ + if (!handle) + { + assert(0); + return false; + } + + ktx2_handle_t* pHandle = reinterpret_cast(wasm_ptr(handle)); + + assert(pHandle->m_magic == KTX2_HANDLE_MAGIC); + if (pHandle->m_magic != KTX2_HANDLE_MAGIC) + return false; + + return pHandle->m_transcoder.is_uastc(); +} + +BU_WASM_EXPORT("bt_ktx2_is_hdr") +wasm_bool_t bt_ktx2_is_hdr(uint64_t handle) +{ + if (!handle) + { + assert(0); + return false; + } + + ktx2_handle_t* pHandle = reinterpret_cast(wasm_ptr(handle)); + + assert(pHandle->m_magic == KTX2_HANDLE_MAGIC); + if (pHandle->m_magic != KTX2_HANDLE_MAGIC) + return false; + + return pHandle->m_transcoder.is_hdr(); +} + +BU_WASM_EXPORT("bt_ktx2_is_hdr_4x4") +wasm_bool_t bt_ktx2_is_hdr_4x4(uint64_t handle) +{ + if (!handle) + { + assert(0); + return false; + } + + ktx2_handle_t* pHandle = reinterpret_cast(wasm_ptr(handle)); + + assert(pHandle->m_magic == KTX2_HANDLE_MAGIC); + if (pHandle->m_magic != KTX2_HANDLE_MAGIC) + return false; + + return pHandle->m_transcoder.is_hdr_4x4(); +} + +BU_WASM_EXPORT("bt_ktx2_is_hdr_6x6") +wasm_bool_t bt_ktx2_is_hdr_6x6(uint64_t handle) +{ + if (!handle) + { + assert(0); + return false; + } + + ktx2_handle_t* pHandle = reinterpret_cast(wasm_ptr(handle)); + + assert(pHandle->m_magic == KTX2_HANDLE_MAGIC); + if (pHandle->m_magic != KTX2_HANDLE_MAGIC) + return false; + + return pHandle->m_transcoder.is_hdr_6x6(); +} + +BU_WASM_EXPORT("bt_ktx2_is_ldr") +wasm_bool_t bt_ktx2_is_ldr(uint64_t handle) +{ + if (!handle) + { + assert(0); + return false; + } + + ktx2_handle_t* pHandle = reinterpret_cast(wasm_ptr(handle)); + + assert(pHandle->m_magic == KTX2_HANDLE_MAGIC); + if (pHandle->m_magic != KTX2_HANDLE_MAGIC) + return false; + + return pHandle->m_transcoder.is_ldr(); +} + +BU_WASM_EXPORT("bt_ktx2_is_astc_ldr") +wasm_bool_t bt_ktx2_is_astc_ldr(uint64_t handle) +{ + if (!handle) + { + assert(0); + return false; + } + + ktx2_handle_t* pHandle = reinterpret_cast(wasm_ptr(handle)); + + assert(pHandle->m_magic == KTX2_HANDLE_MAGIC); + if (pHandle->m_magic != KTX2_HANDLE_MAGIC) + return false; + + return pHandle->m_transcoder.is_astc_ldr(); +} + +BU_WASM_EXPORT("bt_ktx2_is_xuastc_ldr") +wasm_bool_t bt_ktx2_is_xuastc_ldr(uint64_t handle) +{ + if (!handle) + { + assert(0); + return false; + } + + ktx2_handle_t* pHandle = reinterpret_cast(wasm_ptr(handle)); + + assert(pHandle->m_magic == KTX2_HANDLE_MAGIC); + if (pHandle->m_magic != KTX2_HANDLE_MAGIC) + return false; + + return pHandle->m_transcoder.is_xuastc_ldr(); +} + +BU_WASM_EXPORT("bt_ktx2_get_block_width") +uint32_t bt_ktx2_get_block_width(uint64_t handle) +{ + if (!handle) + { + assert(0); + return 0; + } + + ktx2_handle_t* pHandle = reinterpret_cast(wasm_ptr(handle)); + + assert(pHandle->m_magic == KTX2_HANDLE_MAGIC); + if (pHandle->m_magic != KTX2_HANDLE_MAGIC) + return 0; + + return pHandle->m_transcoder.get_block_width(); +} + +BU_WASM_EXPORT("bt_ktx2_get_block_height") +uint32_t bt_ktx2_get_block_height(uint64_t handle) +{ + if (!handle) + { + assert(0); + return 0; + } + + ktx2_handle_t* pHandle = reinterpret_cast(wasm_ptr(handle)); + + assert(pHandle->m_magic == KTX2_HANDLE_MAGIC); + if (pHandle->m_magic != KTX2_HANDLE_MAGIC) + return 0; + + return pHandle->m_transcoder.get_block_height(); +} + +BU_WASM_EXPORT("bt_ktx2_has_alpha") +wasm_bool_t bt_ktx2_has_alpha(uint64_t handle) +{ + if (!handle) + { + assert(0); + return false; + } + + ktx2_handle_t* pHandle = reinterpret_cast(wasm_ptr(handle)); + + assert(pHandle->m_magic == KTX2_HANDLE_MAGIC); + if (pHandle->m_magic != KTX2_HANDLE_MAGIC) + return false; + + return pHandle->m_transcoder.get_has_alpha(); +} + +BU_WASM_EXPORT("bt_ktx2_get_dfd_color_model") +uint32_t bt_ktx2_get_dfd_color_model(uint64_t handle) +{ + if (!handle) + { + assert(0); + return 0; + } + + ktx2_handle_t* pHandle = reinterpret_cast(wasm_ptr(handle)); + + assert(pHandle->m_magic == KTX2_HANDLE_MAGIC); + if (pHandle->m_magic != KTX2_HANDLE_MAGIC) + return 0; + + return pHandle->m_transcoder.get_dfd_color_model(); +} + +BU_WASM_EXPORT("bt_ktx2_get_dfd_color_primaries") +uint32_t bt_ktx2_get_dfd_color_primaries(uint64_t handle) +{ + if (!handle) + { + assert(0); + return 0; + } + + ktx2_handle_t* pHandle = reinterpret_cast(wasm_ptr(handle)); + + assert(pHandle->m_magic == KTX2_HANDLE_MAGIC); + if (pHandle->m_magic != KTX2_HANDLE_MAGIC) + return 0; + + return pHandle->m_transcoder.get_dfd_color_primaries(); +} + +BU_WASM_EXPORT("bt_ktx2_get_dfd_transfer_func") +uint32_t bt_ktx2_get_dfd_transfer_func(uint64_t handle) +{ + if (!handle) + { + assert(0); + return 0; + } + + ktx2_handle_t* pHandle = reinterpret_cast(wasm_ptr(handle)); + + assert(pHandle->m_magic == KTX2_HANDLE_MAGIC); + if (pHandle->m_magic != KTX2_HANDLE_MAGIC) + return 0; + + return pHandle->m_transcoder.get_dfd_transfer_func(); +} + +BU_WASM_EXPORT("bt_ktx2_is_srgb") +wasm_bool_t bt_ktx2_is_srgb(uint64_t handle) +{ + if (!handle) + { + assert(0); + return false; + } + + ktx2_handle_t* pHandle = reinterpret_cast(wasm_ptr(handle)); + + assert(pHandle->m_magic == KTX2_HANDLE_MAGIC); + if (pHandle->m_magic != KTX2_HANDLE_MAGIC) + return false; + + return pHandle->m_transcoder.is_srgb(); +} + +BU_WASM_EXPORT("bt_ktx2_get_dfd_flags") +uint32_t bt_ktx2_get_dfd_flags(uint64_t handle) +{ + if (!handle) + { + assert(0); + return 0; + } + + ktx2_handle_t* pHandle = reinterpret_cast(wasm_ptr(handle)); + + assert(pHandle->m_magic == KTX2_HANDLE_MAGIC); + if (pHandle->m_magic != KTX2_HANDLE_MAGIC) + return 0; + + return pHandle->m_transcoder.get_dfd_flags(); +} + +BU_WASM_EXPORT("bt_ktx2_get_dfd_total_samples") +uint32_t bt_ktx2_get_dfd_total_samples(uint64_t handle) +{ + if (!handle) + { + assert(0); + return 0; + } + + ktx2_handle_t* pHandle = reinterpret_cast(wasm_ptr(handle)); + + assert(pHandle->m_magic == KTX2_HANDLE_MAGIC); + if (pHandle->m_magic != KTX2_HANDLE_MAGIC) + return 0; + + return pHandle->m_transcoder.get_dfd_total_samples(); +} + +BU_WASM_EXPORT("bt_ktx2_get_dfd_channel_id0") +uint32_t bt_ktx2_get_dfd_channel_id0(uint64_t handle) +{ + if (!handle) + { + assert(0); + return 0; + } + + ktx2_handle_t* pHandle = reinterpret_cast(wasm_ptr(handle)); + + assert(pHandle->m_magic == KTX2_HANDLE_MAGIC); + if (pHandle->m_magic != KTX2_HANDLE_MAGIC) + return 0; + + return pHandle->m_transcoder.get_dfd_channel_id0(); +} + +BU_WASM_EXPORT("bt_ktx2_get_dfd_channel_id1") +uint32_t bt_ktx2_get_dfd_channel_id1(uint64_t handle) +{ + if (!handle) + { + assert(0); + return 0; + } + + ktx2_handle_t* pHandle = reinterpret_cast(wasm_ptr(handle)); + + assert(pHandle->m_magic == KTX2_HANDLE_MAGIC); + if (pHandle->m_magic != KTX2_HANDLE_MAGIC) + return 0; + + return pHandle->m_transcoder.get_dfd_channel_id1(); +} + +BU_WASM_EXPORT("bt_ktx2_is_video") +wasm_bool_t bt_ktx2_is_video(uint64_t handle) +{ + if (!handle) + { + assert(0); + return false; + } + + ktx2_handle_t* pHandle = reinterpret_cast(wasm_ptr(handle)); + + assert(pHandle->m_magic == KTX2_HANDLE_MAGIC); + if (pHandle->m_magic != KTX2_HANDLE_MAGIC) + return false; + + return pHandle->m_transcoder.is_video(); +} + +BU_WASM_EXPORT("bt_ktx2_get_ldr_hdr_upconversion_nit_multiplier") +float bt_ktx2_get_ldr_hdr_upconversion_nit_multiplier(uint64_t handle) +{ + if (!handle) + { + assert(0); + return 0.0f; + } + + ktx2_handle_t* pHandle = reinterpret_cast(wasm_ptr(handle)); + + assert(pHandle->m_magic == KTX2_HANDLE_MAGIC); + if (pHandle->m_magic != KTX2_HANDLE_MAGIC) + return 0.0f; + + return pHandle->m_transcoder.get_ldr_hdr_upconversion_nit_multiplier(); +} + +BU_WASM_EXPORT("bt_ktx2_get_level_orig_width") +uint32_t bt_ktx2_get_level_orig_width(uint64_t handle, uint32_t level_index, uint32_t layer_index, uint32_t face_index) +{ + if (!handle) + { + assert(0); + return 0; + } + + ktx2_handle_t* pHandle = reinterpret_cast(wasm_ptr(handle)); + + assert(pHandle->m_magic == KTX2_HANDLE_MAGIC); + if (pHandle->m_magic != KTX2_HANDLE_MAGIC) + return 0; + + // FIXME slow - most info is thrown away. + ktx2_image_level_info level_info; + if (!pHandle->m_transcoder.get_image_level_info(level_info, level_index, layer_index, face_index)) + return 0; + + return level_info.m_orig_width; +} + +BU_WASM_EXPORT("bt_ktx2_get_level_orig_height") +uint32_t bt_ktx2_get_level_orig_height(uint64_t handle, uint32_t level_index, uint32_t layer_index, uint32_t face_index) +{ + if (!handle) + { + assert(0); + return 0; + } + + ktx2_handle_t* pHandle = reinterpret_cast(wasm_ptr(handle)); + + assert(pHandle->m_magic == KTX2_HANDLE_MAGIC); + if (pHandle->m_magic != KTX2_HANDLE_MAGIC) + return 0; + + // FIXME slow - most info is thrown away. + ktx2_image_level_info level_info; + if (!pHandle->m_transcoder.get_image_level_info(level_info, level_index, layer_index, face_index)) + return 0; + + return level_info.m_orig_height; +} + +BU_WASM_EXPORT("bt_ktx2_get_level_actual_width") +uint32_t bt_ktx2_get_level_actual_width(uint64_t handle, uint32_t level_index, uint32_t layer_index, uint32_t face_index) +{ + if (!handle) + { + assert(0); + return 0; + } + + ktx2_handle_t* pHandle = reinterpret_cast(wasm_ptr(handle)); + + assert(pHandle->m_magic == KTX2_HANDLE_MAGIC); + if (pHandle->m_magic != KTX2_HANDLE_MAGIC) + return 0; + + // FIXME slow - most info is thrown away. + ktx2_image_level_info level_info; + if (!pHandle->m_transcoder.get_image_level_info(level_info, level_index, layer_index, face_index)) + return 0; + + return level_info.m_width; +} + +BU_WASM_EXPORT("bt_ktx2_get_level_actual_height") +uint32_t bt_ktx2_get_level_actual_height(uint64_t handle, uint32_t level_index, uint32_t layer_index, uint32_t face_index) +{ + if (!handle) + { + assert(0); + return 0; + } + + ktx2_handle_t* pHandle = reinterpret_cast(wasm_ptr(handle)); + + assert(pHandle->m_magic == KTX2_HANDLE_MAGIC); + if (pHandle->m_magic != KTX2_HANDLE_MAGIC) + return 0; + + // FIXME slow - most info is thrown away. + ktx2_image_level_info level_info; + if (!pHandle->m_transcoder.get_image_level_info(level_info, level_index, layer_index, face_index)) + return 0; + + return level_info.m_height; +} + +BU_WASM_EXPORT("bt_ktx2_get_level_num_blocks_x") +uint32_t bt_ktx2_get_level_num_blocks_x(uint64_t handle, uint32_t level_index, uint32_t layer_index, uint32_t face_index) +{ + if (!handle) + { + assert(0); + return 0; + } + + ktx2_handle_t* pHandle = reinterpret_cast(wasm_ptr(handle)); + + assert(pHandle->m_magic == KTX2_HANDLE_MAGIC); + if (pHandle->m_magic != KTX2_HANDLE_MAGIC) + return 0; + + // FIXME slow - most info is thrown away. + ktx2_image_level_info level_info; + if (!pHandle->m_transcoder.get_image_level_info(level_info, level_index, layer_index, face_index)) + return 0; + + return level_info.m_num_blocks_x; +} + +BU_WASM_EXPORT("bt_ktx2_get_level_num_blocks_y") +uint32_t bt_ktx2_get_level_num_blocks_y(uint64_t handle, uint32_t level_index, uint32_t layer_index, uint32_t face_index) +{ + if (!handle) + { + assert(0); + return 0; + } + + ktx2_handle_t* pHandle = reinterpret_cast(wasm_ptr(handle)); + + assert(pHandle->m_magic == KTX2_HANDLE_MAGIC); + if (pHandle->m_magic != KTX2_HANDLE_MAGIC) + return 0; + + // FIXME slow - most info is thrown away. + ktx2_image_level_info level_info; + if (!pHandle->m_transcoder.get_image_level_info(level_info, level_index, layer_index, face_index)) + return 0; + + return level_info.m_num_blocks_y; +} + +BU_WASM_EXPORT("bt_ktx2_get_level_total_blocks") +uint32_t bt_ktx2_get_level_total_blocks(uint64_t handle, uint32_t level_index, uint32_t layer_index, uint32_t face_index) +{ + if (!handle) + { + assert(0); + return 0; + } + + ktx2_handle_t* pHandle = reinterpret_cast(wasm_ptr(handle)); + + assert(pHandle->m_magic == KTX2_HANDLE_MAGIC); + if (pHandle->m_magic != KTX2_HANDLE_MAGIC) + return 0; + + // FIXME slow - most info is thrown away. + ktx2_image_level_info level_info; + if (!pHandle->m_transcoder.get_image_level_info(level_info, level_index, layer_index, face_index)) + return 0; + + return level_info.m_total_blocks; +} + +BU_WASM_EXPORT("bt_ktx2_get_level_alpha_flag") +wasm_bool_t bt_ktx2_get_level_alpha_flag(uint64_t handle, uint32_t level_index, uint32_t layer_index, uint32_t face_index) +{ + if (!handle) + { + assert(0); + return false; + } + + ktx2_handle_t* pHandle = reinterpret_cast(wasm_ptr(handle)); + + assert(pHandle->m_magic == KTX2_HANDLE_MAGIC); + if (pHandle->m_magic != KTX2_HANDLE_MAGIC) + return false; + + // FIXME slow - most info is thrown away. + ktx2_image_level_info level_info; + if (!pHandle->m_transcoder.get_image_level_info(level_info, level_index, layer_index, face_index)) + return false; + + return level_info.m_alpha_flag; +} + +BU_WASM_EXPORT("bt_ktx2_get_level_iframe_flag") +wasm_bool_t bt_ktx2_get_level_iframe_flag(uint64_t handle, uint32_t level_index, uint32_t layer_index, uint32_t face_index) +{ + if (!handle) + { + assert(0); + return false; + } + + ktx2_handle_t* pHandle = reinterpret_cast(wasm_ptr(handle)); + + assert(pHandle->m_magic == KTX2_HANDLE_MAGIC); + if (pHandle->m_magic != KTX2_HANDLE_MAGIC) + return false; + + // FIXME slow - most info is thrown away. + ktx2_image_level_info level_info; + if (!pHandle->m_transcoder.get_image_level_info(level_info, level_index, layer_index, face_index)) + return false; + + return level_info.m_iframe_flag; +} + +BU_WASM_EXPORT("bt_ktx2_start_transcoding") +wasm_bool_t bt_ktx2_start_transcoding(uint64_t handle) +{ + if (!handle) + { + assert(0); + return false; + } + + ktx2_handle_t* pHandle = reinterpret_cast(wasm_ptr(handle)); + + assert(pHandle->m_magic == KTX2_HANDLE_MAGIC); + if (pHandle->m_magic != KTX2_HANDLE_MAGIC) + return false; + + return pHandle->m_transcoder.start_transcoding(); +} + +const uint32_t KTX2_TRANSCODE_STATE_MAGIC = 0x2B21CF21; + +struct ktx2_transcode_state_t +{ + uint32_t m_magic = KTX2_TRANSCODE_STATE_MAGIC; + + ktx2_transcoder_state m_state; +}; + +BU_WASM_EXPORT("bt_ktx2_create_transcode_state") +uint64_t bt_ktx2_create_transcode_state() +{ + return wasm_offset(new ktx2_transcode_state_t()); +} + +BU_WASM_EXPORT("bt_ktx2_destroy_transcode_state") +void bt_ktx2_destroy_transcode_state(uint64_t handle) +{ + if (!handle) + return; + + ktx2_transcode_state_t* pState = reinterpret_cast(wasm_ptr(handle)); + + assert(pState->m_magic == KTX2_TRANSCODE_STATE_MAGIC); + if (pState->m_magic != KTX2_TRANSCODE_STATE_MAGIC) + return; + + delete pState; +} + +BU_WASM_EXPORT("bt_ktx2_transcode_image_level") +wasm_bool_t bt_ktx2_transcode_image_level( + uint64_t ktx2_handle, + uint32_t level_index, uint32_t layer_index, uint32_t face_index, + uint64_t output_block_mem_ofs, uint32_t output_blocks_buf_size_in_blocks_or_pixels, + uint32_t transcoder_texture_format_u32, + uint32_t decode_flags, + uint32_t output_row_pitch_in_blocks_or_pixels, + uint32_t output_rows_in_pixels, + int channel0, int channel1, + uint64_t state_handle) +{ + if ((!ktx2_handle) || (!output_block_mem_ofs)) + { + assert(0); + return false; + } + + ktx2_handle_t* pHandle = reinterpret_cast(wasm_ptr(ktx2_handle)); + + assert(pHandle->m_magic == KTX2_HANDLE_MAGIC); + if (pHandle->m_magic != KTX2_HANDLE_MAGIC) + return false; + + assert(transcoder_texture_format_u32 < (uint32_t)transcoder_texture_format::cTFTotalTextureFormats); + transcoder_texture_format tex_fmt = static_cast(transcoder_texture_format_u32); + + ktx2_transcode_state_t* pTranscode_state = nullptr; + + if (state_handle) + { + pTranscode_state = reinterpret_cast(wasm_ptr(state_handle)); + + assert(pTranscode_state->m_magic == KTX2_TRANSCODE_STATE_MAGIC); + if (pTranscode_state->m_magic != KTX2_TRANSCODE_STATE_MAGIC) + return false; + } + + return pHandle->m_transcoder.transcode_image_level( + level_index, layer_index, face_index, + wasm_ptr(output_block_mem_ofs), output_blocks_buf_size_in_blocks_or_pixels, + tex_fmt, + decode_flags, output_row_pitch_in_blocks_or_pixels, output_rows_in_pixels, channel0, channel1, + pTranscode_state ? &pTranscode_state->m_state : nullptr); +} diff --git a/encoder/basisu_wasm_transcoder_api.h b/encoder/basisu_wasm_transcoder_api.h new file mode 100644 index 0000000..a7389ac --- /dev/null +++ b/encoder/basisu_wasm_transcoder_api.h @@ -0,0 +1,216 @@ +// File: basisu_wasm_transcoder_api.h - Transcoding API support for WASM WASI modules and Python native support. +#pragma once +#include "basisu_wasm_api_common.h" + +// High-level functions + +BU_WASM_EXPORT("bt_get_version") +uint32_t bt_get_version(); + +BU_WASM_EXPORT("bt_enable_debug_printf") +void bt_enable_debug_printf(uint32_t flag); + +BU_WASM_EXPORT("bt_init") +void bt_init(); + +BU_WASM_EXPORT("bt_alloc") +uint64_t bt_alloc(uint64_t size); + +BU_WASM_EXPORT("bt_free") +void bt_free(uint64_t ofs); + +// basis_tex_format helpers + +BU_WASM_EXPORT("bt_basis_tex_format_is_xuastc_ldr") +wasm_bool_t bt_basis_tex_format_is_xuastc_ldr(uint32_t basis_tex_fmt_u32); + +BU_WASM_EXPORT("bt_basis_tex_format_is_astc_ldr") +wasm_bool_t bt_basis_tex_format_is_astc_ldr(uint32_t basis_tex_fmt_u32); + +BU_WASM_EXPORT("bt_basis_tex_format_get_block_width") +uint32_t bt_basis_tex_format_get_block_width(uint32_t basis_tex_fmt_u32); + +BU_WASM_EXPORT("bt_basis_tex_format_get_block_height") +uint32_t bt_basis_tex_format_get_block_height(uint32_t basis_tex_fmt_u32); + +BU_WASM_EXPORT("bt_basis_tex_format_is_hdr") +wasm_bool_t bt_basis_tex_format_is_hdr(uint32_t basis_tex_format_u32); + +BU_WASM_EXPORT("bt_basis_tex_format_is_ldr") +wasm_bool_t bt_basis_tex_format_is_ldr(uint32_t basis_tex_format_u32); + +// transcoder_texture_format helpers + +BU_WASM_EXPORT("bt_basis_get_bytes_per_block_or_pixel") +uint32_t bt_basis_get_bytes_per_block_or_pixel(uint32_t transcoder_texture_format_u32); + +BU_WASM_EXPORT("bt_basis_transcoder_format_has_alpha") +wasm_bool_t bt_basis_transcoder_format_has_alpha(uint32_t transcoder_texture_format_u32); + +BU_WASM_EXPORT("bt_basis_transcoder_format_is_hdr") +wasm_bool_t bt_basis_transcoder_format_is_hdr(uint32_t transcoder_texture_format_u32); + +BU_WASM_EXPORT("bt_basis_transcoder_format_is_ldr") +wasm_bool_t bt_basis_transcoder_format_is_ldr(uint32_t transcoder_texture_format_u32); + +BU_WASM_EXPORT("bt_basis_transcoder_texture_format_is_astc") +wasm_bool_t bt_basis_transcoder_texture_format_is_astc(uint32_t transcoder_texture_format_u32); + +BU_WASM_EXPORT("bt_basis_transcoder_format_is_uncompressed") +wasm_bool_t bt_basis_transcoder_format_is_uncompressed(uint32_t transcoder_texture_format_u32); + +BU_WASM_EXPORT("bt_basis_get_uncompressed_bytes_per_pixel") +uint32_t bt_basis_get_uncompressed_bytes_per_pixel(uint32_t transcoder_texture_format_u32); + +BU_WASM_EXPORT("bt_basis_get_block_width") +uint32_t bt_basis_get_block_width(uint32_t transcoder_texture_format_u32); + +BU_WASM_EXPORT("bt_basis_get_block_height") +uint32_t bt_basis_get_block_height(uint32_t transcoder_texture_format_u32); + +BU_WASM_EXPORT("bt_basis_get_transcoder_texture_format_from_basis_tex_format") +uint32_t bt_basis_get_transcoder_texture_format_from_basis_tex_format(uint32_t basis_tex_format_u32); + +BU_WASM_EXPORT("bt_basis_is_format_supported") +wasm_bool_t bt_basis_is_format_supported(uint32_t transcoder_texture_format_u32, uint32_t basis_tex_format_u32); + +BU_WASM_EXPORT("bt_basis_compute_transcoded_image_size_in_bytes") +uint32_t bt_basis_compute_transcoded_image_size_in_bytes(uint32_t transcoder_texture_format_u32, uint32_t orig_width, uint32_t orig_height); + +// Transcoding +BU_WASM_EXPORT("bt_ktx2_open") +uint64_t bt_ktx2_open(uint64_t data_mem_ofs, uint32_t data_len); + +BU_WASM_EXPORT("bt_ktx2_close") +void bt_ktx2_close(uint64_t handle); + +BU_WASM_EXPORT("bt_ktx2_get_width") +uint32_t bt_ktx2_get_width(uint64_t handle); + +BU_WASM_EXPORT("bt_ktx2_get_height") +uint32_t bt_ktx2_get_height(uint64_t handle); + +BU_WASM_EXPORT("bt_ktx2_get_levels") +uint32_t bt_ktx2_get_levels(uint64_t handle); + +BU_WASM_EXPORT("bt_ktx2_get_faces") +uint32_t bt_ktx2_get_faces(uint64_t handle); + +BU_WASM_EXPORT("bt_ktx2_get_layers") +uint32_t bt_ktx2_get_layers(uint64_t handle); + +BU_WASM_EXPORT("bt_ktx2_get_basis_tex_format") +uint32_t bt_ktx2_get_basis_tex_format(uint64_t handle); + +BU_WASM_EXPORT("bt_ktx2_is_etc1s") +wasm_bool_t bt_ktx2_is_etc1s(uint64_t handle); + +BU_WASM_EXPORT("bt_ktx2_is_uastc_ldr_4x4") +wasm_bool_t bt_ktx2_is_uastc_ldr_4x4(uint64_t handle); + +BU_WASM_EXPORT("bt_ktx2_is_hdr") +wasm_bool_t bt_ktx2_is_hdr(uint64_t handle); + +BU_WASM_EXPORT("bt_ktx2_is_hdr_4x4") +wasm_bool_t bt_ktx2_is_hdr_4x4(uint64_t handle); + +BU_WASM_EXPORT("bt_ktx2_is_hdr_6x6") +wasm_bool_t bt_ktx2_is_hdr_6x6(uint64_t handle); + +BU_WASM_EXPORT("bt_ktx2_is_ldr") +wasm_bool_t bt_ktx2_is_ldr(uint64_t handle); + +BU_WASM_EXPORT("bt_ktx2_is_astc_ldr") +wasm_bool_t bt_ktx2_is_astc_ldr(uint64_t handle); + +BU_WASM_EXPORT("bt_ktx2_is_xuastc_ldr") +wasm_bool_t bt_ktx2_is_xuastc_ldr(uint64_t handle); + +BU_WASM_EXPORT("bt_ktx2_get_block_width") +uint32_t bt_ktx2_get_block_width(uint64_t handle); + +BU_WASM_EXPORT("bt_ktx2_get_block_height") +uint32_t bt_ktx2_get_block_height(uint64_t handle); + +BU_WASM_EXPORT("bt_ktx2_has_alpha") +wasm_bool_t bt_ktx2_has_alpha(uint64_t handle); + +BU_WASM_EXPORT("bt_ktx2_get_dfd_color_model") +uint32_t bt_ktx2_get_dfd_color_model(uint64_t handle); + +BU_WASM_EXPORT("bt_ktx2_get_dfd_color_primaries") +uint32_t bt_ktx2_get_dfd_color_primaries(uint64_t handle); + +BU_WASM_EXPORT("bt_ktx2_get_dfd_transfer_func") +uint32_t bt_ktx2_get_dfd_transfer_func(uint64_t handle); + +BU_WASM_EXPORT("bt_ktx2_is_srgb") +wasm_bool_t bt_ktx2_is_srgb(uint64_t handle); + +BU_WASM_EXPORT("bt_ktx2_get_dfd_flags") +uint32_t bt_ktx2_get_dfd_flags(uint64_t handle); + +BU_WASM_EXPORT("bt_ktx2_get_dfd_total_samples") +uint32_t bt_ktx2_get_dfd_total_samples(uint64_t handle); + +BU_WASM_EXPORT("bt_ktx2_get_dfd_channel_id0") +uint32_t bt_ktx2_get_dfd_channel_id0(uint64_t handle); + +BU_WASM_EXPORT("bt_ktx2_get_dfd_channel_id1") +uint32_t bt_ktx2_get_dfd_channel_id1(uint64_t handle); + +BU_WASM_EXPORT("bt_ktx2_is_video") +wasm_bool_t bt_ktx2_is_video(uint64_t handle); + +BU_WASM_EXPORT("bt_ktx2_get_ldr_hdr_upconversion_nit_multiplier") +float bt_ktx2_get_ldr_hdr_upconversion_nit_multiplier(uint64_t handle); + +BU_WASM_EXPORT("bt_ktx2_get_level_orig_width") +uint32_t bt_ktx2_get_level_orig_width(uint64_t handle, uint32_t level_index, uint32_t layer_index, uint32_t face_index); + +BU_WASM_EXPORT("bt_ktx2_get_level_orig_height") +uint32_t bt_ktx2_get_level_orig_height(uint64_t handle, uint32_t level_index, uint32_t layer_index, uint32_t face_index); + +BU_WASM_EXPORT("bt_ktx2_get_level_actual_width") +uint32_t bt_ktx2_get_level_actual_width(uint64_t handle, uint32_t level_index, uint32_t layer_index, uint32_t face_index); + +BU_WASM_EXPORT("bt_ktx2_get_level_actual_height") +uint32_t bt_ktx2_get_level_actual_height(uint64_t handle, uint32_t level_index, uint32_t layer_index, uint32_t face_index); + +BU_WASM_EXPORT("bt_ktx2_get_level_num_blocks_x") +uint32_t bt_ktx2_get_level_num_blocks_x(uint64_t handle, uint32_t level_index, uint32_t layer_index, uint32_t face_index); + +BU_WASM_EXPORT("bt_ktx2_get_level_num_blocks_y") +uint32_t bt_ktx2_get_level_num_blocks_y(uint64_t handle, uint32_t level_index, uint32_t layer_index, uint32_t face_index); + +BU_WASM_EXPORT("bt_ktx2_get_level_total_blocks") +uint32_t bt_ktx2_get_level_total_blocks(uint64_t handle, uint32_t level_index, uint32_t layer_index, uint32_t face_index); + +BU_WASM_EXPORT("bt_ktx2_get_level_alpha_flag") +wasm_bool_t bt_ktx2_get_level_alpha_flag(uint64_t handle, uint32_t level_index, uint32_t layer_index, uint32_t face_index); + +BU_WASM_EXPORT("bt_ktx2_get_level_iframe_flag") +wasm_bool_t bt_ktx2_get_level_iframe_flag(uint64_t handle, uint32_t level_index, uint32_t layer_index, uint32_t face_index); + +BU_WASM_EXPORT("bt_ktx2_start_transcoding") +wasm_bool_t bt_ktx2_start_transcoding(uint64_t handle); + +BU_WASM_EXPORT("bt_ktx2_create_transcode_state") +uint64_t bt_ktx2_create_transcode_state(); + +BU_WASM_EXPORT("bt_ktx2_destroy_transcode_state") +void bt_ktx2_destroy_transcode_state(uint64_t handle); + +BU_WASM_EXPORT("bt_ktx2_transcode_image_level") +wasm_bool_t bt_ktx2_transcode_image_level( + uint64_t ktx2_handle, // handle to KTX2 file, see bt_ktx2_open() + uint32_t level_index, uint32_t layer_index, uint32_t face_index, // KTX2 level/layer/face to transcode + uint64_t output_block_mem_ofs, // allocate using bt_alloc() + uint32_t output_blocks_buf_size_in_blocks_or_pixels, + uint32_t transcoder_texture_format_u32, // target format, TF_ETC1_RGB etc. + uint32_t decode_flags, // DECODE_FLAGS_ + uint32_t output_row_pitch_in_blocks_or_pixels, // can be 0 + uint32_t output_rows_in_pixels, // can be 0 + int channel0, int channel1, // both default to -1 + uint64_t state_handle); // thread local state: can be 0, or bt_ktx2_create_transcode_state() + diff --git a/example_capi/example_capi.c b/example_capi/example_capi.c new file mode 100644 index 0000000..f2c2439 --- /dev/null +++ b/example_capi/example_capi.c @@ -0,0 +1,707 @@ +// example_capi.c - Plain C API examples +// Compresses a procedurally generated 32bpp 512x512 test image to a XUASTC LDR 8x5 .ktx2 file with mipmaps and writes a .ktx2 file. +// The .ktx2 file is then opened by the transcoder module, examined and unpacked to RGBA 32bpp and ASTC textures which are saved to disk as .tga and .astc files. +// The .tga image files can be viewed by many common image editors/viewers. +// The standard .astc texture files can be unpacked to .PNG using ARM's astcenc tool, using a command line like this: astcenc-avx2.exe -ds transcoded_0_0_0.astc 0.png + +#include +#include +#include +#include +#include +#include + +typedef int BOOL; +#define TRUE (1) +#define FALSE (0) + +// Include compressor and transcoder C API definitions +#include "../encoder/basisu_wasm_api.h" +#include "../encoder/basisu_wasm_transcoder_api.h" + +// Write a blob of data in memory to a file +int write_blob_to_file(const char* pFilename, const void* pData, size_t len) +{ + assert(pFilename != NULL); + assert(pData != NULL); + + if (!pFilename || !pData) + return FALSE; + + FILE* f = fopen(pFilename, "wb"); + if (!f) + return FALSE; + + /* Write the data */ + size_t written = fwrite(pData, 1, len, f); + if (written != len) + { + fclose(f); + return FALSE; + } + + if (fclose(f) != 0) + return FALSE; + + return TRUE; /* success */ +} + +// Writes 24/32bpp .TGA image files +int write_tga_image(const char* pFilename, int w, int h, int has_alpha, const uint8_t* pPixelsRGBA) +{ + assert(pFilename != NULL); + assert(pPixelsRGBA != NULL); + assert(w > 0); + assert(h > 0); + assert((has_alpha == 0) || (has_alpha == 1)); + + /* Runtime argument validation */ + if ((!pFilename) || (!pPixelsRGBA) || (w <= 0) || (h <= 0)) + return -1; // invalid argument + + FILE* pFile = fopen(pFilename, "wb"); + if (!pFile) + return -2; // cannot open file + + uint8_t header[18] = { 0 }; + header[2] = 2; // uncompressed true-color + header[12] = (uint8_t)(w & 0xFF); + header[13] = (uint8_t)((w >> 8) & 0xFF); + header[14] = (uint8_t)(h & 0xFF); + header[15] = (uint8_t)((h >> 8) & 0xFF); + header[16] = has_alpha ? 32 : 24; + + /* Classic TGA: bottom-left origin */ + header[17] = has_alpha ? 8 : 0; + + if (fwrite(header, 1, 18, pFile) != 18) + { + fclose(pFile); + return -3; // header write failed + } + + uint64_t bytes_per_pixel = has_alpha ? 4ULL : 3ULL; + uint64_t pixel_bytes_u64 = (uint64_t)w * (uint64_t)h * bytes_per_pixel; + size_t pixel_bytes = (size_t)pixel_bytes_u64; + + if ((uint64_t)pixel_bytes != pixel_bytes_u64) + return -6; // overflow bogus dimensions + + /* allocate one scanline for BGRA/BGR output */ + size_t row_bytes = (size_t)w * bytes_per_pixel; + uint8_t* pRow = (uint8_t*)malloc(row_bytes); + if (!pRow) + { + fclose(pFile); + return -7; // out of memory + } + + /* TGA expects rows in bottom-to-top order */ + for (int y = 0; y < h; y++) + { + const uint8_t* pSrcRow = pPixelsRGBA + (size_t)(h - 1 - y) * w * bytes_per_pixel; + + /* Convert RGBA->BGRA or RGB->BGR for this row */ + if (has_alpha) + { + /* 4 bytes per pixel */ + for (int x = 0; x < w; x++) + { + const uint8_t* s = &pSrcRow[x * 4]; + uint8_t* d = &pRow[x * 4]; + + d[0] = s[2]; // B + d[1] = s[1]; // G + d[2] = s[0]; // R + d[3] = s[3]; // A + } + } + else + { + /* 3 bytes per pixel */ + for (int x = 0; x < w; x++) + { + const uint8_t* s = &pSrcRow[x * 3]; + uint8_t* d = &pRow[x * 3]; + + d[0] = s[2]; // B + d[1] = s[1]; // G + d[2] = s[0]; // R + } + } + + if (fwrite(pRow, 1, row_bytes, pFile) != row_bytes) + { + free(pRow); + fclose(pFile); + return -4; // pixel write failed + } + } + + free(pRow); + + if (fclose(pFile) != 0) + return -5; // close failed + + return 0; // success +} + +// Write standard ARM .ASTC format texture files +int write_astc_file(const char* pFilename, + const void* pBlocks, // pointer to ASTC blocks + uint32_t block_width, // in texels [4,12] + uint32_t block_height, // in texels [4,12] + uint32_t dim_x, // image actual dimension in texels + uint32_t dim_y) // image actual dimension in texels +{ + assert(pFilename != NULL); + assert(pBlocks != NULL); + assert(dim_x > 0); + assert(dim_y > 0); + assert((block_width >= 4) && (block_width <= 12)); + assert((block_height >= 4) && (block_height <= 12)); + + FILE* f = fopen(pFilename, "wb"); + if (!f) + return 0; + + /* Helper macro for writing single bytes with error check */ +#define PUTB(v) do { if (fputc((int)(v), f) == EOF) { fclose(f); return 0; } } while (0) + + /* Magic */ + PUTB(0x13); + PUTB(0xAB); + PUTB(0xA1); + PUTB(0x5C); + + /* Block dimensions: x, y, z = 1 */ + PUTB((uint8_t)block_width); + PUTB((uint8_t)block_height); + PUTB(1); /* block depth */ + + /* dim_x (24-bit little endian) */ + PUTB((uint8_t)(dim_x & 0xFF)); + PUTB((uint8_t)((dim_x >> 8) & 0xFF)); + PUTB((uint8_t)((dim_x >> 16) & 0xFF)); + + /* dim_y (24-bit little endian) */ + PUTB((uint8_t)(dim_y & 0xFF)); + PUTB((uint8_t)((dim_y >> 8) & 0xFF)); + PUTB((uint8_t)((dim_y >> 16) & 0xFF)); + + /* dim_z = 1 (24-bit LE) */ + PUTB(1); + PUTB(0); + PUTB(0); + + /* Compute block count and total bytes */ + uint32_t num_blocks_x = (dim_x + block_width - 1) / block_width; + uint32_t num_blocks_y = (dim_y + block_height - 1) / block_height; + + uint64_t total_bytes_u64 = + (uint64_t)num_blocks_x * (uint64_t)num_blocks_y * 16ULL; + + size_t total_bytes = (size_t)total_bytes_u64; + + if ((uint64_t)total_bytes != total_bytes_u64) + { + fclose(f); + return 0; /* overflow → fail */ + } + + /* Write block data directly */ + size_t written = fwrite(pBlocks, 1, total_bytes, f); + if (written != total_bytes) + { + fclose(f); /* still close even if error */ + return 0; + } + + if (fclose(f) != 0) + return 0; + + return 1; /* success */ + +#undef PUTB +} + +// Procedurally create a simple test image in memory +uint8_t* create_pretty_rgba_pattern(int w, int h, float q) +{ + if (w <= 0 || h <= 0) + return NULL; + + uint8_t* pImage = (uint8_t*)malloc((size_t)w * h * 4); + if (!pImage) + return NULL; + + for (int y = 0; y < h; y++) + { + for (int x = 0; x < w; x++) + { + /* normalized coordinates 0..1 */ + float fx = (float)x / (float)w; + float fy = (float)y / (float)h; + + /* --- Extra coordinate warping when q != 0 --- */ + if (q != 0.0f) { + float warp = sinf((fx + fy) * 10.0f * q); + fx += 0.15f * q * warp; + fy += 0.15f * q * sinf((fx - fy) * 8.0f * q); + } + + /* Original plasma formula */ + float v = sinf(fx * 12.0f + fy * 4.0f); + v += sinf(fy * 9.0f - fx * 6.0f); + v += sinf((fx + fy) * 7.0f); + + /* Extra variation term — contributes only when q != 0 */ + if (q != 0.0f) + { + v += q * 0.7f * sinf((fx * fx + fy) * 20.0f); + v += q * 0.4f * cosf((fx - fy) * 18.0f); + } + + /* scale to 0..1 */ + v = v * 0.25f + 0.5f; + + float L = 1.5f; + + /* Convert to RGB colors */ + int r = (int)roundf(255.0f * sinf(v * 6.28f) * L); + int g = (int)roundf(255.0f * (1.0f - v) * L); + int b = (int)roundf(255.0f * v * L); + + /* clamp */ + if (r < 0) r = 0; else if (r > 255) r = 255; + if (g < 0) g = 0; else if (g > 255) g = 255; + if (b < 0) b = 0; else if (b > 255) b = 255; + + /* write RGBA */ + uint8_t* p = &pImage[(y * w + x) * 4]; + p[0] = (uint8_t)r; + p[1] = (uint8_t)g; + p[2] = (uint8_t)b; + p[3] = 255; + } + } + + return pImage; +} + +// Takes a KTX2 file in memory and displays info about it, then transcodes it to RGBA32 and ASTC, writing .tga/.astc files to disk +int transcode_ktx2_file(const void* pKTX2_data, size_t ktx2_data_size, const char *pDesc) +{ + printf("------ transcode_ktx2_file(): ktx2 size: %zu, desc: %s\n", ktx2_data_size, pDesc); + + if (!pKTX2_data || !ktx2_data_size) + return FALSE; + + if ((uint32_t)ktx2_data_size != ktx2_data_size) + return FALSE; + + uint64_t ktx2_data_ofs = bt_alloc(ktx2_data_size); + if (!ktx2_data_ofs) + return FALSE; + + memcpy((void*)ktx2_data_ofs, pKTX2_data, ktx2_data_size); + + uint64_t ktx2_handle = bt_ktx2_open(ktx2_data_ofs, (uint32_t)ktx2_data_size); + if (!ktx2_handle) + { + bt_free(ktx2_data_ofs); + return FALSE; + } + + // Just testing LDR here for now + if (!bt_ktx2_is_ldr(ktx2_handle)) + { + bt_ktx2_close(ktx2_handle); + bt_free(ktx2_data_ofs); + return FALSE; + } + + if (!bt_ktx2_start_transcoding(ktx2_handle)) + { + bt_ktx2_close(ktx2_handle); + bt_free(ktx2_data_ofs); + return FALSE; + } + + uint32_t width = bt_ktx2_get_width(ktx2_handle), height = bt_ktx2_get_height(ktx2_handle); + uint32_t levels = bt_ktx2_get_levels(ktx2_handle); // number of mipmap levels, must be >= 1 + uint32_t faces = bt_ktx2_get_faces(ktx2_handle); // 1 or 6 + uint32_t layers = bt_ktx2_get_layers(ktx2_handle); // 0 or array size + + uint32_t basis_tex_format = bt_ktx2_get_basis_tex_format(ktx2_handle); + uint32_t block_width = bt_ktx2_get_block_width(ktx2_handle); + uint32_t block_height = bt_ktx2_get_block_height(ktx2_handle); + uint32_t is_srgb = bt_ktx2_is_srgb(ktx2_handle); + uint32_t is_video = bt_ktx2_is_video(ktx2_handle); // only reliably set after calling bt_ktx2_start_transcoding() + + printf("KTX2 Dimensions: %ux%u, Levels: %u, Faces: %u, Layers: %u\n", width, height, levels, faces, layers); + printf("basis_tex_format: %u\n", basis_tex_format); + printf("Block dimensions: %ux%u\n", block_width, block_height); + printf("is sRGB: %u\n", is_srgb); + printf("is video: %u\n", is_video); + + assert((width >= 1) && (height >= 1)); + assert(levels >= 1); + assert((faces == 6) || (faces == 1)); + + // If layers==0 it's not a texture array + if (layers < 1) + layers = 1; + + // Create our transcoding state handle (which contains thread-local state) + // This is actually optional, and only needed for thread-safe transcoding, but we'll test it here. + uint64_t transcode_state_handle = bt_ktx2_create_transcode_state(); + + for (uint32_t level_index = 0; level_index < levels; level_index++) + { + for (uint32_t layer_index = 0; layer_index < layers; layer_index++) + { + for (uint32_t face_index = 0; face_index < faces; face_index++) + { + printf("- Level: %u, layer: %u, face: %u\n", level_index, layer_index, face_index); + + uint32_t orig_width = bt_ktx2_get_level_orig_width(ktx2_handle, level_index, layer_index, face_index); + uint32_t orig_height = bt_ktx2_get_level_orig_height(ktx2_handle, level_index, layer_index, face_index); + + printf(" Orig dimensions: %ux%u, actual: %ux%u\n", + orig_width, orig_height, + bt_ktx2_get_level_actual_width(ktx2_handle, level_index, layer_index, face_index), bt_ktx2_get_level_actual_height(ktx2_handle, level_index, layer_index, face_index)); + + printf(" Block dimensions: %ux%u, total blocks: %u\n", + bt_ktx2_get_level_num_blocks_x(ktx2_handle, level_index, layer_index, face_index), + bt_ktx2_get_level_num_blocks_y(ktx2_handle, level_index, layer_index, face_index), + bt_ktx2_get_level_total_blocks(ktx2_handle, level_index, layer_index, face_index)); + + printf(" Alpha flag: %u, iframe flag: %u\n", + bt_ktx2_get_level_alpha_flag(ktx2_handle, level_index, layer_index, face_index), + bt_ktx2_get_level_iframe_flag(ktx2_handle, level_index, layer_index, face_index)); + + // First transcode level to uncompressed RGBA32 and write a .tga file + { + char tga_filename[256]; + snprintf(tga_filename, sizeof(tga_filename), "transcoded_%s_L%u_Y%u_F%u.tga", pDesc, level_index, layer_index, face_index); + + uint32_t transcode_buf_size = bt_basis_compute_transcoded_image_size_in_bytes(TF_RGBA32, orig_width, orig_height); + assert(transcode_buf_size); + + uint64_t transcode_buf_ofs = bt_alloc(transcode_buf_size); + + uint32_t decode_flags = 0; + + if (!bt_ktx2_transcode_image_level(ktx2_handle, level_index, layer_index, face_index, + transcode_buf_ofs, transcode_buf_size, + TF_RGBA32, + decode_flags, + 0, 0, -1, -1, transcode_state_handle)) + { + bt_free(transcode_buf_ofs); + bt_ktx2_destroy_transcode_state(transcode_state_handle); + bt_ktx2_close(ktx2_handle); + bt_free(ktx2_data_ofs); + return FALSE; + } + + write_tga_image(tga_filename, orig_width, orig_height, TRUE, (uint8_t*)transcode_buf_ofs); + printf("Wrote file %s\n", tga_filename); + + bt_free(transcode_buf_ofs); + transcode_buf_ofs = 0; + } + + // Now transcode to ASTC and write a .astc file + { + char astc_filename[256]; + snprintf(astc_filename, sizeof(astc_filename), "transcoded_%s_L%u_Y%u_F%u.astc", pDesc, level_index, layer_index, face_index); + + // Determine the correct ASTC transcode texture format from the ktx2 format + uint32_t target_transcode_fmt = bt_basis_get_transcoder_texture_format_from_basis_tex_format(basis_tex_format); + + uint32_t transcode_buf_size = bt_basis_compute_transcoded_image_size_in_bytes(target_transcode_fmt, orig_width, orig_height); + assert(transcode_buf_size); + + uint64_t transcode_buf_ofs = bt_alloc(transcode_buf_size); + + uint32_t decode_flags = 0; + + if (!bt_ktx2_transcode_image_level(ktx2_handle, level_index, layer_index, face_index, + transcode_buf_ofs, transcode_buf_size, + target_transcode_fmt, + decode_flags, + 0, 0, -1, -1, transcode_state_handle)) + { + bt_free(transcode_buf_ofs); + bt_ktx2_destroy_transcode_state(transcode_state_handle); + bt_ktx2_close(ktx2_handle); + bt_free(ktx2_data_ofs); + return FALSE; + } + + write_astc_file(astc_filename, (void*)transcode_buf_ofs, block_width, block_height, orig_width, orig_height); + printf("Wrote .astc file %s\n", astc_filename); + + bt_free(transcode_buf_ofs); + transcode_buf_ofs = 0; + } + + } // face_index + + } // layer_index + + } // level_index + + bt_ktx2_destroy_transcode_state(transcode_state_handle); + transcode_state_handle = 0; + + bt_ktx2_close(ktx2_handle); + ktx2_handle = 0; + + bt_free(ktx2_data_ofs); + ktx2_data_ofs = 0; + + return TRUE; +} + +// Simple 2D test +int test_2D() +{ + printf("------ test_2D():\n"); + + // Generate a test image + int W = 512, H = 512; + + uint8_t* pSrc_image = create_pretty_rgba_pattern(W, H, 0.0f); + + // Save the test image to a .tga file + write_tga_image("test_image.tga", W, H, TRUE, pSrc_image); + printf("Wrote file test_image.tga\n"); + + // Compress it to .ktx2 + uint64_t comp_params = bu_new_comp_params(); + + // Allocate memory + uint64_t img_ofs = bu_alloc(W * H * 4); + if (!img_ofs) + { + fprintf(stderr, "bu_alloc() failed\n"); + return EXIT_FAILURE; + } + + // Copy the test image into the allocated memory + memcpy((void*)img_ofs, pSrc_image, W * H * 4); + + // Supply the image to the compressor - it'll immediately make a copy of the data + if (!bu_comp_params_set_image_rgba32(comp_params, 0, img_ofs, W, H, W * 4)) + { + fprintf(stderr, "bu_comp_params_set_image_rgba32() failed\n"); + return EXIT_FAILURE; + } + + bu_free(img_ofs); + img_ofs = 0; + + // Now compress it to XUASTC LDR 8x5 with weight grid DCT + uint32_t basis_tex_format = BTF_XUASTC_LDR_8X5; + //uint32_t basis_tex_format = BTF_ASTC_LDR_8X5; + //uint32_t basis_tex_format = BTF_ETC1S; + //uint32_t basis_tex_format = BTF_UASTC_LDR_4X4; + + uint32_t quality_level = 85; + uint32_t effort_level = 2; + + uint32_t flags = BU_COMP_FLAGS_KTX2_OUTPUT | BU_COMP_FLAGS_SRGB | + BU_COMP_FLAGS_THREADED | BU_COMP_FLAGS_GEN_MIPS_CLAMP | + BU_COMP_FLAGS_PRINT_STATS | BU_COMP_FLAGS_PRINT_STATUS; + + if (!bu_compress_texture(comp_params, basis_tex_format, quality_level, effort_level, flags, 0.0f)) + { + fprintf(stderr, "bu_compress_texture() failed\n"); + return EXIT_FAILURE; + } + + // Retrieve the compressed .KTX2 file data + uint64_t comp_size = bu_comp_params_get_comp_data_size(comp_params); + if (!comp_size) + { + fprintf(stderr, "bu_comp_params_get_comp_data_size() failed\n"); + return EXIT_FAILURE; + } + + void* pComp_data = (void*)bu_comp_params_get_comp_data_ofs(comp_params); + if (!pComp_data) + { + fprintf(stderr, "bu_comp_params_get_comp_data_ofs() failed\n"); + return EXIT_FAILURE; + } + + // Write the data to disk + write_blob_to_file("test.ktx2", pComp_data, comp_size); + printf("Wrote file test.ktx2\n"); + + // Now inspect and transcode the .KTX2 data to png/astc files + if (!transcode_ktx2_file(pComp_data, comp_size, "2D")) + { + fprintf(stderr, "transcode_ktx2_file() failed\n"); + return EXIT_FAILURE; + } + + bu_delete_comp_params(comp_params); + + free(pSrc_image); + return EXIT_SUCCESS; +} + +// 2D array/texture video test +int test_2D_array(BOOL tex_video_flag, int L, BOOL mipmap_flag) +{ + printf("------ test_2D_array() %i %i %i:\n", tex_video_flag, L, mipmap_flag); + + // Generate a test image + int W = 256, H = 256; + + // Compress it to .ktx2 + uint64_t comp_params = bu_new_comp_params(); + + const char* pDesc = tex_video_flag ? "video" : "array"; + + char filename_buf[256]; + + for (int layer = 0; layer < L; layer++) + { + uint8_t* pSrc_image = create_pretty_rgba_pattern(W, H, (float)layer * .05f); + + // Save the test image to a .tga file + snprintf(filename_buf, sizeof(filename_buf), "test_%s_layer_%u.tga", pDesc, layer); + + write_tga_image(filename_buf, W, H, TRUE, pSrc_image); + printf("Wrote file %s\n", filename_buf); + + // Allocate memory + uint64_t img_ofs = bu_alloc(W * H * 4); + if (!img_ofs) + { + fprintf(stderr, "bu_alloc() failed\n"); + return EXIT_FAILURE; + } + + // Copy the test image into the allocated memory + memcpy((void*)img_ofs, pSrc_image, W * H * 4); + + // Supply the image to the compressor - it'll immediately make a copy of the data + if (!bu_comp_params_set_image_rgba32(comp_params, layer, img_ofs, W, H, W * 4)) + { + fprintf(stderr, "bu_comp_params_set_image_rgba32() failed\n"); + return EXIT_FAILURE; + } + + bu_free(img_ofs); + img_ofs = 0; + + free(pSrc_image); + + } // layer + + // ETC1S has special optimizations for texture video (basic p-frames with skip blocks). + uint32_t basis_tex_format = tex_video_flag ? BTF_ETC1S : BTF_XUASTC_LDR_4X4; + + uint32_t quality_level = 100; + uint32_t effort_level = 4; + + uint32_t flags = BU_COMP_FLAGS_KTX2_OUTPUT | BU_COMP_FLAGS_SRGB | + BU_COMP_FLAGS_THREADED | + BU_COMP_FLAGS_PRINT_STATS | BU_COMP_FLAGS_PRINT_STATUS; + + if (tex_video_flag) + flags |= BU_COMP_FLAGS_TEXTURE_TYPE_VIDEO_FRAMES; + else + flags |= BU_COMP_FLAGS_TEXTURE_TYPE_2D_ARRAY; + + if (mipmap_flag) + flags |= BU_COMP_FLAGS_GEN_MIPS_CLAMP; + + if (!bu_compress_texture(comp_params, basis_tex_format, quality_level, effort_level, flags, 0.0f)) + { + fprintf(stderr, "bu_compress_texture() failed\n"); + return EXIT_FAILURE; + } + + // Retrieve the compressed .KTX2 file data + uint64_t comp_size = bu_comp_params_get_comp_data_size(comp_params); + if (!comp_size) + { + fprintf(stderr, "bu_comp_params_get_comp_data_size() failed\n"); + return EXIT_FAILURE; + } + + void* pComp_data = (void*)bu_comp_params_get_comp_data_ofs(comp_params); + if (!pComp_data) + { + fprintf(stderr, "bu_comp_params_get_comp_data_ofs() failed\n"); + return EXIT_FAILURE; + } + + // Write the data to disk + snprintf(filename_buf, sizeof(filename_buf), "test_%s.ktx2", pDesc); + write_blob_to_file(filename_buf, pComp_data, comp_size); + printf("Wrote file %s\n", filename_buf); + + // Now inspect and transcode the .KTX2 data to png/astc files + if (!transcode_ktx2_file(pComp_data, comp_size, pDesc)) + { + fprintf(stderr, "transcode_ktx2_file() failed\n"); + return EXIT_FAILURE; + } + + bu_delete_comp_params(comp_params); + + return EXIT_SUCCESS; +} + +int main(int argc, char **argv) +{ + (void)argc; + (void)argv; + printf("example_capi.c:\n"); + + // Initialize the encoder (which initializers the transcoder for us) + printf("bu_init:\n"); + bu_init(); + + // bu_init() already does this for us, but it's harmless to call again. + printf("bt_init:\n"); + bt_init(); + + // Control debug output from the compressor + bu_enable_debug_printf(FALSE); + + // simple 2D + if (test_2D() != EXIT_SUCCESS) + { + fprintf(stderr, "test_2D() failed!\n"); + return EXIT_FAILURE; + } + + // 2D array + if (test_2D_array(FALSE, 8, FALSE) != EXIT_SUCCESS) + { + fprintf(stderr, "test_2D_array() (array mode) failed!\n"); + return EXIT_FAILURE; + } + + // texture video + if (test_2D_array(TRUE, 8, TRUE) != EXIT_SUCCESS) + { + fprintf(stderr, "test_2D_array() (texture video mode) failed!\n"); + return EXIT_FAILURE; + } + + printf("Success\n"); + + return EXIT_SUCCESS; +} + + diff --git a/example_capi/example_capi.vcxproj b/example_capi/example_capi.vcxproj new file mode 100644 index 0000000..67b4a24 --- /dev/null +++ b/example_capi/example_capi.vcxproj @@ -0,0 +1,238 @@ + + + + + Debug + ARM64EC + + + Debug + Win32 + + + Release + ARM64EC + + + Release + Win32 + + + Debug + x64 + + + Release + x64 + + + + 18.0 + Win32Proj + {be889347-e4fd-47dd-bbf4-81f98faa8ba9} + examplecapi + 10.0 + + + + Application + true + v145 + Unicode + + + Application + false + v145 + true + Unicode + + + Application + true + v145 + Unicode + + + Application + true + v145 + Unicode + + + Application + false + v145 + true + Unicode + + + Application + false + v145 + true + Unicode + + + + + + + + + + + + + + + + + + + + + + + + + + + $(SolutionDir)\bin\ + + + $(SolutionDir)\bin\ + + + $(SolutionDir)\bin\ + + + $(SolutionDir)\bin\ + + + $(SolutionDir)\bin\ + + + $(SolutionDir)\bin\ + + + + Level3 + true + WIN32;_DEBUG;_CONSOLE;%(PreprocessorDefinitions) + true + stdcpp20 + ..\OpenCL + + + Console + true + ..\OpenCL\lib + opencl64.lib;$(CoreLibraryDependencies);%(AdditionalDependencies) + + + + + Level3 + true + true + true + WIN32;NDEBUG;_CONSOLE;%(PreprocessorDefinitions) + true + stdcpp20 + ..\OpenCL + + + Console + true + ..\OpenCL\lib + opencl64.lib;$(CoreLibraryDependencies);%(AdditionalDependencies) + + + + + Level3 + true + _DEBUG;_CONSOLE;%(PreprocessorDefinitions) + true + stdcpp20 + ..\OpenCL + + + Console + true + ..\OpenCL\lib + opencl64.lib;$(CoreLibraryDependencies);%(AdditionalDependencies) + + + + + Level3 + true + _DEBUG;_CONSOLE;%(PreprocessorDefinitions) + true + stdcpp20 + ..\OpenCL + + + Console + true + ..\OpenCL\lib + opencl64.lib;softintrin.lib;%(AdditionalDependencies) + + + + + Level3 + true + true + true + NDEBUG;_CONSOLE;%(PreprocessorDefinitions) + true + stdcpp20 + ..\OpenCL + + + Console + true + ..\OpenCL\lib + opencl64.lib;$(CoreLibraryDependencies);%(AdditionalDependencies) + + + + + Level3 + true + true + true + NDEBUG;_CONSOLE;%(PreprocessorDefinitions) + true + stdcpp20 + ..\OpenCL + + + Console + true + ..\OpenCL\lib + opencl64.lib;softintrin.lib;%(AdditionalDependencies) + + + + + + + + + + + + + + + {97c34996-f458-4030-a402-b32c581872f1} + + + + + + \ No newline at end of file diff --git a/example_capi/example_capi.vcxproj.filters b/example_capi/example_capi.vcxproj.filters new file mode 100644 index 0000000..aa9303e --- /dev/null +++ b/example_capi/example_capi.vcxproj.filters @@ -0,0 +1,39 @@ + + + + + {4FC737F1-C7A5-4376-A066-2A32D752A2FF} + cpp;c;cc;cxx;c++;cppm;ixx;def;odl;idl;hpj;bat;asm;asmx + + + {93995380-89BD-4b04-88EB-625FBE52EBFB} + h;hh;hpp;hxx;h++;hm;inl;inc;ipp;xsd + + + {67DA6AB6-F800-4c08-8B7A-83BB121AAD01} + rc;ico;cur;bmp;dlg;rc2;rct;bin;rgs;gif;jpg;jpeg;jpe;resx;tiff;tif;png;wav;mfcribbon-ms + + + + + Source Files + + + Source Files + + + Source Files + + + + + Source Files + + + Source Files + + + Source Files + + + \ No newline at end of file diff --git a/example_transcoding/dds_defs.h b/example_transcoding/dds_defs.h new file mode 100644 index 0000000..cbca0a5 --- /dev/null +++ b/example_transcoding/dds_defs.h @@ -0,0 +1,286 @@ +// File: dds_defs.h +// DX9/10 .DDS file header definitions. +#pragma once + +#define PIXEL_FMT_FOURCC(a, b, c, d) ((a) | ((b) << 8U) | ((c) << 16U) | ((d) << 24U)) + +enum pixel_format +{ + PIXEL_FMT_INVALID = 0, + + PIXEL_FMT_DXT1 = PIXEL_FMT_FOURCC('D', 'X', 'T', '1'), + PIXEL_FMT_DXT2 = PIXEL_FMT_FOURCC('D', 'X', 'T', '2'), + PIXEL_FMT_DXT3 = PIXEL_FMT_FOURCC('D', 'X', 'T', '3'), + PIXEL_FMT_DXT4 = PIXEL_FMT_FOURCC('D', 'X', 'T', '4'), + PIXEL_FMT_DXT5 = PIXEL_FMT_FOURCC('D', 'X', 'T', '5'), + PIXEL_FMT_3DC = PIXEL_FMT_FOURCC('A', 'T', 'I', '2'), // DXN_YX + PIXEL_FMT_DXN = PIXEL_FMT_FOURCC('A', '2', 'X', 'Y'), // DXN_XY + PIXEL_FMT_DXT5A = PIXEL_FMT_FOURCC('A', 'T', 'I', '1'), // ATI1N, http://developer.amd.com/media/gpu_assets/Radeon_X1x00_Programming_Guide.pdf + + // Non-standard formats (some of these are supported by ATI's Compressonator) + PIXEL_FMT_DXT5_CCxY = PIXEL_FMT_FOURCC('C', 'C', 'x', 'Y'), + PIXEL_FMT_DXT5_xGxR = PIXEL_FMT_FOURCC('x', 'G', 'x', 'R'), + PIXEL_FMT_DXT5_xGBR = PIXEL_FMT_FOURCC('x', 'G', 'B', 'R'), + PIXEL_FMT_DXT5_AGBR = PIXEL_FMT_FOURCC('A', 'G', 'B', 'R'), + + PIXEL_FMT_DXT1A = PIXEL_FMT_FOURCC('D', 'X', '1', 'A'), + PIXEL_FMT_ETC1 = PIXEL_FMT_FOURCC('E', 'T', 'C', '1'), + + PIXEL_FMT_R8G8B8 = PIXEL_FMT_FOURCC('R', 'G', 'B', 'x'), + PIXEL_FMT_L8 = PIXEL_FMT_FOURCC('L', 'x', 'x', 'x'), + PIXEL_FMT_A8 = PIXEL_FMT_FOURCC('x', 'x', 'x', 'A'), + PIXEL_FMT_A8L8 = PIXEL_FMT_FOURCC('L', 'x', 'x', 'A'), + PIXEL_FMT_A8R8G8B8 = PIXEL_FMT_FOURCC('R', 'G', 'B', 'A') +}; + +const uint32_t cDDSMaxImageDimensions = 8192U; + +// Total size of header is sizeof(uint32)+cDDSSizeofDDSurfaceDesc2; +const uint32_t cDDSSizeofDDSurfaceDesc2 = 124; + +// "DDS " +const uint32_t cDDSFileSignature = 0x20534444; + +struct DDCOLORKEY +{ + uint32_t dwUnused0; + uint32_t dwUnused1; +}; + +struct DDPIXELFORMAT +{ + uint32_t dwSize; + uint32_t dwFlags; + uint32_t dwFourCC; + uint32_t dwRGBBitCount; // ATI compressonator will place a FOURCC code here for swizzled/cooked DXTn formats + uint32_t dwRBitMask; + uint32_t dwGBitMask; + uint32_t dwBBitMask; + uint32_t dwRGBAlphaBitMask; +}; + +struct DDSCAPS2 +{ + uint32_t dwCaps; + uint32_t dwCaps2; + uint32_t dwCaps3; + uint32_t dwCaps4; +}; + +struct DDSURFACEDESC2 +{ + uint32_t dwSize; + uint32_t dwFlags; + uint32_t dwHeight; + uint32_t dwWidth; + union + { + int32_t lPitch; + uint32_t dwLinearSize; + }; + uint32_t dwBackBufferCount; + uint32_t dwMipMapCount; + uint32_t dwAlphaBitDepth; + uint32_t dwUnused0; + uint32_t lpSurface; + DDCOLORKEY unused0; + DDCOLORKEY unused1; + DDCOLORKEY unused2; + DDCOLORKEY unused3; + DDPIXELFORMAT ddpfPixelFormat; + DDSCAPS2 ddsCaps; + uint32_t dwUnused1; +}; + +const uint32_t DDSD_CAPS = 0x00000001; +const uint32_t DDSD_HEIGHT = 0x00000002; +const uint32_t DDSD_WIDTH = 0x00000004; +const uint32_t DDSD_PITCH = 0x00000008; + +const uint32_t DDSD_BACKBUFFERCOUNT = 0x00000020; +const uint32_t DDSD_ZBUFFERBITDEPTH = 0x00000040; +const uint32_t DDSD_ALPHABITDEPTH = 0x00000080; + +const uint32_t DDSD_LPSURFACE = 0x00000800; + +const uint32_t DDSD_PIXELFORMAT = 0x00001000; +const uint32_t DDSD_CKDESTOVERLAY = 0x00002000; +const uint32_t DDSD_CKDESTBLT = 0x00004000; +const uint32_t DDSD_CKSRCOVERLAY = 0x00008000; + +const uint32_t DDSD_CKSRCBLT = 0x00010000; +const uint32_t DDSD_MIPMAPCOUNT = 0x00020000; +const uint32_t DDSD_REFRESHRATE = 0x00040000; +const uint32_t DDSD_LINEARSIZE = 0x00080000; + +const uint32_t DDSD_TEXTURESTAGE = 0x00100000; +const uint32_t DDSD_FVF = 0x00200000; +const uint32_t DDSD_SRCVBHANDLE = 0x00400000; +const uint32_t DDSD_DEPTH = 0x00800000; + +const uint32_t DDSD_ALL = 0x00fff9ee; + +const uint32_t DDPF_ALPHAPIXELS = 0x00000001; +const uint32_t DDPF_ALPHA = 0x00000002; +const uint32_t DDPF_FOURCC = 0x00000004; +const uint32_t DDPF_PALETTEINDEXED8 = 0x00000020; +const uint32_t DDPF_RGB = 0x00000040; +const uint32_t DDPF_LUMINANCE = 0x00020000; + +const uint32_t DDSCAPS_COMPLEX = 0x00000008; +const uint32_t DDSCAPS_TEXTURE = 0x00001000; +const uint32_t DDSCAPS_MIPMAP = 0x00400000; + +const uint32_t DDSCAPS2_CUBEMAP = 0x00000200; +const uint32_t DDSCAPS2_CUBEMAP_POSITIVEX = 0x00000400; +const uint32_t DDSCAPS2_CUBEMAP_NEGATIVEX = 0x00000800; + +const uint32_t DDSCAPS2_CUBEMAP_POSITIVEY = 0x00001000; +const uint32_t DDSCAPS2_CUBEMAP_NEGATIVEY = 0x00002000; +const uint32_t DDSCAPS2_CUBEMAP_POSITIVEZ = 0x00004000; +const uint32_t DDSCAPS2_CUBEMAP_NEGATIVEZ = 0x00008000; + +const uint32_t DDSCAPS2_VOLUME = 0x00200000; + +typedef enum DXGI_FORMAT +{ + DXGI_FORMAT_UNKNOWN = 0, + DXGI_FORMAT_R32G32B32A32_TYPELESS = 1, + DXGI_FORMAT_R32G32B32A32_FLOAT = 2, + DXGI_FORMAT_R32G32B32A32_UINT = 3, + DXGI_FORMAT_R32G32B32A32_SINT = 4, + DXGI_FORMAT_R32G32B32_TYPELESS = 5, + DXGI_FORMAT_R32G32B32_FLOAT = 6, + DXGI_FORMAT_R32G32B32_UINT = 7, + DXGI_FORMAT_R32G32B32_SINT = 8, + DXGI_FORMAT_R16G16B16A16_TYPELESS = 9, + DXGI_FORMAT_R16G16B16A16_FLOAT = 10, + DXGI_FORMAT_R16G16B16A16_UNORM = 11, + DXGI_FORMAT_R16G16B16A16_UINT = 12, + DXGI_FORMAT_R16G16B16A16_SNORM = 13, + DXGI_FORMAT_R16G16B16A16_SINT = 14, + DXGI_FORMAT_R32G32_TYPELESS = 15, + DXGI_FORMAT_R32G32_FLOAT = 16, + DXGI_FORMAT_R32G32_UINT = 17, + DXGI_FORMAT_R32G32_SINT = 18, + DXGI_FORMAT_R32G8X24_TYPELESS = 19, + DXGI_FORMAT_D32_FLOAT_S8X24_UINT = 20, + DXGI_FORMAT_R32_FLOAT_X8X24_TYPELESS = 21, + DXGI_FORMAT_X32_TYPELESS_G8X24_UINT = 22, + DXGI_FORMAT_R10G10B10A2_TYPELESS = 23, + DXGI_FORMAT_R10G10B10A2_UNORM = 24, + DXGI_FORMAT_R10G10B10A2_UINT = 25, + DXGI_FORMAT_R11G11B10_FLOAT = 26, + DXGI_FORMAT_R8G8B8A8_TYPELESS = 27, + DXGI_FORMAT_R8G8B8A8_UNORM = 28, + DXGI_FORMAT_R8G8B8A8_UNORM_SRGB = 29, + DXGI_FORMAT_R8G8B8A8_UINT = 30, + DXGI_FORMAT_R8G8B8A8_SNORM = 31, + DXGI_FORMAT_R8G8B8A8_SINT = 32, + DXGI_FORMAT_R16G16_TYPELESS = 33, + DXGI_FORMAT_R16G16_FLOAT = 34, + DXGI_FORMAT_R16G16_UNORM = 35, + DXGI_FORMAT_R16G16_UINT = 36, + DXGI_FORMAT_R16G16_SNORM = 37, + DXGI_FORMAT_R16G16_SINT = 38, + DXGI_FORMAT_R32_TYPELESS = 39, + DXGI_FORMAT_D32_FLOAT = 40, + DXGI_FORMAT_R32_FLOAT = 41, + DXGI_FORMAT_R32_UINT = 42, + DXGI_FORMAT_R32_SINT = 43, + DXGI_FORMAT_R24G8_TYPELESS = 44, + DXGI_FORMAT_D24_UNORM_S8_UINT = 45, + DXGI_FORMAT_R24_UNORM_X8_TYPELESS = 46, + DXGI_FORMAT_X24_TYPELESS_G8_UINT = 47, + DXGI_FORMAT_R8G8_TYPELESS = 48, + DXGI_FORMAT_R8G8_UNORM = 49, + DXGI_FORMAT_R8G8_UINT = 50, + DXGI_FORMAT_R8G8_SNORM = 51, + DXGI_FORMAT_R8G8_SINT = 52, + DXGI_FORMAT_R16_TYPELESS = 53, + DXGI_FORMAT_R16_FLOAT = 54, + DXGI_FORMAT_D16_UNORM = 55, + DXGI_FORMAT_R16_UNORM = 56, + DXGI_FORMAT_R16_UINT = 57, + DXGI_FORMAT_R16_SNORM = 58, + DXGI_FORMAT_R16_SINT = 59, + DXGI_FORMAT_R8_TYPELESS = 60, + DXGI_FORMAT_R8_UNORM = 61, + DXGI_FORMAT_R8_UINT = 62, + DXGI_FORMAT_R8_SNORM = 63, + DXGI_FORMAT_R8_SINT = 64, + DXGI_FORMAT_A8_UNORM = 65, + DXGI_FORMAT_R1_UNORM = 66, + DXGI_FORMAT_R9G9B9E5_SHAREDEXP = 67, + DXGI_FORMAT_R8G8_B8G8_UNORM = 68, + DXGI_FORMAT_G8R8_G8B8_UNORM = 69, + DXGI_FORMAT_BC1_TYPELESS = 70, + DXGI_FORMAT_BC1_UNORM = 71, + DXGI_FORMAT_BC1_UNORM_SRGB = 72, + DXGI_FORMAT_BC2_TYPELESS = 73, + DXGI_FORMAT_BC2_UNORM = 74, + DXGI_FORMAT_BC2_UNORM_SRGB = 75, + DXGI_FORMAT_BC3_TYPELESS = 76, + DXGI_FORMAT_BC3_UNORM = 77, + DXGI_FORMAT_BC3_UNORM_SRGB = 78, + DXGI_FORMAT_BC4_TYPELESS = 79, + DXGI_FORMAT_BC4_UNORM = 80, + DXGI_FORMAT_BC4_SNORM = 81, + DXGI_FORMAT_BC5_TYPELESS = 82, + DXGI_FORMAT_BC5_UNORM = 83, + DXGI_FORMAT_BC5_SNORM = 84, + DXGI_FORMAT_B5G6R5_UNORM = 85, + DXGI_FORMAT_B5G5R5A1_UNORM = 86, + DXGI_FORMAT_B8G8R8A8_UNORM = 87, + DXGI_FORMAT_B8G8R8X8_UNORM = 88, + DXGI_FORMAT_R10G10B10_XR_BIAS_A2_UNORM = 89, + DXGI_FORMAT_B8G8R8A8_TYPELESS = 90, + DXGI_FORMAT_B8G8R8A8_UNORM_SRGB = 91, + DXGI_FORMAT_B8G8R8X8_TYPELESS = 92, + DXGI_FORMAT_B8G8R8X8_UNORM_SRGB = 93, + DXGI_FORMAT_BC6H_TYPELESS = 94, + DXGI_FORMAT_BC6H_UF16 = 95, + DXGI_FORMAT_BC6H_SF16 = 96, + DXGI_FORMAT_BC7_TYPELESS = 97, + DXGI_FORMAT_BC7_UNORM = 98, + DXGI_FORMAT_BC7_UNORM_SRGB = 99, + DXGI_FORMAT_AYUV = 100, + DXGI_FORMAT_Y410 = 101, + DXGI_FORMAT_Y416 = 102, + DXGI_FORMAT_NV12 = 103, + DXGI_FORMAT_P010 = 104, + DXGI_FORMAT_P016 = 105, + DXGI_FORMAT_420_OPAQUE = 106, + DXGI_FORMAT_YUY2 = 107, + DXGI_FORMAT_Y210 = 108, + DXGI_FORMAT_Y216 = 109, + DXGI_FORMAT_NV11 = 110, + DXGI_FORMAT_AI44 = 111, + DXGI_FORMAT_IA44 = 112, + DXGI_FORMAT_P8 = 113, + DXGI_FORMAT_A8P8 = 114, + DXGI_FORMAT_B4G4R4A4_UNORM = 115, + DXGI_FORMAT_P208 = 130, + DXGI_FORMAT_V208 = 131, + DXGI_FORMAT_V408 = 132, + DXGI_FORMAT_FORCE_UINT = 0xffffffff +} DXGI_FORMAT; + +enum D3D10_RESOURCE_DIMENSION +{ + D3D10_RESOURCE_DIMENSION_UNKNOWN = 0, + D3D10_RESOURCE_DIMENSION_BUFFER = 1, + D3D10_RESOURCE_DIMENSION_TEXTURE1D = 2, + D3D10_RESOURCE_DIMENSION_TEXTURE2D = 3, + D3D10_RESOURCE_DIMENSION_TEXTURE3D = 4 +}; + +struct DDS_HEADER_DXT10 +{ + DXGI_FORMAT dxgiFormat; + D3D10_RESOURCE_DIMENSION resourceDimension; + uint32_t miscFlag; + uint32_t arraySize; + uint32_t miscFlags2; +}; + diff --git a/example_transcoding/example_transcoding.cpp b/example_transcoding/example_transcoding.cpp new file mode 100644 index 0000000..32335e4 --- /dev/null +++ b/example_transcoding/example_transcoding.cpp @@ -0,0 +1,100 @@ +// example_transcoding.cpp: Very simple transcoding-only example. Does not depend on the basisu encoder library at all, just basisu_transcoder.cpp. +// You can use AMD Compressonator or Microsoft's DirectXTex tools on github to view the written DX10 .DDS file. +#include +#include + +// for testing +//#define BASISD_SUPPORT_XUASTC (0) +//#define BASISD_SUPPORT_KTX2_ZSTD (0) + +#include "../transcoder/basisu_transcoder.h" +#include "utils.h" + +int main() +{ + basist::basisu_transcoder_init(); + + // Read the .KTX2 file's data into memory. + utils::uint8_vec ktx2_file_data; + if (!utils::read_file("../test_files/base_xuastc_arith.ktx2", ktx2_file_data)) + { + if (!utils::read_file("base_xuastc_arith.ktx2", ktx2_file_data)) + { + fprintf(stderr, "Can't read file ../test_files/base_xuastc_arith.ktx2 or base_xuastc_arith.ktx2\n"); + return EXIT_FAILURE; + } + } + + printf("Read file base_xuastc_arith.ktx2\n"); + + if (ktx2_file_data.size() > UINT32_MAX) + { + fprintf(stderr, "KTX2 file too large\n"); + return EXIT_FAILURE; + } + + basist::ktx2_transcoder transcoder; + + // Initialize the transcoder. + if (!transcoder.init(ktx2_file_data.data(), (uint32_t)ktx2_file_data.size())) + return EXIT_FAILURE; + + const uint32_t width = transcoder.get_width(); + const uint32_t height = transcoder.get_height(); + const uint32_t num_levels = transcoder.get_levels(); + const bool is_srgb = transcoder.is_srgb(); + + printf("KTX2 dimensions: %ux%u, num mip levels: %u, sRGB: %u\n", width, height, num_levels, is_srgb); + + // Can't transcode HDR to LDR formats. + if (transcoder.is_hdr()) + { + fprintf(stderr, "Expected LDR KTX2 file\n"); + return EXIT_FAILURE; + } + + // Ensure BC7 support was enabled at compilation time (it will be enabled by default). + const basist::transcoder_texture_format tex_fmt = basist::transcoder_texture_format::cTFBC7_RGBA; + if (!basist::basis_is_format_supported(tex_fmt, transcoder.get_basis_tex_format())) + { + printf("BC7 was disabled in the transcoder at compilation\n"); + return EXIT_FAILURE; + } + + // Begin transcoding (this will be a no-op with UASTC HDR textures, but you still need to do it. For ETC1S it'll unpack the global codebooks). + transcoder.start_transcoding(); + + // Transcode to BC7 and write a BC7 .DDS file. + + // Bytes per block (8 or 16 for BC1-7) + const uint32_t bytes_per_block = basist::basis_get_bytes_per_block_or_pixel(tex_fmt); + // Compute total bytes needed to transcode the slice + const uint32_t total_bytes = basist::basis_compute_transcoded_image_size_in_bytes(tex_fmt, width, height); + // Derive the total number of blocks the output buffer can hold. The transcoder will use this to verify the buffer is large enough. + const uint32_t total_blocks = total_bytes / bytes_per_block; + + // Allocate the buffer to hold the blocks + utils::uint8_vec tex_buffer(total_bytes); + + // Transcode the level + bool status = transcoder.transcode_image_level(0, 0, 0, + tex_buffer.data(), total_blocks, + tex_fmt, 0); + + if (!status) + { + fprintf(stderr, "transcoder.transcode_image_level() failed\n"); + return EXIT_FAILURE; + } + + // Write an sRGB DX10-style .DDS file. + if (!utils::save_dds("out.dds", width, height, tex_buffer.data(), 8, DXGI_FORMAT_BC7_UNORM_SRGB, true, true)) + { + fprintf(stderr, "save_dds() failed\n"); + return EXIT_FAILURE; + } + + printf("Wrote out.dds\n"); + + return EXIT_SUCCESS; +} diff --git a/example_transcoding/example_transcoding.manifest b/example_transcoding/example_transcoding.manifest new file mode 100644 index 0000000..b4baf6b --- /dev/null +++ b/example_transcoding/example_transcoding.manifest @@ -0,0 +1,10 @@ + + + + + + UTF-8 + + + + diff --git a/example_transcoding/example_transcoding.vcxproj b/example_transcoding/example_transcoding.vcxproj new file mode 100644 index 0000000..2cd44e1 --- /dev/null +++ b/example_transcoding/example_transcoding.vcxproj @@ -0,0 +1,202 @@ + + + + + Debug + ARM64EC + + + Debug + Win32 + + + Release + ARM64EC + + + Release + Win32 + + + Debug + x64 + + + Release + x64 + + + + 17.0 + Win32Proj + {13333092-fcfe-4d74-8e76-f10c6037593c} + exampletranscoding + 10.0 + + + + Application + true + Unicode + v145 + + + Application + false + true + Unicode + v145 + + + Application + true + v145 + Unicode + + + Application + true + v145 + Unicode + + + Application + false + v145 + true + Unicode + + + Application + false + v145 + true + Unicode + + + + + + + + + + + + + + + + + + + + + + + + + + + + Level4 + true + WIN32;_DEBUG;_CONSOLE;%(PreprocessorDefinitions) + true + stdcpp17 + AdvancedVectorExtensions + + + Console + true + + + + + Level4 + true + true + true + WIN32;NDEBUG;_CONSOLE;%(PreprocessorDefinitions) + true + stdcpp17 + AdvancedVectorExtensions + + + Console + true + + + + + true + _DEBUG;_CONSOLE;%(PreprocessorDefinitions) + true + stdcpp17 + Level4 + + + Console + true + + + + + true + _DEBUG;_CONSOLE;%(PreprocessorDefinitions) + true + stdcpp17 + Level4 + + + Console + true + + + + + Level4 + true + true + true + NDEBUG;_CONSOLE;%(PreprocessorDefinitions) + true + stdcpp17 + + + Console + true + + + + + Level4 + true + true + true + NDEBUG;_CONSOLE;%(PreprocessorDefinitions) + true + stdcpp17 + + + Console + true + + + + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/example_transcoding/example_transcoding.vcxproj.filters b/example_transcoding/example_transcoding.vcxproj.filters new file mode 100644 index 0000000..563e6b9 --- /dev/null +++ b/example_transcoding/example_transcoding.vcxproj.filters @@ -0,0 +1,47 @@ + + + + + {4FC737F1-C7A5-4376-A066-2A32D752A2FF} + cpp;c;cc;cxx;c++;cppm;ixx;def;odl;idl;hpj;bat;asm;asmx + + + {93995380-89BD-4b04-88EB-625FBE52EBFB} + h;hh;hpp;hxx;h++;hm;inl;inc;ipp;xsd + + + {67DA6AB6-F800-4c08-8B7A-83BB121AAD01} + rc;ico;cur;bmp;dlg;rc2;rct;bin;rgs;gif;jpg;jpeg;jpe;resx;tiff;tif;png;wav;mfcribbon-ms + + + {db43163f-6d1b-46cf-90ad-24650d502e6a} + + + + + Source Files + + + Source Files + + + Source Files\utils + + + Source Files\utils + + + + + Source Files + + + + + Source Files\utils + + + Source Files\utils + + + \ No newline at end of file diff --git a/example_transcoding/utils.cpp b/example_transcoding/utils.cpp new file mode 100644 index 0000000..e42e71f --- /dev/null +++ b/example_transcoding/utils.cpp @@ -0,0 +1,948 @@ +// File: utils.cpp +#include "utils.h" +//#include "lodepng.h" +//#include "miniz.h" + +namespace utils +{ + +#define FLOOD_PUSH(y, xl, xr, dy) if (((y + (dy)) >= 0) && ((y + (dy)) < (int)m_height)) { stack.push_back(fill_segment(y, xl, xr, dy)); } + +// See http://www.realtimerendering.com/resources/GraphicsGems/gems/SeedFill.c +uint32_t image_u8::flood_fill(int x, int y, const color_quad_u8& c, const color_quad_u8& b, std::vector* pSet_pixels) +{ + uint32_t total_set = 0; + + if (!flood_fill_is_inside(x, y, b)) + return 0; + + std::vector stack; + stack.reserve(64); + + FLOOD_PUSH(y, x, x, 1); + FLOOD_PUSH(y + 1, x, x, -1); + + while (stack.size()) + { + fill_segment s = stack.back(); + stack.pop_back(); + + int x1 = s.m_xl, x2 = s.m_xr, dy = s.m_dy; + y = s.m_y + s.m_dy; + + for (x = x1; (x >= 0) && flood_fill_is_inside(x, y, b); x--) + { + (*this)(x, y) = c; + total_set++; + if (pSet_pixels) + pSet_pixels->push_back(pixel_coord(x, y)); + } + + int l; + + if (x >= x1) + goto skip; + + l = x + 1; + if (l < x1) + FLOOD_PUSH(y, l, x1 - 1, -dy); + + x = x1 + 1; + + do + { + for (; x <= ((int)m_width - 1) && flood_fill_is_inside(x, y, b); x++) + { + (*this)(x, y) = c; + total_set++; + if (pSet_pixels) + pSet_pixels->push_back(pixel_coord(x, y)); + } + FLOOD_PUSH(y, l, x - 1, dy); + + if (x > (x2 + 1)) + FLOOD_PUSH(y, x2 + 1, x - 1, -dy); + + skip: + for (x++; x <= x2 && !flood_fill_is_inside(x, y, b); x++) + ; + + l = x; + } while (x <= x2); + } + + return total_set; +} + +void image_u8::draw_line(int xs, int ys, int xe, int ye, const color_quad_u8& color) +{ + if (xs > xe) + { + std::swap(xs, xe); + std::swap(ys, ye); + } + + int dx = xe - xs, dy = ye - ys; + if (!dx) + { + if (ys > ye) + std::swap(ys, ye); + for (int i = ys; i <= ye; i++) + set_pixel_clipped(xs, i, color); + } + else if (!dy) + { + for (int i = xs; i < xe; i++) + set_pixel_clipped(i, ys, color); + } + else if (dy > 0) + { + if (dy <= dx) + { + int e = 2 * dy - dx, e_no_inc = 2 * dy, e_inc = 2 * (dy - dx); + rasterize_line(xs, ys, xe, ye, 0, 1, e, e_inc, e_no_inc, color); + } + else + { + int e = 2 * dx - dy, e_no_inc = 2 * dx, e_inc = 2 * (dx - dy); + rasterize_line(xs, ys, xe, ye, 1, 1, e, e_inc, e_no_inc, color); + } + } + else + { + dy = -dy; + if (dy <= dx) + { + int e = 2 * dy - dx, e_no_inc = 2 * dy, e_inc = 2 * (dy - dx); + rasterize_line(xs, ys, xe, ye, 0, -1, e, e_inc, e_no_inc, color); + } + else + { + int e = 2 * dx - dy, e_no_inc = (2 * dx), e_inc = 2 * (dx - dy); + rasterize_line(xe, ye, xs, ys, 1, -1, e, e_inc, e_no_inc, color); + } + } +} + +void image_u8::rasterize_line(int xs, int ys, int xe, int ye, int pred, int inc_dec, int e, int e_inc, int e_no_inc, const color_quad_u8& color) +{ + int start, end, var; + + if (pred) + { + start = ys; + end = ye; + var = xs; + for (int i = start; i <= end; i++) + { + set_pixel_clipped(var, i, color); + if (e < 0) + e += e_no_inc; + else + { + var += inc_dec; + e += e_inc; + } + } + } + else + { + start = xs; + end = xe; + var = ys; + for (int i = start; i <= end; i++) + { + set_pixel_clipped(i, var, color); + if (e < 0) + e += e_no_inc; + else + { + var += inc_dec; + e += e_inc; + } + } + } +} + +#if 0 +bool load_png(const char* pFilename, image_u8& img) +{ + img.clear(); + + std::vector pixels; + unsigned int w = 0, h = 0; + unsigned int e = lodepng::decode(pixels, w, h, pFilename); + if (e != 0) + { + fprintf(stderr, "Failed loading PNG file %s\n", pFilename); + return false; + } + + img.init(w, h); + memcpy(&img.get_pixels()[0], &pixels[0], w * h * sizeof(uint32_t)); + + return true; +} + +bool save_png(const char* pFilename, const image_u8& img, bool save_alpha) +{ + const uint32_t w = img.width(); + const uint32_t h = img.height(); + + std::vector pixels; + if (save_alpha) + { + pixels.resize(w * h * sizeof(color_quad_u8)); + memcpy(&pixels[0], &img.get_pixels()[0], w * h * sizeof(color_quad_u8)); + } + else + { + pixels.resize(w * h * 3); + unsigned char* pDst = &pixels[0]; + for (uint32_t y = 0; y < h; y++) + for (uint32_t x = 0; x < w; x++, pDst += 3) + pDst[0] = img(x, y)[0], pDst[1] = img(x, y)[1], pDst[2] = img(x, y)[2]; + } + + return lodepng::encode(pFilename, pixels, w, h, save_alpha ? LCT_RGBA : LCT_RGB) == 0; +} +#endif + +static float gauss(int x, int y, float sigma_sqr) +{ + float pow = expf(-((x * x + y * y) / (2.0f * sigma_sqr))); + float g = (1.0f / (sqrtf((float)(2.0f * M_PI * sigma_sqr)))) * pow; + return g; +} + +// size_x/y should be odd +void compute_gaussian_kernel(float* pDst, int size_x, int size_y, float sigma_sqr, uint32_t flags) +{ + assert(size_x & size_y & 1); + + if (!(size_x | size_y)) + return; + + int mid_x = size_x / 2; + int mid_y = size_y / 2; + + double sum = 0; + for (int x = 0; x < size_x; x++) + { + for (int y = 0; y < size_y; y++) + { + float g; + if ((x > mid_x) && (y < mid_y)) + g = pDst[(size_x - x - 1) + y * size_x]; + else if ((x < mid_x) && (y > mid_y)) + g = pDst[x + (size_y - y - 1) * size_x]; + else if ((x > mid_x) && (y > mid_y)) + g = pDst[(size_x - x - 1) + (size_y - y - 1) * size_x]; + else + g = gauss(x - mid_x, y - mid_y, sigma_sqr); + + pDst[x + y * size_x] = g; + sum += g; + } + } + + if (flags & cComputeGaussianFlagNormalizeCenterToOne) + { + sum = pDst[mid_x + mid_y * size_x]; + } + + if (flags & (cComputeGaussianFlagNormalizeCenterToOne | cComputeGaussianFlagNormalize)) + { + double one_over_sum = 1.0f / sum; + for (int i = 0; i < size_x * size_y; i++) + pDst[i] = static_cast(pDst[i] * one_over_sum); + + if (flags & cComputeGaussianFlagNormalizeCenterToOne) + pDst[mid_x + mid_y * size_x] = 1.0f; + } + + if (flags & cComputeGaussianFlagPrint) + { + printf("{\n"); + for (int y = 0; y < size_y; y++) + { + printf(" "); + for (int x = 0; x < size_x; x++) + { + printf("%f, ", pDst[x + y * size_x]); + } + printf("\n"); + } + printf("}"); + } +} + +void gaussian_filter(imagef& dst, const imagef& orig_img, uint32_t odd_filter_width, float sigma_sqr, bool wrapping, uint32_t width_divisor, uint32_t height_divisor) +{ + assert(odd_filter_width && (odd_filter_width & 1)); + odd_filter_width |= 1; + + std::vector kernel(odd_filter_width * odd_filter_width); + compute_gaussian_kernel(&kernel[0], odd_filter_width, odd_filter_width, sigma_sqr, cComputeGaussianFlagNormalize); + + const int dst_width = orig_img.get_width() / width_divisor; + const int dst_height = orig_img.get_height() / height_divisor; + + const int H = odd_filter_width / 2; + const int L = -H; + + dst.crop(dst_width, dst_height); + +//#pragma omp parallel for + for (int oy = 0; oy < dst_height; oy++) + { + for (int ox = 0; ox < dst_width; ox++) + { + vec4F c(0.0f); + + for (int yd = L; yd <= H; yd++) + { + int y = oy * height_divisor + (height_divisor >> 1) + yd; + + for (int xd = L; xd <= H; xd++) + { + int x = ox * width_divisor + (width_divisor >> 1) + xd; + + const vec4F& p = orig_img.get_clamped_or_wrapped(x, y, wrapping, wrapping); + + float w = kernel[(xd + H) + (yd + H) * odd_filter_width]; + c[0] += p[0] * w; + c[1] += p[1] * w; + c[2] += p[2] * w; + c[3] += p[3] * w; + } + } + + dst(ox, oy).set(c[0], c[1], c[2], c[3]); + } + } +} + +static void pow_image(const imagef& src, imagef& dst, const vec4F& power) +{ + dst.resize(src); + +//#pragma omp parallel for + for (int y = 0; y < (int)dst.get_height(); y++) + { + for (uint32_t x = 0; x < dst.get_width(); x++) + { + const vec4F& p = src(x, y); + + if ((power[0] == 2.0f) && (power[1] == 2.0f) && (power[2] == 2.0f) && (power[3] == 2.0f)) + dst(x, y).set(p[0] * p[0], p[1] * p[1], p[2] * p[2], p[3] * p[3]); + else + dst(x, y).set(powf(p[0], power[0]), powf(p[1], power[1]), powf(p[2], power[2]), powf(p[3], power[3])); + } + } +} + +#if 0 +static void mul_image(const imagef& src, imagef& dst, const vec4F& mul) +{ + dst.resize(src); + +//#pragma omp parallel for + for (int y = 0; y < (int)dst.get_height(); y++) + { + for (uint32_t x = 0; x < dst.get_width(); x++) + { + const vec4F& p = src(x, y); + dst(x, y).set(p[0] * mul[0], p[1] * mul[1], p[2] * mul[2], p[3] * mul[3]); + } + } +} +#endif + +static void scale_image(const imagef& src, imagef& dst, const vec4F& scale, const vec4F& shift) +{ + dst.resize(src); + +//#pragma omp parallel for + for (int y = 0; y < (int)dst.get_height(); y++) + { + for (uint32_t x = 0; x < dst.get_width(); x++) + { + const vec4F& p = src(x, y); + + vec4F d; + + for (uint32_t c = 0; c < 4; c++) + d[c] = scale[c] * p[c] + shift[c]; + + dst(x, y).set(d[0], d[1], d[2], d[3]); + } + } +} + +static void add_weighted_image(const imagef& src1, const vec4F& alpha, const imagef& src2, const vec4F& beta, const vec4F& gamma, imagef& dst) +{ + dst.resize(src1); + +//#pragma omp parallel for + for (int y = 0; y < (int)dst.get_height(); y++) + { + for (uint32_t x = 0; x < dst.get_width(); x++) + { + const vec4F& s1 = src1(x, y); + const vec4F& s2 = src2(x, y); + + dst(x, y).set( + s1[0] * alpha[0] + s2[0] * beta[0] + gamma[0], + s1[1] * alpha[1] + s2[1] * beta[1] + gamma[1], + s1[2] * alpha[2] + s2[2] * beta[2] + gamma[2], + s1[3] * alpha[3] + s2[3] * beta[3] + gamma[3]); + } + } +} + +static void add_image(const imagef& src1, const imagef& src2, imagef& dst) +{ + dst.resize(src1); + +//#pragma omp parallel for + for (int y = 0; y < (int)dst.get_height(); y++) + { + for (uint32_t x = 0; x < dst.get_width(); x++) + { + const vec4F& s1 = src1(x, y); + const vec4F& s2 = src2(x, y); + + dst(x, y).set(s1[0] + s2[0], s1[1] + s2[1], s1[2] + s2[2], s1[3] + s2[3]); + } + } +} + +static void adds_image(const imagef& src, const vec4F& value, imagef& dst) +{ + dst.resize(src); + +//#pragma omp parallel for + for (int y = 0; y < (int)dst.get_height(); y++) + { + for (uint32_t x = 0; x < dst.get_width(); x++) + { + const vec4F& p = src(x, y); + + dst(x, y).set(p[0] + value[0], p[1] + value[1], p[2] + value[2], p[3] + value[3]); + } + } +} + +static void mul_image(const imagef& src1, const imagef& src2, imagef& dst, const vec4F& scale) +{ + dst.resize(src1); + +//#pragma omp parallel for + for (int y = 0; y < (int)dst.get_height(); y++) + { + for (uint32_t x = 0; x < dst.get_width(); x++) + { + const vec4F& s1 = src1(x, y); + const vec4F& s2 = src2(x, y); + + vec4F d; + + for (uint32_t c = 0; c < 4; c++) + { + float v1 = s1[c]; + float v2 = s2[c]; + d[c] = v1 * v2 * scale[c]; + } + + dst(x, y) = d; + } + } +} + +static void div_image(const imagef& src1, const imagef& src2, imagef& dst, const vec4F& scale) +{ + dst.resize(src1); + +//#pragma omp parallel for + for (int y = 0; y < (int)dst.get_height(); y++) + { + for (uint32_t x = 0; x < dst.get_width(); x++) + { + const vec4F& s1 = src1(x, y); + const vec4F& s2 = src2(x, y); + + vec4F d; + + for (uint32_t c = 0; c < 4; c++) + { + float v = s2[c]; + if (v == 0.0f) + d[c] = 0.0f; + else + d[c] = (s1[c] * scale[c]) / v; + } + + dst(x, y) = d; + } + } +} + +static vec4F avg_image(const imagef& src) +{ + vec4F avg(0.0f); + + for (uint32_t y = 0; y < src.get_height(); y++) + { + for (uint32_t x = 0; x < src.get_width(); x++) + { + const vec4F& s = src(x, y); + + avg += vec4F(s[0], s[1], s[2], s[3]); + } + } + + avg /= static_cast(src.get_total_pixels()); + + return avg; +} + +// Reference: https://ece.uwaterloo.ca/~z70wang/research/ssim/index.html +vec4F compute_ssim(const imagef& a, const imagef& b) +{ + imagef axb, a_sq, b_sq, mu1, mu2, mu1_sq, mu2_sq, mu1_mu2, s1_sq, s2_sq, s12, smap, t1, t2, t3; + + const float C1 = 6.50250f, C2 = 58.52250f; + + pow_image(a, a_sq, vec4F(2)); + pow_image(b, b_sq, vec4F(2)); + mul_image(a, b, axb, vec4F(1.0f)); + + gaussian_filter(mu1, a, 11, 1.5f * 1.5f); + gaussian_filter(mu2, b, 11, 1.5f * 1.5f); + + pow_image(mu1, mu1_sq, vec4F(2)); + pow_image(mu2, mu2_sq, vec4F(2)); + mul_image(mu1, mu2, mu1_mu2, vec4F(1.0f)); + + gaussian_filter(s1_sq, a_sq, 11, 1.5f * 1.5f); + add_weighted_image(s1_sq, vec4F(1), mu1_sq, vec4F(-1), vec4F(0), s1_sq); + + gaussian_filter(s2_sq, b_sq, 11, 1.5f * 1.5f); + add_weighted_image(s2_sq, vec4F(1), mu2_sq, vec4F(-1), vec4F(0), s2_sq); + + gaussian_filter(s12, axb, 11, 1.5f * 1.5f); + add_weighted_image(s12, vec4F(1), mu1_mu2, vec4F(-1), vec4F(0), s12); + + scale_image(mu1_mu2, t1, vec4F(2), vec4F(0)); + adds_image(t1, vec4F(C1), t1); + + scale_image(s12, t2, vec4F(2), vec4F(0)); + adds_image(t2, vec4F(C2), t2); + + mul_image(t1, t2, t3, vec4F(1)); + + add_image(mu1_sq, mu2_sq, t1); + adds_image(t1, vec4F(C1), t1); + + add_image(s1_sq, s2_sq, t2); + adds_image(t2, vec4F(C2), t2); + + mul_image(t1, t2, t1, vec4F(1)); + + div_image(t3, t1, smap, vec4F(1)); + + return avg_image(smap); +} + +vec4F compute_ssim(const image_u8& a, const image_u8& b, bool luma) +{ + image_u8 ta(a), tb(b); + + if ((ta.width() != tb.width()) || (ta.height() != tb.height())) + { + fprintf(stderr, "compute_ssim: Cropping input images to equal dimensions\n"); + + const uint32_t w = std::min(a.width(), b.width()); + const uint32_t h = std::min(a.height(), b.height()); + ta.crop(w, h); + tb.crop(w, h); + } + + if (!ta.width() || !ta.height()) + { + assert(0); + return vec4F(0); + } + + if (luma) + { + for (uint32_t y = 0; y < ta.height(); y++) + { + for (uint32_t x = 0; x < ta.width(); x++) + { + ta(x, y).set((uint8_t)ta(x, y).get_luma(), ta(x, y).a); + tb(x, y).set((uint8_t)tb(x, y).get_luma(), tb(x, y).a); + } + } + } + + imagef fta, ftb; + + fta.set(ta); + ftb.set(tb); + + return compute_ssim(fta, ftb); +} + +bool save_dds(const char* pFilename, uint32_t width, uint32_t height, const void* pBlocks, uint32_t pixel_format_bpp, DXGI_FORMAT dxgi_format, bool srgb, bool force_dx10_header) +{ + (void)srgb; + + FILE* pFile = NULL; +#ifdef _MSC_VER + fopen_s(&pFile, pFilename, "wb"); +#else + pFile = fopen(pFilename, "wb"); +#endif + if (!pFile) + { + fprintf(stderr, "Failed creating file %s!\n", pFilename); + return false; + } + + fwrite("DDS ", 4, 1, pFile); + + DDSURFACEDESC2 desc; + memset(&desc, 0, sizeof(desc)); + + desc.dwSize = sizeof(desc); + desc.dwFlags = DDSD_WIDTH | DDSD_HEIGHT | DDSD_PIXELFORMAT | DDSD_CAPS; + + desc.dwWidth = width; + desc.dwHeight = height; + + desc.ddsCaps.dwCaps = DDSCAPS_TEXTURE; + desc.ddpfPixelFormat.dwSize = sizeof(desc.ddpfPixelFormat); + + desc.ddpfPixelFormat.dwFlags |= DDPF_FOURCC; + + desc.lPitch = (((desc.dwWidth + 3) & ~3) * ((desc.dwHeight + 3) & ~3) * pixel_format_bpp) >> 3; + desc.dwFlags |= DDSD_LINEARSIZE; + + desc.ddpfPixelFormat.dwRGBBitCount = 0; + + if ((!force_dx10_header) && + ((dxgi_format == DXGI_FORMAT_BC1_UNORM) || + (dxgi_format == DXGI_FORMAT_BC3_UNORM) || + (dxgi_format == DXGI_FORMAT_BC4_UNORM) || + (dxgi_format == DXGI_FORMAT_BC5_UNORM))) + { + if (dxgi_format == DXGI_FORMAT_BC1_UNORM) + desc.ddpfPixelFormat.dwFourCC = (uint32_t)PIXEL_FMT_FOURCC('D', 'X', 'T', '1'); + else if (dxgi_format == DXGI_FORMAT_BC3_UNORM) + desc.ddpfPixelFormat.dwFourCC = (uint32_t)PIXEL_FMT_FOURCC('D', 'X', 'T', '5'); + else if (dxgi_format == DXGI_FORMAT_BC4_UNORM) + desc.ddpfPixelFormat.dwFourCC = (uint32_t)PIXEL_FMT_FOURCC('A', 'T', 'I', '1'); + else if (dxgi_format == DXGI_FORMAT_BC5_UNORM) + desc.ddpfPixelFormat.dwFourCC = (uint32_t)PIXEL_FMT_FOURCC('A', 'T', 'I', '2'); + + fwrite(&desc, sizeof(desc), 1, pFile); + } + else + { + desc.ddpfPixelFormat.dwFourCC = (uint32_t)PIXEL_FMT_FOURCC('D', 'X', '1', '0'); + + fwrite(&desc, sizeof(desc), 1, pFile); + + DDS_HEADER_DXT10 hdr10; + memset(&hdr10, 0, sizeof(hdr10)); + + // Not all tools support DXGI_FORMAT_BC7_UNORM_SRGB (like NVTT), but ddsview in DirectXTex pays attention to it. So not sure what to do here. + // For best compatibility just write DXGI_FORMAT_BC7_UNORM. + //hdr10.dxgiFormat = srgb ? DXGI_FORMAT_BC7_UNORM_SRGB : DXGI_FORMAT_BC7_UNORM; + hdr10.dxgiFormat = dxgi_format; // DXGI_FORMAT_BC7_UNORM; + hdr10.resourceDimension = D3D10_RESOURCE_DIMENSION_TEXTURE2D; + hdr10.arraySize = 1; + + fwrite(&hdr10, sizeof(hdr10), 1, pFile); + } + + fwrite(pBlocks, desc.lPitch, 1, pFile); + + if (fclose(pFile) == EOF) + { + fprintf(stderr, "Failed writing to DDS file %s!\n", pFilename); + return false; + } + + return true; +} + +void strip_extension(std::string& s) +{ + for (int32_t i = (int32_t)s.size() - 1; i >= 0; i--) + { + if (s[i] == '.') + { + s.resize(i); + break; + } + } +} + +void strip_path(std::string& s) +{ + for (int32_t i = (int32_t)s.size() - 1; i >= 0; i--) + { + if ((s[i] == '/') || (s[i] == ':') || (s[i] == '\\')) + { + s.erase(0, i + 1); + break; + } + } +} + +uint32_t hash_hsieh(const uint8_t* pBuf, size_t len) +{ + if (!pBuf || !len) + return 0; + + uint32_t h = static_cast(len); + + const uint32_t bytes_left = len & 3; + len >>= 2; + + while (len--) + { + const uint16_t* pWords = reinterpret_cast(pBuf); + + h += pWords[0]; + + const uint32_t t = (pWords[1] << 11) ^ h; + h = (h << 16) ^ t; + + pBuf += sizeof(uint32_t); + + h += h >> 11; + } + + switch (bytes_left) + { + case 1: + h += *reinterpret_cast(pBuf); + h ^= h << 10; + h += h >> 1; + break; + case 2: + h += *reinterpret_cast(pBuf); + h ^= h << 11; + h += h >> 17; + break; + case 3: + h += *reinterpret_cast(pBuf); + h ^= h << 16; + h ^= (static_cast(pBuf[sizeof(uint16_t)])) << 18; + h += h >> 11; + break; + default: + break; + } + + h ^= h << 3; + h += h >> 5; + h ^= h << 4; + h += h >> 17; + h ^= h << 25; + h += h >> 6; + + return h; +} + +float compute_block_max_std_dev(const color_quad_u8* pPixels, uint32_t block_width, uint32_t block_height, uint32_t num_comps) +{ + tracked_stat comp_stats[4]; + + for (uint32_t y = 0; y < block_height; y++) + { + for (uint32_t x = 0; x < block_width; x++) + { + const color_quad_u8* pPixel = pPixels + x + y * block_width; + + for (uint32_t c = 0; c < num_comps; c++) + comp_stats[c].update(pPixel->m_c[c]); + } + } + + float max_std_dev = 0.0f; + for (uint32_t i = 0; i < num_comps; i++) + max_std_dev = std::max(max_std_dev, comp_stats[i].get_std_dev()); + return max_std_dev; +} + +const uint32_t ASTC_SIG = 0x5CA1AB13; + +#pragma pack(push, 1) +struct astc_header +{ + uint32_t m_sig; + uint8_t m_block_x; + uint8_t m_block_y; + uint8_t m_block_z; + uint8_t m_width[3]; + uint8_t m_height[3]; + uint8_t m_depth[3]; +}; +#pragma pack(pop) + +bool save_astc_file(const char* pFilename, block16_vec& blocks, uint32_t width, uint32_t height, uint32_t block_width, uint32_t block_height) +{ + FILE* pFile = nullptr; + +#ifdef _MSC_VER + fopen_s(&pFile, pFilename, "wb"); +#else + pFile = fopen(pFilename, "wb"); +#endif + + if (!pFile) + return false; + + astc_header hdr; + memset(&hdr, 0, sizeof(hdr)); + + hdr.m_sig = ASTC_SIG; + hdr.m_block_x = (uint8_t)block_width; + hdr.m_block_y = (uint8_t)block_height; + hdr.m_block_z = 1; + hdr.m_width[0] = (uint8_t)(width); + hdr.m_width[1] = (uint8_t)(width >> 8); + hdr.m_width[2] = (uint8_t)(width >> 16); + hdr.m_height[0] = (uint8_t)(height); + hdr.m_height[1] = (uint8_t)(height >> 8); + hdr.m_height[2] = (uint8_t)(height >> 16); + hdr.m_depth[0] = 1; + fwrite(&hdr, sizeof(hdr), 1, pFile); + + fwrite(blocks.data(), 16, blocks.size(), pFile); + if (fclose(pFile) == EOF) + return false; + + return true; +} + +bool load_astc_file(const char* pFilename, block16_vec& blocks, uint32_t& width, uint32_t& height, uint32_t& block_width, uint32_t& block_height) +{ + FILE* pFile = nullptr; + +#ifdef _MSC_VER + fopen_s(&pFile, pFilename, "rb"); +#else + pFile = fopen(pFilename, "rb"); +#endif + + if (!pFile) + return false; + + astc_header hdr; + if (fread(&hdr, sizeof(hdr), 1, pFile) != 1) + { + fclose(pFile); + return false; + } + + if (hdr.m_sig != ASTC_SIG) + { + fclose(pFile); + return false; + } + + width = hdr.m_width[0] + (hdr.m_width[1] << 8) + (hdr.m_width[2] << 16); + height = hdr.m_height[0] + (hdr.m_height[1] << 8) + (hdr.m_height[2] << 16); + uint32_t depth = hdr.m_depth[0] + (hdr.m_depth[1] << 8) + (hdr.m_depth[2] << 16); + + if ((width < 1) || (width > 32768) || (height < 1) || (height > 32768)) + return false; + if ((hdr.m_block_z != 1) || (depth != 1)) + return false; + + block_width = hdr.m_block_x; + block_height = hdr.m_block_y; + + if ((block_width < 4) || (block_width > 12) || (block_height < 4) || (block_height > 12)) + return false; + + uint32_t blocks_x = (width + block_width - 1) / block_width; + uint32_t blocks_y = (height + block_height - 1) / block_height; + uint32_t total_blocks = blocks_x * blocks_y; + + blocks.resize(total_blocks); + + if (fread(blocks.data(), 16, total_blocks, pFile) != total_blocks) + { + fclose(pFile); + return false; + } + + fclose(pFile); + return true; +} + +#if 0 +uint32_t get_deflate_size(const void* pData, size_t data_size) +{ + size_t comp_size = 0; + void* pPre_RDO_Comp_data = tdefl_compress_mem_to_heap(pData, data_size, &comp_size, TDEFL_MAX_PROBES_MASK);// TDEFL_DEFAULT_MAX_PROBES); + mz_free(pPre_RDO_Comp_data); + + if (comp_size > UINT32_MAX) + return UINT32_MAX; + + return (uint32_t)comp_size; +} +#endif + +bool read_file(const char* pFilename, uint8_vec& buf) +{ + buf.resize(0); + + FILE* pFile = nullptr; +#if _MSC_VER + fopen_s(&pFile, pFilename, "rb"); +#else + pFile = fopen(pFilename, "rb"); +#endif + if (!pFile) + return false; + + fseek(pFile, 0, SEEK_END); + + long file_end_ofs = ftell(pFile); + if (file_end_ofs <= 0) + { + fclose(pFile); + return false; + } + + size_t sz = static_cast(file_end_ofs); + if (sz != (unsigned long)file_end_ofs) + { + fclose(pFile); + return false; + } + + fseek(pFile, 0, SEEK_SET); + + buf.resize(sz); + + if (fread(buf.data(), sizeof(uint8_t), sz, pFile) != sz) + { + fclose(pFile); + return false; + } + + fclose(pFile); + return true; +} + +} // namespace utils diff --git a/example_transcoding/utils.h b/example_transcoding/utils.h new file mode 100644 index 0000000..d161e5f --- /dev/null +++ b/example_transcoding/utils.h @@ -0,0 +1,2621 @@ +// File: utils.h +#pragma once +#ifdef _MSC_VER +#pragma warning (push) +#pragma warning (disable:4127) // conditional expression is constant +#endif + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "dds_defs.h" + +#ifndef M_PI +#define M_PI 3.14159265358979323846 +#endif + +#define ASSUME(c) static_assert(c, #c) +#define ARRAY_SIZE(a) (sizeof(a)/sizeof(a[0])) + +#define VECTOR_TEXT_LINE_SIZE (30.0f) +#define VECTOR_TEXT_CORE_LINE_SIZE (21.0f) + +#define UNUSED(x) (void)x + +namespace utils +{ +typedef std::vector uint8_vec; + +extern const uint32_t g_pretty_colors[]; +extern const uint32_t g_num_pretty_colors; + +const float cDegToRad = 0.01745329252f; +const float cRadToDeg = 57.29577951f; + +enum eClear { cClear }; +enum eZero { cZero }; +enum eInitExpand { cInitExpand }; + +inline int iabs(int i) { if (i < 0) i = -i; return i; } +inline uint8_t clamp255(int32_t i) { return (uint8_t)((i & 0xFFFFFF00U) ? (~(i >> 31)) : i); } +template inline S clamp(S value, S low, S high) { return (value < low) ? low : ((value > high) ? high : value); } +template inline F lerp(F a, F b, F s) { return a + (b - a) * s; } +template inline F square(F a) { return a * a; } + +template +inline T prev_wrap(T i, T n) +{ + T temp = i - 1; + if (temp < 0) + temp = n - 1; + return temp; +} + +template +inline T next_wrap(T i, T n) +{ + T temp = i + 1; + if (temp >= n) + temp = 0; + return temp; +} + +inline int posmod(int x, int y) +{ + if (x >= 0) + return (x < y) ? x : (x % y); + int m = (-x) % y; + return (m != 0) ? (y - m) : m; +} + +inline float deg_to_rad(float f) +{ + return f * cDegToRad; +}; + +inline float rad_to_deg(float f) +{ + return f * cRadToDeg; +}; + +template +struct rel_ops +{ + friend bool operator!=(const T& x, const T& y) + { + return (!(x == y)); + } + friend bool operator>(const T& x, const T& y) + { + return (y < x); + } + friend bool operator<=(const T& x, const T& y) + { + return (!(y < x)); + } + friend bool operator>=(const T& x, const T& y) + { + return (!(x < y)); + } +}; + +template +class vec : public rel_ops > +{ +public: + typedef T scalar_type; + enum + { + num_elements = N + }; + + inline vec() + { + } + + inline vec(eClear) + { + clear(); + } + + inline vec(const vec& other) + { + for (uint32_t i = 0; i < N; i++) + m_s[i] = other.m_s[i]; + } + + template + inline vec(const vec& other) + { + set(other); + } + + template + inline vec(const vec& other, T w) + { + *this = other; + m_s[N - 1] = w; + } + + explicit inline vec(T val) + { + set(val); + } + + inline vec(T val0, T val1) + { + set(val0, val1); + } + + inline vec(T val0, T val1, T val2) + { + set(val0, val1, val2); + } + + inline vec(T val0, T val1, T val2, T val3) + { + set(val0, val1, val2, val3); + } + + inline vec(T val0, T val1, T val2, T val3, T val4, T val5) + { + set(val0, val1, val2, val3, val4, val5); + } + + inline vec( + T val0, T val1, T val2, T val3, + T val4, T val5, T val6, T val7, + T val8, T val9, T val10, T val11, + T val12, T val13, T val14, T val15) + { + set(val0, val1, val2, val3, + val4, val5, val6, val7, + val8, val9, val10, val11, + val12, val13, val14, val15); + } + + inline vec( + T val0, T val1, T val2, T val3, + T val4, T val5, T val6, T val7, + T val8, T val9, T val10, T val11, + T val12, T val13, T val14, T val15, + T val16, T val17, T val18, T val19) + { + set(val0, val1, val2, val3, + val4, val5, val6, val7, + val8, val9, val10, val11, + val12, val13, val14, val15, + val16, val17, val18, val19); + } + + inline vec( + T val0, T val1, T val2, T val3, + T val4, T val5, T val6, T val7, + T val8, T val9, T val10, T val11, + T val12, T val13, T val14, T val15, + T val16, T val17, T val18, T val19, + T val20, T val21, T val22, T val23, + T val24) + { + set(val0, val1, val2, val3, + val4, val5, val6, val7, + val8, val9, val10, val11, + val12, val13, val14, val15, + val16, val17, val18, val19, + val20, val21, val22, val23, + val24); + } + + inline void clear() + { + if (N > 4) + memset(m_s, 0, sizeof(m_s)); + else + { + for (uint32_t i = 0; i < N; i++) + m_s[i] = 0; + } + } + + template + inline vec& set(const vec& other) + { + if ((void*)this == (void*)&other) + return *this; + const uint32_t m = std::min(N, ON); + uint32_t i; + for (i = 0; i < m; i++) + m_s[i] = static_cast(other[i]); + for (; i < N; i++) + m_s[i] = 0; + return *this; + } + + inline vec& set_component(uint32_t index, T val) + { + assert(index < N); + m_s[index] = val; + return *this; + } + + inline vec& set(T val) + { + for (uint32_t i = 0; i < N; i++) + m_s[i] = val; + return *this; + } + + inline vec& set(T val0, T val1) + { + m_s[0] = val0; + if (N >= 2) + { + m_s[1] = val1; + + for (uint32_t i = 2; i < N; i++) + m_s[i] = 0; + } + return *this; + } + + inline vec& set(T val0, T val1, T val2) + { + m_s[0] = val0; + if (N >= 2) + { + m_s[1] = val1; + + if (N >= 3) + { + m_s[2] = val2; + + for (uint32_t i = 3; i < N; i++) + m_s[i] = 0; + } + } + return *this; + } + + inline vec& set(T val0, T val1, T val2, T val3) + { + m_s[0] = val0; + if (N >= 2) + { + m_s[1] = val1; + + if (N >= 3) + { + m_s[2] = val2; + + if (N >= 4) + { + m_s[3] = val3; + + for (uint32_t i = 4; i < N; i++) + m_s[i] = 0; + } + } + } + return *this; + } + + inline vec& set(T val0, T val1, T val2, T val3, T val4, T val5) + { + m_s[0] = val0; + if (N >= 2) + { + m_s[1] = val1; + + if (N >= 3) + { + m_s[2] = val2; + + if (N >= 4) + { + m_s[3] = val3; + + if (N >= 5) + { + m_s[4] = val4; + + if (N >= 6) + { + m_s[5] = val5; + + for (uint32_t i = 6; i < N; i++) + m_s[i] = 0; + } + } + } + } + } + return *this; + } + + inline vec& set( + T val0, T val1, T val2, T val3, + T val4, T val5, T val6, T val7, + T val8, T val9, T val10, T val11, + T val12, T val13, T val14, T val15) + { + m_s[0] = val0; + if (N >= 2) + m_s[1] = val1; + if (N >= 3) + m_s[2] = val2; + if (N >= 4) + m_s[3] = val3; + + if (N >= 5) + m_s[4] = val4; + if (N >= 6) + m_s[5] = val5; + if (N >= 7) + m_s[6] = val6; + if (N >= 8) + m_s[7] = val7; + + if (N >= 9) + m_s[8] = val8; + if (N >= 10) + m_s[9] = val9; + if (N >= 11) + m_s[10] = val10; + if (N >= 12) + m_s[11] = val11; + + if (N >= 13) + m_s[12] = val12; + if (N >= 14) + m_s[13] = val13; + if (N >= 15) + m_s[14] = val14; + if (N >= 16) + m_s[15] = val15; + + for (uint32_t i = 16; i < N; i++) + m_s[i] = 0; + + return *this; + } + + inline vec& set( + T val0, T val1, T val2, T val3, + T val4, T val5, T val6, T val7, + T val8, T val9, T val10, T val11, + T val12, T val13, T val14, T val15, + T val16, T val17, T val18, T val19) + { + m_s[0] = val0; + if (N >= 2) + m_s[1] = val1; + if (N >= 3) + m_s[2] = val2; + if (N >= 4) + m_s[3] = val3; + + if (N >= 5) + m_s[4] = val4; + if (N >= 6) + m_s[5] = val5; + if (N >= 7) + m_s[6] = val6; + if (N >= 8) + m_s[7] = val7; + + if (N >= 9) + m_s[8] = val8; + if (N >= 10) + m_s[9] = val9; + if (N >= 11) + m_s[10] = val10; + if (N >= 12) + m_s[11] = val11; + + if (N >= 13) + m_s[12] = val12; + if (N >= 14) + m_s[13] = val13; + if (N >= 15) + m_s[14] = val14; + if (N >= 16) + m_s[15] = val15; + + if (N >= 17) + m_s[16] = val16; + if (N >= 18) + m_s[17] = val17; + if (N >= 19) + m_s[18] = val18; + if (N >= 20) + m_s[19] = val19; + + for (uint32_t i = 20; i < N; i++) + m_s[i] = 0; + + return *this; + } + + inline vec& set( + T val0, T val1, T val2, T val3, + T val4, T val5, T val6, T val7, + T val8, T val9, T val10, T val11, + T val12, T val13, T val14, T val15, + T val16, T val17, T val18, T val19, + T val20, T val21, T val22, T val23, + T val24) + { + m_s[0] = val0; + if (N >= 2) + m_s[1] = val1; + if (N >= 3) + m_s[2] = val2; + if (N >= 4) + m_s[3] = val3; + + if (N >= 5) + m_s[4] = val4; + if (N >= 6) + m_s[5] = val5; + if (N >= 7) + m_s[6] = val6; + if (N >= 8) + m_s[7] = val7; + + if (N >= 9) + m_s[8] = val8; + if (N >= 10) + m_s[9] = val9; + if (N >= 11) + m_s[10] = val10; + if (N >= 12) + m_s[11] = val11; + + if (N >= 13) + m_s[12] = val12; + if (N >= 14) + m_s[13] = val13; + if (N >= 15) + m_s[14] = val14; + if (N >= 16) + m_s[15] = val15; + + if (N >= 17) + m_s[16] = val16; + if (N >= 18) + m_s[17] = val17; + if (N >= 19) + m_s[18] = val18; + if (N >= 20) + m_s[19] = val19; + + if (N >= 21) + m_s[20] = val20; + if (N >= 22) + m_s[21] = val21; + if (N >= 23) + m_s[22] = val22; + if (N >= 24) + m_s[23] = val23; + + if (N >= 25) + m_s[24] = val24; + + for (uint32_t i = 25; i < N; i++) + m_s[i] = 0; + + return *this; + } + + inline vec& set(const T* pValues) + { + for (uint32_t i = 0; i < N; i++) + m_s[i] = pValues[i]; + return *this; + } + + template + inline vec& swizzle_set(const vec& other, uint32_t i) + { + return set(static_cast(other[i])); + } + + template + inline vec& swizzle_set(const vec& other, uint32_t i, uint32_t j) + { + return set(static_cast(other[i]), static_cast(other[j])); + } + + template + inline vec& swizzle_set(const vec& other, uint32_t i, uint32_t j, uint32_t k) + { + return set(static_cast(other[i]), static_cast(other[j]), static_cast(other[k])); + } + + template + inline vec& swizzle_set(const vec& other, uint32_t i, uint32_t j, uint32_t k, uint32_t l) + { + return set(static_cast(other[i]), static_cast(other[j]), static_cast(other[k]), static_cast(other[l])); + } + + inline vec& operator=(const vec& rhs) + { + if (this != &rhs) + { + for (uint32_t i = 0; i < N; i++) + m_s[i] = rhs.m_s[i]; + } + return *this; + } + + template + inline vec& operator=(const vec& other) + { + if ((void*)this == (void*)&other) + return *this; + + uint32_t s = std::min(N, O); + + uint32_t i; + for (i = 0; i < s; i++) + m_s[i] = static_cast(other[i]); + + for (; i < N; i++) + m_s[i] = 0; + + return *this; + } + + inline bool operator==(const vec& rhs) const + { + for (uint32_t i = 0; i < N; i++) + if (!(m_s[i] == rhs.m_s[i])) + return false; + return true; + } + + inline bool operator<(const vec& rhs) const + { + for (uint32_t i = 0; i < N; i++) + { + if (m_s[i] < rhs.m_s[i]) + return true; + else if (!(m_s[i] == rhs.m_s[i])) + return false; + } + + return false; + } + + inline T operator[](uint32_t i) const + { + assert(i < N); + return m_s[i]; + } + + inline T& operator[](uint32_t i) + { + assert(i < N); + return m_s[i]; + } + + template + inline uint64_t get_component_as_uint() const + { + ASSUME(index < N); + if (sizeof(T) == sizeof(float)) + return *reinterpret_cast(&m_s[index]); + else + return *reinterpret_cast(&m_s[index]); + } + + inline T get_x(void) const + { + return m_s[0]; + } + inline T get_y(void) const + { + ASSUME(N >= 2); + return m_s[1]; + } + inline T get_z(void) const + { + ASSUME(N >= 3); + return m_s[2]; + } + inline T get_w(void) const + { + ASSUME(N >= 4); + return m_s[3]; + } + + inline vec get_x_vector() const + { + return broadcast<0>(); + } + inline vec get_y_vector() const + { + return broadcast<1>(); + } + inline vec get_z_vector() const + { + return broadcast<2>(); + } + inline vec get_w_vector() const + { + return broadcast<3>(); + } + + inline T get_component(uint32_t i) const + { + return (*this)[i]; + } + + inline vec& set_x(T v) + { + m_s[0] = v; + return *this; + } + inline vec& set_y(T v) + { + ASSUME(N >= 2); + m_s[1] = v; + return *this; + } + inline vec& set_z(T v) + { + ASSUME(N >= 3); + m_s[2] = v; + return *this; + } + inline vec& set_w(T v) + { + ASSUME(N >= 4); + m_s[3] = v; + return *this; + } + + inline const T* get_ptr() const + { + return reinterpret_cast(&m_s[0]); + } + inline T* get_ptr() + { + return reinterpret_cast(&m_s[0]); + } + + inline vec as_point() const + { + vec result(*this); + result[N - 1] = 1; + return result; + } + + inline vec as_dir() const + { + vec result(*this); + result[N - 1] = 0; + return result; + } + + inline vec<2, T> select2(uint32_t i, uint32_t j) const + { + assert((i < N) && (j < N)); + return vec<2, T>(m_s[i], m_s[j]); + } + + inline vec<3, T> select3(uint32_t i, uint32_t j, uint32_t k) const + { + assert((i < N) && (j < N) && (k < N)); + return vec<3, T>(m_s[i], m_s[j], m_s[k]); + } + + inline vec<4, T> select4(uint32_t i, uint32_t j, uint32_t k, uint32_t l) const + { + assert((i < N) && (j < N) && (k < N) && (l < N)); + return vec<4, T>(m_s[i], m_s[j], m_s[k], m_s[l]); + } + + inline bool is_dir() const + { + return m_s[N - 1] == 0; + } + inline bool is_vector() const + { + return is_dir(); + } + inline bool is_point() const + { + return m_s[N - 1] == 1; + } + + inline vec project() const + { + vec result(*this); + if (result[N - 1]) + result /= result[N - 1]; + return result; + } + + inline vec broadcast(unsigned i) const + { + return vec((*this)[i]); + } + + template + inline vec broadcast() const + { + return vec((*this)[i]); + } + + inline vec swizzle(uint32_t i, uint32_t j) const + { + return vec((*this)[i], (*this)[j]); + } + + inline vec swizzle(uint32_t i, uint32_t j, uint32_t k) const + { + return vec((*this)[i], (*this)[j], (*this)[k]); + } + + inline vec swizzle(uint32_t i, uint32_t j, uint32_t k, uint32_t l) const + { + return vec((*this)[i], (*this)[j], (*this)[k], (*this)[l]); + } + + inline vec operator-() const + { + vec result; + for (uint32_t i = 0; i < N; i++) + result.m_s[i] = -m_s[i]; + return result; + } + + inline vec operator+() const + { + return *this; + } + + inline vec& operator+=(const vec& other) + { + for (uint32_t i = 0; i < N; i++) + m_s[i] += other.m_s[i]; + return *this; + } + + inline vec& operator-=(const vec& other) + { + for (uint32_t i = 0; i < N; i++) + m_s[i] -= other.m_s[i]; + return *this; + } + + inline vec& operator*=(const vec& other) + { + for (uint32_t i = 0; i < N; i++) + m_s[i] *= other.m_s[i]; + return *this; + } + + inline vec& operator/=(const vec& other) + { + for (uint32_t i = 0; i < N; i++) + m_s[i] /= other.m_s[i]; + return *this; + } + + inline vec& operator*=(T s) + { + for (uint32_t i = 0; i < N; i++) + m_s[i] *= s; + return *this; + } + + inline vec& operator/=(T s) + { + for (uint32_t i = 0; i < N; i++) + m_s[i] /= s; + return *this; + } + + // component-wise multiply (not a dot product like in previous versions) + // just remarking it out because it's too ambiguous, use dot() or mul_components() instead +#if 0 + friend inline vec operator*(const vec& lhs, const vec& rhs) + { + return vec::mul_components(lhs, rhs); + } +#endif + + friend inline vec operator*(const vec& lhs, T val) + { + vec result; + for (uint32_t i = 0; i < N; i++) + result.m_s[i] = lhs.m_s[i] * val; + return result; + } + + friend inline vec operator*(T val, const vec& rhs) + { + vec result; + for (uint32_t i = 0; i < N; i++) + result.m_s[i] = val * rhs.m_s[i]; + return result; + } + + friend inline vec operator/(const vec& lhs, const vec& rhs) + { + vec result; + for (uint32_t i = 0; i < N; i++) + result.m_s[i] = lhs.m_s[i] / rhs.m_s[i]; + return result; + } + + friend inline vec operator/(const vec& lhs, T val) + { + vec result; + for (uint32_t i = 0; i < N; i++) + result.m_s[i] = lhs.m_s[i] / val; + return result; + } + + friend inline vec operator+(const vec& lhs, const vec& rhs) + { + vec result; + for (uint32_t i = 0; i < N; i++) + result.m_s[i] = lhs.m_s[i] + rhs.m_s[i]; + return result; + } + + friend inline vec operator-(const vec& lhs, const vec& rhs) + { + vec result; + for (uint32_t i = 0; i < N; i++) + result.m_s[i] = lhs.m_s[i] - rhs.m_s[i]; + return result; + } + + static inline vec<3, T> cross2(const vec& a, const vec& b) + { + ASSUME(N >= 2); + return vec<3, T>(0, 0, a[0] * b[1] - a[1] * b[0]); + } + + inline vec<3, T> cross2(const vec& b) const + { + return cross2(*this, b); + } + + static inline vec<3, T> cross3(const vec& a, const vec& b) + { + ASSUME(N >= 3); + return vec<3, T>(a[1] * b[2] - a[2] * b[1], a[2] * b[0] - a[0] * b[2], a[0] * b[1] - a[1] * b[0]); + } + + inline vec<3, T> cross3(const vec& b) const + { + return cross3(*this, b); + } + + static inline vec<3, T> cross(const vec& a, const vec& b) + { + ASSUME(N >= 2); + + if (N == 2) + return cross2(a, b); + else + return cross3(a, b); + } + + inline vec<3, T> cross(const vec& b) const + { + ASSUME(N >= 2); + return cross(*this, b); + } + + inline T dot(const vec& rhs) const + { + return dot(*this, rhs); + } + + inline vec dot_vector(const vec& rhs) const + { + return vec(dot(*this, rhs)); + } + + static inline T dot(const vec& lhs, const vec& rhs) + { + T result = lhs.m_s[0] * rhs.m_s[0]; + for (uint32_t i = 1; i < N; i++) + result += lhs.m_s[i] * rhs.m_s[i]; + return result; + } + + inline T dot2(const vec& rhs) const + { + ASSUME(N >= 2); + return m_s[0] * rhs.m_s[0] + m_s[1] * rhs.m_s[1]; + } + + inline T dot3(const vec& rhs) const + { + ASSUME(N >= 3); + return m_s[0] * rhs.m_s[0] + m_s[1] * rhs.m_s[1] + m_s[2] * rhs.m_s[2]; + } + + inline T dot4(const vec& rhs) const + { + ASSUME(N >= 4); + return m_s[0] * rhs.m_s[0] + m_s[1] * rhs.m_s[1] + m_s[2] * rhs.m_s[2] + m_s[3] * rhs.m_s[3]; + } + + inline T norm(void) const + { + T sum = m_s[0] * m_s[0]; + for (uint32_t i = 1; i < N; i++) + sum += m_s[i] * m_s[i]; + return sum; + } + + inline T length(void) const + { + return sqrt(norm()); + } + + inline T squared_distance(const vec& rhs) const + { + T dist2 = 0; + for (uint32_t i = 0; i < N; i++) + { + T d = m_s[i] - rhs.m_s[i]; + dist2 += d * d; + } + return dist2; + } + + inline T squared_distance(const vec& rhs, T early_out) const + { + T dist2 = 0; + for (uint32_t i = 0; i < N; i++) + { + T d = m_s[i] - rhs.m_s[i]; + dist2 += d * d; + if (dist2 > early_out) + break; + } + return dist2; + } + + inline T distance(const vec& rhs) const + { + T dist2 = 0; + for (uint32_t i = 0; i < N; i++) + { + T d = m_s[i] - rhs.m_s[i]; + dist2 += d * d; + } + return sqrt(dist2); + } + + inline vec inverse() const + { + vec result; + for (uint32_t i = 0; i < N; i++) + result[i] = m_s[i] ? (1.0f / m_s[i]) : 0; + return result; + } + + // returns squared length (norm) + inline double normalize(const vec* pDefaultVec = NULL) + { + double n = m_s[0] * m_s[0]; + for (uint32_t i = 1; i < N; i++) + n += m_s[i] * m_s[i]; + + if (n != 0) + *this *= static_cast(1.0f / sqrt(n)); + else if (pDefaultVec) + *this = *pDefaultVec; + return n; + } + + inline double normalize3(const vec* pDefaultVec = NULL) + { + ASSUME(N >= 3); + + double n = m_s[0] * m_s[0] + m_s[1] * m_s[1] + m_s[2] * m_s[2]; + + if (n != 0) + *this *= static_cast((1.0f / sqrt(n))); + else if (pDefaultVec) + *this = *pDefaultVec; + return n; + } + + inline vec& normalize_in_place(const vec* pDefaultVec = NULL) + { + normalize(pDefaultVec); + return *this; + } + + inline vec& normalize3_in_place(const vec* pDefaultVec = NULL) + { + normalize3(pDefaultVec); + return *this; + } + + inline vec get_normalized(const vec* pDefaultVec = NULL) const + { + vec result(*this); + result.normalize(pDefaultVec); + return result; + } + + inline vec get_normalized3(const vec* pDefaultVec = NULL) const + { + vec result(*this); + result.normalize3(pDefaultVec); + return result; + } + + inline vec& clamp(T l, T h) + { + for (uint32_t i = 0; i < N; i++) + m_s[i] = static_cast(clamp(m_s[i], l, h)); + return *this; + } + + inline vec& saturate() + { + return clamp(0.0f, 1.0f); + } + + inline vec& clamp(const vec& l, const vec& h) + { + for (uint32_t i = 0; i < N; i++) + m_s[i] = static_cast(clamp(m_s[i], l[i], h[i])); + return *this; + } + + inline bool is_within_bounds(const vec& l, const vec& h) const + { + for (uint32_t i = 0; i < N; i++) + if ((m_s[i] < l[i]) || (m_s[i] > h[i])) + return false; + + return true; + } + + inline bool is_within_bounds(T l, T h) const + { + for (uint32_t i = 0; i < N; i++) + if ((m_s[i] < l) || (m_s[i] > h)) + return false; + + return true; + } + + inline uint32_t get_major_axis(void) const + { + T m = fabs(m_s[0]); + uint32_t r = 0; + for (uint32_t i = 1; i < N; i++) + { + const T c = fabs(m_s[i]); + if (c > m) + { + m = c; + r = i; + } + } + return r; + } + + inline uint32_t get_minor_axis(void) const + { + T m = fabs(m_s[0]); + uint32_t r = 0; + for (uint32_t i = 1; i < N; i++) + { + const T c = fabs(m_s[i]); + if (c < m) + { + m = c; + r = i; + } + } + return r; + } + + inline void get_projection_axes(uint32_t& u, uint32_t& v) const + { + const int axis = get_major_axis(); + if (m_s[axis] < 0.0f) + { + v = next_wrap(axis, N); + u = next_wrap(v, N); + } + else + { + u = next_wrap(axis, N); + v = next_wrap(u, N); + } + } + + inline T get_absolute_minimum(void) const + { + T result = fabs(m_s[0]); + for (uint32_t i = 1; i < N; i++) + result = std::min(result, fabs(m_s[i])); + return result; + } + + inline T get_absolute_maximum(void) const + { + T result = fabs(m_s[0]); + for (uint32_t i = 1; i < N; i++) + result = std::max(result, fabs(m_s[i])); + return result; + } + + inline T get_minimum(void) const + { + T result = m_s[0]; + for (uint32_t i = 1; i < N; i++) + result = std::min(result, m_s[i]); + return result; + } + + inline T get_maximum(void) const + { + T result = m_s[0]; + for (uint32_t i = 1; i < N; i++) + result = std::max(result, m_s[i]); + return result; + } + + inline vec& remove_unit_direction(const vec& dir) + { + *this -= (dot(dir) * dir); + return *this; + } + + inline vec get_remove_unit_direction(const vec& dir) const + { + return *this - (dot(dir) * dir); + } + + inline bool all_less(const vec& b) const + { + for (uint32_t i = 0; i < N; i++) + if (m_s[i] >= b.m_s[i]) + return false; + return true; + } + + inline bool all_less_equal(const vec& b) const + { + for (uint32_t i = 0; i < N; i++) + if (m_s[i] > b.m_s[i]) + return false; + return true; + } + + inline bool all_greater(const vec& b) const + { + for (uint32_t i = 0; i < N; i++) + if (m_s[i] <= b.m_s[i]) + return false; + return true; + } + + inline bool all_greater_equal(const vec& b) const + { + for (uint32_t i = 0; i < N; i++) + if (m_s[i] < b.m_s[i]) + return false; + return true; + } + + inline vec negate_xyz() const + { + vec ret; + + ret[0] = -m_s[0]; + if (N >= 2) + ret[1] = -m_s[1]; + if (N >= 3) + ret[2] = -m_s[2]; + + for (uint32_t i = 3; i < N; i++) + ret[i] = m_s[i]; + + return ret; + } + + inline vec& invert() + { + for (uint32_t i = 0; i < N; i++) + if (m_s[i] != 0.0f) + m_s[i] = 1.0f / m_s[i]; + return *this; + } + + inline scalar_type perp_dot(const vec& b) const + { + ASSUME(N == 2); + return m_s[0] * b.m_s[1] - m_s[1] * b.m_s[0]; + } + + inline vec perp() const + { + ASSUME(N == 2); + return vec(-m_s[1], m_s[0]); + } + + inline vec get_floor() const + { + vec result; + for (uint32_t i = 0; i < N; i++) + result[i] = floor(m_s[i]); + return result; + } + + inline vec get_ceil() const + { + vec result; + for (uint32_t i = 0; i < N; i++) + result[i] = ceil(m_s[i]); + return result; + } + + // static helper methods + + static inline vec mul_components(const vec& lhs, const vec& rhs) + { + vec result; + for (uint32_t i = 0; i < N; i++) + result[i] = lhs.m_s[i] * rhs.m_s[i]; + return result; + } + + static inline vec mul_add_components(const vec& a, const vec& b, const vec& c) + { + vec result; + for (uint32_t i = 0; i < N; i++) + result[i] = a.m_s[i] * b.m_s[i] + c.m_s[i]; + return result; + } + + static inline vec make_axis(uint32_t i) + { + vec result; + result.clear(); + result[i] = 1; + return result; + } + + static inline vec equals_mask(const vec& a, const vec& b) + { + vec ret; + for (uint32_t i = 0; i < N; i++) + ret[i] = (a[i] == b[i]); + return ret; + } + + static inline vec not_equals_mask(const vec& a, const vec& b) + { + vec ret; + for (uint32_t i = 0; i < N; i++) + ret[i] = (a[i] != b[i]); + return ret; + } + + static inline vec less_mask(const vec& a, const vec& b) + { + vec ret; + for (uint32_t i = 0; i < N; i++) + ret[i] = (a[i] < b[i]); + return ret; + } + + static inline vec less_equals_mask(const vec& a, const vec& b) + { + vec ret; + for (uint32_t i = 0; i < N; i++) + ret[i] = (a[i] <= b[i]); + return ret; + } + + static inline vec greater_equals_mask(const vec& a, const vec& b) + { + vec ret; + for (uint32_t i = 0; i < N; i++) + ret[i] = (a[i] >= b[i]); + return ret; + } + + static inline vec greater_mask(const vec& a, const vec& b) + { + vec ret; + for (uint32_t i = 0; i < N; i++) + ret[i] = (a[i] > b[i]); + return ret; + } + + static inline vec component_max(const vec& a, const vec& b) + { + vec ret; + for (uint32_t i = 0; i < N; i++) + ret.m_s[i] = std::max(a.m_s[i], b.m_s[i]); + return ret; + } + + static inline vec component_min(const vec& a, const vec& b) + { + vec ret; + for (uint32_t i = 0; i < N; i++) + ret.m_s[i] = std::min(a.m_s[i], b.m_s[i]); + return ret; + } + + static inline vec lerp(const vec& a, const vec& b, float t) + { + vec ret; + for (uint32_t i = 0; i < N; i++) + ret.m_s[i] = a.m_s[i] + (b.m_s[i] - a.m_s[i]) * t; + return ret; + } + + static inline bool equal_tol(const vec& a, const vec& b, float t) + { + for (uint32_t i = 0; i < N; i++) + if (!equal_tol(a.m_s[i], b.m_s[i], t)) + return false; + return true; + } + + inline bool equal_tol(const vec& b, float t) const + { + return equal_tol(*this, b, t); + } + +protected: + T m_s[N]; +}; + +typedef vec<1, double> vec1D; +typedef vec<2, double> vec2D; +typedef vec<3, double> vec3D; +typedef vec<4, double> vec4D; + +typedef vec<1, float> vec1F; + +typedef vec<2, float> vec2F; +typedef std::vector vec2F_array; + +typedef vec<3, float> vec3F; +typedef std::vector vec3F_array; + +typedef vec<4, float> vec4F; +typedef std::vector vec4F_array; + +typedef vec<2, uint32_t> vec2U; +typedef vec<3, uint32_t> vec3U; +typedef vec<2, int> vec2I; +typedef vec<3, int> vec3I; +typedef vec<4, int> vec4I; + +typedef vec<2, int16_t> vec2I16; +typedef vec<3, int16_t> vec3I16; + +inline vec2F rotate_point(const vec2F& p, float rad) +{ + float c = cos(rad); + float s = sin(rad); + + float x = p[0]; + float y = p[1]; + + return vec2F(x * c - y * s, x * s + y * c); +} + +class rect +{ +public: + inline rect() + { + } + + inline rect(eClear) + { + clear(); + } + + inline rect(eInitExpand) + { + init_expand(); + } + + // up to, but not including right/bottom + inline rect(int left, int top, int right, int bottom) + { + set(left, top, right, bottom); + } + + inline rect(const vec2I& lo, const vec2I& hi) + { + m_corner[0] = lo; + m_corner[1] = hi; + } + + inline rect(const vec2I& point) + { + m_corner[0] = point; + m_corner[1].set(point[0] + 1, point[1] + 1); + } + + inline bool operator==(const rect& r) const + { + return (m_corner[0] == r.m_corner[0]) && (m_corner[1] == r.m_corner[1]); + } + + inline bool operator<(const rect& r) const + { + for (uint32_t i = 0; i < 2; i++) + { + if (m_corner[i] < r.m_corner[i]) + return true; + else if (!(m_corner[i] == r.m_corner[i])) + return false; + } + + return false; + } + + inline void clear() + { + m_corner[0].clear(); + m_corner[1].clear(); + } + + inline void set(int left, int top, int right, int bottom) + { + m_corner[0].set(left, top); + m_corner[1].set(right, bottom); + } + + inline void set(const vec2I& lo, const vec2I& hi) + { + m_corner[0] = lo; + m_corner[1] = hi; + } + + inline void set(const vec2I& point) + { + m_corner[0] = point; + m_corner[1].set(point[0] + 1, point[1] + 1); + } + + inline uint32_t get_width() const + { + return m_corner[1][0] - m_corner[0][0]; + } + inline uint32_t get_height() const + { + return m_corner[1][1] - m_corner[0][1]; + } + + inline int get_left() const + { + return m_corner[0][0]; + } + inline int get_top() const + { + return m_corner[0][1]; + } + inline int get_right() const + { + return m_corner[1][0]; + } + inline int get_bottom() const + { + return m_corner[1][1]; + } + + inline bool is_empty() const + { + return (m_corner[1][0] <= m_corner[0][0]) || (m_corner[1][1] <= m_corner[0][1]); + } + + inline uint32_t get_dimension(uint32_t axis) const + { + return m_corner[1][axis] - m_corner[0][axis]; + } + inline uint32_t get_area() const + { + return get_dimension(0) * get_dimension(1); + } + + inline const vec2I& operator[](uint32_t i) const + { + assert(i < 2); + return m_corner[i]; + } + inline vec2I& operator[](uint32_t i) + { + assert(i < 2); + return m_corner[i]; + } + + inline rect& translate(int x_ofs, int y_ofs) + { + m_corner[0][0] += x_ofs; + m_corner[0][1] += y_ofs; + m_corner[1][0] += x_ofs; + m_corner[1][1] += y_ofs; + return *this; + } + + inline rect& init_expand() + { + m_corner[0].set(INT_MAX); + m_corner[1].set(INT_MIN); + return *this; + } + + inline rect& expand(int x, int y) + { + m_corner[0][0] = std::min(m_corner[0][0], x); + m_corner[0][1] = std::min(m_corner[0][1], y); + m_corner[1][0] = std::max(m_corner[1][0], x + 1); + m_corner[1][1] = std::max(m_corner[1][1], y + 1); + return *this; + } + + inline rect& expand(const rect& r) + { + m_corner[0][0] = std::min(m_corner[0][0], r[0][0]); + m_corner[0][1] = std::min(m_corner[0][1], r[0][1]); + m_corner[1][0] = std::max(m_corner[1][0], r[1][0]); + m_corner[1][1] = std::max(m_corner[1][1], r[1][1]); + return *this; + } + + inline bool touches(const rect& r) const + { + for (uint32_t i = 0; i < 2; i++) + { + if (r[1][i] <= m_corner[0][i]) + return false; + else if (r[0][i] >= m_corner[1][i]) + return false; + } + + return true; + } + + inline bool fully_within(const rect& r) const + { + for (uint32_t i = 0; i < 2; i++) + { + if (m_corner[0][i] < r[0][i]) + return false; + else if (m_corner[1][i] > r[1][i]) + return false; + } + + return true; + } + + inline bool intersect(const rect& r) + { + if (!touches(r)) + { + clear(); + return false; + } + + for (uint32_t i = 0; i < 2; i++) + { + m_corner[0][i] = std::max(m_corner[0][i], r[0][i]); + m_corner[1][i] = std::min(m_corner[1][i], r[1][i]); + } + + return true; + } + + inline bool contains(int x, int y) const + { + return (x >= m_corner[0][0]) && (x < m_corner[1][0]) && + (y >= m_corner[0][1]) && (y < m_corner[1][1]); + } + + inline bool contains(const vec2I& p) const + { + return contains(p[0], p[1]); + } + +private: + vec2I m_corner[2]; +}; + +inline rect make_rect(uint32_t width, uint32_t height) +{ + return rect(0, 0, width, height); +} + +struct color_quad_u8 +{ +#ifdef _MSC_VER +#pragma warning(push) +#pragma warning(disable:4201) +#endif + union + { + uint8_t m_c[4]; + struct + { + uint8_t r; + uint8_t g; + uint8_t b; + uint8_t a; + }; + }; +#ifdef _MSC_VER +#pragma warning(pop) +#endif + + inline color_quad_u8(eClear) : color_quad_u8(0, 0, 0, 0) { } + + inline color_quad_u8(uint8_t cr, uint8_t cg, uint8_t cb, uint8_t ca) + { + set(cr, cg, cb, ca); + } + + inline color_quad_u8(uint8_t cy = 0, uint8_t ca = 255) + { + set(cy, ca); + } + + inline void clear() + { + set(0, 0, 0, 0); + } + + inline color_quad_u8& set(uint8_t cy, uint8_t ca = 255) + { + m_c[0] = cy; + m_c[1] = cy; + m_c[2] = cy; + m_c[3] = ca; + return *this; + } + + inline color_quad_u8& set(uint8_t cr, uint8_t cg, uint8_t cb, uint8_t ca) + { + m_c[0] = cr; + m_c[1] = cg; + m_c[2] = cb; + m_c[3] = ca; + return *this; + } + + inline color_quad_u8& set_clamped(int cr, int cg, int cb, int ca) + { + m_c[0] = (uint8_t)clamp(cr, 0, 255); + m_c[1] = (uint8_t)clamp(cg, 0, 255); + m_c[2] = (uint8_t)clamp(cb, 0, 255); + m_c[3] = (uint8_t)clamp(ca, 0, 255); + return *this; + } + + color_quad_u8& set_alpha(int ca) { a = (uint8_t)clamp(ca, 0, 255); return *this; } + + inline uint8_t& operator[] (uint32_t i) { assert(i < 4); return m_c[i]; } + inline uint8_t operator[] (uint32_t i) const { assert(i < 4); return m_c[i]; } + + inline int get_luma() const { return (13938U * m_c[0] + 46869U * m_c[1] + 4729U * m_c[2] + 32768U) >> 16U; } // REC709 weightings + + inline bool operator== (const color_quad_u8& other) const + { + return (m_c[0] == other.m_c[0]) && (m_c[1] == other.m_c[1]) && (m_c[2] == other.m_c[2]) && (m_c[3] == other.m_c[3]); + } + + inline bool operator!= (const color_quad_u8& other) const + { + return !(*this == other); + } + + inline uint32_t squared_distance(const color_quad_u8& c, bool alpha = true) const + { + return square(r - c.r) + square(g - c.g) + square(b - c.b) + (alpha ? square(a - c.a) : 0); + } + + inline bool rgb_equals(const color_quad_u8& rhs) const + { + return (r == rhs.r) && (g == rhs.g) && (b == rhs.b); + } +}; +typedef std::vector color_quad_u8_vec; + +inline uint32_t color_distance(bool perceptual, const color_quad_u8& e1, const color_quad_u8& e2, bool alpha) +{ + if (perceptual) + { + const float l1 = e1.r * .2126f + e1.g * .715f + e1.b * .0722f; + const float cr1 = e1.r - l1; + const float cb1 = e1.b - l1; + + const float l2 = e2.r * .2126f + e2.g * .715f + e2.b * .0722f; + const float cr2 = e2.r - l2; + const float cb2 = e2.b - l2; + + const float dl = l1 - l2; + const float dcr = cr1 - cr2; + const float dcb = cb1 - cb2; + + uint32_t d = static_cast( + 32.0f * 4.0f * dl * dl + + 32.0f * 2.0f * (.5f / (1.0f - .2126f)) * (.5f / (1.0f - .2126f)) * dcr * dcr + + 32.0f * .25f * (.5f / (1.0f - .0722f)) * (.5f / (1.0f - .0722f)) * dcb * dcb); + + if (alpha) + { + int da = (int)e1.a - (int)e2.a; + + d += static_cast(128.0f * da * da); + } + + return d; + } + else + return e1.squared_distance(e2, alpha); +} + +extern color_quad_u8 g_white_color_u8, g_black_color_u8, g_red_color_u8, g_green_color_u8, g_blue_color_u8, g_yellow_color_u8, g_purple_color_u8, g_magenta_color_u8, g_cyan_color_u8; + +class image_u8 +{ +public: + image_u8() : + m_width(0), m_height(0), + m_clip_rect(cClear) + { + } + + image_u8(uint32_t width, uint32_t height) : + m_width(width), m_height(height), + m_clip_rect(0, 0, width, height) + { + m_pixels.resize(width * height); + } + + inline const color_quad_u8_vec& get_pixels() const { return m_pixels; } + inline color_quad_u8_vec& get_pixels() { return m_pixels; } + + inline uint32_t width() const { return m_width; } + inline uint32_t height() const { return m_height; } + inline uint32_t total_pixels() const { return m_width * m_height; } + + inline const rect& get_clip_rect() const { return m_clip_rect; } + + inline void set_clip_rect(const rect& r) + { + assert((r.get_left() >= 0) && (r.get_top() >= 0) && (r.get_right() <= (int)m_width) && (r.get_bottom() <= (int)m_height)); + + m_clip_rect = r; + } + + inline void clear_clip_rect() { m_clip_rect.set(0, 0, m_width, m_height); } + + inline bool is_clipped(int x, int y) const { return !m_clip_rect.contains(x, y); } + + inline rect get_bounds() const { return rect(0, 0, m_width, m_height); } + + inline color_quad_u8& operator()(uint32_t x, uint32_t y) { assert((x < m_width) && (y < m_height)); return m_pixels[x + m_width * y]; } + inline const color_quad_u8& operator()(uint32_t x, uint32_t y) const { assert((x < m_width) && (y < m_height)); return m_pixels[x + m_width * y]; } + + image_u8& clear() + { + m_width = m_height = 0; + m_clip_rect.clear(); + m_pixels.clear(); + return *this; + } + + image_u8& init(uint32_t width, uint32_t height) + { + clear(); + + m_width = width; + m_height = height; + m_clip_rect.set(0, 0, width, height); + m_pixels.resize(width * height); + return *this; + } + + image_u8& set_all(const color_quad_u8& p) + { + for (uint32_t i = 0; i < m_pixels.size(); i++) + m_pixels[i] = p; + return *this; + } + + inline const color_quad_u8& get_clamped(int x, int y) const { return (*this)(clamp(x, 0, m_width - 1), clamp(y, 0, m_height - 1)); } + inline color_quad_u8& get_clamped(int x, int y) { return (*this)(clamp(x, 0, m_width - 1), clamp(y, 0, m_height - 1)); } + + inline image_u8& set_pixel_clipped(int x, int y, const color_quad_u8& c) + { + if (!is_clipped(x, y)) + (*this)(x, y) = c; + return *this; + } + + inline image_u8& fill_box(int x, int y, int w, int h, const color_quad_u8& c) + { + for (int y_ofs = 0; y_ofs < h; y_ofs++) + for (int x_ofs = 0; x_ofs < w; x_ofs++) + set_pixel_clipped(x + x_ofs, y + y_ofs, c); + return *this; + } + + void invert_box(int inX, int inY, int inW, int inH) + { + for (int y = 0; y < inH; y++) + { + const uint32_t yy = inY + y; + + for (int x = 0; x < inW; x++) + { + const uint32_t xx = inX + x; + + if (is_clipped(xx, yy)) + continue; + + color_quad_u8 c((*this)(xx, yy)); + + c.r = 255 - c.r; + c.g = 255 - c.g; + c.b = 255 - c.b; + + set_pixel_clipped(xx, yy, c); + } + } + } + + image_u8& crop_dup_borders(uint32_t w, uint32_t h) + { + const uint32_t orig_w = m_width, orig_h = m_height; + + crop(w, h); + + if (orig_w && orig_h) + { + if (m_width > orig_w) + { + for (uint32_t x = orig_w; x < m_width; x++) + for (uint32_t y = 0; y < m_height; y++) + set_pixel_clipped(x, y, get_clamped(std::min(x, orig_w - 1U), std::min(y, orig_h - 1U))); + } + + if (m_height > orig_h) + { + for (uint32_t y = orig_h; y < m_height; y++) + for (uint32_t x = 0; x < m_width; x++) + set_pixel_clipped(x, y, get_clamped(std::min(x, orig_w - 1U), std::min(y, orig_h - 1U))); + } + } + return *this; + } + + image_u8& crop(uint32_t new_width, uint32_t new_height) + { + if ((m_width == new_width) && (m_height == new_height)) + return *this; + + image_u8 new_image(new_width, new_height); + + const uint32_t w = std::min(m_width, new_width); + const uint32_t h = std::min(m_height, new_height); + + for (uint32_t y = 0; y < h; y++) + for (uint32_t x = 0; x < w; x++) + new_image(x, y) = (*this)(x, y); + + return swap(new_image); + } + + image_u8& swap(image_u8& other) + { + std::swap(m_width, other.m_width); + std::swap(m_height, other.m_height); + std::swap(m_pixels, other.m_pixels); + std::swap(m_clip_rect, other.m_clip_rect); + return *this; + } + + // No clipping + inline void get_block(uint32_t bx, uint32_t by, uint32_t width, uint32_t height, color_quad_u8* pPixels) const + { + assert((bx * width + width) <= m_width); + assert((by * height + height) <= m_height); + + for (uint32_t y = 0; y < height; y++) + memcpy(pPixels + y * width, &(*this)(bx * width, by * height + y), width * sizeof(color_quad_u8)); + } + + inline void get_block_clamped(uint32_t bx, uint32_t by, uint32_t width, uint32_t height, color_quad_u8* pPixels) const + { + for (uint32_t y = 0; y < height; y++) + for (uint32_t x = 0; x < width; x++) + pPixels[x + y * width] = get_clamped(bx * width + x, by * height + y); + } + + // No clipping + inline void set_block(uint32_t bx, uint32_t by, uint32_t width, uint32_t height, const color_quad_u8* pPixels) + { + assert((bx * width + width) <= m_width); + assert((by * height + height) <= m_height); + + for (uint32_t y = 0; y < height; y++) + memcpy(&(*this)(bx * width, by * height + y), pPixels + y * width, width * sizeof(color_quad_u8)); + } + + image_u8& swizzle(uint32_t r, uint32_t g, uint32_t b, uint32_t a) + { + assert((r | g | b | a) <= 3); + for (uint32_t y = 0; y < m_height; y++) + { + for (uint32_t x = 0; x < m_width; x++) + { + color_quad_u8 tmp((*this)(x, y)); + (*this)(x, y).set(tmp[r], tmp[g], tmp[b], tmp[a]); + } + } + + return *this; + } + + struct pixel_coord + { + uint16_t m_x, m_y; + pixel_coord() { } + pixel_coord(uint32_t x, uint32_t y) : m_x((uint16_t)x), m_y((uint16_t)y) { } + }; + + uint32_t flood_fill(int x, int y, const color_quad_u8& c, const color_quad_u8& b, std::vector* pSet_pixels = nullptr); + + void draw_line(int xs, int ys, int xe, int ye, const color_quad_u8& color); + + inline void set_pixel_clipped_alphablend(int x, int y, const color_quad_u8& c) + { + if (is_clipped(x, y)) + return; + + color_quad_u8 ct(m_pixels[x + y * m_width]); + + ct.r = static_cast(ct.r + ((c.r - ct.r) * c.a) / 255); + ct.g = static_cast(ct.g + ((c.g - ct.g) * c.a) / 255); + ct.b = static_cast(ct.b + ((c.b - ct.b) * c.a) / 255); + + m_pixels[x + y * m_width] = ct; + } + +private: + color_quad_u8_vec m_pixels; + uint32_t m_width, m_height; + rect m_clip_rect; + + struct fill_segment + { + int16_t m_y, m_xl, m_xr, m_dy; + + fill_segment(int y, int xl, int xr, int dy) : + m_y((int16_t)y), m_xl((int16_t)xl), m_xr((int16_t)xr), m_dy((int16_t)dy) + { + } + }; + + inline bool flood_fill_is_inside(int x, int y, const color_quad_u8& b) const + { + if (is_clipped(x, y)) + return false; + + return (*this)(x, y) == b; + } + + void rasterize_line(int xs, int ys, int xe, int ye, int pred, int inc_dec, int e, int e_inc, int e_no_inc, const color_quad_u8& color); + + void draw_aaline_pixel(int x, int y, int a, color_quad_u8 color) + { + color.a = static_cast(255 - a); + set_pixel_clipped_alphablend(x, y, color); + } +}; + +//bool load_png(const char* pFilename, image_u8& img); + +//bool save_png(const char* pFilename, const image_u8& img, bool save_alpha); + +class image_metrics +{ +public: + double m_max, m_mean, m_mean_squared, m_root_mean_squared, m_peak_snr; + + image_metrics() + { + clear(); + } + + void clear() + { + memset(this, 0, sizeof(*this)); + } + + void compute(const image_u8& a, const image_u8& b, uint32_t first_channel, uint32_t num_channels) + { + const bool average_component_error = true; + + const uint32_t width = std::min(a.width(), b.width()); + const uint32_t height = std::min(a.height(), b.height()); + + assert((first_channel < 4U) && (first_channel + num_channels <= 4U)); + + // Histogram approach originally due to Charles Bloom. + double hist[256]; + memset(hist, 0, sizeof(hist)); + + for (uint32_t y = 0; y < height; y++) + { + for (uint32_t x = 0; x < width; x++) + { + const color_quad_u8& ca = a(x, y); + const color_quad_u8& cb = b(x, y); + + if (!num_channels) + hist[iabs(ca.get_luma() - cb.get_luma())]++; + else + { + for (uint32_t c = 0; c < num_channels; c++) + hist[iabs(ca[first_channel + c] - cb[first_channel + c])]++; + } + } + } + + m_max = 0; + double sum = 0.0f, sum2 = 0.0f; + for (uint32_t i = 0; i < 256; i++) + { + if (!hist[i]) + continue; + + m_max = std::max(m_max, i); + + double x = i * hist[i]; + + sum += x; + sum2 += i * x; + } + + // See http://richg42.blogspot.com/2016/09/how-to-compute-psnr-from-old-berkeley.html + double total_values = width * height; + + if (average_component_error) + total_values *= clamp(num_channels, 1, 4); + + m_mean = clamp(sum / total_values, 0.0f, 255.0f); + m_mean_squared = clamp(sum2 / total_values, 0.0f, 255.0f * 255.0f); + + m_root_mean_squared = sqrt(m_mean_squared); + + if (!m_root_mean_squared) + m_peak_snr = 100.0f; + else + m_peak_snr = clamp(log10(255.0f / m_root_mean_squared) * 20.0f, 0.0f, 100.0f); + } +}; + +class imagef +{ +public: + imagef() : + m_width(0), m_height(0), m_pitch(0) + { + } + + imagef(uint32_t w, uint32_t h, uint32_t p = UINT32_MAX) : + m_width(0), m_height(0), m_pitch(0) + { + resize(w, h, p); + } + + imagef(const imagef& other) : + m_width(0), m_height(0), m_pitch(0) + { + *this = other; + } + + imagef& swap(imagef& other) + { + std::swap(m_width, other.m_width); + std::swap(m_height, other.m_height); + std::swap(m_pitch, other.m_pitch); + m_pixels.swap(other.m_pixels); + return *this; + } + + imagef& operator= (const imagef& rhs) + { + if (this != &rhs) + { + m_width = rhs.m_width; + m_height = rhs.m_height; + m_pitch = rhs.m_pitch; + m_pixels = rhs.m_pixels; + } + return *this; + } + + imagef& clear() + { + m_width = 0; + m_height = 0; + m_pitch = 0; + m_pixels.resize(0); + return *this; + } + + imagef& set(const image_u8& src, const vec4F& scale = vec4F(1), const vec4F& bias = vec4F(0)) + { + const uint32_t width = src.width(); + const uint32_t height = src.height(); + + resize(width, height); + + for (int y = 0; y < (int)height; y++) + { + for (uint32_t x = 0; x < width; x++) + { + const color_quad_u8& src_pixel = src(x, y); + (*this)(x, y).set((float)src_pixel.r * scale[0] + bias[0], (float)src_pixel.g * scale[1] + bias[1], (float)src_pixel.b * scale[2] + bias[2], (float)src_pixel.a * scale[3] + bias[3]); + } + } + + return *this; + } + + imagef& resize(const imagef& other, uint32_t p = UINT32_MAX, const vec4F& background = vec4F(0, 0, 0, 1)) + { + return resize(other.get_width(), other.get_height(), p, background); + } + + imagef& resize(uint32_t w, uint32_t h, uint32_t p = UINT32_MAX, const vec4F& background = vec4F(0, 0, 0, 1)) + { + return crop(w, h, p, background); + } + + imagef& set_all(const vec4F& c) + { + for (uint32_t i = 0; i < m_pixels.size(); i++) + m_pixels[i] = c; + return *this; + } + + imagef& fill_box(uint32_t x, uint32_t y, uint32_t w, uint32_t h, const vec4F& c) + { + for (uint32_t iy = 0; iy < h; iy++) + for (uint32_t ix = 0; ix < w; ix++) + set_pixel_clipped(x + ix, y + iy, c); + return *this; + } + + imagef& crop(uint32_t w, uint32_t h, uint32_t p = UINT32_MAX, const vec4F& background = vec4F(0, 0, 0, 1)) + { + if (p == UINT32_MAX) + p = w; + + if ((w == m_width) && (m_height == h) && (m_pitch == p)) + return *this; + + if ((!w) || (!h) || (!p)) + { + clear(); + return *this; + } + + vec4F_array cur_state; + cur_state.swap(m_pixels); + + m_pixels.resize(p * h); + + for (uint32_t y = 0; y < h; y++) + { + for (uint32_t x = 0; x < w; x++) + { + if ((x < m_width) && (y < m_height)) + m_pixels[x + y * p] = cur_state[x + y * m_pitch]; + else + m_pixels[x + y * p] = background; + } + } + + m_width = w; + m_height = h; + m_pitch = p; + + return *this; + } + + inline const vec4F& operator() (uint32_t x, uint32_t y) const { assert(x < m_width&& y < m_height); return m_pixels[x + y * m_pitch]; } + inline vec4F& operator() (uint32_t x, uint32_t y) { assert(x < m_width&& y < m_height); return m_pixels[x + y * m_pitch]; } + + inline const vec4F& get_clamped(int x, int y) const { return (*this)(clamp(x, 0, m_width - 1), clamp(y, 0, m_height - 1)); } + inline vec4F& get_clamped(int x, int y) { return (*this)(clamp(x, 0, m_width - 1), clamp(y, 0, m_height - 1)); } + + inline const vec4F& get_clamped_or_wrapped(int x, int y, bool wrap_u, bool wrap_v) const + { + x = wrap_u ? posmod(x, m_width) : clamp(x, 0, m_width - 1); + y = wrap_v ? posmod(y, m_height) : clamp(y, 0, m_height - 1); + return m_pixels[x + y * m_pitch]; + } + + inline vec4F& get_clamped_or_wrapped(int x, int y, bool wrap_u, bool wrap_v) + { + x = wrap_u ? posmod(x, m_width) : clamp(x, 0, m_width - 1); + y = wrap_v ? posmod(y, m_height) : clamp(y, 0, m_height - 1); + return m_pixels[x + y * m_pitch]; + } + + inline imagef& set_pixel_clipped(int x, int y, const vec4F& c) + { + if ((static_cast(x) < m_width) && (static_cast(y) < m_height)) + (*this)(x, y) = c; + return *this; + } + + // Very straightforward blit with full clipping. Not fast, but it works. + imagef& blit(const imagef& src, int src_x, int src_y, int src_w, int src_h, int dst_x, int dst_y) + { + for (int y = 0; y < src_h; y++) + { + const int sy = src_y + y; + if (sy < 0) + continue; + else if (sy >= (int)src.get_height()) + break; + + for (int x = 0; x < src_w; x++) + { + const int sx = src_x + x; + if (sx < 0) + continue; + else if (sx >= (int)src.get_height()) + break; + + set_pixel_clipped(dst_x + x, dst_y + y, src(sx, sy)); + } + } + + return *this; + } + + const imagef& extract_block_clamped(vec4F* pDst, uint32_t src_x, uint32_t src_y, uint32_t w, uint32_t h) const + { + for (uint32_t y = 0; y < h; y++) + for (uint32_t x = 0; x < w; x++) + *pDst++ = get_clamped(src_x + x, src_y + y); + return *this; + } + + imagef& set_block_clipped(const vec4F* pSrc, uint32_t dst_x, uint32_t dst_y, uint32_t w, uint32_t h) + { + for (uint32_t y = 0; y < h; y++) + for (uint32_t x = 0; x < w; x++) + set_pixel_clipped(dst_x + x, dst_y + y, *pSrc++); + return *this; + } + + inline uint32_t get_width() const { return m_width; } + inline uint32_t get_height() const { return m_height; } + inline uint32_t get_pitch() const { return m_pitch; } + inline uint32_t get_total_pixels() const { return m_width * m_height; } + + inline uint32_t get_block_width(uint32_t w) const { return (m_width + (w - 1)) / w; } + inline uint32_t get_block_height(uint32_t h) const { return (m_height + (h - 1)) / h; } + inline uint32_t get_total_blocks(uint32_t w, uint32_t h) const { return get_block_width(w) * get_block_height(h); } + + inline const vec4F_array& get_pixels() const { return m_pixels; } + inline vec4F_array& get_pixels() { return m_pixels; } + + inline const vec4F* get_ptr() const { return &m_pixels[0]; } + inline vec4F* get_ptr() { return &m_pixels[0]; } + +private: + uint32_t m_width, m_height, m_pitch; // all in pixels + vec4F_array m_pixels; +}; + +enum +{ + cComputeGaussianFlagNormalize = 1, + cComputeGaussianFlagPrint = 2, + cComputeGaussianFlagNormalizeCenterToOne = 4 +}; + +// size_x/y should be odd +void compute_gaussian_kernel(float* pDst, int size_x, int size_y, float sigma_sqr, uint32_t flags); + +void gaussian_filter(imagef& dst, const imagef& orig_img, uint32_t odd_filter_width, float sigma_sqr, bool wrapping = false, uint32_t width_divisor = 1, uint32_t height_divisor = 1); + +vec4F compute_ssim(const imagef& a, const imagef& b); + +vec4F compute_ssim(const image_u8& a, const image_u8& b, bool luma); + +struct block8 +{ + uint64_t m_vals[1]; +}; + +typedef std::vector block8_vec; + +struct block16 +{ + uint64_t m_vals[2]; +}; + +typedef std::vector block16_vec; + +bool save_dds(const char* pFilename, uint32_t width, uint32_t height, const void* pBlocks, uint32_t pixel_format_bpp, DXGI_FORMAT dxgi_format, bool srgb, bool force_dx10_header); + +void strip_extension(std::string& s); +void strip_path(std::string& s); + +uint32_t hash_hsieh(const uint8_t* pBuf, size_t len); + +// https://www.johndcook.com/blog/standard_deviation/ +// This class is for small numbers of integers, so precision shouldn't be an issue. +class tracked_stat +{ +public: + tracked_stat() { clear(); } + + void clear() { m_num = 0; m_total = 0; m_total2 = 0; } + + void update(uint32_t val) { m_num++; m_total += val; m_total2 += val * val; } + + tracked_stat& operator += (uint32_t val) { update(val); return *this; } + + uint32_t get_number_of_values() const { return m_num; } + uint64_t get_total() const { return m_total; } + uint64_t get_total2() const { return m_total2; } + + float get_mean() const { return m_num ? (float)m_total / m_num : 0.0f; }; + + float get_variance() const { return m_num ? ((float)(m_num * m_total2 - m_total * m_total)) / (m_num * m_num) : 0.0f; } + float get_std_dev() const { return m_num ? sqrtf((float)(m_num * m_total2 - m_total * m_total)) / m_num : 0.0f; } + + float get_sample_variance() const { return (m_num > 1) ? ((float)(m_num * m_total2 - m_total * m_total)) / (m_num * (m_num - 1)) : 0.0f; } + float get_sample_std_dev() const { return (m_num > 1) ? sqrtf(get_sample_variance()) : 0.0f; } + +private: + uint32_t m_num; + uint64_t m_total; + uint64_t m_total2; +}; + +inline float compute_covariance(const float* pA, const float* pB, const tracked_stat& a, const tracked_stat& b, bool sample) +{ + const uint32_t n = a.get_number_of_values(); + assert(n == b.get_number_of_values()); + + if (!n) + { + assert(0); + return 0.0f; + } + if ((sample) && (n == 1)) + { + assert(0); + return 0; + } + + const float mean_a = a.get_mean(); + const float mean_b = b.get_mean(); + + float total = 0.0f; + for (uint32_t i = 0; i < n; i++) + total += (pA[i] - mean_a) * (pB[i] - mean_b); + + return total / (sample ? (n - 1) : n); +} + +inline float compute_correlation_coefficient(const float* pA, const float* pB, const tracked_stat& a, const tracked_stat& b, float c, bool sample) +{ + if (!a.get_number_of_values()) + return 1.0f; + + float covar = compute_covariance(pA, pB, a, b, sample); + float std_dev_a = sample ? a.get_sample_std_dev() : a.get_std_dev(); + float std_dev_b = sample ? b.get_sample_std_dev() : b.get_std_dev(); + float denom = std_dev_a * std_dev_b + c; + + if (denom < .0000125f) + return 1.0f; + + float result = (covar + c) / denom; + + return clamp(result, -1.0f, 1.0f); +} + +float compute_block_max_std_dev(const color_quad_u8* pPixels, uint32_t block_width, uint32_t block_height, uint32_t num_comps); + +class rand +{ + std::mt19937 m_mt; + +public: + rand() { } + + rand(uint32_t s) { seed(s); } + void seed(uint32_t s) { m_mt.seed(s); } + + // between [l,h] + int irand(int l, int h) { std::uniform_int_distribution d(l, h); return d(m_mt); } + + uint32_t urand32() { return static_cast(irand(INT32_MIN, INT32_MAX)); } + + bool bit() { return irand(0, 1) == 1; } + + uint8_t byte() { return static_cast(urand32()); } + + // between [l,h) + float frand(float l, float h) { std::uniform_real_distribution d(l, h); return d(m_mt); } + + float gaussian(float mean, float stddev) { std::normal_distribution d(mean, stddev); return d(m_mt); } +}; + +bool save_astc_file(const char* pFilename, block16_vec& blocks, uint32_t width, uint32_t height, uint32_t block_width, uint32_t block_height); +bool load_astc_file(const char* pFilename, block16_vec& blocks, uint32_t& width, uint32_t& height, uint32_t& block_width, uint32_t& block_height); + +class value_stats +{ +public: + value_stats() + { + clear(); + } + + void clear() + { + m_sum = 0; + m_sum2 = 0; + m_num = 0; + m_min = 1e+39; + m_max = -1e+39; + m_vals.clear(); + } + + void add(double val) + { + m_sum += val; + m_sum2 += val * val; + + m_num++; + + m_min = std::min(m_min, val); + m_max = std::max(m_max, val); + + m_vals.push_back(val); + } + + void add(int val) + { + add(static_cast(val)); + } + + void add(uint32_t val) + { + add(static_cast(val)); + } + + void add(int64_t val) + { + add(static_cast(val)); + } + + void add(uint64_t val) + { + add(static_cast(val)); + } + + void print(const char* pPrefix = "") + { + if (!m_vals.size()) + printf("%s: Empty\n", pPrefix); + else + printf("%s: Samples: %llu, Total: %f, Avg: %f, Std Dev: %f, Min: %f, Max: %f, Mean: %f\n", + pPrefix, (unsigned long long)get_num(), get_total(), get_average(), get_std_dev(), get_min(), get_max(), get_mean()); + } + + double get_total() const + { + return m_sum; + } + + double get_average() const + { + return m_num ? (m_sum / m_num) : 0.0f; + } + + double get_min() const + { + return m_min; + } + + double get_max() const + { + return m_max; + } + + uint64_t get_num() const + { + return m_num; + } + + double get_val(uint32_t index) const + { + return m_vals[index]; + } + + // Returns population standard deviation + double get_std_dev() const + { + if (!m_num) + return 0.0f; + + // TODO: FP precision + return sqrt((m_sum2 - ((m_sum * m_sum) / m_num)) / m_num); + } + + double get_mean() const + { + if (!m_num) + return 0.0f; + + std::vector sorted_vals(m_vals); + std::sort(sorted_vals.begin(), sorted_vals.end()); + + return sorted_vals[sorted_vals.size() / 2]; + } + +private: + double m_sum; + double m_sum2; + + uint64_t m_num; + + double m_min; + double m_max; + + mutable std::vector m_vals; +}; + +//uint32_t get_deflate_size(const void* pData, size_t data_size); + +bool read_file(const char* pFilename, uint8_vec& buf); + +} // namespace utils + +#ifdef _MSC_VER +#pragma warning (pop) +#endif \ No newline at end of file diff --git a/python/README.md b/python/README.md new file mode 100644 index 0000000..e97e269 --- /dev/null +++ b/python/README.md @@ -0,0 +1,59 @@ +Python support is still new and coming online, but is entirely functional. +The library's pure C (WASM friendly) API's are completely exposed to Python. + +The Python integration first tries to use native .so's in the basisu_py +directory. If they don't exist, it tries the slower and single threaded WASM +fallbacks under basisu_py/wasm, which requires wasmtime for Python to be +installed. Some tests require an input.ktx2 or test.ktx2 to be in the current +directory. + +Building: + +Under the repo's root directory - build the native SO's: + +``` +mkdir build_python +cd build_python +cmake -DBASISU_BUILD_PYTHON=ON .. +make +``` + +Build the WASM modules (see README_WASI.md file for instructions on how to +install the WASI SDK, which is required): + +``` +mkdir build_wasm_st +cd build_wasm_st +cmake .. -DCMAKE_TOOLCHAIN_FILE=$WASI_SDK_PATH/share/cmake/wasi-sdk.cmake -DCMAKE_BUILD_TYPE=Release -DBASISU_WASM_THREADING=OFF +make +``` + +Running tests: + +The tests assume the current directory is "python". Run them like this: + +Higher-level tests: + +python3 -m tests.test_backend_loading +python3 -m tests.test_basic_wasm_selection +python3 -m tests.test_basic_backend_selection +python3 -m tests.test_basic_decode +python3 -m tests.test_basic_transcode +python3 -m tests.test_compress_swirl +python3 -m tests.test_compress_swirl_hdr +python3 -m tests.test_transcoder_astc +python3 -m tests.test_transcoder_backend_loading +python3 -m tests.test_transcoder_end_to_end +python3 -m tests.test_transcoder_end_to_end_hdr +python3 -m tests.test_transcoder_helpers + +Low-level tests (used while bringing up the codec): + +python3 -m lowlevel_test_native.basic_test +python3 -m lowlevel_test_native.test_transcoder_basic +python3 -m lowlevel_test_native.example_capi_python + +python3 -m lowlevel_test_wasm.basic_test +python3 -m lowlevel_test_wasm.compress_test +python3 -m lowlevel_test_wasm.compress_test_float + diff --git a/python/README_win.md b/python/README_win.md new file mode 100644 index 0000000..74a312b --- /dev/null +++ b/python/README_win.md @@ -0,0 +1,85 @@ +Windows Native Python Build Instructions +======================================== + +This project uses pybind11 to build Python .pyd extension modules on Windows. +Because Windows installs multiple Python versions, and pybind11 currently only +supports up to Python 3.12, you must follow these steps exactly. + +Requirements +------------ +- Visual Studio Developer Command Prompt (VS C++ Build Tools installed) +- Python 3.12 (pybind11 does NOT support 3.13+ at the time of writing) +- pybind11 installed into Python 3.12 + +Check installed Python versions: + py -0 + +If Python 3.12 is missing: + winget install Python.Python.3.12 + +Install pybind11 for Python 3.12: + py -3.12 -m pip install pybind11 + +IMPORTANT: +You must build AND run with the same Python interpreter version (3.12). + +Building the .pyd Modules +------------------------- +Open the "Developer Command Prompt for Visual Studio". + +From the project root: + + mkdir build_python_win + cd build_python_win + +Run CMake using the exact path to python.exe for Python 3.12: + + cmake -G "Visual Studio 17 2022" -A x64 -DBASISU_BUILD_PYTHON=ON -DBASISU_BUILD_WASM=OFF -DPYTHON_EXECUTABLE="C:\Users\\AppData\Local\Programs\Python\Python312\python.exe" .. + +Build: + + cmake --build . --config Release + +Output files will be created in: + + python/basisu_py/basisu_python.pyd + python/basisu_py/basisu_transcoder_python.pyd + +Running the Modules +------------------- +Always run using Python 3.12: + + py -3.12 + +Inside Python: + + import basisu_py + print("Modules loaded OK.") + +While in the "python" directory: + + py -m tests.test_backend_loading + +WASM Backend (Optional) +----------------------- +Install wasmtime: + + py -3.12 -m pip install wasmtime + +Ensure these files exist: + + python/basisu_py/wasm/*.wasm + +Common Problems +--------------- +1. "pybind11 not found" + -> Installed into wrong Python version. Use: + py -3.12 -m pip install pybind11 + +2. "Python config failure" + -> You are using Python 3.13 or 3.14. Must use Python 3.12. + +3. Modules not loading + -> You must run them with the same interpreter used to build them: + py -3.12 + diff --git a/python/astc_writer.py b/python/astc_writer.py new file mode 100644 index 0000000..5752620 --- /dev/null +++ b/python/astc_writer.py @@ -0,0 +1,83 @@ +# astc_writer.py +# +# Minimal ASTC writer that mirrors the C/C++ write_astc_file() logic from example_capi.c. +# Writes a valid single-slice 2D ASTC texture file (no array slices, no 3D, no mips). +# +# Usage: +# from astc_writer import write_astc_file +# write_astc_file("output.astc", blocks, block_width, block_height, width, height) +# +# "blocks" must be a bytes-like object containing the full ASTC block data +# using 16 bytes per block (standard ASTC block size). + + +def write_astc_file( + filename: str, + blocks: bytes, + block_width: int, + block_height: int, + width: int, + height: int +) -> None: + """ + Write an ASTC file to disk. + + Parameters: + filename : Output filename ("something.astc") + blocks : Bytes-like object containing ASTC blocks (16 bytes per block) + block_width : ASTC block width (e.g. 4-12) + block_height : ASTC block height (e.g. 4-12) + width : Original image width in pixels + height : Original image height in pixels + + Notes: + - ASTC files use 2D blocks; depth is always 1. + - Block layout goes row-major: (num_blocks_y num_blocks_x) blocks. + - No mipmaps are stored in this format. + """ + + # Validate block dimensions + if block_width < 4 or block_width > 12: + raise ValueError(f"ASTC block_width {block_width} out of range (412)") + if block_height < 4 or block_height > 12: + raise ValueError(f"ASTC block_height {block_height} out of range (412)") + + # Compute block grid + num_blocks_x = (width + block_width - 1) // block_width + num_blocks_y = (height + block_height - 1) // block_height + total_blocks = num_blocks_x * num_blocks_y + expected_size = total_blocks * 16 # 16 bytes per ASTC block (always) + + if len(blocks) != expected_size: + raise ValueError( + f"ASTC block buffer incorrect size: expected {expected_size}, got {len(blocks)}" + ) + + # Write file + with open(filename, "wb") as f: + # ASTC magic number (0x13AB A15C) + f.write(bytes([0x13, 0xAB, 0xA1, 0x5C])) + + # Block dims: x, y, z (z=1) + f.write(bytes([ + block_width & 0xFF, + block_height & 0xFF, + 1 + ])) + + # ASTC stores width/height/depth as 24-bit LE + def write_24bit_le(v: int): + f.write(bytes([ + v & 0xFF, + (v >> 8) & 0xFF, + (v >> 16) & 0xFF + ])) + + write_24bit_le(width) + write_24bit_le(height) + write_24bit_le(1) # depth + + # Write actual block payload + f.write(blocks) + + print(f"[ASTC Writer] Wrote: {filename} ({width}x{height}, {block_width}x{block_height} blocks)") diff --git a/python/basisu_encoder_pybind11.cpp b/python/basisu_encoder_pybind11.cpp new file mode 100644 index 0000000..c6a6988 --- /dev/null +++ b/python/basisu_encoder_pybind11.cpp @@ -0,0 +1,109 @@ +// File: basisu_encoder_pybind11.cpp +// pybind11 native bindings for the compressor's pure C API basisu_wasm_api.h +#include +#include +#include + +// include the basisu compression plain C API +#include "../encoder/basisu_wasm_api.h" + +namespace py = pybind11; + +// Convert wasm_bool_t (uint32_t) ? Python bool +static inline bool to_bool(uint32_t v) { return v != 0; } + +PYBIND11_MODULE(basisu_python, m) { + m.doc() = "Native Basis Universal encoder (pybind11 binding over basisu_wasm_api)"; + + // + // Initialization / Version + // + m.def("init", &bu_init, "Initialize the BasisU codec library"); + m.def("get_version", &bu_get_version, "Return BASISU_LIB_VERSION"); + + // + // Memory allocation helpers + // + m.def("alloc", &bu_alloc, + "Allocate memory inside native heap and return pointer as uint64"); + m.def("free", &bu_free, + "Free previously allocated pointer"); + + // + // Compression params handles + // + m.def("new_params", &bu_new_comp_params, + "Create a new comp_params struct inside native heap"); + m.def("delete_params", + [](uint64_t h) { return to_bool(bu_delete_comp_params(h)); }, + "Destroy a comp_params struct"); + + m.def("params_clear", + [](uint64_t h) { return to_bool(bu_comp_params_clear(h)); }, + "Clear comp_params struct"); + + // + // Image upload API + // + m.def("set_image_rgba32", + [](uint64_t params, uint32_t index, + uint64_t img_ptr, uint32_t w, uint32_t h, uint32_t pitch) { + return to_bool(bu_comp_params_set_image_rgba32( + params, index, img_ptr, w, h, pitch)); + }, + "Set 8-bit RGBA32 image into parameters"); + + m.def("set_image_float_rgba", + [](uint64_t params, uint32_t index, + uint64_t img_ptr, uint32_t w, uint32_t h, uint32_t pitch) { + return to_bool(bu_comp_params_set_image_float_rgba( + params, index, img_ptr, w, h, pitch)); + }, + "Set float32 RGBA image into parameters"); + + // + // Compression + // + m.def("compress", + [](uint64_t params, + int tex_format, + int quality, + int effort, + uint64_t flags, + float rdo_quality) + { + return to_bool(bu_compress_texture( + params, tex_format, quality, effort, flags, rdo_quality)); + }, + py::arg("params"), + py::arg("tex_format"), + py::arg("quality"), + py::arg("effort"), + py::arg("flags"), + py::arg("rdo_quality") = 0.0f + ); + + // + // Output blob access + // + m.def("get_comp_data_size", + &bu_comp_params_get_comp_data_size, + "Return size (bytes) of compressed output"); + m.def("get_comp_data_ofs", + &bu_comp_params_get_comp_data_ofs, + "Return pointer (uint64) to compressed output buffer"); + + // Memory read/write + m.def("read_memory", + [](uint64_t ptr, uint32_t size) { + return py::bytes((const char*)ptr, size); + }, + "Read `size` bytes starting at native memory address `ptr`"); + + m.def("write_memory", + [](uint64_t dest_ptr, py::buffer src) { + py::buffer_info info = src.request(); + memcpy((void*)dest_ptr, info.ptr, info.size * info.itemsize); + }, + "Write bytes/buffer-like object into native memory at address `ptr`"); +} diff --git a/python/basisu_py/MANIFEST.in b/python/basisu_py/MANIFEST.in new file mode 100644 index 0000000..993a897 --- /dev/null +++ b/python/basisu_py/MANIFEST.in @@ -0,0 +1,2 @@ +recursive-include basisu_py *.py *.so *.wasm +include README.md diff --git a/python/basisu_py/READMD.md b/python/basisu_py/READMD.md new file mode 100644 index 0000000..ca5adf8 --- /dev/null +++ b/python/basisu_py/READMD.md @@ -0,0 +1,5 @@ +This is the Python support directory for the Basis Universal KTX2 compressor +and transcoder modules. + +License: Apache 2.0 + diff --git a/python/basisu_py/__init__.py b/python/basisu_py/__init__.py new file mode 100644 index 0000000..2186700 --- /dev/null +++ b/python/basisu_py/__init__.py @@ -0,0 +1,35 @@ +""" +basisu_py +========= +Python bindings for the Basis Universal encoder and transcoder, with +automatic fallback between native C++ extensions and WASM modules. + +Main entry points: + - Transcoder : basisu_py.transcoder.Transcoder + - Encoder : basisu_py.codec.Encoder + - constants : basisu_py.constants +""" + +from .codec import Encoder +from .transcoder import Transcoder, KTX2Handle +from .constants import ( + BasisTexFormat, + BasisQuality, + BasisEffort, + BasisFlags, + TranscoderTextureFormat, + TranscodeDecodeFlags, +) + +# What the package publicly exposes +__all__ = [ + "Encoder", + "Transcoder", + "KTX2Handle", + "BasisTexFormat", + "BasisQuality", + "BasisEffort", + "BasisFlags", + "TranscoderTextureFormat", + "TranscodeDecodeFlags", +] diff --git a/python/basisu_py/basisu_python.cpython-312-x86_64-linux-gnu.so b/python/basisu_py/basisu_python.cpython-312-x86_64-linux-gnu.so new file mode 100644 index 0000000..efe98fb Binary files /dev/null and b/python/basisu_py/basisu_python.cpython-312-x86_64-linux-gnu.so differ diff --git a/python/basisu_py/basisu_python.pyd b/python/basisu_py/basisu_python.pyd new file mode 100644 index 0000000..581eca9 Binary files /dev/null and b/python/basisu_py/basisu_python.pyd differ diff --git a/python/basisu_py/basisu_transcoder_python.cpython-312-x86_64-linux-gnu.so b/python/basisu_py/basisu_transcoder_python.cpython-312-x86_64-linux-gnu.so new file mode 100644 index 0000000..0a695e9 Binary files /dev/null and b/python/basisu_py/basisu_transcoder_python.cpython-312-x86_64-linux-gnu.so differ diff --git a/python/basisu_py/basisu_transcoder_python.pyd b/python/basisu_py/basisu_transcoder_python.pyd new file mode 100644 index 0000000..2dcd214 Binary files /dev/null and b/python/basisu_py/basisu_transcoder_python.pyd differ diff --git a/python/basisu_py/codec.py b/python/basisu_py/codec.py new file mode 100644 index 0000000..e12b22c --- /dev/null +++ b/python/basisu_py/codec.py @@ -0,0 +1,222 @@ +# basisu_py/codec.py + +import importlib +import numpy as np +from PIL import Image +import ctypes + +from .constants import BasisTexFormat, BasisQuality, BasisEffort, BasisFlags +from pathlib import Path + +class EncoderBackend: + NATIVE = "native" + WASM = "wasm" + AUTO = "auto" + +class Encoder: + + def __init__(self, backend=EncoderBackend.AUTO): + self.backend = backend + self._native = None + self._wasm = None + self.backend_name = None + + # ------------------------------------------------------------------ + # Try native first (AUTO or NATIVE modes) + # ------------------------------------------------------------------ + if backend in (EncoderBackend.AUTO, EncoderBackend.NATIVE): + try: + import basisu_py.basisu_python as native_encoder + native_encoder.init() + + self._native = native_encoder + self._wasm = None + self.backend_name = "NATIVE" + + print("[Encoder] Using native backend") + return + + except Exception as e: + if backend == EncoderBackend.NATIVE: + raise RuntimeError( + f"[Encoder] Native backend requested but unavailable: {e}" + ) + print("[Encoder] Native unavailable; falling back to WASM:", e) + + # ------------------------------------------------------------------ + # Fallback to WASM (AUTO or explicitly WASM) + # ------------------------------------------------------------------ + try: + from basisu_py.wasm.wasm_encoder import BasisuWasmEncoder + except Exception as e: + raise RuntimeError( + f"[Encoder] WASM backend cannot be imported: {e}\n" + "Make sure wasmtime is installed and basisu_py/wasm/*.wasm exist." + ) + + wasm_path = Path(__file__).parent / "wasm" / "basisu_module_st.wasm" + self._wasm = BasisuWasmEncoder(str(wasm_path)) + self._wasm.load() + self._native = None + self.backend_name = "WASM" + + print("[Encoder] Using WASM backend") + + + # ------------------------------------------------------ + # Public API + # ------------------------------------------------------ + def compress(self, + image, + format=-1, + quality=BasisQuality.MAX, + effort=BasisEffort.DEFAULT, + flags=BasisFlags.KTX2_OUTPUT | BasisFlags.SRGB | BasisFlags.THREADED | BasisFlags.XUASTC_LDR_FULL_ZSTD): + + rgba_bytes, w, h, is_hdr = self._convert_input_to_rgba_bytes(image) + + # Auto-select format if user passed -1 + if format == -1: + if is_hdr: + format = BasisTexFormat.cUASTC_HDR_6x6 + else: + format = BasisTexFormat.cXUASTC_LDR_6x6 + + if self._native: + return self._compress_native(rgba_bytes, w, h, format, quality, effort, flags, is_hdr) + else: + return self._compress_wasm(rgba_bytes, w, h, format, quality, effort, flags, is_hdr) + + def compress_float32(self, arr, **kwargs): + if not isinstance(arr, np.ndarray) or arr.dtype != np.float32: + raise ValueError("compress_float32 requires float32 NumPy HxWx4 array") + + return self.compress(arr, **kwargs) + + # ------------------------------------------------------ + # Native backend + # ------------------------------------------------------ + def _compress_native(self, bytes_data, w, h, fmt, quality, effort, flags, is_hdr=False): + enc = self._native + + params = enc.new_params() + + try: + buf_ptr = enc.alloc(len(bytes_data)) + + # Write raw bytes (uint8 or float32) + ctypes.memmove(buf_ptr, bytes_data, len(bytes_data)) + + if is_hdr: + ok = enc.set_image_float_rgba(params, 0, buf_ptr, w, h, w * 16) # 4 floats = 16 bytes per pixel + else: + ok = enc.set_image_rgba32(params, 0, buf_ptr, w, h, w * 4) + + if not ok: + raise RuntimeError("Native encoder: set_image failed (HDR or LDR)") + + ok = enc.compress(params, fmt, quality, effort, flags, 0.0) + if not ok: + raise RuntimeError("Native encoder: compress() failed") + + size = enc.get_comp_data_size(params) + ofs = enc.get_comp_data_ofs(params) + blob = enc.read_memory(ofs, size) + return blob + + finally: + enc.delete_params(params) + if buf_ptr: + enc.free(buf_ptr) + + # ------------------------------------------------------ + # WASM backend + # ------------------------------------------------------ + def _compress_wasm(self, bytes_data, w, h, fmt, quality, effort, flags, is_hdr=False): + enc = self._wasm + + params = enc.new_params() + + try: + buf_ptr = enc.alloc(len(bytes_data)) + enc.write_bytes(buf_ptr, bytes_data) + + if is_hdr: + ok = enc.set_image_float_rgba(params, 0, buf_ptr, w, h, w * 16) + else: + ok = enc.set_image_rgba32(params, 0, buf_ptr, w, h, w * 4) + + if not ok: + raise RuntimeError("WASM encoder: set_image failed (HDR or LDR)") + + ok = enc.compress(params, fmt, quality, effort, flags, 0.0) + if not ok: + raise RuntimeError("WASM encoder: compress() failed") + + size = enc.get_comp_data_size(params) + ofs = enc.get_comp_data_ofs(params) + blob = enc.read_bytes(ofs, size) + return blob + + finally: + enc.delete_params(params) + if buf_ptr: + enc.free(buf_ptr) + + # ------------------------------------------------------ + # Image conversion + # ------------------------------------------------------ + def _convert_input_to_rgba_bytes(self, image): + """ + Accept: + - Pillow Image (LDR) -> returns uint8 bytes + - NumPy uint8 LDR -> returns uint8 bytes + - NumPy float32 HDR -> returns float32 bytes + Returns (bytes, width, height, is_hdr) + """ + + # Pillow image -> LDR + if isinstance(image, Image.Image): + image = image.convert("RGBA") + arr = np.array(image, dtype=np.uint8) + h, w = arr.shape[:2] + return arr.tobytes(), w, h, False + + # NumPy array + elif isinstance(image, np.ndarray): + + # HDR float32 image + if image.dtype == np.float32: + if image.ndim != 3 or image.shape[2] not in (3,4): + raise ValueError("HDR NumPy image must be HxWx3 or HxWx4 float32") + + h, w, c = image.shape + + # Expand RGB -> RGBA if needed + if c == 3: + alpha = np.ones((h, w, 1), dtype=np.float32) + arr = np.concatenate([image, alpha], axis=2) + else: + arr = image + + return arr.tobytes(), w, h, True + + # LDR uint8 image + if image.dtype == np.uint8: + if image.ndim != 3 or image.shape[2] not in (3,4): + raise ValueError("LDR NumPy image must be HxWx3 or HxWx4 uint8") + + h, w, c = image.shape + + if c == 3: + alpha = np.full((h, w, 1), 255, dtype=np.uint8) + arr = np.concatenate([image, alpha], axis=2) + else: + arr = image + + return arr.tobytes(), w, h, False + + raise ValueError("NumPy image must be uint8 (LDR) or float32 (HDR)") + + else: + raise TypeError("compress() expects Pillow Image or NumPy array") diff --git a/python/basisu_py/constants.py b/python/basisu_py/constants.py new file mode 100644 index 0000000..04be0b0 --- /dev/null +++ b/python/basisu_py/constants.py @@ -0,0 +1,183 @@ +# basisu_constants.py + +# ============================================================ +# .KTX2/.basis file types +# basist::basis_tex_format +# ============================================================ +class BasisTexFormat: + # Original LDR formats + cETC1S = 0 + cUASTC_LDR_4x4 = 1 + + # HDR + cUASTC_HDR_4x4 = 2 + cASTC_HDR_6x6 = 3 + cUASTC_HDR_6x6 = 4 + + # XUASTC supercompressed LDR formats + cXUASTC_LDR_4x4 = 5 + cXUASTC_LDR_5x4 = 6 + cXUASTC_LDR_5x5 = 7 + cXUASTC_LDR_6x5 = 8 + cXUASTC_LDR_6x6 = 9 + cXUASTC_LDR_8x5 = 10 + cXUASTC_LDR_8x6 = 11 + cXUASTC_LDR_10x5 = 12 + cXUASTC_LDR_10x6 = 13 + cXUASTC_LDR_8x8 = 14 + cXUASTC_LDR_10x8 = 15 + cXUASTC_LDR_10x10= 16 + cXUASTC_LDR_12x10= 17 + cXUASTC_LDR_12x12= 18 + + # Standard ASTC LDR + cASTC_LDR_4x4 = 19 + cASTC_LDR_5x4 = 20 + cASTC_LDR_5x5 = 21 + cASTC_LDR_6x5 = 22 + cASTC_LDR_6x6 = 23 + cASTC_LDR_8x5 = 24 + cASTC_LDR_8x6 = 25 + cASTC_LDR_10x5 = 26 + cASTC_LDR_10x6 = 27 + cASTC_LDR_8x8 = 28 + cASTC_LDR_10x8 = 29 + cASTC_LDR_10x10= 30 + cASTC_LDR_12x10= 31 + cASTC_LDR_12x12= 32 + +# ============================================================ +# Unified quality level: 1-100 (higher=better quality, 100 disables some codec options) +# ============================================================ +class BasisQuality: + MIN = 1 + MAX = 100 + +# ============================================================ +# Unified effort level: 0-10 (0=fastest, 10=very slow, higher=slower but higher potential quality/more features utilized) +# ============================================================ +class BasisEffort: + MIN = 0 + MAX = 10 + + SUPER_FAST = 0 + FAST = 2 + NORMAL = 5 + DEFAULT = 2 + SLOW = 8 + VERY_SLOW = 10 + +# ============================================================ +# C-style API flags +# ============================================================ +class BasisFlags: + NONE = 0 + USE_OPENCL = 1 << 8 + THREADED = 1 << 9 + DEBUG_OUTPUT = 1 << 10 + + KTX2_OUTPUT = 1 << 11 + KTX2_UASTC_ZSTD = 1 << 12 + + SRGB = 1 << 13 + GEN_MIPS_CLAMP = 1 << 14 + GEN_MIPS_WRAP = 1 << 15 + + Y_FLIP = 1 << 16 + + PRINT_STATS = 1 << 18 + PRINT_STATUS = 1 << 19 + + DEBUG_IMAGES = 1 << 20 + + REC2020 = 1 << 21 + VALIDATE_OUTPUT = 1 << 22 + + XUASTC_LDR_FULL_ARITH = 0 + XUASTC_LDR_HYBRID = 1 << 23 + XUASTC_LDR_FULL_ZSTD = 2 << 23 + XUASTC_LDR_SYNTAX_SHIFT = 23 + XUASTC_LDR_SYNTAX_MASK = 3 + + TEXTURE_TYPE_2D = 0 << 25 + TEXTURE_TYPE_2D_ARRAY = 1 << 25 + TEXTURE_TYPE_CUBEMAP_ARRAY = 2 << 25 + TEXTURE_TYPE_VIDEO_FRAMES = 3 << 25 + TEXTURE_TYPE_SHIFT = 25 + TEXTURE_TYPE_MASK = 3 + + VERBOSE = PRINT_STATS | PRINT_STATUS + MIPMAP_CLAMP = GEN_MIPS_CLAMP + MIPMAP_WRAP = GEN_MIPS_WRAP + +# ============================================================ +# Transcoder Texture Formats (GPU block formats) +# basist::transcoder_texture_format +# ============================================================ +class TranscoderTextureFormat: + TF_ETC1_RGB = 0 + TF_ETC2_RGBA = 1 + TF_BC1_RGB = 2 + TF_BC3_RGBA = 3 + TF_BC4_R = 4 + TF_BC5_RG = 5 + TF_BC7_RGBA = 6 + + TF_PVRTC1_4_RGB = 8 + TF_PVRTC1_4_RGBA = 9 + + TF_ASTC_LDR_4X4_RGBA = 10 + TF_ATC_RGB = 11 + TF_ATC_RGBA = 12 + + # Uncompressed + TF_RGBA32 = 13 + TF_RGB565 = 14 + TF_BGR565 = 15 + TF_RGBA4444 = 16 + + TF_FXT1_RGB = 17 + TF_PVRTC2_4_RGB = 18 + TF_PVRTC2_4_RGBA = 19 + + TF_ETC2_EAC_R11 = 20 + TF_ETC2_EAC_RG11 = 21 + TF_BC6H = 22 + + TF_ASTC_HDR_4X4_RGBA = 23 + + TF_RGB_HALF = 24 + TF_RGBA_HALF = 25 + TF_RGB_9E5 = 26 + TF_ASTC_HDR_6X6_RGBA = 27 + + TF_ASTC_LDR_5X4_RGBA = 28 + TF_ASTC_LDR_5X5_RGBA = 29 + TF_ASTC_LDR_6X5_RGBA = 30 + TF_ASTC_LDR_6X6_RGBA = 31 + TF_ASTC_LDR_8X5_RGBA = 32 + TF_ASTC_LDR_8X6_RGBA = 33 + TF_ASTC_LDR_10X5_RGBA = 34 + TF_ASTC_LDR_10X6_RGBA = 35 + TF_ASTC_LDR_8X8_RGBA = 36 + TF_ASTC_LDR_10X8_RGBA = 37 + TF_ASTC_LDR_10X10_RGBA= 38 + TF_ASTC_LDR_12X10_RGBA= 39 + TF_ASTC_LDR_12X12_RGBA= 40 + + TOTAL = 41 + +# ============================================================ +# Transcoder Decode Flags +# ============================================================ +class TranscodeDecodeFlags: + PVRTC_DECODE_TO_NEXT_POW2 = 2 + TRANSCODE_ALPHA_TO_OPAQUE = 4 + BC1_FORBID_THREE_COLOR_BLOCKS = 8 + OUTPUT_HAS_ALPHA_INDICES = 16 + HIGH_QUALITY = 32 + NO_ETC1S_CHROMA_FILTERING = 64 + NO_DEBLOCK_FILTERING = 128 + STRONGER_DEBLOCK_FILTERING = 256 + FORCE_DEBLOCK_FILTERING = 512 + XUASTC_LDR_DISABLE_FAST_BC7_TRANSCODING = 1024 diff --git a/python/basisu_py/transcoder.py b/python/basisu_py/transcoder.py new file mode 100644 index 0000000..868e69e --- /dev/null +++ b/python/basisu_py/transcoder.py @@ -0,0 +1,735 @@ +# basisu_py/transcoder.py +import numpy as np +from dataclasses import dataclass +from pathlib import Path + +from basisu_py.constants import ( + TranscoderTextureFormat, +) + +import importlib +import ctypes + + +# --------------------------------------------------------------------------- +# Enum to select backend +# --------------------------------------------------------------------------- +class TranscoderBackend: + NATIVE = "native" + WASM = "wasm" + AUTO = "auto" + + +# --------------------------------------------------------------------------- +# Wrapper class storing pointer+handle +# --------------------------------------------------------------------------- +@dataclass +class KTX2Handle: + ptr: int + handle: int + + +# --------------------------------------------------------------------------- +# Main Transcoder class +# --------------------------------------------------------------------------- +class Transcoder: + def __init__(self, backend=TranscoderBackend.AUTO): + self._native = None + self._wasm = None + self.backend_name = None + self.backend = None + + use_native = False + + # ------------------------------------------------------------------ + # Try native backend first if AUTO or NATIVE + # ------------------------------------------------------------------ + if backend in (TranscoderBackend.AUTO, TranscoderBackend.NATIVE): + try: + native_mod = importlib.import_module("basisu_py.basisu_transcoder_python") + native_mod.init() + self._native = native_mod + self.backend = native_mod + self.backend_name = "NATIVE" + use_native = True + print("[Transcoder] Using native backend") + except Exception as e: + if backend == TranscoderBackend.NATIVE: + # Caller explicitly requested native - fail hard + raise RuntimeError(f"Native transcoder backend failed: {e}") + print("[Transcoder] Native backend unavailable, reason:", e) + self._native = None + + # ------------------------------------------------------------------ + # Fallback to WASM if native is not being used + # ------------------------------------------------------------------ + if not use_native: + try: + from basisu_py.wasm.wasm_transcoder import BasisuWasmTranscoder + except Exception as e: + raise RuntimeError( + f"WASM backend cannot be imported: {e}\n" + "Ensure that:\n" + " - 'wasmtime' is installed\n" + " - basisu_py/wasm/*.wasm files are present in the install\n" + ) + + wasm_path = Path(__file__).parent / "wasm" / "basisu_transcoder_module_st.wasm" + self._wasm = BasisuWasmTranscoder(str(wasm_path)) + self._wasm.load() + self.backend = self._wasm + self.backend_name = "WASM" + print("[Transcoder] Using WASM backend") + + # Finally, bind the unified API to whichever backend we chose + self._bind_backend(self.backend) + + # ----------------------------------------------------------------------- + # Unified backend binding (native or wasm) + # ----------------------------------------------------------------------- + def _bind_backend(self, b): + self.backend = b + + # ------------------ memory operations ------------------ + memory_mapping = [ + ("_alloc", "alloc"), + ("_free", "free"), + ("_write", "write_memory"), + ("_read", "read_memory"), + ] + + # ------------------ KTX2 core ------------------ + basis_mapping = [ + # basis_tex_format helpers + ("basis_tex_format_is_xuastc_ldr", "basis_tex_format_is_xuastc_ldr"), + ("basis_tex_format_is_astc_ldr", "basis_tex_format_is_astc_ldr"), + ("basis_tex_format_get_block_width", "basis_tex_format_get_block_width"), + ("basis_tex_format_get_block_height", "basis_tex_format_get_block_height"), + ("basis_tex_format_is_hdr", "basis_tex_format_is_hdr"), + ("basis_tex_format_is_ldr", "basis_tex_format_is_ldr"), + + # transcoder_texture_format helpers + ("basis_get_bytes_per_block_or_pixel", "basis_get_bytes_per_block_or_pixel"), + ("basis_transcoder_format_has_alpha", "basis_transcoder_format_has_alpha"), + ("basis_transcoder_format_is_hdr", "basis_transcoder_format_is_hdr"), + ("basis_transcoder_format_is_ldr", "basis_transcoder_format_is_ldr"), + ("basis_transcoder_texture_format_is_astc", "basis_transcoder_texture_format_is_astc"), + ("basis_transcoder_format_is_uncompressed", "basis_transcoder_format_is_uncompressed"), + ("basis_get_uncompressed_bytes_per_pixel", "basis_get_uncompressed_bytes_per_pixel"), + ("basis_get_block_width", "basis_get_block_width"), + ("basis_get_block_height", "basis_get_block_height"), + ("basis_get_transcoder_texture_format_from_basis_tex_format","basis_get_transcoder_texture_format_from_basis_tex_format"), + ("basis_is_format_supported", "basis_is_format_supported"), + ("basis_compute_transcoded_image_size_in_bytes","basis_compute_transcoded_image_size_in_bytes"), + ] + + ktx2_mapping = [ + + ("ktx2_open", "ktx2_open"), + ("ktx2_close", "ktx2_close"), + + ("ktx2_get_width", "ktx2_get_width"), + ("ktx2_get_height", "ktx2_get_height"), + ("ktx2_get_levels", "ktx2_get_levels"), + ("ktx2_get_faces", "ktx2_get_faces"), + ("ktx2_get_layers", "ktx2_get_layers"), + + ("ktx2_get_basis_tex_format", "ktx2_get_basis_tex_format"), + + ("ktx2_get_block_width", "ktx2_get_block_width"), + ("ktx2_get_block_height", "ktx2_get_block_height"), + + ("ktx2_has_alpha", "ktx2_has_alpha"), + + # flags + ("ktx2_is_hdr", "ktx2_is_hdr"), + ("ktx2_is_hdr_4x4", "ktx2_is_hdr_4x4"), + ("ktx2_is_hdr_6x6", "ktx2_is_hdr_6x6"), + ("ktx2_is_ldr", "ktx2_is_ldr"), + ("ktx2_is_srgb", "ktx2_is_srgb"), + ("ktx2_is_etc1s", "ktx2_is_etc1s"), + ("ktx2_is_uastc_ldr_4x4", "ktx2_is_uastc_ldr_4x4"), + ("ktx2_is_xuastc_ldr", "ktx2_is_xuastc_ldr"), + ("ktx2_is_astc_ldr", "ktx2_is_astc_ldr"), + ("ktx2_is_video", "ktx2_is_video"), + ("ktx2_get_ldr_hdr_upconversion_nit_multiplier", "ktx2_get_ldr_hdr_upconversion_nit_multiplier"), + + # DFD access + ("ktx2_get_dfd_flags", "ktx2_get_dfd_flags"), + ("ktx2_get_dfd_total_samples", "ktx2_get_dfd_total_samples"), + ("ktx2_get_dfd_channel_id0", "ktx2_get_dfd_channel_id0"), + ("ktx2_get_dfd_channel_id1", "ktx2_get_dfd_channel_id1"), + ("ktx2_get_dfd_color_model", "ktx2_get_dfd_color_model"), + ("ktx2_get_dfd_color_primaries", "ktx2_get_dfd_color_primaries"), + ("ktx2_get_dfd_transfer_func", "ktx2_get_dfd_transfer_func"), + + # per-level info + ("ktx2_get_level_orig_width", "ktx2_get_level_orig_width"), + ("ktx2_get_level_orig_height", "ktx2_get_level_orig_height"), + ("ktx2_get_level_actual_width", "ktx2_get_level_actual_width"), + ("ktx2_get_level_actual_height", "ktx2_get_level_actual_height"), + + ("ktx2_get_level_num_blocks_x", "ktx2_get_level_num_blocks_x"), + ("ktx2_get_level_num_blocks_y", "ktx2_get_level_num_blocks_y"), + ("ktx2_get_level_total_blocks", "ktx2_get_level_total_blocks"), + + ("ktx2_get_level_alpha_flag", "ktx2_get_level_alpha_flag"), + ("ktx2_get_level_iframe_flag", "ktx2_get_level_iframe_flag"), + + # transcoding + ("ktx2_start_transcoding", "ktx2_start_transcoding"), + ("ktx2_transcode_image_level", "ktx2_transcode_image_level"), + + # version + ("get_version_fn", "get_version"), + ] + + # Apply all mappings + for public_name, backend_name in (memory_mapping + ktx2_mapping + basis_mapping): + setattr(self, public_name, getattr(b, backend_name)) + + # ----------------------------------------------------------------------- + # Public version query + # ----------------------------------------------------------------------- + def get_version(self): + return self.get_version_fn() + + # ----------------------------------------------------------------------- + # Enable library debug printing to stdout (also set BASISU_FORCE_DEVEL_MESSAGES to 1 in transcoder/basisu.h) + # ----------------------------------------------------------------------- + def enable_debug_printf(self, flag: bool = True): + return self.backend.enable_debug_printf(flag) + + # ----------------------------------------------------------------------- + # KTX2 Handle API: open/close + all queries + # ----------------------------------------------------------------------- + def open(self, ktx2_bytes: bytes) -> KTX2Handle: + ptr = self._alloc(len(ktx2_bytes)) + self._write(ptr, ktx2_bytes) + handle = self.ktx2_open(ptr, len(ktx2_bytes)) + return KTX2Handle(ptr, handle) + + def close(self, ktx2_handle: KTX2Handle): + self.ktx2_close(ktx2_handle.handle) + self._free(ktx2_handle.ptr) + + # ---- Basic queries ---- + def get_width(self, ktx2_handle: KTX2Handle): + return self.ktx2_get_width(ktx2_handle.handle) + + def get_height(self, ktx2_handle: KTX2Handle): + return self.ktx2_get_height(ktx2_handle.handle) + + def get_levels(self, ktx2_handle: KTX2Handle): + return self.ktx2_get_levels(ktx2_handle.handle) + + def get_faces(self, ktx2_handle: KTX2Handle): + return self.ktx2_get_faces(ktx2_handle.handle) + + def get_layers(self, ktx2_handle: KTX2Handle): + return self.ktx2_get_layers(ktx2_handle.handle) + + def get_basis_tex_format(self, ktx2_handle: KTX2Handle): + return self.ktx2_get_basis_tex_format(ktx2_handle.handle) + + def has_alpha(self, ktx2_handle: KTX2Handle) -> bool: + """ + Return true if the KTX2 container has alpha. + """ + return bool(self.ktx2_has_alpha(ktx2_handle.handle)) + + # ---- Format flags ---- + def is_hdr(self, ktx2_handle): return bool(self.ktx2_is_hdr(ktx2_handle.handle)) + def is_hdr_4x4(self, ktx2_handle): return bool(self.ktx2_is_hdr_4x4(ktx2_handle.handle)) + def is_hdr_6x6(self, ktx2_handle): return bool(self.ktx2_is_hdr_6x6(ktx2_handle.handle)) + def is_ldr(self, ktx2_handle): return bool(self.ktx2_is_ldr(ktx2_handle.handle)) + def is_srgb(self, ktx2_handle): return bool(self.ktx2_is_srgb(ktx2_handle.handle)) + def is_video(self, ktx2_handle): return bool(self.ktx2_is_video(ktx2_handle.handle)) + def get_ldr_hdr_upconversion_nit_multiplier(self, ktx2_handle): return self.ktx2_get_ldr_hdr_upconversion_nit_multiplier(ktx2_handle.handle) + def is_etc1s(self, ktx2_handle): return bool(self.ktx2_is_etc1s(ktx2_handle.handle)) + def is_uastc_ldr_4x4(self, ktx2_handle): return bool(self.ktx2_is_uastc_ldr_4x4(ktx2_handle.handle)) + def is_xuastc_ldr(self, ktx2_handle): return bool(self.ktx2_is_xuastc_ldr(ktx2_handle.handle)) + def is_astc_ldr(self, ktx2_handle): return bool(self.ktx2_is_astc_ldr(ktx2_handle.handle)) + + # ---- DFD access + def get_dfd_flags(self, ktx2_handle): return self.ktx2_get_dfd_flags(ktx2_handle.handle) + def get_dfd_total_samples(self, ktx2_handle): return self.ktx2_get_dfd_total_samples(ktx2_handle.handle) + def get_dfd_color_model(self, ktx2_handle): return self.ktx2_get_dfd_color_model(ktx2_handle.handle) + def get_dfd_color_primaries(self, ktx2_handle): return self.ktx2_get_dfd_color_primaries(ktx2_handle.handle) + def get_dfd_transfer_func(self, ktx2_handle): return self.ktx2_get_dfd_transfer_func(ktx2_handle.handle) + def get_dfd_channel_id0(self, ktx2_handle): return self.ktx2_get_dfd_channel_id0(ktx2_handle.handle) + def get_dfd_channel_id1(self, ktx2_handle): return self.ktx2_get_dfd_channel_id1(ktx2_handle.handle) + + # ---- Block dimensions ---- + def get_block_width(self, ktx2_handle): return self.ktx2_get_block_width(ktx2_handle.handle) + def get_block_height(self, ktx2_handle): return self.ktx2_get_block_height(ktx2_handle.handle) + + # ----------------------------------------------------------------------- + # Explicit: start transcoding on an already-open KTX2 file + # ----------------------------------------------------------------------- + def start_transcoding(self, ktx2_handle: KTX2Handle): + """ + Must be called before per-level iframe flags become valid. + """ + ok = self.ktx2_start_transcoding(ktx2_handle.handle) + if not ok: + raise RuntimeError("start_transcoding() failed") + return True + + # ---- Level info ---- + def get_level_orig_width(self, ktx2_handle, level, layer=0, face=0): + return self.ktx2_get_level_orig_width(ktx2_handle.handle, level, layer, face) + + def get_level_orig_height(self, ktx2_handle, level, layer=0, face=0): + return self.ktx2_get_level_orig_height(ktx2_handle.handle, level, layer, face) + + def get_level_actual_width(self, ktx2_handle, level, layer=0, face=0): + return self.ktx2_get_level_actual_width(ktx2_handle.handle, level, layer, face) + + def get_level_actual_height(self, ktx2_handle, level, layer=0, face=0): + return self.ktx2_get_level_actual_height(ktx2_handle.handle, level, layer, face) + + def get_level_num_blocks_x(self, ktx2_handle, level, layer=0, face=0): + return self.ktx2_get_level_num_blocks_x(ktx2_handle.handle, level, layer, face) + + def get_level_num_blocks_y(self, ktx2_handle, level, layer=0, face=0): + return self.ktx2_get_level_num_blocks_y(ktx2_handle.handle, level, layer, face) + + def get_level_total_blocks(self, ktx2_handle, level, layer=0, face=0): + return self.ktx2_get_level_total_blocks(ktx2_handle.handle, level, layer, face) + + def get_level_alpha_flag(self, ktx2_handle, level, layer=0, face=0): + return bool(self.ktx2_get_level_alpha_flag(ktx2_handle.handle, level, layer, face)) + + def get_level_iframe_flag(self, ktx2_handle, level, layer=0, face=0): + return bool(self.ktx2_get_level_iframe_flag(ktx2_handle.handle, level, layer, face)) + + # ----------------------------------------------------------------------- + # Low-level: Decode RGBA8 from an already-open KTX2 handle + # ----------------------------------------------------------------------- + def decode_rgba_handle(self, ktx2_handle: KTX2Handle, level=0, layer=0, face=0): + """ + Low-level fast decode. Requires an already-open KTX2Handle. + Returns HxWx4 uint8 NumPy array. + """ + w = self.ktx2_get_level_orig_width(ktx2_handle.handle, level, layer, face) + h = self.ktx2_get_level_orig_height(ktx2_handle.handle, level, layer, face) + + out_size = w * h * 4 + out_ptr = self._alloc(out_size) + + # MUST start transcoding before ANY decode + ok = self.ktx2_start_transcoding(ktx2_handle.handle) + if not ok: + self._free(out_ptr) + raise RuntimeError("start_transcoding failed") + + ok = self.ktx2_transcode_image_level( + ktx2_handle.handle, + level, layer, face, + out_ptr, + out_size, + TranscoderTextureFormat.TF_RGBA32, + 0, 0, 0, -1, -1, 0 + ) + if not ok: + self._free(out_ptr) + raise RuntimeError("transcode_image_level failed") + + raw_bytes = self._read(out_ptr, out_size) + self._free(out_ptr) + + arr = np.frombuffer(raw_bytes, dtype=np.uint8) + return arr.reshape((h, w, 4)) + + # ----------------------------------------------------------------------- + # High-level: Decode RGBA8 directly from KTX2 file data + # ----------------------------------------------------------------------- + def decode_rgba(self, ktx2_bytes: bytes, level=0, layer=0, face=0): + """ + High-level convenience decode. Opens the KTX2 file bytes for you. + """ + ktx2_handle = self.open(ktx2_bytes) + try: + return self.decode_rgba_handle(ktx2_handle, level, layer, face) + finally: + self.close(ktx2_handle) + + # ----------------------------------------------------------------------- + # Low-level: Decode HDR (RGBA float32) from open KTX2 + # ----------------------------------------------------------------------- + def decode_rgba_hdr_handle(self, ktx2_handle: KTX2Handle, level=0, layer=0, face=0): + """ + Low-level HDR decode. Returns HxWx4 float32 NumPy array. + """ + w = self.ktx2_get_level_orig_width(ktx2_handle.handle, level, layer, face) + h = self.ktx2_get_level_orig_height(ktx2_handle.handle, level, layer, face) + + bytes_per_pixel = 8 # 4 * half-float + out_size = w * h * bytes_per_pixel + out_ptr = self._alloc(out_size) + + ok = self.ktx2_start_transcoding(ktx2_handle.handle) + if not ok: + self._free(out_ptr) + raise RuntimeError("start_transcoding failed") + + ok = self.ktx2_transcode_image_level( + ktx2_handle.handle, + level, layer, face, + out_ptr, + out_size, + TranscoderTextureFormat.TF_RGBA_HALF, + 0, 0, 0, -1, -1, 0 + ) + if not ok: + self._free(out_ptr) + raise RuntimeError("transcode_image_level failed") + + raw_bytes = self._read(out_ptr, out_size) + self._free(out_ptr) + + arr = np.frombuffer(raw_bytes, dtype=np.float16).astype(np.float32) + return arr.reshape((h, w, 4)) + + # ----------------------------------------------------------------------- + # High-level: Decode HDR (RGBA float32) from KTX2 file data + # ----------------------------------------------------------------------- + def decode_rgba_hdr(self, ktx2_bytes: bytes, level=0, layer=0, face=0): + """ + High-level convenience HDR decode. Opens the KTX2 file bytes for you. + """ + ktx2_handle = self.open(ktx2_bytes) + try: + return self.decode_rgba_hdr_handle(ktx2_handle, level, layer, face) + finally: + self.close(ktx2_handle) + + # ----------------------------------------------------------------------- + # Low-level: General-purpose transcode using a chosen TranscoderTextureFormat format + # ----------------------------------------------------------------------- + def transcode_tfmt_handle(self, ktx2_handle: KTX2Handle, tfmt: int, + level=0, layer=0, face=0, decode_flags=0, + channel0=-1, channel1=-1): + """ + Low-level direct transcoding from an already-open KTX2 handle. + + Parameters: + ktx2_handle: KTX2Handle -> already-open KTX2 + tfmt: int -> TranscoderTextureFormat to transcode to (for ASTC: block size and LDR/HDR MUST match the KTX2 file, for HDR: must be a HDR texture format) + level/layer/face: int -> which image slice to decode + decode_flags: int -> basist::decode_flags + row_pitch, rows_in_pixels, channel0, channel1 -> advanced options + + Returns: bytes (transcoded GPU texture data or uncompressed image) + """ + + # Determine actual output size in bytes + ow = self.ktx2_get_level_orig_width(ktx2_handle.handle, level, layer, face) + oh = self.ktx2_get_level_orig_height(ktx2_handle.handle, level, layer, face) + + out_size = self.basis_compute_transcoded_image_size_in_bytes(tfmt, ow, oh) + if out_size == 0: + raise RuntimeError("basis_compute_transcoded_image_size_in_bytes returned 0") + + # print(f"*** ow={ow}, oh={oh}, out_size={out_size}") + + out_ptr = self._alloc(out_size) + + # Call transcoder + ok = self.ktx2_start_transcoding(ktx2_handle.handle) + if not ok: + self._free(out_ptr) + raise RuntimeError("start_transcoding failed") + + ok = self.ktx2_transcode_image_level( + ktx2_handle.handle, + level, layer, face, + out_ptr, + out_size, + tfmt, + decode_flags, + 0, + 0, + channel0, channel1, + 0 # no per-thread state object + ) + if not ok: + self._free(out_ptr) + raise RuntimeError("ktx2_transcode_image_level failed") + + # Extract bytes + raw_bytes = self._read(out_ptr, out_size) + + self._free(out_ptr) + return raw_bytes + + # ----------------------------------------------------------------------- + # High-level: General-purpose transcode (opens the KTX2 for you) + # tfmt: the TranscoderTextureFormat to transcode too + # ----------------------------------------------------------------------- + def transcode_tfmt(self, ktx2_bytes: bytes, tfmt: int, + level=0, layer=0, face=0, decode_flags=0, + channel0=-1, channel1=-1): + """ + High-level convenience wrapper for transcode_tfmt_handle(). + Automatically opens/closes the KTX2 file. + """ + ktx2_handle = self.open(ktx2_bytes) + try: + return self.transcode_tfmt_handle( + ktx2_handle, tfmt, + level=level, + layer=layer, + face=face, + decode_flags=decode_flags, + channel0=channel0, + channel1=channel1 + ) + finally: + self.close(ktx2_handle) + + # ----------------------------------------------------------------------- + # Low-level: choose a specific transcoder_texture_format from a family string + # ----------------------------------------------------------------------- + def choose_transcoder_format(self, ktx2_handle: KTX2Handle, family: str) -> int: + """ + Given an already-opened KTX2 and a desired family string, choose a concrete + TranscoderTextureFormat enum. + + family: one of: + "ASTC", "BC1", "BC3", "BC4", "BC5", "BC6H", "BC7", + "PVRTC1", "PVRTC2", + "ETC1", "ETC2", "ETC2_EAC_R11", "ETC2_EAC_RG11", + "ATC", "FXT1", + "RGBA32", "RGB_HALF", "RGBA_HALF", "RGB_FLOAT", "RGBA_FLOAT", + "RGB_9E5" + + Returns: + int: TranscoderTextureFormat value + """ + + s = family.strip().upper().replace(" ", "") + hdr_tex = self.is_hdr(ktx2_handle) + has_alpha = self.has_alpha(ktx2_handle) + basis_fmt = self.get_basis_tex_format(ktx2_handle) + + tfmt = None + + # ------------------------------------------------------------------- + # Uncompressed families + # ------------------------------------------------------------------- + if s in ("RGBA32", "RGBA8", "UNCOMPRESSED"): + tfmt = TranscoderTextureFormat.TF_RGBA32 + + elif s in ("RGBHALF", "RGB16F", "RGB_FLOAT", "RGBFLOAT"): + tfmt = TranscoderTextureFormat.TF_RGB_HALF + + elif s in ("RGBAHALF", "RGBA16F", "RGBA_FLOAT", "RGBAFLOAT"): + tfmt = TranscoderTextureFormat.TF_RGBA_HALF + + elif s in ("RGB9E5", "RGB_9E5"): + tfmt = TranscoderTextureFormat.TF_RGB_9E5 + + # ------------------------------------------------------------------- + # BC families + # ------------------------------------------------------------------- + elif s == "BC1": + tfmt = TranscoderTextureFormat.TF_BC1_RGB + elif s == "BC3": + tfmt = TranscoderTextureFormat.TF_BC3_RGBA + elif s == "BC4": + tfmt = TranscoderTextureFormat.TF_BC4_R + elif s == "BC5": + tfmt = TranscoderTextureFormat.TF_BC5_RG + elif s == "BC6H": + tfmt = TranscoderTextureFormat.TF_BC6H + elif s == "BC7": + tfmt = TranscoderTextureFormat.TF_BC7_RGBA + + # ------------------------------------------------------------------- + # PVRTC families + # ------------------------------------------------------------------- + elif s == "PVRTC1": + tfmt = (TranscoderTextureFormat.TF_PVRTC1_4_RGBA + if has_alpha else TranscoderTextureFormat.TF_PVRTC1_4_RGB) + + elif s == "PVRTC2": + tfmt = (TranscoderTextureFormat.TF_PVRTC2_4_RGBA + if has_alpha else TranscoderTextureFormat.TF_PVRTC2_4_RGB) + + # ------------------------------------------------------------------- + # ETC / EAC families + # ------------------------------------------------------------------- + elif s == "ETC1": + tfmt = TranscoderTextureFormat.TF_ETC1_RGB + + elif s == "ETC2": + tfmt = TranscoderTextureFormat.TF_ETC2_RGBA + + elif s in ("ETC2_EAC_R11", "EAC_R11"): + tfmt = TranscoderTextureFormat.TF_ETC2_EAC_R11 + + elif s in ("ETC2_EAC_RG11", "EAC_RG11"): + tfmt = TranscoderTextureFormat.TF_ETC2_EAC_RG11 + + # ------------------------------------------------------------------- + # ATC / FXT + # ------------------------------------------------------------------- + elif s == "ATC": + tfmt = (TranscoderTextureFormat.TF_ATC_RGBA + if has_alpha else TranscoderTextureFormat.TF_ATC_RGB) + + elif s == "FXT1": + tfmt = TranscoderTextureFormat.TF_FXT1_RGB + + # ------------------------------------------------------------------- + # ASTC family + # ------------------------------------------------------------------- + elif s == "ASTC": + # Let BasisU decide correct ASTC format (block size + LDR/HDR) + tfmt = self.basis_get_transcoder_texture_format_from_basis_tex_format(basis_fmt) + + else: + # Unknown family: choose a safe uncompressed default + if hdr_tex: + tfmt = TranscoderTextureFormat.TF_RGBA_HALF + else: + tfmt = TranscoderTextureFormat.TF_RGBA32 + + # ------------------------------------------------------------------- + # Validate HDR/LDR compatibility (optional but recommended) + # ------------------------------------------------------------------- + # Use helpers to ensure we don't do HDR->LDR or LDR->HDR accidentally. + is_tfmt_hdr = self.basis_transcoder_format_is_hdr(tfmt) + if hdr_tex and not is_tfmt_hdr: + raise ValueError(f"Requested {family} (LDR transcoder format) for HDR KTX2.") + if not hdr_tex and is_tfmt_hdr: + raise ValueError(f"Requested {family} (HDR transcoder format) for LDR KTX2.") + + return tfmt + + # ----------------------------------------------------------------------- + # Low-level: General-purpose transcode using a family string + # from an already opened ktx2 file. + # Returns: + # (data_bytes, chosen_tfmt, block_width, block_height) + # ----------------------------------------------------------------------- + def transcode_handle( + self, + ktx2_handle: KTX2Handle, + family: str, + level=0, + layer=0, + face=0, + decode_flags=0, + channel0=-1, + channel1=-1 + ): + """ + Low-level direct transcoding from an already-open KTX2 handle, + using a high-level family string such as: + "BC7", "BC3", "BC1", "ETC1", "ETC2", "ASTC", "PVRTC1", + "RGBA32", "RGB_HALF", "RGBA_HALF", "RGB_9E5", etc. + See choose_transcoder_format(). + Returns: + (data_bytes, tfmt, block_width, block_height) + """ + + # Decide the exact transcoder format (BC1/BC7/etc.) + tfmt = self.choose_transcoder_format(ktx2_handle, family) + + # Get original dims of the requested slice + ow = self.get_level_orig_width(ktx2_handle, level, layer, face) + oh = self.get_level_orig_height(ktx2_handle, level, layer, face) + + # Compute correct output size for the chosen format + out_size = self.basis_compute_transcoded_image_size_in_bytes(tfmt, ow, oh) + if out_size == 0: + raise RuntimeError( + f"Computed output size is 0 for tfmt={tfmt}, dims={ow}x{oh}" + ) + + # Allocate output buffer + out_ptr = self._alloc(out_size) + + # Ensure transcoding tables are ready + ok = self.ktx2_start_transcoding(ktx2_handle.handle) + if not ok: + self._free(out_ptr) + raise RuntimeError("start_transcoding failed") + + # Perform the transcode + ok = self.ktx2_transcode_image_level( + ktx2_handle.handle, + level, layer, face, + out_ptr, + out_size, + tfmt, + decode_flags, + 0, # row_pitch_in_blocks_or_pixels + 0, # rows_in_pixels + channel0, + channel1, + 0 # no thread-local state + ) + if not ok: + self._free(out_ptr) + raise RuntimeError("ktx2_transcode_image_level failed") + + # Extract bytes from native/WASM memory + data_bytes = self._read(out_ptr, out_size) + + # Free the output buffer + self._free(out_ptr) + + # Determine block dims for this texture format + if self.basis_transcoder_format_is_uncompressed(tfmt): + bw = None + bh = None + else: + bw = self.basis_get_block_width(tfmt) + bh = self.basis_get_block_height(tfmt) + + return data_bytes, tfmt, bw, bh + + # ----------------------------------------------------------------------- + # High-level: one-shot transcode using a family string + # directly from ktx2 file data. (Slower if you're transcoding multiple + # levels/faces/layers.) + # ----------------------------------------------------------------------- + def transcode( + self, + ktx2_bytes: bytes, + family: str, + level=0, + layer=0, + face=0, + decode_flags=0, + channel0=-1, + channel1=-1 + ): + """ + High-level version of transcode_handle(). + Calls transcode_handle() internally. + + Returns: + (data_bytes, tfmt, block_width, block_height) + """ + ktx2_handle = self.open(ktx2_bytes) + try: + return self.transcode_handle( + ktx2_handle, + family, + level=level, + layer=layer, + face=face, + decode_flags=decode_flags, + channel0=channel0, + channel1=channel1 + ) + finally: + self.close(ktx2_handle) + + def tfmt_name(self, tfmt: int): + return TranscoderTextureFormat(tfmt).name diff --git a/python/basisu_py/wasm/__init__.py b/python/basisu_py/wasm/__init__.py new file mode 100644 index 0000000..76d8f38 --- /dev/null +++ b/python/basisu_py/wasm/__init__.py @@ -0,0 +1 @@ +# Purposely empty diff --git a/python/basisu_py/wasm/basisu_module_mt.wasm b/python/basisu_py/wasm/basisu_module_mt.wasm new file mode 100644 index 0000000..b9045c1 Binary files /dev/null and b/python/basisu_py/wasm/basisu_module_mt.wasm differ diff --git a/python/basisu_py/wasm/basisu_module_st.wasm b/python/basisu_py/wasm/basisu_module_st.wasm new file mode 100644 index 0000000..13b61f7 Binary files /dev/null and b/python/basisu_py/wasm/basisu_module_st.wasm differ diff --git a/python/basisu_py/wasm/basisu_transcoder_module_mt.wasm b/python/basisu_py/wasm/basisu_transcoder_module_mt.wasm new file mode 100644 index 0000000..b9c90be Binary files /dev/null and b/python/basisu_py/wasm/basisu_transcoder_module_mt.wasm differ diff --git a/python/basisu_py/wasm/basisu_transcoder_module_st.wasm b/python/basisu_py/wasm/basisu_transcoder_module_st.wasm new file mode 100644 index 0000000..0b717d4 Binary files /dev/null and b/python/basisu_py/wasm/basisu_transcoder_module_st.wasm differ diff --git a/python/basisu_py/wasm/wasm_encoder.py b/python/basisu_py/wasm/wasm_encoder.py new file mode 100644 index 0000000..e6d3516 --- /dev/null +++ b/python/basisu_py/wasm/wasm_encoder.py @@ -0,0 +1,126 @@ +# basisu_py/wasm/wasm_encoder.py + +import wasmtime +import ctypes + +from ..constants import BasisTexFormat, BasisQuality, BasisEffort, BasisFlags + + +class BasisuWasmEncoder: + def __init__(self, wasm_path): + self.wasm_path = wasm_path + self.engine = None + self.store = None + self.memory = None + self.exports = None + + # ------------------------------------------------------ + # Initialize WASM + WASI + # ------------------------------------------------------ + def _init_engine(self): + self.engine = wasmtime.Engine() + self.store = wasmtime.Store(self.engine) + + wasi = wasmtime.WasiConfig() + wasi.argv = ["basisu-wasm"] + wasi.inherit_stdout() + wasi.inherit_stderr() + self.store.set_wasi(wasi) + + def load(self): + self._init_engine() + + module = wasmtime.Module.from_file(self.engine, self.wasm_path) + linker = wasmtime.Linker(self.engine) + linker.define_wasi() + + instance = linker.instantiate(self.store, module) + self.exports = instance.exports(self.store) + self.memory = self.exports["memory"] + + # Initialize if present + if "bu_init" in self.exports: + self.exports["bu_init"](self.store) + + print("[WASM Encoder] Loaded:", self.wasm_path) + + # ------------------------------------------------------ + # Access raw linear memory buffer + # ------------------------------------------------------ + def _buf(self): + raw_ptr = self.memory.data_ptr(self.store) + size = self.memory.data_len(self.store) + addr = ctypes.addressof(raw_ptr.contents) + return (ctypes.c_ubyte * size).from_address(addr) + + # ------------------------------------------------------ + # Version + # ------------------------------------------------------ + def get_version(self): + return self.exports["bu_get_version"](self.store) + + # ------------------------------------------------------ + # Memory alloc/free + # ------------------------------------------------------ + def alloc(self, size): + return self.exports["bu_alloc"](self.store, size) + + def free(self, ptr): + self.exports["bu_free"](self.store, ptr) + + # ------------------------------------------------------ + # Params + # ------------------------------------------------------ + def new_params(self): + return self.exports["bu_new_comp_params"](self.store) + + def delete_params(self, params): + return self.exports["bu_delete_comp_params"](self.store, params) + + # ------------------------------------------------------ + # Image input + # ------------------------------------------------------ + def set_image_rgba32(self, params, index, ptr, w, h, pitch): + return self.exports["bu_comp_params_set_image_rgba32"]( + self.store, params, index, ptr, w, h, pitch + ) + + def set_image_float_rgba(self, params, index, ptr, w, h, pitch): + return self.exports["bu_comp_params_set_image_float_rgba"]( + self.store, params, index, ptr, w, h, pitch + ) + + # ------------------------------------------------------ + # Compression + # ------------------------------------------------------ + def compress(self, params, fmt, quality, effort, flags, rdo): + return bool(self.exports["bu_compress_texture"]( + self.store, params, fmt, quality, effort, flags, rdo + )) + + # ------------------------------------------------------ + # Output blob + # ------------------------------------------------------ + def get_comp_data_size(self, params): + return self.exports["bu_comp_params_get_comp_data_size"](self.store, params) + + def get_comp_data_ofs(self, params): + return self.exports["bu_comp_params_get_comp_data_ofs"](self.store, params) + + # ------------------------------------------------------ + # Raw memory I/O + # ------------------------------------------------------ + def write_bytes(self, ptr, data): + buf = self._buf() + buf[ptr:ptr + len(data)] = data + + def read_bytes(self, ptr, size): + buf = self._buf() + return bytes(buf[ptr:ptr + size]) + + # NEW unified names: + def write_memory(self, ptr, data): + self.write_bytes(ptr, data) + + def read_memory(self, ptr, size): + return self.read_bytes(ptr, size) diff --git a/python/basisu_py/wasm/wasm_transcoder.py b/python/basisu_py/wasm/wasm_transcoder.py new file mode 100644 index 0000000..01e96ee --- /dev/null +++ b/python/basisu_py/wasm/wasm_transcoder.py @@ -0,0 +1,326 @@ +# basisu_py/wasm/wasm_transcoder.py + +import wasmtime +import ctypes + + +class BasisuWasmTranscoder: + """ + Lowest-level WASM transcoder wrapper. + Direct mapping to basisu_wasm_transcoder_api.h/.cpp + + NOTE: + - This layer does NOT interpret formats or block sizes. + - It only wraps the raw C API (bt_* and basis_* exports). + - Higher-level logic (TranscoderCore, Transcoder) will build on top. + """ + + def __init__(self, wasm_path: str): + self.wasm_path = wasm_path + self.engine = None + self.store = None + self.memory = None + self.exports = None + + # ------------------------------------------------------ + # Internal: initialize WASM + WASI + # ------------------------------------------------------ + def _init_engine(self): + self.engine = wasmtime.Engine() + self.store = wasmtime.Store(self.engine) + + wasi = wasmtime.WasiConfig() + wasi.argv = ["basisu-transcoder"] + wasi.inherit_stdout() + wasi.inherit_stderr() + self.store.set_wasi(wasi) + + def load(self): + self._init_engine() + + module = wasmtime.Module.from_file(self.engine, self.wasm_path) + linker = wasmtime.Linker(self.engine) + linker.define_wasi() + + instance = linker.instantiate(self.store, module) + self.exports = instance.exports(self.store) + self.memory = self.exports["memory"] + + # Mandatory transcoder init + if "bt_init" in self.exports: + self.exports["bt_init"](self.store) + + print("[WASM Transcoder] Loaded:", self.wasm_path) + + # ------------------------------------------------------ + # Linear memory access helpers + # ------------------------------------------------------ + def _buf(self): + raw_ptr = self.memory.data_ptr(self.store) + size = self.memory.data_len(self.store) + addr = ctypes.addressof(raw_ptr.contents) + return (ctypes.c_ubyte * size).from_address(addr) + + def write_bytes(self, ptr: int, data: bytes): + buf = self._buf() + buf[ptr:ptr + len(data)] = data + + def read_bytes(self, ptr: int, num: int) -> bytes: + buf = self._buf() + return bytes(buf[ptr:ptr + num]) + + # NEW unified names: + def write_memory(self, ptr, data): + self.write_bytes(ptr, data) + + def read_memory(self, ptr, size): + return self.read_bytes(ptr, size) + + # ------------------------------------------------------ + # Memory alloc/free + # ------------------------------------------------------ + def alloc(self, size: int) -> int: + return self.exports["bt_alloc"](self.store, size) + + def free(self, ptr: int): + return self.exports["bt_free"](self.store, ptr) + + # ------------------------------------------------------ + # High-level functions: version, init, debug + # ------------------------------------------------------ + def get_version(self) -> int: + return self.exports["bt_get_version"](self.store) + + def enable_debug_printf(self, flag: bool = True): + return self.exports["bt_enable_debug_printf"](self.store, 1 if flag else 0) + + # ------------------------------------------------------ + # basis_tex_format helpers + # ------------------------------------------------------ + def basis_tex_format_is_xuastc_ldr(self, basis_tex_fmt_u32: int) -> bool: + return bool(self.exports["bt_basis_tex_format_is_xuastc_ldr"](self.store, basis_tex_fmt_u32)) + + def basis_tex_format_is_astc_ldr(self, basis_tex_fmt_u32: int) -> bool: + return bool(self.exports["bt_basis_tex_format_is_astc_ldr"](self.store, basis_tex_fmt_u32)) + + def basis_tex_format_get_block_width(self, basis_tex_fmt_u32: int) -> int: + return self.exports["bt_basis_tex_format_get_block_width"](self.store, basis_tex_fmt_u32) + + def basis_tex_format_get_block_height(self, basis_tex_fmt_u32: int) -> int: + return self.exports["bt_basis_tex_format_get_block_height"](self.store, basis_tex_fmt_u32) + + def basis_tex_format_is_hdr(self, basis_tex_fmt_u32: int) -> bool: + return bool(self.exports["bt_basis_tex_format_is_hdr"](self.store, basis_tex_fmt_u32)) + + def basis_tex_format_is_ldr(self, basis_tex_fmt_u32: int) -> bool: + return bool(self.exports["bt_basis_tex_format_is_ldr"](self.store, basis_tex_fmt_u32)) + + # ------------------------------------------------------ + # transcoder_texture_format helpers + # ------------------------------------------------------ + def basis_get_bytes_per_block_or_pixel(self, tfmt_u32: int) -> int: + return self.exports["bt_basis_get_bytes_per_block_or_pixel"](self.store, tfmt_u32) + + def basis_transcoder_format_has_alpha(self, tfmt_u32: int) -> bool: + return bool(self.exports["bt_basis_transcoder_format_has_alpha"](self.store, tfmt_u32)) + + def basis_transcoder_format_is_hdr(self, tfmt_u32: int) -> bool: + return bool(self.exports["bt_basis_transcoder_format_is_hdr"](self.store, tfmt_u32)) + + def basis_transcoder_format_is_ldr(self, tfmt_u32: int) -> bool: + return bool(self.exports["bt_basis_transcoder_format_is_ldr"](self.store, tfmt_u32)) + + def basis_transcoder_texture_format_is_astc(self, tfmt_u32: int) -> bool: + return bool(self.exports["bt_basis_transcoder_texture_format_is_astc"](self.store, tfmt_u32)) + + def basis_transcoder_format_is_uncompressed(self, tfmt_u32: int) -> bool: + return bool(self.exports["bt_basis_transcoder_format_is_uncompressed"](self.store, tfmt_u32)) + + def basis_get_uncompressed_bytes_per_pixel(self, tfmt_u32: int) -> int: + return self.exports["bt_basis_get_uncompressed_bytes_per_pixel"](self.store, tfmt_u32) + + def basis_get_block_width(self, tfmt_u32: int) -> int: + return self.exports["bt_basis_get_block_width"](self.store, tfmt_u32) + + def basis_get_block_height(self, tfmt_u32: int) -> int: + return self.exports["bt_basis_get_block_height"](self.store, tfmt_u32) + + def basis_get_transcoder_texture_format_from_basis_tex_format(self, basis_tex_fmt_u32: int) -> int: + return self.exports["bt_basis_get_transcoder_texture_format_from_basis_tex_format"](self.store, basis_tex_fmt_u32) + + def basis_is_format_supported(self, tfmt_u32: int, basis_tex_fmt_u32: int) -> bool: + return bool(self.exports["bt_basis_is_format_supported"](self.store, tfmt_u32, basis_tex_fmt_u32)) + + def basis_compute_transcoded_image_size_in_bytes(self, tfmt_u32: int, orig_width: int, orig_height: int) -> int: + return self.exports["bt_basis_compute_transcoded_image_size_in_bytes"]( + self.store, tfmt_u32, orig_width, orig_height + ) + + # ------------------------------------------------------ + # KTX2 handle management + # ------------------------------------------------------ + def ktx2_open(self, data_ptr: int, data_len: int) -> int: + return self.exports["bt_ktx2_open"](self.store, data_ptr, data_len) + + def ktx2_close(self, handle: int): + return self.exports["bt_ktx2_close"](self.store, handle) + + # ------------------------------------------------------ + # Basic KTX2 metadata + # ------------------------------------------------------ + def ktx2_get_width(self, handle: int) -> int: + return self.exports["bt_ktx2_get_width"](self.store, handle) + + def ktx2_get_height(self, handle: int) -> int: + return self.exports["bt_ktx2_get_height"](self.store, handle) + + def ktx2_get_levels(self, handle: int) -> int: + return self.exports["bt_ktx2_get_levels"](self.store, handle) + + def ktx2_get_faces(self, handle: int) -> int: + return self.exports["bt_ktx2_get_faces"](self.store, handle) + + def ktx2_get_layers(self, handle: int) -> int: + return self.exports["bt_ktx2_get_layers"](self.store, handle) + + def ktx2_get_basis_tex_format(self, handle: int) -> int: + return self.exports["bt_ktx2_get_basis_tex_format"](self.store, handle) + + # KTX2 format checks + def ktx2_is_etc1s(self, handle: int) -> bool: + return bool(self.exports["bt_ktx2_is_etc1s"](self.store, handle)) + + def ktx2_is_uastc_ldr_4x4(self, handle: int) -> bool: + return bool(self.exports["bt_ktx2_is_uastc_ldr_4x4"](self.store, handle)) + + def ktx2_is_hdr(self, handle: int) -> bool: + return bool(self.exports["bt_ktx2_is_hdr"](self.store, handle)) + + def ktx2_is_hdr_4x4(self, handle: int) -> bool: + return bool(self.exports["bt_ktx2_is_hdr_4x4"](self.store, handle)) + + def ktx2_is_hdr_6x6(self, handle: int) -> bool: + return bool(self.exports["bt_ktx2_is_hdr_6x6"](self.store, handle)) + + def ktx2_is_ldr(self, handle: int) -> bool: + return bool(self.exports["bt_ktx2_is_ldr"](self.store, handle)) + + def ktx2_is_astc_ldr(self, handle: int) -> bool: + return bool(self.exports["bt_ktx2_is_astc_ldr"](self.store, handle)) + + def ktx2_is_xuastc_ldr(self, handle: int) -> bool: + return bool(self.exports["bt_ktx2_is_xuastc_ldr"](self.store, handle)) + + def ktx2_get_block_width(self, handle: int) -> int: + return self.exports["bt_ktx2_get_block_width"](self.store, handle) + + def ktx2_get_block_height(self, handle: int) -> int: + return self.exports["bt_ktx2_get_block_height"](self.store, handle) + + def ktx2_has_alpha(self, handle: int) -> bool: + return bool(self.exports["bt_ktx2_has_alpha"](self.store, handle)) + + def ktx2_get_dfd_color_model(self, handle: int) -> int: + return self.exports["bt_ktx2_get_dfd_color_model"](self.store, handle) + + def ktx2_get_dfd_color_primaries(self, handle: int) -> int: + return self.exports["bt_ktx2_get_dfd_color_primaries"](self.store, handle) + + def ktx2_get_dfd_transfer_func(self, handle: int) -> int: + return self.exports["bt_ktx2_get_dfd_transfer_func"](self.store, handle) + + def ktx2_is_srgb(self, handle: int) -> bool: + return bool(self.exports["bt_ktx2_is_srgb"](self.store, handle)) + + def ktx2_get_dfd_flags(self, handle: int) -> int: + return self.exports["bt_ktx2_get_dfd_flags"](self.store, handle) + + def ktx2_get_dfd_total_samples(self, handle: int) -> int: + return self.exports["bt_ktx2_get_dfd_total_samples"](self.store, handle) + + def ktx2_get_dfd_channel_id0(self, handle: int) -> int: + return self.exports["bt_ktx2_get_dfd_channel_id0"](self.store, handle) + + def ktx2_get_dfd_channel_id1(self, handle: int) -> int: + return self.exports["bt_ktx2_get_dfd_channel_id1"](self.store, handle) + + def ktx2_is_video(self, handle: int) -> bool: + return bool(self.exports["bt_ktx2_is_video"](self.store, handle)) + + def ktx2_get_ldr_hdr_upconversion_nit_multiplier(self, handle: int) -> float: + return self.exports["bt_ktx2_get_ldr_hdr_upconversion_nit_multiplier"](self.store, handle) + + # ------------------------------------------------------ + # Per-level metadata + # ------------------------------------------------------ + def ktx2_get_level_orig_width(self, h, lvl, layer, face) -> int: + return self.exports["bt_ktx2_get_level_orig_width"](self.store, h, lvl, layer, face) + + def ktx2_get_level_orig_height(self, h, lvl, layer, face) -> int: + return self.exports["bt_ktx2_get_level_orig_height"](self.store, h, lvl, layer, face) + + def ktx2_get_level_actual_width(self, h, lvl, layer, face) -> int: + return self.exports["bt_ktx2_get_level_actual_width"](self.store, h, lvl, layer, face) + + def ktx2_get_level_actual_height(self, h, lvl, layer, face) -> int: + return self.exports["bt_ktx2_get_level_actual_height"](self.store, h, lvl, layer, face) + + def ktx2_get_level_num_blocks_x(self, h, lvl, layer, face) -> int: + return self.exports["bt_ktx2_get_level_num_blocks_x"](self.store, h, lvl, layer, face) + + def ktx2_get_level_num_blocks_y(self, h, lvl, layer, face) -> int: + return self.exports["bt_ktx2_get_level_num_blocks_y"](self.store, h, lvl, layer, face) + + def ktx2_get_level_total_blocks(self, h, lvl, layer, face) -> int: + return self.exports["bt_ktx2_get_level_total_blocks"](self.store, h, lvl, layer, face) + + def ktx2_get_level_alpha_flag(self, h, lvl, layer, face) -> bool: + return bool(self.exports["bt_ktx2_get_level_alpha_flag"](self.store, h, lvl, layer, face)) + + def ktx2_get_level_iframe_flag(self, h, lvl, layer, face) -> bool: + return bool(self.exports["bt_ktx2_get_level_iframe_flag"](self.store, h, lvl, layer, face)) + + # ------------------------------------------------------ + # Transcoding control + # ------------------------------------------------------ + def ktx2_start_transcoding(self, handle: int) -> bool: + return bool(self.exports["bt_ktx2_start_transcoding"](self.store, handle)) + + def ktx2_create_transcode_state(self) -> int: + return self.exports["bt_ktx2_create_transcode_state"](self.store) + + def ktx2_destroy_transcode_state(self, handle: int): + return self.exports["bt_ktx2_destroy_transcode_state"](self.store, handle) + + # ------------------------------------------------------ + # Actual transcoding call + # ------------------------------------------------------ + def ktx2_transcode_image_level( + self, + ktx2_handle: int, + level_index: int, + layer_index: int, + face_index: int, + output_block_mem_ofs: int, + output_blocks_buf_size_in_blocks_or_pixels: int, + transcoder_texture_format_u32: int, + decode_flags: int, + output_row_pitch_in_blocks_or_pixels: int, + output_rows_in_pixels: int, + channel0: int, + channel1: int, + state_handle: int, + ) -> bool: + return bool(self.exports["bt_ktx2_transcode_image_level"]( + self.store, + ktx2_handle, + level_index, layer_index, face_index, + output_block_mem_ofs, + output_blocks_buf_size_in_blocks_or_pixels, + transcoder_texture_format_u32, + decode_flags, + output_row_pitch_in_blocks_or_pixels, + output_rows_in_pixels, + channel0, channel1, + state_handle + )) diff --git a/python/basisu_transcoder_pybind11.cpp b/python/basisu_transcoder_pybind11.cpp new file mode 100644 index 0000000..4d4aa0f --- /dev/null +++ b/python/basisu_transcoder_pybind11.cpp @@ -0,0 +1,264 @@ +// File: basisu_transcoder_pybind11.cpp +// pybind11 native bindings for the transcoder's pure C API basisu_wasm_transcoder_api.h + +#include +#include + +#include "../encoder/basisu_wasm_transcoder_api.h" + +namespace py = pybind11; + +// wasm_bool_t is uint32_t — convert to Python bool +static inline bool to_bool(wasm_bool_t v) { return v != 0; } + +PYBIND11_MODULE(basisu_transcoder_python, m) { + m.doc() = "Native Basis Universal transcoder (pybind11 binding over basisu_wasm_transcoder_api)"; + + // ------------------------------------------------------------------------ + // High-level functions + // ------------------------------------------------------------------------ + m.def("get_version", &bt_get_version, + "Get BasisU transcoder version"); + + m.def("enable_debug_printf", + [](bool flag) { bt_enable_debug_printf(flag ? 1u : 0u); }, + "Enable or disable debug printf output"); + + m.def("init", &bt_init, + "Initialize transcoder library"); + + m.def("alloc", &bt_alloc, + "Allocate a buffer, returns uint64 offset/pointer"); + m.def("free", &bt_free, + "Free a buffer allocated by bt_alloc"); + + + // ------------------------------------------------------------------------ + // basis_tex_format helpers + // ------------------------------------------------------------------------ + m.def("basis_tex_format_is_xuastc_ldr", + [](uint32_t fmt) { return to_bool(bt_basis_tex_format_is_xuastc_ldr(fmt)); }); + + m.def("basis_tex_format_is_astc_ldr", + [](uint32_t fmt) { return to_bool(bt_basis_tex_format_is_astc_ldr(fmt)); }); + + m.def("basis_tex_format_get_block_width", + &bt_basis_tex_format_get_block_width); + + m.def("basis_tex_format_get_block_height", + &bt_basis_tex_format_get_block_height); + + m.def("basis_tex_format_is_hdr", + [](uint32_t fmt) { return to_bool(bt_basis_tex_format_is_hdr(fmt)); }); + + m.def("basis_tex_format_is_ldr", + [](uint32_t fmt) { return to_bool(bt_basis_tex_format_is_ldr(fmt)); }); + + + // ------------------------------------------------------------------------ + // transcoder_texture_format helpers + // ------------------------------------------------------------------------ + m.def("basis_get_bytes_per_block_or_pixel", + &bt_basis_get_bytes_per_block_or_pixel); + + m.def("basis_transcoder_format_has_alpha", + [](uint32_t tfmt) { return to_bool(bt_basis_transcoder_format_has_alpha(tfmt)); }); + + m.def("basis_transcoder_format_is_hdr", + [](uint32_t tfmt) { return to_bool(bt_basis_transcoder_format_is_hdr(tfmt)); }); + + m.def("basis_transcoder_format_is_ldr", + [](uint32_t tfmt) { return to_bool(bt_basis_transcoder_format_is_ldr(tfmt)); }); + + m.def("basis_transcoder_texture_format_is_astc", + [](uint32_t tfmt) { return to_bool(bt_basis_transcoder_texture_format_is_astc(tfmt)); }); + + m.def("basis_transcoder_format_is_uncompressed", + [](uint32_t tfmt) { return to_bool(bt_basis_transcoder_format_is_uncompressed(tfmt)); }); + + m.def("basis_get_uncompressed_bytes_per_pixel", + &bt_basis_get_uncompressed_bytes_per_pixel); + + m.def("basis_get_block_width", + &bt_basis_get_block_width); + + m.def("basis_get_block_height", + &bt_basis_get_block_height); + + m.def("basis_get_transcoder_texture_format_from_basis_tex_format", + &bt_basis_get_transcoder_texture_format_from_basis_tex_format); + + m.def("basis_is_format_supported", + [](uint32_t tfmt, uint32_t basis_fmt) { + return to_bool(bt_basis_is_format_supported(tfmt, basis_fmt)); + }); + + m.def("basis_compute_transcoded_image_size_in_bytes", + &bt_basis_compute_transcoded_image_size_in_bytes); + + + // ------------------------------------------------------------------------ + // KTX2 open/close & basic info + // ------------------------------------------------------------------------ + m.def("ktx2_open", &bt_ktx2_open, + "Open a KTX2 image from memory; returns handle"); + + m.def("ktx2_close", &bt_ktx2_close, + "Close a previously opened KTX2 handle"); + + m.def("ktx2_get_width", &bt_ktx2_get_width); + m.def("ktx2_get_height", &bt_ktx2_get_height); + m.def("ktx2_get_levels", &bt_ktx2_get_levels); + m.def("ktx2_get_faces", &bt_ktx2_get_faces); + m.def("ktx2_get_layers", &bt_ktx2_get_layers); + + m.def("ktx2_get_basis_tex_format", &bt_ktx2_get_basis_tex_format); + + m.def("ktx2_is_etc1s", + [](uint64_t h) { return to_bool(bt_ktx2_is_etc1s(h)); }); + + m.def("ktx2_is_uastc_ldr_4x4", + [](uint64_t h) { return to_bool(bt_ktx2_is_uastc_ldr_4x4(h)); }); + + m.def("ktx2_is_hdr", + [](uint64_t h) { return to_bool(bt_ktx2_is_hdr(h)); }); + + m.def("ktx2_is_hdr_4x4", + [](uint64_t h) { return to_bool(bt_ktx2_is_hdr_4x4(h)); }); + + m.def("ktx2_is_hdr_6x6", + [](uint64_t h) { return to_bool(bt_ktx2_is_hdr_6x6(h)); }); + + m.def("ktx2_is_ldr", + [](uint64_t h) { return to_bool(bt_ktx2_is_ldr(h)); }); + + m.def("ktx2_is_astc_ldr", + [](uint64_t h) { return to_bool(bt_ktx2_is_astc_ldr(h)); }); + + m.def("ktx2_is_xuastc_ldr", + [](uint64_t h) { return to_bool(bt_ktx2_is_xuastc_ldr(h)); }); + + m.def("ktx2_get_block_width", &bt_ktx2_get_block_width); + + m.def("ktx2_get_block_height", &bt_ktx2_get_block_height); + + m.def("ktx2_has_alpha", + [](uint64_t h) { return to_bool(bt_ktx2_has_alpha(h)); }); + + m.def("ktx2_get_dfd_color_model", &bt_ktx2_get_dfd_color_model); + m.def("ktx2_get_dfd_color_primaries", &bt_ktx2_get_dfd_color_primaries); + m.def("ktx2_get_dfd_transfer_func", &bt_ktx2_get_dfd_transfer_func); + + m.def("ktx2_is_srgb", + [](uint64_t h) { return to_bool(bt_ktx2_is_srgb(h)); }); + + m.def("ktx2_get_dfd_flags", &bt_ktx2_get_dfd_flags); + m.def("ktx2_get_dfd_total_samples", &bt_ktx2_get_dfd_total_samples); + m.def("ktx2_get_dfd_channel_id0", &bt_ktx2_get_dfd_channel_id0); + m.def("ktx2_get_dfd_channel_id1", &bt_ktx2_get_dfd_channel_id1); + + m.def("ktx2_is_video", + [](uint64_t h) { return to_bool(bt_ktx2_is_video(h)); }); + + m.def("ktx2_get_ldr_hdr_upconversion_nit_multiplier", + &bt_ktx2_get_ldr_hdr_upconversion_nit_multiplier); + + + // ------------------------------------------------------------------------ + // KTX2 per-level info + // ------------------------------------------------------------------------ + m.def("ktx2_get_level_orig_width", + &bt_ktx2_get_level_orig_width); + + m.def("ktx2_get_level_orig_height", + &bt_ktx2_get_level_orig_height); + + m.def("ktx2_get_level_actual_width", + &bt_ktx2_get_level_actual_width); + + m.def("ktx2_get_level_actual_height", + &bt_ktx2_get_level_actual_height); + + m.def("ktx2_get_level_num_blocks_x", + &bt_ktx2_get_level_num_blocks_x); + + m.def("ktx2_get_level_num_blocks_y", + &bt_ktx2_get_level_num_blocks_y); + + m.def("ktx2_get_level_total_blocks", + &bt_ktx2_get_level_total_blocks); + + m.def("ktx2_get_level_alpha_flag", + [](uint64_t h, uint32_t level, uint32_t layer, uint32_t face) { + return to_bool(bt_ktx2_get_level_alpha_flag(h, level, layer, face)); + }); + + m.def("ktx2_get_level_iframe_flag", + [](uint64_t h, uint32_t level, uint32_t layer, uint32_t face) { + return to_bool(bt_ktx2_get_level_iframe_flag(h, level, layer, face)); + }); + + + // ------------------------------------------------------------------------ + // Transcoding state and operations + // ------------------------------------------------------------------------ + m.def("ktx2_start_transcoding", + [](uint64_t h) { return to_bool(bt_ktx2_start_transcoding(h)); }); + + m.def("ktx2_create_transcode_state", + &bt_ktx2_create_transcode_state); + + m.def("ktx2_destroy_transcode_state", + &bt_ktx2_destroy_transcode_state); + + m.def("ktx2_transcode_image_level", + [](uint64_t ktx2_handle, + uint32_t level_index, uint32_t layer_index, uint32_t face_index, + uint64_t out_mem_ofs, + uint32_t out_blocks_or_pixels, + uint32_t transcoder_texture_format_u32, + uint32_t decode_flags, + uint32_t row_pitch_blocks_or_pixels, + uint32_t rows_in_pixels, + int channel0, int channel1, + uint64_t state_handle) + { + return to_bool(bt_ktx2_transcode_image_level( + ktx2_handle, + level_index, layer_index, face_index, + out_mem_ofs, + out_blocks_or_pixels, + transcoder_texture_format_u32, + decode_flags, + row_pitch_blocks_or_pixels, + rows_in_pixels, + channel0, channel1, + state_handle)); + }, + py::arg("ktx2_handle"), + py::arg("level_index"), + py::arg("layer_index"), + py::arg("face_index"), + py::arg("output_block_mem_ofs"), + py::arg("output_blocks_buf_size_in_blocks_or_pixels"), + py::arg("transcoder_texture_format_u32"), + py::arg("decode_flags"), + py::arg("output_row_pitch_in_blocks_or_pixels") = 0, + py::arg("output_rows_in_pixels") = 0, + py::arg("channel0") = -1, + py::arg("channel1") = -1, + py::arg("state_handle") = 0); + + m.def("read_memory", + [](uint64_t ptr, uint32_t size) { + return py::bytes((const char*)ptr, size); + }, + "Read `size` bytes starting at native memory address `ptr`"); + + m.def("write_memory", + [](uint64_t dest_ptr, py::buffer src) { + py::buffer_info info = src.request(); + memcpy((void*)dest_ptr, info.ptr, info.size * info.itemsize); + }, + "Write bytes/buffer-like object into native memory at address `ptr`"); +} diff --git a/python/dds_writer.py b/python/dds_writer.py new file mode 100644 index 0000000..f1c9543 --- /dev/null +++ b/python/dds_writer.py @@ -0,0 +1,332 @@ +# dds_writer.py +# +# Minimal DDS writer that mirrors the C/C++ save_dds() implementation you provided. +# It writes a DX9-style DDS header, and optionally a DX10 extension header, +# followed by the raw compressed blocks. +# +# No mipmaps, no cubes, no 3D volumes – exactly like the original C code. + +import struct +import sys +from typing import Union + + +# --------------------------------------------------------------------------- +# FourCC helper (same as PIXEL_FMT_FOURCC macro) +# --------------------------------------------------------------------------- +def make_fourcc(a: str, b: str, c: str, d: str) -> int: + return (ord(a) | + (ord(b) << 8) | + (ord(c) << 16) | + (ord(d) << 24)) + + +# --------------------------------------------------------------------------- +# DDS-related constants (only the ones we actually use) +# --------------------------------------------------------------------------- + +# DDSD flags +DDSD_CAPS = 0x00000001 +DDSD_HEIGHT = 0x00000002 +DDSD_WIDTH = 0x00000004 +DDSD_PIXELFORMAT= 0x00001000 +DDSD_LINEARSIZE = 0x00080000 + +# DDPF flags +DDPF_FOURCC = 0x00000004 + +# DDSCAPS flags +DDSCAPS_TEXTURE = 0x00001000 + +# DXGI_FORMAT subset (values must match the C enum) +class DXGI_FORMAT: + UNKNOWN = 0 + BC1_UNORM = 71 + BC3_UNORM = 77 + BC4_UNORM = 80 + BC5_UNORM = 83 + # You can add more as needed; for DX10 header we just write the integer value. + +# DX10 resource dimension +class D3D10_RESOURCE_DIMENSION: + UNKNOWN = 0 + BUFFER = 1 + TEXTURE1D = 2 + TEXTURE2D = 3 + TEXTURE3D = 4 + + +# --------------------------------------------------------------------------- +# DDS writer class +# --------------------------------------------------------------------------- +class DDSWriter: + """ + Python port of the C save_dds() function. + + Usage: + writer = DDSWriter() + ok = writer.save_dds( + filename="out.dds", + width=width, + height=height, + blocks=bc_data, # bytes or bytearray + pixel_format_bpp=4, # e.g. 4 for BC1, 8 for BC3/4/5/etc. + dxgi_format=DXGI_FORMAT.BC1_UNORM, + srgb=False, + force_dx10_header=False, + ) + """ + + DDS_MAGIC = b"DDS " # same as fwrite("DDS ", 4, 1, pFile); + + def save_dds( + self, + filename: str, + width: int, + height: int, + blocks: Union[bytes, bytearray, memoryview], + pixel_format_bpp: int, + dxgi_format: int, + srgb: bool = False, + force_dx10_header: bool = False, + ) -> bool: + """ + Port of: + bool save_dds(const char* pFilename, + uint32_t width, uint32_t height, + const void* pBlocks, + uint32_t pixel_format_bpp, + DXGI_FORMAT dxgi_format, + bool srgb, + bool force_dx10_header); + + The 'blocks' buffer is written as-is (up to computed linear size). + """ + + # srgb is intentionally unused in the original C code (commented logic). + _ = srgb + + # Open file like the C code + try: + f = open(filename, "wb") + except OSError: + print(f"Failed creating file {filename}!", file=sys.stderr) + return False + + try: + # Write the "DDS " magic + f.write(self.DDS_MAGIC) + + # ----------------------------------------------------------------- + # Build DDSURFACEDESC2 equivalent + # ----------------------------------------------------------------- + # We'll pack DDSURFACEDESC2 as 31 uint32's (124 bytes) in little-endian: + # struct DDSURFACEDESC2 { + # uint32 dwSize; + # uint32 dwFlags; + # uint32 dwHeight; + # uint32 dwWidth; + # uint32 lPitch_or_dwLinearSize; + # uint32 dwBackBufferCount; + # uint32 dwMipMapCount; + # uint32 dwAlphaBitDepth; + # uint32 dwUnused0; + # uint32 lpSurface; + # DDCOLORKEY unused0; (2 * uint32) + # DDCOLORKEY unused1; (2 * uint32) + # DDCOLORKEY unused2; (2 * uint32) + # DDCOLORKEY unused3; (2 * uint32) + # DDPIXELFORMAT ddpfPixelFormat; (8 * uint32) + # DDSCAPS2 ddsCaps; (4 * uint32) + # uint32 dwUnused1; + # }; + + dwSize = 124 # sizeof(DDSURFACEDESC2) + + dwFlags = ( + DDSD_WIDTH | + DDSD_HEIGHT | + DDSD_PIXELFORMAT | + DDSD_CAPS + ) + + dwWidth = int(width) + dwHeight = int(height) + + # lPitch (actually LinearSize for compressed formats), same as: + # (((dwWidth + 3) & ~3) * ((dwHeight + 3) & ~3) * pixel_format_bpp) >> 3; + lPitch = ( + ((dwWidth + 3) & ~3) + * ((dwHeight + 3) & ~3) + * int(pixel_format_bpp) + ) >> 3 + + dwFlags |= DDSD_LINEARSIZE + + dwBackBufferCount = 0 + dwMipMapCount = 0 + dwAlphaBitDepth = 0 + dwUnused0 = 0 + lpSurface = 0 + + # DDCOLORKEY unused0..3, all zero + ddcolorkey_zero = [0, 0] * 4 # 4 DDCOLORKEY structs + + # DDPIXELFORMAT + # struct DDPIXELFORMAT { + # uint32 dwSize; + # uint32 dwFlags; + # uint32 dwFourCC; + # uint32 dwRGBBitCount; + # uint32 dwRBitMask; + # uint32 dwGBitMask; + # uint32 dwBBitMask; + # uint32 dwRGBAlphaBitMask; + # }; + ddpf_dwSize = 32 + ddpf_dwFlags = DDPF_FOURCC + ddpf_dwFourCC = 0 + ddpf_dwRGBBitCount = 0 + ddpf_dwRBitMask = 0 + ddpf_dwGBitMask = 0 + ddpf_dwBBitMask = 0 + ddpf_dwRGBAlphaBitMask = 0 + + # DDSCAPS2 + # struct DDSCAPS2 { + # uint32 dwCaps; + # uint32 dwCaps2; + # uint32 dwCaps3; + # uint32 dwCaps4; + # }; + ddsCaps_dwCaps = DDSCAPS_TEXTURE + ddsCaps_dwCaps2 = 0 + ddsCaps_dwCaps3 = 0 + ddsCaps_dwCaps4 = 0 + + dwUnused1 = 0 + + # Decide whether to use legacy FourCC (DXT1/DXT5/ATI1/ATI2) or DX10 header + use_legacy = ( + not force_dx10_header and + dxgi_format in ( + DXGI_FORMAT.BC1_UNORM, + DXGI_FORMAT.BC3_UNORM, + DXGI_FORMAT.BC4_UNORM, + DXGI_FORMAT.BC5_UNORM, + ) + ) + + if use_legacy: + if dxgi_format == DXGI_FORMAT.BC1_UNORM: + ddpf_dwFourCC = make_fourcc('D', 'X', 'T', '1') + elif dxgi_format == DXGI_FORMAT.BC3_UNORM: + ddpf_dwFourCC = make_fourcc('D', 'X', 'T', '5') + elif dxgi_format == DXGI_FORMAT.BC4_UNORM: + ddpf_dwFourCC = make_fourcc('A', 'T', 'I', '1') + elif dxgi_format == DXGI_FORMAT.BC5_UNORM: + ddpf_dwFourCC = make_fourcc('A', 'T', 'I', '2') + else: + # Write DX10 header, FourCC = "DX10" + ddpf_dwFourCC = make_fourcc('D', 'X', '1', '0') + + # Build the 31 uint32's for DDSURFACEDESC2 + header_values = [ + dwSize, + dwFlags, + dwHeight, + dwWidth, + lPitch, + dwBackBufferCount, + dwMipMapCount, + dwAlphaBitDepth, + dwUnused0, + lpSurface, + ] + + header_values.extend(ddcolorkey_zero) # 8 uint32's + + ddpf_values = [ + ddpf_dwSize, + ddpf_dwFlags, + ddpf_dwFourCC, + ddpf_dwRGBBitCount, + ddpf_dwRBitMask, + ddpf_dwGBitMask, + ddpf_dwBBitMask, + ddpf_dwRGBAlphaBitMask, + ] + header_values.extend(ddpf_values) # 8 uint32's + + ddsCaps_values = [ + ddsCaps_dwCaps, + ddsCaps_dwCaps2, + ddsCaps_dwCaps3, + ddsCaps_dwCaps4, + ] + header_values.extend(ddsCaps_values) # 4 uint32's + + header_values.append(dwUnused1) # final uint32 + + if len(header_values) != 31: + raise RuntimeError("Internal error: DDSURFACEDESC2 must contain 31 uint32's") + + # Pack and write DDSURFACEDESC2 + dds_header = struct.pack("<31I", *header_values) + f.write(dds_header) + + # If needed, write the DX10 header (DDS_HEADER_DXT10) + if not use_legacy: + # struct DDS_HEADER_DXT10 { + # DXGI_FORMAT dxgiFormat; + # D3D10_RESOURCE_DIMENSION resourceDimension; + # uint32 miscFlag; + # uint32 arraySize; + # uint32 miscFlags2; + # }; + dxgiFormat = int(dxgi_format) + resourceDimension = D3D10_RESOURCE_DIMENSION.TEXTURE2D + miscFlag = 0 + arraySize = 1 + miscFlags2 = 0 + + dxt10_header = struct.pack( + "<5I", + dxgiFormat, + resourceDimension, + miscFlag, + arraySize, + miscFlags2, + ) + f.write(dxt10_header) + + # ----------------------------------------------------------------- + # Write the actual texture data blocks (pBlocks) + # ----------------------------------------------------------------- + + # C code: fwrite(pBlocks, desc.lPitch, 1, pFile); + # i.e. write exactly lPitch bytes. + data = memoryview(blocks) + if len(data) < lPitch: + raise ValueError( + f"blocks buffer too small: need at least {lPitch} bytes, got {len(data)}" + ) + f.write(data[:lPitch]) + + except Exception as e: + # Mimic the C-style error reporting as much as practical + print(f"Failed writing to DDS file {filename}: {e}", file=sys.stderr) + try: + f.close() + except Exception: + pass + return False + + # Close file + try: + f.close() + except OSError: + print(f"Failed closing DDS file {filename}!", file=sys.stderr) + return False + + return True diff --git a/python/explode_ktx2_file.py b/python/explode_ktx2_file.py new file mode 100644 index 0000000..0c414b0 --- /dev/null +++ b/python/explode_ktx2_file.py @@ -0,0 +1,413 @@ +#!/usr/bin/env python3 +""" +explode_ktx2_file.py +FULL LDR/HDR KTX2 EXPLODER + FULL API INTROSPECTION + ASTC + BC7/BC6H OUTPUT + +Usage: + python3 explode_ktx2_file.py input.ktx2 + python3 explode_ktx2_file.py input.ktx2 --info-only +""" + +# Python Dependencies (beyond basisu_py): +# numpy +# pillow +# imageio (v3+) +# wasmtime +# +# System Dependencies: +# OpenImageIO ("oiiotool") -- required for EXR output +# +# Install Python deps: +# pip install numpy pillow imageio wasmtime +# +# On Ubuntu: +# sudo apt install openimageio-tools +# +# On macOS (Homebrew): +# brew install openimageio + +import sys +import os +import numpy as np +import subprocess +import tempfile +import imageio.v3 as iio +from PIL import Image + +from basisu_py import Transcoder +from basisu_py.constants import TranscoderTextureFormat as TF + +# Writers located in same directory as this script +from astc_writer import write_astc_file +from dds_writer import DDSWriter + + +# ============================================================================ +# File-writing helpers +# ============================================================================ +def save_exr(path, rgba32f): + """ + Save float32 RGBA as EXR if possible. + If oiiotool is not available, save TIFF instead (Windows-safe). + """ + import numpy as np + import imageio.v3 as iio + import subprocess, tempfile, os + + # Write temp TIFF + with tempfile.NamedTemporaryFile(suffix=".tiff", delete=False) as tmp: + temp_path = tmp.name + + iio.imwrite(temp_path, rgba32f.astype(np.float32)) + + # Try EXR via oiiotool + try: + subprocess.run(["oiiotool", temp_path, "-o", path], check=True) + os.remove(temp_path) + print(" Wrote EXR:", path) + return + + except Exception: + # --- FALLBACK: save TIFF --- + fallback_path = path + ".tiff" + + # Windows cannot overwrite files via rename(), so remove first + if os.path.exists(fallback_path): + os.remove(fallback_path) + + # os.replace() always overwrites + os.replace(temp_path, fallback_path) + + print(" [Fallback] Wrote TIFF instead:", fallback_path) + + +def save_png(path, rgba8): + img = Image.fromarray(rgba8, mode="RGBA") + img.save(path) + print(f" PNG saved: {path}") + + +# ============================================================================ +# Pretty header +# ============================================================================ +def print_header(title): + print("\n" + "=" * 90) + print(title) + print("=" * 90) + + +# ============================================================================ +# Full top-level metadata dump (ALL API) +# ============================================================================ +def dump_all_top_level(t, h): + print_header("TOP-LEVEL KTX2 METADATA FULL API") + + print("Backend :", t.backend_name) + print("Version :", t.get_version()) + print("Width :", t.get_width(h)) + print("Height :", t.get_height(h)) + print("Levels :", t.get_levels(h)) + print("Faces :", t.get_faces(h)) + + layers = t.get_layers(h) + eff_layers = layers if layers > 0 else 1 + print("Layers (raw) :", layers) + print("Layers (effective) :", eff_layers) + + fmt = t.get_basis_tex_format(h) + print("\nBasisTexFormat :", fmt) + + print("\nKTX2 Format Flags:") + print(" is_etc1s :", t.is_etc1s(h)) + print(" is_uastc_ldr_4x4 :", t.is_uastc_ldr_4x4(h)) + print(" is_xuastc_ldr :", t.is_xuastc_ldr(h)) + print(" is_astc_ldr :", t.is_astc_ldr(h)) + print(" is_hdr :", t.is_hdr(h)) + print(" is_hdr_4x4 :", t.is_hdr_4x4(h)) + print(" is_hdr_6x6 :", t.is_hdr_6x6(h)) + print(" is_ldr :", t.is_ldr(h)) + print(" is_srgb :", t.is_srgb(h)) + print(" is_video :", t.is_video(h)) + print(" has_alpha :", t.has_alpha(h)) + + print("\nBlock Info:") + print(" block_width :", t.get_block_width(h)) + print(" block_height :", t.get_block_height(h)) + + print("\nDFD Info:") + print(" color_model :", t.get_dfd_color_model(h)) + print(" color_primaries :", t.get_dfd_color_primaries(h)) + print(" transfer_func :", t.get_dfd_transfer_func(h)) + print(" flags :", t.get_dfd_flags(h)) + print(" total_samples :", t.get_dfd_total_samples(h)) + print(" channel_id0 :", t.get_dfd_channel_id0(h)) + print(" channel_id1 :", t.get_dfd_channel_id1(h)) + + if t.is_hdr(h): + print(" hdr_nit_multiplier :", t.get_ldr_hdr_upconversion_nit_multiplier(h)) + + +# ============================================================================ +# BasisTexFormat helpers +# ============================================================================ +def dump_basis_tex_format_helpers(t, h): + print_header("BasisTexFormat HELPERS (FULL)") + + fmt = t.get_basis_tex_format(h) + print("basis_tex_format:", fmt) + + print("is_xuastc_ldr :", t.basis_tex_format_is_xuastc_ldr(fmt)) + print("is_astc_ldr :", t.basis_tex_format_is_astc_ldr(fmt)) + print("block width :", t.basis_tex_format_get_block_width(fmt)) + print("block height :", t.basis_tex_format_get_block_height(fmt)) + print("is_hdr :", t.basis_tex_format_is_hdr(fmt)) + print("is_ldr :", t.basis_tex_format_is_ldr(fmt)) + + +# ============================================================================ +# Level / Layer / Face metadata dump +# ============================================================================ +def dump_per_level_info(t, h): + print_header("PER-LEVEL / PER-LAYER / PER-FACE METADATA") + + levels = t.get_levels(h) + faces = t.get_faces(h) + layers = t.get_layers(h) + if layers == 0: + layers = 1 + + for level in range(levels): + for layer in range(layers): + for face in range(faces): + print(f"\nLevel={level}, Layer={layer}, Face={face}") + print(" orig_width :", t.get_level_orig_width(h, level, layer, face)) + print(" orig_height :", t.get_level_orig_height(h, level, layer, face)) + print(" actual_width :", t.get_level_actual_width(h, level, layer, face)) + print(" actual_height:", t.get_level_actual_height(h, level, layer, face)) + print(" blocks_x :", t.get_level_num_blocks_x(h, level, layer, face)) + print(" blocks_y :", t.get_level_num_blocks_y(h, level, layer, face)) + print(" total_blocks :", t.get_level_total_blocks(h, level, layer, face)) + print(" alpha_flag :", t.get_level_alpha_flag(h, level, layer, face)) + print(" iframe_flag :", t.get_level_iframe_flag(h, level, layer, face)) + + +# ============================================================================ +# ASTC Selection +# ============================================================================ +def choose_astc_format(t, h): + fmt = t.get_basis_tex_format(h) + tfmt = t.basis_get_transcoder_texture_format_from_basis_tex_format(fmt) + bw = t.basis_get_block_width(tfmt) + bh = t.basis_get_block_height(tfmt) + + print_header("ASTC SELECTION") + print("ASTC TF:", tfmt) + print(f"Block dims: {bw}x{bh}") + return tfmt, bw, bh + + +# ============================================================================ +# BC Format Selection +# ============================================================================ +def choose_bc_format(t, h): + if t.is_hdr(h): + print_header("HDR -> BC6H") + return TF.TF_BC6H, 8, 95 # DXGI_FORMAT_BC6H_UF16 + else: + print_header("LDR -> BC7") + return TF.TF_BC7_RGBA, 8, 98 # DXGI_FORMAT_BC7_UNORM + + +# ============================================================================ +# Full explode transcoding (using handle API + per-level dims) +# ============================================================================ +def explode_transcode(t, h): + levels = t.get_levels(h) + faces = t.get_faces(h) + layers = t.get_layers(h) + if layers == 0: + layers = 1 + + astc_tfmt, astc_bw, astc_bh = choose_astc_format(t, h) + bc_tfmt, bc_bpp, bc_dxgi = choose_bc_format(t, h) + + ddsw = DDSWriter() + print_header("BEGIN EXPLODE TRANSCODING (handle API)") + + for level in range(levels): + for layer in range(layers): + for face in range(faces): + + print(f"\n- Level={level} Layer={layer} Face={face}") + + ow = t.get_level_orig_width(h, level, layer, face) + oh = t.get_level_orig_height(h, level, layer, face) + print(f" Level orig dims: {ow}x{oh}") + + # ASTC + astc_blocks = t.transcode_tfmt_handle( + h, astc_tfmt, + level=level, layer=layer, face=face, + decode_flags=0, channel0=-1, channel1=-1 + ) + astc_name = f"astc_L{level}_Y{layer}_F{face}.astc" + write_astc_file(astc_name, astc_blocks, astc_bw, astc_bh, ow, oh) + print(" ASTC saved:", astc_name) + + # BC6H / BC7 + bc_blocks = t.transcode_tfmt_handle( + h, bc_tfmt, + level=level, layer=layer, face=face, + decode_flags=0, channel0=-1, channel1=-1 + ) + if t.is_hdr(h): + dds_name = f"bc6h_L{level}_Y{layer}_F{face}.dds" + else: + dds_name = f"bc7_L{level}_Y{layer}_F{face}.dds" + + ddsw.save_dds( + dds_name, + width=ow, height=oh, + blocks=bc_blocks, + pixel_format_bpp=bc_bpp, + dxgi_format=bc_dxgi, + srgb=False, + force_dx10_header=True, + ) + print(" DDS saved :", dds_name) + + print_header("EXPLODE TRANSCODING COMPLETE") + + +# ============================================================================ +# Decode each (Level, Layer, Face) to PNG or EXR +# ============================================================================ +def explode_decode_images(t, h): + print_header("BEGIN EXPLODE IMAGE DECODE (PNG/EXR)") + + levels = t.get_levels(h) + faces = t.get_faces(h) + layers = t.get_layers(h) + if layers == 0: + layers = 1 + + hdr = t.is_hdr(h) + + for level in range(levels): + for layer in range(layers): + for face in range(faces): + + print(f"\n- Decode Level={level} Layer={layer} Face={face}") + + ow = t.get_level_orig_width(h, level, layer, face) + oh = t.get_level_orig_height(h, level, layer, face) + + if hdr: + rgba32f = t.decode_rgba_hdr_handle(h, level, layer, face) + outname = f"exr_L{level}_Y{layer}_F{face}.exr" + save_exr(outname, rgba32f) + else: + rgba8 = t.decode_rgba_handle(h, level, layer, face) + outname = f"png_L{level}_Y{layer}_F{face}.png" + save_png(outname, rgba8) + + print_header("IMAGE DECODE COMPLETE") + +def dump_transcoder_texture_format_helpers(t): + print_header("TranscoderTextureFormat HELPERS (FULL)") + + test_formats = [ + # uncompressed + TF.TF_RGBA32, TF.TF_RGB565, TF.TF_BGR565, + TF.TF_RGBA4444, TF.TF_RGB_HALF, TF.TF_RGBA_HALF, TF.TF_RGB_9E5, + + # basic compressed + TF.TF_ETC1_RGB, TF.TF_ETC2_RGBA, + TF.TF_BC1_RGB, TF.TF_BC3_RGBA, + TF.TF_BC4_R, TF.TF_BC5_RG, + TF.TF_BC7_RGBA, TF.TF_BC6H, + TF.TF_ETC2_EAC_R11, TF.TF_ETC2_EAC_RG11, + TF.TF_FXT1_RGB, + TF.TF_PVRTC1_4_RGB, TF.TF_PVRTC1_4_RGBA, + TF.TF_PVRTC2_4_RGB, TF.TF_PVRTC2_4_RGBA, + TF.TF_ATC_RGB, TF.TF_ATC_RGBA, + + # HDR ASTC + TF.TF_ASTC_HDR_4X4_RGBA, + TF.TF_ASTC_HDR_6X6_RGBA, + + # LDR ASTC + TF.TF_ASTC_LDR_4X4_RGBA, + TF.TF_ASTC_LDR_5X4_RGBA, TF.TF_ASTC_LDR_5X5_RGBA, + TF.TF_ASTC_LDR_6X5_RGBA, TF.TF_ASTC_LDR_6X6_RGBA, + TF.TF_ASTC_LDR_8X5_RGBA, TF.TF_ASTC_LDR_8X6_RGBA, + TF.TF_ASTC_LDR_10X5_RGBA, TF.TF_ASTC_LDR_10X6_RGBA, + TF.TF_ASTC_LDR_8X8_RGBA, TF.TF_ASTC_LDR_10X8_RGBA, + TF.TF_ASTC_LDR_10X10_RGBA, TF.TF_ASTC_LDR_12X10_RGBA, + TF.TF_ASTC_LDR_12X12_RGBA, + ] + + for tfmt in test_formats: + print(f"\nTF={tfmt}") + print(" has_alpha :", t.basis_transcoder_format_has_alpha(tfmt)) + print(" is_hdr :", t.basis_transcoder_format_is_hdr(tfmt)) + print(" is_ldr :", t.basis_transcoder_format_is_ldr(tfmt)) + print(" is_astc :", t.basis_transcoder_texture_format_is_astc(tfmt)) + print(" is_uncompressed :", t.basis_transcoder_format_is_uncompressed(tfmt)) + print(" bytes/block :", t.basis_get_bytes_per_block_or_pixel(tfmt)) + print(" block_width :", t.basis_get_block_width(tfmt)) + print(" block_height :", t.basis_get_block_height(tfmt)) + + +def main(): + if len(sys.argv) < 2: + print("Usage: python explode_ktx2_file.py input.ktx2 [--info-only] [--print-tf]") + return 1 + + args = sys.argv[1:] + info_only = "--info-only" in args + print_tf = "--print-tf" in args or "--transcoder-formats" in args + + # Determine input filename + input_file = None + for a in args: + if not a.startswith("--"): + input_file = a + break + + if input_file is None: + print("Error: No input file provided.") + return 1 + + ktx_bytes = open(input_file, "rb").read() + + t = Transcoder() + h = t.open(ktx_bytes) + t.start_transcoding(h) + + # Full metadata + dump_all_top_level(t, h) + dump_basis_tex_format_helpers(t, h) + dump_per_level_info(t, h) + + # Optional TF helpers + if print_tf: + dump_transcoder_texture_format_helpers(t) + + if info_only: + print_header("INFO-ONLY MODE NO FILES WRITTEN") + t.close(h) + return 0 + + # Full output + explode_transcode(t, h) + explode_decode_images(t, h) + + t.close(h) + print("Success") + return 0 + +if __name__ == "__main__": + sys.exit(main()) + + diff --git a/python/lowlevel_test_native/__init__.py b/python/lowlevel_test_native/__init__.py new file mode 100644 index 0000000..143f486 --- /dev/null +++ b/python/lowlevel_test_native/__init__.py @@ -0,0 +1 @@ +# __init__.py diff --git a/python/lowlevel_test_native/basic_test.py b/python/lowlevel_test_native/basic_test.py new file mode 100644 index 0000000..44004e2 --- /dev/null +++ b/python/lowlevel_test_native/basic_test.py @@ -0,0 +1,127 @@ +# basic_test.py +import sys +sys.path.append("basisu_py") # make sure Python can load the .so + +import basisu_python as bu +from constants import * + +import ctypes +import math + +def generate_swirl_rgba8(width, height): + """ + Generate a smooth colorful swirl procedural RGBA8 test image. + Returns: a ctypes array of type (c_ubyte * (width * height * 4)) + """ + pixel_count = width * height * 4 + img = (ctypes.c_ubyte * pixel_count)() + + for y in range(height): + for x in range(width): + i = (y * width + x) * 4 + + dx = x - width / 2 + dy = y - height / 2 + + dist = math.hypot(dx, dy) + angle = math.atan2(dy, dx) + + # Color swirl pattern + r = int((math.sin(dist * 0.15) * 0.5 + 0.5) * 255) + g = int((math.sin(angle * 3.0) * 0.5 + 0.5) * 255) + b = int((math.cos(dist * 0.10 + angle * 2.0) * 0.5 + 0.5) * 255) + + img[i + 0] = r & 255 + img[i + 1] = g & 255 + img[i + 2] = b & 255 + img[i + 3] = 255 + + return img + +def generate_test_pattern_rgba8(width, height): + """ + Generate a simple deterministic RGBA8 test pattern: + R = x + G = y + B = x^y + A = 255 + """ + import ctypes + + pixel_count = width * height * 4 + img = (ctypes.c_ubyte * pixel_count)() + + for y in range(height): + for x in range(width): + i = (y * width + x) * 4 + + img[i + 0] = x & 0xFF + img[i + 1] = y & 0xFF + img[i + 2] = (x ^ y) & 0xFF + img[i + 3] = 255 + + return img + +# ------------------------------------------------------------ +# BasisU compression test (NATIVE C++) +# ------------------------------------------------------------ + +print("Native BasisU version:", bu.get_version()) +bu.init() + +# Create comp params +params = bu.new_params() +print("Params handle:", params) + +# Create RGBA8 swirl (64 x 64) +W, H = 512, 512 +pixel_count = W * H * 4 + +# Generate swirl image in PYTHON memory + +img = generate_swirl_rgba8(W, H) +#img = generate_test_pattern_rgba8(W, H) + +# Allocate memory inside NATIVE C++ heap +img_ptr = bu.alloc(pixel_count) + +# Copy Python swirl image ? C++ heap buffer +ctypes.memmove(img_ptr, img, pixel_count) + +# Set into BasisU +pitch = W * 4 +ok = bu.set_image_rgba32(params, 0, img_ptr, W, H, pitch) +print("Set image:", ok) + +# Compress (UASTC LDR 4x4 = 1) +ok = bu.compress( + params, + BasisTexFormat.cASTC_LDR_4x4, # basis_tex_format + BasisQuality.MAX, # quality + BasisEffort.DEFAULT, # effort + BasisFlags.KTX2_OUTPUT | BasisFlags.SRGB | BasisFlags.THREADED | BasisFlags.DEBUG_OUTPUT | BasisFlags.VERBOSE, # flags + 0.0 # rdo +) +print("Compress:", ok) + +# Retrieve compressed data +size = bu.get_comp_data_size(params) +ofs = bu.get_comp_data_ofs(params) + +print("Output size =", size, "ptr =", ofs) + +# Copy bytes out of native memory +byte_ptr = ctypes.cast(ofs, ctypes.POINTER(ctypes.c_ubyte)) +blob = bytes(byte_ptr[i] for i in range(size)) + +print("First 16 bytes:", blob[:16]) + +# Save to KTX2 +with open("out_native.ktx2", "wb") as f: + f.write(blob) + +print("Saved out_native.ktx2") + +# Cleanup +bu.delete_params(params) +bu.free(img_ptr) diff --git a/python/lowlevel_test_native/example_capi_python.py b/python/lowlevel_test_native/example_capi_python.py new file mode 100644 index 0000000..d24f48e --- /dev/null +++ b/python/lowlevel_test_native/example_capi_python.py @@ -0,0 +1,481 @@ +#!/usr/bin/env python3 +# example_capi_python.py +# +# Simple Python port of example_capi.c using native C++ pybind11 bindings: +# - basisu_python (encoder) +# - basisu_transcoder_python (transcoder) +# +# Requires: +# basisu_py/basisu_python*.so +# basisu_py/basisu_transcoder_python*.so +# basisu_py/constants.py + +import sys +import os +import math +import ctypes + +# Make sure Python can see the native .so's and the shared constants +sys.path.append("basisu_py") + +import basisu_python as bu +import basisu_transcoder_python as bt +from constants import BasisTexFormat, BasisFlags +from constants import TranscoderTextureFormat as TF +from constants import TranscodeDecodeFlags as DF + +TRUE = 1 +FALSE = 0 + +# ------------------------------------------------------------ +# Utility: write raw bytes to a file +# ------------------------------------------------------------ + +def write_blob_to_file(filename: str, data: bytes) -> int: + print(f"write_blob_to_file: writing {len(data)} bytes to {filename!r}") + if not filename or data is None: + print(" ERROR: invalid filename or data") + return FALSE + + try: + with open(filename, "wb") as f: + f.write(data) + print(" OK") + return TRUE + except OSError as e: + print(" ERROR:", e) + return FALSE + +# ------------------------------------------------------------ +# TGA writer (24/32bpp) - port of write_tga_image() +# ------------------------------------------------------------ + +def write_tga_image(filename: str, w: int, h: int, has_alpha: bool, pixels_rgba_ptr: int) -> int: + """ + filename: path to TGA file + w, h: image dimensions + has_alpha: True for 32bpp, False for 24bpp + pixels_rgba_ptr: C pointer (uint64) to RGBA or RGB data in native heap + """ + print(f"write_tga_image: {filename!r}, {w}x{h}, has_alpha={has_alpha}, ptr=0x{pixels_rgba_ptr:x}") + if not filename or pixels_rgba_ptr == 0 or w <= 0 or h <= 0: + print(" ERROR: invalid args") + return -1 + + bytes_per_pixel = 4 if has_alpha else 3 + row_bytes = w * bytes_per_pixel + total_bytes = row_bytes * h + + # Create a ctypes buffer that views the native memory + SrcArrayType = ctypes.c_ubyte * total_bytes + src = SrcArrayType.from_address(pixels_rgba_ptr) + + try: + with open(filename, "wb") as f: + header = bytearray(18) + header[2] = 2 # uncompressed true-color + header[12] = w & 0xFF + header[13] = (w >> 8) & 0xFF + header[14] = h & 0xFF + header[15] = (h >> 8) & 0xFF + header[16] = 32 if has_alpha else 24 + header[17] = 8 if has_alpha else 0 # bottom-left origin (with or without alpha) + + f.write(header) + + # temp row buffer for BGRA/BGR + row_buf = bytearray(row_bytes) + + # TGA expects rows bottom-to-top + for y in range(h): + src_y = h - 1 - y + row_start = src_y * row_bytes + src_row = src[row_start:row_start + row_bytes] + + if has_alpha: + # RGBA -> BGRA + for x in range(w): + si = x*4 + di = x*4 + row_buf[di + 0] = src_row[si + 2] # B + row_buf[di + 1] = src_row[si + 1] # G + row_buf[di + 2] = src_row[si + 0] # R + row_buf[di + 3] = src_row[si + 3] # A + else: + # RGB -> BGR + for x in range(w): + si = x*3 + di = x*3 + row_buf[di + 0] = src_row[si + 2] # B + row_buf[di + 1] = src_row[si + 1] # G + row_buf[di + 2] = src_row[si + 0] # R + + f.write(row_buf) + + print(" Wrote TGA:", filename) + return 0 + except OSError as e: + print(" ERROR writing TGA:", e) + return -2 + +# ------------------------------------------------------------ +# ASTC writer - port of write_astc_file() +# ------------------------------------------------------------ + +def write_astc_file(filename: str, + blocks_ptr: int, + block_width: int, + block_height: int, + dim_x: int, + dim_y: int) -> int: + print(f"write_astc_file: {filename!r}, block={block_width}x{block_height}, dim={dim_x}x{dim_y}, ptr=0x{blocks_ptr:x}") + if not filename or blocks_ptr == 0: + print(" ERROR: invalid filename or pointer") + return 0 + + assert dim_x > 0 and dim_y > 0 + assert 4 <= block_width <= 12 + assert 4 <= block_height <= 12 + + num_blocks_x = (dim_x + block_width - 1) // block_width + num_blocks_y = (dim_y + block_height - 1) // block_height + total_blocks = num_blocks_x * num_blocks_y + total_bytes = total_blocks * 16 # 16 bytes per ASTC block + + print(f" num_blocks_x={num_blocks_x}, num_blocks_y={num_blocks_y}, total_blocks={total_blocks}, total_bytes={total_bytes}") + + # View native memory + BlockArray = ctypes.c_ubyte * total_bytes + src = BlockArray.from_address(blocks_ptr) + + try: + with open(filename, "wb") as f: + # Magic + f.write(bytes([0x13, 0xAB, 0xA1, 0x5C])) + + # Block dimensions x,y,z (=1) + f.write(bytes([block_width & 0xFF, block_height & 0xFF, 1])) + + # dim_x (24-bit LE) + f.write(bytes([dim_x & 0xFF, (dim_x >> 8) & 0xFF, (dim_x >> 16) & 0xFF])) + + # dim_y (24-bit LE) + f.write(bytes([dim_y & 0xFF, (dim_y >> 8) & 0xFF, (dim_y >> 16) & 0xFF])) + + # dim_z = 1 (24-bit LE) + f.write(bytes([1, 0, 0])) + + # Block data + f.write(bytes(src)) + + print(" Wrote ASTC:", filename) + return 1 + except OSError as e: + print(" ERROR writing ASTC:", e) + return 0 + +# ------------------------------------------------------------ +# Procedural RGBA pattern (ported & fixed version) +# ------------------------------------------------------------ + +def create_pretty_rgba_pattern(w: int, h: int) -> bytes: + print(f"create_pretty_rgba_pattern: {w}x{h}") + if w <= 0 or h <= 0: + return None + + out = bytearray(w * h * 4) + for y in range(h): + for x in range(w): + fx = x / float(w) + fy = y / float(h) + + # Colorful plasma-type formula + v = math.sin(fx * 12.0 + fy * 4.0) + v += math.sin(fy * 9.0 - fx * 6.0) + v += math.sin((fx + fy) * 7.0) + v = v * 0.25 + 0.5 # scale 0..1 + + L = 1.5 + + r = int(round(255.0 * math.sin(v * 6.28) * L)) + g = int(round(255.0 * (1.0 - v) * L)) + b = int(round(255.0 * v * L)) + + if r < 0: r = 0 + elif r > 255: r = 255 + if g < 0: g = 0 + elif g > 255: g = 255 + if b < 0: b = 0 + elif b > 255: b = 255 + + i = (y * w + x) * 4 + out[i+0] = r + out[i+1] = g + out[i+2] = b + out[i+3] = 255 + + return bytes(out) + +# ------------------------------------------------------------ +# Transcode a KTX2 blob (ported from transcode_ktx2_file) +# ------------------------------------------------------------ + +def transcode_ktx2_file(ktx2_data: bytes) -> int: + if not ktx2_data: + print("transcode_ktx2_file: empty data") + return FALSE + + size = len(ktx2_data) + print(f"transcode_ktx2_file: size={size} bytes") + + if size > 0xFFFFFFFF: + print(" ERROR: size too large for 32-bit length") + return FALSE + + # Allocate memory in transcoder heap and copy KTX2 data + ktx2_data_ofs = bt.alloc(size) + if not ktx2_data_ofs: + print(" ERROR: bt.alloc failed") + return FALSE + + print(f" KTX2 data allocated at 0x{ktx2_data_ofs:x}") + ctypes.memmove(ktx2_data_ofs, ktx2_data, size) + + # Open KTX2 + ktx2_handle = bt.ktx2_open(ktx2_data_ofs, size) + if not ktx2_handle: + print(" ERROR: bt.ktx2_open failed") + bt.free(ktx2_data_ofs) + return FALSE + + print(f" KTX2 handle = 0x{ktx2_handle:x}") + + if not bt.ktx2_is_ldr(ktx2_handle): + print(" ERROR: This sample only handles LDR KTX2 files") + bt.ktx2_close(ktx2_handle) + bt.free(ktx2_data_ofs) + return FALSE + + if not bt.ktx2_start_transcoding(ktx2_handle): + print(" ERROR: bt.ktx2_start_transcoding failed") + bt.ktx2_close(ktx2_handle) + bt.free(ktx2_data_ofs) + return FALSE + + width = bt.ktx2_get_width(ktx2_handle) + height = bt.ktx2_get_height(ktx2_handle) + levels = bt.ktx2_get_levels(ktx2_handle) + faces = bt.ktx2_get_faces(ktx2_handle) + layers = bt.ktx2_get_layers(ktx2_handle) + + basis_tex_format = bt.ktx2_get_basis_tex_format(ktx2_handle) + block_width = bt.ktx2_get_block_width(ktx2_handle) + block_height = bt.ktx2_get_block_height(ktx2_handle) + is_srgb = bt.ktx2_is_srgb(ktx2_handle) + + print(f"KTX2 Dimensions: {width}x{height}, Levels={levels}, Faces={faces}, Layers={layers}") + print(f"basis_tex_format: {basis_tex_format}") + print(f"Block dimensions: {block_width}x{block_height}") + print(f"is sRGB: {is_srgb}") + + if layers < 1: + layers = 1 + + assert width >= 1 and height >= 1 + assert levels >= 1 + assert faces in (1, 6) + + # Optional: separate transcode state (thread-local) + trans_state = bt.ktx2_create_transcode_state() + print(f"trans_state handle = 0x{trans_state:x}") + + for level_index in range(levels): + for layer_index in range(layers): + for face_index in range(faces): + print(f"- Level {level_index}, layer {layer_index}, face {face_index}") + ow = bt.ktx2_get_level_orig_width(ktx2_handle, level_index, layer_index, face_index) + oh = bt.ktx2_get_level_orig_height(ktx2_handle, level_index, layer_index, face_index) + aw = bt.ktx2_get_level_actual_width(ktx2_handle, level_index, layer_index, face_index) + ah = bt.ktx2_get_level_actual_height(ktx2_handle, level_index, layer_index, face_index) + nbx = bt.ktx2_get_level_num_blocks_x(ktx2_handle, level_index, layer_index, face_index) + nby = bt.ktx2_get_level_num_blocks_y(ktx2_handle, level_index, layer_index, face_index) + tblocks = bt.ktx2_get_level_total_blocks(ktx2_handle, level_index, layer_index, face_index) + alpha_flag = bt.ktx2_get_level_alpha_flag(ktx2_handle, level_index, layer_index, face_index) + iframe_flag = bt.ktx2_get_level_iframe_flag(ktx2_handle, level_index, layer_index, face_index) + + print(f" Orig dimensions: {ow}x{oh}, actual: {aw}x{ah}") + print(f" Block dims: {nbx}x{nby}, total blocks: {tblocks}") + print(f" Alpha={alpha_flag}, I-frame={iframe_flag}") + + # 1) Transcode to RGBA32 and write TGA + tga_name = f"transcoded_{level_index}_{layer_index}_{face_index}.tga" + trans_size_rgba = bt.basis_compute_transcoded_image_size_in_bytes(TF.TF_RGBA32, ow, oh) + assert trans_size_rgba > 0 + rgba_ofs = bt.alloc(trans_size_rgba) + print(f" RGBA buf ofs=0x{rgba_ofs:x}, size={trans_size_rgba}") + + decode_flags = 0 + ok = bt.ktx2_transcode_image_level( + ktx2_handle, + level_index, layer_index, face_index, + rgba_ofs, + trans_size_rgba, + TF.TF_RGBA32, + decode_flags, + 0, 0, -1, -1, + trans_state + ) + print(" ktx2_transcode_image_level(RGBA32):", ok) + if not ok: + bt.free(rgba_ofs) + bt.ktx2_destroy_transcode_state(trans_state) + bt.ktx2_close(ktx2_handle) + bt.free(ktx2_data_ofs) + return FALSE + + write_tga_image(tga_name, ow, oh, True, rgba_ofs) + bt.free(rgba_ofs) + + # 2) Transcode to ASTC and write .astc file + astc_name = f"transcoded_{level_index}_{layer_index}_{face_index}.astc" + target_tf = bt.basis_get_transcoder_texture_format_from_basis_tex_format(basis_tex_format) + print(f" Target ASTC TF={target_tf}") + + trans_size_astc = bt.basis_compute_transcoded_image_size_in_bytes(target_tf, ow, oh) + assert trans_size_astc > 0 + astc_ofs = bt.alloc(trans_size_astc) + print(f" ASTC buf ofs=0x{astc_ofs:x}, size={trans_size_astc}") + + ok = bt.ktx2_transcode_image_level( + ktx2_handle, + level_index, layer_index, face_index, + astc_ofs, + trans_size_astc, + target_tf, + 0, 0, 0, -1, -1, + trans_state + ) + print(" ktx2_transcode_image_level(ASTC):", ok) + if not ok: + bt.free(astc_ofs) + bt.ktx2_destroy_transcode_state(trans_state) + bt.ktx2_close(ktx2_handle) + bt.free(ktx2_data_ofs) + return FALSE + + write_astc_file(astc_name, astc_ofs, block_width, block_height, ow, oh) + bt.free(astc_ofs) + + bt.ktx2_destroy_transcode_state(trans_state) + bt.ktx2_close(ktx2_handle) + bt.free(ktx2_data_ofs) + + print("transcode_ktx2_file: success") + return TRUE + +# ------------------------------------------------------------ +# main() equivalent +# ------------------------------------------------------------ + +def main(): + print("example_capi_python:") + + # Init encoder (which initializes transcoder) + print("Calling bu.init() ...") + bu.init() + + print("Calling bt.init() ...") + bt.init() + + # Optional debug control if bound + if hasattr(bu, "enable_debug_printf"): + print("Disabling debug printf from encoder") + bu.enable_debug_printf(False) + + # Generate test image + W, H = 512, 512 + src_image = create_pretty_rgba_pattern(W, H) + if src_image is None: + print("ERROR: create_pretty_rgba_pattern failed") + return 1 + + # Save test image for inspection + print("Writing test_image.tga ...") + # use Python-level TGA writer by allocating a temporary native buffer + tmp_ofs = bt.alloc(len(src_image)) + ctypes.memmove(tmp_ofs, src_image, len(src_image)) + write_tga_image("test_image.tga", W, H, True, tmp_ofs) + bt.free(tmp_ofs) + + # Compress to KTX2 + print("Creating comp_params ...") + comp_params = bu.new_params() + print(" comp_params handle:", comp_params) + + img_ofs = bu.alloc(W * H * 4) + print(f"Allocated encoder image buffer at 0x{img_ofs:x}") + ctypes.memmove(img_ofs, src_image, W * H * 4) + + print("Calling bu.comp_params_set_image_rgba32(...)") + ok = bu.set_image_rgba32(comp_params, 0, img_ofs, W, H, W * 4) + print(" set_image_rgba32:", ok) + if not ok: + print("ERROR: bu_comp_params_set_image_rgba32 failed") + return 1 + + bu.free(img_ofs) + + print("Compressing to XUASTC LDR 8x5 KTX2 ...") + basis_tex_format = BasisTexFormat.cXUASTC_LDR_8x5 + quality_level = 85 + effort_level = 2 + flags = (BasisFlags.KTX2_OUTPUT | + BasisFlags.SRGB | + BasisFlags.THREADED | + BasisFlags.GEN_MIPS_CLAMP | + BasisFlags.PRINT_STATS | + BasisFlags.PRINT_STATUS) + + ok = bu.compress(comp_params, + tex_format=basis_tex_format, + quality=quality_level, + effort=effort_level, + flags=flags, + rdo_quality=0.0) + print(" bu.compress:", ok) + if not ok: + print("ERROR: bu_compress_texture failed") + return 1 + + comp_size = bu.get_comp_data_size(comp_params) + print("Compressed size:", comp_size) + if comp_size == 0: + print("ERROR: bu_comp_params_get_comp_data_size failed") + return 1 + + comp_ofs = bu.get_comp_data_ofs(comp_params) + print(f"Compressed data ptr=0x{comp_ofs:x}") + + # Copy compressed data into Python bytes + CompArray = ctypes.c_ubyte * comp_size + comp_buf = CompArray.from_address(comp_ofs) + comp_bytes = bytes(comp_buf) + + print("Writing test.ktx2 ...") + if not write_blob_to_file("test.ktx2", comp_bytes): + print("ERROR: write_blob_to_file failed") + return 1 + + # Transcode using the native transcoder API + print("Now transcoding test.ktx2 via C API ...") + if not transcode_ktx2_file(comp_bytes): + print("ERROR: transcode_ktx2_file failed") + return 1 + + bu.delete_params(comp_params) + + print("Success") + return 0 + +if __name__ == "__main__": + sys.exit(main()) diff --git a/python/lowlevel_test_native/test_transcoder_basic.py b/python/lowlevel_test_native/test_transcoder_basic.py new file mode 100644 index 0000000..7aa8772 --- /dev/null +++ b/python/lowlevel_test_native/test_transcoder_basic.py @@ -0,0 +1,24 @@ +# test_transcoder_basic.py +import sys +import os + +# Make sure Python can find the .so file +sys.path.append("basisu_py") # Adjust if needed + +try: + import basisu_transcoder_python as bt +except ImportError as e: + print("Failed to import basisu_transcoder_python:", e) + raise + +print("Successfully loaded basisu_transcoder_python") + +# Call bt_get_version() via the pybind11 binding +try: + version = bt.get_version() + print("Transcoder version:", version) +except Exception as e: + print("Error calling bt_get_version:", e) + raise + +print("Basic transcoder test complete.") diff --git a/python/lowlevel_test_wasm/__init__.py b/python/lowlevel_test_wasm/__init__.py new file mode 100644 index 0000000..143f486 --- /dev/null +++ b/python/lowlevel_test_wasm/__init__.py @@ -0,0 +1 @@ +# __init__.py diff --git a/python/lowlevel_test_wasm/basic_test.py b/python/lowlevel_test_wasm/basic_test.py new file mode 100644 index 0000000..5fbede0 --- /dev/null +++ b/python/lowlevel_test_wasm/basic_test.py @@ -0,0 +1,58 @@ +import wasmtime +import ctypes + +# --- Engine --- +engine = wasmtime.Engine() + +# --- Store --- +store = wasmtime.Store(engine) + +# --- WASI config --- +wasi = wasmtime.WasiConfig() +wasi.argv = ["basisu_module_st"] +wasi.inherit_stdout() # <-- tell WASI to use the host stdout +wasi.inherit_stderr() +store.set_wasi(wasi) + +# --- Load module --- +module = wasmtime.Module.from_file(engine, "basisu_py/wasm/basisu_module_st.wasm") + +# --- Linker + WASI --- +linker = wasmtime.Linker(engine) +linker.define_wasi() + +# --- Instantiate --- +instance = linker.instantiate(store, module) +print("Single-threaded WASM instantiated OK") + +# --- Exports --- +exports = instance.exports(store) + +get_version = exports["bu_get_version"] +alloc = exports["bu_alloc"] +free = exports["bu_free"] +memory = exports["memory"] + +# --- Version --- +version = get_version(store) +print("Version =", version) + +# --- Alloc --- +ptr = alloc(store, 64) +print("Allocated ptr =", ptr) + +# --- Access WASM memory properly --- +data_len = memory.data_len(store) +raw_ptr = memory.data_ptr(store) # ctypes pointer +addr = ctypes.addressof(raw_ptr.contents) # convert to integer pointer + +# Create a byte array view into WASM memory +buf = (ctypes.c_ubyte * data_len).from_address(addr) + +# Write TEST at allocated ptr +buf[ptr : ptr + 4] = b"TEST" +print("Wrote TEST into WASM memory.") + +# --- Free --- +free(store, ptr) +print("Memory free OK.") diff --git a/python/lowlevel_test_wasm/basisu_wasm.py b/python/lowlevel_test_wasm/basisu_wasm.py new file mode 100644 index 0000000..08b3933 --- /dev/null +++ b/python/lowlevel_test_wasm/basisu_wasm.py @@ -0,0 +1,148 @@ +# basisu_wasm.py +import wasmtime +import ctypes +import sys + +sys.path.append("basisu_py") # our shared .py files + +from constants import * + +class BasisuWasm: + def __init__(self, path): + self.path = path + self.engine = None + self.store = None + self.memory = None + self.exports = None + + # ----------------------------------------------- + # Internal helper: build WASI + Wasmtime engine + # ----------------------------------------------- + def _init_engine(self): + self.engine = wasmtime.Engine() + self.store = wasmtime.Store(self.engine) + + wasi = wasmtime.WasiConfig() + wasi.argv = ["basisu"] + wasi.inherit_stdout() + wasi.inherit_stderr() + self.store.set_wasi(wasi) + + return wasi + + # ----------------------------------------------- + # Create linker and instantiate WASM module + # ----------------------------------------------- + def load(self): + self._init_engine() + + module = wasmtime.Module.from_file(self.engine, self.path) + linker = wasmtime.Linker(self.engine) + linker.define_wasi() + + instance = linker.instantiate(self.store, module) + + self.exports = instance.exports(self.store) + self.memory = self.exports["memory"] + + if "bu_init" in self.exports: + self.exports["bu_init"](self.store) + + print("WASM loaded:", self.path) + + # ----------------------------------------------- + # Read/write WASM linear memory via ctypes + # ----------------------------------------------- + def _wasm_buf(self): + raw_ptr = self.memory.data_ptr(self.store) + length = self.memory.data_len(self.store) + addr = ctypes.addressof(raw_ptr.contents) + return (ctypes.c_ubyte * length).from_address(addr) + + # ----------------------------------------------- + # Exported API accessors + # ----------------------------------------------- + def init(self): + return self.exports["bu_init"](self.store) + + def version(self): + return self.exports["bu_get_version"](self.store) + + def alloc(self, size): + return self.exports["bu_alloc"](self.store, size) + + def free(self, ptr): + return self.exports["bu_free"](self.store, ptr) + + def new_params(self): + return self.exports["bu_new_comp_params"](self.store) + + def delete_params(self, ptr): + return self.exports["bu_delete_comp_params"](self.store, ptr) + + def set_image_rgba32(self, params, image_index, img_ptr, w, h, pitch): + return self.exports["bu_comp_params_set_image_rgba32"]( + self.store, params, image_index, img_ptr, w, h, pitch + ) + + def set_image_float_rgba(self, params, image_index, img_ptr, w, h, pitch): + return self.exports["bu_comp_params_set_image_float_rgba"]( + self.store, params, image_index, img_ptr, w, h, pitch + ) + + # Normally quality_level controls the quality. + # If quality_level==-1, then rdo_quality (a low-level parameter) directly + # controls each codec's quality setting. Normally set to 0. + + def compress_texture_lowlevel(self, params, + tex_format, + quality_level, + effort_level, + flags_and_quality, + rdo_quality): + + return self.exports["bu_compress_texture"]( + self.store, + params, + tex_format, + quality_level, + effort_level, + flags_and_quality, + rdo_quality + ) + + def compress(self, params, + tex_format=BasisTexFormat.cUASTC_LDR_4x4, + quality=BasisQuality.MAX, + effort=BasisEffort.DEFAULT, + flags=BasisFlags.NONE, + rdo_quality=0.0): + + return bool(self.compress_texture_lowlevel( + params, + tex_format, + quality, + effort, + flags, + rdo_quality + )) + + def get_comp_data_ofs(self, params): + return self.exports["bu_comp_params_get_comp_data_ofs"](self.store, params) + + def get_comp_data_size(self, params): + return self.exports["bu_comp_params_get_comp_data_size"](self.store, params) + + # ----------------------------------------------- + # Copy bytes into WASM memory + # ----------------------------------------------- + def write_bytes(self, wasm_ptr, data: bytes): + buf = self._wasm_buf() + buf[wasm_ptr:wasm_ptr+len(data)] = data + + # ----------------------------------------------- + # Read bytes from WASM memory + # ----------------------------------------------- + def read_bytes(self, wasm_ptr, size): + buf = self._wasm_buf() + return bytes(buf[wasm_ptr:wasm_ptr+size]) diff --git a/python/lowlevel_test_wasm/compress_test.py b/python/lowlevel_test_wasm/compress_test.py new file mode 100644 index 0000000..285bb10 --- /dev/null +++ b/python/lowlevel_test_wasm/compress_test.py @@ -0,0 +1,63 @@ +# compress_test.py +from .basisu_wasm import * + +# === Load WASM === +codec = BasisuWasm("basisu_py/wasm/basisu_module_st.wasm") +codec.load() + +print("Version =", codec.version()) + +# === Build test image === +W, H = 256, 256 +BYTES_PER_PIXEL = 4 +pitch = W * BYTES_PER_PIXEL + +img = bytearray(W * H * 4) + +for y in range(H): + for x in range(W): + i = (y * W + x) * 4 + img[i + 0] = x & 0xFF # R + img[i + 1] = y & 0xFF # G + img[i + 2] = (x ^ y) & 0xFF # B + img[i + 3] = 255 # A + +# === Upload image to WASM memory === +img_ptr = codec.alloc(len(img)) +codec.write_bytes(img_ptr, img) + +# === Create comp_params === +params = codec.new_params() + +# === Set image into comp_params === +ok = codec.set_image_rgba32(params, 0, img_ptr, W, H, pitch) +print("Set image:", ok) + +# === Compress === +ok = codec.compress( + params, + tex_format=BasisTexFormat.cUASTC_LDR_4x4, + quality=100, + effort=BasisEffort.DEFAULT, + flags=BasisFlags.KTX2_OUTPUT | BasisFlags.SRGB, + rdo_quality=0.0 +) +print("Compress result:", ok) + +# === Retrieve compressed blob === +ofs = codec.get_comp_data_ofs(params) +size = codec.get_comp_data_size(params) +print("Output size =", size) + +comp_data = codec.read_bytes(ofs, size) +print("First 16 bytes:", comp_data[:16]) + +# === Save to KTX2 === +with open("test.ktx2", "wb") as f: + f.write(comp_data) + +print("File written: test.ktx2") + +# === Cleanup === +codec.delete_params(params) +codec.free(img_ptr) diff --git a/python/lowlevel_test_wasm/compress_test_float.py b/python/lowlevel_test_wasm/compress_test_float.py new file mode 100644 index 0000000..3f60cb7 --- /dev/null +++ b/python/lowlevel_test_wasm/compress_test_float.py @@ -0,0 +1,76 @@ +# compress_test_float.py + +from .basisu_wasm import BasisuWasm, BasisTexFormat, BasisEffort, BasisFlags, BasisQuality +import struct # for packing floats + +# === Load WASM === +codec = BasisuWasm("basisu_py/wasm/basisu_module_st.wasm") +codec.load() + +print("Version =", codec.version()) + +# === Build a 256x256 FLOAT RGBA image === +W, H = 256, 256 +BYTES_PER_PIXEL = 16 # float32 * 4 +pitch = W * BYTES_PER_PIXEL + +# Float image stored as bytearray of packed floats +img = bytearray(W * H * BYTES_PER_PIXEL) + +for y in range(H): + for x in range(W): + # Create some float HDR gradient pattern + r = float(x) / W # 0.0 ? 1.0 + g = float(y) / H # 0.0 ? 1.0 + b = float(x ^ y) / 255.0 # quirky pattern + a = 1.0 + + i = (y * W + x) * 4 + + # pack into img bytearray + struct.pack_into("ffff", img, i*4, r, g, b, a) + +print("Created FLOAT RGBA image.") + +# === Upload to WASM memory === +img_ptr = codec.alloc(len(img)) +codec.write_bytes(img_ptr, img) +print("Copied float image into WASM heap at", img_ptr) + +# === Create params === +params = codec.new_params() + +# === Set FLOAT RGBA image === +ok = codec.set_image_float_rgba(params, 0, img_ptr, W, H, pitch) +print("Set float RGBA:", ok) + +# === Compress using HDR UASTC 4x4 === +ok = codec.compress( + params, + tex_format=BasisTexFormat.cUASTC_HDR_4x4, + quality=BasisQuality.MAX, + effort=BasisEffort.DEFAULT, + flags=BasisFlags.KTX2_OUTPUT | BasisFlags.REC2020, # optional: HDR color space + rdo_quality=0.0 +) + +print("Compression result:", ok) + +# === Retrieve compressed HDR KTX2 === +ofs = codec.get_comp_data_ofs(params) +size = codec.get_comp_data_size(params) + +print("Output size =", size) +data = codec.read_bytes(ofs, size) + +print("First 16 bytes:", data[:16]) + +# === Save to test_hdr.ktx2 === +with open("test_hdr.ktx2", "wb") as f: + f.write(data) + +print("Wrote test_hdr.ktx2") + +# === Cleanup === +codec.delete_params(params) +codec.free(img_ptr) diff --git a/python/pyproject.toml b/python/pyproject.toml new file mode 100644 index 0000000..e893c57 --- /dev/null +++ b/python/pyproject.toml @@ -0,0 +1,44 @@ +[build-system] +requires = ["setuptools>=65", "wheel"] +build-backend = "setuptools.build_meta" + +[project] +name = "basisu-py" +version = "0.2.0" +description = "Python bindings for Basis Universal encoder/transcoder v2.x with native + WASM backend" +authors = [ + { name = "Binomial LLC", email = "stephanie@binomial.info" } +] +license = { text = "Apache 2.0" } +readme = "README.md" +requires-python = ">=3.8" + +dependencies = [ + "numpy", + "Pillow", + "imageio>=2.22", + "wasmtime", +] + +classifiers = [ + "Programming Language :: Python :: 3", + "Programming Language :: C++", + "Operating System :: OS Independent", + "License :: OSI Approved :: Apache Software License", +] + +[tool.setuptools] +include-package-data = true + +[tool.setuptools.packages.find] +include = ["basisu_py*"] + +[tool.setuptools.package-data] +basisu_py = [ + "*.so", + "*.pyd", + "*.py", + "wasm/*.wasm", + "wasm/*.py", + "README.md", +] diff --git a/python/tests/__init__.py b/python/tests/__init__.py new file mode 100644 index 0000000..2badd27 --- /dev/null +++ b/python/tests/__init__.py @@ -0,0 +1 @@ +# python/tests/__init__.py diff --git a/python/tests/test_backend_loading.py b/python/tests/test_backend_loading.py new file mode 100644 index 0000000..598bae8 --- /dev/null +++ b/python/tests/test_backend_loading.py @@ -0,0 +1,70 @@ +#!/usr/bin/env python3 +import numpy as np +from PIL import Image + +from basisu_py.codec import Encoder, EncoderBackend +from basisu_py.constants import BasisTexFormat + +print("========== BACKEND LOADING TEST ==========\n") + +# -------------------------------------------------------------- +# 1. Test native backend (if available) +# -------------------------------------------------------------- +print("Testing native backend...") + +try: + enc_native = Encoder(backend=EncoderBackend.NATIVE) + print(" [OK] Native backend loaded") +except Exception as e: + print(" [FAIL] Native backend failed to load:", e) + enc_native = None + +# If native loaded, test very basic functionality +if enc_native: + try: + version = enc_native._native.get_version() + print(f" Native get_version() ? {version}") + + ptr = enc_native._native.alloc(16) + print(f" Native alloc() returned ptr = {ptr}") + + enc_native._native.free(ptr) + print(f" Native free() OK") + + print(" [OK] Native basic operations working.\n") + except Exception as e: + print(" [FAIL] Native operations error:", e) +else: + print(" Skipping native basic operations.\n") + +# -------------------------------------------------------------- +# 2. Test WASM backend +# -------------------------------------------------------------- +print("\nTesting WASM backend...") + +try: + enc_wasm = Encoder(backend=EncoderBackend.WASM) + print(" [OK] WASM backend loaded") +except Exception as e: + print(" [FAIL] WASM backend failed to load:", e) + enc_wasm = None + +# If WASM loaded, test basic methods +if enc_wasm and enc_wasm._wasm is not None: + try: + version = enc_wasm._wasm.get_version() + print(f" WASM get_version() ? {version}") + + ptr = enc_wasm._wasm.alloc(16) + print(f" WASM alloc() returned ptr = {ptr}") + + enc_wasm._wasm.free(ptr) + print(f" WASM free() OK") + + print(" [OK] WASM basic operations working.\n") + except Exception as e: + print(" [FAIL] WASM operations error:", e) +else: + print(" Skipping WASM basic operations.\n") + +print("\n========== DONE ==========\n") diff --git a/python/tests/test_basic_backend_selection.py b/python/tests/test_basic_backend_selection.py new file mode 100644 index 0000000..6cfb75c --- /dev/null +++ b/python/tests/test_basic_backend_selection.py @@ -0,0 +1,7 @@ +from basisu_py import Encoder + +enc = Encoder() # AUTO mode +print("Encoder backend:", enc.backend) +print("Native loaded:", enc._native is not None) +print("WASM loaded:", enc._wasm is not None) +print("Version:", enc._native.get_version() if enc._native else enc._wasm.get_version()) diff --git a/python/tests/test_basic_decode.py b/python/tests/test_basic_decode.py new file mode 100644 index 0000000..515c31a --- /dev/null +++ b/python/tests/test_basic_decode.py @@ -0,0 +1,19 @@ +from basisu_py import Transcoder +from PIL import Image +import numpy as np + +# Load input file +with open("test.ktx2", "rb") as f: + data = f.read() + +# Decode (AUTO backend) +t = Transcoder() +rgba = t.decode_rgba(data) # returns HxWx4 uint8 NumPy array + +print("Decoded:", rgba.shape, rgba.dtype) + +# Convert to Pillow Image and save +img = Image.fromarray(rgba, mode="RGBA") +img.save("decoded.png") + +print("Wrote decoded.png") \ No newline at end of file diff --git a/python/tests/test_basic_transcode.py b/python/tests/test_basic_transcode.py new file mode 100644 index 0000000..fc10a23 --- /dev/null +++ b/python/tests/test_basic_transcode.py @@ -0,0 +1,10 @@ +from basisu_py import Transcoder + +with open("test.ktx2", "rb") as f: + data = f.read() + +t = Transcoder() # AUTO backend +img = t.decode_rgba(data) + +print("Decoded shape:", img.shape) +print("dtype:", img.dtype) diff --git a/python/tests/test_basic_wasm_selection.py b/python/tests/test_basic_wasm_selection.py new file mode 100644 index 0000000..bdad65f --- /dev/null +++ b/python/tests/test_basic_wasm_selection.py @@ -0,0 +1,6 @@ +from basisu_py import Transcoder +from basisu_py.transcoder import TranscoderBackend + +t = Transcoder(backend=TranscoderBackend.WASM) +print("Backend:", t.backend_name) +t.decode_rgba(open("test.ktx2","rb").read()) diff --git a/python/tests/test_compress_swirl.py b/python/tests/test_compress_swirl.py new file mode 100644 index 0000000..a02f7d5 --- /dev/null +++ b/python/tests/test_compress_swirl.py @@ -0,0 +1,82 @@ +#!/usr/bin/env python3 +import numpy as np +from PIL import Image +from math import sin, cos, atan2, hypot + +from basisu_py.codec import Encoder, EncoderBackend +from basisu_py.constants import BasisTexFormat, BasisQuality, BasisEffort, BasisFlags + + +# -------------------------------------------------------------- +# Procedural swirl pattern (RGBA8) +# -------------------------------------------------------------- +def make_swirl_image(w=256, h=256): + arr = np.zeros((h, w, 4), dtype=np.uint8) + + cx = w / 2.0 + cy = h / 2.0 + + for y in range(h): + for x in range(w): + dx = x - cx + dy = y - cy + + dist = hypot(dx, dy) + angle = atan2(dy, dx) + + r = int((sin(dist * 0.15) * 0.5 + 0.5) * 255) + g = int((sin(angle * 3.0) * 0.5 + 0.5) * 255) + b = int((cos(dist * 0.10 + angle * 2.0) * 0.5 + 0.5) * 255) + + arr[y, x] = (r, g, b, 255) + + return arr + + +# -------------------------------------------------------------- +# Test encode using a given backend +# -------------------------------------------------------------- +def compress_swirl(backend, outfile): + print(f"\n========== Testing {backend} backend ==========") + + # Build procedural image + swirl = make_swirl_image(256, 256) + print("Generated swirl image:", swirl.shape) + + # Create encoder + enc = Encoder(backend=backend) + + # Compress + blob = enc.compress( + swirl, + format=BasisTexFormat.cUASTC_LDR_4x4, + quality=BasisQuality.MAX, + effort=BasisEffort.DEFAULT, + flags=BasisFlags.KTX2_OUTPUT | BasisFlags.SRGB + ) + + print(f"Compressed blob size: {len(blob)} bytes") + + # Save output + with open(outfile, "wb") as f: + f.write(blob) + + print(f"Wrote: {outfile}") + print("==============================================") + + +# -------------------------------------------------------------- +# Main +# -------------------------------------------------------------- +if __name__ == "__main__": + # Test native backend + try: + compress_swirl(EncoderBackend.NATIVE, "swirl_native.ktx2") + except Exception as e: + print("Native backend ERROR:", e) + + # Test WASM backend + try: + compress_swirl(EncoderBackend.WASM, "swirl_wasm.ktx2") + except Exception as e: + print("WASM backend ERROR:", e) diff --git a/python/tests/test_compress_swirl_hdr.py b/python/tests/test_compress_swirl_hdr.py new file mode 100644 index 0000000..34e0adb --- /dev/null +++ b/python/tests/test_compress_swirl_hdr.py @@ -0,0 +1,75 @@ +#!/usr/bin/env python3 +import numpy as np +from math import sin, cos, atan2, hypot +from basisu_py.codec import Encoder, EncoderBackend +from basisu_py.constants import BasisTexFormat, BasisQuality, BasisEffort, BasisFlags + + +# -------------------------------------------------------------- +# Procedural HDR swirl pattern (float32 RGBA) +# -------------------------------------------------------------- +def make_hdr_swirl_image(w=256, h=256): + arr = np.zeros((h, w, 4), dtype=np.float32) + + cx = w / 2.0 + cy = h / 2.0 + + for y in range(h): + for x in range(w): + dx = x - cx + dy = y - cy + dist = hypot(dx, dy) + angle = atan2(dy, dx) + + r = (sin(dist * 0.15) * 0.5 + 0.5) + g = (sin(angle * 3.0) * 0.5 + 0.5) + b = (cos(dist * 0.10 + angle * 2.0) * 0.5 + 0.5) + + arr[y, x] = (r, g, b, 1.0) # full alpha + + return arr + + +# -------------------------------------------------------------- +# Test encode using a given backend +# -------------------------------------------------------------- +def compress_hdr_swirl(backend, outfile): + print(f"\n========== Testing HDR {backend} backend ==========") + + hdr = make_hdr_swirl_image(256, 256) + print("Generated HDR swirl image:", hdr.shape, hdr.dtype) + + enc = Encoder(backend=backend) + + blob = enc.compress( + hdr, + format=-1, # auto-select HDR (UASTC_HDR_4x4) + quality=BasisQuality.MAX, + effort=BasisEffort.DEFAULT, + flags=BasisFlags.KTX2_OUTPUT | BasisFlags.SRGB + ) + + print(f"Compressed blob size: {len(blob)} bytes") + + with open(outfile, "wb") as f: + f.write(blob) + + print(f"Wrote: {outfile}") + print("==============================================") + + +# -------------------------------------------------------------- +# Main +# -------------------------------------------------------------- +if __name__ == "__main__": + # Native backend + try: + compress_hdr_swirl(EncoderBackend.NATIVE, "hdr_swirl_native.ktx2") + except Exception as e: + print("Native HDR backend ERROR:", e) + + # WASM backend + try: + compress_hdr_swirl(EncoderBackend.WASM, "hdr_swirl_wasm.ktx2") + except Exception as e: + print("WASM HDR backend ERROR:", e) diff --git a/python/tests/test_transcoder_astc.py b/python/tests/test_transcoder_astc.py new file mode 100644 index 0000000..306834b --- /dev/null +++ b/python/tests/test_transcoder_astc.py @@ -0,0 +1,18 @@ +from basisu_py import Transcoder +from astc_writer import write_astc_file + +# Load a .ktx2 +data = open("input.ktx2", "rb").read() +t = Transcoder() + +# Transcode to ASTC +h = t.open(data) +bw = t.get_block_width(h) # or basis_get_block_width(astc_tfmt) +bh = t.get_block_height(h) +tfmt = t.basis_get_transcoder_texture_format_from_basis_tex_format( + t.get_basis_tex_format(h) +) + +blocks = t.transcode_tfmt(data, tfmt) +write_astc_file("output.astc", blocks, bw, bh, t.get_width(h), t.get_height(h)) +t.close(h) diff --git a/python/tests/test_transcoder_backend_loading.py b/python/tests/test_transcoder_backend_loading.py new file mode 100644 index 0000000..e0d1ed2 --- /dev/null +++ b/python/tests/test_transcoder_backend_loading.py @@ -0,0 +1,72 @@ +#!/usr/bin/env python3 +import sys +from basisu_py.transcoder import Transcoder, TranscoderBackend +from basisu_py.constants import BasisTexFormat + +print("========== TESTING TRANSCODER BACKENDS ==========\n") + +# Load some test data (ensure test.ktx2 exists) +try: + test_data = open("test.ktx2", "rb").read() + print("[INFO] Loaded test.ktx2") +except FileNotFoundError: + print("[ERROR] test.ktx2 not found. Create one first via encoder tests.") + sys.exit(1) + + +# ------------------------------------------------------------------- +# 1. Test NATIVE backend +# ------------------------------------------------------------------- +print("\n--- Testing NATIVE transcoder backend ---") + +try: + t_native = Transcoder(TranscoderBackend.NATIVE) + print(" [OK] Native backend loaded") + + version = t_native.get_version() + print(f" Native get_version() = {version}") + + # Open KTX2 + raw = t_native.open(test_data) + print(" [OK] Opened KTX2 (native)") + + # Query some basic properties + print(" Width :", t_native.get_width(raw)) + print(" Height:", t_native.get_height(raw)) + print(" Levels:", t_native.get_levels(raw)) + + # Cleanup + t_native.close(raw) + print(" [OK] Native transcoder basic operations working.") + +except Exception as e: + print(" [FAIL] Native transcoder error:", e) + + +# ------------------------------------------------------------------- +# 2. Test WASM backend +# ------------------------------------------------------------------- +print("\n--- Testing WASM transcoder backend ---") + +try: + t_wasm = Transcoder(TranscoderBackend.WASM) + print(" [OK] WASM backend loaded") + + version = t_wasm.get_version() + print(f" WASM get_version() = {version}") + + raw = t_wasm.open(test_data) + print(" [OK] Opened KTX2 (wasm)") + + print(" Width :", t_wasm.get_width(raw)) + print(" Height:", t_wasm.get_height(raw)) + print(" Levels:", t_wasm.get_levels(raw)) + + t_wasm.close(raw) + print(" [OK] WASM transcoder basic operations working.") + +except Exception as e: + print(" [FAIL] WASM transcoder error:", e) + + +print("\n========== DONE ==========") diff --git a/python/tests/test_transcoder_end_to_end.py b/python/tests/test_transcoder_end_to_end.py new file mode 100644 index 0000000..5e4825a --- /dev/null +++ b/python/tests/test_transcoder_end_to_end.py @@ -0,0 +1,154 @@ +#!/usr/bin/env python3 +""" +Full end-to-end transcoder test with automatic fallback. + +- Generates a swirl image +- Compresses it using native OR WASM (AUTO mode) +- Writes test.ktx2 +- Decodes it using whichever backends are available: + * AUTO (native if present, otherwise WASM) + * Native (if available) + * WASM (if available) +- Produces PNG outputs for all successful backends +""" + +import numpy as np +from math import sin, cos, atan2, hypot +from PIL import Image +import sys + +from basisu_py.codec import Encoder, EncoderBackend +from basisu_py.transcoder import Transcoder, TranscoderBackend +from basisu_py.constants import ( + BasisTexFormat, + BasisQuality, + BasisEffort, + BasisFlags, +) + + +# ------------------------------------------------------------------- +# Create an RGBA swirl test image +# ------------------------------------------------------------------- +def make_swirl(w=256, h=256): + arr = np.zeros((h, w, 4), dtype=np.uint8) + + cx, cy = w / 2.0, h / 2.0 + + for y in range(h): + for x in range(w): + dx, dy = x - cx, y - cy + dist = hypot(dx, dy) + angle = atan2(dy, dx) + + r = int((sin(dist * 0.15) * 0.5 + 0.5) * 255) + g = int((sin(angle * 3.0) * 0.5 + 0.5) * 255) + b = int((cos(dist * 0.10 + angle * 2.0) * 0.5 + 0.5) * 255) + + arr[y, x] = (r, g, b, 255) + + return arr + + +# ------------------------------------------------------------------- +# Try loading transcoder with a backend, return (success, transcoder) +# ------------------------------------------------------------------- +def try_transcoder(backend): + try: + t = Transcoder(backend) + print(f"[OK] Loaded transcoder backend '{backend}' ({t.backend_name})") + return True, t + except Exception as e: + print(f"[SKIP] Backend '{backend}' unavailable:", e) + return False, None + + +# ------------------------------------------------------------------- +# Try loading encoder with a backend, return blob or None +# ------------------------------------------------------------------- +def try_encoder(backend, img): + try: + enc = Encoder(backend) + print(f"[OK] Loaded encoder backend '{backend}' ({enc.backend_name})") + except Exception as e: + print(f"[SKIP] Encoder backend '{backend}' unavailable:", e) + return None + + try: + print(f"[Test] Compressing swirl -> KTX2 using {enc.backend_name}...") + blob = enc.compress( + img, + format=-1, + quality=BasisQuality.MAX, + effort=BasisEffort.DEFAULT, + flags=BasisFlags.KTX2_OUTPUT | BasisFlags.SRGB + ) + return blob + except Exception as e: + print(f"[FAIL] Compression failed on backend '{backend}':", e) + return None + + +# ------------------------------------------------------------------- +# Decode blob with a given transcoder +# ------------------------------------------------------------------- +def decode_with_backend(name, t, blob): + try: + rgba = t.decode_rgba(blob) + outname = f"decoded_{name}.png" + Image.fromarray(rgba, mode="RGBA").save(outname) + print(f" --> {name}: decoded successfully, wrote {outname}") + except Exception as e: + print(f" [FAIL] decode_rgba on backend '{name}':", e) + + +# ------------------------------------------------------------------- +# Main test +# ------------------------------------------------------------------- +if __name__ == "__main__": + print("========== BasisU End-to-End Compression & Transcoding Test ==========") + + # ------------------------------------------------------- + # Generate swirl test + # ------------------------------------------------------- + img = make_swirl(256, 256) + print("[Test] Generated swirl:", img.shape) + + # ------------------------------------------------------- + # Try AUTO encoder (native if available, else WASM) + # ------------------------------------------------------- + blob = try_encoder(EncoderBackend.AUTO, img) + if blob is None: + print("[FAIL] Could not encode using AUTO backend; aborting.") + sys.exit(1) + + # Save test.ktx2 + with open("test.ktx2", "wb") as f: + f.write(blob) + print("[Test] Wrote: test.ktx2") + + # ------------------------------------------------------- + # Test transcoding using AUTO + # ------------------------------------------------------- + print("\n[Test] Decoding via AUTO backend...") + ok_auto, t_auto = try_transcoder(TranscoderBackend.AUTO) + if ok_auto: + decode_with_backend("auto", t_auto, blob) + + # ------------------------------------------------------- + # Test NATIVE explicitly (if available) + # ------------------------------------------------------- + print("\n[Test] Decoding via NATIVE backend...") + ok_native, t_native = try_transcoder(TranscoderBackend.NATIVE) + if ok_native: + decode_with_backend("native", t_native, blob) + + # ------------------------------------------------------- + # Test WASM explicitly (if available) + # ------------------------------------------------------- + print("\n[Test] Decoding via WASM backend...") + ok_wasm, t_wasm = try_transcoder(TranscoderBackend.WASM) + if ok_wasm: + decode_with_backend("wasm", t_wasm, blob) + + print("\n========== DONE ==========") diff --git a/python/tests/test_transcoder_end_to_end_hdr.py b/python/tests/test_transcoder_end_to_end_hdr.py new file mode 100644 index 0000000..d1fceb1 --- /dev/null +++ b/python/tests/test_transcoder_end_to_end_hdr.py @@ -0,0 +1,175 @@ +#!/usr/bin/env python3 +""" +HDR End-to-End Compression & Transcoding Test +Works on all platforms: + - native if available + - WASM fallback otherwise +""" + +import numpy as np +from math import sin, cos, atan2, hypot +from PIL import Image +import subprocess +import tempfile +import os +import imageio.v3 as iio + +from basisu_py.codec import Encoder, EncoderBackend +from basisu_py.transcoder import Transcoder, TranscoderBackend +from basisu_py.constants import ( + BasisTexFormat, + BasisQuality, + BasisEffort, + BasisFlags +) + + +# ------------------------------------------------------------------- +# Save EXR using TIFF temp + oiiotool (as required) +# ------------------------------------------------------------------- +def save_exr(path, rgba32f): + """ + Save float32 RGBA as EXR if possible. + If oiiotool is not available, save TIFF instead (Windows-safe). + """ + import numpy as np + import imageio.v3 as iio + import subprocess, tempfile, os + + # Write temp TIFF + with tempfile.NamedTemporaryFile(suffix=".tiff", delete=False) as tmp: + temp_path = tmp.name + + iio.imwrite(temp_path, rgba32f.astype(np.float32)) + + # Try EXR via oiiotool + try: + subprocess.run(["oiiotool", temp_path, "-o", path], check=True) + os.remove(temp_path) + print(" Wrote EXR:", path) + return + + except Exception: + # --- FALLBACK: save TIFF --- + fallback_path = path + ".tiff" + + # Windows cannot overwrite files via rename(), so remove first + if os.path.exists(fallback_path): + os.remove(fallback_path) + + # os.replace() always overwrites + os.replace(temp_path, fallback_path) + + print(" [Fallback] Wrote TIFF instead:", fallback_path) + +# ------------------------------------------------------------------- +# Generate HDR swirl image (float32) +# ------------------------------------------------------------------- +def make_swirl_hdr(w=256, h=256): + arr = np.zeros((h, w, 4), dtype=np.float32) + cx, cy = w / 2.0, h / 2.0 + + for y in range(h): + for x in range(w): + dx, dy = x - cx, y - cy + dist = hypot(dx, dy) + angle = atan2(dy, dx) + + # HDR values range up to about 4.0 + r = (sin(dist * 0.08) * 0.5 + 0.5) * 4.0 + g = (sin(angle * 2.0) * 0.5 + 0.5) * 4.0 + b = (cos(dist * 0.06 + angle * 1.5) * 0.5 + 0.5) * 4.0 + + arr[y, x] = (r, g, b, 1.0) + + return arr + + +# ------------------------------------------------------------------- +# Try loading a transcoder backend +# ------------------------------------------------------------------- +def try_transcoder(name, backend): + try: + t = Transcoder(backend) + print(f"[OK] Loaded transcoder backend '{name}' ({t.backend_name})") + return t + except Exception as e: + print(f"[SKIP] Backend '{name}' unavailable:", e) + return None + + +# ------------------------------------------------------------------- +# MAIN +# ------------------------------------------------------------------- +if __name__ == "__main__": + print("========== HDR End-to-End Compression & Transcoding Test ==========") + + # ------------------------------------------------------- + # Create HDR test image + # ------------------------------------------------------- + img_hdr = make_swirl_hdr(256, 256) + print("[HDR] swirl:", img_hdr.shape, img_hdr.dtype) + + # ------------------------------------------------------- + # ENCODE using AUTO backend (native ? or WASM) + # ------------------------------------------------------- + try: + enc = Encoder(EncoderBackend.AUTO) + print(f"[HDR] Encoder backend = {enc.backend_name}") + except Exception as e: + print("[FATAL] Could not create encoder:", e) + exit(1) + + try: + print("[HDR] Compressing HDR swirl -> test_hdr.ktx2...") + ktx2_blob = enc.compress( + img_hdr, + format=-1, # auto-select HDR format + quality=BasisQuality.MAX, + effort=BasisEffort.DEFAULT, + flags=BasisFlags.KTX2_OUTPUT + ) + print(" KTX2 size:", len(ktx2_blob)) + open("test_hdr.ktx2", "wb").write(ktx2_blob) + print(" Wrote test_hdr.ktx2") + except Exception as e: + print("[FATAL] Encoding failed:", e) + exit(1) + + # ------------------------------------------------------- + # DECODE using AUTO (native ? or WASM) + # ------------------------------------------------------- + t_auto = try_transcoder("AUTO", TranscoderBackend.AUTO) + if t_auto: + try: + hdr = t_auto.decode_rgba_hdr(ktx2_blob) + print(" AUTO decoded:", hdr.shape, hdr.dtype) + save_exr("decoded_auto_hdr.exr", hdr) + except Exception as e: + print(" [FAIL] AUTO decode failed:", e) + + # ------------------------------------------------------- + # DECODE using NATIVE if available + # ------------------------------------------------------- + t_native = try_transcoder("NATIVE", TranscoderBackend.NATIVE) + if t_native: + try: + hdr_n = t_native.decode_rgba_hdr(ktx2_blob) + print(" Native decoded:", hdr_n.shape, hdr_n.dtype) + save_exr("decoded_native_hdr.exr", hdr_n) + except Exception as e: + print(" [FAIL] Native decode failed:", e) + + # ------------------------------------------------------- + # DECODE using WASM if available + # ------------------------------------------------------- + t_wasm = try_transcoder("WASM", TranscoderBackend.WASM) + if t_wasm: + try: + hdr_w = t_wasm.decode_rgba_hdr(ktx2_blob) + print(" WASM decoded:", hdr_w.shape, hdr_w.dtype) + save_exr("decoded_wasm_hdr.exr", hdr_w) + except Exception as e: + print(" [FAIL] WASM decode failed:", e) + + print("\n========== DONE ==========") diff --git a/python/tests/test_transcoder_helpers.py b/python/tests/test_transcoder_helpers.py new file mode 100644 index 0000000..31245c0 --- /dev/null +++ b/python/tests/test_transcoder_helpers.py @@ -0,0 +1,190 @@ +#!/usr/bin/env python3 +import sys +import numpy as np + +from basisu_py.transcoder import Transcoder, TranscoderBackend +from basisu_py.constants import BasisTexFormat, TranscoderTextureFormat + +print("========== TESTING TRANSCODER HELPERS & METADATA ==========\n") + +# ---------------------------------------------------------------------------- +# Load test KTX2 file +# ---------------------------------------------------------------------------- +try: + ktx2_bytes = open("test.ktx2", "rb").read() + print("[INFO] Loaded test.ktx2") +except FileNotFoundError: + print("[ERROR] test.ktx2 not found. Run encoder tests first.") + sys.exit(1) + + +# ---------------------------------------------------------------------------- +# Utility: run helper tests on a given backend +# ---------------------------------------------------------------------------- +def test_backend(name, backend): + print(f"\n=== Testing {name} backend ===") + + try: + t = Transcoder(backend) + except Exception as e: + print(f"[FAIL] Could not initialize {name} backend:", e) + return + + print(f"[OK] {name} backend loaded") + + # Version + try: + ver = t.get_version() + print(f" version = {ver}") + except Exception as e: + print(" [FAIL] get_version() error:", e) + return + + # enable_debug_printf + try: + t.enable_debug_printf(True) + except Exception as e: + print(" [FAIL] enable_debug_printf() failed") + return + + # Open KTX2 + try: + raw = t.open(ktx2_bytes) + print(" [OK] open() success") + except Exception as e: + print(" [FAIL] open() failed:", e) + return + + # ---------------------------------------------------------------------- + # KTX2 top-level metadata + # ---------------------------------------------------------------------- + try: + w = t.get_width(raw) + h = t.get_height(raw) + lv = t.get_levels(raw) + fc = t.get_faces(raw) + la = t.get_layers(raw) + fmt = t.get_basis_tex_format(raw) + + print(f" Width = {w}") + print(f" Height = {h}") + print(f" Levels = {lv}") + print(f" Faces = {fc}") + print(f" Layers = {la}") + print(f" basis_tex_format = {fmt}") + print(f" has_alpha = {t.has_alpha(raw)}") + print(f" is_hdr = {t.is_hdr(raw)}") + print(f" is_ldr = {t.is_ldr(raw)}") + print(f" is_srgb = {t.is_srgb(raw)}") + print(f" is_etc1s = {t.is_etc1s(raw)}") + print(f" is_uastc_ldr_4x4 = {t.is_uastc_ldr_4x4(raw)}") + print(f" is_xuastc_ldr = {t.is_xuastc_ldr(raw)}") + print(f" is_astc_ldr = {t.is_astc_ldr(raw)}") + print(f" block dims = {t.get_block_width(raw)} x {t.get_block_height(raw)}") + + except Exception as e: + print(" [FAIL] get_* metadata error:", e) + t.close(raw) + return + + # ---------------------------------------------------------------------- + # Per-level metadata for each mipmap + # ---------------------------------------------------------------------- + print("\n -- Level Metadata --") + for level in range(lv): + try: + ow = t.get_level_orig_width(raw, level) + oh = t.get_level_orig_height(raw, level) + nbx = t.get_level_num_blocks_x(raw, level) + nby = t.get_level_num_blocks_y(raw, level) + tb = t.get_level_total_blocks(raw, level) + af = t.get_level_alpha_flag(raw, level) + ff = t.get_level_iframe_flag(raw, level) + + print(f" Level {level}: orig={ow}x{oh}, blocks={nbx}x{nby}, total={tb}, alpha={af}, iframe={ff}") + except Exception as e: + print(f" [FAIL] Level {level} metadata error:", e) + + # ---------------------------------------------------------------------- + # Test ALL basis_tex_format helpers on the file's format + # ---------------------------------------------------------------------- + print("\n -- basis_tex_format helpers --") + + try: + print(f" is_xuastc_ldr = {t.basis_tex_format_is_xuastc_ldr(fmt)}") + print(f" is_astc_ldr = {t.basis_tex_format_is_astc_ldr(fmt)}") + print(f" block W/H = {t.basis_tex_format_get_block_width(fmt)} x " + f"{t.basis_tex_format_get_block_height(fmt)}") + print(f" is_hdr = {t.basis_tex_format_is_hdr(fmt)}") + print(f" is_ldr = {t.basis_tex_format_is_ldr(fmt)}") + except Exception as e: + print(" [FAIL] basis_tex_format_* error:", e) + + # ---------------------------------------------------------------------- + # Test transcoder_texture_format helpers using a few common formats + # ---------------------------------------------------------------------- + print("\n -- transcoder_texture_format helpers --") + + test_formats = [ + TranscoderTextureFormat.TF_RGBA32, + TranscoderTextureFormat.TF_RGBA_HALF, + TranscoderTextureFormat.TF_BC7_RGBA, + TranscoderTextureFormat.TF_ETC1_RGB, + ] + + for tfmt in test_formats: + try: + print(f" Format {tfmt}: hdr={t.basis_transcoder_format_is_hdr(tfmt)}, " + f"ldr={t.basis_transcoder_format_is_ldr(tfmt)}, " + f"has_alpha={t.basis_transcoder_format_has_alpha(tfmt)}, " + f"uncompressed={t.basis_transcoder_format_is_uncompressed(tfmt)}, " + f"bytes/pixel or block={t.basis_get_bytes_per_block_or_pixel(tfmt)}") + except Exception as e: + print(" [FAIL] transcoder_texture_format_* error:", e) + + # ---------------------------------------------------------------------- + # Compute transcode buffer sizes + # ---------------------------------------------------------------------- + print("\n -- compute_transcoded_image_size_in_bytes --") + try: + for tfmt in test_formats: + sz = t.basis_compute_transcoded_image_size_in_bytes(tfmt, w, h) + print(f" Format {tfmt}: size = {sz}") + except Exception as e: + print(" [FAIL] size computation error:", e) + + # ---------------------------------------------------------------------- + # Decode RGBA (LDR) + # ---------------------------------------------------------------------- + print("\n -- decode_rgba --") + try: + img_rgba = t.decode_rgba(ktx2_bytes) + print(f" decode_rgba: shape={img_rgba.shape}, dtype={img_rgba.dtype}") + except Exception as e: + print(" [FAIL] decode_rgba error:", e) + + # ---------------------------------------------------------------------- + # Decode HDR if applicable + # ---------------------------------------------------------------------- + if t.is_hdr(raw): + print("\n -- decode_rgba_hdr --") + try: + img_hdr = t.decode_rgba_hdr(ktx2_bytes) + print(f" decode_rgba_hdr: shape={img_hdr.shape}, dtype={img_hdr.dtype}") + except Exception as e: + print(" [FAIL] decode_rgba_hdr error:", e) + else: + print(" Texture is LDR; skipping decode_rgba_hdr().") + + # Cleanup + t.close(raw) + print(f"\n=== {name} backend OK ===\n") + + +# ---------------------------------------------------------------------------- +# Run tests for both backends +# ---------------------------------------------------------------------------- +test_backend("NATIVE", TranscoderBackend.NATIVE) +test_backend("WASM", TranscoderBackend.WASM) + +print("\n========== DONE ==========\n") diff --git a/transcoder/basisu_astc_cfgs.inl b/transcoder/basisu_astc_cfgs.inl new file mode 100644 index 0000000..b9f138f --- /dev/null +++ b/transcoder/basisu_astc_cfgs.inl @@ -0,0 +1,648 @@ +const uint32_t BU_TOTAL_ASTC_CFGS = 10311; +const uint8_t s_astc_cfg_table[BU_TOTAL_ASTC_CFGS*3] = { +176,72,0,208,72,0,240,72,0,16,73,0,48,73,0,80,73,0,112,73,0,176,130,0,208,130,0,240,130,0,16,131,0,48,131,0,80,131,0,112,131,0,176,132,0,208,132,0, +240,132,0,16,133,0,48,133,0,80,133,0,112,133,0,176,134,0,208,134,0,240,134,0,16,135,0,48,135,0,80,135,0,112,135,0,176,194,0,208,194,0,240,194,0,16,195,0, +48,195,0,80,195,0,112,195,0,176,196,0,208,196,0,240,196,0,16,197,0,48,197,0,80,197,0,112,197,0,176,198,0,208,198,0,240,198,0,16,199,0,48,199,0,80,199,0, +112,199,0,176,2,1,208,2,1,240,2,1,16,3,1,48,3,1,80,3,1,112,3,1,176,4,1,208,4,1,240,4,1,16,5,1,48,5,1,80,5,1,112,5,1,176,6,1, +208,6,1,240,6,1,16,7,1,48,7,1,80,7,1,112,7,1,176,8,1,208,8,1,240,8,1,16,9,1,48,9,1,80,9,1,112,9,1,176,66,1,208,66,1,240,66,1, +16,67,1,48,67,1,80,67,1,112,67,1,176,68,1,208,68,1,240,68,1,16,69,1,48,69,1,80,69,1,112,69,1,176,70,1,208,70,1,240,70,1,16,71,1,48,71,1, +80,71,1,112,71,1,176,72,1,208,72,1,240,72,1,16,73,1,48,73,1,80,73,1,112,73,1,16,1,2,48,1,2,80,1,2,112,1,2,16,17,2,48,17,2,80,17,2, +112,17,2,16,33,2,48,33,2,80,33,2,112,33,2,16,65,2,48,65,2,80,65,2,112,65,2,80,72,2,112,72,2,144,72,2,176,72,2,208,72,2,240,72,2,16,73,2, +48,73,2,80,73,2,112,73,2,16,81,2,48,81,2,80,81,2,112,81,2,10,97,2,42,97,2,73,97,2,105,97,2,16,129,2,48,129,2,80,129,2,112,129,2,80,130,2, +112,130,2,144,130,2,176,130,2,208,130,2,240,130,2,16,131,2,48,131,2,80,131,2,112,131,2,80,132,2,112,132,2,144,132,2,176,132,2,208,132,2,240,132,2,16,133,2, +48,133,2,80,133,2,112,133,2,80,134,2,112,134,2,144,134,2,176,134,2,208,134,2,240,134,2,16,135,2,48,135,2,80,135,2,112,135,2,16,145,2,48,145,2,80,145,2, +112,145,2,10,161,2,42,161,2,73,161,2,105,161,2,16,193,2,48,193,2,80,193,2,112,193,2,80,194,2,112,194,2,144,194,2,176,194,2,208,194,2,240,194,2,16,195,2, +48,195,2,80,195,2,112,195,2,80,196,2,112,196,2,144,196,2,176,196,2,208,196,2,240,196,2,16,197,2,48,197,2,80,197,2,112,197,2,80,198,2,112,198,2,144,198,2, +176,198,2,208,198,2,240,198,2,16,199,2,48,199,2,80,199,2,112,199,2,10,209,2,42,209,2,73,209,2,105,209,2,4,225,2,36,225,2,67,225,2,99,225,2,16,1,3, +48,1,3,80,1,3,112,1,3,80,2,3,112,2,3,144,2,3,176,2,3,208,2,3,240,2,3,16,3,3,48,3,3,80,3,3,112,3,3,80,4,3,112,4,3,144,4,3, +176,4,3,208,4,3,240,4,3,16,5,3,48,5,3,80,5,3,112,5,3,80,6,3,112,6,3,144,6,3,176,6,3,208,6,3,240,6,3,16,7,3,48,7,3,80,7,3, +112,7,3,80,8,3,112,8,3,144,8,3,176,8,3,208,8,3,240,8,3,16,9,3,48,9,3,80,9,3,112,9,3,10,17,3,42,17,3,73,17,3,105,17,3,4,33,3, +36,33,3,67,33,3,99,33,3,16,65,3,48,65,3,80,65,3,112,65,3,80,66,3,112,66,3,144,66,3,176,66,3,208,66,3,240,66,3,15,67,3,45,67,3,76,67,3, +106,67,3,80,68,3,112,68,3,144,68,3,176,68,3,208,68,3,240,68,3,15,69,3,45,69,3,76,69,3,106,69,3,80,70,3,112,70,3,144,70,3,176,70,3,208,70,3, +240,70,3,15,71,3,45,71,3,76,71,3,106,71,3,80,72,3,112,72,3,144,72,3,176,72,3,208,72,3,240,72,3,15,73,3,45,73,3,76,73,3,106,73,3,6,81,3, +37,81,3,69,81,3,100,81,3,176,0,4,208,0,4,240,0,4,16,1,4,48,1,4,80,1,4,112,1,4,176,16,4,208,16,4,240,16,4,16,17,4,48,17,4,80,17,4, +112,17,4,176,32,4,208,32,4,240,32,4,16,33,4,48,33,4,80,33,4,112,33,4,176,64,4,208,64,4,240,64,4,16,65,4,48,65,4,80,65,4,112,65,4,48,72,4, +80,72,4,112,72,4,144,72,4,176,72,4,208,72,4,240,72,4,16,73,4,48,73,4,80,73,4,109,73,4,176,80,4,208,80,4,240,80,4,16,81,4,48,81,4,79,81,4, +110,81,4,170,96,4,202,96,4,233,96,4,8,97,4,40,97,4,71,97,4,102,97,4,176,128,4,208,128,4,240,128,4,16,129,4,48,129,4,80,129,4,112,129,4,48,130,4, +80,130,4,112,130,4,144,130,4,176,130,4,208,130,4,240,130,4,16,131,4,48,131,4,80,131,4,109,131,4,48,132,4,80,132,4,112,132,4,144,132,4,176,132,4,208,132,4, +240,132,4,16,133,4,48,133,4,80,133,4,109,133,4,48,134,4,80,134,4,112,134,4,144,134,4,176,134,4,208,134,4,240,134,4,16,135,4,48,135,4,80,135,4,109,135,4, +176,144,4,208,144,4,240,144,4,16,145,4,48,145,4,79,145,4,110,145,4,170,160,4,202,160,4,233,160,4,8,161,4,40,161,4,71,161,4,102,161,4,176,192,4,208,192,4, +240,192,4,16,193,4,48,193,4,80,193,4,112,193,4,48,194,4,80,194,4,112,194,4,144,194,4,176,194,4,208,194,4,240,194,4,14,195,4,43,195,4,73,195,4,102,195,4, +48,196,4,80,196,4,112,196,4,144,196,4,176,196,4,208,196,4,240,196,4,14,197,4,43,197,4,73,197,4,102,197,4,48,198,4,80,198,4,112,198,4,144,198,4,176,198,4, +208,198,4,240,198,4,14,199,4,43,199,4,73,199,4,102,199,4,170,208,4,202,208,4,233,208,4,8,209,4,40,209,4,71,209,4,102,209,4,164,224,4,196,224,4,227,224,4, +3,225,4,34,225,4,66,225,4,97,225,4,176,0,5,208,0,5,240,0,5,16,1,5,48,1,5,80,1,5,112,1,5,48,2,5,80,2,5,112,2,5,144,2,5,176,2,5, +208,2,5,240,2,5,14,3,5,43,3,5,73,3,5,102,3,5,48,4,5,80,4,5,112,4,5,144,4,5,176,4,5,208,4,5,240,4,5,14,5,5,43,5,5,73,5,5, +102,5,5,48,6,5,80,6,5,112,6,5,144,6,5,176,6,5,208,6,5,240,6,5,14,7,5,43,7,5,73,7,5,102,7,5,48,8,5,80,8,5,112,8,5,144,8,5, +176,8,5,208,8,5,240,8,5,14,9,5,43,9,5,73,9,5,102,9,5,170,16,5,202,16,5,233,16,5,8,17,5,40,17,5,71,17,5,102,17,5,164,32,5,196,32,5, +227,32,5,3,33,5,34,33,5,66,33,5,97,33,5,176,64,5,208,64,5,240,64,5,16,65,5,48,65,5,80,65,5,112,65,5,48,66,5,80,66,5,112,66,5,144,66,5, +175,66,5,204,66,5,235,66,5,9,67,5,38,67,5,69,67,5,99,67,5,48,68,5,80,68,5,112,68,5,144,68,5,175,68,5,204,68,5,235,68,5,9,69,5,38,69,5, +69,69,5,99,69,5,48,70,5,80,70,5,112,70,5,144,70,5,175,70,5,204,70,5,235,70,5,9,71,5,38,71,5,69,71,5,99,71,5,48,72,5,80,72,5,112,72,5, +144,72,5,175,72,5,204,72,5,235,72,5,9,73,5,38,73,5,69,73,5,99,73,5,166,80,5,197,80,5,229,80,5,4,81,5,36,81,5,67,81,5,99,81,5,112,0,6, +144,0,6,176,0,6,208,0,6,240,0,6,16,1,6,48,1,6,80,1,6,112,1,6,112,16,6,144,16,6,176,16,6,208,16,6,240,16,6,16,17,6,48,17,6,80,17,6, +112,17,6,112,32,6,144,32,6,176,32,6,208,32,6,240,32,6,16,33,6,48,33,6,80,33,6,112,33,6,112,64,6,144,64,6,176,64,6,208,64,6,240,64,6,16,65,6, +48,65,6,80,65,6,112,65,6,48,72,6,80,72,6,112,72,6,144,72,6,176,72,6,208,72,6,240,72,6,13,73,6,40,73,6,68,73,6,112,80,6,144,80,6,176,80,6, +208,80,6,239,80,6,14,81,6,44,81,6,76,81,6,106,81,6,106,96,6,138,96,6,169,96,6,200,96,6,231,96,6,6,97,6,37,97,6,69,97,6,100,97,6,112,128,6, +144,128,6,176,128,6,208,128,6,240,128,6,16,129,6,48,129,6,80,129,6,112,129,6,48,130,6,80,130,6,112,130,6,144,130,6,176,130,6,208,130,6,240,130,6,13,131,6, +40,131,6,68,131,6,48,132,6,80,132,6,112,132,6,144,132,6,176,132,6,208,132,6,240,132,6,13,133,6,40,133,6,68,133,6,48,134,6,80,134,6,112,134,6,144,134,6, +176,134,6,208,134,6,240,134,6,13,135,6,40,135,6,68,135,6,112,144,6,144,144,6,176,144,6,208,144,6,239,144,6,14,145,6,44,145,6,76,145,6,106,145,6,106,160,6, +138,160,6,169,160,6,200,160,6,231,160,6,6,161,6,37,161,6,69,161,6,100,161,6,112,192,6,144,192,6,176,192,6,208,192,6,240,192,6,16,193,6,48,193,6,80,193,6, +112,193,6,48,194,6,80,194,6,112,194,6,144,194,6,176,194,6,205,194,6,234,194,6,6,195,6,35,195,6,64,195,6,48,196,6,80,196,6,112,196,6,144,196,6,176,196,6, +205,196,6,234,196,6,6,197,6,35,197,6,64,197,6,48,198,6,80,198,6,112,198,6,144,198,6,176,198,6,205,198,6,234,198,6,6,199,6,35,199,6,64,199,6,106,208,6, +138,208,6,169,208,6,200,208,6,231,208,6,6,209,6,37,209,6,69,209,6,100,209,6,100,224,6,132,224,6,163,224,6,195,224,6,226,224,6,1,225,6,33,225,6,64,225,6, +96,225,6,112,0,7,144,0,7,176,0,7,208,0,7,240,0,7,16,1,7,48,1,7,80,1,7,112,1,7,48,2,7,80,2,7,112,2,7,144,2,7,176,2,7,205,2,7, +234,2,7,6,3,7,35,3,7,64,3,7,48,4,7,80,4,7,112,4,7,144,4,7,176,4,7,205,4,7,234,4,7,6,5,7,35,5,7,64,5,7,48,6,7,80,6,7, +112,6,7,144,6,7,176,6,7,205,6,7,234,6,7,6,7,7,35,7,7,64,7,7,48,8,7,80,8,7,112,8,7,144,8,7,176,8,7,205,8,7,234,8,7,6,9,7, +35,9,7,64,9,7,106,16,7,138,16,7,169,16,7,200,16,7,231,16,7,6,17,7,37,17,7,69,17,7,100,17,7,100,32,7,132,32,7,163,32,7,195,32,7,226,32,7, +1,33,7,33,33,7,64,33,7,96,33,7,112,64,7,144,64,7,176,64,7,208,64,7,240,64,7,16,65,7,48,65,7,80,65,7,111,65,7,48,66,7,80,66,7,111,66,7, +141,66,7,170,66,7,199,66,7,230,66,7,3,67,7,32,67,7,48,68,7,80,68,7,111,68,7,141,68,7,170,68,7,199,68,7,230,68,7,3,69,7,32,69,7,48,70,7, +80,70,7,111,70,7,141,70,7,170,70,7,199,70,7,230,70,7,3,71,7,32,71,7,48,72,7,80,72,7,111,72,7,141,72,7,170,72,7,199,72,7,230,72,7,3,73,7, +32,73,7,102,80,7,133,80,7,164,80,7,196,80,7,227,80,7,3,81,7,34,81,7,65,81,7,97,81,7,80,0,8,112,0,8,144,0,8,176,0,8,208,0,8,240,0,8, +16,1,8,48,1,8,80,1,8,112,1,8,80,16,8,112,16,8,144,16,8,176,16,8,208,16,8,240,16,8,16,17,8,48,17,8,80,17,8,112,17,8,80,32,8,112,32,8, +144,32,8,176,32,8,208,32,8,240,32,8,16,33,8,47,33,8,77,33,8,107,33,8,80,64,8,112,64,8,144,64,8,176,64,8,208,64,8,240,64,8,16,65,8,48,65,8, +80,65,8,112,65,8,16,72,8,48,72,8,80,72,8,112,72,8,144,72,8,176,72,8,205,72,8,232,72,8,1,73,8,80,80,8,112,80,8,144,80,8,175,80,8,206,80,8, +236,80,8,11,81,8,41,81,8,72,81,8,102,81,8,74,96,8,105,96,8,136,96,8,167,96,8,198,96,8,229,96,8,4,97,8,35,97,8,66,97,8,97,97,8,80,128,8, +112,128,8,144,128,8,176,128,8,208,128,8,240,128,8,16,129,8,48,129,8,80,129,8,112,129,8,16,130,8,48,130,8,80,130,8,112,130,8,144,130,8,176,130,8,205,130,8, +232,130,8,1,131,8,16,132,8,48,132,8,80,132,8,112,132,8,144,132,8,176,132,8,205,132,8,232,132,8,1,133,8,16,134,8,48,134,8,80,134,8,112,134,8,144,134,8, +176,134,8,205,134,8,232,134,8,1,135,8,80,144,8,112,144,8,144,144,8,175,144,8,206,144,8,236,144,8,11,145,8,41,145,8,72,145,8,102,145,8,74,160,8,105,160,8, +136,160,8,167,160,8,198,160,8,229,160,8,4,161,8,35,161,8,66,161,8,97,161,8,80,192,8,112,192,8,144,192,8,176,192,8,208,192,8,240,192,8,16,193,8,48,193,8, +80,193,8,112,193,8,16,194,8,48,194,8,80,194,8,112,194,8,143,194,8,170,194,8,198,194,8,227,194,8,16,196,8,48,196,8,80,196,8,112,196,8,143,196,8,170,196,8, +198,196,8,227,196,8,16,198,8,48,198,8,80,198,8,112,198,8,143,198,8,170,198,8,198,198,8,227,198,8,74,208,8,105,208,8,136,208,8,167,208,8,198,208,8,229,208,8, +4,209,8,35,209,8,66,209,8,97,209,8,68,224,8,99,224,8,131,224,8,162,224,8,193,224,8,225,224,8,0,225,8,32,225,8,80,0,9,112,0,9,144,0,9,176,0,9, +208,0,9,240,0,9,16,1,9,48,1,9,80,1,9,112,1,9,16,2,9,48,2,9,80,2,9,112,2,9,143,2,9,170,2,9,198,2,9,227,2,9,16,4,9,48,4,9, +80,4,9,112,4,9,143,4,9,170,4,9,198,4,9,227,4,9,16,6,9,48,6,9,80,6,9,112,6,9,143,6,9,170,6,9,198,6,9,227,6,9,16,8,9,48,8,9, +80,8,9,112,8,9,143,8,9,170,8,9,198,8,9,227,8,9,74,16,9,105,16,9,136,16,9,167,16,9,198,16,9,229,16,9,4,17,9,35,17,9,66,17,9,97,17,9, +68,32,9,99,32,9,131,32,9,162,32,9,193,32,9,225,32,9,0,33,9,32,33,9,80,64,9,112,64,9,144,64,9,176,64,9,208,64,9,240,64,9,15,65,9,46,65,9, +76,65,9,107,65,9,16,66,9,48,66,9,79,66,9,108,66,9,137,66,9,166,66,9,195,66,9,224,66,9,16,68,9,48,68,9,79,68,9,108,68,9,137,68,9,166,68,9, +195,68,9,224,68,9,16,70,9,48,70,9,79,70,9,108,70,9,137,70,9,166,70,9,195,70,9,224,70,9,16,72,9,48,72,9,79,72,9,108,72,9,137,72,9,166,72,9, +195,72,9,224,72,9,70,80,9,101,80,9,132,80,9,163,80,9,195,80,9,226,80,9,1,81,9,32,81,9,64,81,9,80,0,10,112,0,10,144,0,10,176,0,10,208,0,10, +240,0,10,16,1,10,48,1,10,80,1,10,112,1,10,80,16,10,112,16,10,144,16,10,176,16,10,208,16,10,240,16,10,16,17,10,48,17,10,80,17,10,109,17,10,80,32,10, +112,32,10,144,32,10,176,32,10,208,32,10,240,32,10,13,33,10,43,33,10,73,33,10,102,33,10,80,64,10,112,64,10,144,64,10,176,64,10,208,64,10,240,64,10,16,65,10, +48,65,10,80,65,10,112,65,10,16,72,10,48,72,10,80,72,10,112,72,10,144,72,10,170,72,10,195,72,10,80,80,10,112,80,10,143,80,10,173,80,10,203,80,10,234,80,10, +8,81,10,38,81,10,68,81,10,99,81,10,73,96,10,104,96,10,135,96,10,166,96,10,197,96,10,228,96,10,2,97,10,33,97,10,64,97,10,80,128,10,112,128,10,144,128,10, +176,128,10,208,128,10,240,128,10,16,129,10,48,129,10,80,129,10,112,129,10,16,130,10,48,130,10,80,130,10,112,130,10,144,130,10,170,130,10,195,130,10,16,132,10,48,132,10, +80,132,10,112,132,10,144,132,10,170,132,10,195,132,10,16,134,10,48,134,10,80,134,10,112,134,10,144,134,10,170,134,10,195,134,10,80,144,10,112,144,10,143,144,10,173,144,10, +203,144,10,234,144,10,8,145,10,38,145,10,68,145,10,99,145,10,73,160,10,104,160,10,135,160,10,166,160,10,197,160,10,228,160,10,2,161,10,33,161,10,64,161,10,80,192,10, +112,192,10,144,192,10,176,192,10,208,192,10,240,192,10,16,193,10,48,193,10,79,193,10,108,193,10,16,194,10,48,194,10,80,194,10,109,194,10,138,194,10,164,194,10,16,196,10, +48,196,10,80,196,10,109,196,10,138,196,10,164,196,10,16,198,10,48,198,10,80,198,10,109,198,10,138,198,10,164,198,10,73,208,10,104,208,10,135,208,10,166,208,10,197,208,10, +228,208,10,2,209,10,33,209,10,64,209,10,67,224,10,99,224,10,130,224,10,161,224,10,192,224,10,224,224,10,80,0,11,112,0,11,144,0,11,176,0,11,208,0,11,240,0,11, +16,1,11,48,1,11,79,1,11,108,1,11,16,2,11,48,2,11,80,2,11,109,2,11,138,2,11,164,2,11,16,4,11,48,4,11,80,4,11,109,4,11,138,4,11,164,4,11, +16,6,11,48,6,11,80,6,11,109,6,11,138,6,11,164,6,11,16,8,11,48,8,11,80,8,11,109,8,11,138,8,11,164,8,11,73,16,11,104,16,11,135,16,11,166,16,11, +197,16,11,228,16,11,2,17,11,33,17,11,64,17,11,67,32,11,99,32,11,130,32,11,161,32,11,192,32,11,224,32,11,80,64,11,112,64,11,144,64,11,176,64,11,208,64,11, +238,64,11,12,65,11,42,65,11,73,65,11,103,65,11,16,66,11,48,66,11,76,66,11,104,66,11,133,66,11,161,66,11,16,68,11,48,68,11,76,68,11,104,68,11,133,68,11, +161,68,11,16,70,11,48,70,11,76,70,11,104,70,11,133,70,11,161,70,11,16,72,11,48,72,11,76,72,11,104,72,11,133,72,11,161,72,11,69,80,11,100,80,11,131,80,11, +162,80,11,193,80,11,225,80,11,0,81,11,48,0,12,80,0,12,112,0,12,144,0,12,176,0,12,208,0,12,240,0,12,16,1,12,48,1,12,80,1,12,112,1,12,48,16,12, +80,16,12,112,16,12,144,16,12,176,16,12,208,16,12,240,16,12,16,17,12,45,17,12,74,17,12,102,17,12,48,32,12,80,32,12,112,32,12,144,32,12,176,32,12,206,32,12, +236,32,12,9,33,12,38,33,12,68,33,12,97,33,12,48,64,12,80,64,12,112,64,12,144,64,12,176,64,12,208,64,12,240,64,12,16,65,12,48,65,12,80,65,12,111,65,12, +16,72,12,48,72,12,80,72,12,112,72,12,138,72,12,161,72,12,48,80,12,80,80,12,111,80,12,141,80,12,171,80,12,201,80,12,231,80,12,5,81,12,35,81,12,65,81,12, +42,96,12,72,96,12,103,96,12,134,96,12,164,96,12,195,96,12,226,96,12,0,97,12,48,128,12,80,128,12,112,128,12,144,128,12,176,128,12,208,128,12,240,128,12,16,129,12, +48,129,12,80,129,12,111,129,12,16,130,12,48,130,12,80,130,12,112,130,12,138,130,12,161,130,12,16,132,12,48,132,12,80,132,12,112,132,12,138,132,12,161,132,12,16,134,12, +48,134,12,80,134,12,112,134,12,138,134,12,161,134,12,48,144,12,80,144,12,111,144,12,141,144,12,171,144,12,201,144,12,231,144,12,5,145,12,35,145,12,65,145,12,42,160,12, +72,160,12,103,160,12,134,160,12,164,160,12,195,160,12,226,160,12,0,161,12,48,192,12,80,192,12,112,192,12,144,192,12,176,192,12,208,192,12,240,192,12,15,193,12,44,193,12, +74,193,12,103,193,12,16,194,12,48,194,12,78,194,12,105,194,12,132,194,12,16,196,12,48,196,12,78,196,12,105,196,12,132,196,12,16,198,12,48,198,12,78,198,12,105,198,12, +132,198,12,42,208,12,72,208,12,103,208,12,134,208,12,164,208,12,195,208,12,226,208,12,0,209,12,36,224,12,67,224,12,98,224,12,129,224,12,160,224,12,48,0,13,80,0,13, +112,0,13,144,0,13,176,0,13,208,0,13,240,0,13,15,1,13,44,1,13,74,1,13,103,1,13,16,2,13,48,2,13,78,2,13,105,2,13,132,2,13,16,4,13,48,4,13, +78,4,13,105,4,13,132,4,13,16,6,13,48,6,13,78,6,13,105,6,13,132,6,13,16,8,13,48,8,13,78,8,13,105,8,13,132,8,13,42,16,13,72,16,13,103,16,13, +134,16,13,164,16,13,195,16,13,226,16,13,0,17,13,36,32,13,67,32,13,98,32,13,129,32,13,160,32,13,48,64,13,80,64,13,112,64,13,144,64,13,175,64,13,205,64,13, +236,64,13,9,65,13,39,65,13,70,65,13,99,65,13,16,66,13,45,66,13,73,66,13,100,66,13,129,66,13,16,68,13,45,68,13,73,68,13,100,68,13,129,68,13,16,70,13, +45,70,13,73,70,13,100,70,13,129,70,13,16,72,13,45,72,13,73,72,13,100,72,13,129,72,13,37,80,13,68,80,13,99,80,13,130,80,13,161,80,13,192,80,13,48,0,14, +80,0,14,112,0,14,144,0,14,176,0,14,208,0,14,240,0,14,16,1,14,48,1,14,80,1,14,112,1,14,48,16,14,80,16,14,112,16,14,144,16,14,176,16,14,208,16,14, +240,16,14,12,17,14,39,17,14,68,17,14,48,32,14,80,32,14,112,32,14,144,32,14,174,32,14,203,32,14,233,32,14,5,33,14,34,33,14,64,33,14,48,64,14,80,64,14, +112,64,14,144,64,14,176,64,14,208,64,14,240,64,14,16,65,14,48,65,14,77,65,14,103,65,14,16,72,14,48,72,14,80,72,14,106,72,14,131,72,14,48,80,14,79,80,14, +109,80,14,139,80,14,169,80,14,198,80,14,228,80,14,2,81,14,32,81,14,41,96,14,71,96,14,102,96,14,133,96,14,163,96,14,193,96,14,224,96,14,48,128,14,80,128,14, +112,128,14,144,128,14,176,128,14,208,128,14,240,128,14,16,129,14,48,129,14,77,129,14,103,129,14,16,130,14,48,130,14,80,130,14,106,130,14,131,130,14,16,132,14,48,132,14, +80,132,14,106,132,14,131,132,14,16,134,14,48,134,14,80,134,14,106,134,14,131,134,14,48,144,14,79,144,14,109,144,14,139,144,14,169,144,14,198,144,14,228,144,14,2,145,14, +32,145,14,41,160,14,71,160,14,102,160,14,133,160,14,163,160,14,193,160,14,224,160,14,48,192,14,80,192,14,112,192,14,144,192,14,176,192,14,208,192,14,239,192,14,11,193,14, +40,193,14,70,193,14,98,193,14,16,194,14,48,194,14,74,194,14,100,194,14,16,196,14,48,196,14,74,196,14,100,196,14,16,198,14,48,198,14,74,198,14,100,198,14,41,208,14, +71,208,14,102,208,14,133,208,14,163,208,14,193,208,14,224,208,14,35,224,14,66,224,14,97,224,14,128,224,14,48,0,15,80,0,15,112,0,15,144,0,15,176,0,15,208,0,15, +239,0,15,11,1,15,40,1,15,70,1,15,98,1,15,16,2,15,48,2,15,74,2,15,100,2,15,16,4,15,48,4,15,74,4,15,100,4,15,16,6,15,48,6,15,74,6,15, +100,6,15,16,8,15,48,8,15,74,8,15,100,8,15,41,16,15,71,16,15,102,16,15,133,16,15,163,16,15,193,16,15,224,16,15,35,32,15,66,32,15,97,32,15,128,32,15, +48,64,15,80,64,15,112,64,15,144,64,15,173,64,15,203,64,15,233,64,15,6,65,15,36,65,15,66,65,15,96,65,15,16,66,15,43,66,15,70,66,15,97,66,15,16,68,15, +43,68,15,70,68,15,97,68,15,16,70,15,43,70,15,70,70,15,97,70,15,16,72,15,43,72,15,70,72,15,97,72,15,37,80,15,67,80,15,98,80,15,129,80,15,160,80,15, +48,0,16,80,0,16,112,0,16,144,0,16,176,0,16,208,0,16,240,0,16,16,1,16,48,1,16,80,1,16,48,16,16,80,16,16,112,16,16,144,16,16,176,16,16,208,16,16, +236,16,16,6,17,16,33,17,16,48,32,16,80,32,16,112,32,16,143,32,16,171,32,16,200,32,16,229,32,16,1,33,16,48,64,16,80,64,16,112,64,16,144,64,16,176,64,16, +208,64,16,240,64,16,15,65,16,42,65,16,70,65,16,16,72,16,48,72,16,77,72,16,99,72,16,48,80,16,78,80,16,107,80,16,137,80,16,166,80,16,196,80,16,226,80,16, +40,96,16,70,96,16,101,96,16,131,96,16,161,96,16,192,96,16,48,128,16,80,128,16,112,128,16,144,128,16,176,128,16,208,128,16,240,128,16,15,129,16,42,129,16,70,129,16, +16,130,16,48,130,16,77,130,16,99,130,16,16,132,16,48,132,16,77,132,16,99,132,16,16,134,16,48,134,16,77,134,16,99,134,16,48,144,16,78,144,16,107,144,16,137,144,16, +166,144,16,196,144,16,226,144,16,40,160,16,70,160,16,101,160,16,131,160,16,161,160,16,192,160,16,48,192,16,80,192,16,112,192,16,144,192,16,176,192,16,206,192,16,235,192,16, +7,193,16,36,193,16,65,193,16,16,194,16,46,194,16,70,194,16,16,196,16,46,196,16,70,196,16,16,198,16,46,198,16,70,198,16,40,208,16,70,208,16,101,208,16,131,208,16, +161,208,16,192,208,16,35,224,16,65,224,16,96,224,16,128,224,16,48,0,17,80,0,17,112,0,17,144,0,17,176,0,17,206,0,17,235,0,17,7,1,17,36,1,17,65,1,17, +16,2,17,46,2,17,70,2,17,16,4,17,46,4,17,70,4,17,16,6,17,46,6,17,70,6,17,16,8,17,46,8,17,70,8,17,40,16,17,70,16,17,101,16,17,131,16,17, +161,16,17,192,16,17,35,32,17,65,32,17,96,32,17,128,32,17,48,64,17,80,64,17,112,64,17,142,64,17,171,64,17,200,64,17,230,64,17,3,65,17,33,65,17,16,66,17, +41,66,17,67,66,17,16,68,17,41,68,17,67,68,17,16,70,17,41,70,17,67,70,17,16,72,17,41,72,17,67,72,17,36,80,17,67,80,17,97,80,17,128,80,17,48,0,18, +80,0,18,112,0,18,144,0,18,176,0,18,208,0,18,240,0,18,16,1,18,46,1,18,48,16,18,80,16,18,112,16,18,144,16,18,176,16,18,202,16,18,230,16,18,0,17,18, +48,32,18,80,32,18,111,32,18,140,32,18,168,32,18,196,32,18,225,32,18,48,64,18,80,64,18,112,64,18,144,64,18,176,64,18,208,64,18,239,64,18,9,65,18,35,65,18, +16,72,18,48,72,18,71,72,18,47,80,18,76,80,18,105,80,18,135,80,18,164,80,18,193,80,18,39,96,18,69,96,18,99,96,18,130,96,18,160,96,18,48,128,18,80,128,18, +112,128,18,144,128,18,176,128,18,208,128,18,239,128,18,9,129,18,35,129,18,16,130,18,48,130,18,71,130,18,16,132,18,48,132,18,71,132,18,16,134,18,48,134,18,71,134,18, +47,144,18,76,144,18,105,144,18,135,144,18,164,144,18,193,144,18,39,160,18,69,160,18,99,160,18,130,160,18,160,160,18,48,192,18,80,192,18,112,192,18,144,192,18,174,192,18, +202,192,18,231,192,18,3,193,18,16,194,18,43,194,18,66,194,18,16,196,18,43,196,18,66,196,18,16,198,18,43,198,18,66,198,18,39,208,18,69,208,18,99,208,18,130,208,18, +160,208,18,34,224,18,65,224,18,96,224,18,48,0,19,80,0,19,112,0,19,144,0,19,174,0,19,202,0,19,231,0,19,3,1,19,16,2,19,43,2,19,66,2,19,16,4,19, +43,4,19,66,4,19,16,6,19,43,6,19,66,6,19,16,8,19,43,8,19,66,8,19,39,16,19,69,16,19,99,16,19,130,16,19,160,16,19,34,32,19,65,32,19,96,32,19, +48,64,19,80,64,19,110,64,19,140,64,19,169,64,19,198,64,19,227,64,19,0,65,19,16,66,19,38,66,19,64,66,19,16,68,19,38,68,19,64,68,19,16,70,19,38,70,19, +64,70,19,16,72,19,38,72,19,64,72,19,35,80,19,66,80,19,96,80,19,16,0,20,48,0,20,80,0,20,112,0,20,144,0,20,176,0,20,208,0,20,240,0,20,14,1,20, +16,16,20,48,16,20,80,16,20,112,16,20,144,16,20,172,16,20,198,16,20,225,16,20,16,32,20,48,32,20,80,32,20,109,32,20,138,32,20,165,32,20,193,32,20,16,64,20, +48,64,20,80,64,20,112,64,20,144,64,20,176,64,20,207,64,20,234,64,20,3,65,20,16,72,20,48,72,20,65,72,20,16,80,20,46,80,20,75,80,20,104,80,20,133,80,20, +162,80,20,10,96,20,39,96,20,68,96,20,98,96,20,129,96,20,16,128,20,48,128,20,80,128,20,112,128,20,144,128,20,176,128,20,207,128,20,234,128,20,3,129,20,16,130,20, +48,130,20,65,130,20,16,132,20,48,132,20,65,132,20,16,134,20,48,134,20,65,134,20,16,144,20,46,144,20,75,144,20,104,144,20,133,144,20,162,144,20,10,160,20,39,160,20, +68,160,20,98,160,20,129,160,20,16,192,20,48,192,20,80,192,20,112,192,20,144,192,20,171,192,20,199,192,20,228,192,20,16,194,20,40,194,20,16,196,20,40,196,20,16,198,20, +40,198,20,10,208,20,39,208,20,68,208,20,98,208,20,129,208,20,4,224,20,34,224,20,64,224,20,16,0,21,48,0,21,80,0,21,112,0,21,144,0,21,171,0,21,199,0,21, +228,0,21,16,2,21,40,2,21,16,4,21,40,4,21,16,6,21,40,6,21,16,8,21,40,8,21,10,16,21,39,16,21,68,16,21,98,16,21,129,16,21,4,32,21,34,32,21, +64,32,21,16,64,21,48,64,21,79,64,21,108,64,21,138,64,21,166,64,21,195,64,21,225,64,21,15,66,21,36,66,21,15,68,21,36,68,21,15,70,21,36,70,21,15,72,21, +36,72,21,6,80,21,35,80,21,65,80,21,96,80,21,16,1,22,48,1,22,80,1,22,112,1,22,16,17,22,48,17,22,80,17,22,112,17,22,16,33,22,48,33,22,80,33,22, +112,33,22,16,65,22,48,65,22,80,65,22,112,65,22,80,72,22,112,72,22,144,72,22,176,72,22,208,72,22,240,72,22,16,73,22,48,73,22,80,73,22,112,73,22,16,81,22, +48,81,22,80,81,22,112,81,22,10,97,22,42,97,22,73,97,22,105,97,22,16,129,22,48,129,22,80,129,22,112,129,22,80,130,22,112,130,22,144,130,22,176,130,22,208,130,22, +240,130,22,16,131,22,48,131,22,80,131,22,112,131,22,80,132,22,112,132,22,144,132,22,176,132,22,208,132,22,240,132,22,16,133,22,48,133,22,80,133,22,112,133,22,80,134,22, +112,134,22,144,134,22,176,134,22,208,134,22,240,134,22,16,135,22,48,135,22,80,135,22,112,135,22,16,145,22,48,145,22,80,145,22,112,145,22,10,161,22,42,161,22,73,161,22, +105,161,22,16,193,22,48,193,22,80,193,22,112,193,22,80,194,22,112,194,22,144,194,22,176,194,22,208,194,22,240,194,22,16,195,22,48,195,22,80,195,22,112,195,22,80,196,22, +112,196,22,144,196,22,176,196,22,208,196,22,240,196,22,16,197,22,48,197,22,80,197,22,112,197,22,80,198,22,112,198,22,144,198,22,176,198,22,208,198,22,240,198,22,16,199,22, +48,199,22,80,199,22,112,199,22,10,209,22,42,209,22,73,209,22,105,209,22,4,225,22,36,225,22,67,225,22,99,225,22,16,1,23,48,1,23,80,1,23,112,1,23,80,2,23, +112,2,23,144,2,23,176,2,23,208,2,23,240,2,23,16,3,23,48,3,23,80,3,23,112,3,23,80,4,23,112,4,23,144,4,23,176,4,23,208,4,23,240,4,23,16,5,23, +48,5,23,80,5,23,112,5,23,80,6,23,112,6,23,144,6,23,176,6,23,208,6,23,240,6,23,16,7,23,48,7,23,80,7,23,112,7,23,80,8,23,112,8,23,144,8,23, +176,8,23,208,8,23,240,8,23,16,9,23,48,9,23,80,9,23,112,9,23,10,17,23,42,17,23,73,17,23,105,17,23,4,33,23,36,33,23,67,33,23,99,33,23,16,65,23, +48,65,23,80,65,23,112,65,23,80,66,23,112,66,23,144,66,23,176,66,23,208,66,23,240,66,23,15,67,23,45,67,23,76,67,23,106,67,23,80,68,23,112,68,23,144,68,23, +176,68,23,208,68,23,240,68,23,15,69,23,45,69,23,76,69,23,106,69,23,80,70,23,112,70,23,144,70,23,176,70,23,208,70,23,240,70,23,15,71,23,45,71,23,76,71,23, +106,71,23,80,72,23,112,72,23,144,72,23,176,72,23,208,72,23,240,72,23,15,73,23,45,73,23,76,73,23,106,73,23,6,81,23,37,81,23,69,81,23,100,81,23,144,0,24, +176,0,24,208,0,24,240,0,24,16,1,24,48,1,24,80,1,24,112,1,24,144,16,24,176,16,24,208,16,24,240,16,24,16,17,24,48,17,24,80,17,24,112,17,24,144,32,24, +176,32,24,208,32,24,240,32,24,16,33,24,48,33,24,80,33,24,112,33,24,144,64,24,176,64,24,208,64,24,240,64,24,16,65,24,48,65,24,80,65,24,112,65,24,48,72,24, +80,72,24,112,72,24,144,72,24,176,72,24,208,72,24,240,72,24,16,73,24,47,73,24,75,73,24,102,73,24,144,80,24,176,80,24,208,80,24,240,80,24,15,81,24,46,81,24, +77,81,24,108,81,24,138,96,24,170,96,24,201,96,24,232,96,24,7,97,24,39,97,24,70,97,24,101,97,24,144,128,24,176,128,24,208,128,24,240,128,24,16,129,24,48,129,24, +80,129,24,112,129,24,48,130,24,80,130,24,112,130,24,144,130,24,176,130,24,208,130,24,240,130,24,16,131,24,47,131,24,75,131,24,102,131,24,48,132,24,80,132,24,112,132,24, +144,132,24,176,132,24,208,132,24,240,132,24,16,133,24,47,133,24,75,133,24,102,133,24,48,134,24,80,134,24,112,134,24,144,134,24,176,134,24,208,134,24,240,134,24,16,135,24, +47,135,24,75,135,24,102,135,24,144,144,24,176,144,24,208,144,24,240,144,24,15,145,24,46,145,24,77,145,24,108,145,24,138,160,24,170,160,24,201,160,24,232,160,24,7,161,24, +39,161,24,70,161,24,101,161,24,144,192,24,176,192,24,208,192,24,240,192,24,16,193,24,48,193,24,80,193,24,112,193,24,48,194,24,80,194,24,112,194,24,144,194,24,176,194,24, +208,194,24,238,194,24,10,195,24,39,195,24,69,195,24,97,195,24,48,196,24,80,196,24,112,196,24,144,196,24,176,196,24,208,196,24,238,196,24,10,197,24,39,197,24,69,197,24, +97,197,24,48,198,24,80,198,24,112,198,24,144,198,24,176,198,24,208,198,24,238,198,24,10,199,24,39,199,24,69,199,24,97,199,24,138,208,24,170,208,24,201,208,24,232,208,24, +7,209,24,39,209,24,70,209,24,101,209,24,132,224,24,164,224,24,195,224,24,227,224,24,2,225,24,34,225,24,65,225,24,97,225,24,144,0,25,176,0,25,208,0,25,240,0,25, +16,1,25,48,1,25,80,1,25,112,1,25,48,2,25,80,2,25,112,2,25,144,2,25,176,2,25,208,2,25,238,2,25,10,3,25,39,3,25,69,3,25,97,3,25,48,4,25, +80,4,25,112,4,25,144,4,25,176,4,25,208,4,25,238,4,25,10,5,25,39,5,25,69,5,25,97,5,25,48,6,25,80,6,25,112,6,25,144,6,25,176,6,25,208,6,25, +238,6,25,10,7,25,39,7,25,69,7,25,97,7,25,48,8,25,80,8,25,112,8,25,144,8,25,176,8,25,208,8,25,238,8,25,10,9,25,39,9,25,69,9,25,97,9,25, +138,16,25,170,16,25,201,16,25,232,16,25,7,17,25,39,17,25,70,17,25,101,17,25,132,32,25,164,32,25,195,32,25,227,32,25,2,33,25,34,33,25,65,33,25,97,33,25, +144,64,25,176,64,25,208,64,25,240,64,25,16,65,25,48,65,25,80,65,25,112,65,25,48,66,25,80,66,25,112,66,25,143,66,25,172,66,25,202,66,25,232,66,25,6,67,25, +35,67,25,65,67,25,48,68,25,80,68,25,112,68,25,143,68,25,172,68,25,202,68,25,232,68,25,6,69,25,35,69,25,65,69,25,48,70,25,80,70,25,112,70,25,143,70,25, +172,70,25,202,70,25,232,70,25,6,71,25,35,71,25,65,71,25,48,72,25,80,72,25,112,72,25,143,72,25,172,72,25,202,72,25,232,72,25,6,73,25,35,73,25,65,73,25, +134,80,25,165,80,25,196,80,25,228,80,25,3,81,25,35,81,25,66,81,25,98,81,25,80,0,26,112,0,26,144,0,26,176,0,26,208,0,26,240,0,26,16,1,26,48,1,26, +80,1,26,112,1,26,80,16,26,112,16,26,144,16,26,176,16,26,208,16,26,240,16,26,16,17,26,48,17,26,80,17,26,112,17,26,80,32,26,112,32,26,144,32,26,176,32,26, +208,32,26,240,32,26,16,33,26,47,33,26,77,33,26,107,33,26,80,64,26,112,64,26,144,64,26,176,64,26,208,64,26,240,64,26,16,65,26,48,65,26,80,65,26,112,65,26, +16,72,26,48,72,26,80,72,26,112,72,26,144,72,26,176,72,26,205,72,26,232,72,26,1,73,26,80,80,26,112,80,26,144,80,26,175,80,26,206,80,26,236,80,26,11,81,26, +41,81,26,72,81,26,102,81,26,74,96,26,105,96,26,136,96,26,167,96,26,198,96,26,229,96,26,4,97,26,35,97,26,66,97,26,97,97,26,80,128,26,112,128,26,144,128,26, +176,128,26,208,128,26,240,128,26,16,129,26,48,129,26,80,129,26,112,129,26,16,130,26,48,130,26,80,130,26,112,130,26,144,130,26,176,130,26,205,130,26,232,130,26,1,131,26, +16,132,26,48,132,26,80,132,26,112,132,26,144,132,26,176,132,26,205,132,26,232,132,26,1,133,26,16,134,26,48,134,26,80,134,26,112,134,26,144,134,26,176,134,26,205,134,26, +232,134,26,1,135,26,80,144,26,112,144,26,144,144,26,175,144,26,206,144,26,236,144,26,11,145,26,41,145,26,72,145,26,102,145,26,74,160,26,105,160,26,136,160,26,167,160,26, +198,160,26,229,160,26,4,161,26,35,161,26,66,161,26,97,161,26,80,192,26,112,192,26,144,192,26,176,192,26,208,192,26,240,192,26,16,193,26,48,193,26,80,193,26,112,193,26, +16,194,26,48,194,26,80,194,26,112,194,26,143,194,26,170,194,26,198,194,26,227,194,26,16,196,26,48,196,26,80,196,26,112,196,26,143,196,26,170,196,26,198,196,26,227,196,26, +16,198,26,48,198,26,80,198,26,112,198,26,143,198,26,170,198,26,198,198,26,227,198,26,74,208,26,105,208,26,136,208,26,167,208,26,198,208,26,229,208,26,4,209,26,35,209,26, +66,209,26,97,209,26,68,224,26,99,224,26,131,224,26,162,224,26,193,224,26,225,224,26,0,225,26,32,225,26,80,0,27,112,0,27,144,0,27,176,0,27,208,0,27,240,0,27, +16,1,27,48,1,27,80,1,27,112,1,27,16,2,27,48,2,27,80,2,27,112,2,27,143,2,27,170,2,27,198,2,27,227,2,27,16,4,27,48,4,27,80,4,27,112,4,27, +143,4,27,170,4,27,198,4,27,227,4,27,16,6,27,48,6,27,80,6,27,112,6,27,143,6,27,170,6,27,198,6,27,227,6,27,16,8,27,48,8,27,80,8,27,112,8,27, +143,8,27,170,8,27,198,8,27,227,8,27,74,16,27,105,16,27,136,16,27,167,16,27,198,16,27,229,16,27,4,17,27,35,17,27,66,17,27,97,17,27,68,32,27,99,32,27, +131,32,27,162,32,27,193,32,27,225,32,27,0,33,27,32,33,27,80,64,27,112,64,27,144,64,27,176,64,27,208,64,27,240,64,27,15,65,27,46,65,27,76,65,27,107,65,27, +16,66,27,48,66,27,79,66,27,108,66,27,137,66,27,166,66,27,195,66,27,224,66,27,16,68,27,48,68,27,79,68,27,108,68,27,137,68,27,166,68,27,195,68,27,224,68,27, +16,70,27,48,70,27,79,70,27,108,70,27,137,70,27,166,70,27,195,70,27,224,70,27,16,72,27,48,72,27,79,72,27,108,72,27,137,72,27,166,72,27,195,72,27,224,72,27, +70,80,27,101,80,27,132,80,27,163,80,27,195,80,27,226,80,27,1,81,27,32,81,27,64,81,27,48,0,28,80,0,28,112,0,28,144,0,28,176,0,28,208,0,28,240,0,28, +16,1,28,48,1,28,80,1,28,112,1,28,48,16,28,80,16,28,112,16,28,144,16,28,176,16,28,208,16,28,240,16,28,16,17,28,48,17,28,78,17,28,106,17,28,48,32,28, +80,32,28,112,32,28,144,32,28,176,32,28,208,32,28,238,32,28,11,33,28,41,33,28,71,33,28,100,33,28,48,64,28,80,64,28,112,64,28,144,64,28,176,64,28,208,64,28, +240,64,28,16,65,28,48,65,28,80,65,28,112,65,28,16,72,28,48,72,28,80,72,28,112,72,28,143,72,28,166,72,28,48,80,28,80,80,28,112,80,28,142,80,28,172,80,28, +202,80,28,233,80,28,6,81,28,36,81,28,67,81,28,97,81,28,42,96,28,73,96,28,104,96,28,135,96,28,165,96,28,196,96,28,227,96,28,1,97,28,32,97,28,48,128,28, +80,128,28,112,128,28,144,128,28,176,128,28,208,128,28,240,128,28,16,129,28,48,129,28,80,129,28,112,129,28,16,130,28,48,130,28,80,130,28,112,130,28,143,130,28,166,130,28, +16,132,28,48,132,28,80,132,28,112,132,28,143,132,28,166,132,28,16,134,28,48,134,28,80,134,28,112,134,28,143,134,28,166,134,28,48,144,28,80,144,28,112,144,28,142,144,28, +172,144,28,202,144,28,233,144,28,6,145,28,36,145,28,67,145,28,97,145,28,42,160,28,73,160,28,104,160,28,135,160,28,165,160,28,196,160,28,227,160,28,1,161,28,32,161,28, +48,192,28,80,192,28,112,192,28,144,192,28,176,192,28,208,192,28,240,192,28,16,193,28,47,193,28,77,193,28,106,193,28,16,194,28,48,194,28,80,194,28,107,194,28,135,194,28, +161,194,28,16,196,28,48,196,28,80,196,28,107,196,28,135,196,28,161,196,28,16,198,28,48,198,28,80,198,28,107,198,28,135,198,28,161,198,28,42,208,28,73,208,28,104,208,28, +135,208,28,165,208,28,196,208,28,227,208,28,1,209,28,32,209,28,36,224,28,67,224,28,98,224,28,130,224,28,161,224,28,192,224,28,48,0,29,80,0,29,112,0,29,144,0,29, +176,0,29,208,0,29,240,0,29,16,1,29,47,1,29,77,1,29,106,1,29,16,2,29,48,2,29,80,2,29,107,2,29,135,2,29,161,2,29,16,4,29,48,4,29,80,4,29, +107,4,29,135,4,29,161,4,29,16,6,29,48,6,29,80,6,29,107,6,29,135,6,29,161,6,29,16,8,29,48,8,29,80,8,29,107,8,29,135,8,29,161,8,29,42,16,29, +73,16,29,104,16,29,135,16,29,165,16,29,196,16,29,227,16,29,1,17,29,32,17,29,36,32,29,67,32,29,98,32,29,130,32,29,161,32,29,192,32,29,48,64,29,80,64,29, +112,64,29,144,64,29,176,64,29,207,64,29,237,64,29,11,65,29,41,65,29,71,65,29,101,65,29,16,66,29,47,66,29,74,66,29,102,66,29,131,66,29,16,68,29,47,68,29, +74,68,29,102,68,29,131,68,29,16,70,29,47,70,29,74,70,29,102,70,29,131,70,29,16,72,29,47,72,29,74,72,29,102,72,29,131,72,29,38,80,29,68,80,29,100,80,29, +131,80,29,162,80,29,193,80,29,224,80,29,48,0,30,80,0,30,112,0,30,144,0,30,176,0,30,208,0,30,240,0,30,16,1,30,48,1,30,80,1,30,112,1,30,48,16,30, +80,16,30,112,16,30,144,16,30,176,16,30,208,16,30,240,16,30,12,17,30,39,17,30,68,17,30,48,32,30,80,32,30,112,32,30,144,32,30,174,32,30,203,32,30,233,32,30, +5,33,30,34,33,30,64,33,30,48,64,30,80,64,30,112,64,30,144,64,30,176,64,30,208,64,30,240,64,30,16,65,30,48,65,30,77,65,30,103,65,30,16,72,30,48,72,30, +80,72,30,106,72,30,131,72,30,48,80,30,79,80,30,109,80,30,139,80,30,169,80,30,198,80,30,228,80,30,2,81,30,32,81,30,41,96,30,71,96,30,102,96,30,133,96,30, +163,96,30,193,96,30,224,96,30,48,128,30,80,128,30,112,128,30,144,128,30,176,128,30,208,128,30,240,128,30,16,129,30,48,129,30,77,129,30,103,129,30,16,130,30,48,130,30, +80,130,30,106,130,30,131,130,30,16,132,30,48,132,30,80,132,30,106,132,30,131,132,30,16,134,30,48,134,30,80,134,30,106,134,30,131,134,30,48,144,30,79,144,30,109,144,30, +139,144,30,169,144,30,198,144,30,228,144,30,2,145,30,32,145,30,41,160,30,71,160,30,102,160,30,133,160,30,163,160,30,193,160,30,224,160,30,48,192,30,80,192,30,112,192,30, +144,192,30,176,192,30,208,192,30,239,192,30,11,193,30,40,193,30,70,193,30,98,193,30,16,194,30,48,194,30,74,194,30,100,194,30,16,196,30,48,196,30,74,196,30,100,196,30, +16,198,30,48,198,30,74,198,30,100,198,30,41,208,30,71,208,30,102,208,30,133,208,30,163,208,30,193,208,30,224,208,30,35,224,30,66,224,30,97,224,30,128,224,30,48,0,31, +80,0,31,112,0,31,144,0,31,176,0,31,208,0,31,239,0,31,11,1,31,40,1,31,70,1,31,98,1,31,16,2,31,48,2,31,74,2,31,100,2,31,16,4,31,48,4,31, +74,4,31,100,4,31,16,6,31,48,6,31,74,6,31,100,6,31,16,8,31,48,8,31,74,8,31,100,8,31,41,16,31,71,16,31,102,16,31,133,16,31,163,16,31,193,16,31, +224,16,31,35,32,31,66,32,31,97,32,31,128,32,31,48,64,31,80,64,31,112,64,31,144,64,31,173,64,31,203,64,31,233,64,31,6,65,31,36,65,31,66,65,31,96,65,31, +16,66,31,43,66,31,70,66,31,97,66,31,16,68,31,43,68,31,70,68,31,97,68,31,16,70,31,43,70,31,70,70,31,97,70,31,16,72,31,43,72,31,70,72,31,97,72,31, +37,80,31,67,80,31,98,80,31,129,80,31,160,80,31,48,0,32,80,0,32,112,0,32,144,0,32,176,0,32,208,0,32,240,0,32,16,1,32,48,1,32,48,16,32,80,16,32, +112,16,32,144,16,32,176,16,32,205,16,32,233,16,32,3,17,32,48,32,32,80,32,32,112,32,32,142,32,32,170,32,32,198,32,32,227,32,32,48,64,32,80,64,32,112,64,32, +144,64,32,176,64,32,208,64,32,240,64,32,12,65,32,39,65,32,16,72,32,48,72,32,74,72,32,48,80,32,77,80,32,106,80,32,136,80,32,165,80,32,195,80,32,224,80,32, +40,96,32,70,96,32,100,96,32,131,96,32,161,96,32,48,128,32,80,128,32,112,128,32,144,128,32,176,128,32,208,128,32,240,128,32,12,129,32,39,129,32,16,130,32,48,130,32, +74,130,32,16,132,32,48,132,32,74,132,32,16,134,32,48,134,32,74,134,32,48,144,32,77,144,32,106,144,32,136,144,32,165,144,32,195,144,32,224,144,32,40,160,32,70,160,32, +100,160,32,131,160,32,161,160,32,48,192,32,80,192,32,112,192,32,144,192,32,176,192,32,204,192,32,233,192,32,5,193,32,34,193,32,16,194,32,44,194,32,68,194,32,16,196,32, +44,196,32,68,196,32,16,198,32,44,198,32,68,198,32,40,208,32,70,208,32,100,208,32,131,208,32,161,208,32,35,224,32,65,224,32,96,224,32,48,0,33,80,0,33,112,0,33, +144,0,33,176,0,33,204,0,33,233,0,33,5,1,33,34,1,33,16,2,33,44,2,33,68,2,33,16,4,33,44,4,33,68,4,33,16,6,33,44,6,33,68,6,33,16,8,33, +44,8,33,68,8,33,40,16,33,70,16,33,100,16,33,131,16,33,161,16,33,35,32,33,65,32,33,96,32,33,48,64,33,80,64,33,111,64,33,141,64,33,170,64,33,199,64,33, +229,64,33,2,65,33,16,66,33,39,66,33,65,66,33,16,68,33,39,68,33,65,68,33,16,70,33,39,70,33,65,70,33,16,72,33,39,72,33,65,72,33,36,80,33,66,80,33, +97,80,33,128,80,33,16,0,34,48,0,34,80,0,34,112,0,34,144,0,34,176,0,34,208,0,34,240,0,34,14,1,34,16,16,34,48,16,34,80,16,34,112,16,34,144,16,34, +172,16,34,198,16,34,225,16,34,16,32,34,48,32,34,80,32,34,109,32,34,138,32,34,165,32,34,193,32,34,16,64,34,48,64,34,80,64,34,112,64,34,144,64,34,176,64,34, +207,64,34,234,64,34,3,65,34,16,72,34,48,72,34,65,72,34,16,80,34,46,80,34,75,80,34,104,80,34,133,80,34,162,80,34,10,96,34,39,96,34,68,96,34,98,96,34, +129,96,34,16,128,34,48,128,34,80,128,34,112,128,34,144,128,34,176,128,34,207,128,34,234,128,34,3,129,34,16,130,34,48,130,34,65,130,34,16,132,34,48,132,34,65,132,34, +16,134,34,48,134,34,65,134,34,16,144,34,46,144,34,75,144,34,104,144,34,133,144,34,162,144,34,10,160,34,39,160,34,68,160,34,98,160,34,129,160,34,16,192,34,48,192,34, +80,192,34,112,192,34,144,192,34,171,192,34,199,192,34,228,192,34,16,194,34,40,194,34,16,196,34,40,196,34,16,198,34,40,198,34,10,208,34,39,208,34,68,208,34,98,208,34, +129,208,34,4,224,34,34,224,34,64,224,34,16,0,35,48,0,35,80,0,35,112,0,35,144,0,35,171,0,35,199,0,35,228,0,35,16,2,35,40,2,35,16,4,35,40,4,35, +16,6,35,40,6,35,16,8,35,40,8,35,10,16,35,39,16,35,68,16,35,98,16,35,129,16,35,4,32,35,34,32,35,64,32,35,16,64,35,48,64,35,79,64,35,108,64,35, +138,64,35,166,64,35,195,64,35,225,64,35,15,66,35,36,66,35,15,68,35,36,68,35,15,70,35,36,70,35,15,72,35,36,72,35,6,80,35,35,80,35,65,80,35,96,80,35, +16,0,36,48,0,36,80,0,36,112,0,36,144,0,36,176,0,36,208,0,36,16,16,36,48,16,36,80,16,36,112,16,36,141,16,36,165,16,36,16,32,36,48,32,36,78,32,36, +106,32,36,134,32,36,161,32,36,16,64,36,48,64,36,80,64,36,112,64,36,144,64,36,174,64,36,199,64,36,16,72,36,40,72,36,16,80,36,44,80,36,73,80,36,101,80,36, +130,80,36,10,96,36,37,96,36,67,96,36,97,96,36,16,128,36,48,128,36,80,128,36,112,128,36,144,128,36,174,128,36,199,128,36,16,130,36,40,130,36,16,132,36,40,132,36, +16,134,36,40,134,36,16,144,36,44,144,36,73,144,36,101,144,36,130,144,36,10,160,36,37,160,36,67,160,36,97,160,36,16,192,36,48,192,36,80,192,36,112,192,36,140,192,36, +167,192,36,194,192,36,16,194,36,35,194,36,16,196,36,35,196,36,16,198,36,35,198,36,10,208,36,37,208,36,67,208,36,97,208,36,4,224,36,33,224,36,16,0,37,48,0,37, +80,0,37,112,0,37,140,0,37,167,0,37,194,0,37,16,2,37,35,2,37,16,4,37,35,4,37,16,6,37,35,6,37,16,8,37,35,8,37,10,16,37,37,16,37,67,16,37, +97,16,37,4,32,37,33,32,37,16,64,37,48,64,37,77,64,37,106,64,37,135,64,37,163,64,37,192,64,37,12,66,37,32,66,37,12,68,37,32,68,37,12,70,37,32,70,37, +12,72,37,32,72,37,5,80,37,34,80,37,64,80,37,16,0,38,48,0,38,80,0,38,112,0,38,144,0,38,176,0,38,16,16,38,48,16,38,80,16,38,109,16,38,135,16,38, +16,32,38,48,32,38,75,32,38,102,32,38,130,32,38,16,64,38,48,64,38,80,64,38,112,64,38,144,64,38,167,64,38,16,72,38,33,72,38,16,80,38,43,80,38,70,80,38, +99,80,38,128,80,38,9,96,38,36,96,38,65,96,38,16,128,38,48,128,38,80,128,38,112,128,38,144,128,38,167,128,38,16,130,38,33,130,38,16,132,38,33,132,38,16,134,38, +33,134,38,16,144,38,43,144,38,70,144,38,99,144,38,128,144,38,9,160,38,36,160,38,65,160,38,16,192,38,48,192,38,80,192,38,108,192,38,136,192,38,162,192,38,16,194,38, +16,196,38,16,198,38,9,208,38,36,208,38,65,208,38,3,224,38,32,224,38,16,0,39,48,0,39,80,0,39,108,0,39,136,0,39,162,0,39,16,2,39,16,4,39,16,6,39, +16,8,39,9,16,39,36,16,39,65,16,39,3,32,39,32,32,39,16,64,39,47,64,39,75,64,39,103,64,39,132,64,39,160,64,39,10,66,39,10,68,39,10,70,39,10,72,39, +4,80,39,33,80,39,16,0,40,48,0,40,80,0,40,112,0,40,144,0,40,16,16,40,48,16,40,80,16,40,104,16,40,129,16,40,16,32,40,47,32,40,72,32,40,99,32,40, +16,64,40,48,64,40,80,64,40,112,64,40,138,64,40,16,80,40,41,80,40,68,80,40,96,80,40,8,96,40,35,96,40,64,96,40,16,128,40,48,128,40,80,128,40,112,128,40, +138,128,40,16,144,40,41,144,40,68,144,40,96,144,40,8,160,40,35,160,40,64,160,40,16,192,40,48,192,40,78,192,40,105,192,40,132,192,40,8,208,40,35,208,40,64,208,40, +3,224,40,16,0,41,48,0,41,78,0,41,105,0,41,132,0,41,8,16,41,35,16,41,64,16,41,3,32,41,16,64,41,45,64,41,73,64,41,100,64,41,129,64,41,4,80,41, +32,80,41,16,0,42,48,0,42,80,0,42,112,0,42,144,0,42,16,16,42,48,16,42,76,16,42,99,16,42,16,32,42,44,32,42,69,32,42,16,64,42,48,64,42,80,64,42, +108,64,42,132,64,42,15,80,42,39,80,42,66,80,42,7,96,42,34,96,42,16,128,42,48,128,42,80,128,42,108,128,42,132,128,42,15,144,42,39,144,42,66,144,42,7,160,42, +34,160,42,16,192,42,48,192,42,75,192,42,101,192,42,128,192,42,7,208,42,34,208,42,2,224,42,16,0,43,48,0,43,75,0,43,101,0,43,128,0,43,7,16,43,34,16,43, +2,32,43,16,64,43,44,64,43,70,64,43,98,64,43,3,80,43,176,0,44,208,0,44,240,0,44,16,1,44,48,1,44,80,1,44,112,1,44,176,16,44,208,16,44,240,16,44, +16,17,44,48,17,44,80,17,44,112,17,44,176,32,44,208,32,44,240,32,44,16,33,44,48,33,44,80,33,44,112,33,44,176,64,44,208,64,44,240,64,44,16,65,44,48,65,44, +80,65,44,112,65,44,48,72,44,80,72,44,112,72,44,144,72,44,176,72,44,208,72,44,240,72,44,16,73,44,48,73,44,80,73,44,109,73,44,176,80,44,208,80,44,240,80,44, +16,81,44,48,81,44,79,81,44,110,81,44,170,96,44,202,96,44,233,96,44,8,97,44,40,97,44,71,97,44,102,97,44,176,128,44,208,128,44,240,128,44,16,129,44,48,129,44, +80,129,44,112,129,44,48,130,44,80,130,44,112,130,44,144,130,44,176,130,44,208,130,44,240,130,44,16,131,44,48,131,44,80,131,44,109,131,44,48,132,44,80,132,44,112,132,44, +144,132,44,176,132,44,208,132,44,240,132,44,16,133,44,48,133,44,80,133,44,109,133,44,48,134,44,80,134,44,112,134,44,144,134,44,176,134,44,208,134,44,240,134,44,16,135,44, +48,135,44,80,135,44,109,135,44,176,144,44,208,144,44,240,144,44,16,145,44,48,145,44,79,145,44,110,145,44,170,160,44,202,160,44,233,160,44,8,161,44,40,161,44,71,161,44, +102,161,44,176,192,44,208,192,44,240,192,44,16,193,44,48,193,44,80,193,44,112,193,44,48,194,44,80,194,44,112,194,44,144,194,44,176,194,44,208,194,44,240,194,44,14,195,44, +43,195,44,73,195,44,102,195,44,48,196,44,80,196,44,112,196,44,144,196,44,176,196,44,208,196,44,240,196,44,14,197,44,43,197,44,73,197,44,102,197,44,48,198,44,80,198,44, +112,198,44,144,198,44,176,198,44,208,198,44,240,198,44,14,199,44,43,199,44,73,199,44,102,199,44,170,208,44,202,208,44,233,208,44,8,209,44,40,209,44,71,209,44,102,209,44, +164,224,44,196,224,44,227,224,44,3,225,44,34,225,44,66,225,44,97,225,44,176,0,45,208,0,45,240,0,45,16,1,45,48,1,45,80,1,45,112,1,45,48,2,45,80,2,45, +112,2,45,144,2,45,176,2,45,208,2,45,240,2,45,14,3,45,43,3,45,73,3,45,102,3,45,48,4,45,80,4,45,112,4,45,144,4,45,176,4,45,208,4,45,240,4,45, +14,5,45,43,5,45,73,5,45,102,5,45,48,6,45,80,6,45,112,6,45,144,6,45,176,6,45,208,6,45,240,6,45,14,7,45,43,7,45,73,7,45,102,7,45,48,8,45, +80,8,45,112,8,45,144,8,45,176,8,45,208,8,45,240,8,45,14,9,45,43,9,45,73,9,45,102,9,45,170,16,45,202,16,45,233,16,45,8,17,45,40,17,45,71,17,45, +102,17,45,164,32,45,196,32,45,227,32,45,3,33,45,34,33,45,66,33,45,97,33,45,176,64,45,208,64,45,240,64,45,16,65,45,48,65,45,80,65,45,112,65,45,48,66,45, +80,66,45,112,66,45,144,66,45,175,66,45,204,66,45,235,66,45,9,67,45,38,67,45,69,67,45,99,67,45,48,68,45,80,68,45,112,68,45,144,68,45,175,68,45,204,68,45, +235,68,45,9,69,45,38,69,45,69,69,45,99,69,45,48,70,45,80,70,45,112,70,45,144,70,45,175,70,45,204,70,45,235,70,45,9,71,45,38,71,45,69,71,45,99,71,45, +48,72,45,80,72,45,112,72,45,144,72,45,175,72,45,204,72,45,235,72,45,9,73,45,38,73,45,69,73,45,99,73,45,166,80,45,197,80,45,229,80,45,4,81,45,36,81,45, +67,81,45,99,81,45,80,0,46,112,0,46,144,0,46,176,0,46,208,0,46,240,0,46,16,1,46,48,1,46,80,1,46,112,1,46,80,16,46,112,16,46,144,16,46,176,16,46, +208,16,46,240,16,46,16,17,46,48,17,46,80,17,46,112,17,46,80,32,46,112,32,46,144,32,46,176,32,46,208,32,46,240,32,46,16,33,46,47,33,46,77,33,46,107,33,46, +80,64,46,112,64,46,144,64,46,176,64,46,208,64,46,240,64,46,16,65,46,48,65,46,80,65,46,112,65,46,16,72,46,48,72,46,80,72,46,112,72,46,144,72,46,176,72,46, +205,72,46,232,72,46,1,73,46,80,80,46,112,80,46,144,80,46,175,80,46,206,80,46,236,80,46,11,81,46,41,81,46,72,81,46,102,81,46,74,96,46,105,96,46,136,96,46, +167,96,46,198,96,46,229,96,46,4,97,46,35,97,46,66,97,46,97,97,46,80,128,46,112,128,46,144,128,46,176,128,46,208,128,46,240,128,46,16,129,46,48,129,46,80,129,46, +112,129,46,16,130,46,48,130,46,80,130,46,112,130,46,144,130,46,176,130,46,205,130,46,232,130,46,1,131,46,16,132,46,48,132,46,80,132,46,112,132,46,144,132,46,176,132,46, +205,132,46,232,132,46,1,133,46,16,134,46,48,134,46,80,134,46,112,134,46,144,134,46,176,134,46,205,134,46,232,134,46,1,135,46,80,144,46,112,144,46,144,144,46,175,144,46, +206,144,46,236,144,46,11,145,46,41,145,46,72,145,46,102,145,46,74,160,46,105,160,46,136,160,46,167,160,46,198,160,46,229,160,46,4,161,46,35,161,46,66,161,46,97,161,46, +80,192,46,112,192,46,144,192,46,176,192,46,208,192,46,240,192,46,16,193,46,48,193,46,80,193,46,112,193,46,16,194,46,48,194,46,80,194,46,112,194,46,143,194,46,170,194,46, +198,194,46,227,194,46,16,196,46,48,196,46,80,196,46,112,196,46,143,196,46,170,196,46,198,196,46,227,196,46,16,198,46,48,198,46,80,198,46,112,198,46,143,198,46,170,198,46, +198,198,46,227,198,46,74,208,46,105,208,46,136,208,46,167,208,46,198,208,46,229,208,46,4,209,46,35,209,46,66,209,46,97,209,46,68,224,46,99,224,46,131,224,46,162,224,46, +193,224,46,225,224,46,0,225,46,32,225,46,80,0,47,112,0,47,144,0,47,176,0,47,208,0,47,240,0,47,16,1,47,48,1,47,80,1,47,112,1,47,16,2,47,48,2,47, +80,2,47,112,2,47,143,2,47,170,2,47,198,2,47,227,2,47,16,4,47,48,4,47,80,4,47,112,4,47,143,4,47,170,4,47,198,4,47,227,4,47,16,6,47,48,6,47, +80,6,47,112,6,47,143,6,47,170,6,47,198,6,47,227,6,47,16,8,47,48,8,47,80,8,47,112,8,47,143,8,47,170,8,47,198,8,47,227,8,47,74,16,47,105,16,47, +136,16,47,167,16,47,198,16,47,229,16,47,4,17,47,35,17,47,66,17,47,97,17,47,68,32,47,99,32,47,131,32,47,162,32,47,193,32,47,225,32,47,0,33,47,32,33,47, +80,64,47,112,64,47,144,64,47,176,64,47,208,64,47,240,64,47,15,65,47,46,65,47,76,65,47,107,65,47,16,66,47,48,66,47,79,66,47,108,66,47,137,66,47,166,66,47, +195,66,47,224,66,47,16,68,47,48,68,47,79,68,47,108,68,47,137,68,47,166,68,47,195,68,47,224,68,47,16,70,47,48,70,47,79,70,47,108,70,47,137,70,47,166,70,47, +195,70,47,224,70,47,16,72,47,48,72,47,79,72,47,108,72,47,137,72,47,166,72,47,195,72,47,224,72,47,70,80,47,101,80,47,132,80,47,163,80,47,195,80,47,226,80,47, +1,81,47,32,81,47,64,81,47,48,0,48,80,0,48,112,0,48,144,0,48,176,0,48,208,0,48,240,0,48,16,1,48,48,1,48,80,1,48,112,1,48,48,16,48,80,16,48, +112,16,48,144,16,48,176,16,48,208,16,48,240,16,48,16,17,48,45,17,48,74,17,48,102,17,48,48,32,48,80,32,48,112,32,48,144,32,48,176,32,48,206,32,48,236,32,48, +9,33,48,38,33,48,68,33,48,97,33,48,48,64,48,80,64,48,112,64,48,144,64,48,176,64,48,208,64,48,240,64,48,16,65,48,48,65,48,80,65,48,111,65,48,16,72,48, +48,72,48,80,72,48,112,72,48,138,72,48,161,72,48,48,80,48,80,80,48,111,80,48,141,80,48,171,80,48,201,80,48,231,80,48,5,81,48,35,81,48,65,81,48,42,96,48, +72,96,48,103,96,48,134,96,48,164,96,48,195,96,48,226,96,48,0,97,48,48,128,48,80,128,48,112,128,48,144,128,48,176,128,48,208,128,48,240,128,48,16,129,48,48,129,48, +80,129,48,111,129,48,16,130,48,48,130,48,80,130,48,112,130,48,138,130,48,161,130,48,16,132,48,48,132,48,80,132,48,112,132,48,138,132,48,161,132,48,16,134,48,48,134,48, +80,134,48,112,134,48,138,134,48,161,134,48,48,144,48,80,144,48,111,144,48,141,144,48,171,144,48,201,144,48,231,144,48,5,145,48,35,145,48,65,145,48,42,160,48,72,160,48, +103,160,48,134,160,48,164,160,48,195,160,48,226,160,48,0,161,48,48,192,48,80,192,48,112,192,48,144,192,48,176,192,48,208,192,48,240,192,48,15,193,48,44,193,48,74,193,48, +103,193,48,16,194,48,48,194,48,78,194,48,105,194,48,132,194,48,16,196,48,48,196,48,78,196,48,105,196,48,132,196,48,16,198,48,48,198,48,78,198,48,105,198,48,132,198,48, +42,208,48,72,208,48,103,208,48,134,208,48,164,208,48,195,208,48,226,208,48,0,209,48,36,224,48,67,224,48,98,224,48,129,224,48,160,224,48,48,0,49,80,0,49,112,0,49, +144,0,49,176,0,49,208,0,49,240,0,49,15,1,49,44,1,49,74,1,49,103,1,49,16,2,49,48,2,49,78,2,49,105,2,49,132,2,49,16,4,49,48,4,49,78,4,49, +105,4,49,132,4,49,16,6,49,48,6,49,78,6,49,105,6,49,132,6,49,16,8,49,48,8,49,78,8,49,105,8,49,132,8,49,42,16,49,72,16,49,103,16,49,134,16,49, +164,16,49,195,16,49,226,16,49,0,17,49,36,32,49,67,32,49,98,32,49,129,32,49,160,32,49,48,64,49,80,64,49,112,64,49,144,64,49,175,64,49,205,64,49,236,64,49, +9,65,49,39,65,49,70,65,49,99,65,49,16,66,49,45,66,49,73,66,49,100,66,49,129,66,49,16,68,49,45,68,49,73,68,49,100,68,49,129,68,49,16,70,49,45,70,49, +73,70,49,100,70,49,129,70,49,16,72,49,45,72,49,73,72,49,100,72,49,129,72,49,37,80,49,68,80,49,99,80,49,130,80,49,161,80,49,192,80,49,48,0,50,80,0,50, +112,0,50,144,0,50,176,0,50,208,0,50,240,0,50,16,1,50,48,1,50,80,1,50,48,16,50,80,16,50,112,16,50,144,16,50,176,16,50,208,16,50,236,16,50,6,17,50, +33,17,50,48,32,50,80,32,50,112,32,50,143,32,50,171,32,50,200,32,50,229,32,50,1,33,50,48,64,50,80,64,50,112,64,50,144,64,50,176,64,50,208,64,50,240,64,50, +15,65,50,42,65,50,70,65,50,16,72,50,48,72,50,77,72,50,99,72,50,48,80,50,78,80,50,107,80,50,137,80,50,166,80,50,196,80,50,226,80,50,40,96,50,70,96,50, +101,96,50,131,96,50,161,96,50,192,96,50,48,128,50,80,128,50,112,128,50,144,128,50,176,128,50,208,128,50,240,128,50,15,129,50,42,129,50,70,129,50,16,130,50,48,130,50, +77,130,50,99,130,50,16,132,50,48,132,50,77,132,50,99,132,50,16,134,50,48,134,50,77,134,50,99,134,50,48,144,50,78,144,50,107,144,50,137,144,50,166,144,50,196,144,50, +226,144,50,40,160,50,70,160,50,101,160,50,131,160,50,161,160,50,192,160,50,48,192,50,80,192,50,112,192,50,144,192,50,176,192,50,206,192,50,235,192,50,7,193,50,36,193,50, +65,193,50,16,194,50,46,194,50,70,194,50,16,196,50,46,196,50,70,196,50,16,198,50,46,198,50,70,198,50,40,208,50,70,208,50,101,208,50,131,208,50,161,208,50,192,208,50, +35,224,50,65,224,50,96,224,50,128,224,50,48,0,51,80,0,51,112,0,51,144,0,51,176,0,51,206,0,51,235,0,51,7,1,51,36,1,51,65,1,51,16,2,51,46,2,51, +70,2,51,16,4,51,46,4,51,70,4,51,16,6,51,46,6,51,70,6,51,16,8,51,46,8,51,70,8,51,40,16,51,70,16,51,101,16,51,131,16,51,161,16,51,192,16,51, +35,32,51,65,32,51,96,32,51,128,32,51,48,64,51,80,64,51,112,64,51,142,64,51,171,64,51,200,64,51,230,64,51,3,65,51,33,65,51,16,66,51,41,66,51,67,66,51, +16,68,51,41,68,51,67,68,51,16,70,51,41,70,51,67,70,51,16,72,51,41,72,51,67,72,51,36,80,51,67,80,51,97,80,51,128,80,51,16,0,52,48,0,52,80,0,52, +112,0,52,144,0,52,176,0,52,208,0,52,240,0,52,14,1,52,16,16,52,48,16,52,80,16,52,112,16,52,144,16,52,172,16,52,198,16,52,225,16,52,16,32,52,48,32,52, +80,32,52,109,32,52,138,32,52,165,32,52,193,32,52,16,64,52,48,64,52,80,64,52,112,64,52,144,64,52,176,64,52,207,64,52,234,64,52,3,65,52,16,72,52,48,72,52, +65,72,52,16,80,52,46,80,52,75,80,52,104,80,52,133,80,52,162,80,52,10,96,52,39,96,52,68,96,52,98,96,52,129,96,52,16,128,52,48,128,52,80,128,52,112,128,52, +144,128,52,176,128,52,207,128,52,234,128,52,3,129,52,16,130,52,48,130,52,65,130,52,16,132,52,48,132,52,65,132,52,16,134,52,48,134,52,65,134,52,16,144,52,46,144,52, +75,144,52,104,144,52,133,144,52,162,144,52,10,160,52,39,160,52,68,160,52,98,160,52,129,160,52,16,192,52,48,192,52,80,192,52,112,192,52,144,192,52,171,192,52,199,192,52, +228,192,52,16,194,52,40,194,52,16,196,52,40,196,52,16,198,52,40,198,52,10,208,52,39,208,52,68,208,52,98,208,52,129,208,52,4,224,52,34,224,52,64,224,52,16,0,53, +48,0,53,80,0,53,112,0,53,144,0,53,171,0,53,199,0,53,228,0,53,16,2,53,40,2,53,16,4,53,40,4,53,16,6,53,40,6,53,16,8,53,40,8,53,10,16,53, +39,16,53,68,16,53,98,16,53,129,16,53,4,32,53,34,32,53,64,32,53,16,64,53,48,64,53,79,64,53,108,64,53,138,64,53,166,64,53,195,64,53,225,64,53,15,66,53, +36,66,53,15,68,53,36,68,53,15,70,53,36,70,53,15,72,53,36,72,53,6,80,53,35,80,53,65,80,53,96,80,53,16,0,54,48,0,54,80,0,54,112,0,54,144,0,54, +176,0,54,208,0,54,16,16,54,48,16,54,80,16,54,112,16,54,139,16,54,163,16,54,16,32,54,48,32,54,77,32,54,104,32,54,133,32,54,16,64,54,48,64,54,80,64,54, +112,64,54,144,64,54,172,64,54,196,64,54,16,72,54,38,72,54,16,80,54,44,80,54,72,80,54,100,80,54,129,80,54,9,96,54,37,96,54,66,96,54,96,96,54,16,128,54, +48,128,54,80,128,54,112,128,54,144,128,54,172,128,54,196,128,54,16,130,54,38,130,54,16,132,54,38,132,54,16,134,54,38,134,54,16,144,54,44,144,54,72,144,54,100,144,54, +129,144,54,9,160,54,37,160,54,66,160,54,96,160,54,16,192,54,48,192,54,80,192,54,110,192,54,139,192,54,165,192,54,192,192,54,16,194,54,33,194,54,16,196,54,33,196,54, +16,198,54,33,198,54,9,208,54,37,208,54,66,208,54,96,208,54,3,224,54,33,224,54,16,0,55,48,0,55,80,0,55,110,0,55,139,0,55,165,0,55,192,0,55,16,2,55, +33,2,55,16,4,55,33,4,55,16,6,55,33,6,55,16,8,55,33,8,55,9,16,55,37,16,55,66,16,55,96,16,55,3,32,55,33,32,55,16,64,55,48,64,55,76,64,55, +105,64,55,134,64,55,162,64,55,12,66,55,12,68,55,12,70,55,12,72,55,5,80,55,34,80,55,64,80,55,16,0,56,48,0,56,80,0,56,112,0,56,144,0,56,174,0,56, +16,16,56,48,16,56,80,16,56,106,16,56,131,16,56,16,32,56,47,32,56,73,32,56,100,32,56,16,64,56,48,64,56,80,64,56,112,64,56,140,64,56,163,64,56,16,72,56, +16,80,56,41,80,56,69,80,56,97,80,56,8,96,56,35,96,56,64,96,56,16,128,56,48,128,56,80,128,56,112,128,56,140,128,56,163,128,56,16,130,56,16,132,56,16,134,56, +16,144,56,41,144,56,69,144,56,97,144,56,8,160,56,35,160,56,64,160,56,16,192,56,48,192,56,79,192,56,106,192,56,133,192,56,14,194,56,14,196,56,14,198,56,8,208,56, +35,208,56,64,208,56,3,224,56,32,224,56,16,0,57,48,0,57,79,0,57,106,0,57,133,0,57,14,2,57,14,4,57,14,6,57,14,8,57,8,16,57,35,16,57,64,16,57, +3,32,57,32,32,57,16,64,57,46,64,57,73,64,57,101,64,57,130,64,57,9,66,57,9,68,57,9,70,57,9,72,57,4,80,57,32,80,57,16,0,58,48,0,58,80,0,58, +112,0,58,144,0,58,16,16,58,48,16,58,76,16,58,99,16,58,16,32,58,44,32,58,69,32,58,16,64,58,48,64,58,80,64,58,108,64,58,132,64,58,15,80,58,39,80,58, +66,80,58,7,96,58,34,96,58,16,128,58,48,128,58,80,128,58,108,128,58,132,128,58,15,144,58,39,144,58,66,144,58,7,160,58,34,160,58,16,192,58,48,192,58,75,192,58, +101,192,58,128,192,58,7,208,58,34,208,58,2,224,58,16,0,59,48,0,59,75,0,59,101,0,59,128,0,59,7,16,59,34,16,59,2,32,59,16,64,59,44,64,59,70,64,59, +98,64,59,3,80,59,16,0,60,48,0,60,80,0,60,112,0,60,16,16,60,48,16,60,70,16,60,16,32,60,41,32,60,65,32,60,16,64,60,48,64,60,79,64,60,100,64,60, +14,80,60,37,80,60,6,96,60,32,96,60,16,128,60,48,128,60,79,128,60,100,128,60,14,144,60,37,144,60,6,160,60,32,160,60,16,192,60,47,192,60,71,192,60,96,192,60, +6,208,60,32,208,60,1,224,60,16,0,61,47,0,61,71,0,61,96,0,61,6,16,61,32,16,61,1,32,61,16,64,61,41,64,61,67,64,61,3,80,61,16,0,62,48,0,62, +80,0,62,16,16,62,45,16,62,64,16,62,16,32,62,38,32,62,16,64,62,48,64,62,73,64,62,12,80,62,34,80,62,5,96,62,16,128,62,48,128,62,73,128,62,12,144,62, +34,144,62,5,160,62,16,192,62,44,192,62,67,192,62,5,208,62,1,224,62,16,0,63,44,0,63,67,0,63,5,16,63,1,32,63,16,64,63,39,64,63,64,64,63,2,80,63, +16,0,64,48,0,64,78,0,64,16,16,64,40,16,64,16,32,64,35,32,64,16,64,64,48,64,64,67,64,64,11,80,64,32,80,64,4,96,64,16,128,64,48,128,64,67,128,64, +11,144,64,32,144,64,4,160,64,16,192,64,41,192,64,4,208,64,0,224,64,16,0,65,41,0,65,4,16,65,0,32,65,15,64,65,36,64,65,1,80,65,112,0,66,144,0,66, +176,0,66,208,0,66,240,0,66,16,1,66,48,1,66,80,1,66,112,1,66,112,16,66,144,16,66,176,16,66,208,16,66,240,16,66,16,17,66,48,17,66,80,17,66,112,17,66, +112,32,66,144,32,66,176,32,66,208,32,66,240,32,66,16,33,66,48,33,66,80,33,66,112,33,66,112,64,66,144,64,66,176,64,66,208,64,66,240,64,66,16,65,66,48,65,66, +80,65,66,112,65,66,48,72,66,80,72,66,112,72,66,144,72,66,176,72,66,208,72,66,240,72,66,13,73,66,40,73,66,68,73,66,112,80,66,144,80,66,176,80,66,208,80,66, +239,80,66,14,81,66,44,81,66,76,81,66,106,81,66,106,96,66,138,96,66,169,96,66,200,96,66,231,96,66,6,97,66,37,97,66,69,97,66,100,97,66,112,128,66,144,128,66, +176,128,66,208,128,66,240,128,66,16,129,66,48,129,66,80,129,66,112,129,66,48,130,66,80,130,66,112,130,66,144,130,66,176,130,66,208,130,66,240,130,66,13,131,66,40,131,66, +68,131,66,48,132,66,80,132,66,112,132,66,144,132,66,176,132,66,208,132,66,240,132,66,13,133,66,40,133,66,68,133,66,48,134,66,80,134,66,112,134,66,144,134,66,176,134,66, +208,134,66,240,134,66,13,135,66,40,135,66,68,135,66,112,144,66,144,144,66,176,144,66,208,144,66,239,144,66,14,145,66,44,145,66,76,145,66,106,145,66,106,160,66,138,160,66, +169,160,66,200,160,66,231,160,66,6,161,66,37,161,66,69,161,66,100,161,66,112,192,66,144,192,66,176,192,66,208,192,66,240,192,66,16,193,66,48,193,66,80,193,66,112,193,66, +48,194,66,80,194,66,112,194,66,144,194,66,176,194,66,205,194,66,234,194,66,6,195,66,35,195,66,64,195,66,48,196,66,80,196,66,112,196,66,144,196,66,176,196,66,205,196,66, +234,196,66,6,197,66,35,197,66,64,197,66,48,198,66,80,198,66,112,198,66,144,198,66,176,198,66,205,198,66,234,198,66,6,199,66,35,199,66,64,199,66,106,208,66,138,208,66, +169,208,66,200,208,66,231,208,66,6,209,66,37,209,66,69,209,66,100,209,66,100,224,66,132,224,66,163,224,66,195,224,66,226,224,66,1,225,66,33,225,66,64,225,66,96,225,66, +112,0,67,144,0,67,176,0,67,208,0,67,240,0,67,16,1,67,48,1,67,80,1,67,112,1,67,48,2,67,80,2,67,112,2,67,144,2,67,176,2,67,205,2,67,234,2,67, +6,3,67,35,3,67,64,3,67,48,4,67,80,4,67,112,4,67,144,4,67,176,4,67,205,4,67,234,4,67,6,5,67,35,5,67,64,5,67,48,6,67,80,6,67,112,6,67, +144,6,67,176,6,67,205,6,67,234,6,67,6,7,67,35,7,67,64,7,67,48,8,67,80,8,67,112,8,67,144,8,67,176,8,67,205,8,67,234,8,67,6,9,67,35,9,67, +64,9,67,106,16,67,138,16,67,169,16,67,200,16,67,231,16,67,6,17,67,37,17,67,69,17,67,100,17,67,100,32,67,132,32,67,163,32,67,195,32,67,226,32,67,1,33,67, +33,33,67,64,33,67,96,33,67,112,64,67,144,64,67,176,64,67,208,64,67,240,64,67,16,65,67,48,65,67,80,65,67,111,65,67,48,66,67,80,66,67,111,66,67,141,66,67, +170,66,67,199,66,67,230,66,67,3,67,67,32,67,67,48,68,67,80,68,67,111,68,67,141,68,67,170,68,67,199,68,67,230,68,67,3,69,67,32,69,67,48,70,67,80,70,67, +111,70,67,141,70,67,170,70,67,199,70,67,230,70,67,3,71,67,32,71,67,48,72,67,80,72,67,111,72,67,141,72,67,170,72,67,199,72,67,230,72,67,3,73,67,32,73,67, +102,80,67,133,80,67,164,80,67,196,80,67,227,80,67,3,81,67,34,81,67,65,81,67,97,81,67,48,0,68,80,0,68,112,0,68,144,0,68,176,0,68,208,0,68,240,0,68, +16,1,68,48,1,68,80,1,68,112,1,68,48,16,68,80,16,68,112,16,68,144,16,68,176,16,68,208,16,68,240,16,68,16,17,68,48,17,68,78,17,68,106,17,68,48,32,68, +80,32,68,112,32,68,144,32,68,176,32,68,208,32,68,238,32,68,11,33,68,41,33,68,71,33,68,100,33,68,48,64,68,80,64,68,112,64,68,144,64,68,176,64,68,208,64,68, +240,64,68,16,65,68,48,65,68,80,65,68,112,65,68,16,72,68,48,72,68,80,72,68,112,72,68,143,72,68,166,72,68,48,80,68,80,80,68,112,80,68,142,80,68,172,80,68, +202,80,68,233,80,68,6,81,68,36,81,68,67,81,68,97,81,68,42,96,68,73,96,68,104,96,68,135,96,68,165,96,68,196,96,68,227,96,68,1,97,68,32,97,68,48,128,68, +80,128,68,112,128,68,144,128,68,176,128,68,208,128,68,240,128,68,16,129,68,48,129,68,80,129,68,112,129,68,16,130,68,48,130,68,80,130,68,112,130,68,143,130,68,166,130,68, +16,132,68,48,132,68,80,132,68,112,132,68,143,132,68,166,132,68,16,134,68,48,134,68,80,134,68,112,134,68,143,134,68,166,134,68,48,144,68,80,144,68,112,144,68,142,144,68, +172,144,68,202,144,68,233,144,68,6,145,68,36,145,68,67,145,68,97,145,68,42,160,68,73,160,68,104,160,68,135,160,68,165,160,68,196,160,68,227,160,68,1,161,68,32,161,68, +48,192,68,80,192,68,112,192,68,144,192,68,176,192,68,208,192,68,240,192,68,16,193,68,47,193,68,77,193,68,106,193,68,16,194,68,48,194,68,80,194,68,107,194,68,135,194,68, +161,194,68,16,196,68,48,196,68,80,196,68,107,196,68,135,196,68,161,196,68,16,198,68,48,198,68,80,198,68,107,198,68,135,198,68,161,198,68,42,208,68,73,208,68,104,208,68, +135,208,68,165,208,68,196,208,68,227,208,68,1,209,68,32,209,68,36,224,68,67,224,68,98,224,68,130,224,68,161,224,68,192,224,68,48,0,69,80,0,69,112,0,69,144,0,69, +176,0,69,208,0,69,240,0,69,16,1,69,47,1,69,77,1,69,106,1,69,16,2,69,48,2,69,80,2,69,107,2,69,135,2,69,161,2,69,16,4,69,48,4,69,80,4,69, +107,4,69,135,4,69,161,4,69,16,6,69,48,6,69,80,6,69,107,6,69,135,6,69,161,6,69,16,8,69,48,8,69,80,8,69,107,8,69,135,8,69,161,8,69,42,16,69, +73,16,69,104,16,69,135,16,69,165,16,69,196,16,69,227,16,69,1,17,69,32,17,69,36,32,69,67,32,69,98,32,69,130,32,69,161,32,69,192,32,69,48,64,69,80,64,69, +112,64,69,144,64,69,176,64,69,207,64,69,237,64,69,11,65,69,41,65,69,71,65,69,101,65,69,16,66,69,47,66,69,74,66,69,102,66,69,131,66,69,16,68,69,47,68,69, +74,68,69,102,68,69,131,68,69,16,70,69,47,70,69,74,70,69,102,70,69,131,70,69,16,72,69,47,72,69,74,72,69,102,72,69,131,72,69,38,80,69,68,80,69,100,80,69, +131,80,69,162,80,69,193,80,69,224,80,69,48,0,70,80,0,70,112,0,70,144,0,70,176,0,70,208,0,70,240,0,70,16,1,70,48,1,70,80,1,70,48,16,70,80,16,70, +112,16,70,144,16,70,176,16,70,208,16,70,236,16,70,6,17,70,33,17,70,48,32,70,80,32,70,112,32,70,143,32,70,171,32,70,200,32,70,229,32,70,1,33,70,48,64,70, +80,64,70,112,64,70,144,64,70,176,64,70,208,64,70,240,64,70,15,65,70,42,65,70,70,65,70,16,72,70,48,72,70,77,72,70,99,72,70,48,80,70,78,80,70,107,80,70, +137,80,70,166,80,70,196,80,70,226,80,70,40,96,70,70,96,70,101,96,70,131,96,70,161,96,70,192,96,70,48,128,70,80,128,70,112,128,70,144,128,70,176,128,70,208,128,70, +240,128,70,15,129,70,42,129,70,70,129,70,16,130,70,48,130,70,77,130,70,99,130,70,16,132,70,48,132,70,77,132,70,99,132,70,16,134,70,48,134,70,77,134,70,99,134,70, +48,144,70,78,144,70,107,144,70,137,144,70,166,144,70,196,144,70,226,144,70,40,160,70,70,160,70,101,160,70,131,160,70,161,160,70,192,160,70,48,192,70,80,192,70,112,192,70, +144,192,70,176,192,70,206,192,70,235,192,70,7,193,70,36,193,70,65,193,70,16,194,70,46,194,70,70,194,70,16,196,70,46,196,70,70,196,70,16,198,70,46,198,70,70,198,70, +40,208,70,70,208,70,101,208,70,131,208,70,161,208,70,192,208,70,35,224,70,65,224,70,96,224,70,128,224,70,48,0,71,80,0,71,112,0,71,144,0,71,176,0,71,206,0,71, +235,0,71,7,1,71,36,1,71,65,1,71,16,2,71,46,2,71,70,2,71,16,4,71,46,4,71,70,4,71,16,6,71,46,6,71,70,6,71,16,8,71,46,8,71,70,8,71, +40,16,71,70,16,71,101,16,71,131,16,71,161,16,71,192,16,71,35,32,71,65,32,71,96,32,71,128,32,71,48,64,71,80,64,71,112,64,71,142,64,71,171,64,71,200,64,71, +230,64,71,3,65,71,33,65,71,16,66,71,41,66,71,67,66,71,16,68,71,41,68,71,67,68,71,16,70,71,41,70,71,67,70,71,16,72,71,41,72,71,67,72,71,36,80,71, +67,80,71,97,80,71,128,80,71,16,0,72,48,0,72,80,0,72,112,0,72,144,0,72,176,0,72,208,0,72,240,0,72,16,16,72,48,16,72,80,16,72,112,16,72,144,16,72, +170,16,72,195,16,72,16,32,72,48,32,72,80,32,72,108,32,72,137,32,72,164,32,72,16,64,72,48,64,72,80,64,72,112,64,72,144,64,72,176,64,72,204,64,72,231,64,72, +16,72,72,45,72,72,16,80,72,46,80,72,74,80,72,103,80,72,132,80,72,161,80,72,10,96,72,38,96,72,68,96,72,98,96,72,128,96,72,16,128,72,48,128,72,80,128,72, +112,128,72,144,128,72,176,128,72,204,128,72,231,128,72,16,130,72,45,130,72,16,132,72,45,132,72,16,134,72,45,134,72,16,144,72,46,144,72,74,144,72,103,144,72,132,144,72, +161,144,72,10,160,72,38,160,72,68,160,72,98,160,72,128,160,72,16,192,72,48,192,72,80,192,72,112,192,72,143,192,72,170,192,72,197,192,72,226,192,72,16,194,72,38,194,72, +16,196,72,38,196,72,16,198,72,38,198,72,10,208,72,38,208,72,68,208,72,98,208,72,128,208,72,4,224,72,33,224,72,64,224,72,16,0,73,48,0,73,80,0,73,112,0,73, +143,0,73,170,0,73,197,0,73,226,0,73,16,2,73,38,2,73,16,4,73,38,4,73,16,6,73,38,6,73,16,8,73,38,8,73,10,16,73,38,16,73,68,16,73,98,16,73, +128,16,73,4,32,73,33,32,73,64,32,73,16,64,73,48,64,73,79,64,73,107,64,73,137,64,73,165,64,73,194,64,73,224,64,73,14,66,73,35,66,73,14,68,73,35,68,73, +14,70,73,35,70,73,14,72,73,35,72,73,6,80,73,35,80,73,65,80,73,16,0,74,48,0,74,80,0,74,112,0,74,144,0,74,176,0,74,16,16,74,48,16,74,80,16,74, +109,16,74,135,16,74,16,32,74,48,32,74,75,32,74,102,32,74,130,32,74,16,64,74,48,64,74,80,64,74,112,64,74,144,64,74,167,64,74,16,72,74,33,72,74,16,80,74, +43,80,74,70,80,74,99,80,74,128,80,74,9,96,74,36,96,74,65,96,74,16,128,74,48,128,74,80,128,74,112,128,74,144,128,74,167,128,74,16,130,74,33,130,74,16,132,74, +33,132,74,16,134,74,33,134,74,16,144,74,43,144,74,70,144,74,99,144,74,128,144,74,9,160,74,36,160,74,65,160,74,16,192,74,48,192,74,80,192,74,108,192,74,136,192,74, +162,192,74,16,194,74,16,196,74,16,198,74,9,208,74,36,208,74,65,208,74,3,224,74,32,224,74,16,0,75,48,0,75,80,0,75,108,0,75,136,0,75,162,0,75,16,2,75, +16,4,75,16,6,75,16,8,75,9,16,75,36,16,75,65,16,75,3,32,75,32,32,75,16,64,75,47,64,75,75,64,75,103,64,75,132,64,75,160,64,75,10,66,75,10,68,75, +10,70,75,10,72,75,4,80,75,33,80,75,16,0,76,48,0,76,80,0,76,112,0,76,144,0,76,16,16,76,48,16,76,77,16,76,100,16,76,16,32,76,45,32,76,70,32,76, +96,32,76,16,64,76,48,64,76,80,64,76,109,64,76,135,64,76,16,80,76,40,80,76,67,80,76,8,96,76,34,96,76,16,128,76,48,128,76,80,128,76,109,128,76,135,128,76, +16,144,76,40,144,76,67,144,76,8,160,76,34,160,76,16,192,76,48,192,76,76,192,76,102,192,76,130,192,76,8,208,76,34,208,76,2,224,76,16,0,77,48,0,77,76,0,77, +102,0,77,130,0,77,8,16,77,34,16,77,2,32,77,16,64,77,44,64,77,71,64,77,99,64,77,4,80,77,32,80,77,16,0,78,48,0,78,80,0,78,112,0,78,16,16,78, +48,16,78,70,16,78,16,32,78,41,32,78,65,32,78,16,64,78,48,64,78,79,64,78,100,64,78,14,80,78,37,80,78,6,96,78,32,96,78,16,128,78,48,128,78,79,128,78, +100,128,78,14,144,78,37,144,78,6,160,78,32,160,78,16,192,78,47,192,78,71,192,78,96,192,78,6,208,78,32,208,78,1,224,78,16,0,79,47,0,79,71,0,79,96,0,79, +6,16,79,32,16,79,1,32,79,16,64,79,41,64,79,67,64,79,3,80,79,16,0,80,48,0,80,80,0,80,16,16,80,44,16,80,16,32,80,37,32,80,16,64,80,48,64,80, +71,64,80,12,80,80,34,80,80,5,96,80,16,128,80,48,128,80,71,128,80,12,144,80,34,144,80,5,160,80,16,192,80,43,192,80,66,192,80,5,208,80,1,224,80,16,0,81, +43,0,81,66,0,81,5,16,81,1,32,81,16,64,81,38,64,81,64,64,81,2,80,81,16,0,82,48,0,82,16,16,82,38,16,82,16,32,82,33,32,82,16,64,82,47,64,82, +10,80,82,4,96,82,16,128,82,47,128,82,10,144,82,4,160,82,16,192,82,39,192,82,4,208,82,0,224,82,16,0,83,39,0,83,4,16,83,0,32,83,15,64,83,35,64,83, +1,80,83,16,0,84,48,0,84,16,16,84,32,16,84,14,32,84,16,64,84,41,64,84,8,80,84,3,96,84,16,128,84,41,128,84,8,144,84,3,160,84,16,192,84,35,192,84, +3,208,84,16,0,85,35,0,85,3,16,85,13,64,85,32,64,85,0,80,85,16,0,86,46,0,86,16,16,86,11,32,86,16,64,86,35,64,86,6,80,86,1,96,86,16,128,86, +35,128,86,6,144,86,1,160,86,16,192,86,1,208,86,16,0,87,1,16,87,11,64,87,80,0,88,112,0,88,144,0,88,176,0,88,208,0,88,240,0,88,16,1,88,48,1,88, +80,1,88,112,1,88,80,16,88,112,16,88,144,16,88,176,16,88,208,16,88,240,16,88,16,17,88,48,17,88,80,17,88,112,17,88,80,32,88,112,32,88,144,32,88,176,32,88, +208,32,88,240,32,88,16,33,88,47,33,88,77,33,88,107,33,88,80,64,88,112,64,88,144,64,88,176,64,88,208,64,88,240,64,88,16,65,88,48,65,88,80,65,88,112,65,88, +16,72,88,48,72,88,80,72,88,112,72,88,144,72,88,176,72,88,205,72,88,232,72,88,1,73,88,80,80,88,112,80,88,144,80,88,175,80,88,206,80,88,236,80,88,11,81,88, +41,81,88,72,81,88,102,81,88,74,96,88,105,96,88,136,96,88,167,96,88,198,96,88,229,96,88,4,97,88,35,97,88,66,97,88,97,97,88,80,128,88,112,128,88,144,128,88, +176,128,88,208,128,88,240,128,88,16,129,88,48,129,88,80,129,88,112,129,88,16,130,88,48,130,88,80,130,88,112,130,88,144,130,88,176,130,88,205,130,88,232,130,88,1,131,88, +16,132,88,48,132,88,80,132,88,112,132,88,144,132,88,176,132,88,205,132,88,232,132,88,1,133,88,16,134,88,48,134,88,80,134,88,112,134,88,144,134,88,176,134,88,205,134,88, +232,134,88,1,135,88,80,144,88,112,144,88,144,144,88,175,144,88,206,144,88,236,144,88,11,145,88,41,145,88,72,145,88,102,145,88,74,160,88,105,160,88,136,160,88,167,160,88, +198,160,88,229,160,88,4,161,88,35,161,88,66,161,88,97,161,88,80,192,88,112,192,88,144,192,88,176,192,88,208,192,88,240,192,88,16,193,88,48,193,88,80,193,88,112,193,88, +16,194,88,48,194,88,80,194,88,112,194,88,143,194,88,170,194,88,198,194,88,227,194,88,16,196,88,48,196,88,80,196,88,112,196,88,143,196,88,170,196,88,198,196,88,227,196,88, +16,198,88,48,198,88,80,198,88,112,198,88,143,198,88,170,198,88,198,198,88,227,198,88,74,208,88,105,208,88,136,208,88,167,208,88,198,208,88,229,208,88,4,209,88,35,209,88, +66,209,88,97,209,88,68,224,88,99,224,88,131,224,88,162,224,88,193,224,88,225,224,88,0,225,88,32,225,88,80,0,89,112,0,89,144,0,89,176,0,89,208,0,89,240,0,89, +16,1,89,48,1,89,80,1,89,112,1,89,16,2,89,48,2,89,80,2,89,112,2,89,143,2,89,170,2,89,198,2,89,227,2,89,16,4,89,48,4,89,80,4,89,112,4,89, +143,4,89,170,4,89,198,4,89,227,4,89,16,6,89,48,6,89,80,6,89,112,6,89,143,6,89,170,6,89,198,6,89,227,6,89,16,8,89,48,8,89,80,8,89,112,8,89, +143,8,89,170,8,89,198,8,89,227,8,89,74,16,89,105,16,89,136,16,89,167,16,89,198,16,89,229,16,89,4,17,89,35,17,89,66,17,89,97,17,89,68,32,89,99,32,89, +131,32,89,162,32,89,193,32,89,225,32,89,0,33,89,32,33,89,80,64,89,112,64,89,144,64,89,176,64,89,208,64,89,240,64,89,15,65,89,46,65,89,76,65,89,107,65,89, +16,66,89,48,66,89,79,66,89,108,66,89,137,66,89,166,66,89,195,66,89,224,66,89,16,68,89,48,68,89,79,68,89,108,68,89,137,68,89,166,68,89,195,68,89,224,68,89, +16,70,89,48,70,89,79,70,89,108,70,89,137,70,89,166,70,89,195,70,89,224,70,89,16,72,89,48,72,89,79,72,89,108,72,89,137,72,89,166,72,89,195,72,89,224,72,89, +70,80,89,101,80,89,132,80,89,163,80,89,195,80,89,226,80,89,1,81,89,32,81,89,64,81,89,48,0,90,80,0,90,112,0,90,144,0,90,176,0,90,208,0,90,240,0,90, +16,1,90,48,1,90,80,1,90,112,1,90,48,16,90,80,16,90,112,16,90,144,16,90,176,16,90,208,16,90,240,16,90,12,17,90,39,17,90,68,17,90,48,32,90,80,32,90, +112,32,90,144,32,90,174,32,90,203,32,90,233,32,90,5,33,90,34,33,90,64,33,90,48,64,90,80,64,90,112,64,90,144,64,90,176,64,90,208,64,90,240,64,90,16,65,90, +48,65,90,77,65,90,103,65,90,16,72,90,48,72,90,80,72,90,106,72,90,131,72,90,48,80,90,79,80,90,109,80,90,139,80,90,169,80,90,198,80,90,228,80,90,2,81,90, +32,81,90,41,96,90,71,96,90,102,96,90,133,96,90,163,96,90,193,96,90,224,96,90,48,128,90,80,128,90,112,128,90,144,128,90,176,128,90,208,128,90,240,128,90,16,129,90, +48,129,90,77,129,90,103,129,90,16,130,90,48,130,90,80,130,90,106,130,90,131,130,90,16,132,90,48,132,90,80,132,90,106,132,90,131,132,90,16,134,90,48,134,90,80,134,90, +106,134,90,131,134,90,48,144,90,79,144,90,109,144,90,139,144,90,169,144,90,198,144,90,228,144,90,2,145,90,32,145,90,41,160,90,71,160,90,102,160,90,133,160,90,163,160,90, +193,160,90,224,160,90,48,192,90,80,192,90,112,192,90,144,192,90,176,192,90,208,192,90,239,192,90,11,193,90,40,193,90,70,193,90,98,193,90,16,194,90,48,194,90,74,194,90, +100,194,90,16,196,90,48,196,90,74,196,90,100,196,90,16,198,90,48,198,90,74,198,90,100,198,90,41,208,90,71,208,90,102,208,90,133,208,90,163,208,90,193,208,90,224,208,90, +35,224,90,66,224,90,97,224,90,128,224,90,48,0,91,80,0,91,112,0,91,144,0,91,176,0,91,208,0,91,239,0,91,11,1,91,40,1,91,70,1,91,98,1,91,16,2,91, +48,2,91,74,2,91,100,2,91,16,4,91,48,4,91,74,4,91,100,4,91,16,6,91,48,6,91,74,6,91,100,6,91,16,8,91,48,8,91,74,8,91,100,8,91,41,16,91, +71,16,91,102,16,91,133,16,91,163,16,91,193,16,91,224,16,91,35,32,91,66,32,91,97,32,91,128,32,91,48,64,91,80,64,91,112,64,91,144,64,91,173,64,91,203,64,91, +233,64,91,6,65,91,36,65,91,66,65,91,96,65,91,16,66,91,43,66,91,70,66,91,97,66,91,16,68,91,43,68,91,70,68,91,97,68,91,16,70,91,43,70,91,70,70,91, +97,70,91,16,72,91,43,72,91,70,72,91,97,72,91,37,80,91,67,80,91,98,80,91,129,80,91,160,80,91,16,0,92,48,0,92,80,0,92,112,0,92,144,0,92,176,0,92, +208,0,92,240,0,92,14,1,92,16,16,92,48,16,92,80,16,92,112,16,92,144,16,92,172,16,92,198,16,92,225,16,92,16,32,92,48,32,92,80,32,92,109,32,92,138,32,92, +165,32,92,193,32,92,16,64,92,48,64,92,80,64,92,112,64,92,144,64,92,176,64,92,207,64,92,234,64,92,3,65,92,16,72,92,48,72,92,65,72,92,16,80,92,46,80,92, +75,80,92,104,80,92,133,80,92,162,80,92,10,96,92,39,96,92,68,96,92,98,96,92,129,96,92,16,128,92,48,128,92,80,128,92,112,128,92,144,128,92,176,128,92,207,128,92, +234,128,92,3,129,92,16,130,92,48,130,92,65,130,92,16,132,92,48,132,92,65,132,92,16,134,92,48,134,92,65,134,92,16,144,92,46,144,92,75,144,92,104,144,92,133,144,92, +162,144,92,10,160,92,39,160,92,68,160,92,98,160,92,129,160,92,16,192,92,48,192,92,80,192,92,112,192,92,144,192,92,171,192,92,199,192,92,228,192,92,16,194,92,40,194,92, +16,196,92,40,196,92,16,198,92,40,198,92,10,208,92,39,208,92,68,208,92,98,208,92,129,208,92,4,224,92,34,224,92,64,224,92,16,0,93,48,0,93,80,0,93,112,0,93, +144,0,93,171,0,93,199,0,93,228,0,93,16,2,93,40,2,93,16,4,93,40,4,93,16,6,93,40,6,93,16,8,93,40,8,93,10,16,93,39,16,93,68,16,93,98,16,93, +129,16,93,4,32,93,34,32,93,64,32,93,16,64,93,48,64,93,79,64,93,108,64,93,138,64,93,166,64,93,195,64,93,225,64,93,15,66,93,36,66,93,15,68,93,36,68,93, +15,70,93,36,70,93,15,72,93,36,72,93,6,80,93,35,80,93,65,80,93,96,80,93,16,0,94,48,0,94,80,0,94,112,0,94,144,0,94,176,0,94,16,16,94,48,16,94, +80,16,94,109,16,94,135,16,94,16,32,94,48,32,94,75,32,94,102,32,94,130,32,94,16,64,94,48,64,94,80,64,94,112,64,94,144,64,94,167,64,94,16,72,94,33,72,94, +16,80,94,43,80,94,70,80,94,99,80,94,128,80,94,9,96,94,36,96,94,65,96,94,16,128,94,48,128,94,80,128,94,112,128,94,144,128,94,167,128,94,16,130,94,33,130,94, +16,132,94,33,132,94,16,134,94,33,134,94,16,144,94,43,144,94,70,144,94,99,144,94,128,144,94,9,160,94,36,160,94,65,160,94,16,192,94,48,192,94,80,192,94,108,192,94, +136,192,94,162,192,94,16,194,94,16,196,94,16,198,94,9,208,94,36,208,94,65,208,94,3,224,94,32,224,94,16,0,95,48,0,95,80,0,95,108,0,95,136,0,95,162,0,95, +16,2,95,16,4,95,16,6,95,16,8,95,9,16,95,36,16,95,65,16,95,3,32,95,32,32,95,16,64,95,47,64,95,75,64,95,103,64,95,132,64,95,160,64,95,10,66,95, +10,68,95,10,70,95,10,72,95,4,80,95,33,80,95,16,0,96,48,0,96,80,0,96,112,0,96,144,0,96,16,16,96,48,16,96,76,16,96,99,16,96,16,32,96,44,32,96, +69,32,96,16,64,96,48,64,96,80,64,96,108,64,96,132,64,96,15,80,96,39,80,96,66,80,96,7,96,96,34,96,96,16,128,96,48,128,96,80,128,96,108,128,96,132,128,96, +15,144,96,39,144,96,66,144,96,7,160,96,34,160,96,16,192,96,48,192,96,75,192,96,101,192,96,128,192,96,7,208,96,34,208,96,2,224,96,16,0,97,48,0,97,75,0,97, +101,0,97,128,0,97,7,16,97,34,16,97,2,32,97,16,64,97,44,64,97,70,64,97,98,64,97,3,80,97,16,0,98,48,0,98,80,0,98,16,16,98,47,16,98,67,16,98, +16,32,98,39,32,98,16,64,98,48,64,98,76,64,98,13,80,98,35,80,98,6,96,98,16,128,98,48,128,98,76,128,98,13,144,98,35,144,98,6,160,98,16,192,98,45,192,98, +69,192,98,6,208,98,1,224,98,16,0,99,45,0,99,69,0,99,6,16,99,1,32,99,16,64,99,40,64,99,66,64,99,2,80,99,16,0,100,48,0,100,78,0,100,16,16,100, +40,16,100,16,32,100,35,32,100,16,64,100,48,64,100,67,64,100,11,80,100,32,80,100,4,96,100,16,128,100,48,128,100,67,128,100,11,144,100,32,144,100,4,160,100,16,192,100, +41,192,100,4,208,100,0,224,100,16,0,101,41,0,101,4,16,101,0,32,101,15,64,101,36,64,101,1,80,101,16,0,102,48,0,102,16,16,102,33,16,102,14,32,102,16,64,102, +42,64,102,9,80,102,3,96,102,16,128,102,42,128,102,9,144,102,3,160,102,16,192,102,36,192,102,3,208,102,16,0,103,36,0,103,3,16,103,13,64,103,33,64,103,0,80,103, +16,0,104,46,0,104,16,16,104,11,32,104,16,64,104,35,64,104,6,80,104,1,96,104,16,128,104,35,128,104,6,144,104,1,160,104,16,192,104,1,208,104,16,0,105,1,16,105, +11,64,105,80,0,110,112,0,110,144,0,110,176,0,110,208,0,110,240,0,110,16,1,110,48,1,110,80,1,110,112,1,110,80,16,110,112,16,110,144,16,110,176,16,110,208,16,110, +240,16,110,16,17,110,48,17,110,80,17,110,109,17,110,80,32,110,112,32,110,144,32,110,176,32,110,208,32,110,240,32,110,13,33,110,43,33,110,73,33,110,102,33,110,80,64,110, +112,64,110,144,64,110,176,64,110,208,64,110,240,64,110,16,65,110,48,65,110,80,65,110,112,65,110,16,72,110,48,72,110,80,72,110,112,72,110,144,72,110,170,72,110,195,72,110, +80,80,110,112,80,110,143,80,110,173,80,110,203,80,110,234,80,110,8,81,110,38,81,110,68,81,110,99,81,110,73,96,110,104,96,110,135,96,110,166,96,110,197,96,110,228,96,110, +2,97,110,33,97,110,64,97,110,80,128,110,112,128,110,144,128,110,176,128,110,208,128,110,240,128,110,16,129,110,48,129,110,80,129,110,112,129,110,16,130,110,48,130,110,80,130,110, +112,130,110,144,130,110,170,130,110,195,130,110,16,132,110,48,132,110,80,132,110,112,132,110,144,132,110,170,132,110,195,132,110,16,134,110,48,134,110,80,134,110,112,134,110,144,134,110, +170,134,110,195,134,110,80,144,110,112,144,110,143,144,110,173,144,110,203,144,110,234,144,110,8,145,110,38,145,110,68,145,110,99,145,110,73,160,110,104,160,110,135,160,110,166,160,110, +197,160,110,228,160,110,2,161,110,33,161,110,64,161,110,80,192,110,112,192,110,144,192,110,176,192,110,208,192,110,240,192,110,16,193,110,48,193,110,79,193,110,108,193,110,16,194,110, +48,194,110,80,194,110,109,194,110,138,194,110,164,194,110,16,196,110,48,196,110,80,196,110,109,196,110,138,196,110,164,196,110,16,198,110,48,198,110,80,198,110,109,198,110,138,198,110, +164,198,110,73,208,110,104,208,110,135,208,110,166,208,110,197,208,110,228,208,110,2,209,110,33,209,110,64,209,110,67,224,110,99,224,110,130,224,110,161,224,110,192,224,110,224,224,110, +80,0,111,112,0,111,144,0,111,176,0,111,208,0,111,240,0,111,16,1,111,48,1,111,79,1,111,108,1,111,16,2,111,48,2,111,80,2,111,109,2,111,138,2,111,164,2,111, +16,4,111,48,4,111,80,4,111,109,4,111,138,4,111,164,4,111,16,6,111,48,6,111,80,6,111,109,6,111,138,6,111,164,6,111,16,8,111,48,8,111,80,8,111,109,8,111, +138,8,111,164,8,111,73,16,111,104,16,111,135,16,111,166,16,111,197,16,111,228,16,111,2,17,111,33,17,111,64,17,111,67,32,111,99,32,111,130,32,111,161,32,111,192,32,111, +224,32,111,80,64,111,112,64,111,144,64,111,176,64,111,208,64,111,238,64,111,12,65,111,42,65,111,73,65,111,103,65,111,16,66,111,48,66,111,76,66,111,104,66,111,133,66,111, +161,66,111,16,68,111,48,68,111,76,68,111,104,68,111,133,68,111,161,68,111,16,70,111,48,70,111,76,70,111,104,70,111,133,70,111,161,70,111,16,72,111,48,72,111,76,72,111, +104,72,111,133,72,111,161,72,111,69,80,111,100,80,111,131,80,111,162,80,111,193,80,111,225,80,111,0,81,111,48,0,112,80,0,112,112,0,112,144,0,112,176,0,112,208,0,112, +240,0,112,16,1,112,48,1,112,48,16,112,80,16,112,112,16,112,144,16,112,176,16,112,205,16,112,233,16,112,3,17,112,48,32,112,80,32,112,112,32,112,142,32,112,170,32,112, +198,32,112,227,32,112,48,64,112,80,64,112,112,64,112,144,64,112,176,64,112,208,64,112,240,64,112,12,65,112,39,65,112,16,72,112,48,72,112,74,72,112,48,80,112,77,80,112, +106,80,112,136,80,112,165,80,112,195,80,112,224,80,112,40,96,112,70,96,112,100,96,112,131,96,112,161,96,112,48,128,112,80,128,112,112,128,112,144,128,112,176,128,112,208,128,112, +240,128,112,12,129,112,39,129,112,16,130,112,48,130,112,74,130,112,16,132,112,48,132,112,74,132,112,16,134,112,48,134,112,74,134,112,48,144,112,77,144,112,106,144,112,136,144,112, +165,144,112,195,144,112,224,144,112,40,160,112,70,160,112,100,160,112,131,160,112,161,160,112,48,192,112,80,192,112,112,192,112,144,192,112,176,192,112,204,192,112,233,192,112,5,193,112, +34,193,112,16,194,112,44,194,112,68,194,112,16,196,112,44,196,112,68,196,112,16,198,112,44,198,112,68,198,112,40,208,112,70,208,112,100,208,112,131,208,112,161,208,112,35,224,112, +65,224,112,96,224,112,48,0,113,80,0,113,112,0,113,144,0,113,176,0,113,204,0,113,233,0,113,5,1,113,34,1,113,16,2,113,44,2,113,68,2,113,16,4,113,44,4,113, +68,4,113,16,6,113,44,6,113,68,6,113,16,8,113,44,8,113,68,8,113,40,16,113,70,16,113,100,16,113,131,16,113,161,16,113,35,32,113,65,32,113,96,32,113,48,64,113, +80,64,113,111,64,113,141,64,113,170,64,113,199,64,113,229,64,113,2,65,113,16,66,113,39,66,113,65,66,113,16,68,113,39,68,113,65,68,113,16,70,113,39,70,113,65,70,113, +16,72,113,39,72,113,65,72,113,36,80,113,66,80,113,97,80,113,128,80,113,16,0,114,48,0,114,80,0,114,112,0,114,144,0,114,176,0,114,208,0,114,16,16,114,48,16,114, +80,16,114,112,16,114,139,16,114,163,16,114,16,32,114,48,32,114,77,32,114,104,32,114,133,32,114,16,64,114,48,64,114,80,64,114,112,64,114,144,64,114,172,64,114,196,64,114, +16,72,114,38,72,114,16,80,114,44,80,114,72,80,114,100,80,114,129,80,114,9,96,114,37,96,114,66,96,114,96,96,114,16,128,114,48,128,114,80,128,114,112,128,114,144,128,114, +172,128,114,196,128,114,16,130,114,38,130,114,16,132,114,38,132,114,16,134,114,38,134,114,16,144,114,44,144,114,72,144,114,100,144,114,129,144,114,9,160,114,37,160,114,66,160,114, +96,160,114,16,192,114,48,192,114,80,192,114,110,192,114,139,192,114,165,192,114,192,192,114,16,194,114,33,194,114,16,196,114,33,196,114,16,198,114,33,198,114,9,208,114,37,208,114, +66,208,114,96,208,114,3,224,114,33,224,114,16,0,115,48,0,115,80,0,115,110,0,115,139,0,115,165,0,115,192,0,115,16,2,115,33,2,115,16,4,115,33,4,115,16,6,115, +33,6,115,16,8,115,33,8,115,9,16,115,37,16,115,66,16,115,96,16,115,3,32,115,33,32,115,16,64,115,48,64,115,76,64,115,105,64,115,134,64,115,162,64,115,12,66,115, +12,68,115,12,70,115,12,72,115,5,80,115,34,80,115,64,80,115,16,0,116,48,0,116,80,0,116,112,0,116,144,0,116,16,16,116,48,16,116,77,16,116,100,16,116,16,32,116, +45,32,116,70,32,116,96,32,116,16,64,116,48,64,116,80,64,116,109,64,116,135,64,116,16,80,116,40,80,116,67,80,116,8,96,116,34,96,116,16,128,116,48,128,116,80,128,116, +109,128,116,135,128,116,16,144,116,40,144,116,67,144,116,8,160,116,34,160,116,16,192,116,48,192,116,76,192,116,102,192,116,130,192,116,8,208,116,34,208,116,2,224,116,16,0,117, +48,0,117,76,0,117,102,0,117,130,0,117,8,16,117,34,16,117,2,32,117,16,64,117,44,64,117,71,64,117,99,64,117,4,80,117,32,80,117,16,0,118,48,0,118,80,0,118, +16,16,118,47,16,118,67,16,118,16,32,118,39,32,118,16,64,118,48,64,118,76,64,118,13,80,118,35,80,118,6,96,118,16,128,118,48,128,118,76,128,118,13,144,118,35,144,118, +6,160,118,16,192,118,45,192,118,69,192,118,6,208,118,1,224,118,16,0,119,45,0,119,69,0,119,6,16,119,1,32,119,16,64,119,40,64,119,66,64,119,2,80,119,16,0,120, +48,0,120,16,16,120,39,16,120,16,32,120,34,32,120,16,64,120,48,64,120,10,80,120,4,96,120,16,128,120,48,128,120,10,144,120,4,160,120,16,192,120,40,192,120,4,208,120, +0,224,120,16,0,121,40,0,121,4,16,121,0,32,121,15,64,121,36,64,121,1,80,121,16,0,122,48,0,122,16,16,122,13,32,122,16,64,122,39,64,122,8,80,122,2,96,122, +16,128,122,39,128,122,8,144,122,2,160,122,16,192,122,34,192,122,2,208,122,16,0,123,34,0,123,2,16,123,12,64,123,32,64,123,0,80,123,16,0,124,16,16,124,10,32,124, +16,64,124,5,80,124,1,96,124,16,128,124,5,144,124,1,160,124,16,192,124,1,208,124,16,0,125,1,16,125,10,64,125,48,0,132,80,0,132,112,0,132,144,0,132,176,0,132, +208,0,132,240,0,132,16,1,132,48,1,132,80,1,132,112,1,132,48,16,132,80,16,132,112,16,132,144,16,132,176,16,132,208,16,132,240,16,132,16,17,132,45,17,132,74,17,132, +102,17,132,48,32,132,80,32,132,112,32,132,144,32,132,176,32,132,206,32,132,236,32,132,9,33,132,38,33,132,68,33,132,97,33,132,48,64,132,80,64,132,112,64,132,144,64,132, +176,64,132,208,64,132,240,64,132,16,65,132,48,65,132,80,65,132,111,65,132,16,72,132,48,72,132,80,72,132,112,72,132,138,72,132,161,72,132,48,80,132,80,80,132,111,80,132, +141,80,132,171,80,132,201,80,132,231,80,132,5,81,132,35,81,132,65,81,132,42,96,132,72,96,132,103,96,132,134,96,132,164,96,132,195,96,132,226,96,132,0,97,132,48,128,132, +80,128,132,112,128,132,144,128,132,176,128,132,208,128,132,240,128,132,16,129,132,48,129,132,80,129,132,111,129,132,16,130,132,48,130,132,80,130,132,112,130,132,138,130,132,161,130,132, +16,132,132,48,132,132,80,132,132,112,132,132,138,132,132,161,132,132,16,134,132,48,134,132,80,134,132,112,134,132,138,134,132,161,134,132,48,144,132,80,144,132,111,144,132,141,144,132, +171,144,132,201,144,132,231,144,132,5,145,132,35,145,132,65,145,132,42,160,132,72,160,132,103,160,132,134,160,132,164,160,132,195,160,132,226,160,132,0,161,132,48,192,132,80,192,132, +112,192,132,144,192,132,176,192,132,208,192,132,240,192,132,15,193,132,44,193,132,74,193,132,103,193,132,16,194,132,48,194,132,78,194,132,105,194,132,132,194,132,16,196,132,48,196,132, +78,196,132,105,196,132,132,196,132,16,198,132,48,198,132,78,198,132,105,198,132,132,198,132,42,208,132,72,208,132,103,208,132,134,208,132,164,208,132,195,208,132,226,208,132,0,209,132, +36,224,132,67,224,132,98,224,132,129,224,132,160,224,132,48,0,133,80,0,133,112,0,133,144,0,133,176,0,133,208,0,133,240,0,133,15,1,133,44,1,133,74,1,133,103,1,133, +16,2,133,48,2,133,78,2,133,105,2,133,132,2,133,16,4,133,48,4,133,78,4,133,105,4,133,132,4,133,16,6,133,48,6,133,78,6,133,105,6,133,132,6,133,16,8,133, +48,8,133,78,8,133,105,8,133,132,8,133,42,16,133,72,16,133,103,16,133,134,16,133,164,16,133,195,16,133,226,16,133,0,17,133,36,32,133,67,32,133,98,32,133,129,32,133, +160,32,133,48,64,133,80,64,133,112,64,133,144,64,133,175,64,133,205,64,133,236,64,133,9,65,133,39,65,133,70,65,133,99,65,133,16,66,133,45,66,133,73,66,133,100,66,133, +129,66,133,16,68,133,45,68,133,73,68,133,100,68,133,129,68,133,16,70,133,45,70,133,73,70,133,100,70,133,129,70,133,16,72,133,45,72,133,73,72,133,100,72,133,129,72,133, +37,80,133,68,80,133,99,80,133,130,80,133,161,80,133,192,80,133,16,0,134,48,0,134,80,0,134,112,0,134,144,0,134,176,0,134,208,0,134,240,0,134,14,1,134,16,16,134, +48,16,134,80,16,134,112,16,134,144,16,134,172,16,134,198,16,134,225,16,134,16,32,134,48,32,134,80,32,134,109,32,134,138,32,134,165,32,134,193,32,134,16,64,134,48,64,134, +80,64,134,112,64,134,144,64,134,176,64,134,207,64,134,234,64,134,3,65,134,16,72,134,48,72,134,65,72,134,16,80,134,46,80,134,75,80,134,104,80,134,133,80,134,162,80,134, +10,96,134,39,96,134,68,96,134,98,96,134,129,96,134,16,128,134,48,128,134,80,128,134,112,128,134,144,128,134,176,128,134,207,128,134,234,128,134,3,129,134,16,130,134,48,130,134, +65,130,134,16,132,134,48,132,134,65,132,134,16,134,134,48,134,134,65,134,134,16,144,134,46,144,134,75,144,134,104,144,134,133,144,134,162,144,134,10,160,134,39,160,134,68,160,134, +98,160,134,129,160,134,16,192,134,48,192,134,80,192,134,112,192,134,144,192,134,171,192,134,199,192,134,228,192,134,16,194,134,40,194,134,16,196,134,40,196,134,16,198,134,40,198,134, +10,208,134,39,208,134,68,208,134,98,208,134,129,208,134,4,224,134,34,224,134,64,224,134,16,0,135,48,0,135,80,0,135,112,0,135,144,0,135,171,0,135,199,0,135,228,0,135, +16,2,135,40,2,135,16,4,135,40,4,135,16,6,135,40,6,135,16,8,135,40,8,135,10,16,135,39,16,135,68,16,135,98,16,135,129,16,135,4,32,135,34,32,135,64,32,135, +16,64,135,48,64,135,79,64,135,108,64,135,138,64,135,166,64,135,195,64,135,225,64,135,15,66,135,36,66,135,15,68,135,36,68,135,15,70,135,36,70,135,15,72,135,36,72,135, +6,80,135,35,80,135,65,80,135,96,80,135,16,0,136,48,0,136,80,0,136,112,0,136,144,0,136,174,0,136,16,16,136,48,16,136,80,16,136,106,16,136,131,16,136,16,32,136, +47,32,136,73,32,136,100,32,136,16,64,136,48,64,136,80,64,136,112,64,136,140,64,136,163,64,136,16,72,136,16,80,136,41,80,136,69,80,136,97,80,136,8,96,136,35,96,136, +64,96,136,16,128,136,48,128,136,80,128,136,112,128,136,140,128,136,163,128,136,16,130,136,16,132,136,16,134,136,16,144,136,41,144,136,69,144,136,97,144,136,8,160,136,35,160,136, +64,160,136,16,192,136,48,192,136,79,192,136,106,192,136,133,192,136,14,194,136,14,196,136,14,198,136,8,208,136,35,208,136,64,208,136,3,224,136,32,224,136,16,0,137,48,0,137, +79,0,137,106,0,137,133,0,137,14,2,137,14,4,137,14,6,137,14,8,137,8,16,137,35,16,137,64,16,137,3,32,137,32,32,137,16,64,137,46,64,137,73,64,137,101,64,137, +130,64,137,9,66,137,9,68,137,9,70,137,9,72,137,4,80,137,32,80,137,16,0,138,48,0,138,80,0,138,112,0,138,16,16,138,48,16,138,70,16,138,16,32,138,41,32,138, +65,32,138,16,64,138,48,64,138,79,64,138,100,64,138,14,80,138,37,80,138,6,96,138,32,96,138,16,128,138,48,128,138,79,128,138,100,128,138,14,144,138,37,144,138,6,160,138, +32,160,138,16,192,138,47,192,138,71,192,138,96,192,138,6,208,138,32,208,138,1,224,138,16,0,139,47,0,139,71,0,139,96,0,139,6,16,139,32,16,139,1,32,139,16,64,139, +41,64,139,67,64,139,3,80,139,16,0,140,48,0,140,78,0,140,16,16,140,40,16,140,16,32,140,35,32,140,16,64,140,48,64,140,67,64,140,11,80,140,32,80,140,4,96,140, +16,128,140,48,128,140,67,128,140,11,144,140,32,144,140,4,160,140,16,192,140,41,192,140,4,208,140,0,224,140,16,0,141,41,0,141,4,16,141,0,32,141,15,64,141,36,64,141, +1,80,141,16,0,142,48,0,142,16,16,142,13,32,142,16,64,142,39,64,142,8,80,142,2,96,142,16,128,142,39,128,142,8,144,142,2,160,142,16,192,142,34,192,142,2,208,142, +16,0,143,34,0,143,2,16,143,12,64,143,32,64,143,0,80,143,16,0,144,16,16,144,9,32,144,16,64,144,5,80,144,0,96,144,16,128,144,5,144,144,0,160,144,15,192,144, +0,208,144,15,0,145,0,16,145,9,64,145,48,0,154,80,0,154,112,0,154,144,0,154,176,0,154,208,0,154,240,0,154,16,1,154,48,1,154,80,1,154,112,1,154,48,16,154, +80,16,154,112,16,154,144,16,154,176,16,154,208,16,154,240,16,154,12,17,154,39,17,154,68,17,154,48,32,154,80,32,154,112,32,154,144,32,154,174,32,154,203,32,154,233,32,154, +5,33,154,34,33,154,64,33,154,48,64,154,80,64,154,112,64,154,144,64,154,176,64,154,208,64,154,240,64,154,16,65,154,48,65,154,77,65,154,103,65,154,16,72,154,48,72,154, +80,72,154,106,72,154,131,72,154,48,80,154,79,80,154,109,80,154,139,80,154,169,80,154,198,80,154,228,80,154,2,81,154,32,81,154,41,96,154,71,96,154,102,96,154,133,96,154, +163,96,154,193,96,154,224,96,154,48,128,154,80,128,154,112,128,154,144,128,154,176,128,154,208,128,154,240,128,154,16,129,154,48,129,154,77,129,154,103,129,154,16,130,154,48,130,154, +80,130,154,106,130,154,131,130,154,16,132,154,48,132,154,80,132,154,106,132,154,131,132,154,16,134,154,48,134,154,80,134,154,106,134,154,131,134,154,48,144,154,79,144,154,109,144,154, +139,144,154,169,144,154,198,144,154,228,144,154,2,145,154,32,145,154,41,160,154,71,160,154,102,160,154,133,160,154,163,160,154,193,160,154,224,160,154,48,192,154,80,192,154,112,192,154, +144,192,154,176,192,154,208,192,154,239,192,154,11,193,154,40,193,154,70,193,154,98,193,154,16,194,154,48,194,154,74,194,154,100,194,154,16,196,154,48,196,154,74,196,154,100,196,154, +16,198,154,48,198,154,74,198,154,100,198,154,41,208,154,71,208,154,102,208,154,133,208,154,163,208,154,193,208,154,224,208,154,35,224,154,66,224,154,97,224,154,128,224,154,48,0,155, +80,0,155,112,0,155,144,0,155,176,0,155,208,0,155,239,0,155,11,1,155,40,1,155,70,1,155,98,1,155,16,2,155,48,2,155,74,2,155,100,2,155,16,4,155,48,4,155, +74,4,155,100,4,155,16,6,155,48,6,155,74,6,155,100,6,155,16,8,155,48,8,155,74,8,155,100,8,155,41,16,155,71,16,155,102,16,155,133,16,155,163,16,155,193,16,155, +224,16,155,35,32,155,66,32,155,97,32,155,128,32,155,48,64,155,80,64,155,112,64,155,144,64,155,173,64,155,203,64,155,233,64,155,6,65,155,36,65,155,66,65,155,96,65,155, +16,66,155,43,66,155,70,66,155,97,66,155,16,68,155,43,68,155,70,68,155,97,68,155,16,70,155,43,70,155,70,70,155,97,70,155,16,72,155,43,72,155,70,72,155,97,72,155, +37,80,155,67,80,155,98,80,155,129,80,155,160,80,155,16,0,156,48,0,156,80,0,156,112,0,156,144,0,156,176,0,156,208,0,156,16,16,156,48,16,156,80,16,156,112,16,156, +141,16,156,165,16,156,16,32,156,48,32,156,78,32,156,106,32,156,134,32,156,161,32,156,16,64,156,48,64,156,80,64,156,112,64,156,144,64,156,174,64,156,199,64,156,16,72,156, +40,72,156,16,80,156,44,80,156,73,80,156,101,80,156,130,80,156,10,96,156,37,96,156,67,96,156,97,96,156,16,128,156,48,128,156,80,128,156,112,128,156,144,128,156,174,128,156, +199,128,156,16,130,156,40,130,156,16,132,156,40,132,156,16,134,156,40,134,156,16,144,156,44,144,156,73,144,156,101,144,156,130,144,156,10,160,156,37,160,156,67,160,156,97,160,156, +16,192,156,48,192,156,80,192,156,112,192,156,140,192,156,167,192,156,194,192,156,16,194,156,35,194,156,16,196,156,35,196,156,16,198,156,35,198,156,10,208,156,37,208,156,67,208,156, +97,208,156,4,224,156,33,224,156,16,0,157,48,0,157,80,0,157,112,0,157,140,0,157,167,0,157,194,0,157,16,2,157,35,2,157,16,4,157,35,4,157,16,6,157,35,6,157, +16,8,157,35,8,157,10,16,157,37,16,157,67,16,157,97,16,157,4,32,157,33,32,157,16,64,157,48,64,157,77,64,157,106,64,157,135,64,157,163,64,157,192,64,157,12,66,157, +32,66,157,12,68,157,32,68,157,12,70,157,32,70,157,12,72,157,32,72,157,5,80,157,34,80,157,64,80,157,16,0,158,48,0,158,80,0,158,112,0,158,144,0,158,16,16,158, +48,16,158,76,16,158,99,16,158,16,32,158,44,32,158,69,32,158,16,64,158,48,64,158,80,64,158,108,64,158,132,64,158,15,80,158,39,80,158,66,80,158,7,96,158,34,96,158, +16,128,158,48,128,158,80,128,158,108,128,158,132,128,158,15,144,158,39,144,158,66,144,158,7,160,158,34,160,158,16,192,158,48,192,158,75,192,158,101,192,158,128,192,158,7,208,158, +34,208,158,2,224,158,16,0,159,48,0,159,75,0,159,101,0,159,128,0,159,7,16,159,34,16,159,2,32,159,16,64,159,44,64,159,70,64,159,98,64,159,3,80,159,16,0,160, +48,0,160,80,0,160,16,16,160,44,16,160,16,32,160,37,32,160,16,64,160,48,64,160,71,64,160,12,80,160,34,80,160,5,96,160,16,128,160,48,128,160,71,128,160,12,144,160, +34,144,160,5,160,160,16,192,160,43,192,160,66,192,160,5,208,160,1,224,160,16,0,161,43,0,161,66,0,161,5,16,161,1,32,161,16,64,161,38,64,161,64,64,161,2,80,161, +16,0,162,48,0,162,16,16,162,33,16,162,14,32,162,16,64,162,42,64,162,9,80,162,3,96,162,16,128,162,42,128,162,9,144,162,3,160,162,16,192,162,36,192,162,3,208,162, +16,0,163,36,0,163,3,16,163,13,64,163,33,64,163,0,80,163,16,0,164,16,16,164,10,32,164,16,64,164,5,80,164,1,96,164,16,128,164,5,144,164,1,160,164,16,192,164, +1,208,164,16,0,165,1,16,165,10,64,165,48,0,176,80,0,176,112,0,176,144,0,176,176,0,176,208,0,176,240,0,176,16,1,176,48,1,176,80,1,176,48,16,176,80,16,176, +112,16,176,144,16,176,176,16,176,208,16,176,236,16,176,6,17,176,33,17,176,48,32,176,80,32,176,112,32,176,143,32,176,171,32,176,200,32,176,229,32,176,1,33,176,48,64,176, +80,64,176,112,64,176,144,64,176,176,64,176,208,64,176,240,64,176,15,65,176,42,65,176,70,65,176,16,72,176,48,72,176,77,72,176,99,72,176,48,80,176,78,80,176,107,80,176, +137,80,176,166,80,176,196,80,176,226,80,176,40,96,176,70,96,176,101,96,176,131,96,176,161,96,176,192,96,176,48,128,176,80,128,176,112,128,176,144,128,176,176,128,176,208,128,176, +240,128,176,15,129,176,42,129,176,70,129,176,16,130,176,48,130,176,77,130,176,99,130,176,16,132,176,48,132,176,77,132,176,99,132,176,16,134,176,48,134,176,77,134,176,99,134,176, +48,144,176,78,144,176,107,144,176,137,144,176,166,144,176,196,144,176,226,144,176,40,160,176,70,160,176,101,160,176,131,160,176,161,160,176,192,160,176,48,192,176,80,192,176,112,192,176, +144,192,176,176,192,176,206,192,176,235,192,176,7,193,176,36,193,176,65,193,176,16,194,176,46,194,176,70,194,176,16,196,176,46,196,176,70,196,176,16,198,176,46,198,176,70,198,176, +40,208,176,70,208,176,101,208,176,131,208,176,161,208,176,192,208,176,35,224,176,65,224,176,96,224,176,128,224,176,48,0,177,80,0,177,112,0,177,144,0,177,176,0,177,206,0,177, +235,0,177,7,1,177,36,1,177,65,1,177,16,2,177,46,2,177,70,2,177,16,4,177,46,4,177,70,4,177,16,6,177,46,6,177,70,6,177,16,8,177,46,8,177,70,8,177, +40,16,177,70,16,177,101,16,177,131,16,177,161,16,177,192,16,177,35,32,177,65,32,177,96,32,177,128,32,177,48,64,177,80,64,177,112,64,177,142,64,177,171,64,177,200,64,177, +230,64,177,3,65,177,33,65,177,16,66,177,41,66,177,67,66,177,16,68,177,41,68,177,67,68,177,16,70,177,41,70,177,67,70,177,16,72,177,41,72,177,67,72,177,36,80,177, +67,80,177,97,80,177,128,80,177,16,0,178,48,0,178,80,0,178,112,0,178,144,0,178,176,0,178,16,16,178,48,16,178,80,16,178,109,16,178,135,16,178,16,32,178,48,32,178, +75,32,178,102,32,178,130,32,178,16,64,178,48,64,178,80,64,178,112,64,178,144,64,178,167,64,178,16,72,178,33,72,178,16,80,178,43,80,178,70,80,178,99,80,178,128,80,178, +9,96,178,36,96,178,65,96,178,16,128,178,48,128,178,80,128,178,112,128,178,144,128,178,167,128,178,16,130,178,33,130,178,16,132,178,33,132,178,16,134,178,33,134,178,16,144,178, +43,144,178,70,144,178,99,144,178,128,144,178,9,160,178,36,160,178,65,160,178,16,192,178,48,192,178,80,192,178,108,192,178,136,192,178,162,192,178,16,194,178,16,196,178,16,198,178, +9,208,178,36,208,178,65,208,178,3,224,178,32,224,178,16,0,179,48,0,179,80,0,179,108,0,179,136,0,179,162,0,179,16,2,179,16,4,179,16,6,179,16,8,179,9,16,179, +36,16,179,65,16,179,3,32,179,32,32,179,16,64,179,47,64,179,75,64,179,103,64,179,132,64,179,160,64,179,10,66,179,10,68,179,10,70,179,10,72,179,4,80,179,33,80,179, +16,0,180,48,0,180,80,0,180,112,0,180,16,16,180,48,16,180,70,16,180,16,32,180,41,32,180,65,32,180,16,64,180,48,64,180,79,64,180,100,64,180,14,80,180,37,80,180, +6,96,180,32,96,180,16,128,180,48,128,180,79,128,180,100,128,180,14,144,180,37,144,180,6,160,180,32,160,180,16,192,180,47,192,180,71,192,180,96,192,180,6,208,180,32,208,180, +1,224,180,16,0,181,47,0,181,71,0,181,96,0,181,6,16,181,32,16,181,1,32,181,16,64,181,41,64,181,67,64,181,3,80,181,16,0,182,48,0,182,16,16,182,38,16,182, +16,32,182,33,32,182,16,64,182,47,64,182,10,80,182,4,96,182,16,128,182,47,128,182,10,144,182,4,160,182,16,192,182,39,192,182,4,208,182,0,224,182,16,0,183,39,0,183, +4,16,183,0,32,183,15,64,183,35,64,183,1,80,183,16,0,184,46,0,184,16,16,184,11,32,184,16,64,184,35,64,184,6,80,184,1,96,184,16,128,184,35,128,184,6,144,184, +1,160,184,16,192,184,1,208,184,16,0,185,1,16,185,11,64,185,48,0,198,80,0,198,112,0,198,144,0,198,176,0,198,208,0,198,240,0,198,16,1,198,46,1,198,48,16,198, +80,16,198,112,16,198,144,16,198,176,16,198,202,16,198,230,16,198,0,17,198,48,32,198,80,32,198,111,32,198,140,32,198,168,32,198,196,32,198,225,32,198,48,64,198,80,64,198, +112,64,198,144,64,198,176,64,198,208,64,198,239,64,198,9,65,198,35,65,198,16,72,198,48,72,198,71,72,198,47,80,198,76,80,198,105,80,198,135,80,198,164,80,198,193,80,198, +39,96,198,69,96,198,99,96,198,130,96,198,160,96,198,48,128,198,80,128,198,112,128,198,144,128,198,176,128,198,208,128,198,239,128,198,9,129,198,35,129,198,16,130,198,48,130,198, +71,130,198,16,132,198,48,132,198,71,132,198,16,134,198,48,134,198,71,134,198,47,144,198,76,144,198,105,144,198,135,144,198,164,144,198,193,144,198,39,160,198,69,160,198,99,160,198, +130,160,198,160,160,198,48,192,198,80,192,198,112,192,198,144,192,198,174,192,198,202,192,198,231,192,198,3,193,198,16,194,198,43,194,198,66,194,198,16,196,198,43,196,198,66,196,198, +16,198,198,43,198,198,66,198,198,39,208,198,69,208,198,99,208,198,130,208,198,160,208,198,34,224,198,65,224,198,96,224,198,48,0,199,80,0,199,112,0,199,144,0,199,174,0,199, +202,0,199,231,0,199,3,1,199,16,2,199,43,2,199,66,2,199,16,4,199,43,4,199,66,4,199,16,6,199,43,6,199,66,6,199,16,8,199,43,8,199,66,8,199,39,16,199, +69,16,199,99,16,199,130,16,199,160,16,199,34,32,199,65,32,199,96,32,199,48,64,199,80,64,199,110,64,199,140,64,199,169,64,199,198,64,199,227,64,199,0,65,199,16,66,199, +38,66,199,64,66,199,16,68,199,38,68,199,64,68,199,16,70,199,38,70,199,64,70,199,16,72,199,38,72,199,64,72,199,35,80,199,66,80,199,96,80,199,16,0,200,48,0,200, +80,0,200,112,0,200,144,0,200,16,16,200,48,16,200,80,16,200,104,16,200,129,16,200,16,32,200,47,32,200,72,32,200,99,32,200,16,64,200,48,64,200,80,64,200,112,64,200, +138,64,200,16,80,200,41,80,200,68,80,200,96,80,200,8,96,200,35,96,200,64,96,200,16,128,200,48,128,200,80,128,200,112,128,200,138,128,200,16,144,200,41,144,200,68,144,200, +96,144,200,8,160,200,35,160,200,64,160,200,16,192,200,48,192,200,78,192,200,105,192,200,132,192,200,8,208,200,35,208,200,64,208,200,3,224,200,16,0,201,48,0,201,78,0,201, +105,0,201,132,0,201,8,16,201,35,16,201,64,16,201,3,32,201,16,64,201,45,64,201,73,64,201,100,64,201,129,64,201,4,80,201,32,80,201,16,0,202,48,0,202,80,0,202, +16,16,202,45,16,202,64,16,202,16,32,202,38,32,202,16,64,202,48,64,202,73,64,202,12,80,202,34,80,202,5,96,202,16,128,202,48,128,202,73,128,202,12,144,202,34,144,202, +5,160,202,16,192,202,44,192,202,67,192,202,5,208,202,1,224,202,16,0,203,44,0,203,67,0,203,5,16,203,1,32,203,16,64,203,39,64,203,64,64,203,2,80,203,16,0,204, +48,0,204,16,16,204,32,16,204,14,32,204,16,64,204,41,64,204,8,80,204,3,96,204,16,128,204,41,128,204,8,144,204,3,160,204,16,192,204,35,192,204,3,208,204,16,0,205, +35,0,205,3,16,205,13,64,205,32,64,205,0,80,205,16,0,220,48,0,220,80,0,220,112,0,220,144,0,220,176,0,220,208,0,220,240,0,220,14,1,220,16,16,220,48,16,220, +80,16,220,112,16,220,144,16,220,172,16,220,198,16,220,225,16,220,16,32,220,48,32,220,80,32,220,109,32,220,138,32,220,165,32,220,193,32,220,16,64,220,48,64,220,80,64,220, +112,64,220,144,64,220,176,64,220,207,64,220,234,64,220,3,65,220,16,72,220,48,72,220,65,72,220,16,80,220,46,80,220,75,80,220,104,80,220,133,80,220,162,80,220,10,96,220, +39,96,220,68,96,220,98,96,220,129,96,220,16,128,220,48,128,220,80,128,220,112,128,220,144,128,220,176,128,220,207,128,220,234,128,220,3,129,220,16,130,220,48,130,220,65,130,220, +16,132,220,48,132,220,65,132,220,16,134,220,48,134,220,65,134,220,16,144,220,46,144,220,75,144,220,104,144,220,133,144,220,162,144,220,10,160,220,39,160,220,68,160,220,98,160,220, +129,160,220,16,192,220,48,192,220,80,192,220,112,192,220,144,192,220,171,192,220,199,192,220,228,192,220,16,194,220,40,194,220,16,196,220,40,196,220,16,198,220,40,198,220,10,208,220, +39,208,220,68,208,220,98,208,220,129,208,220,4,224,220,34,224,220,64,224,220,16,0,221,48,0,221,80,0,221,112,0,221,144,0,221,171,0,221,199,0,221,228,0,221,16,2,221, +40,2,221,16,4,221,40,4,221,16,6,221,40,6,221,16,8,221,40,8,221,10,16,221,39,16,221,68,16,221,98,16,221,129,16,221,4,32,221,34,32,221,64,32,221,16,64,221, +48,64,221,79,64,221,108,64,221,138,64,221,166,64,221,195,64,221,225,64,221,15,66,221,36,66,221,15,68,221,36,68,221,15,70,221,36,70,221,15,72,221,36,72,221,6,80,221, +35,80,221,65,80,221,96,80,221,16,0,222,48,0,222,80,0,222,112,0,222,144,0,222,16,16,222,48,16,222,76,16,222,99,16,222,16,32,222,44,32,222,69,32,222,16,64,222, +48,64,222,80,64,222,108,64,222,132,64,222,15,80,222,39,80,222,66,80,222,7,96,222,34,96,222,16,128,222,48,128,222,80,128,222,108,128,222,132,128,222,15,144,222,39,144,222, +66,144,222,7,160,222,34,160,222,16,192,222,48,192,222,75,192,222,101,192,222,128,192,222,7,208,222,34,208,222,2,224,222,16,0,223,48,0,223,75,0,223,101,0,223,128,0,223, +7,16,223,34,16,223,2,32,223,16,64,223,44,64,223,70,64,223,98,64,223,3,80,223,16,0,224,48,0,224,78,0,224,16,16,224,40,16,224,16,32,224,35,32,224,16,64,224, +48,64,224,67,64,224,11,80,224,32,80,224,4,96,224,16,128,224,48,128,224,67,128,224,11,144,224,32,144,224,4,160,224,16,192,224,41,192,224,4,208,224,0,224,224,16,0,225, +41,0,225,4,16,225,0,32,225,15,64,225,36,64,225,1,80,225,16,0,226,46,0,226,16,16,226,11,32,226,16,64,226,35,64,226,6,80,226,1,96,226,16,128,226,35,128,226, +6,144,226,1,160,226,16,192,226,1,208,226,16,0,227,1,16,227,11,64,227 +}; diff --git a/transcoder/basisu_etc1_mods.inl b/transcoder/basisu_etc1_mods.inl new file mode 100644 index 0000000..572a816 --- /dev/null +++ b/transcoder/basisu_etc1_mods.inl @@ -0,0 +1,257 @@ +static const uint8_t g_etc1_mod_tabs[255][8] = { +{0,0,0,0,0,0,0,0,}, +{0,0,0,0,0,0,0,0,}, +{0,0,0,0,0,0,0,0,}, +{0,0,0,0,0,0,0,0,}, +{0,0,0,0,0,0,1,1,}, +{0,0,0,0,0,0,0,1,}, +{0,0,0,0,0,0,0,0,}, +{0,0,0,0,0,0,0,0,}, +{0,0,0,0,0,0,0,0,}, +{0,0,0,0,0,0,0,0,}, +{0,0,0,0,0,0,0,0,}, +{0,0,0,0,0,0,0,0,}, +{0,0,0,0,0,0,0,1,}, +{0,0,0,0,0,0,1,1,}, +{0,0,0,0,0,1,1,1,}, +{0,0,0,0,0,1,1,1,}, +{0,0,0,0,1,1,1,1,}, +{0,0,0,0,1,1,1,1,}, +{0,0,0,0,1,1,1,1,}, +{0,0,0,0,1,1,1,1,}, +{0,0,0,1,1,1,1,1,}, +{0,0,0,1,1,1,1,1,}, +{0,0,0,1,1,1,1,1,}, +{0,0,0,1,1,1,1,2,}, +{0,0,0,1,1,1,2,2,}, +{0,0,0,1,1,1,2,2,}, +{0,0,0,1,1,2,2,2,}, +{0,0,1,1,1,2,2,2,}, +{0,0,1,1,1,2,2,2,}, +{0,0,1,1,1,2,2,2,}, +{0,0,1,1,1,2,2,2,}, +{0,0,1,1,2,2,2,2,}, +{0,0,1,1,2,2,2,2,}, +{0,0,1,1,2,2,2,2,}, +{0,0,1,1,2,2,2,2,}, +{0,0,1,1,2,2,2,2,}, +{0,0,1,1,2,2,2,3,}, +{0,0,1,1,2,2,3,3,}, +{0,0,1,2,2,2,3,3,}, +{0,0,1,2,2,2,3,3,}, +{0,0,1,2,2,2,3,3,}, +{0,1,1,2,2,3,3,3,}, +{0,1,1,2,2,3,3,3,}, +{0,1,1,2,2,3,3,3,}, +{0,1,1,2,2,3,3,3,}, +{0,1,1,2,2,3,3,3,}, +{0,1,1,2,2,3,3,3,}, +{0,1,1,2,2,3,3,3,}, +{0,1,1,2,3,3,3,3,}, +{0,1,1,2,3,3,3,3,}, +{0,1,1,2,3,3,3,3,}, +{0,1,2,2,3,3,3,3,}, +{0,1,2,2,3,3,3,4,}, +{0,1,2,2,3,3,3,4,}, +{0,1,2,2,3,3,4,4,}, +{0,1,2,2,3,3,4,4,}, +{0,1,2,2,3,3,4,4,}, +{0,1,2,2,3,3,4,4,}, +{0,1,2,2,3,3,4,4,}, +{0,1,2,2,3,4,4,4,}, +{0,1,2,3,3,4,4,4,}, +{0,1,2,3,3,4,4,4,}, +{0,1,2,3,3,4,4,4,}, +{0,1,2,3,3,4,4,4,}, +{0,1,2,3,3,4,4,4,}, +{0,1,2,3,3,4,4,4,}, +{0,1,2,3,3,4,4,4,}, +{0,1,2,3,3,4,4,4,}, +{0,1,2,3,3,4,4,4,}, +{0,1,2,3,3,4,4,4,}, +{0,1,2,3,4,4,4,4,}, +{0,1,2,3,4,4,4,5,}, +{0,1,2,3,4,4,4,5,}, +{0,1,2,3,4,4,4,5,}, +{0,1,2,3,4,4,5,5,}, +{0,1,2,3,4,4,5,5,}, +{0,1,2,3,4,4,5,5,}, +{0,1,2,3,4,4,5,5,}, +{0,2,2,3,4,4,5,5,}, +{0,2,2,3,4,4,5,5,}, +{0,2,2,3,4,4,5,5,}, +{0,2,3,3,4,5,5,5,}, +{0,2,3,3,4,5,5,5,}, +{0,2,3,3,4,5,5,5,}, +{0,2,3,3,4,5,5,5,}, +{1,2,3,3,4,5,5,5,}, +{1,2,3,3,4,5,5,5,}, +{1,2,3,4,4,5,5,5,}, +{1,2,3,4,4,5,5,5,}, +{1,2,3,4,4,5,5,5,}, +{1,2,3,4,4,5,5,5,}, +{1,2,3,4,4,5,5,5,}, +{1,2,3,4,4,5,5,5,}, +{1,2,3,4,4,5,5,5,}, +{1,2,3,4,4,5,5,5,}, +{1,2,3,4,4,5,5,6,}, +{1,2,3,4,5,5,5,6,}, +{1,2,3,4,5,5,5,6,}, +{1,2,3,4,5,5,5,6,}, +{1,2,3,4,5,5,6,6,}, +{1,2,3,4,5,5,6,6,}, +{1,2,3,4,5,5,6,6,}, +{1,2,3,4,5,5,6,6,}, +{1,2,3,4,5,5,6,6,}, +{1,2,3,4,5,5,6,6,}, +{1,2,3,4,5,5,6,6,}, +{1,2,3,4,5,5,6,6,}, +{1,2,3,4,5,5,6,6,}, +{1,2,3,4,5,6,6,6,}, +{1,2,3,4,5,6,6,6,}, +{1,2,3,4,5,6,6,6,}, +{1,2,3,4,5,6,6,6,}, +{1,2,3,4,5,6,6,6,}, +{1,2,3,4,5,6,6,6,}, +{1,2,3,4,5,6,6,6,}, +{1,2,3,4,5,6,6,6,}, +{1,2,4,4,5,6,6,6,}, +{1,2,4,4,5,6,6,6,}, +{1,2,4,4,5,6,6,6,}, +{1,2,4,4,5,6,6,6,}, +{1,2,4,5,5,6,6,6,}, +{1,2,4,5,5,6,6,6,}, +{1,3,4,5,5,6,6,6,}, +{1,3,4,5,5,6,6,6,}, +{1,3,4,5,5,6,6,6,}, +{1,3,4,5,5,6,6,6,}, +{1,3,4,5,5,6,6,6,}, +{1,3,4,5,5,6,6,6,}, +{1,3,4,5,6,6,6,6,}, +{1,3,4,5,6,6,6,6,}, +{1,3,4,5,6,6,6,6,}, +{1,3,4,5,6,6,6,6,}, +{1,3,4,5,6,6,6,6,}, +{1,3,4,5,6,6,6,6,}, +{1,3,4,5,6,6,6,6,}, +{1,3,4,5,6,6,6,6,}, +{1,3,4,5,6,6,6,6,}, +{1,3,4,5,6,6,6,6,}, +{1,3,4,5,6,6,6,6,}, +{1,3,4,5,6,6,6,6,}, +{1,3,4,5,6,6,6,6,}, +{1,3,4,5,6,6,6,6,}, +{1,3,4,5,6,6,6,6,}, +{1,3,4,5,6,6,6,6,}, +{1,3,4,5,6,6,6,6,}, +{1,3,4,5,6,6,6,6,}, +{1,3,4,5,6,6,6,6,}, +{1,3,4,5,6,6,6,7,}, +{1,3,4,5,6,6,6,7,}, +{1,3,4,5,6,6,6,7,}, +{1,3,4,5,6,6,6,7,}, +{1,3,4,5,6,6,6,7,}, +{1,3,4,5,6,6,6,7,}, +{1,3,4,5,6,6,6,7,}, +{1,3,4,5,6,6,7,7,}, +{1,3,4,5,6,6,7,7,}, +{1,3,4,5,6,6,7,7,}, +{1,3,4,5,6,6,7,7,}, +{1,3,4,5,6,6,7,7,}, +{1,3,4,6,6,6,7,7,}, +{1,3,5,6,6,6,7,7,}, +{1,3,5,6,6,6,7,7,}, +{1,3,5,6,6,6,7,7,}, +{2,3,5,6,6,6,7,7,}, +{2,3,5,6,6,6,7,7,}, +{2,3,5,6,6,6,7,7,}, +{2,3,5,6,6,6,7,7,}, +{2,3,5,6,6,7,7,7,}, +{2,3,5,6,6,7,7,7,}, +{2,3,5,6,6,7,7,7,}, +{2,3,5,6,6,7,7,7,}, +{2,3,5,6,6,7,7,7,}, +{2,3,5,6,6,7,7,7,}, +{2,3,5,6,6,7,7,7,}, +{2,3,5,6,6,7,7,7,}, +{2,4,5,6,6,7,7,7,}, +{2,4,5,6,6,7,7,7,}, +{2,4,5,6,6,7,7,7,}, +{2,4,5,6,6,7,7,7,}, +{2,4,5,6,6,7,7,7,}, +{2,4,5,6,6,7,7,7,}, +{2,4,5,6,6,7,7,7,}, +{2,4,5,6,6,7,7,7,}, +{2,4,5,6,6,7,7,7,}, +{2,4,5,6,6,7,7,7,}, +{2,4,5,6,6,7,7,7,}, +{2,4,5,6,6,7,7,7,}, +{2,4,5,6,6,7,7,7,}, +{2,4,5,6,6,7,7,7,}, +{2,4,5,6,6,7,7,7,}, +{2,4,5,6,6,7,7,7,}, +{2,4,5,6,6,7,7,7,}, +{2,4,5,6,6,7,7,7,}, +{2,4,5,6,6,7,7,7,}, +{2,4,5,6,6,7,7,7,}, +{2,4,5,6,6,7,7,7,}, +{2,4,5,6,6,7,7,7,}, +{2,4,5,6,7,7,7,7,}, +{2,4,5,6,7,7,7,7,}, +{2,4,5,6,7,7,7,7,}, +{2,4,5,6,7,7,7,7,}, +{2,4,5,6,7,7,7,7,}, +{2,4,5,6,7,7,7,7,}, +{2,4,5,6,7,7,7,7,}, +{2,4,5,6,7,7,7,7,}, +{2,4,5,6,7,7,7,7,}, +{2,4,5,6,7,7,7,7,}, +{2,4,5,6,7,7,7,7,}, +{2,4,5,6,7,7,7,7,}, +{2,4,5,6,7,7,7,7,}, +{2,4,5,6,7,7,7,7,}, +{2,4,5,6,7,7,7,7,}, +{2,4,6,6,7,7,7,7,}, +{2,4,6,6,7,7,7,7,}, +{2,4,6,6,7,7,7,7,}, +{2,4,6,6,7,7,7,7,}, +{2,4,6,6,7,7,7,7,}, +{2,4,6,6,7,7,7,7,}, +{2,4,6,6,7,7,7,7,}, +{2,4,6,6,7,7,7,7,}, +{2,4,6,6,7,7,7,7,}, +{2,4,6,6,7,7,7,7,}, +{2,4,6,6,7,7,7,7,}, +{2,4,6,6,7,7,7,7,}, +{2,4,6,6,7,7,7,7,}, +{2,4,6,6,7,7,7,7,}, +{2,4,6,6,7,7,7,7,}, +{2,4,6,6,7,7,7,7,}, +{2,4,6,6,7,7,7,7,}, +{2,4,6,6,7,7,7,7,}, +{2,4,6,6,7,7,7,7,}, +{2,4,6,6,7,7,7,7,}, +{2,4,6,6,7,7,7,7,}, +{2,4,6,6,7,7,7,7,}, +{2,4,6,6,7,7,7,7,}, +{2,4,6,6,7,7,7,7,}, +{2,4,6,6,7,7,7,7,}, +{2,4,6,6,7,7,7,7,}, +{2,4,6,6,7,7,7,7,}, +{2,4,6,6,7,7,7,7,}, +{2,4,6,6,7,7,7,7,}, +{2,4,6,6,7,7,7,7,}, +{2,5,6,6,7,7,7,7,}, +{2,5,6,6,7,7,7,7,}, +{2,5,6,6,7,7,7,7,}, +{2,5,6,6,7,7,7,7,}, +{2,5,6,6,7,7,7,7,}, +{2,5,6,7,7,7,7,7,}, +{2,5,6,7,7,7,7,7,}, +{2,5,6,7,7,7,7,7,}, +{2,5,6,7,7,7,7,7,}, +{2,5,6,7,7,7,7,7,}, +{2,5,6,7,7,7,7,7,}, +{2,5,6,7,7,7,7,7,}, +{2,5,6,7,7,7,7,7,} +}; diff --git a/transcoder/basisu_idct.h b/transcoder/basisu_idct.h new file mode 100644 index 0000000..33b77d0 --- /dev/null +++ b/transcoder/basisu_idct.h @@ -0,0 +1,1446 @@ +// ------------------------------------------------------------ +// 1D ORTHONORMAL IDCT (DCT-III), SIZE 2, FLOAT +// out[x*dst_stride] = sum_k C[k][x] * src[k*src_stride] +// C[k][x] = alpha(k) * cos(pi * (2*x+1) * k / (2*N)), +// alpha(0) = sqrt(1/N), alpha(k>0) = sqrt(2/N) +static inline void idct_1d_2( + const float* src, int src_stride, + float* dst, int dst_stride) +{ + float s0 = 0.0f; + float s1 = 0.0f; + + { + float v = src[0 * src_stride]; + if (v != 0.0f) + { + s0 += 7.071067691e-01f * v; + s1 += 7.071067691e-01f * v; + } + } + + { + float v = src[1 * src_stride]; + if (v != 0.0f) + { + s0 += 7.071067691e-01f * v; + s1 += -7.071067691e-01f * v; + } + } + + dst[0 * dst_stride] = s0; + dst[1 * dst_stride] = s1; +} + +// ------------------------------------------------------------ +// 1D ORTHONORMAL IDCT (DCT-III), SIZE 3, FLOAT +// out[x*dst_stride] = sum_k C[k][x] * src[k*src_stride] +// C[k][x] = alpha(k) * cos(pi * (2*x+1) * k / (2*N)), +// alpha(0) = sqrt(1/N), alpha(k>0) = sqrt(2/N) +static inline void idct_1d_3( + const float* src, int src_stride, + float* dst, int dst_stride) +{ + float s0 = 0.0f; + float s1 = 0.0f; + float s2 = 0.0f; + + { + float v = src[0 * src_stride]; + if (v != 0.0f) + { + s0 += 5.773502588e-01f * v; + s1 += 5.773502588e-01f * v; + s2 += 5.773502588e-01f * v; + } + } + + { + float v = src[1 * src_stride]; + if (v != 0.0f) + { + s0 += 7.071067691e-01f * v; + s2 += -7.071068883e-01f * v; + } + } + + { + float v = src[2 * src_stride]; + if (v != 0.0f) + { + s0 += 4.082482755e-01f * v; + s1 += -8.164966106e-01f * v; + s2 += 4.082486033e-01f * v; + } + } + + dst[0 * dst_stride] = s0; + dst[1 * dst_stride] = s1; + dst[2 * dst_stride] = s2; +} + +// ------------------------------------------------------------ +// 1D ORTHONORMAL IDCT (DCT-III), SIZE 4, FLOAT +// out[x*dst_stride] = sum_k C[k][x] * src[k*src_stride] +// C[k][x] = alpha(k) * cos(pi * (2*x+1) * k / (2*N)), +// alpha(0) = sqrt(1/N), alpha(k>0) = sqrt(2/N) +static inline void idct_1d_4( + const float* src, int src_stride, + float* dst, int dst_stride) +{ + float s0 = 0.0f; + float s1 = 0.0f; + float s2 = 0.0f; + float s3 = 0.0f; + + { + float v = src[0 * src_stride]; + if (v != 0.0f) + { + s0 += 5.000000000e-01f * v; + s1 += 5.000000000e-01f * v; + s2 += 5.000000000e-01f * v; + s3 += 5.000000000e-01f * v; + } + } + + { + float v = src[1 * src_stride]; + if (v != 0.0f) + { + s0 += 6.532814503e-01f * v; + s1 += 2.705980539e-01f * v; + s2 += -2.705981135e-01f * v; + s3 += -6.532815099e-01f * v; + } + } + + { + float v = src[2 * src_stride]; + if (v != 0.0f) + { + s0 += 4.999999702e-01f * v; + s1 += -4.999999702e-01f * v; + s2 += -4.999999106e-01f * v; + s3 += 5.000001788e-01f * v; + } + } + + { + float v = src[3 * src_stride]; + if (v != 0.0f) + { + s0 += 2.705980539e-01f * v; + s1 += -6.532814503e-01f * v; + s2 += 6.532815099e-01f * v; + s3 += -2.705983818e-01f * v; + } + } + + dst[0 * dst_stride] = s0; + dst[1 * dst_stride] = s1; + dst[2 * dst_stride] = s2; + dst[3 * dst_stride] = s3; +} + +// ------------------------------------------------------------ +// 1D ORTHONORMAL IDCT (DCT-III), SIZE 5, FLOAT +// out[x*dst_stride] = sum_k C[k][x] * src[k*src_stride] +// C[k][x] = alpha(k) * cos(pi * (2*x+1) * k / (2*N)), +// alpha(0) = sqrt(1/N), alpha(k>0) = sqrt(2/N) +static inline void idct_1d_5( + const float* src, int src_stride, + float* dst, int dst_stride) +{ + float s0 = 0.0f; + float s1 = 0.0f; + float s2 = 0.0f; + float s3 = 0.0f; + float s4 = 0.0f; + + { + float v = src[0 * src_stride]; + if (v != 0.0f) + { + s0 += 4.472135901e-01f * v; + s1 += 4.472135901e-01f * v; + s2 += 4.472135901e-01f * v; + s3 += 4.472135901e-01f * v; + s4 += 4.472135901e-01f * v; + } + } + + { + float v = src[1 * src_stride]; + if (v != 0.0f) + { + s0 += 6.015009880e-01f * v; + s1 += 3.717480302e-01f * v; + s3 += -3.717481494e-01f * v; + s4 += -6.015009284e-01f * v; + } + } + + { + float v = src[2 * src_stride]; + if (v != 0.0f) + { + s0 += 5.116672516e-01f * v; + s1 += -1.954395324e-01f * v; + s2 += -6.324555278e-01f * v; + s3 += -1.954392791e-01f * v; + s4 += 5.116672516e-01f * v; + } + } + + { + float v = src[3 * src_stride]; + if (v != 0.0f) + { + s0 += 3.717480302e-01f * v; + s1 += -6.015009284e-01f * v; + s3 += 6.015008688e-01f * v; + s4 += -3.717483282e-01f * v; + } + } + + { + float v = src[4 * src_stride]; + if (v != 0.0f) + { + s0 += 1.954394877e-01f * v; + s1 += -5.116672516e-01f * v; + s2 += 6.324555278e-01f * v; + s3 += -5.116675496e-01f * v; + s4 += 1.954394132e-01f * v; + } + } + + dst[0 * dst_stride] = s0; + dst[1 * dst_stride] = s1; + dst[2 * dst_stride] = s2; + dst[3 * dst_stride] = s3; + dst[4 * dst_stride] = s4; +} + +// ------------------------------------------------------------ +// 1D ORTHONORMAL IDCT (DCT-III), SIZE 6, FLOAT +// out[x*dst_stride] = sum_k C[k][x] * src[k*src_stride] +// C[k][x] = alpha(k) * cos(pi * (2*x+1) * k / (2*N)), +// alpha(0) = sqrt(1/N), alpha(k>0) = sqrt(2/N) +static inline void idct_1d_6( + const float* src, int src_stride, + float* dst, int dst_stride) +{ + float s0 = 0.0f; + float s1 = 0.0f; + float s2 = 0.0f; + float s3 = 0.0f; + float s4 = 0.0f; + float s5 = 0.0f; + + { + float v = src[0 * src_stride]; + if (v != 0.0f) + { + s0 += 4.082483053e-01f * v; + s1 += 4.082483053e-01f * v; + s2 += 4.082483053e-01f * v; + s3 += 4.082483053e-01f * v; + s4 += 4.082483053e-01f * v; + s5 += 4.082483053e-01f * v; + } + } + + { + float v = src[1 * src_stride]; + if (v != 0.0f) + { + s0 += 5.576775074e-01f * v; + s1 += 4.082482755e-01f * v; + s2 += 1.494291872e-01f * v; + s3 += -1.494293064e-01f * v; + s4 += -4.082482755e-01f * v; + s5 += -5.576775670e-01f * v; + } + } + + { + float v = src[2 * src_stride]; + if (v != 0.0f) + { + s0 += 4.999999702e-01f * v; + s2 += -5.000000596e-01f * v; + s3 += -4.999999106e-01f * v; + s5 += 5.000000596e-01f * v; + } + } + + { + float v = src[3 * src_stride]; + if (v != 0.0f) + { + s0 += 4.082482755e-01f * v; + s1 += -4.082482755e-01f * v; + s2 += -4.082483053e-01f * v; + s3 += 4.082484245e-01f * v; + s4 += 4.082480669e-01f * v; + s5 += -4.082485437e-01f * v; + } + } + + { + float v = src[4 * src_stride]; + if (v != 0.0f) + { + s0 += 2.886750996e-01f * v; + s1 += -5.773502588e-01f * v; + s2 += 2.886753380e-01f * v; + s3 += 2.886748910e-01f * v; + s4 += -5.773502588e-01f * v; + s5 += 2.886753976e-01f * v; + } + } + + { + float v = src[5 * src_stride]; + if (v != 0.0f) + { + s0 += 1.494291872e-01f * v; + s1 += -4.082483053e-01f * v; + s2 += 5.576775074e-01f * v; + s3 += -5.576776266e-01f * v; + s4 += 4.082483053e-01f * v; + s5 += -1.494295001e-01f * v; + } + } + + dst[0 * dst_stride] = s0; + dst[1 * dst_stride] = s1; + dst[2 * dst_stride] = s2; + dst[3 * dst_stride] = s3; + dst[4 * dst_stride] = s4; + dst[5 * dst_stride] = s5; +} + +// ------------------------------------------------------------ +// 1D ORTHONORMAL IDCT (DCT-III), SIZE 7, FLOAT +// out[x*dst_stride] = sum_k C[k][x] * src[k*src_stride] +// C[k][x] = alpha(k) * cos(pi * (2*x+1) * k / (2*N)), +// alpha(0) = sqrt(1/N), alpha(k>0) = sqrt(2/N) +static inline void idct_1d_7( + const float* src, int src_stride, + float* dst, int dst_stride) +{ + float s0 = 0.0f; + float s1 = 0.0f; + float s2 = 0.0f; + float s3 = 0.0f; + float s4 = 0.0f; + float s5 = 0.0f; + float s6 = 0.0f; + + { + float v = src[0 * src_stride]; + if (v != 0.0f) + { + s0 += 3.779644668e-01f * v; + s1 += 3.779644668e-01f * v; + s2 += 3.779644668e-01f * v; + s3 += 3.779644668e-01f * v; + s4 += 3.779644668e-01f * v; + s5 += 3.779644668e-01f * v; + s6 += 3.779644668e-01f * v; + } + } + + { + float v = src[1 * src_stride]; + if (v != 0.0f) + { + s0 += 5.211208463e-01f * v; + s1 += 4.179065228e-01f * v; + s2 += 2.319205552e-01f * v; + s4 += -2.319206595e-01f * v; + s5 += -4.179066122e-01f * v; + s6 += -5.211208463e-01f * v; + } + } + + { + float v = src[2 * src_stride]; + if (v != 0.0f) + { + s0 += 4.815880954e-01f * v; + s1 += 1.189424619e-01f * v; + s2 += -3.332694173e-01f * v; + s3 += -5.345224738e-01f * v; + s4 += -3.332692087e-01f * v; + s5 += 1.189427450e-01f * v; + s6 += 4.815880954e-01f * v; + } + } + + { + float v = src[3 * src_stride]; + if (v != 0.0f) + { + s0 += 4.179065228e-01f * v; + s1 += -2.319206595e-01f * v; + s2 += -5.211208463e-01f * v; + s4 += 5.211208463e-01f * v; + s5 += 2.319205403e-01f * v; + s6 += -4.179067314e-01f * v; + } + } + + { + float v = src[4 * src_stride]; + if (v != 0.0f) + { + s0 += 3.332692981e-01f * v; + s1 += -4.815880954e-01f * v; + s2 += -1.189422309e-01f * v; + s3 += 5.345224738e-01f * v; + s4 += -1.189426631e-01f * v; + s5 += -4.815878570e-01f * v; + s6 += 3.332692981e-01f * v; + } + } + + { + float v = src[5 * src_stride]; + if (v != 0.0f) + { + s0 += 2.319205552e-01f * v; + s1 += -5.211208463e-01f * v; + s2 += 4.179064631e-01f * v; + s4 += -4.179064035e-01f * v; + s5 += 5.211209059e-01f * v; + s6 += -2.319207191e-01f * v; + } + } + + { + float v = src[6 * src_stride]; + if (v != 0.0f) + { + s0 += 1.189424619e-01f * v; + s1 += -3.332692087e-01f * v; + s2 += 4.815881252e-01f * v; + s3 += -5.345224738e-01f * v; + s4 += 4.815881550e-01f * v; + s5 += -3.332694471e-01f * v; + s6 += 1.189431697e-01f * v; + } + } + + dst[0 * dst_stride] = s0; + dst[1 * dst_stride] = s1; + dst[2 * dst_stride] = s2; + dst[3 * dst_stride] = s3; + dst[4 * dst_stride] = s4; + dst[5 * dst_stride] = s5; + dst[6 * dst_stride] = s6; +} + +// ------------------------------------------------------------ +// 1D ORTHONORMAL IDCT (DCT-III), SIZE 8, FLOAT +// out[x*dst_stride] = sum_k C[k][x] * src[k*src_stride] +// C[k][x] = alpha(k) * cos(pi * (2*x+1) * k / (2*N)), +// alpha(0) = sqrt(1/N), alpha(k>0) = sqrt(2/N) +static inline void idct_1d_8( + const float* src, int src_stride, + float* dst, int dst_stride) +{ + float s0 = 0.0f; + float s1 = 0.0f; + float s2 = 0.0f; + float s3 = 0.0f; + float s4 = 0.0f; + float s5 = 0.0f; + float s6 = 0.0f; + float s7 = 0.0f; + + { + float v = src[0 * src_stride]; + if (v != 0.0f) + { + s0 += 3.535533845e-01f * v; + s1 += 3.535533845e-01f * v; + s2 += 3.535533845e-01f * v; + s3 += 3.535533845e-01f * v; + s4 += 3.535533845e-01f * v; + s5 += 3.535533845e-01f * v; + s6 += 3.535533845e-01f * v; + s7 += 3.535533845e-01f * v; + } + } + + { + float v = src[1 * src_stride]; + if (v != 0.0f) + { + s0 += 4.903926253e-01f * v; + s1 += 4.157347977e-01f * v; + s2 += 2.777850926e-01f * v; + s3 += 9.754511714e-02f * v; + s4 += -9.754516184e-02f * v; + s5 += -2.777851820e-01f * v; + s6 += -4.157348275e-01f * v; + s7 += -4.903926551e-01f * v; + } + } + + { + float v = src[2 * src_stride]; + if (v != 0.0f) + { + s0 += 4.619397521e-01f * v; + s1 += 1.913417131e-01f * v; + s2 += -1.913417578e-01f * v; + s3 += -4.619398117e-01f * v; + s4 += -4.619397521e-01f * v; + s5 += -1.913415641e-01f * v; + s6 += 1.913418025e-01f * v; + s7 += 4.619397819e-01f * v; + } + } + + { + float v = src[3 * src_stride]; + if (v != 0.0f) + { + s0 += 4.157347977e-01f * v; + s1 += -9.754516184e-02f * v; + s2 += -4.903926551e-01f * v; + s3 += -2.777850032e-01f * v; + s4 += 2.777852118e-01f * v; + s5 += 4.903926253e-01f * v; + s6 += 9.754503518e-02f * v; + s7 += -4.157348871e-01f * v; + } + } + + { + float v = src[4 * src_stride]; + if (v != 0.0f) + { + s0 += 3.535533845e-01f * v; + s1 += -3.535533845e-01f * v; + s2 += -3.535533249e-01f * v; + s3 += 3.535535038e-01f * v; + s4 += 3.535533845e-01f * v; + s5 += -3.535536230e-01f * v; + s6 += -3.535532653e-01f * v; + s7 += 3.535534143e-01f * v; + } + } + + { + float v = src[5 * src_stride]; + if (v != 0.0f) + { + s0 += 2.777850926e-01f * v; + s1 += -4.903926551e-01f * v; + s2 += 9.754520655e-02f * v; + s3 += 4.157346785e-01f * v; + s4 += -4.157348871e-01f * v; + s5 += -9.754510969e-02f * v; + s6 += 4.903926551e-01f * v; + s7 += -2.777854204e-01f * v; + } + } + + { + float v = src[6 * src_stride]; + if (v != 0.0f) + { + s0 += 1.913417131e-01f * v; + s1 += -4.619397521e-01f * v; + s2 += 4.619397819e-01f * v; + s3 += -1.913419515e-01f * v; + s4 += -1.913414896e-01f * v; + s5 += 4.619396627e-01f * v; + s6 += -4.619398713e-01f * v; + s7 += 1.913419515e-01f * v; + } + } + + { + float v = src[7 * src_stride]; + if (v != 0.0f) + { + s0 += 9.754511714e-02f * v; + s1 += -2.777850032e-01f * v; + s2 += 4.157346785e-01f * v; + s3 += -4.903925955e-01f * v; + s4 += 4.903927147e-01f * v; + s5 += -4.157347977e-01f * v; + s6 += 2.777855694e-01f * v; + s7 += -9.754577279e-02f * v; + } + } + + dst[0 * dst_stride] = s0; + dst[1 * dst_stride] = s1; + dst[2 * dst_stride] = s2; + dst[3 * dst_stride] = s3; + dst[4 * dst_stride] = s4; + dst[5 * dst_stride] = s5; + dst[6 * dst_stride] = s6; + dst[7 * dst_stride] = s7; +} + +// ------------------------------------------------------------ +// 1D ORTHONORMAL IDCT (DCT-III), SIZE 9, FLOAT +// out[x*dst_stride] = sum_k C[k][x] * src[k*src_stride] +// C[k][x] = alpha(k) * cos(pi * (2*x+1) * k / (2*N)), +// alpha(0) = sqrt(1/N), alpha(k>0) = sqrt(2/N) +static inline void idct_1d_9( + const float* src, int src_stride, + float* dst, int dst_stride) +{ + float s0 = 0.0f; + float s1 = 0.0f; + float s2 = 0.0f; + float s3 = 0.0f; + float s4 = 0.0f; + float s5 = 0.0f; + float s6 = 0.0f; + float s7 = 0.0f; + float s8 = 0.0f; + + { + float v = src[0 * src_stride]; + if (v != 0.0f) + { + s0 += 3.333333433e-01f * v; + s1 += 3.333333433e-01f * v; + s2 += 3.333333433e-01f * v; + s3 += 3.333333433e-01f * v; + s4 += 3.333333433e-01f * v; + s5 += 3.333333433e-01f * v; + s6 += 3.333333433e-01f * v; + s7 += 3.333333433e-01f * v; + s8 += 3.333333433e-01f * v; + } + } + + { + float v = src[1 * src_stride]; + if (v != 0.0f) + { + s0 += 4.642428160e-01f * v; + s1 += 4.082482755e-01f * v; + s2 += 3.030129671e-01f * v; + s3 += 1.612297893e-01f * v; + s5 += -1.612298936e-01f * v; + s6 += -3.030129969e-01f * v; + s7 += -4.082482755e-01f * v; + s8 += -4.642428458e-01f * v; + } + } + + { + float v = src[2 * src_stride]; + if (v != 0.0f) + { + s0 += 4.429753423e-01f * v; + s1 += 2.357022464e-01f * v; + s2 += -8.185859025e-02f * v; + s3 += -3.611168861e-01f * v; + s4 += -4.714045227e-01f * v; + s5 += -3.611167669e-01f * v; + s6 += -8.185851574e-02f * v; + s7 += 2.357022166e-01f * v; + s8 += 4.429753721e-01f * v; + } + } + + { + float v = src[3 * src_stride]; + if (v != 0.0f) + { + s0 += 4.082482755e-01f * v; + s2 += -4.082482755e-01f * v; + s3 += -4.082482159e-01f * v; + s5 += 4.082483649e-01f * v; + s6 += 4.082482755e-01f * v; + s8 += -4.082485437e-01f * v; + } + } + + { + float v = src[4 * src_stride]; + if (v != 0.0f) + { + s0 += 3.611168265e-01f * v; + s1 += -2.357022911e-01f * v; + s2 += -4.429753125e-01f * v; + s3 += 8.185874671e-02f * v; + s4 += 4.714045227e-01f * v; + s5 += 8.185835928e-02f * v; + s6 += -4.429753721e-01f * v; + s7 += -2.357023507e-01f * v; + s8 += 3.611169457e-01f * v; + } + } + + { + float v = src[5 * src_stride]; + if (v != 0.0f) + { + s0 += 3.030129671e-01f * v; + s1 += -4.082482755e-01f * v; + s2 += -1.612298042e-01f * v; + s3 += 4.642428458e-01f * v; + s5 += -4.642428160e-01f * v; + s6 += 1.612296849e-01f * v; + s7 += 4.082482159e-01f * v; + s8 += -3.030129373e-01f * v; + } + } + + { + float v = src[6 * src_stride]; + if (v != 0.0f) + { + s0 += 2.357022464e-01f * v; + s1 += -4.714045227e-01f * v; + s2 += 2.357022166e-01f * v; + s3 += 2.357020825e-01f * v; + s4 += -4.714045227e-01f * v; + s5 += 2.357024848e-01f * v; + s6 += 2.357022017e-01f * v; + s7 += -4.714045227e-01f * v; + s8 += 2.357031256e-01f * v; + } + } + + { + float v = src[7 * src_stride]; + if (v != 0.0f) + { + s0 += 1.612297893e-01f * v; + s1 += -4.082482159e-01f * v; + s2 += 4.642428458e-01f * v; + s3 += -3.030130565e-01f * v; + s5 += 3.030129075e-01f * v; + s6 += -4.642427862e-01f * v; + s7 += 4.082485735e-01f * v; + s8 += -1.612301171e-01f * v; + } + } + + { + float v = src[8 * src_stride]; + if (v != 0.0f) + { + s0 += 8.185850084e-02f * v; + s1 += -2.357022166e-01f * v; + s2 += 3.611166775e-01f * v; + s3 += -4.429752231e-01f * v; + s4 += 4.714045227e-01f * v; + s5 += -4.429754615e-01f * v; + s6 += 3.611168563e-01f * v; + s7 += -2.357021123e-01f * v; + s8 += 8.185899258e-02f * v; + } + } + + dst[0 * dst_stride] = s0; + dst[1 * dst_stride] = s1; + dst[2 * dst_stride] = s2; + dst[3 * dst_stride] = s3; + dst[4 * dst_stride] = s4; + dst[5 * dst_stride] = s5; + dst[6 * dst_stride] = s6; + dst[7 * dst_stride] = s7; + dst[8 * dst_stride] = s8; +} + +// ------------------------------------------------------------ +// 1D ORTHONORMAL IDCT (DCT-III), SIZE 10, FLOAT +// out[x*dst_stride] = sum_k C[k][x] * src[k*src_stride] +// C[k][x] = alpha(k) * cos(pi * (2*x+1) * k / (2*N)), +// alpha(0) = sqrt(1/N), alpha(k>0) = sqrt(2/N) +static inline void idct_1d_10( + const float* src, int src_stride, + float* dst, int dst_stride) +{ + float s0 = 0.0f; + float s1 = 0.0f; + float s2 = 0.0f; + float s3 = 0.0f; + float s4 = 0.0f; + float s5 = 0.0f; + float s6 = 0.0f; + float s7 = 0.0f; + float s8 = 0.0f; + float s9 = 0.0f; + + { + float v = src[0 * src_stride]; + if (v != 0.0f) + { + s0 += 3.162277639e-01f * v; + s1 += 3.162277639e-01f * v; + s2 += 3.162277639e-01f * v; + s3 += 3.162277639e-01f * v; + s4 += 3.162277639e-01f * v; + s5 += 3.162277639e-01f * v; + s6 += 3.162277639e-01f * v; + s7 += 3.162277639e-01f * v; + s8 += 3.162277639e-01f * v; + s9 += 3.162277639e-01f * v; + } + } + + { + float v = src[1 * src_stride]; + if (v != 0.0f) + { + s0 += 4.417076707e-01f * v; + s1 += 3.984702229e-01f * v; + s2 += 3.162277639e-01f * v; + s3 += 2.030306906e-01f * v; + s4 += 6.995963305e-02f * v; + s5 += -6.995966285e-02f * v; + s6 += -2.030307651e-01f * v; + s7 += -3.162277639e-01f * v; + s8 += -3.984702528e-01f * v; + s9 += -4.417076707e-01f * v; + } + } + + { + float v = src[2 * src_stride]; + if (v != 0.0f) + { + s0 += 4.253254235e-01f * v; + s1 += 2.628655434e-01f * v; + s3 += -2.628656328e-01f * v; + s4 += -4.253253937e-01f * v; + s5 += -4.253253639e-01f * v; + s6 += -2.628654838e-01f * v; + s8 += 2.628656626e-01f * v; + s9 += 4.253254235e-01f * v; + } + } + + { + float v = src[3 * src_stride]; + if (v != 0.0f) + { + s0 += 3.984702229e-01f * v; + s1 += 6.995963305e-02f * v; + s2 += -3.162277639e-01f * v; + s3 += -4.417076409e-01f * v; + s4 += -2.030306011e-01f * v; + s5 += 2.030307949e-01f * v; + s6 += 4.417076707e-01f * v; + s7 += 3.162277639e-01f * v; + s8 += -6.995979697e-02f * v; + s9 += -3.984701931e-01f * v; + } + } + + { + float v = src[4 * src_stride]; + if (v != 0.0f) + { + s0 += 3.618034124e-01f * v; + s1 += -1.381966174e-01f * v; + s2 += -4.472135901e-01f * v; + s3 += -1.381964386e-01f * v; + s4 += 3.618033826e-01f * v; + s5 += 3.618032932e-01f * v; + s6 += -1.381967962e-01f * v; + s7 += -4.472135901e-01f * v; + s8 += -1.381963789e-01f * v; + s9 += 3.618034124e-01f * v; + } + } + + { + float v = src[5 * src_stride]; + if (v != 0.0f) + { + s0 += 3.162277639e-01f * v; + s1 += -3.162277639e-01f * v; + s2 += -3.162277043e-01f * v; + s3 += 3.162278533e-01f * v; + s4 += 3.162277639e-01f * v; + s5 += -3.162276745e-01f * v; + s6 += -3.162276447e-01f * v; + s7 += 3.162280619e-01f * v; + s8 += 3.162278533e-01f * v; + s9 += -3.162281811e-01f * v; + } + } + + { + float v = src[6 * src_stride]; + if (v != 0.0f) + { + s0 += 2.628655434e-01f * v; + s1 += -4.253253937e-01f * v; + s3 += 4.253253639e-01f * v; + s4 += -2.628657520e-01f * v; + s5 += -2.628654242e-01f * v; + s6 += 4.253254235e-01f * v; + s8 += -4.253252745e-01f * v; + s9 += 2.628654540e-01f * v; + } + } + + { + float v = src[7 * src_stride]; + if (v != 0.0f) + { + s0 += 2.030306906e-01f * v; + s1 += -4.417076409e-01f * v; + s2 += 3.162278533e-01f * v; + s3 += 6.995949894e-02f * v; + s4 += -3.984701633e-01f * v; + s5 += 3.984702528e-01f * v; + s6 += -6.996008009e-02f * v; + s7 += -3.162274361e-01f * v; + s8 += 4.417077899e-01f * v; + s9 += -2.030310780e-01f * v; + } + } + + { + float v = src[8 * src_stride]; + if (v != 0.0f) + { + s0 += 1.381965876e-01f * v; + s1 += -3.618033826e-01f * v; + s2 += 4.472135901e-01f * v; + s3 += -3.618035913e-01f * v; + s4 += 1.381965429e-01f * v; + s5 += 1.381962299e-01f * v; + s6 += -3.618031442e-01f * v; + s7 += 4.472135901e-01f * v; + s8 += -3.618036509e-01f * v; + s9 += 1.381966770e-01f * v; + } + } + + { + float v = src[9 * src_stride]; + if (v != 0.0f) + { + s0 += 6.995963305e-02f * v; + s1 += -2.030306011e-01f * v; + s2 += 3.162277639e-01f * v; + s3 += -3.984701633e-01f * v; + s4 += 4.417076409e-01f * v; + s5 += -4.417076409e-01f * v; + s6 += 3.984701931e-01f * v; + s7 += -3.162280619e-01f * v; + s8 += 2.030308247e-01f * v; + s9 += -6.995939463e-02f * v; + } + } + + dst[0 * dst_stride] = s0; + dst[1 * dst_stride] = s1; + dst[2 * dst_stride] = s2; + dst[3 * dst_stride] = s3; + dst[4 * dst_stride] = s4; + dst[5 * dst_stride] = s5; + dst[6 * dst_stride] = s6; + dst[7 * dst_stride] = s7; + dst[8 * dst_stride] = s8; + dst[9 * dst_stride] = s9; +} + +// ------------------------------------------------------------ +// 1D ORTHONORMAL IDCT (DCT-III), SIZE 11, FLOAT +// out[x*dst_stride] = sum_k C[k][x] * src[k*src_stride] +// C[k][x] = alpha(k) * cos(pi * (2*x+1) * k / (2*N)), +// alpha(0) = sqrt(1/N), alpha(k>0) = sqrt(2/N) +static inline void idct_1d_11( + const float* src, int src_stride, + float* dst, int dst_stride) +{ + float s0 = 0.0f; + float s1 = 0.0f; + float s2 = 0.0f; + float s3 = 0.0f; + float s4 = 0.0f; + float s5 = 0.0f; + float s6 = 0.0f; + float s7 = 0.0f; + float s8 = 0.0f; + float s9 = 0.0f; + float s10 = 0.0f; + + { + float v = src[0 * src_stride]; + if (v != 0.0f) + { + s0 += 3.015113473e-01f * v; + s1 += 3.015113473e-01f * v; + s2 += 3.015113473e-01f * v; + s3 += 3.015113473e-01f * v; + s4 += 3.015113473e-01f * v; + s5 += 3.015113473e-01f * v; + s6 += 3.015113473e-01f * v; + s7 += 3.015113473e-01f * v; + s8 += 3.015113473e-01f * v; + s9 += 3.015113473e-01f * v; + s10 += 3.015113473e-01f * v; + } + } + + { + float v = src[1 * src_stride]; + if (v != 0.0f) + { + s0 += 4.220612943e-01f * v; + s1 += 3.878683746e-01f * v; + s2 += 3.222526908e-01f * v; + s3 += 2.305300087e-01f * v; + s4 += 1.201311573e-01f * v; + s6 += -1.201311946e-01f * v; + s7 += -2.305300087e-01f * v; + s8 += -3.222527206e-01f * v; + s9 += -3.878683746e-01f * v; + s10 += -4.220612943e-01f * v; + } + } + + { + float v = src[2 * src_stride]; + if (v != 0.0f) + { + s0 += 4.091291726e-01f * v; + s1 += 2.792335451e-01f * v; + s2 += 6.068321317e-02f * v; + s3 += -1.771336049e-01f * v; + s4 += -3.587117195e-01f * v; + s5 += -4.264014363e-01f * v; + s6 += -3.587116897e-01f * v; + s7 += -1.771335900e-01f * v; + s8 += 6.068333238e-02f * v; + s9 += 2.792335451e-01f * v; + s10 += 4.091292024e-01f * v; + } + } + + { + float v = src[3 * src_stride]; + if (v != 0.0f) + { + s0 += 3.878683746e-01f * v; + s1 += 1.201311573e-01f * v; + s2 += -2.305300087e-01f * v; + s3 += -4.220612943e-01f * v; + s4 += -3.222526908e-01f * v; + s6 += 3.222527504e-01f * v; + s7 += 4.220612645e-01f * v; + s8 += 2.305298299e-01f * v; + s9 += -1.201310679e-01f * v; + s10 += -3.878685534e-01f * v; + } + } + + { + float v = src[4 * src_stride]; + if (v != 0.0f) + { + s0 += 3.587117195e-01f * v; + s1 += -6.068325043e-02f * v; + s2 += -4.091292024e-01f * v; + s3 += -2.792334855e-01f * v; + s4 += 1.771336049e-01f * v; + s5 += 4.264014363e-01f * v; + s6 += 1.771334559e-01f * v; + s7 += -2.792335153e-01f * v; + s8 += -4.091291428e-01f * v; + s9 += -6.068325043e-02f * v; + s10 += 3.587118387e-01f * v; + } + } + + { + float v = src[5 * src_stride]; + if (v != 0.0f) + { + s0 += 3.222526908e-01f * v; + s1 += -2.305300087e-01f * v; + s2 += -3.878683448e-01f * v; + s3 += 1.201313213e-01f * v; + s4 += 4.220612645e-01f * v; + s6 += -4.220612943e-01f * v; + s7 += -1.201310530e-01f * v; + s8 += 3.878682852e-01f * v; + s9 += 2.305295914e-01f * v; + s10 += -3.222530484e-01f * v; + } + } + + { + float v = src[6 * src_stride]; + if (v != 0.0f) + { + s0 += 2.792335451e-01f * v; + s1 += -3.587117195e-01f * v; + s2 += -1.771335900e-01f * v; + s3 += 4.091292024e-01f * v; + s4 += 6.068318710e-02f * v; + s5 += -4.264014363e-01f * v; + s6 += 6.068341061e-02f * v; + s7 += 4.091290832e-01f * v; + s8 += -1.771339774e-01f * v; + s9 += -3.587118387e-01f * v; + s10 += 2.792341411e-01f * v; + } + } + + { + float v = src[7 * src_stride]; + if (v != 0.0f) + { + s0 += 2.305300087e-01f * v; + s1 += -4.220612943e-01f * v; + s2 += 1.201313213e-01f * v; + s3 += 3.222525418e-01f * v; + s4 += -3.878685534e-01f * v; + s6 += 3.878683150e-01f * v; + s7 += -3.222530484e-01f * v; + s8 += -1.201303899e-01f * v; + s9 += 4.220611751e-01f * v; + s10 += -2.305305302e-01f * v; + } + } + + { + float v = src[8 * src_stride]; + if (v != 0.0f) + { + s0 += 1.771335304e-01f * v; + s1 += -4.091291726e-01f * v; + s2 += 3.587118089e-01f * v; + s3 += -6.068347394e-02f * v; + s4 += -2.792334855e-01f * v; + s5 += 4.264014363e-01f * v; + s6 += -2.792337239e-01f * v; + s7 += -6.068337709e-02f * v; + s8 += 3.587115407e-01f * v; + s9 += -4.091291726e-01f * v; + s10 += 1.771339774e-01f * v; + } + } + + { + float v = src[9 * src_stride]; + if (v != 0.0f) + { + s0 += 1.201311573e-01f * v; + s1 += -3.222526908e-01f * v; + s2 += 4.220612645e-01f * v; + s3 += -3.878685534e-01f * v; + s4 += 2.305301726e-01f * v; + s6 += -2.305298299e-01f * v; + s7 += 3.878681958e-01f * v; + s8 += -4.220613837e-01f * v; + s9 += 3.222527504e-01f * v; + s10 += -1.201314703e-01f * v; + } + } + + { + float v = src[10 * src_stride]; + if (v != 0.0f) + { + s0 += 6.068321317e-02f * v; + s1 += -1.771335900e-01f * v; + s2 += 2.792334557e-01f * v; + s3 += -3.587115407e-01f * v; + s4 += 4.091290832e-01f * v; + s5 += -4.264014363e-01f * v; + s6 += 4.091292620e-01f * v; + s7 += -3.587118387e-01f * v; + s8 += 2.792330980e-01f * v; + s9 += -1.771344692e-01f * v; + s10 += 6.068423390e-02f * v; + } + } + + dst[0 * dst_stride] = s0; + dst[1 * dst_stride] = s1; + dst[2 * dst_stride] = s2; + dst[3 * dst_stride] = s3; + dst[4 * dst_stride] = s4; + dst[5 * dst_stride] = s5; + dst[6 * dst_stride] = s6; + dst[7 * dst_stride] = s7; + dst[8 * dst_stride] = s8; + dst[9 * dst_stride] = s9; + dst[10 * dst_stride] = s10; +} + +// ------------------------------------------------------------ +// 1D ORTHONORMAL IDCT (DCT-III), SIZE 12, FLOAT +// out[x*dst_stride] = sum_k C[k][x] * src[k*src_stride] +// C[k][x] = alpha(k) * cos(pi * (2*x+1) * k / (2*N)), +// alpha(0) = sqrt(1/N), alpha(k>0) = sqrt(2/N) +static inline void idct_1d_12( + const float* src, int src_stride, + float* dst, int dst_stride) +{ + float s0 = 0.0f; + float s1 = 0.0f; + float s2 = 0.0f; + float s3 = 0.0f; + float s4 = 0.0f; + float s5 = 0.0f; + float s6 = 0.0f; + float s7 = 0.0f; + float s8 = 0.0f; + float s9 = 0.0f; + float s10 = 0.0f; + float s11 = 0.0f; + + { + float v = src[0 * src_stride]; + if (v != 0.0f) + { + s0 += 2.886751294e-01f * v; + s1 += 2.886751294e-01f * v; + s2 += 2.886751294e-01f * v; + s3 += 2.886751294e-01f * v; + s4 += 2.886751294e-01f * v; + s5 += 2.886751294e-01f * v; + s6 += 2.886751294e-01f * v; + s7 += 2.886751294e-01f * v; + s8 += 2.886751294e-01f * v; + s9 += 2.886751294e-01f * v; + s10 += 2.886751294e-01f * v; + s11 += 2.886751294e-01f * v; + } + } + + { + float v = src[1 * src_stride]; + if (v != 0.0f) + { + s0 += 4.047556818e-01f * v; + s1 += 3.771722317e-01f * v; + s2 += 3.238851428e-01f * v; + s3 += 2.485257983e-01f * v; + s4 += 1.562298536e-01f * v; + s5 += 5.328707024e-02f * v; + s6 += -5.328710750e-02f * v; + s7 += -1.562298536e-01f * v; + s8 += -2.485258281e-01f * v; + s9 += -3.238851428e-01f * v; + s10 += -3.771722913e-01f * v; + s11 += -4.047556818e-01f * v; + } + } + + { + float v = src[2 * src_stride]; + if (v != 0.0f) + { + s0 += 3.943375647e-01f * v; + s1 += 2.886751294e-01f * v; + s2 += 1.056623980e-01f * v; + s3 += -1.056624874e-01f * v; + s4 += -2.886751294e-01f * v; + s5 += -3.943375945e-01f * v; + s6 += -3.943375647e-01f * v; + s7 += -2.886751592e-01f * v; + s8 += -1.056624129e-01f * v; + s9 += 1.056624204e-01f * v; + s10 += 2.886752486e-01f * v; + s11 += 3.943375647e-01f * v; + } + } + + { + float v = src[3 * src_stride]; + if (v != 0.0f) + { + s0 += 3.771722317e-01f * v; + s1 += 1.562298536e-01f * v; + s2 += -1.562298536e-01f * v; + s3 += -3.771722913e-01f * v; + s4 += -3.771722317e-01f * v; + s5 += -1.562297344e-01f * v; + s6 += 1.562299281e-01f * v; + s7 += 3.771722615e-01f * v; + s8 += 3.771722019e-01f * v; + s9 += 1.562297940e-01f * v; + s10 += -1.562300622e-01f * v; + s11 += -3.771722317e-01f * v; + } + } + + { + float v = src[4 * src_stride]; + if (v != 0.0f) + { + s0 += 3.535533845e-01f * v; + s2 += -3.535534441e-01f * v; + s3 += -3.535533547e-01f * v; + s5 += 3.535534739e-01f * v; + s6 += 3.535533845e-01f * v; + s8 += -3.535534143e-01f * v; + s9 += -3.535534143e-01f * v; + s11 += 3.535532951e-01f * v; + } + } + + { + float v = src[5 * src_stride]; + if (v != 0.0f) + { + s0 += 3.238851428e-01f * v; + s1 += -1.562298536e-01f * v; + s2 += -4.047556818e-01f * v; + s3 += -5.328698456e-02f * v; + s4 += 3.771722615e-01f * v; + s5 += 2.485257536e-01f * v; + s6 += -2.485258281e-01f * v; + s7 += -3.771722317e-01f * v; + s8 += 5.328687653e-02f * v; + s9 += 4.047557116e-01f * v; + s10 += 1.562295407e-01f * v; + s11 += -3.238854110e-01f * v; + } + } + + { + float v = src[6 * src_stride]; + if (v != 0.0f) + { + s0 += 2.886751294e-01f * v; + s1 += -2.886751294e-01f * v; + s2 += -2.886751592e-01f * v; + s3 += 2.886752486e-01f * v; + s4 += 2.886749804e-01f * v; + s5 += -2.886753380e-01f * v; + s6 += -2.886750400e-01f * v; + s7 += 2.886751592e-01f * v; + s8 += 2.886749506e-01f * v; + s9 += -2.886752486e-01f * v; + s10 += -2.886748612e-01f * v; + s11 += 2.886750698e-01f * v; + } + } + + { + float v = src[7 * src_stride]; + if (v != 0.0f) + { + s0 += 2.485257983e-01f * v; + s1 += -3.771722913e-01f * v; + s2 += -5.328698456e-02f * v; + s3 += 4.047556818e-01f * v; + s4 += -1.562300622e-01f * v; + s5 += -3.238852322e-01f * v; + s6 += 3.238853514e-01f * v; + s7 += 1.562295407e-01f * v; + s8 += -4.047557414e-01f * v; + s9 += 5.328752100e-02f * v; + s10 += 3.771720827e-01f * v; + s11 += -2.485256344e-01f * v; + } + } + + { + float v = src[8 * src_stride]; + if (v != 0.0f) + { + s0 += 2.041241378e-01f * v; + s1 += -4.082483053e-01f * v; + s2 += 2.041243017e-01f * v; + s3 += 2.041239887e-01f * v; + s4 += -4.082483053e-01f * v; + s5 += 2.041243464e-01f * v; + s6 += 2.041241080e-01f * v; + s7 += -4.082483053e-01f * v; + s8 += 2.041242570e-01f * v; + s9 += 2.041241974e-01f * v; + s10 += -4.082483053e-01f * v; + s11 += 2.041237950e-01f * v; + } + } + + { + float v = src[9 * src_stride]; + if (v != 0.0f) + { + s0 += 1.562298536e-01f * v; + s1 += -3.771722317e-01f * v; + s2 += 3.771722615e-01f * v; + s3 += -1.562300622e-01f * v; + s4 += -1.562296748e-01f * v; + s5 += 3.771723211e-01f * v; + s6 += -3.771723509e-01f * v; + s7 += 1.562300622e-01f * v; + s8 += 1.562293023e-01f * v; + s9 += -3.771721721e-01f * v; + s10 += 3.771724999e-01f * v; + s11 += -1.562300622e-01f * v; + } + } + + { + float v = src[10 * src_stride]; + if (v != 0.0f) + { + s0 += 1.056623980e-01f * v; + s1 += -2.886751592e-01f * v; + s2 += 3.943375647e-01f * v; + s3 += -3.943376541e-01f * v; + s4 += 2.886751592e-01f * v; + s5 += -1.056626216e-01f * v; + s6 += -1.056624576e-01f * v; + s7 += 2.886750400e-01f * v; + s8 += -3.943376839e-01f * v; + s9 += 3.943377137e-01f * v; + s10 += -2.886756361e-01f * v; + s11 += 1.056632623e-01f * v; + } + } + + { + float v = src[11 * src_stride]; + if (v != 0.0f) + { + s0 += 5.328707024e-02f * v; + s1 += -1.562297344e-01f * v; + s2 += 2.485257536e-01f * v; + s3 += -3.238852322e-01f * v; + s4 += 3.771723211e-01f * v; + s5 += -4.047556818e-01f * v; + s6 += 4.047556818e-01f * v; + s7 += -3.771722913e-01f * v; + s8 += 3.238852024e-01f * v; + s9 += -2.485264540e-01f * v; + s10 += 1.562305540e-01f * v; + s11 += -5.328702182e-02f * v; + } + } + + dst[0 * dst_stride] = s0; + dst[1 * dst_stride] = s1; + dst[2 * dst_stride] = s2; + dst[3 * dst_stride] = s3; + dst[4 * dst_stride] = s4; + dst[5 * dst_stride] = s5; + dst[6 * dst_stride] = s6; + dst[7 * dst_stride] = s7; + dst[8 * dst_stride] = s8; + dst[9 * dst_stride] = s9; + dst[10 * dst_stride] = s10; + dst[11 * dst_stride] = s11; +} diff --git a/webgl/encoder/build_notes.txt b/webgl/encoder/build_notes.txt new file mode 100644 index 0000000..2db46f9 --- /dev/null +++ b/webgl/encoder/build_notes.txt @@ -0,0 +1,26 @@ +# Prereq: activate emsdk first (so emcmake/em++ are on PATH) +# Linux/macOS: +source /path/to/emsdk/emsdk_env.sh +# Windows PowerShell: +# & "C:\path\to\emsdk\emsdk_env.ps1" + +# ===== Release (fast; same behavior as your original file) ===== +emcmake cmake -S . -B build-release -DCMAKE_BUILD_TYPE=Release +cmake --build build-release -j + +# ===== Debug (symbols + assertions) ===== +emcmake cmake -S . -B build-debug -DCMAKE_BUILD_TYPE=Debug +cmake --build build-debug -j + +# ===== SAN (ASan + UBSan; great for catching bugs) ===== +emcmake cmake -S . -B build-san -DCMAKE_BUILD_TYPE=SAN +cmake --build build-san -j + +# Build a single target (optional) instead of all three: +cmake --build build-release -j --target basis_encoder.js +cmake --build build-release -j --target basis_encoder_threads.js +cmake --build build-release -j --target basis_encoder_threads_wasm64.js + +# Toggle Zstd (OFF = smaller binary, no KTX2 Zstd compression) +emcmake cmake -S . -B build-release -DCMAKE_BUILD_TYPE=Release -DKTX2_ZSTANDARD=OFF +cmake --build build-release -j diff --git a/webgl/texture_test/assets/base.basis b/webgl/texture_test/assets/base.basis new file mode 100644 index 0000000..e7e102e Binary files /dev/null and b/webgl/texture_test/assets/base.basis differ diff --git a/zstd/zstd_errors.h b/zstd/zstd_errors.h new file mode 100644 index 0000000..8ebc95c --- /dev/null +++ b/zstd/zstd_errors.h @@ -0,0 +1,107 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under both the BSD-style license (found in the + * LICENSE file in the root directory of this source tree) and the GPLv2 (found + * in the COPYING file in the root directory of this source tree). + * You may select, at your option, one of the above-listed licenses. + */ + +#ifndef ZSTD_ERRORS_H_398273423 +#define ZSTD_ERRORS_H_398273423 + +#if defined (__cplusplus) +extern "C" { +#endif + +/* ===== ZSTDERRORLIB_API : control library symbols visibility ===== */ +#ifndef ZSTDERRORLIB_VISIBLE + /* Backwards compatibility with old macro name */ +# ifdef ZSTDERRORLIB_VISIBILITY +# define ZSTDERRORLIB_VISIBLE ZSTDERRORLIB_VISIBILITY +# elif defined(__GNUC__) && (__GNUC__ >= 4) && !defined(__MINGW32__) +# define ZSTDERRORLIB_VISIBLE __attribute__ ((visibility ("default"))) +# else +# define ZSTDERRORLIB_VISIBLE +# endif +#endif + +#ifndef ZSTDERRORLIB_HIDDEN +# if defined(__GNUC__) && (__GNUC__ >= 4) && !defined(__MINGW32__) +# define ZSTDERRORLIB_HIDDEN __attribute__ ((visibility ("hidden"))) +# else +# define ZSTDERRORLIB_HIDDEN +# endif +#endif + +#if defined(ZSTD_DLL_EXPORT) && (ZSTD_DLL_EXPORT==1) +# define ZSTDERRORLIB_API __declspec(dllexport) ZSTDERRORLIB_VISIBLE +#elif defined(ZSTD_DLL_IMPORT) && (ZSTD_DLL_IMPORT==1) +# define ZSTDERRORLIB_API __declspec(dllimport) ZSTDERRORLIB_VISIBLE /* It isn't required but allows to generate better code, saving a function pointer load from the IAT and an indirect jump.*/ +#else +# define ZSTDERRORLIB_API ZSTDERRORLIB_VISIBLE +#endif + +/*-********************************************* + * Error codes list + *-********************************************* + * Error codes _values_ are pinned down since v1.3.1 only. + * Therefore, don't rely on values if you may link to any version < v1.3.1. + * + * Only values < 100 are considered stable. + * + * note 1 : this API shall be used with static linking only. + * dynamic linking is not yet officially supported. + * note 2 : Prefer relying on the enum than on its value whenever possible + * This is the only supported way to use the error list < v1.3.1 + * note 3 : ZSTD_isError() is always correct, whatever the library version. + **********************************************/ +typedef enum { + ZSTD_error_no_error = 0, + ZSTD_error_GENERIC = 1, + ZSTD_error_prefix_unknown = 10, + ZSTD_error_version_unsupported = 12, + ZSTD_error_frameParameter_unsupported = 14, + ZSTD_error_frameParameter_windowTooLarge = 16, + ZSTD_error_corruption_detected = 20, + ZSTD_error_checksum_wrong = 22, + ZSTD_error_literals_headerWrong = 24, + ZSTD_error_dictionary_corrupted = 30, + ZSTD_error_dictionary_wrong = 32, + ZSTD_error_dictionaryCreation_failed = 34, + ZSTD_error_parameter_unsupported = 40, + ZSTD_error_parameter_combination_unsupported = 41, + ZSTD_error_parameter_outOfBound = 42, + ZSTD_error_tableLog_tooLarge = 44, + ZSTD_error_maxSymbolValue_tooLarge = 46, + ZSTD_error_maxSymbolValue_tooSmall = 48, + ZSTD_error_cannotProduce_uncompressedBlock = 49, + ZSTD_error_stabilityCondition_notRespected = 50, + ZSTD_error_stage_wrong = 60, + ZSTD_error_init_missing = 62, + ZSTD_error_memory_allocation = 64, + ZSTD_error_workSpace_tooSmall= 66, + ZSTD_error_dstSize_tooSmall = 70, + ZSTD_error_srcSize_wrong = 72, + ZSTD_error_dstBuffer_null = 74, + ZSTD_error_noForwardProgress_destFull = 80, + ZSTD_error_noForwardProgress_inputEmpty = 82, + /* following error codes are __NOT STABLE__, they can be removed or changed in future versions */ + ZSTD_error_frameIndex_tooLarge = 100, + ZSTD_error_seekableIO = 102, + ZSTD_error_dstBuffer_wrong = 104, + ZSTD_error_srcBuffer_wrong = 105, + ZSTD_error_sequenceProducer_failed = 106, + ZSTD_error_externalSequences_invalid = 107, + ZSTD_error_maxCode = 120 /* never EVER use this value directly, it can change in future versions! Use ZSTD_isError() instead */ +} ZSTD_ErrorCode; + +ZSTDERRORLIB_API const char* ZSTD_getErrorString(ZSTD_ErrorCode code); /**< Same as ZSTD_getErrorName, but using a `ZSTD_ErrorCode` enum argument */ + + +#if defined (__cplusplus) +} +#endif + +#endif /* ZSTD_ERRORS_H_398273423 */