mirror of
https://github.com/BinomialLLC/basis_universal.git
synced 2026-06-08 08:33:53 +00:00
11066 lines
376 KiB
C++
11066 lines
376 KiB
C++
// File: basisu_astc_ldr_encode.cpp
|
||
// Copyright (C) 2019-2026 Binomial LLC. All Rights Reserved.
|
||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||
// you may not use this file except in compliance with the License.
|
||
// You may obtain a copy of the License at
|
||
//
|
||
// http://www.apache.org/licenses/LICENSE-2.0
|
||
//
|
||
// Unless required by applicable law or agreed to in writing, software
|
||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||
// See the License for the specific language governing permissions and
|
||
// limitations under the License.
|
||
#include "basisu_enc.h"
|
||
#include "basisu_astc_ldr_encode.h"
|
||
#include "basisu_astc_hdr_common.h"
|
||
#include "basisu_astc_ldr_common.h"
|
||
#include "3rdparty/android_astc_decomp.h"
|
||
#include <queue>
|
||
|
||
#include "../zstd/zstd.h"
|
||
|
||
namespace basisu {
|
||
namespace astc_ldr {
|
||
|
||
const bool g_devel_messages = true;
|
||
const bool ASTC_LDR_CONSISTENCY_CHECKING = true;
|
||
|
||
bool g_initialized;
|
||
|
||
const uint32_t EXPECTED_SUPERBUCKET_HASH_SIZE = 8192;
|
||
const uint32_t EXPECTED_SHORTLIST_HASH_SIZE = 4096;
|
||
|
||
const uint32_t MAX_BASE_PARTS2 = 128;
|
||
const uint32_t MAX_BASE_PARTS3 = 128;
|
||
|
||
const uint32_t PART_ESTIMATE_STAGE1_MULTIPLIER = 4;
|
||
|
||
const uint32_t MAX_WIDTH = 65535, MAX_HEIGHT = 65535;
|
||
|
||
void code_block_weights(
|
||
basist::astc_ldr_t::grid_weight_dct &gw_dct,
|
||
float q, uint32_t plane_index,
|
||
const astc_helpers::log_astc_block& log_blk,
|
||
const basist::astc_ldr_t::astc_block_grid_data* pGrid_data,
|
||
basisu::bitwise_coder& c,
|
||
basist::astc_ldr_t::dct_syms& syms)
|
||
{
|
||
assert(q > 0.0f);
|
||
|
||
syms.clear();
|
||
|
||
const uint32_t grid_width = log_blk.m_grid_width, grid_height = log_blk.m_grid_height;
|
||
const uint32_t total_grid_samples = grid_width * grid_height;
|
||
const uint32_t num_planes = log_blk.m_dual_plane ? 2 : 1;
|
||
|
||
//const auto& dequant_tab = astc_helpers::g_dequant_tables.get_weight_tab(log_blk.m_weight_ise_range).m_ISE_to_val;
|
||
//const auto& quant_tab = astc_helpers::g_dequant_tables.get_weight_tab(log_blk.m_weight_ise_range).m_val_to_ise;
|
||
|
||
uint8_t dequantized_raw_weights0[astc_helpers::MAX_BLOCK_PIXELS];
|
||
|
||
for (uint32_t i = 0; i < grid_width * grid_height; i++)
|
||
dequantized_raw_weights0[i] = astc_helpers::g_dequant_tables.get_weight_tab(log_blk.m_weight_ise_range).m_ISE_to_val[log_blk.m_weights[i * num_planes + plane_index]];
|
||
|
||
auto grid_dim_vals_iter = gw_dct.m_grid_dim_key_vals.find(basist::astc_ldr_t::grid_dim_key(grid_width, grid_height));
|
||
assert(grid_dim_vals_iter != gw_dct.m_grid_dim_key_vals.end());
|
||
|
||
auto& grid_dim_vals = grid_dim_vals_iter->second;
|
||
|
||
float orig_weights[astc_helpers::MAX_BLOCK_PIXELS];
|
||
float weight_sum = 0;
|
||
for (uint32_t y = 0; y < grid_height; y++)
|
||
{
|
||
for (uint32_t x = 0; x < grid_width; x++)
|
||
{
|
||
orig_weights[x + y * grid_width] = dequantized_raw_weights0[x + y * grid_width];
|
||
weight_sum += orig_weights[x + y * grid_width];
|
||
}
|
||
}
|
||
|
||
float scaled_weight_coding_scale = basist::astc_ldr_t::SCALED_WEIGHT_BASE_CODING_SCALE;
|
||
if (log_blk.m_weight_ise_range <= astc_helpers::BISE_8_LEVELS)
|
||
scaled_weight_coding_scale = 1.0f / 8.0f;
|
||
|
||
float scaled_mean_weight = std::round((float)scaled_weight_coding_scale * (weight_sum / total_grid_samples));
|
||
scaled_mean_weight = basisu::clamp<float>(scaled_mean_weight, 0.0f, 64.0f * (float)scaled_weight_coding_scale);
|
||
|
||
float mean_weight = scaled_mean_weight / (float)scaled_weight_coding_scale;
|
||
|
||
for (uint32_t y = 0; y < grid_height; y++)
|
||
for (uint32_t x = 0; x < grid_width; x++)
|
||
orig_weights[x + y * grid_width] -= mean_weight;
|
||
|
||
const float span_len = gw_dct.get_max_span_len(log_blk, plane_index);
|
||
|
||
float dct_weights[astc_helpers::MAX_BLOCK_PIXELS];
|
||
|
||
// TODO - temp alloc
|
||
basist::astc_ldr_t::fvec dct_work;
|
||
grid_dim_vals.m_dct.forward(orig_weights, dct_weights, dct_work);
|
||
|
||
const float level_scale = gw_dct.compute_level_scale(q, span_len, pGrid_data->m_weight_gamma, grid_width, grid_height, log_blk.m_weight_ise_range);
|
||
|
||
int dct_quant_tab[astc_helpers::MAX_BLOCK_PIXELS];
|
||
gw_dct.compute_quant_table(q, grid_width, grid_height, level_scale, dct_quant_tab);
|
||
|
||
#if defined(DEBUG) || defined(_DEBUG)
|
||
// sanity checking
|
||
basist::astc_ldr_t::sample_quant_table_state quant_state;
|
||
quant_state.init(q, gw_dct.m_block_width, gw_dct.m_block_height, level_scale);
|
||
#endif
|
||
|
||
c.put_truncated_binary((int)scaled_mean_weight, (uint32_t)(64.0f * scaled_weight_coding_scale) + 1);
|
||
|
||
syms.m_dc_sym = (int)scaled_mean_weight;
|
||
syms.m_num_dc_levels = (uint32_t)(64.0f * scaled_weight_coding_scale) + 1;
|
||
assert(syms.m_num_dc_levels == gw_dct.get_num_weight_dc_levels(log_blk.m_weight_ise_range));
|
||
|
||
int dct_coeffs[astc_helpers::MAX_BLOCK_PIXELS];
|
||
|
||
for (uint32_t y = 0; y < grid_height; y++)
|
||
{
|
||
for (uint32_t x = 0; x < grid_width; x++)
|
||
{
|
||
if (!x && !y)
|
||
{
|
||
dct_coeffs[0] = 0;
|
||
continue;
|
||
}
|
||
|
||
const int levels = dct_quant_tab[x + y * grid_width];
|
||
|
||
#if defined(DEBUG) || defined(_DEBUG)
|
||
// sanity checking
|
||
assert(levels == gw_dct.sample_quant_table(quant_state, x, y));
|
||
#endif
|
||
|
||
float d = dct_weights[x + y * grid_width];
|
||
|
||
int id = gw_dct.quantize_deadzone(d, levels, basist::astc_ldr_t::DEADZONE_ALPHA, x, y);
|
||
|
||
dct_coeffs[x + y * grid_width] = id;
|
||
|
||
} // x
|
||
|
||
} // y
|
||
|
||
const basisu::int_vec& zigzag = grid_dim_vals.m_zigzag;
|
||
assert(zigzag.size() == total_grid_samples);
|
||
|
||
int total_zeros = 0;
|
||
for (uint32_t i = 0; i < total_grid_samples; i++)
|
||
{
|
||
uint32_t dct_idx = zigzag[i];
|
||
if (!dct_idx)
|
||
continue;
|
||
|
||
int coeff = dct_coeffs[dct_idx];
|
||
if (!coeff)
|
||
{
|
||
total_zeros++;
|
||
continue;
|
||
}
|
||
|
||
basist::astc_ldr_t::dct_syms::coeff cf;
|
||
cf.m_num_zeros = basisu::safe_cast_uint16(total_zeros);
|
||
cf.m_coeff = basisu::safe_cast_int16(coeff);
|
||
syms.m_coeffs.push_back(cf);
|
||
syms.m_max_coeff_mag = basisu::maximum(syms.m_max_coeff_mag, basisu::iabs(coeff));
|
||
syms.m_max_zigzag_index = basisu::maximum(syms.m_max_zigzag_index, i);
|
||
|
||
c.put_rice(total_zeros, gw_dct.m_zero_run);
|
||
total_zeros = 0;
|
||
|
||
c.put_bits(coeff < 0 ? 1 : 0, 1);
|
||
|
||
if (coeff < 0)
|
||
coeff = -coeff;
|
||
|
||
c.put_rice(coeff, gw_dct.m_coeff);
|
||
}
|
||
|
||
if (total_zeros)
|
||
{
|
||
basist::astc_ldr_t::dct_syms::coeff cf;
|
||
cf.m_num_zeros = basisu::safe_cast_uint16(total_zeros);
|
||
cf.m_coeff = INT16_MAX;
|
||
syms.m_coeffs.push_back(cf);
|
||
|
||
c.put_rice(total_zeros, gw_dct.m_zero_run);
|
||
}
|
||
}
|
||
|
||
void astc_ldr_requantize_astc_weights(uint32_t n, const uint8_t* pSrc_ise_vals, uint32_t from_ise_range, uint8_t* pDst_ise_vals, uint32_t to_ise_range)
|
||
{
|
||
if (from_ise_range == to_ise_range)
|
||
{
|
||
if (pDst_ise_vals != pSrc_ise_vals)
|
||
memcpy(pDst_ise_vals, pSrc_ise_vals, n);
|
||
return;
|
||
}
|
||
|
||
// from/to BISE ranges not equal
|
||
if (from_ise_range == astc_helpers::BISE_64_LEVELS)
|
||
{
|
||
// from [0,64]
|
||
const auto& quant_tab = astc_helpers::g_dequant_tables.get_weight_tab(to_ise_range).m_val_to_ise;
|
||
|
||
for (uint32_t i = 0; i < n; i++)
|
||
pDst_ise_vals[i] = quant_tab[pSrc_ise_vals[i]];
|
||
}
|
||
else if (to_ise_range == astc_helpers::BISE_64_LEVELS)
|
||
{
|
||
// to [0,64]
|
||
const auto& dequant_tab = astc_helpers::g_dequant_tables.get_weight_tab(from_ise_range).m_ISE_to_val;
|
||
|
||
for (uint32_t i = 0; i < n; i++)
|
||
pDst_ise_vals[i] = dequant_tab[pSrc_ise_vals[i]];
|
||
}
|
||
else
|
||
{
|
||
// from/to any other
|
||
const auto& dequant_tab = astc_helpers::g_dequant_tables.get_weight_tab(from_ise_range).m_ISE_to_val;
|
||
const auto& quant_tab = astc_helpers::g_dequant_tables.get_weight_tab(to_ise_range).m_val_to_ise;
|
||
|
||
for (uint32_t i = 0; i < n; i++)
|
||
pDst_ise_vals[i] = quant_tab[dequant_tab[pSrc_ise_vals[i]]];
|
||
}
|
||
}
|
||
|
||
void astc_ldr_downsample_ise_weights(
|
||
uint32_t dequant_weight_ise_range, uint32_t quant_weight_ise_range,
|
||
uint32_t block_w, uint32_t block_h,
|
||
uint32_t grid_w, uint32_t grid_h,
|
||
const uint8_t* pSrc_weights, uint8_t* pDst_weights,
|
||
const float* pDownsample_matrix)
|
||
{
|
||
assert((block_w <= astc_ldr::ASTC_LDR_MAX_BLOCK_WIDTH) && (block_h <= astc_ldr::ASTC_LDR_MAX_BLOCK_HEIGHT));
|
||
assert((grid_w >= 2) && (grid_w <= block_w));
|
||
assert((grid_h >= 2) && (grid_h <= block_h));
|
||
|
||
assert(((dequant_weight_ise_range >= astc_helpers::FIRST_VALID_WEIGHT_ISE_RANGE) && (dequant_weight_ise_range <= astc_helpers::LAST_VALID_WEIGHT_ISE_RANGE)) ||
|
||
(dequant_weight_ise_range == astc_helpers::BISE_64_LEVELS));
|
||
|
||
assert(((quant_weight_ise_range >= astc_helpers::FIRST_VALID_WEIGHT_ISE_RANGE) && (quant_weight_ise_range <= astc_helpers::LAST_VALID_WEIGHT_ISE_RANGE)) ||
|
||
(quant_weight_ise_range == astc_helpers::BISE_64_LEVELS));
|
||
|
||
assert(pDownsample_matrix);
|
||
|
||
if ((block_w == grid_w) && (block_h == grid_h))
|
||
{
|
||
if (dequant_weight_ise_range != quant_weight_ise_range)
|
||
{
|
||
astc_ldr_requantize_astc_weights(block_w * block_h, pSrc_weights, dequant_weight_ise_range, pDst_weights, quant_weight_ise_range);
|
||
}
|
||
else
|
||
{
|
||
if (pDst_weights != pSrc_weights)
|
||
memcpy(pDst_weights, pSrc_weights, block_w * block_h);
|
||
}
|
||
|
||
return;
|
||
}
|
||
|
||
uint8_t desired_weights[astc_ldr::ASTC_LDR_MAX_BLOCK_PIXELS];
|
||
|
||
if (dequant_weight_ise_range == astc_helpers::BISE_64_LEVELS)
|
||
{
|
||
memcpy(desired_weights, pSrc_weights, block_w * block_h);
|
||
}
|
||
else
|
||
{
|
||
const auto& dequant_tab = astc_helpers::g_dequant_tables.get_weight_tab(dequant_weight_ise_range).m_ISE_to_val;
|
||
|
||
for (uint32_t by = 0; by < block_h; by++)
|
||
for (uint32_t bx = 0; bx < block_w; bx++)
|
||
desired_weights[bx + by * block_w] = dequant_tab[pSrc_weights[bx + by * block_w]];
|
||
}
|
||
|
||
if (quant_weight_ise_range == astc_helpers::BISE_64_LEVELS)
|
||
{
|
||
downsample_weight_grid(
|
||
pDownsample_matrix,
|
||
block_w, block_h, // source/from dimension (block size)
|
||
grid_w, grid_h, // dest/to dimension (grid size)
|
||
desired_weights, // these are dequantized weights, NOT ISE symbols, [by][bx]
|
||
pDst_weights); // [wy][wx]
|
||
}
|
||
else
|
||
{
|
||
uint8_t raw_downsampled_weights[astc_ldr::ASTC_LDR_MAX_BLOCK_PIXELS];
|
||
|
||
downsample_weight_grid(
|
||
pDownsample_matrix,
|
||
block_w, block_h, // source/from dimension (block size)
|
||
grid_w, grid_h, // dest/to dimension (grid size)
|
||
desired_weights, // these are dequantized weights, NOT ISE symbols, [by][bx]
|
||
raw_downsampled_weights); // [wy][wx]
|
||
|
||
const auto& weight_quant_tab = astc_helpers::g_dequant_tables.get_weight_tab(quant_weight_ise_range).m_val_to_ise;
|
||
|
||
for (uint32_t gy = 0; gy < grid_h; gy++)
|
||
for (uint32_t gx = 0; gx < grid_w; gx++)
|
||
pDst_weights[gx + gy * grid_w] = weight_quant_tab[raw_downsampled_weights[gx + gy * grid_w]];
|
||
}
|
||
}
|
||
|
||
void downsample_weight_residual_grid(
|
||
const float* pMatrix_weights,
|
||
uint32_t bx, uint32_t by, // source/from dimension (block size)
|
||
uint32_t wx, uint32_t wy, // dest/to dimension (grid size)
|
||
const int* pSrc_weights, // these are dequantized weights, NOT ISE symbols, [by][bx]
|
||
float* pDst_weights) // [wy][wx]
|
||
{
|
||
const uint32_t total_block_samples = bx * by;
|
||
|
||
for (uint32_t y = 0; y < wy; y++)
|
||
{
|
||
for (uint32_t x = 0; x < wx; x++)
|
||
{
|
||
float total = 0.0f;
|
||
|
||
for (uint32_t i = 0; i < total_block_samples; i++)
|
||
if (pMatrix_weights[i])
|
||
total += pMatrix_weights[i] * (float)pSrc_weights[i];
|
||
|
||
pDst_weights[x + y * wx] = total;
|
||
|
||
pMatrix_weights += total_block_samples;
|
||
}
|
||
}
|
||
}
|
||
|
||
void downsample_weightsf(
|
||
const float* pMatrix_weights,
|
||
uint32_t bx, uint32_t by, // source/from dimension (block size)
|
||
uint32_t wx, uint32_t wy, // dest/to dimension (grid size)
|
||
const float* pSrc_weights, // these are dequantized weights, NOT ISE symbols, [by][bx]
|
||
float* pDst_weights) // [wy][wx]
|
||
{
|
||
const uint32_t total_block_samples = bx * by;
|
||
|
||
for (uint32_t y = 0; y < wy; y++)
|
||
{
|
||
for (uint32_t x = 0; x < wx; x++)
|
||
{
|
||
float total = 0.0f;
|
||
|
||
for (uint32_t i = 0; i < total_block_samples; i++)
|
||
if (pMatrix_weights[i])
|
||
total += pMatrix_weights[i] * pSrc_weights[i];
|
||
|
||
pDst_weights[x + y * wx] = total;
|
||
|
||
pMatrix_weights += total_block_samples;
|
||
}
|
||
}
|
||
}
|
||
|
||
static inline uint32_t weighted_color_error(const color_rgba& a, const color_rgba& b, const astc_ldr::cem_encode_params& p)
|
||
{
|
||
uint32_t total_e = 0;
|
||
for (uint32_t c = 0; c < 4; c++)
|
||
{
|
||
int av = a[c];
|
||
int bv = b[c];
|
||
int ev = av - bv;
|
||
total_e += (uint32_t)(ev * ev) * p.m_comp_weights[c];
|
||
}
|
||
|
||
return total_e;
|
||
}
|
||
|
||
uint64_t eval_error(
|
||
uint32_t block_width, uint32_t block_height,
|
||
const astc_helpers::log_astc_block& enc_log_block,
|
||
const astc_ldr::pixel_stats_t& pixel_stats,
|
||
const astc_ldr::cem_encode_params& params)
|
||
{
|
||
color_rgba dec_block_pixels[astc_ldr::ASTC_LDR_MAX_BLOCK_PIXELS];
|
||
bool status = astc_helpers::decode_block_xuastc_ldr(enc_log_block, dec_block_pixels, block_width, block_height, params.m_decode_mode_srgb ? astc_helpers::cDecodeModeSRGB8 : astc_helpers::cDecodeModeLDR8);
|
||
if (!status)
|
||
{
|
||
// Shouldn't ever happen
|
||
assert(0);
|
||
return UINT64_MAX;
|
||
}
|
||
|
||
#if defined(_DEBUG) || defined(DEBUG)
|
||
// Sanity check vs. unoptimized decoder
|
||
color_rgba dec_block_pixels_alt[astc_ldr::ASTC_LDR_MAX_BLOCK_PIXELS];
|
||
bool alt_status = astc_helpers::decode_block(enc_log_block, dec_block_pixels_alt, block_width, block_height, params.m_decode_mode_srgb ? astc_helpers::cDecodeModeSRGB8 : astc_helpers::cDecodeModeLDR8);
|
||
if (!alt_status)
|
||
{
|
||
// Shouldn't ever happen
|
||
assert(0);
|
||
return UINT64_MAX;
|
||
}
|
||
|
||
if (memcmp(dec_block_pixels, dec_block_pixels_alt, sizeof(color_rgba) * block_width * block_height) != 0)
|
||
{
|
||
// Very bad
|
||
assert(0);
|
||
return UINT64_MAX;
|
||
}
|
||
#endif
|
||
|
||
uint64_t total_err = 0;
|
||
|
||
const uint32_t total_block_pixels = block_width * block_height;
|
||
for (uint32_t i = 0; i < total_block_pixels; i++)
|
||
total_err += weighted_color_error(dec_block_pixels[i], pixel_stats.m_pixels[i], params);
|
||
|
||
return total_err;
|
||
}
|
||
|
||
uint64_t eval_error(
|
||
uint32_t block_width, uint32_t block_height,
|
||
const astc_ldr::pixel_stats_t& pixel_stats,
|
||
uint32_t cem_index,
|
||
bool dual_plane_flag, int ccs_index,
|
||
uint32_t endpoint_ise_range, uint32_t weight_ise_range,
|
||
uint32_t grid_width, uint32_t grid_height,
|
||
const uint8_t* pEndpoint_vals, const uint8_t* pWeight_grid_vals0, const uint8_t* pWeight_grid_vals1,
|
||
const astc_ldr::cem_encode_params& params)
|
||
{
|
||
const uint32_t total_block_pixels = block_width * block_height;
|
||
const uint32_t total_grid_pixels = grid_width * grid_height;
|
||
|
||
astc_helpers::log_astc_block enc_log_block;
|
||
|
||
enc_log_block.clear();
|
||
enc_log_block.m_grid_width = (uint8_t)grid_width;
|
||
enc_log_block.m_grid_height = (uint8_t)grid_height;
|
||
enc_log_block.m_weight_ise_range = (uint8_t)weight_ise_range;
|
||
enc_log_block.m_endpoint_ise_range = (uint8_t)endpoint_ise_range;
|
||
enc_log_block.m_color_endpoint_modes[0] = (uint8_t)cem_index;
|
||
enc_log_block.m_num_partitions = 1;
|
||
|
||
memcpy(enc_log_block.m_endpoints, pEndpoint_vals, astc_helpers::get_num_cem_values(cem_index));
|
||
|
||
if (dual_plane_flag)
|
||
{
|
||
assert((ccs_index >= 0) && (ccs_index <= 3));
|
||
|
||
enc_log_block.m_dual_plane = true;
|
||
enc_log_block.m_color_component_selector = (uint8_t)ccs_index;
|
||
|
||
for (uint32_t i = 0; i < total_grid_pixels; i++)
|
||
{
|
||
enc_log_block.m_weights[i * 2 + 0] = pWeight_grid_vals0[i];
|
||
enc_log_block.m_weights[i * 2 + 1] = pWeight_grid_vals1[i];
|
||
}
|
||
}
|
||
else
|
||
{
|
||
assert(ccs_index < 0);
|
||
|
||
memcpy(enc_log_block.m_weights, pWeight_grid_vals0, total_grid_pixels);
|
||
}
|
||
|
||
color_rgba decoded_pixels[astc_ldr::ASTC_LDR_MAX_BLOCK_PIXELS];
|
||
bool status = astc_helpers::decode_block(enc_log_block, decoded_pixels, block_width, block_height, params.m_decode_mode_srgb ? astc_helpers::cDecodeModeSRGB8 : astc_helpers::cDecodeModeLDR8);
|
||
assert(status);
|
||
|
||
if (!status)
|
||
return UINT64_MAX;
|
||
|
||
uint64_t total_err = 0;
|
||
|
||
for (uint32_t i = 0; i < total_block_pixels; i++)
|
||
total_err += weighted_color_error(pixel_stats.m_pixels[i], decoded_pixels[i], params);
|
||
|
||
return total_err;
|
||
}
|
||
|
||
float compute_psnr_from_wsse(uint32_t block_width, uint32_t block_height, uint64_t sse, float total_comp_weights)
|
||
{
|
||
const uint32_t total_block_pixels = block_width * block_height;
|
||
const float wmse = (float)sse / (total_comp_weights * (float)total_block_pixels);
|
||
const float wpsnr = (wmse > 1e-5f) ? (20.0f * log10f(255.0f / sqrtf(wmse))) : 10000.0f;
|
||
return wpsnr;
|
||
}
|
||
|
||
// quantized coordinate descent (QCD), quadratic objective
|
||
namespace qcd
|
||
{
|
||
struct qcd_min_solver
|
||
{
|
||
// geometry / sizes
|
||
int m_N = 0; // texels
|
||
int m_K = 0; // controls
|
||
int m_Q = 0; // label count
|
||
|
||
// inputs (not owned), (N x K) row-major
|
||
const float* m_pU = nullptr; // grid to texel upsample matrix
|
||
|
||
// cached
|
||
float_vec m_ucols; // N*K, column k at &m_ucols[k*m_N]
|
||
float_vec m_alpha; // K, ||u_k||^2 (>= eps)
|
||
float_vec m_labels; // Q, sorted unique u-labels (ints in [0..64]), ASTC raw [0,64] weights
|
||
|
||
bool m_ready_flag = false;
|
||
|
||
// init: cache columns, norms, and label set
|
||
bool init(const float* pU_rowmajor, int N, int K, const int* pLabels_u, int Q)
|
||
{
|
||
if ((!pU_rowmajor) || (!pLabels_u) || (N <= 0) || (K <= 0) || (Q <= 0))
|
||
return false;
|
||
|
||
m_pU = pU_rowmajor;
|
||
m_N = N;
|
||
m_K = K;
|
||
m_Q = Q;
|
||
|
||
// cache columns
|
||
m_ucols.assign(size_t(N) * K, 0.0f);
|
||
|
||
for (int k = 0; k < K; ++k)
|
||
{
|
||
float* pDst = &m_ucols[size_t(k) * size_t(N)];
|
||
const float* pSrc = m_pU + k; // first element of column k
|
||
for (int t = 0; t < N; ++t)
|
||
pDst[t] = pSrc[size_t(t) * size_t(K)];
|
||
}
|
||
|
||
// column norms
|
||
m_alpha.resize(K);
|
||
|
||
for (int k = 0; k < K; ++k)
|
||
{
|
||
const float* pUK = &m_ucols[size_t(k) * size_t(N)];
|
||
|
||
float a = 0.0f;
|
||
for (int t = 0; t < N; ++t)
|
||
a += pUK[t] * pUK[t];
|
||
|
||
if (!(a > 0.0f))
|
||
a = 1e-8f;
|
||
|
||
m_alpha[k] = a;
|
||
}
|
||
|
||
m_labels.assign(pLabels_u, pLabels_u + Q);
|
||
|
||
#if defined(_DEBUG) || defined(DEBUG)
|
||
for (size_t i = 1; i < m_labels.size(); ++i)
|
||
{
|
||
assert(m_labels[i] > m_labels[i - 1]); // strictly increasing
|
||
assert((m_labels[i] >= 0) && (m_labels[i] <= 64));
|
||
}
|
||
#endif
|
||
|
||
m_Q = (int)m_labels.size();
|
||
if (m_Q <= 0)
|
||
return false;
|
||
|
||
m_ready_flag = true;
|
||
return true;
|
||
}
|
||
|
||
// compute residual r = U*g - w* (uses label IDs -> u-values)
|
||
void build_residual(const int* pG_idx, const float* pW_star, float* pR_out) const
|
||
{
|
||
assert(m_ready_flag && pG_idx && pW_star && pR_out);
|
||
|
||
// r = sum_k (u_label[pG_idx[k]] * ucol_k) - pW_star
|
||
std::fill(pR_out, pR_out + m_N, 0.0f);
|
||
|
||
for (int k = 0; k < m_K; ++k)
|
||
{
|
||
const float* pUK = &m_ucols[size_t(k) * size_t(m_N)];
|
||
const float s = m_labels[pG_idx[k]];
|
||
|
||
for (int t = 0; t < m_N; ++t)
|
||
pR_out[t] += s * pUK[t];
|
||
}
|
||
|
||
for (int t = 0; t < m_N; ++t)
|
||
pR_out[t] -= pW_star[t];
|
||
}
|
||
|
||
// one QCD sweep: returns num moves accepted (strict dE < -eps)
|
||
int sweep(int* pG_idx, float* pR_io, float accept_eps = 1e-6f) const
|
||
{
|
||
assert(m_ready_flag && pG_idx && pR_io);
|
||
int num_moved = 0;
|
||
|
||
for (int k = 0; k < m_K; ++k)
|
||
{
|
||
const float* pUK = &m_ucols[size_t(k) * size_t(m_N)];
|
||
|
||
// beta = <r, u_k>
|
||
float beta = 0.0f;
|
||
for (int t = 0; t < m_N; ++t)
|
||
beta += pR_io[t] * pUK[t];
|
||
|
||
const float a = m_alpha[k]; // >= 1e-8
|
||
|
||
const float cur_u = m_labels[pG_idx[k]];
|
||
const float s_star = cur_u - beta / a; // continuous minimizer (u-domain)
|
||
|
||
// nearest label index to s_star (binary search)
|
||
const int j0 = nearest_label_idx(s_star);
|
||
|
||
const int cand[3] =
|
||
{
|
||
j0,
|
||
(j0 + 1 < m_Q) ? (j0 + 1) : j0,
|
||
(j0 - 1 >= 0) ? (j0 - 1) : j0
|
||
};
|
||
|
||
int best_j = pG_idx[k];
|
||
float best_dE = 0.0f;
|
||
|
||
for (int c = 0; c < 3; ++c)
|
||
{
|
||
const int j = cand[c];
|
||
if (j == pG_idx[k])
|
||
continue;
|
||
|
||
const float s = m_labels[j];
|
||
const float d = s - cur_u; // u-change at coord k
|
||
const float dE = 2.0f * d * beta + d * d * a; // exact delta E
|
||
|
||
if ((best_j == pG_idx[k]) || (dE < best_dE))
|
||
{
|
||
best_dE = dE;
|
||
best_j = j;
|
||
}
|
||
}
|
||
|
||
if ((best_j != pG_idx[k]) && (best_dE < -accept_eps))
|
||
{
|
||
// commit: update residual and label ID
|
||
const float d = m_labels[best_j] - cur_u;
|
||
|
||
for (int t = 0; t < m_N; ++t)
|
||
pR_io[t] += d * pUK[t];
|
||
|
||
pG_idx[k] = best_j;
|
||
++num_moved;
|
||
}
|
||
} // k
|
||
|
||
return num_moved;
|
||
}
|
||
|
||
// utility: energy from residual (sum r^2)
|
||
float residual_energy(const float* pR) const
|
||
{
|
||
assert(pR);
|
||
|
||
float E = 0.0f;
|
||
for (int t = 0; t < m_N; ++t)
|
||
E += pR[t] * pR[t];
|
||
|
||
return E;
|
||
}
|
||
|
||
private:
|
||
// nearest label index by u-value (handles non-uniform spacing)
|
||
int nearest_label_idx(float x) const
|
||
{
|
||
const int Q = m_Q;
|
||
|
||
if (Q <= 1)
|
||
return 0;
|
||
if (x <= m_labels.front())
|
||
return 0;
|
||
if (x >= m_labels.back())
|
||
return Q - 1;
|
||
|
||
int lo = 0, hi = Q - 1;
|
||
while (hi - lo > 1)
|
||
{
|
||
const int mid = (lo + hi) >> 1;
|
||
(x >= m_labels[mid]) ? lo = mid : hi = mid;
|
||
}
|
||
|
||
const float dlo = std::fabs(x - m_labels[lo]);
|
||
const float dhi = std::fabs(x - m_labels[hi]);
|
||
return (dlo <= dhi) ? lo : hi;
|
||
}
|
||
};
|
||
|
||
} // namespace qcd
|
||
|
||
// 1-3 subsets, requires initial weights
|
||
bool polish_block_weights(
|
||
uint32_t block_width, uint32_t block_height,
|
||
const astc_ldr::pixel_stats_t& pixel_stats,
|
||
astc_helpers::log_astc_block& enc_log_block, // assumes there is already a good encoding to improve here
|
||
const astc_ldr::cem_encode_params& params,
|
||
const astc_ldr::partition_pattern_vec* pPat,
|
||
bool& improved_flag,
|
||
bool gradient_descent_flag, bool polish_weights_flag, bool qcd_enabled_flag)
|
||
{
|
||
improved_flag = false;
|
||
|
||
if (!gradient_descent_flag && !polish_weights_flag && !qcd_enabled_flag)
|
||
return true;
|
||
|
||
const uint32_t grid_width = enc_log_block.m_grid_width, grid_height = enc_log_block.m_grid_height;
|
||
const uint32_t cem_index = enc_log_block.m_color_endpoint_modes[0];
|
||
const uint32_t num_subsets = enc_log_block.m_num_partitions;
|
||
const bool dual_plane_flag = enc_log_block.m_dual_plane;
|
||
//const uint32_t num_planes = dual_plane_flag ? 2 : 1;
|
||
const int ccs_index = dual_plane_flag ? enc_log_block.m_color_component_selector : -1;
|
||
|
||
const uint32_t endpoint_ise_range = enc_log_block.m_endpoint_ise_range;
|
||
const uint32_t weight_ise_range = enc_log_block.m_weight_ise_range;
|
||
|
||
const auto& dequant_tab = astc_helpers::g_dequant_tables.get_weight_tab(weight_ise_range).m_ISE_to_val;
|
||
const auto& quant_tab = astc_helpers::g_dequant_tables.get_weight_tab(weight_ise_range).m_val_to_ise;
|
||
|
||
//const bool is_downsampling = (grid_width < block_width) || (grid_height < block_height);
|
||
|
||
#if defined(_DEBUG) || defined(DEBUG)
|
||
if (num_subsets > 1)
|
||
{
|
||
for (uint32_t i = 1; i < num_subsets; i++)
|
||
{
|
||
assert(enc_log_block.m_color_endpoint_modes[i] == cem_index);
|
||
}
|
||
}
|
||
#endif
|
||
|
||
//const astc_block_grid_data* pBlock_grid_data = find_astc_block_grid_data(block_width, block_height, grid_width, grid_height);
|
||
|
||
const uint32_t total_block_pixels = block_width * block_height;
|
||
const uint32_t total_grid_pixels = grid_width * grid_height;
|
||
|
||
uint64_t cur_err = eval_error(block_width, block_height, enc_log_block, pixel_stats, params);
|
||
|
||
uint8_t weights0[astc_ldr::ASTC_LDR_MAX_BLOCK_PIXELS];
|
||
uint8_t weights1[astc_ldr::ASTC_LDR_MAX_BLOCK_PIXELS];
|
||
|
||
astc_helpers::extract_weights(enc_log_block, weights0, 0);
|
||
|
||
if (dual_plane_flag)
|
||
astc_helpers::extract_weights(enc_log_block, weights1, 1);
|
||
|
||
const bool global_gradient_desc_enabled = true;
|
||
const bool global_qcd_enabled = true;
|
||
const bool global_polish_weights_enabled = true;
|
||
|
||
const uint32_t NUM_WEIGHT_POLISH_PASSES = 1;
|
||
|
||
// Gradient descent
|
||
if ((gradient_descent_flag) && (global_gradient_desc_enabled))
|
||
{
|
||
// Downsample the residuals to grid res
|
||
vector2D<float> upsample_matrix;
|
||
compute_upsample_matrix(upsample_matrix, block_width, block_height, grid_width, grid_height);
|
||
|
||
// First compute the block's ideal raw weights given the current endpoints at full block/texel res
|
||
// TODO: Move to helper
|
||
uint8_t ideal_block_raw_weights0[astc_ldr::ASTC_LDR_MAX_BLOCK_PIXELS], ideal_block_raw_weights1[astc_ldr::ASTC_LDR_MAX_BLOCK_PIXELS];
|
||
|
||
if (num_subsets == 1)
|
||
{
|
||
if (dual_plane_flag)
|
||
astc_ldr::eval_solution_dp(pixel_stats, cem_index, ccs_index, enc_log_block.m_endpoints, endpoint_ise_range, ideal_block_raw_weights0, ideal_block_raw_weights1, astc_helpers::BISE_64_LEVELS, params);
|
||
else
|
||
astc_ldr::eval_solution(pixel_stats, cem_index, enc_log_block.m_endpoints, endpoint_ise_range, ideal_block_raw_weights0, astc_helpers::BISE_64_LEVELS, params);
|
||
}
|
||
else
|
||
{
|
||
// Extract each subset's texels, compute the raw weights, place back into full res texel/block weight grid.
|
||
color_rgba part_pixels[astc_helpers::MAX_PARTITIONS][astc_ldr::ASTC_LDR_MAX_BLOCK_PIXELS];
|
||
uint32_t num_part_pixels[astc_helpers::MAX_PARTITIONS] = { 0 };
|
||
|
||
for (uint32_t y = 0; y < block_height; y++)
|
||
{
|
||
for (uint32_t x = 0; x < block_width; x++)
|
||
{
|
||
const color_rgba& px = pixel_stats.m_pixels[x + y * block_width];
|
||
|
||
const uint32_t part_index = (*pPat)(x, y);
|
||
assert(part_index < num_subsets);
|
||
|
||
// Sanity check
|
||
assert(part_index == (uint32_t)astc_helpers::compute_texel_partition(enc_log_block.m_partition_id, x, y, 0, num_subsets, astc_helpers::is_small_block(block_width, block_height)));
|
||
|
||
part_pixels[part_index][num_part_pixels[part_index]] = px;
|
||
num_part_pixels[part_index]++;
|
||
} // x
|
||
} // y
|
||
|
||
astc_ldr::pixel_stats_t part_pixel_stats[astc_helpers::MAX_PARTITIONS];
|
||
|
||
for (uint32_t i = 0; i < num_subsets; i++)
|
||
part_pixel_stats[i].clear();
|
||
|
||
uint8_t part_raw_weights[astc_helpers::MAX_PARTITIONS][astc_ldr::ASTC_LDR_MAX_BLOCK_PIXELS];
|
||
|
||
for (uint32_t part_index = 0; part_index < num_subsets; part_index++)
|
||
{
|
||
part_pixel_stats[part_index].init(num_part_pixels[part_index], &part_pixels[part_index][0]);
|
||
|
||
const uint8_t* pPart_endpoints = astc_helpers::get_endpoints(enc_log_block, part_index);
|
||
|
||
astc_ldr::eval_solution(part_pixel_stats[part_index], cem_index, pPart_endpoints, endpoint_ise_range, &part_raw_weights[part_index][0], astc_helpers::BISE_64_LEVELS, params);
|
||
|
||
} // part_index
|
||
|
||
clear_obj(num_part_pixels);
|
||
|
||
for (uint32_t y = 0; y < block_height; y++)
|
||
{
|
||
for (uint32_t x = 0; x < block_width; x++)
|
||
{
|
||
const uint32_t part_index = (*pPat)(x, y);
|
||
assert(part_index < num_subsets);
|
||
|
||
ideal_block_raw_weights0[x + y * block_width] = part_raw_weights[part_index][num_part_pixels[part_index]];
|
||
num_part_pixels[part_index]++;
|
||
} // x
|
||
} // y
|
||
}
|
||
|
||
#if 1
|
||
// Now compute the current block/texel res (upsampled) raw [0,64] weights given the current quantized grid weights. Dequant then upsample.
|
||
// This is what an ASTC decoder would use during unpacking.
|
||
uint8_t dequantized_grid_weights0[astc_ldr::ASTC_LDR_MAX_BLOCK_PIXELS], dequantized_grid_weights1[astc_ldr::ASTC_LDR_MAX_BLOCK_PIXELS];
|
||
uint8_t dequantized_block_weights_upsampled0[astc_ldr::ASTC_LDR_MAX_BLOCK_PIXELS], dequantized_block_weights_upsampled1[astc_ldr::ASTC_LDR_MAX_BLOCK_PIXELS];
|
||
|
||
astc_ldr_requantize_astc_weights(total_grid_pixels, weights0, weight_ise_range, dequantized_grid_weights0, astc_helpers::BISE_64_LEVELS);
|
||
|
||
if (dual_plane_flag)
|
||
astc_ldr_requantize_astc_weights(total_grid_pixels, weights1, weight_ise_range, dequantized_grid_weights1, astc_helpers::BISE_64_LEVELS);
|
||
|
||
astc_helpers::upsample_weight_grid(
|
||
block_width, block_height, // destination/to dimension
|
||
grid_width, grid_height, // source/from dimension
|
||
dequantized_grid_weights0, // these are dequantized [0,64] weights, NOT ISE symbols, [wy][wx]
|
||
dequantized_block_weights_upsampled0); // [by][bx]
|
||
|
||
if (dual_plane_flag)
|
||
{
|
||
astc_helpers::upsample_weight_grid(
|
||
block_width, block_height, // destination/to dimension
|
||
grid_width, grid_height, // source/from dimension
|
||
dequantized_grid_weights1, // these are dequantized [0,64] weights, NOT ISE symbols, [wy][wx]
|
||
dequantized_block_weights_upsampled1); // [by][bx]
|
||
}
|
||
|
||
// Now compute residuals at the block res
|
||
int weight_block_raw_residuals0[astc_ldr::ASTC_LDR_MAX_BLOCK_PIXELS], weight_block_raw_residuals1[astc_ldr::ASTC_LDR_MAX_BLOCK_PIXELS];
|
||
|
||
for (uint32_t i = 0; i < total_block_pixels; i++)
|
||
weight_block_raw_residuals0[i] = ideal_block_raw_weights0[i] - dequantized_block_weights_upsampled0[i];
|
||
|
||
if (dual_plane_flag)
|
||
{
|
||
for (uint32_t i = 0; i < total_block_pixels; i++)
|
||
weight_block_raw_residuals1[i] = ideal_block_raw_weights1[i] - dequantized_block_weights_upsampled1[i];
|
||
}
|
||
|
||
float weight_grid_residuals_downsampled0[astc_ldr::ASTC_LDR_MAX_BLOCK_PIXELS], weight_grid_residuals_downsampled1[astc_ldr::ASTC_LDR_MAX_BLOCK_PIXELS];
|
||
|
||
basisu::vector<float> unweighted_downsample_matrix;
|
||
|
||
// TODO: precompute, store in weight grid data
|
||
compute_upsample_matrix_transposed(unweighted_downsample_matrix, block_width, block_height, grid_width, grid_height);
|
||
|
||
basisu::vector<float> diag_AtA(total_grid_pixels);
|
||
compute_diag_AtA_vector(block_width, block_height, grid_width, grid_height, upsample_matrix, diag_AtA.get_ptr());
|
||
|
||
downsample_weight_residual_grid(
|
||
unweighted_downsample_matrix.get_ptr(),
|
||
block_width, block_height, // source/from dimension (block size)
|
||
grid_width, grid_height, // dest/to dimension (grid size)
|
||
weight_block_raw_residuals0, // these are dequantized weights, NOT ISE symbols, [by][bx]
|
||
weight_grid_residuals_downsampled0); // [wy][wx]
|
||
|
||
for (uint32_t i = 0; i < total_grid_pixels; i++)
|
||
weight_grid_residuals_downsampled0[i] /= diag_AtA[i];
|
||
|
||
if (dual_plane_flag)
|
||
{
|
||
downsample_weight_residual_grid(
|
||
unweighted_downsample_matrix.get_ptr(),
|
||
block_width, block_height, // source/from dimension (block size)
|
||
grid_width, grid_height, // dest/to dimension (grid size)
|
||
weight_block_raw_residuals1, // these are dequantized weights, NOT ISE symbols, [by][bx]
|
||
weight_grid_residuals_downsampled1); // [wy][wx]
|
||
|
||
for (uint32_t i = 0; i < total_grid_pixels; i++)
|
||
weight_grid_residuals_downsampled1[i] /= diag_AtA[i];
|
||
}
|
||
|
||
// Apply the residuals at grid res and quantize
|
||
const float Q = 1.0f;
|
||
|
||
uint8_t refined_grid_weights0[astc_ldr::ASTC_LDR_MAX_BLOCK_PIXELS], refined_grid_weights1[astc_ldr::ASTC_LDR_MAX_BLOCK_PIXELS];
|
||
|
||
for (uint32_t i = 0; i < total_grid_pixels; i++)
|
||
{
|
||
float v = (float)dequant_tab[weights0[i]] + weight_grid_residuals_downsampled0[i] * Q;
|
||
int iv = clamp((int)std::roundf(v), 0, 64);
|
||
refined_grid_weights0[i] = quant_tab[iv];
|
||
}
|
||
|
||
if (dual_plane_flag)
|
||
{
|
||
for (uint32_t i = 0; i < total_grid_pixels; i++)
|
||
{
|
||
float v = (float)dequant_tab[weights1[i]] + weight_grid_residuals_downsampled1[i] * Q;
|
||
int iv = clamp((int)std::roundf(v), 0, 64);
|
||
refined_grid_weights1[i] = quant_tab[iv];
|
||
}
|
||
}
|
||
#else
|
||
uint8_t refined_grid_weights0[astc_ldr::ASTC_LDR_MAX_BLOCK_PIXELS], refined_grid_weights1[astc_ldr::ASTC_LDR_MAX_BLOCK_PIXELS];
|
||
|
||
for (uint32_t i = 0; i < total_grid_pixels; i++)
|
||
refined_grid_weights0[i] = weights0[i];
|
||
|
||
if (dual_plane_flag)
|
||
{
|
||
for (uint32_t i = 0; i < total_grid_pixels; i++)
|
||
refined_grid_weights1[i] = weights1[i];
|
||
}
|
||
#endif
|
||
|
||
astc_helpers::log_astc_block refined_log_block(enc_log_block);
|
||
|
||
// TODO: This refines both weight planes simultanously, probably not optimal, could do individually.
|
||
astc_helpers::set_weights(refined_log_block, refined_grid_weights0, 0);
|
||
|
||
if (dual_plane_flag)
|
||
astc_helpers::set_weights(refined_log_block, refined_grid_weights1, 1);
|
||
|
||
uint64_t refined_err = eval_error(block_width, block_height, refined_log_block, pixel_stats, params);
|
||
|
||
if (refined_err < cur_err)
|
||
{
|
||
cur_err = refined_err;
|
||
|
||
memcpy(weights0, refined_grid_weights0, total_grid_pixels);
|
||
|
||
if (dual_plane_flag)
|
||
memcpy(weights1, refined_grid_weights1, total_grid_pixels);
|
||
|
||
improved_flag = true;
|
||
}
|
||
|
||
// QCD - not a huge boost (.05-.75 dB), but on the toughest blocks it does help.
|
||
if ((qcd_enabled_flag) && (global_qcd_enabled))
|
||
{
|
||
float ideal_block_weights0[astc_ldr::ASTC_LDR_MAX_BLOCK_PIXELS], ideal_block_weights1[astc_ldr::ASTC_LDR_MAX_BLOCK_PIXELS];
|
||
for (uint32_t i = 0; i < total_block_pixels; i++)
|
||
{
|
||
ideal_block_weights0[i] = (float)ideal_block_raw_weights0[i];
|
||
|
||
if (dual_plane_flag)
|
||
ideal_block_weights1[i] = (float)ideal_block_raw_weights1[i];
|
||
}
|
||
|
||
const float* pUpsample_matrix = basist::astc_ldr_t::find_astc_block_grid_data(block_width, block_height, grid_width, grid_height)->m_upsample_matrix.get_ptr();
|
||
|
||
qcd::qcd_min_solver solver;
|
||
|
||
const uint32_t num_weight_levels = astc_helpers::get_ise_levels(weight_ise_range);
|
||
|
||
assert(num_weight_levels <= 32);
|
||
int labels[32 + 1];
|
||
|
||
for (uint32_t i = 0; i < num_weight_levels; i++)
|
||
labels[i] = astc_helpers::g_dequant_tables.get_weight_tab(weight_ise_range).get_rank_to_val(i);
|
||
|
||
solver.init(pUpsample_matrix, total_block_pixels, total_grid_pixels, labels, num_weight_levels);
|
||
|
||
int grid_idx0[astc_ldr::ASTC_LDR_MAX_BLOCK_PIXELS], grid_idx1[astc_ldr::ASTC_LDR_MAX_BLOCK_PIXELS];
|
||
|
||
const auto& ise_to_rank = astc_helpers::g_dequant_tables.get_weight_tab(weight_ise_range).m_ISE_to_rank;
|
||
|
||
for (uint32_t i = 0; i < total_grid_pixels; i++)
|
||
{
|
||
grid_idx0[i] = ise_to_rank[refined_grid_weights0[i]];
|
||
|
||
if (dual_plane_flag)
|
||
grid_idx1[i] = ise_to_rank[refined_grid_weights1[i]];
|
||
}
|
||
|
||
float resid0[astc_ldr::ASTC_LDR_MAX_BLOCK_PIXELS], resid1[astc_ldr::ASTC_LDR_MAX_BLOCK_PIXELS];
|
||
|
||
solver.build_residual(grid_idx0, ideal_block_weights0, resid0);
|
||
|
||
const uint32_t MAX_QCD_SWEEPS = 5;
|
||
for (uint32_t t = 0; t < MAX_QCD_SWEEPS; t++)
|
||
{
|
||
int moved0 = solver.sweep(grid_idx0, resid0);
|
||
if (!moved0)
|
||
break;
|
||
}
|
||
|
||
if (dual_plane_flag)
|
||
{
|
||
solver.build_residual(grid_idx1, ideal_block_weights1, resid1);
|
||
|
||
for (uint32_t t = 0; t < MAX_QCD_SWEEPS; t++)
|
||
{
|
||
int moved1 = solver.sweep(grid_idx1, resid1);
|
||
if (!moved1)
|
||
break;
|
||
}
|
||
}
|
||
|
||
const auto& rank_to_ise = astc_helpers::g_dequant_tables.get_weight_tab(weight_ise_range).m_rank_to_ISE;
|
||
|
||
for (uint32_t i = 0; i < total_grid_pixels; i++)
|
||
{
|
||
refined_grid_weights0[i] = rank_to_ise[grid_idx0[i]];
|
||
|
||
if (dual_plane_flag)
|
||
refined_grid_weights1[i] = rank_to_ise[grid_idx1[i]];
|
||
}
|
||
|
||
refined_log_block = enc_log_block;
|
||
|
||
astc_helpers::set_weights(refined_log_block, refined_grid_weights0, 0);
|
||
|
||
if (dual_plane_flag)
|
||
astc_helpers::set_weights(refined_log_block, refined_grid_weights1, 1);
|
||
|
||
refined_err = eval_error(block_width, block_height, refined_log_block, pixel_stats, params);
|
||
|
||
if (refined_err < cur_err)
|
||
{
|
||
cur_err = refined_err;
|
||
|
||
memcpy(weights0, refined_grid_weights0, total_grid_pixels);
|
||
|
||
if (dual_plane_flag)
|
||
memcpy(weights1, refined_grid_weights1, total_grid_pixels);
|
||
|
||
improved_flag = true;
|
||
}
|
||
}
|
||
} // if (qcd_enabled)
|
||
|
||
if ((polish_weights_flag) && (global_polish_weights_enabled))
|
||
{
|
||
// Final, expensive, weight polish. Much can be done to improve this, but it's hopefully not ran much in the first place.
|
||
// TODO: The dB gain from this is large, must optimize.
|
||
for (uint32_t polish_pass = 0; polish_pass < NUM_WEIGHT_POLISH_PASSES; polish_pass++)
|
||
{
|
||
for (uint32_t y = 0; y < grid_height; y++)
|
||
{
|
||
for (uint32_t x = 0; x < grid_width; x++)
|
||
{
|
||
for (uint32_t plane_iter = 0; plane_iter < (dual_plane_flag ? 2u : 1u); plane_iter++)
|
||
{
|
||
uint8_t base_grid_weights0[astc_ldr::ASTC_LDR_MAX_BLOCK_PIXELS], base_grid_weights1[astc_ldr::ASTC_LDR_MAX_BLOCK_PIXELS];
|
||
|
||
memcpy(base_grid_weights0, weights0, total_grid_pixels);
|
||
if (dual_plane_flag)
|
||
memcpy(base_grid_weights1, weights1, total_grid_pixels);
|
||
|
||
for (int delta = -1; delta <= 1; delta += 2)
|
||
{
|
||
uint8_t trial_grid_weights0[astc_ldr::ASTC_LDR_MAX_BLOCK_PIXELS], trial_grid_weights1[astc_ldr::ASTC_LDR_MAX_BLOCK_PIXELS];
|
||
|
||
memcpy(trial_grid_weights0, base_grid_weights0, total_grid_pixels);
|
||
|
||
if (dual_plane_flag)
|
||
memcpy(trial_grid_weights1, base_grid_weights1, total_grid_pixels);
|
||
|
||
if (plane_iter == 0)
|
||
trial_grid_weights0[x + y * grid_width] = (uint8_t)astc_ldr::apply_delta_to_bise_weight_val(weight_ise_range, base_grid_weights0[x + y * grid_width], delta);
|
||
else
|
||
trial_grid_weights1[x + y * grid_width] = (uint8_t)astc_ldr::apply_delta_to_bise_weight_val(weight_ise_range, base_grid_weights1[x + y * grid_width], delta);
|
||
|
||
astc_helpers::log_astc_block trial_log_block(enc_log_block);
|
||
|
||
astc_helpers::set_weights(trial_log_block, trial_grid_weights0, 0);
|
||
|
||
if (dual_plane_flag)
|
||
astc_helpers::set_weights(trial_log_block, trial_grid_weights1, 1);
|
||
|
||
uint64_t trial_err = eval_error(block_width, block_height, trial_log_block, pixel_stats, params);
|
||
|
||
if (trial_err < cur_err)
|
||
{
|
||
cur_err = trial_err;
|
||
|
||
memcpy(weights0, trial_grid_weights0, total_grid_pixels);
|
||
|
||
if (dual_plane_flag)
|
||
memcpy(weights1, trial_grid_weights1, total_grid_pixels);
|
||
|
||
improved_flag = true;
|
||
}
|
||
|
||
} // delta
|
||
|
||
} // plane_iter
|
||
|
||
} // x
|
||
} // y
|
||
|
||
} // polish_pass
|
||
|
||
} // polish_flag
|
||
|
||
astc_helpers::log_astc_block new_log_block(enc_log_block);
|
||
|
||
astc_helpers::set_weights(new_log_block, weights0, 0);
|
||
|
||
if (dual_plane_flag)
|
||
astc_helpers::set_weights(new_log_block, weights1, 1);
|
||
|
||
#if defined(_DEBUG) || defined(DEBUG)
|
||
uint64_t new_err = eval_error(block_width, block_height, new_log_block, pixel_stats, params);
|
||
|
||
assert(cur_err == new_err);
|
||
|
||
if (improved_flag)
|
||
{
|
||
uint64_t orig_err = eval_error(block_width, block_height, enc_log_block, pixel_stats, params);
|
||
|
||
assert(new_err < orig_err);
|
||
}
|
||
#endif
|
||
|
||
enc_log_block = new_log_block;
|
||
|
||
return true;
|
||
}
|
||
|
||
bool encode_trial_subsets(
|
||
uint32_t block_width, uint32_t block_height,
|
||
const astc_ldr::pixel_stats_t& pixel_stats,
|
||
uint32_t cem_index, uint32_t num_parts,
|
||
uint32_t pat_seed_index, const astc_ldr::partition_pattern_vec* pPat, // seed index is a ASTC partition pattern index
|
||
uint32_t endpoint_ise_range, uint32_t weight_ise_range,
|
||
uint32_t grid_width, uint32_t grid_height,
|
||
astc_helpers::log_astc_block& enc_log_block,
|
||
const astc_ldr::cem_encode_params& params,
|
||
bool refine_only_flag = false,
|
||
bool gradient_descent_flag = true, bool polish_weights_flag = true, bool qcd_enabled_flag = true,
|
||
bool use_blue_contraction = true,
|
||
bool* pBase_ofs_clamped_flag = nullptr)
|
||
{
|
||
assert((num_parts >= 2) && (num_parts <= astc_helpers::MAX_PARTITIONS));
|
||
assert(pPat);
|
||
assert(pat_seed_index < astc_helpers::NUM_PARTITION_PATTERNS);
|
||
|
||
if (pBase_ofs_clamped_flag)
|
||
*pBase_ofs_clamped_flag = false;
|
||
|
||
const bool is_downsampling = (grid_width < block_width) || (grid_height < block_height);
|
||
//const uint32_t total_block_pixels = block_width * block_height;
|
||
const uint32_t total_grid_pixels = grid_width * grid_height;
|
||
|
||
color_rgba part_pixels[astc_helpers::MAX_PARTITIONS][astc_ldr::ASTC_LDR_MAX_BLOCK_PIXELS];
|
||
uint32_t num_part_pixels[astc_helpers::MAX_PARTITIONS] = { 0 };
|
||
|
||
for (uint32_t y = 0; y < block_height; y++)
|
||
{
|
||
for (uint32_t x = 0; x < block_width; x++)
|
||
{
|
||
const color_rgba& px = pixel_stats.m_pixels[x + y * block_width];
|
||
|
||
const uint32_t part_index = (*pPat)(x, y);
|
||
assert(part_index < num_parts);
|
||
|
||
part_pixels[part_index][num_part_pixels[part_index]] = px;
|
||
num_part_pixels[part_index]++;
|
||
} // x
|
||
} // y
|
||
|
||
#if defined(_DEBUG) || defined(DEBUG)
|
||
for (uint32_t i = 0; i < num_parts; i++)
|
||
assert(num_part_pixels[i]);
|
||
#endif
|
||
|
||
astc_ldr::pixel_stats_t part_pixel_stats[astc_helpers::MAX_PARTITIONS];
|
||
|
||
for (uint32_t i = 0; i < num_parts; i++)
|
||
part_pixel_stats[i].clear();
|
||
|
||
uint8_t part_endpoints[astc_helpers::MAX_PARTITIONS][astc_helpers::MAX_CEM_ENDPOINT_VALS];
|
||
uint8_t part_weights[astc_helpers::MAX_PARTITIONS][astc_ldr::ASTC_LDR_MAX_BLOCK_PIXELS];
|
||
|
||
for (uint32_t part_index = 0; part_index < num_parts; part_index++)
|
||
{
|
||
part_pixel_stats[part_index].init(num_part_pixels[part_index], &part_pixels[part_index][0]);
|
||
|
||
if (!refine_only_flag)
|
||
{
|
||
bool base_ofs_clamped_flag = false;
|
||
|
||
// Encode at block res, but with quantized weights
|
||
uint64_t block_err = astc_ldr::cem_encode_pixels(cem_index, -1, part_pixel_stats[part_index], params,
|
||
endpoint_ise_range, weight_ise_range,
|
||
&part_endpoints[part_index][0], &part_weights[part_index][0], nullptr, UINT64_MAX, use_blue_contraction, &base_ofs_clamped_flag);
|
||
|
||
if (block_err == UINT64_MAX)
|
||
return false;
|
||
|
||
if ((pBase_ofs_clamped_flag) && (base_ofs_clamped_flag))
|
||
*pBase_ofs_clamped_flag = true;
|
||
}
|
||
|
||
} // part_index
|
||
|
||
const uint32_t num_endpoint_vals = astc_helpers::get_num_cem_values(cem_index);
|
||
|
||
if (!refine_only_flag)
|
||
{
|
||
uint8_t block_weights[astc_ldr::ASTC_LDR_MAX_BLOCK_PIXELS];
|
||
|
||
clear_obj(num_part_pixels);
|
||
|
||
for (uint32_t y = 0; y < block_height; y++)
|
||
{
|
||
for (uint32_t x = 0; x < block_width; x++)
|
||
{
|
||
const uint32_t part_index = (*pPat)(x, y);
|
||
assert(part_index < num_parts);
|
||
|
||
block_weights[x + y * block_width] = part_weights[part_index][num_part_pixels[part_index]];
|
||
num_part_pixels[part_index]++;
|
||
} // x
|
||
} // y
|
||
|
||
enc_log_block.clear();
|
||
|
||
enc_log_block.m_grid_width = (uint8_t)grid_width;
|
||
enc_log_block.m_grid_height = (uint8_t)grid_height;
|
||
enc_log_block.m_weight_ise_range = (uint8_t)weight_ise_range;
|
||
enc_log_block.m_endpoint_ise_range = (uint8_t)endpoint_ise_range;
|
||
|
||
enc_log_block.m_num_partitions = (uint8_t)num_parts;
|
||
for (uint32_t i = 0; i < num_parts; i++)
|
||
enc_log_block.m_color_endpoint_modes[i] = (uint8_t)cem_index;
|
||
enc_log_block.m_partition_id = (uint16_t)pat_seed_index;
|
||
|
||
if (is_downsampling)
|
||
{
|
||
// TODO: Make the downsample step faster
|
||
const float* pDownsample_matrix = basist::astc_ldr_t::find_astc_block_grid_data(block_width, block_height, grid_width, grid_height)->m_downsample_matrix.get_ptr();
|
||
|
||
// Now downsample the weight grid (quantized to quantized)
|
||
astc_ldr_downsample_ise_weights(
|
||
weight_ise_range, weight_ise_range,
|
||
block_width, block_height,
|
||
grid_width, grid_height,
|
||
block_weights, enc_log_block.m_weights,
|
||
pDownsample_matrix);
|
||
}
|
||
else
|
||
{
|
||
memcpy(enc_log_block.m_weights, block_weights, total_grid_pixels);
|
||
}
|
||
|
||
for (uint32_t p = 0; p < num_parts; p++)
|
||
memcpy(enc_log_block.m_endpoints + num_endpoint_vals * p, &part_endpoints[p][0], num_endpoint_vals);
|
||
}
|
||
|
||
// attempt endpoint refinement given the current weights
|
||
// TODO: Expose to caller
|
||
const uint32_t NUM_REFINEMENT_PASSES = 3;
|
||
for (uint32_t refine_pass = 0; refine_pass < NUM_REFINEMENT_PASSES; refine_pass++)
|
||
{
|
||
uint8_t dequantized_raw_weights0[astc_ldr::ASTC_LDR_MAX_BLOCK_PIXELS];
|
||
uint8_t upsampled_weights0[astc_ldr::ASTC_LDR_MAX_BLOCK_PIXELS]; // raw weights, NOT ISE
|
||
|
||
for (uint32_t i = 0; i < total_grid_pixels; i++)
|
||
dequantized_raw_weights0[i] = astc_helpers::g_dequant_tables.get_weight_tab(weight_ise_range).m_ISE_to_val[enc_log_block.m_weights[i]];
|
||
|
||
astc_helpers::upsample_weight_grid(block_width, block_height, grid_width, grid_height, dequantized_raw_weights0, upsampled_weights0);
|
||
|
||
astc_helpers::log_astc_block alt_enc_log_block(enc_log_block);
|
||
|
||
uint8_t raw_part_weights[astc_helpers::MAX_PARTITIONS][astc_ldr::ASTC_LDR_MAX_BLOCK_PIXELS];
|
||
|
||
clear_obj(num_part_pixels);
|
||
|
||
for (uint32_t y = 0; y < block_height; y++)
|
||
{
|
||
for (uint32_t x = 0; x < block_width; x++)
|
||
{
|
||
const uint32_t part_index = (*pPat)(x, y);
|
||
assert(part_index < num_parts);
|
||
|
||
raw_part_weights[part_index][num_part_pixels[part_index]] = upsampled_weights0[x + y * block_width];
|
||
num_part_pixels[part_index]++;
|
||
} // x
|
||
} // y
|
||
|
||
for (uint32_t part_index = 0; part_index < num_parts; part_index++)
|
||
{
|
||
assert(num_part_pixels[part_index] == part_pixel_stats[part_index].m_num_pixels);
|
||
|
||
astc_ldr::cem_encode_params temp_params(params);
|
||
temp_params.m_pForced_weight_vals0 = &raw_part_weights[part_index][0];
|
||
|
||
uint8_t temp_weights[astc_ldr::ASTC_LDR_MAX_BLOCK_PIXELS];
|
||
|
||
bool base_ofs_clamped_flag = false;
|
||
|
||
// Encode at block res, but with quantized weights
|
||
uint64_t block_err = astc_ldr::cem_encode_pixels(cem_index, -1, part_pixel_stats[part_index], temp_params,
|
||
endpoint_ise_range, astc_helpers::BISE_64_LEVELS,
|
||
&alt_enc_log_block.m_endpoints[num_endpoint_vals * part_index], temp_weights, nullptr, UINT64_MAX, use_blue_contraction, &base_ofs_clamped_flag);
|
||
|
||
if (block_err == UINT64_MAX)
|
||
return false;
|
||
|
||
if ((pBase_ofs_clamped_flag) && (base_ofs_clamped_flag))
|
||
*pBase_ofs_clamped_flag = true;
|
||
|
||
#if defined(_DEBUG) || defined(DEBUG)
|
||
for (uint32_t i = 0; i < part_pixel_stats[part_index].m_num_pixels; i++)
|
||
{
|
||
assert(temp_weights[i] == temp_params.m_pForced_weight_vals0[i]);
|
||
}
|
||
#endif
|
||
|
||
} // part_index
|
||
|
||
uint64_t cur_err = eval_error(block_width, block_height, enc_log_block, pixel_stats, params);
|
||
uint64_t ref_err = eval_error(block_width, block_height, alt_enc_log_block, pixel_stats, params);
|
||
|
||
if (ref_err < cur_err)
|
||
{
|
||
memcpy(&enc_log_block, &alt_enc_log_block, sizeof(astc_helpers::log_astc_block));
|
||
}
|
||
|
||
if (refine_pass == (NUM_REFINEMENT_PASSES - 1))
|
||
break;
|
||
|
||
if ((is_downsampling) && (gradient_descent_flag || polish_weights_flag))
|
||
{
|
||
bool improved_flag = false;
|
||
bool status = polish_block_weights(block_width, block_height, pixel_stats, enc_log_block, params, pPat, improved_flag, gradient_descent_flag, polish_weights_flag, qcd_enabled_flag);
|
||
if (!status)
|
||
{
|
||
assert(0);
|
||
}
|
||
|
||
if (!improved_flag)
|
||
break;
|
||
}
|
||
else
|
||
{
|
||
break;
|
||
}
|
||
} // refine_pass
|
||
|
||
return true;
|
||
}
|
||
|
||
bool encode_trial(
|
||
uint32_t block_width, uint32_t block_height,
|
||
const astc_ldr::pixel_stats_t& pixel_stats,
|
||
uint32_t cem_index,
|
||
bool dual_plane_flag, int ccs_index,
|
||
uint32_t endpoint_ise_range, uint32_t weight_ise_range,
|
||
uint32_t grid_width, uint32_t grid_height,
|
||
astc_helpers::log_astc_block& enc_log_block,
|
||
const astc_ldr::cem_encode_params& params,
|
||
bool gradient_descent_flag = true, bool polish_weights_flag = true, bool qcd_enabled_flag = true,
|
||
bool use_blue_contraction = true,
|
||
bool* pBase_ofs_clamped_flag = nullptr)
|
||
{
|
||
assert(dual_plane_flag || (ccs_index == -1));
|
||
|
||
if (pBase_ofs_clamped_flag)
|
||
*pBase_ofs_clamped_flag = false;
|
||
|
||
const bool is_downsampling = (grid_width < block_width) || (grid_height < block_height);
|
||
|
||
const basist::astc_ldr_t::astc_block_grid_data* pBlock_grid_data = basist::astc_ldr_t::find_astc_block_grid_data(block_width, block_height, grid_width, grid_height);
|
||
|
||
const float* pDownsample_matrix = nullptr;
|
||
if (is_downsampling)
|
||
pDownsample_matrix = pBlock_grid_data->m_downsample_matrix.get_ptr();
|
||
|
||
//const uint32_t total_block_pixels = block_width * block_height;
|
||
const uint32_t total_grid_pixels = grid_width * grid_height;
|
||
|
||
const auto& dequant_tab = astc_helpers::g_dequant_tables.get_weight_tab(weight_ise_range).m_ISE_to_val;
|
||
//const auto& quant_tab = astc_helpers::g_dequant_tables.get_weight_tab(weight_ise_range).m_val_to_ise;
|
||
|
||
enc_log_block.clear();
|
||
|
||
enc_log_block.m_grid_width = (uint8_t)grid_width;
|
||
enc_log_block.m_grid_height = (uint8_t)grid_height;
|
||
enc_log_block.m_weight_ise_range = (uint8_t)weight_ise_range;
|
||
enc_log_block.m_endpoint_ise_range = (uint8_t)endpoint_ise_range;
|
||
|
||
enc_log_block.m_dual_plane = dual_plane_flag;
|
||
if (dual_plane_flag)
|
||
{
|
||
assert((ccs_index >= 0) && (ccs_index <= 3));
|
||
enc_log_block.m_color_component_selector = (uint8_t)ccs_index;
|
||
}
|
||
else
|
||
{
|
||
assert(ccs_index == -1);
|
||
}
|
||
|
||
enc_log_block.m_num_partitions = 1;
|
||
enc_log_block.m_color_endpoint_modes[0] = (uint8_t)cem_index;
|
||
|
||
uint8_t fullres_endpoints[astc_helpers::MAX_CEM_ENDPOINT_VALS];
|
||
uint8_t weights0[astc_ldr::ASTC_LDR_MAX_BLOCK_PIXELS];
|
||
uint8_t weights1[astc_ldr::ASTC_LDR_MAX_BLOCK_PIXELS];
|
||
|
||
if ((grid_width == block_width) && (grid_height == block_height))
|
||
{
|
||
bool base_ofs_clamped_flag = false;
|
||
|
||
uint64_t block_err = astc_ldr::cem_encode_pixels(cem_index, ccs_index, pixel_stats, params,
|
||
endpoint_ise_range, weight_ise_range,
|
||
fullres_endpoints, weights0, weights1, UINT64_MAX, use_blue_contraction, &base_ofs_clamped_flag);
|
||
|
||
if (block_err == UINT64_MAX)
|
||
return false;
|
||
|
||
if ((pBase_ofs_clamped_flag) && (base_ofs_clamped_flag))
|
||
*pBase_ofs_clamped_flag = base_ofs_clamped_flag;
|
||
|
||
if (dual_plane_flag)
|
||
{
|
||
for (uint32_t i = 0; i < total_grid_pixels; i++)
|
||
{
|
||
enc_log_block.m_weights[i * 2 + 0] = weights0[i];
|
||
enc_log_block.m_weights[i * 2 + 1] = weights1[i];
|
||
}
|
||
}
|
||
else
|
||
{
|
||
memcpy(enc_log_block.m_weights, weights0, total_grid_pixels);
|
||
}
|
||
|
||
memcpy(enc_log_block.m_endpoints, fullres_endpoints, astc_helpers::get_num_cem_values(cem_index));
|
||
|
||
return true;
|
||
}
|
||
|
||
// Handle downsampled weight grids case
|
||
|
||
uint8_t fullres_raw_weights0[astc_ldr::ASTC_LDR_MAX_BLOCK_PIXELS];
|
||
uint8_t fullres_raw_weights1[astc_ldr::ASTC_LDR_MAX_BLOCK_PIXELS];
|
||
|
||
bool base_ofs_clamped_flag = false;
|
||
|
||
// Encode at block res, but with quantized weights
|
||
uint64_t block_err = astc_ldr::cem_encode_pixels(cem_index, ccs_index, pixel_stats, params,
|
||
endpoint_ise_range, weight_ise_range,
|
||
fullres_endpoints, fullres_raw_weights0, fullres_raw_weights1, UINT64_MAX, use_blue_contraction, &base_ofs_clamped_flag);
|
||
|
||
if (block_err == UINT64_MAX)
|
||
return false;
|
||
|
||
if ((pBase_ofs_clamped_flag) && (base_ofs_clamped_flag))
|
||
*pBase_ofs_clamped_flag = base_ofs_clamped_flag;
|
||
|
||
// Now downsample the weight grid (quantized to quantized)
|
||
astc_ldr_downsample_ise_weights(
|
||
weight_ise_range, weight_ise_range,
|
||
block_width, block_height,
|
||
grid_width, grid_height,
|
||
fullres_raw_weights0, weights0,
|
||
pDownsample_matrix);
|
||
|
||
astc_helpers::set_weights(enc_log_block, weights0, 0);
|
||
|
||
if (dual_plane_flag)
|
||
{
|
||
astc_ldr_downsample_ise_weights(
|
||
weight_ise_range, weight_ise_range,
|
||
block_width, block_height,
|
||
grid_width, grid_height,
|
||
fullres_raw_weights1, weights1,
|
||
pDownsample_matrix);
|
||
}
|
||
|
||
if (dual_plane_flag)
|
||
astc_helpers::set_weights(enc_log_block, weights1, 1);
|
||
|
||
memcpy(enc_log_block.m_endpoints, fullres_endpoints, astc_helpers::get_num_cem_values(cem_index));
|
||
|
||
// TODO: Expose to caller
|
||
const uint32_t NUM_OUTER_PASSES = 3;
|
||
for (uint32_t outer_pass = 0; outer_pass < NUM_OUTER_PASSES; outer_pass++)
|
||
{
|
||
// endpoint refinement, given current upsampled weights
|
||
{
|
||
astc_helpers::extract_weights(enc_log_block, weights0, 0);
|
||
|
||
if (dual_plane_flag)
|
||
astc_helpers::extract_weights(enc_log_block, weights1, 1);
|
||
|
||
// Plane 0
|
||
uint8_t dequantized_raw_weights0[astc_ldr::ASTC_LDR_MAX_BLOCK_PIXELS];
|
||
uint8_t upsampled_weights0[astc_ldr::ASTC_LDR_MAX_BLOCK_PIXELS]; // raw weights, NOT ISE
|
||
|
||
for (uint32_t i = 0; i < total_grid_pixels; i++)
|
||
dequantized_raw_weights0[i] = dequant_tab[weights0[i]];
|
||
|
||
astc_helpers::upsample_weight_grid(block_width, block_height, grid_width, grid_height, dequantized_raw_weights0, upsampled_weights0);
|
||
|
||
// Plane 1
|
||
uint8_t dequantized_raw_weights1[astc_ldr::ASTC_LDR_MAX_BLOCK_PIXELS];
|
||
uint8_t upsampled_weights1[astc_ldr::ASTC_LDR_MAX_BLOCK_PIXELS]; // raw weights, NOT ISE
|
||
|
||
if (dual_plane_flag)
|
||
{
|
||
for (uint32_t i = 0; i < total_grid_pixels; i++)
|
||
dequantized_raw_weights1[i] = dequant_tab[weights1[i]];
|
||
astc_helpers::upsample_weight_grid(block_width, block_height, grid_width, grid_height, dequantized_raw_weights1, upsampled_weights1);
|
||
}
|
||
|
||
// Jam in the weights to the actual raw [0,64] weights the decoder is going to use after upsampling the grid.
|
||
astc_ldr::cem_encode_params refine_params(params);
|
||
refine_params.m_pForced_weight_vals0 = upsampled_weights0;
|
||
if (dual_plane_flag)
|
||
refine_params.m_pForced_weight_vals1 = upsampled_weights1;
|
||
|
||
uint8_t refined_endpoints[astc_helpers::MAX_CEM_ENDPOINT_VALS];
|
||
uint8_t refined_weights0[astc_ldr::ASTC_LDR_MAX_BLOCK_PIXELS];
|
||
uint8_t refined_weights1[astc_ldr::ASTC_LDR_MAX_BLOCK_PIXELS];
|
||
|
||
uint64_t refined_block_err = astc_ldr::cem_encode_pixels(cem_index, ccs_index, pixel_stats, refine_params,
|
||
endpoint_ise_range, astc_helpers::BISE_64_LEVELS,
|
||
refined_endpoints, refined_weights0, refined_weights1, UINT64_MAX, use_blue_contraction, &base_ofs_clamped_flag);
|
||
assert(refined_block_err != UINT64_MAX);
|
||
|
||
if ((pBase_ofs_clamped_flag) && (base_ofs_clamped_flag))
|
||
*pBase_ofs_clamped_flag = base_ofs_clamped_flag;
|
||
|
||
if (refined_block_err != UINT64_MAX)
|
||
{
|
||
uint64_t cur_err = eval_error(
|
||
block_width, block_height,
|
||
pixel_stats,
|
||
cem_index,
|
||
dual_plane_flag, ccs_index,
|
||
endpoint_ise_range, weight_ise_range,
|
||
grid_width, grid_height,
|
||
enc_log_block.m_endpoints, weights0, weights1,
|
||
params);
|
||
|
||
if (refined_block_err < cur_err)
|
||
{
|
||
memcpy(enc_log_block.m_endpoints, refined_endpoints, astc_helpers::get_num_cem_values(cem_index));
|
||
}
|
||
}
|
||
}
|
||
|
||
if (outer_pass == (NUM_OUTER_PASSES - 1))
|
||
break;
|
||
|
||
if ((!gradient_descent_flag) && (!polish_weights_flag))
|
||
break;
|
||
|
||
bool improved_flag = false;
|
||
|
||
bool status = polish_block_weights(
|
||
block_width, block_height,
|
||
pixel_stats,
|
||
enc_log_block, // assumes there is already a good encoding to improve here
|
||
params,
|
||
nullptr,
|
||
improved_flag,
|
||
gradient_descent_flag,
|
||
polish_weights_flag,
|
||
qcd_enabled_flag);
|
||
|
||
if (!status)
|
||
{
|
||
assert(0);
|
||
return false;
|
||
}
|
||
|
||
if (!improved_flag)
|
||
break;
|
||
|
||
} // outer_pass
|
||
|
||
return true;
|
||
}
|
||
|
||
// 1 part only, refines endpoints given current weights
|
||
bool encode_trial_refine_only(
|
||
uint32_t block_width, uint32_t block_height,
|
||
const astc_ldr::pixel_stats_t& pixel_stats,
|
||
astc_helpers::log_astc_block& enc_log_block,
|
||
const astc_ldr::cem_encode_params& params,
|
||
bool use_blue_contraction = true,
|
||
bool* pBase_ofs_clamped_flag = nullptr)
|
||
{
|
||
assert(enc_log_block.m_num_partitions == 1);
|
||
|
||
if (pBase_ofs_clamped_flag)
|
||
*pBase_ofs_clamped_flag = false;
|
||
|
||
const uint32_t cem_index = enc_log_block.m_color_endpoint_modes[0];
|
||
const bool dual_plane_flag = enc_log_block.m_dual_plane;
|
||
const int ccs_index = dual_plane_flag ? enc_log_block.m_color_component_selector : -1;
|
||
const uint32_t endpoint_ise_range = enc_log_block.m_endpoint_ise_range;
|
||
const uint32_t weight_ise_range = enc_log_block.m_weight_ise_range;
|
||
const uint32_t grid_width = enc_log_block.m_grid_width;
|
||
const uint32_t grid_height = enc_log_block.m_grid_height;
|
||
|
||
//const bool is_downsampling = (grid_width < block_width) || (grid_height < block_height);
|
||
|
||
//const uint32_t total_block_pixels = block_width * block_height;
|
||
const uint32_t total_grid_pixels = grid_width * grid_height;
|
||
|
||
uint8_t dequantized_raw_weights0[astc_ldr::ASTC_LDR_MAX_BLOCK_PIXELS];
|
||
uint8_t upsampled_weights0[astc_ldr::ASTC_LDR_MAX_BLOCK_PIXELS]; // raw weights, NOT ISE
|
||
|
||
for (uint32_t i = 0; i < total_grid_pixels; i++)
|
||
dequantized_raw_weights0[i] = astc_helpers::g_dequant_tables.get_weight_tab(weight_ise_range).m_ISE_to_val[astc_helpers::get_weight(enc_log_block, 0, i)];
|
||
|
||
// suppress bogus gcc warning on dequantized_raw_weights0
|
||
#ifndef __clang__
|
||
#if defined(__GNUC__)
|
||
#pragma GCC diagnostic push
|
||
#pragma GCC diagnostic ignored "-Wmaybe-uninitialized"
|
||
#endif
|
||
#endif
|
||
|
||
astc_helpers::upsample_weight_grid(block_width, block_height, grid_width, grid_height, dequantized_raw_weights0, upsampled_weights0);
|
||
|
||
#ifndef __clang__
|
||
#if defined(__GNUC__)
|
||
#pragma GCC diagnostic pop
|
||
#endif
|
||
#endif
|
||
|
||
uint8_t dequantized_raw_weights1[astc_ldr::ASTC_LDR_MAX_BLOCK_PIXELS];
|
||
uint8_t upsampled_weights1[astc_ldr::ASTC_LDR_MAX_BLOCK_PIXELS]; // raw weights, NOT ISE
|
||
|
||
if (dual_plane_flag)
|
||
{
|
||
for (uint32_t i = 0; i < total_grid_pixels; i++)
|
||
dequantized_raw_weights1[i] = astc_helpers::g_dequant_tables.get_weight_tab(weight_ise_range).m_ISE_to_val[astc_helpers::get_weight(enc_log_block, 1, i)];
|
||
astc_helpers::upsample_weight_grid(block_width, block_height, grid_width, grid_height, dequantized_raw_weights1, upsampled_weights1);
|
||
}
|
||
|
||
astc_ldr::cem_encode_params refine_params(params);
|
||
refine_params.m_pForced_weight_vals0 = upsampled_weights0;
|
||
if (dual_plane_flag)
|
||
refine_params.m_pForced_weight_vals1 = upsampled_weights1;
|
||
|
||
uint8_t refined_endpoints[astc_helpers::MAX_CEM_ENDPOINT_VALS];
|
||
uint8_t refined_weights0[astc_ldr::ASTC_LDR_MAX_BLOCK_PIXELS];
|
||
uint8_t refined_weights1[astc_ldr::ASTC_LDR_MAX_BLOCK_PIXELS];
|
||
|
||
//bool use_blue_contraction = true;
|
||
|
||
bool base_ofs_clamped_flag = false;
|
||
|
||
uint64_t refined_block_err = astc_ldr::cem_encode_pixels(cem_index, ccs_index, pixel_stats, refine_params,
|
||
endpoint_ise_range, astc_helpers::BISE_64_LEVELS,
|
||
refined_endpoints, refined_weights0, refined_weights1, UINT64_MAX, use_blue_contraction, &base_ofs_clamped_flag);
|
||
assert(refined_block_err != UINT64_MAX);
|
||
|
||
if ((pBase_ofs_clamped_flag) && (base_ofs_clamped_flag))
|
||
*pBase_ofs_clamped_flag = base_ofs_clamped_flag;
|
||
|
||
#if defined(_DEBUG) || defined(DEBUG)
|
||
for (uint32_t i = 0; i < total_grid_pixels; i++)
|
||
{
|
||
assert(refined_weights0[i] == upsampled_weights0[i]);
|
||
|
||
if (dual_plane_flag)
|
||
{
|
||
assert(refined_weights1[i] == upsampled_weights1[i]);
|
||
}
|
||
}
|
||
#endif
|
||
|
||
if (refined_block_err != UINT64_MAX)
|
||
{
|
||
astc_helpers::log_astc_block alt_enc_log_block(enc_log_block);
|
||
memcpy(alt_enc_log_block.m_endpoints, refined_endpoints, astc_helpers::get_num_cem_values(cem_index));
|
||
|
||
#if defined(_DEBUG) || defined(DEBUG)
|
||
// refined_block_err was computed on the actual ASTC [0,64] upsampled weights the decoder would use. But double check this for sanity.
|
||
{
|
||
uint64_t ref_err = eval_error(block_width, block_height, alt_enc_log_block, pixel_stats, params);
|
||
assert(ref_err == refined_block_err);
|
||
}
|
||
#endif
|
||
|
||
uint64_t cur_err = eval_error(block_width, block_height, enc_log_block, pixel_stats, params);
|
||
|
||
if (refined_block_err < cur_err)
|
||
{
|
||
memcpy(enc_log_block.m_endpoints, refined_endpoints, astc_helpers::get_num_cem_values(cem_index));
|
||
}
|
||
}
|
||
|
||
return true;
|
||
}
|
||
|
||
struct log_surrogate_astc_blk
|
||
{
|
||
int m_grid_width, m_grid_height;
|
||
|
||
uint32_t m_cem_index; // base+scale or direct variants only
|
||
int m_ccs_index; // -1 for single plane
|
||
|
||
uint32_t m_num_endpoint_levels;
|
||
uint32_t m_num_weight_levels;
|
||
|
||
uint32_t m_num_parts; // 1-3
|
||
uint32_t m_seed_index; // ASTC seed index, 10-bits if m_num_parts > 1
|
||
|
||
vec4F m_endpoints[astc_helpers::MAX_PARTITIONS][2]; // [subset_index][l/h endpoint]
|
||
float m_scales[astc_helpers::MAX_PARTITIONS]; // scale factor used for each subset
|
||
|
||
float m_weights0[astc_ldr::ASTC_LDR_MAX_BLOCK_PIXELS];
|
||
float m_weights1[astc_ldr::ASTC_LDR_MAX_BLOCK_PIXELS];
|
||
|
||
void clear()
|
||
{
|
||
memset((void *)this, 0, sizeof(*this));
|
||
}
|
||
|
||
void decode(uint32_t block_width, uint32_t block_height, vec4F* pPixels, const astc_ldr::partition_pattern_vec* pPat) const;
|
||
void decode(uint32_t block_width, uint32_t block_height, vec4F* pPixels, const astc_ldr::partitions_data* pPat_data) const;
|
||
};
|
||
|
||
void upsample_surrogate_weights(
|
||
const astc_helpers::weighted_sample* pWeighted_samples,
|
||
const float* pSrc_weights,
|
||
float* pDst_weights,
|
||
uint32_t by, uint32_t bx,
|
||
uint32_t wx, uint32_t wy,
|
||
uint32_t num_weight_levels)
|
||
{
|
||
const uint32_t total_src_weights = wx * wy;
|
||
const float weight_levels_minus_1 = (float)(num_weight_levels - 1) * (1.0f / 16.0f);
|
||
const float inv_weight_levels = 1.0f / (float)(num_weight_levels - 1);
|
||
|
||
const astc_helpers::weighted_sample* pS = pWeighted_samples;
|
||
|
||
for (uint32_t y = 0; y < by; y++)
|
||
{
|
||
for (uint32_t x = 0; x < bx; x++, ++pS)
|
||
{
|
||
const uint32_t w00 = pS->m_weights[0][0];
|
||
const uint32_t w01 = pS->m_weights[0][1];
|
||
const uint32_t w10 = pS->m_weights[1][0];
|
||
const uint32_t w11 = pS->m_weights[1][1];
|
||
|
||
assert(w00 || w01 || w10 || w11);
|
||
|
||
const uint32_t sx = pS->m_src_x, sy = pS->m_src_y;
|
||
|
||
float total = 0.0f;
|
||
|
||
if (w00) total += pSrc_weights[bounds_check(sx + sy * wx, 0U, total_src_weights)] * (float)w00;
|
||
if (w01) total += pSrc_weights[bounds_check(sx + 1 + sy * wx, 0U, total_src_weights)] * (float)w01;
|
||
if (w10) total += pSrc_weights[bounds_check(sx + (sy + 1) * wx, 0U, total_src_weights)] * (float)w10;
|
||
if (w11) total += pSrc_weights[bounds_check(sx + 1 + (sy + 1) * wx, 0U, total_src_weights)] * (float)w11;
|
||
|
||
float w = (float)fast_roundf_pos_int(total * weight_levels_minus_1) * inv_weight_levels;
|
||
|
||
pDst_weights[x + y * bx] = w;
|
||
} // x
|
||
} // y
|
||
}
|
||
|
||
void log_surrogate_astc_blk::decode(uint32_t block_width, uint32_t block_height, vec4F* pPixels, const astc_ldr::partition_pattern_vec* pPat) const
|
||
{
|
||
const bool dual_plane = (m_ccs_index >= 0);
|
||
|
||
const uint32_t total_block_pixels = block_width * block_height;
|
||
const uint32_t total_grid_pixels = m_grid_width * m_grid_height;
|
||
|
||
const bool needs_upsampling = total_grid_pixels < total_block_pixels;
|
||
|
||
const bool is_small_block = total_block_pixels < 31; // astc_helpers::is_small_block(block_width, block_height);
|
||
BASISU_NOTE_UNUSED(is_small_block);
|
||
|
||
float upsampled_weights0[astc_ldr::ASTC_LDR_MAX_BLOCK_PIXELS], upsampled_weights1[astc_ldr::ASTC_LDR_MAX_BLOCK_PIXELS];
|
||
|
||
const float* pWeights0 = m_weights0;
|
||
const float* pWeights1 = m_weights1;
|
||
|
||
if (needs_upsampling)
|
||
{
|
||
// TODO: Precompute these in tables
|
||
astc_helpers::weighted_sample up_weights[astc_helpers::MAX_BLOCK_DIM * astc_helpers::MAX_BLOCK_DIM];
|
||
astc_helpers::compute_upsample_weights(block_width, block_height, m_grid_width, m_grid_height, up_weights);
|
||
|
||
upsample_surrogate_weights(up_weights, m_weights0, upsampled_weights0, block_width, block_height, m_grid_width, m_grid_height, m_num_weight_levels);
|
||
pWeights0 = upsampled_weights0;
|
||
|
||
if (dual_plane)
|
||
{
|
||
upsample_surrogate_weights(up_weights, m_weights1, upsampled_weights1, block_width, block_height, m_grid_width, m_grid_height, m_num_weight_levels);
|
||
pWeights1 = upsampled_weights1;
|
||
}
|
||
}
|
||
|
||
for (uint32_t y = 0; y < block_height; y++)
|
||
{
|
||
for (uint32_t x = 0; x < block_width; x++)
|
||
{
|
||
uint32_t part_index = 0;
|
||
if (m_num_parts > 1)
|
||
{
|
||
part_index = (*pPat)(x, y);
|
||
assert(part_index < m_num_parts);
|
||
|
||
assert(part_index == (uint32_t)astc_helpers::compute_texel_partition(m_seed_index, x, y, 0, m_num_parts, is_small_block));
|
||
}
|
||
|
||
const vec4F& l = m_endpoints[part_index][0];
|
||
const vec4F& h = m_endpoints[part_index][1];
|
||
|
||
vec4F& dst = pPixels[x + y * block_width];
|
||
|
||
for (uint32_t c = 0; c < 4; c++)
|
||
{
|
||
float w = ((int)c == m_ccs_index) ? pWeights1[x + y * block_width] : pWeights0[x + y * block_width];
|
||
|
||
//dst[c] = lerp(l[c], h[c], w);
|
||
|
||
const float one_minus_w = 1.0f - w;
|
||
dst[c] = l[c] * one_minus_w + h[c] * w;
|
||
} // c
|
||
|
||
} // x
|
||
} // y
|
||
}
|
||
|
||
void log_surrogate_astc_blk::decode(uint32_t block_width, uint32_t block_height, vec4F* pPixels, const astc_ldr::partitions_data* pPat_data) const
|
||
{
|
||
if (m_num_parts == 1)
|
||
return decode(block_width, block_height, pPixels, (const astc_ldr::partition_pattern_vec*)nullptr);
|
||
|
||
uint32_t unique_pat_index = pPat_data->m_part_seed_to_unique_index[m_seed_index];
|
||
assert(unique_pat_index < pPat_data->m_total_unique_patterns);
|
||
|
||
return decode(block_width, block_height, pPixels, &pPat_data->m_partition_pats[unique_pat_index]);
|
||
}
|
||
|
||
void downsample_float_weight_grid(
|
||
const float* pMatrix_weights,
|
||
uint32_t bx, uint32_t by, // source/from dimension (block size)
|
||
uint32_t wx, uint32_t wy, // dest/to dimension (grid size)
|
||
const float* pSrc_weights, // these are dequantized weights, NOT ISE symbols, [by][bx]
|
||
float* pDst_weights, // [wy][wx]
|
||
uint32_t num_weight_levels)
|
||
{
|
||
const uint32_t total_block_samples = bx * by;
|
||
const float weight_levels_minus_1 = (float)(num_weight_levels - 1);
|
||
const float inv_weight_levels = 1.0f / (float)(num_weight_levels - 1);
|
||
|
||
for (uint32_t y = 0; y < wy; y++)
|
||
{
|
||
for (uint32_t x = 0; x < wx; x++)
|
||
{
|
||
float total = 0.0f;
|
||
|
||
// TODO - optimize!
|
||
for (uint32_t i = 0; i < total_block_samples; i++)
|
||
if (pMatrix_weights[i])
|
||
total += pMatrix_weights[i] * (float)pSrc_weights[i];
|
||
|
||
pDst_weights[x + y * wx] = (float)fast_roundf_pos_int(total * weight_levels_minus_1) * inv_weight_levels;
|
||
|
||
pMatrix_weights += total_block_samples;
|
||
}
|
||
}
|
||
}
|
||
|
||
float decode_surrogate_and_compute_error(
|
||
uint32_t block_width, uint32_t block_height,
|
||
const astc_ldr::pixel_stats_t& pixel_stats,
|
||
log_surrogate_astc_blk& log_block,
|
||
const astc_ldr::partition_pattern_vec* pPat,
|
||
const astc_ldr::cem_encode_params& params)
|
||
{
|
||
vec4F dec_pixels[astc_ldr::ASTC_LDR_MAX_BLOCK_PIXELS];
|
||
log_block.decode(block_width, block_height, dec_pixels, pPat);
|
||
|
||
const float wr = (float)params.m_comp_weights[0];
|
||
const float wg = (float)params.m_comp_weights[1];
|
||
const float wb = (float)params.m_comp_weights[2];
|
||
const float wa = (float)params.m_comp_weights[3];
|
||
|
||
float total_err = 0.0f;
|
||
for (uint32_t by = 0; by < block_height; by++)
|
||
{
|
||
for (uint32_t bx = 0; bx < block_width; bx++)
|
||
{
|
||
const vec4F& s = pixel_stats.m_pixels_f[bx + by * block_width];
|
||
const vec4F& d = dec_pixels[bx + by * block_width];
|
||
|
||
float dr = s[0] - d[0];
|
||
float dg = s[1] - d[1];
|
||
float db = s[2] - d[2];
|
||
float da = s[3] - d[3];
|
||
|
||
total_err += (wr * dr * dr) + (wg * dg * dg) + (wb * db * db) + (wa * da * da);
|
||
} // bx
|
||
|
||
} // by
|
||
|
||
return total_err;
|
||
}
|
||
|
||
// Returns WSSE error
|
||
float encode_surrogate_trial(
|
||
uint32_t block_width, uint32_t block_height,
|
||
const astc_ldr::pixel_stats_t& pixel_stats,
|
||
uint32_t cem_index,
|
||
int ccs_index,
|
||
uint32_t endpoint_ise_range, uint32_t weight_ise_range,
|
||
uint32_t grid_width, uint32_t grid_height,
|
||
log_surrogate_astc_blk& log_block,
|
||
const astc_ldr::cem_encode_params& params,
|
||
uint32_t flags)
|
||
{
|
||
const bool is_downsampling = (grid_width < block_width) || (grid_height < block_height);
|
||
const bool dual_plane_flag = (ccs_index >= 0);
|
||
|
||
const basist::astc_ldr_t::astc_block_grid_data* pBlock_grid_data = basist::astc_ldr_t::find_astc_block_grid_data(block_width, block_height, grid_width, grid_height);
|
||
|
||
const float* pDownsample_matrix = nullptr;
|
||
if (is_downsampling)
|
||
pDownsample_matrix = pBlock_grid_data->m_downsample_matrix.get_ptr();
|
||
|
||
//const uint32_t total_block_pixels = block_width * block_height;
|
||
//const uint32_t total_grid_pixels = grid_width * grid_height;
|
||
|
||
log_block.m_cem_index = cem_index;
|
||
log_block.m_ccs_index = ccs_index;
|
||
log_block.m_grid_width = grid_width;
|
||
log_block.m_grid_height = grid_height;
|
||
log_block.m_num_parts = 1;
|
||
log_block.m_seed_index = 0;
|
||
clear_obj(log_block.m_scales);
|
||
log_block.m_num_endpoint_levels = astc_helpers::get_ise_levels(endpoint_ise_range);
|
||
log_block.m_num_weight_levels = astc_helpers::get_ise_levels(weight_ise_range);
|
||
|
||
float wsse_err = 0.0f;
|
||
|
||
if (is_downsampling)
|
||
{
|
||
float temp_weights0[astc_ldr::ASTC_LDR_MAX_BLOCK_PIXELS], temp_weights1[astc_ldr::ASTC_LDR_MAX_BLOCK_PIXELS];
|
||
|
||
astc_ldr::cem_surrogate_encode_pixels(
|
||
cem_index, ccs_index,
|
||
pixel_stats, params,
|
||
endpoint_ise_range, weight_ise_range,
|
||
log_block.m_endpoints[0][0], log_block.m_endpoints[0][1], log_block.m_scales[0], temp_weights0, temp_weights1,
|
||
flags);
|
||
|
||
downsample_float_weight_grid(
|
||
pDownsample_matrix,
|
||
block_width, block_height,
|
||
grid_width, grid_height,
|
||
temp_weights0,
|
||
log_block.m_weights0,
|
||
log_block.m_num_weight_levels);
|
||
|
||
if (dual_plane_flag)
|
||
{
|
||
downsample_float_weight_grid(
|
||
pDownsample_matrix,
|
||
block_width, block_height,
|
||
grid_width, grid_height,
|
||
temp_weights1,
|
||
log_block.m_weights1,
|
||
log_block.m_num_weight_levels);
|
||
}
|
||
|
||
wsse_err = decode_surrogate_and_compute_error(block_width, block_height, pixel_stats, log_block, nullptr, params);
|
||
}
|
||
else
|
||
{
|
||
wsse_err = astc_ldr::cem_surrogate_encode_pixels(
|
||
cem_index, ccs_index,
|
||
pixel_stats, params,
|
||
endpoint_ise_range, weight_ise_range,
|
||
log_block.m_endpoints[0][0], log_block.m_endpoints[0][1], log_block.m_scales[0], log_block.m_weights0, log_block.m_weights1,
|
||
flags);
|
||
|
||
#if defined(_DEBUG) || defined(DEBUG)
|
||
{
|
||
float alt_wsse_err = decode_surrogate_and_compute_error(block_width, block_height, pixel_stats, log_block, nullptr, params);
|
||
assert(fabs(wsse_err - alt_wsse_err) < .00125f);
|
||
}
|
||
#endif
|
||
}
|
||
|
||
return wsse_err;
|
||
}
|
||
|
||
float encode_surrogate_trial_subsets(
|
||
uint32_t block_width, uint32_t block_height,
|
||
const astc_ldr::pixel_stats_t& pixel_stats,
|
||
uint32_t cem_index,
|
||
uint32_t num_subsets, uint32_t pat_seed_index, const astc_ldr::partition_pattern_vec* pPat,
|
||
uint32_t endpoint_ise_range, uint32_t weight_ise_range,
|
||
uint32_t grid_width, uint32_t grid_height,
|
||
log_surrogate_astc_blk& log_block,
|
||
const astc_ldr::cem_encode_params& params,
|
||
uint32_t flags)
|
||
{
|
||
assert((num_subsets >= 2) && (num_subsets <= astc_helpers::MAX_PARTITIONS));
|
||
|
||
const bool is_downsampling = (grid_width < block_width) || (grid_height < block_height);
|
||
//const uint32_t total_block_pixels = block_width * block_height;
|
||
//const uint32_t total_grid_pixels = grid_width * grid_height;
|
||
|
||
const uint32_t num_weight_levels = astc_helpers::get_ise_levels(weight_ise_range);
|
||
const uint32_t num_endpoint_levels = astc_helpers::get_ise_levels(endpoint_ise_range);
|
||
|
||
const basist::astc_ldr_t::astc_block_grid_data* pBlock_grid_data = basist::astc_ldr_t::find_astc_block_grid_data(block_width, block_height, grid_width, grid_height);
|
||
|
||
const float* pDownsample_matrix = nullptr;
|
||
if (is_downsampling)
|
||
pDownsample_matrix = pBlock_grid_data->m_downsample_matrix.get_ptr();
|
||
|
||
color_rgba part_pixels[astc_helpers::MAX_PARTITIONS][astc_ldr::ASTC_LDR_MAX_BLOCK_PIXELS];
|
||
uint32_t num_part_pixels[astc_helpers::MAX_PARTITIONS] = { 0 };
|
||
|
||
for (uint32_t y = 0; y < block_height; y++)
|
||
{
|
||
for (uint32_t x = 0; x < block_width; x++)
|
||
{
|
||
const color_rgba& px = pixel_stats.m_pixels[x + y * block_width];
|
||
|
||
const uint32_t part_index = (*pPat)(x, y);
|
||
assert(part_index < num_subsets);
|
||
|
||
part_pixels[part_index][num_part_pixels[part_index]] = px;
|
||
num_part_pixels[part_index]++;
|
||
} // x
|
||
} // y
|
||
|
||
#if defined(_DEBUG) || defined(DEBUG)
|
||
for (uint32_t i = 0; i < num_subsets; i++)
|
||
assert(num_part_pixels[i] > 0);
|
||
#endif
|
||
|
||
astc_ldr::pixel_stats_t part_pixel_stats[astc_helpers::MAX_PARTITIONS];
|
||
|
||
for (uint32_t i = 0; i < num_subsets; i++)
|
||
part_pixel_stats[i].clear();
|
||
|
||
float part_weights[astc_helpers::MAX_PARTITIONS][astc_ldr::ASTC_LDR_MAX_BLOCK_PIXELS];
|
||
float temp_block_weights[astc_ldr::ASTC_LDR_MAX_BLOCK_PIXELS];
|
||
|
||
double total_subset_err = 0.0f;
|
||
for (uint32_t part_index = 0; part_index < num_subsets; part_index++)
|
||
{
|
||
part_pixel_stats[part_index].init(num_part_pixels[part_index], &part_pixels[part_index][0]);
|
||
|
||
float subset_err = astc_ldr::cem_surrogate_encode_pixels(
|
||
cem_index, -1,
|
||
part_pixel_stats[part_index], params,
|
||
endpoint_ise_range, weight_ise_range,
|
||
log_block.m_endpoints[part_index][0], log_block.m_endpoints[part_index][1],
|
||
log_block.m_scales[part_index], part_weights[part_index], temp_block_weights,
|
||
flags);
|
||
|
||
total_subset_err += subset_err;
|
||
|
||
} // part_index
|
||
|
||
float* pDst_weights = is_downsampling ? temp_block_weights : log_block.m_weights0;
|
||
|
||
clear_obj(num_part_pixels);
|
||
|
||
for (uint32_t y = 0; y < block_height; y++)
|
||
{
|
||
for (uint32_t x = 0; x < block_width; x++)
|
||
{
|
||
const uint32_t part_index = (*pPat)(x, y);
|
||
assert(part_index < num_subsets);
|
||
|
||
pDst_weights[x + y * block_width] = part_weights[part_index][num_part_pixels[part_index]];
|
||
num_part_pixels[part_index]++;
|
||
} // x
|
||
} // y
|
||
|
||
log_block.m_cem_index = cem_index;
|
||
log_block.m_ccs_index = -1;
|
||
log_block.m_num_endpoint_levels = num_endpoint_levels;
|
||
log_block.m_num_weight_levels = num_weight_levels;
|
||
log_block.m_grid_width = grid_width;
|
||
log_block.m_grid_height = grid_height;
|
||
log_block.m_num_parts = num_subsets;
|
||
log_block.m_seed_index = pat_seed_index;
|
||
|
||
if (is_downsampling)
|
||
{
|
||
downsample_float_weight_grid(
|
||
pDownsample_matrix,
|
||
block_width, block_height,
|
||
grid_width, grid_height,
|
||
temp_block_weights,
|
||
log_block.m_weights0,
|
||
astc_helpers::get_ise_levels(weight_ise_range));
|
||
|
||
total_subset_err = decode_surrogate_and_compute_error(block_width, block_height, pixel_stats, log_block, pPat, params);
|
||
}
|
||
|
||
#if defined(_DEBUG) || defined(DEBUG)
|
||
if (!is_downsampling)
|
||
{
|
||
float alt_subset_err = decode_surrogate_and_compute_error(block_width, block_height, pixel_stats, log_block, pPat, params);
|
||
|
||
assert(fabs(total_subset_err - alt_subset_err) < .00125f);
|
||
}
|
||
#endif
|
||
|
||
return (float)total_subset_err;
|
||
}
|
||
|
||
#if 0
|
||
static inline vec4F vec4F_norm_approx(vec4F axis)
|
||
{
|
||
float l = axis.norm();
|
||
axis = (fabs(l) >= SMALL_FLOAT_VAL) ? (axis * bu_math::inv_sqrt(l)) : vec4F(.5f);
|
||
return axis;
|
||
}
|
||
#endif
|
||
|
||
static bool estimate_partition2(
|
||
uint32_t block_width, uint32_t block_height,
|
||
const astc_ldr::pixel_stats_t& pixels,
|
||
int* pBest_parts, uint32_t num_best_parts, // unique indices, not ASTC seeds
|
||
const astc_ldr::partitions_data* pPart_data, bool brute_force_flag)
|
||
{
|
||
assert(num_best_parts && (num_best_parts <= pPart_data->m_total_unique_patterns));
|
||
|
||
const uint32_t num_block_pixels = block_width * block_height;
|
||
|
||
if (brute_force_flag)
|
||
{
|
||
int desired_parts[astc_ldr::ASTC_LDR_MAX_BLOCK_HEIGHT][astc_ldr::ASTC_LDR_MAX_BLOCK_WIDTH]; // [y][x]
|
||
|
||
for (uint32_t i = 0; i < num_block_pixels; i++)
|
||
{
|
||
float proj = (pixels.m_pixels_f[i] - pixels.m_mean_f).dot(pixels.m_mean_rel_axis4);
|
||
|
||
desired_parts[i / block_width][i % block_width] = proj < 0.0f;
|
||
}
|
||
|
||
uint32_t part_similarity[astc_helpers::NUM_PARTITION_PATTERNS];
|
||
|
||
for (uint32_t part_index = 0; part_index < pPart_data->m_total_unique_patterns; part_index++)
|
||
{
|
||
const astc_ldr::partition_pattern_vec& pat_vec = pPart_data->m_partition_pats[part_index];
|
||
|
||
int total_sim_non_inv = 0;
|
||
int total_sim_inv = 0;
|
||
|
||
for (uint32_t y = 0; y < block_height; y++)
|
||
{
|
||
for (uint32_t x = 0; x < block_width; x++)
|
||
{
|
||
int part = pat_vec[x + y * block_width];
|
||
|
||
if (part == desired_parts[y][x])
|
||
total_sim_non_inv++;
|
||
|
||
if ((part ^ 1) == desired_parts[y][x])
|
||
total_sim_inv++;
|
||
}
|
||
}
|
||
|
||
int total_sim = maximum(total_sim_non_inv, total_sim_inv);
|
||
|
||
part_similarity[part_index] = (total_sim << 16) | part_index;
|
||
|
||
} // part_index;
|
||
|
||
std::sort(part_similarity, part_similarity + pPart_data->m_total_unique_patterns);
|
||
|
||
for (uint32_t i = 0; i < num_best_parts; i++)
|
||
pBest_parts[i] = part_similarity[(pPart_data->m_total_unique_patterns - 1) - i] & 0xFFFF;
|
||
}
|
||
else
|
||
{
|
||
astc_ldr::partition_pattern_vec desired_part(block_width, block_height);
|
||
|
||
for (uint32_t i = 0; i < num_block_pixels; i++)
|
||
{
|
||
float proj = (pixels.m_pixels_f[i] - pixels.m_mean_f).dot(pixels.m_mean_rel_axis4);
|
||
|
||
desired_part.m_parts[i] = proj < 0.0f;
|
||
}
|
||
|
||
astc_ldr::vp_tree::result_queue results;
|
||
results.reserve(num_best_parts);
|
||
|
||
pPart_data->m_part_vp_tree.find_nearest(2, desired_part, results, num_best_parts);
|
||
|
||
assert(results.get_size() == num_best_parts);
|
||
|
||
const auto& elements = results.get_elements();
|
||
|
||
for (uint32_t i = 0; i < results.get_size(); i++)
|
||
pBest_parts[i] = elements[1 + i].m_pat_index;
|
||
}
|
||
|
||
return true;
|
||
}
|
||
|
||
static bool estimate_partition3(
|
||
uint32_t block_width, uint32_t block_height,
|
||
const astc_ldr::pixel_stats_t& pixels,
|
||
int* pBest_parts, uint32_t num_best_parts,
|
||
const astc_ldr::partitions_data* pPart_data, bool brute_force_flag)
|
||
{
|
||
assert(num_best_parts && (num_best_parts <= pPart_data->m_total_unique_patterns));
|
||
|
||
vec4F training_vecs[astc_ldr::ASTC_LDR_MAX_BLOCK_PIXELS], mean(0.0f);
|
||
|
||
const uint32_t num_block_pixels = block_width * block_height, NUM_SUBSETS = 3;
|
||
|
||
float brightest_inten = 0.0f, darkest_inten = BIG_FLOAT_VAL;
|
||
vec4F cluster_centroids[NUM_SUBSETS];
|
||
clear_obj(cluster_centroids);
|
||
|
||
for (uint32_t i = 0; i < num_block_pixels; i++)
|
||
{
|
||
vec4F& v = training_vecs[i];
|
||
|
||
v = pixels.m_pixels_f[i];
|
||
|
||
float inten = v.dot(vec4F(1.0f));
|
||
if (inten < darkest_inten)
|
||
{
|
||
darkest_inten = inten;
|
||
cluster_centroids[0] = v;
|
||
}
|
||
|
||
if (inten > brightest_inten)
|
||
{
|
||
brightest_inten = inten;
|
||
cluster_centroids[1] = v;
|
||
}
|
||
}
|
||
|
||
if (cluster_centroids[0] == cluster_centroids[1])
|
||
return false;
|
||
|
||
float furthest_dist2 = 0.0f;
|
||
for (uint32_t i = 0; i < num_block_pixels; i++)
|
||
{
|
||
vec4F& v = training_vecs[i];
|
||
|
||
float dist_a = v.squared_distance(cluster_centroids[0]);
|
||
if (dist_a == 0.0f)
|
||
continue;
|
||
|
||
float dist_b = v.squared_distance(cluster_centroids[1]);
|
||
if (dist_b == 0.0f)
|
||
continue;
|
||
|
||
float dist2 = dist_a + dist_b;
|
||
if (dist2 > furthest_dist2)
|
||
{
|
||
furthest_dist2 = dist2;
|
||
cluster_centroids[2] = v;
|
||
}
|
||
}
|
||
|
||
if ((cluster_centroids[0] == cluster_centroids[2]) || (cluster_centroids[1] == cluster_centroids[2]))
|
||
return false;
|
||
|
||
uint32_t cluster_pixels[NUM_SUBSETS][astc_ldr::ASTC_LDR_MAX_BLOCK_PIXELS];
|
||
uint32_t num_cluster_pixels[NUM_SUBSETS];
|
||
vec4F new_cluster_means[NUM_SUBSETS];
|
||
|
||
const uint32_t NUM_ITERS = 4;
|
||
|
||
for (uint32_t s = 0; s < NUM_ITERS; s++)
|
||
{
|
||
memset(num_cluster_pixels, 0, sizeof(num_cluster_pixels));
|
||
memset((void *)new_cluster_means, 0, sizeof(new_cluster_means));
|
||
|
||
for (uint32_t i = 0; i < num_block_pixels; i++)
|
||
{
|
||
float d[NUM_SUBSETS] = {
|
||
training_vecs[i].squared_distance(cluster_centroids[0]),
|
||
training_vecs[i].squared_distance(cluster_centroids[1]),
|
||
training_vecs[i].squared_distance(cluster_centroids[2]) };
|
||
|
||
float min_d = d[0];
|
||
uint32_t min_idx = 0;
|
||
for (uint32_t j = 1; j < NUM_SUBSETS; j++)
|
||
{
|
||
if (d[j] < min_d)
|
||
{
|
||
min_d = d[j];
|
||
min_idx = j;
|
||
}
|
||
}
|
||
|
||
cluster_pixels[min_idx][num_cluster_pixels[min_idx]] = i;
|
||
new_cluster_means[min_idx] += training_vecs[i];
|
||
num_cluster_pixels[min_idx]++;
|
||
} // i
|
||
|
||
// Can skip updating the centroids on the last iteration - all we care about is the final partitioning.
|
||
if (s == (NUM_ITERS - 1))
|
||
{
|
||
for (uint32_t j = 0; j < NUM_SUBSETS; j++)
|
||
{
|
||
if (!num_cluster_pixels[j])
|
||
return false;
|
||
}
|
||
}
|
||
else
|
||
{
|
||
for (uint32_t j = 0; j < NUM_SUBSETS; j++)
|
||
{
|
||
if (!num_cluster_pixels[j])
|
||
return false;
|
||
|
||
cluster_centroids[j] = new_cluster_means[j] / (float)num_cluster_pixels[j];
|
||
} // j
|
||
}
|
||
|
||
} // s
|
||
|
||
astc_ldr::partition_pattern_vec desired_part(block_width, block_height);
|
||
|
||
for (uint32_t p = 0; p < NUM_SUBSETS; p++)
|
||
{
|
||
for (uint32_t i = 0; i < num_cluster_pixels[p]; i++)
|
||
{
|
||
const uint32_t pix_index = cluster_pixels[p][i];
|
||
desired_part[pix_index] = (uint8_t)p;
|
||
} // i
|
||
} // p
|
||
|
||
if (brute_force_flag)
|
||
{
|
||
astc_ldr::partition_pattern_vec desired_parts[astc_ldr::NUM_PART3_MAPPINGS];
|
||
for (uint32_t j = 0; j < astc_ldr::NUM_PART3_MAPPINGS; j++)
|
||
desired_parts[j] = desired_part.get_permuted3(j);
|
||
|
||
uint32_t part_similarity[astc_helpers::NUM_PARTITION_PATTERNS];
|
||
|
||
for (uint32_t part_index = 0; part_index < pPart_data->m_total_unique_patterns; part_index++)
|
||
{
|
||
const astc_ldr::partition_pattern_vec& pat = pPart_data->m_partition_pats[part_index];
|
||
|
||
uint32_t lowest_pat_dist = UINT32_MAX;
|
||
for (uint32_t p = 0; p < astc_ldr::NUM_PART3_MAPPINGS; p++)
|
||
{
|
||
uint32_t dist = pat.get_squared_distance(desired_parts[p]);
|
||
if (dist < lowest_pat_dist)
|
||
lowest_pat_dist = dist;
|
||
}
|
||
|
||
part_similarity[part_index] = (lowest_pat_dist << 16) | part_index;
|
||
|
||
} // part_index;
|
||
|
||
std::sort(part_similarity, part_similarity + pPart_data->m_total_unique_patterns);
|
||
|
||
for (uint32_t i = 0; i < num_best_parts; i++)
|
||
pBest_parts[i] = part_similarity[i] & 0xFFFF;
|
||
}
|
||
else
|
||
{
|
||
astc_ldr::vp_tree::result_queue results;
|
||
results.reserve(num_best_parts);
|
||
|
||
pPart_data->m_part_vp_tree.find_nearest(3, desired_part, results, num_best_parts);
|
||
|
||
assert(results.get_size() == num_best_parts);
|
||
|
||
const auto& elements = results.get_elements();
|
||
|
||
for (uint32_t i = 0; i < results.get_size(); i++)
|
||
pBest_parts[i] = elements[1 + i].m_pat_index;
|
||
}
|
||
|
||
return true;
|
||
}
|
||
|
||
//---------------------------------------------------------------------
|
||
|
||
static const float g_sobel_x[3][3] = // [y][x]
|
||
{
|
||
{ -1.0f, 0.0f, 1.0f },
|
||
{ -2.0f, 0.0f, 2.0f },
|
||
{ -1.0f, 0.0f, 1.0f }
|
||
};
|
||
|
||
static const float g_sobel_y[3][3] = // [y][x]
|
||
{
|
||
{ -1.0f, -2.0f, -1.0f },
|
||
{ 0.0f, 0.0f, 0.0f },
|
||
{ 1.0f, 2.0f, 1.0f }
|
||
};
|
||
|
||
void compute_sobel(const image& orig, image& dest, const float* pMatrix_3x3)
|
||
{
|
||
const uint32_t width = orig.get_width();
|
||
const uint32_t height = orig.get_height();
|
||
|
||
dest.resize(width, height);
|
||
|
||
for (int y = 0; y < (int)height; y++)
|
||
{
|
||
for (int x = 0; x < (int)width; x++)
|
||
{
|
||
vec4F d(128.0f);
|
||
|
||
for (int my = -1; my <= 1; my++)
|
||
{
|
||
for (int mx = -1; mx <= 1; mx++)
|
||
{
|
||
float w = pMatrix_3x3[(my + 1) * 3 + (mx + 1)];
|
||
if (w == 0.0f)
|
||
continue;
|
||
|
||
const color_rgba& s = orig.get_clamped(x + mx, y + my);
|
||
|
||
for (uint32_t c = 0; c < 4; c++)
|
||
d[c] += w * (float)s[c];
|
||
|
||
} // mx
|
||
|
||
} // my
|
||
|
||
dest(x, y).set(fast_roundf_int(d[0]), fast_roundf_int(d[1]), fast_roundf_int(d[2]), fast_roundf_int(d[3]));
|
||
|
||
} // x
|
||
} // y
|
||
}
|
||
|
||
void compute_energy_from_dct(uint32_t block_width, uint32_t block_height, float* pDCT)
|
||
{
|
||
const uint32_t num_texels = block_width * block_height;
|
||
|
||
for (uint32_t i = 1; i < num_texels; i++)
|
||
pDCT[i] = square(pDCT[i]);
|
||
|
||
pDCT[0] = 0.0f;
|
||
}
|
||
|
||
// Results scaled by # block texels (block-SSE in weight space)
|
||
float compute_preserved_dct_energy(uint32_t block_width, uint32_t block_height, const float* pEnergy, uint32_t grid_w, uint32_t grid_h)
|
||
{
|
||
float tot = 0.0f;
|
||
|
||
for (uint32_t y = 0; y < block_height; y++)
|
||
{
|
||
for (uint32_t x = 0; x < block_width; x++)
|
||
{
|
||
if ((x < grid_w) && (y < grid_h))
|
||
tot += pEnergy[x + y * block_width];
|
||
}
|
||
}
|
||
|
||
return tot;
|
||
}
|
||
|
||
// Results scaled by # block texels (block-SSE in weight space)
|
||
inline float compute_lost_dct_energy(uint32_t block_width, uint32_t block_height, const float* pEnergy, uint32_t grid_w, uint32_t grid_h)
|
||
{
|
||
float tot = 0.0f;
|
||
|
||
for (uint32_t y = 0; y < block_height; y++)
|
||
{
|
||
for (uint32_t x = 0; x < block_width; x++)
|
||
{
|
||
if ((x < grid_w) && (y < grid_h))
|
||
continue;
|
||
|
||
tot += pEnergy[x + y * block_width];
|
||
}
|
||
}
|
||
|
||
return tot;
|
||
}
|
||
|
||
struct ldr_astc_lowlevel_block_encoder_params
|
||
{
|
||
ldr_astc_lowlevel_block_encoder_params()
|
||
{
|
||
clear();
|
||
}
|
||
|
||
void clear()
|
||
{
|
||
clear_obj(*this);
|
||
|
||
for (uint32_t i = 0; i < 4; i++)
|
||
m_dp_active_chans[i] = true;
|
||
|
||
m_subsets_edge_filtering = true;
|
||
|
||
m_use_superbuckets = true;
|
||
m_bucket_pruning_passes = true;
|
||
m_use_dual_planes = true;
|
||
|
||
m_superbucket_max_to_retain[0] = 4;
|
||
m_superbucket_max_to_retain[1] = 8;
|
||
m_superbucket_max_to_retain[2] = 16;
|
||
|
||
m_shortlist_buckets_to_examine_fract = 1.0f; // after high-level bucket surrogate encoding and pruning stages, 1.0=effectively disabled
|
||
m_shortlist_buckets_to_examine_min = 1;
|
||
m_shortlist_buckets_to_examine_max = 1024;
|
||
|
||
// TODO: Expose these at a higher level. Add alpha specific?
|
||
m_num_similar_modes_in_bucket_to_shortlist_fract = .33f;
|
||
m_num_similar_modes_in_bucket_to_shortlist_fract_min = 2;
|
||
m_num_similar_modes_in_bucket_to_shortlist_fract_max = 4096;
|
||
|
||
m_final_shortlist_fraction[0] = .2f;
|
||
m_final_shortlist_fraction[1] = .3f;
|
||
m_final_shortlist_fraction[2] = .5f;
|
||
m_final_shortlist_min_size[0] = 1;
|
||
m_final_shortlist_min_size[1] = 1;
|
||
m_final_shortlist_min_size[2] = 1;
|
||
m_final_shortlist_max_size[0] = 4096;
|
||
m_final_shortlist_max_size[1] = 4096;
|
||
m_final_shortlist_max_size[2] = 4096;
|
||
|
||
m_gradient_descent_flag = true;
|
||
m_polish_weights_flag = true;
|
||
m_qcd_enabled_flag = true;
|
||
|
||
m_final_encode_try_base_ofs = true;
|
||
m_final_encode_always_try_rgb_direct = false; // if true, even if base_ofs succeeds, we try RGB/RGBA direct too
|
||
|
||
m_use_parts_std_dev_thresh = (8.0f / 255.0f);
|
||
m_use_parts_std_dev_thresh2 = (40.0f / 255.0f);
|
||
m_sobel_energy_thresh1 = 3200.0f;
|
||
m_sobel_energy_thresh2 = 30000.0f;
|
||
m_sobel_energy_thresh3 = 50000.0f;
|
||
|
||
m_part2_fraction_to_keep = 2;
|
||
m_part3_fraction_to_keep = 2;
|
||
m_base_parts2 = 32;
|
||
m_base_parts3 = 32;
|
||
|
||
// TODO: Prehaps expose this at a higher level.
|
||
m_use_blue_contraction = true;
|
||
}
|
||
|
||
uint32_t m_bx, m_by, m_block_width, m_block_height, m_total_block_pixels;
|
||
|
||
const image* m_pOrig_img_sobel_xy_t;
|
||
|
||
const astc_ldr::partitions_data* m_pPart_data_p2;
|
||
const astc_ldr::partitions_data* m_pPart_data_p3;
|
||
|
||
const astc_ldr::cem_encode_params* m_pEnc_params;
|
||
|
||
// RGB or alpha trial lists (shouldn't have both in same lists)
|
||
uint32_t m_num_trial_modes;
|
||
const basist::astc_ldr_t::trial_mode* m_pTrial_modes;
|
||
|
||
const basist::astc_ldr_t::grouped_trial_modes* m_pGrouped_trial_modes;
|
||
|
||
uint32_t m_superbucket_max_to_retain[3]; // [block_complexity_index]
|
||
|
||
float m_shortlist_buckets_to_examine_fract;
|
||
uint32_t m_shortlist_buckets_to_examine_min;
|
||
uint32_t m_shortlist_buckets_to_examine_max;
|
||
|
||
float m_num_similar_modes_in_bucket_to_shortlist_fract;
|
||
uint32_t m_num_similar_modes_in_bucket_to_shortlist_fract_min;
|
||
uint32_t m_num_similar_modes_in_bucket_to_shortlist_fract_max;
|
||
|
||
float m_final_shortlist_fraction[3];
|
||
uint32_t m_final_shortlist_min_size[3];
|
||
uint32_t m_final_shortlist_max_size[3];
|
||
|
||
bool m_use_superbuckets;
|
||
bool m_bucket_pruning_passes;
|
||
|
||
// true if this is a trial mode list containing alpha
|
||
bool m_alpha_cems;
|
||
|
||
bool m_use_alpha_or_opaque_modes; // true for only alpha cems, false of only opaque cems;
|
||
bool m_use_lum_direct_modes;
|
||
bool m_use_base_scale_modes;
|
||
bool m_use_direct_modes;
|
||
bool m_use_dual_planes;
|
||
|
||
bool m_grid_hv_filtering;
|
||
bool m_filter_horizontally_flag; // = h_energy_lost < v_energy_lost, if true it's visually better to resample the block on the X axis vs. Y
|
||
bool m_use_small_grids_only;
|
||
|
||
bool m_dp_active_chans[4];
|
||
|
||
bool m_subsets_enabled;
|
||
bool m_subsets_edge_filtering;
|
||
|
||
// TODO: Make polishing controllable per superpass.
|
||
bool m_gradient_descent_flag;
|
||
bool m_polish_weights_flag;
|
||
bool m_qcd_enabled_flag;
|
||
|
||
bool m_final_encode_try_base_ofs;
|
||
bool m_final_encode_always_try_rgb_direct;
|
||
|
||
bool m_brute_force_est_parts;
|
||
bool m_disable_part_est_stage2; // only use single stage partition estimation
|
||
|
||
bool m_use_blue_contraction; // currently global enable/disable
|
||
|
||
float m_use_parts_std_dev_thresh;
|
||
float m_use_parts_std_dev_thresh2;
|
||
float m_sobel_energy_thresh1;
|
||
float m_sobel_energy_thresh2;
|
||
float m_sobel_energy_thresh3;
|
||
|
||
uint32_t m_part2_fraction_to_keep;
|
||
uint32_t m_part3_fraction_to_keep;
|
||
uint32_t m_base_parts2;
|
||
uint32_t m_base_parts3;
|
||
|
||
float m_early_stop_wpsnr;
|
||
float m_early_stop2_wpsnr;
|
||
|
||
basist::astc_ldr_t::dct2f* m_pDCT2F; // at block size
|
||
};
|
||
|
||
struct trial_surrogate
|
||
{
|
||
uint32_t m_trial_mode_index;
|
||
float m_err;
|
||
|
||
log_surrogate_astc_blk m_log_blk;
|
||
|
||
void clear()
|
||
{
|
||
m_trial_mode_index = 0;
|
||
m_err = 0;
|
||
m_log_blk.clear();
|
||
}
|
||
|
||
bool operator < (const trial_surrogate& rhs) const
|
||
{
|
||
return m_err < rhs.m_err;
|
||
}
|
||
};
|
||
|
||
struct encode_block_output
|
||
{
|
||
int16_t m_trial_mode_index; // -1 = solid, no trial mode
|
||
uint16_t m_blur_id; // blur index
|
||
|
||
astc_helpers::log_astc_block m_log_blk;
|
||
|
||
// Packed per-plane DCT data
|
||
basist::astc_ldr_t::dct_syms m_packed_dct_plane_data[2];
|
||
|
||
uint64_t m_sse;
|
||
|
||
void clear()
|
||
{
|
||
m_trial_mode_index = -1;
|
||
m_blur_id = 0;
|
||
m_log_blk.clear();
|
||
m_sse = 0;
|
||
}
|
||
};
|
||
|
||
struct encode_block_stats
|
||
{
|
||
uint32_t m_total_superbuckets_created;
|
||
uint32_t m_total_buckets_created;
|
||
uint32_t m_total_surrogate_encodes;
|
||
uint32_t m_total_shortlist_candidates;
|
||
uint32_t m_total_full_encodes;
|
||
|
||
encode_block_stats() { clear(); }
|
||
|
||
void clear()
|
||
{
|
||
clear_obj(*this);
|
||
}
|
||
};
|
||
|
||
struct chan_mse_est
|
||
{
|
||
float m_ep;
|
||
float m_wp;
|
||
|
||
chan_mse_est() {}
|
||
chan_mse_est(float ep, float wp) : m_ep(ep), m_wp(wp) {}
|
||
};
|
||
|
||
struct weight_terms
|
||
{
|
||
float m_mean;
|
||
float m_var;
|
||
float m_endpoint_factor;
|
||
float m_weight_spread_scale;
|
||
|
||
void calc(uint32_t n, const float* pWeights)
|
||
{
|
||
assert(n);
|
||
|
||
float weight_total = 0.0f;
|
||
for (uint32_t i = 0; i < n; i++)
|
||
{
|
||
assert(is_in_range(pWeights[i], 0.0f, 1.0f));
|
||
weight_total += pWeights[i];
|
||
}
|
||
m_mean = weight_total / (float)n;
|
||
|
||
float weight_var = 0.0f;
|
||
for (uint32_t i = 0; i < n; i++)
|
||
weight_var += squaref(pWeights[i] - m_mean);
|
||
m_var = weight_var / (float)n;
|
||
|
||
// drops below 2/3 on smooth blocks and tends to 2/3 when weights are well spread
|
||
m_endpoint_factor = (1.0f + 2.0f * m_var + 2.0f * m_mean * m_mean - 2.0f * m_mean) / (2.0f / 3.0f);
|
||
m_endpoint_factor = clamp<float>(m_endpoint_factor, .25f, 1.50f);
|
||
|
||
const float UNIFORM_VAR = 1.0f / 12.0f;
|
||
float s = m_var / UNIFORM_VAR;
|
||
|
||
// shrinks the weight term on smooth blocks and is ~1 when weights are spread.
|
||
m_weight_spread_scale = saturate(s);
|
||
}
|
||
};
|
||
|
||
// weight_gamma is block size/grid size specific factor (0,1] (the amount of MSE quant error remaining taking into account bilinear smoothing)
|
||
inline chan_mse_est compute_quantized_channel_mse_estimates(uint32_t num_endpoint_levels, uint32_t num_weight_levels, float span_size, float weight_gamma, const weight_terms* pWeight_terms = nullptr)
|
||
{
|
||
assert(num_endpoint_levels >= 2);
|
||
assert(num_weight_levels >= 2);
|
||
|
||
const float Dep = 1.0f / (float)(num_endpoint_levels - 1); // endpoint quant step
|
||
const float Dw = 1.0f / (float)(num_weight_levels - 1); // weight quant step
|
||
|
||
// Endpoint quant MSE estimate is not span dependent
|
||
float ep_lower = (Dep * Dep) / 12.0f * (2.0f / 3.0f);
|
||
|
||
// Weight quant MSE estimate is span dependent
|
||
float wq_lower = (Dw * Dw) / 12.0f * weight_gamma * (span_size * span_size);
|
||
|
||
if (pWeight_terms)
|
||
{
|
||
ep_lower *= pWeight_terms->m_endpoint_factor;
|
||
wq_lower *= pWeight_terms->m_weight_spread_scale;
|
||
}
|
||
|
||
return chan_mse_est(ep_lower, wq_lower);
|
||
}
|
||
|
||
inline float compute_quantized_channel_endpoint_mse_estimate(uint32_t num_endpoint_levels, const weight_terms* pWeight_terms = nullptr)
|
||
{
|
||
assert(num_endpoint_levels >= 2);
|
||
|
||
const float Dep = 1.0f / (float)(num_endpoint_levels - 1); // endpoint quant step
|
||
|
||
// Endpoint quant MSE estimate is not span dependent
|
||
float ep_lower = (Dep * Dep) / 12.0f * (2.0f / 3.0f);
|
||
|
||
if (pWeight_terms)
|
||
ep_lower *= pWeight_terms->m_endpoint_factor;
|
||
|
||
return ep_lower;
|
||
}
|
||
|
||
inline float compute_quantized_channel_weight_mse_estimate(uint32_t num_weight_levels, float span_size, float weight_gamma, const weight_terms* pWeight_terms = nullptr)
|
||
{
|
||
assert(num_weight_levels >= 2);
|
||
|
||
const float Dw = 1.0f / (float)(num_weight_levels - 1); // weight quant step
|
||
|
||
// Weight quant MSE estimate is span dependent
|
||
float wq_lower = (Dw * Dw) / 12.0f * weight_gamma * (span_size * span_size);
|
||
|
||
if (pWeight_terms)
|
||
wq_lower *= pWeight_terms->m_weight_spread_scale;
|
||
|
||
return wq_lower;
|
||
}
|
||
|
||
const float BLUE_CONTRACTION_BASE_OFS_DISCOUNT = .9f;
|
||
const float SKIP_IF_BUCKET_WORSE_MULTIPLIER = 5.0f;
|
||
|
||
struct shortlist_bucket
|
||
{
|
||
bool m_examined_flag;
|
||
int8_t m_grid_width, m_grid_height;
|
||
int8_t m_ccs_index;
|
||
|
||
uint8_t m_cem_index;
|
||
uint8_t m_num_parts;
|
||
uint16_t m_unique_seed_index;
|
||
|
||
log_surrogate_astc_blk m_surrogate_log_blk;
|
||
float m_sse;
|
||
|
||
shortlist_bucket()
|
||
{
|
||
}
|
||
|
||
shortlist_bucket(int grid_width, int grid_height, uint32_t cem_index, int ccs_index, uint32_t num_parts, uint32_t unique_seed_index) :
|
||
m_grid_width((int8_t)grid_width), m_grid_height((int8_t)grid_height),
|
||
m_ccs_index((int8_t)ccs_index),
|
||
m_cem_index((uint8_t)cem_index),
|
||
m_num_parts((uint8_t)num_parts),
|
||
m_unique_seed_index((uint16_t)unique_seed_index)
|
||
{
|
||
m_surrogate_log_blk.clear();
|
||
m_sse = 0.0f;
|
||
m_examined_flag = false;
|
||
}
|
||
|
||
operator size_t() const
|
||
{
|
||
#define ADD_HASH(H) h ^= basist::hash_hsieh((uint8_t*)&(H), sizeof(H));
|
||
size_t h = 0;
|
||
ADD_HASH(m_grid_width);
|
||
ADD_HASH(m_grid_height);
|
||
ADD_HASH(m_ccs_index);
|
||
ADD_HASH(m_cem_index);
|
||
ADD_HASH(m_num_parts);
|
||
ADD_HASH(m_unique_seed_index);
|
||
#undef ADD_HASH
|
||
return h;
|
||
}
|
||
|
||
// equality for hashing
|
||
bool operator== (const shortlist_bucket& rhs) const
|
||
{
|
||
return (m_grid_width == rhs.m_grid_width) && (m_grid_height == rhs.m_grid_height) && (m_cem_index == rhs.m_cem_index) && (m_ccs_index == rhs.m_ccs_index) &&
|
||
(m_num_parts == rhs.m_num_parts) && (m_unique_seed_index == rhs.m_unique_seed_index);
|
||
}
|
||
};
|
||
|
||
typedef static_vector<uint16_t, 16> trial_mode_index_vec;
|
||
typedef basisu::hash_map<shortlist_bucket, trial_mode_index_vec > shortlist_bucket_hash_t;
|
||
|
||
#pragma pack(push, 1)
|
||
struct trial_mode_estimate_superbucket_key
|
||
{
|
||
// All member vars from beginning to m_last will be hashed. Be careful of alignment.
|
||
uint8_t m_cem_index;
|
||
int8_t m_ccs_index;
|
||
uint16_t m_subset_unique_index;
|
||
|
||
uint8_t m_num_subsets;
|
||
uint8_t m_last;
|
||
uint8_t m_unused[2];
|
||
|
||
trial_mode_estimate_superbucket_key()
|
||
{
|
||
static_assert((sizeof(*this) % 4) == 0, "struct size must be divisible by 4");
|
||
}
|
||
|
||
void clear()
|
||
{
|
||
clear_obj(*this);
|
||
}
|
||
|
||
operator size_t() const
|
||
{
|
||
return basist::hash_hsieh((const uint8_t*)this, BASISU_OFFSETOF(trial_mode_estimate_superbucket_key, m_last));
|
||
}
|
||
|
||
bool operator== (const trial_mode_estimate_superbucket_key& rhs) const
|
||
{
|
||
#define COMP(e) if (e != rhs.e) return false;
|
||
COMP(m_cem_index);
|
||
COMP(m_ccs_index);
|
||
COMP(m_subset_unique_index);
|
||
COMP(m_num_subsets);
|
||
#undef COMP
|
||
return true;
|
||
}
|
||
};
|
||
#pragma pack(pop)
|
||
|
||
struct trial_mode_estimate_superbucket_value
|
||
{
|
||
basisu::vector<uint32_t> m_trial_mode_list;
|
||
};
|
||
|
||
typedef hash_map<trial_mode_estimate_superbucket_key, trial_mode_estimate_superbucket_value> trial_mode_estimate_superbucket_hash;
|
||
|
||
struct trial_mode_estimate
|
||
{
|
||
trial_mode_estimate_superbucket_key m_superbucket_key;
|
||
|
||
uint32_t m_trial_mode_index;
|
||
float m_wsse;
|
||
|
||
bool operator< (const trial_mode_estimate& rhs) const
|
||
{
|
||
return m_wsse < rhs.m_wsse;
|
||
}
|
||
};
|
||
|
||
struct ranked_shortlist_bucket
|
||
{
|
||
shortlist_bucket m_bucket;
|
||
trial_mode_index_vec m_trial_mode_indices;
|
||
|
||
bool operator < (const ranked_shortlist_bucket& rhs) const { return m_bucket.m_sse < rhs.m_bucket.m_sse; }
|
||
};
|
||
|
||
struct ldr_astc_lowlevel_block_encoder
|
||
{
|
||
ldr_astc_lowlevel_block_encoder() :
|
||
m_used_flag(false)
|
||
{
|
||
clear();
|
||
}
|
||
|
||
// Warning: These objects can migrate between threads (be cautious of determinism issues with containers/hash tables!)
|
||
bool m_used_flag;
|
||
|
||
// Thread-local data follows
|
||
uint_vec m_trial_modes_to_estimate;
|
||
|
||
trial_mode_estimate_superbucket_hash m_superbucket_hash;
|
||
|
||
std::priority_queue<trial_mode_estimate> m_trial_mode_estimate_priority_queue;
|
||
|
||
basist::astc_ldr_t::fvec m_dct_work;
|
||
|
||
shortlist_bucket_hash_t m_shortlist_hash0;
|
||
shortlist_bucket_hash_t m_shortlist_hash1;
|
||
|
||
basisu::vector<trial_surrogate> m_trial_surrogates;
|
||
|
||
float m_sobel_energy;
|
||
float m_max_std_dev;
|
||
|
||
uint32_t m_block_complexity_index; // [0,2]
|
||
bool m_strong_edges;
|
||
bool m_very_strong_edges;
|
||
bool m_super_strong_edges;
|
||
|
||
bool m_used_superbuckets;
|
||
|
||
int m_best_parts2[2][MAX_BASE_PARTS2 * PART_ESTIMATE_STAGE1_MULTIPLIER]; // [rgb[a]direct/rgbs][est_part]
|
||
int m_num_est_parts2[2];
|
||
|
||
int m_best_parts3[2][MAX_BASE_PARTS3 * PART_ESTIMATE_STAGE1_MULTIPLIER]; // [rgb[a]direct/rgbs][est_part]
|
||
int m_num_est_parts3[2];
|
||
|
||
basisu::vector<ranked_shortlist_bucket> m_ranked_buckets;
|
||
|
||
void clear()
|
||
{
|
||
m_trial_modes_to_estimate.resize(0);
|
||
m_superbucket_hash.reset();
|
||
|
||
m_trial_surrogates.resize(0);
|
||
|
||
m_sobel_energy = 0;
|
||
m_max_std_dev = 0;
|
||
m_block_complexity_index = 0;
|
||
m_strong_edges = false;
|
||
m_very_strong_edges = false;
|
||
m_super_strong_edges = false;
|
||
|
||
m_used_superbuckets = false;
|
||
|
||
clear_obj(m_best_parts2);
|
||
clear_obj(m_num_est_parts2);
|
||
|
||
clear_obj(m_best_parts3);
|
||
clear_obj(m_num_est_parts3);
|
||
|
||
m_ranked_buckets.resize(0);
|
||
}
|
||
|
||
bool init(
|
||
const ldr_astc_lowlevel_block_encoder_params& p,
|
||
const astc_ldr::pixel_stats_t& pixel_stats,
|
||
basisu::vector<encode_block_output>& out_blocks,
|
||
uint32_t blur_id,
|
||
encode_block_stats& stats)
|
||
{
|
||
BASISU_NOTE_UNUSED(blur_id);
|
||
BASISU_NOTE_UNUSED(out_blocks);
|
||
BASISU_NOTE_UNUSED(stats);
|
||
|
||
// TODO: This sums the *original* (not blurred) block's energy - precompute this? Replace with DCT?
|
||
m_sobel_energy = 0.0f;
|
||
for (uint32_t y = 0; y < p.m_block_height; y++)
|
||
{
|
||
for (uint32_t x = 0; x < p.m_block_width; x++)
|
||
{
|
||
const color_rgba& s = p.m_pOrig_img_sobel_xy_t->get_clamped(p.m_bx * p.m_block_width + x, p.m_by * p.m_block_height + y);
|
||
|
||
// TODO: sum max of all channels instead?
|
||
m_sobel_energy += s[0] * s[0] + s[1] * s[1] + s[2] * s[2] + s[3] * s[3];
|
||
} // x
|
||
} // y
|
||
|
||
m_sobel_energy /= (float)p.m_total_block_pixels;
|
||
|
||
m_max_std_dev = 0.0f;
|
||
for (uint32_t i = 0; i < 4; i++)
|
||
m_max_std_dev = maximum(m_max_std_dev, pixel_stats.m_rgba_stats[i].m_std_dev);
|
||
|
||
m_strong_edges = (m_max_std_dev > p.m_use_parts_std_dev_thresh) && (m_sobel_energy > p.m_sobel_energy_thresh1);
|
||
m_very_strong_edges = (m_max_std_dev > p.m_use_parts_std_dev_thresh2) && (m_sobel_energy > p.m_sobel_energy_thresh2);
|
||
m_super_strong_edges = (m_max_std_dev > p.m_use_parts_std_dev_thresh2) && (m_sobel_energy > p.m_sobel_energy_thresh3);
|
||
|
||
m_block_complexity_index = m_super_strong_edges ? 2 : (m_very_strong_edges ? 1 : 0);
|
||
|
||
return true;
|
||
}
|
||
|
||
bool partition_triage(
|
||
const ldr_astc_lowlevel_block_encoder_params& p,
|
||
const astc_ldr::pixel_stats_t& pixel_stats,
|
||
basisu::vector<encode_block_output>& out_blocks,
|
||
uint32_t blur_id,
|
||
encode_block_stats& stats)
|
||
{
|
||
BASISU_NOTE_UNUSED(blur_id);
|
||
BASISU_NOTE_UNUSED(out_blocks);
|
||
|
||
clear_obj(m_num_est_parts2);
|
||
clear_obj(m_num_est_parts3);
|
||
|
||
if (!p.m_subsets_enabled)
|
||
return true;
|
||
|
||
if (p.m_subsets_edge_filtering)
|
||
{
|
||
if (!m_strong_edges)
|
||
return true;
|
||
}
|
||
|
||
assert(p.m_base_parts2 <= MAX_BASE_PARTS2);
|
||
assert(p.m_base_parts3 <= MAX_BASE_PARTS3);
|
||
|
||
// 2 subsets
|
||
int total_parts2 = m_super_strong_edges ? (p.m_base_parts2 * PART_ESTIMATE_STAGE1_MULTIPLIER) : (m_very_strong_edges ? (p.m_base_parts2 * 2) : p.m_base_parts2);
|
||
total_parts2 = minimum<uint32_t>(total_parts2, MAX_BASE_PARTS2 * PART_ESTIMATE_STAGE1_MULTIPLIER);
|
||
total_parts2 = minimum<uint32_t>(total_parts2, p.m_pPart_data_p2->m_total_unique_patterns);
|
||
|
||
const uint32_t surrogate_encode_flags = 0;
|
||
|
||
if (total_parts2)
|
||
{
|
||
int best_parts2_temp[MAX_BASE_PARTS2 * PART_ESTIMATE_STAGE1_MULTIPLIER];
|
||
assert(total_parts2 <= (int)std::size(best_parts2_temp));
|
||
|
||
// Stage 1: kmeans+vptree
|
||
const bool has_est_parts2 = estimate_partition2(
|
||
p.m_block_width, p.m_block_height,
|
||
pixel_stats,
|
||
best_parts2_temp, total_parts2,
|
||
p.m_pPart_data_p2, p.m_brute_force_est_parts);
|
||
|
||
if (has_est_parts2)
|
||
{
|
||
// Always try direct, optionally base+scale cem's
|
||
for (uint32_t s = 0; s < 2; s++)
|
||
{
|
||
if ((s) && (!p.m_use_base_scale_modes))
|
||
continue;
|
||
|
||
if (p.m_disable_part_est_stage2)
|
||
{
|
||
m_num_est_parts2[s] = total_parts2;
|
||
memcpy(m_best_parts2[s], best_parts2_temp, m_num_est_parts2[s] * sizeof(int));
|
||
continue;
|
||
}
|
||
|
||
uint32_t cem_to_surrogate_encode = p.m_alpha_cems ? astc_helpers::CEM_LDR_RGBA_DIRECT : astc_helpers::CEM_LDR_RGB_DIRECT;
|
||
if (s)
|
||
cem_to_surrogate_encode = p.m_alpha_cems ? astc_helpers::CEM_LDR_RGB_BASE_SCALE_PLUS_TWO_A : astc_helpers::CEM_LDR_RGB_BASE_SCALE;
|
||
|
||
// Stage 2: Analytic surrogate WSSE
|
||
basisu::vector<float> part_sses(total_parts2);
|
||
|
||
for (int i = 0; i < total_parts2; i++)
|
||
{
|
||
const astc_ldr::partitions_data* pPart_data = p.m_pPart_data_p2;
|
||
|
||
const uint32_t unique_seed_index = best_parts2_temp[i];
|
||
const uint32_t part_seed_index = pPart_data->m_unique_index_to_part_seed[unique_seed_index];
|
||
|
||
const astc_ldr::partition_pattern_vec* pPat = &pPart_data->m_partition_pats[unique_seed_index];
|
||
|
||
log_surrogate_astc_blk surrogate_log_blk;
|
||
float sse = encode_surrogate_trial_subsets(
|
||
p.m_block_width, p.m_block_height,
|
||
pixel_stats,
|
||
cem_to_surrogate_encode, 2, part_seed_index, pPat,
|
||
astc_helpers::BISE_256_LEVELS, astc_helpers::BISE_64_LEVELS,
|
||
p.m_block_width, p.m_block_height,
|
||
surrogate_log_blk,
|
||
*p.m_pEnc_params, surrogate_encode_flags);
|
||
|
||
stats.m_total_surrogate_encodes++;
|
||
|
||
part_sses[i] = sse;
|
||
} // i
|
||
|
||
basisu::vector<uint32_t> part_sses_ranks(total_parts2);
|
||
|
||
indirect_sort(total_parts2, part_sses_ranks.get_ptr(), part_sses.get_ptr());
|
||
|
||
m_num_est_parts2[s] = maximum<int>(1, (total_parts2 + p.m_part2_fraction_to_keep - 1) / p.m_part2_fraction_to_keep);
|
||
|
||
for (int i = 0; i < m_num_est_parts2[s]; i++)
|
||
{
|
||
const uint32_t rank_index = part_sses_ranks[i];
|
||
const uint32_t unique_seed_unique = best_parts2_temp[rank_index];
|
||
m_best_parts2[s][i] = unique_seed_unique;
|
||
} // i
|
||
|
||
} // s
|
||
|
||
} // if (has_est_parts2)
|
||
|
||
} // if (total_parts2)
|
||
|
||
// 3 subsets
|
||
int total_parts3 = m_super_strong_edges ? (p.m_base_parts3 * PART_ESTIMATE_STAGE1_MULTIPLIER) : (m_very_strong_edges ? (p.m_base_parts3 * 2) : p.m_base_parts3);
|
||
total_parts3 = minimum<uint32_t>(total_parts3, MAX_BASE_PARTS3 * PART_ESTIMATE_STAGE1_MULTIPLIER);
|
||
total_parts3 = minimum<uint32_t>(total_parts3, p.m_pPart_data_p3->m_total_unique_patterns);
|
||
|
||
if (total_parts3)
|
||
{
|
||
int best_parts3_temp[MAX_BASE_PARTS3 * PART_ESTIMATE_STAGE1_MULTIPLIER];
|
||
assert(total_parts3 <= (int)std::size(best_parts3_temp));
|
||
|
||
// Stage 1: kmeans+vptree
|
||
const bool has_est_parts3 = estimate_partition3(
|
||
p.m_block_width, p.m_block_height,
|
||
pixel_stats,
|
||
best_parts3_temp, total_parts3,
|
||
p.m_pPart_data_p3, p.m_brute_force_est_parts);
|
||
|
||
if (has_est_parts3)
|
||
{
|
||
// Always try direct, optionally base+scale cem's
|
||
for (uint32_t s = 0; s < 2; s++)
|
||
{
|
||
if ((s) && (!p.m_use_base_scale_modes))
|
||
continue;
|
||
|
||
if (p.m_disable_part_est_stage2)
|
||
{
|
||
m_num_est_parts3[s] = total_parts3;
|
||
memcpy(m_best_parts3[s], best_parts3_temp, m_num_est_parts3[s] * sizeof(int));
|
||
continue;
|
||
}
|
||
|
||
uint32_t cem_to_surrogate_encode = p.m_alpha_cems ? astc_helpers::CEM_LDR_RGBA_DIRECT : astc_helpers::CEM_LDR_RGB_DIRECT;
|
||
if (s)
|
||
cem_to_surrogate_encode = p.m_alpha_cems ? astc_helpers::CEM_LDR_RGB_BASE_SCALE_PLUS_TWO_A : astc_helpers::CEM_LDR_RGB_BASE_SCALE;
|
||
|
||
// Stage 2: Analytic surrogate WSSE
|
||
basisu::vector<float> part_sses(total_parts3);
|
||
for (int i = 0; i < total_parts3; i++)
|
||
{
|
||
const astc_ldr::partitions_data* pPart_data = p.m_pPart_data_p3;
|
||
|
||
const uint32_t unique_seed_index = best_parts3_temp[i];
|
||
const uint32_t part_seed_index = pPart_data->m_unique_index_to_part_seed[unique_seed_index];
|
||
|
||
const astc_ldr::partition_pattern_vec* pPat = &pPart_data->m_partition_pats[unique_seed_index];
|
||
|
||
log_surrogate_astc_blk surrogate_log_blk;
|
||
float sse = encode_surrogate_trial_subsets(
|
||
p.m_block_width, p.m_block_height,
|
||
pixel_stats,
|
||
cem_to_surrogate_encode, 3, part_seed_index, pPat,
|
||
astc_helpers::BISE_256_LEVELS, astc_helpers::BISE_64_LEVELS,
|
||
p.m_block_width, p.m_block_height,
|
||
surrogate_log_blk,
|
||
*p.m_pEnc_params, surrogate_encode_flags);
|
||
|
||
stats.m_total_surrogate_encodes++;
|
||
|
||
part_sses[i] = sse;
|
||
} // i
|
||
|
||
basisu::vector<uint32_t> part_sses_ranks(total_parts3);
|
||
|
||
indirect_sort(total_parts3, part_sses_ranks.get_ptr(), part_sses.get_ptr());
|
||
|
||
m_num_est_parts3[s] = maximum<int>(1, (total_parts3 + p.m_part3_fraction_to_keep - 1) / p.m_part3_fraction_to_keep);
|
||
|
||
for (int i = 0; i < m_num_est_parts3[s]; i++)
|
||
{
|
||
const uint32_t rank_index = part_sses_ranks[i];
|
||
const uint32_t unique_seed_unique = best_parts3_temp[rank_index];
|
||
m_best_parts3[s][i] = unique_seed_unique;
|
||
} // i
|
||
|
||
} // s
|
||
|
||
} // if (has_est_parts3)
|
||
|
||
} // if (total_parts3)
|
||
|
||
return true;
|
||
}
|
||
|
||
bool trivial_triage(
|
||
const ldr_astc_lowlevel_block_encoder_params& p,
|
||
const astc_ldr::pixel_stats_t& pixel_stats,
|
||
basisu::vector<encode_block_output>& out_blocks,
|
||
uint32_t blur_id,
|
||
encode_block_stats& stats)
|
||
{
|
||
BASISU_NOTE_UNUSED(pixel_stats);
|
||
BASISU_NOTE_UNUSED(stats);
|
||
BASISU_NOTE_UNUSED(out_blocks);
|
||
BASISU_NOTE_UNUSED(blur_id);
|
||
|
||
if (m_trial_modes_to_estimate.capacity() < 1024)
|
||
m_trial_modes_to_estimate.reserve(1024);
|
||
m_trial_modes_to_estimate.resize(0);
|
||
|
||
assert((astc_helpers::CEM_LDR_RGBA_BASE_PLUS_OFFSET + 1) == basist::astc_ldr_t::OTM_NUM_CEMS);
|
||
|
||
for (uint32_t cem_index = astc_helpers::CEM_LDR_LUM_DIRECT; cem_index < basist::astc_ldr_t::OTM_NUM_CEMS; cem_index++)
|
||
{
|
||
if (astc_helpers::does_cem_have_alpha(cem_index) != p.m_alpha_cems)
|
||
continue;
|
||
|
||
const bool cem_has_alpha = astc_helpers::does_cem_have_alpha(cem_index);
|
||
if (cem_has_alpha != p.m_use_alpha_or_opaque_modes)
|
||
continue;
|
||
|
||
bool accept_flag = false;
|
||
switch (cem_index)
|
||
{
|
||
case astc_helpers::CEM_LDR_LUM_DIRECT:
|
||
case astc_helpers::CEM_LDR_LUM_ALPHA_DIRECT:
|
||
{
|
||
accept_flag = p.m_use_lum_direct_modes;
|
||
break;
|
||
}
|
||
case astc_helpers::CEM_LDR_RGB_DIRECT:
|
||
case astc_helpers::CEM_LDR_RGBA_DIRECT:
|
||
{
|
||
accept_flag = p.m_use_direct_modes;
|
||
break;
|
||
}
|
||
case astc_helpers::CEM_LDR_RGB_BASE_SCALE:
|
||
case astc_helpers::CEM_LDR_RGB_BASE_SCALE_PLUS_TWO_A:
|
||
{
|
||
accept_flag = p.m_use_base_scale_modes;
|
||
break;
|
||
}
|
||
default:
|
||
break;
|
||
}
|
||
|
||
if (!accept_flag)
|
||
continue;
|
||
|
||
const uint32_t s = astc_helpers::cem_is_ldr_base_scale(cem_index) ? 1 : 0;
|
||
|
||
for (uint32_t subsets_index = 0; subsets_index < basist::astc_ldr_t::OTM_NUM_SUBSETS; subsets_index++)
|
||
{
|
||
if (subsets_index == 1)
|
||
{
|
||
if (!m_num_est_parts2[s])
|
||
continue;
|
||
}
|
||
else if (subsets_index == 2)
|
||
{
|
||
if (!m_num_est_parts3[s])
|
||
continue;
|
||
}
|
||
|
||
const uint32_t ccs_max_index = (p.m_use_dual_planes ? basist::astc_ldr_t::OTM_NUM_CCS : 1);
|
||
for (uint32_t ccs_index = 0; ccs_index < ccs_max_index; ccs_index++)
|
||
{
|
||
if (ccs_index)
|
||
{
|
||
if (!p.m_dp_active_chans[ccs_index - 1])
|
||
continue;
|
||
}
|
||
|
||
for (uint32_t grid_size_index = 0; grid_size_index < basist::astc_ldr_t::OTM_NUM_GRID_SIZES; grid_size_index++)
|
||
{
|
||
if (grid_size_index) // if large grid
|
||
{
|
||
if (p.m_use_small_grids_only)
|
||
continue;
|
||
}
|
||
|
||
for (uint32_t grid_anisos_index = 0; grid_anisos_index < basist::astc_ldr_t::OTM_NUM_GRID_ANISOS; grid_anisos_index++)
|
||
{
|
||
if (p.m_grid_hv_filtering)
|
||
{
|
||
if (grid_anisos_index == 1)
|
||
{
|
||
// W>=H
|
||
if (p.m_filter_horizontally_flag)
|
||
continue;
|
||
}
|
||
else if (grid_anisos_index == 2)
|
||
{
|
||
// W<H
|
||
if (!p.m_filter_horizontally_flag)
|
||
continue;
|
||
}
|
||
}
|
||
|
||
m_trial_modes_to_estimate.append(p.m_pGrouped_trial_modes->m_tm_groups[cem_index][subsets_index][ccs_index][grid_size_index][grid_anisos_index]);
|
||
|
||
} // grid_aniso_index
|
||
|
||
} // grid_size_index
|
||
|
||
} // ccs_index
|
||
|
||
} // subsets_index
|
||
|
||
} // cem_iter
|
||
|
||
if (!m_trial_modes_to_estimate.size())
|
||
{
|
||
assert(0);
|
||
return false;
|
||
}
|
||
|
||
return true;
|
||
}
|
||
|
||
bool analytic_triage(
|
||
const ldr_astc_lowlevel_block_encoder_params& p,
|
||
const astc_ldr::pixel_stats_t& pixel_stats,
|
||
basisu::vector<encode_block_output>& out_blocks,
|
||
uint32_t blur_id,
|
||
encode_block_stats& stats)
|
||
{
|
||
BASISU_NOTE_UNUSED(blur_id);
|
||
BASISU_NOTE_UNUSED(out_blocks);
|
||
|
||
//--------------------------------- superbucket analytical estimation
|
||
|
||
shortlist_bucket_hash_t& shortlist_buckets = m_shortlist_hash0;
|
||
|
||
if (m_shortlist_hash0.get_table_size() != EXPECTED_SHORTLIST_HASH_SIZE)
|
||
{
|
||
const bool was_allocated = m_shortlist_hash0.get_table_size() > 0;
|
||
|
||
m_shortlist_hash0.clear();
|
||
m_shortlist_hash0.reserve(EXPECTED_SHORTLIST_HASH_SIZE / 2);
|
||
|
||
if ((g_devel_messages) && (was_allocated))
|
||
fmt_debug_printf("shortlist hash0 thrash\n");
|
||
}
|
||
else
|
||
{
|
||
m_shortlist_hash0.reset();
|
||
}
|
||
|
||
m_used_superbuckets = false;
|
||
|
||
if (p.m_use_superbuckets)
|
||
{
|
||
m_used_superbuckets = true;
|
||
|
||
// This may thrash if it grows larger on another thread, but we must avoid determinism issues.
|
||
if (m_superbucket_hash.get_table_size() != EXPECTED_SUPERBUCKET_HASH_SIZE)
|
||
{
|
||
const bool was_allocated = m_superbucket_hash.get_table_size() > 0;
|
||
|
||
m_superbucket_hash.clear();
|
||
m_superbucket_hash.reserve(EXPECTED_SUPERBUCKET_HASH_SIZE >> 1);
|
||
|
||
if ((g_devel_messages) && (was_allocated))
|
||
fmt_debug_printf("superbucket hash thrash\n");
|
||
}
|
||
else
|
||
{
|
||
m_superbucket_hash.reset();
|
||
}
|
||
|
||
trial_mode_estimate_superbucket_key new_key;
|
||
new_key.clear();
|
||
|
||
trial_mode_estimate_superbucket_value new_val;
|
||
|
||
// Create superbuckets
|
||
uint32_t max_superbucket_tm_indices = 0;
|
||
for (uint32_t j = 0; j < m_trial_modes_to_estimate.size(); j++)
|
||
{
|
||
const uint32_t trial_mode_iter = m_trial_modes_to_estimate[j];
|
||
|
||
assert(trial_mode_iter < p.m_num_trial_modes);
|
||
const basist::astc_ldr_t::trial_mode& tm = p.m_pTrial_modes[trial_mode_iter];
|
||
|
||
new_key.m_cem_index = safe_cast_uint8(tm.m_cem);
|
||
new_key.m_ccs_index = safe_cast_int8(tm.m_ccs_index);
|
||
|
||
new_key.m_subset_unique_index = 0;
|
||
new_key.m_num_subsets = (uint8_t)tm.m_num_parts;
|
||
|
||
if (tm.m_num_parts == 1)
|
||
{
|
||
auto ins_res = m_superbucket_hash.insert(new_key, new_val);
|
||
const bool created_flag = ins_res.second;
|
||
|
||
assert(ins_res.first->first.m_cem_index == tm.m_cem);
|
||
assert(ins_res.first->first.m_ccs_index == tm.m_ccs_index);
|
||
assert(ins_res.first->first.m_num_subsets == tm.m_num_parts);
|
||
|
||
trial_mode_estimate_superbucket_value& v = (ins_res.first)->second;
|
||
|
||
if (created_flag)
|
||
v.m_trial_mode_list.reserve(256);
|
||
|
||
v.m_trial_mode_list.push_back(trial_mode_iter);
|
||
|
||
max_superbucket_tm_indices = maximum(max_superbucket_tm_indices, v.m_trial_mode_list.size_u32());
|
||
}
|
||
else
|
||
{
|
||
//const astc_ldr::partitions_data* pPart_data = (tm.m_num_parts == 2) ? p.m_pPart_data_p2 : p.m_pPart_data_p3;
|
||
|
||
const uint32_t s = astc_helpers::cem_is_ldr_base_scale(tm.m_cem) ? 1 : 0;
|
||
const uint32_t num_est_parts_to_try = (tm.m_num_parts == 2) ? m_num_est_parts2[s] : m_num_est_parts3[s];
|
||
|
||
for (uint32_t est_part_iter = 0; est_part_iter < num_est_parts_to_try; est_part_iter++)
|
||
{
|
||
const uint32_t part_unique_index = (tm.m_num_parts == 2) ? m_best_parts2[s][est_part_iter] : m_best_parts3[s][est_part_iter];
|
||
|
||
new_key.m_subset_unique_index = safe_cast_uint16(part_unique_index);
|
||
|
||
auto ins_res = m_superbucket_hash.insert(new_key, new_val);
|
||
const bool created_flag = ins_res.second;
|
||
|
||
assert(ins_res.first->first.m_cem_index == tm.m_cem);
|
||
assert(ins_res.first->first.m_ccs_index == tm.m_ccs_index);
|
||
assert(ins_res.first->first.m_num_subsets == tm.m_num_parts);
|
||
|
||
trial_mode_estimate_superbucket_value& v = (ins_res.first)->second;
|
||
if (created_flag)
|
||
v.m_trial_mode_list.reserve(256);
|
||
|
||
v.m_trial_mode_list.push_back(trial_mode_iter);
|
||
|
||
max_superbucket_tm_indices = maximum(max_superbucket_tm_indices, v.m_trial_mode_list.size_u32());
|
||
|
||
} // est_part_iter
|
||
}
|
||
|
||
} // j
|
||
|
||
//fmt_debug_printf("Total superbucket entries: {}\n", m_superbucket_hash.size());
|
||
//fmt_debug_printf("Max superbucket tm indices: {}\n", max_superbucket_tm_indices);
|
||
|
||
const uint32_t total_block_texels = p.m_total_block_pixels;
|
||
const float inv_total_block_texels = 1.0f / (float)total_block_texels;
|
||
|
||
while (m_trial_mode_estimate_priority_queue.size())
|
||
m_trial_mode_estimate_priority_queue.pop();
|
||
|
||
const uint32_t max_priority_queue_size = p.m_superbucket_max_to_retain[m_block_complexity_index];
|
||
|
||
// purposely downscale lost scale energy relative to the other error sources
|
||
// this biased the encoder towards smaller grids
|
||
const float SLAM_TO_LINE_WEIGHT = 1.5f; // upweight STL relative to other errors to give the estimator more of a signal especially for dual plane
|
||
const float QUANT_ERROR_WEIGHT = 1.0f; // quant error is naturally quite pessimistic
|
||
const float SCALE_ERROR_WEIGHT = 3.0f; // weight grid downsample (scale) error
|
||
|
||
// Discount for blue contraction encoding and base+offset CEM's.
|
||
const float BLUE_CONTRACTION_ENDPOINT_QUANT_DISCOUNT = .5f;
|
||
|
||
// Iterate over all superbuckets, surrogate encode to compute slam to line error, DCT of weight grid(s) to estimate energy lost during weight grid downsampling.
|
||
// TODO: priority queue and aggressive early outs
|
||
for (auto superbucket_iter = m_superbucket_hash.begin(); superbucket_iter != m_superbucket_hash.end(); ++superbucket_iter)
|
||
{
|
||
const trial_mode_estimate_superbucket_key& key = superbucket_iter->first;
|
||
const trial_mode_estimate_superbucket_value& val = superbucket_iter->second;
|
||
|
||
//const bool cem_has_alpha = astc_helpers::does_cem_have_alpha(key.m_cem_index);
|
||
|
||
log_surrogate_astc_blk log_blk;
|
||
|
||
const astc_ldr::partitions_data* pPart_data = nullptr;
|
||
const astc_ldr::partition_pattern_vec* pPat = nullptr;
|
||
|
||
//const uint32_t num_planes = (key.m_ccs_index >= 0) ? 2 : 1;
|
||
|
||
const float worst_wsse_found_so_far = (m_trial_mode_estimate_priority_queue.size() >= max_priority_queue_size) ? m_trial_mode_estimate_priority_queue.top().m_wsse : 1e+9f;
|
||
|
||
float slam_to_line_wsse = 0;
|
||
if (key.m_num_subsets == 1)
|
||
{
|
||
slam_to_line_wsse = encode_surrogate_trial(
|
||
p.m_block_width, p.m_block_height,
|
||
pixel_stats,
|
||
key.m_cem_index,
|
||
key.m_ccs_index,
|
||
astc_helpers::BISE_256_LEVELS, astc_helpers::BISE_64_LEVELS,
|
||
p.m_block_width, p.m_block_height,
|
||
log_blk,
|
||
*p.m_pEnc_params,
|
||
astc_ldr::cFlagDisableQuant);
|
||
}
|
||
else
|
||
{
|
||
pPart_data = (key.m_num_subsets == 3) ? p.m_pPart_data_p3 : p.m_pPart_data_p2;
|
||
|
||
const uint32_t unique_seed_index = key.m_subset_unique_index;
|
||
const uint32_t part_seed_index = pPart_data->m_unique_index_to_part_seed[unique_seed_index];
|
||
|
||
pPat = &pPart_data->m_partition_pats[unique_seed_index];
|
||
|
||
slam_to_line_wsse = encode_surrogate_trial_subsets(
|
||
p.m_block_width, p.m_block_height,
|
||
pixel_stats,
|
||
key.m_cem_index, key.m_num_subsets, part_seed_index, pPat,
|
||
astc_helpers::BISE_256_LEVELS, astc_helpers::BISE_64_LEVELS,
|
||
p.m_block_width, p.m_block_height,
|
||
log_blk,
|
||
*p.m_pEnc_params,
|
||
astc_ldr::cFlagDisableQuant);
|
||
}
|
||
|
||
stats.m_total_surrogate_encodes++;
|
||
|
||
// Early out: Slam to line error is so high it's impossible for any blocks in this bucket to win.
|
||
if ((SLAM_TO_LINE_WEIGHT * slam_to_line_wsse) >= worst_wsse_found_so_far)
|
||
continue;
|
||
|
||
bool can_use_base_ofs = false;
|
||
if ((key.m_cem_index == astc_helpers::CEM_LDR_RGB_DIRECT) || (key.m_cem_index == astc_helpers::CEM_LDR_RGBA_DIRECT))
|
||
{
|
||
float max_span_size = 0.0f;
|
||
|
||
for (uint32_t subset_index = 0; subset_index < key.m_num_subsets; subset_index++)
|
||
{
|
||
const vec4F subset_chan_spans(log_blk.m_endpoints[subset_index][1] - log_blk.m_endpoints[subset_index][0]);
|
||
for (uint32_t c = 0; c < 4; c++)
|
||
{
|
||
float span_size = fabs(subset_chan_spans[c]);
|
||
max_span_size = maximum(max_span_size, span_size);
|
||
}
|
||
}
|
||
|
||
can_use_base_ofs = (max_span_size < .25f);
|
||
}
|
||
|
||
assert(p.m_pDCT2F);
|
||
|
||
assert((p.m_pDCT2F->rows() == p.m_block_height) && (p.m_pDCT2F->cols() == p.m_block_width));
|
||
|
||
float weight0_energy[astc_ldr::ASTC_LDR_MAX_BLOCK_PIXELS];
|
||
float weight1_energy[astc_ldr::ASTC_LDR_MAX_BLOCK_PIXELS];
|
||
|
||
basist::astc_ldr_t::fvec& dct_work = m_dct_work;
|
||
|
||
// Forward DCT in normalized weight (surrogate) space
|
||
p.m_pDCT2F->forward(log_blk.m_weights0, weight0_energy, dct_work);
|
||
compute_energy_from_dct(p.m_block_width, p.m_block_height, weight0_energy);
|
||
|
||
if (key.m_ccs_index >= 0)
|
||
{
|
||
p.m_pDCT2F->forward(log_blk.m_weights1, weight1_energy, dct_work);
|
||
compute_energy_from_dct(p.m_block_width, p.m_block_height, weight1_energy);
|
||
}
|
||
|
||
weight_terms weight0_terms, weight1_terms;
|
||
weight_terms* pWeight0_terms = &weight0_terms;
|
||
weight_terms* pWeight1_terms = nullptr;
|
||
weight0_terms.calc(total_block_texels, log_blk.m_weights0);
|
||
if (key.m_ccs_index >= 0)
|
||
{
|
||
weight1_terms.calc(total_block_texels, log_blk.m_weights1);
|
||
pWeight1_terms = &weight1_terms;
|
||
}
|
||
|
||
// Precompute subset span and total pixels info
|
||
vec4F subset_spans[astc_helpers::MAX_PARTITIONS];
|
||
uint32_t subset_pixels[astc_helpers::MAX_PARTITIONS];
|
||
|
||
for (uint32_t subset_index = 0; subset_index < key.m_num_subsets; subset_index++)
|
||
{
|
||
subset_spans[subset_index] = log_blk.m_endpoints[subset_index][1] - log_blk.m_endpoints[subset_index][0];
|
||
|
||
uint32_t total_subset_pixels = p.m_total_block_pixels;
|
||
if (key.m_num_subsets > 1)
|
||
total_subset_pixels = pPart_data->m_partition_pat_histograms[key.m_subset_unique_index].m_hist[subset_index];
|
||
|
||
subset_pixels[subset_index] = total_subset_pixels;
|
||
}
|
||
|
||
// Loop through all trial modes in this sueprbucket. TODO: Sort by endpoint levels?
|
||
for (uint32_t k = 0; k < val.m_trial_mode_list.size(); k++)
|
||
{
|
||
const uint32_t trial_mode_index = val.m_trial_mode_list[k];
|
||
assert(trial_mode_index < p.m_num_trial_modes);
|
||
|
||
const basist::astc_ldr_t::trial_mode& tm = p.m_pTrial_modes[trial_mode_index];
|
||
|
||
assert(tm.m_cem == key.m_cem_index);
|
||
assert(tm.m_ccs_index == key.m_ccs_index);
|
||
assert(tm.m_num_parts == key.m_num_subsets);
|
||
|
||
const basist::astc_ldr_t::astc_block_grid_data* pGrid_data = basist::astc_ldr_t::find_astc_block_grid_data(p.m_block_width, p.m_block_height, tm.m_grid_width, tm.m_grid_height);
|
||
|
||
const uint32_t total_endpoint_levels = astc_helpers::get_ise_levels(tm.m_endpoint_ise_range);
|
||
const uint32_t total_weight_levels = astc_helpers::get_ise_levels(tm.m_weight_ise_range);
|
||
|
||
const uint32_t num_effective_e_levels = can_use_base_ofs ? minimum<uint32_t>(total_endpoint_levels * 2, 256) : total_endpoint_levels;
|
||
float qe0 = compute_quantized_channel_endpoint_mse_estimate(num_effective_e_levels);
|
||
const float qe1 = (key.m_ccs_index >= 0) ? (qe0 * pWeight1_terms->m_endpoint_factor) : 0.0f;
|
||
qe0 *= pWeight0_terms->m_endpoint_factor;
|
||
|
||
float total_e_quant_wsse = 0.0f;
|
||
|
||
for (uint32_t subset_index = 0; subset_index < key.m_num_subsets; subset_index++)
|
||
{
|
||
const vec4F& subset_chan_spans = subset_spans[subset_index];
|
||
const uint32_t total_subset_pixels = subset_pixels[subset_index];
|
||
|
||
for (uint32_t c = 0; c < 4; c++)
|
||
{
|
||
float span_size = fabs(subset_chan_spans[c]);
|
||
|
||
if ((span_size == 0.0f) && ((log_blk.m_endpoints[subset_index][1][c] == 0.0f) || (log_blk.m_endpoints[subset_index][1][c] == 1.0f)))
|
||
continue;
|
||
|
||
// Scale channel MSE by chan weight and the # of subset pixels to get weighted SSE
|
||
const float chan_N = (float)p.m_pEnc_params->m_comp_weights[c] * (float)total_subset_pixels;
|
||
|
||
total_e_quant_wsse += ((key.m_ccs_index == (int)c) ? qe1 : qe0) * chan_N;
|
||
|
||
} // chan_index
|
||
}
|
||
|
||
if ((tm.m_cem == astc_helpers::CEM_LDR_RGB_DIRECT) || (tm.m_cem == astc_helpers::CEM_LDR_RGBA_DIRECT))
|
||
total_e_quant_wsse *= BLUE_CONTRACTION_ENDPOINT_QUANT_DISCOUNT;
|
||
|
||
float total_wsse_so_far = (SLAM_TO_LINE_WEIGHT * slam_to_line_wsse) + (QUANT_ERROR_WEIGHT * total_e_quant_wsse);
|
||
if (total_wsse_so_far >= worst_wsse_found_so_far)
|
||
continue;
|
||
|
||
float lost_weight_energy0 = compute_lost_dct_energy(p.m_block_width, p.m_block_height, weight0_energy, tm.m_grid_width, tm.m_grid_height) * inv_total_block_texels;
|
||
|
||
float lost_weight_energy1 = 0;
|
||
if (key.m_ccs_index >= 0)
|
||
lost_weight_energy1 = compute_lost_dct_energy(p.m_block_width, p.m_block_height, weight1_energy, tm.m_grid_width, tm.m_grid_height) * inv_total_block_texels;
|
||
|
||
// Add up:
|
||
// slam to line error WSSE (weighted sum of squared errors)
|
||
// weight quant error WSSE
|
||
// endpoint quant error WSSE
|
||
// weight grid rescale error WSSE (scaled by span^2)
|
||
float total_scale_wsse = 0.0f;
|
||
|
||
for (uint32_t subset_index = 0; subset_index < key.m_num_subsets; subset_index++)
|
||
{
|
||
const vec4F& subset_chan_spans = subset_spans[subset_index];
|
||
const uint32_t total_subset_pixels = subset_pixels[subset_index];
|
||
|
||
for (uint32_t c = 0; c < 4; c++)
|
||
{
|
||
float span_size = fabs(subset_chan_spans[c]);
|
||
|
||
if ((span_size == 0.0f) && ((log_blk.m_endpoints[subset_index][1][c] == 0.0f) || (log_blk.m_endpoints[subset_index][1][c] == 1.0f)))
|
||
{
|
||
// Won't have any E/W quant err at extremes (0.0 or 1.0 are always perfectly represented), no weight downsample error either.
|
||
//chan_mse.m_ep = 0.0f;
|
||
//chan_mse.m_wp = 0.0f;
|
||
}
|
||
else
|
||
{
|
||
// Scale channel MSE by chan weight and the # of subset pixels to get weighted SSE
|
||
const float chan_N = (float)p.m_pEnc_params->m_comp_weights[c] * (float)total_subset_pixels;
|
||
|
||
// sum in the plane's lost weight energy, scaled by span_size^2 * chan_weight * num_texels_covered
|
||
if (key.m_ccs_index == (int)c)
|
||
total_scale_wsse += lost_weight_energy1 * square(span_size) * chan_N;
|
||
else
|
||
total_scale_wsse += lost_weight_energy0 * square(span_size) * chan_N;
|
||
}
|
||
|
||
} // chan_index
|
||
}
|
||
|
||
total_wsse_so_far += (SCALE_ERROR_WEIGHT * total_scale_wsse);
|
||
if (total_wsse_so_far >= worst_wsse_found_so_far)
|
||
continue;
|
||
|
||
float total_w_quant_wsse = 0.0f;
|
||
for (uint32_t subset_index = 0; subset_index < key.m_num_subsets; subset_index++)
|
||
{
|
||
const vec4F& subset_chan_spans = subset_spans[subset_index];
|
||
const uint32_t total_subset_pixels = subset_pixels[subset_index];
|
||
|
||
for (uint32_t c = 0; c < 4; c++)
|
||
{
|
||
float span_size = fabs(subset_chan_spans[c]);
|
||
|
||
if ((span_size == 0.0f) && ((log_blk.m_endpoints[subset_index][1][c] == 0.0f) || (log_blk.m_endpoints[subset_index][1][c] == 1.0f)))
|
||
{
|
||
// Won't have any E/W quant err at extremes (0.0 or 1.0 are always perfectly represented), no weight downsample error either.
|
||
//chan_mse.m_ep = 0.0f;
|
||
//chan_mse.m_wp = 0.0f;
|
||
}
|
||
else
|
||
{
|
||
// span_size != 0 here - estimate weight/endpoint quantization errors
|
||
float chan_w_mse = compute_quantized_channel_weight_mse_estimate(
|
||
total_weight_levels, span_size,
|
||
pGrid_data->m_weight_gamma, (key.m_ccs_index == (int)c) ? pWeight1_terms : pWeight0_terms);
|
||
|
||
// Scale channel MSE by chan weight and the # of subset pixels to get weighted SSE
|
||
const float chan_N = (float)p.m_pEnc_params->m_comp_weights[c] * (float)total_subset_pixels;
|
||
|
||
total_w_quant_wsse += chan_w_mse * chan_N;
|
||
}
|
||
|
||
} // chan_index
|
||
|
||
} // subset_index
|
||
|
||
const float total_wsse = total_wsse_so_far + (QUANT_ERROR_WEIGHT * total_w_quant_wsse);
|
||
|
||
if (m_trial_mode_estimate_priority_queue.size() >= max_priority_queue_size)
|
||
{
|
||
if (total_wsse < m_trial_mode_estimate_priority_queue.top().m_wsse)
|
||
{
|
||
m_trial_mode_estimate_priority_queue.pop();
|
||
|
||
trial_mode_estimate est;
|
||
est.m_superbucket_key = key;
|
||
est.m_trial_mode_index = trial_mode_index;
|
||
est.m_wsse = total_wsse;
|
||
|
||
m_trial_mode_estimate_priority_queue.push(est);
|
||
}
|
||
}
|
||
else
|
||
{
|
||
trial_mode_estimate est;
|
||
est.m_superbucket_key = key;
|
||
est.m_trial_mode_index = trial_mode_index;
|
||
est.m_wsse = total_wsse;
|
||
|
||
m_trial_mode_estimate_priority_queue.push(est);
|
||
}
|
||
|
||
} // k
|
||
|
||
} // superbucket_iter
|
||
|
||
stats.m_total_superbuckets_created += m_superbucket_hash.size_u32();
|
||
|
||
const uint32_t total_estimates_to_retain = (uint32_t)m_trial_mode_estimate_priority_queue.size();
|
||
assert(total_estimates_to_retain);
|
||
|
||
for (uint32_t i = 0; i < total_estimates_to_retain; i++)
|
||
{
|
||
const trial_mode_estimate &est = m_trial_mode_estimate_priority_queue.top();
|
||
|
||
const trial_mode_estimate_superbucket_key& key = est.m_superbucket_key;
|
||
const uint32_t trial_mode_iter = est.m_trial_mode_index;
|
||
|
||
assert(trial_mode_iter < p.m_num_trial_modes);
|
||
const basist::astc_ldr_t::trial_mode& tm = p.m_pTrial_modes[trial_mode_iter];
|
||
|
||
assert(tm.m_cem == key.m_cem_index);
|
||
assert(tm.m_ccs_index == key.m_ccs_index);
|
||
assert(tm.m_num_parts == key.m_num_subsets);
|
||
|
||
const uint32_t part_unique_index = key.m_subset_unique_index;
|
||
|
||
auto ins_res = shortlist_buckets.insert(shortlist_bucket(tm.m_grid_width, tm.m_grid_height, tm.m_cem, tm.m_ccs_index, tm.m_num_parts, part_unique_index));
|
||
|
||
ins_res.first->second.push_back(safe_cast_uint16(trial_mode_iter));
|
||
|
||
m_trial_mode_estimate_priority_queue.pop();
|
||
}
|
||
}
|
||
else
|
||
{
|
||
for (uint32_t j = 0; j < m_trial_modes_to_estimate.size(); j++)
|
||
{
|
||
const uint32_t trial_mode_iter = m_trial_modes_to_estimate[j];
|
||
|
||
assert(trial_mode_iter < p.m_num_trial_modes);
|
||
const basist::astc_ldr_t::trial_mode& tm = p.m_pTrial_modes[trial_mode_iter];
|
||
|
||
if (tm.m_num_parts > 1)
|
||
{
|
||
//const astc_ldr::partitions_data* pPart_data = (tm.m_num_parts == 2) ? p.m_pPart_data_p2 : p.m_pPart_data_p3;
|
||
|
||
const uint32_t s = astc_helpers::cem_is_ldr_base_scale(tm.m_cem) ? 1 : 0;
|
||
const uint32_t num_est_parts_to_try = (tm.m_num_parts == 2) ? m_num_est_parts2[s] : m_num_est_parts3[s];
|
||
|
||
for (uint32_t est_part_iter = 0; est_part_iter < num_est_parts_to_try; est_part_iter++)
|
||
{
|
||
const uint32_t part_unique_index = (tm.m_num_parts == 2) ? m_best_parts2[s][est_part_iter] : m_best_parts3[s][est_part_iter];
|
||
|
||
auto ins_res = shortlist_buckets.insert(shortlist_bucket(tm.m_grid_width, tm.m_grid_height, tm.m_cem, tm.m_ccs_index, tm.m_num_parts, part_unique_index));
|
||
|
||
ins_res.first->second.push_back(safe_cast_uint16(trial_mode_iter));
|
||
|
||
} // est_part_iter
|
||
|
||
}
|
||
else
|
||
{
|
||
auto ins_res = shortlist_buckets.insert(shortlist_bucket(tm.m_grid_width, tm.m_grid_height, tm.m_cem, tm.m_ccs_index, 1, 0));
|
||
ins_res.first->second.push_back(safe_cast_uint16(trial_mode_iter));
|
||
|
||
}
|
||
}
|
||
}
|
||
|
||
stats.m_total_buckets_created += (uint32_t)shortlist_buckets.size();
|
||
|
||
#if 0
|
||
// TEMP
|
||
uint32_t max_bucket_tm_indices = 0;
|
||
for (auto it = shortlist_buckets.begin(); it != shortlist_buckets.end(); ++it)
|
||
{
|
||
shortlist_bucket& bucket = it->first;
|
||
trial_mode_index_vec& trial_mode_indices = it->second;
|
||
max_bucket_tm_indices = maximum<uint32_t>(max_bucket_tm_indices, trial_mode_indices.size_u32());
|
||
}
|
||
|
||
fmt_debug_printf("max_bucket_tm_indices: {}\n", max_bucket_tm_indices);
|
||
#endif
|
||
|
||
return true;
|
||
}
|
||
|
||
bool surrogate_encode_shortlist_bucket_representatives(
|
||
const ldr_astc_lowlevel_block_encoder_params& p,
|
||
const astc_ldr::pixel_stats_t& pixel_stats,
|
||
basisu::vector<encode_block_output>& out_blocks,
|
||
uint32_t blur_id,
|
||
encode_block_stats& stats)
|
||
{
|
||
BASISU_NOTE_UNUSED(blur_id);
|
||
BASISU_NOTE_UNUSED(out_blocks);
|
||
|
||
shortlist_bucket_hash_t& shortlist_buckets = m_shortlist_hash0;
|
||
|
||
// Surrogate encode a representative for each bucket.
|
||
for (auto it = shortlist_buckets.begin(); it != shortlist_buckets.end(); ++it)
|
||
{
|
||
shortlist_bucket& bucket = it->first;
|
||
//const uint_vec& trial_mode_indices = it->second;
|
||
const trial_mode_index_vec& trial_mode_indices = it->second;
|
||
|
||
// Choose bucket's largest endpoint/weight ise ranges (finest quant levels) - anything in the bucket will quite likely encode to worse SSE, which we can rapidly estimate.
|
||
uint32_t max_endpoint_ise_range = 0, max_weight_ise_range = 0;
|
||
for (uint32_t i = 0; i < trial_mode_indices.size(); i++)
|
||
{
|
||
const basist::astc_ldr_t::trial_mode& tm = p.m_pTrial_modes[trial_mode_indices[i]];
|
||
|
||
max_endpoint_ise_range = maximum(max_endpoint_ise_range, tm.m_endpoint_ise_range);
|
||
max_weight_ise_range = maximum(max_weight_ise_range, tm.m_weight_ise_range);
|
||
}
|
||
|
||
log_surrogate_astc_blk& log_block = bucket.m_surrogate_log_blk;
|
||
|
||
if (bucket.m_num_parts == 1)
|
||
{
|
||
bucket.m_sse = encode_surrogate_trial(
|
||
p.m_block_width, p.m_block_height,
|
||
pixel_stats,
|
||
bucket.m_cem_index,
|
||
bucket.m_ccs_index,
|
||
max_endpoint_ise_range, max_weight_ise_range,
|
||
bucket.m_grid_width, bucket.m_grid_height,
|
||
log_block,
|
||
*p.m_pEnc_params, 0);
|
||
|
||
stats.m_total_surrogate_encodes++;
|
||
}
|
||
else
|
||
{
|
||
const astc_ldr::partitions_data* pPart_data = (bucket.m_num_parts == 2) ? p.m_pPart_data_p2 : p.m_pPart_data_p3;
|
||
|
||
const uint32_t part_seed_index = pPart_data->m_unique_index_to_part_seed[bucket.m_unique_seed_index];
|
||
|
||
const astc_ldr::partition_pattern_vec* pPat = &pPart_data->m_partition_pats[bucket.m_unique_seed_index];
|
||
|
||
bucket.m_sse = encode_surrogate_trial_subsets(
|
||
p.m_block_width, p.m_block_height,
|
||
pixel_stats,
|
||
bucket.m_cem_index, bucket.m_num_parts, part_seed_index, pPat,
|
||
max_endpoint_ise_range, max_weight_ise_range,
|
||
bucket.m_grid_width, bucket.m_grid_height,
|
||
log_block,
|
||
*p.m_pEnc_params, 0);
|
||
|
||
stats.m_total_surrogate_encodes++;
|
||
}
|
||
|
||
if ((bucket.m_cem_index == astc_helpers::CEM_LDR_RGB_DIRECT) || (bucket.m_cem_index == astc_helpers::CEM_LDR_RGBA_DIRECT))
|
||
{
|
||
// blue contraction/base+offset discount
|
||
bucket.m_sse *= BLUE_CONTRACTION_BASE_OFS_DISCOUNT;
|
||
}
|
||
|
||
} // it
|
||
|
||
return true;
|
||
}
|
||
|
||
bool prune_shortlist_buckets(
|
||
const ldr_astc_lowlevel_block_encoder_params& p,
|
||
const astc_ldr::pixel_stats_t& pixel_stats,
|
||
basisu::vector<encode_block_output>& out_blocks,
|
||
uint32_t blur_id,
|
||
encode_block_stats& stats)
|
||
{
|
||
BASISU_NOTE_UNUSED(pixel_stats);
|
||
BASISU_NOTE_UNUSED(stats);
|
||
BASISU_NOTE_UNUSED(blur_id);
|
||
BASISU_NOTE_UNUSED(out_blocks);
|
||
|
||
shortlist_bucket_hash_t& shortlist_buckets = m_shortlist_hash0;
|
||
|
||
if (p.m_bucket_pruning_passes)
|
||
{
|
||
shortlist_bucket_hash_t& new_shortlist_buckets = m_shortlist_hash1;
|
||
|
||
if (m_shortlist_hash1.get_table_size() != EXPECTED_SHORTLIST_HASH_SIZE)
|
||
{
|
||
const bool was_allocated = m_shortlist_hash1.get_table_size() > 0;
|
||
|
||
m_shortlist_hash1.clear();
|
||
m_shortlist_hash1.reserve(EXPECTED_SHORTLIST_HASH_SIZE / 2);
|
||
|
||
if ((g_devel_messages) && (was_allocated))
|
||
fmt_debug_printf("shortlist hash1 thrash\n");
|
||
}
|
||
else
|
||
{
|
||
m_shortlist_hash1.reset();
|
||
}
|
||
|
||
const uint32_t NUM_PRUNE_PASSES = 3;
|
||
for (uint32_t prune_pass = 0; prune_pass < NUM_PRUNE_PASSES; prune_pass++)
|
||
{
|
||
for (auto it = shortlist_buckets.begin(); it != shortlist_buckets.end(); ++it)
|
||
it->first.m_examined_flag = false;
|
||
|
||
new_shortlist_buckets.reset();
|
||
|
||
for (auto it = shortlist_buckets.begin(); it != shortlist_buckets.end(); ++it)
|
||
{
|
||
shortlist_bucket& bucket = it->first;
|
||
|
||
if (bucket.m_examined_flag)
|
||
continue;
|
||
|
||
if (prune_pass == 0)
|
||
{
|
||
// Prune pass 0: Dual plane groups: only accept best CCS index
|
||
if (bucket.m_ccs_index >= 0)
|
||
{
|
||
shortlist_bucket_hash_t::iterator ccs_buckets[4];
|
||
|
||
int best_ccs_index = -1;
|
||
float best_ccs_err = BIG_FLOAT_VAL;
|
||
|
||
bool skip_bucket = false;
|
||
for (uint32_t c = 0; c < 4; c++)
|
||
{
|
||
auto ccs_res_it = shortlist_buckets.find(shortlist_bucket(bucket.m_grid_width, bucket.m_grid_height, bucket.m_cem_index, c, bucket.m_num_parts, bucket.m_unique_seed_index));
|
||
ccs_buckets[c] = ccs_res_it;
|
||
|
||
if (ccs_res_it == shortlist_buckets.end())
|
||
continue;
|
||
|
||
assert(!ccs_res_it->first.m_examined_flag);
|
||
|
||
ccs_res_it->first.m_examined_flag = true;
|
||
|
||
float ccs_sse_err = ccs_res_it->first.m_sse;
|
||
if (ccs_sse_err < best_ccs_err)
|
||
{
|
||
best_ccs_err = ccs_sse_err;
|
||
best_ccs_index = c;
|
||
}
|
||
} // c
|
||
|
||
if (!skip_bucket)
|
||
{
|
||
assert(best_ccs_index >= 0);
|
||
|
||
shortlist_bucket_hash_t::iterator best_ccs_it = ccs_buckets[best_ccs_index];
|
||
assert(best_ccs_it != shortlist_buckets.end());
|
||
|
||
new_shortlist_buckets.insert(best_ccs_it->first, best_ccs_it->second);
|
||
}
|
||
}
|
||
else
|
||
{
|
||
new_shortlist_buckets.insert(it->first, it->second);
|
||
}
|
||
}
|
||
else if (prune_pass == 1)
|
||
{
|
||
// Prune pass 1: Same # of weight samples, compare WxH vs. HxW
|
||
if (bucket.m_grid_width != bucket.m_grid_height)
|
||
{
|
||
auto alt_res_it = shortlist_buckets.find(shortlist_bucket(bucket.m_grid_height, bucket.m_grid_width, bucket.m_cem_index, bucket.m_ccs_index, bucket.m_num_parts, bucket.m_unique_seed_index));
|
||
if (alt_res_it == shortlist_buckets.end())
|
||
{
|
||
new_shortlist_buckets.insert(it->first, it->second);
|
||
}
|
||
else
|
||
{
|
||
assert(!alt_res_it->first.m_examined_flag);
|
||
alt_res_it->first.m_examined_flag = true;
|
||
|
||
const float fract = (bucket.m_sse > 0.0f) ? (alt_res_it->first.m_sse / bucket.m_sse) : 0.0f;
|
||
|
||
const float ALT_RES_SSE_THRESH = .2f;
|
||
if (fract < (1.0f - ALT_RES_SSE_THRESH))
|
||
new_shortlist_buckets.insert(alt_res_it->first, alt_res_it->second);
|
||
else if (fract > (1.0f + ALT_RES_SSE_THRESH))
|
||
new_shortlist_buckets.insert(it->first, it->second);
|
||
else
|
||
{
|
||
new_shortlist_buckets.insert(alt_res_it->first, alt_res_it->second);
|
||
new_shortlist_buckets.insert(it->first, it->second);
|
||
}
|
||
}
|
||
}
|
||
else
|
||
{
|
||
new_shortlist_buckets.insert(it->first, it->second);
|
||
}
|
||
|
||
}
|
||
else if (prune_pass == 2)
|
||
{
|
||
// Prune pass 2: RGB Direct vs. Scale bucket groups
|
||
|
||
if ((bucket.m_cem_index == astc_helpers::CEM_LDR_RGB_DIRECT) || (bucket.m_cem_index == astc_helpers::CEM_LDR_RGB_BASE_SCALE) ||
|
||
(bucket.m_cem_index == astc_helpers::CEM_LDR_RGBA_DIRECT) || (bucket.m_cem_index == astc_helpers::CEM_LDR_RGB_BASE_SCALE_PLUS_TWO_A))
|
||
{
|
||
uint32_t alt_cem_index_to_find = astc_helpers::CEM_LDR_RGB_BASE_SCALE;
|
||
|
||
// Check for pairs: CEM_LDR_RGB_DIRECT vs. CEM_LDR_RGB_BASE_SCALE, or CEM_LDR_RGBA_DIRECT vs. CEM_LDR_RGB_BASE_SCALE_PLUS_TWO_A.
|
||
switch (bucket.m_cem_index)
|
||
{
|
||
case astc_helpers::CEM_LDR_RGB_DIRECT:
|
||
alt_cem_index_to_find = astc_helpers::CEM_LDR_RGB_BASE_SCALE;
|
||
break;
|
||
case astc_helpers::CEM_LDR_RGB_BASE_SCALE:
|
||
alt_cem_index_to_find = astc_helpers::CEM_LDR_RGB_DIRECT;
|
||
break;
|
||
case astc_helpers::CEM_LDR_RGBA_DIRECT:
|
||
alt_cem_index_to_find = astc_helpers::CEM_LDR_RGB_BASE_SCALE_PLUS_TWO_A;
|
||
break;
|
||
case astc_helpers::CEM_LDR_RGB_BASE_SCALE_PLUS_TWO_A:
|
||
alt_cem_index_to_find = astc_helpers::CEM_LDR_RGBA_DIRECT;
|
||
break;
|
||
default:
|
||
assert(0);
|
||
break;
|
||
}
|
||
|
||
auto alt_res_it = shortlist_buckets.find(shortlist_bucket(bucket.m_grid_width, bucket.m_grid_height, alt_cem_index_to_find, bucket.m_ccs_index, bucket.m_num_parts, bucket.m_unique_seed_index));
|
||
|
||
if (alt_res_it == shortlist_buckets.end())
|
||
{
|
||
new_shortlist_buckets.insert(it->first, it->second);
|
||
}
|
||
else
|
||
{
|
||
assert(!alt_res_it->first.m_examined_flag);
|
||
|
||
alt_res_it->first.m_examined_flag = true;
|
||
|
||
// Compare the two buckets, decide if one or another can be tossed as not worth it.
|
||
const float fract = (bucket.m_sse > 0.0f) ? (alt_res_it->first.m_sse / bucket.m_sse) : 0.0f;
|
||
|
||
const float ALT_RES_SSE_THRESH = .1f;
|
||
if (fract < (1.0f - ALT_RES_SSE_THRESH))
|
||
new_shortlist_buckets.insert(alt_res_it->first, alt_res_it->second);
|
||
else if (fract > (1.0f + ALT_RES_SSE_THRESH))
|
||
new_shortlist_buckets.insert(it->first, it->second);
|
||
else
|
||
{
|
||
new_shortlist_buckets.insert(alt_res_it->first, alt_res_it->second);
|
||
new_shortlist_buckets.insert(it->first, it->second);
|
||
}
|
||
}
|
||
}
|
||
else
|
||
{
|
||
new_shortlist_buckets.insert(it->first, it->second);
|
||
}
|
||
|
||
} // if (prune_pass
|
||
|
||
it->first.m_examined_flag = true;
|
||
}
|
||
|
||
new_shortlist_buckets.swap(shortlist_buckets);
|
||
} // prune_pass
|
||
} // if (g_bucket_pruning_passes)
|
||
|
||
assert(shortlist_buckets.size());
|
||
|
||
if (m_ranked_buckets.capacity() < shortlist_buckets.size())
|
||
m_ranked_buckets.reserve(shortlist_buckets.size());
|
||
|
||
for (auto it = shortlist_buckets.begin(); it != shortlist_buckets.end(); ++it)
|
||
{
|
||
shortlist_bucket& bucket = it->first;
|
||
const trial_mode_index_vec& trial_mode_indices = it->second;
|
||
|
||
ranked_shortlist_bucket* pDst = m_ranked_buckets.enlarge(1);
|
||
pDst->m_bucket = bucket;
|
||
pDst->m_trial_mode_indices = trial_mode_indices;
|
||
}
|
||
|
||
assert(m_ranked_buckets.size());
|
||
|
||
// Sort the buckets by their surrogate encoded SSE to rank them.
|
||
std::sort(m_ranked_buckets.begin(), m_ranked_buckets.end());
|
||
|
||
return true;
|
||
}
|
||
|
||
bool rank_and_sort_shortlist_buckets(
|
||
const ldr_astc_lowlevel_block_encoder_params& p,
|
||
const astc_ldr::pixel_stats_t& pixel_stats,
|
||
basisu::vector<encode_block_output>& out_blocks,
|
||
uint32_t blur_id,
|
||
encode_block_stats& stats)
|
||
{
|
||
BASISU_NOTE_UNUSED(blur_id);
|
||
BASISU_NOTE_UNUSED(out_blocks);
|
||
|
||
basisu::vector<trial_surrogate>& shortlist_trials = m_trial_surrogates;
|
||
|
||
// TODO: Tune this further. Memory here adds up across all encoding threads.
|
||
{
|
||
//const float reserve_factor = (sizeof(void*) > 4) ? .5f : .25f;
|
||
const uint32_t reserve_size = 64;// maximum(256, (int)(p.m_num_trial_modes * reserve_factor));
|
||
|
||
if (shortlist_trials.capacity() < reserve_size)
|
||
shortlist_trials.reserve(reserve_size);
|
||
|
||
shortlist_trials.resize(0);
|
||
}
|
||
|
||
uint32_t num_buckets_to_examine = fast_roundf_int((float)m_ranked_buckets.size_u32() * p.m_shortlist_buckets_to_examine_fract);
|
||
num_buckets_to_examine = clamp<uint32_t>(num_buckets_to_examine, p.m_shortlist_buckets_to_examine_min, p.m_shortlist_buckets_to_examine_max);
|
||
|
||
num_buckets_to_examine = clamp<uint32_t>(num_buckets_to_examine, 1, m_ranked_buckets.size_u32());
|
||
|
||
float best_err_so_far = BIG_FLOAT_VAL;
|
||
|
||
for (uint32_t bucket_index = 0; bucket_index < num_buckets_to_examine; bucket_index++)
|
||
{
|
||
const shortlist_bucket& bucket = m_ranked_buckets[bucket_index].m_bucket;
|
||
const trial_mode_index_vec& bucket_trial_mode_indices = m_ranked_buckets[bucket_index].m_trial_mode_indices;
|
||
|
||
if (best_err_so_far != BIG_FLOAT_VAL)
|
||
{
|
||
if (bucket.m_sse > best_err_so_far * SKIP_IF_BUCKET_WORSE_MULTIPLIER)
|
||
continue;
|
||
}
|
||
best_err_so_far = minimum(best_err_so_far, bucket.m_sse);
|
||
|
||
if (bucket_trial_mode_indices.size() == 1)
|
||
{
|
||
// Bucket only contains 1 mode, so we've already encoded its surrogate.
|
||
trial_surrogate& s = *shortlist_trials.try_enlarge(1);
|
||
|
||
s.m_trial_mode_index = bucket_trial_mode_indices[0];
|
||
s.m_err = bucket.m_sse;
|
||
s.m_log_blk = bucket.m_surrogate_log_blk;
|
||
continue;
|
||
}
|
||
|
||
//-----
|
||
// We have a bucket sharing all config except for ISE weight/endpoint levels. Decide how many to place on the shortlist using analytic weighted MSE/SSE estimates.
|
||
|
||
const uint32_t num_modes_in_bucket = bucket_trial_mode_indices.size_u32();
|
||
|
||
uint32_t num_modes_in_bucket_to_shortlist = fast_roundf_pos_int(num_modes_in_bucket * p.m_num_similar_modes_in_bucket_to_shortlist_fract);
|
||
|
||
num_modes_in_bucket_to_shortlist = clamp<uint32_t>(num_modes_in_bucket_to_shortlist, p.m_num_similar_modes_in_bucket_to_shortlist_fract_min, p.m_num_similar_modes_in_bucket_to_shortlist_fract_max);
|
||
|
||
num_modes_in_bucket_to_shortlist = clamp<uint32_t>(num_modes_in_bucket_to_shortlist, 1, num_modes_in_bucket);
|
||
|
||
basisu::vector<uint32_t> bucket_indices(num_modes_in_bucket);
|
||
for (uint32_t i = 0; i < num_modes_in_bucket; i++)
|
||
bucket_indices[i] = i;
|
||
|
||
if (num_modes_in_bucket_to_shortlist < num_modes_in_bucket)
|
||
{
|
||
basisu::vector<float> sse_estimates(num_modes_in_bucket);
|
||
|
||
const uint32_t bucket_surrogate_endpoint_levels = bucket.m_surrogate_log_blk.m_num_endpoint_levels;
|
||
const uint32_t bucket_surrogate_weight_levels = bucket.m_surrogate_log_blk.m_num_weight_levels;
|
||
const float bucket_surrogate_base_sse = bucket.m_sse;
|
||
|
||
const basist::astc_ldr_t::astc_block_grid_data* pGrid_data = basist::astc_ldr_t::find_astc_block_grid_data(p.m_block_width, p.m_block_height, bucket.m_grid_width, bucket.m_grid_height);
|
||
const astc_ldr::partitions_data* pBucket_part_data = (bucket.m_num_parts == 1) ? nullptr : ((bucket.m_num_parts == 2) ? p.m_pPart_data_p2 : p.m_pPart_data_p3);
|
||
|
||
bool can_use_base_ofs = false;
|
||
if ((bucket.m_cem_index == astc_helpers::CEM_LDR_RGB_DIRECT) || (bucket.m_cem_index == astc_helpers::CEM_LDR_RGBA_DIRECT))
|
||
{
|
||
float max_span_size = 0.0f;
|
||
for (uint32_t part_iter = 0; part_iter < bucket.m_num_parts; part_iter++)
|
||
{
|
||
for (uint32_t c = 0; c < 4; c++)
|
||
{
|
||
float span_size = fabs(bucket.m_surrogate_log_blk.m_endpoints[part_iter][1][c] - bucket.m_surrogate_log_blk.m_endpoints[part_iter][0][c]);
|
||
max_span_size = maximum(max_span_size, span_size);
|
||
}
|
||
}
|
||
|
||
can_use_base_ofs = max_span_size < .25f;
|
||
}
|
||
|
||
chan_mse_est bucket_sse_est(0.0f, 0.0f);
|
||
for (uint32_t part_iter = 0; part_iter < bucket.m_num_parts; part_iter++)
|
||
{
|
||
uint32_t total_texels_in_part = p.m_block_width * p.m_block_height;
|
||
if (bucket.m_num_parts > 1)
|
||
{
|
||
total_texels_in_part = pBucket_part_data->m_partition_pat_histograms[bucket.m_unique_seed_index].m_hist[part_iter];
|
||
assert(total_texels_in_part && total_texels_in_part < p.m_block_width * p.m_block_height);
|
||
}
|
||
|
||
for (uint32_t c = 0; c < 4; c++)
|
||
{
|
||
float span_size = fabs(bucket.m_surrogate_log_blk.m_endpoints[part_iter][1][c] - bucket.m_surrogate_log_blk.m_endpoints[part_iter][0][c]);
|
||
|
||
chan_mse_est chan_mse_est(compute_quantized_channel_mse_estimates(
|
||
can_use_base_ofs ? minimum<uint32_t>(bucket_surrogate_endpoint_levels * 2, 256) : bucket_surrogate_endpoint_levels,
|
||
bucket_surrogate_weight_levels,
|
||
span_size, pGrid_data->m_weight_gamma));
|
||
|
||
if (span_size == 0.0f)
|
||
{
|
||
if ((bucket.m_surrogate_log_blk.m_endpoints[part_iter][1][c] == 1.0f) || (bucket.m_surrogate_log_blk.m_endpoints[part_iter][1][c] == 0.0f))
|
||
{
|
||
chan_mse_est.m_ep = 0.0f;
|
||
chan_mse_est.m_wp = 0.0f;
|
||
}
|
||
}
|
||
|
||
bucket_sse_est.m_ep += chan_mse_est.m_ep * (float)p.m_pEnc_params->m_comp_weights[c] * total_texels_in_part;
|
||
bucket_sse_est.m_wp += chan_mse_est.m_wp * (float)p.m_pEnc_params->m_comp_weights[c] * total_texels_in_part;
|
||
} // c
|
||
|
||
} // part_iter
|
||
|
||
#if 0
|
||
fmt_debug_printf("----------------\n");
|
||
|
||
fmt_debug_printf("bucket endpoint levels: {}, weight levels: {}, surrogate sse: {}, ep_est: {}, wp_est: {}, avg RGB subset0 span: {}\n",
|
||
bucket_surrogate_endpoint_levels, bucket_surrogate_weight_levels,
|
||
bucket.m_sse,
|
||
bucket_sse_est.m_ep, bucket_sse_est.m_wp,
|
||
(fabs(bucket.m_surrogate_log_blk.m_endpoints[0][1][0] - bucket.m_surrogate_log_blk.m_endpoints[0][0][0]) +
|
||
fabs(bucket.m_surrogate_log_blk.m_endpoints[0][1][1] - bucket.m_surrogate_log_blk.m_endpoints[0][0][1]) +
|
||
fabs(bucket.m_surrogate_log_blk.m_endpoints[0][1][2] - bucket.m_surrogate_log_blk.m_endpoints[0][0][2])) / 3.0f);
|
||
#endif
|
||
|
||
for (uint32_t j = 0; j < bucket_trial_mode_indices.size(); j++)
|
||
{
|
||
const uint32_t trial_mode_index = bucket_trial_mode_indices[j];
|
||
const basist::astc_ldr_t::trial_mode& tm = p.m_pTrial_modes[trial_mode_index];
|
||
|
||
const uint32_t trial_mode_endpoint_levels = astc_helpers::get_ise_levels(tm.m_endpoint_ise_range);
|
||
const uint32_t trial_mode_weight_levels = astc_helpers::get_ise_levels(tm.m_weight_ise_range);
|
||
|
||
assert(trial_mode_endpoint_levels <= bucket_surrogate_endpoint_levels);
|
||
assert(trial_mode_weight_levels <= bucket_surrogate_weight_levels);
|
||
|
||
chan_mse_est mode_sse_est(0.0f, 0.0f);
|
||
for (uint32_t part_iter = 0; part_iter < bucket.m_num_parts; part_iter++)
|
||
{
|
||
uint32_t total_texels_in_part = p.m_block_width * p.m_block_height;
|
||
if (bucket.m_num_parts > 1)
|
||
{
|
||
total_texels_in_part = pBucket_part_data->m_partition_pat_histograms[bucket.m_unique_seed_index].m_hist[part_iter];
|
||
assert(total_texels_in_part && total_texels_in_part < p.m_block_width * p.m_block_height);
|
||
}
|
||
|
||
for (uint32_t c = 0; c < 4; c++)
|
||
{
|
||
float span_size = fabs(bucket.m_surrogate_log_blk.m_endpoints[part_iter][1][c] - bucket.m_surrogate_log_blk.m_endpoints[part_iter][0][c]);
|
||
|
||
chan_mse_est chan_mse_est(compute_quantized_channel_mse_estimates(
|
||
can_use_base_ofs ? minimum<uint32_t>(trial_mode_endpoint_levels * 2, 256) : trial_mode_endpoint_levels,
|
||
trial_mode_weight_levels,
|
||
span_size, pGrid_data->m_weight_gamma));
|
||
|
||
if (span_size == 0.0f)
|
||
{
|
||
if ((bucket.m_surrogate_log_blk.m_endpoints[part_iter][1][c] == 1.0f) || (bucket.m_surrogate_log_blk.m_endpoints[part_iter][1][c] == 0.0f))
|
||
{
|
||
chan_mse_est.m_ep = 0.0f;
|
||
chan_mse_est.m_wp = 0.0f;
|
||
}
|
||
}
|
||
|
||
mode_sse_est.m_ep += chan_mse_est.m_ep * (float)p.m_pEnc_params->m_comp_weights[c] * total_texels_in_part;
|
||
mode_sse_est.m_wp += chan_mse_est.m_wp * (float)p.m_pEnc_params->m_comp_weights[c] * total_texels_in_part;
|
||
} // c
|
||
|
||
} // part_iter
|
||
|
||
// Remove the bucket's base estimated endpoint/weight quant
|
||
if (trial_mode_endpoint_levels == bucket_surrogate_endpoint_levels)
|
||
{
|
||
mode_sse_est.m_ep = 0.0f;
|
||
}
|
||
else
|
||
{
|
||
mode_sse_est.m_ep -= bucket_sse_est.m_ep;
|
||
|
||
if (mode_sse_est.m_ep < 0.0f)
|
||
mode_sse_est.m_ep = 0.0f;
|
||
}
|
||
|
||
if (trial_mode_weight_levels == bucket_surrogate_weight_levels)
|
||
{
|
||
mode_sse_est.m_wp = 0.0f;
|
||
}
|
||
else
|
||
{
|
||
mode_sse_est.m_wp -= bucket_sse_est.m_wp;
|
||
|
||
if (mode_sse_est.m_wp < 0.0f)
|
||
mode_sse_est.m_wp = 0.0f;
|
||
}
|
||
|
||
float mode_total_sse_est = bucket_surrogate_base_sse + mode_sse_est.m_ep + mode_sse_est.m_wp;
|
||
|
||
sse_estimates[j] = mode_total_sse_est;
|
||
|
||
#if 0
|
||
// TEMP comparison code
|
||
float actual_sse = 0.0f;
|
||
|
||
{
|
||
log_surrogate_astc_blk temp_surrogate_log_blk;
|
||
if (bucket.m_num_parts == 1)
|
||
{
|
||
actual_sse = encode_surrogate_trial(
|
||
p.m_block_width, p.m_block_height,
|
||
pixel_stats,
|
||
bucket.m_cem_index,
|
||
bucket.m_ccs_index,
|
||
tm.m_endpoint_ise_range, tm.m_weight_ise_range,
|
||
bucket.m_grid_width, bucket.m_grid_height,
|
||
temp_surrogate_log_blk,
|
||
*p.m_pEnc_params);
|
||
}
|
||
else
|
||
{
|
||
const astc_ldr::partitions_data* pPart_data = (bucket.m_num_parts == 2) ? p.m_pPart_data_p2 : p.m_pPart_data_p3;
|
||
|
||
const uint32_t part_seed_index = pPart_data->m_unique_index_to_part_seed[bucket.m_unique_seed_index];
|
||
|
||
const astc_ldr::partition_pattern_vec* pPat = &pPart_data->m_partition_pats[bucket.m_unique_seed_index];
|
||
|
||
actual_sse = encode_surrogate_trial_subsets(
|
||
p.m_block_width, p.m_block_height,
|
||
pixel_stats,
|
||
bucket.m_cem_index, bucket.m_num_parts, part_seed_index, pPat,
|
||
tm.m_endpoint_ise_range, tm.m_weight_ise_range,
|
||
bucket.m_grid_width, bucket.m_grid_height,
|
||
temp_surrogate_log_blk,
|
||
*p.m_pEnc_params, 0);
|
||
}
|
||
|
||
stats.m_total_surrogate_encodes++;
|
||
}
|
||
|
||
fmt_debug_printf("sse: {}, actual sse: {}, endpoint levels: {} weight levels: {}\n", sse_estimates[j], actual_sse, trial_mode_endpoint_levels, trial_mode_weight_levels);
|
||
#endif
|
||
|
||
} // j
|
||
|
||
#if 0
|
||
fmt_debug_printf("\n");
|
||
#endif
|
||
|
||
indirect_sort(num_modes_in_bucket, bucket_indices.get_ptr(), sse_estimates.get_ptr());
|
||
|
||
} // if (num_modes_in_bucket_to_shortlist < num_modes_in_bucket)
|
||
|
||
// Surrogate encode the best looking buckets after factoring in estimate SSE errors.
|
||
|
||
for (uint32_t q = 0; q < num_modes_in_bucket_to_shortlist; q++)
|
||
{
|
||
const uint32_t j = bucket_indices[q];
|
||
|
||
trial_surrogate& s = *shortlist_trials.try_enlarge(1);
|
||
|
||
const uint32_t trial_mode_index = bucket_trial_mode_indices[j];
|
||
const basist::astc_ldr_t::trial_mode& tm = p.m_pTrial_modes[trial_mode_index];
|
||
|
||
s.m_trial_mode_index = trial_mode_index;
|
||
|
||
if (bucket.m_num_parts == 1)
|
||
{
|
||
s.m_err = encode_surrogate_trial(
|
||
p.m_block_width, p.m_block_height,
|
||
pixel_stats,
|
||
bucket.m_cem_index,
|
||
bucket.m_ccs_index,
|
||
tm.m_endpoint_ise_range, tm.m_weight_ise_range,
|
||
bucket.m_grid_width, bucket.m_grid_height,
|
||
s.m_log_blk,
|
||
*p.m_pEnc_params, 0);
|
||
|
||
stats.m_total_surrogate_encodes++;
|
||
}
|
||
else
|
||
{
|
||
const astc_ldr::partitions_data* pPart_data = (bucket.m_num_parts == 2) ? p.m_pPart_data_p2 : p.m_pPart_data_p3;
|
||
|
||
const uint32_t part_seed_index = pPart_data->m_unique_index_to_part_seed[bucket.m_unique_seed_index];
|
||
|
||
const astc_ldr::partition_pattern_vec* pPat = &pPart_data->m_partition_pats[bucket.m_unique_seed_index];
|
||
|
||
s.m_err = encode_surrogate_trial_subsets(
|
||
p.m_block_width, p.m_block_height,
|
||
pixel_stats,
|
||
bucket.m_cem_index, bucket.m_num_parts, part_seed_index, pPat,
|
||
tm.m_endpoint_ise_range, tm.m_weight_ise_range,
|
||
bucket.m_grid_width, bucket.m_grid_height,
|
||
s.m_log_blk,
|
||
*p.m_pEnc_params, 0);
|
||
|
||
stats.m_total_surrogate_encodes++;
|
||
}
|
||
|
||
if ((bucket.m_cem_index == astc_helpers::CEM_LDR_RGB_DIRECT) || (bucket.m_cem_index == astc_helpers::CEM_LDR_RGBA_DIRECT))
|
||
{
|
||
// blue contraction/base+offset discount
|
||
s.m_err *= BLUE_CONTRACTION_BASE_OFS_DISCOUNT;
|
||
}
|
||
|
||
} // j
|
||
|
||
} // bucket_index
|
||
|
||
if (!shortlist_trials.size())
|
||
return false;
|
||
|
||
shortlist_trials.sort();
|
||
|
||
stats.m_total_shortlist_candidates += shortlist_trials.size_u32();
|
||
|
||
return true;
|
||
}
|
||
|
||
bool final_polish_encode_from_shortlist(
|
||
const ldr_astc_lowlevel_block_encoder_params& p,
|
||
const astc_ldr::pixel_stats_t& pixel_stats,
|
||
basisu::vector<encode_block_output>& out_blocks,
|
||
uint32_t blur_id,
|
||
encode_block_stats& stats)
|
||
{
|
||
basisu::vector<trial_surrogate>& shortlist_trials = m_trial_surrogates;
|
||
|
||
// TODO: Diversity selection
|
||
const float shortlist_fract = p.m_final_shortlist_fraction[m_block_complexity_index];
|
||
|
||
uint32_t max_shortlist_trials = (uint32_t)std::roundf((float)shortlist_trials.size_u32() * shortlist_fract);
|
||
|
||
max_shortlist_trials = clamp<uint32_t>(max_shortlist_trials, p.m_final_shortlist_min_size[m_block_complexity_index], p.m_final_shortlist_max_size[m_block_complexity_index]);
|
||
|
||
uint32_t total_shortlist_trials = clamp<uint32_t>(max_shortlist_trials, 1, shortlist_trials.size_u32());
|
||
|
||
const uint32_t EARLY_STOP2_SHORTLIST_ITER_INDEX = 5;
|
||
|
||
// Now do the real encodes on the top surrogate shortlist trials.
|
||
for (uint32_t shortlist_iter = 0; shortlist_iter < total_shortlist_trials; shortlist_iter++)
|
||
{
|
||
const uint32_t trial_mode_index = shortlist_trials[shortlist_iter].m_trial_mode_index;
|
||
const basist::astc_ldr_t::trial_mode& tm = p.m_pTrial_modes[trial_mode_index];
|
||
|
||
astc_helpers::log_astc_block log_astc_blk;
|
||
|
||
bool base_ofs_succeeded_flag = false;
|
||
|
||
if ((p.m_final_encode_try_base_ofs) && ((tm.m_cem == astc_helpers::CEM_LDR_RGB_DIRECT) || (tm.m_cem == astc_helpers::CEM_LDR_RGBA_DIRECT)))
|
||
{
|
||
// Add RGB/RGBA BASE PLUS OFFSET variant.
|
||
astc_helpers::log_astc_block log_astc_blk_alt;
|
||
|
||
const uint32_t base_ofs_cem_index = (tm.m_cem == astc_helpers::CEM_LDR_RGB_DIRECT) ? astc_helpers::CEM_LDR_RGB_BASE_PLUS_OFFSET : astc_helpers::CEM_LDR_RGBA_BASE_PLUS_OFFSET;
|
||
|
||
bool base_ofs_clamped_flag = false;
|
||
|
||
bool alt_enc_trial_status;
|
||
if (tm.m_num_parts > 1)
|
||
{
|
||
const astc_ldr::partitions_data* pPart_data = (tm.m_num_parts == 2) ? p.m_pPart_data_p2 : p.m_pPart_data_p3;
|
||
|
||
const uint32_t part_seed_index = shortlist_trials[shortlist_iter].m_log_blk.m_seed_index;
|
||
const uint32_t part_unique_index = pPart_data->m_part_seed_to_unique_index[part_seed_index];
|
||
const astc_ldr::partition_pattern_vec* pPat = &pPart_data->m_partition_pats[part_unique_index];
|
||
|
||
alt_enc_trial_status = encode_trial_subsets(
|
||
p.m_block_width, p.m_block_height, pixel_stats, base_ofs_cem_index, tm.m_num_parts,
|
||
part_seed_index, pPat,
|
||
tm.m_endpoint_ise_range, tm.m_weight_ise_range,
|
||
tm.m_grid_width, tm.m_grid_height, log_astc_blk_alt, *p.m_pEnc_params, false,
|
||
p.m_gradient_descent_flag, p.m_polish_weights_flag, p.m_qcd_enabled_flag,
|
||
p.m_use_blue_contraction, &base_ofs_clamped_flag);
|
||
}
|
||
else
|
||
{
|
||
alt_enc_trial_status = encode_trial(
|
||
p.m_block_width, p.m_block_height, pixel_stats, base_ofs_cem_index,
|
||
tm.m_ccs_index != -1, tm.m_ccs_index,
|
||
tm.m_endpoint_ise_range, tm.m_weight_ise_range,
|
||
tm.m_grid_width, tm.m_grid_height, log_astc_blk_alt, *p.m_pEnc_params,
|
||
p.m_gradient_descent_flag, p.m_polish_weights_flag, p.m_qcd_enabled_flag,
|
||
p.m_use_blue_contraction, &base_ofs_clamped_flag);
|
||
}
|
||
|
||
assert(alt_enc_trial_status);
|
||
|
||
if (alt_enc_trial_status)
|
||
{
|
||
stats.m_total_full_encodes++;
|
||
|
||
encode_block_output* pOut_block2 = out_blocks.enlarge(1);
|
||
pOut_block2->clear();
|
||
pOut_block2->m_trial_mode_index = safe_cast_int16(trial_mode_index);
|
||
pOut_block2->m_log_blk = log_astc_blk_alt;
|
||
pOut_block2->m_blur_id = safe_cast_uint16(blur_id);
|
||
pOut_block2->m_sse = eval_error(p.m_block_width, p.m_block_height, log_astc_blk_alt, pixel_stats, *p.m_pEnc_params);
|
||
|
||
if ((p.m_early_stop_wpsnr) || (p.m_early_stop2_wpsnr))
|
||
{
|
||
const float wpsnr = compute_psnr_from_wsse(p.m_block_width, p.m_block_height, pOut_block2->m_sse, p.m_pEnc_params->get_total_comp_weights());
|
||
|
||
if ((p.m_early_stop_wpsnr) && (wpsnr >= p.m_early_stop_wpsnr))
|
||
break;
|
||
|
||
if (shortlist_iter >= EARLY_STOP2_SHORTLIST_ITER_INDEX)
|
||
{
|
||
if ((p.m_early_stop2_wpsnr) && (wpsnr >= p.m_early_stop2_wpsnr))
|
||
break;
|
||
}
|
||
}
|
||
|
||
base_ofs_succeeded_flag = !base_ofs_clamped_flag;
|
||
}
|
||
|
||
} // (p.m_final_encode_try_base_ofs)
|
||
|
||
if ((p.m_final_encode_always_try_rgb_direct) || (!base_ofs_succeeded_flag))
|
||
{
|
||
bool enc_trial_status;
|
||
|
||
if (tm.m_num_parts > 1)
|
||
{
|
||
const astc_ldr::partitions_data* pPart_data = (tm.m_num_parts == 2) ? p.m_pPart_data_p2 : p.m_pPart_data_p3;
|
||
|
||
const uint32_t part_seed_index = shortlist_trials[shortlist_iter].m_log_blk.m_seed_index;
|
||
const uint32_t part_unique_index = pPart_data->m_part_seed_to_unique_index[part_seed_index];
|
||
assert(part_unique_index < astc_helpers::NUM_PARTITION_PATTERNS);
|
||
const astc_ldr::partition_pattern_vec* pPat = &pPart_data->m_partition_pats[part_unique_index];
|
||
|
||
enc_trial_status = encode_trial_subsets(
|
||
p.m_block_width, p.m_block_height, pixel_stats, tm.m_cem, tm.m_num_parts,
|
||
part_seed_index, pPat,
|
||
tm.m_endpoint_ise_range, tm.m_weight_ise_range,
|
||
tm.m_grid_width, tm.m_grid_height, log_astc_blk, *p.m_pEnc_params, false,
|
||
p.m_gradient_descent_flag, p.m_polish_weights_flag, p.m_qcd_enabled_flag,
|
||
p.m_use_blue_contraction);
|
||
}
|
||
else
|
||
{
|
||
enc_trial_status = encode_trial(
|
||
p.m_block_width, p.m_block_height, pixel_stats, tm.m_cem,
|
||
tm.m_ccs_index != -1, tm.m_ccs_index,
|
||
tm.m_endpoint_ise_range, tm.m_weight_ise_range,
|
||
tm.m_grid_width, tm.m_grid_height, log_astc_blk, *p.m_pEnc_params,
|
||
p.m_gradient_descent_flag, p.m_polish_weights_flag, p.m_qcd_enabled_flag,
|
||
p.m_use_blue_contraction);
|
||
}
|
||
|
||
assert(enc_trial_status);
|
||
|
||
if (!enc_trial_status)
|
||
return false;
|
||
|
||
stats.m_total_full_encodes++;
|
||
|
||
{
|
||
encode_block_output* pOut_block1 = out_blocks.enlarge(1);
|
||
pOut_block1->clear();
|
||
pOut_block1->m_trial_mode_index = safe_cast_int16(trial_mode_index);
|
||
pOut_block1->m_log_blk = log_astc_blk;
|
||
pOut_block1->m_blur_id = safe_cast_uint16(blur_id);
|
||
pOut_block1->m_sse = eval_error(p.m_block_width, p.m_block_height, log_astc_blk, pixel_stats, *p.m_pEnc_params);
|
||
|
||
if ((p.m_early_stop_wpsnr) || (p.m_early_stop2_wpsnr))
|
||
{
|
||
const float wpsnr = compute_psnr_from_wsse(p.m_block_width, p.m_block_height, pOut_block1->m_sse, p.m_pEnc_params->get_total_comp_weights());
|
||
|
||
if ((p.m_early_stop_wpsnr) && (wpsnr >= p.m_early_stop_wpsnr))
|
||
break;
|
||
|
||
if (shortlist_iter >= EARLY_STOP2_SHORTLIST_ITER_INDEX)
|
||
{
|
||
if ((p.m_early_stop2_wpsnr) && (wpsnr >= p.m_early_stop2_wpsnr))
|
||
break;
|
||
}
|
||
}
|
||
}
|
||
|
||
} // if (!skip_encode_flag)
|
||
|
||
} // shortlist_iter
|
||
|
||
return true;
|
||
}
|
||
|
||
bool full_encode(const ldr_astc_lowlevel_block_encoder_params& p,
|
||
const astc_ldr::pixel_stats_t& pixel_stats,
|
||
basisu::vector<encode_block_output>& out_blocks,
|
||
uint32_t blur_id,
|
||
encode_block_stats& stats)
|
||
{
|
||
clear();
|
||
|
||
if (!init(p, pixel_stats, out_blocks, blur_id, stats))
|
||
return false;
|
||
|
||
if (!partition_triage(p, pixel_stats, out_blocks, blur_id, stats))
|
||
return false;
|
||
|
||
if (!trivial_triage(p, pixel_stats, out_blocks, blur_id, stats))
|
||
return false;
|
||
|
||
if (!analytic_triage(p, pixel_stats, out_blocks, blur_id, stats))
|
||
return false;
|
||
|
||
if (!surrogate_encode_shortlist_bucket_representatives(p, pixel_stats, out_blocks, blur_id, stats))
|
||
return false;
|
||
|
||
if (!prune_shortlist_buckets(p, pixel_stats, out_blocks, blur_id, stats))
|
||
return false;
|
||
|
||
if (!rank_and_sort_shortlist_buckets(p, pixel_stats, out_blocks, blur_id, stats))
|
||
return false;
|
||
|
||
if (!final_polish_encode_from_shortlist(p, pixel_stats, out_blocks, blur_id, stats))
|
||
return false;
|
||
|
||
return true;
|
||
}
|
||
};
|
||
|
||
class ldr_astc_lowlevel_block_encoder_pool
|
||
{
|
||
public:
|
||
ldr_astc_lowlevel_block_encoder_pool()
|
||
{
|
||
}
|
||
|
||
void init(uint32_t total_threads)
|
||
{
|
||
std::lock_guard g(m_mutex);
|
||
|
||
m_pool.resize(total_threads);
|
||
|
||
for (uint32_t i = 0; i < total_threads; i++)
|
||
m_pool[i].m_used_flag = false;
|
||
}
|
||
|
||
void deinit()
|
||
{
|
||
std::lock_guard g(m_mutex);
|
||
|
||
for (uint32_t i = 0; i < m_pool.size(); i++)
|
||
{
|
||
if (m_pool[i].m_used_flag)
|
||
{
|
||
assert(0);
|
||
debug_printf("ldr_astc_lowlevel_block_encoder_pool::deinit: Pool entry still marked as used\n");
|
||
}
|
||
|
||
m_pool[i].m_used_flag = false;
|
||
}
|
||
|
||
m_pool.resize(0);
|
||
}
|
||
|
||
ldr_astc_lowlevel_block_encoder* acquire()
|
||
{
|
||
std::lock_guard g(m_mutex);
|
||
|
||
assert(m_pool.size());
|
||
|
||
ldr_astc_lowlevel_block_encoder* pRes = nullptr;
|
||
|
||
for (uint32_t i = 0; i < m_pool.size(); i++)
|
||
{
|
||
if (!m_pool[i].m_used_flag)
|
||
{
|
||
pRes = &m_pool[i];
|
||
pRes->m_used_flag = true;
|
||
|
||
break;
|
||
}
|
||
}
|
||
|
||
assert(pRes);
|
||
|
||
return pRes;
|
||
}
|
||
|
||
bool release(ldr_astc_lowlevel_block_encoder* pTemps)
|
||
{
|
||
std::lock_guard g(m_mutex);
|
||
|
||
assert(m_pool.size());
|
||
|
||
if ((pTemps < m_pool.begin()) || (pTemps >= m_pool.end()))
|
||
{
|
||
assert(0);
|
||
return false;
|
||
}
|
||
|
||
size_t idx = pTemps - m_pool.begin();
|
||
if (idx >= m_pool.size())
|
||
{
|
||
assert(0);
|
||
return false;
|
||
}
|
||
|
||
m_pool[idx].m_used_flag = false;
|
||
|
||
return true;
|
||
}
|
||
|
||
private:
|
||
std::mutex m_mutex;
|
||
basisu::vector<ldr_astc_lowlevel_block_encoder> m_pool;
|
||
};
|
||
|
||
class scoped_ldr_astc_lowlevel_block_encoder
|
||
{
|
||
public:
|
||
scoped_ldr_astc_lowlevel_block_encoder(ldr_astc_lowlevel_block_encoder_pool& pool) :
|
||
m_pool(pool)
|
||
{
|
||
m_pTemps = pool.acquire();
|
||
}
|
||
|
||
~scoped_ldr_astc_lowlevel_block_encoder()
|
||
{
|
||
m_pool.release(m_pTemps);
|
||
}
|
||
|
||
ldr_astc_lowlevel_block_encoder_pool& get_pool() const
|
||
{
|
||
return m_pool;
|
||
}
|
||
|
||
ldr_astc_lowlevel_block_encoder* get_ptr()
|
||
{
|
||
return m_pTemps;
|
||
}
|
||
|
||
private:
|
||
ldr_astc_lowlevel_block_encoder_pool& m_pool;
|
||
ldr_astc_lowlevel_block_encoder* m_pTemps;
|
||
};
|
||
|
||
|
||
//-------------------------------------------------------------------
|
||
|
||
#pragma pack(push, 1)
|
||
struct trial_mode_desc
|
||
{
|
||
uint8_t m_unique_cem_index; // LDR base CEM's, 0-5
|
||
uint8_t m_ccs; // 0 if SP, 1-4 for DP
|
||
uint8_t m_subsets; // 1-3
|
||
uint8_t m_eise; // endpoint ise range, 4-20
|
||
uint8_t m_wise; // weight ise range, 0-11
|
||
uint8_t m_grid_w, m_grid_h; // grid resolution, 4-12
|
||
};
|
||
#pragma pack(pop)
|
||
|
||
static const int s_astc_cem_to_unique_ldr_index[16] =
|
||
{
|
||
0, // CEM_LDR_LUM_DIRECT
|
||
-1, // CEM_LDR_LUM_BASE_PLUS_OFS
|
||
-1, // CEM_HDR_LUM_LARGE_RANGE
|
||
-1, // CEM_HDR_LUM_SMALL_RANGE
|
||
1, // CEM_LDR_LUM_ALPHA_DIRECT
|
||
-1, // CEM_LDR_LUM_ALPHA_BASE_PLUS_OFS
|
||
2, // CEM_LDR_RGB_BASE_SCALE
|
||
-1, // CEM_HDR_RGB_BASE_SCALE
|
||
3, // CEM_LDR_RGB_DIRECT
|
||
-1, // CEM_LDR_RGB_BASE_PLUS_OFFSET
|
||
4, // CEM_LDR_RGB_BASE_SCALE_PLUS_TWO_A
|
||
-1, // CEM_HDR_RGB
|
||
5, // CEM_LDR_RGBA_DIRECT
|
||
-1, // CEM_LDR_RGBA_BASE_PLUS_OFFSET
|
||
-1, // CEM_HDR_RGB_LDR_ALPHA
|
||
-1, // CEM_HDR_RGB_HDR_ALPHA
|
||
};
|
||
|
||
#if 0
|
||
static const int s_unique_ldr_index_to_astc_cem[6] =
|
||
{
|
||
astc_helpers::CEM_LDR_LUM_DIRECT,
|
||
astc_helpers::CEM_LDR_LUM_ALPHA_DIRECT,
|
||
astc_helpers::CEM_LDR_RGB_BASE_SCALE,
|
||
astc_helpers::CEM_LDR_RGB_DIRECT,
|
||
astc_helpers::CEM_LDR_RGB_BASE_SCALE_PLUS_TWO_A,
|
||
astc_helpers::CEM_LDR_RGBA_DIRECT
|
||
};
|
||
#endif
|
||
|
||
static uint32_t pack_tm_desc(
|
||
uint32_t grid_width, uint32_t grid_height,
|
||
uint32_t cem_index, uint32_t ccs_index, uint32_t num_subsets,
|
||
uint32_t endpoint_ise_range, uint32_t weight_ise_range)
|
||
{
|
||
assert((grid_width >= 2) && (grid_width <= 12));
|
||
assert((grid_height >= 2) && (grid_height <= 12));
|
||
assert((cem_index < 16) && astc_helpers::is_cem_ldr(cem_index));
|
||
assert((num_subsets >= 1) && (num_subsets <= 3));
|
||
assert(ccs_index <= 4); // 0 for SP, 1-4 for DP
|
||
assert((endpoint_ise_range >= astc_helpers::FIRST_VALID_ENDPOINT_ISE_RANGE) && (endpoint_ise_range <= astc_helpers::LAST_VALID_ENDPOINT_ISE_RANGE));
|
||
assert((weight_ise_range >= astc_helpers::FIRST_VALID_WEIGHT_ISE_RANGE) && (weight_ise_range <= astc_helpers::LAST_VALID_WEIGHT_ISE_RANGE));
|
||
|
||
grid_width -= 2;
|
||
grid_height -= 2;
|
||
assert((grid_width <= 10) && (grid_height <= 10));
|
||
|
||
const int unique_cem_index = s_astc_cem_to_unique_ldr_index[cem_index];
|
||
assert((unique_cem_index >= 0) && (unique_cem_index <= 5));
|
||
assert(basist::astc_ldr_t::s_unique_ldr_index_to_astc_cem[unique_cem_index] == (int)cem_index);
|
||
|
||
num_subsets--;
|
||
|
||
endpoint_ise_range -= astc_helpers::FIRST_VALID_ENDPOINT_ISE_RANGE;
|
||
|
||
uint32_t cur_bit_ofs = 0;
|
||
|
||
#define BU_PACK_FIELD(val, bits) do { uint32_t v = (uint32_t)(val); assert(v < (1u << bits)); packed_id |= (v << cur_bit_ofs); cur_bit_ofs += (bits); } while(0)
|
||
|
||
uint32_t packed_id = 0;
|
||
BU_PACK_FIELD(endpoint_ise_range, basist::astc_ldr_t::CFG_PACK_EISE_BITS);
|
||
BU_PACK_FIELD(weight_ise_range, basist::astc_ldr_t::CFG_PACK_WISE_BITS);
|
||
BU_PACK_FIELD(ccs_index, basist::astc_ldr_t::CFG_PACK_CCS_BITS);
|
||
BU_PACK_FIELD(num_subsets, basist::astc_ldr_t::CFG_PACK_SUBSETS_BITS);
|
||
BU_PACK_FIELD(unique_cem_index, basist::astc_ldr_t::CFG_PACK_CEM_BITS);
|
||
// must be at the top
|
||
BU_PACK_FIELD(grid_width * 11 + grid_height, basist::astc_ldr_t::CFG_PACK_GRID_BITS);
|
||
#undef BU_PACK_FIELD
|
||
|
||
assert(cur_bit_ofs == 24);
|
||
|
||
return packed_id;
|
||
}
|
||
|
||
void create_encoder_trial_modes_full_eval(uint32_t block_width, uint32_t block_height,
|
||
basisu::vector<basist::astc_ldr_t::trial_mode>& encoder_trial_modes, basist::astc_ldr_t::grouped_trial_modes& grouped_encoder_trial_modes,
|
||
bool print_debug_info = true, bool print_modes = false)
|
||
{
|
||
interval_timer itm;
|
||
itm.start();
|
||
|
||
encoder_trial_modes.resize(0);
|
||
grouped_encoder_trial_modes.clear();
|
||
|
||
uint32_t max_grid_width = 0, max_grid_height = 0;
|
||
uint32_t total_evals = 0, total_partial_evals = 0, total_evals_succeeded = 0;
|
||
uint32_t mode_index = 0;
|
||
uint_vec packed_mode_ids;
|
||
|
||
for (uint32_t alpha_iter = 0; alpha_iter < 2; alpha_iter++)
|
||
{
|
||
if (print_modes)
|
||
{
|
||
if (alpha_iter)
|
||
fmt_debug_printf("ALPHA TRIAL MODES\n");
|
||
else
|
||
fmt_debug_printf("RGB TRIAL MODES\n");
|
||
}
|
||
|
||
astc_helpers::astc_block phys_block;
|
||
|
||
for (uint32_t cem_mode_iter = 0; cem_mode_iter < 3; cem_mode_iter++)
|
||
{
|
||
const uint32_t s_rgb_cems[3] = { astc_helpers::CEM_LDR_LUM_DIRECT, astc_helpers::CEM_LDR_RGB_BASE_SCALE, astc_helpers::CEM_LDR_RGB_DIRECT };
|
||
const uint32_t s_alpha_cems[3] = { astc_helpers::CEM_LDR_LUM_ALPHA_DIRECT, astc_helpers::CEM_LDR_RGB_BASE_SCALE_PLUS_TWO_A, astc_helpers::CEM_LDR_RGBA_DIRECT };
|
||
|
||
const uint32_t cem_index = alpha_iter ? s_alpha_cems[cem_mode_iter] : s_rgb_cems[cem_mode_iter];
|
||
|
||
uint32_t num_dp_chans = 0;
|
||
bool cem_supports_dual_plane = false;
|
||
bool cem_supports_subsets = false;
|
||
|
||
// base+ofs variants are automatically used later as alternates to RGB/RGBA direct modes
|
||
switch (cem_index)
|
||
{
|
||
case astc_helpers::CEM_LDR_LUM_DIRECT:
|
||
num_dp_chans = 0; // only a single component, so only a single plane
|
||
cem_supports_dual_plane = false;
|
||
cem_supports_subsets = true;
|
||
break;
|
||
case astc_helpers::CEM_LDR_LUM_ALPHA_DIRECT:
|
||
num_dp_chans = 1; // CCS can only be 3
|
||
cem_supports_dual_plane = true;
|
||
cem_supports_subsets = true;
|
||
break;
|
||
case astc_helpers::CEM_LDR_RGB_DIRECT:
|
||
num_dp_chans = 3;
|
||
cem_supports_dual_plane = true;
|
||
cem_supports_subsets = true;
|
||
break;
|
||
case astc_helpers::CEM_LDR_RGB_BASE_SCALE:
|
||
num_dp_chans = 3;
|
||
cem_supports_dual_plane = true;
|
||
cem_supports_subsets = true;
|
||
break;
|
||
case astc_helpers::CEM_LDR_RGBA_DIRECT:
|
||
num_dp_chans = 4;
|
||
cem_supports_dual_plane = true;
|
||
cem_supports_subsets = true;
|
||
break;
|
||
case astc_helpers::CEM_LDR_RGB_BASE_SCALE_PLUS_TWO_A:
|
||
num_dp_chans = 4;
|
||
cem_supports_dual_plane = true;
|
||
cem_supports_subsets = true;
|
||
break;
|
||
default:
|
||
assert(0);
|
||
break;
|
||
}
|
||
|
||
for (int dp = 0; dp < (cem_supports_dual_plane ? 2 : 1); dp++)
|
||
{
|
||
const bool use_subsets = !dp && cem_supports_subsets;
|
||
|
||
for (int subsets = 1; subsets <= (use_subsets ? 3 : 1); subsets++)
|
||
{
|
||
for (uint32_t grid_height = 2; grid_height <= block_height; grid_height++)
|
||
{
|
||
for (uint32_t grid_width = 2; grid_width <= block_width; grid_width++)
|
||
{
|
||
for (uint32_t dp_chan_index = 0; dp_chan_index < (dp ? num_dp_chans : 1); dp_chan_index++)
|
||
{
|
||
astc_helpers::log_astc_block log_block;
|
||
log_block.clear();
|
||
|
||
log_block.m_grid_width = (uint8_t)grid_width;
|
||
log_block.m_grid_height = (uint8_t)grid_height;
|
||
|
||
log_block.m_num_partitions = (uint8_t)subsets;
|
||
|
||
for (int i = 0; i < subsets; i++)
|
||
log_block.m_color_endpoint_modes[i] = (uint8_t)cem_index;
|
||
|
||
log_block.m_dual_plane = dp > 0;
|
||
|
||
if (log_block.m_dual_plane)
|
||
{
|
||
uint32_t ccs_index = dp_chan_index;
|
||
|
||
if (cem_index == astc_helpers::CEM_LDR_LUM_ALPHA_DIRECT)
|
||
{
|
||
// must be 3 for LA if DP is enabled
|
||
ccs_index = 3;
|
||
}
|
||
|
||
log_block.m_color_component_selector = (uint8_t)ccs_index;
|
||
}
|
||
|
||
for (uint32_t weight_ise_range = astc_helpers::FIRST_VALID_WEIGHT_ISE_RANGE; weight_ise_range <= astc_helpers::LAST_VALID_WEIGHT_ISE_RANGE; weight_ise_range++)
|
||
{
|
||
log_block.m_weight_ise_range = (uint8_t)weight_ise_range;
|
||
log_block.m_endpoint_ise_range = astc_helpers::FIRST_VALID_ENDPOINT_ISE_RANGE; // dummy value
|
||
|
||
total_partial_evals++;
|
||
|
||
bool success = astc_helpers::pack_astc_block(phys_block, log_block, nullptr, nullptr, astc_helpers::cValidateEarlyOutAtEndpointISEChecks);
|
||
if (!success)
|
||
continue;
|
||
|
||
// in reality only 1 endpoint ISE range is valid here
|
||
for (uint32_t endpoint_ise_range = astc_helpers::FIRST_VALID_ENDPOINT_ISE_RANGE; endpoint_ise_range <= astc_helpers::LAST_VALID_ENDPOINT_ISE_RANGE; endpoint_ise_range++)
|
||
{
|
||
log_block.m_endpoint_ise_range = (uint8_t)endpoint_ise_range;
|
||
|
||
total_evals++;
|
||
|
||
success = astc_helpers::pack_astc_block(phys_block, log_block, nullptr, nullptr, astc_helpers::cValidateSkipFinalEndpointWeightPacking);
|
||
if (!success)
|
||
continue;
|
||
|
||
total_evals_succeeded++;
|
||
|
||
if (print_modes)
|
||
{
|
||
fmt_debug_printf("{}: CEM: {} DP: {}, CCS: {}, SUBSETS: {}, GRID: {}x{}, ENDPOINTS: {}, WEIGHTS: {}\n",
|
||
mode_index,
|
||
log_block.m_color_endpoint_modes[0],
|
||
log_block.m_dual_plane,
|
||
log_block.m_color_component_selector,
|
||
log_block.m_num_partitions,
|
||
log_block.m_grid_width, log_block.m_grid_height,
|
||
astc_helpers::get_ise_levels(log_block.m_endpoint_ise_range),
|
||
astc_helpers::get_ise_levels(log_block.m_weight_ise_range));
|
||
}
|
||
|
||
basist::astc_ldr_t::trial_mode m;
|
||
m.m_ccs_index = log_block.m_dual_plane ? log_block.m_color_component_selector : -1;
|
||
m.m_cem = log_block.m_color_endpoint_modes[0];
|
||
m.m_endpoint_ise_range = log_block.m_endpoint_ise_range;
|
||
m.m_weight_ise_range = log_block.m_weight_ise_range;
|
||
m.m_grid_width = grid_width;
|
||
m.m_grid_height = grid_height;
|
||
m.m_num_parts = log_block.m_num_partitions;
|
||
|
||
uint32_t packed_index = pack_tm_desc(
|
||
log_block.m_grid_width, log_block.m_grid_height,
|
||
log_block.m_color_endpoint_modes[0], log_block.m_dual_plane ? (log_block.m_color_component_selector + 1) : 0, log_block.m_num_partitions,
|
||
log_block.m_endpoint_ise_range, log_block.m_weight_ise_range);
|
||
|
||
assert(packed_index <= 0xFFFFFF);
|
||
packed_mode_ids.push_back(packed_index);
|
||
|
||
grouped_encoder_trial_modes.add(block_width, block_height, m, encoder_trial_modes.size_u32());
|
||
|
||
encoder_trial_modes.push_back(m);
|
||
|
||
max_grid_width = maximum(max_grid_width, grid_width);
|
||
max_grid_height = maximum(max_grid_height, grid_height);
|
||
|
||
++mode_index;
|
||
|
||
} // weight_ise_range
|
||
} // endpoint_ise_range
|
||
|
||
} // ccs_index
|
||
|
||
} // grid_width
|
||
|
||
} // grid_height
|
||
|
||
} // subsets
|
||
|
||
} // dp
|
||
|
||
} // cem_mode_iter
|
||
|
||
} // alpha_iter
|
||
|
||
#if 0
|
||
packed_mode_ids.sort();
|
||
|
||
for (uint32_t i = 0; i < packed_mode_ids.size(); i++)
|
||
{
|
||
uint32_t packed_index = packed_mode_ids[i];
|
||
|
||
fmt_debug_printf("{},{},{},", packed_index & 0xFF, (packed_index >> 8) & 0xFF, (packed_index >> 16) & 0xFF);
|
||
if ((i & 15) == 15)
|
||
fmt_debug_printf("\n");
|
||
}
|
||
#endif
|
||
|
||
if (print_debug_info)
|
||
{
|
||
fmt_debug_printf("create_encoder_trial_modes_full_eval() time: {} secs\n", itm.get_elapsed_secs());
|
||
|
||
fmt_debug_printf("create_encoder_trial_modes_full_eval() - ASTC {}x{} modes\n", block_width, block_height);
|
||
fmt_debug_printf("total_evals: {}, total_partial_evals: {}, total_evals_succeeded: {}\n", total_evals, total_partial_evals, total_evals_succeeded);
|
||
fmt_debug_printf("Total trial modes: {}\n", (uint32_t)encoder_trial_modes.size());
|
||
fmt_debug_printf("Total used trial mode groups: {}\n", grouped_encoder_trial_modes.count_used_groups());
|
||
fmt_debug_printf("Max ever grid dimensions: {}x{}\n", max_grid_width, max_grid_height);
|
||
}
|
||
|
||
// sanity check
|
||
assert(encoder_trial_modes.size() < 11000);
|
||
}
|
||
|
||
const uint32_t TOTAL_RGBA_CHAN_PAIRS = 6;
|
||
//const uint32_t TOTAL_RGB_CHAN_PAIRS = 3;
|
||
static const uint8_t g_rgba_chan_pairs[TOTAL_RGBA_CHAN_PAIRS][2] =
|
||
{
|
||
{ 0, 1 },
|
||
{ 0, 2 },
|
||
{ 1, 2 },
|
||
{ 0, 3 },
|
||
{ 1, 3 },
|
||
{ 2, 3 }
|
||
};
|
||
|
||
bool encoder_trial_mode_test()
|
||
{
|
||
for (uint32_t w = 4; w <= 12; w++)
|
||
{
|
||
for (uint32_t h = 4; h <= 12; h++)
|
||
{
|
||
if (!astc_helpers::is_valid_block_size(w, h))
|
||
continue;
|
||
|
||
basisu::vector<basist::astc_ldr_t::trial_mode> encoder_trial_modes_orig;
|
||
basist::astc_ldr_t::grouped_trial_modes grouped_encoder_trial_modes_orig;
|
||
|
||
create_encoder_trial_modes_full_eval(w, h,
|
||
encoder_trial_modes_orig, grouped_encoder_trial_modes_orig,
|
||
false, false);
|
||
|
||
fmt_debug_printf("Testing block size {}x{}, {} total modes\n", w, h, encoder_trial_modes_orig.size_u32());
|
||
|
||
basisu::hash_map<basist::astc_ldr_t::trial_mode> trial_mode_hash;
|
||
for (uint32_t i = 0; i < encoder_trial_modes_orig.size(); i++)
|
||
{
|
||
trial_mode_hash.insert(encoder_trial_modes_orig[i]);
|
||
}
|
||
|
||
basisu::vector<basist::astc_ldr_t::trial_mode> encoder_trial_modes_new;
|
||
basist::astc_ldr_t::grouped_trial_modes grouped_encoder_trial_modes_new;
|
||
|
||
basist::astc_ldr_t::create_encoder_trial_modes_table(w, h,
|
||
encoder_trial_modes_new, grouped_encoder_trial_modes_new,
|
||
false, false);
|
||
|
||
if (encoder_trial_modes_new.size() != encoder_trial_modes_orig.size())
|
||
{
|
||
fmt_error_printf("trial mode test failed!\n");
|
||
|
||
assert(0);
|
||
return false;
|
||
}
|
||
|
||
for (uint32_t i = 0; i < encoder_trial_modes_new.size(); i++)
|
||
{
|
||
const basist::astc_ldr_t::trial_mode& tm = encoder_trial_modes_new[i];
|
||
if (trial_mode_hash.find(tm) == trial_mode_hash.end())
|
||
{
|
||
fmt_error_printf("trial mode test failed!\n");
|
||
|
||
assert(0);
|
||
return false;
|
||
}
|
||
}
|
||
|
||
} // h
|
||
} // w
|
||
|
||
fmt_debug_printf("trial mode test succeeded\n");
|
||
return true;
|
||
}
|
||
|
||
//----------------------------------------------------------------------------------
|
||
|
||
struct ldr_astc_block_encode_image_high_level_config
|
||
{
|
||
uint32_t m_block_width = 6;
|
||
uint32_t m_block_height = 6;
|
||
|
||
bool m_second_superpass_refinement = true;
|
||
float m_second_superpass_fract_to_recompress = .075f;
|
||
|
||
bool m_third_superpass_try_neighbors = true;
|
||
|
||
float m_base_q = 75.0f;
|
||
bool m_use_dct = false;
|
||
|
||
bool m_subsets_enabled = true;
|
||
bool m_subsets_edge_filtering = true;
|
||
|
||
bool m_filter_by_pca_angles_flag = true;
|
||
float m_use_direct_angle_thresh = 2.0f;
|
||
float m_use_base_scale_angle_thresh = 7.0f;
|
||
|
||
bool m_force_all_dual_plane_chan_evals = false; // much slower, test on base
|
||
bool m_disable_rgb_dual_plane = false; // DP can be on alpha only, if block has alpha
|
||
float m_strong_dp_decorr_thresh_rgb = .998f;
|
||
|
||
bool m_use_base_ofs = true;
|
||
bool m_use_blue_contraction = true;
|
||
|
||
bool m_grid_hv_filtering = true;
|
||
bool m_low_freq_block_filtering = true;
|
||
|
||
uint32_t m_superbucket_max_to_retain[3] = { 4, 8, 16 };
|
||
|
||
float m_final_shortlist_fraction[3] = { .25f, .33f, .5f };
|
||
uint32_t m_final_shortlist_min_size[3] = { 1, 1, 1 };
|
||
uint32_t m_final_shortlist_max_size[3] = { 4096, 4096, 4096 };
|
||
|
||
uint32_t m_part2_fraction_to_keep = 2;
|
||
uint32_t m_part3_fraction_to_keep = 2;
|
||
uint32_t m_base_parts2 = 32;
|
||
uint32_t m_base_parts3 = 32;
|
||
|
||
float m_early_stop_wpsnr = 0.0f;
|
||
float m_early_stop2_wpsnr = 0.0f;
|
||
|
||
bool m_blurring_enabled = false;
|
||
bool m_blurring_enabled_p2 = false;
|
||
|
||
bool m_gradient_descent_flag = true;
|
||
bool m_polish_weights_flag = true;
|
||
bool m_qcd_enabled_flag = true; // gradient descent must be enabled too
|
||
bool m_bucket_pruning_passes = true;
|
||
|
||
// 2nd superpass options
|
||
uint32_t m_base_parts2_p2 = 64;
|
||
uint32_t m_base_parts3_p2 = 64;
|
||
uint32_t m_superbucket_max_to_retain_p2[3] = { 16, 32, 256 };
|
||
uint32_t m_final_shortlist_max_size_p2[3] = { 4096, 4096, 4096 };
|
||
uint32_t m_second_pass_total_weight_refine_passes = astc_ldr::WEIGHT_REFINER_MAX_PASSES;
|
||
bool m_second_pass_force_subsets_enabled = true;
|
||
bool m_force_all_dp_chans_p2 = false;
|
||
bool m_final_encode_always_try_rgb_direct = false;
|
||
bool m_filter_by_pca_angles_flag_p2 = true;
|
||
|
||
// only store the single best result per block
|
||
//bool m_save_single_result = false;
|
||
|
||
bool m_debug_images = false;
|
||
bool m_debug_output = false;
|
||
|
||
std::string m_debug_file_prefix;
|
||
|
||
job_pool* m_pJob_pool;
|
||
|
||
//saliency_map m_saliency_map;
|
||
|
||
astc_ldr::cem_encode_params m_cem_enc_params;
|
||
};
|
||
|
||
struct ldr_astc_block_encode_image_output
|
||
{
|
||
ldr_astc_block_encode_image_output()
|
||
{
|
||
}
|
||
|
||
~ldr_astc_block_encode_image_output()
|
||
{
|
||
interval_timer itm;
|
||
itm.start();
|
||
|
||
const int num_blocks_x = m_image_block_info.get_width();
|
||
const int num_blocks_y = m_image_block_info.get_height();
|
||
|
||
for (int y = num_blocks_y - 1; y >= 0; --y)
|
||
{
|
||
for (int x = num_blocks_x - 1; x >= 0; --x)
|
||
{
|
||
auto& out_blocks = m_image_block_info(x, y).m_out_blocks;
|
||
out_blocks.clear();
|
||
}
|
||
} // y
|
||
|
||
//fmt_debug_printf("Cleared enc_out image block info: {3.3} secs\n", itm.get_elapsed_secs());
|
||
}
|
||
|
||
astc_ldr::partitions_data m_part_data_p2;
|
||
astc_ldr::partitions_data m_part_data_p3;
|
||
|
||
basisu::vector<basist::astc_ldr_t::trial_mode> m_encoder_trial_modes;
|
||
basist::astc_ldr_t::grouped_trial_modes m_grouped_encoder_trial_modes;
|
||
|
||
vector2D<astc_helpers::astc_block> m_packed_phys_blocks;
|
||
|
||
struct block_info
|
||
{
|
||
block_info()
|
||
{
|
||
m_pixel_stats.clear();
|
||
}
|
||
|
||
astc_ldr::pixel_stats_t m_pixel_stats; // of original/input block
|
||
|
||
basisu::vector<encode_block_output> m_out_blocks;
|
||
|
||
uint32_t m_packed_out_block_index = 0; // index of best out block by WSSE
|
||
|
||
bool m_low_freq_block_flag = false;
|
||
bool m_super_strong_edges = false;
|
||
bool m_very_strong_edges = false;
|
||
bool m_strong_edges = false;
|
||
};
|
||
|
||
vector2D<block_info> m_image_block_info;
|
||
|
||
struct block_info_superpass1
|
||
{
|
||
int m_config_reuse_neighbor_out_block_indices[basist::astc_ldr_t::cMaxConfigReuseNeighbors] = { cInvalidIndex, cInvalidIndex, cInvalidIndex };
|
||
|
||
bool m_config_reuse_new_neighbor_out_block_flags[basist::astc_ldr_t::cMaxConfigReuseNeighbors] = { false, false, false };
|
||
|
||
basisu::vector<encode_block_output> m_new_out_config_reuse_blocks;
|
||
basisu::vector<encode_block_output> m_new_out_config_endpoint_reuse_blocks;
|
||
};
|
||
|
||
vector2D<block_info_superpass1> m_image_block_info_superpass2;
|
||
|
||
private:
|
||
ldr_astc_block_encode_image_output(const ldr_astc_block_encode_image_output&);
|
||
ldr_astc_block_encode_image_output& operator= (const ldr_astc_block_encode_image_output&);
|
||
};
|
||
|
||
constexpr bool selective_blurring = true;
|
||
|
||
bool ldr_astc_block_encode_image(
|
||
const image& orig_img,
|
||
const ldr_astc_block_encode_image_high_level_config& enc_cfg,
|
||
ldr_astc_block_encode_image_output& enc_out)
|
||
{
|
||
if (enc_cfg.m_debug_output)
|
||
fmt_debug_printf("ldr_astc_block_encode_image:\n");
|
||
|
||
const uint32_t block_width = enc_cfg.m_block_width, block_height = enc_cfg.m_block_height;
|
||
const uint32_t width = orig_img.get_width(), height = orig_img.get_height();
|
||
const uint32_t total_pixels = width * height;
|
||
const uint32_t total_block_pixels = enc_cfg.m_block_width * enc_cfg.m_block_height;
|
||
const uint32_t num_blocks_x = orig_img.get_block_width(enc_cfg.m_block_width);
|
||
const uint32_t num_blocks_y = orig_img.get_block_height(enc_cfg.m_block_height);
|
||
const uint32_t total_blocks = num_blocks_x * num_blocks_y;
|
||
|
||
if (enc_cfg.m_debug_output)
|
||
{
|
||
fmt_debug_printf("ASTC base bitrate: {3.3} bpp\n", 128.0f / (float)(enc_cfg.m_block_width * enc_cfg.m_block_height));
|
||
|
||
fmt_debug_printf("ASTC block size: {}x{}\n", enc_cfg.m_block_width, enc_cfg.m_block_height);
|
||
}
|
||
|
||
if (enc_cfg.m_debug_output)
|
||
fmt_debug_printf("Image has alpha: {}\n", orig_img.has_alpha());
|
||
|
||
astc_ldr::partitions_data* pPart_data_p2 = &enc_out.m_part_data_p2;
|
||
pPart_data_p2->init(2, enc_cfg.m_block_width, enc_cfg.m_block_height);
|
||
|
||
astc_ldr::partitions_data* pPart_data_p3 = &enc_out.m_part_data_p3;
|
||
pPart_data_p3->init(3, enc_cfg.m_block_width, enc_cfg.m_block_height);
|
||
|
||
// blurring coefficients
|
||
const float bw0 = 1.15f;
|
||
const float bw1 = 1.25f, bw1_a = 1.0f;
|
||
const float bw2 = 1.25f;
|
||
|
||
// TODO: Make this optional/tune this, add only 2 level blurring support
|
||
image orig_img_blurred2, orig_img_blurred3, orig_img_blurred4, orig_img_blurred5;
|
||
|
||
if ((enc_cfg.m_blurring_enabled) || (enc_cfg.m_blurring_enabled_p2))
|
||
{
|
||
orig_img_blurred2.resize(orig_img.get_width(), orig_img.get_height());
|
||
orig_img_blurred3.resize(orig_img.get_width(), orig_img.get_height());
|
||
orig_img_blurred4.resize(orig_img.get_width(), orig_img.get_height());
|
||
orig_img_blurred5.resize(orig_img.get_width(), orig_img.get_height());
|
||
|
||
image_resample(orig_img, orig_img_blurred2, true, "gaussian", bw0);
|
||
image_resample(orig_img, orig_img_blurred3, true, "gaussian", bw1, false, 0, 4, bw1_a);
|
||
image_resample(orig_img, orig_img_blurred4, true, "gaussian", bw1_a, false, 0, 4, bw1);
|
||
image_resample(orig_img, orig_img_blurred5, true, "gaussian", bw2, false);
|
||
}
|
||
|
||
if (enc_cfg.m_debug_images)
|
||
{
|
||
save_png(enc_cfg.m_debug_file_prefix + "dbg_astc_ldr_orig_img.png", orig_img);
|
||
|
||
if ((enc_cfg.m_blurring_enabled) || (enc_cfg.m_blurring_enabled_p2))
|
||
{
|
||
save_png(enc_cfg.m_debug_file_prefix + "vis_orig_blurred2.png", orig_img_blurred2);
|
||
save_png(enc_cfg.m_debug_file_prefix + "vis_orig_blurred3.png", orig_img_blurred3);
|
||
save_png(enc_cfg.m_debug_file_prefix + "vis_orig_blurred4.png", orig_img_blurred4);
|
||
save_png(enc_cfg.m_debug_file_prefix + "vis_orig_blurred5.png", orig_img_blurred5);
|
||
}
|
||
}
|
||
|
||
if (enc_cfg.m_debug_output)
|
||
fmt_debug_printf("Dimensions: {}x{}, Blocks: {}x{}, Total blocks: {}\n", width, height, num_blocks_x, num_blocks_y, total_blocks);
|
||
|
||
image orig_img_sobel_x, orig_img_sobel_y;
|
||
compute_sobel(orig_img, orig_img_sobel_x, &g_sobel_x[0][0]);
|
||
compute_sobel(orig_img, orig_img_sobel_y, &g_sobel_y[0][0]);
|
||
|
||
if (enc_cfg.m_debug_images)
|
||
{
|
||
save_png(enc_cfg.m_debug_file_prefix + "vis_orig_sobel_x.png", orig_img_sobel_x);
|
||
save_png(enc_cfg.m_debug_file_prefix + "vis_orig_sobel_y.png", orig_img_sobel_y);
|
||
}
|
||
|
||
image orig_img_sobel_xy(width, height);
|
||
for (uint32_t y = 0; y < height; y++)
|
||
{
|
||
for (uint32_t x = 0; x < width; x++)
|
||
{
|
||
const color_rgba& sx = orig_img_sobel_x(x, y);
|
||
const color_rgba& sy = orig_img_sobel_y(x, y);
|
||
|
||
orig_img_sobel_xy(x, y).set(
|
||
iabs((int)sx.r - 128) + iabs((int)sy.r - 128),
|
||
iabs((int)sx.g - 128) + iabs((int)sy.g - 128),
|
||
iabs((int)sx.b - 128) + iabs((int)sy.b - 128),
|
||
iabs((int)sx.a - 128) + iabs((int)sy.a - 128));
|
||
}
|
||
}
|
||
|
||
if (enc_cfg.m_debug_images)
|
||
save_png(enc_cfg.m_debug_file_prefix + "vis_orig_sobel_xy.png", orig_img_sobel_xy);
|
||
|
||
vector2D<astc_helpers::astc_block>& packed_blocks = enc_out.m_packed_phys_blocks;
|
||
packed_blocks.resize(num_blocks_x, num_blocks_y);
|
||
memset(packed_blocks.get_ptr(), 0, packed_blocks.size_in_bytes());
|
||
|
||
assert(enc_cfg.m_pJob_pool);
|
||
job_pool& job_pool = *enc_cfg.m_pJob_pool;
|
||
|
||
std::atomic<bool> encoder_failed_flag;
|
||
encoder_failed_flag.store(false);
|
||
|
||
std::mutex global_mutex;
|
||
|
||
basisu::vector<basist::astc_ldr_t::trial_mode>& encoder_trial_modes = enc_out.m_encoder_trial_modes;
|
||
encoder_trial_modes.reserve(4096);
|
||
|
||
basist::astc_ldr_t::grouped_trial_modes& grouped_encoder_trial_modes = enc_out.m_grouped_encoder_trial_modes;
|
||
basist::astc_ldr_t::create_encoder_trial_modes_table(block_width, block_height, encoder_trial_modes, grouped_encoder_trial_modes, enc_cfg.m_debug_output, false);
|
||
|
||
if (enc_cfg.m_debug_output)
|
||
{
|
||
uint32_t total_actual_modes = encoder_trial_modes.size_u32();
|
||
|
||
if (enc_cfg.m_use_base_ofs)
|
||
{
|
||
for (uint32_t i = 0; i < encoder_trial_modes.size(); i++)
|
||
{
|
||
const auto& tm = encoder_trial_modes[i];
|
||
|
||
switch (tm.m_cem)
|
||
{
|
||
case astc_helpers::CEM_LDR_RGBA_DIRECT:
|
||
case astc_helpers::CEM_LDR_RGB_DIRECT:
|
||
// add base+ofs variant
|
||
total_actual_modes++;
|
||
break;
|
||
default:
|
||
break;
|
||
}
|
||
} // i
|
||
}
|
||
|
||
fmt_debug_printf("Base encoder trial modes: {}, grand total including base+ofs CEM's: {}\n", encoder_trial_modes.size_u32(), total_actual_modes);
|
||
}
|
||
|
||
uint32_t total_used_bc = 0;
|
||
|
||
uint_vec used_rgb_direct_count;
|
||
used_rgb_direct_count.resize(encoder_trial_modes.size());
|
||
|
||
uint_vec used_base_offset_count;
|
||
used_base_offset_count.resize(encoder_trial_modes.size());
|
||
|
||
uint32_t total_void_extent_blocks_skipped = 0;
|
||
|
||
uint32_t total_superbuckets_created = 0;
|
||
uint32_t total_buckets_created = 0;
|
||
uint32_t total_surrogate_encodes = 0;
|
||
uint32_t total_full_encodes = 0;
|
||
uint32_t total_shortlist_candidates = 0;
|
||
uint32_t total_full_encodes_pass1 = 0;
|
||
uint32_t total_full_encodes_pass2 = 0;
|
||
|
||
uint32_t total_blur_encodes = 0;
|
||
uint32_t total_blurred_blocks1 = 0;
|
||
uint32_t total_blurred_blocks2 = 0;
|
||
uint32_t total_blurred_blocks3 = 0;
|
||
uint32_t total_blurred_blocks4 = 0;
|
||
|
||
basist::astc_ldr_t::dct2f dct;
|
||
dct.init(enc_cfg.m_block_height, enc_cfg.m_block_width);
|
||
|
||
image vis_part_usage_img, vis_part_pat_img, vis_strong_edge, vis_dct_low_freq_block, vis_dp_img, vis_base_ofs_img;
|
||
if (enc_cfg.m_debug_images)
|
||
{
|
||
vis_part_usage_img.resize(block_width * num_blocks_x, block_height * num_blocks_y);
|
||
vis_part_pat_img.resize(block_width * num_blocks_x, block_height * num_blocks_y);
|
||
vis_strong_edge.resize(block_width * num_blocks_x, block_height * num_blocks_y);
|
||
vis_dct_low_freq_block.resize(block_width * num_blocks_x, block_height * num_blocks_y);
|
||
vis_dp_img.resize(block_width * num_blocks_x, block_height * num_blocks_y);
|
||
vis_base_ofs_img.resize(block_width * num_blocks_x, block_height * num_blocks_y);
|
||
}
|
||
|
||
ldr_astc_lowlevel_block_encoder_pool encoder_pool;
|
||
assert(job_pool.get_total_threads());
|
||
encoder_pool.init((uint32_t)job_pool.get_total_threads());
|
||
|
||
basist::astc_ldr_t::grid_weight_dct grid_coder;
|
||
grid_coder.init(block_width, block_height);
|
||
|
||
struct output_block_devel_desc
|
||
{
|
||
const basist::astc_ldr_t::trial_mode* m_pTrial_modes;
|
||
int m_trial_mode_index; // this is the index of the mode it tried to encode, but the actual output/enc block could have used base+ofs
|
||
bool m_had_alpha;
|
||
|
||
bool m_low_freq_block_flag;
|
||
bool m_super_strong_edges;
|
||
bool m_very_strong_edges;
|
||
bool m_strong_edges;
|
||
|
||
void clear()
|
||
{
|
||
clear_obj(*this);
|
||
}
|
||
};
|
||
|
||
enc_out.m_image_block_info.resize(0, 0);
|
||
enc_out.m_image_block_info.resize(num_blocks_x, num_blocks_y);
|
||
|
||
#if 0
|
||
for (uint32_t y = 0; y < num_blocks_y; y++)
|
||
{
|
||
for (uint32_t x = 0; x < num_blocks_x; x++)
|
||
{
|
||
auto& out_blocks = enc_out.m_image_block_info(x, y).m_out_blocks;
|
||
out_blocks.reserve(16);
|
||
out_blocks.resize(0);
|
||
}
|
||
} // y
|
||
#endif
|
||
|
||
vector2D<bool> superpass2_recompress_block_flags;
|
||
|
||
if (enc_cfg.m_second_superpass_refinement)
|
||
superpass2_recompress_block_flags.resize(num_blocks_x, num_blocks_y);
|
||
|
||
if (enc_cfg.m_third_superpass_try_neighbors)
|
||
enc_out.m_image_block_info_superpass2.resize(num_blocks_x, num_blocks_y);
|
||
|
||
interval_timer itm;
|
||
itm.start();
|
||
|
||
//--------------------------------------------------------------------------------------
|
||
// ASTC compression loop
|
||
|
||
vector2D<output_block_devel_desc> output_block_devel_info(num_blocks_x, num_blocks_y);
|
||
|
||
uint32_t total_superpasses = 1;
|
||
if (enc_cfg.m_third_superpass_try_neighbors)
|
||
total_superpasses = 3;
|
||
else if (enc_cfg.m_second_superpass_refinement)
|
||
total_superpasses = 2;
|
||
|
||
uint32_t total_blocks_to_recompress = 0;
|
||
|
||
for (uint32_t superpass_index = 0; superpass_index < total_superpasses; superpass_index++)
|
||
{
|
||
if (superpass_index == 1)
|
||
{
|
||
if (!enc_cfg.m_second_superpass_refinement)
|
||
continue;
|
||
if (!total_blocks_to_recompress)
|
||
continue;
|
||
}
|
||
|
||
if (enc_cfg.m_debug_output)
|
||
fmt_debug_printf("ASTC packing superpass: {}\n", 1 + superpass_index);
|
||
|
||
uint32_t total_blocks_done = 0;
|
||
float last_printed_progress_val = -100.0f;
|
||
|
||
for (uint32_t by = 0; by < num_blocks_y; by++)
|
||
{
|
||
for (uint32_t bx = 0; bx < num_blocks_x; bx++)
|
||
{
|
||
job_pool.add_job([superpass_index,
|
||
//width, height,
|
||
bx, by,
|
||
//num_blocks_x, num_blocks_y,
|
||
total_blocks, block_width, block_height, total_block_pixels, &packed_blocks, &global_mutex,
|
||
&orig_img, &orig_img_sobel_xy, &orig_img_blurred2, &orig_img_blurred3, &orig_img_blurred4, &orig_img_blurred5,
|
||
&enc_cfg, &encoder_failed_flag, pPart_data_p2, pPart_data_p3,
|
||
&total_blocks_done, &total_superbuckets_created, &total_buckets_created, &total_surrogate_encodes, &total_full_encodes, &total_shortlist_candidates,
|
||
&encoder_trial_modes,
|
||
&total_blur_encodes, &total_blurred_blocks1,
|
||
&total_full_encodes_pass1, &total_full_encodes_pass2,
|
||
&dct, &vis_dct_low_freq_block,
|
||
&encoder_pool, &grid_coder, &grouped_encoder_trial_modes,
|
||
&enc_out, &output_block_devel_info, &total_void_extent_blocks_skipped, &superpass2_recompress_block_flags, &total_blocks_to_recompress, &last_printed_progress_val]
|
||
{
|
||
if (encoder_failed_flag)
|
||
return;
|
||
|
||
//const uint32_t base_x = bx * block_width, base_y = by * block_height;
|
||
|
||
color_rgba block_pixels[astc_ldr::ASTC_LDR_MAX_BLOCK_PIXELS];
|
||
orig_img.extract_block_clamped(block_pixels, bx * block_width, by * block_height, block_width, block_height);
|
||
|
||
if (superpass_index == 2)
|
||
{
|
||
// Superpass 2: Encode to best neighbor configurations
|
||
const ldr_astc_block_encode_image_output::block_info& out_block_info = enc_out.m_image_block_info(bx, by);
|
||
|
||
ldr_astc_block_encode_image_output::block_info_superpass1& out_block_info_superpass1 = enc_out.m_image_block_info_superpass2(bx, by);
|
||
|
||
const astc_ldr::pixel_stats_t& pixel_stats = out_block_info.m_pixel_stats;
|
||
|
||
const bool is_purely_solid_block = (pixel_stats.m_min == pixel_stats.m_max);
|
||
|
||
// if void extent, just skip
|
||
if (is_purely_solid_block)
|
||
return;
|
||
|
||
//const basisu::vector<encode_block_output>& out_blocks = out_block_info.m_out_blocks;
|
||
|
||
for (uint32_t neighbor_index = 0; neighbor_index < basist::astc_ldr_t::cMaxConfigReuseNeighbors; neighbor_index++)
|
||
{
|
||
const ldr_astc_block_encode_image_output::block_info* pNeighbor_out_block_info = nullptr;
|
||
|
||
if (neighbor_index == 0)
|
||
{
|
||
// Left
|
||
if (bx)
|
||
pNeighbor_out_block_info = &enc_out.m_image_block_info(bx - 1, by);
|
||
}
|
||
else if (neighbor_index == 1)
|
||
{
|
||
// Up
|
||
if (by)
|
||
pNeighbor_out_block_info = &enc_out.m_image_block_info(bx, by - 1);
|
||
}
|
||
else
|
||
{
|
||
assert(neighbor_index == 2);
|
||
|
||
// Diagonal
|
||
if ((bx) && (by))
|
||
pNeighbor_out_block_info = &enc_out.m_image_block_info(bx - 1, by - 1);
|
||
}
|
||
|
||
if (!pNeighbor_out_block_info)
|
||
continue;
|
||
|
||
const encode_block_output& neighbor_output = pNeighbor_out_block_info->m_out_blocks[pNeighbor_out_block_info->m_packed_out_block_index];
|
||
|
||
// Best neighbor was solid, skip it (TODO: reusing it is possible)
|
||
if (neighbor_output.m_log_blk.m_solid_color_flag_ldr)
|
||
continue;
|
||
|
||
const uint32_t neighbor_tm_index = neighbor_output.m_trial_mode_index;
|
||
assert(neighbor_tm_index < encoder_trial_modes.size());
|
||
|
||
//const trial_mode& neighbor_tm = encoder_trial_modes[neighbor_tm_index]; // do not use the tm's cem, it may be base+ofs, use the log blk instead
|
||
|
||
const astc_helpers::log_astc_block& neighbor_log_blk = neighbor_output.m_log_blk;
|
||
assert(!neighbor_log_blk.m_solid_color_flag_ldr);
|
||
|
||
const uint32_t neighbor_actual_cem = neighbor_log_blk.m_color_endpoint_modes[0];
|
||
const uint32_t neighbor_partition_id = neighbor_log_blk.m_partition_id;
|
||
|
||
// See if we've already encoded this full config
|
||
int already_existing_out_block_index = cInvalidIndex;
|
||
for (uint32_t i = 0; i < out_block_info.m_out_blocks.size(); i++)
|
||
{
|
||
if ((out_block_info.m_out_blocks[i].m_trial_mode_index == (int)neighbor_tm_index) &&
|
||
(out_block_info.m_out_blocks[i].m_log_blk.m_color_endpoint_modes[0] == neighbor_actual_cem) &&
|
||
(out_block_info.m_out_blocks[i].m_log_blk.m_partition_id == neighbor_partition_id))
|
||
{
|
||
already_existing_out_block_index = i;
|
||
break;
|
||
}
|
||
}
|
||
|
||
if (already_existing_out_block_index != cInvalidIndex)
|
||
{
|
||
// We already have an output block using this neighbor trial mode, skip
|
||
out_block_info_superpass1.m_config_reuse_neighbor_out_block_indices[neighbor_index] = (uint32_t)already_existing_out_block_index;
|
||
out_block_info_superpass1.m_config_reuse_new_neighbor_out_block_flags[neighbor_index] = false;
|
||
}
|
||
else
|
||
{
|
||
// Re-encode using the neighbor's full config (tm, base+ofs, partition ID)
|
||
astc_helpers::log_astc_block new_log_block;
|
||
|
||
bool status = false;
|
||
|
||
if (neighbor_log_blk.m_num_partitions > 1)
|
||
{
|
||
const astc_ldr::partitions_data* pPart_data = (neighbor_log_blk.m_num_partitions == 2) ? pPart_data_p2 : pPart_data_p3;
|
||
|
||
const uint32_t part_seed_index = neighbor_log_blk.m_partition_id;
|
||
const uint32_t part_unique_index = pPart_data->m_part_seed_to_unique_index[part_seed_index];
|
||
|
||
assert(part_unique_index < astc_helpers::NUM_PARTITION_PATTERNS);
|
||
const astc_ldr::partition_pattern_vec* pPat = &pPart_data->m_partition_pats[part_unique_index];
|
||
|
||
bool refine_only_flag = false;
|
||
|
||
status = encode_trial_subsets(
|
||
block_width, block_height,
|
||
pixel_stats,
|
||
neighbor_log_blk.m_color_endpoint_modes[0], neighbor_log_blk.m_num_partitions, neighbor_log_blk.m_partition_id, pPat,
|
||
neighbor_log_blk.m_endpoint_ise_range, neighbor_log_blk.m_weight_ise_range,
|
||
neighbor_log_blk.m_grid_width, neighbor_log_blk.m_grid_height,
|
||
new_log_block,
|
||
enc_cfg.m_cem_enc_params,
|
||
refine_only_flag,
|
||
enc_cfg.m_gradient_descent_flag, enc_cfg.m_polish_weights_flag, enc_cfg.m_qcd_enabled_flag,
|
||
enc_cfg.m_use_blue_contraction);
|
||
}
|
||
else
|
||
{
|
||
status = encode_trial(
|
||
block_width, block_height,
|
||
pixel_stats,
|
||
neighbor_log_blk.m_color_endpoint_modes[0],
|
||
neighbor_log_blk.m_dual_plane, neighbor_log_blk.m_dual_plane ? neighbor_log_blk.m_color_component_selector : -1,
|
||
neighbor_log_blk.m_endpoint_ise_range, neighbor_log_blk.m_weight_ise_range,
|
||
neighbor_log_blk.m_grid_width, neighbor_log_blk.m_grid_height,
|
||
new_log_block,
|
||
enc_cfg.m_cem_enc_params,
|
||
enc_cfg.m_gradient_descent_flag, enc_cfg.m_polish_weights_flag, enc_cfg.m_qcd_enabled_flag,
|
||
enc_cfg.m_use_blue_contraction);
|
||
}
|
||
|
||
if (!status)
|
||
{
|
||
fmt_debug_printf("encode_trial/encode_trial_subsets failed in superpass 1!\n");
|
||
encoder_failed_flag.store(true);
|
||
return;
|
||
}
|
||
|
||
out_block_info_superpass1.m_config_reuse_neighbor_out_block_indices[neighbor_index] = out_block_info_superpass1.m_new_out_config_reuse_blocks.size_u32();
|
||
out_block_info_superpass1.m_config_reuse_new_neighbor_out_block_flags[neighbor_index] = true;
|
||
|
||
encode_block_output& new_output_blk = *out_block_info_superpass1.m_new_out_config_reuse_blocks.enlarge(1);
|
||
|
||
new_output_blk.clear();
|
||
|
||
if (enc_cfg.m_use_dct)
|
||
{
|
||
const basist::astc_ldr_t::astc_block_grid_data* pGrid_data = basist::astc_ldr_t::find_astc_block_grid_data(block_width, block_height, new_log_block.m_grid_width, new_log_block.m_grid_height);
|
||
|
||
const uint32_t num_planes = (new_log_block.m_dual_plane ? 2 : 1);
|
||
|
||
for (uint32_t plane_index = 0; plane_index < num_planes; plane_index++)
|
||
{
|
||
bitwise_coder c;
|
||
basist::astc_ldr_t::dct_syms syms;
|
||
code_block_weights(grid_coder, enc_cfg.m_base_q, plane_index, new_log_block, pGrid_data, c, syms);
|
||
|
||
new_output_blk.m_packed_dct_plane_data[plane_index] = syms;
|
||
|
||
c.flush();
|
||
|
||
basist::bitwise_decoder d;
|
||
d.init(c.get_bytes().data(), c.get_bytes().size_u32());
|
||
|
||
// ensure existing weights get blown away
|
||
for (uint32_t i = 0; i < (uint32_t)(new_log_block.m_grid_width * new_log_block.m_grid_height); i++)
|
||
new_log_block.m_weights[i * num_planes + plane_index] = 0;
|
||
|
||
basist::astc_ldr_t::fvec dct_temp;
|
||
bool dec_status = grid_coder.decode_block_weights(enc_cfg.m_base_q, plane_index, new_log_block, &d, pGrid_data, nullptr, dct_temp, nullptr);
|
||
|
||
assert(dec_status);
|
||
if (!dec_status)
|
||
{
|
||
error_printf("grid_coder.decode_block_weights() failed!\n");
|
||
|
||
encoder_failed_flag.store(true);
|
||
return;
|
||
}
|
||
}
|
||
} // if (enc_cfg.m_use_dct)
|
||
|
||
new_output_blk.m_trial_mode_index = safe_cast_int16(neighbor_tm_index);
|
||
new_output_blk.m_log_blk = new_log_block;
|
||
//new_output_blk.m_trial_surrogate.clear();
|
||
|
||
new_output_blk.m_sse = eval_error(block_width, block_height, new_log_block, pixel_stats, enc_cfg.m_cem_enc_params);
|
||
|
||
{
|
||
std::lock_guard g(global_mutex);
|
||
|
||
total_full_encodes_pass2++;
|
||
}
|
||
} // if (already_existing_out_block_index != cInvalidIndex)
|
||
|
||
{
|
||
// Re-encode using the neighbor's full config (tm, base+ofs, partition ID) AND its endpoints
|
||
astc_helpers::log_astc_block new_log_block(neighbor_log_blk);
|
||
|
||
// Start with fresh 0 weights, then polish them.
|
||
clear_obj(new_log_block.m_weights);
|
||
|
||
//const bool use_blue_contraction = enc_cfg.m_use_blue_contraction;
|
||
|
||
bool improved_flag = false;
|
||
|
||
const astc_ldr::partition_pattern_vec* pPat = nullptr;
|
||
if (neighbor_log_blk.m_num_partitions > 1)
|
||
{
|
||
const astc_ldr::partitions_data* pPart_data = (neighbor_log_blk.m_num_partitions == 2) ? pPart_data_p2 : pPart_data_p3;
|
||
|
||
const uint32_t part_seed_index = neighbor_log_blk.m_partition_id;
|
||
const uint32_t part_unique_index = pPart_data->m_part_seed_to_unique_index[part_seed_index];
|
||
|
||
assert(part_unique_index < astc_helpers::NUM_PARTITION_PATTERNS);
|
||
pPat = &pPart_data->m_partition_pats[part_unique_index];
|
||
}
|
||
|
||
bool status = polish_block_weights(
|
||
block_width, block_height,
|
||
pixel_stats,
|
||
new_log_block,
|
||
enc_cfg.m_cem_enc_params, pPat, improved_flag,
|
||
enc_cfg.m_gradient_descent_flag, enc_cfg.m_polish_weights_flag, enc_cfg.m_qcd_enabled_flag);
|
||
|
||
if (!status)
|
||
{
|
||
fmt_error_printf("polish_block_weights failed in superpass 1!\n");
|
||
encoder_failed_flag.store(true);
|
||
return;
|
||
}
|
||
|
||
encode_block_output& new_output_blk = *out_block_info_superpass1.m_new_out_config_endpoint_reuse_blocks.enlarge(1);
|
||
|
||
new_output_blk.clear();
|
||
|
||
if (enc_cfg.m_use_dct)
|
||
{
|
||
const basist::astc_ldr_t::astc_block_grid_data* pGrid_data = basist::astc_ldr_t::find_astc_block_grid_data(block_width, block_height, new_log_block.m_grid_width, new_log_block.m_grid_height);
|
||
|
||
const uint32_t num_planes = (new_log_block.m_dual_plane ? 2 : 1);
|
||
|
||
for (uint32_t plane_index = 0; plane_index < num_planes; plane_index++)
|
||
{
|
||
bitwise_coder c;
|
||
basist::astc_ldr_t::dct_syms syms;
|
||
code_block_weights(grid_coder, enc_cfg.m_base_q, plane_index, new_log_block, pGrid_data, c, syms);
|
||
|
||
new_output_blk.m_packed_dct_plane_data[plane_index] = syms;
|
||
|
||
c.flush();
|
||
|
||
basist::bitwise_decoder d;
|
||
d.init(c.get_bytes().data(), c.get_bytes().size_u32());
|
||
|
||
// ensure existing weights get blown away
|
||
for (uint32_t i = 0; i < (uint32_t)(new_log_block.m_grid_width * new_log_block.m_grid_height); i++)
|
||
new_log_block.m_weights[i * num_planes + plane_index] = 0;
|
||
|
||
basist::astc_ldr_t::fvec dct_temp;
|
||
bool dec_status = grid_coder.decode_block_weights(enc_cfg.m_base_q, plane_index, new_log_block, &d, pGrid_data, nullptr, dct_temp, nullptr);
|
||
|
||
assert(dec_status);
|
||
if (!dec_status)
|
||
{
|
||
error_printf("grid_coder.decode_block_weights() failed!\n");
|
||
|
||
encoder_failed_flag.store(true);
|
||
return;
|
||
}
|
||
}
|
||
} // if (enc_cfg.m_use_dct)
|
||
|
||
new_output_blk.m_trial_mode_index = safe_cast_int16(neighbor_tm_index);
|
||
new_output_blk.m_log_blk = new_log_block;
|
||
//new_output_blk.m_trial_surrogate.clear();
|
||
|
||
new_output_blk.m_sse = eval_error(block_width, block_height, new_log_block, pixel_stats, enc_cfg.m_cem_enc_params);
|
||
|
||
{
|
||
std::lock_guard g(global_mutex);
|
||
|
||
total_full_encodes_pass2++;
|
||
}
|
||
}
|
||
|
||
} // neighbor_index
|
||
}
|
||
else
|
||
{
|
||
if (superpass_index == 1)
|
||
{
|
||
if (!superpass2_recompress_block_flags(bx, by))
|
||
return;
|
||
}
|
||
|
||
// Superpass 0/2: core ASTC encoding
|
||
basisu::vector<encode_block_output>& out_blocks = enc_out.m_image_block_info(bx, by).m_out_blocks;
|
||
out_blocks.resize(0);
|
||
|
||
astc_ldr::pixel_stats_t& pixel_stats = enc_out.m_image_block_info(bx, by).m_pixel_stats;
|
||
|
||
if (superpass_index == 0)
|
||
pixel_stats.init(total_block_pixels, block_pixels);
|
||
|
||
const bool is_purely_solid_block = (pixel_stats.m_min == pixel_stats.m_max);
|
||
|
||
// early out on totally solid blocks
|
||
if (is_purely_solid_block)
|
||
{
|
||
encode_block_output* pOut = out_blocks.enlarge(1);
|
||
pOut->clear();
|
||
|
||
astc_helpers::log_astc_block& log_blk = pOut->m_log_blk;
|
||
|
||
log_blk.clear();
|
||
log_blk.m_solid_color_flag_ldr = true;
|
||
|
||
for (uint32_t c = 0; c < 4; c++)
|
||
log_blk.m_solid_color[c] = pixel_stats.m_min[c];
|
||
|
||
// Expand each component to 16-bits
|
||
for (uint32_t c = 0; c < 4; c++)
|
||
log_blk.m_solid_color[c] |= (uint16_t)(log_blk.m_solid_color[c]) << 8u;
|
||
|
||
pOut->m_sse = eval_error(block_width, block_height, log_blk, pixel_stats, enc_cfg.m_cem_enc_params);
|
||
|
||
ldr_astc_block_encode_image_output::block_info& block_info_out = enc_out.m_image_block_info(bx, by);
|
||
|
||
block_info_out.m_low_freq_block_flag = true;
|
||
block_info_out.m_super_strong_edges = false;
|
||
block_info_out.m_very_strong_edges = false;
|
||
block_info_out.m_strong_edges = false;
|
||
block_info_out.m_packed_out_block_index = 0;
|
||
|
||
// Create packed ASTC block
|
||
astc_helpers::astc_block& best_phys_block = packed_blocks(bx, by);
|
||
bool pack_success = astc_helpers::pack_astc_block(best_phys_block, log_blk);
|
||
if (!pack_success)
|
||
{
|
||
encoder_failed_flag.store(true);
|
||
return;
|
||
}
|
||
|
||
output_block_devel_desc& out_devel_desc = output_block_devel_info(bx, by);
|
||
out_devel_desc.m_low_freq_block_flag = true;
|
||
out_devel_desc.m_super_strong_edges = false;
|
||
out_devel_desc.m_very_strong_edges = false;
|
||
out_devel_desc.m_strong_edges = false;
|
||
|
||
{
|
||
std::lock_guard g(global_mutex);
|
||
|
||
total_void_extent_blocks_skipped++;
|
||
|
||
total_blocks_done++;
|
||
}
|
||
|
||
return;
|
||
}
|
||
|
||
float max_std_dev = 0.0f;
|
||
for (uint32_t i = 0; i < 4; i++)
|
||
max_std_dev = maximum(max_std_dev, pixel_stats.m_rgba_stats[i].m_std_dev);
|
||
|
||
bool is_lum_only = true;
|
||
|
||
for (uint32_t y = 0; y < block_height; y++)
|
||
{
|
||
for (uint32_t x = 0; x < block_width; x++)
|
||
{
|
||
const color_rgba& c = pixel_stats.m_pixels[x + y * block_width];
|
||
bool is_lum_texel = (c.r == c.g) && (c.r == c.b);
|
||
if (!is_lum_texel)
|
||
{
|
||
is_lum_only = false;
|
||
break;
|
||
}
|
||
}
|
||
if (is_lum_only)
|
||
break;
|
||
}
|
||
|
||
basisu::vector<float> block_dct_energy(total_block_pixels);
|
||
|
||
bool filter_horizontally_flag = false;
|
||
bool low_freq_block_flag = 0;
|
||
|
||
{
|
||
basisu::vector<float> block_floats(total_block_pixels);
|
||
basisu::vector<float> block_dct(total_block_pixels);
|
||
basist::astc_ldr_t::fvec work;
|
||
|
||
for (uint32_t c = 0; c < 4; c++)
|
||
{
|
||
for (uint32_t i = 0; i < total_block_pixels; i++)
|
||
block_floats[i] = pixel_stats.m_pixels_f[i][c];
|
||
|
||
dct.forward(block_floats.data(), block_dct.data(), work);
|
||
|
||
for (uint32_t y = 0; y < block_height; y++)
|
||
for (uint32_t x = 0; x < block_width; x++)
|
||
block_dct_energy[x + y * block_width] += (float)enc_cfg.m_cem_enc_params.m_comp_weights[c] * squaref(block_dct[x + y * block_width]);
|
||
|
||
} // c
|
||
|
||
// Wipe DC
|
||
block_dct_energy[0] = 0.0f;
|
||
|
||
float tot_energy = compute_preserved_dct_energy(block_width, block_height, block_dct_energy.get_ptr(), block_width, block_height);
|
||
|
||
float h_energy_lost = compute_lost_dct_energy(block_width, block_height, block_dct_energy.get_ptr(), block_width / 2, block_height);
|
||
float v_energy_lost = compute_lost_dct_energy(block_width, block_height, block_dct_energy.get_ptr(), block_width, block_height / 2);
|
||
|
||
filter_horizontally_flag = h_energy_lost < v_energy_lost;
|
||
|
||
float hv2_lost_energy_fract = compute_lost_dct_energy(block_width, block_height, block_dct_energy.get_ptr(), 2, 2);
|
||
if (tot_energy)
|
||
hv2_lost_energy_fract /= tot_energy;
|
||
|
||
if ((hv2_lost_energy_fract < .03f) || (max_std_dev < (1.0f / 255.0f)))
|
||
low_freq_block_flag = true;
|
||
}
|
||
|
||
if (enc_cfg.m_debug_images)
|
||
vis_dct_low_freq_block.fill_box(bx * block_width, by * block_height, block_width, block_height, low_freq_block_flag ? color_rgba(255, 0, 0, 255) : g_black_color);
|
||
|
||
bool active_chan_flags[4] = { };
|
||
|
||
// The number of channels with non-zero spans
|
||
uint32_t total_active_chans = 0;
|
||
// The indices of the channels with non-zero spans.
|
||
//uint32_t active_chan_list[4] = { 0 };
|
||
|
||
for (uint32_t i = 0; i < 4; i++)
|
||
{
|
||
if (pixel_stats.m_rgba_stats[i].m_range > 0.0f)
|
||
{
|
||
assert(pixel_stats.m_max[i] != pixel_stats.m_min[i]);
|
||
|
||
active_chan_flags[i] = true;
|
||
|
||
//active_chan_list[total_active_chans] = i;
|
||
total_active_chans++;
|
||
}
|
||
else
|
||
{
|
||
assert(pixel_stats.m_max[i] == pixel_stats.m_min[i]);
|
||
}
|
||
}
|
||
|
||
basisu::comparative_stats<float> cross_chan_stats[TOTAL_RGBA_CHAN_PAIRS];
|
||
|
||
// def=max correlation for each channel pair (or 1 if one of the channels is inactive)
|
||
float chan_pair_correlations[6] = { 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 1.0f };
|
||
// 0=0, 1
|
||
// 1=0, 2
|
||
// 2=1, 2
|
||
// 3=0, 3
|
||
// 4=1, 3
|
||
// 5=2, 3
|
||
|
||
float min_corr = 1.0f, max_corr = 0.0f;
|
||
|
||
for (uint32_t pair_index = 0; pair_index < TOTAL_RGBA_CHAN_PAIRS; pair_index++)
|
||
{
|
||
const uint32_t chanA = g_rgba_chan_pairs[pair_index][0];
|
||
const uint32_t chanB = g_rgba_chan_pairs[pair_index][1];
|
||
|
||
// If both channels were active, we've got usable correlation statistics.
|
||
if (active_chan_flags[chanA] && active_chan_flags[chanB])
|
||
{
|
||
// TODO: This can be directly derived from the 3D/4D covariance matrix entries.
|
||
cross_chan_stats[pair_index].calc_pearson(total_block_pixels,
|
||
&pixel_stats.m_pixels_f[0][chanA],
|
||
&pixel_stats.m_pixels_f[0][chanB],
|
||
4, 4,
|
||
&pixel_stats.m_rgba_stats[chanA],
|
||
&pixel_stats.m_rgba_stats[chanB]);
|
||
|
||
chan_pair_correlations[pair_index] = fabsf(cross_chan_stats[pair_index].m_pearson);
|
||
|
||
const float c = fabsf((float)cross_chan_stats[pair_index].m_pearson);
|
||
min_corr = minimum(min_corr, c);
|
||
max_corr = maximum(max_corr, c);
|
||
}
|
||
}
|
||
|
||
// min_cor will be 1.0f if all channels inactive (solid)
|
||
|
||
// Pixel the trial modes the encoder will use: RGB or RGBA (we don't currently support trying both)
|
||
|
||
const bool used_alpha_encoder_modes = pixel_stats.m_has_alpha;
|
||
|
||
float sobel_energy = 0.0f;
|
||
for (uint32_t y = 0; y < block_height; y++)
|
||
{
|
||
for (uint32_t x = 0; x < block_width; x++)
|
||
{
|
||
const color_rgba& s = orig_img_sobel_xy.get_clamped(bx * block_width + x, by * block_height + y);
|
||
sobel_energy += s[0] * s[0] + s[1] * s[1] + s[2] * s[2] + s[3] * s[3];
|
||
} // x
|
||
} // y
|
||
|
||
sobel_energy /= (float)total_block_pixels;
|
||
|
||
// Configure low-level block encoder.
|
||
ldr_astc_lowlevel_block_encoder_params enc_blk_params;
|
||
|
||
enc_blk_params.m_block_width = block_width;
|
||
enc_blk_params.m_block_height = block_height;
|
||
enc_blk_params.m_total_block_pixels = total_block_pixels;
|
||
enc_blk_params.m_bx = bx;
|
||
enc_blk_params.m_by = by;
|
||
|
||
enc_blk_params.m_pOrig_img_sobel_xy_t = &orig_img_sobel_xy;
|
||
|
||
enc_blk_params.m_num_trial_modes = encoder_trial_modes.size_u32();
|
||
enc_blk_params.m_pTrial_modes = encoder_trial_modes.get_ptr();
|
||
enc_blk_params.m_pGrouped_trial_modes = &grouped_encoder_trial_modes;
|
||
|
||
enc_blk_params.m_pPart_data_p2 = pPart_data_p2;
|
||
enc_blk_params.m_pPart_data_p3 = pPart_data_p3;
|
||
enc_blk_params.m_pEnc_params = &enc_cfg.m_cem_enc_params;
|
||
|
||
float ang_dot = saturate(pixel_stats.m_zero_rel_axis3.dot3(pixel_stats.m_mean_rel_axis3));
|
||
const float pca_axis_angles = acosf(ang_dot) * (180.0f / (float)cPiD);
|
||
|
||
enc_blk_params.m_use_alpha_or_opaque_modes = used_alpha_encoder_modes;
|
||
enc_blk_params.m_use_lum_direct_modes = is_lum_only;
|
||
|
||
const bool filter_by_pca_angles_flag = (superpass_index == 1) ? enc_cfg.m_filter_by_pca_angles_flag_p2 : enc_cfg.m_filter_by_pca_angles_flag;
|
||
if (!filter_by_pca_angles_flag)
|
||
{
|
||
enc_blk_params.m_use_direct_modes = true;
|
||
enc_blk_params.m_use_base_scale_modes = true;
|
||
}
|
||
else
|
||
{
|
||
// TODO: Make selective based off edge blocks?
|
||
enc_blk_params.m_use_direct_modes = (!total_active_chans) || (pca_axis_angles > enc_cfg.m_use_direct_angle_thresh);
|
||
enc_blk_params.m_use_base_scale_modes = (pca_axis_angles <= enc_cfg.m_use_base_scale_angle_thresh);
|
||
}
|
||
|
||
enc_blk_params.m_grid_hv_filtering = enc_cfg.m_grid_hv_filtering;
|
||
enc_blk_params.m_filter_horizontally_flag = filter_horizontally_flag;
|
||
|
||
enc_blk_params.m_use_small_grids_only = low_freq_block_flag && enc_cfg.m_low_freq_block_filtering;
|
||
|
||
enc_blk_params.m_subsets_enabled = enc_cfg.m_subsets_enabled && (!low_freq_block_flag || !enc_cfg.m_subsets_edge_filtering);
|
||
|
||
enc_blk_params.m_subsets_edge_filtering = enc_cfg.m_subsets_edge_filtering;
|
||
|
||
enc_blk_params.m_use_blue_contraction = enc_cfg.m_use_blue_contraction;
|
||
enc_blk_params.m_final_encode_try_base_ofs = enc_cfg.m_use_base_ofs;
|
||
|
||
memcpy(enc_blk_params.m_superbucket_max_to_retain, enc_cfg.m_superbucket_max_to_retain, sizeof(enc_cfg.m_superbucket_max_to_retain));
|
||
|
||
memcpy(enc_blk_params.m_final_shortlist_fraction, enc_cfg.m_final_shortlist_fraction, sizeof(enc_blk_params.m_final_shortlist_fraction));
|
||
memcpy(enc_blk_params.m_final_shortlist_min_size, enc_cfg.m_final_shortlist_min_size, sizeof(enc_cfg.m_final_shortlist_min_size));
|
||
memcpy(enc_blk_params.m_final_shortlist_max_size, enc_cfg.m_final_shortlist_max_size, sizeof(enc_blk_params.m_final_shortlist_max_size));
|
||
|
||
enc_blk_params.m_part2_fraction_to_keep = enc_cfg.m_part2_fraction_to_keep;
|
||
enc_blk_params.m_part3_fraction_to_keep = enc_cfg.m_part3_fraction_to_keep;
|
||
enc_blk_params.m_base_parts2 = enc_cfg.m_base_parts2;
|
||
enc_blk_params.m_base_parts3 = enc_cfg.m_base_parts3;
|
||
enc_blk_params.m_gradient_descent_flag = enc_cfg.m_gradient_descent_flag;
|
||
enc_blk_params.m_polish_weights_flag = enc_cfg.m_polish_weights_flag;
|
||
enc_blk_params.m_qcd_enabled_flag = enc_cfg.m_qcd_enabled_flag;
|
||
enc_blk_params.m_bucket_pruning_passes = enc_cfg.m_bucket_pruning_passes;
|
||
|
||
enc_blk_params.m_alpha_cems = used_alpha_encoder_modes;
|
||
|
||
enc_blk_params.m_early_stop_wpsnr = enc_cfg.m_early_stop_wpsnr;
|
||
enc_blk_params.m_early_stop2_wpsnr = enc_cfg.m_early_stop2_wpsnr;
|
||
|
||
enc_blk_params.m_final_encode_always_try_rgb_direct = enc_cfg.m_final_encode_always_try_rgb_direct;
|
||
|
||
enc_blk_params.m_pDCT2F = &dct;
|
||
|
||
// Determine DP usage
|
||
if (enc_cfg.m_force_all_dual_plane_chan_evals)
|
||
{
|
||
for (uint32_t i = 0; i < 4; i++)
|
||
enc_blk_params.m_dp_active_chans[i] = active_chan_flags[i];
|
||
}
|
||
else
|
||
{
|
||
for (uint32_t i = 0; i < 3; i++)
|
||
enc_blk_params.m_dp_active_chans[i] = false;
|
||
|
||
// Being very conservative with alpha here - always let the analytical evaluator consider it.
|
||
enc_blk_params.m_dp_active_chans[3] = pixel_stats.m_has_alpha;
|
||
|
||
if (!enc_cfg.m_disable_rgb_dual_plane)
|
||
{
|
||
const float rg_corr = chan_pair_correlations[0];
|
||
const float rb_corr = chan_pair_correlations[1];
|
||
const float gb_corr = chan_pair_correlations[2];
|
||
|
||
int desired_dp_chan_rgb = -1;
|
||
|
||
float min_p = minimum(rg_corr, rb_corr, gb_corr);
|
||
|
||
if (min_p < enc_cfg.m_strong_dp_decorr_thresh_rgb)
|
||
{
|
||
const bool has_r = active_chan_flags[0], has_g = active_chan_flags[1];
|
||
//const bool has_b = active_chan_flags[2];
|
||
|
||
uint32_t total_active_chans_rgb = 0;
|
||
for (uint32_t i = 0; i < 3; i++)
|
||
total_active_chans_rgb += active_chan_flags[i];
|
||
|
||
if (total_active_chans_rgb == 2)
|
||
{
|
||
if (!has_r)
|
||
desired_dp_chan_rgb = 1;
|
||
else if (!has_g)
|
||
desired_dp_chan_rgb = 0;
|
||
else
|
||
desired_dp_chan_rgb = 0;
|
||
}
|
||
else if (total_active_chans_rgb == 3)
|
||
{
|
||
// see if rg/rb is weakly correlated vs. gb
|
||
if ((rg_corr < gb_corr) && (rb_corr < gb_corr))
|
||
desired_dp_chan_rgb = 0;
|
||
// see if gr/gb is weakly correlated vs. rb
|
||
else if ((rg_corr < rb_corr) && (gb_corr < rb_corr))
|
||
desired_dp_chan_rgb = 1;
|
||
// assume b is weakest
|
||
else
|
||
desired_dp_chan_rgb = 2;
|
||
}
|
||
}
|
||
|
||
if (desired_dp_chan_rgb != -1)
|
||
{
|
||
assert(active_chan_flags[desired_dp_chan_rgb]);
|
||
enc_blk_params.m_dp_active_chans[desired_dp_chan_rgb] = true;
|
||
}
|
||
}
|
||
}
|
||
|
||
if (!enc_blk_params.m_dp_active_chans[0] && !enc_blk_params.m_dp_active_chans[1] && !enc_blk_params.m_dp_active_chans[2] && !enc_blk_params.m_dp_active_chans[3])
|
||
{
|
||
enc_blk_params.m_use_dual_planes = false;
|
||
}
|
||
|
||
astc_ldr::cem_encode_params temp_cem_enc_params;
|
||
if (superpass_index == 1)
|
||
{
|
||
enc_blk_params.m_base_parts2 = enc_cfg.m_base_parts2_p2;
|
||
enc_blk_params.m_base_parts3 = enc_cfg.m_base_parts3_p2;
|
||
enc_blk_params.m_part2_fraction_to_keep = 1;
|
||
enc_blk_params.m_part3_fraction_to_keep = 1;
|
||
|
||
memcpy(enc_blk_params.m_superbucket_max_to_retain, enc_cfg.m_superbucket_max_to_retain_p2, sizeof(enc_cfg.m_superbucket_max_to_retain_p2));
|
||
memcpy(enc_blk_params.m_final_shortlist_max_size, enc_cfg.m_final_shortlist_max_size_p2, sizeof(enc_cfg.m_final_shortlist_max_size_p2));
|
||
|
||
if (enc_cfg.m_second_pass_force_subsets_enabled)
|
||
enc_blk_params.m_subsets_enabled = true;
|
||
enc_blk_params.m_subsets_edge_filtering = false;
|
||
|
||
if (enc_cfg.m_force_all_dp_chans_p2)
|
||
{
|
||
enc_blk_params.m_dp_active_chans[0] = active_chan_flags[0];
|
||
enc_blk_params.m_dp_active_chans[1] = active_chan_flags[1];
|
||
enc_blk_params.m_dp_active_chans[2] = active_chan_flags[2];
|
||
enc_blk_params.m_dp_active_chans[3] = active_chan_flags[3];
|
||
enc_blk_params.m_use_dual_planes = true;
|
||
|
||
if (!enc_blk_params.m_dp_active_chans[0] && !enc_blk_params.m_dp_active_chans[1] && !enc_blk_params.m_dp_active_chans[2] && !enc_blk_params.m_dp_active_chans[3])
|
||
{
|
||
enc_blk_params.m_use_dual_planes = false;
|
||
}
|
||
}
|
||
|
||
enc_blk_params.m_gradient_descent_flag = true;
|
||
enc_blk_params.m_polish_weights_flag = true;
|
||
|
||
enc_blk_params.m_use_direct_modes = true;
|
||
enc_blk_params.m_use_base_scale_modes = true;
|
||
|
||
enc_blk_params.m_early_stop_wpsnr = enc_cfg.m_early_stop_wpsnr + 2.0f;
|
||
enc_blk_params.m_early_stop2_wpsnr = enc_cfg.m_early_stop2_wpsnr + 2.0f;
|
||
|
||
if (enc_cfg.m_second_pass_total_weight_refine_passes)
|
||
{
|
||
temp_cem_enc_params = enc_cfg.m_cem_enc_params;
|
||
enc_blk_params.m_pEnc_params = &temp_cem_enc_params;
|
||
|
||
temp_cem_enc_params.m_total_weight_refine_passes = enc_cfg.m_second_pass_total_weight_refine_passes;
|
||
temp_cem_enc_params.m_worst_weight_nudging_flag = true;
|
||
temp_cem_enc_params.m_endpoint_refinement_flag = true;
|
||
}
|
||
}
|
||
|
||
scoped_ldr_astc_lowlevel_block_encoder scoped_block_encoder(encoder_pool);
|
||
if (scoped_block_encoder.get_ptr() == nullptr)
|
||
{
|
||
error_printf("Failed allocating thread local encode block temps\n");
|
||
encoder_failed_flag.store(true);
|
||
return;
|
||
}
|
||
|
||
// solid color
|
||
{
|
||
encode_block_output* pOut = out_blocks.enlarge(1);
|
||
pOut->clear();
|
||
|
||
astc_helpers::log_astc_block& log_blk = pOut->m_log_blk;
|
||
|
||
log_blk.clear();
|
||
log_blk.m_solid_color_flag_ldr = true;
|
||
|
||
for (uint32_t c = 0; c < 4; c++)
|
||
log_blk.m_solid_color[c] = (uint16_t)clamp((int)std::round(pixel_stats.m_mean_f[c] * 255.0f), 0, 255);
|
||
|
||
// Expand each component to 16-bits
|
||
for (uint32_t c = 0; c < 4; c++)
|
||
log_blk.m_solid_color[c] |= (uint16_t)(log_blk.m_solid_color[c]) << 8u;
|
||
|
||
pOut->m_sse = eval_error(block_width, block_height, log_blk, pixel_stats, enc_cfg.m_cem_enc_params);
|
||
}
|
||
|
||
encode_block_stats enc_block_stats;
|
||
|
||
bool enc_status = scoped_block_encoder.get_ptr()->full_encode(enc_blk_params, pixel_stats, out_blocks, 0, enc_block_stats);
|
||
if (!enc_status)
|
||
{
|
||
encoder_failed_flag.store(true);
|
||
return;
|
||
}
|
||
|
||
#if 1
|
||
// --------------------- BLOCK BLURRING
|
||
// TODO - very slow, needs more configuration and tuning, experimental
|
||
const float BLUR_STD_DEV_THRESH = (15.0f / 255.0f);
|
||
const float BLUR_SOBEL_ENERGY_THRESH = 15000.0f;
|
||
|
||
const bool use_blurs = (enc_cfg.m_blurring_enabled && (!selective_blurring || ((max_std_dev > BLUR_STD_DEV_THRESH) && (sobel_energy > BLUR_SOBEL_ENERGY_THRESH)))) ||
|
||
(enc_cfg.m_blurring_enabled_p2 && (superpass_index == 1));
|
||
|
||
if (use_blurs)
|
||
{
|
||
{
|
||
assert(orig_img_blurred2.get_width());
|
||
|
||
color_rgba block_pixels_blurred2[astc_ldr::ASTC_LDR_MAX_BLOCK_PIXELS];
|
||
orig_img_blurred2.extract_block_clamped(block_pixels_blurred2, bx * block_width, by * block_height, block_width, block_height);
|
||
|
||
astc_ldr::pixel_stats_t pixel_stats_blurred2;
|
||
pixel_stats_blurred2.init(total_block_pixels, block_pixels_blurred2);
|
||
|
||
enc_status = scoped_block_encoder.get_ptr()->full_encode(enc_blk_params, pixel_stats_blurred2, out_blocks, 1, enc_block_stats);
|
||
if (!enc_status)
|
||
{
|
||
encoder_failed_flag.store(true);
|
||
return;
|
||
}
|
||
}
|
||
|
||
{
|
||
assert(orig_img_blurred3.get_width());
|
||
|
||
color_rgba block_pixels_blurred3[astc_ldr::ASTC_LDR_MAX_BLOCK_PIXELS];
|
||
orig_img_blurred3.extract_block_clamped(block_pixels_blurred3, bx * block_width, by * block_height, block_width, block_height);
|
||
|
||
astc_ldr::pixel_stats_t pixel_stats_blurred3;
|
||
pixel_stats_blurred3.init(total_block_pixels, block_pixels_blurred3);
|
||
|
||
enc_status = scoped_block_encoder.get_ptr()->full_encode(enc_blk_params, pixel_stats_blurred3, out_blocks, 2, enc_block_stats);
|
||
if (!enc_status)
|
||
{
|
||
encoder_failed_flag.store(true);
|
||
return;
|
||
}
|
||
}
|
||
|
||
{
|
||
assert(orig_img_blurred4.get_width());
|
||
|
||
color_rgba block_pixels_blurred4[astc_ldr::ASTC_LDR_MAX_BLOCK_PIXELS];
|
||
orig_img_blurred4.extract_block_clamped(block_pixels_blurred4, bx * block_width, by * block_height, block_width, block_height);
|
||
|
||
astc_ldr::pixel_stats_t pixel_stats_blurred4;
|
||
pixel_stats_blurred4.init(total_block_pixels, block_pixels_blurred4);
|
||
|
||
enc_status = scoped_block_encoder.get_ptr()->full_encode(enc_blk_params, pixel_stats_blurred4, out_blocks, 3, enc_block_stats);
|
||
if (!enc_status)
|
||
{
|
||
encoder_failed_flag.store(true);
|
||
return;
|
||
}
|
||
}
|
||
|
||
{
|
||
assert(orig_img_blurred5.get_width());
|
||
|
||
color_rgba block_pixels_blurred5[astc_ldr::ASTC_LDR_MAX_BLOCK_PIXELS];
|
||
orig_img_blurred5.extract_block_clamped(block_pixels_blurred5, bx * block_width, by * block_height, block_width, block_height);
|
||
|
||
astc_ldr::pixel_stats_t pixel_stats_blurred5;
|
||
pixel_stats_blurred5.init(total_block_pixels, block_pixels_blurred5);
|
||
|
||
enc_status = scoped_block_encoder.get_ptr()->full_encode(enc_blk_params, pixel_stats_blurred5, out_blocks, 4, enc_block_stats);
|
||
if (!enc_status)
|
||
{
|
||
encoder_failed_flag.store(true);
|
||
return;
|
||
}
|
||
}
|
||
}
|
||
#endif
|
||
|
||
// --------------------- WEIGHT GRID DCT CODING
|
||
if (enc_cfg.m_use_dct)
|
||
{
|
||
// apply DCT to weights
|
||
for (uint32_t out_block_iter = 0; out_block_iter < out_blocks.size_u32(); out_block_iter++)
|
||
{
|
||
if (out_blocks[out_block_iter].m_trial_mode_index < 0)
|
||
continue;
|
||
|
||
astc_helpers::log_astc_block& log_astc_blk = out_blocks[out_block_iter].m_log_blk;
|
||
|
||
const basist::astc_ldr_t::astc_block_grid_data* pGrid_data = basist::astc_ldr_t::find_astc_block_grid_data(block_width, block_height, log_astc_blk.m_grid_width, log_astc_blk.m_grid_height);
|
||
|
||
const uint32_t num_planes = (log_astc_blk.m_dual_plane ? 2 : 1);
|
||
for (uint32_t plane_index = 0; plane_index < num_planes; plane_index++)
|
||
{
|
||
bitwise_coder c;
|
||
basist::astc_ldr_t::dct_syms syms;
|
||
code_block_weights(grid_coder, enc_cfg.m_base_q, plane_index, log_astc_blk, pGrid_data, c, syms);
|
||
|
||
out_blocks[out_block_iter].m_packed_dct_plane_data[plane_index] = syms;
|
||
|
||
c.flush();
|
||
|
||
basist::bitwise_decoder d;
|
||
d.init(c.get_bytes().data(), c.get_bytes().size_u32());
|
||
|
||
// ensure existing weights get blown away
|
||
for (uint32_t i = 0; i < (uint32_t)(log_astc_blk.m_grid_width * log_astc_blk.m_grid_height); i++)
|
||
log_astc_blk.m_weights[i * num_planes + plane_index] = 0;
|
||
|
||
basist::astc_ldr_t::fvec dct_temp;
|
||
bool status = grid_coder.decode_block_weights(enc_cfg.m_base_q, plane_index, log_astc_blk, &d, pGrid_data, nullptr, dct_temp, nullptr);
|
||
|
||
assert(status);
|
||
if (!status)
|
||
{
|
||
error_printf("grid_coder.decode_block_weights() failed!\n");
|
||
|
||
encoder_failed_flag.store(true);
|
||
return;
|
||
}
|
||
|
||
#if 0
|
||
{
|
||
astc_helpers::log_astc_block alt_log_astc_blk(log_astc_blk);
|
||
|
||
for (uint32_t i = 0; i < (uint32_t)(log_astc_blk.m_grid_width * log_astc_blk.m_grid_height); i++)
|
||
alt_log_astc_blk.m_weights[i * num_planes + plane_index] = 0;
|
||
|
||
status = grid_coder.decode_block_weights(q, plane_index, alt_log_astc_blk, nullptr, pGrid_data, &out_block_dct_stats[out_block_iter], &syms);
|
||
assert(status);
|
||
|
||
for (uint32_t i = 0; i < (uint32_t)(log_astc_blk.m_grid_width * log_astc_blk.m_grid_height); i++)
|
||
{
|
||
assert(log_astc_blk.m_weights[i * num_planes + plane_index] == alt_log_astc_blk.m_weights[i * num_planes + plane_index]);
|
||
}
|
||
|
||
}
|
||
#endif
|
||
// TODO: in theory, endpoints can be refined if they don't change the DCT span.
|
||
}
|
||
|
||
out_blocks[out_block_iter].m_sse = eval_error(block_width, block_height, log_astc_blk, pixel_stats, enc_cfg.m_cem_enc_params);
|
||
|
||
} // for
|
||
|
||
} // use_dct
|
||
|
||
// Find best output block
|
||
uint64_t best_out_blocks_err = UINT64_MAX;
|
||
uint32_t best_out_blocks_index = 0;
|
||
astc_helpers::log_astc_block best_out_blocks_log_astc_blk;
|
||
|
||
for (uint32_t out_block_iter = 0; out_block_iter < out_blocks.size_u32(); out_block_iter++)
|
||
{
|
||
const astc_helpers::log_astc_block& log_astc_blk = out_blocks[out_block_iter].m_log_blk;
|
||
|
||
color_rgba dec_pixels[astc_helpers::MAX_BLOCK_DIM * astc_helpers::MAX_BLOCK_DIM];
|
||
bool dec_status = astc_helpers::decode_block(log_astc_blk, dec_pixels, block_width, block_height, enc_cfg.m_cem_enc_params.m_decode_mode_srgb ? astc_helpers::cDecodeModeSRGB8 : astc_helpers::cDecodeModeLDR8);
|
||
|
||
assert(dec_status);
|
||
if (!dec_status)
|
||
{
|
||
encoder_failed_flag.store(true);
|
||
return;
|
||
}
|
||
|
||
uint64_t total_err = 0;
|
||
for (uint32_t i = 0; i < total_block_pixels; i++)
|
||
total_err += weighted_color_error(block_pixels[i], dec_pixels[i], enc_cfg.m_cem_enc_params);
|
||
|
||
// if not blurred
|
||
if (out_blocks[out_block_iter].m_blur_id == 0)
|
||
{
|
||
if (out_blocks[out_block_iter].m_sse != total_err)
|
||
{
|
||
assert(0);
|
||
fmt_error_printf("output block SSE invalid\n");
|
||
encoder_failed_flag.store(true);
|
||
return;
|
||
}
|
||
}
|
||
|
||
// Replace m_sse with the actual WSSE vs. the original source block (in case it was blurred)
|
||
out_blocks[out_block_iter].m_sse = total_err;
|
||
|
||
if (total_err < best_out_blocks_err)
|
||
{
|
||
best_out_blocks_err = total_err;
|
||
best_out_blocks_log_astc_blk = log_astc_blk;
|
||
best_out_blocks_index = out_block_iter;
|
||
}
|
||
} // out_block_iter
|
||
|
||
#if 0
|
||
// TODO: Save memory, only minimally tested
|
||
if (enc_cfg.m_save_single_result)
|
||
{
|
||
basisu::vector<encode_block_output> new_out_blocks(1);
|
||
new_out_blocks[0] = out_blocks[best_out_blocks_index];
|
||
|
||
std::swap(out_blocks, new_out_blocks);
|
||
|
||
best_out_blocks_index = 0;
|
||
}
|
||
#endif
|
||
|
||
ldr_astc_block_encode_image_output::block_info& block_info_out = enc_out.m_image_block_info(bx, by);
|
||
|
||
block_info_out.m_low_freq_block_flag = low_freq_block_flag;
|
||
block_info_out.m_super_strong_edges = scoped_block_encoder.get_ptr()->m_super_strong_edges;
|
||
block_info_out.m_very_strong_edges = scoped_block_encoder.get_ptr()->m_very_strong_edges;
|
||
block_info_out.m_strong_edges = scoped_block_encoder.get_ptr()->m_strong_edges;
|
||
block_info_out.m_packed_out_block_index = best_out_blocks_index;
|
||
|
||
// Create packed ASTC block
|
||
astc_helpers::astc_block& best_phys_block = packed_blocks(bx, by);
|
||
bool pack_success = astc_helpers::pack_astc_block(best_phys_block, best_out_blocks_log_astc_blk);
|
||
if (!pack_success)
|
||
{
|
||
encoder_failed_flag.store(true);
|
||
return;
|
||
}
|
||
|
||
output_block_devel_desc& out_devel_desc = output_block_devel_info(bx, by);
|
||
out_devel_desc.m_low_freq_block_flag = low_freq_block_flag;
|
||
out_devel_desc.m_super_strong_edges = scoped_block_encoder.get_ptr()->m_super_strong_edges;
|
||
out_devel_desc.m_very_strong_edges = scoped_block_encoder.get_ptr()->m_very_strong_edges;
|
||
out_devel_desc.m_strong_edges = scoped_block_encoder.get_ptr()->m_strong_edges;
|
||
|
||
// Critical Section
|
||
{
|
||
std::lock_guard g(global_mutex);
|
||
|
||
if (use_blurs)
|
||
total_blur_encodes++;
|
||
|
||
if (out_blocks[best_out_blocks_index].m_blur_id)
|
||
total_blurred_blocks1++;
|
||
|
||
if (superpass_index == 0)
|
||
{
|
||
// TODO: Add 2nd pass statistics
|
||
total_superbuckets_created += enc_block_stats.m_total_superbuckets_created;
|
||
total_buckets_created += enc_block_stats.m_total_buckets_created;
|
||
total_surrogate_encodes += enc_block_stats.m_total_surrogate_encodes;
|
||
total_full_encodes += enc_block_stats.m_total_full_encodes;
|
||
total_shortlist_candidates += enc_block_stats.m_total_shortlist_candidates;
|
||
}
|
||
else if (superpass_index == 1)
|
||
{
|
||
total_full_encodes_pass1 += enc_block_stats.m_total_full_encodes;
|
||
}
|
||
|
||
total_blocks_done++;
|
||
if (enc_cfg.m_debug_output)
|
||
{
|
||
if (superpass_index == 1)
|
||
{
|
||
if ((total_blocks_done & 63) == 63)
|
||
{
|
||
float new_val = ((float)total_blocks_done * 100.0f) / (float)total_blocks_to_recompress;
|
||
if ((new_val - last_printed_progress_val) >= 5.0f)
|
||
{
|
||
last_printed_progress_val = new_val;
|
||
fmt_printf("{3.2}%\n", new_val);
|
||
}
|
||
}
|
||
}
|
||
else if ((total_blocks_done & 255) == 255)
|
||
{
|
||
float new_val = ((float)total_blocks_done * 100.0f) / (float)total_blocks;
|
||
if ((new_val - last_printed_progress_val) >= 5.0f)
|
||
{
|
||
last_printed_progress_val = new_val;
|
||
fmt_printf("{3.2}%\n", new_val);
|
||
}
|
||
}
|
||
}
|
||
|
||
} // lock_guard (global_mutex)
|
||
|
||
} // if (superpass_index == ...)
|
||
|
||
});
|
||
|
||
if (encoder_failed_flag)
|
||
break;
|
||
|
||
} // bx
|
||
|
||
if (encoder_failed_flag)
|
||
break;
|
||
|
||
} // by
|
||
|
||
if (encoder_failed_flag)
|
||
{
|
||
fmt_error_printf("Main compressor block loop failed!\n");
|
||
return false;
|
||
}
|
||
|
||
job_pool.wait_for_all();
|
||
|
||
if (encoder_failed_flag)
|
||
{
|
||
fmt_error_printf("Main compressor block loop failed!\n");
|
||
return false;
|
||
}
|
||
|
||
if ((superpass_index == 0) && (enc_cfg.m_second_superpass_refinement) && (enc_cfg.m_second_superpass_fract_to_recompress > 0.0f))
|
||
{
|
||
uint_vec block_wsse_indices(total_blocks);
|
||
|
||
float_vec block_wsses(total_blocks);
|
||
for (uint32_t by = 0; by < num_blocks_y; by++)
|
||
{
|
||
for (uint32_t bx = 0; bx < num_blocks_x; bx++)
|
||
{
|
||
ldr_astc_block_encode_image_output::block_info& out_block_info = enc_out.m_image_block_info(bx, by);
|
||
|
||
float wsse = (float)out_block_info.m_out_blocks[out_block_info.m_packed_out_block_index].m_sse;
|
||
|
||
block_wsses[bx + by * num_blocks_x] = wsse;
|
||
} // bx
|
||
} // by
|
||
|
||
indirect_sort(total_blocks, block_wsse_indices.data(), block_wsses.data());
|
||
|
||
if (block_wsses[block_wsse_indices[total_blocks - 1]] > 0.0f)
|
||
{
|
||
total_blocks_to_recompress = clamp<uint32_t>((uint32_t)std::round((float)total_blocks * enc_cfg.m_second_superpass_fract_to_recompress), 0, total_blocks);
|
||
|
||
image vis_recomp_img;
|
||
if (enc_cfg.m_debug_images)
|
||
vis_recomp_img.resize(width, height);
|
||
|
||
for (uint32_t i = 0; i < total_blocks_to_recompress; i++)
|
||
{
|
||
const uint32_t block_index = block_wsse_indices[total_blocks - 1 - i];
|
||
|
||
const uint32_t block_x = block_index % num_blocks_x;
|
||
const uint32_t block_y = block_index / num_blocks_x;
|
||
|
||
superpass2_recompress_block_flags(block_x, block_y) = true;
|
||
|
||
if (enc_cfg.m_debug_images)
|
||
vis_recomp_img.fill_box(block_x * block_width, block_y * block_height, block_width, block_height, color_rgba(255, 255, 255, 255));
|
||
}
|
||
|
||
if (enc_cfg.m_debug_images)
|
||
save_png(enc_cfg.m_debug_file_prefix + "vis_recomp_img.png", vis_recomp_img);
|
||
}
|
||
}
|
||
|
||
} // superpass_index
|
||
|
||
if (enc_cfg.m_third_superpass_try_neighbors)
|
||
{
|
||
uint32_t total_superpass1_improved_blocks1 = 0;
|
||
uint32_t total_superpass1_improved_blocks2 = 0;
|
||
|
||
// Merge pass 2's output into pass 0's/1's output, which can be done safely now.
|
||
for (uint32_t by = 0; by < num_blocks_y; by++)
|
||
{
|
||
for (uint32_t bx = 0; bx < num_blocks_x; bx++)
|
||
{
|
||
ldr_astc_block_encode_image_output::block_info& out_block_info = enc_out.m_image_block_info(bx, by);
|
||
|
||
const ldr_astc_block_encode_image_output::block_info_superpass1& out_block_info_superpass1 = enc_out.m_image_block_info_superpass2(bx, by);
|
||
|
||
for (uint32_t neighbor_index = 0; neighbor_index < basist::astc_ldr_t::cMaxConfigReuseNeighbors; neighbor_index++)
|
||
{
|
||
const int new_neighbor_index = out_block_info_superpass1.m_config_reuse_neighbor_out_block_indices[neighbor_index];
|
||
|
||
if (new_neighbor_index == cInvalidIndex)
|
||
{
|
||
// Can't reuse neighbor's best output block
|
||
continue;
|
||
}
|
||
|
||
if (!out_block_info_superpass1.m_config_reuse_new_neighbor_out_block_flags[neighbor_index])
|
||
{
|
||
// Reuses an existing, already encoded output block which matches the neighbor
|
||
assert((size_t)new_neighbor_index < out_block_info.m_out_blocks.size());
|
||
continue;
|
||
}
|
||
|
||
const uint32_t new_out_block_index = out_block_info.m_out_blocks.size_u32();
|
||
|
||
const encode_block_output& new_output_blk = out_block_info_superpass1.m_new_out_config_reuse_blocks[new_neighbor_index];
|
||
|
||
out_block_info.m_out_blocks.push_back(new_output_blk);
|
||
|
||
#define BU_CHECK_NEIGHBOR_BEST (1)
|
||
|
||
#if BU_CHECK_NEIGHBOR_BEST
|
||
// See if the solution has improved
|
||
if (new_output_blk.m_sse < out_block_info.m_out_blocks[out_block_info.m_packed_out_block_index].m_sse)
|
||
{
|
||
total_superpass1_improved_blocks1++;
|
||
|
||
// Warning: This invalidate the neighbor indices
|
||
out_block_info.m_packed_out_block_index = new_out_block_index;
|
||
|
||
//astc_helpers::astc_block& packed_block = enc_out.m_packed_phys_blocks(bx, by);
|
||
|
||
bool pack_success = astc_helpers::pack_astc_block((astc_helpers::astc_block&)packed_blocks(bx, by), new_output_blk.m_log_blk);
|
||
if (!pack_success)
|
||
{
|
||
fmt_error_printf("astc_helpers::pack_astc_block failed\n");
|
||
|
||
return false;
|
||
}
|
||
}
|
||
#endif
|
||
|
||
} // neighbor_index
|
||
|
||
for (uint32_t j = 0; j < out_block_info_superpass1.m_new_out_config_endpoint_reuse_blocks.size(); j++)
|
||
{
|
||
const uint32_t new_out_block_index = out_block_info.m_out_blocks.size_u32();
|
||
|
||
const encode_block_output& new_output_blk = out_block_info_superpass1.m_new_out_config_endpoint_reuse_blocks[j];
|
||
|
||
out_block_info.m_out_blocks.push_back(new_output_blk);
|
||
|
||
#define BU_CHECK_NEIGHBOR_BEST (1)
|
||
|
||
#if BU_CHECK_NEIGHBOR_BEST
|
||
// See if the solution has improved
|
||
if (new_output_blk.m_sse < out_block_info.m_out_blocks[out_block_info.m_packed_out_block_index].m_sse)
|
||
{
|
||
total_superpass1_improved_blocks2++;
|
||
|
||
// Warning: This invalidate the neighbor indices
|
||
out_block_info.m_packed_out_block_index = new_out_block_index;
|
||
|
||
//astc_helpers::astc_block& packed_block = enc_out.m_packed_phys_blocks(bx, by);
|
||
|
||
bool pack_success = astc_helpers::pack_astc_block((astc_helpers::astc_block&)packed_blocks(bx, by), new_output_blk.m_log_blk);
|
||
if (!pack_success)
|
||
{
|
||
fmt_error_printf("astc_helpers::pack_astc_block failed\n");
|
||
|
||
return false;
|
||
}
|
||
}
|
||
#endif
|
||
|
||
} // j
|
||
|
||
} // bx
|
||
} // by
|
||
|
||
if (enc_cfg.m_debug_output)
|
||
{
|
||
fmt_debug_printf("Total superpass 1 improved blocks 1: {} {3.2}%\n", total_superpass1_improved_blocks1, ((float)total_superpass1_improved_blocks1 * 100.0f) / (float)(total_blocks));
|
||
fmt_debug_printf("Total superpass 1 improved blocks 2: {} {3.2}%\n", total_superpass1_improved_blocks2, ((float)total_superpass1_improved_blocks2 * 100.0f) / (float)(total_blocks));
|
||
}
|
||
}
|
||
|
||
if (ASTC_LDR_CONSISTENCY_CHECKING)
|
||
{
|
||
if (enc_cfg.m_debug_output)
|
||
fmt_debug_printf("consistency checking\n");
|
||
|
||
// Consistency/sanity cross checking
|
||
//uint32_t total_blocks_using_neighbor_config = 0;
|
||
for (uint32_t by = 0; by < num_blocks_y; by++)
|
||
{
|
||
for (uint32_t bx = 0; bx < num_blocks_x; bx++)
|
||
{
|
||
const ldr_astc_block_encode_image_output::block_info& out_block_info = enc_out.m_image_block_info(bx, by);
|
||
|
||
#if BU_CHECK_NEIGHBOR_BEST
|
||
uint64_t best_sse = UINT64_MAX;
|
||
uint32_t best_out_block_index = 0;
|
||
|
||
for (uint32_t i = 0; i < out_block_info.m_out_blocks.size(); i++)
|
||
{
|
||
if (out_block_info.m_out_blocks[i].m_sse < best_sse)
|
||
{
|
||
best_sse = out_block_info.m_out_blocks[i].m_sse;
|
||
best_out_block_index = i;
|
||
}
|
||
} // i
|
||
|
||
if (best_out_block_index != out_block_info.m_packed_out_block_index)
|
||
{
|
||
fmt_error_printf("consistency check failed\n");
|
||
assert(0);
|
||
return false;
|
||
}
|
||
#endif
|
||
|
||
if (out_block_info.m_out_blocks[out_block_info.m_packed_out_block_index].m_sse !=
|
||
eval_error(block_width, block_height, out_block_info.m_out_blocks[out_block_info.m_packed_out_block_index].m_log_blk, out_block_info.m_pixel_stats, enc_cfg.m_cem_enc_params))
|
||
{
|
||
fmt_error_printf("consistency check failed\n");
|
||
assert(0);
|
||
return false;
|
||
}
|
||
|
||
// Ensure packed output block matches the expected best WSSE block.
|
||
astc_helpers::astc_block packed_block;
|
||
bool pack_success = astc_helpers::pack_astc_block(packed_block, out_block_info.m_out_blocks[out_block_info.m_packed_out_block_index].m_log_blk);
|
||
if (!pack_success)
|
||
{
|
||
fmt_error_printf("astc_helpers::pack_astc_block failed\n");
|
||
return false;
|
||
}
|
||
|
||
if (memcmp(&packed_block, &enc_out.m_packed_phys_blocks(bx, by), sizeof(astc_helpers::astc_block)) != 0)
|
||
{
|
||
fmt_error_printf("consistency check failed\n");
|
||
assert(0);
|
||
return false;
|
||
}
|
||
|
||
// DCT check
|
||
if ((enc_cfg.m_use_dct) && (out_block_info.m_out_blocks[out_block_info.m_packed_out_block_index].m_trial_mode_index >= 0))
|
||
{
|
||
const auto& best_log_blk = out_block_info.m_out_blocks[out_block_info.m_packed_out_block_index].m_log_blk;
|
||
if (best_log_blk.m_solid_color_flag_ldr)
|
||
{
|
||
fmt_error_printf("consistency check failed\n");
|
||
assert(0);
|
||
return false;
|
||
}
|
||
|
||
const basist::astc_ldr_t::astc_block_grid_data* pGrid_data = basist::astc_ldr_t::find_astc_block_grid_data(block_width, block_height, best_log_blk.m_grid_width, best_log_blk.m_grid_height);
|
||
const uint32_t total_planes = best_log_blk.m_num_partitions ? (best_log_blk.m_dual_plane ? 2 : 1) : 0;
|
||
|
||
astc_helpers::log_astc_block verify_log_blk(best_log_blk);
|
||
|
||
for (uint32_t plane_index = 0; plane_index < total_planes; plane_index++)
|
||
{
|
||
if (!out_block_info.m_out_blocks[out_block_info.m_packed_out_block_index].m_packed_dct_plane_data[plane_index].m_coeffs.size())
|
||
{
|
||
fmt_error_printf("consistency check failed\n");
|
||
assert(0);
|
||
return false;
|
||
}
|
||
|
||
basist::astc_ldr_t::fvec dct_temp;
|
||
bool dec_status = grid_coder.decode_block_weights(enc_cfg.m_base_q, plane_index, verify_log_blk, nullptr, pGrid_data, nullptr, dct_temp,
|
||
&out_block_info.m_out_blocks[out_block_info.m_packed_out_block_index].m_packed_dct_plane_data[plane_index]);
|
||
|
||
if (!dec_status)
|
||
{
|
||
fmt_error_printf("consistency check failed\n");
|
||
assert(0);
|
||
return false;
|
||
}
|
||
|
||
for (uint32_t i = 0; i < (uint32_t)(best_log_blk.m_grid_width * best_log_blk.m_grid_height); i++)
|
||
{
|
||
if (best_log_blk.m_weights[i * total_planes + plane_index] != verify_log_blk.m_weights[i * total_planes + plane_index])
|
||
{
|
||
fmt_error_printf("consistency check failed\n");
|
||
assert(0);
|
||
return false;
|
||
}
|
||
}
|
||
|
||
} // plane_index
|
||
}
|
||
|
||
} // bx
|
||
} // by
|
||
|
||
if (enc_cfg.m_debug_output)
|
||
fmt_debug_printf("consistency checking PASSED\n");
|
||
}
|
||
|
||
//fmt_debug_printf("Total blocks using neighbor config: {} {3.2}%\n", total_blocks_using_neighbor_config, ((float)total_blocks_using_neighbor_config * 100.0f) / (float)(total_blocks));
|
||
|
||
// Debug output
|
||
uint_vec trial_mode_hist;
|
||
trial_mode_hist.resize(encoder_trial_modes.size());
|
||
uint32_t total_alpha_blocks = 0;
|
||
|
||
for (uint32_t by = 0; by < num_blocks_y; by++)
|
||
{
|
||
for (uint32_t bx = 0; bx < num_blocks_x; bx++)
|
||
{
|
||
const ldr_astc_block_encode_image_output::block_info& out_block_info = enc_out.m_image_block_info(bx, by);
|
||
const astc_ldr::pixel_stats_t& pixel_stats = out_block_info.m_pixel_stats;
|
||
|
||
const encode_block_output& best_out_block = out_block_info.m_out_blocks[out_block_info.m_packed_out_block_index];
|
||
const astc_helpers::log_astc_block& best_out_blocks_log_astc_blk = best_out_block.m_log_blk;
|
||
|
||
if (pixel_stats.m_has_alpha)
|
||
total_alpha_blocks++;
|
||
|
||
output_block_devel_desc& out_devel_desc = output_block_devel_info(bx, by);
|
||
out_devel_desc.m_had_alpha = pixel_stats.m_has_alpha;
|
||
out_devel_desc.m_trial_mode_index = best_out_block.m_trial_mode_index;
|
||
out_devel_desc.m_pTrial_modes = encoder_trial_modes.data();
|
||
|
||
if (out_devel_desc.m_trial_mode_index >= 0)
|
||
trial_mode_hist[out_devel_desc.m_trial_mode_index]++;
|
||
|
||
//const float total_astc_weight_bits = log2f((float)astc_helpers::get_ise_levels(best_out_block.m_log_blk.m_weight_ise_range)) *
|
||
// best_out_block.m_log_blk.m_grid_width * best_out_block.m_log_blk.m_grid_height * (best_out_block.m_log_blk.m_dual_plane ? 2 : 1);
|
||
|
||
//bool used_blue_contraction = astc_ldr::used_blue_contraction(best_out_blocks_log_astc_blk.m_color_endpoint_modes[0], best_out_blocks_log_astc_blk.m_endpoints, best_out_blocks_log_astc_blk.m_endpoint_ise_range);
|
||
|
||
if (enc_cfg.m_debug_images)
|
||
{
|
||
color_rgba vis_col(g_black_color);
|
||
color_rgba vis2_col(g_black_color);
|
||
color_rgba dp_vis(g_black_color);
|
||
color_rgba base_ofs_vis(g_black_color);
|
||
//color_rgba dct_bits_abs_vis(g_black_color);
|
||
//color_rgba dct_bits_vs_astc_vis(g_black_color);
|
||
|
||
const astc_ldr::partition_pattern_vec* pPat = nullptr;
|
||
|
||
if (best_out_blocks_log_astc_blk.m_num_partitions == 2)
|
||
{
|
||
vis_col.set(0, 255, 0, 255);
|
||
|
||
const astc_ldr::partitions_data* pPart_data = pPart_data_p2;
|
||
|
||
const uint32_t part_seed_index = best_out_blocks_log_astc_blk.m_partition_id;
|
||
const uint32_t part_unique_index = pPart_data->m_part_seed_to_unique_index[part_seed_index];
|
||
|
||
pPat = &pPart_data->m_partition_pats[part_unique_index];
|
||
}
|
||
else if (best_out_blocks_log_astc_blk.m_num_partitions == 3)
|
||
{
|
||
vis_col.set(0, 0, 255, 255);
|
||
|
||
const astc_ldr::partitions_data* pPart_data = pPart_data_p3;
|
||
|
||
const uint32_t part_seed_index = best_out_blocks_log_astc_blk.m_partition_id;
|
||
const uint32_t part_unique_index = pPart_data->m_part_seed_to_unique_index[part_seed_index];
|
||
|
||
pPat = &pPart_data->m_partition_pats[part_unique_index];
|
||
}
|
||
|
||
// vis_col.r = enc_blk_params.m_use_base_scale_modes ? 255 : 0;
|
||
// vis_col.g = enc_blk_params.m_use_direct_modes ? 255 : 0;
|
||
|
||
if (!out_devel_desc.m_low_freq_block_flag)
|
||
{
|
||
if (out_devel_desc.m_super_strong_edges)
|
||
vis2_col.set(255, 0, 255, 255);
|
||
else if (out_devel_desc.m_very_strong_edges)
|
||
vis2_col.set(255, 0, 0, 255);
|
||
else if (out_devel_desc.m_strong_edges)
|
||
vis2_col.set(0, 255, 0, 255);
|
||
}
|
||
|
||
if (pPat)
|
||
{
|
||
for (uint32_t y = 0; y < block_height; y++)
|
||
{
|
||
for (uint32_t x = 0; x < block_width; x++)
|
||
{
|
||
const uint32_t subset_idx = (*pPat)(x, y);
|
||
|
||
color_rgba c(g_black_color);
|
||
|
||
if (best_out_blocks_log_astc_blk.m_num_partitions == 2)
|
||
{
|
||
assert(subset_idx < 2);
|
||
c = subset_idx ? color_rgba(255, 0, 0, 255) : color_rgba(0, 255, 0, 255);
|
||
}
|
||
else
|
||
{
|
||
assert(best_out_blocks_log_astc_blk.m_num_partitions == 3);
|
||
assert(subset_idx < 3);
|
||
|
||
if (subset_idx == 2)
|
||
c = color_rgba(0, 0, 255, 255);
|
||
else if (subset_idx == 1)
|
||
c = color_rgba(32, 0, 190, 255);
|
||
else
|
||
c = color_rgba(64, 0, 64, 255);
|
||
}
|
||
|
||
vis_part_pat_img.set_clipped(bx * block_width + x, by * block_height + y, c);
|
||
}
|
||
}
|
||
}
|
||
|
||
if (best_out_blocks_log_astc_blk.m_dual_plane)
|
||
dp_vis.g = 255;
|
||
|
||
if ((best_out_blocks_log_astc_blk.m_color_endpoint_modes[0] == astc_helpers::CEM_LDR_RGB_BASE_PLUS_OFFSET) ||
|
||
(best_out_blocks_log_astc_blk.m_color_endpoint_modes[0] == astc_helpers::CEM_LDR_RGBA_BASE_PLUS_OFFSET))
|
||
{
|
||
base_ofs_vis.b = 255;
|
||
}
|
||
|
||
vis_part_usage_img.fill_box(bx * block_width, by * block_height, block_width, block_height, vis_col);
|
||
vis_strong_edge.fill_box(bx * block_width, by * block_height, block_width, block_height, vis2_col);
|
||
vis_dp_img.fill_box(bx * block_width, by * block_height, block_width, block_height, dp_vis);
|
||
vis_base_ofs_img.fill_box(bx * block_width, by * block_height, block_width, block_height, base_ofs_vis);
|
||
}
|
||
|
||
} // bx
|
||
|
||
} // by
|
||
|
||
const double total_enc_time = itm.get_elapsed_secs();
|
||
|
||
if (enc_cfg.m_debug_output)
|
||
fmt_debug_printf("ASTC packing complete\n");
|
||
|
||
image unpacked_img(width, height);
|
||
|
||
// Unpack packed image, validate ASTC data with several decoders.
|
||
for (uint32_t by = 0; by < num_blocks_y; by++)
|
||
{
|
||
for (uint32_t bx = 0; bx < num_blocks_x; bx++)
|
||
{
|
||
const astc_helpers::astc_block* pPhys_block = &packed_blocks(bx, by);
|
||
|
||
astc_helpers::log_astc_block log_blk;
|
||
bool status = astc_helpers::unpack_block(pPhys_block, log_blk, block_width, block_height);
|
||
if (!status)
|
||
{
|
||
fmt_error_printf("unpack_block() failed\n");
|
||
return false;
|
||
}
|
||
|
||
// Decode with our generic ASTC decoder.
|
||
color_rgba block_pixels[astc_ldr::ASTC_LDR_MAX_BLOCK_PIXELS];
|
||
status = astc_helpers::decode_block(log_blk, block_pixels, block_width, block_height, enc_cfg.m_cem_enc_params.m_decode_mode_srgb ? astc_helpers::cDecodeModeSRGB8 : astc_helpers::cDecodeModeLDR8);
|
||
if (!status)
|
||
{
|
||
fmt_error_printf("decode_block() failed\n");
|
||
return false;
|
||
}
|
||
|
||
unpacked_img.set_block_clipped(block_pixels, bx * block_width, by * block_height, block_width, block_height);
|
||
|
||
// Decode with the Android testing framework ASTC decoder
|
||
{
|
||
uint8_t dec_pixels_android[astc_ldr::ASTC_LDR_MAX_BLOCK_PIXELS * 4];
|
||
|
||
bool android_success = basisu_astc::astc::decompress_ldr(dec_pixels_android, (const uint8_t*)pPhys_block, enc_cfg.m_cem_enc_params.m_decode_mode_srgb, block_width, block_height);
|
||
if (!android_success)
|
||
{
|
||
fmt_error_printf("Android ASTC decoder failed!\n");
|
||
return false;
|
||
}
|
||
|
||
if (memcmp(dec_pixels_android, block_pixels, total_block_pixels * 4) != 0)
|
||
{
|
||
fmt_error_printf("Android ASTC decoder mismatch!\n");
|
||
return false;
|
||
}
|
||
}
|
||
|
||
// Decode with our optimized XUASTC LDR decoder
|
||
{
|
||
color_rgba block_pixels_alt[astc_ldr::ASTC_LDR_MAX_BLOCK_PIXELS];
|
||
status = astc_helpers::decode_block_xuastc_ldr(log_blk, block_pixels_alt, block_width, block_height, enc_cfg.m_cem_enc_params.m_decode_mode_srgb ? astc_helpers::cDecodeModeSRGB8 : astc_helpers::cDecodeModeLDR8);
|
||
if (!status)
|
||
{
|
||
fmt_error_printf("decode_block_xuastc_ldr() failed\n");
|
||
return false;
|
||
}
|
||
|
||
if (memcmp(block_pixels, block_pixels_alt, total_block_pixels * 4) != 0)
|
||
{
|
||
fmt_error_printf("XUASTC LDR ASTC decoder mismatch!\n");
|
||
return false;
|
||
}
|
||
}
|
||
|
||
} // bx
|
||
} // by
|
||
|
||
if (enc_cfg.m_debug_images)
|
||
{
|
||
save_png(enc_cfg.m_debug_file_prefix + "dbg_astc_ldr_unpacked_img.png", unpacked_img);
|
||
|
||
if (vis_part_usage_img.is_valid())
|
||
save_png(enc_cfg.m_debug_file_prefix + "vis_part_usage.png", vis_part_usage_img);
|
||
|
||
if (vis_part_pat_img.is_valid())
|
||
save_png(enc_cfg.m_debug_file_prefix + "vis_part_pat_img.png", vis_part_pat_img);
|
||
|
||
if (vis_strong_edge.is_valid())
|
||
save_png(enc_cfg.m_debug_file_prefix + "vis_strong_edge.png", vis_strong_edge);
|
||
|
||
if (vis_dct_low_freq_block.is_valid())
|
||
save_png(enc_cfg.m_debug_file_prefix + "vis_dct_low_freq_block.png", vis_dct_low_freq_block);
|
||
|
||
if (vis_dp_img.is_valid())
|
||
save_png(enc_cfg.m_debug_file_prefix + "vis_dp.png", vis_dp_img);
|
||
|
||
if (vis_base_ofs_img.is_valid())
|
||
save_png(enc_cfg.m_debug_file_prefix + "vis_base_ofs.png", vis_base_ofs_img);
|
||
}
|
||
|
||
if (enc_cfg.m_debug_output)
|
||
{
|
||
uint32_t cem_used_hist[16] = { 0 };
|
||
uint32_t cem_used_bc[16] = { 0 };
|
||
uint32_t cem_used_subsets[16] = { 0 };
|
||
uint32_t cem_used_dp[16] = { 0 };
|
||
uint32_t total_dp = 0, total_base_ofs = 0;
|
||
uint32_t subset_used_hist[4] = { 0 };
|
||
uint32_t grid_usage_hist[astc_ldr::ASTC_LDR_MAX_BLOCK_PIXELS * astc_ldr::ASTC_LDR_MAX_BLOCK_PIXELS + 1] = { 0 };
|
||
|
||
uint32_t total_header_bits = 0;
|
||
uint32_t total_weight_bits = 0;
|
||
uint32_t total_endpoint_bits = 0;
|
||
|
||
uint32_t total_void_extent = 0;
|
||
|
||
uint32_t used_endpoint_levels_hist[astc_helpers::LAST_VALID_ENDPOINT_ISE_RANGE - astc_helpers::FIRST_VALID_ENDPOINT_ISE_RANGE + 1] = { 0 };
|
||
uint32_t used_weight_levels_hist[astc_helpers::LAST_VALID_WEIGHT_ISE_RANGE - astc_helpers::FIRST_VALID_WEIGHT_ISE_RANGE + 1] = { 0 };
|
||
|
||
uint32_t total_blocks_using_subsets = 0;
|
||
|
||
for (uint32_t by = 0; by < num_blocks_y; by++)
|
||
{
|
||
for (uint32_t bx = 0; bx < num_blocks_x; bx++)
|
||
{
|
||
const output_block_devel_desc& desc = output_block_devel_info(bx, by);
|
||
|
||
const astc_helpers::astc_block* pPhys_block = &packed_blocks(bx, by);
|
||
|
||
astc_helpers::log_astc_block log_blk;
|
||
bool status = astc_helpers::unpack_block(pPhys_block, log_blk, block_width, block_height);
|
||
if (!status)
|
||
{
|
||
fmt_error_printf("unpack_block() failed\n");
|
||
return false;
|
||
}
|
||
|
||
if (desc.m_trial_mode_index < 0)
|
||
{
|
||
total_void_extent++;
|
||
continue;
|
||
}
|
||
else
|
||
{
|
||
const basist::astc_ldr_t::trial_mode& tm = desc.m_pTrial_modes[desc.m_trial_mode_index];
|
||
|
||
const uint32_t actual_cem = log_blk.m_color_endpoint_modes[0];
|
||
//assert(tm.m_cem == log_blk.m_color_endpoint_modes[0]); // may differ due to base+ofs usage
|
||
|
||
assert((tm.m_ccs_index >= 0) == log_blk.m_dual_plane);
|
||
assert((!log_blk.m_dual_plane) || (tm.m_ccs_index == log_blk.m_color_component_selector));
|
||
assert(tm.m_endpoint_ise_range == log_blk.m_endpoint_ise_range);
|
||
assert(tm.m_weight_ise_range == log_blk.m_weight_ise_range);
|
||
assert(tm.m_grid_width == log_blk.m_grid_width);
|
||
assert(tm.m_grid_height == log_blk.m_grid_height);
|
||
assert(tm.m_num_parts == log_blk.m_num_partitions);
|
||
|
||
used_weight_levels_hist[open_range_check<int>(tm.m_weight_ise_range - astc_helpers::FIRST_VALID_WEIGHT_ISE_RANGE, std::size(used_weight_levels_hist))]++;
|
||
used_endpoint_levels_hist[open_range_check<int>(tm.m_endpoint_ise_range - astc_helpers::FIRST_VALID_ENDPOINT_ISE_RANGE, std::size(used_endpoint_levels_hist))]++;
|
||
|
||
cem_used_hist[actual_cem]++;
|
||
if (log_blk.m_dual_plane)
|
||
total_dp++;
|
||
|
||
subset_used_hist[open_range_check<size_t>(log_blk.m_num_partitions - 1, std::size(subset_used_hist))]++;
|
||
|
||
bool used_bc = false;
|
||
for (uint32_t i = 0; i < tm.m_num_parts; i++)
|
||
{
|
||
if (astc_helpers::used_blue_contraction(actual_cem, log_blk.m_endpoints + i * astc_helpers::get_num_cem_values(actual_cem), log_blk.m_endpoint_ise_range))
|
||
{
|
||
used_bc = true;
|
||
}
|
||
}
|
||
|
||
if (used_bc)
|
||
cem_used_bc[actual_cem]++;
|
||
|
||
if (tm.m_num_parts > 1)
|
||
cem_used_subsets[actual_cem]++;
|
||
|
||
// TODO: add CCS index histogram per CEM
|
||
if (log_blk.m_dual_plane)
|
||
cem_used_dp[actual_cem]++;
|
||
|
||
if ((actual_cem == astc_helpers::CEM_LDR_RGB_BASE_PLUS_OFFSET) ||
|
||
(actual_cem == astc_helpers::CEM_LDR_RGBA_BASE_PLUS_OFFSET))
|
||
{
|
||
total_base_ofs++;
|
||
}
|
||
|
||
grid_usage_hist[open_range_check<size_t>(log_blk.m_grid_width * log_blk.m_grid_height, std::size(grid_usage_hist))]++;
|
||
|
||
if (tm.m_num_parts > 1)
|
||
total_blocks_using_subsets++;
|
||
}
|
||
|
||
astc_helpers::pack_stats pack_stats;
|
||
pack_stats.clear();
|
||
|
||
astc_helpers::astc_block temp_phys_block;
|
||
int expected_endpoint_range = 0;
|
||
status = astc_helpers::pack_astc_block(temp_phys_block, log_blk, &expected_endpoint_range, &pack_stats);
|
||
assert(status);
|
||
|
||
total_header_bits += pack_stats.m_header_bits;
|
||
total_weight_bits += pack_stats.m_weight_bits;
|
||
total_endpoint_bits += pack_stats.m_endpoint_bits;
|
||
|
||
} // bx
|
||
} // by
|
||
|
||
uint32_t total_used_modes = 0;
|
||
|
||
fmt_debug_printf("--------------------- Trial Modes:\n");
|
||
|
||
for (uint32_t i = 0; i < trial_mode_hist.size(); i++)
|
||
{
|
||
if (!trial_mode_hist[i])
|
||
continue;
|
||
|
||
if (trial_mode_hist[i])
|
||
total_used_modes++;
|
||
|
||
#if 0
|
||
const uint32_t total_mode_blocks = trial_mode_hist[i];
|
||
|
||
const uint32_t num_subsets = encoder_trial_modes[i].m_num_parts;
|
||
const uint32_t cem_index = encoder_trial_modes[i].m_cem;
|
||
|
||
fmt_debug_printf("{}: {} {3.2}%: cem: {}, grid {}x{}, e: {} w: {}, ccs: {}, parts: {}, total base+ofs: {}, total direct: {}\n", i, total_mode_blocks, (float)total_mode_blocks * 100.0f / (float)total_blocks,
|
||
encoder_trial_modes[i].m_cem,
|
||
encoder_trial_modes[i].m_grid_width, encoder_trial_modes[i].m_grid_height,
|
||
astc_helpers::get_ise_levels(encoder_trial_modes[i].m_endpoint_ise_range), astc_helpers::get_ise_levels(encoder_trial_modes[i].m_weight_ise_range),
|
||
encoder_trial_modes[i].m_ccs_index,
|
||
encoder_trial_modes[i].m_num_parts,
|
||
used_base_offset_count[i],
|
||
used_rgb_direct_count[i]);
|
||
#endif
|
||
}
|
||
|
||
fmt_debug_printf("\n");
|
||
|
||
fmt_debug_printf("Used endpoint ISE levels:\n");
|
||
for (uint32_t i = 0; i < std::size(used_endpoint_levels_hist); i++)
|
||
fmt_debug_printf("{} levels: {}\n", astc_helpers::get_ise_levels(astc_helpers::FIRST_VALID_ENDPOINT_ISE_RANGE + i), used_endpoint_levels_hist[i]);
|
||
|
||
fmt_debug_printf("\nUsed weight ISE levels:\n");
|
||
for (uint32_t i = 0; i < std::size(used_weight_levels_hist); i++)
|
||
fmt_debug_printf("{} levels: {}\n", astc_helpers::get_ise_levels(astc_helpers::FIRST_VALID_WEIGHT_ISE_RANGE + i), used_weight_levels_hist[i]);
|
||
|
||
const uint32_t total_blocks_excluding_void_extent = total_blocks - total_void_extent;
|
||
|
||
fmt_debug_printf("\nTotal blocks: {}, excluding void extent: {}\n", total_blocks, total_blocks_excluding_void_extent);
|
||
fmt_debug_printf("Total void extent blocks skipped by compressor: {}\n", total_void_extent_blocks_skipped);
|
||
fmt_debug_printf("Total final void extent blocks: {}\n", total_void_extent);
|
||
fmt_debug_printf("Total input blocks with alpha: {} {3.1}%\n", total_alpha_blocks, (float)total_alpha_blocks * 100.0f / (float)total_blocks);
|
||
|
||
fmt_debug_printf("\nASTC phys avg block stats (including void extent):\n");
|
||
fmt_debug_printf("Total header bits: {}, {} per block, {} per pixel\n", total_header_bits, (float)total_header_bits / (float)total_blocks, (float)total_header_bits / (float)(total_pixels));
|
||
fmt_debug_printf("Total weight bits: {}, {} per block, {} per pixel\n", total_weight_bits, (float)total_weight_bits / (float)total_blocks, (float)total_weight_bits / (float)(total_pixels));
|
||
fmt_debug_printf("Total endpoint bits: {}, {} per block, {} per pixel\n", total_endpoint_bits, (float)total_endpoint_bits / (float)total_blocks, (float)total_endpoint_bits / (float)(total_pixels));
|
||
fmt_debug_printf("Total header+endpoint bits: {}, {} per block, {} per pixel\n", total_header_bits + total_endpoint_bits,
|
||
(float)(total_header_bits + total_endpoint_bits) / (float)total_blocks, (float)(total_header_bits + total_endpoint_bits) / (float)(total_pixels));
|
||
fmt_debug_printf("Total header+endpoint+weight bits: {}, {} per block, {} per pixel\n", total_header_bits + total_endpoint_bits + total_weight_bits,
|
||
(float)(total_header_bits + total_endpoint_bits + total_weight_bits) / (float)total_blocks, (float)(total_header_bits + total_endpoint_bits + total_weight_bits) / (float)(total_pixels));
|
||
|
||
fmt_debug_printf("\nEncoder stats:\n");
|
||
fmt_debug_printf("Total utilized encoder trial modes: {} {3.2}%\n", total_used_modes, (float)total_used_modes * 100.0f / (float)encoder_trial_modes.size());
|
||
|
||
const uint32_t total_blurred_blocks = total_blurred_blocks1 + total_blurred_blocks2 + total_blurred_blocks3 + total_blurred_blocks4;
|
||
|
||
fmt_debug_printf("\nTotal blur encodes: {} ({3.2}%)\n", total_blur_encodes, (float)total_blur_encodes * 100.0f / (float)total_blocks);
|
||
fmt_debug_printf("Total blurred blocks: {} ({3.2}%)\n", total_blurred_blocks, (float)total_blurred_blocks * 100.0f / (float)total_blocks);
|
||
fmt_debug_printf("Total blurred1 blocks: {} ({3.2}%)\n", total_blurred_blocks1, (float)total_blurred_blocks1 * 100.0f / (float)total_blocks);
|
||
fmt_debug_printf("Total blurred2 blocks: {} ({3.2}%)\n", total_blurred_blocks2, (float)total_blurred_blocks2 * 100.0f / (float)total_blocks);
|
||
fmt_debug_printf("Total blurred3 blocks: {} ({3.2}%)\n", total_blurred_blocks3, (float)total_blurred_blocks3 * 100.0f / (float)total_blocks);
|
||
fmt_debug_printf("Total blurred4 blocks: {} ({3.2}%)\n", total_blurred_blocks4, (float)total_blurred_blocks4 * 100.0f / (float)total_blocks);
|
||
|
||
fmt_debug_printf("\nTotal superbuckets created: {} ({4.1} per block)\n", total_superbuckets_created, (float)total_superbuckets_created / (float)total_blocks);
|
||
fmt_debug_printf("Total shortlist buckets created: {} ({4.1} per block)\n", total_buckets_created, (float)total_buckets_created / (float)total_blocks);
|
||
fmt_debug_printf("Total surrogate encodes: {} ({4.1} per block)\n", total_surrogate_encodes, (float)total_surrogate_encodes / (float)total_blocks);
|
||
fmt_debug_printf("Total shortlist candidates (before full encoding): {} ({4.1} per block)\n", total_shortlist_candidates, (float)total_shortlist_candidates / (float)total_blocks);
|
||
fmt_debug_printf("Total full encodes on superpass 0: {} ({4.1} per block)\n", total_full_encodes, (float)total_full_encodes / (float)total_blocks);
|
||
fmt_debug_printf("Total full encodes on superpass 1: {} ({4.1} per block)\n", total_full_encodes_pass1, (float)total_full_encodes_pass1 / (float)total_blocks);
|
||
fmt_debug_printf("Total full encodes on superpass 2: {} ({4.1} per block)\n", total_full_encodes_pass2, (float)total_full_encodes_pass2 / (float)total_blocks);
|
||
|
||
debug_printf("\nTotal final encoded ASTC blocks using blue contraction: %u (%.2f%%)\n", total_used_bc, 100.0f * (float)total_used_bc / (float)total_blocks);
|
||
|
||
fmt_debug_printf("Total final encoded ASTC blocks using dual planes: {} {3.2}%\n", total_dp, (float)total_dp * 100.0f / (float)total_blocks);
|
||
fmt_debug_printf("Total final encoded ASTC blocks using base+ofs: {} {3.2}%\n", total_dp, (float)total_base_ofs * 100.0f / (float)total_blocks);
|
||
fmt_debug_printf("Total final encoded ASTC blocks using subsets: {} {3.2}%\n", total_blocks_using_subsets, (float)total_blocks_using_subsets * 100.0f / (float)total_blocks);
|
||
|
||
debug_printf("\nSubset usage histogram:\n");
|
||
for (uint32_t i = 0; i < 4; i++)
|
||
fmt_debug_printf("{} subsets: {} {3.2}%\n", i + 1, subset_used_hist[i], (float)subset_used_hist[i] * 100.0f / (float)total_blocks);
|
||
debug_printf("\n");
|
||
|
||
debug_printf("CEM usage histogram:\n");
|
||
for (uint32_t i = 0; i < 16; i++)
|
||
{
|
||
if (astc_helpers::is_cem_hdr(i))
|
||
continue;
|
||
|
||
std::string n(astc_helpers::get_cem_name(i));
|
||
while (n.size() < 40)
|
||
n.push_back(' ');
|
||
|
||
fmt_debug_printf("{}: {} {3.2}%, Used BC: {3.2}%, Used subsets: {3.2}%, Used DP: {3.2}%\n",
|
||
n,
|
||
cem_used_hist[i],
|
||
(float)cem_used_hist[i] * 100.0f / (float)total_blocks,
|
||
(float)cem_used_bc[i] * 100.0f / (float)total_blocks,
|
||
(float)cem_used_subsets[i] * 100.0f / (float)total_blocks,
|
||
(float)cem_used_dp[i] * 100.0f / (float)total_blocks);
|
||
}
|
||
debug_printf("\n");
|
||
|
||
debug_printf("Grid samples histogram:\n");
|
||
for (uint32_t i = 1; i <= block_width * block_height; i++)
|
||
{
|
||
if (grid_usage_hist[i])
|
||
fmt_debug_printf("{} samples: {} {3.2}%\n", i, grid_usage_hist[i], (float)grid_usage_hist[i] * 100.0f / (float)total_blocks);
|
||
}
|
||
debug_printf("\n");
|
||
|
||
fmt_debug_printf("orig vs. ASTC compressed:\n");
|
||
print_image_metrics(orig_img, unpacked_img);
|
||
|
||
fmt_debug_printf("Total encode time: {.3} secs, {.3} ms per block, {.1} blocks/sec\n", total_enc_time, total_enc_time * 1000.0f / total_blocks, total_blocks / total_enc_time);
|
||
|
||
fmt_debug_printf("OK\n");
|
||
}
|
||
|
||
return true;
|
||
}
|
||
|
||
//const uint32_t rice_zero_run_m = 3, rice_dct_coeff_m = 2;
|
||
|
||
const uint_vec& separate_tm_index(uint32_t block_width, uint32_t block_height, const basist::astc_ldr_t::grouped_trial_modes& grouped_enc_trial_modes, const basist::astc_ldr_t::trial_mode& tm,
|
||
uint32_t& cem_index, uint32_t& subset_index, uint32_t& ccs_index, uint32_t& grid_size, uint32_t& grid_aniso)
|
||
{
|
||
cem_index = tm.m_cem;
|
||
assert(cem_index < basist::astc_ldr_t::OTM_NUM_CEMS);
|
||
|
||
subset_index = tm.m_num_parts - 1;
|
||
assert(subset_index < basist::astc_ldr_t::OTM_NUM_SUBSETS);
|
||
|
||
ccs_index = tm.m_ccs_index + 1;
|
||
assert(ccs_index < basist::astc_ldr_t::OTM_NUM_CCS);
|
||
|
||
grid_size = (tm.m_grid_width >= (block_width - 1)) && (tm.m_grid_height >= (block_height - 1));
|
||
grid_aniso = basist::astc_ldr_t::calc_grid_aniso_val(tm.m_grid_width, tm.m_grid_height, block_width, block_height);
|
||
|
||
const uint_vec& modes = grouped_enc_trial_modes.m_tm_groups[cem_index][subset_index][ccs_index][grid_size][grid_aniso];
|
||
return modes;
|
||
}
|
||
|
||
static bool compare_log_block_configs(const astc_helpers::log_astc_block& trial_log_blk, const astc_helpers::log_astc_block& neighbor_log_blk)
|
||
{
|
||
assert(!trial_log_blk.m_solid_color_flag_ldr);
|
||
|
||
if (neighbor_log_blk.m_solid_color_flag_ldr)
|
||
return false;
|
||
|
||
if ((trial_log_blk.m_color_endpoint_modes[0] == neighbor_log_blk.m_color_endpoint_modes[0]) &&
|
||
(trial_log_blk.m_dual_plane == neighbor_log_blk.m_dual_plane) && (trial_log_blk.m_color_component_selector == neighbor_log_blk.m_color_component_selector) &&
|
||
(trial_log_blk.m_num_partitions == neighbor_log_blk.m_num_partitions) && (trial_log_blk.m_partition_id == neighbor_log_blk.m_partition_id) &&
|
||
(trial_log_blk.m_grid_width == neighbor_log_blk.m_grid_width) && (trial_log_blk.m_grid_height == neighbor_log_blk.m_grid_height) &&
|
||
(trial_log_blk.m_endpoint_ise_range == neighbor_log_blk.m_endpoint_ise_range) && (trial_log_blk.m_weight_ise_range == neighbor_log_blk.m_weight_ise_range))
|
||
{
|
||
return true;
|
||
}
|
||
|
||
return false;
|
||
}
|
||
|
||
static bool compare_log_block_configs_and_endpoints(const astc_helpers::log_astc_block& trial_log_blk, const astc_helpers::log_astc_block& neighbor_log_blk)
|
||
{
|
||
if (!compare_log_block_configs(trial_log_blk, neighbor_log_blk))
|
||
return false;
|
||
|
||
const uint32_t total_endpoint_vals = trial_log_blk.m_num_partitions * astc_helpers::get_num_cem_values(trial_log_blk.m_color_endpoint_modes[0]);
|
||
if (memcmp(trial_log_blk.m_endpoints, neighbor_log_blk.m_endpoints, total_endpoint_vals) == 0)
|
||
return true;
|
||
|
||
return false;
|
||
}
|
||
|
||
static bool compare_log_blocks_for_equality(const astc_helpers::log_astc_block& trial_log_blk, const astc_helpers::log_astc_block& neighbor_log_blk)
|
||
{
|
||
if (trial_log_blk.m_solid_color_flag_ldr)
|
||
{
|
||
if (!neighbor_log_blk.m_solid_color_flag_ldr)
|
||
return false;
|
||
|
||
for (uint32_t i = 0; i < 4; i++)
|
||
if (trial_log_blk.m_solid_color[i] != neighbor_log_blk.m_solid_color[i])
|
||
return false;
|
||
|
||
return true;
|
||
}
|
||
else if (neighbor_log_blk.m_solid_color_flag_ldr)
|
||
{
|
||
return false;
|
||
}
|
||
|
||
assert(!trial_log_blk.m_solid_color_flag_ldr && !neighbor_log_blk.m_solid_color_flag_ldr);
|
||
|
||
if ((trial_log_blk.m_color_endpoint_modes[0] == neighbor_log_blk.m_color_endpoint_modes[0]) &&
|
||
(trial_log_blk.m_dual_plane == neighbor_log_blk.m_dual_plane) && (trial_log_blk.m_color_component_selector == neighbor_log_blk.m_color_component_selector) &&
|
||
(trial_log_blk.m_num_partitions == neighbor_log_blk.m_num_partitions) && (trial_log_blk.m_partition_id == neighbor_log_blk.m_partition_id) &&
|
||
(trial_log_blk.m_grid_width == neighbor_log_blk.m_grid_width) && (trial_log_blk.m_grid_height == neighbor_log_blk.m_grid_height) &&
|
||
(trial_log_blk.m_endpoint_ise_range == neighbor_log_blk.m_endpoint_ise_range) && (trial_log_blk.m_weight_ise_range == neighbor_log_blk.m_weight_ise_range))
|
||
{
|
||
const uint32_t total_endpoint_vals = trial_log_blk.m_num_partitions * astc_helpers::get_num_cem_values(trial_log_blk.m_color_endpoint_modes[0]);
|
||
if (memcmp(trial_log_blk.m_endpoints, neighbor_log_blk.m_endpoints, total_endpoint_vals) == 0)
|
||
{
|
||
const uint32_t total_weights = (trial_log_blk.m_dual_plane ? 2 : 1) * (trial_log_blk.m_grid_width * trial_log_blk.m_grid_height);
|
||
return memcmp(trial_log_blk.m_weights, neighbor_log_blk.m_weights, total_weights) == 0;
|
||
}
|
||
}
|
||
|
||
return false;
|
||
}
|
||
|
||
void configure_encoder_effort_level(int level, ldr_astc_block_encode_image_high_level_config& cfg)
|
||
{
|
||
switch (level)
|
||
{
|
||
case 10:
|
||
{
|
||
cfg.m_second_superpass_refinement = true;
|
||
cfg.m_third_superpass_try_neighbors = true;
|
||
|
||
cfg.m_subsets_enabled = true;
|
||
cfg.m_use_blue_contraction = true;
|
||
cfg.m_use_base_ofs = true;
|
||
|
||
cfg.m_force_all_dual_plane_chan_evals = true;
|
||
cfg.m_filter_by_pca_angles_flag = false;
|
||
|
||
cfg.m_superbucket_max_to_retain[0] = 256;
|
||
cfg.m_superbucket_max_to_retain[1] = 256;
|
||
cfg.m_superbucket_max_to_retain[2] = 256;
|
||
|
||
cfg.m_base_parts2 = 128;
|
||
cfg.m_base_parts3 = 128;
|
||
cfg.m_part2_fraction_to_keep = 1;
|
||
cfg.m_part3_fraction_to_keep = 1;
|
||
|
||
cfg.m_final_shortlist_fraction[0] = 1.0f;
|
||
cfg.m_final_shortlist_fraction[1] = 1.0f;
|
||
cfg.m_final_shortlist_fraction[2] = 1.0f;
|
||
|
||
cfg.m_final_shortlist_max_size[0] = 128;
|
||
cfg.m_final_shortlist_max_size[1] = 128;
|
||
cfg.m_final_shortlist_max_size[2] = 128;
|
||
|
||
// Second superpass
|
||
cfg.m_second_superpass_fract_to_recompress = .075f;
|
||
cfg.m_superbucket_max_to_retain_p2[0] = 1024;
|
||
cfg.m_superbucket_max_to_retain_p2[1] = 1024;
|
||
cfg.m_superbucket_max_to_retain_p2[2] = 1024;
|
||
cfg.m_final_shortlist_max_size_p2[0] = 256;
|
||
cfg.m_final_shortlist_max_size_p2[1] = 256;
|
||
cfg.m_final_shortlist_max_size_p2[2] = 256;
|
||
cfg.m_base_parts2_p2 = 128;
|
||
cfg.m_base_parts3_p2 = 128;
|
||
cfg.m_force_all_dp_chans_p2 = true;
|
||
cfg.m_filter_by_pca_angles_flag_p2 = false;
|
||
|
||
cfg.m_final_encode_always_try_rgb_direct = true;
|
||
|
||
cfg.m_early_stop_wpsnr = 90.0f;
|
||
cfg.m_early_stop2_wpsnr = 90.0f;
|
||
cfg.m_grid_hv_filtering = false;
|
||
cfg.m_low_freq_block_filtering = false;
|
||
|
||
break;
|
||
}
|
||
case 9:
|
||
{
|
||
cfg.m_second_superpass_refinement = true;
|
||
cfg.m_third_superpass_try_neighbors = true;
|
||
|
||
cfg.m_subsets_enabled = true;
|
||
cfg.m_use_blue_contraction = true;
|
||
cfg.m_use_base_ofs = true;
|
||
|
||
cfg.m_force_all_dual_plane_chan_evals = false;
|
||
cfg.m_filter_by_pca_angles_flag = true;
|
||
|
||
cfg.m_superbucket_max_to_retain[0] = 8;
|
||
cfg.m_superbucket_max_to_retain[1] = 16;
|
||
cfg.m_superbucket_max_to_retain[2] = 32;
|
||
|
||
cfg.m_base_parts2 = 32;
|
||
cfg.m_base_parts3 = 32;
|
||
cfg.m_part2_fraction_to_keep = 2;
|
||
cfg.m_part3_fraction_to_keep = 2;
|
||
|
||
cfg.m_final_shortlist_fraction[0] = 1.0f;
|
||
cfg.m_final_shortlist_fraction[1] = 1.0f;
|
||
cfg.m_final_shortlist_fraction[2] = 1.0f;
|
||
|
||
cfg.m_final_shortlist_max_size[0] = 4;
|
||
cfg.m_final_shortlist_max_size[1] = 12;
|
||
cfg.m_final_shortlist_max_size[2] = 24;
|
||
|
||
// Second superpass
|
||
cfg.m_second_superpass_fract_to_recompress = .075f;
|
||
cfg.m_superbucket_max_to_retain_p2[0] = 16;
|
||
cfg.m_superbucket_max_to_retain_p2[1] = 64;
|
||
cfg.m_superbucket_max_to_retain_p2[2] = 256;
|
||
cfg.m_final_shortlist_max_size_p2[0] = 8;
|
||
cfg.m_final_shortlist_max_size_p2[1] = 16;
|
||
cfg.m_final_shortlist_max_size_p2[2] = 32;
|
||
cfg.m_base_parts2_p2 = 64;
|
||
cfg.m_base_parts3_p2 = 64;
|
||
cfg.m_force_all_dp_chans_p2 = false;
|
||
cfg.m_filter_by_pca_angles_flag_p2 = false;
|
||
|
||
cfg.m_final_encode_always_try_rgb_direct = false;
|
||
|
||
cfg.m_early_stop_wpsnr = 75.0f;
|
||
cfg.m_early_stop2_wpsnr = 70.0f;
|
||
|
||
break;
|
||
}
|
||
case 8:
|
||
{
|
||
cfg.m_second_superpass_refinement = true;
|
||
cfg.m_third_superpass_try_neighbors = true;
|
||
|
||
cfg.m_subsets_enabled = true;
|
||
cfg.m_use_blue_contraction = true;
|
||
cfg.m_use_base_ofs = true;
|
||
|
||
cfg.m_force_all_dual_plane_chan_evals = false;
|
||
cfg.m_filter_by_pca_angles_flag = true;
|
||
|
||
cfg.m_superbucket_max_to_retain[0] = 4;
|
||
cfg.m_superbucket_max_to_retain[1] = 8;
|
||
cfg.m_superbucket_max_to_retain[2] = 16;
|
||
|
||
cfg.m_base_parts2 = 16;
|
||
cfg.m_base_parts3 = 16;
|
||
cfg.m_part2_fraction_to_keep = 2;
|
||
cfg.m_part3_fraction_to_keep = 2;
|
||
|
||
cfg.m_final_shortlist_fraction[0] = 1.0f;
|
||
cfg.m_final_shortlist_fraction[1] = 1.0f;
|
||
cfg.m_final_shortlist_fraction[2] = 1.0f;
|
||
|
||
cfg.m_final_shortlist_max_size[0] = 3;
|
||
cfg.m_final_shortlist_max_size[1] = 8;
|
||
cfg.m_final_shortlist_max_size[2] = 12;
|
||
|
||
// Second superpass
|
||
cfg.m_second_superpass_fract_to_recompress = .075f;
|
||
cfg.m_superbucket_max_to_retain_p2[0] = 16;
|
||
cfg.m_superbucket_max_to_retain_p2[1] = 64;
|
||
cfg.m_superbucket_max_to_retain_p2[2] = 256;
|
||
cfg.m_final_shortlist_max_size_p2[0] = 8;
|
||
cfg.m_final_shortlist_max_size_p2[1] = 16;
|
||
cfg.m_final_shortlist_max_size_p2[2] = 32;
|
||
cfg.m_base_parts2_p2 = 64;
|
||
cfg.m_base_parts3_p2 = 64;
|
||
cfg.m_force_all_dp_chans_p2 = false;
|
||
cfg.m_filter_by_pca_angles_flag_p2 = false;
|
||
|
||
cfg.m_final_encode_always_try_rgb_direct = false;
|
||
|
||
cfg.m_early_stop_wpsnr = 75.0f;
|
||
cfg.m_early_stop2_wpsnr = 70.0f;
|
||
break;
|
||
}
|
||
case 7:
|
||
{
|
||
cfg.m_second_superpass_refinement = true;
|
||
cfg.m_third_superpass_try_neighbors = true;
|
||
|
||
cfg.m_subsets_enabled = true;
|
||
cfg.m_use_blue_contraction = true;
|
||
cfg.m_use_base_ofs = true;
|
||
|
||
cfg.m_disable_rgb_dual_plane = false;
|
||
cfg.m_strong_dp_decorr_thresh_rgb = .9f;
|
||
|
||
cfg.m_force_all_dual_plane_chan_evals = false;
|
||
cfg.m_filter_by_pca_angles_flag = true;
|
||
|
||
cfg.m_superbucket_max_to_retain[0] = 3;
|
||
cfg.m_superbucket_max_to_retain[1] = 7;
|
||
cfg.m_superbucket_max_to_retain[2] = 12;
|
||
|
||
cfg.m_base_parts2 = 12;
|
||
cfg.m_base_parts3 = 12;
|
||
cfg.m_part2_fraction_to_keep = 2;
|
||
cfg.m_part3_fraction_to_keep = 2;
|
||
|
||
cfg.m_final_shortlist_fraction[0] = 1.0f;
|
||
cfg.m_final_shortlist_fraction[1] = 1.0f;
|
||
cfg.m_final_shortlist_fraction[2] = 1.0f;
|
||
|
||
cfg.m_final_shortlist_max_size[0] = 2;
|
||
cfg.m_final_shortlist_max_size[1] = 4;
|
||
cfg.m_final_shortlist_max_size[2] = 8;
|
||
|
||
cfg.m_gradient_descent_flag = true;
|
||
cfg.m_polish_weights_flag = true;
|
||
cfg.m_qcd_enabled_flag = true;
|
||
|
||
cfg.m_bucket_pruning_passes = false;
|
||
cfg.m_cem_enc_params.m_max_ls_passes = 1;
|
||
|
||
// Second superpass
|
||
cfg.m_second_superpass_fract_to_recompress = .075f;
|
||
cfg.m_superbucket_max_to_retain_p2[0] = 4;
|
||
cfg.m_superbucket_max_to_retain_p2[1] = 16;
|
||
cfg.m_superbucket_max_to_retain_p2[2] = 32;
|
||
cfg.m_final_shortlist_max_size_p2[0] = 4;
|
||
cfg.m_final_shortlist_max_size_p2[1] = 16;
|
||
cfg.m_final_shortlist_max_size_p2[2] = 32;
|
||
cfg.m_base_parts2_p2 = 32;
|
||
cfg.m_base_parts3_p2 = 8;
|
||
cfg.m_force_all_dp_chans_p2 = false;
|
||
cfg.m_filter_by_pca_angles_flag_p2 = true;
|
||
|
||
cfg.m_early_stop_wpsnr = 65.0f;
|
||
cfg.m_early_stop2_wpsnr = 60.0f;
|
||
break;
|
||
}
|
||
case 6:
|
||
{
|
||
cfg.m_second_superpass_refinement = true;
|
||
cfg.m_third_superpass_try_neighbors = true;
|
||
|
||
cfg.m_subsets_enabled = true;
|
||
cfg.m_use_blue_contraction = true;
|
||
cfg.m_use_base_ofs = true;
|
||
|
||
cfg.m_disable_rgb_dual_plane = false;
|
||
cfg.m_strong_dp_decorr_thresh_rgb = .75f;
|
||
|
||
cfg.m_force_all_dual_plane_chan_evals = false;
|
||
cfg.m_filter_by_pca_angles_flag = true;
|
||
|
||
cfg.m_superbucket_max_to_retain[0] = 2;
|
||
cfg.m_superbucket_max_to_retain[1] = 5;
|
||
cfg.m_superbucket_max_to_retain[2] = 10;
|
||
|
||
cfg.m_base_parts2 = 12;
|
||
cfg.m_base_parts3 = 10;
|
||
cfg.m_part2_fraction_to_keep = 2;
|
||
cfg.m_part3_fraction_to_keep = 2;
|
||
|
||
cfg.m_final_shortlist_fraction[0] = 1.0f;
|
||
cfg.m_final_shortlist_fraction[1] = 1.0f;
|
||
cfg.m_final_shortlist_fraction[2] = 1.0f;
|
||
|
||
cfg.m_final_shortlist_max_size[0] = 1;
|
||
cfg.m_final_shortlist_max_size[1] = 4;
|
||
cfg.m_final_shortlist_max_size[2] = 8;
|
||
|
||
cfg.m_gradient_descent_flag = true;
|
||
cfg.m_polish_weights_flag = true;
|
||
cfg.m_qcd_enabled_flag = true;
|
||
|
||
cfg.m_bucket_pruning_passes = false;
|
||
cfg.m_cem_enc_params.m_max_ls_passes = 1;
|
||
|
||
// Second superpass
|
||
cfg.m_second_superpass_fract_to_recompress = .075f;
|
||
cfg.m_superbucket_max_to_retain_p2[0] = 2;
|
||
cfg.m_superbucket_max_to_retain_p2[1] = 8;
|
||
cfg.m_superbucket_max_to_retain_p2[2] = 16;
|
||
cfg.m_final_shortlist_max_size_p2[0] = 2;
|
||
cfg.m_final_shortlist_max_size_p2[1] = 8;
|
||
cfg.m_final_shortlist_max_size_p2[2] = 16;
|
||
cfg.m_base_parts2_p2 = 32;
|
||
cfg.m_base_parts3_p2 = 8;
|
||
cfg.m_force_all_dp_chans_p2 = false;
|
||
cfg.m_filter_by_pca_angles_flag_p2 = true;
|
||
|
||
cfg.m_early_stop_wpsnr = 65.0f;
|
||
cfg.m_early_stop2_wpsnr = 60.0f;
|
||
break;
|
||
}
|
||
case 5:
|
||
{
|
||
cfg.m_second_superpass_refinement = true;
|
||
cfg.m_third_superpass_try_neighbors = true;
|
||
|
||
cfg.m_subsets_enabled = true;
|
||
cfg.m_use_blue_contraction = true;
|
||
cfg.m_use_base_ofs = true;
|
||
|
||
cfg.m_disable_rgb_dual_plane = false;
|
||
cfg.m_strong_dp_decorr_thresh_rgb = .75f;
|
||
|
||
cfg.m_force_all_dual_plane_chan_evals = false;
|
||
cfg.m_filter_by_pca_angles_flag = true;
|
||
|
||
cfg.m_superbucket_max_to_retain[0] = 1;
|
||
cfg.m_superbucket_max_to_retain[1] = 4;
|
||
cfg.m_superbucket_max_to_retain[2] = 8;
|
||
|
||
cfg.m_base_parts2 = 12;
|
||
cfg.m_base_parts3 = 8;
|
||
cfg.m_part2_fraction_to_keep = 2;
|
||
cfg.m_part3_fraction_to_keep = 2;
|
||
|
||
cfg.m_final_shortlist_fraction[0] = 1.0f;
|
||
cfg.m_final_shortlist_fraction[1] = 1.0f;
|
||
cfg.m_final_shortlist_fraction[2] = 1.0f;
|
||
|
||
cfg.m_final_shortlist_max_size[0] = 1;
|
||
cfg.m_final_shortlist_max_size[1] = 4;
|
||
cfg.m_final_shortlist_max_size[2] = 8;
|
||
|
||
cfg.m_gradient_descent_flag = true;
|
||
cfg.m_polish_weights_flag = true;
|
||
cfg.m_qcd_enabled_flag = false;
|
||
|
||
cfg.m_bucket_pruning_passes = false;
|
||
cfg.m_cem_enc_params.m_max_ls_passes = 1;
|
||
|
||
// Second superpass
|
||
cfg.m_second_superpass_fract_to_recompress = .075f;
|
||
cfg.m_superbucket_max_to_retain_p2[0] = 2;
|
||
cfg.m_superbucket_max_to_retain_p2[1] = 8;
|
||
cfg.m_superbucket_max_to_retain_p2[2] = 16;
|
||
cfg.m_final_shortlist_max_size_p2[0] = 2;
|
||
cfg.m_final_shortlist_max_size_p2[1] = 8;
|
||
cfg.m_final_shortlist_max_size_p2[2] = 16;
|
||
cfg.m_base_parts2_p2 = 32;
|
||
cfg.m_base_parts3_p2 = 8;
|
||
cfg.m_force_all_dp_chans_p2 = false;
|
||
cfg.m_filter_by_pca_angles_flag_p2 = true;
|
||
|
||
cfg.m_early_stop_wpsnr = 65.0f;
|
||
cfg.m_early_stop2_wpsnr = 60.0f;
|
||
break;
|
||
}
|
||
case 4:
|
||
{
|
||
cfg.m_second_superpass_refinement = true;
|
||
cfg.m_third_superpass_try_neighbors = true;
|
||
|
||
cfg.m_subsets_enabled = true;
|
||
cfg.m_use_blue_contraction = true;
|
||
cfg.m_use_base_ofs = true;
|
||
|
||
cfg.m_disable_rgb_dual_plane = false;
|
||
cfg.m_strong_dp_decorr_thresh_rgb = .75f;
|
||
|
||
cfg.m_force_all_dual_plane_chan_evals = false;
|
||
cfg.m_filter_by_pca_angles_flag = true;
|
||
|
||
cfg.m_superbucket_max_to_retain[0] = 1;
|
||
cfg.m_superbucket_max_to_retain[1] = 4;
|
||
cfg.m_superbucket_max_to_retain[2] = 8;
|
||
|
||
cfg.m_base_parts2 = 8;
|
||
cfg.m_base_parts3 = 4;
|
||
cfg.m_part2_fraction_to_keep = 2;
|
||
cfg.m_part3_fraction_to_keep = 2;
|
||
|
||
cfg.m_final_shortlist_fraction[0] = 1.0f;
|
||
cfg.m_final_shortlist_fraction[1] = 1.0f;
|
||
cfg.m_final_shortlist_fraction[2] = 1.0f;
|
||
|
||
cfg.m_final_shortlist_max_size[0] = 1;
|
||
cfg.m_final_shortlist_max_size[1] = 4;
|
||
cfg.m_final_shortlist_max_size[2] = 8;
|
||
|
||
cfg.m_gradient_descent_flag = true;
|
||
cfg.m_polish_weights_flag = true;
|
||
cfg.m_qcd_enabled_flag = false;
|
||
|
||
cfg.m_bucket_pruning_passes = false;
|
||
cfg.m_cem_enc_params.m_max_ls_passes = 1;
|
||
|
||
// Second superpass
|
||
cfg.m_second_superpass_fract_to_recompress = .075f;
|
||
cfg.m_superbucket_max_to_retain_p2[0] = 2;
|
||
cfg.m_superbucket_max_to_retain_p2[1] = 8;
|
||
cfg.m_superbucket_max_to_retain_p2[2] = 16;
|
||
cfg.m_final_shortlist_max_size_p2[0] = 2;
|
||
cfg.m_final_shortlist_max_size_p2[1] = 8;
|
||
cfg.m_final_shortlist_max_size_p2[2] = 16;
|
||
cfg.m_base_parts2_p2 = 32;
|
||
cfg.m_base_parts3_p2 = 8;
|
||
cfg.m_force_all_dp_chans_p2 = false;
|
||
cfg.m_filter_by_pca_angles_flag_p2 = true;
|
||
|
||
cfg.m_early_stop_wpsnr = 65.0f;
|
||
cfg.m_early_stop2_wpsnr = 60.0f;
|
||
break;
|
||
}
|
||
default:
|
||
case 3:
|
||
{
|
||
cfg.m_second_superpass_refinement = true;
|
||
cfg.m_third_superpass_try_neighbors = true;
|
||
|
||
cfg.m_subsets_enabled = true;
|
||
cfg.m_use_blue_contraction = true;
|
||
cfg.m_use_base_ofs = false;
|
||
|
||
cfg.m_disable_rgb_dual_plane = false;
|
||
cfg.m_strong_dp_decorr_thresh_rgb = .75f;
|
||
|
||
cfg.m_force_all_dual_plane_chan_evals = false;
|
||
cfg.m_filter_by_pca_angles_flag = true;
|
||
|
||
cfg.m_superbucket_max_to_retain[0] = 1;
|
||
cfg.m_superbucket_max_to_retain[1] = 4;
|
||
cfg.m_superbucket_max_to_retain[2] = 8;
|
||
|
||
cfg.m_base_parts2 = 4;
|
||
cfg.m_base_parts3 = 2;
|
||
cfg.m_part2_fraction_to_keep = 2;
|
||
cfg.m_part3_fraction_to_keep = 2;
|
||
|
||
cfg.m_final_shortlist_fraction[0] = 1.0f;
|
||
cfg.m_final_shortlist_fraction[1] = 1.0f;
|
||
cfg.m_final_shortlist_fraction[2] = 1.0f;
|
||
|
||
cfg.m_final_shortlist_max_size[0] = 1;
|
||
cfg.m_final_shortlist_max_size[1] = 4;
|
||
cfg.m_final_shortlist_max_size[2] = 8;
|
||
|
||
cfg.m_gradient_descent_flag = true;
|
||
cfg.m_polish_weights_flag = true;
|
||
cfg.m_qcd_enabled_flag = false;
|
||
|
||
cfg.m_bucket_pruning_passes = false;
|
||
cfg.m_cem_enc_params.m_max_ls_passes = 1;
|
||
|
||
// Second superpass
|
||
cfg.m_second_superpass_fract_to_recompress = .075f;
|
||
cfg.m_superbucket_max_to_retain_p2[0] = 2;
|
||
cfg.m_superbucket_max_to_retain_p2[1] = 8;
|
||
cfg.m_superbucket_max_to_retain_p2[2] = 16;
|
||
cfg.m_final_shortlist_max_size_p2[0] = 2;
|
||
cfg.m_final_shortlist_max_size_p2[1] = 8;
|
||
cfg.m_final_shortlist_max_size_p2[2] = 16;
|
||
cfg.m_base_parts2_p2 = 32;
|
||
cfg.m_base_parts3_p2 = 8;
|
||
cfg.m_force_all_dp_chans_p2 = false;
|
||
cfg.m_filter_by_pca_angles_flag_p2 = true;
|
||
|
||
cfg.m_early_stop_wpsnr = 65.0f;
|
||
cfg.m_early_stop2_wpsnr = 60.0f;
|
||
break;
|
||
}
|
||
case 2:
|
||
{
|
||
// Level 2+ have subsets and RGB dual-plane enabled
|
||
cfg.m_second_superpass_refinement = false;
|
||
cfg.m_third_superpass_try_neighbors = true;
|
||
|
||
cfg.m_subsets_enabled = true;
|
||
cfg.m_use_blue_contraction = true;
|
||
cfg.m_use_base_ofs = false;
|
||
cfg.m_disable_rgb_dual_plane = false;
|
||
|
||
cfg.m_force_all_dual_plane_chan_evals = false;
|
||
cfg.m_filter_by_pca_angles_flag = true;
|
||
|
||
cfg.m_superbucket_max_to_retain[0] = 1;
|
||
cfg.m_superbucket_max_to_retain[1] = 2;
|
||
cfg.m_superbucket_max_to_retain[2] = 3;
|
||
|
||
cfg.m_base_parts2 = 1;
|
||
cfg.m_base_parts3 = 0;
|
||
cfg.m_part2_fraction_to_keep = 1;
|
||
cfg.m_part3_fraction_to_keep = 1;
|
||
|
||
cfg.m_final_shortlist_fraction[0] = 1.0f;
|
||
cfg.m_final_shortlist_fraction[1] = 1.0f;
|
||
cfg.m_final_shortlist_fraction[2] = 1.0f;
|
||
|
||
cfg.m_final_shortlist_max_size[0] = 1;
|
||
cfg.m_final_shortlist_max_size[1] = 2;
|
||
cfg.m_final_shortlist_max_size[2] = 3;
|
||
|
||
cfg.m_gradient_descent_flag = false;
|
||
cfg.m_polish_weights_flag = true;
|
||
cfg.m_qcd_enabled_flag = false;
|
||
|
||
cfg.m_bucket_pruning_passes = false;
|
||
cfg.m_cem_enc_params.m_max_ls_passes = 1;
|
||
|
||
// Second superpass
|
||
cfg.m_second_superpass_fract_to_recompress = .04f;
|
||
cfg.m_second_pass_force_subsets_enabled = true;
|
||
cfg.m_superbucket_max_to_retain_p2[0] = 1;
|
||
cfg.m_superbucket_max_to_retain_p2[1] = 2;
|
||
cfg.m_superbucket_max_to_retain_p2[2] = 8;
|
||
cfg.m_final_shortlist_max_size_p2[0] = 1;
|
||
cfg.m_final_shortlist_max_size_p2[1] = 2;
|
||
cfg.m_final_shortlist_max_size_p2[2] = 8;
|
||
cfg.m_base_parts2_p2 = 16;
|
||
cfg.m_base_parts3_p2 = 0;
|
||
cfg.m_force_all_dp_chans_p2 = false;
|
||
cfg.m_filter_by_pca_angles_flag_p2 = true;
|
||
|
||
cfg.m_early_stop_wpsnr = 45.0f;
|
||
cfg.m_early_stop2_wpsnr = 40.0f;
|
||
break;
|
||
}
|
||
case 1:
|
||
{
|
||
cfg.m_second_superpass_refinement = false;
|
||
cfg.m_third_superpass_try_neighbors = false;
|
||
|
||
cfg.m_subsets_enabled = false;
|
||
cfg.m_use_blue_contraction = true;
|
||
cfg.m_use_base_ofs = false;
|
||
cfg.m_disable_rgb_dual_plane = true;
|
||
|
||
cfg.m_force_all_dual_plane_chan_evals = false;
|
||
cfg.m_filter_by_pca_angles_flag = true;
|
||
|
||
cfg.m_superbucket_max_to_retain[0] = 1;
|
||
cfg.m_superbucket_max_to_retain[1] = 1;
|
||
cfg.m_superbucket_max_to_retain[2] = 1;
|
||
|
||
cfg.m_base_parts2 = 0;
|
||
cfg.m_base_parts3 = 0;
|
||
cfg.m_part2_fraction_to_keep = 1;
|
||
cfg.m_part3_fraction_to_keep = 1;
|
||
|
||
cfg.m_final_shortlist_fraction[0] = 1.0f;
|
||
cfg.m_final_shortlist_fraction[1] = 1.0f;
|
||
cfg.m_final_shortlist_fraction[2] = 1.0f;
|
||
|
||
cfg.m_final_shortlist_max_size[0] = 1;
|
||
cfg.m_final_shortlist_max_size[1] = 1;
|
||
cfg.m_final_shortlist_max_size[2] = 1;
|
||
|
||
cfg.m_gradient_descent_flag = false;
|
||
cfg.m_polish_weights_flag = true;
|
||
cfg.m_qcd_enabled_flag = false;
|
||
|
||
cfg.m_bucket_pruning_passes = false;
|
||
cfg.m_cem_enc_params.m_max_ls_passes = 1;
|
||
|
||
cfg.m_early_stop_wpsnr = 45.0f;
|
||
cfg.m_early_stop2_wpsnr = 40.0f;
|
||
break;
|
||
}
|
||
case 0:
|
||
{
|
||
cfg.m_second_superpass_refinement = false;
|
||
cfg.m_third_superpass_try_neighbors = false;
|
||
|
||
cfg.m_subsets_enabled = false;
|
||
cfg.m_use_blue_contraction = true;
|
||
cfg.m_use_base_ofs = false;
|
||
cfg.m_disable_rgb_dual_plane = true;
|
||
|
||
cfg.m_force_all_dual_plane_chan_evals = false;
|
||
cfg.m_filter_by_pca_angles_flag = true;
|
||
|
||
cfg.m_superbucket_max_to_retain[0] = 1;
|
||
cfg.m_superbucket_max_to_retain[1] = 1;
|
||
cfg.m_superbucket_max_to_retain[2] = 1;
|
||
|
||
cfg.m_base_parts2 = 0;
|
||
cfg.m_base_parts3 = 0;
|
||
cfg.m_part2_fraction_to_keep = 1;
|
||
cfg.m_part3_fraction_to_keep = 1;
|
||
|
||
cfg.m_final_shortlist_fraction[0] = 1.0f;
|
||
cfg.m_final_shortlist_fraction[1] = 1.0f;
|
||
cfg.m_final_shortlist_fraction[2] = 1.0f;
|
||
|
||
cfg.m_final_shortlist_max_size[0] = 1;
|
||
cfg.m_final_shortlist_max_size[1] = 1;
|
||
cfg.m_final_shortlist_max_size[2] = 1;
|
||
|
||
cfg.m_gradient_descent_flag = false;
|
||
cfg.m_polish_weights_flag = false;
|
||
cfg.m_qcd_enabled_flag = false;
|
||
|
||
cfg.m_bucket_pruning_passes = false;
|
||
cfg.m_cem_enc_params.m_max_ls_passes = 1;
|
||
|
||
cfg.m_early_stop_wpsnr = 45.0f;
|
||
cfg.m_early_stop2_wpsnr = 40.0f;
|
||
break;
|
||
}
|
||
}
|
||
}
|
||
|
||
static bool zstd_compress(const uint8_t* pData, size_t data_len, uint8_vec& comp_data, int zstd_level)
|
||
{
|
||
if (!data_len)
|
||
{
|
||
comp_data.resize(0);
|
||
return true;
|
||
}
|
||
|
||
assert(pData);
|
||
|
||
comp_data.resize(ZSTD_compressBound(data_len));
|
||
|
||
size_t result = ZSTD_compress(comp_data.data(), comp_data.size(), pData, data_len, zstd_level);
|
||
|
||
if (ZSTD_isError(result))
|
||
{
|
||
comp_data.resize(0);
|
||
return false;
|
||
}
|
||
|
||
if (result > UINT32_MAX)
|
||
{
|
||
comp_data.resize(0);
|
||
return false;
|
||
}
|
||
|
||
comp_data.resize(result);
|
||
return true;
|
||
}
|
||
|
||
static bool zstd_compress(const bitwise_coder& coder, uint8_vec& comp_data, int zstd_level)
|
||
{
|
||
return zstd_compress(coder.get_bytes().data(), coder.get_bytes().size(), comp_data, zstd_level);
|
||
}
|
||
|
||
static bool zstd_compress(const uint8_vec& vec, uint8_vec& comp_data, int zstd_level)
|
||
{
|
||
return zstd_compress(vec.data(), vec.size(), comp_data, zstd_level);
|
||
}
|
||
|
||
static uint32_t encode_values(bitwise_coder& coder, uint32_t total_values, const uint8_t* pVals, uint32_t endpoint_range)
|
||
{
|
||
const uint32_t MAX_VALS = 64;
|
||
uint32_t bit_values[MAX_VALS], tq_values[(MAX_VALS + 2) / 3];
|
||
uint32_t total_tq_values = 0, tq_accum = 0, tq_mul = 1;
|
||
|
||
assert((total_values) && (total_values <= MAX_VALS));
|
||
|
||
const uint32_t ep_bits = astc_helpers::g_ise_range_table[endpoint_range][0];
|
||
const uint32_t ep_trits = astc_helpers::g_ise_range_table[endpoint_range][1];
|
||
const uint32_t ep_quints = astc_helpers::g_ise_range_table[endpoint_range][2];
|
||
|
||
for (uint32_t i = 0; i < total_values; i++)
|
||
{
|
||
uint32_t val = pVals[i];
|
||
|
||
uint32_t bits = val & ((1 << ep_bits) - 1);
|
||
uint32_t tq = val >> ep_bits;
|
||
|
||
bit_values[i] = bits;
|
||
|
||
if (ep_trits)
|
||
{
|
||
assert(tq < 3);
|
||
tq_accum += tq * tq_mul;
|
||
tq_mul *= 3;
|
||
if (tq_mul == 243)
|
||
{
|
||
assert(total_tq_values < BASISU_ARRAY_SIZE(tq_values));
|
||
tq_values[total_tq_values++] = tq_accum;
|
||
tq_accum = 0;
|
||
tq_mul = 1;
|
||
}
|
||
}
|
||
else if (ep_quints)
|
||
{
|
||
assert(tq < 5);
|
||
tq_accum += tq * tq_mul;
|
||
tq_mul *= 5;
|
||
if (tq_mul == 125)
|
||
{
|
||
assert(total_tq_values < BASISU_ARRAY_SIZE(tq_values));
|
||
tq_values[total_tq_values++] = tq_accum;
|
||
tq_accum = 0;
|
||
tq_mul = 1;
|
||
}
|
||
}
|
||
}
|
||
|
||
uint32_t total_bits_output = 0;
|
||
|
||
for (uint32_t i = 0; i < total_tq_values; i++)
|
||
{
|
||
const uint32_t num_bits = ep_trits ? 8 : 7;
|
||
coder.put_bits(tq_values[i], num_bits);
|
||
total_bits_output += num_bits;
|
||
}
|
||
|
||
if (tq_mul > 1)
|
||
{
|
||
uint32_t num_bits;
|
||
if (ep_trits)
|
||
{
|
||
if (tq_mul == 3)
|
||
num_bits = 2;
|
||
else if (tq_mul == 9)
|
||
num_bits = 4;
|
||
else if (tq_mul == 27)
|
||
num_bits = 5;
|
||
else //if (tq_mul == 81)
|
||
num_bits = 7;
|
||
}
|
||
else
|
||
{
|
||
if (tq_mul == 5)
|
||
num_bits = 3;
|
||
else //if (tq_mul == 25)
|
||
num_bits = 5;
|
||
}
|
||
coder.put_bits(tq_accum, num_bits);
|
||
total_bits_output += num_bits;
|
||
}
|
||
|
||
for (uint32_t i = 0; i < total_values; i++)
|
||
{
|
||
coder.put_bits(bit_values[i], ep_bits);
|
||
total_bits_output += ep_bits;
|
||
}
|
||
|
||
return total_bits_output;
|
||
}
|
||
|
||
static bool compress_image_full_zstd(
|
||
const image& orig_img, uint8_vec& comp_data, vector2D<astc_helpers::log_astc_block>& coded_blocks,
|
||
const astc_ldr_encode_config& global_cfg,
|
||
job_pool& job_pool,
|
||
ldr_astc_block_encode_image_high_level_config& enc_cfg, const ldr_astc_block_encode_image_output& enc_out)
|
||
{
|
||
BASISU_NOTE_UNUSED(job_pool);
|
||
|
||
const uint32_t width = orig_img.get_width(), height = orig_img.get_height();
|
||
|
||
const uint32_t block_width = global_cfg.m_astc_block_width;
|
||
const uint32_t block_height = global_cfg.m_astc_block_height;
|
||
const uint32_t total_block_pixels = block_width * block_height;
|
||
|
||
const uint32_t total_pixels = width * height;
|
||
const uint32_t num_blocks_x = (width + block_width - 1) / block_width;
|
||
const uint32_t num_blocks_y = (height + block_height - 1) / block_height;
|
||
const uint32_t total_blocks = num_blocks_x * num_blocks_y;
|
||
const bool has_alpha = orig_img.has_alpha();
|
||
|
||
// Mode
|
||
uint8_vec mode_bytes;
|
||
mode_bytes.reserve(8192);
|
||
|
||
bitwise_coder raw_bits;
|
||
raw_bits.init(8192);
|
||
|
||
uint8_vec solid_dpcm_bytes;
|
||
solid_dpcm_bytes.reserve(8192);
|
||
|
||
// Endpoints
|
||
uint8_vec endpoint_dpcm_reuse_indices;
|
||
endpoint_dpcm_reuse_indices.reserve(8192);
|
||
|
||
bitwise_coder use_bc_bits;
|
||
use_bc_bits.init(1024);
|
||
|
||
bitwise_coder endpoint_dpcm_3bit;
|
||
endpoint_dpcm_3bit.init(1024);
|
||
|
||
bitwise_coder endpoint_dpcm_4bit;
|
||
endpoint_dpcm_4bit.init(1024);
|
||
|
||
uint8_vec endpoint_dpcm_5bit;
|
||
endpoint_dpcm_5bit.reserve(8192);
|
||
|
||
uint8_vec endpoint_dpcm_6bit;
|
||
endpoint_dpcm_6bit.reserve(8192);
|
||
|
||
uint8_vec endpoint_dpcm_7bit;
|
||
endpoint_dpcm_7bit.reserve(8192);
|
||
|
||
uint8_vec endpoint_dpcm_8bit;
|
||
endpoint_dpcm_8bit.reserve(8192);
|
||
|
||
// Weights
|
||
bitwise_coder mean0_bits;
|
||
uint8_vec mean1_bytes;
|
||
uint8_vec run_bytes;
|
||
uint8_vec coeff_bytes;
|
||
bitwise_coder sign_bits;
|
||
bitwise_coder weight2_bits;
|
||
bitwise_coder weight3_bits;
|
||
bitwise_coder weight4_bits;
|
||
uint8_vec weight8_bits;
|
||
|
||
mean0_bits.init(1024);
|
||
mean1_bytes.reserve(1024);
|
||
run_bytes.reserve(8192);
|
||
coeff_bytes.reserve(8192);
|
||
sign_bits.init(1024);
|
||
weight2_bits.init(1024);
|
||
weight3_bits.init(1024);
|
||
weight4_bits.init(1024);
|
||
weight8_bits.reserve(8192);
|
||
|
||
const float replacement_min_psnr = has_alpha ? global_cfg.m_replacement_min_psnr_alpha : global_cfg.m_replacement_min_psnr;
|
||
const float psnr_trial_diff_thresh = has_alpha ? global_cfg.m_psnr_trial_diff_thresh_alpha : global_cfg.m_psnr_trial_diff_thresh;
|
||
const float psnr_trial_diff_thresh_edge = has_alpha ? global_cfg.m_psnr_trial_diff_thresh_edge_alpha : global_cfg.m_psnr_trial_diff_thresh_edge;
|
||
const float total_comp_weights = enc_cfg.m_cem_enc_params.get_total_comp_weights();
|
||
|
||
basist::astc_ldr_t::grid_weight_dct grid_dct;
|
||
grid_dct.init(block_width, block_height);
|
||
|
||
coded_blocks.resize(num_blocks_x, num_blocks_y);
|
||
for (uint32_t y = 0; y < num_blocks_y; y++)
|
||
for (uint32_t x = 0; x < num_blocks_x; x++)
|
||
coded_blocks(x, y).clear();
|
||
|
||
vector2D<basist::astc_ldr_t::prev_block_state_full_zstd> prev_block_states(num_blocks_x, num_blocks_y);
|
||
|
||
int part2_hash[basist::astc_ldr_t::PART_HASH_SIZE];
|
||
std::fill(part2_hash, part2_hash + basist::astc_ldr_t::PART_HASH_SIZE, -1);
|
||
|
||
int part3_hash[basist::astc_ldr_t::PART_HASH_SIZE];
|
||
std::fill(part3_hash, part3_hash + basist::astc_ldr_t::PART_HASH_SIZE, -1);
|
||
|
||
int tm_hash[basist::astc_ldr_t::TM_HASH_SIZE];
|
||
std::fill(tm_hash, tm_hash + basist::astc_ldr_t::TM_HASH_SIZE, -1);
|
||
|
||
const bool use_run_commands_global_enable = true;
|
||
const bool endpoint_dpcm_global_enable = true;
|
||
|
||
uint32_t cur_run_len = 0;
|
||
|
||
uint32_t total_runs = 0, total_run_blocks = 0, total_nonrun_blocks = 0;
|
||
uint32_t total_lossy_replacements = 0;
|
||
uint32_t total_solid_blocks = 0;
|
||
uint32_t total_full_reuse_commands = 0;
|
||
uint32_t total_raw_commands = 0;
|
||
uint32_t total_reuse_full_cfg_emitted = 0;
|
||
uint32_t total_full_cfg_emitted = 0;
|
||
uint32_t num_part_hash_probes = 0;
|
||
uint32_t num_part_hash_hits = 0;
|
||
uint32_t total_used_endpoint_dpcm = 0;
|
||
uint32_t total_used_endpoint_raw = 0;
|
||
uint32_t total_used_dct = 0;
|
||
uint32_t total_used_weight_dpcm = 0;
|
||
uint32_t num_tm_hash_hits = 0, num_tm_hash_probes = 0;
|
||
|
||
raw_bits.put_bits(basist::astc_ldr_t::FULL_ZSTD_HEADER_MARKER, basist::astc_ldr_t::FULL_ZSTD_HEADER_MARKER_BITS);
|
||
|
||
const int block_dim_index = astc_helpers::find_astc_block_size_index(block_width, block_height);
|
||
assert((block_dim_index >= 0) && (block_dim_index < (int)astc_helpers::NUM_ASTC_BLOCK_SIZES));
|
||
|
||
raw_bits.put_bits(block_dim_index, 4);
|
||
|
||
raw_bits.put_bits(enc_cfg.m_cem_enc_params.m_decode_mode_srgb, 1);
|
||
|
||
raw_bits.put_bits(width, 16);
|
||
raw_bits.put_bits(height, 16);
|
||
|
||
raw_bits.put_bits(has_alpha, 1);
|
||
|
||
raw_bits.put_bits(enc_cfg.m_use_dct, 1);
|
||
if (enc_cfg.m_use_dct)
|
||
{
|
||
const int int_q = clamp<int>((int)std::round(global_cfg.m_dct_quality * 2.0f), 0, 200);
|
||
raw_bits.put_bits(int_q, 8);
|
||
}
|
||
|
||
const uint32_t FULL_ZSTD_MAX_RUN_LEN = 64;
|
||
|
||
for (uint32_t by = 0; by < num_blocks_y; by++)
|
||
{
|
||
//const uint32_t base_y = by * block_height;
|
||
|
||
for (uint32_t bx = 0; bx < num_blocks_x; bx++)
|
||
{
|
||
//const uint32_t base_x = bx * block_width;
|
||
//raw_bits.put_bits(0xA1, 8);
|
||
|
||
basist::astc_ldr_t::prev_block_state_full_zstd& prev_state = prev_block_states(bx, by);
|
||
|
||
const basist::astc_ldr_t::prev_block_state_full_zstd* pLeft_state = bx ? &prev_block_states(bx - 1, by) : nullptr;
|
||
const basist::astc_ldr_t::prev_block_state_full_zstd* pUpper_state = by ? &prev_block_states(bx, by - 1) : nullptr;
|
||
const basist::astc_ldr_t::prev_block_state_full_zstd* pDiag_state = (bx && by) ? &prev_block_states(bx - 1, by - 1) : nullptr;
|
||
|
||
const ldr_astc_block_encode_image_output::block_info& blk_info = enc_out.m_image_block_info(bx, by);
|
||
|
||
uint32_t best_packed_out_block_index = blk_info.m_packed_out_block_index;
|
||
|
||
// check for run
|
||
if ((use_run_commands_global_enable) && (bx || by))
|
||
{
|
||
const encode_block_output& blk_out = blk_info.m_out_blocks[best_packed_out_block_index];
|
||
const astc_helpers::log_astc_block& cur_log_blk = blk_out.m_log_blk;
|
||
|
||
const astc_helpers::log_astc_block& prev_log_blk = bx ? coded_blocks(bx - 1, by) : coded_blocks(0, by - 1);
|
||
const basist::astc_ldr_t::prev_block_state_full_zstd* pPrev_block_state = bx ? pLeft_state : pUpper_state;
|
||
|
||
assert(pPrev_block_state);
|
||
|
||
if (compare_log_blocks_for_equality(cur_log_blk, prev_log_blk))
|
||
{
|
||
// Left or upper is exactly the same logical block, so expand the run.
|
||
cur_run_len++;
|
||
|
||
// Accept the previous block (left or upper) as if it's been coded normally.
|
||
|
||
coded_blocks(bx, by) = prev_log_blk;
|
||
|
||
//prev_state.m_was_solid_color = pPrev_block_state->m_was_solid_color;
|
||
prev_state.m_tm_index = pPrev_block_state->m_tm_index;
|
||
//prev_state.m_base_cem_index = pPrev_block_state->m_base_cem_index;
|
||
|
||
if (cur_run_len == FULL_ZSTD_MAX_RUN_LEN)
|
||
{
|
||
total_runs++;
|
||
total_run_blocks += cur_run_len;
|
||
mode_bytes.push_back((uint8_t)((uint32_t)basist::astc_ldr_t::xuastc_zstd_mode::cMODE_RUN | ((cur_run_len - 1) << 2)));
|
||
cur_run_len = 0;
|
||
}
|
||
|
||
continue;
|
||
}
|
||
}
|
||
|
||
if (cur_run_len)
|
||
{
|
||
assert(cur_run_len <= FULL_ZSTD_MAX_RUN_LEN);
|
||
|
||
total_runs++;
|
||
total_run_blocks += cur_run_len;
|
||
mode_bytes.push_back((uint8_t)((uint32_t)basist::astc_ldr_t::xuastc_zstd_mode::cMODE_RUN | ((cur_run_len - 1) << 2)));
|
||
cur_run_len = 0;
|
||
}
|
||
|
||
total_nonrun_blocks++;
|
||
|
||
// TODO: Move this to a prepass that's shared between arith/zstd
|
||
const float ref_wmse = (float)blk_info.m_out_blocks[best_packed_out_block_index].m_sse / (total_comp_weights * (float)total_block_pixels);
|
||
const float ref_wpsnr = (ref_wmse > 1e-5f) ? 20.0f * log10f(255.0f / sqrtf(ref_wmse)) : 10000.0f;
|
||
|
||
if ((global_cfg.m_lossy_supercompression) && (ref_wpsnr >= replacement_min_psnr) &&
|
||
(!blk_info.m_out_blocks[blk_info.m_packed_out_block_index].m_log_blk.m_solid_color_flag_ldr))
|
||
{
|
||
const float psnr_thresh = blk_info.m_strong_edges ? psnr_trial_diff_thresh_edge : psnr_trial_diff_thresh;
|
||
|
||
float best_alt_wpsnr = 0.0f;
|
||
bool found_alternative = false;
|
||
|
||
// Pass: 0 consider full config+part ID endpoint reuse
|
||
// Pass: 1 fall back to just full config+part ID reuse (no endpoints)
|
||
for (uint32_t pass = 0; pass < 2; pass++)
|
||
{
|
||
// Iterate through all available alternative candidates
|
||
for (uint32_t out_block_iter = 0; out_block_iter < blk_info.m_out_blocks.size(); out_block_iter++)
|
||
{
|
||
if (out_block_iter == blk_info.m_packed_out_block_index)
|
||
continue;
|
||
|
||
const float trial_wmse = (float)blk_info.m_out_blocks[out_block_iter].m_sse / (total_comp_weights * (float)total_block_pixels);
|
||
const float trial_wpsnr = (trial_wmse > 1e-5f) ? 20.0f * log10f(255.0f / sqrtf(trial_wmse)) : 10000.0f;
|
||
|
||
// Reject if PSNR too low
|
||
if (trial_wpsnr < (ref_wpsnr - psnr_thresh))
|
||
continue;
|
||
|
||
// Reject if inferior than best found so far
|
||
if (trial_wpsnr < best_alt_wpsnr)
|
||
continue;
|
||
|
||
const astc_helpers::log_astc_block& trial_log_blk = blk_info.m_out_blocks[out_block_iter].m_log_blk;
|
||
|
||
if (trial_log_blk.m_solid_color_flag_ldr)
|
||
continue;
|
||
|
||
// Examine nearby neighbors
|
||
for (uint32_t i = 0; i < basist::astc_ldr_t::cMaxConfigReuseNeighbors; i++)
|
||
{
|
||
int dx = 0, dy = 0;
|
||
switch (i)
|
||
{
|
||
case 0: dx = -1; break;
|
||
case 1: dy = -1; break;
|
||
case 2: dx = -1; dy = -1; break;
|
||
default: assert(0); break;
|
||
}
|
||
|
||
const int n_bx = bx + dx, n_by = by + dy;
|
||
if ((n_bx < 0) || (n_by < 0))
|
||
continue;
|
||
|
||
astc_helpers::log_astc_block& neighbor_log_blk = coded_blocks(n_bx, n_by);
|
||
|
||
if (neighbor_log_blk.m_solid_color_flag_ldr)
|
||
continue;
|
||
|
||
bool accept_flag = false;
|
||
if (pass == 0)
|
||
{
|
||
// prefer full config+endpoint equality first
|
||
accept_flag = compare_log_block_configs_and_endpoints(trial_log_blk, neighbor_log_blk);
|
||
}
|
||
else
|
||
{
|
||
// next check for just config equality
|
||
accept_flag = compare_log_block_configs(trial_log_blk, neighbor_log_blk);
|
||
}
|
||
|
||
if (accept_flag)
|
||
{
|
||
best_alt_wpsnr = trial_wpsnr;
|
||
best_packed_out_block_index = out_block_iter;
|
||
found_alternative = true;
|
||
break;
|
||
}
|
||
|
||
} // i
|
||
|
||
} // out_block_iter
|
||
|
||
if (found_alternative)
|
||
break;
|
||
|
||
} // pass
|
||
|
||
if (best_packed_out_block_index != blk_info.m_packed_out_block_index)
|
||
total_lossy_replacements++;
|
||
|
||
} // global_cfg.m_lossy_supercompression
|
||
|
||
const encode_block_output& blk_out = blk_info.m_out_blocks[best_packed_out_block_index];
|
||
|
||
astc_helpers::log_astc_block& cur_log_blk = coded_blocks(bx, by);
|
||
|
||
cur_log_blk = blk_out.m_log_blk;
|
||
|
||
// Solid color/void extent
|
||
if (blk_out.m_trial_mode_index < 0)
|
||
{
|
||
assert(cur_log_blk.m_solid_color_flag_ldr);
|
||
|
||
total_solid_blocks++;
|
||
|
||
mode_bytes.push_back((uint8_t)basist::astc_ldr_t::xuastc_zstd_mode::cMODE_SOLID);
|
||
|
||
uint32_t cur_solid_color[4];
|
||
for (uint32_t i = 0; i < 4; i++)
|
||
cur_solid_color[i] = blk_out.m_log_blk.m_solid_color[i] >> 8;
|
||
|
||
uint32_t prev_solid_color[4] = { 0 };
|
||
|
||
const uint32_t num_comps = has_alpha ? 4 : 3;
|
||
|
||
astc_helpers::log_astc_block* pPrev_log_blk = bx ? &coded_blocks(bx - 1, by) : (by ? &coded_blocks(bx, by - 1) : nullptr);
|
||
if (pPrev_log_blk)
|
||
{
|
||
if (pPrev_log_blk->m_solid_color_flag_ldr)
|
||
{
|
||
prev_solid_color[0] = pPrev_log_blk->m_solid_color[0] >> 8;
|
||
prev_solid_color[1] = pPrev_log_blk->m_solid_color[1] >> 8;
|
||
prev_solid_color[2] = pPrev_log_blk->m_solid_color[2] >> 8;
|
||
prev_solid_color[3] = pPrev_log_blk->m_solid_color[3] >> 8;
|
||
}
|
||
else
|
||
{
|
||
// Decode previous block's first CEM, use the halfway point as the predictor.
|
||
color_rgba prev_l, prev_h;
|
||
decode_endpoints(pPrev_log_blk->m_color_endpoint_modes[0], pPrev_log_blk->m_endpoints, pPrev_log_blk->m_endpoint_ise_range, prev_l, prev_h);
|
||
|
||
prev_solid_color[0] = (prev_l[0] + prev_h[0] + 1) >> 1;
|
||
prev_solid_color[1] = (prev_l[1] + prev_h[1] + 1) >> 1;
|
||
prev_solid_color[2] = (prev_l[2] + prev_h[2] + 1) >> 1;
|
||
prev_solid_color[3] = (prev_l[3] + prev_h[3] + 1) >> 1;
|
||
}
|
||
}
|
||
|
||
for (uint32_t i = 0; i < num_comps; i++)
|
||
{
|
||
const uint32_t delta = (cur_solid_color[i] - prev_solid_color[i]) & 0xFF;
|
||
solid_dpcm_bytes.push_back((uint8_t)delta);
|
||
}
|
||
|
||
//prev_state.m_was_solid_color = true;
|
||
prev_state.m_tm_index = -1;
|
||
//prev_state.m_base_cem_index = astc_helpers::CEM_LDR_RGB_DIRECT;
|
||
|
||
continue;
|
||
}
|
||
|
||
assert(!cur_log_blk.m_solid_color_flag_ldr);
|
||
|
||
int full_cfg_endpoint_reuse_index = -1;
|
||
|
||
for (uint32_t i = 0; i < basist::astc_ldr_t::cMaxConfigReuseNeighbors; i++)
|
||
{
|
||
int dx = 0, dy = 0;
|
||
switch (i)
|
||
{
|
||
case 0: dx = -1; break;
|
||
case 1: dy = -1; break;
|
||
case 2: dx = -1; dy = -1; break;
|
||
default: assert(0); break;
|
||
}
|
||
|
||
const int n_bx = bx + dx, n_by = by + dy;
|
||
if ((n_bx < 0) || (n_by < 0))
|
||
continue;
|
||
|
||
astc_helpers::log_astc_block& neighbor_log_blk = coded_blocks(n_bx, n_by);
|
||
|
||
if (neighbor_log_blk.m_solid_color_flag_ldr)
|
||
continue;
|
||
|
||
if (compare_log_block_configs_and_endpoints(cur_log_blk, neighbor_log_blk))
|
||
{
|
||
full_cfg_endpoint_reuse_index = i;
|
||
break;
|
||
}
|
||
} // i
|
||
|
||
if (full_cfg_endpoint_reuse_index >= 0)
|
||
{
|
||
// Reused full config, part ID and endpoint values from an immediate neighbor
|
||
mode_bytes.push_back((uint8_t)((uint32_t)basist::astc_ldr_t::xuastc_zstd_mode::cMODE_REUSE_CFG_ENDPOINTS_LEFT + (full_cfg_endpoint_reuse_index << 2)));
|
||
|
||
total_full_reuse_commands++;
|
||
|
||
const basist::astc_ldr_t::prev_block_state_full_zstd* pReused_cfg_state = nullptr;
|
||
|
||
switch (full_cfg_endpoint_reuse_index)
|
||
{
|
||
case 0: pReused_cfg_state = pLeft_state; break;
|
||
case 1: pReused_cfg_state = pUpper_state; break;
|
||
case 2: pReused_cfg_state = pDiag_state; break;
|
||
default: assert(0); break;
|
||
}
|
||
|
||
if (!pReused_cfg_state)
|
||
{
|
||
assert(0);
|
||
fmt_error_printf("encoding internal failure\n");
|
||
return false;
|
||
}
|
||
|
||
assert(pReused_cfg_state->m_tm_index == blk_out.m_trial_mode_index);
|
||
|
||
prev_state.m_tm_index = blk_out.m_trial_mode_index;
|
||
}
|
||
else
|
||
{
|
||
// No nearby full config+part ID+endpoint reuse, so send raw command
|
||
// Must send endpoints too.
|
||
total_raw_commands++;
|
||
|
||
// Format of mode byte (UD bit used in modes other than raw)
|
||
// 7 6 5 4 3 2 1 0
|
||
// UD C ED HH BO I I M
|
||
|
||
// MMM=mode
|
||
// II=neighbor reuse index [0,3], 3=no reuse
|
||
// BO=base offset flag
|
||
// HH=partition hash hit flag
|
||
// ED=endpoint DPCM flag
|
||
// C=config hash table hit
|
||
// UD=use DCT flag
|
||
|
||
mode_bytes.push_back((uint8_t)basist::astc_ldr_t::xuastc_zstd_mode::cMODE_RAW);
|
||
|
||
const uint32_t cur_actual_cem = cur_log_blk.m_color_endpoint_modes[0];
|
||
const uint32_t total_endpoint_vals = astc_helpers::get_num_cem_values(cur_actual_cem);
|
||
|
||
// DO NOT use tm.m_cem because the encoder may have selected a base+ofs variant instead. Use cur_actual_cem.
|
||
const basist::astc_ldr_t::trial_mode& tm = enc_out.m_encoder_trial_modes[blk_out.m_trial_mode_index];
|
||
|
||
// Check for config+part ID neighbor reuse (partial refuse)
|
||
int neighbor_cfg_match_index = -1;
|
||
for (uint32_t i = 0; i < basist::astc_ldr_t::cMaxConfigReuseNeighbors; i++)
|
||
{
|
||
const basist::astc_ldr_t::prev_block_state_full_zstd* pNeighbor_state = nullptr;
|
||
|
||
int dx = 0, dy = 0;
|
||
switch (i)
|
||
{
|
||
case 0: dx = -1; pNeighbor_state = pLeft_state; break;
|
||
case 1: dy = -1; pNeighbor_state = pUpper_state; break;
|
||
case 2: dx = -1; dy = -1; pNeighbor_state = pDiag_state; break;
|
||
default: assert(0); break;
|
||
}
|
||
|
||
if (!pNeighbor_state)
|
||
continue;
|
||
|
||
const int n_bx = bx + dx, n_by = by + dy;
|
||
assert((n_bx >= 0) && (n_by >= 0));
|
||
|
||
astc_helpers::log_astc_block& neighbor_log_blk = coded_blocks(n_bx, n_by);
|
||
|
||
if (pNeighbor_state->m_tm_index != blk_out.m_trial_mode_index)
|
||
continue;
|
||
|
||
if (neighbor_log_blk.m_color_endpoint_modes[0] != cur_log_blk.m_color_endpoint_modes[0])
|
||
continue;
|
||
|
||
if (neighbor_log_blk.m_partition_id != cur_log_blk.m_partition_id)
|
||
continue;
|
||
|
||
assert(neighbor_log_blk.m_dual_plane == cur_log_blk.m_dual_plane);
|
||
assert(neighbor_log_blk.m_color_component_selector == cur_log_blk.m_color_component_selector);
|
||
assert(neighbor_log_blk.m_num_partitions == cur_log_blk.m_num_partitions);
|
||
assert(neighbor_log_blk.m_grid_width == cur_log_blk.m_grid_width);
|
||
assert(neighbor_log_blk.m_grid_height == cur_log_blk.m_grid_height);
|
||
assert(neighbor_log_blk.m_endpoint_ise_range == cur_log_blk.m_endpoint_ise_range);
|
||
assert(neighbor_log_blk.m_weight_ise_range == cur_log_blk.m_weight_ise_range);
|
||
|
||
neighbor_cfg_match_index = i;
|
||
break;
|
||
}
|
||
|
||
if (neighbor_cfg_match_index >= 0)
|
||
{
|
||
// Partial reuse (config+partition ID, but not endpoints).
|
||
// OR 2-bits into the mode byte
|
||
mode_bytes.back() |= (uint8_t)(neighbor_cfg_match_index << 1);
|
||
|
||
const basist::astc_ldr_t::prev_block_state_full_zstd* pReused_cfg_state = nullptr;
|
||
|
||
switch (neighbor_cfg_match_index)
|
||
{
|
||
case 0: pReused_cfg_state = pLeft_state; break;
|
||
case 1: pReused_cfg_state = pUpper_state; break;
|
||
case 2: pReused_cfg_state = pDiag_state; break;
|
||
default: assert(0); break;
|
||
}
|
||
|
||
if (!pReused_cfg_state)
|
||
{
|
||
assert(0);
|
||
fmt_error_printf("encoding internal failure\n");
|
||
return false;
|
||
}
|
||
|
||
assert(pReused_cfg_state->m_tm_index == blk_out.m_trial_mode_index);
|
||
|
||
prev_state.m_tm_index = blk_out.m_trial_mode_index;
|
||
|
||
total_reuse_full_cfg_emitted++;
|
||
}
|
||
else
|
||
{
|
||
// No reuse - must send config, so pack it. Then send endpoints.
|
||
total_full_cfg_emitted++;
|
||
|
||
// OR 2-bits into the mode byte (so now 5 bits total)
|
||
mode_bytes.back() |= (uint8_t)(((uint32_t)basist::astc_ldr_t::cMaxConfigReuseNeighbors) << 1);
|
||
|
||
// Pack tm index (ASTC base config)
|
||
{
|
||
num_tm_hash_probes++;
|
||
|
||
uint32_t tm_h = basist::astc_ldr_t::tm_hash_index(blk_out.m_trial_mode_index);
|
||
|
||
if (tm_hash[tm_h] == blk_out.m_trial_mode_index)
|
||
{
|
||
num_tm_hash_hits++;
|
||
|
||
mode_bytes.back() |= (uint8_t)basist::astc_ldr_t::XUASTC_LDR_MODE_BYTE_TM_HASH_HIT_FLAG; // tm hash hit flag
|
||
|
||
raw_bits.put_bits(tm_h, basist::astc_ldr_t::TM_HASH_BITS);
|
||
}
|
||
else
|
||
{
|
||
raw_bits.put_truncated_binary(blk_out.m_trial_mode_index, (uint32_t)enc_out.m_encoder_trial_modes.size());
|
||
|
||
tm_hash[tm_h] = blk_out.m_trial_mode_index;
|
||
}
|
||
}
|
||
|
||
prev_state.m_tm_index = blk_out.m_trial_mode_index;
|
||
|
||
// Send base_ofs bit if the tm is direct
|
||
if ((tm.m_cem == astc_helpers::CEM_LDR_RGB_DIRECT) || (tm.m_cem == astc_helpers::CEM_LDR_RGBA_DIRECT))
|
||
{
|
||
const bool is_base_ofs = (cur_log_blk.m_color_endpoint_modes[0] == astc_helpers::CEM_LDR_RGB_BASE_PLUS_OFFSET) ||
|
||
(cur_log_blk.m_color_endpoint_modes[0] == astc_helpers::CEM_LDR_RGBA_BASE_PLUS_OFFSET);
|
||
|
||
if (is_base_ofs)
|
||
mode_bytes.back() |= basist::astc_ldr_t::XUASTC_LDR_MODE_BYTE_IS_BASE_OFS_FLAG; // base_ofs bit
|
||
}
|
||
|
||
if (tm.m_num_parts > 1)
|
||
{
|
||
// Send unique part pattern ID
|
||
const astc_ldr::partitions_data* pPart_data = (tm.m_num_parts == 2) ? &enc_out.m_part_data_p2 : &enc_out.m_part_data_p3;
|
||
|
||
const uint32_t astc_pat_index = cur_log_blk.m_partition_id;
|
||
const uint32_t unique_pat_index = pPart_data->m_part_seed_to_unique_index[astc_pat_index];
|
||
const uint32_t total_unique_indices = pPart_data->m_total_unique_patterns;
|
||
assert(unique_pat_index < total_unique_indices);
|
||
|
||
num_part_hash_probes++;
|
||
|
||
int* pPart_hash = (tm.m_num_parts == 2) ? part2_hash : part3_hash;
|
||
|
||
const uint32_t h = basist::astc_ldr_t::part_hash_index(unique_pat_index);
|
||
|
||
if (pPart_hash[h] != (int)unique_pat_index)
|
||
{
|
||
#if defined(_DEBUG) || defined(DEBUG)
|
||
// sanity
|
||
for (uint32_t i = 0; i < basist::astc_ldr_t::PART_HASH_SIZE; i++)
|
||
{
|
||
assert(pPart_hash[i] != (int)unique_pat_index);
|
||
}
|
||
#endif
|
||
|
||
raw_bits.put_truncated_binary(unique_pat_index, total_unique_indices);
|
||
}
|
||
else
|
||
{
|
||
num_part_hash_hits++;
|
||
|
||
mode_bytes.back() |= basist::astc_ldr_t::XUASTC_LDR_MODE_BYTE_PART_HASH_HIT; // hash pat_index hit bit
|
||
raw_bits.put_bits(h, basist::astc_ldr_t::PART_HASH_BITS);
|
||
}
|
||
|
||
pPart_hash[basist::astc_ldr_t::part_hash_index(unique_pat_index)] = unique_pat_index;
|
||
}
|
||
}
|
||
|
||
// Send endpoints
|
||
const int num_endpoint_levels = astc_helpers::get_ise_levels(cur_log_blk.m_endpoint_ise_range);
|
||
const auto& endpoint_ise_to_rank = astc_helpers::g_dequant_tables.get_endpoint_tab(cur_log_blk.m_endpoint_ise_range).m_ISE_to_rank;
|
||
|
||
bool endpoints_use_bc[astc_helpers::MAX_PARTITIONS] = { false };
|
||
|
||
if (astc_helpers::cem_supports_bc(cur_actual_cem))
|
||
{
|
||
for (uint32_t part_iter = 0; part_iter < tm.m_num_parts; part_iter++)
|
||
{
|
||
const bool cur_uses_bc = astc_helpers::used_blue_contraction(cur_actual_cem, cur_log_blk.m_endpoints + part_iter * total_endpoint_vals, cur_log_blk.m_endpoint_ise_range);
|
||
|
||
endpoints_use_bc[part_iter] = cur_uses_bc;
|
||
|
||
} // part_iter
|
||
}
|
||
|
||
int best_reuse_bx = -1, best_reuse_by = -1;
|
||
uint32_t best_reuse_index = 0;
|
||
const astc_helpers::log_astc_block* pEndpoint_pred_log_blk = nullptr;
|
||
|
||
if (endpoint_dpcm_global_enable)
|
||
{
|
||
int64_t best_trial_delta2 = INT64_MAX;
|
||
float best_trial_bits = BIG_FLOAT_VAL;
|
||
|
||
// TODO: Decide if DPCM is even worth it.
|
||
const float N = (float)(total_endpoint_vals * tm.m_num_parts);
|
||
|
||
for (uint32_t reuse_index = 0; reuse_index < basist::astc_6x6_hdr::NUM_REUSE_XY_DELTAS; reuse_index++)
|
||
{
|
||
const int rx = (int)bx + basist::astc_6x6_hdr::g_reuse_xy_deltas[reuse_index].m_x;
|
||
const int ry = (int)by + basist::astc_6x6_hdr::g_reuse_xy_deltas[reuse_index].m_y;
|
||
if ((rx < 0) || (ry < 0) || (rx >= (int)num_blocks_x) || (ry >= (int)num_blocks_y))
|
||
continue;
|
||
|
||
const astc_helpers::log_astc_block* pTrial_log_blk = &coded_blocks(rx, ry);
|
||
if (pTrial_log_blk->m_solid_color_flag_ldr)
|
||
continue;
|
||
|
||
uint8_t trial_predicted_endpoints[astc_helpers::MAX_PARTITIONS][astc_helpers::MAX_CEM_ENDPOINT_VALS] = { };
|
||
|
||
uint32_t part_iter;
|
||
for (part_iter = 0; part_iter < tm.m_num_parts; part_iter++)
|
||
{
|
||
const bool always_repack_flag = false;
|
||
bool blue_contraction_clamped_flag = false, base_ofs_clamped_flag = false;
|
||
|
||
bool conv_status = basist::astc_ldr_t::convert_endpoints_across_cems(
|
||
pTrial_log_blk->m_color_endpoint_modes[0], pTrial_log_blk->m_endpoint_ise_range, pTrial_log_blk->m_endpoints,
|
||
cur_actual_cem, cur_log_blk.m_endpoint_ise_range, trial_predicted_endpoints[part_iter],
|
||
always_repack_flag,
|
||
endpoints_use_bc[part_iter], false,
|
||
blue_contraction_clamped_flag, base_ofs_clamped_flag);
|
||
|
||
if (!conv_status)
|
||
break;
|
||
} // part_iter
|
||
|
||
if (part_iter < tm.m_num_parts)
|
||
continue; // failed
|
||
|
||
int64_t trial_endpoint_delta2 = 0;
|
||
for (part_iter = 0; part_iter < tm.m_num_parts; part_iter++)
|
||
{
|
||
for (uint32_t val_iter = 0; val_iter < total_endpoint_vals; val_iter++)
|
||
{
|
||
int cur_e_rank = endpoint_ise_to_rank[cur_log_blk.m_endpoints[part_iter * total_endpoint_vals + val_iter]];
|
||
int prev_e_rank = endpoint_ise_to_rank[trial_predicted_endpoints[part_iter][val_iter]];
|
||
|
||
int e_delta = cur_e_rank - prev_e_rank;
|
||
|
||
trial_endpoint_delta2 += e_delta * e_delta;
|
||
|
||
} // val_iter
|
||
|
||
} // part_iter
|
||
|
||
const float mse = (float)trial_endpoint_delta2 / N;
|
||
|
||
// Gaussian entropy estimate - precomputed 0.5 * log2(2*pi*e) = ~2.0470956f
|
||
const float k_const = 2.0470956f;
|
||
|
||
float bits_per_sym = 0.5f * log2f(basisu::maximum(mse, 1e-9f)) + k_const;
|
||
|
||
bits_per_sym = clamp(bits_per_sym, 0.05f, 8.0f);
|
||
|
||
// total est bits for this block’s endpoints
|
||
float total_est_bits = bits_per_sym * N;
|
||
|
||
if (total_est_bits < best_trial_bits)
|
||
{
|
||
best_trial_delta2 = trial_endpoint_delta2;
|
||
best_trial_bits = total_est_bits;
|
||
|
||
best_reuse_bx = rx;
|
||
best_reuse_by = ry;
|
||
best_reuse_index = reuse_index;
|
||
|
||
if (!best_trial_delta2)
|
||
break;
|
||
}
|
||
|
||
} // reuse_index
|
||
|
||
if (best_reuse_bx >= 0)
|
||
{
|
||
pEndpoint_pred_log_blk = &coded_blocks(best_reuse_bx, best_reuse_by);
|
||
|
||
assert(!pEndpoint_pred_log_blk->m_solid_color_flag_ldr);
|
||
}
|
||
|
||
} // if (endpoint_dpcm_global_enable)
|
||
|
||
uint8_t predicted_endpoints[astc_helpers::MAX_PARTITIONS][astc_helpers::MAX_CEM_ENDPOINT_VALS] = { };
|
||
|
||
bool use_dpcm_endpoints = false;
|
||
|
||
if (pEndpoint_pred_log_blk)
|
||
{
|
||
use_dpcm_endpoints = true;
|
||
|
||
assert(cur_log_blk.m_num_partitions == tm.m_num_parts);
|
||
|
||
for (uint32_t part_iter = 0; part_iter < tm.m_num_parts; part_iter++)
|
||
{
|
||
const bool always_repack_flag = false;
|
||
bool blue_contraction_clamped_flag = false, base_ofs_clamped_flag = false;
|
||
|
||
bool conv_status = basist::astc_ldr_t::convert_endpoints_across_cems(
|
||
pEndpoint_pred_log_blk->m_color_endpoint_modes[0], pEndpoint_pred_log_blk->m_endpoint_ise_range, pEndpoint_pred_log_blk->m_endpoints,
|
||
cur_actual_cem, cur_log_blk.m_endpoint_ise_range, predicted_endpoints[part_iter],
|
||
always_repack_flag,
|
||
endpoints_use_bc[part_iter], false,
|
||
blue_contraction_clamped_flag, base_ofs_clamped_flag);
|
||
|
||
if (!conv_status)
|
||
{
|
||
// In practice, should never happen
|
||
use_dpcm_endpoints = false;
|
||
break;
|
||
}
|
||
}
|
||
}
|
||
|
||
// TODO: Decide what is cheaper, endpoint DPCM vs. raw
|
||
|
||
if (use_dpcm_endpoints)
|
||
{
|
||
// DPCM flag bit
|
||
mode_bytes.back() |= basist::astc_ldr_t::XUASTC_LDR_MODE_BYTE_DPCM_ENDPOINTS_FLAG;
|
||
|
||
endpoint_dpcm_reuse_indices.push_back((uint8_t)best_reuse_index);
|
||
|
||
if (astc_helpers::cem_supports_bc(cur_actual_cem))
|
||
{
|
||
for (uint32_t part_iter = 0; part_iter < tm.m_num_parts; part_iter++)
|
||
{
|
||
use_bc_bits.put_bits(endpoints_use_bc[part_iter], 1);
|
||
|
||
} // part_iter
|
||
}
|
||
|
||
for (uint32_t part_iter = 0; part_iter < tm.m_num_parts; part_iter++)
|
||
{
|
||
for (uint32_t val_iter = 0; val_iter < total_endpoint_vals; val_iter++)
|
||
{
|
||
int cur_e_rank = endpoint_ise_to_rank[cur_log_blk.m_endpoints[part_iter * total_endpoint_vals + val_iter]];
|
||
int prev_e_rank = endpoint_ise_to_rank[predicted_endpoints[part_iter][val_iter]];
|
||
|
||
int e_val = imod(cur_e_rank - prev_e_rank, num_endpoint_levels);
|
||
|
||
if (num_endpoint_levels <= 8)
|
||
endpoint_dpcm_3bit.put_bits(e_val, 4);
|
||
else if (num_endpoint_levels <= 16)
|
||
endpoint_dpcm_4bit.put_bits(e_val, 4);
|
||
else if (num_endpoint_levels <= 32)
|
||
endpoint_dpcm_5bit.push_back((uint8_t)e_val);
|
||
else if (num_endpoint_levels <= 64)
|
||
endpoint_dpcm_6bit.push_back((uint8_t)e_val);
|
||
else if (num_endpoint_levels <= 128)
|
||
endpoint_dpcm_7bit.push_back((uint8_t)e_val);
|
||
else if (num_endpoint_levels <= 256)
|
||
endpoint_dpcm_8bit.push_back((uint8_t)e_val);
|
||
|
||
} // val_iter
|
||
|
||
} // part_iter
|
||
|
||
total_used_endpoint_dpcm++;
|
||
}
|
||
else
|
||
{
|
||
encode_values(raw_bits, tm.m_num_parts * total_endpoint_vals, cur_log_blk.m_endpoints, cur_log_blk.m_endpoint_ise_range);
|
||
|
||
total_used_endpoint_raw++;
|
||
} // if (use_dpcm_endpoints)
|
||
|
||
} // if (full_cfg_endpoint_reuse_index >= 0)
|
||
|
||
// ------------------------------------ Send weights
|
||
|
||
const uint32_t total_planes = cur_log_blk.m_dual_plane ? 2 : 1;
|
||
const uint32_t total_weights = cur_log_blk.m_grid_width * cur_log_blk.m_grid_height;
|
||
|
||
const int num_weight_levels = astc_helpers::get_ise_levels(cur_log_blk.m_weight_ise_range);
|
||
const auto& weight_ise_to_rank = astc_helpers::g_dequant_tables.get_weight_tab(cur_log_blk.m_weight_ise_range).m_ISE_to_rank;
|
||
|
||
bool use_dct = enc_cfg.m_use_dct;
|
||
|
||
// TODO - tune this threshold
|
||
const uint32_t SWITCH_TO_DPCM_NUM_COEFF_THRESH = (cur_log_blk.m_grid_width * cur_log_blk.m_grid_height * 45 + 64) >> 7;
|
||
|
||
if (use_dct)
|
||
{
|
||
for (uint32_t plane_iter = 0; plane_iter < total_planes; plane_iter++)
|
||
{
|
||
const basist::astc_ldr_t::dct_syms& syms = blk_out.m_packed_dct_plane_data[plane_iter];
|
||
if (syms.m_max_coeff_mag > basist::astc_ldr_t::DCT_MAX_ARITH_COEFF_MAG)
|
||
{
|
||
use_dct = false;
|
||
break;
|
||
}
|
||
|
||
if (syms.m_coeffs.size() > SWITCH_TO_DPCM_NUM_COEFF_THRESH)
|
||
{
|
||
use_dct = false;
|
||
break;
|
||
}
|
||
}
|
||
}
|
||
|
||
// MSB of mode byte=use DCT
|
||
if (enc_cfg.m_use_dct)
|
||
{
|
||
assert((mode_bytes.back() & basist::astc_ldr_t::XUASTC_LDR_MODE_BYTE_USE_DCT) == 0);
|
||
|
||
if (use_dct)
|
||
mode_bytes.back() |= basist::astc_ldr_t::XUASTC_LDR_MODE_BYTE_USE_DCT;
|
||
}
|
||
|
||
if (use_dct)
|
||
{
|
||
total_used_dct++;
|
||
|
||
if (total_planes > 1)
|
||
{
|
||
assert(blk_out.m_packed_dct_plane_data[0].m_num_dc_levels == blk_out.m_packed_dct_plane_data[1].m_num_dc_levels);
|
||
}
|
||
|
||
for (uint32_t plane_iter = 0; plane_iter < total_planes; plane_iter++)
|
||
{
|
||
const basist::astc_ldr_t::dct_syms& syms = blk_out.m_packed_dct_plane_data[plane_iter];
|
||
|
||
if (syms.m_num_dc_levels == basist::astc_ldr_t::DCT_MEAN_LEVELS1)
|
||
mean1_bytes.push_back((uint8_t)syms.m_dc_sym);
|
||
else
|
||
{
|
||
assert(syms.m_num_dc_levels == basist::astc_ldr_t::DCT_MEAN_LEVELS0);
|
||
mean0_bits.put_bits(syms.m_dc_sym, 4);
|
||
}
|
||
|
||
for (uint32_t i = 0; i < syms.m_coeffs.size(); i++)
|
||
{
|
||
if (syms.m_coeffs[i].m_coeff == INT16_MAX)
|
||
{
|
||
run_bytes.push_back(basist::astc_ldr_t::DCT_RUN_LEN_EOB_SYM_INDEX);
|
||
}
|
||
else
|
||
{
|
||
run_bytes.push_back((uint8_t)syms.m_coeffs[i].m_num_zeros);
|
||
|
||
sign_bits.put_bits(syms.m_coeffs[i].m_coeff < 0, 1);
|
||
|
||
assert((syms.m_coeffs[i].m_coeff != 0) && (iabs(syms.m_coeffs[i].m_coeff) <= 255));
|
||
|
||
coeff_bytes.push_back((uint8_t)(iabs(syms.m_coeffs[i].m_coeff) - 1));
|
||
}
|
||
}
|
||
|
||
} // plane_iter
|
||
}
|
||
else
|
||
{
|
||
total_used_weight_dpcm++;
|
||
|
||
for (uint32_t plane_iter = 0; plane_iter < total_planes; plane_iter++)
|
||
{
|
||
int prev_w = num_weight_levels / 2;
|
||
|
||
for (uint32_t weight_iter = 0; weight_iter < total_weights; weight_iter++)
|
||
{
|
||
int ise_w = cur_log_blk.m_weights[plane_iter + weight_iter * total_planes];
|
||
int w = weight_ise_to_rank[ise_w];
|
||
|
||
int w_to_code = w;
|
||
w_to_code = imod(w - prev_w, num_weight_levels);
|
||
|
||
prev_w = w;
|
||
|
||
if (num_weight_levels <= 4)
|
||
weight2_bits.put_bits((uint8_t)w_to_code, 2);
|
||
else if (num_weight_levels <= 8)
|
||
weight3_bits.put_bits((uint8_t)w_to_code, 4);
|
||
else if (num_weight_levels <= 16)
|
||
weight4_bits.put_bits((uint8_t)w_to_code, 4);
|
||
else
|
||
weight8_bits.push_back((uint8_t)w_to_code);
|
||
|
||
} // weight_iter
|
||
|
||
} // plane_iter
|
||
}
|
||
|
||
} // bx
|
||
|
||
if (cur_run_len)
|
||
{
|
||
assert(cur_run_len <= FULL_ZSTD_MAX_RUN_LEN);
|
||
|
||
total_runs++;
|
||
total_run_blocks += cur_run_len;
|
||
mode_bytes.push_back((uint8_t)((uint32_t)basist::astc_ldr_t::xuastc_zstd_mode::cMODE_RUN | ((cur_run_len - 1) << 2)));
|
||
cur_run_len = 0;
|
||
}
|
||
|
||
} // by
|
||
|
||
raw_bits.put_bits(basist::astc_ldr_t::FINAL_SYNC_MARKER, basist::astc_ldr_t::FINAL_SYNC_MARKER_BITS);
|
||
|
||
raw_bits.flush();
|
||
endpoint_dpcm_3bit.flush();
|
||
endpoint_dpcm_4bit.flush();
|
||
use_bc_bits.flush();
|
||
|
||
mean0_bits.flush();
|
||
sign_bits.flush();
|
||
weight2_bits.flush();
|
||
weight3_bits.flush();
|
||
weight4_bits.flush();
|
||
|
||
const uint32_t zstd_level = 9;
|
||
|
||
uint8_vec comp_mode, comp_solid_dpcm, comp_endpoint_dpcm_reuse_indices;
|
||
uint8_vec comp_use_bc_bits, comp_endpoint_dpcm_3bit, comp_endpoint_dpcm_4bit, comp_endpoint_dpcm_5bit, comp_endpoint_dpcm_6bit, comp_endpoint_dpcm_7bit, comp_endpoint_dpcm_8bit;
|
||
|
||
// Mode
|
||
if (!zstd_compress(mode_bytes, comp_mode, zstd_level)) return false;
|
||
if (!zstd_compress(solid_dpcm_bytes, comp_solid_dpcm, zstd_level)) return false;
|
||
|
||
// Endpoints
|
||
if (!zstd_compress(endpoint_dpcm_reuse_indices, comp_endpoint_dpcm_reuse_indices, zstd_level)) return false;
|
||
if (!zstd_compress(use_bc_bits, comp_use_bc_bits, zstd_level)) return false;
|
||
if (!zstd_compress(endpoint_dpcm_3bit, comp_endpoint_dpcm_3bit, zstd_level)) return false;
|
||
if (!zstd_compress(endpoint_dpcm_4bit, comp_endpoint_dpcm_4bit, zstd_level)) return false;
|
||
if (!zstd_compress(endpoint_dpcm_5bit, comp_endpoint_dpcm_5bit, zstd_level)) return false;
|
||
if (!zstd_compress(endpoint_dpcm_6bit, comp_endpoint_dpcm_6bit, zstd_level)) return false;
|
||
if (!zstd_compress(endpoint_dpcm_7bit, comp_endpoint_dpcm_7bit, zstd_level)) return false;
|
||
if (!zstd_compress(endpoint_dpcm_8bit, comp_endpoint_dpcm_8bit, zstd_level)) return false;
|
||
|
||
// Weights
|
||
uint8_vec comp_mean0, comp_mean1, comp_run, comp_coeff, comp_weight2, comp_weight3, comp_weight4, comp_weight8;
|
||
|
||
if (!zstd_compress(mean0_bits, comp_mean0, zstd_level)) return false;
|
||
if (!zstd_compress(mean1_bytes, comp_mean1, zstd_level)) return false;
|
||
if (!zstd_compress(run_bytes, comp_run, zstd_level)) return false;
|
||
if (!zstd_compress(coeff_bytes, comp_coeff, zstd_level)) return false;
|
||
if (!zstd_compress(weight2_bits, comp_weight2, zstd_level)) return false;
|
||
if (!zstd_compress(weight3_bits, comp_weight3, zstd_level)) return false;
|
||
if (!zstd_compress(weight4_bits, comp_weight4, zstd_level)) return false;
|
||
if (!zstd_compress(weight8_bits, comp_weight8, zstd_level)) return false;
|
||
|
||
basist::astc_ldr_t::xuastc_ldr_full_zstd_header hdr;
|
||
clear_obj(hdr);
|
||
|
||
hdr.m_flags = (uint8_t)basist::astc_ldr_t::xuastc_ldr_syntax::cFullZStd;
|
||
|
||
hdr.m_raw_bits_len = (uint32_t)raw_bits.get_bytes().size();
|
||
hdr.m_mode_bytes_len = (uint32_t)comp_mode.size();
|
||
hdr.m_solid_dpcm_bytes_len = (uint32_t)comp_solid_dpcm.size();
|
||
|
||
hdr.m_endpoint_dpcm_reuse_indices_len = (uint32_t)comp_endpoint_dpcm_reuse_indices.size();
|
||
hdr.m_use_bc_bits_len = (uint32_t)comp_use_bc_bits.size();
|
||
hdr.m_endpoint_dpcm_3bit_len = (uint32_t)comp_endpoint_dpcm_3bit.size();
|
||
hdr.m_endpoint_dpcm_4bit_len = (uint32_t)comp_endpoint_dpcm_4bit.size();
|
||
hdr.m_endpoint_dpcm_5bit_len = (uint32_t)comp_endpoint_dpcm_5bit.size();
|
||
hdr.m_endpoint_dpcm_6bit_len = (uint32_t)comp_endpoint_dpcm_6bit.size();
|
||
hdr.m_endpoint_dpcm_7bit_len = (uint32_t)comp_endpoint_dpcm_7bit.size();
|
||
hdr.m_endpoint_dpcm_8bit_len = (uint32_t)comp_endpoint_dpcm_8bit.size();
|
||
|
||
hdr.m_mean0_bits_len = (uint32_t)comp_mean0.size();
|
||
hdr.m_mean1_bytes_len = (uint32_t)comp_mean1.size();
|
||
hdr.m_run_bytes_len = (uint32_t)comp_run.size();
|
||
hdr.m_coeff_bytes_len = (uint32_t)comp_coeff.size();
|
||
hdr.m_sign_bits_len = (uint32_t)sign_bits.get_bytes().size();
|
||
hdr.m_weight2_bits_len = (uint32_t)comp_weight2.size();
|
||
hdr.m_weight3_bits_len = (uint32_t)comp_weight3.size();
|
||
hdr.m_weight4_bits_len = (uint32_t)comp_weight4.size();
|
||
hdr.m_weight8_bytes_len = (uint32_t)comp_weight8.size();
|
||
|
||
comp_data.reserve(8192);
|
||
|
||
comp_data.resize(sizeof(hdr));
|
||
memcpy(comp_data.data(), &hdr, sizeof(hdr));
|
||
|
||
comp_data.append(raw_bits.get_bytes());
|
||
comp_data.append(comp_mode);
|
||
comp_data.append(comp_solid_dpcm);
|
||
|
||
comp_data.append(comp_endpoint_dpcm_reuse_indices);
|
||
comp_data.append(comp_use_bc_bits);
|
||
comp_data.append(comp_endpoint_dpcm_3bit);
|
||
comp_data.append(comp_endpoint_dpcm_4bit);
|
||
comp_data.append(comp_endpoint_dpcm_5bit);
|
||
comp_data.append(comp_endpoint_dpcm_6bit);
|
||
comp_data.append(comp_endpoint_dpcm_7bit);
|
||
comp_data.append(comp_endpoint_dpcm_8bit);
|
||
|
||
comp_data.append(comp_mean0);
|
||
comp_data.append(comp_mean1);
|
||
comp_data.append(comp_run);
|
||
comp_data.append(comp_coeff);
|
||
comp_data.append(sign_bits.get_bytes());
|
||
comp_data.append(comp_weight2);
|
||
comp_data.append(comp_weight3);
|
||
comp_data.append(comp_weight4);
|
||
comp_data.append(comp_weight8);
|
||
|
||
if (comp_data.size() > UINT32_MAX)
|
||
return false;
|
||
|
||
if ((global_cfg.m_debug_images) || (global_cfg.m_debug_output))
|
||
{
|
||
image coded_img(width, height);
|
||
|
||
vector2D<astc_helpers::astc_block> phys_blocks(num_blocks_x, num_blocks_y);
|
||
|
||
for (uint32_t by = 0; by < num_blocks_y; by++)
|
||
{
|
||
for (uint32_t bx = 0; bx < num_blocks_x; bx++)
|
||
{
|
||
const astc_helpers::log_astc_block& log_blk = coded_blocks(bx, by);
|
||
|
||
color_rgba block_pixels[astc_ldr::ASTC_LDR_MAX_BLOCK_PIXELS];
|
||
|
||
bool status = astc_helpers::decode_block(log_blk, block_pixels, block_width, block_height, enc_cfg.m_cem_enc_params.m_decode_mode_srgb ? astc_helpers::cDecodeModeSRGB8 : astc_helpers::cDecodeModeLDR8);
|
||
if (!status)
|
||
{
|
||
fmt_error_printf("astc_helpers::decode_block() failed\n");
|
||
return false;
|
||
}
|
||
|
||
// Be positive the logical block can be unpacked correctly as XUASTC LDR.
|
||
color_rgba block_pixels_alt[astc_ldr::ASTC_LDR_MAX_BLOCK_PIXELS];
|
||
bool status_alt = astc_helpers::decode_block_xuastc_ldr(log_blk, block_pixels_alt, block_width, block_height, enc_cfg.m_cem_enc_params.m_decode_mode_srgb ? astc_helpers::cDecodeModeSRGB8 : astc_helpers::cDecodeModeLDR8);
|
||
if (!status_alt)
|
||
{
|
||
fmt_error_printf("astc_helpers::decode_block_xuastc_ldr() failed\n");
|
||
return false;
|
||
}
|
||
|
||
if (memcmp(block_pixels, block_pixels_alt, sizeof(color_rgba) * block_width * block_height) != 0)
|
||
{
|
||
fmt_error_printf("astc_helpers::decode_block_xuastc_ldr() decode pixel mismatch\n");
|
||
return false;
|
||
}
|
||
|
||
coded_img.set_block_clipped(block_pixels, bx * block_width, by * block_height, block_width, block_height);
|
||
|
||
} // bx
|
||
|
||
} //by
|
||
|
||
if (global_cfg.m_debug_images)
|
||
save_png(global_cfg.m_debug_file_prefix + "coded_img.png", coded_img);
|
||
|
||
if (global_cfg.m_debug_output)
|
||
{
|
||
debug_printf("Orig image vs. coded img:\n");
|
||
print_image_metrics(orig_img, coded_img);
|
||
}
|
||
}
|
||
|
||
if (global_cfg.m_debug_output)
|
||
{
|
||
fmt_debug_printf("Zstd compressed sizes:\n");
|
||
|
||
fmt_debug_printf(" Raw bytes: {}\n", (uint64_t)raw_bits.get_bytes().size());
|
||
fmt_debug_printf(" Mode bytes: {}, comp size: {}\n", (uint64_t)mode_bytes.size(), (uint64_t)comp_mode.size());
|
||
fmt_debug_printf(" Solid DPCM bytes: {}, comp size: {}\n", (uint64_t)solid_dpcm_bytes.size(), (uint64_t)comp_solid_dpcm.size());
|
||
|
||
fmt_debug_printf(" \n Endpoint DPCM Reuse Bytes: {}, comp size: {}\n", (uint64_t)endpoint_dpcm_reuse_indices.size(), (uint64_t)comp_endpoint_dpcm_reuse_indices.size());
|
||
fmt_debug_printf(" Use BC bits bytes: {}, comp_size: {}\n", (uint64_t)use_bc_bits.get_bytes().size(), (uint64_t)comp_use_bc_bits.size());
|
||
fmt_debug_printf(" Endpoint DPCM 3 bits: {}, comp size: {}\n", (uint64_t)endpoint_dpcm_3bit.get_bytes().size(), (uint64_t)comp_endpoint_dpcm_3bit.size());
|
||
fmt_debug_printf(" Endpoint DPCM 4 bits: {}, comp size: {}\n", (uint64_t)endpoint_dpcm_4bit.get_bytes().size(), (uint64_t)comp_endpoint_dpcm_4bit.size());
|
||
fmt_debug_printf(" Endpoint DPCM 5 bits: {}, comp size: {}\n", (uint64_t)endpoint_dpcm_5bit.size(), (uint64_t)comp_endpoint_dpcm_5bit.size());
|
||
fmt_debug_printf(" Endpoint DPCM 6 bits: {}, comp size: {}\n", (uint64_t)endpoint_dpcm_6bit.size(), (uint64_t)comp_endpoint_dpcm_6bit.size());
|
||
fmt_debug_printf(" Endpoint DPCM 7 bits: {}, comp size: {}\n", (uint64_t)endpoint_dpcm_7bit.size(), (uint64_t)comp_endpoint_dpcm_7bit.size());
|
||
fmt_debug_printf(" Endpoint DPCM 8 bits: {}, comp size: {}\n", (uint64_t)endpoint_dpcm_8bit.size(), (uint64_t)comp_endpoint_dpcm_8bit.size());
|
||
|
||
fmt_debug_printf(" \n Mean0 bytes: {} comp size: {}\n", (uint64_t)mean0_bits.get_bytes().size(), (uint64_t)comp_mean0.size());
|
||
fmt_debug_printf(" Mean1 bytes: {} comp size: {}\n", (uint64_t)mean1_bytes.size(), (uint64_t)comp_mean1.size());
|
||
fmt_debug_printf(" Run bytes: {} comp size: {}\n", (uint64_t)run_bytes.size(), (uint64_t)comp_run.size());
|
||
fmt_debug_printf(" Coeff bytes: {} comp size: {}\n", (uint64_t)coeff_bytes.size(), (uint64_t)comp_coeff.size());
|
||
fmt_debug_printf(" Sign bytes: {}\n", (uint64_t)sign_bits.get_bytes().size());
|
||
fmt_debug_printf(" Weight2 bytes: {} comp size: {}\n", (uint64_t)weight2_bits.get_bytes().size(), (uint64_t)comp_weight2.size());
|
||
fmt_debug_printf(" Weight3 bytes: {} comp size: {}\n", (uint64_t)weight3_bits.get_bytes().size(), (uint64_t)comp_weight3.size());
|
||
fmt_debug_printf(" Weight4 bytes: {} comp size: {}\n", (uint64_t)weight4_bits.get_bytes().size(), (uint64_t)comp_weight4.size());
|
||
fmt_debug_printf(" Weight8 bytes: {} comp size: {}\n", (uint64_t)weight8_bits.size(), (uint64_t)comp_weight8.size());
|
||
|
||
fmt_debug_printf("\nTotal blocks: {}\n", total_blocks);
|
||
fmt_debug_printf("Total runs: {}, run blocks: {}, non-run blocks: {}\n", total_runs, total_run_blocks, total_nonrun_blocks);
|
||
fmt_debug_printf("Total lossy replacements: {}\n", total_lossy_replacements);
|
||
fmt_debug_printf("Total solid blocks: {}\n", total_solid_blocks);
|
||
fmt_debug_printf("Total full reuse commands: {}\n", total_full_reuse_commands);
|
||
fmt_debug_printf("Total raw commands: {}\n", total_raw_commands);
|
||
fmt_debug_printf("Total reuse full cfg emitted: {}\n", total_reuse_full_cfg_emitted);
|
||
fmt_debug_printf("Total full cfg emitted: {}\n", total_full_cfg_emitted);
|
||
fmt_debug_printf("Num part hash probes: {}, num part hash hits: {}\n", num_part_hash_probes, num_part_hash_hits);
|
||
fmt_debug_printf("Total used endpoint dpcm: {}, total used endpoint raw: {}\n", total_used_endpoint_dpcm, total_used_endpoint_raw);
|
||
fmt_debug_printf("Total used weight DCT: {}, total used weight DPCM: {}\n", total_used_dct, total_used_weight_dpcm);
|
||
fmt_debug_printf("Total tm hash probes: {}, total tm hash_hits: {}\n", num_tm_hash_probes, num_tm_hash_hits);
|
||
|
||
fmt_debug_printf("\nCompressed to {} bytes, {3.3}bpp\n\n", comp_data.size_u32(), ((float)comp_data.size() * 8.0f) / (float)total_pixels);
|
||
}
|
||
|
||
return true;
|
||
}
|
||
|
||
bool compress_image(
|
||
const image& orig_img, uint8_vec& comp_data, vector2D<astc_helpers::log_astc_block>& coded_blocks,
|
||
const astc_ldr_encode_config& global_cfg,
|
||
job_pool& job_pool)
|
||
{
|
||
assert(g_initialized);
|
||
|
||
if (global_cfg.m_debug_output)
|
||
{
|
||
fmt_debug_printf("\n------------------- astc_ldr::compress_image\n");
|
||
|
||
fmt_debug_printf("\nglobal_cfg:\n");
|
||
global_cfg.debug_print();
|
||
fmt_debug_printf("\n");
|
||
}
|
||
|
||
comp_data.resize(0);
|
||
|
||
if (!g_initialized)
|
||
return false;
|
||
|
||
const uint32_t width = orig_img.get_width(), height = orig_img.get_height();
|
||
|
||
if (!is_in_range(width, 1, (int)MAX_WIDTH) || !is_in_range(height, 1, (int)MAX_HEIGHT))
|
||
return false;
|
||
|
||
if (!astc_helpers::is_valid_block_size(global_cfg.m_astc_block_width, global_cfg.m_astc_block_height))
|
||
return false;
|
||
|
||
const uint32_t block_width = global_cfg.m_astc_block_width;
|
||
const uint32_t block_height = global_cfg.m_astc_block_height;
|
||
const uint32_t total_block_pixels = block_width * block_height;
|
||
|
||
const uint32_t total_pixels = width * height;
|
||
const uint32_t num_blocks_x = (width + block_width - 1) / block_width;
|
||
const uint32_t num_blocks_y = (height + block_height - 1) / block_height;
|
||
const uint32_t total_blocks = num_blocks_x * num_blocks_y;
|
||
const bool has_alpha = orig_img.has_alpha();
|
||
|
||
if (global_cfg.m_debug_output)
|
||
fmt_debug_printf("Encoding image dimensions {}x{}, has alpha: {}\n", orig_img.get_width(), orig_img.get_height(), has_alpha);
|
||
|
||
ldr_astc_block_encode_image_high_level_config enc_cfg;
|
||
|
||
enc_cfg.m_block_width = block_width;
|
||
enc_cfg.m_block_height = block_height;
|
||
enc_cfg.m_pJob_pool = &job_pool;
|
||
|
||
enc_cfg.m_use_dct = global_cfg.m_use_dct;
|
||
|
||
if (!is_in_range(global_cfg.m_dct_quality, 1.0f, 100.0f))
|
||
return false;
|
||
|
||
const int int_q = clamp<int>((int)std::round(global_cfg.m_dct_quality * 2.0f), 0, 200);
|
||
enc_cfg.m_base_q = (float)int_q / 2.0f;
|
||
|
||
if (global_cfg.m_debug_output)
|
||
fmt_debug_printf("Use DCT: {}, base q: {}, lossy supercompression: {}\n", enc_cfg.m_use_dct, enc_cfg.m_base_q, global_cfg.m_lossy_supercompression);
|
||
|
||
const float replacement_min_psnr = has_alpha ? global_cfg.m_replacement_min_psnr_alpha : global_cfg.m_replacement_min_psnr;
|
||
const float psnr_trial_diff_thresh = has_alpha ? global_cfg.m_psnr_trial_diff_thresh_alpha : global_cfg.m_psnr_trial_diff_thresh;
|
||
const float psnr_trial_diff_thresh_edge = has_alpha ? global_cfg.m_psnr_trial_diff_thresh_edge_alpha : global_cfg.m_psnr_trial_diff_thresh_edge;
|
||
|
||
enc_cfg.m_blurring_enabled = global_cfg.m_block_blurring_p1;
|
||
enc_cfg.m_blurring_enabled_p2 = global_cfg.m_block_blurring_p2;
|
||
|
||
for (uint32_t i = 0; i < 4; i++)
|
||
{
|
||
enc_cfg.m_cem_enc_params.m_comp_weights[i] = global_cfg.m_comp_weights[i];
|
||
|
||
if (!is_in_range(global_cfg.m_comp_weights[i], 1, 256))
|
||
return false;
|
||
}
|
||
|
||
int cfg_effort_level = global_cfg.m_effort_level;
|
||
if (global_cfg.m_debug_output)
|
||
fmt_debug_printf("Using cfg effort level: {}\n", cfg_effort_level);
|
||
|
||
configure_encoder_effort_level(cfg_effort_level, enc_cfg);
|
||
|
||
if (global_cfg.m_force_disable_subsets)
|
||
{
|
||
enc_cfg.m_subsets_enabled = false;
|
||
enc_cfg.m_second_pass_force_subsets_enabled = false;
|
||
}
|
||
|
||
if (global_cfg.m_force_disable_rgb_dual_plane)
|
||
{
|
||
enc_cfg.m_disable_rgb_dual_plane = true;
|
||
enc_cfg.m_force_all_dp_chans_p2 = false;
|
||
}
|
||
|
||
enc_cfg.m_cem_enc_params.m_decode_mode_srgb = global_cfg.m_astc_decode_mode_srgb;
|
||
|
||
enc_cfg.m_debug_output = global_cfg.m_debug_output;
|
||
enc_cfg.m_debug_images = global_cfg.m_debug_images;
|
||
enc_cfg.m_debug_file_prefix = global_cfg.m_debug_file_prefix;
|
||
|
||
ldr_astc_block_encode_image_output enc_out;
|
||
|
||
const bool enc_status = ldr_astc_block_encode_image(orig_img, enc_cfg, enc_out);
|
||
|
||
if (global_cfg.m_debug_output)
|
||
fmt_debug_printf("ldr_astc_block_encode_image: {}\n", enc_status);
|
||
|
||
if (!enc_status)
|
||
return false;
|
||
|
||
basist::astc_ldr_t::xuastc_ldr_syntax syntax = global_cfg.m_compressed_syntax;
|
||
|
||
if (syntax >= basist::astc_ldr_t::xuastc_ldr_syntax::cTotal)
|
||
{
|
||
assert(0);
|
||
return false;
|
||
}
|
||
|
||
// Switch to full adaptive arithmetic coding on the smallest mipmaps to avoid ZStd overhead.
|
||
const uint32_t DISABLE_FASTER_FORMAT_TOTAL_BLOCKS_THRESH = 64;
|
||
if (total_blocks <= DISABLE_FASTER_FORMAT_TOTAL_BLOCKS_THRESH)
|
||
syntax = basist::astc_ldr_t::xuastc_ldr_syntax::cFullArith;
|
||
|
||
if (syntax == basist::astc_ldr_t::xuastc_ldr_syntax::cFullZStd)
|
||
{
|
||
// Full ZStd syntax is so different we'll move that to another function.
|
||
return compress_image_full_zstd(
|
||
orig_img, comp_data, coded_blocks,
|
||
global_cfg,
|
||
job_pool,
|
||
enc_cfg, enc_out);
|
||
}
|
||
|
||
const bool use_faster_format = (syntax == basist::astc_ldr_t::xuastc_ldr_syntax::cHybridArithZStd);
|
||
|
||
// Either full arithmetic, or hybrid arithmetic+ZStd for weight symbols.
|
||
basist::astc_ldr_t::xuastc_ldr_arith_header hdr;
|
||
clear_obj(hdr);
|
||
|
||
bitwise_coder mean0_bits;
|
||
uint8_vec mean1_bytes;
|
||
uint8_vec run_bytes;
|
||
uint8_vec coeff_bytes;
|
||
bitwise_coder sign_bits;
|
||
bitwise_coder weight2_bits;
|
||
bitwise_coder weight3_bits;
|
||
bitwise_coder weight4_bits;
|
||
uint8_vec weight8_bits;
|
||
|
||
if (use_faster_format)
|
||
{
|
||
mean0_bits.init(1024);
|
||
mean1_bytes.reserve(1024);
|
||
run_bytes.reserve(8192);
|
||
coeff_bytes.reserve(8192);
|
||
sign_bits.init(1024);
|
||
weight2_bits.init(1024);
|
||
weight3_bits.init(1024);
|
||
weight4_bits.init(1024);
|
||
weight8_bits.reserve(8192);
|
||
}
|
||
|
||
interval_timer itm;
|
||
itm.start();
|
||
|
||
basist::arith::arith_enc enc;
|
||
enc.init(1024 * 1024);
|
||
|
||
enc.put_bits(basist::astc_ldr_t::ARITH_HEADER_MARKER, basist::astc_ldr_t::ARITH_HEADER_MARKER_BITS);
|
||
|
||
const int block_dim_index = astc_helpers::find_astc_block_size_index(block_width, block_height);
|
||
assert((block_dim_index >= 0) && (block_dim_index < (int)astc_helpers::NUM_ASTC_BLOCK_SIZES));
|
||
|
||
enc.put_bits(block_dim_index, 4);
|
||
|
||
enc.put_bit(enc_cfg.m_cem_enc_params.m_decode_mode_srgb);
|
||
|
||
enc.put_bits(width, 16);
|
||
enc.put_bits(height, 16);
|
||
|
||
enc.put_bit(has_alpha);
|
||
|
||
enc.put_bits(enc_cfg.m_use_dct, 1);
|
||
if (enc_cfg.m_use_dct)
|
||
enc.put_bits(int_q, 8);
|
||
|
||
basist::arith::arith_data_model mode_model((uint32_t)basist::astc_ldr_t::xuastc_mode::cMODE_TOTAL);
|
||
|
||
basist::arith::arith_data_model solid_color_dpcm_model[4];
|
||
for (uint32_t i = 0; i < 4; i++)
|
||
solid_color_dpcm_model[i].init(256, true);
|
||
|
||
basist::arith::arith_data_model raw_endpoint_models[astc_helpers::TOTAL_ENDPOINT_ISE_RANGES];
|
||
for (uint32_t i = 0; i < astc_helpers::TOTAL_ENDPOINT_ISE_RANGES; i++)
|
||
raw_endpoint_models[i].init(astc_helpers::get_ise_levels(astc_helpers::FIRST_VALID_ENDPOINT_ISE_RANGE + i));
|
||
|
||
basist::arith::arith_data_model dpcm_endpoint_models[astc_helpers::TOTAL_ENDPOINT_ISE_RANGES];
|
||
for (uint32_t i = 0; i < astc_helpers::TOTAL_ENDPOINT_ISE_RANGES; i++)
|
||
dpcm_endpoint_models[i].init(astc_helpers::get_ise_levels(astc_helpers::FIRST_VALID_ENDPOINT_ISE_RANGE + i));
|
||
|
||
basist::arith::arith_data_model raw_weight_models[astc_helpers::TOTAL_WEIGHT_ISE_RANGES];
|
||
for (uint32_t i = 0; i < astc_helpers::TOTAL_WEIGHT_ISE_RANGES; i++)
|
||
raw_weight_models[i].init(astc_helpers::get_ise_levels(astc_helpers::FIRST_VALID_WEIGHT_ISE_RANGE + i));
|
||
|
||
basist::arith::arith_bit_model is_base_ofs_model;
|
||
basist::arith::arith_bit_model use_dct_model[4];
|
||
basist::arith::arith_bit_model use_dpcm_endpoints_model;
|
||
|
||
basist::arith::arith_data_model cem_index_model[8];
|
||
for (uint32_t i = 0; i < 8; i++)
|
||
cem_index_model[i].init(basist::astc_ldr_t::OTM_NUM_CEMS);
|
||
|
||
basist::arith::arith_data_model subset_index_model[basist::astc_ldr_t::OTM_NUM_SUBSETS];
|
||
for (uint32_t i = 0; i < basist::astc_ldr_t::OTM_NUM_SUBSETS; i++)
|
||
subset_index_model[i].init(basist::astc_ldr_t::OTM_NUM_SUBSETS);
|
||
|
||
basist::arith::arith_data_model ccs_index_model[basist::astc_ldr_t::OTM_NUM_CCS];
|
||
for (uint32_t i = 0; i < basist::astc_ldr_t::OTM_NUM_CCS; i++)
|
||
ccs_index_model[i].init(basist::astc_ldr_t::OTM_NUM_CCS);
|
||
|
||
basist::arith::arith_data_model grid_size_model[basist::astc_ldr_t::OTM_NUM_GRID_SIZES];
|
||
for (uint32_t i = 0; i < basist::astc_ldr_t::OTM_NUM_GRID_SIZES; i++)
|
||
grid_size_model[i].init(basist::astc_ldr_t::OTM_NUM_GRID_SIZES);
|
||
|
||
basist::arith::arith_data_model grid_aniso_model[basist::astc_ldr_t::OTM_NUM_GRID_ANISOS];
|
||
for (uint32_t i = 0; i < basist::astc_ldr_t::OTM_NUM_GRID_ANISOS; i++)
|
||
grid_aniso_model[i].init(basist::astc_ldr_t::OTM_NUM_GRID_ANISOS);
|
||
|
||
basist::arith::arith_data_model dct_run_len_model(65); // [0,63] or 64=EOB
|
||
basist::arith::arith_data_model dct_coeff_mag(255); // [1,255] (blocks with larger mags go DPCM)
|
||
|
||
double total_header_bits = 0.0f, total_weight_bits = 0.0f, total_endpoint_bits = 0.0f;
|
||
|
||
uint32_t total_solid_blocks = 0, total_used_dct = 0, total_used_weight_dpcm = 0;
|
||
|
||
basist::astc_ldr_t::grid_weight_dct grid_dct;
|
||
grid_dct.init(block_width, block_height);
|
||
|
||
vector2D<basist::astc_ldr_t::prev_block_state> prev_block_states(num_blocks_x, num_blocks_y);
|
||
|
||
coded_blocks.resize(num_blocks_x, num_blocks_y);
|
||
for (uint32_t y = 0; y < num_blocks_y; y++)
|
||
for (uint32_t x = 0; x < num_blocks_x; x++)
|
||
coded_blocks(x, y).clear();
|
||
|
||
const bool endpoint_dpcm_global_enable = true;
|
||
uint32_t total_used_endpoint_dpcm = 0, total_used_endpoint_raw = 0;
|
||
|
||
basist::arith::arith_data_model submode_models[basist::astc_ldr_t::OTM_NUM_CEMS][basist::astc_ldr_t::OTM_NUM_SUBSETS][basist::astc_ldr_t::OTM_NUM_CCS][basist::astc_ldr_t::OTM_NUM_GRID_SIZES][basist::astc_ldr_t::OTM_NUM_GRID_ANISOS];
|
||
|
||
basist::arith::arith_bit_model endpoints_use_bc_models[4];
|
||
|
||
basist::arith::arith_data_model endpoint_reuse_delta_model(basist::astc_6x6_hdr::NUM_REUSE_XY_DELTAS);
|
||
|
||
basist::arith::arith_data_model weight_mean_models[2];
|
||
weight_mean_models[0].init(basist::astc_ldr_t::DCT_MEAN_LEVELS0);
|
||
weight_mean_models[1].init(basist::astc_ldr_t::DCT_MEAN_LEVELS1);
|
||
|
||
basist::arith::arith_data_model config_reuse_model[4];
|
||
for (uint32_t i = 0; i < 4; i++)
|
||
config_reuse_model[i].init(basist::astc_ldr_t::cMaxConfigReuseNeighbors + 1);
|
||
|
||
uint32_t total_reuse_full_cfg_emitted = 0, total_full_cfg_emitted = 0;
|
||
|
||
// TODO: check weights for >= 0
|
||
const float total_comp_weights = enc_cfg.m_cem_enc_params.get_total_comp_weights();
|
||
|
||
uint32_t total_lossy_replacements = 0;
|
||
uint32_t total_full_reuse_commands = 0;
|
||
uint32_t total_raw_commands = 0;
|
||
|
||
if (global_cfg.m_debug_output)
|
||
fmt_debug_printf("Supercompressor init time: {} secs\n", itm.get_elapsed_secs());
|
||
|
||
uint32_t total_runs = 0, total_run_blocks = 0;
|
||
uint32_t cur_run_len = 0;
|
||
const bool use_run_commands = true;
|
||
uint32_t total_nonrun_blocks = 0;
|
||
|
||
int part2_hash[basist::astc_ldr_t::PART_HASH_SIZE];
|
||
std::fill(part2_hash, part2_hash + basist::astc_ldr_t::PART_HASH_SIZE, -1);
|
||
|
||
int part3_hash[basist::astc_ldr_t::PART_HASH_SIZE];
|
||
std::fill(part3_hash, part3_hash + basist::astc_ldr_t::PART_HASH_SIZE, -1);
|
||
|
||
basist::arith::arith_bit_model use_part_hash_model[4];
|
||
basist::arith::arith_data_model part2_hash_index_model(basist::astc_ldr_t::PART_HASH_SIZE, true);
|
||
basist::arith::arith_data_model part3_hash_index_model(basist::astc_ldr_t::PART_HASH_SIZE, true);
|
||
|
||
uint32_t num_part_hash_probes = 0, num_part_hash_hits = 0;
|
||
uint32_t total_dct_syms = 0, total_dpcm_syms = 0;
|
||
|
||
basist::arith::arith_gamma_contexts m_run_len_contexts;
|
||
|
||
image vis_img;
|
||
if (global_cfg.m_debug_images)
|
||
{
|
||
vis_img.resize(width, height);
|
||
}
|
||
|
||
itm.start();
|
||
|
||
for (uint32_t by = 0; by < num_blocks_y; by++)
|
||
{
|
||
const uint32_t base_y = by * block_height;
|
||
|
||
for (uint32_t bx = 0; bx < num_blocks_x; bx++)
|
||
{
|
||
const uint32_t base_x = bx * block_width;
|
||
|
||
basist::astc_ldr_t::prev_block_state& prev_state = prev_block_states(bx, by);
|
||
const basist::astc_ldr_t::prev_block_state* pLeft_state = bx ? &prev_block_states(bx - 1, by) : nullptr;
|
||
const basist::astc_ldr_t::prev_block_state* pUpper_state = by ? &prev_block_states(bx, by - 1) : nullptr;
|
||
const basist::astc_ldr_t::prev_block_state* pDiag_state = (bx && by) ? &prev_block_states(bx - 1, by - 1) : nullptr;
|
||
const basist::astc_ldr_t::prev_block_state* pPred_state = pLeft_state ? pLeft_state : pUpper_state; // left or upper, or nullptr on first block
|
||
|
||
const ldr_astc_block_encode_image_output::block_info& blk_info = enc_out.m_image_block_info(bx, by);
|
||
|
||
uint32_t best_packed_out_block_index = blk_info.m_packed_out_block_index;
|
||
|
||
// check for run
|
||
if ((use_run_commands) && (bx || by))
|
||
{
|
||
const encode_block_output& blk_out = blk_info.m_out_blocks[best_packed_out_block_index];
|
||
const astc_helpers::log_astc_block& cur_log_blk = blk_out.m_log_blk;
|
||
|
||
const astc_helpers::log_astc_block& prev_log_blk = bx ? coded_blocks(bx - 1, by) : coded_blocks(0, by - 1);
|
||
const basist::astc_ldr_t::prev_block_state* pPrev_block_state = bx ? pLeft_state : pUpper_state;
|
||
|
||
assert(pPrev_block_state);
|
||
|
||
if (compare_log_blocks_for_equality(cur_log_blk, prev_log_blk))
|
||
{
|
||
// Left or upper is exactly the same logical block, so expand the run.
|
||
cur_run_len++;
|
||
|
||
// Accept the previous block (left or upper) as if it's been coded normally.
|
||
|
||
coded_blocks(bx, by) = prev_log_blk;
|
||
|
||
prev_state.m_was_solid_color = pPrev_block_state->m_was_solid_color;
|
||
prev_state.m_used_weight_dct = pPrev_block_state->m_used_weight_dct;
|
||
prev_state.m_first_endpoint_uses_bc = pPrev_block_state->m_first_endpoint_uses_bc;
|
||
prev_state.m_reused_full_cfg = true;
|
||
prev_state.m_used_part_hash = pPrev_block_state->m_used_part_hash;
|
||
prev_state.m_tm_index = pPrev_block_state->m_tm_index;
|
||
prev_state.m_base_cem_index = pPrev_block_state->m_base_cem_index;
|
||
prev_state.m_subset_index = pPrev_block_state->m_subset_index;
|
||
prev_state.m_ccs_index = pPrev_block_state->m_ccs_index;
|
||
prev_state.m_grid_size = pPrev_block_state->m_grid_size;
|
||
prev_state.m_grid_aniso = pPrev_block_state->m_grid_aniso;
|
||
|
||
continue;
|
||
}
|
||
}
|
||
|
||
if (cur_run_len)
|
||
{
|
||
total_runs++;
|
||
total_run_blocks += cur_run_len;
|
||
|
||
total_header_bits += enc.encode_and_return_price((uint32_t)basist::astc_ldr_t::xuastc_mode::cMODE_RUN, mode_model);
|
||
total_header_bits += enc.put_gamma_and_return_price(cur_run_len, m_run_len_contexts);
|
||
cur_run_len = 0;
|
||
}
|
||
|
||
total_nonrun_blocks++;
|
||
|
||
const float ref_wmse = (float)blk_info.m_out_blocks[best_packed_out_block_index].m_sse / (total_comp_weights * (float)total_block_pixels);
|
||
const float ref_wpsnr = (ref_wmse > 1e-5f) ? 20.0f * log10f(255.0f / sqrtf(ref_wmse)) : 10000.0f;
|
||
|
||
if ((global_cfg.m_lossy_supercompression) && (ref_wpsnr >= replacement_min_psnr) &&
|
||
(!blk_info.m_out_blocks[blk_info.m_packed_out_block_index].m_log_blk.m_solid_color_flag_ldr))
|
||
{
|
||
const float psnr_thresh = blk_info.m_strong_edges ? psnr_trial_diff_thresh_edge : psnr_trial_diff_thresh;
|
||
|
||
float best_alt_wpsnr = 0.0f;
|
||
bool found_alternative = false;
|
||
|
||
// Pass: 0 consider full config+part ID endpoint reuse
|
||
// Pass: 1 fall back to just full config+part ID reuse (no endpoints)
|
||
for (uint32_t pass = 0; pass < 2; pass++)
|
||
{
|
||
// Iterate through all available alternative candidates
|
||
for (uint32_t out_block_iter = 0; out_block_iter < blk_info.m_out_blocks.size(); out_block_iter++)
|
||
{
|
||
if (out_block_iter == blk_info.m_packed_out_block_index)
|
||
continue;
|
||
|
||
const float trial_wmse = (float)blk_info.m_out_blocks[out_block_iter].m_sse / (total_comp_weights * (float)total_block_pixels);
|
||
const float trial_wpsnr = (trial_wmse > 1e-5f) ? 20.0f * log10f(255.0f / sqrtf(trial_wmse)) : 10000.0f;
|
||
|
||
// Reject if PSNR too low
|
||
if (trial_wpsnr < (ref_wpsnr - psnr_thresh))
|
||
continue;
|
||
|
||
// Reject if inferior than best found so far
|
||
if (trial_wpsnr < best_alt_wpsnr)
|
||
continue;
|
||
|
||
const astc_helpers::log_astc_block& trial_log_blk = blk_info.m_out_blocks[out_block_iter].m_log_blk;
|
||
|
||
if (trial_log_blk.m_solid_color_flag_ldr)
|
||
continue;
|
||
|
||
// Examine nearby neighbors
|
||
for (uint32_t i = 0; i < basist::astc_ldr_t::cMaxConfigReuseNeighbors; i++)
|
||
{
|
||
int dx = 0, dy = 0;
|
||
switch (i)
|
||
{
|
||
case 0: dx = -1; break;
|
||
case 1: dy = -1; break;
|
||
case 2: dx = -1; dy = -1; break;
|
||
default: assert(0); break;
|
||
}
|
||
|
||
const int n_bx = bx + dx, n_by = by + dy;
|
||
if ((n_bx < 0) || (n_by < 0))
|
||
continue;
|
||
|
||
astc_helpers::log_astc_block& neighbor_log_blk = coded_blocks(n_bx, n_by);
|
||
|
||
if (neighbor_log_blk.m_solid_color_flag_ldr)
|
||
continue;
|
||
|
||
bool accept_flag = false;
|
||
if (pass == 0)
|
||
{
|
||
// prefer full config+endpoint equality first
|
||
accept_flag = compare_log_block_configs_and_endpoints(trial_log_blk, neighbor_log_blk);
|
||
}
|
||
else
|
||
{
|
||
// next check for just config equality
|
||
accept_flag = compare_log_block_configs(trial_log_blk, neighbor_log_blk);
|
||
}
|
||
|
||
if (accept_flag)
|
||
{
|
||
best_alt_wpsnr = trial_wpsnr;
|
||
best_packed_out_block_index = out_block_iter;
|
||
found_alternative = true;
|
||
break;
|
||
}
|
||
|
||
} // i
|
||
|
||
} // out_block_iter
|
||
|
||
if (found_alternative)
|
||
break;
|
||
|
||
} // pass
|
||
|
||
if (best_packed_out_block_index != blk_info.m_packed_out_block_index)
|
||
total_lossy_replacements++;
|
||
|
||
} // global_cfg.m_lossy_supercompression
|
||
|
||
const encode_block_output& blk_out = blk_info.m_out_blocks[best_packed_out_block_index];
|
||
|
||
astc_helpers::log_astc_block& cur_log_blk = coded_blocks(bx, by);
|
||
|
||
cur_log_blk = blk_out.m_log_blk;
|
||
|
||
// TODO: Add mode model context
|
||
|
||
if (blk_out.m_trial_mode_index < 0)
|
||
{
|
||
assert(cur_log_blk.m_solid_color_flag_ldr);
|
||
|
||
total_solid_blocks++;
|
||
|
||
//total_header_bits += mode_model.get_price(cMODE_SOLID) + (float)(8 * (has_alpha ? 4 : 3));
|
||
total_header_bits += mode_model.get_price((uint32_t)basist::astc_ldr_t::xuastc_mode::cMODE_SOLID);
|
||
enc.encode((uint32_t)basist::astc_ldr_t::xuastc_mode::cMODE_SOLID, mode_model);
|
||
|
||
uint32_t cur_solid_color[4];
|
||
for (uint32_t i = 0; i < 4; i++)
|
||
cur_solid_color[i] = blk_out.m_log_blk.m_solid_color[i] >> 8;
|
||
|
||
uint32_t prev_solid_color[4] = { 0 };
|
||
|
||
const uint32_t num_comps = has_alpha ? 4 : 3;
|
||
|
||
astc_helpers::log_astc_block* pPrev_log_blk = bx ? &coded_blocks(bx - 1, by) : (by ? &coded_blocks(bx, by - 1) : nullptr);
|
||
if (pPrev_log_blk)
|
||
{
|
||
if (pPrev_log_blk->m_solid_color_flag_ldr)
|
||
{
|
||
prev_solid_color[0] = pPrev_log_blk->m_solid_color[0] >> 8;
|
||
prev_solid_color[1] = pPrev_log_blk->m_solid_color[1] >> 8;
|
||
prev_solid_color[2] = pPrev_log_blk->m_solid_color[2] >> 8;
|
||
prev_solid_color[3] = pPrev_log_blk->m_solid_color[3] >> 8;
|
||
}
|
||
else
|
||
{
|
||
#if 0
|
||
color_rgba prev_block_pixels[astc_ldr::ASTC_LDR_MAX_BLOCK_PIXELS];
|
||
bool dec_status = astc_helpers::decode_block(*pPrev_log_blk, prev_block_pixels, block_width, block_height, enc_cfg.m_cem_enc_params.m_decode_mode_srgb ? astc_helpers::cDecodeModeSRGB8 : astc_helpers::cDecodeModeLDR8);
|
||
if (!dec_status)
|
||
{
|
||
fmt_error_printf("decode_block() failed\n");
|
||
return false;
|
||
}
|
||
|
||
for (uint32_t i = 0; i < total_block_pixels; i++)
|
||
{
|
||
for (uint32_t j = 0; j < num_comps; j++)
|
||
prev_solid_color[j] += prev_block_pixels[i][j];
|
||
}
|
||
|
||
for (uint32_t j = 0; j < num_comps; j++)
|
||
prev_solid_color[j] = (prev_solid_color[j] + (total_block_pixels / 2)) / total_block_pixels;
|
||
#endif
|
||
// Decode previous block's first CEM, use the halfway point as the predictor.
|
||
color_rgba prev_l, prev_h;
|
||
decode_endpoints(pPrev_log_blk->m_color_endpoint_modes[0], pPrev_log_blk->m_endpoints, pPrev_log_blk->m_endpoint_ise_range, prev_l, prev_h);
|
||
|
||
prev_solid_color[0] = (prev_l[0] + prev_h[0] + 1) >> 1;
|
||
prev_solid_color[1] = (prev_l[1] + prev_h[1] + 1) >> 1;
|
||
prev_solid_color[2] = (prev_l[2] + prev_h[2] + 1) >> 1;
|
||
prev_solid_color[3] = (prev_l[3] + prev_h[3] + 1) >> 1;
|
||
}
|
||
}
|
||
|
||
for (uint32_t i = 0; i < num_comps; i++)
|
||
{
|
||
const uint32_t delta = (cur_solid_color[i] - prev_solid_color[i]) & 0xFF;
|
||
|
||
total_header_bits += enc.encode_and_return_price(delta, solid_color_dpcm_model[i]);
|
||
}
|
||
|
||
// Bias the statistics towards using DCT (most common case).
|
||
prev_state.m_was_solid_color = true;
|
||
prev_state.m_used_weight_dct = enc_cfg.m_use_dct;
|
||
prev_state.m_first_endpoint_uses_bc = true;
|
||
prev_state.m_tm_index = -1;
|
||
prev_state.m_base_cem_index = astc_helpers::CEM_LDR_RGB_DIRECT;
|
||
prev_state.m_subset_index = 0;
|
||
prev_state.m_ccs_index = 0;
|
||
prev_state.m_grid_size = 0;
|
||
prev_state.m_grid_aniso = 0;
|
||
prev_state.m_reused_full_cfg = false;
|
||
prev_state.m_used_part_hash = true; // bias to true
|
||
|
||
continue;
|
||
}
|
||
|
||
//--------------------------------------------
|
||
// for (uint32_t out_block_iter = 0; out_block_iter < blk_info.m_out_blocks.size(); out_block_iter++)
|
||
int full_cfg_endpoint_reuse_index = -1;
|
||
|
||
for (uint32_t i = 0; i < basist::astc_ldr_t::cMaxConfigReuseNeighbors; i++)
|
||
{
|
||
int dx = 0, dy = 0;
|
||
switch (i)
|
||
{
|
||
case 0: dx = -1; break;
|
||
case 1: dy = -1; break;
|
||
case 2: dx = -1; dy = -1; break;
|
||
default: assert(0); break;
|
||
}
|
||
|
||
const int n_bx = bx + dx, n_by = by + dy;
|
||
if ((n_bx < 0) || (n_by < 0))
|
||
continue;
|
||
|
||
astc_helpers::log_astc_block& neighbor_log_blk = coded_blocks(n_bx, n_by);
|
||
|
||
if (neighbor_log_blk.m_solid_color_flag_ldr)
|
||
continue;
|
||
|
||
if (compare_log_block_configs_and_endpoints(cur_log_blk, neighbor_log_blk))
|
||
{
|
||
full_cfg_endpoint_reuse_index = i;
|
||
break;
|
||
}
|
||
} // i
|
||
//--------------------------------------------
|
||
|
||
if (full_cfg_endpoint_reuse_index >= 0)
|
||
{
|
||
// Reused full config, part ID and endpoint values from an immediate neighbor
|
||
total_header_bits += enc.encode_and_return_price((uint32_t)basist::astc_ldr_t::xuastc_mode::cMODE_REUSE_CFG_ENDPOINTS_LEFT + full_cfg_endpoint_reuse_index, mode_model);
|
||
|
||
total_full_reuse_commands++;
|
||
|
||
const basist::astc_ldr_t::prev_block_state* pReused_cfg_state = nullptr;
|
||
|
||
switch (full_cfg_endpoint_reuse_index)
|
||
{
|
||
case 0: pReused_cfg_state = pLeft_state; break;
|
||
case 1: pReused_cfg_state = pUpper_state; break;
|
||
case 2: pReused_cfg_state = pDiag_state; break;
|
||
default: assert(0); break;
|
||
}
|
||
|
||
if (!pReused_cfg_state)
|
||
{
|
||
assert(0);
|
||
fmt_error_printf("encoding internal failure\n");
|
||
return false;
|
||
}
|
||
|
||
assert(pReused_cfg_state->m_tm_index == blk_out.m_trial_mode_index);
|
||
|
||
prev_state.m_tm_index = blk_out.m_trial_mode_index;
|
||
prev_state.m_base_cem_index = pReused_cfg_state->m_base_cem_index;
|
||
prev_state.m_subset_index = pReused_cfg_state->m_subset_index;
|
||
prev_state.m_ccs_index = pReused_cfg_state->m_ccs_index;
|
||
prev_state.m_grid_size = pReused_cfg_state->m_grid_size;
|
||
prev_state.m_grid_aniso = pReused_cfg_state->m_grid_aniso;
|
||
prev_state.m_used_part_hash = pReused_cfg_state->m_used_part_hash;
|
||
prev_state.m_reused_full_cfg = true;
|
||
|
||
const uint32_t cur_actual_cem = cur_log_blk.m_color_endpoint_modes[0];
|
||
|
||
if (astc_helpers::cem_supports_bc(cur_actual_cem))
|
||
{
|
||
prev_state.m_first_endpoint_uses_bc = astc_helpers::used_blue_contraction(cur_actual_cem, cur_log_blk.m_endpoints, cur_log_blk.m_endpoint_ise_range);
|
||
assert(prev_state.m_first_endpoint_uses_bc == pReused_cfg_state->m_first_endpoint_uses_bc);
|
||
}
|
||
}
|
||
else
|
||
{
|
||
total_raw_commands++;
|
||
|
||
// Send mode
|
||
total_header_bits += mode_model.get_price((uint32_t)basist::astc_ldr_t::xuastc_mode::cMODE_RAW);
|
||
enc.encode((uint32_t)basist::astc_ldr_t::xuastc_mode::cMODE_RAW, mode_model);
|
||
|
||
const uint32_t cur_actual_cem = cur_log_blk.m_color_endpoint_modes[0];
|
||
//const bool actual_cem_supports_bc = astc_helpers::cem_supports_bc(cur_actual_cem);
|
||
const uint32_t total_endpoint_vals = astc_helpers::get_num_cem_values(cur_actual_cem);
|
||
|
||
// DO NOT use tm.m_cem because the encoder may have selected a base+ofs variant instead. Use cur_actual_cem.
|
||
const basist::astc_ldr_t::trial_mode& tm = enc_out.m_encoder_trial_modes[blk_out.m_trial_mode_index];
|
||
|
||
// Check for config+part ID neighbor reuse
|
||
int neighbor_cfg_match_index = -1;
|
||
for (uint32_t i = 0; i < basist::astc_ldr_t::cMaxConfigReuseNeighbors; i++)
|
||
{
|
||
const basist::astc_ldr_t::prev_block_state* pNeighbor_state = nullptr;
|
||
|
||
int dx = 0, dy = 0;
|
||
switch (i)
|
||
{
|
||
case 0: dx = -1; pNeighbor_state = pLeft_state; break;
|
||
case 1: dy = -1; pNeighbor_state = pUpper_state; break;
|
||
case 2: dx = -1; dy = -1; pNeighbor_state = pDiag_state; break;
|
||
default: assert(0); break;
|
||
}
|
||
|
||
if (!pNeighbor_state)
|
||
continue;
|
||
|
||
const int n_bx = bx + dx, n_by = by + dy;
|
||
assert((n_bx >= 0) && (n_by >= 0));
|
||
|
||
astc_helpers::log_astc_block& neighbor_log_blk = coded_blocks(n_bx, n_by);
|
||
|
||
if (pNeighbor_state->m_tm_index != blk_out.m_trial_mode_index)
|
||
continue;
|
||
|
||
if (neighbor_log_blk.m_color_endpoint_modes[0] != cur_log_blk.m_color_endpoint_modes[0])
|
||
continue;
|
||
|
||
if (neighbor_log_blk.m_partition_id != cur_log_blk.m_partition_id)
|
||
continue;
|
||
|
||
assert(neighbor_log_blk.m_dual_plane == cur_log_blk.m_dual_plane);
|
||
assert(neighbor_log_blk.m_color_component_selector == cur_log_blk.m_color_component_selector);
|
||
assert(neighbor_log_blk.m_num_partitions == cur_log_blk.m_num_partitions);
|
||
assert(neighbor_log_blk.m_grid_width == cur_log_blk.m_grid_width);
|
||
assert(neighbor_log_blk.m_grid_height == cur_log_blk.m_grid_height);
|
||
assert(neighbor_log_blk.m_endpoint_ise_range == cur_log_blk.m_endpoint_ise_range);
|
||
assert(neighbor_log_blk.m_weight_ise_range == cur_log_blk.m_weight_ise_range);
|
||
|
||
neighbor_cfg_match_index = i;
|
||
break;
|
||
}
|
||
|
||
uint32_t reuse_full_cfg_model_index = 0;
|
||
if (pLeft_state)
|
||
reuse_full_cfg_model_index = pLeft_state->m_reused_full_cfg;
|
||
else
|
||
reuse_full_cfg_model_index = 1;
|
||
|
||
if (pUpper_state)
|
||
reuse_full_cfg_model_index |= pUpper_state->m_reused_full_cfg ? 2 : 0;
|
||
else
|
||
reuse_full_cfg_model_index |= 2;
|
||
|
||
if (neighbor_cfg_match_index >= 0)
|
||
{
|
||
total_header_bits += enc.encode_and_return_price(neighbor_cfg_match_index, config_reuse_model[reuse_full_cfg_model_index]);
|
||
|
||
const basist::astc_ldr_t::prev_block_state* pReused_cfg_state = nullptr;
|
||
|
||
switch (neighbor_cfg_match_index)
|
||
{
|
||
case 0: pReused_cfg_state = pLeft_state; break;
|
||
case 1: pReused_cfg_state = pUpper_state; break;
|
||
case 2: pReused_cfg_state = pDiag_state; break;
|
||
default: assert(0); break;
|
||
}
|
||
|
||
if (!pReused_cfg_state)
|
||
{
|
||
assert(0);
|
||
fmt_error_printf("encoding internal failure\n");
|
||
return false;
|
||
}
|
||
|
||
assert(pReused_cfg_state->m_tm_index == blk_out.m_trial_mode_index);
|
||
|
||
prev_state.m_tm_index = blk_out.m_trial_mode_index;
|
||
prev_state.m_base_cem_index = pReused_cfg_state->m_base_cem_index;
|
||
prev_state.m_subset_index = pReused_cfg_state->m_subset_index;
|
||
prev_state.m_ccs_index = pReused_cfg_state->m_ccs_index;
|
||
prev_state.m_grid_size = pReused_cfg_state->m_grid_size;
|
||
prev_state.m_grid_aniso = pReused_cfg_state->m_grid_aniso;
|
||
prev_state.m_used_part_hash = pReused_cfg_state->m_used_part_hash;
|
||
prev_state.m_reused_full_cfg = true;
|
||
|
||
total_reuse_full_cfg_emitted++;
|
||
}
|
||
else
|
||
{
|
||
total_full_cfg_emitted++;
|
||
|
||
total_header_bits += enc.encode_and_return_price(basist::astc_ldr_t::cMaxConfigReuseNeighbors, config_reuse_model[reuse_full_cfg_model_index]);
|
||
|
||
// ------------------------------------------- Set TM index
|
||
{
|
||
uint32_t cem_index, subset_index, ccs_index, grid_size, grid_aniso;
|
||
|
||
const uint_vec& submodes = separate_tm_index(block_width, block_height, enc_out.m_grouped_encoder_trial_modes, tm,
|
||
cem_index, subset_index, ccs_index, grid_size, grid_aniso);
|
||
|
||
// TODO: sort this
|
||
uint32_t submode_index;
|
||
for (submode_index = 0; submode_index < submodes.size(); submode_index++)
|
||
if (submodes[submode_index] == (uint32_t)blk_out.m_trial_mode_index)
|
||
break;
|
||
|
||
if (submode_index == submodes.size_u32())
|
||
{
|
||
assert(0);
|
||
fmt_error_printf("Failed finding mode\n");
|
||
return false;
|
||
}
|
||
|
||
uint32_t prev_cem_index = astc_helpers::CEM_LDR_RGB_DIRECT;
|
||
uint32_t prev_subset_index = 0;
|
||
uint32_t prev_ccs_index = 0;
|
||
uint32_t prev_grid_size = 0;
|
||
uint32_t prev_grid_aniso = 0;
|
||
|
||
if (pPred_state)
|
||
{
|
||
prev_cem_index = pPred_state->m_base_cem_index;
|
||
prev_subset_index = pPred_state->m_subset_index;
|
||
prev_ccs_index = pPred_state->m_ccs_index;
|
||
prev_grid_size = pPred_state->m_grid_size;
|
||
prev_grid_aniso = pPred_state->m_grid_aniso;
|
||
}
|
||
|
||
const uint32_t ldrcem_index = basist::astc_ldr_t::cem_to_ldrcem_index(prev_cem_index);
|
||
|
||
total_header_bits += cem_index_model[ldrcem_index].get_price(cem_index);
|
||
enc.encode(cem_index, cem_index_model[ldrcem_index]);
|
||
|
||
total_header_bits += subset_index_model[prev_subset_index].get_price(subset_index);
|
||
enc.encode(subset_index, subset_index_model[prev_subset_index]);
|
||
|
||
total_header_bits += ccs_index_model[prev_ccs_index].get_price(ccs_index);
|
||
enc.encode(ccs_index, ccs_index_model[prev_ccs_index]);
|
||
|
||
total_header_bits += grid_size_model[prev_grid_size].get_price(grid_size);
|
||
enc.encode(grid_size, grid_size_model[prev_grid_size]);
|
||
|
||
total_header_bits += grid_aniso_model[prev_grid_aniso].get_price(grid_aniso);
|
||
enc.encode(grid_aniso, grid_aniso_model[prev_grid_aniso]);
|
||
|
||
if (submodes.size() > 1)
|
||
{
|
||
basist::arith::arith_data_model& submode_model = submode_models[cem_index][subset_index][ccs_index][grid_size][grid_aniso];
|
||
if (!submode_model.get_num_data_syms())
|
||
submode_model.init(submodes.size_u32(), true);
|
||
|
||
total_header_bits += submode_model.get_price(submode_index);
|
||
enc.encode(submode_index, submode_model);
|
||
}
|
||
|
||
prev_state.m_tm_index = blk_out.m_trial_mode_index;
|
||
prev_state.m_base_cem_index = cem_index;
|
||
prev_state.m_subset_index = subset_index;
|
||
prev_state.m_ccs_index = ccs_index;
|
||
prev_state.m_grid_size = grid_size;
|
||
prev_state.m_grid_aniso = grid_aniso;
|
||
prev_state.m_reused_full_cfg = false;
|
||
}
|
||
|
||
// Send base_ofs bit if the tm is direct
|
||
if ((tm.m_cem == astc_helpers::CEM_LDR_RGB_DIRECT) || (tm.m_cem == astc_helpers::CEM_LDR_RGBA_DIRECT))
|
||
{
|
||
const bool is_base_ofs = (cur_log_blk.m_color_endpoint_modes[0] == astc_helpers::CEM_LDR_RGB_BASE_PLUS_OFFSET) ||
|
||
(cur_log_blk.m_color_endpoint_modes[0] == astc_helpers::CEM_LDR_RGBA_BASE_PLUS_OFFSET);
|
||
|
||
total_header_bits += is_base_ofs_model.get_price(is_base_ofs);
|
||
enc.encode(is_base_ofs, is_base_ofs_model);
|
||
}
|
||
|
||
if (tm.m_num_parts > 1)
|
||
{
|
||
// Send unique part pattern ID
|
||
astc_ldr::partitions_data* pPart_data = (tm.m_num_parts == 2) ? &enc_out.m_part_data_p2 : &enc_out.m_part_data_p3;
|
||
|
||
const uint32_t astc_pat_index = cur_log_blk.m_partition_id;
|
||
const uint32_t unique_pat_index = pPart_data->m_part_seed_to_unique_index[astc_pat_index];
|
||
const uint32_t total_unique_indices = pPart_data->m_total_unique_patterns;
|
||
assert(unique_pat_index < total_unique_indices);
|
||
|
||
num_part_hash_probes++;
|
||
|
||
uint32_t use_part_model_index = 0;
|
||
if (pLeft_state)
|
||
use_part_model_index = pLeft_state->m_used_part_hash;
|
||
else
|
||
use_part_model_index = 1;
|
||
if (pUpper_state)
|
||
use_part_model_index |= pUpper_state->m_used_part_hash ? 2 : 0;
|
||
else
|
||
use_part_model_index |= 2;
|
||
|
||
int* pPart_hash = (tm.m_num_parts == 2) ? part2_hash : part3_hash;
|
||
|
||
const uint32_t h = basist::astc_ldr_t::part_hash_index(unique_pat_index);
|
||
|
||
if (pPart_hash[h] != (int)unique_pat_index)
|
||
{
|
||
#if defined(_DEBUG) || defined(DEBUG)
|
||
// sanity
|
||
for (uint32_t i = 0; i < basist::astc_ldr_t::PART_HASH_SIZE; i++)
|
||
{
|
||
assert(pPart_hash[i] != (int)unique_pat_index);
|
||
}
|
||
#endif
|
||
|
||
total_header_bits += enc.encode_and_return_price(0, use_part_hash_model[use_part_model_index]);
|
||
total_header_bits += enc.put_truncated_binary(unique_pat_index, total_unique_indices);
|
||
|
||
if (global_cfg.m_debug_images)
|
||
{
|
||
vis_img.fill_box(base_x, base_y, block_width, block_height, color_rgba(0, 0, 255, 255));
|
||
}
|
||
|
||
prev_state.m_used_part_hash = false;
|
||
}
|
||
else
|
||
{
|
||
num_part_hash_hits++;
|
||
|
||
if (global_cfg.m_debug_images)
|
||
{
|
||
vis_img.fill_box(base_x, base_y, block_width, block_height, color_rgba(255, 0, 0, 255));
|
||
}
|
||
|
||
total_header_bits += enc.encode_and_return_price(1, use_part_hash_model[use_part_model_index]);
|
||
total_header_bits += enc.encode_and_return_price(h, (tm.m_num_parts == 2) ? part2_hash_index_model : part3_hash_index_model);
|
||
|
||
prev_state.m_used_part_hash = true;
|
||
}
|
||
|
||
pPart_hash[basist::astc_ldr_t::part_hash_index(unique_pat_index)] = unique_pat_index;
|
||
}
|
||
else
|
||
{
|
||
prev_state.m_used_part_hash = true; // bias to true
|
||
}
|
||
|
||
} // if (neighbor_cfg_match_index >= 0)
|
||
|
||
// ----------------------------------------- Send endpoints
|
||
const int num_endpoint_levels = astc_helpers::get_ise_levels(cur_log_blk.m_endpoint_ise_range);
|
||
const auto& endpoint_ise_to_rank = astc_helpers::g_dequant_tables.get_endpoint_tab(cur_log_blk.m_endpoint_ise_range).m_ISE_to_rank;
|
||
|
||
uint32_t bc_model_index = 0;
|
||
if (pLeft_state)
|
||
bc_model_index = pLeft_state->m_first_endpoint_uses_bc;
|
||
else
|
||
bc_model_index = 1;
|
||
|
||
if (pUpper_state)
|
||
bc_model_index |= pUpper_state->m_first_endpoint_uses_bc ? 2 : 0;
|
||
else
|
||
bc_model_index |= 2;
|
||
|
||
bool endpoints_use_bc[astc_helpers::MAX_PARTITIONS] = { false };
|
||
|
||
if (astc_helpers::cem_supports_bc(cur_actual_cem))
|
||
{
|
||
for (uint32_t part_iter = 0; part_iter < tm.m_num_parts; part_iter++)
|
||
{
|
||
const bool cur_uses_bc = astc_helpers::used_blue_contraction(cur_actual_cem, cur_log_blk.m_endpoints + part_iter * total_endpoint_vals, cur_log_blk.m_endpoint_ise_range);
|
||
|
||
endpoints_use_bc[part_iter] = cur_uses_bc;
|
||
|
||
} // part_iter
|
||
|
||
prev_state.m_first_endpoint_uses_bc = endpoints_use_bc[0];
|
||
}
|
||
|
||
int best_reuse_bx = -1, best_reuse_by = -1;
|
||
uint32_t best_reuse_index = 0;
|
||
const astc_helpers::log_astc_block* pEndpoint_pred_log_blk = nullptr;
|
||
|
||
if (endpoint_dpcm_global_enable)
|
||
{
|
||
int64_t best_trial_delta2 = INT64_MAX;
|
||
float best_trial_bits = BIG_FLOAT_VAL;
|
||
|
||
//auto& trial_dpcm_model = dpcm_endpoint_models[cur_log_blk.m_endpoint_ise_range - astc_helpers::FIRST_VALID_ENDPOINT_ISE_RANGE];
|
||
|
||
for (uint32_t reuse_index = 0; reuse_index < basist::astc_6x6_hdr::NUM_REUSE_XY_DELTAS; reuse_index++)
|
||
{
|
||
const int rx = (int)bx + basist::astc_6x6_hdr::g_reuse_xy_deltas[reuse_index].m_x;
|
||
const int ry = (int)by + basist::astc_6x6_hdr::g_reuse_xy_deltas[reuse_index].m_y;
|
||
if ((rx < 0) || (ry < 0) || (rx >= (int)num_blocks_x) || (ry >= (int)num_blocks_y))
|
||
continue;
|
||
|
||
const astc_helpers::log_astc_block* pTrial_log_blk = &coded_blocks(rx, ry);
|
||
if (pTrial_log_blk->m_solid_color_flag_ldr)
|
||
continue;
|
||
|
||
uint8_t trial_predicted_endpoints[astc_helpers::MAX_PARTITIONS][astc_helpers::MAX_CEM_ENDPOINT_VALS] = { };
|
||
|
||
uint32_t part_iter;
|
||
for (part_iter = 0; part_iter < tm.m_num_parts; part_iter++)
|
||
{
|
||
const bool always_repack_flag = false;
|
||
bool blue_contraction_clamped_flag = false, base_ofs_clamped_flag = false;
|
||
|
||
bool conv_status = basist::astc_ldr_t::convert_endpoints_across_cems(
|
||
pTrial_log_blk->m_color_endpoint_modes[0], pTrial_log_blk->m_endpoint_ise_range, pTrial_log_blk->m_endpoints,
|
||
cur_actual_cem, cur_log_blk.m_endpoint_ise_range, trial_predicted_endpoints[part_iter],
|
||
always_repack_flag,
|
||
endpoints_use_bc[part_iter], false,
|
||
blue_contraction_clamped_flag, base_ofs_clamped_flag);
|
||
|
||
if (!conv_status)
|
||
break;
|
||
} // part_iter
|
||
|
||
if (part_iter < tm.m_num_parts)
|
||
continue; // failed
|
||
|
||
int64_t trial_endpoint_delta2 = 0;
|
||
for (part_iter = 0; part_iter < tm.m_num_parts; part_iter++)
|
||
{
|
||
for (uint32_t val_iter = 0; val_iter < total_endpoint_vals; val_iter++)
|
||
{
|
||
int cur_e_rank = endpoint_ise_to_rank[cur_log_blk.m_endpoints[part_iter * total_endpoint_vals + val_iter]];
|
||
int prev_e_rank = endpoint_ise_to_rank[trial_predicted_endpoints[part_iter][val_iter]];
|
||
|
||
int e_delta = cur_e_rank - prev_e_rank;
|
||
|
||
trial_endpoint_delta2 += e_delta * e_delta;
|
||
|
||
} // val_iter
|
||
|
||
} // part_iter
|
||
|
||
const float N = (float)(total_endpoint_vals * tm.m_num_parts);
|
||
const float mse = (float)trial_endpoint_delta2 / N;
|
||
|
||
// Gaussian entropy estimate - precomputed 0.5 * log2(2*pi*e) = ~2.0470956f
|
||
const float k_const = 2.0470956f;
|
||
|
||
float bits_per_sym = 0.5f * log2f(basisu::maximum(mse, 1e-9f)) + k_const;
|
||
|
||
bits_per_sym = clamp(bits_per_sym, 0.05f, 8.0f);
|
||
|
||
// total est bits for this block’s endpoints
|
||
float total_est_bits = bits_per_sym * N;
|
||
|
||
total_est_bits += endpoint_reuse_delta_model.get_price(reuse_index);
|
||
|
||
if (total_est_bits < best_trial_bits)
|
||
{
|
||
best_trial_delta2 = trial_endpoint_delta2;
|
||
best_trial_bits = total_est_bits;
|
||
|
||
best_reuse_bx = rx;
|
||
best_reuse_by = ry;
|
||
best_reuse_index = reuse_index;
|
||
|
||
if (!best_trial_delta2)
|
||
break;
|
||
}
|
||
|
||
} // reuse_index
|
||
|
||
if (best_reuse_bx >= 0)
|
||
{
|
||
pEndpoint_pred_log_blk = &coded_blocks(best_reuse_bx, best_reuse_by);
|
||
|
||
assert(!pEndpoint_pred_log_blk->m_solid_color_flag_ldr);
|
||
}
|
||
|
||
} // if (endpoint_dpcm_global_enable)
|
||
|
||
uint8_t predicted_endpoints[astc_helpers::MAX_PARTITIONS][astc_helpers::MAX_CEM_ENDPOINT_VALS] = { };
|
||
|
||
bool use_dpcm_endpoints = false;
|
||
|
||
if (pEndpoint_pred_log_blk)
|
||
{
|
||
use_dpcm_endpoints = true;
|
||
|
||
assert(cur_log_blk.m_num_partitions == tm.m_num_parts);
|
||
|
||
for (uint32_t part_iter = 0; part_iter < tm.m_num_parts; part_iter++)
|
||
{
|
||
const bool always_repack_flag = false;
|
||
bool blue_contraction_clamped_flag = false, base_ofs_clamped_flag = false;
|
||
|
||
bool conv_status = basist::astc_ldr_t::convert_endpoints_across_cems(
|
||
pEndpoint_pred_log_blk->m_color_endpoint_modes[0], pEndpoint_pred_log_blk->m_endpoint_ise_range, pEndpoint_pred_log_blk->m_endpoints,
|
||
cur_actual_cem, cur_log_blk.m_endpoint_ise_range, predicted_endpoints[part_iter],
|
||
always_repack_flag,
|
||
endpoints_use_bc[part_iter], false,
|
||
blue_contraction_clamped_flag, base_ofs_clamped_flag);
|
||
|
||
if (!conv_status)
|
||
{
|
||
// In practice, should never happen
|
||
use_dpcm_endpoints = false;
|
||
break;
|
||
}
|
||
}
|
||
}
|
||
|
||
// TODO: Decide what is cheaper, endpoint DPCM vs. raw
|
||
|
||
if (use_dpcm_endpoints)
|
||
{
|
||
total_endpoint_bits += enc.encode_and_return_price(1, use_dpcm_endpoints_model);
|
||
|
||
total_endpoint_bits += enc.encode_and_return_price(best_reuse_index, endpoint_reuse_delta_model);
|
||
|
||
if (astc_helpers::cem_supports_bc(cur_actual_cem))
|
||
{
|
||
for (uint32_t part_iter = 0; part_iter < tm.m_num_parts; part_iter++)
|
||
{
|
||
total_endpoint_bits += enc.encode_and_return_price(endpoints_use_bc[part_iter], endpoints_use_bc_models[bc_model_index]);
|
||
|
||
} // part_iter
|
||
}
|
||
|
||
// TODO: Perhaps separate DPCM models by CEM, entry index
|
||
auto& dpcm_model = dpcm_endpoint_models[cur_log_blk.m_endpoint_ise_range - astc_helpers::FIRST_VALID_ENDPOINT_ISE_RANGE];
|
||
|
||
for (uint32_t part_iter = 0; part_iter < tm.m_num_parts; part_iter++)
|
||
{
|
||
for (uint32_t val_iter = 0; val_iter < total_endpoint_vals; val_iter++)
|
||
{
|
||
int cur_e_rank = endpoint_ise_to_rank[cur_log_blk.m_endpoints[part_iter * total_endpoint_vals + val_iter]];
|
||
int prev_e_rank = endpoint_ise_to_rank[predicted_endpoints[part_iter][val_iter]];
|
||
|
||
int e_val = imod(cur_e_rank - prev_e_rank, num_endpoint_levels);
|
||
|
||
total_endpoint_bits += dpcm_model.get_price(e_val);
|
||
enc.encode(e_val, dpcm_model);
|
||
|
||
} // val_iter
|
||
|
||
} // part_iter
|
||
|
||
total_used_endpoint_dpcm++;
|
||
}
|
||
else
|
||
{
|
||
total_endpoint_bits += enc.encode_and_return_price(0, use_dpcm_endpoints_model);
|
||
|
||
for (uint32_t part_iter = 0; part_iter < tm.m_num_parts; part_iter++)
|
||
{
|
||
for (uint32_t val_iter = 0; val_iter < total_endpoint_vals; val_iter++)
|
||
{
|
||
auto& model = raw_endpoint_models[cur_log_blk.m_endpoint_ise_range - astc_helpers::FIRST_VALID_ENDPOINT_ISE_RANGE];
|
||
uint32_t e_val = cur_log_blk.m_endpoints[part_iter * total_endpoint_vals + val_iter];
|
||
|
||
total_endpoint_bits += model.get_price(e_val);
|
||
enc.encode(e_val, model);
|
||
|
||
} // val_iter
|
||
|
||
} // part_iter
|
||
|
||
total_used_endpoint_raw++;
|
||
}
|
||
|
||
} // if (full_cfg_endpoint_reuse_index >= 0)
|
||
|
||
// ------------------------------------ Send weights
|
||
const uint32_t total_planes = cur_log_blk.m_dual_plane ? 2 : 1;
|
||
const uint32_t total_weights = cur_log_blk.m_grid_width * cur_log_blk.m_grid_height;
|
||
|
||
const int num_weight_levels = astc_helpers::get_ise_levels(cur_log_blk.m_weight_ise_range);
|
||
const auto& weight_ise_to_rank = astc_helpers::g_dequant_tables.get_weight_tab(cur_log_blk.m_weight_ise_range).m_ISE_to_rank;
|
||
|
||
uint32_t use_dct_model_index = 0;
|
||
|
||
if (enc_cfg.m_use_dct)
|
||
{
|
||
if (pLeft_state)
|
||
use_dct_model_index = pLeft_state->m_used_weight_dct;
|
||
else
|
||
use_dct_model_index = 1;
|
||
|
||
if (pUpper_state)
|
||
use_dct_model_index |= pUpper_state->m_used_weight_dct ? 2 : 0;
|
||
else
|
||
use_dct_model_index |= 2;
|
||
}
|
||
|
||
if (use_faster_format)
|
||
{
|
||
bool use_dct = enc_cfg.m_use_dct;
|
||
|
||
// TODO - tune this threshold
|
||
//const uint32_t SWITCH_TO_DPCM_NUM_COEFF_THRESH = (cur_log_blk.m_grid_width * cur_log_blk.m_grid_height * 102 + 64) >> 7;
|
||
const uint32_t SWITCH_TO_DPCM_NUM_COEFF_THRESH = (cur_log_blk.m_grid_width * cur_log_blk.m_grid_height * 45 + 64) >> 7;
|
||
|
||
if (use_dct)
|
||
{
|
||
for (uint32_t plane_iter = 0; plane_iter < total_planes; plane_iter++)
|
||
{
|
||
const basist::astc_ldr_t::dct_syms& syms = blk_out.m_packed_dct_plane_data[plane_iter];
|
||
if (syms.m_max_coeff_mag > basist::astc_ldr_t::DCT_MAX_ARITH_COEFF_MAG)
|
||
{
|
||
use_dct = false;
|
||
break;
|
||
}
|
||
|
||
if (syms.m_coeffs.size() > SWITCH_TO_DPCM_NUM_COEFF_THRESH)
|
||
{
|
||
use_dct = false;
|
||
break;
|
||
}
|
||
}
|
||
}
|
||
|
||
if (enc_cfg.m_use_dct)
|
||
{
|
||
total_weight_bits += use_dct_model[use_dct_model_index].get_price(use_dct);
|
||
enc.encode(use_dct, use_dct_model[use_dct_model_index]);
|
||
}
|
||
|
||
if (use_dct)
|
||
{
|
||
prev_state.m_used_weight_dct = true;
|
||
|
||
total_used_dct++;
|
||
|
||
if (total_planes > 1)
|
||
{
|
||
assert(blk_out.m_packed_dct_plane_data[0].m_num_dc_levels == blk_out.m_packed_dct_plane_data[1].m_num_dc_levels);
|
||
}
|
||
|
||
for (uint32_t plane_iter = 0; plane_iter < total_planes; plane_iter++)
|
||
{
|
||
const basist::astc_ldr_t::dct_syms& syms = blk_out.m_packed_dct_plane_data[plane_iter];
|
||
|
||
if (syms.m_num_dc_levels == basist::astc_ldr_t::DCT_MEAN_LEVELS1)
|
||
mean1_bytes.push_back((uint8_t)syms.m_dc_sym);
|
||
else
|
||
{
|
||
assert(syms.m_num_dc_levels == basist::astc_ldr_t::DCT_MEAN_LEVELS0);
|
||
mean0_bits.put_bits(syms.m_dc_sym, 4);
|
||
}
|
||
|
||
for (uint32_t i = 0; i < syms.m_coeffs.size(); i++)
|
||
{
|
||
if (syms.m_coeffs[i].m_coeff == INT16_MAX)
|
||
{
|
||
run_bytes.push_back(basist::astc_ldr_t::DCT_RUN_LEN_EOB_SYM_INDEX);
|
||
}
|
||
else
|
||
{
|
||
run_bytes.push_back((uint8_t)syms.m_coeffs[i].m_num_zeros);
|
||
|
||
sign_bits.put_bits(syms.m_coeffs[i].m_coeff < 0, 1);
|
||
|
||
assert((syms.m_coeffs[i].m_coeff != 0) && (iabs(syms.m_coeffs[i].m_coeff) <= 255));
|
||
|
||
coeff_bytes.push_back((uint8_t)(iabs(syms.m_coeffs[i].m_coeff) - 1));
|
||
}
|
||
}
|
||
|
||
} // plane_iter
|
||
}
|
||
else
|
||
{
|
||
total_used_weight_dpcm++;
|
||
|
||
for (uint32_t plane_iter = 0; plane_iter < total_planes; plane_iter++)
|
||
{
|
||
int prev_w = num_weight_levels / 2;
|
||
|
||
for (uint32_t weight_iter = 0; weight_iter < total_weights; weight_iter++)
|
||
{
|
||
int ise_w = cur_log_blk.m_weights[plane_iter + weight_iter * total_planes];
|
||
int w = weight_ise_to_rank[ise_w];
|
||
|
||
int w_to_code = w;
|
||
w_to_code = imod(w - prev_w, num_weight_levels);
|
||
|
||
prev_w = w;
|
||
|
||
if (num_weight_levels <= 4)
|
||
weight2_bits.put_bits((uint8_t)w_to_code, 2);
|
||
else if (num_weight_levels <= 8)
|
||
weight3_bits.put_bits((uint8_t)w_to_code, 4);
|
||
else if (num_weight_levels <= 16)
|
||
weight4_bits.put_bits((uint8_t)w_to_code, 4);
|
||
else
|
||
weight8_bits.push_back((uint8_t)w_to_code);
|
||
|
||
} // weight_iter
|
||
|
||
} // plane_iter
|
||
}
|
||
}
|
||
else
|
||
{
|
||
float total_dpcm_bits = 0.0f, total_dct_bits = 0.0f;
|
||
const float FORBID_DCT_BITS = 1e+8f;
|
||
|
||
for (uint32_t plane_iter = 0; plane_iter < total_planes; plane_iter++)
|
||
{
|
||
int prev_w = num_weight_levels / 2;
|
||
|
||
for (uint32_t weight_iter = 0; weight_iter < total_weights; weight_iter++)
|
||
{
|
||
const auto& model = raw_weight_models[cur_log_blk.m_weight_ise_range - astc_helpers::FIRST_VALID_WEIGHT_ISE_RANGE];
|
||
|
||
int ise_w = cur_log_blk.m_weights[plane_iter + weight_iter * total_planes];
|
||
int w = weight_ise_to_rank[ise_w];
|
||
|
||
int w_to_code = w;
|
||
w_to_code = imod(w - prev_w, num_weight_levels);
|
||
|
||
prev_w = w;
|
||
|
||
total_dpcm_bits += model.get_price(w_to_code);
|
||
|
||
} // weight_iter
|
||
|
||
} // plane_iter
|
||
|
||
if (enc_cfg.m_use_dct)
|
||
{
|
||
for (uint32_t plane_iter = 0; plane_iter < total_planes; plane_iter++)
|
||
{
|
||
const basist::astc_ldr_t::dct_syms& syms = blk_out.m_packed_dct_plane_data[plane_iter];
|
||
if (syms.m_max_coeff_mag > basist::astc_ldr_t::DCT_MAX_ARITH_COEFF_MAG)
|
||
{
|
||
total_dct_bits = FORBID_DCT_BITS;
|
||
break;
|
||
}
|
||
}
|
||
|
||
if (total_dct_bits < FORBID_DCT_BITS)
|
||
{
|
||
for (uint32_t plane_iter = 0; plane_iter < total_planes; plane_iter++)
|
||
{
|
||
const basist::astc_ldr_t::dct_syms& syms = blk_out.m_packed_dct_plane_data[plane_iter];
|
||
|
||
assert((syms.m_num_dc_levels == basist::astc_ldr_t::DCT_MEAN_LEVELS0) || (syms.m_num_dc_levels == basist::astc_ldr_t::DCT_MEAN_LEVELS1));
|
||
|
||
total_dct_bits += weight_mean_models[(syms.m_num_dc_levels == basist::astc_ldr_t::DCT_MEAN_LEVELS1) ? 1 : 0].get_price(syms.m_dc_sym);
|
||
|
||
for (uint32_t i = 0; i < syms.m_coeffs.size(); i++)
|
||
{
|
||
if (syms.m_coeffs[i].m_coeff == INT16_MAX)
|
||
{
|
||
total_dct_bits += dct_run_len_model.get_price(basist::astc_ldr_t::DCT_RUN_LEN_EOB_SYM_INDEX);
|
||
}
|
||
else
|
||
{
|
||
assert(syms.m_coeffs[i].m_num_zeros < basist::astc_ldr_t::DCT_RUN_LEN_EOB_SYM_INDEX);
|
||
|
||
total_dct_bits += dct_run_len_model.get_price(syms.m_coeffs[i].m_num_zeros);
|
||
|
||
total_dct_bits += 1.0f; // sign bit
|
||
assert((syms.m_coeffs[i].m_coeff != 0) && (iabs(syms.m_coeffs[i].m_coeff) <= 255));
|
||
total_dct_bits += dct_coeff_mag.get_price(iabs(syms.m_coeffs[i].m_coeff) - 1);
|
||
}
|
||
} // i
|
||
} // plane_iter
|
||
}
|
||
}
|
||
|
||
// TODO: Check if any DCT coeff overflows 8-bit mags, switch to DPCM. (In practice, not needed.)
|
||
bool use_dct = false;
|
||
if ((enc_cfg.m_use_dct) &&
|
||
(total_dct_bits < FORBID_DCT_BITS) &&
|
||
((total_dct_bits + use_dct_model[use_dct_model_index].get_price(1)) <= (total_dpcm_bits + use_dct_model[use_dct_model_index].get_price(0))))
|
||
{
|
||
use_dct = true;
|
||
}
|
||
|
||
if (enc_cfg.m_use_dct)
|
||
{
|
||
total_weight_bits += use_dct_model[use_dct_model_index].get_price(use_dct);
|
||
enc.encode(use_dct, use_dct_model[use_dct_model_index]);
|
||
}
|
||
|
||
if (use_dct)
|
||
{
|
||
prev_state.m_used_weight_dct = true;
|
||
|
||
total_used_dct++;
|
||
|
||
if (total_planes > 1)
|
||
{
|
||
assert(blk_out.m_packed_dct_plane_data[0].m_num_dc_levels == blk_out.m_packed_dct_plane_data[1].m_num_dc_levels);
|
||
}
|
||
|
||
for (uint32_t plane_iter = 0; plane_iter < total_planes; plane_iter++)
|
||
{
|
||
const basist::astc_ldr_t::dct_syms& syms = blk_out.m_packed_dct_plane_data[plane_iter];
|
||
|
||
total_weight_bits += enc.encode_and_return_price(syms.m_dc_sym, weight_mean_models[(syms.m_num_dc_levels == basist::astc_ldr_t::DCT_MEAN_LEVELS1) ? 1 : 0]);
|
||
|
||
for (uint32_t i = 0; i < syms.m_coeffs.size(); i++)
|
||
{
|
||
if (syms.m_coeffs[i].m_coeff == INT16_MAX)
|
||
{
|
||
total_weight_bits += enc.encode_and_return_price(basist::astc_ldr_t::DCT_RUN_LEN_EOB_SYM_INDEX, dct_run_len_model);
|
||
|
||
total_dct_syms++;
|
||
}
|
||
else
|
||
{
|
||
total_weight_bits += enc.encode_and_return_price(syms.m_coeffs[i].m_num_zeros, dct_run_len_model);
|
||
|
||
total_dct_syms++;
|
||
|
||
enc.put_bit(syms.m_coeffs[i].m_coeff < 0);
|
||
total_weight_bits += 1.0f;
|
||
|
||
assert((syms.m_coeffs[i].m_coeff != 0) && (iabs(syms.m_coeffs[i].m_coeff) <= 255));
|
||
total_weight_bits += enc.encode_and_return_price(iabs(syms.m_coeffs[i].m_coeff) - 1, dct_coeff_mag);
|
||
|
||
total_dct_syms++;
|
||
}
|
||
}
|
||
|
||
} // plane_iter
|
||
}
|
||
else
|
||
{
|
||
total_used_weight_dpcm++;
|
||
auto& model = raw_weight_models[cur_log_blk.m_weight_ise_range - astc_helpers::FIRST_VALID_WEIGHT_ISE_RANGE];
|
||
|
||
for (uint32_t plane_iter = 0; plane_iter < total_planes; plane_iter++)
|
||
{
|
||
int prev_w = num_weight_levels / 2;
|
||
|
||
for (uint32_t weight_iter = 0; weight_iter < total_weights; weight_iter++)
|
||
{
|
||
int ise_w = cur_log_blk.m_weights[plane_iter + weight_iter * total_planes];
|
||
int w = weight_ise_to_rank[ise_w];
|
||
|
||
int w_to_code = w;
|
||
w_to_code = imod(w - prev_w, num_weight_levels);
|
||
|
||
prev_w = w;
|
||
|
||
total_weight_bits += model.get_price(w_to_code);
|
||
enc.encode(w_to_code, model);
|
||
|
||
total_dpcm_syms++;
|
||
|
||
} // weight_iter
|
||
|
||
} // plane_iter
|
||
}
|
||
|
||
} // use_faster_format
|
||
|
||
} // bx
|
||
|
||
if (cur_run_len)
|
||
{
|
||
total_runs++;
|
||
total_run_blocks += cur_run_len;
|
||
|
||
total_header_bits += enc.encode_and_return_price((uint32_t)basist::astc_ldr_t::xuastc_mode::cMODE_RUN, mode_model);
|
||
total_header_bits += enc.put_gamma_and_return_price(cur_run_len, m_run_len_contexts);
|
||
cur_run_len = 0;
|
||
}
|
||
|
||
} // by
|
||
|
||
enc.put_bits(basist::astc_ldr_t::FINAL_SYNC_MARKER, basist::astc_ldr_t::FINAL_SYNC_MARKER_BITS);
|
||
|
||
enc.flush();
|
||
|
||
if (global_cfg.m_debug_output)
|
||
{
|
||
fmt_debug_printf("Encoding time: {} secs\n", itm.get_elapsed_secs());
|
||
}
|
||
|
||
if (global_cfg.m_debug_images)
|
||
{
|
||
save_png(global_cfg.m_debug_file_prefix + "vis_img.png", vis_img);
|
||
}
|
||
|
||
if ((global_cfg.m_debug_images) || (global_cfg.m_debug_output))
|
||
{
|
||
image coded_img(width, height);
|
||
|
||
vector2D<astc_helpers::astc_block> phys_blocks(num_blocks_x, num_blocks_y);
|
||
|
||
for (uint32_t by = 0; by < num_blocks_y; by++)
|
||
{
|
||
for (uint32_t bx = 0; bx < num_blocks_x; bx++)
|
||
{
|
||
const astc_helpers::log_astc_block& log_blk = coded_blocks(bx, by);
|
||
|
||
color_rgba block_pixels[astc_ldr::ASTC_LDR_MAX_BLOCK_PIXELS];
|
||
|
||
bool status = astc_helpers::decode_block(log_blk, block_pixels, block_width, block_height, enc_cfg.m_cem_enc_params.m_decode_mode_srgb ? astc_helpers::cDecodeModeSRGB8 : astc_helpers::cDecodeModeLDR8);
|
||
if (!status)
|
||
{
|
||
fmt_error_printf("astc_helpers::decode_block() failed\n");
|
||
return false;
|
||
}
|
||
|
||
// Be positive the logical block can be unpacked correctly as XUASTC LDR.
|
||
color_rgba block_pixels_alt[astc_ldr::ASTC_LDR_MAX_BLOCK_PIXELS];
|
||
bool status_alt = astc_helpers::decode_block_xuastc_ldr(log_blk, block_pixels_alt, block_width, block_height, enc_cfg.m_cem_enc_params.m_decode_mode_srgb ? astc_helpers::cDecodeModeSRGB8 : astc_helpers::cDecodeModeLDR8);
|
||
if (!status_alt)
|
||
{
|
||
fmt_error_printf("astc_helpers::decode_block_xuastc_ldr() failed\n");
|
||
return false;
|
||
}
|
||
|
||
if (memcmp(block_pixels, block_pixels_alt, sizeof(color_rgba) * block_width * block_height) != 0)
|
||
{
|
||
fmt_error_printf("astc_helpers::decode_block_xuastc_ldr() decode pixel mismatch\n");
|
||
return false;
|
||
}
|
||
|
||
coded_img.set_block_clipped(block_pixels, bx * block_width, by * block_height, block_width, block_height);
|
||
|
||
} // bx
|
||
|
||
} //by
|
||
|
||
if (global_cfg.m_debug_images)
|
||
save_png(global_cfg.m_debug_file_prefix + "coded_img.png", coded_img);
|
||
|
||
if (global_cfg.m_debug_output)
|
||
{
|
||
debug_printf("Orig image vs. coded img:\n");
|
||
print_image_metrics(orig_img, coded_img);
|
||
}
|
||
}
|
||
|
||
const uint64_t comp_data_size = enc.get_data_buf().size();
|
||
if (comp_data_size > UINT32_MAX)
|
||
return false;
|
||
|
||
uint8_vec suffix_bytes;
|
||
|
||
if (use_faster_format)
|
||
{
|
||
suffix_bytes.reserve(8192);
|
||
|
||
mean0_bits.flush();
|
||
sign_bits.flush();
|
||
weight2_bits.flush();
|
||
weight3_bits.flush();
|
||
weight4_bits.flush();
|
||
|
||
const uint32_t zstd_level = 9;
|
||
|
||
uint8_vec comp_mean0, comp_mean1, comp_run, comp_coeff, comp_weight2, comp_weight3, comp_weight4, comp_weight8;
|
||
|
||
if (!zstd_compress(mean0_bits.get_bytes().data(), mean0_bits.get_bytes().size(), comp_mean0, zstd_level))
|
||
return false;
|
||
if (!zstd_compress(mean1_bytes.data(), mean1_bytes.size(), comp_mean1, zstd_level))
|
||
return false;
|
||
if (!zstd_compress(run_bytes.data(), run_bytes.size(), comp_run, zstd_level))
|
||
return false;
|
||
if (!zstd_compress(coeff_bytes.data(), coeff_bytes.size(), comp_coeff, zstd_level))
|
||
return false;
|
||
if (!zstd_compress(weight2_bits.get_bytes().data(), weight2_bits.get_bytes().size(), comp_weight2, zstd_level))
|
||
return false;
|
||
if (!zstd_compress(weight3_bits.get_bytes().data(), weight3_bits.get_bytes().size(), comp_weight3, zstd_level))
|
||
return false;
|
||
if (!zstd_compress(weight4_bits.get_bytes().data(), weight4_bits.get_bytes().size(), comp_weight4, zstd_level))
|
||
return false;
|
||
if (!zstd_compress(weight8_bits.data(), weight8_bits.size(), comp_weight8, zstd_level))
|
||
return false;
|
||
|
||
hdr.m_flags = (uint8_t)basist::astc_ldr_t::xuastc_ldr_syntax::cHybridArithZStd;
|
||
|
||
hdr.m_arith_bytes_len = (uint32_t)comp_data_size;
|
||
hdr.m_mean0_bits_len = (uint32_t)comp_mean0.size();
|
||
hdr.m_mean1_bytes_len = (uint32_t)comp_mean1.size();
|
||
hdr.m_run_bytes_len = (uint32_t)comp_run.size();
|
||
hdr.m_coeff_bytes_len = (uint32_t)comp_coeff.size();
|
||
hdr.m_sign_bits_len = (uint32_t)sign_bits.get_bytes().size();
|
||
hdr.m_weight2_bits_len = (uint32_t)comp_weight2.size();
|
||
hdr.m_weight3_bits_len = (uint32_t)comp_weight3.size();
|
||
hdr.m_weight4_bits_len = (uint32_t)comp_weight4.size();
|
||
hdr.m_weight8_bytes_len = (uint32_t)comp_weight8.size();
|
||
|
||
suffix_bytes.append(comp_mean0);
|
||
suffix_bytes.append(comp_mean1);
|
||
suffix_bytes.append(comp_run);
|
||
suffix_bytes.append(comp_coeff);
|
||
suffix_bytes.append(sign_bits.get_bytes());
|
||
suffix_bytes.append(comp_weight2);
|
||
suffix_bytes.append(comp_weight3);
|
||
suffix_bytes.append(comp_weight4);
|
||
suffix_bytes.append(comp_weight8);
|
||
|
||
if (global_cfg.m_debug_output)
|
||
{
|
||
fmt_debug_printf("Zstd compressed sizes:\n");
|
||
fmt_debug_printf(" Mean0 bytes: {} comp size: {}\n", (uint64_t)mean0_bits.get_bytes().size(), (uint64_t)comp_mean0.size());
|
||
fmt_debug_printf(" Mean1 bytes: {} comp size: {}\n", (uint64_t)mean1_bytes.size(), (uint64_t)comp_mean1.size());
|
||
fmt_debug_printf(" Run bytes: {} comp size: {}\n", (uint64_t)run_bytes.size(), (uint64_t)comp_run.size());
|
||
fmt_debug_printf(" Coeff bytes: {} comp size: {}\n", (uint64_t)coeff_bytes.size(), (uint64_t)comp_coeff.size());
|
||
fmt_debug_printf(" Sign bytes: {}\n", (uint64_t)sign_bits.get_bytes().size());
|
||
fmt_debug_printf(" Weight2 bytes: {} comp size: {}\n", (uint64_t)weight2_bits.get_bytes().size(), (uint64_t)comp_weight2.size());
|
||
fmt_debug_printf(" Weight3 bytes: {} comp size: {}\n", (uint64_t)weight3_bits.get_bytes().size(), (uint64_t)comp_weight3.size());
|
||
fmt_debug_printf(" Weight4 bytes: {} comp size: {}\n", (uint64_t)weight4_bits.get_bytes().size(), (uint64_t)comp_weight4.size());
|
||
fmt_debug_printf(" Weight8 bytes: {} comp size: {}\n", (uint64_t)weight8_bits.size(), (uint64_t)comp_weight8.size());
|
||
}
|
||
}
|
||
|
||
assert(comp_data.size() == 0);
|
||
if (use_faster_format)
|
||
{
|
||
comp_data.resize(sizeof(hdr));
|
||
memcpy(comp_data.data(), &hdr, sizeof(hdr));
|
||
}
|
||
else
|
||
{
|
||
comp_data.push_back((uint8_t)basist::astc_ldr_t::xuastc_ldr_syntax::cFullArith);
|
||
}
|
||
|
||
comp_data.append(enc.get_data_buf());
|
||
|
||
comp_data.append(suffix_bytes);
|
||
|
||
if (comp_data.size() > UINT32_MAX)
|
||
return false;
|
||
|
||
if (global_cfg.m_debug_output)
|
||
{
|
||
fmt_debug_printf("Total blocks: {}\n", total_blocks);
|
||
fmt_debug_printf("Total lossy replacements made by supercompression layer: {} {3.2}%\n", total_lossy_replacements, (float)total_lossy_replacements * 100.0f / (float)total_blocks);
|
||
fmt_debug_printf("Total runs: {}, total run blocks: {} {3.2}%\n", total_runs, total_run_blocks, (float)total_run_blocks * 100.0f / (float)total_blocks);
|
||
fmt_debug_printf("Total blocks coded (not inside runs): {} {3.2}%\n", total_nonrun_blocks, (float)total_nonrun_blocks * 100.0f / (float)total_blocks);
|
||
fmt_debug_printf("num_part_hash_probes: {}, num_part_hash_hits: {} {3.2}%\n", num_part_hash_probes, num_part_hash_hits, num_part_hash_probes ? ((float)num_part_hash_hits * 100.0f / (float)num_part_hash_probes) : 0);
|
||
fmt_debug_printf("Total DCT syms: {}, DPCM syms: {}\n", total_dct_syms, total_dpcm_syms);
|
||
|
||
const uint32_t total_non_void_extent_blocks = total_blocks - total_solid_blocks;
|
||
|
||
fmt_debug_printf("Total blocks using void extent: {} {3.2}%\n",
|
||
total_solid_blocks, (float)total_solid_blocks * 100.0f / (float)total_blocks);
|
||
|
||
fmt_debug_printf("Total non void-extent blocks: {} {3.2}%\n",
|
||
total_non_void_extent_blocks, (float)total_non_void_extent_blocks * 100.0f / (float)total_blocks);
|
||
|
||
fmt_debug_printf("Total full cfg+part ID+endpoint reuse commands: {} {3.2}%\n",
|
||
total_full_reuse_commands, (float)total_full_reuse_commands * 100.0f / (float)total_blocks);
|
||
|
||
fmt_debug_printf("Total raw commands: {} {3.2}%\n",
|
||
total_raw_commands, (float)total_raw_commands * 100.0f / (float)total_blocks);
|
||
|
||
fmt_debug_printf("Total reuse cfg+part ID emitted: {} {3.2}%, Total full cfg emitted: {} {3.2}%\n",
|
||
total_reuse_full_cfg_emitted, (float)total_reuse_full_cfg_emitted * 100.0f / (float)total_blocks,
|
||
total_full_cfg_emitted, (float)total_full_cfg_emitted * 100.0f / (float)total_blocks);
|
||
|
||
fmt_debug_printf("Total coded endpoints using DPCM: {} {3.2}%\n",
|
||
total_used_endpoint_dpcm, (float)total_used_endpoint_dpcm * 100.0f / (float)total_non_void_extent_blocks);
|
||
|
||
fmt_debug_printf("Total coded endpoints using RAW: {} {3.2}%\n",
|
||
total_used_endpoint_raw, (float)total_used_endpoint_raw * 100.0f / (float)total_non_void_extent_blocks);
|
||
|
||
fmt_debug_printf("Total coded blocks using weight DCT: {} {3.2}%, total blocks using weight DPCM: {} {3.2}%\n",
|
||
total_used_dct, (float)total_used_dct * 100.0f / total_non_void_extent_blocks,
|
||
total_used_weight_dpcm, (float)total_used_weight_dpcm * 100.0f / (float)total_non_void_extent_blocks);
|
||
|
||
fmt_debug_printf("Total header bits: {} bytes: {}, bpp: {}, bits per non-void extent block: {}\nTotal endpoint bits: {}, bytes: {}, bpp: {}, bits per non-void extent block: {}\nTotal weight bits: {}, bytes: {}, bpp: {}, bits per non-void extent block: {}\nTotal_bits: {} bytes: {}, bpp {}, bits per non-void extent block: {}\n",
|
||
total_header_bits, total_header_bits / 8.0f, total_header_bits / (double)total_pixels, total_header_bits / (double)total_non_void_extent_blocks,
|
||
total_endpoint_bits, total_endpoint_bits / 8.0f, total_endpoint_bits / (double)total_pixels, total_endpoint_bits / (double)total_non_void_extent_blocks,
|
||
total_weight_bits, total_weight_bits / 8.0f, total_weight_bits / (double)total_pixels, total_weight_bits / (double)total_non_void_extent_blocks,
|
||
total_header_bits + total_endpoint_bits + total_weight_bits,
|
||
(total_header_bits + total_endpoint_bits + total_weight_bits) / 8.0f,
|
||
(total_header_bits + total_endpoint_bits + total_weight_bits) / (double)total_pixels,
|
||
(total_header_bits + total_endpoint_bits + total_weight_bits) / (double)total_non_void_extent_blocks);
|
||
|
||
fmt_debug_printf("Compressed to {} bytes, {3.3}bpp\n\n", comp_data.size_u32(), ((float)comp_data.size() * 8.0f) / (float)total_pixels);
|
||
|
||
#if 0
|
||
for (uint32_t i = 0; i < 4; i++)
|
||
{
|
||
solid_color_dpcm_model[i].print_prices(fmt_string("solid_color_dpcm_model[{}]:\n\n", i).c_str());
|
||
}
|
||
#endif
|
||
}
|
||
|
||
return true;
|
||
}
|
||
|
||
void encoder_init()
|
||
{
|
||
if (g_initialized)
|
||
return;
|
||
|
||
g_initialized = true;
|
||
}
|
||
|
||
void deblock_filter(uint32_t filter_block_width, uint32_t filter_block_height, const image& src_img, image& dst_img, bool stronger_filtering, int SKIP_THRESH)
|
||
{
|
||
image temp_img(src_img);
|
||
|
||
for (int y = 0; y < (int)src_img.get_height(); y++)
|
||
{
|
||
for (int x = filter_block_width; x < (int)src_img.get_width(); x += filter_block_width)
|
||
{
|
||
color_rgba ll(src_img.get_clamped(x - 2, y));
|
||
color_rgba l(src_img.get_clamped(x - 1, y));
|
||
color_rgba r(src_img.get_clamped(x, y));
|
||
color_rgba rr(src_img.get_clamped(x + 1, y));
|
||
|
||
if (SKIP_THRESH < 256)
|
||
{
|
||
bool skip_flag = false;
|
||
for (uint32_t c = 0; c < 4; c++)
|
||
{
|
||
int delta = iabs((int)l[c] - (int)r[c]);
|
||
if (delta > SKIP_THRESH)
|
||
{
|
||
skip_flag = true;
|
||
break;
|
||
}
|
||
}
|
||
|
||
if (skip_flag)
|
||
continue;
|
||
}
|
||
|
||
color_rgba ml, mr;
|
||
for (uint32_t c = 0; c < 4; c++)
|
||
{
|
||
if (stronger_filtering)
|
||
{
|
||
ml[c] = (3 * l[c] + 2 * r[c] + ll[c] + 3) / 6;
|
||
mr[c] = (3 * r[c] + 2 * l[c] + rr[c] + 3) / 6;
|
||
}
|
||
else
|
||
{
|
||
ml[c] = (5 * l[c] + 2 * r[c] + ll[c] + 4) / 8;
|
||
mr[c] = (5 * r[c] + 2 * l[c] + rr[c] + 4) / 8;
|
||
}
|
||
}
|
||
|
||
temp_img.set_clipped(x - 1, y, ml);
|
||
temp_img.set_clipped(x, y, mr);
|
||
|
||
} // x
|
||
|
||
} // y
|
||
|
||
dst_img = temp_img;
|
||
|
||
for (int x = 0; x < (int)temp_img.get_width(); x++)
|
||
{
|
||
for (int y = filter_block_height; y < (int)temp_img.get_height(); y += filter_block_height)
|
||
{
|
||
color_rgba uu(temp_img.get_clamped(x, y - 2));
|
||
color_rgba u(temp_img.get_clamped(x, y - 1));
|
||
color_rgba d(temp_img.get_clamped(x, y));
|
||
color_rgba dd(temp_img.get_clamped(x, y + 1));
|
||
|
||
if (SKIP_THRESH < 256)
|
||
{
|
||
bool skip_flag = false;
|
||
for (uint32_t c = 0; c < 4; c++)
|
||
{
|
||
int delta = iabs((int)u[c] - (int)d[c]);
|
||
if (delta > SKIP_THRESH)
|
||
{
|
||
skip_flag = true;
|
||
break;
|
||
}
|
||
}
|
||
|
||
if (skip_flag)
|
||
continue;
|
||
}
|
||
|
||
color_rgba mu, md;
|
||
for (uint32_t c = 0; c < 4; c++)
|
||
{
|
||
if (stronger_filtering)
|
||
{
|
||
mu[c] = (3 * u[c] + 2 * d[c] + uu[c] + 3) / 6;
|
||
md[c] = (3 * d[c] + 2 * u[c] + dd[c] + 3) / 6;
|
||
}
|
||
else
|
||
{
|
||
mu[c] = (5 * u[c] + 2 * d[c] + uu[c] + 4) / 8;
|
||
md[c] = (5 * d[c] + 2 * u[c] + dd[c] + 4) / 8;
|
||
}
|
||
}
|
||
|
||
dst_img.set_clipped(x, y - 1, mu);
|
||
dst_img.set_clipped(x, y, md);
|
||
|
||
} // x
|
||
|
||
} // y
|
||
}
|
||
|
||
} // namespace astc_ldr
|
||
} // namespace basisu
|