basis_universal/transcoder/basisu_astc_helpers.h

// basisu_astc_helpers.h
// Be sure to define ASTC_HELPERS_IMPLEMENTATION somewhere to get the implementation, otherwise you only get the header.
#ifndef BASISU_ASTC_HELPERS_HEADER
#define BASISU_ASTC_HELPERS_HEADER

#include <stdlib.h>
#include <stdint.h>
#include <math.h>
#include <fenv.h>

namespace astc_helpers
{
	const uint32_t MIN_GRID_DIM = 2; // the minimum dimension of a block's weight grid
	const uint32_t MIN_BLOCK_DIM = 4, MAX_BLOCK_DIM = 12; // the valid block dimensions in texels
	const uint32_t MAX_BLOCK_PIXELS = MAX_BLOCK_DIM * MAX_BLOCK_DIM;
	const uint32_t MAX_GRID_WEIGHTS = 64; // a block may have a maximum of 64 weight grid values
	const uint32_t MAX_CEM_ENDPOINT_VALS = 8; // see Table 94. ASTC LDR/HDR color endpoint modes (max 8 values to encode any CEM, minimum 2)

	// The number of BISE values needed to encode endpoints for each CEM.
	const uint32_t NUM_MODE0_ENDPOINTS = 2, NUM_MODE4_ENDPOINTS = 4;
	const uint32_t NUM_MODE6_ENDPOINTS = 4, NUM_MODE8_ENDPOINTS = 6, NUM_MODE9_ENDPOINTS = 6; // LDR RGB
	const uint32_t NUM_MODE10_ENDPOINTS = 6, NUM_MODE12_ENDPOINTS = 8, NUM_MODE13_ENDPOINTS = 8; // LDR RGBA
	const uint32_t NUM_MODE11_ENDPOINTS = 6, NUM_MODE7_ENDPOINTS = 4; // hdr

	const uint32_t MAX_WEIGHTS = 32; // max supported # of weights (or "selectors") in any mode, i.e. the max # of colors per endpoint pair
	const uint32_t MAX_WEIGHT_INTERPOLANT_VALUE = 64; // grid texel weights must range from [0,64], i.e. the weight interpolant range is [0,64]

	// 14 unique block dimensions supported by ASTC
	static const uint32_t NUM_ASTC_BLOCK_SIZES = 14;
	extern const uint8_t g_astc_block_sizes[NUM_ASTC_BLOCK_SIZES][2];

	// The Color Endpoint Modes (CEM's)
	enum cems
	{
		CEM_LDR_LUM_DIRECT = 0,
		CEM_LDR_LUM_BASE_PLUS_OFS = 1,
		CEM_HDR_LUM_LARGE_RANGE = 2,
		CEM_HDR_LUM_SMALL_RANGE = 3,
		CEM_LDR_LUM_ALPHA_DIRECT = 4,
		CEM_LDR_LUM_ALPHA_BASE_PLUS_OFS = 5,
		CEM_LDR_RGB_BASE_SCALE = 6,
		CEM_HDR_RGB_BASE_SCALE = 7,
		CEM_LDR_RGB_DIRECT = 8,
		CEM_LDR_RGB_BASE_PLUS_OFFSET = 9,
		CEM_LDR_RGB_BASE_SCALE_PLUS_TWO_A = 10,
		CEM_HDR_RGB = 11,
		CEM_LDR_RGBA_DIRECT = 12,
		CEM_LDR_RGBA_BASE_PLUS_OFFSET = 13,
		CEM_HDR_RGB_LDR_ALPHA = 14,
		CEM_HDR_RGB_HDR_ALPHA = 15
	};

	// All Bounded Integer Sequence Coding (BISE or ISE) ranges.
	// Weights: Ranges [0,11] are valid.
	// Endpoints: Ranges [4,20] are valid.
	enum bise_levels
	{
		BISE_2_LEVELS = 0,
		BISE_3_LEVELS = 1,
		BISE_4_LEVELS = 2,
		BISE_5_LEVELS = 3,
		BISE_6_LEVELS = 4,
		BISE_8_LEVELS = 5,
		BISE_10_LEVELS = 6,
		BISE_12_LEVELS = 7,
		BISE_16_LEVELS = 8,
		BISE_20_LEVELS = 9,
		BISE_24_LEVELS = 10,
		BISE_32_LEVELS = 11,
		BISE_40_LEVELS = 12,
		BISE_48_LEVELS = 13,
		BISE_64_LEVELS = 14,
		BISE_80_LEVELS = 15,
		BISE_96_LEVELS = 16,
		BISE_128_LEVELS = 17,
		BISE_160_LEVELS = 18,
		BISE_192_LEVELS = 19,
		BISE_256_LEVELS = 20
	};

	const uint32_t TOTAL_ISE_RANGES = 21;

	enum
	{
		cBLOCK_SIZE_4x4 = 0,	// 16 samples
		cBLOCK_SIZE_5x4 = 1,	// 20 samples
		cBLOCK_SIZE_5x5 = 2,	// 25 samples
		cBLOCK_SIZE_6x5 = 3,	// 30 samples

		cBLOCK_SIZE_6x6 = 4,	// 36 samples
		cBLOCK_SIZE_8x5 = 5,	// 40 samples
		cBLOCK_SIZE_8x6 = 6,	// 48 samples
		cBLOCK_SIZE_10x5 = 7,	// 50 samples

		cBLOCK_SIZE_10x6 = 8,	// 60 samples
		cBLOCK_SIZE_8x8 = 9,	// 64 samples
		cBLOCK_SIZE_10x8 = 10,	// 80 samples
		cBLOCK_SIZE_10x10 = 11,	// 100 samples

		cBLOCK_SIZE_12x10 = 12,	// 120 samples
		cBLOCK_SIZE_12x12 = 13,	// 144 samples

		cTOTAL_BLOCK_SIZES = 14
	};

	// Valid endpoint ISE ranges
	const uint32_t FIRST_VALID_ENDPOINT_ISE_RANGE = BISE_6_LEVELS; // 4
	const uint32_t LAST_VALID_ENDPOINT_ISE_RANGE = BISE_256_LEVELS; // 20
	const uint32_t TOTAL_ENDPOINT_ISE_RANGES = LAST_VALID_ENDPOINT_ISE_RANGE - FIRST_VALID_ENDPOINT_ISE_RANGE + 1;

	// Valid weight ISE ranges
	const uint32_t FIRST_VALID_WEIGHT_ISE_RANGE = BISE_2_LEVELS; // 0
	const uint32_t LAST_VALID_WEIGHT_ISE_RANGE = BISE_32_LEVELS; // 11
	const uint32_t TOTAL_WEIGHT_ISE_RANGES = LAST_VALID_WEIGHT_ISE_RANGE - FIRST_VALID_WEIGHT_ISE_RANGE + 1;

	// The ISE range table.
	extern const int8_t g_ise_range_table[TOTAL_ISE_RANGES][3]; // 0=bits (0 to 8), 1=trits (0 or 1), 2=quints (0 or 1)

	// Possible Color Component Select values, used in dual plane mode.
	// The CCS component will be interpolated using the 2nd weight plane.
	enum ccs
	{
		CCS_GBA_R = 0,
		CCS_RBA_G = 1,
		CCS_RGA_B = 2,
		CCS_RGB_A = 3
	};

	struct astc_block
	{
		uint32_t m_vals[4];
	};

	const uint32_t MAX_PARTITIONS = 4;				// Max # of partitions or subsets for single plane mode
	const uint32_t MAX_DUAL_PLANE_PARTITIONS = 3;	// Max # of partitions or subsets for dual plane mode
	const uint32_t NUM_PARTITION_PATTERNS = 1024;	// Total # of partition pattern seeds (10-bits)
	const uint32_t MAX_ENDPOINTS = 18;				// Maximum # of endpoint values in a block

	struct log_astc_block
	{
		bool m_error_flag;

		bool m_solid_color_flag_ldr, m_solid_color_flag_hdr;

		uint8_t m_user_mode;					// user defined value, not used in this module

		// Rest is only valid if !m_solid_color_flag_ldr && !m_solid_color_flag_hdr
		uint8_t m_grid_width, m_grid_height;	// weight grid dimensions, not the dimension of the block

		bool m_dual_plane;

		uint8_t m_weight_ise_range;				// 0-11
		uint8_t m_endpoint_ise_range;			// 4-20, this is actually inferred from the size of the other config bits+weights, but this is here for checking

		uint8_t m_color_component_selector;	// 0-3, controls which channel uses the 2nd (odd) weights, only used in dual plane mode

		uint8_t m_num_partitions;				// or the # of subsets, 1-4 (1-3 if dual plane mode)
		uint16_t m_partition_id;				// 10-bits, must be 0 if m_num_partitions==1

		uint8_t m_color_endpoint_modes[MAX_PARTITIONS]; // each subset's CEM's

		union
		{
			// ISE weight grid values. In dual plane mode, the order is p0,p1,  p0,p1,  etc.
			uint8_t m_weights[MAX_GRID_WEIGHTS];
			uint16_t m_solid_color[4];
		};

		// ISE endpoint values
		// Endpoint order examples:
		// 1 subset LA : LL0 LH0 AL0 AH0
		// 1 subset RGB : RL0 RH0 GL0 GH0 BL0 BH0
		// 1 subset RGBA : RL0 RH0 GL0 GH0 BL0 BH0 AL0 AH0
		// 2 subset LA : LL0 LH0 AL0 AH0 LL1 LH1 AL1 AH1
		// 2 subset RGB : RL0 RH0 GL0 GH0 BL0 BH0 RL1 RH1 GL1 GH1 BL1 BH1
		// 2 subset RGBA : RL0 RH0 GL0 GH0 BL0 BH0 AL0 AH0 RL1 RH1 GL1 GH1 BL1 BH1 AL1 AH1
		uint8_t m_endpoints[MAX_ENDPOINTS];

		void clear()
		{
			memset(this, 0, sizeof(*this));
		}
	};

	// Open interval
	inline int bounds_check(int v, int l, int h) { (void)v; (void)l; (void)h; assert(v >= l && v < h); return v; }
	inline uint32_t bounds_check(uint32_t v, uint32_t l, uint32_t h) { (void)v; (void)l; (void)h; assert(v >= l && v < h); return v; }

	inline uint32_t get_bits(uint32_t val, int low, int high)
	{
		const int num_bits = (high - low) + 1;
		assert((num_bits >= 1) && (num_bits <= 32));

		val >>= low;
		if (num_bits != 32)
			val &= ((1u << num_bits) - 1);

		return val;
	}

	// Returns the number of levels in the given ISE range.
	inline uint32_t get_ise_levels(uint32_t ise_range)
	{
		assert(ise_range < TOTAL_ISE_RANGES);
		return (1 + 2 * g_ise_range_table[ise_range][1] + 4 * g_ise_range_table[ise_range][2]) << g_ise_range_table[ise_range][0];
	}

	inline int get_ise_sequence_bits(int count, int range)
	{
		// See 18.22 Data Size Determination - note this will be <= the # of bits actually written by encode_bise(). (It's magic.)
		int total_bits = g_ise_range_table[range][0] * count;
		total_bits += (g_ise_range_table[range][1] * 8 * count + 4) / 5;
		total_bits += (g_ise_range_table[range][2] * 7 * count + 2) / 3;
		return total_bits;
	}

	inline uint32_t weight_interpolate(uint32_t l, uint32_t h, uint32_t w)
	{
		assert(w <= MAX_WEIGHT_INTERPOLANT_VALUE);
		return (l * (64 - w) + h * w + 32) >> 6;
	}

	void encode_bise(uint32_t* pDst, const uint8_t* pSrc_vals, uint32_t bit_pos, int num_vals, int range, uint32_t *pStats = nullptr);

	struct pack_stats
	{
		uint32_t m_header_bits;
		uint32_t m_endpoint_bits;
		uint32_t m_weight_bits;

		inline pack_stats() { clear(); }
		inline void clear() { memset(this, 0, sizeof(*this)); }
	};

	enum
	{
		cValidateEarlyOutAtEndpointISEChecks = 1,
		cValidateSkipFinalEndpointWeightPacking = 2,
	};

	// Packs a logical to physical ASTC block. Note this does not validate the block's dimensions (use is_valid_block_size()), just the grid dimensions.
	bool pack_astc_block(astc_block &phys_block, const log_astc_block& log_block, int* pExpected_endpoint_range = nullptr, pack_stats *pStats = nullptr, uint32_t validate_flags = 0);

	// Pack LDR void extent (really solid color) blocks. For LDR, pass in (val | (val << 8)) for each component.
	void pack_void_extent_ldr(astc_block& blk, uint16_t r, uint16_t g, uint16_t b, uint16_t a, pack_stats *pStats = nullptr);

	// Pack HDR void extent (16-bit values are FP16/half floats - no NaN/Inf's)
	void pack_void_extent_hdr(astc_block& blk, uint16_t rh, uint16_t gh, uint16_t bh, uint16_t ah, pack_stats* pStats = nullptr);

	// These helpers are all quite slow, but are useful for table preparation.

	// Dequantizes ISE encoded endpoint val to [0,255]
	uint32_t dequant_bise_endpoint(uint32_t val, uint32_t ise_range); // ISE ranges 4-11

	// Dequantizes ISE encoded weight val to [0,64]
	uint32_t dequant_bise_weight(uint32_t val, uint32_t ise_range); // ISE ranges 0-10

	uint32_t find_nearest_bise_endpoint(int v, uint32_t ise_range);
	uint32_t find_nearest_bise_weight(int v, uint32_t ise_range);

	void create_quant_tables(
		uint8_t* pVal_to_ise,	// [0-255] or [0-64] value to nearest ISE symbol, array size is [256] or [65]
		uint8_t* pISE_to_val,	// ASTC encoded ISE symbol to [0,255] or [0,64] value, [levels]
		uint8_t* pISE_to_rank,	// returns the level rank index given an ISE symbol, [levels]
		uint8_t* pRank_to_ISE,  // returns the ISE symbol given a level rank, inverse of pISE_to_rank, [levels]
		uint32_t ise_range,		// ise range, [4,20] for endpoints, [0,11] for weights
		bool weight_flag);		// false if block endpoints, true if weights

	// True if the CEM is LDR.
	bool is_cem_ldr(uint32_t mode);
	inline bool is_cem_hdr(uint32_t mode) { return !is_cem_ldr(mode); }

	bool does_cem_have_alpha(uint32_t mode);

	// True if the passed in dimensions are a valid ASTC block size. There are 14 supported configs, from 4x4 (8bpp) to 12x12 (.89bpp).
	bool is_valid_block_size(uint32_t w, uint32_t h);

	// w/h must be a valid ASTC block size, or it returns cBLOCK_SIZE_4x4
	uint32_t get_block_size_index(uint32_t w, uint32_t h);

	float get_bitrate_from_block_size(uint32_t w, uint32_t h);

	uint32_t get_texel_partition_from_table(uint32_t block_width, uint32_t block_height, uint32_t seed, uint32_t subsets, uint32_t x, uint32_t y);

	bool block_has_any_hdr_cems(const log_astc_block& log_blk);
	bool block_has_any_ldr_cems(const log_astc_block& log_blk);

	// Returns the # of endpoint values for the given CEM.
	inline uint32_t get_num_cem_values(uint32_t cem) { assert(cem <= 15); return 2 + 2 * (cem >> 2); }

	struct dequant_table
	{
		basisu::vector<uint8_t> m_val_to_ise;	// [0-255] or [0-64] value to nearest ISE symbol, array size is [256] or [65]
		basisu::vector<uint8_t> m_ISE_to_val;	// ASTC encoded ISE symbol to [0,255] or [0,64] value, [levels]
		basisu::vector<uint8_t> m_ISE_to_rank;	// returns the level rank index given an ISE symbol, [levels]
		basisu::vector<uint8_t> m_rank_to_ISE;  // returns the ISE symbol given a level rank, inverse of pISE_to_rank, [levels]

		void init(bool weight_flag, uint32_t num_levels)
		{
			m_val_to_ise.resize(weight_flag ? (MAX_WEIGHT_INTERPOLANT_VALUE + 1) : 256);
			m_ISE_to_val.resize(num_levels);
			m_ISE_to_rank.resize(num_levels);
			m_rank_to_ISE.resize(num_levels);
		}

		uint32_t get_rank_to_val(uint32_t rank) const
		{
			const uint32_t ise = m_rank_to_ISE[rank];
			const uint32_t val = m_ISE_to_val[ise];
			return val;
		}

		uint32_t get_val_to_rank(uint32_t val)
		{
			const uint32_t ise = m_val_to_ise[val];
			const uint32_t rank = m_ISE_to_rank[ise];
			return rank;
		}
	};

	struct dequant_tables
	{
		dequant_table m_weights[TOTAL_WEIGHT_ISE_RANGES];
		dequant_table m_endpoints[TOTAL_ENDPOINT_ISE_RANGES];
		bool m_initialized_flag = false;

		const dequant_table& get_weight_tab(uint32_t range) const
		{
			assert((range >= FIRST_VALID_WEIGHT_ISE_RANGE) && (range <= LAST_VALID_WEIGHT_ISE_RANGE));
			return m_weights[range - FIRST_VALID_WEIGHT_ISE_RANGE];
		}

		dequant_table& get_weight_tab(uint32_t range)
		{
			assert((range >= FIRST_VALID_WEIGHT_ISE_RANGE) && (range <= LAST_VALID_WEIGHT_ISE_RANGE));
			return m_weights[range - FIRST_VALID_WEIGHT_ISE_RANGE];
		}

		const dequant_table& get_endpoint_tab(uint32_t range) const
		{
			assert((range >= FIRST_VALID_ENDPOINT_ISE_RANGE) && (range <= LAST_VALID_ENDPOINT_ISE_RANGE));
			return m_endpoints[range - FIRST_VALID_ENDPOINT_ISE_RANGE];
		}

		dequant_table& get_endpoint_tab(uint32_t range)
		{
			assert((range >= FIRST_VALID_ENDPOINT_ISE_RANGE) && (range <= LAST_VALID_ENDPOINT_ISE_RANGE));
			return m_endpoints[range - FIRST_VALID_ENDPOINT_ISE_RANGE];
		}

		void init()
		{
			if (m_initialized_flag)
				return;

			for (uint32_t range = FIRST_VALID_WEIGHT_ISE_RANGE; range <= LAST_VALID_WEIGHT_ISE_RANGE; range++)
			{
				const uint32_t num_levels = get_ise_levels(range);
				dequant_table& tab = get_weight_tab(range);

				tab.init(true, num_levels);

				create_quant_tables(tab.m_val_to_ise.data(), tab.m_ISE_to_val.data(), tab.m_ISE_to_rank.data(), tab.m_rank_to_ISE.data(), range, true);
			}

			for (uint32_t range = FIRST_VALID_ENDPOINT_ISE_RANGE; range <= LAST_VALID_ENDPOINT_ISE_RANGE; range++)
			{
				const uint32_t num_levels = get_ise_levels(range);
				dequant_table& tab = get_endpoint_tab(range);

				tab.init(false, num_levels);

				create_quant_tables(tab.m_val_to_ise.data(), tab.m_ISE_to_val.data(), tab.m_ISE_to_rank.data(), tab.m_rank_to_ISE.data(), range, false);
			}

			m_initialized_flag = true;
		}
	};

	extern dequant_tables g_dequant_tables;
	void init_tables();

	struct weighted_sample
	{
		uint8_t m_src_x;
		uint8_t m_src_y;
		uint8_t m_weights[2][2]; // [y][x], scaled by 16, round by adding 8
	};

	void compute_upsample_weights(
		int block_width, int block_height,
		int weight_grid_width, int weight_grid_height,
		weighted_sample* pWeights); // there will be block_width * block_height bilinear samples

	void upsample_weight_grid(
		uint32_t bx, uint32_t by,		// destination/to dimension
		uint32_t wx, uint32_t wy,		// source/from dimension
		const uint8_t* pSrc_weights,	// these are dequantized [0,64] weights, NOT ISE symbols, [wy][wx]
		uint8_t* pDst_weights);			// [by][bx]

	void upsample_weight_grid_xuastc_ldr(
		uint32_t bx, uint32_t by,		// destination/to dimension
		uint32_t wx, uint32_t wy,		// source/from dimension
		const uint8_t* pSrc_weights0,	// these are dequantized [0,64] weights, NOT ISE symbols, [wy][wx]
		uint8_t* pDst_weights0,			// [by][bx]
		const uint8_t* pSrc_weights1,	// these are dequantized [0,64] weights, NOT ISE symbols, [wy][wx]
		uint8_t* pDst_weights1);		// [by][bx]

	bool is_small_block(uint32_t block_width, uint32_t block_height);

	// Procedurally returns the texel partition/subset index given the block coordinate and config (very slow).
	int compute_texel_partition(uint32_t seedIn, uint32_t xIn, uint32_t yIn, uint32_t zIn, int num_partitions, bool small_block);

	// Returns the texel partition/subset index given the block coordinate and config - table lookup, but currently ONLY 2-3 SUBSETS to save RAM.
	int get_precomputed_texel_partition(uint32_t block_width, uint32_t block_height, uint32_t seed, uint32_t x, uint32_t y, uint32_t num_partitions);

	void blue_contract(
		int r, int g, int b, int a,
		int& dr, int& dg, int& db, int& da);

	void bit_transfer_signed(int& a, int& b);

	void decode_endpoint(uint32_t cem_index, int (*pEndpoints)[2], const uint8_t* pE);

	typedef uint16_t half_float;
	half_float float_to_half(float val, bool toward_zero);
	float half_to_float(half_float hval);

	// Notes:
	// qlog16_to_half(half_to_qlog16(half_val_as_int)) == half_val_as_int (is lossless)
	// However, this is not lossless in the general sense.
	inline half_float qlog16_to_half(int k)
	{
		assert((k >= 0) && (k <= 0xFFFF));

		int E = (k & 0xF800) >> 11;
		int M = k & 0x7FF;

		int Mt;
		if (M < 512)
			Mt = 3 * M;
		else if (M >= 1536)
			Mt = 5 * M - 2048;
		else
			Mt = 4 * M - 512;

		return (half_float)((E << 10) + (Mt >> 3));
	}

	const int MAX_RGB9E5 = 0xff80;
	void unpack_rgb9e5(uint32_t packed, float& r, float& g, float& b);
	uint32_t pack_rgb9e5(float r, float g, float b);

	enum decode_mode
	{
		cDecodeModeSRGB8 = 0,	// returns uint8_t's, not valid on HDR blocks
		cDecodeModeLDR8 = 1,	// returns uint8_t's, not valid on HDR blocks
		cDecodeModeHDR16 = 2,   // returns uint16_t's (half floats), valid on all LDR/HDR blocks
		cDecodeModeRGB9E5 = 3	// returns uint32_t's, packed as RGB 9E5 (shared exponent), see https://registry.khronos.org/OpenGL/extensions/EXT/EXT_texture_shared_exponent.txt
	};

	// Decodes logical block to output pixels.
	// pPixels must point to either 32-bit pixel values (SRGB8/LDR8/9E5) or 64-bit pixel values (HDR16)
	bool decode_block(const log_astc_block& log_blk, void* pPixels, uint32_t blk_width, uint32_t blk_height, decode_mode dec_mode);

	// Assuming the ASTC logical block is valid, this checks for the extra XUASTC LDR constraints.
	bool is_block_xuastc_ldr(const log_astc_block& log_blk);

	// XUASTC LDR only - primary assumption is the logical block comes directly from our supercompressor. DO NOT call on general ASTC blocks.
	bool decode_block_xuastc_ldr(const log_astc_block& log_blk, void* pPixels, uint32_t blk_width, uint32_t blk_height, decode_mode dec_mode, const uint8_t* pUpsampled_weights_to_use = nullptr, uint32_t start_x = 0, uint32_t start_y = 0, uint32_t end_x = 0, uint32_t end_y = 0);

	void decode_bise(uint32_t ise_range, uint8_t* pVals, uint32_t num_vals, const uint8_t *pBits128, uint32_t bit_ofs);

	// Unpack a physical ASTC encoded GPU texture block to a logical block description.
	bool unpack_block(const void* pASTC_block, log_astc_block& log_blk, uint32_t blk_width, uint32_t blk_height);

	uint8_t& get_weight(log_astc_block& log_block, uint32_t plane_index, uint32_t idx);
	uint8_t get_weight(const log_astc_block& log_block, uint32_t plane_index, uint32_t idx);
	void extract_weights(const log_astc_block& log_block, uint8_t* pWeights, uint32_t plane_index);
	void set_weights(log_astc_block& log_block, const uint8_t* pWeights, uint32_t plane_index);
	uint32_t get_total_weights(const log_astc_block& log_block);

	uint8_t* get_endpoints(log_astc_block& log_block, uint32_t partition_index);
	const uint8_t* get_endpoints(const log_astc_block& log_block, uint32_t partition_index);

	const char* get_cem_name(uint32_t cem_index);
	bool cem_is_ldr_direct(uint32_t cem_index);
	bool cem_is_ldr_base_scale(uint32_t cem_index);
	bool cem_is_ldr_base_plus_ofs(uint32_t cem_index);

	bool cem_supports_bc(uint32_t cem);

	void bit_transfer_signed_dec(int& a, int& b);
	void bit_transfer_signed_enc(int& a, int& b);

	bool cem8_or_12_used_blue_contraction(uint32_t cem_index, const uint8_t* pEndpoint_vals, uint32_t endpoint_ise_index);
	bool cem9_or_13_used_blue_contraction(uint32_t cem_index, const uint8_t* pEndpoint_vals, uint32_t endpoint_ise_index);
	bool used_blue_contraction(uint32_t cem_index, const uint8_t* pEndpoint_vals, uint32_t endpoint_ise_index);

	uint32_t get_base_cem_without_alpha(uint32_t cem);

	int apply_delta_to_bise_endpoint_val(uint32_t endpoint_ise_range, int ise_val, int delta);

	// index range: [0,NUM_ASTC_BLOCK_SIZES-1]
	void get_astc_block_size_by_index(uint32_t index, uint32_t& width, uint32_t& height);

	// -1 if invalid
	int find_astc_block_size_index(uint32_t width, uint32_t height);

	// 8-bit linear8 or sRGB8, le/he are [0,255], w is [0,64]
	inline int channel_interpolate(int le, int he, int w, bool astc_srgb_decode)
	{
		assert((w >= 0) && (w <= 64));
		assert((le >= 0) && (le <= 255));
		assert((he >= 0) && (he <= 255));

		if (astc_srgb_decode)
		{
			le = (le << 8) | 0x80;
			he = (he << 8) | 0x80;
		}
		else
		{
			le = (le << 8) | le;
			he = (he << 8) | he;
		}

		return astc_helpers::weight_interpolate(le, he, w) >> 8;
	}

} // namespace astc_helpers

#endif // BASISU_ASTC_HELPERS_HEADER

//------------------------------------------------------------------

#ifdef BASISU_ASTC_HELPERS_IMPLEMENTATION

namespace astc_helpers
{
	template<typename T> inline T my_min(T a, T b) { return (a < b) ? a : b; }
	template<typename T> inline T my_max(T a, T b) { return (a > b) ? a : b; }

	const uint8_t g_astc_block_sizes[NUM_ASTC_BLOCK_SIZES][2] = {
		{ 4, 4 }, { 5, 4 }, { 5, 5 }, { 6, 5 },
		{ 6, 6 }, { 8, 5 }, { 8, 6 }, { 10, 5 },
		{ 10, 6 }, { 8, 8 }, { 10, 8 }, { 10, 10 },
		{ 12, 10 }, { 12, 12 }
	};

	const int8_t g_ise_range_table[TOTAL_ISE_RANGES][3] =
	{
		//b  t  q
		//2  3  5	 // rng  ise_index	notes
		{ 1, 0, 0 }, // 0..1 0
		{ 0, 1, 0 }, // 0..2 1
		{ 2, 0, 0 }, // 0..3 2
		{ 0, 0, 1 }, // 0..4 3
		{ 1, 1, 0 }, // 0..5 4			min endpoint ISE index
		{ 3, 0, 0 }, // 0..7 5
		{ 1, 0, 1 }, // 0..9 6
		{ 2, 1, 0 }, // 0..11 7
		{ 4, 0, 0 }, // 0..15 8
		{ 2, 0, 1 }, // 0..19 9
		{ 3, 1, 0 }, // 0..23 10
		{ 5, 0, 0 }, // 0..31 11		max weight ISE index
		{ 3, 0, 1 }, // 0..39 12
		{ 4, 1, 0 }, // 0..47 13
		{ 6, 0, 0 }, // 0..63 14
		{ 4, 0, 1 }, // 0..79 15
		{ 5, 1, 0 }, // 0..95 16
		{ 7, 0, 0 }, // 0..127 17
		{ 5, 0, 1 }, // 0..159 18
		{ 6, 1, 0 }, // 0..191 19
		{ 8, 0, 0 }, // 0..255 20
	};

	static inline void astc_set_bits_1_to_9(uint32_t* pDst, uint32_t& bit_offset, uint32_t code, uint32_t codesize)
	{
		uint8_t* pBuf = reinterpret_cast<uint8_t*>(pDst);

		assert(codesize <= 9);
		if (codesize)
		{
			uint32_t byte_bit_offset = bit_offset & 7;
			uint32_t val = code << byte_bit_offset;

			uint32_t index = bit_offset >> 3;
			pBuf[index] |= (uint8_t)val;

			if (codesize > (8 - byte_bit_offset))
				pBuf[index + 1] |= (uint8_t)(val >> 8);

			bit_offset += codesize;
		}
	}

	static inline uint32_t astc_extract_bits(uint32_t bits, int low, int high)
	{
		return (bits >> low) & ((1 << (high - low + 1)) - 1);
	}

	// Writes bits to output in an endian safe way
	static inline void astc_set_bits(uint32_t* pOutput, uint32_t& bit_pos, uint32_t value, uint32_t total_bits)
	{
		assert(total_bits <= 31);
		assert(value < (1u << total_bits));

		uint8_t* pBytes = reinterpret_cast<uint8_t*>(pOutput);

		while (total_bits)
		{
			const uint32_t bits_to_write = my_min<int>(total_bits, 8 - (bit_pos & 7));

			pBytes[bit_pos >> 3] |= static_cast<uint8_t>(value << (bit_pos & 7));

			bit_pos += bits_to_write;
			total_bits -= bits_to_write;
			value >>= bits_to_write;
		}
	}

	static const uint8_t g_astc_quint_encode[125] =
	{
		0, 1, 2, 3, 4, 8, 9, 10, 11, 12, 16, 17, 18, 19, 20, 24, 25, 26, 27, 28, 5, 13, 21, 29, 6, 32, 33, 34, 35, 36, 40, 41, 42, 43, 44, 48, 49, 50, 51, 52, 56, 57,
		58, 59, 60, 37, 45, 53, 61, 14, 64, 65, 66, 67, 68, 72, 73, 74, 75, 76, 80, 81, 82, 83, 84, 88, 89, 90, 91, 92, 69, 77, 85, 93, 22, 96, 97, 98, 99, 100, 104,
		105, 106, 107, 108, 112, 113, 114, 115, 116, 120, 121, 122, 123, 124, 101, 109, 117, 125, 30, 102, 103, 70, 71, 38, 110, 111, 78, 79, 46, 118, 119, 86, 87, 54,
		126, 127, 94, 95, 62, 39, 47, 55, 63, 7 /*31 - results in the same decode as 7*/
	};

	// Encodes 3 values to output, usable for any range that uses quints and bits
	static inline void astc_encode_quints(uint32_t* pOutput, const uint8_t* pValues, uint32_t& bit_pos, int n, uint32_t* pStats)
	{
		// First extract the quints and the bits from the 3 input values
		int quints = 0, bits[3];
		const uint32_t bit_mask = (1 << n) - 1;
		for (int i = 0; i < 3; i++)
		{
			static const int s_muls[3] = { 1, 5, 25 };

			const int t = pValues[i] >> n;

			quints += t * s_muls[i];
			bits[i] = pValues[i] & bit_mask;
		}

		// Encode the quints, by inverting the bit manipulations done by the decoder, converting 3 quints into 7-bits.
		// See https://www.khronos.org/registry/DataFormat/specs/1.2/dataformat.1.2.html#astc-integer-sequence-encoding

		assert(quints < 125);
		const int T = g_astc_quint_encode[quints];

		// Now interleave the 7 encoded quint bits with the bits to form the encoded output. See table 95-96.
		astc_set_bits(pOutput, bit_pos, bits[0] | (astc_extract_bits(T, 0, 2) << n) | (bits[1] << (3 + n)) | (astc_extract_bits(T, 3, 4) << (3 + n * 2)) |
			(bits[2] << (5 + n * 2)) | (astc_extract_bits(T, 5, 6) << (5 + n * 3)), 7 + n * 3);

		if (pStats)
			*pStats += n * 3 + 7;
	}

	static const uint8_t g_astc_trit_encode[243] = { 0, 1, 2, 4, 5, 6, 8, 9, 10, 16, 17, 18, 20, 21, 22, 24, 25, 26, 3, 7, 11, 19, 23, 27, 12, 13, 14, 32, 33, 34, 36, 37, 38, 40, 41, 42, 48, 49, 50, 52, 53, 54, 56, 57, 58, 35, 39,
		43, 51, 55, 59, 44, 45, 46, 64, 65, 66, 68, 69, 70, 72, 73, 74, 80, 81, 82, 84, 85, 86, 88, 89, 90, 67, 71, 75, 83, 87, 91, 76, 77, 78, 128, 129, 130, 132, 133, 134, 136, 137, 138, 144, 145, 146, 148, 149, 150, 152, 153, 154,
		131, 135, 139, 147, 151, 155, 140, 141, 142, 160, 161, 162, 164, 165, 166, 168, 169, 170, 176, 177, 178, 180, 181, 182, 184, 185, 186, 163, 167, 171, 179, 183, 187, 172, 173, 174, 192, 193, 194, 196, 197, 198, 200, 201, 202,
		208, 209, 210, 212, 213, 214, 216, 217, 218, 195, 199, 203, 211, 215, 219, 204, 205, 206, 96, 97, 98, 100, 101, 102, 104, 105, 106, 112, 113, 114, 116, 117, 118, 120, 121, 122, 99, 103, 107, 115, 119, 123, 108, 109, 110, 224,
		225, 226, 228, 229, 230, 232, 233, 234, 240, 241, 242, 244, 245, 246, 248, 249, 250, 227, 231, 235, 243, 247, 251, 236, 237, 238, 28, 29, 30, 60, 61, 62, 92, 93, 94, 156, 157, 158, 188, 189, 190, 220, 221, 222, 31, 63, 95, 159,
		191, 223, 124, 125, 126 };

	// Encodes 5 values to output, usable for any range that uses trits and bits
	static void astc_encode_trits(uint32_t* pOutput, const uint8_t* pValues, uint32_t& bit_pos, int n, uint32_t *pStats)
	{
		// First extract the trits and the bits from the 5 input values
		int trits = 0, bits[5];
		const uint32_t bit_mask = (1 << n) - 1;
		for (int i = 0; i < 5; i++)
		{
			static const int s_muls[5] = { 1, 3, 9, 27, 81 };

			const int t = pValues[i] >> n;

			trits += t * s_muls[i];
			bits[i] = pValues[i] & bit_mask;
		}

		// Encode the trits, by inverting the bit manipulations done by the decoder, converting 5 trits into 8-bits.
		// See https://www.khronos.org/registry/DataFormat/specs/1.2/dataformat.1.2.html#astc-integer-sequence-encoding

		assert(trits < 243);
		const int T = g_astc_trit_encode[trits];

		// Now interleave the 8 encoded trit bits with the bits to form the encoded output. See table 94.
		astc_set_bits(pOutput, bit_pos, bits[0] | (astc_extract_bits(T, 0, 1) << n) | (bits[1] << (2 + n)), n * 2 + 2);

		astc_set_bits(pOutput, bit_pos, astc_extract_bits(T, 2, 3) | (bits[2] << 2) | (astc_extract_bits(T, 4, 4) << (2 + n)) | (bits[3] << (3 + n)) | (astc_extract_bits(T, 5, 6) << (3 + n * 2)) |
			(bits[4] << (5 + n * 2)) | (astc_extract_bits(T, 7, 7) << (5 + n * 3)), n * 3 + 6);

		if (pStats)
			*pStats += n * 5 + 8;
	}

	// Packs values using ASTC's BISE to output buffer.
	void encode_bise(uint32_t* pDst, const uint8_t* pSrc_vals, uint32_t bit_pos, int num_vals, int range, uint32_t *pStats)
	{
		uint32_t temp[5] = { 0 };

		const int num_bits = g_ise_range_table[range][0];

		int group_size = 0;
		if (g_ise_range_table[range][1])
			group_size = 5;
		else if (g_ise_range_table[range][2])
			group_size = 3;

#ifndef NDEBUG
		const uint32_t num_levels = get_ise_levels(range);
		for (int i = 0; i < num_vals; i++)
		{
			assert(pSrc_vals[i] < num_levels);
		}
#endif

		if (group_size)
		{
			// Range has trits or quints - pack each group of 5 or 3 values
			const int total_groups = (group_size == 5) ? ((num_vals + 4) / 5) : ((num_vals + 2) / 3);

			for (int group_index = 0; group_index < total_groups; group_index++)
			{
				uint8_t vals[5] = { 0 };

				const int limit = my_min(group_size, num_vals - group_index * group_size);
				for (int i = 0; i < limit; i++)
					vals[i] = pSrc_vals[group_index * group_size + i];

				// Note this always writes a group of 3 or 5 bits values, even for incomplete groups. So it can write more than needed.
				// get_ise_sequence_bits() returns the # of bits that must be written for proper decoding.
				if (group_size == 5)
					astc_encode_trits(temp, vals, bit_pos, num_bits, pStats);
				else
					astc_encode_quints(temp, vals, bit_pos, num_bits, pStats);
			}
		}
		else
		{
			for (int i = 0; i < num_vals; i++)
				astc_set_bits_1_to_9(temp, bit_pos, pSrc_vals[i], num_bits);

			if (pStats)
				*pStats += num_vals * num_bits;
		}

		pDst[0] |= temp[0]; pDst[1] |= temp[1];
		pDst[2] |= temp[2]; pDst[3] |= temp[3];
	}

	inline uint32_t rev_dword(uint32_t bits)
	{
		uint32_t v = (bits << 16) | (bits >> 16);
		v = ((v & 0x00ff00ff) << 8) | ((v & 0xff00ff00) >> 8); v = ((v & 0x0f0f0f0f) << 4) | ((v & 0xf0f0f0f0) >> 4);
		v = ((v & 0x33333333) << 2) | ((v & 0xcccccccc) >> 2); v = ((v & 0x55555555) << 1) | ((v & 0xaaaaaaaa) >> 1);
		return v;
	}

	static inline bool is_packable(int value, int num_bits) { assert((num_bits >= 1) && (num_bits < 31)); return (value >= 0) && (value < (1 << num_bits)); }

	static bool get_config_bits(const log_astc_block &log_block, uint32_t &config_bits)
	{
		config_bits = 0;

		const int W = log_block.m_grid_width, H = log_block.m_grid_height;

		const uint32_t P = log_block.m_weight_ise_range >= 6; // high precision
		const uint32_t Dp_P = (log_block.m_dual_plane << 1) | P; // pack dual plane+high precision bits

		// See Tables 81-82
		// Compute p from weight range
		uint32_t p = 2 + log_block.m_weight_ise_range - (P ? 6 : 0);

		// Rearrange p's bits to p0 p2 p1
		p = (p >> 1) + ((p & 1) << 2);

		// Try encoding each row of table 82.

		// W+4 H+2
		if (is_packable(W - 4, 2) && is_packable(H - 2, 2))
		{
			config_bits = (Dp_P << 9) | ((W - 4) << 7) | ((H - 2) << 5) | ((p & 4) << 2) | (p & 3);
			return true;
		}

		// W+8 H+2
		if (is_packable(W - 8, 2) && is_packable(H - 2, 2))
		{
			config_bits = (Dp_P << 9) | ((W - 8) << 7) | ((H - 2) << 5) | ((p & 4) << 2) | 4 | (p & 3);
			return true;
		}

		// W+2 H+8
		if (is_packable(W - 2, 2) && is_packable(H - 8, 2))
		{
			config_bits = (Dp_P << 9) | ((H - 8) << 7) | ((W - 2) << 5) | ((p & 4) << 2) | 8 | (p & 3);
			return true;
		}

		// W+2 H+6
		if (is_packable(W - 2, 2) && is_packable(H - 6, 1))
		{
			config_bits = (Dp_P << 9) | ((H - 6) << 7) | ((W - 2) << 5) | ((p & 4) << 2) | 12 | (p & 3);
			return true;
		}

		// W+2 H+2
		if (is_packable(W - 2, 1) && is_packable(H - 2, 2))
		{
			config_bits = (Dp_P << 9) | ((W) << 7) | ((H - 2) << 5) | ((p & 4) << 2) | 12 | (p & 3);
			return true;
		}

		// 12 H+2
		if ((W == 12) && is_packable(H - 2, 2))
		{
			config_bits = (Dp_P << 9) | ((H - 2) << 5) | (p << 2);
			return true;
		}

		// W+2 12
		if ((H == 12) && is_packable(W - 2, 2))
		{
			config_bits = (Dp_P << 9) | (1 << 7) | ((W - 2) << 5) | (p << 2);
			return true;
		}

		// 6 10
		if ((W == 6) && (H == 10))
		{
			config_bits = (Dp_P << 9) | (3 << 7) | (p << 2);
			return true;
		}

		// 10 6
		if ((W == 10) && (H == 6))
		{
			config_bits = (Dp_P << 9) | (0b1101 << 5) | (p << 2);
			return true;
		}

		// W+6 H+6 (no dual plane or high prec)
		if ((!Dp_P) && is_packable(W - 6, 2) && is_packable(H - 6, 2))
		{
			config_bits = ((H - 6) << 9) | 256 | ((W - 6) << 5) | (p << 2);
			return true;
		}

		// Failed: unsupported weight grid dimensions or config.
		return false;
	}

	bool pack_astc_block(astc_block& phys_block, const log_astc_block& log_block, int* pExpected_endpoint_range, pack_stats *pStats, uint32_t validate_flags)
	{
		// Basic sanity checking
		if (!log_block.m_dual_plane)
		{
			assert(log_block.m_color_component_selector == 0);
		}
		else
		{
			assert(log_block.m_color_component_selector <= 3);
		}

		memset(&phys_block, 0, sizeof(phys_block));

		if (pExpected_endpoint_range)
			*pExpected_endpoint_range = -1;

		assert(!log_block.m_error_flag);
		if (log_block.m_error_flag)
			return false;

		if (log_block.m_solid_color_flag_ldr)
		{
			pack_void_extent_ldr(phys_block, log_block.m_solid_color[0], log_block.m_solid_color[1], log_block.m_solid_color[2], log_block.m_solid_color[3], pStats);
			return true;
		}
		else if (log_block.m_solid_color_flag_hdr)
		{
			pack_void_extent_hdr(phys_block, log_block.m_solid_color[0], log_block.m_solid_color[1], log_block.m_solid_color[2], log_block.m_solid_color[3], pStats);
			return true;
		}

		if ((log_block.m_num_partitions < 1) || (log_block.m_num_partitions > MAX_PARTITIONS))
			return false;

		// Max usable weight range is 11
		if (log_block.m_weight_ise_range > LAST_VALID_WEIGHT_ISE_RANGE)
			return false;

		// See 23.24 Illegal Encodings, [0,5] is the minimum ISE encoding for endpoints
		if ((log_block.m_endpoint_ise_range < FIRST_VALID_ENDPOINT_ISE_RANGE) || (log_block.m_endpoint_ise_range > LAST_VALID_ENDPOINT_ISE_RANGE))
			return false;

		if (log_block.m_color_component_selector > 3)
			return false;

		// TODO: sanity check grid width/height vs. block's physical width/height

		uint32_t config_bits = 0;
		if (!get_config_bits(log_block, config_bits))
			return false;

		uint32_t bit_pos = 0;
		astc_set_bits(&phys_block.m_vals[0], bit_pos, config_bits, 11);
		if (pStats)
			pStats->m_header_bits += 11;

		const uint32_t total_grid_weights = (log_block.m_dual_plane ? 2 : 1) * (log_block.m_grid_width * log_block.m_grid_height);
		const uint32_t total_weight_bits = get_ise_sequence_bits(total_grid_weights, log_block.m_weight_ise_range);

		// 18.24 Illegal Encodings
		if ((!total_grid_weights) || (total_grid_weights > MAX_GRID_WEIGHTS) || (total_weight_bits < 24) || (total_weight_bits > 96))
			return false;

		uint32_t total_extra_bits = 0;

		astc_set_bits(&phys_block.m_vals[0], bit_pos, log_block.m_num_partitions - 1, 2);
		if (pStats)
			pStats->m_header_bits += 2;

		if (log_block.m_num_partitions > 1)
		{
			if (log_block.m_partition_id >= NUM_PARTITION_PATTERNS)
				return false;

			astc_set_bits(&phys_block.m_vals[0], bit_pos, log_block.m_partition_id, 10);
			if (pStats)
				pStats->m_header_bits += 10;

			uint32_t highest_cem = 0, lowest_cem = UINT32_MAX;
			for (uint32_t j = 0; j < log_block.m_num_partitions; j++)
			{
				highest_cem = my_max<uint32_t>(highest_cem, log_block.m_color_endpoint_modes[j]);
				lowest_cem = my_min<uint32_t>(lowest_cem, log_block.m_color_endpoint_modes[j]);
			}

			if (highest_cem > 15)
				return false;

			// Ensure CEM range is contiguous
			if (((highest_cem >> 2) > (1 + (lowest_cem >> 2))))
				return false;

			// See tables 79/80
			uint32_t encoded_cem = log_block.m_color_endpoint_modes[0] << 2;
			if (lowest_cem != highest_cem)
			{
				encoded_cem = my_min<uint32_t>(3, 1 + (lowest_cem >> 2));

				// See tables at 23.11 Color Endpoint Mode
				for (uint32_t j = 0; j < log_block.m_num_partitions; j++)
				{
					const int M = log_block.m_color_endpoint_modes[j] & 3;

					const int C = (log_block.m_color_endpoint_modes[j] >> 2) - ((encoded_cem & 3) - 1);
					if ((C & 1) != C)
						return false;

					encoded_cem |= (C << (2 + j)) | (M << (2 + log_block.m_num_partitions + 2 * j));
				}

				total_extra_bits = 3 * log_block.m_num_partitions - 4;

				if ((total_weight_bits + total_extra_bits) > 128)
					return false;

				uint32_t cem_bit_pos = 128 - total_weight_bits - total_extra_bits;
				astc_set_bits(&phys_block.m_vals[0], cem_bit_pos, encoded_cem >> 6, total_extra_bits);
				if (pStats)
					pStats->m_header_bits += total_extra_bits;
			}

			astc_set_bits(&phys_block.m_vals[0], bit_pos, encoded_cem & 0x3f, 6);
			if (pStats)
				pStats->m_header_bits += 6;
		}
		else
		{
			if (log_block.m_partition_id)
				return false;
			if (log_block.m_color_endpoint_modes[0] > 15)
				return false;

			astc_set_bits(&phys_block.m_vals[0], bit_pos, log_block.m_color_endpoint_modes[0], 4);
			if (pStats)
				pStats->m_header_bits += 4;
		}

		if (log_block.m_dual_plane)
		{
			if (log_block.m_num_partitions > 3)
				return false;

			total_extra_bits += 2;

			uint32_t ccs_bit_pos = 128 - (int)total_weight_bits - (int)total_extra_bits;
			astc_set_bits(&phys_block.m_vals[0], ccs_bit_pos, log_block.m_color_component_selector, 2);
			if (pStats)
				pStats->m_header_bits += 2;
		}

		const uint32_t total_config_bits = bit_pos + total_extra_bits;
		const int num_remaining_bits = 128 - (int)total_config_bits - (int)total_weight_bits;
		if (num_remaining_bits < 0)
			return false;

		uint32_t total_cem_vals = 0;
		for (uint32_t j = 0; j < log_block.m_num_partitions; j++)
			total_cem_vals += 2 + 2 * (log_block.m_color_endpoint_modes[j] >> 2);

		if (total_cem_vals > MAX_ENDPOINTS)
			return false;

		if (validate_flags & cValidateEarlyOutAtEndpointISEChecks)
			return true;

		int endpoint_ise_range = -1;
		for (int k = 20; k > 0; k--)
		{
			int bits = get_ise_sequence_bits(total_cem_vals, k);
			if (bits <= num_remaining_bits)
			{
				endpoint_ise_range = k;
				break;
			}
		}

		// See 23.24 Illegal Encodings, [0,5] is the minimum ISE encoding for endpoints
		if (endpoint_ise_range < (int)FIRST_VALID_ENDPOINT_ISE_RANGE)
			return false;

		// Ensure the caller utilized the right endpoint ISE range.
		if ((int)log_block.m_endpoint_ise_range != endpoint_ise_range)
		{
			if (pExpected_endpoint_range)
				*pExpected_endpoint_range = endpoint_ise_range;
			return false;
		}

		if (pStats)
		{
			pStats->m_endpoint_bits += get_ise_sequence_bits(total_cem_vals, endpoint_ise_range);
			pStats->m_weight_bits += get_ise_sequence_bits(total_grid_weights, log_block.m_weight_ise_range);
		}

		if (validate_flags & cValidateSkipFinalEndpointWeightPacking)
			return true;

		// Pack endpoints forwards
		encode_bise(&phys_block.m_vals[0], log_block.m_endpoints, bit_pos, total_cem_vals, endpoint_ise_range);

		// Pack weights backwards
		uint32_t weight_data[4] = { 0 };
		encode_bise(weight_data, log_block.m_weights, 0, total_grid_weights, log_block.m_weight_ise_range);

		for (uint32_t i = 0; i < 4; i++)
			phys_block.m_vals[i] |= rev_dword(weight_data[3 - i]);

		return true;
	}

	static inline uint32_t bit_replication_scale(uint32_t src, int num_src_bits, int num_dst_bits)
	{
		assert(num_src_bits <= num_dst_bits);
		assert((src & ((1 << num_src_bits) - 1)) == src);

		uint32_t dst = 0;
		for (int shift = num_dst_bits - num_src_bits; shift > -num_src_bits; shift -= num_src_bits)
			dst |= (shift >= 0) ? (src << shift) : (src >> -shift);

		return dst;
	}

	uint32_t dequant_bise_endpoint(uint32_t val, uint32_t ise_range)
	{
		assert((ise_range >= FIRST_VALID_ENDPOINT_ISE_RANGE) && (ise_range <= LAST_VALID_ENDPOINT_ISE_RANGE));
		assert(val < get_ise_levels(ise_range));

		uint32_t u = 0;

		switch (ise_range)
		{
		case 5:
		{
			u = bit_replication_scale(val, 3, 8);
			break;
		}
		case 8:
		{
			u = bit_replication_scale(val, 4, 8);
			break;
		}
		case 11:
		{
			u = bit_replication_scale(val, 5, 8);
			break;
		}
		case 14:
		{
			u = bit_replication_scale(val, 6, 8);
			break;
		}
		case 17:
		{
			u = bit_replication_scale(val, 7, 8);
			break;
		}
		case 20:
		{
			u = val;
			break;
		}
		case 4:
		case 6:
		case 7:
		case 9:
		case 10:
		case 12:
		case 13:
		case 15:
		case 16:
		case 18:
		case 19:
		{
			const uint32_t num_bits = g_ise_range_table[ise_range][0];
			const uint32_t num_trits = g_ise_range_table[ise_range][1]; BASISU_NOTE_UNUSED(num_trits);
			const uint32_t num_quints = g_ise_range_table[ise_range][2]; BASISU_NOTE_UNUSED(num_quints);

			// compute Table 103 row index
			const int range_index = (num_bits * 2 + (num_quints ? 1 : 0)) - 2;

			assert(range_index >= 0 && range_index <= 10);

			uint32_t bits = val & ((1 << num_bits) - 1);
			uint32_t tval = val >> num_bits;

			assert(tval < (num_trits ? 3U : 5U));

			uint32_t a = bits & 1;
			uint32_t b = (bits >> 1) & 1;
			uint32_t c = (bits >> 2) & 1;
			uint32_t d = (bits >> 3) & 1;
			uint32_t e = (bits >> 4) & 1;
			uint32_t f = (bits >> 5) & 1;

			uint32_t A = a ? 511 : 0;
			uint32_t B = 0;

			switch (range_index)
			{
			case 2:
			{
				// 876543210
				// b000b0bb0
				B = (b << 1) | (b << 2) | (b << 4) | (b << 8);
				break;
			}
			case 3:
			{
				// 876543210
				// b0000bb00
				B = (b << 2) | (b << 3) | (b << 8);
				break;
			}
			case 4:
			{
				// 876543210
				// cb000cbcb
				B = b | (c << 1) | (b << 2) | (c << 3) | (b << 7) | (c << 8);
				break;
			}
			case 5:
			{
				// 876543210
				// cb0000cbc
				B = c | (b << 1) | (c << 2) | (b << 7) | (c << 8);
				break;
			}
			case 6:
			{
				// 876543210
				// dcb000dcb
				B = b | (c << 1) | (d << 2) | (b << 6) | (c << 7) | (d << 8);
				break;
			}
			case 7:
			{
				// 876543210
				// dcb0000dc
				B = c | (d << 1) | (b << 6) | (c << 7) | (d << 8);
				break;
			}
			case 8:
			{
				// 876543210
				// edcb000ed
				B = d | (e << 1) | (b << 5) | (c << 6) | (d << 7) | (e << 8);
				break;
			}
			case 9:
			{
				// 876543210
				// edcb0000e
				B = e | (b << 5) | (c << 6) | (d << 7) | (e << 8);
				break;
			}
			case 10:
			{
				// 876543210
				// fedcb000f
				B = f | (b << 4) | (c << 5) | (d << 6) | (e << 7) | (f << 8);
				break;
			}
			default:
				break;
			}

			static uint8_t C_vals[11] = { 204, 113, 93, 54, 44, 26, 22, 13, 11, 6, 5 };
			uint32_t C = C_vals[range_index];
			uint32_t D = tval;

			u = D * C + B;
			u = u ^ A;
			u = (A & 0x80) | (u >> 2);

			break;
		}
		default:
		{
			assert(0);
			break;
		}
		}

		return u;
	}

	uint32_t dequant_bise_weight(uint32_t val, uint32_t ise_range)
	{
		assert(val < get_ise_levels(ise_range));

		uint32_t u = 0;
		switch (ise_range)
		{
		case 0:
		{
			u = val ? 63 : 0;
			break;
		}
		case 1: // 0-2
		{
			const uint8_t s_tab_0_2[3] = { 0, 32, 63 };
			u = s_tab_0_2[val];
			break;
		}
		case 2: // 0-3
		{
			u = bit_replication_scale(val, 2, 6);
			break;
		}
		case 3: // 0-4
		{
			const uint8_t s_tab_0_4[5] = { 0, 16, 32, 47, 63 };
			u = s_tab_0_4[val];
			break;
		}
		case 5: // 0-7
		{
			u = bit_replication_scale(val, 3, 6);
			break;
		}
		case 8: // 0-15
		{
			u = bit_replication_scale(val, 4, 6);
			break;
		}
		case 11: // 0-31
		{
			u = bit_replication_scale(val, 5, 6);
			break;
		}
		case 4: // 0-5
		case 6: // 0-9
		case 7: // 0-11
		case 9: // 0-19
		case 10: // 0-23
		{
			const uint32_t num_bits = g_ise_range_table[ise_range][0];
			const uint32_t num_trits = g_ise_range_table[ise_range][1]; BASISU_NOTE_UNUSED(num_trits);
			const uint32_t num_quints = g_ise_range_table[ise_range][2]; BASISU_NOTE_UNUSED(num_quints);

			// compute Table 103 row index
			const int range_index = num_bits * 2 + (num_quints ? 1 : 0);

			// Extract bits and tris/quints from value
			const uint32_t bits = val & ((1u << num_bits) - 1);
			const uint32_t D = val >> num_bits;

			assert(D < (num_trits ? 3U : 5U));

			// Now dequantize
			// See Table 103. ASTC weight unquantization parameters
			static const uint32_t C_table[5] = { 50, 28, 23, 13, 11 };

			const uint32_t a = bits & 1, b = (bits >> 1) & 1, c = (bits >> 2) & 1;

			const uint32_t A = (a == 0) ? 0 : 0x7F;

			uint32_t B = 0;
			if (range_index == 4)
				B = ((b << 6) | (b << 2) | (b << 0));
			else if (range_index == 5)
				B = ((b << 6) | (b << 1));
			else if (range_index == 6)
				B = ((c << 6) | (b << 5) | (c << 1) | (b << 0));

			const uint32_t C = C_table[range_index - 2];

			u = D * C + B;
			u = u ^ A;
			u = (A & 0x20) | (u >> 2);
			break;
		}
		default:
			assert(0);
			break;
		}

		if (u > 32)
			u++;

		return u;
	}

	// Returns the nearest ISE symbol given a [0,255] endpoint value.
	uint32_t find_nearest_bise_endpoint(int v, uint32_t ise_range)
	{
		assert(ise_range >= FIRST_VALID_ENDPOINT_ISE_RANGE && ise_range <= LAST_VALID_ENDPOINT_ISE_RANGE);

		const uint32_t total_levels = get_ise_levels(ise_range);
		int best_e = INT_MAX, best_index = 0;
		for (uint32_t i = 0; i < total_levels; i++)
		{
			const int qv = dequant_bise_endpoint(i, ise_range);
			int e = (int)labs(v - qv);
			if (e < best_e)
			{
				best_e = e;
				best_index = i;
				if (!best_e)
					break;
			}
		}
		return best_index;
	}

	// Returns the nearest ISE weight given a [0,64] endpoint value.
	uint32_t find_nearest_bise_weight(int v, uint32_t ise_range)
	{
		assert(ise_range >= FIRST_VALID_WEIGHT_ISE_RANGE && ise_range <= LAST_VALID_WEIGHT_ISE_RANGE);
		assert(v <= (int)MAX_WEIGHT_INTERPOLANT_VALUE);

		const uint32_t total_levels = get_ise_levels(ise_range);
		int best_e = INT_MAX, best_index = 0;
		for (uint32_t i = 0; i < total_levels; i++)
		{
			const int qv = dequant_bise_weight(i, ise_range);
			int e = (int)labs(v - qv);
			if (e < best_e)
			{
				best_e = e;
				best_index = i;
				if (!best_e)
					break;
			}
		}
		return best_index;
	}

	void create_quant_tables(
		uint8_t* pVal_to_ise,	// [0-255] or [0-64] value to nearest ISE symbol, array size is [256] or [65]
		uint8_t* pISE_to_val,	// ASTC encoded ISE symbol to [0,255] or [0,64] value, [levels]
		uint8_t* pISE_to_rank,	// returns the level rank index given an ISE symbol, [levels]
		uint8_t* pRank_to_ISE,  // returns the ISE symbol given a level rank, inverse of pISE_to_rank, [levels]
		uint32_t ise_range,		// ise range, [4,20] for endpoints, [0,11] for weights
		bool weight_flag)		// false if block endpoints, true if weights
	{
		const uint32_t num_dequant_vals = weight_flag ? (MAX_WEIGHT_INTERPOLANT_VALUE + 1) : 256;

		for (uint32_t i = 0; i < num_dequant_vals; i++)
		{
			uint32_t bise_index = weight_flag ? astc_helpers::find_nearest_bise_weight(i, ise_range) : astc_helpers::find_nearest_bise_endpoint(i, ise_range);

			if (pVal_to_ise)
				pVal_to_ise[i] = (uint8_t)bise_index;

			if (pISE_to_val)
				pISE_to_val[bise_index] = weight_flag ? (uint8_t)astc_helpers::dequant_bise_weight(bise_index, ise_range) : (uint8_t)astc_helpers::dequant_bise_endpoint(bise_index, ise_range);
		}

		if (pISE_to_rank || pRank_to_ISE)
		{
			const uint32_t num_levels = get_ise_levels(ise_range);

			if (!g_ise_range_table[ise_range][1] && !g_ise_range_table[ise_range][2])
			{
				// Only bits
				for (uint32_t i = 0; i < num_levels; i++)
				{
					if (pISE_to_rank)
						pISE_to_rank[i] = (uint8_t)i;

					if (pRank_to_ISE)
						pRank_to_ISE[i] = (uint8_t)i;
				}
			}
			else
			{
				// Range has trits or quints
				uint32_t vals[256];
				for (uint32_t i = 0; i < num_levels; i++)
				{
					uint32_t v = weight_flag ? astc_helpers::dequant_bise_weight(i, ise_range) : astc_helpers::dequant_bise_endpoint(i, ise_range);

					// Low=ISE value
					// High=dequantized value
					vals[i] = (v << 16) | i;
				}

				// Sorts by dequantized value
				std::sort(vals, vals + num_levels);

				for (uint32_t rank = 0; rank < num_levels; rank++)
				{
					uint32_t ise_val = (uint8_t)vals[rank];

					if (pISE_to_rank)
						pISE_to_rank[ise_val] = (uint8_t)rank;

					if (pRank_to_ISE)
						pRank_to_ISE[rank] = (uint8_t)ise_val;
				}
			}
		}
	}

	void pack_void_extent_ldr(astc_block &blk, uint16_t rh, uint16_t gh, uint16_t bh, uint16_t ah, pack_stats* pStats)
	{
		uint8_t* pDst = (uint8_t*)&blk.m_vals[0];
		memset(pDst, 0xFF, 16);

		pDst[0] = 0b11111100;
		pDst[1] = 0b11111101;

		pDst[8] = (uint8_t)rh;
		pDst[9] = (uint8_t)(rh >> 8);
		pDst[10] = (uint8_t)gh;
		pDst[11] = (uint8_t)(gh >> 8);
		pDst[12] = (uint8_t)bh;
		pDst[13] = (uint8_t)(bh >> 8);
		pDst[14] = (uint8_t)ah;
		pDst[15] = (uint8_t)(ah >> 8);

		if (pStats)
			pStats->m_header_bits += 16 + 64;
	}

	// rh-ah are half-floats
	void pack_void_extent_hdr(astc_block& blk, uint16_t rh, uint16_t gh, uint16_t bh, uint16_t ah, pack_stats *pStats)
	{
		uint8_t* pDst = (uint8_t*)&blk.m_vals[0];
		memset(pDst, 0xFF, 16);

		pDst[0] = 0b11111100;

		pDst[8] = (uint8_t)rh;
		pDst[9] = (uint8_t)(rh >> 8);
		pDst[10] = (uint8_t)gh;
		pDst[11] = (uint8_t)(gh >> 8);
		pDst[12] = (uint8_t)bh;
		pDst[13] = (uint8_t)(bh >> 8);
		pDst[14] = (uint8_t)ah;
		pDst[15] = (uint8_t)(ah >> 8);

		if (pStats)
			pStats->m_header_bits += 8  + 64;
	}

	bool is_cem_ldr(uint32_t mode)
	{
		switch (mode)
		{
		case CEM_LDR_LUM_DIRECT:
		case CEM_LDR_LUM_BASE_PLUS_OFS:
		case CEM_LDR_LUM_ALPHA_DIRECT:
		case CEM_LDR_LUM_ALPHA_BASE_PLUS_OFS:
		case CEM_LDR_RGB_BASE_SCALE:
		case CEM_LDR_RGB_DIRECT:
		case CEM_LDR_RGB_BASE_PLUS_OFFSET:
		case CEM_LDR_RGB_BASE_SCALE_PLUS_TWO_A:
		case CEM_LDR_RGBA_DIRECT:
		case CEM_LDR_RGBA_BASE_PLUS_OFFSET:
			return true;
		default:
			break;
		}

		return false;
	}

	bool does_cem_have_alpha(uint32_t mode)
	{
		switch (mode)
		{
		case CEM_LDR_LUM_ALPHA_DIRECT:
		case CEM_LDR_LUM_ALPHA_BASE_PLUS_OFS:
		case CEM_LDR_RGB_BASE_SCALE_PLUS_TWO_A:
		case CEM_LDR_RGBA_DIRECT:
		case CEM_LDR_RGBA_BASE_PLUS_OFFSET:
		case CEM_HDR_RGB_LDR_ALPHA:
		case CEM_HDR_RGB_HDR_ALPHA:
			return true;
		default:
			break;
		}

		return false;
	}

	bool is_valid_block_size(uint32_t w, uint32_t h)
	{
#define BU_ASTC_HELPERS_SIZECHK(x, y) if ((w == (x)) && (h == (y))) return true;
		BU_ASTC_HELPERS_SIZECHK(4, 4); // 0
		BU_ASTC_HELPERS_SIZECHK(5, 4); // 1

		BU_ASTC_HELPERS_SIZECHK(5, 5); // 2

		BU_ASTC_HELPERS_SIZECHK(6, 5); // 3
		BU_ASTC_HELPERS_SIZECHK(6, 6); // 4

		BU_ASTC_HELPERS_SIZECHK(8, 5); // 5
		BU_ASTC_HELPERS_SIZECHK(8, 6); // 6
		BU_ASTC_HELPERS_SIZECHK(10, 5); // 7
		BU_ASTC_HELPERS_SIZECHK(10, 6); // 8

		BU_ASTC_HELPERS_SIZECHK(8, 8); // 9
		BU_ASTC_HELPERS_SIZECHK(10, 8); // 10
		BU_ASTC_HELPERS_SIZECHK(10, 10); // 11

		BU_ASTC_HELPERS_SIZECHK(12, 10); // 12
		BU_ASTC_HELPERS_SIZECHK(12, 12); // 13
#undef BU_ASTC_HELPERS_SIZECHK

		return false;
	}

	uint32_t get_block_size_index(uint32_t w, uint32_t h)
	{
		assert(is_valid_block_size(w, h));

		const uint32_t t = w * h;

		if (t <= 36)
		{
			if (t == 36)
				return cBLOCK_SIZE_6x6;
			else if (t == 16)
				return cBLOCK_SIZE_4x4;
			else if (t == 25)
				return cBLOCK_SIZE_5x5;
			else if (t == 20)
				return cBLOCK_SIZE_5x4;
			else if (t == 30)
				return cBLOCK_SIZE_6x5;
		}
		else if (t <= 64)
		{
			if (t == 64)
				return cBLOCK_SIZE_8x8;
			else if (t == 60)
				return cBLOCK_SIZE_10x6;
			else if (t == 50)
				return cBLOCK_SIZE_10x5;
			else if (t == 48)
				return cBLOCK_SIZE_8x6;
			else if (t == 40)
				return cBLOCK_SIZE_8x5;
		}
		else
		{
			if (t == 80)
				return cBLOCK_SIZE_10x8;
			else if (t == 100)
				return cBLOCK_SIZE_10x10;
			else if (t == 120)
				return cBLOCK_SIZE_12x10;
			else if (t == 144)
				return cBLOCK_SIZE_12x12;
		}

		assert(0);
		return cBLOCK_SIZE_4x4;
	}

	// returns the standard ASTC bitrates given a valid block size from the ASTC spec.
	// 0=invalid block size
	float get_bitrate_from_block_size(uint32_t w, uint32_t h)
	{
#define BU_ASTC_HELPERS_BLOCK_BITRATE(x, y, b) if ((w == (x)) && (h == (y))) return (b);
		BU_ASTC_HELPERS_BLOCK_BITRATE(4, 4, 8.0f);
		BU_ASTC_HELPERS_BLOCK_BITRATE(5, 4, 6.4f);

		BU_ASTC_HELPERS_BLOCK_BITRATE(5, 5, 5.12f);

		BU_ASTC_HELPERS_BLOCK_BITRATE(6, 5, 4.27f);
		BU_ASTC_HELPERS_BLOCK_BITRATE(6, 6, 3.56f);

		BU_ASTC_HELPERS_BLOCK_BITRATE(8, 5, 3.20f);
		BU_ASTC_HELPERS_BLOCK_BITRATE(8, 6, 2.67f);
		BU_ASTC_HELPERS_BLOCK_BITRATE(10, 5, 2.56f);
		BU_ASTC_HELPERS_BLOCK_BITRATE(10, 6, 2.13f);

		BU_ASTC_HELPERS_BLOCK_BITRATE(8, 8, 2.00f);
		BU_ASTC_HELPERS_BLOCK_BITRATE(10, 8, 1.60f);
		BU_ASTC_HELPERS_BLOCK_BITRATE(10, 10, 1.28f);

		BU_ASTC_HELPERS_BLOCK_BITRATE(12, 10, 1.07f);
		BU_ASTC_HELPERS_BLOCK_BITRATE(12, 12, .89f);
#undef BU_ASTC_HELPERS_BLOCK_BITRATE

		return 0.0f;
	}

	bool block_has_any_hdr_cems(const log_astc_block& log_blk)
	{
		assert((log_blk.m_num_partitions >= 1) && (log_blk.m_num_partitions <= MAX_PARTITIONS));

		for (uint32_t i = 0; i < log_blk.m_num_partitions; i++)
			if (is_cem_hdr(log_blk.m_color_endpoint_modes[i]))
				return true;

		return false;
	}

	bool block_has_any_ldr_cems(const log_astc_block& log_blk)
	{
		assert((log_blk.m_num_partitions >= 1) && (log_blk.m_num_partitions <= MAX_PARTITIONS));

		for (uint32_t i = 0; i < log_blk.m_num_partitions; i++)
			if (!is_cem_hdr(log_blk.m_color_endpoint_modes[i]))
				return true;

		return false;
	}

	dequant_tables g_dequant_tables;

	void precompute_texel_partitions();

	// TODO: this is called twice when using the encoder, first init_rank_tabs=false then init_rank_tabs=true.
	void init_tables()
	{
		g_dequant_tables.init();

		precompute_texel_partitions();
	}

	void compute_upsample_weights(
		int block_width, int block_height,
		int weight_grid_width, int weight_grid_height,
		weighted_sample* pWeights) // there will be block_width * block_height bilinear samples
	{
		const uint32_t scaleX = (1024 + block_width / 2) / (block_width - 1);
		const uint32_t scaleY = (1024 + block_height / 2) / (block_height - 1);

		for (int texelY = 0; texelY < block_height; texelY++)
		{
			for (int texelX = 0; texelX < block_width; texelX++)
			{
				const uint32_t gX = (scaleX * texelX * (weight_grid_width - 1) + 32) >> 6;
				const uint32_t gY = (scaleY * texelY * (weight_grid_height - 1) + 32) >> 6;
				const uint32_t jX = gX >> 4;
				const uint32_t jY = gY >> 4;
				const uint32_t fX = gX & 0xf;
				const uint32_t fY = gY & 0xf;
				const uint32_t w11 = (fX * fY + 8) >> 4;
				const uint32_t w10 = fY - w11;
				const uint32_t w01 = fX - w11;
				const uint32_t w00 = 16 - fX - fY + w11;

				weighted_sample& s = pWeights[texelX + texelY * block_width];
				s.m_src_x = (uint8_t)jX;
				s.m_src_y = (uint8_t)jY;
				s.m_weights[0][0] = (uint8_t)w00;
				s.m_weights[0][1] = (uint8_t)w01;
				s.m_weights[1][0] = (uint8_t)w10;
				s.m_weights[1][1] = (uint8_t)w11;
			}
		}
	}

	// Should be dequantized [0,64] weights
	void upsample_weight_grid(
		uint32_t bx, uint32_t by,		// destination/to dimension
		uint32_t wx, uint32_t wy,		// source/from dimension
		const uint8_t* pSrc_weights,	// these are dequantized [0,64] weights, NOT ISE symbols, [wy][wx]
		uint8_t* pDst_weights)			// [by][bx]
	{
		assert((bx >= 2) && (by >= 2) && (bx <= 12) && (by <= 12));
		assert((wx >= 2) && (wy >= 2) && (wx <= bx) && (wy <= by));

		const uint32_t total_src_weights = wx * wy;
		const uint32_t total_dst_weights = bx * by;

		if (total_src_weights == total_dst_weights)
		{
			assert((bx == wx) && (by == wy));

			memcpy(pDst_weights, pSrc_weights, total_src_weights);
			return;
		}

		weighted_sample weights[12 * 12];
		compute_upsample_weights(bx, by, wx, wy, weights);

		const weighted_sample* pS = weights;

		for (uint32_t y = 0; y < by; y++)
		{
			for (uint32_t x = 0; x < bx; x++, ++pS)
			{
				const uint32_t w00 = pS->m_weights[0][0];
				const uint32_t w01 = pS->m_weights[0][1];
				const uint32_t w10 = pS->m_weights[1][0];
				const uint32_t w11 = pS->m_weights[1][1];

				assert(w00 || w01 || w10 || w11);

				const uint32_t sx = pS->m_src_x, sy = pS->m_src_y;

				uint32_t total = 8;
				if (w00) total += pSrc_weights[bounds_check(sx + sy * wx, 0U, total_src_weights)] * w00;
				if (w01) total += pSrc_weights[bounds_check(sx + 1 + sy * wx, 0U, total_src_weights)] * w01;
				if (w10) total += pSrc_weights[bounds_check(sx + (sy + 1) * wx, 0U, total_src_weights)] * w10;
				if (w11) total += pSrc_weights[bounds_check(sx + 1 + (sy + 1) * wx, 0U, total_src_weights)] * w11;

				pDst_weights[x + y * bx] = (uint8_t)(total >> 4);
			}
		}
	}

	void upsample_weight_grid_xuastc_ldr(
		uint32_t bx, uint32_t by,		// destination/to dimension
		uint32_t wx, uint32_t wy,		// source/from dimension
		const uint8_t* pSrc_weights0,	// these are dequantized [0,64] weights, NOT ISE symbols, [wy][wx]
		uint8_t* pDst_weights0,			// [by][bx]
		const uint8_t* pSrc_weights1,	// these are dequantized [0,64] weights, NOT ISE symbols, [wy][wx]
		uint8_t* pDst_weights1)			// [by][bx]
	{
		assert((bx >= 2) && (by >= 2) && (bx <= 12) && (by <= 12));
		assert((wx >= 2) && (wy >= 2) && (wx <= bx) && (wy <= by));

		assert((bx != wx) || (by != wy));

		const uint32_t scaleX = (1024 + bx / 2) / (bx - 1);
		const uint32_t scaleY = (1024 + by / 2) / (by - 1);

		const uint32_t gYUInc = scaleY * (wy - 1);
		const uint32_t gXUInc = scaleX * (wx - 1);

		uint32_t gYU = 32;
		for (uint32_t texel_y = 0; texel_y < by; texel_y++)
		{
			const uint32_t gY = gYU >> 6;
			gYU += gYUInc;

			const uint32_t jY = gY >> 4;
			const uint32_t fY = gY & 0xf;

			uint32_t gXU = 32;
			for (uint32_t texel_x = 0; texel_x < bx; texel_x++)
			{
				const uint32_t gX = gXU >> 6;
				gXU += gXUInc;

				const uint32_t jX = gX >> 4;
				const uint32_t fX = gX & 0xf;

				const uint32_t w11 = (fX * fY + 8) >> 4;
				const uint32_t w10 = fY - w11;
				const uint32_t w01 = fX - w11;
				const uint32_t w00 = 16 - fX - fY + w11;

				assert(w00 || w01 || w10 || w11);

				const uint32_t sx = jX, sy = jY;

				{
					uint32_t total0 = 8;

					if (w00) total0 += pSrc_weights0[sx + sy * wx] * w00;
					if (w01) total0 += pSrc_weights0[sx + 1 + sy * wx] * w01;
					if (w10) total0 += pSrc_weights0[sx + (sy + 1) * wx] * w10;
					if (w11) total0 += pSrc_weights0[sx + 1 + (sy + 1) * wx] * w11;

					pDst_weights0[texel_x + texel_y * bx] = (uint8_t)(total0 >> 4);
				}

				if (pDst_weights1)
				{
					uint32_t total1 = 8;

					if (w00) total1 += pSrc_weights1[sx + sy * wx] * w00;
					if (w01) total1 += pSrc_weights1[sx + 1 + sy * wx] * w01;
					if (w10) total1 += pSrc_weights1[sx + (sy + 1) * wx] * w10;
					if (w11) total1 += pSrc_weights1[sx + 1 + (sy + 1) * wx] * w11;

					pDst_weights1[texel_x + texel_y * bx] = (uint8_t)(total1 >> 4);
				}
			} // texel_x
		} // texel_y
	}

	inline uint32_t hash52(uint32_t v)
	{
		uint32_t p = v;
		p ^= p >> 15;   p -= p << 17;   p += p << 7;    p += p << 4;
		p ^= p >> 5;   p += p << 16;   p ^= p >> 7;    p ^= p >> 3;
		p ^= p << 6;   p ^= p >> 17;
		return p;
	}

	bool is_small_block(uint32_t block_width, uint32_t block_height)
	{
		assert((block_width >= MIN_BLOCK_DIM) && (block_width <= MAX_BLOCK_DIM));
		assert((block_height >= MIN_BLOCK_DIM) && (block_height <= MAX_BLOCK_DIM));

		const uint32_t num_blk_pixels = block_width * block_height;

		return num_blk_pixels < 31;
	}

	// small_block = num_blk_pixels < 31
	int compute_texel_partition(uint32_t seedIn, uint32_t xIn, uint32_t yIn, uint32_t zIn, int num_partitions, bool small_block)
	{
		assert(zIn == 0);

		const uint32_t  x = small_block ? xIn << 1 : xIn;
		const uint32_t  y = small_block ? yIn << 1 : yIn;
		const uint32_t  z = small_block ? zIn << 1 : zIn;
		const uint32_t  seed = seedIn + 1024 * (num_partitions - 1);
		const uint32_t  rnum = hash52(seed);

		uint8_t         seed1 = (uint8_t)(rnum & 0xf);
		uint8_t         seed2 = (uint8_t)((rnum >> 4) & 0xf);
		uint8_t         seed3 = (uint8_t)((rnum >> 8) & 0xf);
		uint8_t         seed4 = (uint8_t)((rnum >> 12) & 0xf);
		uint8_t         seed5 = (uint8_t)((rnum >> 16) & 0xf);
		uint8_t         seed6 = (uint8_t)((rnum >> 20) & 0xf);
		uint8_t         seed7 = (uint8_t)((rnum >> 24) & 0xf);
		uint8_t         seed8 = (uint8_t)((rnum >> 28) & 0xf);
		uint8_t         seed9 = (uint8_t)((rnum >> 18) & 0xf);
		uint8_t         seed10 = (uint8_t)((rnum >> 22) & 0xf);
		uint8_t         seed11 = (uint8_t)((rnum >> 26) & 0xf);
		uint8_t         seed12 = (uint8_t)(((rnum >> 30) | (rnum << 2)) & 0xf);

		seed1 = (uint8_t)(seed1 * seed1);
		seed2 = (uint8_t)(seed2 * seed2);
		seed3 = (uint8_t)(seed3 * seed3);
		seed4 = (uint8_t)(seed4 * seed4);
		seed5 = (uint8_t)(seed5 * seed5);
		seed6 = (uint8_t)(seed6 * seed6);
		seed7 = (uint8_t)(seed7 * seed7);
		seed8 = (uint8_t)(seed8 * seed8);
		seed9 = (uint8_t)(seed9 * seed9);
		seed10 = (uint8_t)(seed10 * seed10);
		seed11 = (uint8_t)(seed11 * seed11);
		seed12 = (uint8_t)(seed12 * seed12);

		const int shA = (seed & 2) != 0 ? 4 : 5;
		const int shB = (num_partitions == 3) ? 6 : 5;
		const int sh1 = (seed & 1) != 0 ? shA : shB;
		const int sh2 = (seed & 1) != 0 ? shB : shA;
		const int sh3 = (seed & 0x10) != 0 ? sh1 : sh2;

		seed1 = (uint8_t)(seed1 >> sh1);
		seed2 = (uint8_t)(seed2 >> sh2);
		seed3 = (uint8_t)(seed3 >> sh1);
		seed4 = (uint8_t)(seed4 >> sh2);
		seed5 = (uint8_t)(seed5 >> sh1);
		seed6 = (uint8_t)(seed6 >> sh2);
		seed7 = (uint8_t)(seed7 >> sh1);
		seed8 = (uint8_t)(seed8 >> sh2);
		seed9 = (uint8_t)(seed9 >> sh3);
		seed10 = (uint8_t)(seed10 >> sh3);
		seed11 = (uint8_t)(seed11 >> sh3);
		seed12 = (uint8_t)(seed12 >> sh3);

		const int a = 0x3f & (seed1 * x + seed2 * y + seed11 * z + (rnum >> 14));
		const int b = 0x3f & (seed3 * x + seed4 * y + seed12 * z + (rnum >> 10));
		const int c = (num_partitions >= 3) ? 0x3f & (seed5 * x + seed6 * y + seed9 * z + (rnum >> 6)) : 0;
		const int d = (num_partitions >= 4) ? 0x3f & (seed7 * x + seed8 * y + seed10 * z + (rnum >> 2)) : 0;

		return (a >= b && a >= c && a >= d) ? 0
			: (b >= c && b >= d) ? 1
			: (c >= d) ? 2
			: 3;
	}

	// Precomputed partition patterns for each 10-bit seed and small/large block sizes for 2-3 subsets.
	// This costs 144KB of RAM and some init, but considering the sheer complexity of compute_texel_partition() and how hotly it's called in the compressors and transcoders that's worth it.
	// Byte packing:
	//  low 4 bits=small blocks (on valid up to 6x5)
	//  high 4 bits=large blocks (6x6 or larger)

	static uint8_t g_texel_partitions[NUM_PARTITION_PATTERNS][12][12]; // [seed][y][x]

	void sanity_check_texel_partition_tables()
	{
#if 0
#if defined(_DEBUG) || defined(DEBUG)
		// sanity checking
		for (uint32_t i = 0; i < cTOTAL_BLOCK_SIZES; i++)
		{
			const uint32_t bw = g_astc_block_sizes[i][0], bh = g_astc_block_sizes[i][1];
			const bool is_small_block_flag = is_small_block(bw, bh);

			assert(get_block_size_index(bw, bh) == i);

			for (uint32_t s = 0; s < NUM_PARTITION_PATTERNS; s++)
			{
				for (uint32_t y = 0; y < bh; y++)
				{
					for (uint32_t x = 0; x < bw; x++)
					{
						const uint32_t k2 = compute_texel_partition(s, x, y, 0, 2, is_small_block_flag);
						const uint32_t k3 = compute_texel_partition(s, x, y, 0, 3, is_small_block_flag);

						assert(get_precomputed_texel_partition(bw, bh, s, x, y, 2) == (int)k2);
						assert(get_precomputed_texel_partition(bw, bh, s, x, y, 3) == (int)k3);
					} // x
				} // y
			} // s
		}
		printf("precompute_texel_partitions: Sanity check OK\n");
#endif
#endif
	}

	void precompute_texel_partition()
	{
		for (uint32_t seed = 0; seed < NUM_PARTITION_PATTERNS; seed++)
		{
			for (uint32_t y = 0; y < MAX_BLOCK_DIM; y++)
			{
				for (uint32_t x = 0; x < MAX_BLOCK_DIM; x++)
				{
					uint32_t k = 0;

					// small block (width*height<31)
					if ((x <= 6) && (y <= 5))
					{
						uint32_t v2 = compute_texel_partition(seed, x, y, 0, 2, true); assert(v2 <= 1);
						uint32_t v3 = compute_texel_partition(seed, x, y, 0, 3, true); assert(v3 <= 2);
						k |= v2 | (v3 << 2);
					}

					// not small block
					{
						uint32_t v2 = compute_texel_partition(seed, x, y, 0, 2, false); assert(v2 <= 1);
						uint32_t v3 = compute_texel_partition(seed, x, y, 0, 3, false); assert(v3 <= 2);
						k |= ((v2 | (v3 << 2)) << 4);
					}

					assert(k <= 255);

					g_texel_partitions[seed][y][x] = (uint8_t)k;
				} // x
			} // y
		} // seed
	}

	int get_precomputed_texel_partition(uint32_t block_width, uint32_t block_height, uint32_t seed, uint32_t x, uint32_t y, uint32_t subsets)
	{
		assert(seed < NUM_PARTITION_PATTERNS);
		assert((subsets >= 2) && (subsets <= 3));
		assert((x < block_width) && (y < block_height));

		const uint32_t v = g_texel_partitions[seed][y][x];

		uint32_t shift = (subsets == 3) ? 2 : 0;
		shift += ((block_width * block_height) >= 31) * 4;
		uint32_t res = (v >> shift) & 3;

		// sanity checking
		assert(res == (uint32_t)compute_texel_partition(seed, x, y, 0, subsets, is_small_block(block_width, block_height)));

		return res;
	}

	void precompute_texel_partitions()
	{
		if (!g_texel_partitions[0][0][0])
			precompute_texel_partition();

		sanity_check_texel_partition_tables();
	}

	void blue_contract(
		int r, int g, int b, int a,
		int &dr, int &dg, int &db, int &da)
	{
		dr = (r + b) >> 1;
		dg = (g + b) >> 1;
		db = b;
		da = a;
	}

	inline void bit_transfer_signed(int& a, int& b)
	{
		b >>= 1;
		b |= (a & 0x80);
		a >>= 1;
		a &= 0x3F;
		if ((a & 0x20) != 0)
			a -= 0x40;
	}

	static inline int clamp(int a, int l, int h)
	{
		if (a < l)
			a = l;
		else if (a > h)
			a = h;
		return a;
	}

	static inline float clampf(float a, float l, float h)
	{
		if (a < l)
			a = l;
		else if (a > h)
			a = h;
		return a;
	}

	inline int sign_extend(int src, int num_src_bits)
	{
		assert((num_src_bits >= 2) && (num_src_bits <= 31));

		const bool negative = (src & (1 << (num_src_bits - 1))) != 0;
		if (negative)
			return src | ~((1 << num_src_bits) - 1);
		else
			return src & ((1 << num_src_bits) - 1);
	}

	// endpoints is [4][2]
	void decode_endpoint(uint32_t cem_index, int (*pEndpoints)[2], const uint8_t *pE)
	{
		assert(cem_index <= CEM_HDR_RGB_HDR_ALPHA);

		int v0 = pE[0], v1 = pE[1];

		int& e0_r = pEndpoints[0][0], &e0_g = pEndpoints[1][0], &e0_b = pEndpoints[2][0], &e0_a = pEndpoints[3][0];
		int& e1_r = pEndpoints[0][1], &e1_g = pEndpoints[1][1], &e1_b = pEndpoints[2][1], &e1_a = pEndpoints[3][1];

		switch (cem_index)
		{
		case CEM_LDR_LUM_DIRECT:
		{
			e0_r = v0; e1_r = v1;
			e0_g = v0; e1_g = v1;
			e0_b = v0; e1_b = v1;
			e0_a = 0xFF; e1_a = 0xFF;
			break;
		}
		case CEM_LDR_LUM_BASE_PLUS_OFS:
		{
			int l0 = (v0 >> 2) | (v1 & 0xc0);
			int l1 = l0 + (v1 & 0x3f);

			if (l1 > 0xFF)
				l1 = 0xFF;

			e0_r = l0; e1_r = l1;
			e0_g = l0; e1_g = l1;
			e0_b = l0; e1_b = l1;
			e0_a = 0xFF; e1_a = 0xFF;
			break;
		}
		case CEM_LDR_LUM_ALPHA_DIRECT:
		{
			int v2 = pE[2], v3 = pE[3];

			e0_r = v0; e1_r = v1;
			e0_g = v0; e1_g = v1;
			e0_b = v0; e1_b = v1;
			e0_a = v2; e1_a = v3;
			break;
		}
		case CEM_LDR_LUM_ALPHA_BASE_PLUS_OFS:
		{
			int v2 = pE[2], v3 = pE[3];

			bit_transfer_signed(v1, v0);
			bit_transfer_signed(v3, v2);

			e0_r = v0; e1_r = v0 + v1;
			e0_g = v0; e1_g = v0 + v1;
			e0_b = v0; e1_b = v0 + v1;
			e0_a = v2; e1_a = v2 + v3;

			for (uint32_t c = 0; c < 4; c++)
			{
				pEndpoints[c][0] = clamp(pEndpoints[c][0], 0, 255);
				pEndpoints[c][1] = clamp(pEndpoints[c][1], 0, 255);
			}

			break;
		}
		case CEM_LDR_RGB_BASE_SCALE:
		{
			int v2 = pE[2], v3 = pE[3];

			e0_r = (v0 * v3) >> 8; e1_r = v0;
			e0_g = (v1 * v3) >> 8; e1_g = v1;
			e0_b = (v2 * v3) >> 8; e1_b = v2;
			e0_a = 0xFF; e1_a = 0xFF;

			break;
		}
		case CEM_LDR_RGB_DIRECT:
		{
			int v2 = pE[2], v3 = pE[3], v4 = pE[4], v5 = pE[5];

			if ((v1 + v3 + v5) >= (v0 + v2 + v4))
			{
				e0_r = v0; e1_r = v1;
				e0_g = v2; e1_g = v3;
				e0_b = v4; e1_b = v5;
				e0_a = 0xFF; e1_a = 0xFF;
			}
			else
			{
				blue_contract(v1, v3, v5, 0xFF, e0_r, e0_g, e0_b, e0_a);
				blue_contract(v0, v2, v4, 0xFF, e1_r, e1_g, e1_b, e1_a);
			}

			break;
		}
		case CEM_LDR_RGB_BASE_PLUS_OFFSET:
		{
			int v2 = pE[2], v3 = pE[3], v4 = pE[4], v5 = pE[5];

			bit_transfer_signed(v1, v0);
			bit_transfer_signed(v3, v2);
			bit_transfer_signed(v5, v4);

			if ((v1 + v3 + v5) >= 0)
			{
				e0_r = v0; e1_r = v0 + v1;
				e0_g = v2; e1_g = v2 + v3;
				e0_b = v4; e1_b = v4 + v5;
				e0_a = 0xFF; e1_a = 0xFF;
			}
			else
			{
				blue_contract(v0 + v1, v2 + v3, v4 + v5, 0xFF, e0_r, e0_g, e0_b, e0_a);
				blue_contract(v0, v2, v4, 0xFF, e1_r, e1_g, e1_b, e1_a);
			}

			for (uint32_t c = 0; c < 4; c++)
			{
				pEndpoints[c][0] = clamp(pEndpoints[c][0], 0, 255);
				pEndpoints[c][1] = clamp(pEndpoints[c][1], 0, 255);
			}

			break;
		}
		case CEM_LDR_RGB_BASE_SCALE_PLUS_TWO_A:
		{
			int v2 = pE[2], v3 = pE[3], v4 = pE[4], v5 = pE[5];

			e0_r = (v0 * v3) >> 8; e1_r = v0;
			e0_g = (v1 * v3) >> 8; e1_g = v1;
			e0_b = (v2 * v3) >> 8; e1_b = v2;
			e0_a = v4; e1_a = v5;

			break;
		}
		case CEM_LDR_RGBA_DIRECT:
		{
			int v2 = pE[2], v3 = pE[3], v4 = pE[4], v5 = pE[5], v6 = pE[6], v7 = pE[7];

			if ((v1 + v3 + v5) >= (v0 + v2 + v4))
			{
				e0_r = v0; e1_r = v1;
				e0_g = v2; e1_g = v3;
				e0_b = v4; e1_b = v5;
				e0_a = v6; e1_a = v7;
			}
			else
			{
				blue_contract(v1, v3, v5, v7, e0_r, e0_g, e0_b, e0_a);
				blue_contract(v0, v2, v4, v6, e1_r, e1_g, e1_b, e1_a);
			}

			break;
		}
		case CEM_LDR_RGBA_BASE_PLUS_OFFSET:
		{
			int v2 = pE[2], v3 = pE[3], v4 = pE[4], v5 = pE[5], v6 = pE[6], v7 = pE[7];

			bit_transfer_signed(v1, v0);
			bit_transfer_signed(v3, v2);
			bit_transfer_signed(v5, v4);
			bit_transfer_signed(v7, v6);

			if ((v1 + v3 + v5) >= 0)
			{
				e0_r = v0; e1_r = v0 + v1;
				e0_g = v2; e1_g = v2 + v3;
				e0_b = v4; e1_b = v4 + v5;
				e0_a = v6; e1_a = v6 + v7;
			}
			else
			{
				blue_contract(v0 + v1, v2 + v3, v4 + v5, v6 + v7, e0_r, e0_g, e0_b, e0_a);
				blue_contract(v0, v2, v4, v6, e1_r, e1_g, e1_b, e1_a);
			}

			for (uint32_t c = 0; c < 4; c++)
			{
				pEndpoints[c][0] = clamp(pEndpoints[c][0], 0, 255);
				pEndpoints[c][1] = clamp(pEndpoints[c][1], 0, 255);
			}

			break;
		}
		case CEM_HDR_LUM_LARGE_RANGE:
		{
			int y0, y1;
			if (v1 >= v0)
			{
				y0 = (v0 << 4);
				y1 = (v1 << 4);
			}
			else
			{
				y0 = (v1 << 4) + 8;
				y1 = (v0 << 4) - 8;
			}

			e0_r = y0; e1_r = y1;
			e0_g = y0; e1_g = y1;
			e0_b = y0; e1_b = y1;
			e0_a = 0x780; e1_a = 0x780;

			break;
		}
		case CEM_HDR_LUM_SMALL_RANGE:
		{
			int y0, y1, d;

			if ((v0 & 0x80) != 0)
			{
				y0 = ((v1 & 0xE0) << 4) | ((v0 & 0x7F) << 2);
				d = (v1 & 0x1F) << 2;
			}
			else
			{
				y0 = ((v1 & 0xF0) << 4) | ((v0 & 0x7F) << 1);
				d = (v1 & 0x0F) << 1;
			}

			y1 = y0 + d;
			if (y1 > 0xFFF)
				y1 = 0xFFF;

			e0_r = y0; e1_r = y1;
			e0_g = y0; e1_g = y1;
			e0_b = y0; e1_b = y1;
			e0_a = 0x780; e1_a = 0x780;

			break;
		}
		case CEM_HDR_RGB_BASE_SCALE:
		{
			int v2 = pE[2], v3 = pE[3];

			int modeval = ((v0 & 0xC0) >> 6) | ((v1 & 0x80) >> 5) | ((v2 & 0x80) >> 4);

			int majcomp, mode;
			if ((modeval & 0xC) != 0xC)
			{
				majcomp = modeval >> 2;
				mode = modeval & 3;
			}
			else if (modeval != 0xF)
			{
				majcomp = modeval & 3;
				mode = 4;
			}
			else
			{
				majcomp = 0;
				mode = 5;
			}

			int red = v0 & 0x3f;
			int green = v1 & 0x1f;
			int blue = v2 & 0x1f;
			int scale = v3 & 0x1f;

			int x0 = (v1 >> 6) & 1;
			int x1 = (v1 >> 5) & 1;
			int x2 = (v2 >> 6) & 1;
			int x3 = (v2 >> 5) & 1;
			int x4 = (v3 >> 7) & 1;
			int x5 = (v3 >> 6) & 1;
			int x6 = (v3 >> 5) & 1;

			int ohm = 1 << mode;
			if (ohm & 0x30) green |= x0 << 6;
			if (ohm & 0x3A) green |= x1 << 5;
			if (ohm & 0x30) blue |= x2 << 6;
			if (ohm & 0x3A) blue |= x3 << 5;
			if (ohm & 0x3D) scale |= x6 << 5;
			if (ohm & 0x2D) scale |= x5 << 6;
			if (ohm & 0x04) scale |= x4 << 7;
			if (ohm & 0x3B) red |= x4 << 6;
			if (ohm & 0x04) red |= x3 << 6;
			if (ohm & 0x10) red |= x5 << 7;
			if (ohm & 0x0F) red |= x2 << 7;
			if (ohm & 0x05) red |= x1 << 8;
			if (ohm & 0x0A) red |= x0 << 8;
			if (ohm & 0x05) red |= x0 << 9;
			if (ohm & 0x02) red |= x6 << 9;
			if (ohm & 0x01) red |= x3 << 10;
			if (ohm & 0x02) red |= x5 << 10;

			static const int s_shamts[6] = { 1,1,2,3,4,5 };

			const int shamt = s_shamts[mode];
			red <<= shamt;
			green <<= shamt;
			blue <<= shamt;
			scale <<= shamt;

			if (mode != 5)
			{
				green = red - green;
				blue = red - blue;
			}

			if (majcomp == 1)
				std::swap(red, green);

			if (majcomp == 2)
				std::swap(red, blue);

			e1_r = clamp(red, 0, 0xFFF);
			e1_g = clamp(green, 0, 0xFFF);
			e1_b = clamp(blue, 0, 0xFFF);
			e1_a = 0x780;

			e0_r = clamp(red - scale, 0, 0xFFF);
			e0_g = clamp(green - scale, 0, 0xFFF);
			e0_b = clamp(blue - scale, 0, 0xFFF);
			e0_a = 0x780;

			break;
		}
		case CEM_HDR_RGB_HDR_ALPHA:
		case CEM_HDR_RGB_LDR_ALPHA:
		case CEM_HDR_RGB:
		{
			int v2 = pE[2], v3 = pE[3], v4 = pE[4], v5 = pE[5];

			int majcomp = ((v4 & 0x80) >> 7) | ((v5 & 0x80) >> 6);

			e0_a = 0x780;
			e1_a = 0x780;

			if (majcomp == 3)
			{
				e0_r = v0 << 4;
				e0_g = v2 << 4;
				e0_b = (v4 & 0x7f) << 5;

				e1_r = v1 << 4;
				e1_g = v3 << 4;
				e1_b = (v5 & 0x7f) << 5;
			}
			else
			{
				int mode = ((v1 & 0x80) >> 7) | ((v2 & 0x80) >> 6) | ((v3 & 0x80) >> 5);
				int va = v0 | ((v1 & 0x40) << 2);
				int vb0 = v2 & 0x3f;
				int vb1 = v3 & 0x3f;
				int vc = v1 & 0x3f;
				int vd0 = v4 & 0x7f;
				int vd1 = v5 & 0x7f;

				static const int s_dbitstab[8] = { 7,6,7,6,5,6,5,6 };
				vd0 = sign_extend(vd0, s_dbitstab[mode]);
				vd1 = sign_extend(vd1, s_dbitstab[mode]);

				int x0 = (v2 >> 6) & 1;
				int x1 = (v3 >> 6) & 1;
				int x2 = (v4 >> 6) & 1;
				int x3 = (v5 >> 6) & 1;
				int x4 = (v4 >> 5) & 1;
				int x5 = (v5 >> 5) & 1;

				int ohm = 1 << mode;
				if (ohm & 0xA4) va |= x0 << 9;
				if (ohm & 0x08) va |= x2 << 9;
				if (ohm & 0x50) va |= x4 << 9;
				if (ohm & 0x50) va |= x5 << 10;
				if (ohm & 0xA0) va |= x1 << 10;
				if (ohm & 0xC0) va |= x2 << 11;
				if (ohm & 0x04) vc |= x1 << 6;
				if (ohm & 0xE8) vc |= x3 << 6;
				if (ohm & 0x20) vc |= x2 << 7;
				if (ohm & 0x5B) vb0 |= x0 << 6;
				if (ohm & 0x5B) vb1 |= x1 << 6;
				if (ohm & 0x12) vb0 |= x2 << 7;
				if (ohm & 0x12) vb1 |= x3 << 7;

				int shamt = (mode >> 1) ^ 3;
				va  = (uint32_t)va  << shamt;
				vb0 = (uint32_t)vb0 << shamt;
				vb1 = (uint32_t)vb1 << shamt;
				vc  = (uint32_t)vc  << shamt;
				vd0 = (uint32_t)vd0 << shamt;
				vd1 = (uint32_t)vd1 << shamt;

				e1_r = clamp(va, 0, 0xFFF);
				e1_g = clamp(va - vb0, 0, 0xFFF);
				e1_b = clamp(va - vb1, 0, 0xFFF);

				e0_r = clamp(va - vc, 0, 0xFFF);
				e0_g = clamp(va - vb0 - vc - vd0, 0, 0xFFF);
				e0_b = clamp(va - vb1 - vc - vd1, 0, 0xFFF);

				if (majcomp == 1)
				{
					std::swap(e0_r, e0_g);
					std::swap(e1_r, e1_g);
				}
				else if (majcomp == 2)
				{
					std::swap(e0_r, e0_b);
					std::swap(e1_r, e1_b);
				}
			}

			if (cem_index == CEM_HDR_RGB_LDR_ALPHA)
			{
				int v6 = pE[6], v7 = pE[7];

				e0_a = v6;
				e1_a = v7;
			}
			else if (cem_index == CEM_HDR_RGB_HDR_ALPHA)
			{
				int v6 = pE[6], v7 = pE[7];

				// Extract mode bits
				int mode = ((v6 >> 7) & 1) | ((v7 >> 6) & 2);
				v6 &= 0x7F;
				v7 &= 0x7F;

				if (mode == 3)
				{
					e0_a = v6 << 5;
					e1_a = v7 << 5;
				}
				else
				{
					v6 |= (v7 << (mode + 1)) & 0x780;
					v7 &= (0x3F >> mode);
					v7 ^= (0x20 >> mode);
					v7 -= (0x20 >> mode);

					//v6 <<= (4 - mode);  // undefined behavior if neg
					v6 = ((uint32_t)v6) << (4 - mode);

					//v7 <<= (4 - mode); // undefined behavior if neg
					v7 = ((uint32_t)v7) << (4 - mode);

					v7 += v6;
					v7 = clamp(v7, 0, 0xFFF);
					e0_a = v6;
					e1_a = v7;
				}
			}

			break;
		}
		default:
		{
			assert(0);
			for (uint32_t c = 0; c < 4; c++)
			{
				pEndpoints[c][0] = 0;
				pEndpoints[c][1] = 0;
			}
			break;
		}
		}
	}

	static inline bool is_half_inf_or_nan(half_float v)
	{
		return get_bits(v, 10, 14) == 31;
	}

	// This float->half conversion matches how "F32TO16" works on Intel GPU's.
	half_float float_to_half(float val, bool toward_zero)
	{
		union { float f; int32_t i; uint32_t u; } fi = { val };
		const int flt_m = fi.i & 0x7FFFFF, flt_e = (fi.i >> 23) & 0xFF, flt_s = (fi.i >> 31) & 0x1;
		int s = flt_s, e = 0, m = 0;

		// inf/NaN
		if (flt_e == 0xff)
		{
			e = 31;
			if (flt_m != 0) // NaN
				m = 1;
		}
		// not zero or denormal
		else if (flt_e != 0)
		{
			int new_exp = flt_e - 127;
			if (new_exp > 15)
				e = 31;
			else if (new_exp < -14)
			{
				if (toward_zero)
					m = (int)truncf((1 << 24) * fabsf(fi.f));
				else
					m = (int)lrintf((1 << 24) * fabsf(fi.f));
			}
			else
			{
				e = new_exp + 15;
				if (toward_zero)
					m = (int)truncf((float)flt_m * (1.0f / (float)(1 << 13)));
				else
					m = (int)lrintf((float)flt_m * (1.0f / (float)(1 << 13)));
			}
		}

		assert((0 <= m) && (m <= 1024));
		if (m == 1024)
		{
			e++;
			m = 0;
		}

		assert((s >= 0) && (s <= 1));
		assert((e >= 0) && (e <= 31));
		assert((m >= 0) && (m <= 1023));

		half_float result = (half_float)((s << 15) | (e << 10) | m);
		return result;
	}

	float half_to_float(half_float hval)
	{
		union { float f; uint32_t u; } x = { 0 };

		uint32_t s = ((uint32_t)hval >> 15) & 1;
		uint32_t e = ((uint32_t)hval >> 10) & 0x1F;
		uint32_t m = (uint32_t)hval & 0x3FF;

		if (!e)
		{
			if (!m)
			{
				// +- 0
				x.u = s << 31;
				return x.f;
			}
			else
			{
				// denormalized
				while (!(m & 0x00000400))
				{
					m <<= 1;
					--e;
				}

				++e;
				m &= ~0x00000400;
			}
		}
		else if (e == 31)
		{
			if (m == 0)
			{
				// +/- INF
				x.u = (s << 31) | 0x7f800000;
				return x.f;
			}
			else
			{
				// +/- NaN
				x.u = (s << 31) | 0x7f800000 | (m << 13);
				return x.f;
			}
		}

		e = e + (127 - 15);
		m = m << 13;

		assert(s <= 1);
		assert(m <= 0x7FFFFF);
		assert(e <= 255);

		x.u = m | (e << 23) | (s << 31);
		return x.f;
	}

	// See https://registry.khronos.org/OpenGL/extensions/EXT/EXT_texture_shared_exponent.txt
	const int RGB9E5_EXPONENT_BITS = 5, RGB9E5_MANTISSA_BITS = 9, RGB9E5_EXP_BIAS = 15, RGB9E5_MAX_VALID_BIASED_EXP = 31;
	const int MAX_RGB9E5_EXP = (RGB9E5_MAX_VALID_BIASED_EXP - RGB9E5_EXP_BIAS);
	const int RGB9E5_MANTISSA_VALUES = (1 << RGB9E5_MANTISSA_BITS);
	const int MAX_RGB9E5_MANTISSA = (RGB9E5_MANTISSA_VALUES - 1);
	//const int MAX_RGB9E5 = (int)(((float)MAX_RGB9E5_MANTISSA) / RGB9E5_MANTISSA_VALUES * (1 << MAX_RGB9E5_EXP));
	const int EPSILON_RGB9E5 = (int)((1.0f / (float)RGB9E5_MANTISSA_VALUES) / (float)(1 << RGB9E5_EXP_BIAS));

	void unpack_rgb9e5(uint32_t packed, float& r, float& g, float& b)
	{
		int x = packed & 511;
		int y = (packed >> 9) & 511;
		int z = (packed >> 18) & 511;
		int w = (packed >> 27) & 31;

		const float scale = powf(2.0f, static_cast<float>(w - RGB9E5_EXP_BIAS - RGB9E5_MANTISSA_BITS));

		r = x * scale;
		g = y * scale;
		b = z * scale;
	}

	// floor_log2 is not correct for the denorm and zero values, but we are going to do a max of this value with the minimum rgb9e5 exponent that will hide these problem cases.
	static inline int floor_log2(float x)
	{
		union float754
		{
			unsigned int raw;
			float value;
		};

		float754 f;
		f.value = x;
		// Extract float exponent
		return ((f.raw >> 23) & 0xFF) - 127;
	}

	static inline int maximumi(int a, int b) { return (a > b) ? a : b; }
	static inline float maximumf(float a, float b) { return (a > b) ? a : b; }

	uint32_t pack_rgb9e5(float r, float g, float b)
	{
		r = clampf(r, 0.0f, MAX_RGB9E5);
		g = clampf(g, 0.0f, MAX_RGB9E5);
		b = clampf(b, 0.0f, MAX_RGB9E5);

		float maxrgb = maximumf(maximumf(r, g), b);
		int exp_shared = maximumi(-RGB9E5_EXP_BIAS - 1, floor_log2(maxrgb)) + 1 + RGB9E5_EXP_BIAS;
		assert((exp_shared >= 0) && (exp_shared <= RGB9E5_MAX_VALID_BIASED_EXP));

		float denom = powf(2.0f, (float)(exp_shared - RGB9E5_EXP_BIAS - RGB9E5_MANTISSA_BITS));

		int maxm = (int)floorf((maxrgb / denom) + 0.5f);
		if (maxm == (MAX_RGB9E5_MANTISSA + 1))
		{
			denom *= 2;
			exp_shared += 1;
			assert(exp_shared <= RGB9E5_MAX_VALID_BIASED_EXP);
		}
		else
		{
			assert(maxm <= MAX_RGB9E5_MANTISSA);
		}

		int rm = (int)floorf((r / denom) + 0.5f);
		int gm = (int)floorf((g / denom) + 0.5f);
		int bm = (int)floorf((b / denom) + 0.5f);

		assert((rm >= 0) && (rm <= MAX_RGB9E5_MANTISSA));
		assert((gm >= 0) && (gm <= MAX_RGB9E5_MANTISSA));
		assert((bm >= 0) && (bm <= MAX_RGB9E5_MANTISSA));

		return rm | (gm << 9) | (bm << 18) | (exp_shared << 27);
	}

	static inline int clz17(uint32_t x)
	{
		assert(x <= 0x1FFFF);
		x &= 0x1FFFF;

		if (!x)
			return 17;

		uint32_t n = 0;
		while ((x & 0x10000) == 0)
		{
			x <<= 1u;
			n++;
		}

		return n;
	}

	static inline uint32_t pack_rgb9e5_ldr_astc(int Cr, int Cg, int Cb)
	{
		int lz = clz17(Cr | Cg | Cb | 1);
		if (Cr == 65535) { Cr = 65536; lz = 0; }
		if (Cg == 65535) { Cg = 65536; lz = 0; }
		if (Cb == 65535) { Cb = 65536; lz = 0; }
		Cr <<= lz; Cg <<= lz; Cb <<= lz;
		Cr = (Cr >> 8) & 0x1FF;
		Cg = (Cg >> 8) & 0x1FF;
		Cb = (Cb >> 8) & 0x1FF;
		uint32_t exponent = 16 - lz;
		uint32_t texel = (exponent << 27) | (Cb << 18) | (Cg << 9) | Cr;
		return texel;
	}

	static inline uint32_t pack_rgb9e5_hdr_astc(int Cr, int Cg, int Cb)
	{
		if (Cr > 0x7c00) Cr = 0; else if (Cr == 0x7c00) Cr = 0x7bff;
		if (Cg > 0x7c00) Cg = 0; else if (Cg == 0x7c00) Cg = 0x7bff;
		if (Cb > 0x7c00) Cb = 0; else if (Cb == 0x7c00) Cb = 0x7bff;
		int Re = (Cr >> 10) & 0x1F;
		int Ge = (Cg >> 10) & 0x1F;
		int Be = (Cb >> 10) & 0x1F;
		int Rex = (Re == 0) ? 1 : Re;
		int Gex = (Ge == 0) ? 1 : Ge;
		int Bex = (Be == 0) ? 1 : Be;
		int Xm = ((Cr | Cg | Cb) & 0x200) >> 9;
		int Xe = Re | Ge | Be;
		uint32_t rshift, gshift, bshift, expo;

		if (Xe == 0)
		{
			expo = rshift = gshift = bshift = Xm;
		}
		else if (Re >= Ge && Re >= Be)
		{
			expo = Rex + 1;
			rshift = 2;
			gshift = Rex - Gex + 2;
			bshift = Rex - Bex + 2;
		}
		else if (Ge >= Be)
		{
			expo = Gex + 1;
			rshift = Gex - Rex + 2;
			gshift = 2;
			bshift = Gex - Bex + 2;
		}
		else
		{
			expo = Bex + 1;
			rshift = Bex - Rex + 2;
			gshift = Bex - Gex + 2;
			bshift = 2;
		}

		int Rm = (Cr & 0x3FF) | (Re == 0 ? 0 : 0x400);
		int Gm = (Cg & 0x3FF) | (Ge == 0 ? 0 : 0x400);
		int Bm = (Cb & 0x3FF) | (Be == 0 ? 0 : 0x400);
		Rm = (Rm >> rshift) & 0x1FF;
		Gm = (Gm >> gshift) & 0x1FF;
		Bm = (Bm >> bshift) & 0x1FF;

		uint32_t texel = (expo << 27) | (Bm << 18) | (Gm << 9) | (Rm << 0);
		return texel;
	}

	static void write_error_block(void* pPixels, uint32_t num_blk_pixels, decode_mode dec_mode)
	{
		// Write block error color
		if (dec_mode == cDecodeModeHDR16)
		{
			// NaN's
			memset(pPixels, 0xFF, num_blk_pixels * sizeof(half_float) * 4);
		}
		else if (dec_mode == cDecodeModeRGB9E5)
		{
			const uint32_t purple_9e5 = pack_rgb9e5(1.0f, 0.0f, 1.0f);

			for (uint32_t i = 0; i < num_blk_pixels; i++)
				((uint32_t*)pPixels)[i] = purple_9e5;
		}
		else
		{
			for (uint32_t i = 0; i < num_blk_pixels; i++)
				((uint32_t*)pPixels)[i] = 0xFFFF00FF;
		}
	}

	// Important: pPixels is either 32-bit/texel or 64-bit/texel.
	bool decode_block(const log_astc_block& log_blk, void* pPixels, uint32_t blk_width, uint32_t blk_height, decode_mode dec_mode)
	{
		assert(is_valid_block_size(blk_width, blk_height));

		// Basic sanity checking
		if (!log_blk.m_dual_plane)
		{
			assert(log_blk.m_color_component_selector == 0);
		}
		else
		{
			assert(log_blk.m_color_component_selector <= 3);
		}

		assert(g_dequant_tables.m_endpoints[0].m_ISE_to_val.size());
		if (!g_dequant_tables.m_endpoints[0].m_ISE_to_val.size())
			return false;

		const uint32_t num_blk_pixels = blk_width * blk_height;

		if (log_blk.m_error_flag)
		{
			write_error_block(pPixels, num_blk_pixels, dec_mode);
			// Should this return false? It's not an invalid logical block config, though.
			return false;
		}

		// Handle solid color blocks
		if (log_blk.m_solid_color_flag_ldr)
		{
			// LDR solid block
			if (dec_mode == cDecodeModeHDR16)
			{
				// Convert LDR pixels to half-float
				half_float h[4];
				for (uint32_t c = 0; c < 4; c++)
					h[c] = (log_blk.m_solid_color[c] == 0xFFFF) ? 0x3C00 : float_to_half((float)log_blk.m_solid_color[c] * (1.0f / 65536.0f), true);

				for (uint32_t i = 0; i < num_blk_pixels; i++)
					memcpy((uint16_t*)pPixels + i * 4, h, sizeof(half_float) * 4);
			}
			else if (dec_mode == cDecodeModeRGB9E5)
			{
				float r = (log_blk.m_solid_color[0] == 0xFFFF) ? 1.0f : ((float)log_blk.m_solid_color[0] * (1.0f / 65536.0f));
				float g = (log_blk.m_solid_color[1] == 0xFFFF) ? 1.0f : ((float)log_blk.m_solid_color[1] * (1.0f / 65536.0f));
				float b = (log_blk.m_solid_color[2] == 0xFFFF) ? 1.0f : ((float)log_blk.m_solid_color[2] * (1.0f / 65536.0f));

				const uint32_t packed = pack_rgb9e5(r, g, b);

				for (uint32_t i = 0; i < num_blk_pixels; i++)
					((uint32_t*)pPixels)[i] = packed;
			}
			else
			{
				// Convert LDR pixels to 8-bits
				for (uint32_t i = 0; i < num_blk_pixels; i++)
					for (uint32_t c = 0; c < 4; c++)
						((uint8_t*)pPixels)[i * 4 + c] = (log_blk.m_solid_color[c] >> 8);
			}

			return true;
		}
		else if (log_blk.m_solid_color_flag_hdr)
		{
			// HDR solid block, decode mode must be half-float or RGB9E5
			if (dec_mode == cDecodeModeHDR16)
			{
				for (uint32_t i = 0; i < num_blk_pixels; i++)
					memcpy((uint16_t*)pPixels + i * 4, log_blk.m_solid_color, sizeof(half_float) * 4);
			}
			else if (dec_mode == cDecodeModeRGB9E5)
			{
				float r = half_to_float(log_blk.m_solid_color[0]);
				float g = half_to_float(log_blk.m_solid_color[1]);
				float b = half_to_float(log_blk.m_solid_color[2]);

				const uint32_t packed = pack_rgb9e5(r, g, b);

				for (uint32_t i = 0; i < num_blk_pixels; i++)
					((uint32_t*)pPixels)[i] = packed;
			}
			else
			{
				write_error_block(pPixels, num_blk_pixels, dec_mode);
				return false;
			}

			return true;
		}

		// Sanity check block's config
		if ((log_blk.m_grid_width < 2) || (log_blk.m_grid_height < 2))
		{
			write_error_block(pPixels, num_blk_pixels, dec_mode);
			return false;
		}

		if ((log_blk.m_grid_width > blk_width) || (log_blk.m_grid_height > blk_height))
		{
			write_error_block(pPixels, num_blk_pixels, dec_mode);
			return false;
		}

		if ((log_blk.m_endpoint_ise_range < FIRST_VALID_ENDPOINT_ISE_RANGE) || (log_blk.m_endpoint_ise_range > LAST_VALID_ENDPOINT_ISE_RANGE))
		{
			write_error_block(pPixels, num_blk_pixels, dec_mode);
			return false;
		}

		if ((log_blk.m_weight_ise_range < FIRST_VALID_WEIGHT_ISE_RANGE) || (log_blk.m_weight_ise_range > LAST_VALID_WEIGHT_ISE_RANGE))
		{
			write_error_block(pPixels, num_blk_pixels, dec_mode);
			return false;
		}

		if ((log_blk.m_num_partitions < 1) || (log_blk.m_num_partitions > MAX_PARTITIONS))
		{
			write_error_block(pPixels, num_blk_pixels, dec_mode);
			return false;
		}

		if ((log_blk.m_dual_plane) && (log_blk.m_num_partitions > MAX_DUAL_PLANE_PARTITIONS))
		{
			write_error_block(pPixels, num_blk_pixels, dec_mode);
			return false;
		}

		if (log_blk.m_partition_id >= NUM_PARTITION_PATTERNS)
		{
			write_error_block(pPixels, num_blk_pixels, dec_mode);
			return false;
		}

		if ((log_blk.m_num_partitions == 1) && (log_blk.m_partition_id > 0))
		{
			write_error_block(pPixels, num_blk_pixels, dec_mode);
			return false;
		}

		if (log_blk.m_color_component_selector > 3)
		{
			write_error_block(pPixels, num_blk_pixels, dec_mode);
			return false;
		}

		const uint32_t total_endpoint_levels = get_ise_levels(log_blk.m_endpoint_ise_range);
		const uint32_t total_weight_levels = get_ise_levels(log_blk.m_weight_ise_range);

		bool is_ldr_endpoints[MAX_PARTITIONS];

		// Check CEM's
		uint32_t total_cem_vals = 0;
		for (uint32_t i = 0; i < log_blk.m_num_partitions; i++)
		{
			if (log_blk.m_color_endpoint_modes[i] > 15)
			{
				write_error_block(pPixels, num_blk_pixels, dec_mode);
				return false;
			}

			total_cem_vals += get_num_cem_values(log_blk.m_color_endpoint_modes[i]);

			is_ldr_endpoints[i] = is_cem_ldr(log_blk.m_color_endpoint_modes[i]);
		}

		if (total_cem_vals > MAX_ENDPOINTS)
		{
			write_error_block(pPixels, num_blk_pixels, dec_mode);
			return false;
		}

		const dequant_table& endpoint_dequant_tab = g_dequant_tables.get_endpoint_tab(log_blk.m_endpoint_ise_range);
		const uint8_t* pEndpoint_dequant = endpoint_dequant_tab.m_ISE_to_val.data();

		// Dequantized endpoints to [0,255]
		uint8_t dequantized_endpoints[MAX_ENDPOINTS];
		for (uint32_t i = 0; i < total_cem_vals; i++)
		{
			if (log_blk.m_endpoints[i] >= total_endpoint_levels)
			{
				write_error_block(pPixels, num_blk_pixels, dec_mode);
				return false;
			}

			dequantized_endpoints[i] = pEndpoint_dequant[log_blk.m_endpoints[i]];
		}

		// Dequantize weights to [0,64]
		uint8_t dequantized_weights[2][12 * 12];

		const dequant_table& weight_dequant_tab = g_dequant_tables.get_weight_tab(log_blk.m_weight_ise_range);
		const uint8_t* pWeight_dequant = weight_dequant_tab.m_ISE_to_val.data();

		const uint32_t total_weight_vals = (log_blk.m_dual_plane ? 2 : 1) * log_blk.m_grid_width * log_blk.m_grid_height;
		for (uint32_t i = 0; i < total_weight_vals; i++)
		{
			if (log_blk.m_weights[i] >= total_weight_levels)
			{
				write_error_block(pPixels, num_blk_pixels, dec_mode);
				return false;
			}

			const uint32_t plane_index = log_blk.m_dual_plane ? (i & 1) : 0;
			const uint32_t grid_index = log_blk.m_dual_plane ? (i >> 1) : i;

			dequantized_weights[plane_index][grid_index] = pWeight_dequant[log_blk.m_weights[i]];
		}

		// Upsample weight grid. [0,64] weights
		uint8_t upsampled_weights[2][12 * 12];

		upsample_weight_grid(blk_width, blk_height, log_blk.m_grid_width, log_blk.m_grid_height, &dequantized_weights[0][0], &upsampled_weights[0][0]);
		if (log_blk.m_dual_plane)
			upsample_weight_grid(blk_width, blk_height, log_blk.m_grid_width, log_blk.m_grid_height, &dequantized_weights[1][0], &upsampled_weights[1][0]);

		// Decode CEM's
		int endpoints[4][4][2]; // [subset][comp][l/h]

		uint32_t endpoint_val_index = 0;
		for (uint32_t subset = 0; subset < log_blk.m_num_partitions; subset++)
		{
			const uint32_t cem_index = log_blk.m_color_endpoint_modes[subset];

			decode_endpoint(cem_index, &endpoints[subset][0], &dequantized_endpoints[endpoint_val_index]);

			endpoint_val_index += get_num_cem_values(cem_index);
		}

		// Decode texels
		const bool small_block = num_blk_pixels < 31;
		const bool use_precomputed_texel_partitions = (log_blk.m_num_partitions >= 2) && (log_blk.m_num_partitions <= 3);
		const uint32_t ccs = log_blk.m_dual_plane ? log_blk.m_color_component_selector : UINT32_MAX;

		bool success = true;

		if (dec_mode == cDecodeModeRGB9E5)
		{
			// returns uint32_t's
			for (uint32_t y = 0; y < blk_height; y++)
			{
				for (uint32_t x = 0; x < blk_width; x++)
				{
					const uint32_t pixel_index = x + y * blk_width;

					uint32_t subset = 0;
					if (log_blk.m_num_partitions > 1)
					{
						if (use_precomputed_texel_partitions)
						{
							subset = get_precomputed_texel_partition(blk_width, blk_height, log_blk.m_partition_id, x, y, log_blk.m_num_partitions);
							//assert((int)subset == compute_texel_partition(log_blk.m_partition_id, x, y, 0, log_blk.m_num_partitions, small_block)); // extra paranoia
						}
						else
							subset = compute_texel_partition(log_blk.m_partition_id, x, y, 0, log_blk.m_num_partitions, small_block);
					}

					int comp[3];

					for (uint32_t c = 0; c < 3; c++)
					{
						const uint32_t w = upsampled_weights[(c == ccs) ? 1 : 0][pixel_index];

						if (is_ldr_endpoints[subset])
						{
							assert((endpoints[subset][c][0] >= 0) && (endpoints[subset][c][0] <= 0xFF));
							assert((endpoints[subset][c][1] >= 0) && (endpoints[subset][c][1] <= 0xFF));

							int le = endpoints[subset][c][0];
							int he = endpoints[subset][c][1];

							le = (le << 8) | le;
							he = (he << 8) | he;

							int k = weight_interpolate(le, he, w);
							assert((k >= 0) && (k <= 0xFFFF));

							comp[c] = k; // 1.0
						}
						else
						{
							assert((endpoints[subset][c][0] >= 0) && (endpoints[subset][c][0] <= 0xFFF));
							assert((endpoints[subset][c][1] >= 0) && (endpoints[subset][c][1] <= 0xFFF));

							int le = endpoints[subset][c][0] << 4;
							int he = endpoints[subset][c][1] << 4;

							int qlog16 = weight_interpolate(le, he, w);

							comp[c] = qlog16_to_half(qlog16);

							if (is_half_inf_or_nan((half_float)comp[c]))
								comp[c] = 0x7BFF;
						}

					} // c

					uint32_t packed;
					if (is_ldr_endpoints[subset])
						packed = pack_rgb9e5_ldr_astc(comp[0], comp[1], comp[2]);
					else
						packed = pack_rgb9e5_hdr_astc(comp[0], comp[1], comp[2]);

					((uint32_t*)pPixels)[pixel_index] = packed;

				} // x
			} // y
		}
		else if (dec_mode == cDecodeModeHDR16)
		{
			// Note: must round towards zero when converting float to half for ASTC (18.19 Weight Application)

			// returns half floats
			for (uint32_t y = 0; y < blk_height; y++)
			{
				for (uint32_t x = 0; x < blk_width; x++)
				{
					const uint32_t pixel_index = x + y * blk_width;

					uint32_t subset = 0;
					if (log_blk.m_num_partitions > 1)
					{
						if (use_precomputed_texel_partitions)
						{
							subset = get_precomputed_texel_partition(blk_width, blk_height, log_blk.m_partition_id, x, y, log_blk.m_num_partitions);
							//assert((int)subset == compute_texel_partition(log_blk.m_partition_id, x, y, 0, log_blk.m_num_partitions, small_block)); // extra paranoia
						}
						else
							subset = compute_texel_partition(log_blk.m_partition_id, x, y, 0, log_blk.m_num_partitions, small_block);
					}

					for (uint32_t c = 0; c < 4; c++)
					{
						const uint32_t w = upsampled_weights[(c == ccs) ? 1 : 0][pixel_index];

						half_float o;

						if ( (is_ldr_endpoints[subset]) ||
							 ((log_blk.m_color_endpoint_modes[subset] == CEM_HDR_RGB_LDR_ALPHA) && (c == 3)) )
						{
							assert((endpoints[subset][c][0] >= 0) && (endpoints[subset][c][0] <= 0xFF));
							assert((endpoints[subset][c][1] >= 0) && (endpoints[subset][c][1] <= 0xFF));

							int le = endpoints[subset][c][0];
							int he = endpoints[subset][c][1];

							le = (le << 8) | le;
							he = (he << 8) | he;

							int k = weight_interpolate(le, he, w);
							assert((k >= 0) && (k <= 0xFFFF));

							if (k == 0xFFFF)
								o = 0x3C00; // 1.0
							else
								o = float_to_half((float)k * (1.0f / 65536.0f), true);
						}
						else
						{
							assert((endpoints[subset][c][0] >= 0) && (endpoints[subset][c][0] <= 0xFFF));
							assert((endpoints[subset][c][1] >= 0) && (endpoints[subset][c][1] <= 0xFFF));

							int le = endpoints[subset][c][0] << 4;
							int he = endpoints[subset][c][1] << 4;

							int qlog16 = weight_interpolate(le, he, w);

							o = qlog16_to_half(qlog16);

							if (is_half_inf_or_nan(o))
								o = 0x7BFF;
						}

						((half_float*)pPixels)[pixel_index * 4 + c] = o;
					}

				} // x
			} // y
		}
		else
		{
			// returns uint8_t's
			for (uint32_t y = 0; y < blk_height; y++)
			{
				for (uint32_t x = 0; x < blk_width; x++)
				{
					const uint32_t pixel_index = x + y * blk_width;

					uint32_t subset = 0;
					if (log_blk.m_num_partitions > 1)
					{
						if (use_precomputed_texel_partitions)
						{
							subset = get_precomputed_texel_partition(blk_width, blk_height, log_blk.m_partition_id, x, y, log_blk.m_num_partitions);
							//assert((int)subset == compute_texel_partition(log_blk.m_partition_id, x, y, 0, log_blk.m_num_partitions, small_block)); // extra paranoia
						}
						else
							subset = compute_texel_partition(log_blk.m_partition_id, x, y, 0, log_blk.m_num_partitions, small_block);
					}

					if (!is_ldr_endpoints[subset])
					{
						((uint32_t*)pPixels)[pixel_index] = 0xFFFF00FF;
						success = false;
					}
					else
					{
						for (uint32_t c = 0; c < 4; c++)
						{
							const uint32_t w = upsampled_weights[(c == ccs) ? 1 : 0][pixel_index];

							int le = endpoints[subset][c][0];
							int he = endpoints[subset][c][1];

							// FIXME: the spec is apparently wrong? this matches ARM's and Google's decoder
							//if ((dec_mode == cDecodeModeSRGB8) && (c <= 2))
							// See https://github.com/ARM-software/astc-encoder/issues/447
							// See latest spec with recent (2023-2024) fixes:
							// https://raw.githubusercontent.com/KhronosGroup/DataFormat/refs/heads/main/astc.txt
							// "For _LDR endpoint modes_, each color component C is calculated from the corresponding 8 - bit endpoint components C~0~and C~1~as follows" - does this mean alpha too? I guess so. (8/15/2025.)

							// 2/22/2026: See ARM errata 3922301 "ASTC decompression incorrectly rounds linear color endpoints when using unorm8 decode mode". (We currently always assume unorm8 decode mode.)
							// Our ASTC/XUASTC encoders default to the sRGB decode profile, not linear, so at least our default behavior isn't impacted by this.
							// https://documentation-service.arm.com/static/67ca1a5ece2747241fced502?utm_source=chatgpt.com
							if (dec_mode == cDecodeModeSRGB8)
							{
								le = (le << 8) | 0x80;
								he = (he << 8) | 0x80;
							}
							else
							{
								le = (le << 8) | le;
								he = (he << 8) | he;
							}

							uint32_t k = weight_interpolate(le, he, w);

							// FIXME (old comment - before 2023/2024 ARM etc. spec fixes): This is what the spec says to do in LDR mode, but this is not what ARM's decoder does
							// See decompress_symbolic_block(), decode_texel() and unorm16_to_sf16.
							// It seems to effectively divide by 65535.0 and convert to FP16, then back to float, mul by 255.0, add .5 and then convert to 8-bit.
							((uint8_t*)pPixels)[pixel_index * 4 + c] = (uint8_t)(k >> 8);
						}
					}

				} // x
			} // y
		}

		return success;
	}

	bool is_block_xuastc_ldr(const log_astc_block& log_blk)
	{
		if (log_blk.m_error_flag)
			return false;

		if (log_blk.m_solid_color_flag_ldr)
			return true;

		if (log_blk.m_solid_color_flag_hdr)
			return false;

		if (log_blk.m_num_partitions > 3)
			return false;

		if ((log_blk.m_dual_plane) && (log_blk.m_num_partitions > 1))
			return false;

		// TODO: Check partition pattern ID against unique set.

		for (uint32_t i = 1; i < log_blk.m_num_partitions; i++)
			if (log_blk.m_color_endpoint_modes[0] != log_blk.m_color_endpoint_modes[i])
				return false;

		switch (log_blk.m_color_endpoint_modes[0])
		{
			case CEM_LDR_LUM_DIRECT:
			case CEM_LDR_LUM_ALPHA_DIRECT:
			case CEM_LDR_RGB_BASE_SCALE:
			case CEM_LDR_RGB_DIRECT:
			case CEM_LDR_RGB_BASE_PLUS_OFFSET:
			case CEM_LDR_RGB_BASE_SCALE_PLUS_TWO_A:
			case CEM_LDR_RGBA_DIRECT:
			case CEM_LDR_RGBA_BASE_PLUS_OFFSET:
			{
				break;
			}
			default:
			{
				return false;
			}
		}

		return true;
	}

	// ~2x faster than decode_block(), but XUASTC LDR only.
	// pUpsampled_weights_to_use must be at block res, [0,64], single plane blocks ONLY
	bool decode_block_xuastc_ldr(const log_astc_block& log_blk, void* pPixels, uint32_t blk_width, uint32_t blk_height, decode_mode dec_mode,
		const uint8_t* pUpsampled_weights_to_use, uint32_t start_x, uint32_t start_y, uint32_t end_x, uint32_t end_y)
	{
		if (!end_x)
			end_x = blk_width;

		if (!end_y)
			end_y = blk_height;

		assert(start_x < end_x);
		assert(start_y < end_y);
		assert(end_x <= blk_width);
		assert(end_y <= blk_height);

		assert(g_dequant_tables.m_endpoints[0].m_ISE_to_val.size());
		assert((dec_mode == cDecodeModeSRGB8) || (dec_mode == cDecodeModeLDR8));
		assert(is_valid_block_size(blk_width, blk_height));
		assert(!log_blk.m_error_flag && !log_blk.m_solid_color_flag_hdr);

		if (!log_blk.m_solid_color_flag_ldr)
		{
			assert(((log_blk.m_num_partitions >= 1) && (log_blk.m_num_partitions <= 3)));
			assert((log_blk.m_grid_width >= 2) & (log_blk.m_grid_height >= 2));
			assert((log_blk.m_grid_width <= blk_width) && (log_blk.m_grid_height <= blk_height));
			assert((log_blk.m_grid_width * log_blk.m_grid_height) <= MAX_GRID_WEIGHTS);
			assert((log_blk.m_num_partitions > 1) || (log_blk.m_partition_id == 0));
		}

		assert(is_block_xuastc_ldr(log_blk));

		const uint32_t num_blk_pixels = blk_width * blk_height;

		// Handle solid color blocks
		if (log_blk.m_solid_color_flag_ldr)
		{
			// Convert LDR pixels to 8-bits
			uint32_t x;

			((uint8_t*)&x)[0] = (uint8_t)(log_blk.m_solid_color[0] >> 8);
			((uint8_t*)&x)[1] = (uint8_t)(log_blk.m_solid_color[1] >> 8);
			((uint8_t*)&x)[2] = (uint8_t)(log_blk.m_solid_color[2] >> 8);
			((uint8_t*)&x)[3] = (uint8_t)(log_blk.m_solid_color[3] >> 8);

			uint32_t* pDst = (uint32_t*)pPixels;

			uint32_t i = 0;
			while ((i + 3) < num_blk_pixels)
			{
				pDst[i] = x;
				pDst[i + 1] = x;
				pDst[i + 2] = x;
				pDst[i + 3] = x;

				i += 4;
			}

			while (i < num_blk_pixels)
				pDst[i++] = x;

			return true;
		}

		const dequant_table& endpoint_dequant_tab = g_dequant_tables.get_endpoint_tab(log_blk.m_endpoint_ise_range);
		const uint8_t* pEndpoint_dequant = endpoint_dequant_tab.m_ISE_to_val.data();

		const dequant_table& weight_dequant_tab = g_dequant_tables.get_weight_tab(log_blk.m_weight_ise_range);
		const uint8_t* pWeight_dequant = weight_dequant_tab.m_ISE_to_val.data();

		// Check CEM's
		const uint32_t num_cem_vals = get_num_cem_values(log_blk.m_color_endpoint_modes[0]);
		const uint32_t total_cem_vals = num_cem_vals * log_blk.m_num_partitions;

		assert(total_cem_vals <= MAX_ENDPOINTS);

		// Dequantized endpoints to [0,255]
		uint8_t dequantized_endpoints[MAX_ENDPOINTS];

		for (uint32_t i = 0; i < total_cem_vals; i++)
		{
			assert(log_blk.m_endpoints[i] < endpoint_dequant_tab.m_ISE_to_val.size_u32());
			dequantized_endpoints[i] = pEndpoint_dequant[log_blk.m_endpoints[i]];
		}

		// Decode CEM's
		int endpoints[4][4][2]; // [subset][comp][l/h]

		uint32_t endpoint_val_index = 0;
		const uint32_t cem_index = log_blk.m_color_endpoint_modes[0];

		uint32_t alpha_mask = 0xFF;

		for (uint32_t subset = 0; subset < log_blk.m_num_partitions; subset++)
		{
			assert(log_blk.m_color_endpoint_modes[subset] == cem_index);

			decode_endpoint(cem_index, &endpoints[subset][0], &dequantized_endpoints[endpoint_val_index]);

			alpha_mask &= endpoints[subset][3][0];
			alpha_mask &= endpoints[subset][3][1];

			endpoint_val_index += num_cem_vals;
		}

		const bool any_alpha = alpha_mask != 255;

		// Dequantize weights to [0,64]
		uint8_t upsampled_weights[2][12 * 12];

		const uint32_t total_weight_vals = (log_blk.m_dual_plane ? 2 : 1) * log_blk.m_grid_width * log_blk.m_grid_height;

		// Upsample weight grid. [0,64] weights
		const uint8_t(*pUpsampled_weights)[12 * 12];

		uint8_t dequantized_weights[2][12 * 12];

		// For simplicity, ignore any passed in weights if dual plane
		if ((pUpsampled_weights_to_use) && (!log_blk.m_dual_plane))
		{
			// Caller is jamming in already unpacked weights for the first plane to save time
			pUpsampled_weights = reinterpret_cast<const uint8_t(*)[12 * 12]>(pUpsampled_weights_to_use);
		}
		else
		{
			if (log_blk.m_dual_plane)
			{
				for (uint32_t i = 0; i < total_weight_vals; i++)
				{
					const uint32_t plane_index = i & 1;
					const uint32_t grid_index = i >> 1;

					assert(log_blk.m_weights[i] < weight_dequant_tab.m_ISE_to_val.size_u32());
					dequantized_weights[plane_index][grid_index] = pWeight_dequant[log_blk.m_weights[i]];
				}
			}
			else
			{
				for (uint32_t i = 0; i < total_weight_vals; i++)
				{
					assert(log_blk.m_weights[i] < weight_dequant_tab.m_ISE_to_val.size_u32());
					dequantized_weights[0][i] = pWeight_dequant[log_blk.m_weights[i]];
				}
			}

			pUpsampled_weights = &dequantized_weights[0];

			if ((log_blk.m_grid_width < blk_width) || (log_blk.m_grid_height < blk_height))
			{
				upsample_weight_grid_xuastc_ldr(blk_width, blk_height,
					log_blk.m_grid_width, log_blk.m_grid_height,
					&dequantized_weights[0][0], &upsampled_weights[0][0],
					log_blk.m_dual_plane ? &dequantized_weights[1][0] : nullptr, log_blk.m_dual_plane ? &upsampled_weights[1][0] : nullptr);

				pUpsampled_weights = &upsampled_weights[0];
			}
		}

		// Decode texels
		const uint32_t ccs = log_blk.m_dual_plane ? log_blk.m_color_component_selector : UINT32_MAX;

		const uint8_t *pPart = &g_texel_partitions[log_blk.m_partition_id][0][0]; // [seed][y][x]

		const bool large_block = (num_blk_pixels >= 31);
		uint32_t part_shift = (log_blk.m_num_partitions == 3) ? 2 : 0;
		part_shift += large_block * 4;

		//uint32_t pixel_index = 0;

		if (log_blk.m_num_partitions == 1)
		{
			// alpha, 1 subset
			int le0 = endpoints[0][0][0], he0 = endpoints[0][0][1];
			int le1 = endpoints[0][1][0], he1 = endpoints[0][1][1];
			int le2 = endpoints[0][2][0], he2 = endpoints[0][2][1];
			int le3 = endpoints[0][3][0], he3 = endpoints[0][3][1];

			if (dec_mode == cDecodeModeSRGB8)
			{
				le0 = (le0 << 8) | 0x80; he0 = (he0 << 8) | 0x80;
				le1 = (le1 << 8) | 0x80; he1 = (he1 << 8) | 0x80;
				le2 = (le2 << 8) | 0x80; he2 = (he2 << 8) | 0x80;
				le3 = (le3 << 8) | 0x80; he3 = (he3 << 8) | 0x80;
			}
			else
			{
				le0 = (le0 << 8) | le0; he0 = (he0 << 8) | he0;
				le1 = (le1 << 8) | le1; he1 = (he1 << 8) | he1;
				le2 = (le2 << 8) | le2; he2 = (he2 << 8) | he2;
				le3 = (le3 << 8) | le3; he3 = (he3 << 8) | he3;
			}

			// no subsets
			if (!any_alpha)
			{
				if (!log_blk.m_dual_plane)
				{
					for (uint32_t y = start_y; y < end_y; y++)
					{
						for (uint32_t x = start_x; x < end_x; x++)
						{
							const uint32_t pixel_index = x + y * blk_width;

							const uint32_t w0 = pUpsampled_weights[0][pixel_index];
							const uint32_t w1 = pUpsampled_weights[0][pixel_index];
							const uint32_t w2 = pUpsampled_weights[0][pixel_index];

							const uint32_t k0 = weight_interpolate(le0, he0, w0);
							const uint32_t k1 = weight_interpolate(le1, he1, w1);
							const uint32_t k2 = weight_interpolate(le2, he2, w2);

							((uint8_t*)pPixels)[pixel_index * 4 + 0] = (uint8_t)(k0 >> 8);
							((uint8_t*)pPixels)[pixel_index * 4 + 1] = (uint8_t)(k1 >> 8);
							((uint8_t*)pPixels)[pixel_index * 4 + 2] = (uint8_t)(k2 >> 8);
							((uint8_t*)pPixels)[pixel_index * 4 + 3] = 255;
						} // x
					} // y
				}
				else
				{
					for (uint32_t y = start_y; y < end_y; y++)
					{
						for (uint32_t x = start_x; x < end_x; x++)
						{
							const uint32_t pixel_index = x + y * blk_width;

							const uint32_t w0 = pUpsampled_weights[(0 == ccs) ? 1 : 0][pixel_index];
							const uint32_t w1 = pUpsampled_weights[(1 == ccs) ? 1 : 0][pixel_index];
							const uint32_t w2 = pUpsampled_weights[(2 == ccs) ? 1 : 0][pixel_index];

							const uint32_t k0 = weight_interpolate(le0, he0, w0);
							const uint32_t k1 = weight_interpolate(le1, he1, w1);
							const uint32_t k2 = weight_interpolate(le2, he2, w2);

							((uint8_t*)pPixels)[pixel_index * 4 + 0] = (uint8_t)(k0 >> 8);
							((uint8_t*)pPixels)[pixel_index * 4 + 1] = (uint8_t)(k1 >> 8);
							((uint8_t*)pPixels)[pixel_index * 4 + 2] = (uint8_t)(k2 >> 8);
							((uint8_t*)pPixels)[pixel_index * 4 + 3] = 255;
						} // x
					} // y
				}
			}
			else // (!any_alpha)
			{
				for (uint32_t y = start_y; y < end_y; y++)
				{
					for (uint32_t x = start_x; x < end_x; x++)
					{
						const uint32_t pixel_index = x + y * blk_width;

						const uint32_t w0 = pUpsampled_weights[(0 == ccs) ? 1 : 0][pixel_index];
						const uint32_t w1 = pUpsampled_weights[(1 == ccs) ? 1 : 0][pixel_index];
						const uint32_t w2 = pUpsampled_weights[(2 == ccs) ? 1 : 0][pixel_index];
						const uint32_t w3 = pUpsampled_weights[(3 == ccs) ? 1 : 0][pixel_index];

						const uint32_t k0 = weight_interpolate(le0, he0, w0);
						const uint32_t k1 = weight_interpolate(le1, he1, w1);
						const uint32_t k2 = weight_interpolate(le2, he2, w2);
						const uint32_t k3 = weight_interpolate(le3, he3, w3);

						((uint8_t*)pPixels)[pixel_index * 4 + 0] = (uint8_t)(k0 >> 8);
						((uint8_t*)pPixels)[pixel_index * 4 + 1] = (uint8_t)(k1 >> 8);
						((uint8_t*)pPixels)[pixel_index * 4 + 2] = (uint8_t)(k2 >> 8);
						((uint8_t*)pPixels)[pixel_index * 4 + 3] = (uint8_t)(k3 >> 8);

					} // x
				}  // y
			}
		}
		else
		{
			for (uint32_t subset = 0; subset < log_blk.m_num_partitions; subset++)
			{
				int le0 = endpoints[subset][0][0], he0 = endpoints[subset][0][1];
				int le1 = endpoints[subset][1][0], he1 = endpoints[subset][1][1];
				int le2 = endpoints[subset][2][0], he2 = endpoints[subset][2][1];
				int le3 = endpoints[subset][3][0], he3 = endpoints[subset][3][1];

				if (dec_mode == cDecodeModeSRGB8)
				{
					le0 = (le0 << 8) | 0x80; he0 = (he0 << 8) | 0x80;
					le1 = (le1 << 8) | 0x80; he1 = (he1 << 8) | 0x80;
					le2 = (le2 << 8) | 0x80; he2 = (he2 << 8) | 0x80;
					le3 = (le3 << 8) | 0x80; he3 = (he3 << 8) | 0x80;
				}
				else
				{
					le0 = (le0 << 8) | le0; he0 = (he0 << 8) | he0;
					le1 = (le1 << 8) | le1; he1 = (he1 << 8) | he1;
					le2 = (le2 << 8) | le2; he2 = (he2 << 8) | he2;
					le3 = (le3 << 8) | le3; he3 = (he3 << 8) | he3;
				}

				endpoints[subset][0][0] = le0, endpoints[subset][0][1] = he0;
				endpoints[subset][1][0] = le1, endpoints[subset][1][1] = he1;
				endpoints[subset][2][0] = le2, endpoints[subset][2][1] = he2;
				endpoints[subset][3][0] = le3, endpoints[subset][3][1] = he3;
			}

			// subsets
			if (!any_alpha)
			{
				// no alpha, sRGB
				for (uint32_t y = start_y; y < end_y; y++)
				{
					for (uint32_t x = start_x; x < end_x; x++)
					{
						const uint32_t pixel_index = x + y * blk_width;

						const uint32_t v = pPart[y * 12 + x];
						const uint32_t subset = (v >> part_shift) & 3;

						const uint32_t w0 = pUpsampled_weights[(0 == ccs) ? 1 : 0][pixel_index];
						const uint32_t w1 = pUpsampled_weights[(1 == ccs) ? 1 : 0][pixel_index];
						const uint32_t w2 = pUpsampled_weights[(2 == ccs) ? 1 : 0][pixel_index];

						int le0 = endpoints[subset][0][0], he0 = endpoints[subset][0][1];
						int le1 = endpoints[subset][1][0], he1 = endpoints[subset][1][1];
						int le2 = endpoints[subset][2][0], he2 = endpoints[subset][2][1];

						const uint32_t k0 = weight_interpolate(le0, he0, w0);
						const uint32_t k1 = weight_interpolate(le1, he1, w1);
						const uint32_t k2 = weight_interpolate(le2, he2, w2);

						((uint8_t*)pPixels)[pixel_index * 4 + 0] = (uint8_t)(k0 >> 8);
						((uint8_t*)pPixels)[pixel_index * 4 + 1] = (uint8_t)(k1 >> 8);
						((uint8_t*)pPixels)[pixel_index * 4 + 2] = (uint8_t)(k2 >> 8);
						((uint8_t*)pPixels)[pixel_index * 4 + 3] = 255;
					} // x
				} // y
			}
			else
			{
				// alpha
				for (uint32_t y = start_y; y < end_y; y++)
				{
					for (uint32_t x = start_x; x < end_x; x++)
					{
						const uint32_t pixel_index = x + y * blk_width;

						const uint32_t v = pPart[y * 12 + x];
						const uint32_t subset = (v >> part_shift) & 3;

						const uint32_t w0 = pUpsampled_weights[(0 == ccs) ? 1 : 0][pixel_index];
						const uint32_t w1 = pUpsampled_weights[(1 == ccs) ? 1 : 0][pixel_index];
						const uint32_t w2 = pUpsampled_weights[(2 == ccs) ? 1 : 0][pixel_index];
						const uint32_t w3 = pUpsampled_weights[(3 == ccs) ? 1 : 0][pixel_index];

						int le0 = endpoints[subset][0][0], he0 = endpoints[subset][0][1];
						int le1 = endpoints[subset][1][0], he1 = endpoints[subset][1][1];
						int le2 = endpoints[subset][2][0], he2 = endpoints[subset][2][1];
						int le3 = endpoints[subset][3][0], he3 = endpoints[subset][3][1];

						const uint32_t k0 = weight_interpolate(le0, he0, w0);
						const uint32_t k1 = weight_interpolate(le1, he1, w1);
						const uint32_t k2 = weight_interpolate(le2, he2, w2);
						const uint32_t k3 = weight_interpolate(le3, he3, w3);

						((uint8_t*)pPixels)[pixel_index * 4 + 0] = (uint8_t)(k0 >> 8);
						((uint8_t*)pPixels)[pixel_index * 4 + 1] = (uint8_t)(k1 >> 8);
						((uint8_t*)pPixels)[pixel_index * 4 + 2] = (uint8_t)(k2 >> 8);
						((uint8_t*)pPixels)[pixel_index * 4 + 3] = (uint8_t)(k3 >> 8);

					} // x
				}  // y

			}

		} // if (log_blk.m_num_partitions == 1)

		return true;
	}

	//------------------------------------------------
	// Physical to logical block decoding

	// unsigned 128-bit int, with some signed helpers
	class uint128
	{
		uint64_t m_lo, m_hi;

	public:
		uint128() = default;
		inline uint128(uint64_t lo) : m_lo(lo), m_hi(0) { }
		inline uint128(uint64_t lo, uint64_t hi) : m_lo(lo), m_hi(hi) { }
		inline uint128(const uint128& other) : m_lo(other.m_lo), m_hi(other.m_hi) { }

		inline uint128& set_signed(int64_t lo) { m_lo = lo; m_hi = (lo < 0) ? UINT64_MAX : 0; return *this; }
		inline uint128& set(uint64_t lo) { m_lo = lo; m_hi = 0; return *this; }

		inline explicit operator uint8_t () const { return (uint8_t)m_lo; }
		inline explicit operator uint16_t () const { return (uint16_t)m_lo; }
		inline explicit operator uint32_t () const { return (uint32_t)m_lo; }
		inline explicit operator uint64_t () const { return m_lo; }

		inline uint128& operator= (const uint128& rhs) { m_lo = rhs.m_lo; m_hi = rhs.m_hi; return *this; }
		inline uint128& operator= (const uint64_t val) { m_lo = val; m_hi = 0; return *this; }

		inline uint64_t get_low() const { return m_lo; }
		inline uint64_t& get_low() { return m_lo; }

		inline uint64_t get_high() const { return m_hi; }
		inline uint64_t& get_high() { return m_hi; }

		inline bool operator== (const uint128& rhs) const { return (m_lo == rhs.m_lo) && (m_hi == rhs.m_hi); }
		inline bool operator!= (const uint128& rhs) const { return (m_lo != rhs.m_lo) || (m_hi != rhs.m_hi); }

		inline bool operator< (const uint128& rhs) const
		{
			if (m_hi < rhs.m_hi)
				return true;

			if (m_hi == rhs.m_hi)
			{
				if (m_lo < rhs.m_lo)
					return true;
			}

			return false;
		}

		inline bool operator> (const uint128& rhs) const { return (rhs < *this); }

		inline bool operator<= (const uint128& rhs) const { return (*this == rhs) || (*this < rhs); }
		inline bool operator>= (const uint128& rhs) const { return (*this == rhs) || (*this > rhs); }

		inline bool is_zero() const { return (m_lo == 0) && (m_hi == 0); }
		inline bool is_all_ones() const { return (m_lo == UINT64_MAX) && (m_hi == UINT64_MAX); }
		inline bool is_non_zero() const { return (m_lo != 0) || (m_hi != 0); }
		inline explicit operator bool() const { return is_non_zero(); }
		inline bool is_signed() const { return ((int64_t)m_hi) < 0; }

		inline bool signed_less(const uint128& rhs) const
		{
			const bool l_signed = is_signed(), r_signed = rhs.is_signed();

			if (l_signed == r_signed)
				return *this < rhs;

			if (l_signed && !r_signed)
				return true;

			assert(!l_signed && r_signed);
			return false;
		}

		inline bool signed_greater(const uint128& rhs) const { return rhs.signed_less(*this); }
		inline bool signed_less_equal(const uint128& rhs) const { return !rhs.signed_less(*this); }
		inline bool signed_greater_equal(const uint128& rhs) const { return !signed_less(rhs); }

		double get_double() const
		{
			double res = 0;

			if (m_hi)
				res = (double)m_hi * pow(2.0f, 64.0f);

			res += (double)m_lo;

			return res;
		}

		double get_signed_double() const
		{
			if (is_signed())
				return -(uint128(*this).abs().get_double());
			else
				return get_double();
		}

		inline uint128 abs() const
		{
			uint128 res(*this);
			if (res.is_signed())
				res = -res;
			return res;
		}

		inline uint128& operator<<= (int shift)
		{
			assert(shift >= 0);
			if (shift < 0)
				return *this;

			m_hi = (shift >= 64) ? ((shift >= 128) ? 0 : (m_lo << (shift - 64))) : (m_hi << shift);

			if ((shift) && (shift < 64))
				m_hi |= (m_lo >> (64 - shift));

			m_lo = (shift >= 64) ? 0 : (m_lo << shift);

			return *this;
		}

		inline uint128 operator<< (int shift) const { uint128 res(*this); res <<= shift; return res; }

		inline uint128& operator>>= (int shift)
		{
			assert(shift >= 0);
			if (shift < 0)
				return *this;

			m_lo = (shift >= 64) ? ((shift >= 128) ? 0 : (m_hi >> (shift - 64))) : (m_lo >> shift);

			if ((shift) && (shift < 64))
				m_lo |= (m_hi << (64 - shift));

			m_hi = (shift >= 64) ? 0 : (m_hi >> shift);

			return *this;
		}

		inline uint128 operator>> (int shift) const { uint128 res(*this); res >>= shift; return res; }

		inline uint128 signed_shift_right(int shift) const
		{
			uint128 res(*this);
			res >>= shift;

			if (is_signed())
			{
				uint128 x(0U);
				x = ~x;
				x >>= shift;
				res |= (~x);
			}

			return res;
		}

		inline uint128& operator |= (const uint128& rhs) { m_lo |= rhs.m_lo; m_hi |= rhs.m_hi; return *this; }
		inline uint128 operator | (const uint128& rhs) const { uint128 res(*this); res |= rhs; return res; }

		inline uint128& operator &= (const uint128& rhs) { m_lo &= rhs.m_lo; m_hi &= rhs.m_hi; return *this; }
		inline uint128 operator & (const uint128& rhs) const { uint128 res(*this); res &= rhs;	return res; }

		inline uint128& operator ^= (const uint128& rhs) { m_lo ^= rhs.m_lo; m_hi ^= rhs.m_hi; return *this; }
		inline uint128 operator ^ (const uint128& rhs) const { uint128 res(*this); res ^= rhs;	return res; }

		inline uint128 operator ~() const { return uint128(~m_lo, ~m_hi); }

		inline uint128 operator -() const { uint128 res(~*this); if (++res.m_lo == 0) ++res.m_hi; return res; }

		// prefix
		inline uint128 operator ++()
		{
			if (++m_lo == 0)
				++m_hi;
			return *this;
		}

		// postfix
		inline uint128 operator ++(int)
		{
			uint128 res(*this);
			if (++m_lo == 0)
				++m_hi;
			return res;
		}

		// prefix
		inline uint128 operator --()
		{
			const uint64_t t = m_lo;
			if (--m_lo > t)
				--m_hi;
			return *this;
		}

		// postfix
		inline uint128 operator --(int)
		{
			const uint64_t t = m_lo;
			uint128 res(*this);
			if (--m_lo > t)
				--m_hi;
			return res;
		}

		inline uint128& operator+= (const uint128& rhs)
		{
			const uint64_t t = m_lo + rhs.m_lo;
			m_hi = m_hi + rhs.m_hi + (t < m_lo);
			m_lo = t;
			return *this;
		}

		inline uint128 operator+ (const uint128& rhs) const { uint128 res(*this); res += rhs; return res; }

		inline uint128& operator-= (const uint128& rhs)
		{
			const uint64_t t = m_lo - rhs.m_lo;
			m_hi = m_hi - rhs.m_hi - (t > m_lo);
			m_lo = t;
			return *this;
		}

		inline uint128 operator- (const uint128& rhs) const { uint128 res(*this); res -= rhs; return res; }

		// computes bit by bit, very slow
		uint128& operator*=(const uint128& rhs)
		{
			uint128 temp(*this), result(0U);

			for (uint128 bitmask(rhs); bitmask; bitmask >>= 1, temp <<= 1)
				if (bitmask.get_low() & 1)
					result += temp;

			*this = result;
			return *this;
		}

		uint128 operator*(const uint128& rhs) const { uint128 res(*this); res *= rhs; return res; }

		// computes bit by bit, very slow
		friend uint128 divide(const uint128& dividend, const uint128& divisor, uint128& remainder)
		{
			remainder = 0;

			if (!divisor)
			{
				assert(0);
				return ~uint128(0U);
			}

			uint128 quotient(0), one(1);

			for (int i = 127; i >= 0; i--)
			{
				remainder = (remainder << 1) | ((dividend >> i) & one);
				if (remainder >= divisor)
				{
					remainder -= divisor;
					quotient |= (one << i);
				}
			}

			return quotient;
		}

		uint128 operator/(const uint128& rhs) const { uint128 remainder, res; res = divide(*this, rhs, remainder); return res; }
		uint128 operator/=(const uint128& rhs) { uint128 remainder; *this = divide(*this, rhs, remainder); return *this; }

		uint128 operator%(const uint128& rhs) const { uint128 remainder; divide(*this, rhs, remainder); return remainder; }
		uint128 operator%=(const uint128& rhs) { uint128 remainder; divide(*this, rhs, remainder); *this = remainder; return *this; }

		void print_hex(FILE* pFile) const
		{
			fprintf(pFile, "0x%016llx%016llx", (unsigned long long int)m_hi, (unsigned long long int)m_lo);
		}

		void format_unsigned(std::string& res) const
		{
			basisu::vector<uint8_t> digits;
			digits.reserve(39 + 1);

			uint128 k(*this), ten(10);
			do
			{
				uint128 r;
				k = divide(k, ten, r);
				digits.push_back((uint8_t)r);
			} while (k);

			for (int i = (int)digits.size() - 1; i >= 0; i--)
				res += ('0' + digits[i]);
		}

		void format_signed(std::string& res) const
		{
			uint128 val(*this);

			if (val.is_signed())
			{
				res.push_back('-');
				val = -val;
			}

			val.format_unsigned(res);
		}

		void print_unsigned(FILE* pFile)
		{
			std::string str;
			format_unsigned(str);
			fprintf(pFile, "%s", str.c_str());
		}

		void print_signed(FILE* pFile)
		{
			std::string str;
			format_signed(str);
			fprintf(pFile, "%s", str.c_str());
		}

		uint128 get_reversed_bits() const
		{
			uint128 res;

			const uint32_t* pSrc = (const uint32_t*)this;
			uint32_t* pDst = (uint32_t*)&res;

			pDst[0] = rev_dword(pSrc[3]);
			pDst[1] = rev_dword(pSrc[2]);
			pDst[2] = rev_dword(pSrc[1]);
			pDst[3] = rev_dword(pSrc[0]);

			return res;
		}

		uint128 get_byteswapped() const
		{
			uint128 res;

			const uint8_t* pSrc = (const uint8_t*)this;
			uint8_t* pDst = (uint8_t*)&res;

			for (uint32_t i = 0; i < 16; i++)
				pDst[i] = pSrc[15 - i];

			return res;
		}

		inline uint64_t get_bits64(uint32_t bit_ofs, uint32_t bit_len) const
		{
			assert(bit_ofs < 128);
			assert(bit_len && (bit_len <= 64) && ((bit_ofs + bit_len) <= 128));

			uint128 res(*this);
			res >>= bit_ofs;

			const uint64_t bitmask = (bit_len == 64) ? UINT64_MAX : ((1ull << bit_len) - 1);
			return res.get_low() & bitmask;
		}

		inline uint32_t get_bits(uint32_t bit_ofs, uint32_t bit_len) const
		{
			assert(bit_len <= 32);
			return (uint32_t)get_bits64(bit_ofs, bit_len);
		}

		inline uint32_t next_bits(uint32_t& bit_ofs, uint32_t len) const
		{
			assert(len && (len <= 32));
			uint32_t x = get_bits(bit_ofs, len);
			bit_ofs += len;
			return x;
		}

		inline uint128& set_bits(uint64_t val, uint32_t bit_ofs, uint32_t num_bits)
		{
			assert(bit_ofs < 128);
			assert(num_bits && (num_bits <= 64) && ((bit_ofs + num_bits) <= 128));

			uint128 bitmask(1);
			bitmask = (bitmask << num_bits) - 1;
			assert(uint128(val) <= bitmask);

			bitmask <<= bit_ofs;
			*this &= ~bitmask;

			*this = *this | (uint128(val) << bit_ofs);
			return *this;
		}
	};

	static bool decode_void_extent(const uint128& bits, log_astc_block& log_blk)
	{
		if (bits.get_bits(10, 2) != 0b11)
			return false;

		uint32_t bit_ofs = 12;
		const uint32_t min_s = bits.next_bits(bit_ofs, 13);
		const uint32_t max_s = bits.next_bits(bit_ofs, 13);
		const uint32_t min_t = bits.next_bits(bit_ofs, 13);
		const uint32_t max_t = bits.next_bits(bit_ofs, 13);
		assert(bit_ofs == 64);

		const bool all_extents_all_ones = (min_s == 0x1FFF) && (max_s == 0x1FFF) && (min_t == 0x1FFF) && (max_t == 0x1FFF);

		if (!all_extents_all_ones && ((min_s >= max_s) || (min_t >= max_t)))
			return false;

		const bool hdr_flag = bits.get_bits(9, 1) != 0;

		if (hdr_flag)
			log_blk.m_solid_color_flag_hdr = true;
		else
			log_blk.m_solid_color_flag_ldr = true;

		log_blk.m_solid_color[0] = (uint16_t)bits.get_bits(64, 16);
		log_blk.m_solid_color[1] = (uint16_t)bits.get_bits(80, 16);
		log_blk.m_solid_color[2] = (uint16_t)bits.get_bits(96, 16);
		log_blk.m_solid_color[3] = (uint16_t)bits.get_bits(112, 16);

		if (log_blk.m_solid_color_flag_hdr)
		{
			for (uint32_t c = 0; c < 4; c++)
				if (is_half_inf_or_nan(log_blk.m_solid_color[c]))
					return false;
		}

		return true;
	}

	struct astc_dec_row
	{
		int8_t Dp_ofs, P_ofs, W_ofs, W_size, H_ofs, H_size, W_bias, H_bias, p0_ofs, p1_ofs, p2_ofs;
	};

	static const astc_dec_row s_dec_rows[10] =
	{
		// Dp_ofs, P_ofs, W_ofs, W_size, H_ofs, H_size, W_bias, H_bias, p0_ofs, p1_ofs, p2_ofs;
		{  10,     9,     7,     2,      5,     2,      4,      2,      4,      0,      1      }, // 4 2
		{  10,     9,     7,     2,      5,     2,      8,      2,      4,      0,      1      }, // 8 2
		{  10,     9,     5,     2,      7,     2,      2,      8,      4,      0,      1      }, // 2 8
		{  10,     9,     5,     2,      7,     1,      2,      6,      4,      0,      1      }, // 2 6

		{  10,     9,     7,     1,      5,     2,      2,      2,      4,      0,      1      }, // 2 2
		{  10,     9,     0,     0,      5,     2,      12,     2,      4,      2,      3      }, // 12 2
		{  10,     9,     5,     2,      0,     0,      2,     12,      4,      2,      3      }, // 2 12
		{  10,     9,     0,     0,      0,     0,      6,     10,      4,      2,      3      }, // 6 10

		{  10,     9,     0,     0,      0,     0,      10,    6,       4,      2,      3      }, // 10 6
		{  -1,    -1,     5,     2,      9,     2,      6,     6,       4,      2,      3      }, // 6 6
	};

	static bool decode_config(const uint128& bits, log_astc_block& log_blk)
	{
		// Reserved
		if (bits.get_bits(0, 4) == 0)
			return false;

		// Reserved
		if ((bits.get_bits(0, 2) == 0) && (bits.get_bits(6, 3) == 0b111))
		{
			if (bits.get_bits(2, 4) != 0b1111)
				return false;
		}

		// Void extent
		if (bits.get_bits(0, 9) == 0b111111100)
			return decode_void_extent(bits, log_blk);

		// Check rows
		const uint32_t x0_2 = bits.get_bits(0, 2), x2_2 = bits.get_bits(2, 2);
		const uint32_t x5_4 = bits.get_bits(5, 4), x8_1 = bits.get_bits(8, 1);
		const uint32_t x7_2 = bits.get_bits(7, 2);

		int row_index = -1;
		if (x0_2 == 0)
		{
			if (x7_2 == 0b00)
				row_index = 5;
			else if (x7_2 == 0b01)
				row_index = 6;
			else if (x5_4 == 0b1100)
				row_index = 7;
			else if (x5_4 == 0b1101)
				row_index = 8;
			else if (x7_2 == 0b10)
				row_index = 9;
		}
		else
		{
			if (x2_2 == 0b00)
				row_index = 0;
			else if (x2_2 == 0b01)
				row_index = 1;
			else if (x2_2 == 0b10)
				row_index = 2;
			else if ((x2_2 == 0b11) && (x8_1 == 0))
				row_index = 3;
			else if ((x2_2 == 0b11) && (x8_1 == 1))
				row_index = 4;
		}
		if (row_index < 0)
			return false;

		const astc_dec_row& r = s_dec_rows[row_index];

		bool P = false, Dp = false;
		uint32_t W = r.W_bias, H = r.H_bias;

		if (r.P_ofs >= 0)
			P = bits.get_bits(r.P_ofs, 1) != 0;

		if (r.Dp_ofs >= 0)
			Dp = bits.get_bits(r.Dp_ofs, 1) != 0;

		if (r.W_size)
			W += bits.get_bits(r.W_ofs, r.W_size);

		if (r.H_size)
			H += bits.get_bits(r.H_ofs, r.H_size);

		assert((W >= MIN_GRID_DIM) && (W <= MAX_BLOCK_DIM));
		assert((H >= MIN_GRID_DIM) && (H <= MAX_BLOCK_DIM));

		int p0 = bits.get_bits(r.p0_ofs, 1);
		int p1 = bits.get_bits(r.p1_ofs, 1);
		int p2 = bits.get_bits(r.p2_ofs, 1);

		uint32_t p = p0 | (p1 << 1) | (p2 << 2);
		if (p < 2)
			return false;

		log_blk.m_grid_width = (uint8_t)W;
		log_blk.m_grid_height = (uint8_t)H;

		log_blk.m_weight_ise_range = (uint8_t)((p - 2) + (P * BISE_10_LEVELS));
		assert(log_blk.m_weight_ise_range <= LAST_VALID_WEIGHT_ISE_RANGE);

		log_blk.m_dual_plane = Dp;

		return true;
	}

	static inline uint32_t read_le_dword(const uint8_t* pBytes)
	{
		return (pBytes[0]) | (pBytes[1] << 8U) | (pBytes[2] << 16U) | (pBytes[3] << 24U);
	}

	// See 18.12.Integer Sequence Encoding - tables computed by executing the decoder functions with all possible 8/7-bit inputs.
	static const uint8_t s_trit_decode[256][5] =
	{
		{0,0,0,0,0},{1,0,0,0,0},{2,0,0,0,0},{0,0,2,0,0},{0,1,0,0,0},{1,1,0,0,0},{2,1,0,0,0},{1,0,2,0,0},
		{0,2,0,0,0},{1,2,0,0,0},{2,2,0,0,0},{2,0,2,0,0},{0,2,2,0,0},{1,2,2,0,0},{2,2,2,0,0},{2,0,2,0,0},
		{0,0,1,0,0},{1,0,1,0,0},{2,0,1,0,0},{0,1,2,0,0},{0,1,1,0,0},{1,1,1,0,0},{2,1,1,0,0},{1,1,2,0,0},
		{0,2,1,0,0},{1,2,1,0,0},{2,2,1,0,0},{2,1,2,0,0},{0,0,0,2,2},{1,0,0,2,2},{2,0,0,2,2},{0,0,2,2,2},
		{0,0,0,1,0},{1,0,0,1,0},{2,0,0,1,0},{0,0,2,1,0},{0,1,0,1,0},{1,1,0,1,0},{2,1,0,1,0},{1,0,2,1,0},
		{0,2,0,1,0},{1,2,0,1,0},{2,2,0,1,0},{2,0,2,1,0},{0,2,2,1,0},{1,2,2,1,0},{2,2,2,1,0},{2,0,2,1,0},
		{0,0,1,1,0},{1,0,1,1,0},{2,0,1,1,0},{0,1,2,1,0},{0,1,1,1,0},{1,1,1,1,0},{2,1,1,1,0},{1,1,2,1,0},
		{0,2,1,1,0},{1,2,1,1,0},{2,2,1,1,0},{2,1,2,1,0},{0,1,0,2,2},{1,1,0,2,2},{2,1,0,2,2},{1,0,2,2,2},
		{0,0,0,2,0},{1,0,0,2,0},{2,0,0,2,0},{0,0,2,2,0},{0,1,0,2,0},{1,1,0,2,0},{2,1,0,2,0},{1,0,2,2,0},
		{0,2,0,2,0},{1,2,0,2,0},{2,2,0,2,0},{2,0,2,2,0},{0,2,2,2,0},{1,2,2,2,0},{2,2,2,2,0},{2,0,2,2,0},
		{0,0,1,2,0},{1,0,1,2,0},{2,0,1,2,0},{0,1,2,2,0},{0,1,1,2,0},{1,1,1,2,0},{2,1,1,2,0},{1,1,2,2,0},
		{0,2,1,2,0},{1,2,1,2,0},{2,2,1,2,0},{2,1,2,2,0},{0,2,0,2,2},{1,2,0,2,2},{2,2,0,2,2},{2,0,2,2,2},
		{0,0,0,0,2},{1,0,0,0,2},{2,0,0,0,2},{0,0,2,0,2},{0,1,0,0,2},{1,1,0,0,2},{2,1,0,0,2},{1,0,2,0,2},
		{0,2,0,0,2},{1,2,0,0,2},{2,2,0,0,2},{2,0,2,0,2},{0,2,2,0,2},{1,2,2,0,2},{2,2,2,0,2},{2,0,2,0,2},
		{0,0,1,0,2},{1,0,1,0,2},{2,0,1,0,2},{0,1,2,0,2},{0,1,1,0,2},{1,1,1,0,2},{2,1,1,0,2},{1,1,2,0,2},
		{0,2,1,0,2},{1,2,1,0,2},{2,2,1,0,2},{2,1,2,0,2},{0,2,2,2,2},{1,2,2,2,2},{2,2,2,2,2},{2,0,2,2,2},
		{0,0,0,0,1},{1,0,0,0,1},{2,0,0,0,1},{0,0,2,0,1},{0,1,0,0,1},{1,1,0,0,1},{2,1,0,0,1},{1,0,2,0,1},
		{0,2,0,0,1},{1,2,0,0,1},{2,2,0,0,1},{2,0,2,0,1},{0,2,2,0,1},{1,2,2,0,1},{2,2,2,0,1},{2,0,2,0,1},
		{0,0,1,0,1},{1,0,1,0,1},{2,0,1,0,1},{0,1,2,0,1},{0,1,1,0,1},{1,1,1,0,1},{2,1,1,0,1},{1,1,2,0,1},
		{0,2,1,0,1},{1,2,1,0,1},{2,2,1,0,1},{2,1,2,0,1},{0,0,1,2,2},{1,0,1,2,2},{2,0,1,2,2},{0,1,2,2,2},
		{0,0,0,1,1},{1,0,0,1,1},{2,0,0,1,1},{0,0,2,1,1},{0,1,0,1,1},{1,1,0,1,1},{2,1,0,1,1},{1,0,2,1,1},
		{0,2,0,1,1},{1,2,0,1,1},{2,2,0,1,1},{2,0,2,1,1},{0,2,2,1,1},{1,2,2,1,1},{2,2,2,1,1},{2,0,2,1,1},
		{0,0,1,1,1},{1,0,1,1,1},{2,0,1,1,1},{0,1,2,1,1},{0,1,1,1,1},{1,1,1,1,1},{2,1,1,1,1},{1,1,2,1,1},
		{0,2,1,1,1},{1,2,1,1,1},{2,2,1,1,1},{2,1,2,1,1},{0,1,1,2,2},{1,1,1,2,2},{2,1,1,2,2},{1,1,2,2,2},
		{0,0,0,2,1},{1,0,0,2,1},{2,0,0,2,1},{0,0,2,2,1},{0,1,0,2,1},{1,1,0,2,1},{2,1,0,2,1},{1,0,2,2,1},
		{0,2,0,2,1},{1,2,0,2,1},{2,2,0,2,1},{2,0,2,2,1},{0,2,2,2,1},{1,2,2,2,1},{2,2,2,2,1},{2,0,2,2,1},
		{0,0,1,2,1},{1,0,1,2,1},{2,0,1,2,1},{0,1,2,2,1},{0,1,1,2,1},{1,1,1,2,1},{2,1,1,2,1},{1,1,2,2,1},
		{0,2,1,2,1},{1,2,1,2,1},{2,2,1,2,1},{2,1,2,2,1},{0,2,1,2,2},{1,2,1,2,2},{2,2,1,2,2},{2,1,2,2,2},
		{0,0,0,1,2},{1,0,0,1,2},{2,0,0,1,2},{0,0,2,1,2},{0,1,0,1,2},{1,1,0,1,2},{2,1,0,1,2},{1,0,2,1,2},
		{0,2,0,1,2},{1,2,0,1,2},{2,2,0,1,2},{2,0,2,1,2},{0,2,2,1,2},{1,2,2,1,2},{2,2,2,1,2},{2,0,2,1,2},
		{0,0,1,1,2},{1,0,1,1,2},{2,0,1,1,2},{0,1,2,1,2},{0,1,1,1,2},{1,1,1,1,2},{2,1,1,1,2},{1,1,2,1,2},
		{0,2,1,1,2},{1,2,1,1,2},{2,2,1,1,2},{2,1,2,1,2},{0,2,2,2,2},{1,2,2,2,2},{2,2,2,2,2},{2,1,2,2,2}
	};

	static const uint8_t s_quint_decode[128][3] =
	{
		{0,0,0},{1,0,0},{2,0,0},{3,0,0},{4,0,0},{0,4,0},{4,4,0},{4,4,4},
		{0,1,0},{1,1,0},{2,1,0},{3,1,0},{4,1,0},{1,4,0},{4,4,1},{4,4,4},
		{0,2,0},{1,2,0},{2,2,0},{3,2,0},{4,2,0},{2,4,0},{4,4,2},{4,4,4},
		{0,3,0},{1,3,0},{2,3,0},{3,3,0},{4,3,0},{3,4,0},{4,4,3},{4,4,4},
		{0,0,1},{1,0,1},{2,0,1},{3,0,1},{4,0,1},{0,4,1},{4,0,4},{0,4,4},
		{0,1,1},{1,1,1},{2,1,1},{3,1,1},{4,1,1},{1,4,1},{4,1,4},{1,4,4},
		{0,2,1},{1,2,1},{2,2,1},{3,2,1},{4,2,1},{2,4,1},{4,2,4},{2,4,4},
		{0,3,1},{1,3,1},{2,3,1},{3,3,1},{4,3,1},{3,4,1},{4,3,4},{3,4,4},
		{0,0,2},{1,0,2},{2,0,2},{3,0,2},{4,0,2},{0,4,2},{2,0,4},{3,0,4},
		{0,1,2},{1,1,2},{2,1,2},{3,1,2},{4,1,2},{1,4,2},{2,1,4},{3,1,4},
		{0,2,2},{1,2,2},{2,2,2},{3,2,2},{4,2,2},{2,4,2},{2,2,4},{3,2,4},
		{0,3,2},{1,3,2},{2,3,2},{3,3,2},{4,3,2},{3,4,2},{2,3,4},{3,3,4},
		{0,0,3},{1,0,3},{2,0,3},{3,0,3},{4,0,3},{0,4,3},{0,0,4},{1,0,4},
		{0,1,3},{1,1,3},{2,1,3},{3,1,3},{4,1,3},{1,4,3},{0,1,4},{1,1,4},
		{0,2,3},{1,2,3},{2,2,3},{3,2,3},{4,2,3},{2,4,3},{0,2,4},{1,2,4},
		{0,3,3},{1,3,3},{2,3,3},{3,3,3},{4,3,3},{3,4,3},{0,3,4},{1,3,4}
	};

	static void decode_trit_block(uint8_t* pVals, uint32_t num_vals, const uint128& bits, uint32_t& bit_ofs, uint32_t bits_per_val)
	{
		assert((num_vals >= 1) && (num_vals <= 5));
		uint32_t m[5] = { 0 }, T = 0;

		static const uint8_t s_t_bits[5] = { 2, 2, 1, 2, 1 };

		for (uint32_t T_ofs = 0, c = 0; c < num_vals; c++)
		{
			if (bits_per_val)
				m[c] = bits.next_bits(bit_ofs, bits_per_val);
			T |= (bits.next_bits(bit_ofs, s_t_bits[c]) << T_ofs);
			T_ofs += s_t_bits[c];
		}

		const uint8_t (&p_trits)[5] = s_trit_decode[T];

		for (uint32_t i = 0; i < num_vals; i++)
			pVals[i] = (uint8_t)((p_trits[i] << bits_per_val) | m[i]);
	}

	static void decode_quint_block(uint8_t* pVals, uint32_t num_vals, const uint128& bits, uint32_t& bit_ofs, uint32_t bits_per_val)
	{
		assert((num_vals >= 1) && (num_vals <= 3));
		uint32_t m[3] = { 0 }, T = 0;

		static const uint8_t s_t_bits[3] = { 3, 2, 2 };

		for (uint32_t T_ofs = 0, c = 0; c < num_vals; c++)
		{
			if (bits_per_val)
				m[c] = bits.next_bits(bit_ofs, bits_per_val);
			T |= (bits.next_bits(bit_ofs, s_t_bits[c]) << T_ofs);
			T_ofs += s_t_bits[c];
		}

		const uint8_t (&p_quints)[3] = s_quint_decode[T];

		for (uint32_t i = 0; i < num_vals; i++)
			pVals[i] = (uint8_t)((p_quints[i] << bits_per_val) | m[i]);
	}

	static void decode_bise(uint32_t ise_range, uint8_t* pVals, uint32_t num_vals, const uint128& bits, uint32_t bit_ofs)
	{
		assert(num_vals && (ise_range < TOTAL_ISE_RANGES));

		const uint32_t bits_per_val = g_ise_range_table[ise_range][0];

		if (g_ise_range_table[ise_range][1])
		{
			// Trits+bits, 5 vals per block, 7 bits extra per block
			const uint32_t total_blocks = (num_vals + 4) / 5;
			for (uint32_t b = 0; b < total_blocks; b++)
			{
				const uint32_t num_vals_in_block = std::min<int>(num_vals - 5 * b, 5);
				decode_trit_block(pVals + 5 * b, num_vals_in_block, bits, bit_ofs, bits_per_val);
			}
		}
		else if (g_ise_range_table[ise_range][2])
		{
			// Quints+bits, 3 vals per block, 8 bits extra per block
			const uint32_t total_blocks = (num_vals + 2) / 3;
			for (uint32_t b = 0; b < total_blocks; b++)
			{
				const uint32_t num_vals_in_block = std::min<int>(num_vals - 3 * b, 3);
				decode_quint_block(pVals + 3 * b, num_vals_in_block, bits, bit_ofs, bits_per_val);
			}
		}
		else
		{
			assert(bits_per_val);

			// Only bits
			for (uint32_t i = 0; i < num_vals; i++)
				pVals[i] = (uint8_t)bits.next_bits(bit_ofs, bits_per_val);
		}
	}

	void decode_bise(uint32_t ise_range, uint8_t* pVals, uint32_t num_vals, const uint8_t* pBits128, uint32_t bit_ofs)
	{
		const uint128 bits(
			(uint64_t)read_le_dword(pBits128) | (((uint64_t)read_le_dword(pBits128 + sizeof(uint32_t))) << 32),
			(uint64_t)read_le_dword(pBits128 + sizeof(uint32_t) * 2) | (((uint64_t)read_le_dword(pBits128 + sizeof(uint32_t) * 3)) << 32));

		return decode_bise(ise_range, pVals, num_vals, bits, bit_ofs);
	}

	// Decodes a physical ASTC block to a logical ASTC block.
	// blk_width/blk_height are only used to validate the weight grid's dimensions.
	bool unpack_block(const void* pASTC_block, log_astc_block& log_blk, uint32_t blk_width, uint32_t blk_height)
	{
		assert(is_valid_block_size(blk_width, blk_height));

		const uint8_t* pS = (uint8_t*)pASTC_block;

		log_blk.clear();
		log_blk.m_error_flag = true;

		const uint128 bits(
			(uint64_t)read_le_dword(pS) | (((uint64_t)read_le_dword(pS + sizeof(uint32_t))) << 32),
			(uint64_t)read_le_dword(pS + sizeof(uint32_t) * 2) | (((uint64_t)read_le_dword(pS + sizeof(uint32_t) * 3)) << 32));

		const uint128 rev_bits(bits.get_reversed_bits());

		if (!decode_config(bits, log_blk))
			return false;

		if (log_blk.m_solid_color_flag_hdr || log_blk.m_solid_color_flag_ldr)
		{
			// Void extent
			log_blk.m_error_flag = false;
			return true;
		}

		// Check grid dimensions
		if ((log_blk.m_grid_width > blk_width) || (log_blk.m_grid_height > blk_height))
			return false;

		// Now we have the grid width/height, dual plane, weight ISE range

		const uint32_t total_grid_weights = (log_blk.m_dual_plane ? 2 : 1) * (log_blk.m_grid_width * log_blk.m_grid_height);
		const uint32_t total_weight_bits = get_ise_sequence_bits(total_grid_weights, log_blk.m_weight_ise_range);

		// 18.24 Illegal Encodings
		if ((!total_grid_weights) || (total_grid_weights > MAX_GRID_WEIGHTS) || (total_weight_bits < 24) || (total_weight_bits > 96))
			return false;

		const uint32_t end_of_weight_bit_ofs = 128 - total_weight_bits;

		uint32_t total_extra_bits = 0;

		// Right before the weight bits, there may be extra CEM bits, then the 2 CCS bits if dual plane.

		log_blk.m_num_partitions = (uint8_t)(bits.get_bits(11, 2) + 1);
		if (log_blk.m_num_partitions == 1)
			log_blk.m_color_endpoint_modes[0] = (uint8_t)(bits.get_bits(13, 4)); // read CEM bits
		else
		{
			// 2 or more partitions
			if (log_blk.m_dual_plane && (log_blk.m_num_partitions == 4))
				return false;

			log_blk.m_partition_id = (uint16_t)bits.get_bits(13, 10);

			uint32_t cem_bits = bits.get_bits(23, 6);

			if ((cem_bits & 3) == 0)
			{
				// All CEM's the same
				for (uint32_t i = 0; i < log_blk.m_num_partitions; i++)
					log_blk.m_color_endpoint_modes[i] = (uint8_t)(cem_bits >> 2);
			}
			else
			{
				// CEM's different, but within up to 2 adjacent classes
				const uint32_t first_cem_index = ((cem_bits & 3) - 1) * 4;

				total_extra_bits = 3 * log_blk.m_num_partitions - 4;

				if ((total_weight_bits + total_extra_bits) > 128)
					return false;

				uint32_t cem_bit_pos = end_of_weight_bit_ofs - total_extra_bits;

				uint32_t c[4] = { 0 }, m[4] = { 0 };

				cem_bits >>= 2;
				for (uint32_t i = 0; i < log_blk.m_num_partitions; i++, cem_bits >>= 1)
					c[i] = cem_bits & 1;

				switch (log_blk.m_num_partitions)
				{
				case 2:
				{
					m[0] = cem_bits & 3;
					m[1] = bits.next_bits(cem_bit_pos, 2);
					break;
				}
				case 3:
				{
					m[0] = cem_bits & 1;
					m[0] |= (bits.next_bits(cem_bit_pos, 1) << 1);
					m[1] = bits.next_bits(cem_bit_pos, 2);
					m[2] = bits.next_bits(cem_bit_pos, 2);
					break;
				}
				case 4:
				{
					for (uint32_t i = 0; i < 4; i++)
						m[i] = bits.next_bits(cem_bit_pos, 2);
					break;
				}
				default:
				{
					assert(0);
					break;
				}
				}

				assert(cem_bit_pos == end_of_weight_bit_ofs);

				for (uint32_t i = 0; i < log_blk.m_num_partitions; i++)
				{
					log_blk.m_color_endpoint_modes[i] = (uint8_t)(first_cem_index + (c[i] * 4) + m[i]);
					assert(log_blk.m_color_endpoint_modes[i] <= 15);
				}
			}
		}

		// Now we have all the CEM indices.

		if (log_blk.m_dual_plane)
		{
			// Read CCS bits, beneath any CEM bits
			total_extra_bits += 2;

			if (total_extra_bits > end_of_weight_bit_ofs)
				return false;

			uint32_t ccs_bit_pos = end_of_weight_bit_ofs - total_extra_bits;
			log_blk.m_color_component_selector = (uint8_t)(bits.get_bits(ccs_bit_pos, 2));
		}

		uint32_t config_bit_pos = 11 + 2; // config+num_parts
		if (log_blk.m_num_partitions == 1)
			config_bit_pos += 4; // CEM bits
		else
			config_bit_pos += 10 + 6; // part_id+CEM bits

		// config+num_parts+total_extra_bits (CEM extra+CCS)
		uint32_t total_config_bits = config_bit_pos + total_extra_bits;

		// Compute number of remaining bits in block
		const int num_remaining_bits = 128 - (int)total_config_bits - (int)total_weight_bits;
		if (num_remaining_bits < 0)
			return false;

		// Compute total number of ISE encoded color endpoint mode values
		uint32_t total_cem_vals = 0;
		for (uint32_t j = 0; j < log_blk.m_num_partitions; j++)
			total_cem_vals += get_num_cem_values(log_blk.m_color_endpoint_modes[j]);

		if (total_cem_vals > MAX_ENDPOINTS)
			return false;

		// Infer endpoint ISE range based off the # of values we need to encode, and the # of remaining bits in the block
		// TODO: Optimize
		int endpoint_ise_range = -1;
		for (int k = 20; k > 0; k--)
		{
			int b = get_ise_sequence_bits(total_cem_vals, k);
			if (b <= num_remaining_bits)
			{
				endpoint_ise_range = k;
				break;
			}
		}

		// See 23.24 Illegal Encodings, [0,5] is the minimum ISE encoding for endpoints
		if (endpoint_ise_range < (int)FIRST_VALID_ENDPOINT_ISE_RANGE)
			return false;

		log_blk.m_endpoint_ise_range = (uint8_t)endpoint_ise_range;

		// Decode endpoints forwards in block
		decode_bise(log_blk.m_endpoint_ise_range, log_blk.m_endpoints, total_cem_vals, bits, config_bit_pos);

		// Decode grid weights backwards in block
		decode_bise(log_blk.m_weight_ise_range, log_blk.m_weights, total_grid_weights, rev_bits, 0);

		log_blk.m_error_flag = false;

		return true;
	}

	// Misc. helpers

	uint8_t get_weight(const log_astc_block& log_block, uint32_t plane_index, uint32_t i)
	{
		const uint32_t num_planes = log_block.m_dual_plane ? 2 : 1;
		assert(plane_index < num_planes);
		assert(i < (uint32_t)(log_block.m_grid_width * log_block.m_grid_height));

		const uint32_t idx = i * num_planes + plane_index;
		assert(idx < MAX_GRID_WEIGHTS);

		return log_block.m_weights[idx];
	}

	uint8_t &get_weight(log_astc_block& log_block, uint32_t plane_index, uint32_t i)
	{
		const uint32_t num_planes = log_block.m_dual_plane ? 2 : 1;
		assert(plane_index < num_planes);
		assert(i < (uint32_t)(log_block.m_grid_width * log_block.m_grid_height));

		const uint32_t idx = i * num_planes + plane_index;
		assert(idx < MAX_GRID_WEIGHTS);

		return log_block.m_weights[idx];
	}

	void extract_weights(const log_astc_block& log_block, uint8_t* pWeights, uint32_t plane_index)
	{
		const uint32_t num_planes = log_block.m_dual_plane ? 2 : 1;
		assert(plane_index < num_planes);

		const uint32_t num_weights = log_block.m_grid_width * log_block.m_grid_height;
		for (uint32_t i = 0; i < num_weights; i++)
			pWeights[i] = log_block.m_weights[i * num_planes + plane_index];
	}

	void set_weights(log_astc_block& log_block, const uint8_t* pWeights, uint32_t plane_index)
	{
		const uint32_t num_planes = log_block.m_dual_plane ? 2 : 1;
		assert(plane_index < num_planes);

		const uint32_t num_weights = log_block.m_grid_width * log_block.m_grid_height;
		for (uint32_t i = 0; i < num_weights; i++)
			log_block.m_weights[i * num_planes + plane_index] = pWeights[i];
	}

	uint32_t get_total_weights(const log_astc_block& log_block)
	{
		return (log_block.m_dual_plane ? 2 : 1) * (log_block.m_grid_width * log_block.m_grid_height);
	}

	// Returns a pointer to the beginning of a partition's/subset's endpoint values.
	uint8_t *get_endpoints(log_astc_block& log_block, uint32_t partition_index)
	{
		assert(partition_index < log_block.m_num_partitions);

		uint32_t ofs = 0;

		for (uint32_t i = 0; i != partition_index; ++i)
			ofs += get_num_cem_values(log_block.m_color_endpoint_modes[i]);

		assert(ofs < MAX_ENDPOINTS);

		return log_block.m_endpoints + ofs;
	}

	const uint8_t* get_endpoints(const log_astc_block& log_block, uint32_t partition_index)
	{
		assert(partition_index < log_block.m_num_partitions);

		uint32_t ofs = 0;

		for (uint32_t i = 0; i != partition_index; ++i)
			ofs += get_num_cem_values(log_block.m_color_endpoint_modes[i]);

		assert(ofs < MAX_ENDPOINTS);

		return log_block.m_endpoints + ofs;
	}

	const char* get_cem_name(uint32_t cem_index)
	{
		static const char *s_cem_names[16] =
		{
			"CEM_LDR_LUM_DIRECT (0)",
			"CEM_LDR_LUM_BASE_PLUS_OFS (1)",
			"CEM_HDR_LUM_LARGE_RANGE (2)",
			"CEM_HDR_LUM_SMALL_RANGE (3)",
			"CEM_LDR_LUM_ALPHA_DIRECT (4)",
			"CEM_LDR_LUM_ALPHA_BASE_PLUS_OFS (5)",
			"CEM_LDR_RGB_BASE_SCALE (6)",
			"CEM_HDR_RGB_BASE_SCALE (7)",
			"CEM_LDR_RGB_DIRECT (8)",
			"CEM_LDR_RGB_BASE_PLUS_OFFSET (9)",
			"CEM_LDR_RGB_BASE_SCALE_PLUS_TWO_A (10)",
			"CEM_HDR_RGB (11)",
			"CEM_LDR_RGBA_DIRECT (12)",
			"CEM_LDR_RGBA_BASE_PLUS_OFFSET (13)",
			"CEM_HDR_RGB_LDR_ALPHA (14)",
			"CEM_HDR_RGB_HDR_ALPHA (15)"
		};

		assert(cem_index < std::size(s_cem_names));
		const char *p = s_cem_names[cem_index];
		assert(p);
		return p;
	}

	bool cem_is_ldr_direct(uint32_t cem_index)
	{
		return (cem_index == CEM_LDR_RGB_DIRECT) || (cem_index == CEM_LDR_RGBA_DIRECT);
	}

	bool cem_is_ldr_base_scale(uint32_t cem_index)
	{
		return (cem_index == CEM_LDR_RGB_BASE_SCALE) || (cem_index == CEM_LDR_RGB_BASE_SCALE_PLUS_TWO_A);
	}

	bool cem_is_ldr_base_plus_ofs(uint32_t cem_index)
	{
		return (cem_index == CEM_LDR_RGB_BASE_PLUS_OFFSET) || (cem_index == CEM_LDR_RGBA_BASE_PLUS_OFFSET);
	}

	bool cem_supports_bc(uint32_t cem)
	{
		switch (cem)
		{
		case CEM_LDR_RGB_DIRECT:
		case CEM_LDR_RGBA_DIRECT:
		case CEM_LDR_RGB_BASE_PLUS_OFFSET:
		case CEM_LDR_RGBA_BASE_PLUS_OFFSET:
			return true;
		default:
			break;
		}
		return false;
	}

	// input:
	//  a=[0,255]
	//  b=[0,255]
	// output:
	//  a=from, converted to -32 to 31
	//  b=to, shifted right by 1 and 1 bit added to MSB, so [0,255]
	void bit_transfer_signed_dec(int& a, int& b)
	{
		assert((a >= 0) && (a <= 255));
		assert((b >= 0) && (b <= 255));

		b >>= 1;
		b |= (a & 0x80);

		a >>= 1;
		a &= 0x3F;
		if ((a & 0x20) != 0)
			a -= 0x40;
	}

	// transfers a bit from b to a, prepares a for encoding
	// input:
	//  a=[-32,31] (6-bits, 2's complement)
	//  b=[0,255] (8-bits)
	// output:
	//  a=[0,255] (preserve top 2 bits)
	//  b=[0,255]
	void bit_transfer_signed_enc(int& a, int& b)
	{
		assert((a >= -32) && (a <= 31));
		assert((b >= 0) && (b <= 255));

		// extract MSB of b
		bool bit_to_transfer = (b & 0x80) != 0;
		b = (b << 1) & 0xFF;	// 7 bits to 8

		a &= 0x3F;				// 6 bits
		a <<= 1;				// 6 to 7 bits
		if (bit_to_transfer)
			a |= 0x80;			// set MSB
	}

	// RGB or RGBA direct
	bool cem8_or_12_used_blue_contraction(uint32_t cem_index, const uint8_t* pEndpoint_vals, uint32_t endpoint_ise_index)
	{
		assert((cem_index == CEM_LDR_RGB_DIRECT) || (cem_index == CEM_LDR_RGBA_DIRECT));
		(void)(cem_index);

		const auto& endpoint_dequant_tab = g_dequant_tables.get_endpoint_tab(endpoint_ise_index).m_ISE_to_val;

		uint8_t dequantized_endpoints[6];
		for (uint32_t i = 0; i < 6; i++)
			dequantized_endpoints[i] = endpoint_dequant_tab[pEndpoint_vals[i]];

		uint32_t s0 = dequantized_endpoints[0] + dequantized_endpoints[2] + dequantized_endpoints[4];
		uint32_t s1 = dequantized_endpoints[1] + dequantized_endpoints[3] + dequantized_endpoints[5];

		return s1 < s0;
	}

	// RGB or RGBA base plus offset
	bool cem9_or_13_used_blue_contraction(uint32_t cem_index, const uint8_t* pEndpoint_vals, uint32_t endpoint_ise_index)
	{
		assert((cem_index == CEM_LDR_RGB_BASE_PLUS_OFFSET) || (cem_index == CEM_LDR_RGBA_BASE_PLUS_OFFSET));
		(void)(cem_index);

		const auto& endpoint_dequant_tab = g_dequant_tables.get_endpoint_tab(endpoint_ise_index).m_ISE_to_val;

		int dequantized_endpoints[6];
		for (uint32_t i = 0; i < 6; i++)
			dequantized_endpoints[i] = endpoint_dequant_tab[pEndpoint_vals[i]];

		bit_transfer_signed_dec(dequantized_endpoints[1], dequantized_endpoints[0]);
		bit_transfer_signed_dec(dequantized_endpoints[3], dequantized_endpoints[2]);
		bit_transfer_signed_dec(dequantized_endpoints[5], dequantized_endpoints[4]);

		int s = dequantized_endpoints[1] + dequantized_endpoints[3] + dequantized_endpoints[5];

		return s < 0;
	}

	bool used_blue_contraction(uint32_t cem_index, const uint8_t* pEndpoint_vals, uint32_t endpoint_ise_index)
	{
		assert(is_cem_ldr(cem_index));

		bool used_blue_contraction_flag = false;

		if ((cem_index == 8) || (cem_index == 12))
			used_blue_contraction_flag = cem8_or_12_used_blue_contraction(cem_index, pEndpoint_vals, endpoint_ise_index);
		else if ((cem_index == 9) || (cem_index == 13))
			used_blue_contraction_flag = cem9_or_13_used_blue_contraction(cem_index, pEndpoint_vals, endpoint_ise_index);

		return used_blue_contraction_flag;
	}

	uint32_t get_base_cem_without_alpha(uint32_t cem)
	{
		assert(is_cem_ldr(cem));

		switch (cem)
		{
		case CEM_LDR_LUM_ALPHA_DIRECT: return CEM_LDR_LUM_DIRECT;
		case CEM_LDR_RGBA_DIRECT: return CEM_LDR_RGB_DIRECT;
		case CEM_LDR_RGB_BASE_SCALE_PLUS_TWO_A: return CEM_LDR_RGB_BASE_SCALE;
		case CEM_LDR_RGBA_BASE_PLUS_OFFSET: return CEM_LDR_RGB_BASE_PLUS_OFFSET;
		default:
			break;
		}

		return cem;
	}

	int apply_delta_to_bise_endpoint_val(uint32_t endpoint_ise_range, int ise_val, int delta)
	{
		if (delta == 0)
			return ise_val;

		uint32_t num_ise_levels = astc_helpers::get_ise_levels(endpoint_ise_range);

		const auto& ISE_to_rank = astc_helpers::g_dequant_tables.get_endpoint_tab(endpoint_ise_range).m_ISE_to_rank;
		const auto& rank_to_ISE = astc_helpers::g_dequant_tables.get_endpoint_tab(endpoint_ise_range).m_rank_to_ISE;

		int cur_rank = ISE_to_rank[ise_val];
		int new_rank = basisu::clamp<int>(cur_rank + delta, 0, (int)num_ise_levels - 1);

		return rank_to_ISE[new_rank];
	}

	void get_astc_block_size_by_index(uint32_t index, uint32_t& width, uint32_t& height)
	{
		assert(index < NUM_ASTC_BLOCK_SIZES);

		width = g_astc_block_sizes[index][0];
		height = g_astc_block_sizes[index][1];
	}

	int find_astc_block_size_index(uint32_t width, uint32_t height)
	{
		for (uint32_t i = 0; i < NUM_ASTC_BLOCK_SIZES; i++)
			if ((width == g_astc_block_sizes[i][0]) && (height == g_astc_block_sizes[i][1]))
				return i;

		return -1;
	}

} // namespace astc_helpers

#endif //BASISU_ASTC_HELPERS_IMPLEMENTATION