From c86a40c7e6e53a4bd0ad5e88a3be2e4ae4b55787 Mon Sep 17 00:00:00 2001 From: Richard Geldreich Date: Fri, 7 Oct 2022 19:33:34 -0400 Subject: [PATCH] Adding -clbench command line option, and a new API basis_benchmark_etc1s_opencl() to determine if OpenCL encoding is worthwhile on the current machine/driver/GPU. --- basisu_tool.cpp | 38 +++++++-- encoder/basisu_comp.cpp | 178 +++++++++++++++++++++++++++++++++++----- encoder/basisu_comp.h | 21 ++++- 3 files changed, 207 insertions(+), 30 deletions(-) diff --git a/basisu_tool.cpp b/basisu_tool.cpp index 08f84e1..9b2a43a 100644 --- a/basisu_tool.cpp +++ b/basisu_tool.cpp @@ -54,6 +54,7 @@ enum tool_mode cBench, cCompSize, cTest, + cCLBench, cSplitImage, cCombineImages }; @@ -365,6 +366,8 @@ public: m_mode = cCompSize; else if (strcasecmp(pArg, "-test") == 0) m_mode = cTest; + else if (strcasecmp(pArg, "-clbench") == 0) + m_mode = cCLBench; else if (strcasecmp(pArg, "-test_dir") == 0) { REMAINING_ARGS_CHECK(1); @@ -4266,7 +4269,7 @@ static bool test_mode(command_line_params& opts) size_t data_size = 0; // Test ETC1S - flags_and_quality = (opts.m_comp_params.m_multithreading ? cFlagThreaded : 0); + flags_and_quality = (opts.m_comp_params.m_multithreading ? cFlagThreaded : 0) | cFlagPrintStats | cFlagPrintStatus; void* pData = basis_compress(source_images, flags_and_quality, uastc_rdo_quality, &data_size, &stats); if (!pData) @@ -4293,7 +4296,7 @@ static bool test_mode(command_line_params& opts) if (opencl_is_available()) { - flags_and_quality = (opts.m_comp_params.m_multithreading ? cFlagThreaded : 0) | cFlagUseOpenCL; + flags_and_quality = (opts.m_comp_params.m_multithreading ? cFlagThreaded : 0) | cFlagUseOpenCL | cFlagPrintStats | cFlagPrintStatus; pData = basis_compress(source_images, flags_and_quality, uastc_rdo_quality, &data_size, &stats); if (!pData) @@ -4329,7 +4332,7 @@ static bool test_mode(command_line_params& opts) } // Test UASTC - flags_and_quality = (opts.m_comp_params.m_multithreading ? cFlagThreaded : 0) | cFlagUASTC; + flags_and_quality = (opts.m_comp_params.m_multithreading ? cFlagThreaded : 0) | cFlagUASTC | cFlagPrintStats | cFlagPrintStatus; pData = basis_compress(source_images, flags_and_quality, uastc_rdo_quality, &data_size, &stats); if (!pData) @@ -4362,6 +4365,24 @@ static bool test_mode(command_line_params& opts) return result; } +static bool clbench_mode(command_line_params& opts) +{ + BASISU_NOTE_UNUSED(opts); + + bool opencl_failed = false; + bool use_cl = basis_benchmark_etc1s_opencl(&opencl_failed); + if (use_cl) + printf("OpenCL ETC1S encoding is faster on this machine\n"); + else + { + if (opencl_failed) + printf("OpenCL failed!\n"); + printf("CPU ETC1S encoding is faster on this machine\n"); + } + + return true; +} + static int main_internal(int argc, const char **argv) { printf("Basis Universal GPU Texture Compressor v" BASISU_TOOL_VERSION "\nCopyright (C) 2019-2022 Binomial LLC, All rights reserved\n"); @@ -4374,7 +4395,7 @@ static int main_internal(int argc, const char **argv) bool opencl_force_serialization = false; for (int i = 1; i < argc; i++) { - if (strcmp(argv[i], "-opencl") == 0) + if ((strcmp(argv[i], "-opencl") == 0) || (strcmp(argv[i], "-clbench") == 0)) use_opencl = true; if (strcmp(argv[i], "-opencl_serialize") == 0) opencl_force_serialization = true; @@ -4394,13 +4415,13 @@ static int main_internal(int argc, const char **argv) #if defined(DEBUG) || defined(_DEBUG) printf("DEBUG build\n"); #endif - + if (argc == 1) { print_usage(); return EXIT_FAILURE; } - + command_line_params opts; if (!opts.parse(argc, argv)) { @@ -4413,7 +4434,7 @@ static int main_internal(int argc, const char **argv) #else printf("Multithreading: %u, Zstandard support: %u, OpenCL: %u\n", (uint32_t)opts.m_comp_params.m_multithreading, basist::basisu_transcoder_supports_ktx2_zstd(), opencl_is_available()); #endif - + if (!opts.process_listing_files()) return EXIT_FAILURE; @@ -4459,6 +4480,9 @@ static int main_internal(int argc, const char **argv) case cTest: status = test_mode(opts); break; + case cCLBench: + status = clbench_mode(opts); + break; case cSplitImage: status = split_image_mode(opts); break; diff --git a/encoder/basisu_comp.cpp b/encoder/basisu_comp.cpp index 166a1c4..41eae2b 100644 --- a/encoder/basisu_comp.cpp +++ b/encoder/basisu_comp.cpp @@ -1501,7 +1501,8 @@ namespace basisu if (m_params.m_compute_stats) { - printf("Slice: %u\n", slice_index); + if (m_params.m_print_stats) + printf("Slice: %u\n", slice_index); image_stats& s = m_stats[slice_index]; @@ -1511,81 +1512,100 @@ namespace basisu // ---- .basis stats em.calc(m_slice_images[slice_index], m_decoded_output_textures_unpacked[slice_index], 0, 3); - em.print(".basis RGB Avg: "); + if (m_params.m_print_stats) + em.print(".basis RGB Avg: "); s.m_basis_rgb_avg_psnr = em.m_psnr; em.calc(m_slice_images[slice_index], m_decoded_output_textures_unpacked[slice_index], 0, 4); - em.print(".basis RGBA Avg: "); + if (m_params.m_print_stats) + em.print(".basis RGBA Avg: "); s.m_basis_rgba_avg_psnr = em.m_psnr; em.calc(m_slice_images[slice_index], m_decoded_output_textures_unpacked[slice_index], 0, 1); - em.print(".basis R Avg: "); + if (m_params.m_print_stats) + em.print(".basis R Avg: "); em.calc(m_slice_images[slice_index], m_decoded_output_textures_unpacked[slice_index], 1, 1); - em.print(".basis G Avg: "); + if (m_params.m_print_stats) + em.print(".basis G Avg: "); em.calc(m_slice_images[slice_index], m_decoded_output_textures_unpacked[slice_index], 2, 1); - em.print(".basis B Avg: "); + if (m_params.m_print_stats) + em.print(".basis B Avg: "); if (m_params.m_uastc) { em.calc(m_slice_images[slice_index], m_decoded_output_textures_unpacked[slice_index], 3, 1); - em.print(".basis A Avg: "); + if (m_params.m_print_stats) + em.print(".basis A Avg: "); s.m_basis_a_avg_psnr = em.m_psnr; } em.calc(m_slice_images[slice_index], m_decoded_output_textures_unpacked[slice_index], 0, 0); - em.print(".basis 709 Luma: "); + if (m_params.m_print_stats) + em.print(".basis 709 Luma: "); s.m_basis_luma_709_psnr = static_cast(em.m_psnr); s.m_basis_luma_709_ssim = static_cast(em.m_ssim); em.calc(m_slice_images[slice_index], m_decoded_output_textures_unpacked[slice_index], 0, 0, true, true); - em.print(".basis 601 Luma: "); + if (m_params.m_print_stats) + em.print(".basis 601 Luma: "); s.m_basis_luma_601_psnr = static_cast(em.m_psnr); if (m_slice_descs.size() == 1) { const uint32_t output_size = comp_size ? (uint32_t)comp_size : (uint32_t)comp_data.size(); - debug_printf(".basis RGB PSNR per bit/texel*10000: %3.3f\n", 10000.0f * s.m_basis_rgb_avg_psnr / ((output_size * 8.0f) / (slice_desc.m_orig_width * slice_desc.m_orig_height))); - debug_printf(".basis Luma 709 PSNR per bit/texel*10000: %3.3f\n", 10000.0f * s.m_basis_luma_709_psnr / ((output_size * 8.0f) / (slice_desc.m_orig_width * slice_desc.m_orig_height))); + if (m_params.m_print_stats) + { + debug_printf(".basis RGB PSNR per bit/texel*10000: %3.3f\n", 10000.0f * s.m_basis_rgb_avg_psnr / ((output_size * 8.0f) / (slice_desc.m_orig_width * slice_desc.m_orig_height))); + debug_printf(".basis Luma 709 PSNR per bit/texel*10000: %3.3f\n", 10000.0f * s.m_basis_luma_709_psnr / ((output_size * 8.0f) / (slice_desc.m_orig_width * slice_desc.m_orig_height))); + } } if (m_decoded_output_textures_unpacked_bc7[slice_index].get_width()) { // ---- BC7 stats em.calc(m_slice_images[slice_index], m_decoded_output_textures_unpacked_bc7[slice_index], 0, 3); - em.print("BC7 RGB Avg: "); + if (m_params.m_print_stats) + em.print("BC7 RGB Avg: "); s.m_bc7_rgb_avg_psnr = em.m_psnr; em.calc(m_slice_images[slice_index], m_decoded_output_textures_unpacked_bc7[slice_index], 0, 4); - em.print("BC7 RGBA Avg: "); + if (m_params.m_print_stats) + em.print("BC7 RGBA Avg: "); s.m_bc7_rgba_avg_psnr = em.m_psnr; em.calc(m_slice_images[slice_index], m_decoded_output_textures_unpacked_bc7[slice_index], 0, 1); - em.print("BC7 R Avg: "); + if (m_params.m_print_stats) + em.print("BC7 R Avg: "); em.calc(m_slice_images[slice_index], m_decoded_output_textures_unpacked_bc7[slice_index], 1, 1); - em.print("BC7 G Avg: "); + if (m_params.m_print_stats) + em.print("BC7 G Avg: "); em.calc(m_slice_images[slice_index], m_decoded_output_textures_unpacked_bc7[slice_index], 2, 1); - em.print("BC7 B Avg: "); + if (m_params.m_print_stats) + em.print("BC7 B Avg: "); if (m_params.m_uastc) { em.calc(m_slice_images[slice_index], m_decoded_output_textures_unpacked_bc7[slice_index], 3, 1); - em.print("BC7 A Avg: "); + if (m_params.m_print_stats) + em.print("BC7 A Avg: "); s.m_bc7_a_avg_psnr = em.m_psnr; } em.calc(m_slice_images[slice_index], m_decoded_output_textures_unpacked_bc7[slice_index], 0, 0); - em.print("BC7 709 Luma: "); + if (m_params.m_print_stats) + em.print("BC7 709 Luma: "); s.m_bc7_luma_709_psnr = static_cast(em.m_psnr); s.m_bc7_luma_709_ssim = static_cast(em.m_ssim); em.calc(m_slice_images[slice_index], m_decoded_output_textures_unpacked_bc7[slice_index], 0, 0, true, true); - em.print("BC7 601 Luma: "); + if (m_params.m_print_stats) + em.print("BC7 601 Luma: "); s.m_bc7_luma_601_psnr = static_cast(em.m_psnr); } @@ -1593,16 +1613,19 @@ namespace basisu { // ---- Nearly best possible ETC1S stats em.calc(m_slice_images[slice_index], m_best_etc1s_images_unpacked[slice_index], 0, 3); - em.print("Unquantized ETC1S RGB Avg: "); + if (m_params.m_print_stats) + em.print("Unquantized ETC1S RGB Avg: "); s.m_best_etc1s_rgb_avg_psnr = static_cast(em.m_psnr); em.calc(m_slice_images[slice_index], m_best_etc1s_images_unpacked[slice_index], 0, 0); - em.print("Unquantized ETC1S 709 Luma: "); + if (m_params.m_print_stats) + em.print("Unquantized ETC1S 709 Luma: "); s.m_best_etc1s_luma_709_psnr = static_cast(em.m_psnr); s.m_best_etc1s_luma_709_ssim = static_cast(em.m_ssim); em.calc(m_slice_images[slice_index], m_best_etc1s_images_unpacked[slice_index], 0, 0, true, true); - em.print("Unquantized ETC1S 601 Luma: "); + if (m_params.m_print_stats) + em.print("Unquantized ETC1S 601 Luma: "); s.m_best_etc1s_luma_601_psnr = static_cast(em.m_psnr); } } @@ -2311,6 +2334,8 @@ namespace basisu } comp_params.m_compute_stats = (pStats != nullptr); + comp_params.m_print_stats = (flags_and_quality & cFlagPrintStats) != 0; + comp_params.m_status_output = (flags_and_quality & cFlagPrintStatus) != 0; // Create the compressor, initialize it, and process the input basis_compressor comp; @@ -2328,6 +2353,11 @@ namespace basisu return nullptr; } + if ((pStats) && (comp.get_opencl_failed())) + { + pStats->m_opencl_failed = true; + } + // Get the output file data and return it to the caller void* pFile_data = nullptr; const uint8_vec* pFile_data_vec = comp_params.m_create_ktx2_file ? &comp.get_output_ktx2_file() : &comp.get_output_basis_file(); @@ -2388,4 +2418,108 @@ namespace basisu free(p); } + bool basis_benchmark_etc1s_opencl(bool* pOpenCL_failed) + { + if (pOpenCL_failed) + *pOpenCL_failed = false; + + if (!opencl_is_available()) + { + error_printf("basis_benchmark_etc1s_opencl: OpenCL support must be enabled first!\n"); + return false; + } + + const uint32_t W = 1024, H = 1024; + basisu::vector images; + image& img = images.enlarge(1)->resize(W, H); + + const uint32_t NUM_RAND_LETTERS = 6000;// 40000; + + rand r; + r.seed(200); + + for (uint32_t i = 0; i < NUM_RAND_LETTERS; i++) + { + uint32_t x = r.irand(0, W - 1), y = r.irand(0, H - 1); + uint32_t sx = r.irand(1, 4), sy = r.irand(1, 4); + color_rgba c(r.byte(), r.byte(), r.byte(), 255); + + img.debug_text(x, y, sx, sy, c, nullptr, false, "%c", static_cast(r.irand(32, 127))); + } + + //save_png("test.png", img); + + image_stats stats; + + uint32_t flags_and_quality = cFlagSRGB | cFlagThreaded | 255; + size_t comp_size = 0; + + double best_cpu_time = 1e+9f, best_gpu_time = 1e+9f; + + const uint32_t TIMES_TO_ENCODE = 2; + interval_timer tm; + + for (uint32_t i = 0; i < TIMES_TO_ENCODE; i++) + { + tm.start(); + void* pComp_data = basis_compress( + images, + flags_and_quality, 1.0f, + &comp_size, + &stats); + double cpu_time = tm.get_elapsed_secs(); + if (!pComp_data) + { + error_printf("basis_benchmark_etc1s_opencl: basis_compress() failed (CPU)!\n"); + return false; + } + + best_cpu_time = minimum(best_cpu_time, cpu_time); + + basis_free_data(pComp_data); + } + + printf("Best CPU time: %3.3f\n", best_cpu_time); + + for (uint32_t i = 0; i < TIMES_TO_ENCODE; i++) + { + tm.start(); + void* pComp_data = basis_compress( + images, + flags_and_quality | cFlagUseOpenCL, 1.0f, + &comp_size, + &stats); + + if (stats.m_opencl_failed) + { + error_printf("basis_benchmark_etc1s_opencl: OpenCL failed!\n"); + + basis_free_data(pComp_data); + + if (pOpenCL_failed) + *pOpenCL_failed = true; + + return false; + } + + double gpu_time = tm.get_elapsed_secs(); + if (!pComp_data) + { + error_printf("basis_benchmark_etc1s_opencl: basis_compress() failed (GPU)!\n"); + return false; + } + + best_gpu_time = minimum(best_gpu_time, gpu_time); + + basis_free_data(pComp_data); + } + + printf("Best GPU time: %3.3f\n", best_gpu_time); + + return best_gpu_time < best_cpu_time; + } + } // namespace basisu + + + diff --git a/encoder/basisu_comp.h b/encoder/basisu_comp.h index aa5ea6f..b6c9fef 100644 --- a/encoder/basisu_comp.h +++ b/encoder/basisu_comp.h @@ -92,6 +92,8 @@ namespace basisu m_best_etc1s_luma_709_psnr = 0.0f; m_best_etc1s_luma_601_psnr = 0.0f; m_best_etc1s_luma_709_ssim = 0.0f; + + m_opencl_failed = false; } std::string m_filename; @@ -119,6 +121,8 @@ namespace basisu float m_best_etc1s_luma_709_psnr; float m_best_etc1s_luma_601_psnr; float m_best_etc1s_luma_709_ssim; + + bool m_opencl_failed; }; template @@ -255,6 +259,7 @@ namespace basisu m_write_output_basis_files.clear(); m_compression_level.clear(); m_compute_stats.clear(); + m_print_stats.clear(); m_check_for_alpha.clear(); m_force_alpha.clear(); m_multithreading.clear(); @@ -373,6 +378,9 @@ namespace basisu // Compute and display image metrics bool_param m_compute_stats; + + // Print stats to stdout, if m_compute_stats is true. + bool_param m_print_stats; // Check to see if any input image has an alpha channel, if so then the output basis file will have alpha channels bool_param m_check_for_alpha; @@ -583,11 +591,16 @@ namespace basisu cFlagYFlip = 1 << 16, // flip source image on Y axis before compression cFlagUASTC = 1 << 17, // use UASTC compression vs. ETC1S - cFlagUASTCRDO = 1 << 18 // use RDO postprocessing when generating UASTC files (must set uastc_rdo_quality to the quality scalar) + cFlagUASTCRDO = 1 << 18, // use RDO postprocessing when generating UASTC files (must set uastc_rdo_quality to the quality scalar) + + cFlagPrintStats = 1 << 19, // print image stats to stdout + cFlagPrintStatus = 1 << 20 // print status to stdout }; // This function accepts an array of source images. // If more than one image is provided, it's assumed the images form a mipmap pyramid and automatic mipmap generation is disabled. + // Returns a pointer to the compressed .basis or .ktx2 file data. *pSize is the size of the compressed data. The returned block must be freed using basis_free_data(). + // basisu_encoder_init() MUST be called first! void* basis_compress( const basisu::vector &source_images, uint32_t flags_and_quality, float uastc_rdo_quality, @@ -604,6 +617,12 @@ namespace basisu // Frees the dynamically allocated file data returned by basis_compress(). void basis_free_data(void* p); + // Runs a short benchmark using synthetic image data to time OpenCL encoding vs. CPU encoding, with multithreading enabled. + // Returns true if opencl is worth using on this system, otherwise false. + // If pOpenCL_failed is not null, it will be set to true if OpenCL encoding failed *on this particular machine/driver/BasisU version* and the encoder falled back to CPU encoding. + // basisu_encoder_init() MUST be called first. If OpenCL support wasn't enabled this always returns false. + bool basis_benchmark_etc1s_opencl(bool *pOpenCL_failed = nullptr); + // Parallel compression API struct parallel_results {