1 // SPDX-License-Identifier: Apache-2.0
2 // ----------------------------------------------------------------------------
3 // Copyright 2011-2022 Arm Limited
4 //
5 // Licensed under the Apache License, Version 2.0 (the "License"); you may not
6 // use this file except in compliance with the License. You may obtain a copy
7 // of the License at:
8 //
9 // http://www.apache.org/licenses/LICENSE-2.0
10 //
11 // Unless required by applicable law or agreed to in writing, software
12 // distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
13 // WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
14 // License for the specific language governing permissions and limitations
15 // under the License.
16 // ----------------------------------------------------------------------------
17
18 #if !defined(ASTCENC_DECOMPRESS_ONLY)
19
20 /**
21 * @brief Functions for finding best endpoint format.
22 *
23 * We assume there are two independent sources of error in any given partition:
24 *
25 * - Encoding choice errors
26 * - Quantization errors
27 *
28 * Encoding choice errors are caused by encoder decisions. For example:
29 *
30 * - Using luminance instead of separate RGB components.
31 * - Using a constant 1.0 alpha instead of storing an alpha component.
32 * - Using RGB+scale instead of storing two full RGB endpoints.
33 *
34 * Quantization errors occur due to the limited precision we use for storage. These errors generally
35 * scale with quantization level, but are not actually independent of color encoding. In particular:
36 *
37 * - If we can use offset encoding then quantization error is halved.
38 * - If we can use blue-contraction then quantization error for RG is halved.
39 * - If we use HDR endpoints the quantization error is higher.
40 *
41 * Apart from these effects, we assume the error is proportional to the quantization step size.
42 */
43
44
45 #include "astcenc_internal.h"
46 #include "astcenc_vecmathlib.h"
47
48 #include <assert.h>
49
50 /**
51 * @brief Compute the errors of the endpoint line options for one partition.
52 *
53 * Uncorrelated data assumes storing completely independent RGBA channels for each endpoint. Same
54 * chroma data assumes storing RGBA endpoints which pass though the origin (LDR only). RGBL data
55 * assumes storing RGB + lumashift (HDR only). Luminance error assumes storing RGB channels as a
56 * single value.
57 *
58 *
59 * @param pi The partition info data.
60 * @param partition_index The partition index to compule the error for.
61 * @param blk The image block.
62 * @param uncor_pline The endpoint line assuming uncorrelated endpoints.
63 * @param[out] uncor_err The computed error for the uncorrelated endpoint line.
64 * @param samec_pline The endpoint line assuming the same chroma for both endpoints.
65 * @param[out] samec_err The computed error for the uncorrelated endpoint line.
66 * @param rgbl_pline The endpoint line assuming RGB + lumashift data.
67 * @param[out] rgbl_err The computed error for the RGB + lumashift endpoint line.
68 * @param l_pline The endpoint line assuming luminance data.
69 * @param[out] l_err The computed error for the luminance endpoint line.
70 * @param[out] a_drop_err The computed error for dropping the alpha component.
71 */
compute_error_squared_rgb_single_partition( const partition_info& pi, int partition_index, const image_block& blk, const processed_line3& uncor_pline, float& uncor_err, const processed_line3& samec_pline, float& samec_err, const processed_line3& rgbl_pline, float& rgbl_err, const processed_line3& l_pline, float& l_err, float& a_drop_err )72 static void compute_error_squared_rgb_single_partition(
73 const partition_info& pi,
74 int partition_index,
75 const image_block& blk,
76 const processed_line3& uncor_pline,
77 float& uncor_err,
78 const processed_line3& samec_pline,
79 float& samec_err,
80 const processed_line3& rgbl_pline,
81 float& rgbl_err,
82 const processed_line3& l_pline,
83 float& l_err,
84 float& a_drop_err
85 ) {
86 vfloat4 ews = blk.channel_weight;
87
88 unsigned int texel_count = pi.partition_texel_count[partition_index];
89 const uint8_t* texel_indexes = pi.texels_of_partition[partition_index];
90 promise(texel_count > 0);
91
92 vfloatacc a_drop_errv = vfloatacc::zero();
93 vfloat default_a(blk.get_default_alpha());
94
95 vfloatacc uncor_errv = vfloatacc::zero();
96 vfloat uncor_bs0(uncor_pline.bs.lane<0>());
97 vfloat uncor_bs1(uncor_pline.bs.lane<1>());
98 vfloat uncor_bs2(uncor_pline.bs.lane<2>());
99
100 vfloat uncor_amod0(uncor_pline.amod.lane<0>());
101 vfloat uncor_amod1(uncor_pline.amod.lane<1>());
102 vfloat uncor_amod2(uncor_pline.amod.lane<2>());
103
104 vfloatacc samec_errv = vfloatacc::zero();
105 vfloat samec_bs0(samec_pline.bs.lane<0>());
106 vfloat samec_bs1(samec_pline.bs.lane<1>());
107 vfloat samec_bs2(samec_pline.bs.lane<2>());
108
109 vfloatacc rgbl_errv = vfloatacc::zero();
110 vfloat rgbl_bs0(rgbl_pline.bs.lane<0>());
111 vfloat rgbl_bs1(rgbl_pline.bs.lane<1>());
112 vfloat rgbl_bs2(rgbl_pline.bs.lane<2>());
113
114 vfloat rgbl_amod0(rgbl_pline.amod.lane<0>());
115 vfloat rgbl_amod1(rgbl_pline.amod.lane<1>());
116 vfloat rgbl_amod2(rgbl_pline.amod.lane<2>());
117
118 vfloatacc l_errv = vfloatacc::zero();
119 vfloat l_bs0(l_pline.bs.lane<0>());
120 vfloat l_bs1(l_pline.bs.lane<1>());
121 vfloat l_bs2(l_pline.bs.lane<2>());
122
123 vfloat one_third(1/3.0f, 1/3.0f, 1/3.0f, 1/3.0f);
124 vfloat uncor_errv0 = vfloat::zero();
125 vfloat uncor_errv1 = vfloat::zero();
126 vfloat uncor_errv2 = vfloat::zero();
127 vfloat samec_errv0 = vfloat::zero();
128 vfloat samec_errv1 = vfloat::zero();
129 vfloat samec_errv2 = vfloat::zero();
130 vfloat rgbl_errv0 = vfloat::zero();
131 vfloat rgbl_errv1 = vfloat::zero();
132 vfloat rgbl_errv2 = vfloat::zero();
133 vfloat l_errv0 = vfloat::zero();
134 vfloat l_errv1 = vfloat::zero();
135 vfloat l_errv2 = vfloat::zero();
136
137 unsigned int i = 0;
138 for (; i + ASTCENC_SIMD_WIDTH <= texel_count; i += ASTCENC_SIMD_WIDTH)
139 {
140 #ifdef ASTCENC_USE_COMMON_GATHERF
141 const uint8_t* tix = texel_indexes + i;
142 #else
143 vint tix(texel_indexes + i);
144 #endif
145
146 // Compute the error that arises from just ditching alpha
147 vfloat data_a = gatherf(blk.data_a, tix);
148 vfloat alpha_diff = data_a - default_a;
149 alpha_diff = alpha_diff * alpha_diff;
150
151 haccumulate(a_drop_errv, alpha_diff);
152
153 vfloat data_r = gatherf(blk.data_r, tix);
154 vfloat data_g = gatherf(blk.data_g, tix);
155 vfloat data_b = gatherf(blk.data_b, tix);
156
157 vfloat data_rgb_avg = (data_r + data_g + data_b) * one_third;
158 vfloat data_rgb_0 = data_rgb_avg - data_r;
159 vfloat data_rgb_1 = data_rgb_avg - data_g;
160 vfloat data_rgb_2 = data_rgb_avg - data_b;
161
162 // Compute uncorrelated error
163 vfloat param = data_r * uncor_bs0
164 + data_g * uncor_bs1
165 + data_b * uncor_bs2;
166
167 vfloat dist0 = (uncor_amod0 + param * uncor_bs0) - data_r;
168 vfloat dist1 = (uncor_amod1 + param * uncor_bs1) - data_g;
169 vfloat dist2 = (uncor_amod2 + param * uncor_bs2) - data_b;
170
171 haccumulate(uncor_errv0, dist0 * dist0);
172 haccumulate(uncor_errv1, dist1 * dist1);
173 haccumulate(uncor_errv2, dist2 * dist2);
174
175 // Compute same chroma error - no "amod", its always zero
176 param = data_r * samec_bs0
177 + data_g * samec_bs1
178 + data_b * samec_bs2;
179
180 dist0 = (param * samec_bs0) - data_r;
181 dist1 = (param * samec_bs1) - data_g;
182 dist2 = (param * samec_bs2) - data_b;
183
184 haccumulate(uncor_errv0, dist0 * dist0);
185 haccumulate(uncor_errv1, dist1 * dist1);
186 haccumulate(uncor_errv2, dist2 * dist2);
187
188 // Compute rgbl error
189 dist0 = rgbl_amod0 + data_rgb_0;
190 dist1 = rgbl_amod1 + data_rgb_1;
191 dist2 = rgbl_amod2 + data_rgb_2;
192
193 haccumulate(rgbl_errv0, dist0 * dist0);
194 haccumulate(rgbl_errv1, dist1 * dist1);
195 haccumulate(rgbl_errv2, dist2 * dist2);
196
197 // Compute luma error - no "amod", its always zero
198 dist0 = data_rgb_0;
199 dist1 = data_rgb_1;
200 dist2 = data_rgb_2;
201
202 haccumulate(l_errv0, dist0 * dist0);
203 haccumulate(l_errv1, dist1 * dist1);
204 haccumulate(l_errv2, dist2 * dist2);
205 }
206
207 uncor_errv = uncor_errv0 * ews.lane<0>() + uncor_errv1 * ews.lane<1>() + uncor_errv2 * ews.lane<2>(); // channel 0,1,2
208 samec_errv = samec_errv0 * ews.lane<0>() + samec_errv1 * ews.lane<1>() + samec_errv2 * ews.lane<2>(); // channel 0,1,2
209 rgbl_errv = rgbl_errv0 * ews.lane<0>() + rgbl_errv1 * ews.lane<1>() + rgbl_errv2 * ews.lane<2>(); // channel 0,1,2
210 l_errv = l_errv0 * ews.lane<0>() + l_errv1 * ews.lane<1>() + l_errv2 * ews.lane<2>(); // channel 0,1,2
211
212 if (i < texel_count)
213 {
214 vint lane_ids = vint::lane_id() + i;
215 vint tix(texel_indexes + i);
216
217 vmask mask = lane_ids < vint(texel_count);
218 lane_ids += vint(ASTCENC_SIMD_WIDTH);
219
220 // Compute the error that arises from just ditching alpha
221 vfloat data_a = gatherf(blk.data_a, tix);
222 vfloat alpha_diff = data_a - default_a;
223 alpha_diff = alpha_diff * alpha_diff;
224
225 haccumulate(a_drop_errv, alpha_diff, mask);
226
227 vfloat data_r = gatherf(blk.data_r, tix);
228 vfloat data_g = gatherf(blk.data_g, tix);
229 vfloat data_b = gatherf(blk.data_b, tix);
230
231 vfloat data_rgb_avg = (data_r + data_g + data_b) * one_third;
232 vfloat data_rgb_0 = data_rgb_avg - data_r;
233 vfloat data_rgb_1 = data_rgb_avg - data_g;
234 vfloat data_rgb_2 = data_rgb_avg - data_b;
235
236 // Compute uncorrelated error
237 vfloat param = data_r * uncor_bs0
238 + data_g * uncor_bs1
239 + data_b * uncor_bs2;
240
241 vfloat dist0 = (uncor_amod0 + param * uncor_bs0) - data_r;
242 vfloat dist1 = (uncor_amod1 + param * uncor_bs1) - data_g;
243 vfloat dist2 = (uncor_amod2 + param * uncor_bs2) - data_b;
244
245 vfloat error = dist0 * dist0 * ews.lane<0>()
246 + dist1 * dist1 * ews.lane<1>()
247 + dist2 * dist2 * ews.lane<2>();
248
249 haccumulate(uncor_errv, error, mask);
250
251 // Compute same chroma error - no "amod", its always zero
252 param = data_r * samec_bs0
253 + data_g * samec_bs1
254 + data_b * samec_bs2;
255
256 dist0 = (param * samec_bs0) - data_r;
257 dist1 = (param * samec_bs1) - data_g;
258 dist2 = (param * samec_bs2) - data_b;
259
260 error = dist0 * dist0 * ews.lane<0>()
261 + dist1 * dist1 * ews.lane<1>()
262 + dist2 * dist2 * ews.lane<2>();
263
264 haccumulate(samec_errv, error, mask);
265
266 // Compute rgbl error
267 dist0 = rgbl_amod0 + data_rgb_0;
268 dist1 = rgbl_amod1 + data_rgb_1;
269 dist2 = rgbl_amod2 + data_rgb_2;
270
271 error = dist0 * dist0 * ews.lane<0>()
272 + dist1 * dist1 * ews.lane<1>()
273 + dist2 * dist2 * ews.lane<2>();
274
275 haccumulate(rgbl_errv, error, mask);
276
277 // Compute luma error - no "amod", its always zero
278 dist0 = data_rgb_0;
279 dist1 = data_rgb_1;
280 dist2 = data_rgb_2;
281
282 error = dist0 * dist0 * ews.lane<0>()
283 + dist1 * dist1 * ews.lane<1>()
284 + dist2 * dist2 * ews.lane<2>();
285
286 haccumulate(l_errv, error, mask);
287 }
288
289 a_drop_err = hadd_s(a_drop_errv) * ews.lane<3>();
290 uncor_err = hadd_s(uncor_errv);
291 samec_err = hadd_s(samec_errv);
292 rgbl_err = hadd_s(rgbl_errv);
293 l_err = hadd_s(l_errv);
294 }
295
296 /**
297 * @brief For a given set of input colors and partitioning determine endpoint encode errors.
298 *
299 * This function determines the color error that results from RGB-scale encoding (LDR only),
300 * RGB-lumashift encoding (HDR only), luminance-encoding, and alpha drop. Also determines whether
301 * the endpoints are eligible for offset encoding or blue-contraction
302 *
303 * @param blk The image block.
304 * @param pi The partition info data.
305 * @param ep The idealized endpoints.
306 * @param[out] eci The resulting encoding choice error metrics.
307 */
308 static void compute_encoding_choice_errors(
309 QualityProfile privateProfile,
310 const image_block& blk,
311 const partition_info& pi,
312 const endpoints& ep,
313 encoding_choice_errors eci[BLOCK_MAX_PARTITIONS])
314 {
315 int partition_count = pi.partition_count;
316 promise(partition_count > 0);
317
318 partition_metrics *pms = reinterpret_cast<partition_metrics *>(&blk.pms[0]);
319
320 if (!blk.is_constant_channel(3) || (partition_count != 1 && privateProfile == HIGH_QUALITY_PROFILE))
321 {
322 compute_avgs_and_dirs_3_comp_rgb(pi, blk, pms);
323 }
324
325 for (int i = 0; i < partition_count; i++)
326 {
327 partition_metrics& pm = pms[i];
328
329 line3 uncor_rgb_lines;
330 line3 samec_rgb_lines; // for LDR-RGB-scale
331 line3 rgb_luma_lines; // for HDR-RGB-scale
332
333 processed_line3 uncor_rgb_plines;
334 processed_line3 samec_rgb_plines;
335 processed_line3 rgb_luma_plines;
336 processed_line3 luminance_plines;
337
338 float uncorr_rgb_error;
339 float samechroma_rgb_error;
340 float rgb_luma_error;
341 float luminance_rgb_error;
342 float alpha_drop_error;
343
344 uncor_rgb_lines.a = pm.avg;
345 uncor_rgb_lines.b = normalize_safe(pm.dir, unit3());
346
347 samec_rgb_lines.a = vfloat4::zero();
348 samec_rgb_lines.b = normalize_safe(pm.avg, unit3());
349
350 rgb_luma_lines.a = pm.avg;
351 rgb_luma_lines.b = unit3();
352
353 uncor_rgb_plines.amod = uncor_rgb_lines.a - uncor_rgb_lines.b * dot3(uncor_rgb_lines.a, uncor_rgb_lines.b);
354 uncor_rgb_plines.bs = uncor_rgb_lines.b;
355
356 // Same chroma always goes though zero, so this is simpler than the others
357 samec_rgb_plines.amod = vfloat4::zero();
358 samec_rgb_plines.bs = samec_rgb_lines.b;
359
360 rgb_luma_plines.amod = rgb_luma_lines.a - rgb_luma_lines.b * dot3(rgb_luma_lines.a, rgb_luma_lines.b);
361 rgb_luma_plines.bs = rgb_luma_lines.b;
362
363 // Luminance always goes though zero, so this is simpler than the others
364 luminance_plines.amod = vfloat4::zero();
365 luminance_plines.bs = unit3();
366
367 compute_error_squared_rgb_single_partition(
368 pi, i, blk,
369 uncor_rgb_plines, uncorr_rgb_error,
370 samec_rgb_plines, samechroma_rgb_error,
371 rgb_luma_plines, rgb_luma_error,
372 luminance_plines, luminance_rgb_error,
373 alpha_drop_error);
374
375 // Determine if we can offset encode RGB lanes
376 vfloat4 endpt0 = ep.endpt0[i];
377 vfloat4 endpt1 = ep.endpt1[i];
378 vfloat4 endpt_diff = abs(endpt1 - endpt0);
379 vmask4 endpt_can_offset = endpt_diff < vfloat4(0.12f * 65535.0f);
380 bool can_offset_encode = (mask(endpt_can_offset) & 0x7) == 0x7;
381
382 // Store out the settings
383 eci[i].rgb_scale_error = (samechroma_rgb_error - uncorr_rgb_error) * 0.7f; // empirical
384 eci[i].rgb_luma_error = (rgb_luma_error - uncorr_rgb_error) * 1.5f; // wild guess
385 eci[i].luminance_error = (luminance_rgb_error - uncorr_rgb_error) * 3.0f; // empirical
386 eci[i].alpha_drop_error = alpha_drop_error * 3.0f;
387 eci[i].can_offset_encode = can_offset_encode;
388 eci[i].can_blue_contract = !blk.is_luminance();
389 }
390 }
391
392 /**
393 * @brief For a given partition compute the error for every endpoint integer count and quant level.
394 *
395 * @param encode_hdr_rgb @c true if using HDR for RGB, @c false for LDR.
396 * @param encode_hdr_alpha @c true if using HDR for alpha, @c false for LDR.
397 * @param partition_index The partition index.
398 * @param pi The partition info.
399 * @param eci The encoding choice error metrics.
400 * @param ep The idealized endpoints.
401 * @param error_weight The resulting encoding choice error metrics.
402 * @param[out] best_error The best error for each integer count and quant level.
403 * @param[out] format_of_choice The preferred endpoint format for each integer count and quant level.
404 */
405 static void compute_color_error_for_every_integer_count_and_quant_level(
406 bool encode_hdr_rgb,
407 bool encode_hdr_alpha,
408 int partition_index,
409 const partition_info& pi,
410 const encoding_choice_errors& eci,
411 const endpoints& ep,
412 vfloat4 error_weight,
413 float best_error[21][4],
414 uint8_t format_of_choice[21][4]
415 ) {
416 int partition_size = pi.partition_texel_count[partition_index];
417
418 static const float baseline_quant_error[21 - QUANT_6] {
419 (65536.0f * 65536.0f / 18.0f) / (5 * 5),
420 (65536.0f * 65536.0f / 18.0f) / (7 * 7),
421 (65536.0f * 65536.0f / 18.0f) / (9 * 9),
422 (65536.0f * 65536.0f / 18.0f) / (11 * 11),
423 (65536.0f * 65536.0f / 18.0f) / (15 * 15),
424 (65536.0f * 65536.0f / 18.0f) / (19 * 19),
425 (65536.0f * 65536.0f / 18.0f) / (23 * 23),
426 (65536.0f * 65536.0f / 18.0f) / (31 * 31),
427 (65536.0f * 65536.0f / 18.0f) / (39 * 39),
428 (65536.0f * 65536.0f / 18.0f) / (47 * 47),
429 (65536.0f * 65536.0f / 18.0f) / (63 * 63),
430 (65536.0f * 65536.0f / 18.0f) / (79 * 79),
431 (65536.0f * 65536.0f / 18.0f) / (95 * 95),
432 (65536.0f * 65536.0f / 18.0f) / (127 * 127),
433 (65536.0f * 65536.0f / 18.0f) / (159 * 159),
434 (65536.0f * 65536.0f / 18.0f) / (191 * 191),
435 (65536.0f * 65536.0f / 18.0f) / (255 * 255)
436 };
437
438 vfloat4 ep0 = ep.endpt0[partition_index];
439 vfloat4 ep1 = ep.endpt1[partition_index];
440
441 float ep1_min = hmin_rgb_s(ep1);
442 ep1_min = astc::max(ep1_min, 0.0f);
443
444 float error_weight_rgbsum = hadd_rgb_s(error_weight);
445
446 float range_upper_limit_rgb = encode_hdr_rgb ? 61440.0f : 65535.0f;
447 float range_upper_limit_alpha = encode_hdr_alpha ? 61440.0f : 65535.0f;
448
449 // It is possible to get endpoint colors significantly outside [0,upper-limit] even if the
450 // input data are safely contained in [0,upper-limit]; we need to add an error term for this
451 vfloat4 offset(range_upper_limit_rgb, range_upper_limit_rgb, range_upper_limit_rgb, range_upper_limit_alpha);
452 vfloat4 ep0_range_error_high = max(ep0 - offset, 0.0f);
453 vfloat4 ep1_range_error_high = max(ep1 - offset, 0.0f);
454
455 vfloat4 ep0_range_error_low = min(ep0, 0.0f);
456 vfloat4 ep1_range_error_low = min(ep1, 0.0f);
457
458 vfloat4 sum_range_error =
459 (ep0_range_error_low * ep0_range_error_low) +
460 (ep1_range_error_low * ep1_range_error_low) +
461 (ep0_range_error_high * ep0_range_error_high) +
462 (ep1_range_error_high * ep1_range_error_high);
463
464 float rgb_range_error = dot3_s(sum_range_error, error_weight)
465 * 0.5f * static_cast<float>(partition_size);
466 float alpha_range_error = sum_range_error.lane<3>() * error_weight.lane<3>()
467 * 0.5f * static_cast<float>(partition_size);
468
469 if (encode_hdr_rgb)
470 {
471
472 // Collect some statistics
473 float af, cf;
474 if (ep1.lane<0>() > ep1.lane<1>() && ep1.lane<0>() > ep1.lane<2>())
475 {
476 af = ep1.lane<0>();
477 cf = ep1.lane<0>() - ep0.lane<0>();
478 }
479 else if (ep1.lane<1>() > ep1.lane<2>())
480 {
481 af = ep1.lane<1>();
482 cf = ep1.lane<1>() - ep0.lane<1>();
483 }
484 else
485 {
486 af = ep1.lane<2>();
487 cf = ep1.lane<2>() - ep0.lane<2>();
488 }
489
490 // Estimate of color-component spread in high endpoint color
491 float bf = af - ep1_min;
492 vfloat4 prd = (ep1 - vfloat4(cf)).swz<0, 1, 2>();
493 vfloat4 pdif = prd - ep0.swz<0, 1, 2>();
494 // Estimate of color-component spread in low endpoint color
495 float df = hmax_s(abs(pdif));
496
497 int b = static_cast<int>(bf);
498 int c = static_cast<int>(cf);
499 int d = static_cast<int>(df);
500
501 // Determine which one of the 6 submodes is likely to be used in case of an RGBO-mode
502 int rgbo_mode = 5; // 7 bits per component
503 // mode 4: 8 7 6
504 if (b < 32768 && c < 16384)
505 {
506 rgbo_mode = 4;
507 }
508
509 // mode 3: 9 6 7
510 if (b < 8192 && c < 16384)
511 {
512 rgbo_mode = 3;
513 }
514
515 // mode 2: 10 5 8
516 if (b < 2048 && c < 16384)
517 {
518 rgbo_mode = 2;
519 }
520
521 // mode 1: 11 6 5
522 if (b < 2048 && c < 1024)
523 {
524 rgbo_mode = 1;
525 }
526
527 // mode 0: 11 5 7
528 if (b < 1024 && c < 4096)
529 {
530 rgbo_mode = 0;
531 }
532
533 // Determine which one of the 9 submodes is likely to be used in case of an RGB-mode.
534 int rgb_mode = 8; // 8 bits per component, except 7 bits for blue
535
536 // mode 0: 9 7 6 7
537 if (b < 16384 && c < 8192 && d < 8192)
538 {
539 rgb_mode = 0;
540 }
541
542 // mode 1: 9 8 6 6
543 if (b < 32768 && c < 8192 && d < 4096)
544 {
545 rgb_mode = 1;
546 }
547
548 // mode 2: 10 6 7 7
549 if (b < 4096 && c < 8192 && d < 4096)
550 {
551 rgb_mode = 2;
552 }
553
554 // mode 3: 10 7 7 6
555 if (b < 8192 && c < 8192 && d < 2048)
556 {
557 rgb_mode = 3;
558 }
559
560 // mode 4: 11 8 6 5
561 if (b < 8192 && c < 2048 && d < 512)
562 {
563 rgb_mode = 4;
564 }
565
566 // mode 5: 11 6 8 6
567 if (b < 2048 && c < 8192 && d < 1024)
568 {
569 rgb_mode = 5;
570 }
571
572 // mode 6: 12 7 7 5
573 if (b < 2048 && c < 2048 && d < 256)
574 {
575 rgb_mode = 6;
576 }
577
578 // mode 7: 12 6 7 6
579 if (b < 1024 && c < 2048 && d < 512)
580 {
581 rgb_mode = 7;
582 }
583
584 static const float rgbo_error_scales[6] { 4.0f, 4.0f, 16.0f, 64.0f, 256.0f, 1024.0f };
585 static const float rgb_error_scales[9] { 64.0f, 64.0f, 16.0f, 16.0f, 4.0f, 4.0f, 1.0f, 1.0f, 384.0f };
586
587 float mode7mult = rgbo_error_scales[rgbo_mode] * 0.0015f; // Empirically determined ....
588 float mode11mult = rgb_error_scales[rgb_mode] * 0.010f; // Empirically determined ....
589
590
591 float lum_high = hadd_rgb_s(ep1) * (1.0f / 3.0f);
592 float lum_low = hadd_rgb_s(ep0) * (1.0f / 3.0f);
593 float lumdif = lum_high - lum_low;
594 float mode23mult = lumdif < 960 ? 4.0f : lumdif < 3968 ? 16.0f : 128.0f;
595
596 mode23mult *= 0.0005f; // Empirically determined ....
597
598 // Pick among the available HDR endpoint modes
599 for (int i = QUANT_2; i < QUANT_16; i++)
600 {
601 best_error[i][3] = ERROR_CALC_DEFAULT;
602 best_error[i][2] = ERROR_CALC_DEFAULT;
603 best_error[i][1] = ERROR_CALC_DEFAULT;
604 best_error[i][0] = ERROR_CALC_DEFAULT;
605
606 format_of_choice[i][3] = static_cast<uint8_t>(encode_hdr_alpha ? FMT_HDR_RGBA : FMT_HDR_RGB_LDR_ALPHA);
607 format_of_choice[i][2] = FMT_HDR_RGB;
608 format_of_choice[i][1] = FMT_HDR_RGB_SCALE;
609 format_of_choice[i][0] = FMT_HDR_LUMINANCE_LARGE_RANGE;
610 }
611
612 for (int i = QUANT_16; i <= QUANT_256; i++)
613 {
614 // The base_quant_error should depend on the scale-factor that would be used during
615 // actual encode of the color value
616
617 float base_quant_error = baseline_quant_error[i - QUANT_6] * static_cast<float>(partition_size);
618 float rgb_quantization_error = error_weight_rgbsum * base_quant_error * 2.0f;
619 float alpha_quantization_error = error_weight.lane<3>() * base_quant_error * 2.0f;
620 float rgba_quantization_error = rgb_quantization_error + alpha_quantization_error;
621
622 // For 8 integers, we have two encodings: one with HDR A and another one with LDR A
623
624 float full_hdr_rgba_error = rgba_quantization_error + rgb_range_error + alpha_range_error;
625 best_error[i][3] = full_hdr_rgba_error;
626 format_of_choice[i][3] = static_cast<uint8_t>(encode_hdr_alpha ? FMT_HDR_RGBA : FMT_HDR_RGB_LDR_ALPHA);
627
628 // For 6 integers, we have one HDR-RGB encoding
629 float full_hdr_rgb_error = (rgb_quantization_error * mode11mult) + rgb_range_error + eci.alpha_drop_error;
630 best_error[i][2] = full_hdr_rgb_error;
631 format_of_choice[i][2] = FMT_HDR_RGB;
632
633 // For 4 integers, we have one HDR-RGB-Scale encoding
634 float hdr_rgb_scale_error = (rgb_quantization_error * mode7mult) + rgb_range_error + eci.alpha_drop_error + eci.rgb_luma_error;
635
636 best_error[i][1] = hdr_rgb_scale_error;
637 format_of_choice[i][1] = FMT_HDR_RGB_SCALE;
638
639 // For 2 integers, we assume luminance-with-large-range
640 float hdr_luminance_error = (rgb_quantization_error * mode23mult) + rgb_range_error + eci.alpha_drop_error + eci.luminance_error;
641 best_error[i][0] = hdr_luminance_error;
642 format_of_choice[i][0] = FMT_HDR_LUMINANCE_LARGE_RANGE;
643 }
644 }
645 else
646 {
647 for (int i = QUANT_2; i < QUANT_6; i++)
648 {
649 best_error[i][3] = ERROR_CALC_DEFAULT;
650 best_error[i][2] = ERROR_CALC_DEFAULT;
651 best_error[i][1] = ERROR_CALC_DEFAULT;
652 best_error[i][0] = ERROR_CALC_DEFAULT;
653
654 format_of_choice[i][3] = FMT_RGBA;
655 format_of_choice[i][2] = FMT_RGB;
656 format_of_choice[i][1] = FMT_RGB_SCALE;
657 format_of_choice[i][0] = FMT_LUMINANCE;
658 }
659
660 float base_quant_error_rgb = error_weight_rgbsum * static_cast<float>(partition_size);
661 float base_quant_error_a = error_weight.lane<3>() * static_cast<float>(partition_size);
662 float base_quant_error_rgba = base_quant_error_rgb + base_quant_error_a;
663
664 float error_scale_bc_rgba = eci.can_blue_contract ? 0.625f : 1.0f;
665 float error_scale_oe_rgba = eci.can_offset_encode ? 0.5f : 1.0f;
666
667 float error_scale_bc_rgb = eci.can_blue_contract ? 0.5f : 1.0f;
668 float error_scale_oe_rgb = eci.can_offset_encode ? 0.25f : 1.0f;
669
670 // Pick among the available LDR endpoint modes
671 for (int i = QUANT_6; i <= QUANT_256; i++)
672 {
673 // Offset encoding not possible at higher quant levels
674 if (i >= QUANT_192)
675 {
676 error_scale_oe_rgba = 1.0f;
677 error_scale_oe_rgb = 1.0f;
678 }
679
680 float base_quant_error = baseline_quant_error[i - QUANT_6];
681 float quant_error_rgb = base_quant_error_rgb * base_quant_error;
682 float quant_error_rgba = base_quant_error_rgba * base_quant_error;
683
684 // 8 integers can encode as RGBA+RGBA
685 float full_ldr_rgba_error = quant_error_rgba
686 * error_scale_bc_rgba
687 * error_scale_oe_rgba
688 + rgb_range_error
689 + alpha_range_error;
690
691 best_error[i][3] = full_ldr_rgba_error;
692 format_of_choice[i][3] = FMT_RGBA;
693
694 // 6 integers can encode as RGB+RGB or RGBS+AA
695 float full_ldr_rgb_error = quant_error_rgb
696 * error_scale_bc_rgb
697 * error_scale_oe_rgb
698 + rgb_range_error
699 + eci.alpha_drop_error;
700
701 float rgbs_alpha_error = quant_error_rgba
702 + eci.rgb_scale_error
703 + rgb_range_error
704 + alpha_range_error;
705
706 if (rgbs_alpha_error < full_ldr_rgb_error)
707 {
708 best_error[i][2] = rgbs_alpha_error;
709 format_of_choice[i][2] = FMT_RGB_SCALE_ALPHA;
710 }
711 else
712 {
713 best_error[i][2] = full_ldr_rgb_error;
714 format_of_choice[i][2] = FMT_RGB;
715 }
716
717 // 4 integers can encode as RGBS or LA+LA
718 float ldr_rgbs_error = quant_error_rgb
719 + rgb_range_error
720 + eci.alpha_drop_error
721 + eci.rgb_scale_error;
722
723 float lum_alpha_error = quant_error_rgba
724 + rgb_range_error
725 + alpha_range_error
726 + eci.luminance_error;
727
728 if (ldr_rgbs_error < lum_alpha_error)
729 {
730 best_error[i][1] = ldr_rgbs_error;
731 format_of_choice[i][1] = FMT_RGB_SCALE;
732 }
733 else
734 {
735 best_error[i][1] = lum_alpha_error;
736 format_of_choice[i][1] = FMT_LUMINANCE_ALPHA;
737 }
738
739 // 2 integers can encode as L+L
740 float luminance_error = quant_error_rgb
741 + rgb_range_error
742 + eci.alpha_drop_error
743 + eci.luminance_error;
744
745 best_error[i][0] = luminance_error;
746 format_of_choice[i][0] = FMT_LUMINANCE;
747 }
748 }
749 }
750
751 /**
752 * @brief For one partition compute the best format and quantization for a given bit count.
753 *
754 * @param best_combined_error The best error for each quant level and integer count.
755 * @param best_combined_format The best format for each quant level and integer count.
756 * @param bits_available The number of bits available for encoding.
757 * @param[out] best_quant_level The output best color quant level.
758 * @param[out] best_format The output best color format.
759 *
760 * @return The output error for the best pairing.
761 */
762 static float one_partition_find_best_combination_for_bitcount(
763 QualityProfile privateProfile,
764 const float best_combined_error[21][4],
765 const uint8_t best_combined_format[21][4],
766 int bits_available,
767 uint8_t& best_quant_level,
768 uint8_t& best_format
769 ) {
770 int best_integer_count = 0;
771 float best_integer_count_error = ERROR_CALC_DEFAULT;
772
773 for (int integer_count = 1; integer_count <= 4; integer_count++)
774 {
775 if (privateProfile != HIGH_QUALITY_PROFILE)
776 {
777 integer_count = 4; // constant 4 bit count for HIGH_SPEED_PROFILE mode
778 }
779 // Compute the quantization level for a given number of integers and a given number of bits
780 int quant_level = quant_mode_table[integer_count][bits_available];
781
782 // Don't have enough bits to represent a given endpoint format at all!
783 if (quant_level < QUANT_6)
784 {
785 continue;
786 }
787
788 float integer_count_error = best_combined_error[quant_level][integer_count - 1];
789 if (integer_count_error < best_integer_count_error)
790 {
791 best_integer_count_error = integer_count_error;
792 best_integer_count = integer_count - 1;
793 }
794 }
795
796 int ql = quant_mode_table[best_integer_count + 1][bits_available];
797
798 best_quant_level = static_cast<uint8_t>(ql);
799 if (privateProfile != HIGH_QUALITY_PROFILE) // keep openSource code style
800 {
801 best_format = FMT_RGBA;
802 }
803 else
804 {
805 best_format = FMT_LUMINANCE;
806
807 if (ql >= QUANT_6)
808 {
809 best_format = best_combined_format[ql][best_integer_count];
810 }
811 }
812
813 return best_integer_count_error;
814 }
815
816 /**
817 * @brief For 2 partitions compute the best format combinations for every pair of quant mode and integer count.
818 *
819 * @param best_error The best error for a single endpoint quant level and integer count.
820 * @param best_format The best format for a single endpoint quant level and integer count.
821 * @param[out] best_combined_error The best combined error pairings for the 2 partitions.
822 * @param[out] best_combined_format The best combined format pairings for the 2 partitions.
823 */
824 static void two_partitions_find_best_combination_for_every_quantization_and_integer_count(
825 const float best_error[2][21][4], // indexed by (partition, quant-level, integer-pair-count-minus-1)
826 const uint8_t best_format[2][21][4],
827 float best_combined_error[21][7], // indexed by (quant-level, integer-pair-count-minus-2)
828 uint8_t best_combined_format[21][7][2]
829 ) {
830 for (int i = QUANT_2; i <= QUANT_256; i++)
831 {
832 for (int j = 0; j < 7; j++)
833 {
834 best_combined_error[i][j] = ERROR_CALC_DEFAULT;
835 }
836 }
837
838 for (int quant = QUANT_6; quant <= QUANT_256; quant++)
839 {
840 for (int i = 0; i < 4; i++) // integer-count for first endpoint-pair
841 {
842 for (int j = 0; j < 4; j++) // integer-count for second endpoint-pair
843 {
844 int low2 = astc::min(i, j);
845 int high2 = astc::max(i, j);
846 if ((high2 - low2) > 1)
847 {
848 continue;
849 }
850
851 int intcnt = i + j;
852 float errorterm = astc::min(best_error[0][quant][i] + best_error[1][quant][j], 1e10f);
853 if (errorterm <= best_combined_error[quant][intcnt])
854 {
855 best_combined_error[quant][intcnt] = errorterm;
856 best_combined_format[quant][intcnt][0] = best_format[0][quant][i];
857 best_combined_format[quant][intcnt][1] = best_format[1][quant][j];
858 }
859 }
860 }
861 }
862 }
863
864 /**
865 * @brief For 2 partitions compute the best format and quantization for a given bit count.
866 *
867 * @param best_combined_error The best error for each quant level and integer count.
868 * @param best_combined_format The best format for each quant level and integer count.
869 * @param bits_available The number of bits available for encoding.
870 * @param[out] best_quant_level The output best color quant level.
871 * @param[out] best_quant_level_mod The output best color quant level assuming two more bits are available.
872 * @param[out] best_formats The output best color formats.
873 *
874 * @return The output error for the best pairing.
875 */
876 static float two_partitions_find_best_combination_for_bitcount(
877 unsigned int privateProfile,
878 float best_combined_error[21][7],
879 uint8_t best_combined_format[21][7][2],
880 int bits_available,
881 uint8_t& best_quant_level,
882 uint8_t& best_quant_level_mod,
883 uint8_t* best_formats
884 ) {
885 int best_integer_count = 0;
886 float best_integer_count_error = ERROR_CALC_DEFAULT;
887 int integer_count = 2;
888 if (privateProfile != HIGH_QUALITY_PROFILE)
889 {
890 integer_count = 8; // constant 8 bit count
891 }
892
893 for (; integer_count <= 8; integer_count++)
894 {
895 // Compute the quantization level for a given number of integers and a given number of bits
896 int quant_level = quant_mode_table[integer_count][bits_available];
897
898 // Don't have enough bits to represent a given endpoint format at all!
899 if (quant_level < QUANT_6)
900 {
901 break;
902 }
903
904 float integer_count_error = best_combined_error[quant_level][integer_count - 2];
905 if (integer_count_error < best_integer_count_error)
906 {
907 best_integer_count_error = integer_count_error;
908 best_integer_count = integer_count;
909 }
910 }
911
912 int ql = quant_mode_table[best_integer_count][bits_available];
913 int ql_mod = quant_mode_table[best_integer_count][bits_available + 2];
914
915 best_quant_level = static_cast<uint8_t>(ql);
916 best_quant_level_mod = static_cast<uint8_t>(ql_mod);
917
918 if (ql >= QUANT_6)
919 {
920 for (int i = 0; i < 2; i++)
921 {
922 best_formats[i] = best_combined_format[ql][best_integer_count - 2][i];
923 }
924 }
925 else
926 {
927 for (int i = 0; i < 2; i++)
928 {
929 best_formats[i] = FMT_LUMINANCE;
930 }
931 }
932
933 return best_integer_count_error;
934 }
935
936 /**
937 * @brief For 3 partitions compute the best format combinations for every pair of quant mode and integer count.
938 *
939 * @param best_error The best error for a single endpoint quant level and integer count.
940 * @param best_format The best format for a single endpoint quant level and integer count.
941 * @param[out] best_combined_error The best combined error pairings for the 3 partitions.
942 * @param[out] best_combined_format The best combined format pairings for the 3 partitions.
943 */
944 static void three_partitions_find_best_combination_for_every_quantization_and_integer_count(
945 const float best_error[3][21][4], // indexed by (partition, quant-level, integer-count)
946 const uint8_t best_format[3][21][4],
947 float best_combined_error[21][10],
948 uint8_t best_combined_format[21][10][3]
949 ) {
950 for (int i = QUANT_2; i <= QUANT_256; i++)
951 {
952 for (int j = 0; j < 10; j++)
953 {
954 best_combined_error[i][j] = ERROR_CALC_DEFAULT;
955 }
956 }
957
958 for (int quant = QUANT_6; quant <= QUANT_256; quant++)
959 {
960 for (int i = 0; i < 4; i++) // integer-count for first endpoint-pair
961 {
962 for (int j = 0; j < 4; j++) // integer-count for second endpoint-pair
963 {
964 int low2 = astc::min(i, j);
965 int high2 = astc::max(i, j);
966 if ((high2 - low2) > 1)
967 {
968 continue;
969 }
970
971 for (int k = 0; k < 4; k++) // integer-count for third endpoint-pair
972 {
973 int low3 = astc::min(k, low2);
974 int high3 = astc::max(k, high2);
975 if ((high3 - low3) > 1)
976 {
977 continue;
978 }
979
980 int intcnt = i + j + k;
981 float errorterm = astc::min(best_error[0][quant][i] + best_error[1][quant][j] + best_error[2][quant][k], 1e10f);
982 if (errorterm <= best_combined_error[quant][intcnt])
983 {
984 best_combined_error[quant][intcnt] = errorterm;
985 best_combined_format[quant][intcnt][0] = best_format[0][quant][i];
986 best_combined_format[quant][intcnt][1] = best_format[1][quant][j];
987 best_combined_format[quant][intcnt][2] = best_format[2][quant][k];
988 }
989 }
990 }
991 }
992 }
993 }
994
995 /**
996 * @brief For 3 partitions compute the best format and quantization for a given bit count.
997 *
998 * @param best_combined_error The best error for each quant level and integer count.
999 * @param best_combined_format The best format for each quant level and integer count.
1000 * @param bits_available The number of bits available for encoding.
1001 * @param[out] best_quant_level The output best color quant level.
1002 * @param[out] best_quant_level_mod The output best color quant level assuming two more bits are available.
1003 * @param[out] best_formats The output best color formats.
1004 *
1005 * @return The output error for the best pairing.
1006 */
1007 static float three_partitions_find_best_combination_for_bitcount(
1008 const float best_combined_error[21][10],
1009 const uint8_t best_combined_format[21][10][3],
1010 int bits_available,
1011 uint8_t& best_quant_level,
1012 uint8_t& best_quant_level_mod,
1013 uint8_t* best_formats
1014 ) {
1015 int best_integer_count = 0;
1016 float best_integer_count_error = ERROR_CALC_DEFAULT;
1017
1018 for (int integer_count = 3; integer_count <= 9; integer_count++)
1019 {
1020 // Compute the quantization level for a given number of integers and a given number of bits
1021 int quant_level = quant_mode_table[integer_count][bits_available];
1022
1023 // Don't have enough bits to represent a given endpoint format at all!
1024 if (quant_level < QUANT_6)
1025 {
1026 break;
1027 }
1028
1029 float integer_count_error = best_combined_error[quant_level][integer_count - 3];
1030 if (integer_count_error < best_integer_count_error)
1031 {
1032 best_integer_count_error = integer_count_error;
1033 best_integer_count = integer_count;
1034 }
1035 }
1036
1037 int ql = quant_mode_table[best_integer_count][bits_available];
1038 int ql_mod = quant_mode_table[best_integer_count][bits_available + 5];
1039
1040 best_quant_level = static_cast<uint8_t>(ql);
1041 best_quant_level_mod = static_cast<uint8_t>(ql_mod);
1042
1043 if (ql >= QUANT_6)
1044 {
1045 for (int i = 0; i < 3; i++)
1046 {
1047 best_formats[i] = best_combined_format[ql][best_integer_count - 3][i];
1048 }
1049 }
1050 else
1051 {
1052 for (int i = 0; i < 3; i++)
1053 {
1054 best_formats[i] = FMT_LUMINANCE;
1055 }
1056 }
1057
1058 return best_integer_count_error;
1059 }
1060
1061 /**
1062 * @brief For 4 partitions compute the best format combinations for every pair of quant mode and integer count.
1063 *
1064 * @param best_error The best error for a single endpoint quant level and integer count.
1065 * @param best_format The best format for a single endpoint quant level and integer count.
1066 * @param[out] best_combined_error The best combined error pairings for the 4 partitions.
1067 * @param[out] best_combined_format The best combined format pairings for the 4 partitions.
1068 */
1069 static void four_partitions_find_best_combination_for_every_quantization_and_integer_count(
1070 const float best_error[4][21][4], // indexed by (partition, quant-level, integer-count)
1071 const uint8_t best_format[4][21][4],
1072 float best_combined_error[21][13],
1073 uint8_t best_combined_format[21][13][4]
1074 ) {
1075 for (int i = QUANT_2; i <= QUANT_256; i++)
1076 {
1077 for (int j = 0; j < 13; j++)
1078 {
1079 best_combined_error[i][j] = ERROR_CALC_DEFAULT;
1080 }
1081 }
1082
1083 for (int quant = QUANT_6; quant <= QUANT_256; quant++)
1084 {
1085 for (int i = 0; i < 4; i++) // integer-count for first endpoint-pair
1086 {
1087 for (int j = 0; j < 4; j++) // integer-count for second endpoint-pair
1088 {
1089 int low2 = astc::min(i, j);
1090 int high2 = astc::max(i, j);
1091 if ((high2 - low2) > 1)
1092 {
1093 continue;
1094 }
1095
1096 for (int k = 0; k < 4; k++) // integer-count for third endpoint-pair
1097 {
1098 int low3 = astc::min(k, low2);
1099 int high3 = astc::max(k, high2);
1100 if ((high3 - low3) > 1)
1101 {
1102 continue;
1103 }
1104
1105 for (int l = 0; l < 4; l++) // integer-count for fourth endpoint-pair
1106 {
1107 int low4 = astc::min(l, low3);
1108 int high4 = astc::max(l, high3);
1109 if ((high4 - low4) > 1)
1110 {
1111 continue;
1112 }
1113
1114 int intcnt = i + j + k + l;
1115 float errorterm = astc::min(best_error[0][quant][i] + best_error[1][quant][j] + best_error[2][quant][k] + best_error[3][quant][l], 1e10f);
1116 if (errorterm <= best_combined_error[quant][intcnt])
1117 {
1118 best_combined_error[quant][intcnt] = errorterm;
1119 best_combined_format[quant][intcnt][0] = best_format[0][quant][i];
1120 best_combined_format[quant][intcnt][1] = best_format[1][quant][j];
1121 best_combined_format[quant][intcnt][2] = best_format[2][quant][k];
1122 best_combined_format[quant][intcnt][3] = best_format[3][quant][l];
1123 }
1124 }
1125 }
1126 }
1127 }
1128 }
1129 }
1130
1131 /**
1132 * @brief For 4 partitions compute the best format and quantization for a given bit count.
1133 *
1134 * @param best_combined_error The best error for each quant level and integer count.
1135 * @param best_combined_format The best format for each quant level and integer count.
1136 * @param bits_available The number of bits available for encoding.
1137 * @param[out] best_quant_level The output best color quant level.
1138 * @param[out] best_quant_level_mod The output best color quant level assuming two more bits are available.
1139 * @param[out] best_formats The output best color formats.
1140 *
1141 * @return best_error The output error for the best pairing.
1142 */
1143 static float four_partitions_find_best_combination_for_bitcount(
1144 const float best_combined_error[21][13],
1145 const uint8_t best_combined_format[21][13][4],
1146 int bits_available,
1147 uint8_t& best_quant_level,
1148 uint8_t& best_quant_level_mod,
1149 uint8_t* best_formats
1150 ) {
1151 int best_integer_count = 0;
1152 float best_integer_count_error = ERROR_CALC_DEFAULT;
1153
1154 for (int integer_count = 4; integer_count <= 9; integer_count++)
1155 {
1156 // Compute the quantization level for a given number of integers and a given number of bits
1157 int quant_level = quant_mode_table[integer_count][bits_available];
1158
1159 // Don't have enough bits to represent a given endpoint format at all!
1160 if (quant_level < QUANT_6)
1161 {
1162 break;
1163 }
1164
1165 float integer_count_error = best_combined_error[quant_level][integer_count - 4];
1166 if (integer_count_error < best_integer_count_error)
1167 {
1168 best_integer_count_error = integer_count_error;
1169 best_integer_count = integer_count;
1170 }
1171 }
1172
1173 int ql = quant_mode_table[best_integer_count][bits_available];
1174 int ql_mod = quant_mode_table[best_integer_count][bits_available + 8];
1175
1176 best_quant_level = static_cast<uint8_t>(ql);
1177 best_quant_level_mod = static_cast<uint8_t>(ql_mod);
1178
1179 if (ql >= QUANT_6)
1180 {
1181 for (int i = 0; i < 4; i++)
1182 {
1183 best_formats[i] = best_combined_format[ql][best_integer_count - 4][i];
1184 }
1185 }
1186 else
1187 {
1188 for (int i = 0; i < 4; i++)
1189 {
1190 best_formats[i] = FMT_LUMINANCE;
1191 }
1192 }
1193
1194 return best_integer_count_error;
1195 }
1196
1197 /* See header for documentation. */
1198 unsigned int compute_ideal_endpoint_formats(
1199 QualityProfile privateProfile,
1200 const partition_info& pi,
1201 const image_block& blk,
1202 const endpoints& ep,
1203 // bitcounts and errors computed for the various quantization methods
1204 const int8_t* qwt_bitcounts,
1205 const float* qwt_errors,
1206 unsigned int tune_candidate_limit,
1207 unsigned int start_block_mode,
1208 unsigned int end_block_mode,
1209 // output data
1210 uint8_t partition_format_specifiers[TUNE_MAX_TRIAL_CANDIDATES][BLOCK_MAX_PARTITIONS],
1211 int block_mode[TUNE_MAX_TRIAL_CANDIDATES],
1212 quant_method quant_level[TUNE_MAX_TRIAL_CANDIDATES],
1213 quant_method quant_level_mod[TUNE_MAX_TRIAL_CANDIDATES],
1214 compression_working_buffers& tmpbuf
1215 ) {
1216 int partition_count = pi.partition_count;
1217
1218 promise(partition_count > 0);
1219
1220 bool encode_hdr_rgb = static_cast<bool>(blk.rgb_lns[0]);
1221 bool encode_hdr_alpha = static_cast<bool>(blk.alpha_lns[0]);
1222
1223 // Compute the errors that result from various encoding choices (such as using luminance instead
1224 // of RGB, discarding Alpha, using RGB-scale in place of two separate RGB endpoints and so on)
1225 encoding_choice_errors eci[BLOCK_MAX_PARTITIONS];
1226 compute_encoding_choice_errors(privateProfile, blk, pi, ep, eci);
1227
1228 float best_error[BLOCK_MAX_PARTITIONS][21][4];
1229 uint8_t format_of_choice[BLOCK_MAX_PARTITIONS][21][4];
1230 for (int i = 0; i < partition_count; i++)
1231 {
1232 compute_color_error_for_every_integer_count_and_quant_level(
1233 encode_hdr_rgb, encode_hdr_alpha, i,
1234 pi, eci[i], ep, blk.channel_weight, best_error[i],
1235 format_of_choice[i]);
1236 }
1237
1238 float* errors_of_best_combination = tmpbuf.errors_of_best_combination;
1239 uint8_t* best_quant_levels = tmpbuf.best_quant_levels;
1240 uint8_t* best_quant_levels_mod = tmpbuf.best_quant_levels_mod;
1241 uint8_t (&best_ep_formats)[WEIGHTS_MAX_BLOCK_MODES][BLOCK_MAX_PARTITIONS] = tmpbuf.best_ep_formats;
1242
1243 // Ensure that the first iteration understep contains data that will never be picked
1244 vfloat clear_error(ERROR_CALC_DEFAULT);
1245 vint clear_quant(0);
1246
1247 unsigned int packed_start_block_mode = round_down_to_simd_multiple_vla(start_block_mode);
1248 storea(clear_error, errors_of_best_combination + packed_start_block_mode);
1249 store_nbytes(clear_quant, best_quant_levels + packed_start_block_mode);
1250 store_nbytes(clear_quant, best_quant_levels_mod + packed_start_block_mode);
1251
1252 // Ensure that last iteration overstep contains data that will never be picked
1253 unsigned int packed_end_block_mode = round_down_to_simd_multiple_vla(end_block_mode - 1);
1254 storea(clear_error, errors_of_best_combination + packed_end_block_mode);
1255 store_nbytes(clear_quant, best_quant_levels + packed_end_block_mode);
1256 store_nbytes(clear_quant, best_quant_levels_mod + packed_end_block_mode);
1257
1258 // Track a scalar best to avoid expensive search at least once ...
1259 float error_of_best_combination = ERROR_CALC_DEFAULT;
1260 int index_of_best_combination = -1;
1261
1262 // The block contains 1 partition
1263 if (partition_count == 1)
1264 {
1265 for (unsigned int i = start_block_mode; i < end_block_mode; i++)
1266 {
1267 if (qwt_errors[i] >= ERROR_CALC_DEFAULT)
1268 {
1269 errors_of_best_combination[i] = ERROR_CALC_DEFAULT;
1270 continue;
1271 }
1272
1273 float error_of_best = one_partition_find_best_combination_for_bitcount(
1274 privateProfile,
1275 best_error[0], format_of_choice[0], qwt_bitcounts[i],
1276 best_quant_levels[i], best_ep_formats[i][0]);
1277
1278 float total_error = error_of_best + qwt_errors[i];
1279 errors_of_best_combination[i] = total_error;
1280 best_quant_levels_mod[i] = best_quant_levels[i];
1281
1282 if (total_error < error_of_best_combination)
1283 {
1284 error_of_best_combination = total_error;
1285 index_of_best_combination = i;
1286 }
1287 }
1288 }
1289 // The block contains 2 partitions
1290 else if (partition_count == 2)
1291 {
1292 float combined_best_error[21][7];
1293 uint8_t formats_of_choice[21][7][2];
1294
1295 two_partitions_find_best_combination_for_every_quantization_and_integer_count(
1296 best_error, format_of_choice, combined_best_error, formats_of_choice);
1297
1298 assert(start_block_mode == 0);
1299 for (unsigned int i = 0; i < end_block_mode; i++)
1300 {
1301 if (qwt_errors[i] >= ERROR_CALC_DEFAULT)
1302 {
1303 errors_of_best_combination[i] = ERROR_CALC_DEFAULT;
1304 continue;
1305 }
1306
1307 float error_of_best = two_partitions_find_best_combination_for_bitcount(
1308 privateProfile,
1309 combined_best_error, formats_of_choice, qwt_bitcounts[i],
1310 best_quant_levels[i], best_quant_levels_mod[i],
1311 best_ep_formats[i]);
1312
1313 float total_error = error_of_best + qwt_errors[i];
1314 errors_of_best_combination[i] = total_error;
1315
1316 if (total_error < error_of_best_combination)
1317 {
1318 error_of_best_combination = total_error;
1319 index_of_best_combination = i;
1320 }
1321 }
1322 }
1323 // The block contains 3 partitions
1324 else if (partition_count == 3)
1325 {
1326 float combined_best_error[21][10];
1327 uint8_t formats_of_choice[21][10][3];
1328
1329 three_partitions_find_best_combination_for_every_quantization_and_integer_count(
1330 best_error, format_of_choice, combined_best_error, formats_of_choice);
1331
1332 assert(start_block_mode == 0);
1333 for (unsigned int i = 0; i < end_block_mode; i++)
1334 {
1335 if (qwt_errors[i] >= ERROR_CALC_DEFAULT)
1336 {
1337 errors_of_best_combination[i] = ERROR_CALC_DEFAULT;
1338 continue;
1339 }
1340
1341 float error_of_best = three_partitions_find_best_combination_for_bitcount(
1342 combined_best_error, formats_of_choice, qwt_bitcounts[i],
1343 best_quant_levels[i], best_quant_levels_mod[i],
1344 best_ep_formats[i]);
1345
1346 float total_error = error_of_best + qwt_errors[i];
1347 errors_of_best_combination[i] = total_error;
1348
1349 if (total_error < error_of_best_combination)
1350 {
1351 error_of_best_combination = total_error;
1352 index_of_best_combination = i;
1353 }
1354 }
1355 }
1356 // The block contains 4 partitions
1357 else // if (partition_count == 4)
1358 {
1359 assert(partition_count == 4);
1360 float combined_best_error[21][13];
1361 uint8_t formats_of_choice[21][13][4];
1362
1363 four_partitions_find_best_combination_for_every_quantization_and_integer_count(
1364 best_error, format_of_choice, combined_best_error, formats_of_choice);
1365
1366 assert(start_block_mode == 0);
1367 for (unsigned int i = 0; i < end_block_mode; i++)
1368 {
1369 if (qwt_errors[i] >= ERROR_CALC_DEFAULT)
1370 {
1371 errors_of_best_combination[i] = ERROR_CALC_DEFAULT;
1372 continue;
1373 }
1374
1375 float error_of_best = four_partitions_find_best_combination_for_bitcount(
1376 combined_best_error, formats_of_choice, qwt_bitcounts[i],
1377 best_quant_levels[i], best_quant_levels_mod[i],
1378 best_ep_formats[i]);
1379
1380 float total_error = error_of_best + qwt_errors[i];
1381 errors_of_best_combination[i] = total_error;
1382
1383 if (total_error < error_of_best_combination)
1384 {
1385 error_of_best_combination = total_error;
1386 index_of_best_combination = i;
1387 }
1388 }
1389 }
1390
1391 int best_error_weights[TUNE_MAX_TRIAL_CANDIDATES];
1392
1393 // Fast path the first result and avoid the list search for trial 0
1394 best_error_weights[0] = index_of_best_combination;
1395 if (index_of_best_combination >= 0)
1396 {
1397 errors_of_best_combination[index_of_best_combination] = ERROR_CALC_DEFAULT;
1398 }
1399
1400 // Search the remaining results and pick the best candidate modes for trial 1+
1401 for (unsigned int i = 1; i < tune_candidate_limit; i++)
1402 {
1403 vint vbest_error_index(-1);
1404 vfloat vbest_ep_error(ERROR_CALC_DEFAULT);
1405
1406 start_block_mode = round_down_to_simd_multiple_vla(start_block_mode);
1407 vint lane_ids = vint::lane_id() + vint(start_block_mode);
1408 for (unsigned int j = start_block_mode; j < end_block_mode; j += ASTCENC_SIMD_WIDTH)
1409 {
1410 vfloat err = vfloat(errors_of_best_combination + j);
1411 vmask mask = err < vbest_ep_error;
1412 vbest_ep_error = select(vbest_ep_error, err, mask);
1413 vbest_error_index = select(vbest_error_index, lane_ids, mask);
1414 lane_ids += vint(ASTCENC_SIMD_WIDTH);
1415 }
1416
1417 // Pick best mode from the SIMD result, using lowest matching index to ensure invariance
1418 vmask lanes_min_error = vbest_ep_error == hmin(vbest_ep_error);
1419 vbest_error_index = select(vint(0x7FFFFFFF), vbest_error_index, lanes_min_error);
1420 vbest_error_index = hmin(vbest_error_index);
1421 int best_error_index = vbest_error_index.lane<0>();
1422
1423 best_error_weights[i] = best_error_index;
1424
1425 // Max the error for this candidate so we don't pick it again
1426 if (best_error_index >= 0)
1427 {
1428 errors_of_best_combination[best_error_index] = ERROR_CALC_DEFAULT;
1429 }
1430 // Early-out if no more candidates are valid
1431 else
1432 {
1433 break;
1434 }
1435 }
1436
1437 for (unsigned int i = 0; i < tune_candidate_limit; i++)
1438 {
1439 if (best_error_weights[i] < 0)
1440 {
1441 return i;
1442 }
1443
1444 block_mode[i] = best_error_weights[i];
1445
1446 quant_level[i] = static_cast<quant_method>(best_quant_levels[best_error_weights[i]]);
1447 quant_level_mod[i] = static_cast<quant_method>(best_quant_levels_mod[best_error_weights[i]]);
1448
1449 assert(quant_level[i] >= QUANT_6 && quant_level[i] <= QUANT_256);
1450 assert(quant_level_mod[i] >= QUANT_6 && quant_level_mod[i] <= QUANT_256);
1451
1452 for (int j = 0; j < partition_count; j++)
1453 {
1454 partition_format_specifiers[i][j] = best_ep_formats[best_error_weights[i]][j];
1455 }
1456 }
1457
1458 return tune_candidate_limit;
1459 }
1460
1461 #endif
1462