1/*
2 * Copyright (C) 2014 Intel Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
21 * DEALINGS IN THE SOFTWARE.
22 */
23
24/*
25 * Included by texcompress_bptc and gallium to define BPTC decoding routines.
26 */
27
28#ifndef TEXCOMPRESS_BPTC_TMP_H
29#define TEXCOMPRESS_BPTC_TMP_H
30
31#include "util/format_srgb.h"
32#include "util/half_float.h"
33#include "macros.h"
34
35#define BLOCK_SIZE 4
36#define N_PARTITIONS 64
37#define BLOCK_BYTES 16
38
39struct bptc_unorm_mode {
40   int n_subsets;
41   int n_partition_bits;
42   bool has_rotation_bits;
43   bool has_index_selection_bit;
44   int n_color_bits;
45   int n_alpha_bits;
46   bool has_endpoint_pbits;
47   bool has_shared_pbits;
48   int n_index_bits;
49   int n_secondary_index_bits;
50};
51
52struct bptc_float_bitfield {
53   int8_t endpoint;
54   uint8_t component;
55   uint8_t offset;
56   uint8_t n_bits;
57   bool reverse;
58};
59
60struct bptc_float_mode {
61   bool reserved;
62   bool transformed_endpoints;
63   int n_partition_bits;
64   int n_endpoint_bits;
65   int n_index_bits;
66   int n_delta_bits[3];
67   struct bptc_float_bitfield bitfields[24];
68};
69
70struct bit_writer {
71   uint8_t buf;
72   int pos;
73   uint8_t *dst;
74};
75
76static const struct bptc_unorm_mode
77bptc_unorm_modes[] = {
78   /* 0 */ { 3, 4, false, false, 4, 0, true,  false, 3, 0 },
79   /* 1 */ { 2, 6, false, false, 6, 0, false, true,  3, 0 },
80   /* 2 */ { 3, 6, false, false, 5, 0, false, false, 2, 0 },
81   /* 3 */ { 2, 6, false, false, 7, 0, true,  false, 2, 0 },
82   /* 4 */ { 1, 0, true,  true,  5, 6, false, false, 2, 3 },
83   /* 5 */ { 1, 0, true,  false, 7, 8, false, false, 2, 2 },
84   /* 6 */ { 1, 0, false, false, 7, 7, true,  false, 4, 0 },
85   /* 7 */ { 2, 6, false, false, 5, 5, true,  false, 2, 0 }
86};
87
88static const struct bptc_float_mode
89bptc_float_modes[] = {
90   /* 00 */
91   { false, true, 5, 10, 3, { 5, 5, 5 },
92     { { 2, 1, 4, 1, false }, { 2, 2, 4, 1, false }, { 3, 2, 4, 1, false },
93       { 0, 0, 0, 10, false }, { 0, 1, 0, 10, false }, { 0, 2, 0, 10, false },
94       { 1, 0, 0, 5, false }, { 3, 1, 4, 1, false }, { 2, 1, 0, 4, false },
95       { 1, 1, 0, 5, false }, { 3, 2, 0, 1, false }, { 3, 1, 0, 4, false },
96       { 1, 2, 0, 5, false }, { 3, 2, 1, 1, false }, { 2, 2, 0, 4, false },
97       { 2, 0, 0, 5, false }, { 3, 2, 2, 1, false }, { 3, 0, 0, 5, false },
98       { 3, 2, 3, 1, false },
99       { -1 } }
100   },
101   /* 01 */
102   { false, true, 5, 7, 3, { 6, 6, 6 },
103     { { 2, 1, 5, 1, false }, { 3, 1, 4, 1, false }, { 3, 1, 5, 1, false },
104       { 0, 0, 0, 7, false }, { 3, 2, 0, 1, false }, { 3, 2, 1, 1, false },
105       { 2, 2, 4, 1, false }, { 0, 1, 0, 7, false }, { 2, 2, 5, 1, false },
106       { 3, 2, 2, 1, false }, { 2, 1, 4, 1, false }, { 0, 2, 0, 7, false },
107       { 3, 2, 3, 1, false }, { 3, 2, 5, 1, false }, { 3, 2, 4, 1, false },
108       { 1, 0, 0, 6, false }, { 2, 1, 0, 4, false }, { 1, 1, 0, 6, false },
109       { 3, 1, 0, 4, false }, { 1, 2, 0, 6, false }, { 2, 2, 0, 4, false },
110       { 2, 0, 0, 6, false },
111       { 3, 0, 0, 6, false },
112       { -1 } }
113   },
114   /* 00010 */
115   { false, true, 5, 11, 3, { 5, 4, 4 },
116     { { 0, 0, 0, 10, false }, { 0, 1, 0, 10, false }, { 0, 2, 0, 10, false },
117       { 1, 0, 0, 5, false }, { 0, 0, 10, 1, false }, { 2, 1, 0, 4, false },
118       { 1, 1, 0, 4, false }, { 0, 1, 10, 1, false }, { 3, 2, 0, 1, false },
119       { 3, 1, 0, 4, false }, { 1, 2, 0, 4, false }, { 0, 2, 10, 1, false },
120       { 3, 2, 1, 1, false }, { 2, 2, 0, 4, false }, { 2, 0, 0, 5, false },
121       { 3, 2, 2, 1, false }, { 3, 0, 0, 5, false }, { 3, 2, 3, 1, false },
122       { -1 } }
123   },
124   /* 00011 */
125   { false, false, 0, 10, 4, { 10, 10, 10 },
126     { { 0, 0, 0, 10, false }, { 0, 1, 0, 10, false }, { 0, 2, 0, 10, false },
127       { 1, 0, 0, 10, false }, { 1, 1, 0, 10, false }, { 1, 2, 0, 10, false },
128       { -1 } }
129   },
130   /* 00110 */
131   { false, true, 5, 11, 3, { 4, 5, 4 },
132     { { 0, 0, 0, 10, false }, { 0, 1, 0, 10, false }, { 0, 2, 0, 10, false },
133       { 1, 0, 0, 4, false }, { 0, 0, 10, 1, false }, { 3, 1, 4, 1, false },
134       { 2, 1, 0, 4, false }, { 1, 1, 0, 5, false }, { 0, 1, 10, 1, false },
135       { 3, 1, 0, 4, false }, { 1, 2, 0, 4, false }, { 0, 2, 10, 1, false },
136       { 3, 2, 1, 1, false }, { 2, 2, 0, 4, false }, { 2, 0, 0, 4, false },
137       { 3, 2, 0, 1, false }, { 3, 2, 2, 1, false }, { 3, 0, 0, 4, false },
138       { 2, 1, 4, 1, false }, { 3, 2, 3, 1, false },
139       { -1 } }
140   },
141   /* 00111 */
142   { false, true, 0, 11, 4, { 9, 9, 9 },
143     { { 0, 0, 0, 10, false }, { 0, 1, 0, 10, false }, { 0, 2, 0, 10, false },
144       { 1, 0, 0, 9, false }, { 0, 0, 10, 1, false }, { 1, 1, 0, 9, false },
145       { 0, 1, 10, 1, false }, { 1, 2, 0, 9, false }, { 0, 2, 10, 1, false },
146       { -1 } }
147   },
148   /* 01010 */
149   { false, true, 5, 11, 3, { 4, 4, 5 },
150     { { 0, 0, 0, 10, false }, { 0, 1, 0, 10, false }, { 0, 2, 0, 10, false },
151       { 1, 0, 0, 4, false }, { 0, 0, 10, 1, false }, { 2, 2, 4, 1, false },
152       { 2, 1, 0, 4, false }, { 1, 1, 0, 4, false }, { 0, 1, 10, 1, false },
153       { 3, 2, 0, 1, false }, { 3, 1, 0, 4, false }, { 1, 2, 0, 5, false },
154       { 0, 2, 10, 1, false }, { 2, 2, 0, 4, false }, { 2, 0, 0, 4, false },
155       { 3, 2, 1, 1, false }, { 3, 2, 2, 1, false }, { 3, 0, 0, 4, false },
156       { 3, 2, 4, 1, false }, { 3, 2, 3, 1, false },
157       { -1 } }
158   },
159   /* 01011 */
160   { false, true, 0, 12, 4, { 8, 8, 8 },
161     { { 0, 0, 0, 10, false }, { 0, 1, 0, 10, false }, { 0, 2, 0, 10, false },
162       { 1, 0, 0, 8, false }, { 0, 0, 10, 2, true }, { 1, 1, 0, 8, false },
163       { 0, 1, 10, 2, true }, { 1, 2, 0, 8, false }, { 0, 2, 10, 2, true },
164       { -1 } }
165   },
166   /* 01110 */
167   { false, true, 5, 9, 3, { 5, 5, 5 },
168     { { 0, 0, 0, 9, false }, { 2, 2, 4, 1, false }, { 0, 1, 0, 9, false },
169       { 2, 1, 4, 1, false }, { 0, 2, 0, 9, false }, { 3, 2, 4, 1, false },
170       { 1, 0, 0, 5, false }, { 3, 1, 4, 1, false }, { 2, 1, 0, 4, false },
171       { 1, 1, 0, 5, false }, { 3, 2, 0, 1, false }, { 3, 1, 0, 4, false },
172       { 1, 2, 0, 5, false }, { 3, 2, 1, 1, false }, { 2, 2, 0, 4, false },
173       { 2, 0, 0, 5, false }, { 3, 2, 2, 1, false }, { 3, 0, 0, 5, false },
174       { 3, 2, 3, 1, false },
175       { -1 } }
176   },
177   /* 01111 */
178   { false, true, 0, 16, 4, { 4, 4, 4 },
179     { { 0, 0, 0, 10, false }, { 0, 1, 0, 10, false }, { 0, 2, 0, 10, false },
180       { 1, 0, 0, 4, false }, { 0, 0, 10, 6, true }, { 1, 1, 0, 4, false },
181       { 0, 1, 10, 6, true }, { 1, 2, 0, 4, false }, { 0, 2, 10, 6, true },
182       { -1 } }
183   },
184   /* 10010 */
185   { false, true, 5, 8, 3, { 6, 5, 5 },
186     { { 0, 0, 0, 8, false }, { 3, 1, 4, 1, false }, { 2, 2, 4, 1, false },
187       { 0, 1, 0, 8, false }, { 3, 2, 2, 1, false }, { 2, 1, 4, 1, false },
188       { 0, 2, 0, 8, false }, { 3, 2, 3, 1, false }, { 3, 2, 4, 1, false },
189       { 1, 0, 0, 6, false }, { 2, 1, 0, 4, false }, { 1, 1, 0, 5, false },
190       { 3, 2, 0, 1, false }, { 3, 1, 0, 4, false }, { 1, 2, 0, 5, false },
191       { 3, 2, 1, 1, false }, { 2, 2, 0, 4, false }, { 2, 0, 0, 6, false },
192       { 3, 0, 0, 6, false },
193       { -1 } }
194   },
195   /* 10011 */
196   { true /* reserved */ },
197   /* 10110 */
198   { false, true, 5, 8, 3, { 5, 6, 5 },
199     { { 0, 0, 0, 8, false }, { 3, 2, 0, 1, false }, { 2, 2, 4, 1, false },
200       { 0, 1, 0, 8, false }, { 2, 1, 5, 1, false }, { 2, 1, 4, 1, false },
201       { 0, 2, 0, 8, false }, { 3, 1, 5, 1, false }, { 3, 2, 4, 1, false },
202       { 1, 0, 0, 5, false }, { 3, 1, 4, 1, false }, { 2, 1, 0, 4, false },
203       { 1, 1, 0, 6, false }, { 3, 1, 0, 4, false }, { 1, 2, 0, 5, false },
204       { 3, 2, 1, 1, false }, { 2, 2, 0, 4, false }, { 2, 0, 0, 5, false },
205       { 3, 2, 2, 1, false }, { 3, 0, 0, 5, false }, { 3, 2, 3, 1, false },
206       { -1 } }
207   },
208   /* 10111 */
209   { true /* reserved */ },
210   /* 11010 */
211   { false, true, 5, 8, 3, { 5, 5, 6 },
212     { { 0, 0, 0, 8, false }, { 3, 2, 1, 1, false }, { 2, 2, 4, 1, false },
213       { 0, 1, 0, 8, false }, { 2, 2, 5, 1, false }, { 2, 1, 4, 1, false },
214       { 0, 2, 0, 8, false }, { 3, 2, 5, 1, false }, { 3, 2, 4, 1, false },
215       { 1, 0, 0, 5, false }, { 3, 1, 4, 1, false }, { 2, 1, 0, 4, false },
216       { 1, 1, 0, 5, false }, { 3, 2, 0, 1, false }, { 3, 1, 0, 4, false },
217       { 1, 2, 0, 6, false }, { 2, 2, 0, 4, false }, { 2, 0, 0, 5, false },
218       { 3, 2, 2, 1, false }, { 3, 0, 0, 5, false }, { 3, 2, 3, 1, false },
219       { -1 } }
220   },
221   /* 11011 */
222   { true /* reserved */ },
223   /* 11110 */
224   { false, false, 5, 6, 3, { 6, 6, 6 },
225     { { 0, 0, 0, 6, false }, { 3, 1, 4, 1, false }, { 3, 2, 0, 1, false },
226       { 3, 2, 1, 1, false }, { 2, 2, 4, 1, false }, { 0, 1, 0, 6, false },
227       { 2, 1, 5, 1, false }, { 2, 2, 5, 1, false }, { 3, 2, 2, 1, false },
228       { 2, 1, 4, 1, false }, { 0, 2, 0, 6, false }, { 3, 1, 5, 1, false },
229       { 3, 2, 3, 1, false }, { 3, 2, 5, 1, false }, { 3, 2, 4, 1, false },
230       { 1, 0, 0, 6, false }, { 2, 1, 0, 4, false }, { 1, 1, 0, 6, false },
231       { 3, 1, 0, 4, false }, { 1, 2, 0, 6, false }, { 2, 2, 0, 4, false },
232       { 2, 0, 0, 6, false }, { 3, 0, 0, 6, false },
233       { -1 } }
234   },
235   /* 11111 */
236   { true /* reserved */ },
237};
238
239/* This partition table is used when the mode has two subsets. Each
240 * partition is represented by a 32-bit value which gives 2 bits per texel
241 * within the block. The value of the two bits represents which subset to use
242 * (0 or 1).
243 */
244static const uint32_t
245partition_table1[N_PARTITIONS] = {
246   0x50505050U, 0x40404040U, 0x54545454U, 0x54505040U,
247   0x50404000U, 0x55545450U, 0x55545040U, 0x54504000U,
248   0x50400000U, 0x55555450U, 0x55544000U, 0x54400000U,
249   0x55555440U, 0x55550000U, 0x55555500U, 0x55000000U,
250   0x55150100U, 0x00004054U, 0x15010000U, 0x00405054U,
251   0x00004050U, 0x15050100U, 0x05010000U, 0x40505054U,
252   0x00404050U, 0x05010100U, 0x14141414U, 0x05141450U,
253   0x01155440U, 0x00555500U, 0x15014054U, 0x05414150U,
254   0x44444444U, 0x55005500U, 0x11441144U, 0x05055050U,
255   0x05500550U, 0x11114444U, 0x41144114U, 0x44111144U,
256   0x15055054U, 0x01055040U, 0x05041050U, 0x05455150U,
257   0x14414114U, 0x50050550U, 0x41411414U, 0x00141400U,
258   0x00041504U, 0x00105410U, 0x10541000U, 0x04150400U,
259   0x50410514U, 0x41051450U, 0x05415014U, 0x14054150U,
260   0x41050514U, 0x41505014U, 0x40011554U, 0x54150140U,
261   0x50505500U, 0x00555050U, 0x15151010U, 0x54540404U,
262};
263
264/* This partition table is used when the mode has three subsets. In this case
265 * the values can be 0, 1 or 2.
266 */
267static const uint32_t
268partition_table2[N_PARTITIONS] = {
269   0xaa685050U, 0x6a5a5040U, 0x5a5a4200U, 0x5450a0a8U,
270   0xa5a50000U, 0xa0a05050U, 0x5555a0a0U, 0x5a5a5050U,
271   0xaa550000U, 0xaa555500U, 0xaaaa5500U, 0x90909090U,
272   0x94949494U, 0xa4a4a4a4U, 0xa9a59450U, 0x2a0a4250U,
273   0xa5945040U, 0x0a425054U, 0xa5a5a500U, 0x55a0a0a0U,
274   0xa8a85454U, 0x6a6a4040U, 0xa4a45000U, 0x1a1a0500U,
275   0x0050a4a4U, 0xaaa59090U, 0x14696914U, 0x69691400U,
276   0xa08585a0U, 0xaa821414U, 0x50a4a450U, 0x6a5a0200U,
277   0xa9a58000U, 0x5090a0a8U, 0xa8a09050U, 0x24242424U,
278   0x00aa5500U, 0x24924924U, 0x24499224U, 0x50a50a50U,
279   0x500aa550U, 0xaaaa4444U, 0x66660000U, 0xa5a0a5a0U,
280   0x50a050a0U, 0x69286928U, 0x44aaaa44U, 0x66666600U,
281   0xaa444444U, 0x54a854a8U, 0x95809580U, 0x96969600U,
282   0xa85454a8U, 0x80959580U, 0xaa141414U, 0x96960000U,
283   0xaaaa1414U, 0xa05050a0U, 0xa0a5a5a0U, 0x96000000U,
284   0x40804080U, 0xa9a8a9a8U, 0xaaaaaa44U, 0x2a4a5254U
285};
286
287static const uint8_t
288anchor_indices[][N_PARTITIONS] = {
289   /* Anchor index values for the second subset of two-subset partitioning */
290   {
291      0xf,0xf,0xf,0xf,0xf,0xf,0xf,0xf,0xf,0xf,0xf,0xf,0xf,0xf,0xf,0xf,
292      0xf,0x2,0x8,0x2,0x2,0x8,0x8,0xf,0x2,0x8,0x2,0x2,0x8,0x8,0x2,0x2,
293      0xf,0xf,0x6,0x8,0x2,0x8,0xf,0xf,0x2,0x8,0x2,0x2,0x2,0xf,0xf,0x6,
294      0x6,0x2,0x6,0x8,0xf,0xf,0x2,0x2,0xf,0xf,0xf,0xf,0xf,0x2,0x2,0xf
295   },
296
297   /* Anchor index values for the second subset of three-subset partitioning */
298   {
299      0x3,0x3,0xf,0xf,0x8,0x3,0xf,0xf,0x8,0x8,0x6,0x6,0x6,0x5,0x3,0x3,
300      0x3,0x3,0x8,0xf,0x3,0x3,0x6,0xa,0x5,0x8,0x8,0x6,0x8,0x5,0xf,0xf,
301      0x8,0xf,0x3,0x5,0x6,0xa,0x8,0xf,0xf,0x3,0xf,0x5,0xf,0xf,0xf,0xf,
302      0x3,0xf,0x5,0x5,0x5,0x8,0x5,0xa,0x5,0xa,0x8,0xd,0xf,0xc,0x3,0x3
303   },
304
305   /* Anchor index values for the third subset of three-subset
306    * partitioning
307    */
308   {
309      0xf,0x8,0x8,0x3,0xf,0xf,0x3,0x8,0xf,0xf,0xf,0xf,0xf,0xf,0xf,0x8,
310      0xf,0x8,0xf,0x3,0xf,0x8,0xf,0x8,0x3,0xf,0x6,0xa,0xf,0xf,0xa,0x8,
311      0xf,0x3,0xf,0xa,0xa,0x8,0x9,0xa,0x6,0xf,0x8,0xf,0x3,0x6,0x6,0x8,
312      0xf,0x3,0xf,0xf,0xf,0xf,0xf,0xf,0xf,0xf,0xf,0xf,0x3,0xf,0xf,0x8
313   }
314};
315
316static int
317extract_bits(const uint8_t *block,
318             int offset,
319             int n_bits)
320{
321   int byte_index = offset / 8;
322   int bit_index = offset % 8;
323   int n_bits_in_byte = MIN2(n_bits, 8 - bit_index);
324   int result = 0;
325   int bit = 0;
326
327   while (true) {
328      result |= ((block[byte_index] >> bit_index) &
329                 ((1 << n_bits_in_byte) - 1)) << bit;
330
331      n_bits -= n_bits_in_byte;
332
333      if (n_bits <= 0)
334         return result;
335
336      bit += n_bits_in_byte;
337      byte_index++;
338      bit_index = 0;
339      n_bits_in_byte = MIN2(n_bits, 8);
340   }
341}
342
343static uint8_t
344expand_component(uint8_t byte,
345                 int n_bits)
346{
347   /* Expands a n-bit quantity into a byte by copying the most-significant
348    * bits into the unused least-significant bits.
349    */
350   return byte << (8 - n_bits) | (byte >> (2 * n_bits - 8));
351}
352
353static int
354extract_unorm_endpoints(const struct bptc_unorm_mode *mode,
355                        const uint8_t *block,
356                        int bit_offset,
357                        uint8_t endpoints[][4])
358{
359   int component;
360   int subset;
361   int endpoint;
362   int pbit;
363   int n_components;
364
365   /* Extract each color component */
366   for (component = 0; component < 3; component++) {
367      for (subset = 0; subset < mode->n_subsets; subset++) {
368         for (endpoint = 0; endpoint < 2; endpoint++) {
369            endpoints[subset * 2 + endpoint][component] =
370               extract_bits(block, bit_offset, mode->n_color_bits);
371            bit_offset += mode->n_color_bits;
372         }
373      }
374   }
375
376   /* Extract the alpha values */
377   if (mode->n_alpha_bits > 0) {
378      for (subset = 0; subset < mode->n_subsets; subset++) {
379         for (endpoint = 0; endpoint < 2; endpoint++) {
380            endpoints[subset * 2 + endpoint][3] =
381               extract_bits(block, bit_offset, mode->n_alpha_bits);
382            bit_offset += mode->n_alpha_bits;
383         }
384      }
385
386      n_components = 4;
387   } else {
388      for (subset = 0; subset < mode->n_subsets; subset++)
389         for (endpoint = 0; endpoint < 2; endpoint++)
390            endpoints[subset * 2 + endpoint][3] = 255;
391
392      n_components = 3;
393   }
394
395   /* Add in the p-bits */
396   if (mode->has_endpoint_pbits) {
397      for (subset = 0; subset < mode->n_subsets; subset++) {
398         for (endpoint = 0; endpoint < 2; endpoint++) {
399            pbit = extract_bits(block, bit_offset, 1);
400            bit_offset += 1;
401
402            for (component = 0; component < n_components; component++) {
403               endpoints[subset * 2 + endpoint][component] <<= 1;
404               endpoints[subset * 2 + endpoint][component] |= pbit;
405            }
406         }
407      }
408   } else if (mode->has_shared_pbits) {
409      for (subset = 0; subset < mode->n_subsets; subset++) {
410         pbit = extract_bits(block, bit_offset, 1);
411         bit_offset += 1;
412
413         for (endpoint = 0; endpoint < 2; endpoint++) {
414            for (component = 0; component < n_components; component++) {
415               endpoints[subset * 2 + endpoint][component] <<= 1;
416               endpoints[subset * 2 + endpoint][component] |= pbit;
417            }
418         }
419      }
420   }
421
422   /* Expand the n-bit values to a byte */
423   for (subset = 0; subset < mode->n_subsets; subset++) {
424      for (endpoint = 0; endpoint < 2; endpoint++) {
425         for (component = 0; component < 3; component++) {
426            endpoints[subset * 2 + endpoint][component] =
427               expand_component(endpoints[subset * 2 + endpoint][component],
428                                mode->n_color_bits +
429                                mode->has_endpoint_pbits +
430                                mode->has_shared_pbits);
431         }
432
433         if (mode->n_alpha_bits > 0) {
434            endpoints[subset * 2 + endpoint][3] =
435               expand_component(endpoints[subset * 2 + endpoint][3],
436                                mode->n_alpha_bits +
437                                mode->has_endpoint_pbits +
438                                mode->has_shared_pbits);
439         }
440      }
441   }
442
443   return bit_offset;
444}
445
446static bool
447is_anchor(int n_subsets,
448          int partition_num,
449          int texel)
450{
451   if (texel == 0)
452      return true;
453
454   switch (n_subsets) {
455   case 1:
456      return false;
457   case 2:
458      return anchor_indices[0][partition_num] == texel;
459   case 3:
460      return (anchor_indices[1][partition_num] == texel ||
461              anchor_indices[2][partition_num] == texel);
462   default:
463      assert(false);
464      return false;
465   }
466}
467
468static int
469count_anchors_before_texel(int n_subsets,
470                           int partition_num,
471                           int texel)
472{
473   int count = 1;
474
475   if (texel == 0)
476      return 0;
477
478   switch (n_subsets) {
479   case 1:
480      break;
481   case 2:
482      if (texel > anchor_indices[0][partition_num])
483         count++;
484      break;
485   case 3:
486      if (texel > anchor_indices[1][partition_num])
487         count++;
488      if (texel > anchor_indices[2][partition_num])
489         count++;
490      break;
491   default:
492      assert(false);
493      return 0;
494   }
495
496   return count;
497}
498
499static int32_t
500interpolate(int32_t a, int32_t b,
501            int index,
502            int index_bits)
503{
504   static const uint8_t weights2[] = { 0, 21, 43, 64 };
505   static const uint8_t weights3[] = { 0, 9, 18, 27, 37, 46, 55, 64 };
506   static const uint8_t weights4[] =
507      { 0, 4, 9, 13, 17, 21, 26, 30, 34, 38, 43, 47, 51, 55, 60, 64 };
508   static const uint8_t *weights[] = {
509      NULL, NULL, weights2, weights3, weights4
510   };
511   int weight;
512
513   weight = weights[index_bits][index];
514
515   return ((64 - weight) * a + weight * b + 32) >> 6;
516}
517
518static void
519apply_rotation(int rotation,
520               uint8_t *result)
521{
522   uint8_t t;
523
524   if (rotation == 0)
525      return;
526
527   rotation--;
528
529   t = result[rotation];
530   result[rotation] = result[3];
531   result[3] = t;
532}
533
534static void
535fetch_rgba_unorm_from_block(const uint8_t *block,
536                            uint8_t *result,
537                            int texel)
538{
539   int mode_num = ffs(block[0]);
540   const struct bptc_unorm_mode *mode;
541   int bit_offset, secondary_bit_offset;
542   int partition_num;
543   int subset_num;
544   int rotation;
545   int index_selection;
546   int index_bits;
547   int indices[2];
548   int index;
549   int anchors_before_texel;
550   bool anchor;
551   uint8_t endpoints[3 * 2][4];
552   uint32_t subsets;
553   int component;
554
555   if (mode_num == 0) {
556      /* According to the spec this mode is reserved and shouldn't be used. */
557      memset(result, 0, 4);
558      return;
559   }
560
561   mode = bptc_unorm_modes + mode_num - 1;
562   bit_offset = mode_num;
563
564   partition_num = extract_bits(block, bit_offset, mode->n_partition_bits);
565   bit_offset += mode->n_partition_bits;
566
567   switch (mode->n_subsets) {
568   case 1:
569      subsets = 0;
570      break;
571   case 2:
572      subsets = partition_table1[partition_num];
573      break;
574   case 3:
575      subsets = partition_table2[partition_num];
576      break;
577   default:
578      assert(false);
579      return;
580   }
581
582   if (mode->has_rotation_bits) {
583      rotation = extract_bits(block, bit_offset, 2);
584      bit_offset += 2;
585   } else {
586      rotation = 0;
587   }
588
589   if (mode->has_index_selection_bit) {
590      index_selection = extract_bits(block, bit_offset, 1);
591      bit_offset++;
592   } else {
593      index_selection = 0;
594   }
595
596   bit_offset = extract_unorm_endpoints(mode, block, bit_offset, endpoints);
597
598   anchors_before_texel = count_anchors_before_texel(mode->n_subsets,
599                                                     partition_num, texel);
600
601   /* Calculate the offset to the secondary index */
602   secondary_bit_offset = (bit_offset +
603                           BLOCK_SIZE * BLOCK_SIZE * mode->n_index_bits -
604                           mode->n_subsets +
605                           mode->n_secondary_index_bits * texel -
606                           anchors_before_texel);
607
608   /* Calculate the offset to the primary index for this texel */
609   bit_offset += mode->n_index_bits * texel - anchors_before_texel;
610
611   subset_num = (subsets >> (texel * 2)) & 3;
612
613   anchor = is_anchor(mode->n_subsets, partition_num, texel);
614
615   index_bits = mode->n_index_bits;
616   if (anchor)
617      index_bits--;
618   indices[0] = extract_bits(block, bit_offset, index_bits);
619
620   if (mode->n_secondary_index_bits) {
621      index_bits = mode->n_secondary_index_bits;
622      if (anchor)
623         index_bits--;
624      indices[1] = extract_bits(block, secondary_bit_offset, index_bits);
625   }
626
627   index = indices[index_selection];
628   index_bits = (index_selection ?
629                 mode->n_secondary_index_bits :
630                 mode->n_index_bits);
631
632   for (component = 0; component < 3; component++)
633      result[component] = interpolate(endpoints[subset_num * 2][component],
634                                      endpoints[subset_num * 2 + 1][component],
635                                      index,
636                                      index_bits);
637
638   /* Alpha uses the opposite index from the color components */
639   if (mode->n_secondary_index_bits && !index_selection) {
640      index = indices[1];
641      index_bits = mode->n_secondary_index_bits;
642   } else {
643      index = indices[0];
644      index_bits = mode->n_index_bits;
645   }
646
647   result[3] = interpolate(endpoints[subset_num * 2][3],
648                           endpoints[subset_num * 2 + 1][3],
649                           index,
650                           index_bits);
651
652   apply_rotation(rotation, result);
653}
654
655#ifdef BPTC_BLOCK_DECODE
656static void
657decompress_rgba_unorm_block(int src_width, int src_height,
658                            const uint8_t *block,
659                            uint8_t *dst_row, int dst_rowstride)
660{
661   int mode_num = ffs(block[0]);
662   const struct bptc_unorm_mode *mode;
663   int bit_offset_head, bit_offset, secondary_bit_offset;
664   int partition_num;
665   int subset_num;
666   int rotation;
667   int index_selection;
668   int index_bits;
669   int indices[2];
670   int index;
671   int anchors_before_texel;
672   bool anchor;
673   uint8_t endpoints[3 * 2][4];
674   uint32_t subsets;
675   int component;
676   unsigned x, y;
677
678   if (mode_num == 0) {
679      /* According to the spec this mode is reserved and shouldn't be used. */
680      for(y = 0; y < src_height; y += 1) {
681         uint8_t *result = dst_row;
682         memset(result, 0, 4 * src_width);
683         dst_row += dst_rowstride;
684      }
685      return;
686   }
687
688   mode = bptc_unorm_modes + mode_num - 1;
689   bit_offset_head = mode_num;
690
691   partition_num = extract_bits(block, bit_offset_head, mode->n_partition_bits);
692   bit_offset_head += mode->n_partition_bits;
693
694   switch (mode->n_subsets) {
695   case 1:
696      subsets = 0;
697      break;
698   case 2:
699      subsets = partition_table1[partition_num];
700      break;
701   case 3:
702      subsets = partition_table2[partition_num];
703      break;
704   default:
705      assert(false);
706      return;
707   }
708
709   if (mode->has_rotation_bits) {
710      rotation = extract_bits(block, bit_offset_head, 2);
711      bit_offset_head += 2;
712   } else {
713      rotation = 0;
714   }
715
716   if (mode->has_index_selection_bit) {
717      index_selection = extract_bits(block, bit_offset_head, 1);
718      bit_offset_head++;
719   } else {
720      index_selection = 0;
721   }
722
723   bit_offset_head = extract_unorm_endpoints(mode, block, bit_offset_head, endpoints);
724
725   for(y = 0; y < src_height; y += 1) {
726      uint8_t *result = dst_row;
727      for(x = 0; x < src_width; x += 1) {
728         int texel;
729         texel = x + y * 4;
730         bit_offset = bit_offset_head;
731
732         anchors_before_texel = count_anchors_before_texel(mode->n_subsets,
733                                                           partition_num,
734                                                           texel);
735
736         /* Calculate the offset to the secondary index */
737         secondary_bit_offset = (bit_offset +
738                                 BLOCK_SIZE * BLOCK_SIZE * mode->n_index_bits -
739                                 mode->n_subsets +
740                                 mode->n_secondary_index_bits * texel -
741                                 anchors_before_texel);
742
743         /* Calculate the offset to the primary index for this texel */
744         bit_offset += mode->n_index_bits * texel - anchors_before_texel;
745
746         subset_num = (subsets >> (texel * 2)) & 3;
747
748         anchor = is_anchor(mode->n_subsets, partition_num, texel);
749
750         index_bits = mode->n_index_bits;
751         if (anchor)
752            index_bits--;
753         indices[0] = extract_bits(block, bit_offset, index_bits);
754
755         if (mode->n_secondary_index_bits) {
756            index_bits = mode->n_secondary_index_bits;
757            if (anchor)
758               index_bits--;
759            indices[1] = extract_bits(block, secondary_bit_offset, index_bits);
760         }
761
762         index = indices[index_selection];
763         index_bits = (index_selection ?
764                       mode->n_secondary_index_bits :
765                       mode->n_index_bits);
766
767         for (component = 0; component < 3; component++)
768            result[component] = interpolate(endpoints[subset_num * 2][component],
769                                            endpoints[subset_num * 2 + 1][component],
770                                            index,
771                                            index_bits);
772
773         /* Alpha uses the opposite index from the color components */
774         if (mode->n_secondary_index_bits && !index_selection) {
775            index = indices[1];
776            index_bits = mode->n_secondary_index_bits;
777         } else {
778            index = indices[0];
779            index_bits = mode->n_index_bits;
780         }
781
782         result[3] = interpolate(endpoints[subset_num * 2][3],
783                                 endpoints[subset_num * 2 + 1][3],
784                                 index,
785                                 index_bits);
786
787         apply_rotation(rotation, result);
788         result += 4;
789      }
790      dst_row += dst_rowstride;
791   }
792}
793
794static void
795decompress_rgba_unorm(int width, int height,
796                      const uint8_t *src, int src_rowstride,
797                      uint8_t *dst, int dst_rowstride)
798{
799   int src_row_diff;
800   int y, x;
801
802   if (src_rowstride >= width * 4)
803      src_row_diff = src_rowstride - ((width + 3) & ~3) * 4;
804   else
805      src_row_diff = 0;
806
807   for (y = 0; y < height; y += BLOCK_SIZE) {
808      for (x = 0; x < width; x += BLOCK_SIZE) {
809         decompress_rgba_unorm_block(MIN2(width - x, BLOCK_SIZE),
810                                     MIN2(height - y, BLOCK_SIZE),
811                                     src,
812                                     dst + x * 4 + y * dst_rowstride,
813                                     dst_rowstride);
814         src += BLOCK_BYTES;
815      }
816      src += src_row_diff;
817   }
818}
819#endif // BPTC_BLOCK_DECODE
820
821static int
822signed_unquantize(int value, int n_endpoint_bits)
823{
824   bool sign;
825
826   if (n_endpoint_bits >= 16)
827      return value;
828
829   if (value == 0)
830      return 0;
831
832   sign = false;
833
834   if (value < 0) {
835      sign = true;
836      value = -value;
837   }
838
839   if (value >= (1 << (n_endpoint_bits - 1)) - 1)
840      value = 0x7fff;
841   else
842      value = ((value << 15) + 0x4000) >> (n_endpoint_bits - 1);
843
844   if (sign)
845      value = -value;
846
847   return value;
848}
849
850static int
851unsigned_unquantize(int value, int n_endpoint_bits)
852{
853   if (n_endpoint_bits >= 15)
854      return value;
855
856   if (value == 0)
857      return 0;
858
859   if (value == (1 << n_endpoint_bits) - 1)
860      return 0xffff;
861
862   return ((value << 15) + 0x4000) >> (n_endpoint_bits - 1);
863}
864
865static int
866extract_float_endpoints(const struct bptc_float_mode *mode,
867                        const uint8_t *block,
868                        int bit_offset,
869                        int32_t endpoints[][3],
870                        bool is_signed)
871{
872   const struct bptc_float_bitfield *bitfield;
873   int endpoint, component;
874   int n_endpoints;
875   int value;
876   int i;
877
878   if (mode->n_partition_bits)
879      n_endpoints = 4;
880   else
881      n_endpoints = 2;
882
883   memset(endpoints, 0, sizeof endpoints[0][0] * n_endpoints * 3);
884
885   for (bitfield = mode->bitfields; bitfield->endpoint != -1; bitfield++) {
886      value = extract_bits(block, bit_offset, bitfield->n_bits);
887      bit_offset += bitfield->n_bits;
888
889      if (bitfield->reverse) {
890         for (i = 0; i < bitfield->n_bits; i++) {
891            if (value & (1 << i))
892               endpoints[bitfield->endpoint][bitfield->component] |=
893                  1 << ((bitfield->n_bits - 1 - i) + bitfield->offset);
894         }
895      } else {
896         endpoints[bitfield->endpoint][bitfield->component] |=
897            value << bitfield->offset;
898      }
899   }
900
901   if (mode->transformed_endpoints) {
902      /* The endpoints are specified as signed offsets from e0 */
903      for (endpoint = 1; endpoint < n_endpoints; endpoint++) {
904         for (component = 0; component < 3; component++) {
905            value = util_sign_extend(endpoints[endpoint][component],
906                                     mode->n_delta_bits[component]);
907            endpoints[endpoint][component] =
908               ((endpoints[0][component] + value) &
909                ((1 << mode->n_endpoint_bits) - 1));
910         }
911      }
912   }
913
914   if (is_signed) {
915      for (endpoint = 0; endpoint < n_endpoints; endpoint++) {
916         for (component = 0; component < 3; component++) {
917            value = util_sign_extend(endpoints[endpoint][component],
918                                     mode->n_endpoint_bits);
919            endpoints[endpoint][component] =
920               signed_unquantize(value, mode->n_endpoint_bits);
921         }
922      }
923   } else {
924      for (endpoint = 0; endpoint < n_endpoints; endpoint++) {
925         for (component = 0; component < 3; component++) {
926            endpoints[endpoint][component] =
927               unsigned_unquantize(endpoints[endpoint][component],
928                                   mode->n_endpoint_bits);
929         }
930      }
931   }
932
933   return bit_offset;
934}
935
936static int32_t
937finish_unsigned_unquantize(int32_t value)
938{
939   return value * 31 / 64;
940}
941
942static int32_t
943finish_signed_unquantize(int32_t value)
944{
945   if (value < 0)
946      return (-value * 31 / 32) | 0x8000;
947   else
948      return value * 31 / 32;
949}
950
951static void
952fetch_rgb_float_from_block(const uint8_t *block,
953                           float *result,
954                           int texel,
955                           bool is_signed)
956{
957   int mode_num;
958   const struct bptc_float_mode *mode;
959   int bit_offset;
960   int partition_num;
961   int subset_num;
962   int index_bits;
963   int index;
964   int anchors_before_texel;
965   int32_t endpoints[2 * 2][3];
966   uint32_t subsets;
967   int n_subsets;
968   int component;
969   int32_t value;
970
971   if (block[0] & 0x2) {
972      mode_num = (((block[0] >> 1) & 0xe) | (block[0] & 1)) + 2;
973      bit_offset = 5;
974   } else {
975      mode_num = block[0] & 3;
976      bit_offset = 2;
977   }
978
979   mode = bptc_float_modes + mode_num;
980
981   if (mode->reserved) {
982      memset(result, 0, sizeof result[0] * 3);
983      result[3] = 1.0f;
984      return;
985   }
986
987   bit_offset = extract_float_endpoints(mode, block, bit_offset,
988                                        endpoints, is_signed);
989
990   if (mode->n_partition_bits) {
991      partition_num = extract_bits(block, bit_offset, mode->n_partition_bits);
992      bit_offset += mode->n_partition_bits;
993
994      subsets = partition_table1[partition_num];
995      n_subsets = 2;
996   } else {
997      partition_num = 0;
998      subsets = 0;
999      n_subsets = 1;
1000   }
1001
1002   anchors_before_texel =
1003      count_anchors_before_texel(n_subsets, partition_num, texel);
1004
1005   /* Calculate the offset to the primary index for this texel */
1006   bit_offset += mode->n_index_bits * texel - anchors_before_texel;
1007
1008   subset_num = (subsets >> (texel * 2)) & 3;
1009
1010   index_bits = mode->n_index_bits;
1011   if (is_anchor(n_subsets, partition_num, texel))
1012      index_bits--;
1013   index = extract_bits(block, bit_offset, index_bits);
1014
1015   for (component = 0; component < 3; component++) {
1016      value = interpolate(endpoints[subset_num * 2][component],
1017                          endpoints[subset_num * 2 + 1][component],
1018                          index,
1019                          mode->n_index_bits);
1020
1021      if (is_signed)
1022         value = finish_signed_unquantize(value);
1023      else
1024         value = finish_unsigned_unquantize(value);
1025
1026      result[component] = _mesa_half_to_float(value);
1027   }
1028
1029   result[3] = 1.0f;
1030}
1031
1032#ifdef BPTC_BLOCK_DECODE
1033static void
1034decompress_rgb_float_block(unsigned src_width, unsigned src_height,
1035                           const uint8_t *block,
1036                           float *dst_row, unsigned dst_rowstride,
1037                           bool is_signed)
1038{
1039   int mode_num;
1040   const struct bptc_float_mode *mode;
1041   int bit_offset_head, bit_offset;
1042   int partition_num;
1043   int subset_num;
1044   int index_bits;
1045   int index;
1046   int anchors_before_texel;
1047   int32_t endpoints[2 * 2][3];
1048   uint32_t subsets;
1049   int n_subsets;
1050   int component;
1051   int32_t value;
1052   unsigned x, y;
1053
1054   if (block[0] & 0x2) {
1055      mode_num = (((block[0] >> 1) & 0xe) | (block[0] & 1)) + 2;
1056      bit_offset_head = 5;
1057   } else {
1058      mode_num = block[0] & 3;
1059      bit_offset_head = 2;
1060   }
1061
1062   mode = bptc_float_modes + mode_num;
1063
1064   if (mode->reserved) {
1065      for(y = 0; y < src_height; y += 1) {
1066         float *result = dst_row;
1067         memset(result, 0, sizeof result[0] * 4 * src_width);
1068         for(x = 0; x < src_width; x += 1) {
1069            result[3] = 1.0f;
1070            result += 4;
1071         }
1072         dst_row += dst_rowstride / sizeof dst_row[0];
1073      }
1074      return;
1075   }
1076
1077   bit_offset_head = extract_float_endpoints(mode, block, bit_offset_head,
1078                                        endpoints, is_signed);
1079
1080   if (mode->n_partition_bits) {
1081      partition_num = extract_bits(block, bit_offset_head, mode->n_partition_bits);
1082      bit_offset_head += mode->n_partition_bits;
1083
1084      subsets = partition_table1[partition_num];
1085      n_subsets = 2;
1086   } else {
1087      partition_num = 0;
1088      subsets = 0;
1089      n_subsets = 1;
1090   }
1091
1092   for(y = 0; y < src_height; y += 1) {
1093      float *result = dst_row;
1094      for(x = 0; x < src_width; x += 1) {
1095         int texel;
1096
1097         bit_offset = bit_offset_head;
1098
1099         texel = x + y * 4;
1100
1101         anchors_before_texel =
1102            count_anchors_before_texel(n_subsets, partition_num, texel);
1103
1104         /* Calculate the offset to the primary index for this texel */
1105         bit_offset += mode->n_index_bits * texel - anchors_before_texel;
1106
1107         subset_num = (subsets >> (texel * 2)) & 3;
1108
1109         index_bits = mode->n_index_bits;
1110         if (is_anchor(n_subsets, partition_num, texel))
1111            index_bits--;
1112         index = extract_bits(block, bit_offset, index_bits);
1113
1114         for (component = 0; component < 3; component++) {
1115            value = interpolate(endpoints[subset_num * 2][component],
1116                                endpoints[subset_num * 2 + 1][component],
1117                                index,
1118                                mode->n_index_bits);
1119
1120            if (is_signed)
1121               value = finish_signed_unquantize(value);
1122            else
1123               value = finish_unsigned_unquantize(value);
1124
1125            result[component] = _mesa_half_to_float(value);
1126         }
1127
1128         result[3] = 1.0f;
1129         result += 4;
1130      }
1131      dst_row += dst_rowstride / sizeof dst_row[0];
1132   }
1133}
1134
1135static void
1136decompress_rgb_float(int width, int height,
1137                      const uint8_t *src, int src_rowstride,
1138                      float *dst, int dst_rowstride, bool is_signed)
1139{
1140   int src_row_diff;
1141   int y, x;
1142
1143   if (src_rowstride >= width * 4)
1144      src_row_diff = src_rowstride - ((width + 3) & ~3) * 4;
1145   else
1146      src_row_diff = 0;
1147
1148   for (y = 0; y < height; y += BLOCK_SIZE) {
1149      for (x = 0; x < width; x += BLOCK_SIZE) {
1150         decompress_rgb_float_block(MIN2(width - x, BLOCK_SIZE),
1151                                    MIN2(height - y, BLOCK_SIZE),
1152                                    src,
1153                                    (dst + x * 4 +
1154                                     (y * dst_rowstride / sizeof dst[0])),
1155                                    dst_rowstride, is_signed);
1156         src += BLOCK_BYTES;
1157      }
1158      src += src_row_diff;
1159   }
1160}
1161#endif // BPTC_BLOCK_DECODE
1162
1163static void
1164write_bits(struct bit_writer *writer, int n_bits, int value)
1165{
1166   do {
1167      if (n_bits + writer->pos >= 8) {
1168         *(writer->dst++) = writer->buf | (value << writer->pos);
1169         writer->buf = 0;
1170         value >>= (8 - writer->pos);
1171         n_bits -= (8 - writer->pos);
1172         writer->pos = 0;
1173      } else {
1174         writer->buf |= value << writer->pos;
1175         writer->pos += n_bits;
1176         break;
1177      }
1178   } while (n_bits > 0);
1179}
1180
1181static void
1182get_average_luminance_alpha_unorm(int width, int height,
1183                                  const uint8_t *src, int src_rowstride,
1184                                  int *average_luminance, int *average_alpha)
1185{
1186   int luminance_sum = 0, alpha_sum = 0;
1187   int y, x;
1188
1189   for (y = 0; y < height; y++) {
1190      for (x = 0; x < width; x++) {
1191         luminance_sum += src[0] + src[1] + src[2];
1192         alpha_sum += src[3];
1193         src += 4;
1194      }
1195      src += src_rowstride - width * 4;
1196   }
1197
1198   *average_luminance = luminance_sum / (width * height);
1199   *average_alpha = alpha_sum / (width * height);
1200}
1201
1202static void
1203get_rgba_endpoints_unorm(int width, int height,
1204                         const uint8_t *src, int src_rowstride,
1205                         int average_luminance, int average_alpha,
1206                         uint8_t endpoints[][4])
1207{
1208   int endpoint_luminances[2];
1209   int midpoint;
1210   int sums[2][4];
1211   int endpoint;
1212   int luminance;
1213   uint8_t temp[3];
1214   const uint8_t *p = src;
1215   int rgb_left_endpoint_count = 0;
1216   int alpha_left_endpoint_count = 0;
1217   int y, x, i;
1218
1219   memset(sums, 0, sizeof sums);
1220
1221   for (y = 0; y < height; y++) {
1222      for (x = 0; x < width; x++) {
1223         luminance = p[0] + p[1] + p[2];
1224         if (luminance < average_luminance) {
1225            endpoint = 0;
1226            rgb_left_endpoint_count++;
1227         } else {
1228            endpoint = 1;
1229         }
1230         for (i = 0; i < 3; i++)
1231            sums[endpoint][i] += p[i];
1232
1233         if (p[2] < average_alpha) {
1234            endpoint = 0;
1235            alpha_left_endpoint_count++;
1236         } else {
1237            endpoint = 1;
1238         }
1239         sums[endpoint][3] += p[3];
1240
1241         p += 4;
1242      }
1243
1244      p += src_rowstride - width * 4;
1245   }
1246
1247   if (rgb_left_endpoint_count == 0 ||
1248       rgb_left_endpoint_count == width * height) {
1249      for (i = 0; i < 3; i++)
1250         endpoints[0][i] = endpoints[1][i] =
1251            (sums[0][i] + sums[1][i]) / (width * height);
1252   } else {
1253      for (i = 0; i < 3; i++) {
1254         endpoints[0][i] = sums[0][i] / rgb_left_endpoint_count;
1255         endpoints[1][i] = (sums[1][i] /
1256                            (width * height - rgb_left_endpoint_count));
1257      }
1258   }
1259
1260   if (alpha_left_endpoint_count == 0 ||
1261       alpha_left_endpoint_count == width * height) {
1262      endpoints[0][3] = endpoints[1][3] =
1263         (sums[0][3] + sums[1][3]) / (width * height);
1264   } else {
1265         endpoints[0][3] = sums[0][3] / alpha_left_endpoint_count;
1266         endpoints[1][3] = (sums[1][3] /
1267                            (width * height - alpha_left_endpoint_count));
1268   }
1269
1270   /* We may need to swap the endpoints to ensure the most-significant bit of
1271    * the first index is zero */
1272
1273   for (endpoint = 0; endpoint < 2; endpoint++) {
1274      endpoint_luminances[endpoint] =
1275         endpoints[endpoint][0] +
1276         endpoints[endpoint][1] +
1277         endpoints[endpoint][2];
1278   }
1279   midpoint = (endpoint_luminances[0] + endpoint_luminances[1]) / 2;
1280
1281   if ((src[0] + src[1] + src[2] <= midpoint) !=
1282       (endpoint_luminances[0] <= midpoint)) {
1283      memcpy(temp, endpoints[0], 3);
1284      memcpy(endpoints[0], endpoints[1], 3);
1285      memcpy(endpoints[1], temp, 3);
1286   }
1287
1288   /* Same for the alpha endpoints */
1289
1290   midpoint = (endpoints[0][3] + endpoints[1][3]) / 2;
1291
1292   if ((src[3] <= midpoint) != (endpoints[0][3] <= midpoint)) {
1293      temp[0] = endpoints[0][3];
1294      endpoints[0][3] = endpoints[1][3];
1295      endpoints[1][3] = temp[0];
1296   }
1297}
1298
1299static void
1300write_rgb_indices_unorm(struct bit_writer *writer,
1301                        int src_width, int src_height,
1302                        const uint8_t *src, int src_rowstride,
1303                        uint8_t endpoints[][4])
1304{
1305   int luminance;
1306   int endpoint_luminances[2];
1307   int endpoint;
1308   int index;
1309   int y, x;
1310
1311   for (endpoint = 0; endpoint < 2; endpoint++) {
1312      endpoint_luminances[endpoint] =
1313         endpoints[endpoint][0] +
1314         endpoints[endpoint][1] +
1315         endpoints[endpoint][2];
1316   }
1317
1318   /* If the endpoints have the same luminance then we'll just use index 0 for
1319    * all of the texels */
1320   if (endpoint_luminances[0] == endpoint_luminances[1]) {
1321      write_bits(writer, BLOCK_SIZE * BLOCK_SIZE * 2 - 1, 0);
1322      return;
1323   }
1324
1325   for (y = 0; y < src_height; y++) {
1326      for (x = 0; x < src_width; x++) {
1327         luminance = src[0] + src[1] + src[2];
1328
1329         index = ((luminance - endpoint_luminances[0]) * 3 /
1330                  (endpoint_luminances[1] - endpoint_luminances[0]));
1331         if (index < 0)
1332            index = 0;
1333         else if (index > 3)
1334            index = 3;
1335
1336         assert(x != 0 || y != 0 || index < 2);
1337
1338         write_bits(writer, (x == 0 && y == 0) ? 1 : 2, index);
1339
1340         src += 4;
1341      }
1342
1343      /* Pad the indices out to the block size */
1344      if (src_width < BLOCK_SIZE)
1345         write_bits(writer, 2 * (BLOCK_SIZE - src_width), 0);
1346
1347      src += src_rowstride - src_width * 4;
1348   }
1349
1350   /* Pad the indices out to the block size */
1351   if (src_height < BLOCK_SIZE)
1352      write_bits(writer, 2 * BLOCK_SIZE * (BLOCK_SIZE - src_height), 0);
1353}
1354
1355static void
1356write_alpha_indices_unorm(struct bit_writer *writer,
1357                          int src_width, int src_height,
1358                          const uint8_t *src, int src_rowstride,
1359                          uint8_t endpoints[][4])
1360{
1361   int index;
1362   int y, x;
1363
1364   /* If the endpoints have the same alpha then we'll just use index 0 for
1365    * all of the texels */
1366   if (endpoints[0][3] == endpoints[1][3]) {
1367      write_bits(writer, BLOCK_SIZE * BLOCK_SIZE * 3 - 1, 0);
1368      return;
1369   }
1370
1371   for (y = 0; y < src_height; y++) {
1372      for (x = 0; x < src_width; x++) {
1373         index = (((int) src[3] - (int) endpoints[0][3]) * 7 /
1374                  ((int) endpoints[1][3] - endpoints[0][3]));
1375         if (index < 0)
1376            index = 0;
1377         else if (index > 7)
1378            index = 7;
1379
1380         assert(x != 0 || y != 0 || index < 4);
1381
1382         /* The first index has one less bit */
1383         write_bits(writer, (x == 0 && y == 0) ? 2 : 3, index);
1384
1385         src += 4;
1386      }
1387
1388      /* Pad the indices out to the block size */
1389      if (src_width < BLOCK_SIZE)
1390         write_bits(writer, 3 * (BLOCK_SIZE - src_width), 0);
1391
1392      src += src_rowstride - src_width * 4;
1393   }
1394
1395   /* Pad the indices out to the block size */
1396   if (src_height < BLOCK_SIZE)
1397      write_bits(writer, 3 * BLOCK_SIZE * (BLOCK_SIZE - src_height), 0);
1398}
1399
1400static void
1401compress_rgba_unorm_block(int src_width, int src_height,
1402                          const uint8_t *src, int src_rowstride,
1403                          uint8_t *dst)
1404{
1405   int average_luminance, average_alpha;
1406   uint8_t endpoints[2][4];
1407   struct bit_writer writer;
1408   int component, endpoint;
1409
1410   get_average_luminance_alpha_unorm(src_width, src_height, src, src_rowstride,
1411                                     &average_luminance, &average_alpha);
1412   get_rgba_endpoints_unorm(src_width, src_height, src, src_rowstride,
1413                            average_luminance, average_alpha,
1414                            endpoints);
1415
1416   writer.dst = dst;
1417   writer.pos = 0;
1418   writer.buf = 0;
1419
1420   write_bits(&writer, 5, 0x10); /* mode 4 */
1421   write_bits(&writer, 2, 0); /* rotation 0 */
1422   write_bits(&writer, 1, 0); /* index selection bit */
1423
1424   /* Write the color endpoints */
1425   for (component = 0; component < 3; component++)
1426      for (endpoint = 0; endpoint < 2; endpoint++)
1427         write_bits(&writer, 5, endpoints[endpoint][component] >> 3);
1428
1429   /* Write the alpha endpoints */
1430   for (endpoint = 0; endpoint < 2; endpoint++)
1431      write_bits(&writer, 6, endpoints[endpoint][3] >> 2);
1432
1433   write_rgb_indices_unorm(&writer,
1434                           src_width, src_height,
1435                           src, src_rowstride,
1436                           endpoints);
1437   write_alpha_indices_unorm(&writer,
1438                             src_width, src_height,
1439                             src, src_rowstride,
1440                             endpoints);
1441}
1442
1443static void
1444compress_rgba_unorm(int width, int height,
1445                    const uint8_t *src, int src_rowstride,
1446                    uint8_t *dst, int dst_rowstride)
1447{
1448   int dst_row_diff;
1449   int y, x;
1450
1451   if (dst_rowstride >= width * 4)
1452      dst_row_diff = dst_rowstride - ((width + 3) & ~3) * 4;
1453   else
1454      dst_row_diff = 0;
1455
1456   for (y = 0; y < height; y += BLOCK_SIZE) {
1457      for (x = 0; x < width; x += BLOCK_SIZE) {
1458         compress_rgba_unorm_block(MIN2(width - x, BLOCK_SIZE),
1459                                   MIN2(height - y, BLOCK_SIZE),
1460                                   src + x * 4 + y * src_rowstride,
1461                                   src_rowstride,
1462                                   dst);
1463         dst += BLOCK_BYTES;
1464      }
1465      dst += dst_row_diff;
1466   }
1467}
1468
1469static float
1470get_average_luminance_float(int width, int height,
1471                            const float *src, int src_rowstride)
1472{
1473   float luminance_sum = 0;
1474   int y, x;
1475
1476   for (y = 0; y < height; y++) {
1477      for (x = 0; x < width; x++) {
1478         luminance_sum += src[0] + src[1] + src[2];
1479         src += 3;
1480      }
1481      src += (src_rowstride - width * 3 * sizeof (float)) / sizeof (float);
1482   }
1483
1484   return luminance_sum / (width * height);
1485}
1486
1487static float
1488clamp_value(float value, bool is_signed)
1489{
1490   if (value > 65504.0f)
1491      return 65504.0f;
1492
1493   if (is_signed) {
1494      if (value < -65504.0f)
1495         return -65504.0f;
1496      else
1497         return value;
1498   }
1499
1500   if (value < 0.0f)
1501      return 0.0f;
1502
1503   return value;
1504}
1505
1506static void
1507get_endpoints_float(int width, int height,
1508                    const float *src, int src_rowstride,
1509                    float average_luminance, float endpoints[][3],
1510                    bool is_signed)
1511{
1512   float endpoint_luminances[2];
1513   float midpoint;
1514   float sums[2][3];
1515   int endpoint, component;
1516   float luminance;
1517   float temp[3];
1518   const float *p = src;
1519   int left_endpoint_count = 0;
1520   int y, x, i;
1521
1522   memset(sums, 0, sizeof sums);
1523
1524   for (y = 0; y < height; y++) {
1525      for (x = 0; x < width; x++) {
1526         luminance = p[0] + p[1] + p[2];
1527         if (luminance < average_luminance) {
1528            endpoint = 0;
1529            left_endpoint_count++;
1530         } else {
1531            endpoint = 1;
1532         }
1533         for (i = 0; i < 3; i++)
1534            sums[endpoint][i] += p[i];
1535
1536         p += 3;
1537      }
1538
1539      p += (src_rowstride - width * 3 * sizeof (float)) / sizeof (float);
1540   }
1541
1542   if (left_endpoint_count == 0 ||
1543       left_endpoint_count == width * height) {
1544      for (i = 0; i < 3; i++)
1545         endpoints[0][i] = endpoints[1][i] =
1546            (sums[0][i] + sums[1][i]) / (width * height);
1547   } else {
1548      for (i = 0; i < 3; i++) {
1549         endpoints[0][i] = sums[0][i] / left_endpoint_count;
1550         endpoints[1][i] = sums[1][i] / (width * height - left_endpoint_count);
1551      }
1552   }
1553
1554   /* Clamp the endpoints to the range of a half float and strip out
1555    * infinities */
1556   for (endpoint = 0; endpoint < 2; endpoint++) {
1557      for (component = 0; component < 3; component++) {
1558         endpoints[endpoint][component] =
1559            clamp_value(endpoints[endpoint][component], is_signed);
1560      }
1561   }
1562
1563   /* We may need to swap the endpoints to ensure the most-significant bit of
1564    * the first index is zero */
1565
1566   for (endpoint = 0; endpoint < 2; endpoint++) {
1567      endpoint_luminances[endpoint] =
1568         endpoints[endpoint][0] +
1569         endpoints[endpoint][1] +
1570         endpoints[endpoint][2];
1571   }
1572   midpoint = (endpoint_luminances[0] + endpoint_luminances[1]) / 2.0f;
1573
1574   if ((src[0] + src[1] + src[2] <= midpoint) !=
1575       (endpoint_luminances[0] <= midpoint)) {
1576      memcpy(temp, endpoints[0], sizeof temp);
1577      memcpy(endpoints[0], endpoints[1], sizeof temp);
1578      memcpy(endpoints[1], temp, sizeof temp);
1579   }
1580}
1581
1582static void
1583write_rgb_indices_float(struct bit_writer *writer,
1584                        int src_width, int src_height,
1585                        const float *src, int src_rowstride,
1586                        float endpoints[][3])
1587{
1588   float luminance;
1589   float endpoint_luminances[2];
1590   int endpoint;
1591   int index;
1592   int y, x;
1593
1594   for (endpoint = 0; endpoint < 2; endpoint++) {
1595      endpoint_luminances[endpoint] =
1596         endpoints[endpoint][0] +
1597         endpoints[endpoint][1] +
1598         endpoints[endpoint][2];
1599   }
1600
1601   /* If the endpoints have the same luminance then we'll just use index 0 for
1602    * all of the texels */
1603   if (endpoint_luminances[0] == endpoint_luminances[1]) {
1604      write_bits(writer, BLOCK_SIZE * BLOCK_SIZE * 4 - 1, 0);
1605      return;
1606   }
1607
1608   for (y = 0; y < src_height; y++) {
1609      for (x = 0; x < src_width; x++) {
1610         luminance = src[0] + src[1] + src[2];
1611
1612         index = ((luminance - endpoint_luminances[0]) * 15 /
1613                  (endpoint_luminances[1] - endpoint_luminances[0]));
1614         if (index < 0)
1615            index = 0;
1616         else if (index > 15)
1617            index = 15;
1618
1619         assert(x != 0 || y != 0 || index < 8);
1620
1621         write_bits(writer, (x == 0 && y == 0) ? 3 : 4, index);
1622
1623         src += 3;
1624      }
1625
1626      /* Pad the indices out to the block size */
1627      if (src_width < BLOCK_SIZE)
1628         write_bits(writer, 4 * (BLOCK_SIZE - src_width), 0);
1629
1630      src += (src_rowstride - src_width * 3 * sizeof (float)) / sizeof (float);
1631   }
1632
1633   /* Pad the indices out to the block size */
1634   if (src_height < BLOCK_SIZE)
1635      write_bits(writer, 4 * BLOCK_SIZE * (BLOCK_SIZE - src_height), 0);
1636}
1637
1638static int
1639get_endpoint_value(float value, bool is_signed)
1640{
1641   bool sign = false;
1642   int half;
1643
1644   if (is_signed) {
1645      half = _mesa_float_to_half(value);
1646
1647      if (half & 0x8000) {
1648         half &= 0x7fff;
1649         sign = true;
1650      }
1651
1652      half = (32 * half / 31) >> 6;
1653
1654      if (sign)
1655         half = -half & ((1 << 10) - 1);
1656
1657      return half;
1658   } else {
1659      if (value <= 0.0f)
1660         return 0;
1661
1662      half = _mesa_float_to_half(value);
1663
1664      return (64 * half / 31) >> 6;
1665   }
1666}
1667
1668static void
1669compress_rgb_float_block(int src_width, int src_height,
1670                         const float *src, int src_rowstride,
1671                         uint8_t *dst,
1672                         bool is_signed)
1673{
1674   float average_luminance;
1675   float endpoints[2][3];
1676   struct bit_writer writer;
1677   int component, endpoint;
1678   int endpoint_value;
1679
1680   average_luminance =
1681      get_average_luminance_float(src_width, src_height, src, src_rowstride);
1682   get_endpoints_float(src_width, src_height, src, src_rowstride,
1683                       average_luminance, endpoints, is_signed);
1684
1685   writer.dst = dst;
1686   writer.pos = 0;
1687   writer.buf = 0;
1688
1689   write_bits(&writer, 5, 3); /* mode 3 */
1690
1691   /* Write the endpoints */
1692   for (endpoint = 0; endpoint < 2; endpoint++) {
1693      for (component = 0; component < 3; component++) {
1694         endpoint_value =
1695            get_endpoint_value(endpoints[endpoint][component], is_signed);
1696         write_bits(&writer, 10, endpoint_value);
1697      }
1698   }
1699
1700   write_rgb_indices_float(&writer,
1701                           src_width, src_height,
1702                           src, src_rowstride,
1703                           endpoints);
1704}
1705
1706static void
1707compress_rgb_float(int width, int height,
1708                   const float *src, int src_rowstride,
1709                   uint8_t *dst, int dst_rowstride,
1710                   bool is_signed)
1711{
1712   int dst_row_diff;
1713   int y, x;
1714
1715   if (dst_rowstride >= width * 4)
1716      dst_row_diff = dst_rowstride - ((width + 3) & ~3) * 4;
1717   else
1718      dst_row_diff = 0;
1719
1720   for (y = 0; y < height; y += BLOCK_SIZE) {
1721      for (x = 0; x < width; x += BLOCK_SIZE) {
1722         compress_rgb_float_block(MIN2(width - x, BLOCK_SIZE),
1723                                  MIN2(height - y, BLOCK_SIZE),
1724                                  src + x * 3 +
1725                                  y * src_rowstride / sizeof (float),
1726                                  src_rowstride,
1727                                  dst,
1728                                  is_signed);
1729         dst += BLOCK_BYTES;
1730      }
1731      dst += dst_row_diff;
1732   }
1733}
1734
1735#endif
1736