1/*
2 * Copyright 2011 Joakim Sindholt <opensource@zhasha.com>
3 * Copyright 2013 Christoph Bumiller
4 *
5 * Permission is hereby granted, free of charge, to any person obtaining a
6 * copy of this software and associated documentation files (the "Software"),
7 * to deal in the Software without restriction, including without limitation
8 * on the rights to use, copy, modify, merge, publish, distribute, sub
9 * license, and/or sell copies of the Software, and to permit persons to whom
10 * the Software is furnished to do so, subject to the following conditions:
11 *
12 * The above copyright notice and this permission notice (including the next
13 * paragraph) shall be included in all copies or substantial portions of the
14 * Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
19 * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
20 * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
21 * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
22 * USE OR OTHER DEALINGS IN THE SOFTWARE. */
23
24#include "nine_shader.h"
25
26#include "device9.h"
27#include "nine_debug.h"
28#include "nine_state.h"
29#include "vertexdeclaration9.h"
30
31#include "util/macros.h"
32#include "util/u_memory.h"
33#include "util/u_inlines.h"
34#include "pipe/p_shader_tokens.h"
35#include "tgsi/tgsi_ureg.h"
36#include "tgsi/tgsi_dump.h"
37#include "nir/tgsi_to_nir.h"
38
39#define DBG_CHANNEL DBG_SHADER
40
41#define DUMP(args...) _nine_debug_printf(DBG_CHANNEL, NULL, args)
42
43
44struct shader_translator;
45
46typedef HRESULT (*translate_instruction_func)(struct shader_translator *);
47
48static inline const char *d3dsio_to_string(unsigned opcode);
49
50
51#define NINED3D_SM1_VS 0xfffe
52#define NINED3D_SM1_PS 0xffff
53
54#define NINE_MAX_COND_DEPTH 64
55#define NINE_MAX_LOOP_DEPTH 64
56
57#define NINED3DSP_END 0x0000ffff
58
59#define NINED3DSPTYPE_FLOAT4  0
60#define NINED3DSPTYPE_INT4    1
61#define NINED3DSPTYPE_BOOL    2
62
63#define NINED3DSPR_IMMEDIATE (D3DSPR_PREDICATE + 1)
64
65#define NINED3DSP_WRITEMASK_MASK  D3DSP_WRITEMASK_ALL
66#define NINED3DSP_WRITEMASK_SHIFT 16
67
68#define NINED3DSHADER_INST_PREDICATED (1 << 28)
69
70#define NINED3DSHADER_REL_OP_GT 1
71#define NINED3DSHADER_REL_OP_EQ 2
72#define NINED3DSHADER_REL_OP_GE 3
73#define NINED3DSHADER_REL_OP_LT 4
74#define NINED3DSHADER_REL_OP_NE 5
75#define NINED3DSHADER_REL_OP_LE 6
76
77#define NINED3DSIO_OPCODE_FLAGS_SHIFT 16
78#define NINED3DSIO_OPCODE_FLAGS_MASK  (0xff << NINED3DSIO_OPCODE_FLAGS_SHIFT)
79
80#define NINED3DSI_TEXLD_PROJECT 0x1
81#define NINED3DSI_TEXLD_BIAS    0x2
82
83#define NINED3DSP_WRITEMASK_0   0x1
84#define NINED3DSP_WRITEMASK_1   0x2
85#define NINED3DSP_WRITEMASK_2   0x4
86#define NINED3DSP_WRITEMASK_3   0x8
87#define NINED3DSP_WRITEMASK_ALL 0xf
88
89#define NINED3DSP_NOSWIZZLE ((0 << 0) | (1 << 2) | (2 << 4) | (3 << 6))
90
91#define NINE_SWIZZLE4(x,y,z,w) \
92   TGSI_SWIZZLE_##x, TGSI_SWIZZLE_##y, TGSI_SWIZZLE_##z, TGSI_SWIZZLE_##w
93
94#define NINE_APPLY_SWIZZLE(src, s) \
95   ureg_swizzle(src, NINE_SWIZZLE4(s, s, s, s))
96
97#define NINED3DSPDM_SATURATE (D3DSPDM_SATURATE >> D3DSP_DSTMOD_SHIFT)
98#define NINED3DSPDM_PARTIALP (D3DSPDM_PARTIALPRECISION >> D3DSP_DSTMOD_SHIFT)
99#define NINED3DSPDM_CENTROID (D3DSPDM_MSAMPCENTROID >> D3DSP_DSTMOD_SHIFT)
100
101/*
102 * NEG     all, not ps: m3x2, m3x3, m3x4, m4x3, m4x4
103 * BIAS    <= PS 1.4 (x-0.5)
104 * BIASNEG <= PS 1.4 (-(x-0.5))
105 * SIGN    <= PS 1.4 (2(x-0.5))
106 * SIGNNEG <= PS 1.4 (-2(x-0.5))
107 * COMP    <= PS 1.4 (1-x)
108 * X2       = PS 1.4 (2x)
109 * X2NEG    = PS 1.4 (-2x)
110 * DZ      <= PS 1.4, tex{ld,crd} (.xy/.z), z=0 => .11
111 * DW      <= PS 1.4, tex{ld,crd} (.xy/.w), w=0 => .11
112 * ABS     >= SM 3.0 (abs(x))
113 * ABSNEG  >= SM 3.0 (-abs(x))
114 * NOT     >= SM 2.0 pedication only
115 */
116#define NINED3DSPSM_NONE    (D3DSPSM_NONE    >> D3DSP_SRCMOD_SHIFT)
117#define NINED3DSPSM_NEG     (D3DSPSM_NEG     >> D3DSP_SRCMOD_SHIFT)
118#define NINED3DSPSM_BIAS    (D3DSPSM_BIAS    >> D3DSP_SRCMOD_SHIFT)
119#define NINED3DSPSM_BIASNEG (D3DSPSM_BIASNEG >> D3DSP_SRCMOD_SHIFT)
120#define NINED3DSPSM_SIGN    (D3DSPSM_SIGN    >> D3DSP_SRCMOD_SHIFT)
121#define NINED3DSPSM_SIGNNEG (D3DSPSM_SIGNNEG >> D3DSP_SRCMOD_SHIFT)
122#define NINED3DSPSM_COMP    (D3DSPSM_COMP    >> D3DSP_SRCMOD_SHIFT)
123#define NINED3DSPSM_X2      (D3DSPSM_X2      >> D3DSP_SRCMOD_SHIFT)
124#define NINED3DSPSM_X2NEG   (D3DSPSM_X2NEG   >> D3DSP_SRCMOD_SHIFT)
125#define NINED3DSPSM_DZ      (D3DSPSM_DZ      >> D3DSP_SRCMOD_SHIFT)
126#define NINED3DSPSM_DW      (D3DSPSM_DW      >> D3DSP_SRCMOD_SHIFT)
127#define NINED3DSPSM_ABS     (D3DSPSM_ABS     >> D3DSP_SRCMOD_SHIFT)
128#define NINED3DSPSM_ABSNEG  (D3DSPSM_ABSNEG  >> D3DSP_SRCMOD_SHIFT)
129#define NINED3DSPSM_NOT     (D3DSPSM_NOT     >> D3DSP_SRCMOD_SHIFT)
130
131static const char *sm1_mod_str[] =
132{
133    [NINED3DSPSM_NONE] = "",
134    [NINED3DSPSM_NEG] = "-",
135    [NINED3DSPSM_BIAS] = "bias",
136    [NINED3DSPSM_BIASNEG] = "biasneg",
137    [NINED3DSPSM_SIGN] = "sign",
138    [NINED3DSPSM_SIGNNEG] = "signneg",
139    [NINED3DSPSM_COMP] = "comp",
140    [NINED3DSPSM_X2] = "x2",
141    [NINED3DSPSM_X2NEG] = "x2neg",
142    [NINED3DSPSM_DZ] = "dz",
143    [NINED3DSPSM_DW] = "dw",
144    [NINED3DSPSM_ABS] = "abs",
145    [NINED3DSPSM_ABSNEG] = "-abs",
146    [NINED3DSPSM_NOT] = "not"
147};
148
149static void
150sm1_dump_writemask(BYTE mask)
151{
152    if (mask & 1) DUMP("x"); else DUMP("_");
153    if (mask & 2) DUMP("y"); else DUMP("_");
154    if (mask & 4) DUMP("z"); else DUMP("_");
155    if (mask & 8) DUMP("w"); else DUMP("_");
156}
157
158static void
159sm1_dump_swizzle(BYTE s)
160{
161    char c[4] = { 'x', 'y', 'z', 'w' };
162    DUMP("%c%c%c%c",
163         c[(s >> 0) & 3], c[(s >> 2) & 3], c[(s >> 4) & 3], c[(s >> 6) & 3]);
164}
165
166static const char sm1_file_char[] =
167{
168    [D3DSPR_TEMP] = 'r',
169    [D3DSPR_INPUT] = 'v',
170    [D3DSPR_CONST] = 'c',
171    [D3DSPR_ADDR] = 'A',
172    [D3DSPR_RASTOUT] = 'R',
173    [D3DSPR_ATTROUT] = 'D',
174    [D3DSPR_OUTPUT] = 'o',
175    [D3DSPR_CONSTINT] = 'I',
176    [D3DSPR_COLOROUT] = 'C',
177    [D3DSPR_DEPTHOUT] = 'D',
178    [D3DSPR_SAMPLER] = 's',
179    [D3DSPR_CONST2] = 'c',
180    [D3DSPR_CONST3] = 'c',
181    [D3DSPR_CONST4] = 'c',
182    [D3DSPR_CONSTBOOL] = 'B',
183    [D3DSPR_LOOP] = 'L',
184    [D3DSPR_TEMPFLOAT16] = 'h',
185    [D3DSPR_MISCTYPE] = 'M',
186    [D3DSPR_LABEL] = 'X',
187    [D3DSPR_PREDICATE] = 'p'
188};
189
190static void
191sm1_dump_reg(BYTE file, INT index)
192{
193    switch (file) {
194    case D3DSPR_LOOP:
195        DUMP("aL");
196        break;
197    case D3DSPR_COLOROUT:
198        DUMP("oC%i", index);
199        break;
200    case D3DSPR_DEPTHOUT:
201        DUMP("oDepth");
202        break;
203    case D3DSPR_RASTOUT:
204        DUMP("oRast%i", index);
205        break;
206    case D3DSPR_CONSTINT:
207        DUMP("iconst[%i]", index);
208        break;
209    case D3DSPR_CONSTBOOL:
210        DUMP("bconst[%i]", index);
211        break;
212    default:
213        DUMP("%c%i", sm1_file_char[file], index);
214        break;
215    }
216}
217
218struct sm1_src_param
219{
220    INT idx;
221    struct sm1_src_param *rel;
222    BYTE file;
223    BYTE swizzle;
224    BYTE mod;
225    BYTE type;
226    union {
227        DWORD d[4];
228        float f[4];
229        int i[4];
230        BOOL b;
231    } imm;
232};
233static void
234sm1_parse_immediate(struct shader_translator *, struct sm1_src_param *);
235
236struct sm1_dst_param
237{
238    INT idx;
239    struct sm1_src_param *rel;
240    BYTE file;
241    BYTE mask;
242    BYTE mod;
243    int8_t shift; /* sint4 */
244    BYTE type;
245};
246
247static inline void
248assert_replicate_swizzle(const struct ureg_src *reg)
249{
250    assert(reg->SwizzleY == reg->SwizzleX &&
251           reg->SwizzleZ == reg->SwizzleX &&
252           reg->SwizzleW == reg->SwizzleX);
253}
254
255static void
256sm1_dump_immediate(const struct sm1_src_param *param)
257{
258    switch (param->type) {
259    case NINED3DSPTYPE_FLOAT4:
260        DUMP("{ %f %f %f %f }",
261             param->imm.f[0], param->imm.f[1],
262             param->imm.f[2], param->imm.f[3]);
263        break;
264    case NINED3DSPTYPE_INT4:
265        DUMP("{ %i %i %i %i }",
266             param->imm.i[0], param->imm.i[1],
267             param->imm.i[2], param->imm.i[3]);
268        break;
269    case NINED3DSPTYPE_BOOL:
270        DUMP("%s", param->imm.b ? "TRUE" : "FALSE");
271        break;
272    default:
273        assert(0);
274        break;
275    }
276}
277
278static void
279sm1_dump_src_param(const struct sm1_src_param *param)
280{
281    if (param->file == NINED3DSPR_IMMEDIATE) {
282        assert(!param->mod &&
283               !param->rel &&
284               param->swizzle == NINED3DSP_NOSWIZZLE);
285        sm1_dump_immediate(param);
286        return;
287    }
288
289    if (param->mod)
290        DUMP("%s(", sm1_mod_str[param->mod]);
291    if (param->rel) {
292        DUMP("%c[", sm1_file_char[param->file]);
293        sm1_dump_src_param(param->rel);
294        DUMP("+%i]", param->idx);
295    } else {
296        sm1_dump_reg(param->file, param->idx);
297    }
298    if (param->mod)
299       DUMP(")");
300    if (param->swizzle != NINED3DSP_NOSWIZZLE) {
301       DUMP(".");
302       sm1_dump_swizzle(param->swizzle);
303    }
304}
305
306static void
307sm1_dump_dst_param(const struct sm1_dst_param *param)
308{
309   if (param->mod & NINED3DSPDM_SATURATE)
310      DUMP("sat ");
311   if (param->mod & NINED3DSPDM_PARTIALP)
312      DUMP("pp ");
313   if (param->mod & NINED3DSPDM_CENTROID)
314      DUMP("centroid ");
315   if (param->shift < 0)
316      DUMP("/%u ", 1 << -param->shift);
317   if (param->shift > 0)
318      DUMP("*%u ", 1 << param->shift);
319
320   if (param->rel) {
321      DUMP("%c[", sm1_file_char[param->file]);
322      sm1_dump_src_param(param->rel);
323      DUMP("+%i]", param->idx);
324   } else {
325      sm1_dump_reg(param->file, param->idx);
326   }
327   if (param->mask != NINED3DSP_WRITEMASK_ALL) {
328      DUMP(".");
329      sm1_dump_writemask(param->mask);
330   }
331}
332
333struct sm1_semantic
334{
335   struct sm1_dst_param reg;
336   BYTE sampler_type;
337   D3DDECLUSAGE usage;
338   BYTE usage_idx;
339};
340
341struct sm1_op_info
342{
343    /* NOTE: 0 is a valid TGSI opcode, but if handler is set, this parameter
344     * should be ignored completely */
345    unsigned sio;
346    unsigned opcode; /* TGSI_OPCODE_x */
347
348    /* versions are still set even handler is set */
349    struct {
350        unsigned min;
351        unsigned max;
352    } vert_version, frag_version;
353
354    /* number of regs parsed outside of special handler */
355    unsigned ndst;
356    unsigned nsrc;
357
358    /* some instructions don't map perfectly, so use a special handler */
359    translate_instruction_func handler;
360};
361
362struct sm1_instruction
363{
364    D3DSHADER_INSTRUCTION_OPCODE_TYPE opcode;
365    BYTE flags;
366    BOOL coissue;
367    BOOL predicated;
368    BYTE ndst;
369    BYTE nsrc;
370    struct sm1_src_param src[4];
371    struct sm1_src_param src_rel[4];
372    struct sm1_src_param pred;
373    struct sm1_src_param dst_rel[1];
374    struct sm1_dst_param dst[1];
375
376    const struct sm1_op_info *info;
377};
378
379static void
380sm1_dump_instruction(struct sm1_instruction *insn, unsigned indent)
381{
382    unsigned i;
383
384    /* no info stored for these: */
385    if (insn->opcode == D3DSIO_DCL)
386        return;
387    for (i = 0; i < indent; ++i)
388        DUMP("  ");
389
390    if (insn->predicated) {
391        DUMP("@");
392        sm1_dump_src_param(&insn->pred);
393        DUMP(" ");
394    }
395    DUMP("%s", d3dsio_to_string(insn->opcode));
396    if (insn->flags) {
397        switch (insn->opcode) {
398        case D3DSIO_TEX:
399            DUMP(insn->flags == NINED3DSI_TEXLD_PROJECT ? "p" : "b");
400            break;
401        default:
402            DUMP("_%x", insn->flags);
403            break;
404        }
405    }
406    if (insn->coissue)
407        DUMP("_co");
408    DUMP(" ");
409
410    for (i = 0; i < insn->ndst && i < ARRAY_SIZE(insn->dst); ++i) {
411        sm1_dump_dst_param(&insn->dst[i]);
412        DUMP(" ");
413    }
414
415    for (i = 0; i < insn->nsrc && i < ARRAY_SIZE(insn->src); ++i) {
416        sm1_dump_src_param(&insn->src[i]);
417        DUMP(" ");
418    }
419    if (insn->opcode == D3DSIO_DEF ||
420        insn->opcode == D3DSIO_DEFI ||
421        insn->opcode == D3DSIO_DEFB)
422        sm1_dump_immediate(&insn->src[0]);
423
424    DUMP("\n");
425}
426
427struct sm1_local_const
428{
429    INT idx;
430    struct ureg_src reg;
431    float f[4]; /* for indirect addressing of float constants */
432};
433
434struct shader_translator
435{
436    const DWORD *byte_code;
437    const DWORD *parse;
438    const DWORD *parse_next;
439
440    struct ureg_program *ureg;
441
442    /* shader version */
443    struct {
444        BYTE major;
445        BYTE minor;
446    } version;
447    unsigned processor; /* PIPE_SHADER_VERTEX/FRAMGENT */
448    unsigned num_constf_allowed;
449    unsigned num_consti_allowed;
450    unsigned num_constb_allowed;
451
452    boolean native_integers;
453    boolean inline_subroutines;
454    boolean want_texcoord;
455    boolean shift_wpos;
456    boolean wpos_is_sysval;
457    boolean face_is_sysval_integer;
458    boolean mul_zero_wins;
459    unsigned texcoord_sn;
460
461    struct sm1_instruction insn; /* current instruction */
462
463    struct {
464        struct ureg_dst *r;
465        struct ureg_dst oPos;
466        struct ureg_dst oPos_out; /* the real output when doing streamout */
467        struct ureg_dst oFog;
468        struct ureg_dst oPts;
469        struct ureg_dst oCol[4];
470        struct ureg_dst o[PIPE_MAX_SHADER_OUTPUTS];
471        struct ureg_dst oDepth;
472        struct ureg_src v[PIPE_MAX_SHADER_INPUTS];
473        struct ureg_src v_consecutive; /* copy in temp array of ps inputs for rel addressing */
474        struct ureg_src vPos;
475        struct ureg_src vFace;
476        struct ureg_src s;
477        struct ureg_dst p;
478        struct ureg_dst address;
479        struct ureg_dst a0;
480        struct ureg_dst predicate;
481        struct ureg_dst predicate_tmp;
482        struct ureg_dst predicate_dst;
483        struct ureg_dst tS[8]; /* texture stage registers */
484        struct ureg_dst tdst; /* scratch dst if we need extra modifiers */
485        struct ureg_dst t[8]; /* scratch TEMPs */
486        struct ureg_src vC[2]; /* PS color in */
487        struct ureg_src vT[8]; /* PS texcoord in */
488        struct ureg_dst rL[NINE_MAX_LOOP_DEPTH]; /* loop ctr */
489    } regs;
490    unsigned num_temp; /* ARRAY_SIZE(regs.r) */
491    unsigned num_scratch;
492    unsigned loop_depth;
493    unsigned loop_depth_max;
494    unsigned cond_depth;
495    unsigned loop_labels[NINE_MAX_LOOP_DEPTH];
496    unsigned cond_labels[NINE_MAX_COND_DEPTH];
497    boolean loop_or_rep[NINE_MAX_LOOP_DEPTH]; /* true: loop, false: rep */
498    boolean predicated_activated;
499
500    unsigned *inst_labels; /* LABEL op */
501    unsigned num_inst_labels;
502
503    unsigned sampler_targets[NINE_MAX_SAMPLERS]; /* TGSI_TEXTURE_x */
504
505    struct sm1_local_const *lconstf;
506    unsigned num_lconstf;
507    struct sm1_local_const *lconsti;
508    unsigned num_lconsti;
509    struct sm1_local_const *lconstb;
510    unsigned num_lconstb;
511
512    boolean slots_used[NINE_MAX_CONST_ALL];
513    unsigned *slot_map;
514    unsigned num_slots;
515
516    boolean indirect_const_access;
517    boolean failure;
518
519    struct nine_vs_output_info output_info[16];
520    int num_outputs;
521
522    struct nine_shader_info *info;
523
524    int16_t op_info_map[D3DSIO_BREAKP + 1];
525};
526
527#define IS_VS (tx->processor == PIPE_SHADER_VERTEX)
528#define IS_PS (tx->processor == PIPE_SHADER_FRAGMENT)
529
530#define FAILURE_VOID(cond) if ((cond)) {tx->failure=1;return;}
531
532static void
533sm1_read_semantic(struct shader_translator *, struct sm1_semantic *);
534
535static void
536sm1_instruction_check(const struct sm1_instruction *insn)
537{
538    if (insn->opcode == D3DSIO_CRS)
539    {
540        if (insn->dst[0].mask & NINED3DSP_WRITEMASK_3)
541        {
542            DBG("CRS.mask.w\n");
543        }
544    }
545}
546
547static void
548nine_record_outputs(struct shader_translator *tx, BYTE Usage, BYTE UsageIndex,
549                    int mask, int output_index)
550{
551    tx->output_info[tx->num_outputs].output_semantic = Usage;
552    tx->output_info[tx->num_outputs].output_semantic_index = UsageIndex;
553    tx->output_info[tx->num_outputs].mask = mask;
554    tx->output_info[tx->num_outputs].output_index = output_index;
555    tx->num_outputs++;
556}
557
558static struct ureg_src nine_float_constant_src(struct shader_translator *tx, int idx)
559{
560    struct ureg_src src;
561
562    if (tx->slot_map)
563        idx = tx->slot_map[idx];
564    /* vswp constant handling: we use two buffers
565     * to fit all the float constants. The special handling
566     * doesn't need to be elsewhere, because all the instructions
567     * accessing the constants directly are VS1, and swvp
568     * is VS >= 2 */
569    if (tx->info->swvp_on && idx >= 4096) {
570        /* TODO: swvp rel is broken if many constants are used */
571        src = ureg_src_register(TGSI_FILE_CONSTANT, idx - 4096);
572        src = ureg_src_dimension(src, 1);
573    } else {
574        src = ureg_src_register(TGSI_FILE_CONSTANT, idx);
575        src = ureg_src_dimension(src, 0);
576    }
577
578    if (!tx->info->swvp_on)
579        tx->slots_used[idx] = TRUE;
580    if (tx->info->const_float_slots < (idx + 1))
581        tx->info->const_float_slots = idx + 1;
582    if (tx->num_slots < (idx + 1))
583        tx->num_slots = idx + 1;
584
585    return src;
586}
587
588static struct ureg_src nine_integer_constant_src(struct shader_translator *tx, int idx)
589{
590    struct ureg_src src;
591
592    if (tx->info->swvp_on) {
593        src = ureg_src_register(TGSI_FILE_CONSTANT, idx);
594        src = ureg_src_dimension(src, 2);
595    } else {
596        unsigned slot_idx = tx->info->const_i_base + idx;
597        if (tx->slot_map)
598            slot_idx = tx->slot_map[slot_idx];
599        src = ureg_src_register(TGSI_FILE_CONSTANT, slot_idx);
600        src = ureg_src_dimension(src, 0);
601        tx->slots_used[slot_idx] = TRUE;
602        tx->info->int_slots_used[idx] = TRUE;
603        if (tx->num_slots < (slot_idx + 1))
604            tx->num_slots = slot_idx + 1;
605    }
606
607    if (tx->info->const_int_slots < (idx + 1))
608        tx->info->const_int_slots = idx + 1;
609
610    return src;
611}
612
613static struct ureg_src nine_boolean_constant_src(struct shader_translator *tx, int idx)
614{
615    struct ureg_src src;
616
617    char r = idx / 4;
618    char s = idx & 3;
619
620    if (tx->info->swvp_on) {
621        src = ureg_src_register(TGSI_FILE_CONSTANT, r);
622        src = ureg_src_dimension(src, 3);
623    } else {
624        unsigned slot_idx = tx->info->const_b_base + r;
625        if (tx->slot_map)
626            slot_idx = tx->slot_map[slot_idx];
627        src = ureg_src_register(TGSI_FILE_CONSTANT, slot_idx);
628        src = ureg_src_dimension(src, 0);
629        tx->slots_used[slot_idx] = TRUE;
630        tx->info->bool_slots_used[idx] = TRUE;
631        if (tx->num_slots < (slot_idx + 1))
632            tx->num_slots = slot_idx + 1;
633    }
634    src = ureg_swizzle(src, s, s, s, s);
635
636    if (tx->info->const_bool_slots < (idx + 1))
637        tx->info->const_bool_slots = idx + 1;
638
639    return src;
640}
641
642static boolean
643tx_lconstf(struct shader_translator *tx, struct ureg_src *src, INT index)
644{
645   INT i;
646
647   if (index < 0 || index >= tx->num_constf_allowed) {
648       tx->failure = TRUE;
649       return FALSE;
650   }
651   for (i = 0; i < tx->num_lconstf; ++i) {
652      if (tx->lconstf[i].idx == index) {
653         *src = tx->lconstf[i].reg;
654         return TRUE;
655      }
656   }
657   return FALSE;
658}
659static boolean
660tx_lconsti(struct shader_translator *tx, struct ureg_src *src, INT index)
661{
662   int i;
663
664   if (index < 0 || index >= tx->num_consti_allowed) {
665       tx->failure = TRUE;
666       return FALSE;
667   }
668   for (i = 0; i < tx->num_lconsti; ++i) {
669      if (tx->lconsti[i].idx == index) {
670         *src = tx->lconsti[i].reg;
671         return TRUE;
672      }
673   }
674   return FALSE;
675}
676static boolean
677tx_lconstb(struct shader_translator *tx, struct ureg_src *src, INT index)
678{
679   int i;
680
681   if (index < 0 || index >= tx->num_constb_allowed) {
682       tx->failure = TRUE;
683       return FALSE;
684   }
685   for (i = 0; i < tx->num_lconstb; ++i) {
686      if (tx->lconstb[i].idx == index) {
687         *src = tx->lconstb[i].reg;
688         return TRUE;
689      }
690   }
691   return FALSE;
692}
693
694static void
695tx_set_lconstf(struct shader_translator *tx, INT index, float f[4])
696{
697    unsigned n;
698
699    FAILURE_VOID(index < 0 || index >= tx->num_constf_allowed)
700
701    for (n = 0; n < tx->num_lconstf; ++n)
702        if (tx->lconstf[n].idx == index)
703            break;
704    if (n == tx->num_lconstf) {
705       if ((n % 8) == 0) {
706          tx->lconstf = REALLOC(tx->lconstf,
707                                (n + 0) * sizeof(tx->lconstf[0]),
708                                (n + 8) * sizeof(tx->lconstf[0]));
709          assert(tx->lconstf);
710       }
711       tx->num_lconstf++;
712    }
713    tx->lconstf[n].idx = index;
714    tx->lconstf[n].reg = ureg_imm4f(tx->ureg, f[0], f[1], f[2], f[3]);
715
716    memcpy(tx->lconstf[n].f, f, sizeof(tx->lconstf[n].f));
717}
718static void
719tx_set_lconsti(struct shader_translator *tx, INT index, int i[4])
720{
721    unsigned n;
722
723    FAILURE_VOID(index < 0 || index >= tx->num_consti_allowed)
724
725    for (n = 0; n < tx->num_lconsti; ++n)
726        if (tx->lconsti[n].idx == index)
727            break;
728    if (n == tx->num_lconsti) {
729       if ((n % 8) == 0) {
730          tx->lconsti = REALLOC(tx->lconsti,
731                                (n + 0) * sizeof(tx->lconsti[0]),
732                                (n + 8) * sizeof(tx->lconsti[0]));
733          assert(tx->lconsti);
734       }
735       tx->num_lconsti++;
736    }
737
738    tx->lconsti[n].idx = index;
739    tx->lconsti[n].reg = tx->native_integers ?
740       ureg_imm4i(tx->ureg, i[0], i[1], i[2], i[3]) :
741       ureg_imm4f(tx->ureg, i[0], i[1], i[2], i[3]);
742}
743static void
744tx_set_lconstb(struct shader_translator *tx, INT index, BOOL b)
745{
746    unsigned n;
747
748    FAILURE_VOID(index < 0 || index >= tx->num_constb_allowed)
749
750    for (n = 0; n < tx->num_lconstb; ++n)
751        if (tx->lconstb[n].idx == index)
752            break;
753    if (n == tx->num_lconstb) {
754       if ((n % 8) == 0) {
755          tx->lconstb = REALLOC(tx->lconstb,
756                                (n + 0) * sizeof(tx->lconstb[0]),
757                                (n + 8) * sizeof(tx->lconstb[0]));
758          assert(tx->lconstb);
759       }
760       tx->num_lconstb++;
761    }
762
763    tx->lconstb[n].idx = index;
764    tx->lconstb[n].reg = tx->native_integers ?
765       ureg_imm1u(tx->ureg, b ? 0xffffffff : 0) :
766       ureg_imm1f(tx->ureg, b ? 1.0f : 0.0f);
767}
768
769static inline struct ureg_dst
770tx_scratch(struct shader_translator *tx)
771{
772    if (tx->num_scratch >= ARRAY_SIZE(tx->regs.t)) {
773        tx->failure = TRUE;
774        return tx->regs.t[0];
775    }
776    if (ureg_dst_is_undef(tx->regs.t[tx->num_scratch]))
777        tx->regs.t[tx->num_scratch] = ureg_DECL_local_temporary(tx->ureg);
778    return tx->regs.t[tx->num_scratch++];
779}
780
781static inline struct ureg_dst
782tx_scratch_scalar(struct shader_translator *tx)
783{
784    return ureg_writemask(tx_scratch(tx), TGSI_WRITEMASK_X);
785}
786
787static inline struct ureg_src
788tx_src_scalar(struct ureg_dst dst)
789{
790    struct ureg_src src = ureg_src(dst);
791    int c = ffs(dst.WriteMask) - 1;
792    if (dst.WriteMask == (1 << c))
793        src = ureg_scalar(src, c);
794    return src;
795}
796
797static inline void
798tx_temp_alloc(struct shader_translator *tx, INT idx)
799{
800    assert(idx >= 0);
801    if (idx >= tx->num_temp) {
802       unsigned k = tx->num_temp;
803       unsigned n = idx + 1;
804       tx->regs.r = REALLOC(tx->regs.r,
805                            k * sizeof(tx->regs.r[0]),
806                            n * sizeof(tx->regs.r[0]));
807       for (; k < n; ++k)
808          tx->regs.r[k] = ureg_dst_undef();
809       tx->num_temp = n;
810    }
811    if (ureg_dst_is_undef(tx->regs.r[idx]))
812        tx->regs.r[idx] = ureg_DECL_temporary(tx->ureg);
813}
814
815static inline void
816tx_addr_alloc(struct shader_translator *tx, INT idx)
817{
818    assert(idx == 0);
819    if (ureg_dst_is_undef(tx->regs.address))
820        tx->regs.address = ureg_DECL_address(tx->ureg);
821    if (ureg_dst_is_undef(tx->regs.a0))
822        tx->regs.a0 = ureg_DECL_temporary(tx->ureg);
823}
824
825static inline bool
826TEX_if_fetch4(struct shader_translator *tx, struct ureg_dst dst,
827              unsigned target, struct ureg_src src0,
828              struct ureg_src src1, INT idx)
829{
830    struct ureg_dst tmp;
831    struct ureg_src src_tg4[3] = {src0, ureg_imm1f(tx->ureg, 0.f), src1};
832
833    if (!(tx->info->fetch4 & (1 << idx)))
834        return false;
835
836    /* TODO: needs more tests, but this feature is not much used at all */
837
838    tmp = tx_scratch(tx);
839    ureg_tex_insn(tx->ureg, TGSI_OPCODE_TG4, &tmp, 1, target, TGSI_RETURN_TYPE_FLOAT,
840                  NULL, 0, src_tg4, 3);
841    ureg_MOV(tx->ureg, dst, ureg_swizzle(ureg_src(tmp), NINE_SWIZZLE4(Z, X, Y, W)));
842    return true;
843}
844
845/* NOTE: It's not very clear on which ps1.1-ps1.3 instructions
846 * the projection should be applied on the texture. It doesn't
847 * apply on texkill.
848 * The doc is very imprecise here (it says the projection is done
849 * before rasterization, thus in vs, which seems wrong since ps instructions
850 * are affected differently)
851 * For now we only apply to the ps TEX instruction and TEXBEM.
852 * Perhaps some other instructions would need it */
853static inline void
854apply_ps1x_projection(struct shader_translator *tx, struct ureg_dst dst,
855                      struct ureg_src src, INT idx)
856{
857    struct ureg_dst tmp;
858    unsigned dim = 1 + ((tx->info->projected >> (2 * idx)) & 3);
859
860    /* no projection */
861    if (dim == 1) {
862        ureg_MOV(tx->ureg, dst, src);
863    } else {
864        tmp = tx_scratch_scalar(tx);
865        ureg_RCP(tx->ureg, tmp, ureg_scalar(src, dim-1));
866        ureg_MUL(tx->ureg, dst, tx_src_scalar(tmp), src);
867    }
868}
869
870static inline void
871TEX_with_ps1x_projection(struct shader_translator *tx, struct ureg_dst dst,
872                         unsigned target, struct ureg_src src0,
873                         struct ureg_src src1, INT idx)
874{
875    unsigned dim = 1 + ((tx->info->projected >> (2 * idx)) & 3);
876    struct ureg_dst tmp;
877    boolean shadow = !!(tx->info->sampler_mask_shadow & (1 << idx));
878
879    /* dim == 1: no projection
880     * Looks like must be disabled when it makes no
881     * sense according the texture dimensions
882     */
883    if (dim == 1 || (dim <= target && !shadow)) {
884        ureg_TEX(tx->ureg, dst, target, src0, src1);
885    } else if (dim == 4) {
886        ureg_TXP(tx->ureg, dst, target, src0, src1);
887    } else {
888        tmp = tx_scratch(tx);
889        apply_ps1x_projection(tx, tmp, src0, idx);
890        ureg_TEX(tx->ureg, dst, target, ureg_src(tmp), src1);
891    }
892}
893
894static inline void
895tx_texcoord_alloc(struct shader_translator *tx, INT idx)
896{
897    assert(IS_PS);
898    assert(idx >= 0 && idx < ARRAY_SIZE(tx->regs.vT));
899    if (ureg_src_is_undef(tx->regs.vT[idx]))
900       tx->regs.vT[idx] = ureg_DECL_fs_input(tx->ureg, tx->texcoord_sn, idx,
901                                             TGSI_INTERPOLATE_PERSPECTIVE);
902}
903
904static inline unsigned *
905tx_bgnloop(struct shader_translator *tx)
906{
907    tx->loop_depth++;
908    if (tx->loop_depth_max < tx->loop_depth)
909        tx->loop_depth_max = tx->loop_depth;
910    assert(tx->loop_depth < NINE_MAX_LOOP_DEPTH);
911    return &tx->loop_labels[tx->loop_depth - 1];
912}
913
914static inline unsigned *
915tx_endloop(struct shader_translator *tx)
916{
917    assert(tx->loop_depth);
918    tx->loop_depth--;
919    ureg_fixup_label(tx->ureg, tx->loop_labels[tx->loop_depth],
920                     ureg_get_instruction_number(tx->ureg));
921    return &tx->loop_labels[tx->loop_depth];
922}
923
924static struct ureg_dst
925tx_get_loopctr(struct shader_translator *tx, boolean loop_or_rep)
926{
927    const unsigned l = tx->loop_depth - 1;
928
929    if (!tx->loop_depth)
930    {
931        DBG("loop counter requested outside of loop\n");
932        return ureg_dst_undef();
933    }
934
935    if (ureg_dst_is_undef(tx->regs.rL[l])) {
936        /* loop or rep ctr creation */
937        tx->regs.rL[l] = ureg_DECL_local_temporary(tx->ureg);
938        tx->loop_or_rep[l] = loop_or_rep;
939    }
940    /* loop - rep - endloop - endrep not allowed */
941    assert(tx->loop_or_rep[l] == loop_or_rep);
942
943    return tx->regs.rL[l];
944}
945
946static struct ureg_src
947tx_get_loopal(struct shader_translator *tx)
948{
949    int loop_level = tx->loop_depth - 1;
950
951    while (loop_level >= 0) {
952        /* handle loop - rep - endrep - endloop case */
953        if (tx->loop_or_rep[loop_level])
954            /* the value is in the loop counter y component (nine implementation) */
955            return ureg_scalar(ureg_src(tx->regs.rL[loop_level]), TGSI_SWIZZLE_Y);
956        loop_level--;
957    }
958
959    DBG("aL counter requested outside of loop\n");
960    return ureg_src_undef();
961}
962
963static inline unsigned *
964tx_cond(struct shader_translator *tx)
965{
966   assert(tx->cond_depth <= NINE_MAX_COND_DEPTH);
967   tx->cond_depth++;
968   return &tx->cond_labels[tx->cond_depth - 1];
969}
970
971static inline unsigned *
972tx_elsecond(struct shader_translator *tx)
973{
974   assert(tx->cond_depth);
975   return &tx->cond_labels[tx->cond_depth - 1];
976}
977
978static inline void
979tx_endcond(struct shader_translator *tx)
980{
981   assert(tx->cond_depth);
982   tx->cond_depth--;
983   ureg_fixup_label(tx->ureg, tx->cond_labels[tx->cond_depth],
984                    ureg_get_instruction_number(tx->ureg));
985}
986
987static inline struct ureg_dst
988nine_ureg_dst_register(unsigned file, int index)
989{
990    return ureg_dst(ureg_src_register(file, index));
991}
992
993static inline struct ureg_src
994nine_get_position_input(struct shader_translator *tx)
995{
996    struct ureg_program *ureg = tx->ureg;
997
998    if (tx->wpos_is_sysval)
999        return ureg_DECL_system_value(ureg, TGSI_SEMANTIC_POSITION, 0);
1000    else
1001        return ureg_DECL_fs_input(ureg, TGSI_SEMANTIC_POSITION,
1002                                  0, TGSI_INTERPOLATE_LINEAR);
1003}
1004
1005static struct ureg_src
1006tx_src_param(struct shader_translator *tx, const struct sm1_src_param *param)
1007{
1008    struct ureg_program *ureg = tx->ureg;
1009    struct ureg_src src;
1010    struct ureg_dst tmp;
1011
1012    assert(!param->rel || (IS_VS && param->file == D3DSPR_CONST) ||
1013        (param->file == D3DSPR_INPUT && tx->version.major == 3));
1014
1015    switch (param->file)
1016    {
1017    case D3DSPR_TEMP:
1018        tx_temp_alloc(tx, param->idx);
1019        src = ureg_src(tx->regs.r[param->idx]);
1020        break;
1021 /* case D3DSPR_TEXTURE: == D3DSPR_ADDR */
1022    case D3DSPR_ADDR:
1023        if (IS_VS) {
1024            assert(param->idx == 0);
1025            /* the address register (vs only) must be
1026             * assigned before use */
1027            assert(!ureg_dst_is_undef(tx->regs.a0));
1028            /* Round to lowest for vs1.1 (contrary to the doc), else
1029             * round to nearest */
1030            if (tx->version.major < 2 && tx->version.minor < 2)
1031                ureg_ARL(ureg, tx->regs.address, ureg_src(tx->regs.a0));
1032            else
1033                ureg_ARR(ureg, tx->regs.address, ureg_src(tx->regs.a0));
1034            src = ureg_src(tx->regs.address);
1035        } else {
1036            if (tx->version.major < 2 && tx->version.minor < 4) {
1037                /* no subroutines, so should be defined */
1038                src = ureg_src(tx->regs.tS[param->idx]);
1039            } else {
1040                tx_texcoord_alloc(tx, param->idx);
1041                src = tx->regs.vT[param->idx];
1042            }
1043        }
1044        break;
1045    case D3DSPR_INPUT:
1046        if (IS_VS) {
1047            src = ureg_src_register(TGSI_FILE_INPUT, param->idx);
1048        } else {
1049            if (tx->version.major < 3) {
1050                src = ureg_DECL_fs_input_centroid(
1051                    ureg, TGSI_SEMANTIC_COLOR, param->idx,
1052                    TGSI_INTERPOLATE_COLOR,
1053                    tx->info->force_color_in_centroid ?
1054                      TGSI_INTERPOLATE_LOC_CENTROID : 0,
1055                    0, 1);
1056            } else {
1057                if(param->rel) {
1058                    /* Copy all inputs (non consecutive)
1059                     * to temp array (consecutive).
1060                     * This is not good for performance.
1061                     * A better way would be to have inputs
1062                     * consecutive (would need implement alternative
1063                     * way to match vs outputs and ps inputs).
1064                     * However even with the better way, the temp array
1065                     * copy would need to be used if some inputs
1066                     * are not GENERIC or if they have different
1067                     * interpolation flag. */
1068                    if (ureg_src_is_undef(tx->regs.v_consecutive)) {
1069                        int i;
1070                        tx->regs.v_consecutive = ureg_src(ureg_DECL_array_temporary(ureg, 10, 0));
1071                        for (i = 0; i < 10; i++) {
1072                            if (!ureg_src_is_undef(tx->regs.v[i]))
1073                                ureg_MOV(ureg, ureg_dst_array_offset(ureg_dst(tx->regs.v_consecutive), i), tx->regs.v[i]);
1074                            else
1075                                ureg_MOV(ureg, ureg_dst_array_offset(ureg_dst(tx->regs.v_consecutive), i), ureg_imm4f(ureg, 0.0f, 0.0f, 0.0f, 1.0f));
1076                        }
1077                    }
1078                    src = ureg_src_array_offset(tx->regs.v_consecutive, param->idx);
1079                } else {
1080                    assert(param->idx < ARRAY_SIZE(tx->regs.v));
1081                    src = tx->regs.v[param->idx];
1082                }
1083            }
1084        }
1085        if (param->rel)
1086            src = ureg_src_indirect(src, tx_src_param(tx, param->rel));
1087        break;
1088    case D3DSPR_PREDICATE:
1089        if (ureg_dst_is_undef(tx->regs.predicate)) {
1090            /* Forbidden to use the predicate register before being set */
1091            tx->failure = TRUE;
1092            tx->regs.predicate = ureg_DECL_temporary(tx->ureg);
1093        }
1094        src = ureg_src(tx->regs.predicate);
1095        break;
1096    case D3DSPR_SAMPLER:
1097        assert(param->mod == NINED3DSPSM_NONE);
1098        /* assert(param->swizzle == NINED3DSP_NOSWIZZLE); Passed by wine tests */
1099        src = ureg_DECL_sampler(ureg, param->idx);
1100        break;
1101    case D3DSPR_CONST:
1102        if (param->rel || !tx_lconstf(tx, &src, param->idx)) {
1103            src = nine_float_constant_src(tx, param->idx);
1104            if (param->rel) {
1105                tx->indirect_const_access = TRUE;
1106                src = ureg_src_indirect(src, tx_src_param(tx, param->rel));
1107            }
1108        }
1109        if (!IS_VS && tx->version.major < 2) {
1110            /* ps 1.X clamps constants */
1111            tmp = tx_scratch(tx);
1112            ureg_MIN(ureg, tmp, src, ureg_imm1f(ureg, 1.0f));
1113            ureg_MAX(ureg, tmp, ureg_src(tmp), ureg_imm1f(ureg, -1.0f));
1114            src = ureg_src(tmp);
1115        }
1116        break;
1117    case D3DSPR_CONST2:
1118    case D3DSPR_CONST3:
1119    case D3DSPR_CONST4:
1120        DBG("CONST2/3/4 should have been collapsed into D3DSPR_CONST !\n");
1121        assert(!"CONST2/3/4");
1122        src = ureg_imm1f(ureg, 0.0f);
1123        break;
1124    case D3DSPR_CONSTINT:
1125        /* relative adressing only possible for float constants in vs */
1126        if (!tx_lconsti(tx, &src, param->idx))
1127            src = nine_integer_constant_src(tx, param->idx);
1128        break;
1129    case D3DSPR_CONSTBOOL:
1130        if (!tx_lconstb(tx, &src, param->idx))
1131            src = nine_boolean_constant_src(tx, param->idx);
1132        break;
1133    case D3DSPR_LOOP:
1134        if (ureg_dst_is_undef(tx->regs.address))
1135            tx->regs.address = ureg_DECL_address(ureg);
1136        if (!tx->native_integers)
1137            ureg_ARR(ureg, tx->regs.address, tx_get_loopal(tx));
1138        else
1139            ureg_UARL(ureg, tx->regs.address, tx_get_loopal(tx));
1140        src = ureg_src(tx->regs.address);
1141        break;
1142    case D3DSPR_MISCTYPE:
1143        switch (param->idx) {
1144        case D3DSMO_POSITION:
1145           if (ureg_src_is_undef(tx->regs.vPos))
1146              tx->regs.vPos = nine_get_position_input(tx);
1147           if (tx->shift_wpos) {
1148               /* TODO: do this only once */
1149               struct ureg_dst wpos = tx_scratch(tx);
1150               ureg_ADD(ureg, wpos, tx->regs.vPos,
1151                        ureg_imm4f(ureg, -0.5f, -0.5f, 0.0f, 0.0f));
1152               src = ureg_src(wpos);
1153           } else {
1154               src = tx->regs.vPos;
1155           }
1156           break;
1157        case D3DSMO_FACE:
1158           if (ureg_src_is_undef(tx->regs.vFace)) {
1159               if (tx->face_is_sysval_integer) {
1160                   tmp = ureg_DECL_temporary(ureg);
1161                   tx->regs.vFace =
1162                       ureg_DECL_system_value(ureg, TGSI_SEMANTIC_FACE, 0);
1163
1164                   /* convert bool to float */
1165                   ureg_UCMP(ureg, tmp, ureg_scalar(tx->regs.vFace, TGSI_SWIZZLE_X),
1166                             ureg_imm1f(ureg, 1), ureg_imm1f(ureg, -1));
1167                   tx->regs.vFace = ureg_src(tmp);
1168               } else {
1169                   tx->regs.vFace = ureg_DECL_fs_input(ureg,
1170                                                       TGSI_SEMANTIC_FACE, 0,
1171                                                       TGSI_INTERPOLATE_CONSTANT);
1172               }
1173               tx->regs.vFace = ureg_scalar(tx->regs.vFace, TGSI_SWIZZLE_X);
1174           }
1175           src = tx->regs.vFace;
1176           break;
1177        default:
1178            assert(!"invalid src D3DSMO");
1179            break;
1180        }
1181        break;
1182    case D3DSPR_TEMPFLOAT16:
1183        break;
1184    default:
1185        assert(!"invalid src D3DSPR");
1186    }
1187
1188    switch (param->mod) {
1189    case NINED3DSPSM_DW:
1190        tmp = tx_scratch(tx);
1191        /* NOTE: app is not allowed to read w with this modifier */
1192        ureg_RCP(ureg, ureg_writemask(tmp, NINED3DSP_WRITEMASK_3), ureg_scalar(src, TGSI_SWIZZLE_W));
1193        ureg_MUL(ureg, tmp, src, ureg_swizzle(ureg_src(tmp), NINE_SWIZZLE4(W,W,W,W)));
1194        src = ureg_src(tmp);
1195        break;
1196    case NINED3DSPSM_DZ:
1197        tmp = tx_scratch(tx);
1198        /* NOTE: app is not allowed to read z with this modifier */
1199        ureg_RCP(ureg, ureg_writemask(tmp, NINED3DSP_WRITEMASK_2), ureg_scalar(src, TGSI_SWIZZLE_Z));
1200        ureg_MUL(ureg, tmp, src, ureg_swizzle(ureg_src(tmp), NINE_SWIZZLE4(Z,Z,Z,Z)));
1201        src = ureg_src(tmp);
1202        break;
1203    default:
1204        break;
1205    }
1206
1207    if (param->swizzle != NINED3DSP_NOSWIZZLE && param->file != D3DSPR_SAMPLER)
1208        src = ureg_swizzle(src,
1209                           (param->swizzle >> 0) & 0x3,
1210                           (param->swizzle >> 2) & 0x3,
1211                           (param->swizzle >> 4) & 0x3,
1212                           (param->swizzle >> 6) & 0x3);
1213
1214    switch (param->mod) {
1215    case NINED3DSPSM_ABS:
1216        src = ureg_abs(src);
1217        break;
1218    case NINED3DSPSM_ABSNEG:
1219        src = ureg_negate(ureg_abs(src));
1220        break;
1221    case NINED3DSPSM_NEG:
1222        src = ureg_negate(src);
1223        break;
1224    case NINED3DSPSM_BIAS:
1225        tmp = tx_scratch(tx);
1226        ureg_ADD(ureg, tmp, src, ureg_imm1f(ureg, -0.5f));
1227        src = ureg_src(tmp);
1228        break;
1229    case NINED3DSPSM_BIASNEG:
1230        tmp = tx_scratch(tx);
1231        ureg_ADD(ureg, tmp, ureg_imm1f(ureg, 0.5f), ureg_negate(src));
1232        src = ureg_src(tmp);
1233        break;
1234    case NINED3DSPSM_NOT:
1235        if (tx->native_integers && param->file == D3DSPR_CONSTBOOL) {
1236            tmp = tx_scratch(tx);
1237            ureg_NOT(ureg, tmp, src);
1238            src = ureg_src(tmp);
1239            break;
1240        } else { /* predicate */
1241            tmp = tx_scratch(tx);
1242            ureg_ADD(ureg, tmp, ureg_imm1f(ureg, 1.0f), ureg_negate(src));
1243            src = ureg_src(tmp);
1244        }
1245        FALLTHROUGH;
1246    case NINED3DSPSM_COMP:
1247        tmp = tx_scratch(tx);
1248        ureg_ADD(ureg, tmp, ureg_imm1f(ureg, 1.0f), ureg_negate(src));
1249        src = ureg_src(tmp);
1250        break;
1251    case NINED3DSPSM_DZ:
1252    case NINED3DSPSM_DW:
1253        /* Already handled*/
1254        break;
1255    case NINED3DSPSM_SIGN:
1256        tmp = tx_scratch(tx);
1257        ureg_MAD(ureg, tmp, src, ureg_imm1f(ureg, 2.0f), ureg_imm1f(ureg, -1.0f));
1258        src = ureg_src(tmp);
1259        break;
1260    case NINED3DSPSM_SIGNNEG:
1261        tmp = tx_scratch(tx);
1262        ureg_MAD(ureg, tmp, src, ureg_imm1f(ureg, -2.0f), ureg_imm1f(ureg, 1.0f));
1263        src = ureg_src(tmp);
1264        break;
1265    case NINED3DSPSM_X2:
1266        tmp = tx_scratch(tx);
1267        ureg_ADD(ureg, tmp, src, src);
1268        src = ureg_src(tmp);
1269        break;
1270    case NINED3DSPSM_X2NEG:
1271        tmp = tx_scratch(tx);
1272        ureg_ADD(ureg, tmp, src, src);
1273        src = ureg_negate(ureg_src(tmp));
1274        break;
1275    default:
1276        assert(param->mod == NINED3DSPSM_NONE);
1277        break;
1278    }
1279
1280    return src;
1281}
1282
1283static struct ureg_dst
1284_tx_dst_param(struct shader_translator *tx, const struct sm1_dst_param *param)
1285{
1286    struct ureg_dst dst;
1287
1288    switch (param->file)
1289    {
1290    case D3DSPR_TEMP:
1291        assert(!param->rel);
1292        tx_temp_alloc(tx, param->idx);
1293        dst = tx->regs.r[param->idx];
1294        break;
1295 /* case D3DSPR_TEXTURE: == D3DSPR_ADDR */
1296    case D3DSPR_ADDR:
1297        assert(!param->rel);
1298        if (tx->version.major < 2 && !IS_VS) {
1299            if (ureg_dst_is_undef(tx->regs.tS[param->idx]))
1300                tx->regs.tS[param->idx] = ureg_DECL_temporary(tx->ureg);
1301            dst = tx->regs.tS[param->idx];
1302        } else
1303        if (!IS_VS && tx->insn.opcode == D3DSIO_TEXKILL) { /* maybe others, too */
1304            tx_texcoord_alloc(tx, param->idx);
1305            dst = ureg_dst(tx->regs.vT[param->idx]);
1306        } else {
1307            tx_addr_alloc(tx, param->idx);
1308            dst = tx->regs.a0;
1309        }
1310        break;
1311    case D3DSPR_RASTOUT:
1312        assert(!param->rel);
1313        switch (param->idx) {
1314        case 0:
1315            if (ureg_dst_is_undef(tx->regs.oPos))
1316                tx->regs.oPos =
1317                    ureg_DECL_output(tx->ureg, TGSI_SEMANTIC_POSITION, 0);
1318            dst = tx->regs.oPos;
1319            break;
1320        case 1:
1321            if (ureg_dst_is_undef(tx->regs.oFog))
1322                tx->regs.oFog =
1323                    ureg_saturate(ureg_DECL_output(tx->ureg, TGSI_SEMANTIC_GENERIC, 16));
1324            dst = tx->regs.oFog;
1325            break;
1326        case 2:
1327            if (ureg_dst_is_undef(tx->regs.oPts))
1328                tx->regs.oPts = ureg_DECL_temporary(tx->ureg);
1329            dst = tx->regs.oPts;
1330            break;
1331        default:
1332            assert(0);
1333            break;
1334        }
1335        break;
1336 /* case D3DSPR_TEXCRDOUT: == D3DSPR_OUTPUT */
1337    case D3DSPR_OUTPUT:
1338        if (tx->version.major < 3) {
1339            assert(!param->rel);
1340            dst = ureg_DECL_output(tx->ureg, tx->texcoord_sn, param->idx);
1341        } else {
1342            assert(!param->rel); /* TODO */
1343            assert(param->idx < ARRAY_SIZE(tx->regs.o));
1344            dst = tx->regs.o[param->idx];
1345        }
1346        break;
1347    case D3DSPR_ATTROUT: /* VS */
1348    case D3DSPR_COLOROUT: /* PS */
1349        assert(param->idx >= 0 && param->idx < 4);
1350        assert(!param->rel);
1351        tx->info->rt_mask |= 1 << param->idx;
1352        if (ureg_dst_is_undef(tx->regs.oCol[param->idx])) {
1353            /* ps < 3: oCol[0] will have fog blending afterward */
1354            if (!IS_VS && tx->version.major < 3 && param->idx == 0) {
1355                tx->regs.oCol[0] = ureg_DECL_temporary(tx->ureg);
1356            } else {
1357                tx->regs.oCol[param->idx] =
1358                    ureg_DECL_output(tx->ureg, TGSI_SEMANTIC_COLOR, param->idx);
1359            }
1360        }
1361        dst = tx->regs.oCol[param->idx];
1362        if (IS_VS && tx->version.major < 3)
1363            dst = ureg_saturate(dst);
1364        break;
1365    case D3DSPR_DEPTHOUT:
1366        assert(!param->rel);
1367        if (ureg_dst_is_undef(tx->regs.oDepth))
1368           tx->regs.oDepth =
1369              ureg_DECL_output_masked(tx->ureg, TGSI_SEMANTIC_POSITION, 0,
1370                                      TGSI_WRITEMASK_Z, 0, 1);
1371        dst = tx->regs.oDepth; /* XXX: must write .z component */
1372        break;
1373    case D3DSPR_PREDICATE:
1374        if (ureg_dst_is_undef(tx->regs.predicate))
1375            tx->regs.predicate = ureg_DECL_temporary(tx->ureg);
1376        dst = tx->regs.predicate;
1377        break;
1378    case D3DSPR_TEMPFLOAT16:
1379        DBG("unhandled D3DSPR: %u\n", param->file);
1380        break;
1381    default:
1382        assert(!"invalid dst D3DSPR");
1383        break;
1384    }
1385    if (param->rel)
1386        dst = ureg_dst_indirect(dst, tx_src_param(tx, param->rel));
1387
1388    if (param->mask != NINED3DSP_WRITEMASK_ALL)
1389        dst = ureg_writemask(dst, param->mask);
1390    if (param->mod & NINED3DSPDM_SATURATE)
1391        dst = ureg_saturate(dst);
1392
1393    if (tx->predicated_activated) {
1394        tx->regs.predicate_dst = dst;
1395        dst = tx->regs.predicate_tmp;
1396    }
1397
1398    return dst;
1399}
1400
1401static struct ureg_dst
1402tx_dst_param(struct shader_translator *tx, const struct sm1_dst_param *param)
1403{
1404    if (param->shift) {
1405        tx->regs.tdst = ureg_writemask(tx_scratch(tx), param->mask);
1406        return tx->regs.tdst;
1407    }
1408    return _tx_dst_param(tx, param);
1409}
1410
1411static void
1412tx_apply_dst0_modifiers(struct shader_translator *tx)
1413{
1414    struct ureg_dst rdst;
1415    float f;
1416
1417    if (!tx->insn.ndst || !tx->insn.dst[0].shift || tx->insn.opcode == D3DSIO_TEXKILL)
1418        return;
1419    rdst = _tx_dst_param(tx, &tx->insn.dst[0]);
1420
1421    assert(rdst.File != TGSI_FILE_ADDRESS); /* this probably isn't possible */
1422
1423    if (tx->insn.dst[0].shift < 0)
1424        f = 1.0f / (1 << -tx->insn.dst[0].shift);
1425    else
1426        f = 1 << tx->insn.dst[0].shift;
1427
1428    ureg_MUL(tx->ureg, rdst, ureg_src(tx->regs.tdst), ureg_imm1f(tx->ureg, f));
1429}
1430
1431static struct ureg_src
1432tx_dst_param_as_src(struct shader_translator *tx, const struct sm1_dst_param *param)
1433{
1434    struct ureg_src src;
1435
1436    assert(!param->shift);
1437    assert(!(param->mod & NINED3DSPDM_SATURATE));
1438
1439    switch (param->file) {
1440    case D3DSPR_INPUT:
1441        if (IS_VS) {
1442            src = ureg_src_register(TGSI_FILE_INPUT, param->idx);
1443        } else {
1444            assert(!param->rel);
1445            assert(param->idx < ARRAY_SIZE(tx->regs.v));
1446            src = tx->regs.v[param->idx];
1447        }
1448        break;
1449    default:
1450        src = ureg_src(tx_dst_param(tx, param));
1451        break;
1452    }
1453    if (param->rel)
1454        src = ureg_src_indirect(src, tx_src_param(tx, param->rel));
1455
1456    if (!param->mask)
1457        WARN("mask is 0, using identity swizzle\n");
1458
1459    if (param->mask && param->mask != NINED3DSP_WRITEMASK_ALL) {
1460        char s[4];
1461        int n;
1462        int c;
1463        for (n = 0, c = 0; c < 4; ++c)
1464            if (param->mask & (1 << c))
1465                s[n++] = c;
1466        assert(n);
1467        for (c = n; c < 4; ++c)
1468            s[c] = s[n - 1];
1469        src = ureg_swizzle(src, s[0], s[1], s[2], s[3]);
1470    }
1471    return src;
1472}
1473
1474static HRESULT
1475NineTranslateInstruction_Mkxn(struct shader_translator *tx, const unsigned k, const unsigned n)
1476{
1477    struct ureg_program *ureg = tx->ureg;
1478    struct ureg_dst dst;
1479    struct ureg_src src[2];
1480    struct sm1_src_param *src_mat = &tx->insn.src[1];
1481    unsigned i;
1482
1483    dst = tx_dst_param(tx, &tx->insn.dst[0]);
1484    src[0] = tx_src_param(tx, &tx->insn.src[0]);
1485
1486    for (i = 0; i < n; i++)
1487    {
1488        const unsigned m = (1 << i);
1489
1490        src[1] = tx_src_param(tx, src_mat);
1491        src_mat->idx++;
1492
1493        if (!(dst.WriteMask & m))
1494            continue;
1495
1496        /* XXX: src == dst case ? */
1497
1498        switch (k) {
1499        case 3:
1500            ureg_DP3(ureg, ureg_writemask(dst, m), src[0], src[1]);
1501            break;
1502        case 4:
1503            ureg_DP4(ureg, ureg_writemask(dst, m), src[0], src[1]);
1504            break;
1505        default:
1506            DBG("invalid operation: M%ux%u\n", m, n);
1507            break;
1508        }
1509    }
1510
1511    return D3D_OK;
1512}
1513
1514#define VNOTSUPPORTED   0, 0
1515#define V(maj, min)     (((maj) << 8) | (min))
1516
1517static inline const char *
1518d3dsio_to_string( unsigned opcode )
1519{
1520    static const char *names[] = {
1521        "NOP",
1522        "MOV",
1523        "ADD",
1524        "SUB",
1525        "MAD",
1526        "MUL",
1527        "RCP",
1528        "RSQ",
1529        "DP3",
1530        "DP4",
1531        "MIN",
1532        "MAX",
1533        "SLT",
1534        "SGE",
1535        "EXP",
1536        "LOG",
1537        "LIT",
1538        "DST",
1539        "LRP",
1540        "FRC",
1541        "M4x4",
1542        "M4x3",
1543        "M3x4",
1544        "M3x3",
1545        "M3x2",
1546        "CALL",
1547        "CALLNZ",
1548        "LOOP",
1549        "RET",
1550        "ENDLOOP",
1551        "LABEL",
1552        "DCL",
1553        "POW",
1554        "CRS",
1555        "SGN",
1556        "ABS",
1557        "NRM",
1558        "SINCOS",
1559        "REP",
1560        "ENDREP",
1561        "IF",
1562        "IFC",
1563        "ELSE",
1564        "ENDIF",
1565        "BREAK",
1566        "BREAKC",
1567        "MOVA",
1568        "DEFB",
1569        "DEFI",
1570        NULL,
1571        NULL,
1572        NULL,
1573        NULL,
1574        NULL,
1575        NULL,
1576        NULL,
1577        NULL,
1578        NULL,
1579        NULL,
1580        NULL,
1581        NULL,
1582        NULL,
1583        NULL,
1584        NULL,
1585        "TEXCOORD",
1586        "TEXKILL",
1587        "TEX",
1588        "TEXBEM",
1589        "TEXBEML",
1590        "TEXREG2AR",
1591        "TEXREG2GB",
1592        "TEXM3x2PAD",
1593        "TEXM3x2TEX",
1594        "TEXM3x3PAD",
1595        "TEXM3x3TEX",
1596        NULL,
1597        "TEXM3x3SPEC",
1598        "TEXM3x3VSPEC",
1599        "EXPP",
1600        "LOGP",
1601        "CND",
1602        "DEF",
1603        "TEXREG2RGB",
1604        "TEXDP3TEX",
1605        "TEXM3x2DEPTH",
1606        "TEXDP3",
1607        "TEXM3x3",
1608        "TEXDEPTH",
1609        "CMP",
1610        "BEM",
1611        "DP2ADD",
1612        "DSX",
1613        "DSY",
1614        "TEXLDD",
1615        "SETP",
1616        "TEXLDL",
1617        "BREAKP"
1618    };
1619
1620    if (opcode < ARRAY_SIZE(names)) return names[opcode];
1621
1622    switch (opcode) {
1623    case D3DSIO_PHASE: return "PHASE";
1624    case D3DSIO_COMMENT: return "COMMENT";
1625    case D3DSIO_END: return "END";
1626    default:
1627        return NULL;
1628    }
1629}
1630
1631#define NULL_INSTRUCTION            { 0, { 0, 0 }, { 0, 0 }, 0, 0, NULL }
1632#define IS_VALID_INSTRUCTION(inst)  ((inst).vert_version.min | \
1633                                     (inst).vert_version.max | \
1634                                     (inst).frag_version.min | \
1635                                     (inst).frag_version.max)
1636
1637#define SPECIAL(name) \
1638    NineTranslateInstruction_##name
1639
1640#define DECL_SPECIAL(name) \
1641    static HRESULT \
1642    NineTranslateInstruction_##name( struct shader_translator *tx )
1643
1644static HRESULT
1645NineTranslateInstruction_Generic(struct shader_translator *);
1646
1647DECL_SPECIAL(NOP)
1648{
1649    /* Nothing to do. NOP was used to avoid hangs
1650     * with very old d3d drivers. */
1651    return D3D_OK;
1652}
1653
1654DECL_SPECIAL(SUB)
1655{
1656    struct ureg_program *ureg = tx->ureg;
1657    struct ureg_dst dst = tx_dst_param(tx, &tx->insn.dst[0]);
1658    struct ureg_src src0 = tx_src_param(tx, &tx->insn.src[0]);
1659    struct ureg_src src1 = tx_src_param(tx, &tx->insn.src[1]);
1660
1661    ureg_ADD(ureg, dst, src0, ureg_negate(src1));
1662    return D3D_OK;
1663}
1664
1665DECL_SPECIAL(ABS)
1666{
1667    struct ureg_program *ureg = tx->ureg;
1668    struct ureg_dst dst = tx_dst_param(tx, &tx->insn.dst[0]);
1669    struct ureg_src src = tx_src_param(tx, &tx->insn.src[0]);
1670
1671    ureg_MOV(ureg, dst, ureg_abs(src));
1672    return D3D_OK;
1673}
1674
1675DECL_SPECIAL(XPD)
1676{
1677    struct ureg_program *ureg = tx->ureg;
1678    struct ureg_dst dst = tx_dst_param(tx, &tx->insn.dst[0]);
1679    struct ureg_src src0 = tx_src_param(tx, &tx->insn.src[0]);
1680    struct ureg_src src1 = tx_src_param(tx, &tx->insn.src[1]);
1681
1682    ureg_MUL(ureg, ureg_writemask(dst, TGSI_WRITEMASK_XYZ),
1683             ureg_swizzle(src0, TGSI_SWIZZLE_Y, TGSI_SWIZZLE_Z,
1684                          TGSI_SWIZZLE_X, 0),
1685             ureg_swizzle(src1, TGSI_SWIZZLE_Z, TGSI_SWIZZLE_X,
1686                          TGSI_SWIZZLE_Y, 0));
1687    ureg_MAD(ureg, ureg_writemask(dst, TGSI_WRITEMASK_XYZ),
1688             ureg_swizzle(src0, TGSI_SWIZZLE_Z, TGSI_SWIZZLE_X,
1689                          TGSI_SWIZZLE_Y, 0),
1690             ureg_negate(ureg_swizzle(src1, TGSI_SWIZZLE_Y,
1691                                      TGSI_SWIZZLE_Z, TGSI_SWIZZLE_X, 0)),
1692             ureg_src(dst));
1693    ureg_MOV(ureg, ureg_writemask(dst, TGSI_WRITEMASK_W),
1694             ureg_imm1f(ureg, 1));
1695    return D3D_OK;
1696}
1697
1698DECL_SPECIAL(M4x4)
1699{
1700    return NineTranslateInstruction_Mkxn(tx, 4, 4);
1701}
1702
1703DECL_SPECIAL(M4x3)
1704{
1705    return NineTranslateInstruction_Mkxn(tx, 4, 3);
1706}
1707
1708DECL_SPECIAL(M3x4)
1709{
1710    return NineTranslateInstruction_Mkxn(tx, 3, 4);
1711}
1712
1713DECL_SPECIAL(M3x3)
1714{
1715    return NineTranslateInstruction_Mkxn(tx, 3, 3);
1716}
1717
1718DECL_SPECIAL(M3x2)
1719{
1720    return NineTranslateInstruction_Mkxn(tx, 3, 2);
1721}
1722
1723DECL_SPECIAL(CMP)
1724{
1725    ureg_CMP(tx->ureg, tx_dst_param(tx, &tx->insn.dst[0]),
1726             tx_src_param(tx, &tx->insn.src[0]),
1727             tx_src_param(tx, &tx->insn.src[2]),
1728             tx_src_param(tx, &tx->insn.src[1]));
1729    return D3D_OK;
1730}
1731
1732DECL_SPECIAL(CND)
1733{
1734    struct ureg_dst dst = tx_dst_param(tx, &tx->insn.dst[0]);
1735    struct ureg_dst cgt;
1736    struct ureg_src cnd;
1737
1738    /* the coissue flag was a tip for compilers to advise to
1739     * execute two operations at the same time, in cases
1740     * the two executions had same dst with different channels.
1741     * It has no effect on current hw. However it seems CND
1742     * is affected. The handling of this very specific case
1743     * handled below mimick wine behaviour */
1744    if (tx->insn.coissue && tx->version.major == 1 && tx->version.minor < 4 && tx->insn.dst[0].mask != NINED3DSP_WRITEMASK_3) {
1745        ureg_MOV(tx->ureg,
1746                 dst, tx_src_param(tx, &tx->insn.src[1]));
1747        return D3D_OK;
1748    }
1749
1750    cnd = tx_src_param(tx, &tx->insn.src[0]);
1751    cgt = tx_scratch(tx);
1752
1753    if (tx->version.major == 1 && tx->version.minor < 4)
1754        cnd = ureg_scalar(cnd, TGSI_SWIZZLE_W);
1755
1756    ureg_SGT(tx->ureg, cgt, cnd, ureg_imm1f(tx->ureg, 0.5f));
1757
1758    ureg_CMP(tx->ureg, dst, ureg_negate(ureg_src(cgt)),
1759             tx_src_param(tx, &tx->insn.src[1]),
1760             tx_src_param(tx, &tx->insn.src[2]));
1761    return D3D_OK;
1762}
1763
1764DECL_SPECIAL(CALL)
1765{
1766    assert(tx->insn.src[0].idx < tx->num_inst_labels);
1767    ureg_CAL(tx->ureg, &tx->inst_labels[tx->insn.src[0].idx]);
1768    return D3D_OK;
1769}
1770
1771DECL_SPECIAL(CALLNZ)
1772{
1773    struct ureg_program *ureg = tx->ureg;
1774    struct ureg_src src = tx_src_param(tx, &tx->insn.src[1]);
1775
1776    if (!tx->native_integers)
1777        ureg_IF(ureg, src, tx_cond(tx));
1778    else
1779        ureg_UIF(ureg, src, tx_cond(tx));
1780    ureg_CAL(ureg, &tx->inst_labels[tx->insn.src[0].idx]);
1781    tx_endcond(tx);
1782    ureg_ENDIF(ureg);
1783    return D3D_OK;
1784}
1785
1786DECL_SPECIAL(LOOP)
1787{
1788    struct ureg_program *ureg = tx->ureg;
1789    unsigned *label;
1790    struct ureg_src src = tx_src_param(tx, &tx->insn.src[1]);
1791    struct ureg_dst ctr;
1792    struct ureg_dst tmp;
1793    struct ureg_src ctrx;
1794
1795    label = tx_bgnloop(tx);
1796    ctr = tx_get_loopctr(tx, TRUE);
1797    ctrx = ureg_scalar(ureg_src(ctr), TGSI_SWIZZLE_X);
1798
1799    /* src: num_iterations - start_value of al - step for al - 0 */
1800    ureg_MOV(ureg, ctr, src);
1801    ureg_BGNLOOP(tx->ureg, label);
1802    tmp = tx_scratch_scalar(tx);
1803    /* Initially ctr.x contains the number of iterations.
1804     * ctr.y will contain the updated value of al.
1805     * We decrease ctr.x at the end of every iteration,
1806     * and stop when it reaches 0. */
1807
1808    if (!tx->native_integers) {
1809        /* case src and ctr contain floats */
1810        /* to avoid precision issue, we stop when ctr <= 0.5 */
1811        ureg_SGE(ureg, tmp, ureg_imm1f(ureg, 0.5f), ctrx);
1812        ureg_IF(ureg, tx_src_scalar(tmp), tx_cond(tx));
1813    } else {
1814        /* case src and ctr contain integers */
1815        ureg_ISGE(ureg, tmp, ureg_imm1i(ureg, 0), ctrx);
1816        ureg_UIF(ureg, tx_src_scalar(tmp), tx_cond(tx));
1817    }
1818    ureg_BRK(ureg);
1819    tx_endcond(tx);
1820    ureg_ENDIF(ureg);
1821    return D3D_OK;
1822}
1823
1824DECL_SPECIAL(RET)
1825{
1826    /* RET as a last instruction could be safely ignored.
1827     * Remove it to prevent crashes/warnings in case underlying
1828     * driver doesn't implement arbitrary returns.
1829     */
1830    if (*(tx->parse_next) != NINED3DSP_END) {
1831        ureg_RET(tx->ureg);
1832    }
1833    return D3D_OK;
1834}
1835
1836DECL_SPECIAL(ENDLOOP)
1837{
1838    struct ureg_program *ureg = tx->ureg;
1839    struct ureg_dst ctr = tx_get_loopctr(tx, TRUE);
1840    struct ureg_dst dst_ctrx, dst_al;
1841    struct ureg_src src_ctr, al_counter;
1842
1843    dst_ctrx = ureg_writemask(ctr, NINED3DSP_WRITEMASK_0);
1844    dst_al = ureg_writemask(ctr, NINED3DSP_WRITEMASK_1);
1845    src_ctr = ureg_src(ctr);
1846    al_counter = ureg_scalar(src_ctr, TGSI_SWIZZLE_Z);
1847
1848    /* ctr.x -= 1
1849     * ctr.y (aL) += step */
1850    if (!tx->native_integers) {
1851        ureg_ADD(ureg, dst_ctrx, src_ctr, ureg_imm1f(ureg, -1.0f));
1852        ureg_ADD(ureg, dst_al, src_ctr, al_counter);
1853    } else {
1854        ureg_UADD(ureg, dst_ctrx, src_ctr, ureg_imm1i(ureg, -1));
1855        ureg_UADD(ureg, dst_al, src_ctr, al_counter);
1856    }
1857    ureg_ENDLOOP(tx->ureg, tx_endloop(tx));
1858    return D3D_OK;
1859}
1860
1861DECL_SPECIAL(LABEL)
1862{
1863    unsigned k = tx->num_inst_labels;
1864    unsigned n = tx->insn.src[0].idx;
1865    assert(n < 2048);
1866    if (n >= k)
1867       tx->inst_labels = REALLOC(tx->inst_labels,
1868                                 k * sizeof(tx->inst_labels[0]),
1869                                 n * sizeof(tx->inst_labels[0]));
1870
1871    tx->inst_labels[n] = ureg_get_instruction_number(tx->ureg);
1872    return D3D_OK;
1873}
1874
1875DECL_SPECIAL(SINCOS)
1876{
1877    struct ureg_program *ureg = tx->ureg;
1878    struct ureg_dst dst = tx_dst_param(tx, &tx->insn.dst[0]);
1879    struct ureg_src src = tx_src_param(tx, &tx->insn.src[0]);
1880    struct ureg_dst tmp = tx_scratch_scalar(tx);
1881
1882    assert(!(dst.WriteMask & 0xc));
1883
1884    /* Copying to a temporary register avoids src/dst aliasing.
1885     * src is supposed to have replicated swizzle. */
1886    ureg_MOV(ureg, tmp, src);
1887
1888    /* z undefined, w untouched */
1889    ureg_COS(ureg, ureg_writemask(dst, TGSI_WRITEMASK_X),
1890             tx_src_scalar(tmp));
1891    ureg_SIN(ureg, ureg_writemask(dst, TGSI_WRITEMASK_Y),
1892             tx_src_scalar(tmp));
1893    return D3D_OK;
1894}
1895
1896DECL_SPECIAL(SGN)
1897{
1898    ureg_SSG(tx->ureg,
1899             tx_dst_param(tx, &tx->insn.dst[0]),
1900             tx_src_param(tx, &tx->insn.src[0]));
1901    return D3D_OK;
1902}
1903
1904DECL_SPECIAL(REP)
1905{
1906    struct ureg_program *ureg = tx->ureg;
1907    unsigned *label;
1908    struct ureg_src rep = tx_src_param(tx, &tx->insn.src[0]);
1909    struct ureg_dst ctr;
1910    struct ureg_dst tmp;
1911    struct ureg_src ctrx;
1912
1913    label = tx_bgnloop(tx);
1914    ctr = ureg_writemask(tx_get_loopctr(tx, FALSE), NINED3DSP_WRITEMASK_0);
1915    ctrx = ureg_scalar(ureg_src(ctr), TGSI_SWIZZLE_X);
1916
1917    /* NOTE: rep must be constant, so we don't have to save the count */
1918    assert(rep.File == TGSI_FILE_CONSTANT || rep.File == TGSI_FILE_IMMEDIATE);
1919
1920    /* rep: num_iterations - 0 - 0 - 0 */
1921    ureg_MOV(ureg, ctr, rep);
1922    ureg_BGNLOOP(ureg, label);
1923    tmp = tx_scratch_scalar(tx);
1924    /* Initially ctr.x contains the number of iterations.
1925     * We decrease ctr.x at the end of every iteration,
1926     * and stop when it reaches 0. */
1927
1928    if (!tx->native_integers) {
1929        /* case src and ctr contain floats */
1930        /* to avoid precision issue, we stop when ctr <= 0.5 */
1931        ureg_SGE(ureg, tmp, ureg_imm1f(ureg, 0.5f), ctrx);
1932        ureg_IF(ureg, tx_src_scalar(tmp), tx_cond(tx));
1933    } else {
1934        /* case src and ctr contain integers */
1935        ureg_ISGE(ureg, tmp, ureg_imm1i(ureg, 0), ctrx);
1936        ureg_UIF(ureg, tx_src_scalar(tmp), tx_cond(tx));
1937    }
1938    ureg_BRK(ureg);
1939    tx_endcond(tx);
1940    ureg_ENDIF(ureg);
1941
1942    return D3D_OK;
1943}
1944
1945DECL_SPECIAL(ENDREP)
1946{
1947    struct ureg_program *ureg = tx->ureg;
1948    struct ureg_dst ctr = tx_get_loopctr(tx, FALSE);
1949    struct ureg_dst dst_ctrx = ureg_writemask(ctr, NINED3DSP_WRITEMASK_0);
1950    struct ureg_src src_ctr = ureg_src(ctr);
1951
1952    /* ctr.x -= 1 */
1953    if (!tx->native_integers)
1954        ureg_ADD(ureg, dst_ctrx, src_ctr, ureg_imm1f(ureg, -1.0f));
1955    else
1956        ureg_UADD(ureg, dst_ctrx, src_ctr, ureg_imm1i(ureg, -1));
1957
1958    ureg_ENDLOOP(tx->ureg, tx_endloop(tx));
1959    return D3D_OK;
1960}
1961
1962DECL_SPECIAL(ENDIF)
1963{
1964    tx_endcond(tx);
1965    ureg_ENDIF(tx->ureg);
1966    return D3D_OK;
1967}
1968
1969DECL_SPECIAL(IF)
1970{
1971    struct ureg_src src = tx_src_param(tx, &tx->insn.src[0]);
1972
1973    if (tx->native_integers && tx->insn.src[0].file == D3DSPR_CONSTBOOL)
1974        ureg_UIF(tx->ureg, src, tx_cond(tx));
1975    else
1976        ureg_IF(tx->ureg, src, tx_cond(tx));
1977
1978    return D3D_OK;
1979}
1980
1981static inline unsigned
1982sm1_insn_flags_to_tgsi_setop(BYTE flags)
1983{
1984    switch (flags) {
1985    case NINED3DSHADER_REL_OP_GT: return TGSI_OPCODE_SGT;
1986    case NINED3DSHADER_REL_OP_EQ: return TGSI_OPCODE_SEQ;
1987    case NINED3DSHADER_REL_OP_GE: return TGSI_OPCODE_SGE;
1988    case NINED3DSHADER_REL_OP_LT: return TGSI_OPCODE_SLT;
1989    case NINED3DSHADER_REL_OP_NE: return TGSI_OPCODE_SNE;
1990    case NINED3DSHADER_REL_OP_LE: return TGSI_OPCODE_SLE;
1991    default:
1992        assert(!"invalid comparison flags");
1993        return TGSI_OPCODE_SGT;
1994    }
1995}
1996
1997DECL_SPECIAL(IFC)
1998{
1999    const unsigned cmp_op = sm1_insn_flags_to_tgsi_setop(tx->insn.flags);
2000    struct ureg_src src[2];
2001    struct ureg_dst tmp = ureg_writemask(tx_scratch(tx), TGSI_WRITEMASK_X);
2002    src[0] = tx_src_param(tx, &tx->insn.src[0]);
2003    src[1] = tx_src_param(tx, &tx->insn.src[1]);
2004    ureg_insn(tx->ureg, cmp_op, &tmp, 1, src, 2, 0);
2005    ureg_IF(tx->ureg, ureg_scalar(ureg_src(tmp), TGSI_SWIZZLE_X), tx_cond(tx));
2006    return D3D_OK;
2007}
2008
2009DECL_SPECIAL(ELSE)
2010{
2011    ureg_ELSE(tx->ureg, tx_elsecond(tx));
2012    return D3D_OK;
2013}
2014
2015DECL_SPECIAL(BREAKC)
2016{
2017    const unsigned cmp_op = sm1_insn_flags_to_tgsi_setop(tx->insn.flags);
2018    struct ureg_src src[2];
2019    struct ureg_dst tmp = ureg_writemask(tx_scratch(tx), TGSI_WRITEMASK_X);
2020    src[0] = tx_src_param(tx, &tx->insn.src[0]);
2021    src[1] = tx_src_param(tx, &tx->insn.src[1]);
2022    ureg_insn(tx->ureg, cmp_op, &tmp, 1, src, 2, 0);
2023    ureg_IF(tx->ureg, ureg_scalar(ureg_src(tmp), TGSI_SWIZZLE_X), tx_cond(tx));
2024    ureg_BRK(tx->ureg);
2025    tx_endcond(tx);
2026    ureg_ENDIF(tx->ureg);
2027    return D3D_OK;
2028}
2029
2030static const char *sm1_declusage_names[] =
2031{
2032    [D3DDECLUSAGE_POSITION] = "POSITION",
2033    [D3DDECLUSAGE_BLENDWEIGHT] = "BLENDWEIGHT",
2034    [D3DDECLUSAGE_BLENDINDICES] = "BLENDINDICES",
2035    [D3DDECLUSAGE_NORMAL] = "NORMAL",
2036    [D3DDECLUSAGE_PSIZE] = "PSIZE",
2037    [D3DDECLUSAGE_TEXCOORD] = "TEXCOORD",
2038    [D3DDECLUSAGE_TANGENT] = "TANGENT",
2039    [D3DDECLUSAGE_BINORMAL] = "BINORMAL",
2040    [D3DDECLUSAGE_TESSFACTOR] = "TESSFACTOR",
2041    [D3DDECLUSAGE_POSITIONT] = "POSITIONT",
2042    [D3DDECLUSAGE_COLOR] = "COLOR",
2043    [D3DDECLUSAGE_FOG] = "FOG",
2044    [D3DDECLUSAGE_DEPTH] = "DEPTH",
2045    [D3DDECLUSAGE_SAMPLE] = "SAMPLE"
2046};
2047
2048static inline unsigned
2049sm1_to_nine_declusage(struct sm1_semantic *dcl)
2050{
2051    return nine_d3d9_to_nine_declusage(dcl->usage, dcl->usage_idx);
2052}
2053
2054static void
2055sm1_declusage_to_tgsi(struct tgsi_declaration_semantic *sem,
2056                      boolean tc,
2057                      struct sm1_semantic *dcl)
2058{
2059    BYTE index = dcl->usage_idx;
2060
2061    /* For everything that is not matching to a TGSI_SEMANTIC_****,
2062     * we match to a TGSI_SEMANTIC_GENERIC with index.
2063     *
2064     * The index can be anything UINT16 and usage_idx is BYTE,
2065     * so we can fit everything. It doesn't matter if indices
2066     * are close together or low.
2067     *
2068     *
2069     * POSITION >= 1: 10 * index + 7
2070     * COLOR >= 2: 10 * (index-1) + 8
2071     * FOG: 16
2072     * TEXCOORD[0..15]: index
2073     * BLENDWEIGHT: 10 * index + 19
2074     * BLENDINDICES: 10 * index + 20
2075     * NORMAL: 10 * index + 21
2076     * TANGENT: 10 * index + 22
2077     * BINORMAL: 10 * index + 23
2078     * TESSFACTOR: 10 * index + 24
2079     */
2080
2081    switch (dcl->usage) {
2082    case D3DDECLUSAGE_POSITION:
2083    case D3DDECLUSAGE_POSITIONT:
2084    case D3DDECLUSAGE_DEPTH:
2085        if (index == 0) {
2086            sem->Name = TGSI_SEMANTIC_POSITION;
2087            sem->Index = 0;
2088        } else {
2089            sem->Name = TGSI_SEMANTIC_GENERIC;
2090            sem->Index = 10 * index + 7;
2091        }
2092        break;
2093    case D3DDECLUSAGE_COLOR:
2094        if (index < 2) {
2095            sem->Name = TGSI_SEMANTIC_COLOR;
2096            sem->Index = index;
2097        } else {
2098            sem->Name = TGSI_SEMANTIC_GENERIC;
2099            sem->Index = 10 * (index-1) + 8;
2100        }
2101        break;
2102    case D3DDECLUSAGE_FOG:
2103        assert(index == 0);
2104        sem->Name = TGSI_SEMANTIC_GENERIC;
2105        sem->Index = 16;
2106        break;
2107    case D3DDECLUSAGE_PSIZE:
2108        assert(index == 0);
2109        sem->Name = TGSI_SEMANTIC_PSIZE;
2110        sem->Index = 0;
2111        break;
2112    case D3DDECLUSAGE_TEXCOORD:
2113        assert(index < 16);
2114        if (index < 8 && tc)
2115            sem->Name = TGSI_SEMANTIC_TEXCOORD;
2116        else
2117            sem->Name = TGSI_SEMANTIC_GENERIC;
2118        sem->Index = index;
2119        break;
2120    case D3DDECLUSAGE_BLENDWEIGHT:
2121        sem->Name = TGSI_SEMANTIC_GENERIC;
2122        sem->Index = 10 * index + 19;
2123        break;
2124    case D3DDECLUSAGE_BLENDINDICES:
2125        sem->Name = TGSI_SEMANTIC_GENERIC;
2126        sem->Index = 10 * index + 20;
2127        break;
2128    case D3DDECLUSAGE_NORMAL:
2129        sem->Name = TGSI_SEMANTIC_GENERIC;
2130        sem->Index = 10 * index + 21;
2131        break;
2132    case D3DDECLUSAGE_TANGENT:
2133        sem->Name = TGSI_SEMANTIC_GENERIC;
2134        sem->Index = 10 * index + 22;
2135        break;
2136    case D3DDECLUSAGE_BINORMAL:
2137        sem->Name = TGSI_SEMANTIC_GENERIC;
2138        sem->Index = 10 * index + 23;
2139        break;
2140    case D3DDECLUSAGE_TESSFACTOR:
2141        sem->Name = TGSI_SEMANTIC_GENERIC;
2142        sem->Index = 10 * index + 24;
2143        break;
2144    case D3DDECLUSAGE_SAMPLE:
2145        sem->Name = TGSI_SEMANTIC_COUNT;
2146        sem->Index = 0;
2147        break;
2148    default:
2149        unreachable("Invalid DECLUSAGE.");
2150        break;
2151    }
2152}
2153
2154#define NINED3DSTT_1D     (D3DSTT_1D >> D3DSP_TEXTURETYPE_SHIFT)
2155#define NINED3DSTT_2D     (D3DSTT_2D >> D3DSP_TEXTURETYPE_SHIFT)
2156#define NINED3DSTT_VOLUME (D3DSTT_VOLUME >> D3DSP_TEXTURETYPE_SHIFT)
2157#define NINED3DSTT_CUBE   (D3DSTT_CUBE >> D3DSP_TEXTURETYPE_SHIFT)
2158static inline unsigned
2159d3dstt_to_tgsi_tex(BYTE sampler_type)
2160{
2161    switch (sampler_type) {
2162    case NINED3DSTT_1D:     return TGSI_TEXTURE_1D;
2163    case NINED3DSTT_2D:     return TGSI_TEXTURE_2D;
2164    case NINED3DSTT_VOLUME: return TGSI_TEXTURE_3D;
2165    case NINED3DSTT_CUBE:   return TGSI_TEXTURE_CUBE;
2166    default:
2167        assert(0);
2168        return TGSI_TEXTURE_UNKNOWN;
2169    }
2170}
2171static inline unsigned
2172d3dstt_to_tgsi_tex_shadow(BYTE sampler_type)
2173{
2174    switch (sampler_type) {
2175    case NINED3DSTT_1D: return TGSI_TEXTURE_SHADOW1D;
2176    case NINED3DSTT_2D: return TGSI_TEXTURE_SHADOW2D;
2177    case NINED3DSTT_VOLUME:
2178    case NINED3DSTT_CUBE:
2179    default:
2180        assert(0);
2181        return TGSI_TEXTURE_UNKNOWN;
2182    }
2183}
2184static inline unsigned
2185ps1x_sampler_type(const struct nine_shader_info *info, unsigned stage)
2186{
2187    boolean shadow = !!(info->sampler_mask_shadow & (1 << stage));
2188    switch ((info->sampler_ps1xtypes >> (stage * 2)) & 0x3) {
2189    case 1: return shadow ? TGSI_TEXTURE_SHADOW1D : TGSI_TEXTURE_1D;
2190    case 0: return shadow ? TGSI_TEXTURE_SHADOW2D : TGSI_TEXTURE_2D;
2191    case 3: return TGSI_TEXTURE_3D;
2192    default:
2193        return TGSI_TEXTURE_CUBE;
2194    }
2195}
2196
2197static const char *
2198sm1_sampler_type_name(BYTE sampler_type)
2199{
2200    switch (sampler_type) {
2201    case NINED3DSTT_1D:     return "1D";
2202    case NINED3DSTT_2D:     return "2D";
2203    case NINED3DSTT_VOLUME: return "VOLUME";
2204    case NINED3DSTT_CUBE:   return "CUBE";
2205    default:
2206        return "(D3DSTT_?)";
2207    }
2208}
2209
2210static inline unsigned
2211nine_tgsi_to_interp_mode(struct tgsi_declaration_semantic *sem)
2212{
2213    switch (sem->Name) {
2214    case TGSI_SEMANTIC_POSITION:
2215    case TGSI_SEMANTIC_NORMAL:
2216        return TGSI_INTERPOLATE_LINEAR;
2217    case TGSI_SEMANTIC_BCOLOR:
2218    case TGSI_SEMANTIC_COLOR:
2219        return TGSI_INTERPOLATE_COLOR;
2220    case TGSI_SEMANTIC_FOG:
2221    case TGSI_SEMANTIC_GENERIC:
2222    case TGSI_SEMANTIC_TEXCOORD:
2223    case TGSI_SEMANTIC_CLIPDIST:
2224    case TGSI_SEMANTIC_CLIPVERTEX:
2225        return TGSI_INTERPOLATE_PERSPECTIVE;
2226    case TGSI_SEMANTIC_EDGEFLAG:
2227    case TGSI_SEMANTIC_FACE:
2228    case TGSI_SEMANTIC_INSTANCEID:
2229    case TGSI_SEMANTIC_PCOORD:
2230    case TGSI_SEMANTIC_PRIMID:
2231    case TGSI_SEMANTIC_PSIZE:
2232    case TGSI_SEMANTIC_VERTEXID:
2233        return TGSI_INTERPOLATE_CONSTANT;
2234    default:
2235        assert(0);
2236        return TGSI_INTERPOLATE_CONSTANT;
2237    }
2238}
2239
2240DECL_SPECIAL(DCL)
2241{
2242    struct ureg_program *ureg = tx->ureg;
2243    boolean is_input;
2244    boolean is_sampler;
2245    struct tgsi_declaration_semantic tgsi;
2246    struct sm1_semantic sem;
2247    sm1_read_semantic(tx, &sem);
2248
2249    is_input = sem.reg.file == D3DSPR_INPUT;
2250    is_sampler =
2251        sem.usage == D3DDECLUSAGE_SAMPLE || sem.reg.file == D3DSPR_SAMPLER;
2252
2253    DUMP("DCL ");
2254    sm1_dump_dst_param(&sem.reg);
2255    if (is_sampler)
2256        DUMP(" %s\n", sm1_sampler_type_name(sem.sampler_type));
2257    else
2258    if (tx->version.major >= 3)
2259        DUMP(" %s%i\n", sm1_declusage_names[sem.usage], sem.usage_idx);
2260    else
2261    if (sem.usage | sem.usage_idx)
2262        DUMP(" %u[%u]\n", sem.usage, sem.usage_idx);
2263    else
2264        DUMP("\n");
2265
2266    if (is_sampler) {
2267        const unsigned m = 1 << sem.reg.idx;
2268        ureg_DECL_sampler(ureg, sem.reg.idx);
2269        tx->info->sampler_mask |= m;
2270        tx->sampler_targets[sem.reg.idx] = (tx->info->sampler_mask_shadow & m) ?
2271            d3dstt_to_tgsi_tex_shadow(sem.sampler_type) :
2272            d3dstt_to_tgsi_tex(sem.sampler_type);
2273        return D3D_OK;
2274    }
2275
2276    sm1_declusage_to_tgsi(&tgsi, tx->want_texcoord, &sem);
2277    if (IS_VS) {
2278        if (is_input) {
2279            /* linkage outside of shader with vertex declaration */
2280            ureg_DECL_vs_input(ureg, sem.reg.idx);
2281            assert(sem.reg.idx < ARRAY_SIZE(tx->info->input_map));
2282            tx->info->input_map[sem.reg.idx] = sm1_to_nine_declusage(&sem);
2283            tx->info->num_inputs = MAX2(tx->info->num_inputs, sem.reg.idx + 1);
2284            /* NOTE: preserving order in case of indirect access */
2285        } else
2286        if (tx->version.major >= 3) {
2287            /* SM2 output semantic determined by file */
2288            assert(sem.reg.mask != 0);
2289            if (sem.usage == D3DDECLUSAGE_POSITIONT)
2290                tx->info->position_t = TRUE;
2291            assert(sem.reg.idx < ARRAY_SIZE(tx->regs.o));
2292            assert(ureg_dst_is_undef(tx->regs.o[sem.reg.idx]) && "Nine doesn't support yet packing");
2293            tx->regs.o[sem.reg.idx] = ureg_DECL_output_masked(
2294                ureg, tgsi.Name, tgsi.Index, sem.reg.mask, 0, 1);
2295            nine_record_outputs(tx, sem.usage, sem.usage_idx, sem.reg.mask, sem.reg.idx);
2296            if (tx->info->process_vertices && sem.usage == D3DDECLUSAGE_POSITION && sem.usage_idx == 0) {
2297                tx->regs.oPos_out = tx->regs.o[sem.reg.idx];
2298                tx->regs.o[sem.reg.idx] = ureg_DECL_temporary(ureg);
2299                tx->regs.oPos = tx->regs.o[sem.reg.idx];
2300            }
2301
2302            if (tgsi.Name == TGSI_SEMANTIC_PSIZE) {
2303                tx->regs.o[sem.reg.idx] = ureg_DECL_temporary(ureg);
2304                tx->regs.oPts = tx->regs.o[sem.reg.idx];
2305            }
2306        }
2307    } else {
2308        if (is_input && tx->version.major >= 3) {
2309            unsigned interp_location = 0;
2310            /* SM3 only, SM2 input semantic determined by file */
2311            assert(sem.reg.idx < ARRAY_SIZE(tx->regs.v));
2312            assert(ureg_src_is_undef(tx->regs.v[sem.reg.idx]) && "Nine doesn't support yet packing");
2313            /* PositionT and tessfactor forbidden */
2314            if (sem.usage == D3DDECLUSAGE_POSITIONT || sem.usage == D3DDECLUSAGE_TESSFACTOR)
2315                return D3DERR_INVALIDCALL;
2316
2317            if (tgsi.Name == TGSI_SEMANTIC_POSITION) {
2318                /* Position0 is forbidden (likely because vPos already does that) */
2319                if (sem.usage == D3DDECLUSAGE_POSITION)
2320                    return D3DERR_INVALIDCALL;
2321                /* Following code is for depth */
2322                tx->regs.v[sem.reg.idx] = nine_get_position_input(tx);
2323                return D3D_OK;
2324            }
2325
2326            if (sem.reg.mod & NINED3DSPDM_CENTROID ||
2327                (tgsi.Name == TGSI_SEMANTIC_COLOR && tx->info->force_color_in_centroid))
2328                interp_location = TGSI_INTERPOLATE_LOC_CENTROID;
2329
2330            tx->regs.v[sem.reg.idx] = ureg_DECL_fs_input_centroid(
2331                ureg, tgsi.Name, tgsi.Index,
2332                nine_tgsi_to_interp_mode(&tgsi),
2333                interp_location, 0, 1);
2334        } else
2335        if (!is_input && 0) { /* declare in COLOROUT/DEPTHOUT case */
2336            /* FragColor or FragDepth */
2337            assert(sem.reg.mask != 0);
2338            ureg_DECL_output_masked(ureg, tgsi.Name, tgsi.Index, sem.reg.mask,
2339                                    0, 1);
2340        }
2341    }
2342    return D3D_OK;
2343}
2344
2345DECL_SPECIAL(DEF)
2346{
2347    tx_set_lconstf(tx, tx->insn.dst[0].idx, tx->insn.src[0].imm.f);
2348    return D3D_OK;
2349}
2350
2351DECL_SPECIAL(DEFB)
2352{
2353    tx_set_lconstb(tx, tx->insn.dst[0].idx, tx->insn.src[0].imm.b);
2354    return D3D_OK;
2355}
2356
2357DECL_SPECIAL(DEFI)
2358{
2359    tx_set_lconsti(tx, tx->insn.dst[0].idx, tx->insn.src[0].imm.i);
2360    return D3D_OK;
2361}
2362
2363DECL_SPECIAL(POW)
2364{
2365    struct ureg_dst dst = tx_dst_param(tx, &tx->insn.dst[0]);
2366    struct ureg_src src[2] = {
2367        tx_src_param(tx, &tx->insn.src[0]),
2368        tx_src_param(tx, &tx->insn.src[1])
2369    };
2370    ureg_POW(tx->ureg, dst, ureg_abs(src[0]), src[1]);
2371    return D3D_OK;
2372}
2373
2374/* Tests results on Win 10:
2375 * NV (NVIDIA GeForce GT 635M)
2376 * AMD (AMD Radeon HD 7730M)
2377 * INTEL (Intel(R) HD Graphics 4000)
2378 * PS2 and PS3:
2379 * RCP and RSQ can generate inf on NV and AMD.
2380 * RCP and RSQ are clamped on INTEL (+- FLT_MAX),
2381 * NV: log not clamped
2382 * AMD: log(0) is -FLT_MAX (but log(inf) is inf)
2383 * INTEL: log(0) is -FLT_MAX and log(inf) is 127
2384 * All devices have 0*anything = 0
2385 *
2386 * INTEL VS2 and VS3: same behaviour.
2387 * Some differences VS2 and VS3 for constants defined with inf/NaN.
2388 * While PS3, VS3 and PS2 keep NaN and Inf shader constants without change,
2389 * VS2 seems to clamp to zero (may be test failure).
2390 * AMD VS2: unknown, VS3: very likely behaviour of PS3
2391 * NV VS2 and VS3: very likely behaviour of PS3
2392 * For both, Inf in VS becomes NaN is PS
2393 * "Very likely" because the test was less extensive.
2394 *
2395 * Thus all clamping can be removed for shaders 2 and 3,
2396 * as long as 0*anything = 0.
2397 * Else clamps to enforce 0*anything = 0 (anything being then
2398 * neither inf or NaN, the user being unlikely to pass them
2399 * as constant).
2400 * The status for VS1 and PS1 is unknown.
2401 */
2402
2403DECL_SPECIAL(RCP)
2404{
2405    struct ureg_program *ureg = tx->ureg;
2406    struct ureg_dst dst = tx_dst_param(tx, &tx->insn.dst[0]);
2407    struct ureg_src src = tx_src_param(tx, &tx->insn.src[0]);
2408    struct ureg_dst tmp = tx->mul_zero_wins ? dst : tx_scratch(tx);
2409    ureg_RCP(ureg, tmp, src);
2410    if (!tx->mul_zero_wins) {
2411        /* FLT_MAX has issues with Rayman */
2412        ureg_MIN(ureg, tmp, ureg_imm1f(ureg, FLT_MAX/2.f), ureg_src(tmp));
2413        ureg_MAX(ureg, dst, ureg_imm1f(ureg, -FLT_MAX/2.f), ureg_src(tmp));
2414    }
2415    return D3D_OK;
2416}
2417
2418DECL_SPECIAL(RSQ)
2419{
2420    struct ureg_program *ureg = tx->ureg;
2421    struct ureg_dst dst = tx_dst_param(tx, &tx->insn.dst[0]);
2422    struct ureg_src src = tx_src_param(tx, &tx->insn.src[0]);
2423    struct ureg_dst tmp = tx->mul_zero_wins ? dst : tx_scratch(tx);
2424    ureg_RSQ(ureg, tmp, ureg_abs(src));
2425    if (!tx->mul_zero_wins)
2426        ureg_MIN(ureg, dst, ureg_imm1f(ureg, FLT_MAX), ureg_src(tmp));
2427    return D3D_OK;
2428}
2429
2430DECL_SPECIAL(LOG)
2431{
2432    struct ureg_program *ureg = tx->ureg;
2433    struct ureg_dst tmp = tx_scratch_scalar(tx);
2434    struct ureg_dst dst = tx_dst_param(tx, &tx->insn.dst[0]);
2435    struct ureg_src src = tx_src_param(tx, &tx->insn.src[0]);
2436    ureg_LG2(ureg, tmp, ureg_abs(src));
2437    if (tx->mul_zero_wins) {
2438        ureg_MOV(ureg, dst, tx_src_scalar(tmp));
2439    } else {
2440        ureg_MAX(ureg, dst, ureg_imm1f(ureg, -FLT_MAX), tx_src_scalar(tmp));
2441    }
2442    return D3D_OK;
2443}
2444
2445DECL_SPECIAL(LIT)
2446{
2447    struct ureg_program *ureg = tx->ureg;
2448    struct ureg_dst tmp = tx_scratch(tx);
2449    struct ureg_dst dst = tx_dst_param(tx, &tx->insn.dst[0]);
2450    struct ureg_src src = tx_src_param(tx, &tx->insn.src[0]);
2451    ureg_LIT(ureg, tmp, src);
2452    /* d3d9 LIT is the same than gallium LIT. One difference is that d3d9
2453     * states that dst.z is 0 when src.y <= 0. Gallium definition can assign
2454     * it 0^0 if src.w=0, which value is driver dependent. */
2455    ureg_CMP(ureg, ureg_writemask(dst, TGSI_WRITEMASK_Z),
2456             ureg_negate(ureg_scalar(src, TGSI_SWIZZLE_Y)),
2457             ureg_src(tmp), ureg_imm1f(ureg, 0.0f));
2458    ureg_MOV(ureg, ureg_writemask(dst, TGSI_WRITEMASK_XYW), ureg_src(tmp));
2459    return D3D_OK;
2460}
2461
2462DECL_SPECIAL(NRM)
2463{
2464    struct ureg_program *ureg = tx->ureg;
2465    struct ureg_dst tmp = tx_scratch_scalar(tx);
2466    struct ureg_src nrm = tx_src_scalar(tmp);
2467    struct ureg_dst dst = tx_dst_param(tx, &tx->insn.dst[0]);
2468    struct ureg_src src = tx_src_param(tx, &tx->insn.src[0]);
2469    ureg_DP3(ureg, tmp, src, src);
2470    ureg_RSQ(ureg, tmp, nrm);
2471    if (!tx->mul_zero_wins)
2472        ureg_MIN(ureg, tmp, ureg_imm1f(ureg, FLT_MAX), nrm);
2473    ureg_MUL(ureg, dst, src, nrm);
2474    return D3D_OK;
2475}
2476
2477DECL_SPECIAL(DP2ADD)
2478{
2479    struct ureg_dst tmp = tx_scratch_scalar(tx);
2480    struct ureg_src dp2 = tx_src_scalar(tmp);
2481    struct ureg_dst dst = tx_dst_param(tx, &tx->insn.dst[0]);
2482    struct ureg_src src[3];
2483    int i;
2484    for (i = 0; i < 3; ++i)
2485        src[i] = tx_src_param(tx, &tx->insn.src[i]);
2486    assert_replicate_swizzle(&src[2]);
2487
2488    ureg_DP2(tx->ureg, tmp, src[0], src[1]);
2489    ureg_ADD(tx->ureg, dst, src[2], dp2);
2490
2491    return D3D_OK;
2492}
2493
2494DECL_SPECIAL(TEXCOORD)
2495{
2496    struct ureg_program *ureg = tx->ureg;
2497    const unsigned s = tx->insn.dst[0].idx;
2498    struct ureg_dst dst = tx_dst_param(tx, &tx->insn.dst[0]);
2499
2500    tx_texcoord_alloc(tx, s);
2501    ureg_MOV(ureg, ureg_writemask(ureg_saturate(dst), TGSI_WRITEMASK_XYZ), tx->regs.vT[s]);
2502    ureg_MOV(ureg, ureg_writemask(dst, TGSI_WRITEMASK_W), ureg_imm1f(tx->ureg, 1.0f));
2503
2504    return D3D_OK;
2505}
2506
2507DECL_SPECIAL(TEXCOORD_ps14)
2508{
2509    struct ureg_program *ureg = tx->ureg;
2510    struct ureg_src src = tx_src_param(tx, &tx->insn.src[0]);
2511    struct ureg_dst dst = tx_dst_param(tx, &tx->insn.dst[0]);
2512
2513    assert(tx->insn.src[0].file == D3DSPR_TEXTURE);
2514
2515    ureg_MOV(ureg, dst, src);
2516
2517    return D3D_OK;
2518}
2519
2520DECL_SPECIAL(TEXKILL)
2521{
2522    struct ureg_src reg;
2523
2524    if (tx->version.major > 1 || tx->version.minor > 3) {
2525        reg = tx_dst_param_as_src(tx, &tx->insn.dst[0]);
2526    } else {
2527        tx_texcoord_alloc(tx, tx->insn.dst[0].idx);
2528        reg = tx->regs.vT[tx->insn.dst[0].idx];
2529    }
2530    if (tx->version.major < 2)
2531        reg = ureg_swizzle(reg, NINE_SWIZZLE4(X,Y,Z,Z));
2532    ureg_KILL_IF(tx->ureg, reg);
2533
2534    return D3D_OK;
2535}
2536
2537DECL_SPECIAL(TEXBEM)
2538{
2539    struct ureg_program *ureg = tx->ureg;
2540    struct ureg_dst dst = tx_dst_param(tx, &tx->insn.dst[0]);
2541    struct ureg_src src = tx_src_param(tx, &tx->insn.src[0]); /* t[n] */
2542    struct ureg_dst tmp, tmp2, texcoord;
2543    struct ureg_src sample, m00, m01, m10, m11, c8m, c16m2;
2544    struct ureg_src bumpenvlscale, bumpenvloffset;
2545    const int m = tx->insn.dst[0].idx;
2546
2547    assert(tx->version.major == 1);
2548
2549    sample = ureg_DECL_sampler(ureg, m);
2550    tx->info->sampler_mask |= 1 << m;
2551
2552    tx_texcoord_alloc(tx, m);
2553
2554    tmp = tx_scratch(tx);
2555    tmp2 = tx_scratch(tx);
2556    texcoord = tx_scratch(tx);
2557    /*
2558     * Bump-env-matrix:
2559     * 00 is X
2560     * 01 is Y
2561     * 10 is Z
2562     * 11 is W
2563     */
2564    c8m = nine_float_constant_src(tx, 8+m);
2565    c16m2 = nine_float_constant_src(tx, 8+8+m/2);
2566
2567    m00 = NINE_APPLY_SWIZZLE(c8m, X);
2568    m01 = NINE_APPLY_SWIZZLE(c8m, Y);
2569    m10 = NINE_APPLY_SWIZZLE(c8m, Z);
2570    m11 = NINE_APPLY_SWIZZLE(c8m, W);
2571
2572    /* These two attributes are packed as X=scale0 Y=offset0 Z=scale1 W=offset1 etc */
2573    if (m % 2 == 0) {
2574        bumpenvlscale = NINE_APPLY_SWIZZLE(c16m2, X);
2575        bumpenvloffset = NINE_APPLY_SWIZZLE(c16m2, Y);
2576    } else {
2577        bumpenvlscale = NINE_APPLY_SWIZZLE(c16m2, Z);
2578        bumpenvloffset = NINE_APPLY_SWIZZLE(c16m2, W);
2579    }
2580
2581    apply_ps1x_projection(tx, texcoord, tx->regs.vT[m], m);
2582
2583    /* u' = TextureCoordinates(stage m)u + D3DTSS_BUMPENVMAT00(stage m)*t(n)R  */
2584    ureg_MAD(ureg, ureg_writemask(tmp, TGSI_WRITEMASK_X), m00,
2585             NINE_APPLY_SWIZZLE(src, X), ureg_src(texcoord));
2586    /* u' = u' + D3DTSS_BUMPENVMAT10(stage m)*t(n)G */
2587    ureg_MAD(ureg, ureg_writemask(tmp, TGSI_WRITEMASK_X), m10,
2588             NINE_APPLY_SWIZZLE(src, Y),
2589             NINE_APPLY_SWIZZLE(ureg_src(tmp), X));
2590
2591    /* v' = TextureCoordinates(stage m)v + D3DTSS_BUMPENVMAT01(stage m)*t(n)R */
2592    ureg_MAD(ureg, ureg_writemask(tmp, TGSI_WRITEMASK_Y), m01,
2593             NINE_APPLY_SWIZZLE(src, X), ureg_src(texcoord));
2594    /* v' = v' + D3DTSS_BUMPENVMAT11(stage m)*t(n)G*/
2595    ureg_MAD(ureg, ureg_writemask(tmp, TGSI_WRITEMASK_Y), m11,
2596             NINE_APPLY_SWIZZLE(src, Y),
2597             NINE_APPLY_SWIZZLE(ureg_src(tmp), Y));
2598
2599    /* Now the texture coordinates are in tmp.xy */
2600
2601    if (tx->insn.opcode == D3DSIO_TEXBEM) {
2602        ureg_TEX(ureg, dst, ps1x_sampler_type(tx->info, m), ureg_src(tmp), sample);
2603    } else if (tx->insn.opcode == D3DSIO_TEXBEML) {
2604        /* t(m)RGBA = t(m)RGBA * [(t(n)B * D3DTSS_BUMPENVLSCALE(stage m)) + D3DTSS_BUMPENVLOFFSET(stage m)] */
2605        ureg_TEX(ureg, tmp, ps1x_sampler_type(tx->info, m), ureg_src(tmp), sample);
2606        ureg_MAD(ureg, tmp2, NINE_APPLY_SWIZZLE(src, Z),
2607                 bumpenvlscale, bumpenvloffset);
2608        ureg_MUL(ureg, dst, ureg_src(tmp), ureg_src(tmp2));
2609    }
2610
2611    tx->info->bumpenvmat_needed = 1;
2612
2613    return D3D_OK;
2614}
2615
2616DECL_SPECIAL(TEXREG2AR)
2617{
2618    struct ureg_program *ureg = tx->ureg;
2619    struct ureg_dst dst = tx_dst_param(tx, &tx->insn.dst[0]);
2620    struct ureg_src src = tx_src_param(tx, &tx->insn.src[0]); /* t[n] */
2621    struct ureg_src sample;
2622    const int m = tx->insn.dst[0].idx;
2623    ASSERTED const int n = tx->insn.src[0].idx;
2624    assert(m >= 0 && m > n);
2625
2626    sample = ureg_DECL_sampler(ureg, m);
2627    tx->info->sampler_mask |= 1 << m;
2628    ureg_TEX(ureg, dst, ps1x_sampler_type(tx->info, m), ureg_swizzle(src, NINE_SWIZZLE4(W,X,X,X)), sample);
2629
2630    return D3D_OK;
2631}
2632
2633DECL_SPECIAL(TEXREG2GB)
2634{
2635    struct ureg_program *ureg = tx->ureg;
2636    struct ureg_dst dst = tx_dst_param(tx, &tx->insn.dst[0]);
2637    struct ureg_src src = tx_src_param(tx, &tx->insn.src[0]); /* t[n] */
2638    struct ureg_src sample;
2639    const int m = tx->insn.dst[0].idx;
2640    ASSERTED const int n = tx->insn.src[0].idx;
2641    assert(m >= 0 && m > n);
2642
2643    sample = ureg_DECL_sampler(ureg, m);
2644    tx->info->sampler_mask |= 1 << m;
2645    ureg_TEX(ureg, dst, ps1x_sampler_type(tx->info, m), ureg_swizzle(src, NINE_SWIZZLE4(Y,Z,Z,Z)), sample);
2646
2647    return D3D_OK;
2648}
2649
2650DECL_SPECIAL(TEXM3x2PAD)
2651{
2652    return D3D_OK; /* this is just padding */
2653}
2654
2655DECL_SPECIAL(TEXM3x2TEX)
2656{
2657    struct ureg_program *ureg = tx->ureg;
2658    struct ureg_dst dst = tx_dst_param(tx, &tx->insn.dst[0]);
2659    struct ureg_src src = tx_src_param(tx, &tx->insn.src[0]); /* t[n] */
2660    struct ureg_src sample;
2661    const int m = tx->insn.dst[0].idx - 1;
2662    ASSERTED const int n = tx->insn.src[0].idx;
2663    assert(m >= 0 && m > n);
2664
2665    tx_texcoord_alloc(tx, m);
2666    tx_texcoord_alloc(tx, m+1);
2667
2668    /* performs the matrix multiplication */
2669    ureg_DP3(ureg, ureg_writemask(dst, TGSI_WRITEMASK_X), tx->regs.vT[m], src);
2670    ureg_DP3(ureg, ureg_writemask(dst, TGSI_WRITEMASK_Y), tx->regs.vT[m+1], src);
2671
2672    sample = ureg_DECL_sampler(ureg, m + 1);
2673    tx->info->sampler_mask |= 1 << (m + 1);
2674    ureg_TEX(ureg, dst, ps1x_sampler_type(tx->info, m + 1), ureg_src(dst), sample);
2675
2676    return D3D_OK;
2677}
2678
2679DECL_SPECIAL(TEXM3x3PAD)
2680{
2681    return D3D_OK; /* this is just padding */
2682}
2683
2684DECL_SPECIAL(TEXM3x3SPEC)
2685{
2686    struct ureg_program *ureg = tx->ureg;
2687    struct ureg_dst dst = tx_dst_param(tx, &tx->insn.dst[0]);
2688    struct ureg_src src = tx_src_param(tx, &tx->insn.src[0]); /* t[n] */
2689    struct ureg_src E = tx_src_param(tx, &tx->insn.src[1]);
2690    struct ureg_src sample;
2691    struct ureg_dst tmp;
2692    const int m = tx->insn.dst[0].idx - 2;
2693    ASSERTED const int n = tx->insn.src[0].idx;
2694    assert(m >= 0 && m > n);
2695
2696    tx_texcoord_alloc(tx, m);
2697    tx_texcoord_alloc(tx, m+1);
2698    tx_texcoord_alloc(tx, m+2);
2699
2700    ureg_DP3(ureg, ureg_writemask(dst, TGSI_WRITEMASK_X), tx->regs.vT[m], src);
2701    ureg_DP3(ureg, ureg_writemask(dst, TGSI_WRITEMASK_Y), tx->regs.vT[m+1], src);
2702    ureg_DP3(ureg, ureg_writemask(dst, TGSI_WRITEMASK_Z), tx->regs.vT[m+2], src);
2703
2704    sample = ureg_DECL_sampler(ureg, m + 2);
2705    tx->info->sampler_mask |= 1 << (m + 2);
2706    tmp = ureg_writemask(tx_scratch(tx), TGSI_WRITEMASK_XYZ);
2707
2708    /* At this step, dst = N = (u', w', z').
2709     * We want dst to be the texture sampled at (u'', w'', z''), with
2710     * (u'', w'', z'') = 2 * (N.E / N.N) * N - E */
2711    ureg_DP3(ureg, ureg_writemask(tmp, TGSI_WRITEMASK_X), ureg_src(dst), ureg_src(dst));
2712    ureg_RCP(ureg, ureg_writemask(tmp, TGSI_WRITEMASK_X), ureg_scalar(ureg_src(tmp), TGSI_SWIZZLE_X));
2713    /* at this step tmp.x = 1/N.N */
2714    ureg_DP3(ureg, ureg_writemask(tmp, TGSI_WRITEMASK_Y), ureg_src(dst), E);
2715    /* at this step tmp.y = N.E */
2716    ureg_MUL(ureg, ureg_writemask(tmp, TGSI_WRITEMASK_X), ureg_scalar(ureg_src(tmp), TGSI_SWIZZLE_X), ureg_scalar(ureg_src(tmp), TGSI_SWIZZLE_Y));
2717    /* at this step tmp.x = N.E/N.N */
2718    ureg_MUL(ureg, ureg_writemask(tmp, TGSI_WRITEMASK_X), ureg_scalar(ureg_src(tmp), TGSI_SWIZZLE_X), ureg_imm1f(ureg, 2.0f));
2719    ureg_MUL(ureg, tmp, ureg_scalar(ureg_src(tmp), TGSI_SWIZZLE_X), ureg_src(dst));
2720    /* at this step tmp.xyz = 2 * (N.E / N.N) * N */
2721    ureg_ADD(ureg, tmp, ureg_src(tmp), ureg_negate(E));
2722    ureg_TEX(ureg, dst, ps1x_sampler_type(tx->info, m + 2), ureg_src(tmp), sample);
2723
2724    return D3D_OK;
2725}
2726
2727DECL_SPECIAL(TEXREG2RGB)
2728{
2729    struct ureg_program *ureg = tx->ureg;
2730    struct ureg_dst dst = tx_dst_param(tx, &tx->insn.dst[0]);
2731    struct ureg_src src = tx_src_param(tx, &tx->insn.src[0]); /* t[n] */
2732    struct ureg_src sample;
2733    const int m = tx->insn.dst[0].idx;
2734    ASSERTED const int n = tx->insn.src[0].idx;
2735    assert(m >= 0 && m > n);
2736
2737    sample = ureg_DECL_sampler(ureg, m);
2738    tx->info->sampler_mask |= 1 << m;
2739    ureg_TEX(ureg, dst, ps1x_sampler_type(tx->info, m), src, sample);
2740
2741    return D3D_OK;
2742}
2743
2744DECL_SPECIAL(TEXDP3TEX)
2745{
2746    struct ureg_program *ureg = tx->ureg;
2747    struct ureg_dst dst = tx_dst_param(tx, &tx->insn.dst[0]);
2748    struct ureg_src src = tx_src_param(tx, &tx->insn.src[0]); /* t[n] */
2749    struct ureg_dst tmp;
2750    struct ureg_src sample;
2751    const int m = tx->insn.dst[0].idx;
2752    ASSERTED const int n = tx->insn.src[0].idx;
2753    assert(m >= 0 && m > n);
2754
2755    tx_texcoord_alloc(tx, m);
2756
2757    tmp = tx_scratch(tx);
2758    ureg_DP3(ureg, ureg_writemask(tmp, TGSI_WRITEMASK_X), tx->regs.vT[m], src);
2759    ureg_MOV(ureg, ureg_writemask(tmp, TGSI_WRITEMASK_YZ), ureg_imm1f(ureg, 0.0f));
2760
2761    sample = ureg_DECL_sampler(ureg, m);
2762    tx->info->sampler_mask |= 1 << m;
2763    ureg_TEX(ureg, dst, ps1x_sampler_type(tx->info, m), ureg_src(tmp), sample);
2764
2765    return D3D_OK;
2766}
2767
2768DECL_SPECIAL(TEXM3x2DEPTH)
2769{
2770    struct ureg_program *ureg = tx->ureg;
2771    struct ureg_src src = tx_src_param(tx, &tx->insn.src[0]); /* t[n] */
2772    struct ureg_dst tmp;
2773    const int m = tx->insn.dst[0].idx - 1;
2774    ASSERTED const int n = tx->insn.src[0].idx;
2775    assert(m >= 0 && m > n);
2776
2777    tx_texcoord_alloc(tx, m);
2778    tx_texcoord_alloc(tx, m+1);
2779
2780    tmp = tx_scratch(tx);
2781
2782    /* performs the matrix multiplication */
2783    ureg_DP3(ureg, ureg_writemask(tmp, TGSI_WRITEMASK_X), tx->regs.vT[m], src);
2784    ureg_DP3(ureg, ureg_writemask(tmp, TGSI_WRITEMASK_Y), tx->regs.vT[m+1], src);
2785
2786    ureg_RCP(ureg, ureg_writemask(tmp, TGSI_WRITEMASK_Z), ureg_scalar(ureg_src(tmp), TGSI_SWIZZLE_Y));
2787    /* tmp.x = 'z', tmp.y = 'w', tmp.z = 1/'w'. */
2788    ureg_MUL(ureg, ureg_writemask(tmp, TGSI_WRITEMASK_X), ureg_scalar(ureg_src(tmp), TGSI_SWIZZLE_X), ureg_scalar(ureg_src(tmp), TGSI_SWIZZLE_Z));
2789    /* res = 'w' == 0 ? 1.0 : z/w */
2790    ureg_CMP(ureg, ureg_writemask(tmp, TGSI_WRITEMASK_X), ureg_negate(ureg_abs(ureg_scalar(ureg_src(tmp), TGSI_SWIZZLE_Y))),
2791             ureg_scalar(ureg_src(tmp), TGSI_SWIZZLE_X), ureg_imm1f(ureg, 1.0f));
2792    /* replace the depth for depth testing with the result */
2793    tx->regs.oDepth = ureg_DECL_output_masked(ureg, TGSI_SEMANTIC_POSITION, 0,
2794                                              TGSI_WRITEMASK_Z, 0, 1);
2795    ureg_MOV(ureg, tx->regs.oDepth, ureg_scalar(ureg_src(tmp), TGSI_SWIZZLE_X));
2796    /* note that we write nothing to the destination, since it's disallowed to use it afterward */
2797    return D3D_OK;
2798}
2799
2800DECL_SPECIAL(TEXDP3)
2801{
2802    struct ureg_program *ureg = tx->ureg;
2803    struct ureg_dst dst = tx_dst_param(tx, &tx->insn.dst[0]);
2804    struct ureg_src src = tx_src_param(tx, &tx->insn.src[0]); /* t[n] */
2805    const int m = tx->insn.dst[0].idx;
2806    ASSERTED const int n = tx->insn.src[0].idx;
2807    assert(m >= 0 && m > n);
2808
2809    tx_texcoord_alloc(tx, m);
2810
2811    ureg_DP3(ureg, dst, tx->regs.vT[m], src);
2812
2813    return D3D_OK;
2814}
2815
2816DECL_SPECIAL(TEXM3x3)
2817{
2818    struct ureg_program *ureg = tx->ureg;
2819    struct ureg_dst dst = tx_dst_param(tx, &tx->insn.dst[0]);
2820    struct ureg_src src = tx_src_param(tx, &tx->insn.src[0]); /* t[n] */
2821    struct ureg_src sample;
2822    struct ureg_dst E, tmp;
2823    const int m = tx->insn.dst[0].idx - 2;
2824    ASSERTED const int n = tx->insn.src[0].idx;
2825    assert(m >= 0 && m > n);
2826
2827    tx_texcoord_alloc(tx, m);
2828    tx_texcoord_alloc(tx, m+1);
2829    tx_texcoord_alloc(tx, m+2);
2830
2831    ureg_DP3(ureg, ureg_writemask(dst, TGSI_WRITEMASK_X), tx->regs.vT[m], src);
2832    ureg_DP3(ureg, ureg_writemask(dst, TGSI_WRITEMASK_Y), tx->regs.vT[m+1], src);
2833    ureg_DP3(ureg, ureg_writemask(dst, TGSI_WRITEMASK_Z), tx->regs.vT[m+2], src);
2834
2835    switch (tx->insn.opcode) {
2836    case D3DSIO_TEXM3x3:
2837        ureg_MOV(ureg, ureg_writemask(dst, TGSI_WRITEMASK_W), ureg_imm1f(ureg, 1.0f));
2838        break;
2839    case D3DSIO_TEXM3x3TEX:
2840        sample = ureg_DECL_sampler(ureg, m + 2);
2841        tx->info->sampler_mask |= 1 << (m + 2);
2842        ureg_TEX(ureg, dst, ps1x_sampler_type(tx->info, m + 2), ureg_src(dst), sample);
2843        break;
2844    case D3DSIO_TEXM3x3VSPEC:
2845        sample = ureg_DECL_sampler(ureg, m + 2);
2846        tx->info->sampler_mask |= 1 << (m + 2);
2847        E = tx_scratch(tx);
2848        tmp = ureg_writemask(tx_scratch(tx), TGSI_WRITEMASK_XYZ);
2849        ureg_MOV(ureg, ureg_writemask(E, TGSI_WRITEMASK_X), ureg_scalar(tx->regs.vT[m], TGSI_SWIZZLE_W));
2850        ureg_MOV(ureg, ureg_writemask(E, TGSI_WRITEMASK_Y), ureg_scalar(tx->regs.vT[m+1], TGSI_SWIZZLE_W));
2851        ureg_MOV(ureg, ureg_writemask(E, TGSI_WRITEMASK_Z), ureg_scalar(tx->regs.vT[m+2], TGSI_SWIZZLE_W));
2852        /* At this step, dst = N = (u', w', z').
2853         * We want dst to be the texture sampled at (u'', w'', z''), with
2854         * (u'', w'', z'') = 2 * (N.E / N.N) * N - E */
2855        ureg_DP3(ureg, ureg_writemask(tmp, TGSI_WRITEMASK_X), ureg_src(dst), ureg_src(dst));
2856        ureg_RCP(ureg, ureg_writemask(tmp, TGSI_WRITEMASK_X), ureg_scalar(ureg_src(tmp), TGSI_SWIZZLE_X));
2857        /* at this step tmp.x = 1/N.N */
2858        ureg_DP3(ureg, ureg_writemask(tmp, TGSI_WRITEMASK_Y), ureg_src(dst), ureg_src(E));
2859        /* at this step tmp.y = N.E */
2860        ureg_MUL(ureg, ureg_writemask(tmp, TGSI_WRITEMASK_X), ureg_scalar(ureg_src(tmp), TGSI_SWIZZLE_X), ureg_scalar(ureg_src(tmp), TGSI_SWIZZLE_Y));
2861        /* at this step tmp.x = N.E/N.N */
2862        ureg_MUL(ureg, ureg_writemask(tmp, TGSI_WRITEMASK_X), ureg_scalar(ureg_src(tmp), TGSI_SWIZZLE_X), ureg_imm1f(ureg, 2.0f));
2863        ureg_MUL(ureg, tmp, ureg_scalar(ureg_src(tmp), TGSI_SWIZZLE_X), ureg_src(dst));
2864        /* at this step tmp.xyz = 2 * (N.E / N.N) * N */
2865        ureg_ADD(ureg, tmp, ureg_src(tmp), ureg_negate(ureg_src(E)));
2866        ureg_TEX(ureg, dst, ps1x_sampler_type(tx->info, m + 2), ureg_src(tmp), sample);
2867        break;
2868    default:
2869        return D3DERR_INVALIDCALL;
2870    }
2871    return D3D_OK;
2872}
2873
2874DECL_SPECIAL(TEXDEPTH)
2875{
2876    struct ureg_program *ureg = tx->ureg;
2877    struct ureg_dst r5;
2878    struct ureg_src r5r, r5g;
2879
2880    assert(tx->insn.dst[0].idx == 5); /* instruction must get r5 here */
2881
2882    /* we must replace the depth by r5.g == 0 ? 1.0f : r5.r/r5.g.
2883     * r5 won't be used afterward, thus we can use r5.ba */
2884    r5 = tx->regs.r[5];
2885    r5r = ureg_scalar(ureg_src(r5), TGSI_SWIZZLE_X);
2886    r5g = ureg_scalar(ureg_src(r5), TGSI_SWIZZLE_Y);
2887
2888    ureg_RCP(ureg, ureg_writemask(r5, TGSI_WRITEMASK_Z), r5g);
2889    ureg_MUL(ureg, ureg_writemask(r5, TGSI_WRITEMASK_X), r5r, ureg_scalar(ureg_src(r5), TGSI_SWIZZLE_Z));
2890    /* r5.r = r/g */
2891    ureg_CMP(ureg, ureg_writemask(r5, TGSI_WRITEMASK_X), ureg_negate(ureg_abs(r5g)),
2892             r5r, ureg_imm1f(ureg, 1.0f));
2893    /* replace the depth for depth testing with the result */
2894    tx->regs.oDepth = ureg_DECL_output_masked(ureg, TGSI_SEMANTIC_POSITION, 0,
2895                                              TGSI_WRITEMASK_Z, 0, 1);
2896    ureg_MOV(ureg, tx->regs.oDepth, r5r);
2897
2898    return D3D_OK;
2899}
2900
2901DECL_SPECIAL(BEM)
2902{
2903    struct ureg_program *ureg = tx->ureg;
2904    struct ureg_dst dst = tx_dst_param(tx, &tx->insn.dst[0]);
2905    struct ureg_src src0 = tx_src_param(tx, &tx->insn.src[0]);
2906    struct ureg_src src1 = tx_src_param(tx, &tx->insn.src[1]);
2907    struct ureg_src m00, m01, m10, m11, c8m;
2908    const int m = tx->insn.dst[0].idx;
2909    struct ureg_dst tmp = tx_scratch(tx);
2910    /*
2911     * Bump-env-matrix:
2912     * 00 is X
2913     * 01 is Y
2914     * 10 is Z
2915     * 11 is W
2916     */
2917    c8m = nine_float_constant_src(tx, 8+m);
2918    m00 = NINE_APPLY_SWIZZLE(c8m, X);
2919    m01 = NINE_APPLY_SWIZZLE(c8m, Y);
2920    m10 = NINE_APPLY_SWIZZLE(c8m, Z);
2921    m11 = NINE_APPLY_SWIZZLE(c8m, W);
2922    /* dest.r = src0.r + D3DTSS_BUMPENVMAT00(stage n) * src1.r  */
2923    ureg_MAD(ureg, ureg_writemask(tmp, TGSI_WRITEMASK_X), m00,
2924             NINE_APPLY_SWIZZLE(src1, X), NINE_APPLY_SWIZZLE(src0, X));
2925    /* dest.r = dest.r + D3DTSS_BUMPENVMAT10(stage n) * src1.g; */
2926    ureg_MAD(ureg, ureg_writemask(tmp, TGSI_WRITEMASK_X), m10,
2927             NINE_APPLY_SWIZZLE(src1, Y), NINE_APPLY_SWIZZLE(ureg_src(tmp), X));
2928
2929    /* dest.g = src0.g + D3DTSS_BUMPENVMAT01(stage n) * src1.r */
2930    ureg_MAD(ureg, ureg_writemask(tmp, TGSI_WRITEMASK_Y), m01,
2931             NINE_APPLY_SWIZZLE(src1, X), src0);
2932    /* dest.g = dest.g + D3DTSS_BUMPENVMAT11(stage n) * src1.g */
2933    ureg_MAD(ureg, ureg_writemask(tmp, TGSI_WRITEMASK_Y), m11,
2934             NINE_APPLY_SWIZZLE(src1, Y), NINE_APPLY_SWIZZLE(ureg_src(tmp), Y));
2935    ureg_MOV(ureg, ureg_writemask(dst, TGSI_WRITEMASK_XY), ureg_src(tmp));
2936
2937    tx->info->bumpenvmat_needed = 1;
2938
2939    return D3D_OK;
2940}
2941
2942DECL_SPECIAL(TEXLD)
2943{
2944    struct ureg_program *ureg = tx->ureg;
2945    unsigned target;
2946    struct ureg_dst dst = tx_dst_param(tx, &tx->insn.dst[0]);
2947    struct ureg_src src[2] = {
2948        tx_src_param(tx, &tx->insn.src[0]),
2949        tx_src_param(tx, &tx->insn.src[1])
2950    };
2951    assert(tx->insn.src[1].idx >= 0 &&
2952           tx->insn.src[1].idx < ARRAY_SIZE(tx->sampler_targets));
2953    target = tx->sampler_targets[tx->insn.src[1].idx];
2954
2955    if (TEX_if_fetch4(tx, dst, target, src[0], src[1], tx->insn.src[1].idx))
2956        return D3D_OK;
2957
2958    switch (tx->insn.flags) {
2959    case 0:
2960        ureg_TEX(ureg, dst, target, src[0], src[1]);
2961        break;
2962    case NINED3DSI_TEXLD_PROJECT:
2963        ureg_TXP(ureg, dst, target, src[0], src[1]);
2964        break;
2965    case NINED3DSI_TEXLD_BIAS:
2966        ureg_TXB(ureg, dst, target, src[0], src[1]);
2967        break;
2968    default:
2969        assert(0);
2970        return D3DERR_INVALIDCALL;
2971    }
2972    return D3D_OK;
2973}
2974
2975DECL_SPECIAL(TEXLD_14)
2976{
2977    struct ureg_program *ureg = tx->ureg;
2978    struct ureg_dst dst = tx_dst_param(tx, &tx->insn.dst[0]);
2979    struct ureg_src src = tx_src_param(tx, &tx->insn.src[0]);
2980    const unsigned s = tx->insn.dst[0].idx;
2981    const unsigned t = ps1x_sampler_type(tx->info, s);
2982
2983    tx->info->sampler_mask |= 1 << s;
2984    ureg_TEX(ureg, dst, t, src, ureg_DECL_sampler(ureg, s));
2985
2986    return D3D_OK;
2987}
2988
2989DECL_SPECIAL(TEX)
2990{
2991    struct ureg_program *ureg = tx->ureg;
2992    const unsigned s = tx->insn.dst[0].idx;
2993    const unsigned t = ps1x_sampler_type(tx->info, s);
2994    struct ureg_dst dst = tx_dst_param(tx, &tx->insn.dst[0]);
2995    struct ureg_src src[2];
2996
2997    tx_texcoord_alloc(tx, s);
2998
2999    src[0] = tx->regs.vT[s];
3000    src[1] = ureg_DECL_sampler(ureg, s);
3001    tx->info->sampler_mask |= 1 << s;
3002
3003    TEX_with_ps1x_projection(tx, dst, t, src[0], src[1], s);
3004
3005    return D3D_OK;
3006}
3007
3008DECL_SPECIAL(TEXLDD)
3009{
3010    unsigned target;
3011    struct ureg_dst dst = tx_dst_param(tx, &tx->insn.dst[0]);
3012    struct ureg_src src[4] = {
3013        tx_src_param(tx, &tx->insn.src[0]),
3014        tx_src_param(tx, &tx->insn.src[1]),
3015        tx_src_param(tx, &tx->insn.src[2]),
3016        tx_src_param(tx, &tx->insn.src[3])
3017    };
3018    assert(tx->insn.src[1].idx >= 0 &&
3019           tx->insn.src[1].idx < ARRAY_SIZE(tx->sampler_targets));
3020    target = tx->sampler_targets[tx->insn.src[1].idx];
3021
3022    if (TEX_if_fetch4(tx, dst, target, src[0], src[1], tx->insn.src[1].idx))
3023        return D3D_OK;
3024
3025    ureg_TXD(tx->ureg, dst, target, src[0], src[2], src[3], src[1]);
3026    return D3D_OK;
3027}
3028
3029DECL_SPECIAL(TEXLDL)
3030{
3031    unsigned target;
3032    struct ureg_dst dst = tx_dst_param(tx, &tx->insn.dst[0]);
3033    struct ureg_src src[2] = {
3034       tx_src_param(tx, &tx->insn.src[0]),
3035       tx_src_param(tx, &tx->insn.src[1])
3036    };
3037    assert(tx->insn.src[1].idx >= 0 &&
3038           tx->insn.src[1].idx < ARRAY_SIZE(tx->sampler_targets));
3039    target = tx->sampler_targets[tx->insn.src[1].idx];
3040
3041    if (TEX_if_fetch4(tx, dst, target, src[0], src[1], tx->insn.src[1].idx))
3042        return D3D_OK;
3043
3044    ureg_TXL(tx->ureg, dst, target, src[0], src[1]);
3045    return D3D_OK;
3046}
3047
3048DECL_SPECIAL(SETP)
3049{
3050    const unsigned cmp_op = sm1_insn_flags_to_tgsi_setop(tx->insn.flags);
3051    struct ureg_dst dst = tx_dst_param(tx, &tx->insn.dst[0]);
3052    struct ureg_src src[2] = {
3053       tx_src_param(tx, &tx->insn.src[0]),
3054       tx_src_param(tx, &tx->insn.src[1])
3055    };
3056    ureg_insn(tx->ureg, cmp_op, &dst, 1, src, 2, 0);
3057    return D3D_OK;
3058}
3059
3060DECL_SPECIAL(BREAKP)
3061{
3062    struct ureg_src src = tx_src_param(tx, &tx->insn.src[0]);
3063    ureg_IF(tx->ureg, src, tx_cond(tx));
3064    ureg_BRK(tx->ureg);
3065    tx_endcond(tx);
3066    ureg_ENDIF(tx->ureg);
3067    return D3D_OK;
3068}
3069
3070DECL_SPECIAL(PHASE)
3071{
3072    return D3D_OK; /* we don't care about phase */
3073}
3074
3075DECL_SPECIAL(COMMENT)
3076{
3077    return D3D_OK; /* nothing to do */
3078}
3079
3080
3081#define _OPI(o,t,vv1,vv2,pv1,pv2,d,s,h) \
3082    { D3DSIO_##o, TGSI_OPCODE_##t, { vv1, vv2 }, { pv1, pv2, }, d, s, h }
3083
3084static const struct sm1_op_info inst_table[] =
3085{
3086    _OPI(NOP, NOP, V(0,0), V(3,0), V(0,0), V(3,0), 0, 0, SPECIAL(NOP)), /* 0 */
3087    _OPI(MOV, MOV, V(0,0), V(3,0), V(0,0), V(3,0), 1, 1, NULL),
3088    _OPI(ADD, ADD, V(0,0), V(3,0), V(0,0), V(3,0), 1, 2, NULL), /* 2 */
3089    _OPI(SUB, NOP, V(0,0), V(3,0), V(0,0), V(3,0), 1, 2, SPECIAL(SUB)), /* 3 */
3090    _OPI(MAD, MAD, V(0,0), V(3,0), V(0,0), V(3,0), 1, 3, NULL), /* 4 */
3091    _OPI(MUL, MUL, V(0,0), V(3,0), V(0,0), V(3,0), 1, 2, NULL), /* 5 */
3092    _OPI(RCP, RCP, V(0,0), V(3,0), V(0,0), V(3,0), 1, 1, SPECIAL(RCP)), /* 6 */
3093    _OPI(RSQ, RSQ, V(0,0), V(3,0), V(0,0), V(3,0), 1, 1, SPECIAL(RSQ)), /* 7 */
3094    _OPI(DP3, DP3, V(0,0), V(3,0), V(0,0), V(3,0), 1, 2, NULL), /* 8 */
3095    _OPI(DP4, DP4, V(0,0), V(3,0), V(0,0), V(3,0), 1, 2, NULL), /* 9 */
3096    _OPI(MIN, MIN, V(0,0), V(3,0), V(0,0), V(3,0), 1, 2, NULL), /* 10 */
3097    _OPI(MAX, MAX, V(0,0), V(3,0), V(0,0), V(3,0), 1, 2, NULL), /* 11 */
3098    _OPI(SLT, SLT, V(0,0), V(3,0), V(0,0), V(3,0), 1, 2, NULL), /* 12 */
3099    _OPI(SGE, SGE, V(0,0), V(3,0), V(0,0), V(3,0), 1, 2, NULL), /* 13 */
3100    _OPI(EXP, EX2, V(0,0), V(3,0), V(0,0), V(3,0), 1, 1, NULL), /* 14 */
3101    _OPI(LOG, LG2, V(0,0), V(3,0), V(0,0), V(3,0), 1, 1, SPECIAL(LOG)), /* 15 */
3102    _OPI(LIT, LIT, V(0,0), V(3,0), V(0,0), V(0,0), 1, 1, SPECIAL(LIT)), /* 16 */
3103    _OPI(DST, DST, V(0,0), V(3,0), V(0,0), V(3,0), 1, 2, NULL), /* 17 */
3104    _OPI(LRP, LRP, V(0,0), V(3,0), V(0,0), V(3,0), 1, 3, NULL), /* 18 */
3105    _OPI(FRC, FRC, V(0,0), V(3,0), V(0,0), V(3,0), 1, 1, NULL), /* 19 */
3106
3107    _OPI(M4x4, NOP, V(0,0), V(3,0), V(0,0), V(3,0), 1, 2, SPECIAL(M4x4)),
3108    _OPI(M4x3, NOP, V(0,0), V(3,0), V(0,0), V(3,0), 1, 2, SPECIAL(M4x3)),
3109    _OPI(M3x4, NOP, V(0,0), V(3,0), V(0,0), V(3,0), 1, 2, SPECIAL(M3x4)),
3110    _OPI(M3x3, NOP, V(0,0), V(3,0), V(0,0), V(3,0), 1, 2, SPECIAL(M3x3)),
3111    _OPI(M3x2, NOP, V(0,0), V(3,0), V(0,0), V(3,0), 1, 2, SPECIAL(M3x2)),
3112
3113    _OPI(CALL,    CAL,     V(2,0), V(3,0), V(2,1), V(3,0), 0, 1, SPECIAL(CALL)),
3114    _OPI(CALLNZ,  CAL,     V(2,0), V(3,0), V(2,1), V(3,0), 0, 2, SPECIAL(CALLNZ)),
3115    _OPI(LOOP,    BGNLOOP, V(2,0), V(3,0), V(3,0), V(3,0), 0, 2, SPECIAL(LOOP)),
3116    _OPI(RET,     RET,     V(2,0), V(3,0), V(2,1), V(3,0), 0, 0, SPECIAL(RET)),
3117    _OPI(ENDLOOP, ENDLOOP, V(2,0), V(3,0), V(3,0), V(3,0), 0, 0, SPECIAL(ENDLOOP)),
3118    _OPI(LABEL,   NOP,     V(2,0), V(3,0), V(2,1), V(3,0), 0, 1, SPECIAL(LABEL)),
3119
3120    _OPI(DCL, NOP, V(0,0), V(3,0), V(0,0), V(3,0), 0, 0, SPECIAL(DCL)),
3121
3122    _OPI(POW, POW, V(0,0), V(3,0), V(0,0), V(3,0), 1, 2, SPECIAL(POW)),
3123    _OPI(CRS, NOP, V(0,0), V(3,0), V(0,0), V(3,0), 1, 2, SPECIAL(XPD)), /* XXX: .w */
3124    _OPI(SGN, SSG, V(2,0), V(3,0), V(0,0), V(0,0), 1, 3, SPECIAL(SGN)), /* ignore src1,2 */
3125    _OPI(ABS, NOP, V(0,0), V(3,0), V(0,0), V(3,0), 1, 1, SPECIAL(ABS)),
3126    _OPI(NRM, NOP, V(0,0), V(3,0), V(0,0), V(3,0), 1, 1, SPECIAL(NRM)), /* NRM doesn't fit */
3127
3128    _OPI(SINCOS, NOP, V(2,0), V(2,1), V(2,0), V(2,1), 1, 3, SPECIAL(SINCOS)),
3129    _OPI(SINCOS, NOP, V(3,0), V(3,0), V(3,0), V(3,0), 1, 1, SPECIAL(SINCOS)),
3130
3131    /* More flow control */
3132    _OPI(REP,    NOP,    V(2,0), V(3,0), V(2,1), V(3,0), 0, 1, SPECIAL(REP)),
3133    _OPI(ENDREP, NOP,    V(2,0), V(3,0), V(2,1), V(3,0), 0, 0, SPECIAL(ENDREP)),
3134    _OPI(IF,     IF,     V(2,0), V(3,0), V(2,1), V(3,0), 0, 1, SPECIAL(IF)),
3135    _OPI(IFC,    IF,     V(2,1), V(3,0), V(2,1), V(3,0), 0, 2, SPECIAL(IFC)),
3136    _OPI(ELSE,   ELSE,   V(2,0), V(3,0), V(2,1), V(3,0), 0, 0, SPECIAL(ELSE)),
3137    _OPI(ENDIF,  ENDIF,  V(2,0), V(3,0), V(2,1), V(3,0), 0, 0, SPECIAL(ENDIF)),
3138    _OPI(BREAK,  BRK,    V(2,1), V(3,0), V(2,1), V(3,0), 0, 0, NULL),
3139    _OPI(BREAKC, NOP,    V(2,1), V(3,0), V(2,1), V(3,0), 0, 2, SPECIAL(BREAKC)),
3140    /* we don't write to the address register, but a normal register (copied
3141     * when needed to the address register), thus we don't use ARR */
3142    _OPI(MOVA, MOV, V(2,0), V(3,0), V(0,0), V(0,0), 1, 1, NULL),
3143
3144    _OPI(DEFB, NOP, V(0,0), V(3,0) , V(0,0), V(3,0) , 1, 0, SPECIAL(DEFB)),
3145    _OPI(DEFI, NOP, V(0,0), V(3,0) , V(0,0), V(3,0) , 1, 0, SPECIAL(DEFI)),
3146
3147    _OPI(TEXCOORD,     NOP, V(0,0), V(0,0), V(0,0), V(1,3), 1, 0, SPECIAL(TEXCOORD)),
3148    _OPI(TEXCOORD,     MOV, V(0,0), V(0,0), V(1,4), V(1,4), 1, 1, SPECIAL(TEXCOORD_ps14)),
3149    _OPI(TEXKILL,      KILL_IF, V(0,0), V(0,0), V(0,0), V(3,0), 1, 0, SPECIAL(TEXKILL)),
3150    _OPI(TEX,          TEX, V(0,0), V(0,0), V(0,0), V(1,3), 1, 0, SPECIAL(TEX)),
3151    _OPI(TEX,          TEX, V(0,0), V(0,0), V(1,4), V(1,4), 1, 1, SPECIAL(TEXLD_14)),
3152    _OPI(TEX,          TEX, V(0,0), V(0,0), V(2,0), V(3,0), 1, 2, SPECIAL(TEXLD)),
3153    _OPI(TEXBEM,       TEX, V(0,0), V(0,0), V(0,0), V(1,3), 1, 1, SPECIAL(TEXBEM)),
3154    _OPI(TEXBEML,      TEX, V(0,0), V(0,0), V(0,0), V(1,3), 1, 1, SPECIAL(TEXBEM)),
3155    _OPI(TEXREG2AR,    TEX, V(0,0), V(0,0), V(0,0), V(1,3), 1, 1, SPECIAL(TEXREG2AR)),
3156    _OPI(TEXREG2GB,    TEX, V(0,0), V(0,0), V(0,0), V(1,3), 1, 1, SPECIAL(TEXREG2GB)),
3157    _OPI(TEXM3x2PAD,   TEX, V(0,0), V(0,0), V(0,0), V(1,3), 1, 1, SPECIAL(TEXM3x2PAD)),
3158    _OPI(TEXM3x2TEX,   TEX, V(0,0), V(0,0), V(0,0), V(1,3), 1, 1, SPECIAL(TEXM3x2TEX)),
3159    _OPI(TEXM3x3PAD,   TEX, V(0,0), V(0,0), V(0,0), V(1,3), 1, 1, SPECIAL(TEXM3x3PAD)),
3160    _OPI(TEXM3x3TEX,   TEX, V(0,0), V(0,0), V(0,0), V(1,3), 1, 1, SPECIAL(TEXM3x3)),
3161    _OPI(TEXM3x3SPEC,  TEX, V(0,0), V(0,0), V(0,0), V(1,3), 1, 2, SPECIAL(TEXM3x3SPEC)),
3162    _OPI(TEXM3x3VSPEC, TEX, V(0,0), V(0,0), V(0,0), V(1,3), 1, 1, SPECIAL(TEXM3x3)),
3163
3164    _OPI(EXPP, EXP, V(0,0), V(1,1), V(0,0), V(0,0), 1, 1, NULL),
3165    _OPI(EXPP, EX2, V(2,0), V(3,0), V(0,0), V(0,0), 1, 1, NULL),
3166    _OPI(LOGP, LG2, V(0,0), V(3,0), V(0,0), V(0,0), 1, 1, SPECIAL(LOG)),
3167    _OPI(CND,  NOP, V(0,0), V(0,0), V(0,0), V(1,4), 1, 3, SPECIAL(CND)),
3168
3169    _OPI(DEF, NOP, V(0,0), V(3,0), V(0,0), V(3,0), 1, 0, SPECIAL(DEF)),
3170
3171    /* More tex stuff */
3172    _OPI(TEXREG2RGB,   TEX, V(0,0), V(0,0), V(1,2), V(1,3), 1, 1, SPECIAL(TEXREG2RGB)),
3173    _OPI(TEXDP3TEX,    TEX, V(0,0), V(0,0), V(1,2), V(1,3), 1, 1, SPECIAL(TEXDP3TEX)),
3174    _OPI(TEXM3x2DEPTH, TEX, V(0,0), V(0,0), V(1,3), V(1,3), 1, 1, SPECIAL(TEXM3x2DEPTH)),
3175    _OPI(TEXDP3,       TEX, V(0,0), V(0,0), V(1,2), V(1,3), 1, 1, SPECIAL(TEXDP3)),
3176    _OPI(TEXM3x3,      TEX, V(0,0), V(0,0), V(1,2), V(1,3), 1, 1, SPECIAL(TEXM3x3)),
3177    _OPI(TEXDEPTH,     TEX, V(0,0), V(0,0), V(1,4), V(1,4), 1, 0, SPECIAL(TEXDEPTH)),
3178
3179    /* Misc */
3180    _OPI(CMP,    CMP,  V(0,0), V(0,0), V(1,2), V(3,0), 1, 3, SPECIAL(CMP)), /* reversed */
3181    _OPI(BEM,    NOP,  V(0,0), V(0,0), V(1,4), V(1,4), 1, 2, SPECIAL(BEM)),
3182    _OPI(DP2ADD, NOP,  V(0,0), V(0,0), V(2,0), V(3,0), 1, 3, SPECIAL(DP2ADD)),
3183    _OPI(DSX,    DDX,  V(0,0), V(0,0), V(2,1), V(3,0), 1, 1, NULL),
3184    _OPI(DSY,    DDY,  V(0,0), V(0,0), V(2,1), V(3,0), 1, 1, NULL),
3185    _OPI(TEXLDD, TXD,  V(0,0), V(0,0), V(2,1), V(3,0), 1, 4, SPECIAL(TEXLDD)),
3186    _OPI(SETP,   NOP,  V(0,0), V(3,0), V(2,1), V(3,0), 1, 2, SPECIAL(SETP)),
3187    _OPI(TEXLDL, TXL,  V(3,0), V(3,0), V(3,0), V(3,0), 1, 2, SPECIAL(TEXLDL)),
3188    _OPI(BREAKP, BRK,  V(0,0), V(3,0), V(2,1), V(3,0), 0, 1, SPECIAL(BREAKP))
3189};
3190
3191static const struct sm1_op_info inst_phase =
3192    _OPI(PHASE, NOP, V(0,0), V(0,0), V(1,4), V(1,4), 0, 0, SPECIAL(PHASE));
3193
3194static const struct sm1_op_info inst_comment =
3195    _OPI(COMMENT, NOP, V(0,0), V(3,0), V(0,0), V(3,0), 0, 0, SPECIAL(COMMENT));
3196
3197static void
3198create_op_info_map(struct shader_translator *tx)
3199{
3200    const unsigned version = (tx->version.major << 8) | tx->version.minor;
3201    unsigned i;
3202
3203    for (i = 0; i < ARRAY_SIZE(tx->op_info_map); ++i)
3204        tx->op_info_map[i] = -1;
3205
3206    if (tx->processor == PIPE_SHADER_VERTEX) {
3207        for (i = 0; i < ARRAY_SIZE(inst_table); ++i) {
3208            assert(inst_table[i].sio < ARRAY_SIZE(tx->op_info_map));
3209            if (inst_table[i].vert_version.min <= version &&
3210                inst_table[i].vert_version.max >= version)
3211                tx->op_info_map[inst_table[i].sio] = i;
3212        }
3213    } else {
3214        for (i = 0; i < ARRAY_SIZE(inst_table); ++i) {
3215            assert(inst_table[i].sio < ARRAY_SIZE(tx->op_info_map));
3216            if (inst_table[i].frag_version.min <= version &&
3217                inst_table[i].frag_version.max >= version)
3218                tx->op_info_map[inst_table[i].sio] = i;
3219        }
3220    }
3221}
3222
3223static inline HRESULT
3224NineTranslateInstruction_Generic(struct shader_translator *tx)
3225{
3226    struct ureg_dst dst[1];
3227    struct ureg_src src[4];
3228    unsigned i;
3229
3230    for (i = 0; i < tx->insn.ndst && i < ARRAY_SIZE(dst); ++i)
3231        dst[i] = tx_dst_param(tx, &tx->insn.dst[i]);
3232    for (i = 0; i < tx->insn.nsrc && i < ARRAY_SIZE(src); ++i)
3233        src[i] = tx_src_param(tx, &tx->insn.src[i]);
3234
3235    ureg_insn(tx->ureg, tx->insn.info->opcode,
3236              dst, tx->insn.ndst,
3237              src, tx->insn.nsrc, 0);
3238    return D3D_OK;
3239}
3240
3241static inline DWORD
3242TOKEN_PEEK(struct shader_translator *tx)
3243{
3244    return *(tx->parse);
3245}
3246
3247static inline DWORD
3248TOKEN_NEXT(struct shader_translator *tx)
3249{
3250    return *(tx->parse)++;
3251}
3252
3253static inline void
3254TOKEN_JUMP(struct shader_translator *tx)
3255{
3256    if (tx->parse_next && tx->parse != tx->parse_next) {
3257        WARN("parse(%p) != parse_next(%p) !\n", tx->parse, tx->parse_next);
3258        tx->parse = tx->parse_next;
3259    }
3260}
3261
3262static inline boolean
3263sm1_parse_eof(struct shader_translator *tx)
3264{
3265    return TOKEN_PEEK(tx) == NINED3DSP_END;
3266}
3267
3268static void
3269sm1_read_version(struct shader_translator *tx)
3270{
3271    const DWORD tok = TOKEN_NEXT(tx);
3272
3273    tx->version.major = D3DSHADER_VERSION_MAJOR(tok);
3274    tx->version.minor = D3DSHADER_VERSION_MINOR(tok);
3275
3276    switch (tok >> 16) {
3277    case NINED3D_SM1_VS: tx->processor = PIPE_SHADER_VERTEX; break;
3278    case NINED3D_SM1_PS: tx->processor = PIPE_SHADER_FRAGMENT; break;
3279    default:
3280       DBG("Invalid shader type: %x\n", tok);
3281       tx->processor = ~0;
3282       break;
3283    }
3284}
3285
3286/* This is just to check if we parsed the instruction properly. */
3287static void
3288sm1_parse_get_skip(struct shader_translator *tx)
3289{
3290    const DWORD tok = TOKEN_PEEK(tx);
3291
3292    if (tx->version.major >= 2) {
3293        tx->parse_next = tx->parse + 1 /* this */ +
3294            ((tok & D3DSI_INSTLENGTH_MASK) >> D3DSI_INSTLENGTH_SHIFT);
3295    } else {
3296        tx->parse_next = NULL; /* TODO: determine from param count */
3297    }
3298}
3299
3300static void
3301sm1_print_comment(const char *comment, UINT size)
3302{
3303    if (!size)
3304        return;
3305    /* TODO */
3306}
3307
3308static void
3309sm1_parse_comments(struct shader_translator *tx, BOOL print)
3310{
3311    DWORD tok = TOKEN_PEEK(tx);
3312
3313    while ((tok & D3DSI_OPCODE_MASK) == D3DSIO_COMMENT)
3314    {
3315        const char *comment = "";
3316        UINT size = (tok & D3DSI_COMMENTSIZE_MASK) >> D3DSI_COMMENTSIZE_SHIFT;
3317        tx->parse += size + 1;
3318
3319        if (print)
3320            sm1_print_comment(comment, size);
3321
3322        tok = TOKEN_PEEK(tx);
3323    }
3324}
3325
3326static void
3327sm1_parse_get_param(struct shader_translator *tx, DWORD *reg, DWORD *rel)
3328{
3329    *reg = TOKEN_NEXT(tx);
3330
3331    if (*reg & D3DSHADER_ADDRMODE_RELATIVE)
3332    {
3333        if (tx->version.major < 2)
3334            *rel = (1 << 31) |
3335                ((D3DSPR_ADDR << D3DSP_REGTYPE_SHIFT2) & D3DSP_REGTYPE_MASK2) |
3336                ((D3DSPR_ADDR << D3DSP_REGTYPE_SHIFT)  & D3DSP_REGTYPE_MASK) |
3337                D3DSP_NOSWIZZLE;
3338        else
3339            *rel = TOKEN_NEXT(tx);
3340    }
3341}
3342
3343static void
3344sm1_parse_dst_param(struct sm1_dst_param *dst, DWORD tok)
3345{
3346    int8_t shift;
3347    dst->file =
3348        (tok & D3DSP_REGTYPE_MASK)  >> D3DSP_REGTYPE_SHIFT |
3349        (tok & D3DSP_REGTYPE_MASK2) >> D3DSP_REGTYPE_SHIFT2;
3350    dst->type = TGSI_RETURN_TYPE_FLOAT;
3351    dst->idx = tok & D3DSP_REGNUM_MASK;
3352    dst->rel = NULL;
3353    dst->mask = (tok & NINED3DSP_WRITEMASK_MASK) >> NINED3DSP_WRITEMASK_SHIFT;
3354    dst->mod = (tok & D3DSP_DSTMOD_MASK) >> D3DSP_DSTMOD_SHIFT;
3355    shift = (tok & D3DSP_DSTSHIFT_MASK) >> D3DSP_DSTSHIFT_SHIFT;
3356    dst->shift = (shift & 0x7) - (shift & 0x8);
3357}
3358
3359static void
3360sm1_parse_src_param(struct sm1_src_param *src, DWORD tok)
3361{
3362    src->file =
3363        ((tok & D3DSP_REGTYPE_MASK)  >> D3DSP_REGTYPE_SHIFT) |
3364        ((tok & D3DSP_REGTYPE_MASK2) >> D3DSP_REGTYPE_SHIFT2);
3365    src->type = TGSI_RETURN_TYPE_FLOAT;
3366    src->idx = tok & D3DSP_REGNUM_MASK;
3367    src->rel = NULL;
3368    src->swizzle = (tok & D3DSP_SWIZZLE_MASK) >> D3DSP_SWIZZLE_SHIFT;
3369    src->mod = (tok & D3DSP_SRCMOD_MASK) >> D3DSP_SRCMOD_SHIFT;
3370
3371    switch (src->file) {
3372    case D3DSPR_CONST2: src->file = D3DSPR_CONST; src->idx += 2048; break;
3373    case D3DSPR_CONST3: src->file = D3DSPR_CONST; src->idx += 4096; break;
3374    case D3DSPR_CONST4: src->file = D3DSPR_CONST; src->idx += 6144; break;
3375    default:
3376        break;
3377    }
3378}
3379
3380static void
3381sm1_parse_immediate(struct shader_translator *tx,
3382                    struct sm1_src_param *imm)
3383{
3384    imm->file = NINED3DSPR_IMMEDIATE;
3385    imm->idx = INT_MIN;
3386    imm->rel = NULL;
3387    imm->swizzle = NINED3DSP_NOSWIZZLE;
3388    imm->mod = 0;
3389    switch (tx->insn.opcode) {
3390    case D3DSIO_DEF:
3391        imm->type = NINED3DSPTYPE_FLOAT4;
3392        memcpy(&imm->imm.d[0], tx->parse, 4 * sizeof(DWORD));
3393        tx->parse += 4;
3394        break;
3395    case D3DSIO_DEFI:
3396        imm->type = NINED3DSPTYPE_INT4;
3397        memcpy(&imm->imm.d[0], tx->parse, 4 * sizeof(DWORD));
3398        tx->parse += 4;
3399        break;
3400    case D3DSIO_DEFB:
3401        imm->type = NINED3DSPTYPE_BOOL;
3402        memcpy(&imm->imm.d[0], tx->parse, 1 * sizeof(DWORD));
3403        tx->parse += 1;
3404        break;
3405    default:
3406       assert(0);
3407       break;
3408    }
3409}
3410
3411static void
3412sm1_read_dst_param(struct shader_translator *tx,
3413                   struct sm1_dst_param *dst,
3414                   struct sm1_src_param *rel)
3415{
3416    DWORD tok_dst, tok_rel = 0;
3417
3418    sm1_parse_get_param(tx, &tok_dst, &tok_rel);
3419    sm1_parse_dst_param(dst, tok_dst);
3420    if (tok_dst & D3DSHADER_ADDRMODE_RELATIVE) {
3421        sm1_parse_src_param(rel, tok_rel);
3422        dst->rel = rel;
3423    }
3424}
3425
3426static void
3427sm1_read_src_param(struct shader_translator *tx,
3428                   struct sm1_src_param *src,
3429                   struct sm1_src_param *rel)
3430{
3431    DWORD tok_src, tok_rel = 0;
3432
3433    sm1_parse_get_param(tx, &tok_src, &tok_rel);
3434    sm1_parse_src_param(src, tok_src);
3435    if (tok_src & D3DSHADER_ADDRMODE_RELATIVE) {
3436        assert(rel);
3437        sm1_parse_src_param(rel, tok_rel);
3438        src->rel = rel;
3439    }
3440}
3441
3442static void
3443sm1_read_semantic(struct shader_translator *tx,
3444                  struct sm1_semantic *sem)
3445{
3446    const DWORD tok_usg = TOKEN_NEXT(tx);
3447    const DWORD tok_dst = TOKEN_NEXT(tx);
3448
3449    sem->sampler_type = (tok_usg & D3DSP_TEXTURETYPE_MASK) >> D3DSP_TEXTURETYPE_SHIFT;
3450    sem->usage = (tok_usg & D3DSP_DCL_USAGE_MASK) >> D3DSP_DCL_USAGE_SHIFT;
3451    sem->usage_idx = (tok_usg & D3DSP_DCL_USAGEINDEX_MASK) >> D3DSP_DCL_USAGEINDEX_SHIFT;
3452
3453    sm1_parse_dst_param(&sem->reg, tok_dst);
3454}
3455
3456static void
3457sm1_parse_instruction(struct shader_translator *tx)
3458{
3459    struct sm1_instruction *insn = &tx->insn;
3460    HRESULT hr;
3461    DWORD tok;
3462    const struct sm1_op_info *info = NULL;
3463    unsigned i;
3464
3465    sm1_parse_comments(tx, TRUE);
3466    sm1_parse_get_skip(tx);
3467
3468    tok = TOKEN_NEXT(tx);
3469
3470    insn->opcode = tok & D3DSI_OPCODE_MASK;
3471    insn->flags = (tok & NINED3DSIO_OPCODE_FLAGS_MASK) >> NINED3DSIO_OPCODE_FLAGS_SHIFT;
3472    insn->coissue = !!(tok & D3DSI_COISSUE);
3473    insn->predicated = !!(tok & NINED3DSHADER_INST_PREDICATED);
3474
3475    if (insn->opcode < ARRAY_SIZE(tx->op_info_map)) {
3476        int k = tx->op_info_map[insn->opcode];
3477        if (k >= 0) {
3478            assert(k < ARRAY_SIZE(inst_table));
3479            info = &inst_table[k];
3480        }
3481    } else {
3482       if (insn->opcode == D3DSIO_PHASE)   info = &inst_phase;
3483       if (insn->opcode == D3DSIO_COMMENT) info = &inst_comment;
3484    }
3485    if (!info) {
3486       DBG("illegal or unhandled opcode: %08x\n", insn->opcode);
3487       TOKEN_JUMP(tx);
3488       return;
3489    }
3490    insn->info = info;
3491    insn->ndst = info->ndst;
3492    insn->nsrc = info->nsrc;
3493
3494    /* check version */
3495    {
3496        unsigned min = IS_VS ? info->vert_version.min : info->frag_version.min;
3497        unsigned max = IS_VS ? info->vert_version.max : info->frag_version.max;
3498        unsigned ver = (tx->version.major << 8) | tx->version.minor;
3499        if (ver < min || ver > max) {
3500            DBG("opcode not supported in this shader version: %x <= %x <= %x\n",
3501                min, ver, max);
3502            return;
3503        }
3504    }
3505
3506    for (i = 0; i < insn->ndst; ++i)
3507        sm1_read_dst_param(tx, &insn->dst[i], &insn->dst_rel[i]);
3508    if (insn->predicated)
3509        sm1_read_src_param(tx, &insn->pred, NULL);
3510    for (i = 0; i < insn->nsrc; ++i)
3511        sm1_read_src_param(tx, &insn->src[i], &insn->src_rel[i]);
3512
3513    /* parse here so we can dump them before processing */
3514    if (insn->opcode == D3DSIO_DEF ||
3515        insn->opcode == D3DSIO_DEFI ||
3516        insn->opcode == D3DSIO_DEFB)
3517        sm1_parse_immediate(tx, &tx->insn.src[0]);
3518
3519    sm1_dump_instruction(insn, tx->cond_depth + tx->loop_depth);
3520    sm1_instruction_check(insn);
3521
3522    if (insn->predicated) {
3523        tx->predicated_activated = true;
3524        if (ureg_dst_is_undef(tx->regs.predicate_tmp)) {
3525            tx->regs.predicate_tmp = ureg_DECL_temporary(tx->ureg);
3526            tx->regs.predicate_dst = ureg_DECL_temporary(tx->ureg);
3527        }
3528    }
3529
3530    if (info->handler)
3531        hr = info->handler(tx);
3532    else
3533        hr = NineTranslateInstruction_Generic(tx);
3534    tx_apply_dst0_modifiers(tx);
3535
3536    if (insn->predicated) {
3537        tx->predicated_activated = false;
3538        /* TODO: predicate might be allowed on outputs,
3539         * which cannot be src. Workaround it. */
3540        ureg_CMP(tx->ureg, tx->regs.predicate_dst,
3541                 ureg_negate(tx_src_param(tx, &insn->pred)),
3542                 ureg_src(tx->regs.predicate_tmp),
3543                 ureg_src(tx->regs.predicate_dst));
3544    }
3545
3546    if (hr != D3D_OK)
3547        tx->failure = TRUE;
3548    tx->num_scratch = 0; /* reset */
3549
3550    TOKEN_JUMP(tx);
3551}
3552
3553#define GET_CAP(n) screen->get_param( \
3554      screen, PIPE_CAP_##n)
3555#define GET_SHADER_CAP(n) screen->get_shader_param( \
3556      screen, info->type, PIPE_SHADER_CAP_##n)
3557
3558static HRESULT
3559tx_ctor(struct shader_translator *tx, struct pipe_screen *screen, struct nine_shader_info *info)
3560{
3561    unsigned i;
3562
3563    memset(tx, 0, sizeof(*tx));
3564
3565    tx->info = info;
3566
3567    tx->byte_code = info->byte_code;
3568    tx->parse = info->byte_code;
3569
3570    for (i = 0; i < ARRAY_SIZE(info->input_map); ++i)
3571        info->input_map[i] = NINE_DECLUSAGE_NONE;
3572    info->num_inputs = 0;
3573
3574    info->position_t = FALSE;
3575    info->point_size = FALSE;
3576
3577    memset(tx->slots_used, 0, sizeof(tx->slots_used));
3578    memset(info->int_slots_used, 0, sizeof(info->int_slots_used));
3579    memset(info->bool_slots_used, 0, sizeof(info->bool_slots_used));
3580
3581    tx->info->const_float_slots = 0;
3582    tx->info->const_int_slots = 0;
3583    tx->info->const_bool_slots = 0;
3584
3585    info->sampler_mask = 0x0;
3586    info->rt_mask = 0x0;
3587
3588    info->lconstf.data = NULL;
3589    info->lconstf.ranges = NULL;
3590
3591    info->bumpenvmat_needed = 0;
3592
3593    for (i = 0; i < ARRAY_SIZE(tx->regs.rL); ++i) {
3594        tx->regs.rL[i] = ureg_dst_undef();
3595    }
3596    tx->regs.address = ureg_dst_undef();
3597    tx->regs.a0 = ureg_dst_undef();
3598    tx->regs.p = ureg_dst_undef();
3599    tx->regs.oDepth = ureg_dst_undef();
3600    tx->regs.vPos = ureg_src_undef();
3601    tx->regs.vFace = ureg_src_undef();
3602    for (i = 0; i < ARRAY_SIZE(tx->regs.o); ++i)
3603        tx->regs.o[i] = ureg_dst_undef();
3604    for (i = 0; i < ARRAY_SIZE(tx->regs.oCol); ++i)
3605        tx->regs.oCol[i] = ureg_dst_undef();
3606    for (i = 0; i < ARRAY_SIZE(tx->regs.vC); ++i)
3607        tx->regs.vC[i] = ureg_src_undef();
3608    for (i = 0; i < ARRAY_SIZE(tx->regs.vT); ++i)
3609        tx->regs.vT[i] = ureg_src_undef();
3610
3611    sm1_read_version(tx);
3612
3613    info->version = (tx->version.major << 4) | tx->version.minor;
3614
3615    tx->num_outputs = 0;
3616
3617    create_op_info_map(tx);
3618
3619    tx->ureg = ureg_create(info->type);
3620    if (!tx->ureg) {
3621        return E_OUTOFMEMORY;
3622    }
3623
3624    tx->native_integers = GET_SHADER_CAP(INTEGERS);
3625    tx->inline_subroutines = !GET_SHADER_CAP(SUBROUTINES);
3626    tx->want_texcoord = GET_CAP(TGSI_TEXCOORD);
3627    tx->shift_wpos = !GET_CAP(FS_COORD_PIXEL_CENTER_INTEGER);
3628    tx->texcoord_sn = tx->want_texcoord ?
3629        TGSI_SEMANTIC_TEXCOORD : TGSI_SEMANTIC_GENERIC;
3630    tx->wpos_is_sysval = GET_CAP(FS_POSITION_IS_SYSVAL);
3631    tx->face_is_sysval_integer = GET_CAP(FS_FACE_IS_INTEGER_SYSVAL);
3632
3633    if (IS_VS) {
3634        tx->num_constf_allowed = NINE_MAX_CONST_F;
3635    } else if (tx->version.major < 2) {/* IS_PS v1 */
3636        tx->num_constf_allowed = 8;
3637    } else if (tx->version.major == 2) {/* IS_PS v2 */
3638        tx->num_constf_allowed = 32;
3639    } else {/* IS_PS v3 */
3640        tx->num_constf_allowed = NINE_MAX_CONST_F_PS3;
3641    }
3642
3643    if (tx->version.major < 2) {
3644        tx->num_consti_allowed = 0;
3645        tx->num_constb_allowed = 0;
3646    } else {
3647        tx->num_consti_allowed = NINE_MAX_CONST_I;
3648        tx->num_constb_allowed = NINE_MAX_CONST_B;
3649    }
3650
3651    if (info->swvp_on) {
3652        /* TODO: The values tx->version.major == 1 */
3653        tx->num_constf_allowed = 8192;
3654        tx->num_consti_allowed = 2048;
3655        tx->num_constb_allowed = 2048;
3656    }
3657
3658    /* VS must always write position. Declare it here to make it the 1st output.
3659     * (Some drivers like nv50 are buggy and rely on that.)
3660     */
3661    if (IS_VS) {
3662        tx->regs.oPos = ureg_DECL_output(tx->ureg, TGSI_SEMANTIC_POSITION, 0);
3663    } else {
3664        ureg_property(tx->ureg, TGSI_PROPERTY_FS_COORD_ORIGIN, TGSI_FS_COORD_ORIGIN_UPPER_LEFT);
3665        if (!tx->shift_wpos)
3666            ureg_property(tx->ureg, TGSI_PROPERTY_FS_COORD_PIXEL_CENTER, TGSI_FS_COORD_PIXEL_CENTER_INTEGER);
3667    }
3668
3669    tx->mul_zero_wins = GET_CAP(LEGACY_MATH_RULES);
3670    if (tx->mul_zero_wins)
3671       ureg_property(tx->ureg, TGSI_PROPERTY_LEGACY_MATH_RULES, 1);
3672
3673    /* Add additional definition of constants */
3674    if (info->add_constants_defs.c_combination) {
3675        unsigned i;
3676
3677        assert(info->add_constants_defs.int_const_added);
3678        assert(info->add_constants_defs.bool_const_added);
3679        /* We only add constants that are used by the shader
3680         * and that are not defined in the shader */
3681        for (i = 0; i < NINE_MAX_CONST_I; ++i) {
3682            if ((*info->add_constants_defs.int_const_added)[i]) {
3683                DBG("Defining const i%i : { %i %i %i %i }\n", i,
3684                    info->add_constants_defs.c_combination->const_i[i][0],
3685                    info->add_constants_defs.c_combination->const_i[i][1],
3686                    info->add_constants_defs.c_combination->const_i[i][2],
3687                    info->add_constants_defs.c_combination->const_i[i][3]);
3688                tx_set_lconsti(tx, i, info->add_constants_defs.c_combination->const_i[i]);
3689            }
3690        }
3691        for (i = 0; i < NINE_MAX_CONST_B; ++i) {
3692            if ((*info->add_constants_defs.bool_const_added)[i]) {
3693                DBG("Defining const b%i : %i\n", i, (int)(info->add_constants_defs.c_combination->const_b[i] != 0));
3694                tx_set_lconstb(tx, i, info->add_constants_defs.c_combination->const_b[i]);
3695            }
3696        }
3697    }
3698    return D3D_OK;
3699}
3700
3701static void
3702tx_dtor(struct shader_translator *tx)
3703{
3704    if (tx->slot_map)
3705        FREE(tx->slot_map);
3706    if (tx->num_inst_labels)
3707        FREE(tx->inst_labels);
3708    FREE(tx->lconstf);
3709    FREE(tx->regs.r);
3710    FREE(tx);
3711}
3712
3713/* CONST[0].xyz = width/2, -height/2, zmax-zmin
3714 * CONST[1].xyz = x+width/2, y+height/2, zmin */
3715static void
3716shader_add_vs_viewport_transform(struct shader_translator *tx)
3717{
3718    struct ureg_program *ureg = tx->ureg;
3719    struct ureg_src c0 = ureg_src_register(TGSI_FILE_CONSTANT, 0);
3720    struct ureg_src c1 = ureg_src_register(TGSI_FILE_CONSTANT, 1);
3721    /* struct ureg_dst pos_tmp = ureg_DECL_temporary(ureg);*/
3722
3723    c0 = ureg_src_dimension(c0, 4);
3724    c1 = ureg_src_dimension(c1, 4);
3725    /* TODO: find out when we need to apply the viewport transformation or not.
3726     * Likely will be XYZ vs XYZRHW in vdecl_out
3727     * ureg_MUL(ureg, ureg_writemask(pos_tmp, TGSI_WRITEMASK_XYZ), ureg_src(tx->regs.oPos), c0);
3728     * ureg_ADD(ureg, ureg_writemask(tx->regs.oPos_out, TGSI_WRITEMASK_XYZ), ureg_src(pos_tmp), c1);
3729     */
3730    ureg_MOV(ureg, ureg_writemask(tx->regs.oPos_out, TGSI_WRITEMASK_XYZ), ureg_src(tx->regs.oPos));
3731}
3732
3733static void
3734shader_add_ps_fog_stage(struct shader_translator *tx, struct ureg_src src_col)
3735{
3736    struct ureg_program *ureg = tx->ureg;
3737    struct ureg_dst oCol0 = ureg_DECL_output(ureg, TGSI_SEMANTIC_COLOR, 0);
3738    struct ureg_src fog_end, fog_coeff, fog_density, fog_params;
3739    struct ureg_src fog_vs, fog_color;
3740    struct ureg_dst fog_factor, depth;
3741
3742    if (!tx->info->fog_enable) {
3743        ureg_MOV(ureg, oCol0, src_col);
3744        return;
3745    }
3746
3747    if (tx->info->fog_mode != D3DFOG_NONE) {
3748        depth = tx_scratch_scalar(tx);
3749        /* Depth used for fog is perspective interpolated */
3750        ureg_RCP(ureg, depth, ureg_scalar(nine_get_position_input(tx), TGSI_SWIZZLE_W));
3751        ureg_MUL(ureg, depth, ureg_src(depth), ureg_scalar(nine_get_position_input(tx), TGSI_SWIZZLE_Z));
3752    }
3753
3754    fog_color = nine_float_constant_src(tx, 32);
3755    fog_params = nine_float_constant_src(tx, 33);
3756    fog_factor = tx_scratch_scalar(tx);
3757
3758    if (tx->info->fog_mode == D3DFOG_LINEAR) {
3759        fog_end = NINE_APPLY_SWIZZLE(fog_params, X);
3760        fog_coeff = NINE_APPLY_SWIZZLE(fog_params, Y);
3761        ureg_ADD(ureg, fog_factor, fog_end, ureg_negate(ureg_src(depth)));
3762        ureg_MUL(ureg, ureg_saturate(fog_factor), tx_src_scalar(fog_factor), fog_coeff);
3763    } else if (tx->info->fog_mode == D3DFOG_EXP) {
3764        fog_density = NINE_APPLY_SWIZZLE(fog_params, X);
3765        ureg_MUL(ureg, fog_factor, ureg_src(depth), fog_density);
3766        ureg_MUL(ureg, fog_factor, tx_src_scalar(fog_factor), ureg_imm1f(ureg, -1.442695f));
3767        ureg_EX2(ureg, fog_factor, tx_src_scalar(fog_factor));
3768    } else if (tx->info->fog_mode == D3DFOG_EXP2) {
3769        fog_density = NINE_APPLY_SWIZZLE(fog_params, X);
3770        ureg_MUL(ureg, fog_factor, ureg_src(depth), fog_density);
3771        ureg_MUL(ureg, fog_factor, tx_src_scalar(fog_factor), tx_src_scalar(fog_factor));
3772        ureg_MUL(ureg, fog_factor, tx_src_scalar(fog_factor), ureg_imm1f(ureg, -1.442695f));
3773        ureg_EX2(ureg, fog_factor, tx_src_scalar(fog_factor));
3774    } else {
3775        fog_vs = ureg_scalar(ureg_DECL_fs_input(ureg, TGSI_SEMANTIC_GENERIC, 16,
3776                                            TGSI_INTERPOLATE_PERSPECTIVE),
3777                                            TGSI_SWIZZLE_X);
3778        ureg_MOV(ureg, fog_factor, fog_vs);
3779    }
3780
3781    ureg_LRP(ureg, ureg_writemask(oCol0, TGSI_WRITEMASK_XYZ),
3782             tx_src_scalar(fog_factor), src_col, fog_color);
3783    ureg_MOV(ureg, ureg_writemask(oCol0, TGSI_WRITEMASK_W), src_col);
3784}
3785
3786static void parse_shader(struct shader_translator *tx)
3787{
3788    struct nine_shader_info *info = tx->info;
3789
3790    while (!sm1_parse_eof(tx) && !tx->failure)
3791        sm1_parse_instruction(tx);
3792    tx->parse++; /* for byte_size */
3793
3794    if (tx->failure)
3795        return;
3796
3797    if (IS_PS && tx->version.major < 3) {
3798        if (tx->version.major < 2) {
3799            assert(tx->num_temp); /* there must be color output */
3800            info->rt_mask |= 0x1;
3801            shader_add_ps_fog_stage(tx, ureg_src(tx->regs.r[0]));
3802        } else {
3803            shader_add_ps_fog_stage(tx, ureg_src(tx->regs.oCol[0]));
3804        }
3805    }
3806
3807    if (IS_VS && tx->version.major < 3 && ureg_dst_is_undef(tx->regs.oFog) && info->fog_enable) {
3808        tx->regs.oFog = ureg_DECL_output(tx->ureg, TGSI_SEMANTIC_GENERIC, 16);
3809        ureg_MOV(tx->ureg, ureg_writemask(tx->regs.oFog, TGSI_WRITEMASK_X), ureg_imm1f(tx->ureg, 0.0f));
3810    }
3811
3812    if (info->position_t)
3813        ureg_property(tx->ureg, TGSI_PROPERTY_VS_WINDOW_SPACE_POSITION, TRUE);
3814
3815    if (IS_VS && !ureg_dst_is_undef(tx->regs.oPts)) {
3816        struct ureg_dst oPts = ureg_DECL_output(tx->ureg, TGSI_SEMANTIC_PSIZE, 0);
3817        ureg_MAX(tx->ureg, tx->regs.oPts, ureg_src(tx->regs.oPts), ureg_imm1f(tx->ureg, info->point_size_min));
3818        ureg_MIN(tx->ureg, oPts, ureg_src(tx->regs.oPts), ureg_imm1f(tx->ureg, info->point_size_max));
3819        info->point_size = TRUE;
3820    }
3821
3822    if (info->process_vertices)
3823        shader_add_vs_viewport_transform(tx);
3824
3825    ureg_END(tx->ureg);
3826}
3827
3828#define NINE_SHADER_DEBUG_OPTION_NIR_VS           (1 << 0)
3829#define NINE_SHADER_DEBUG_OPTION_NIR_PS           (1 << 1)
3830#define NINE_SHADER_DEBUG_OPTION_NO_NIR_VS        (1 << 2)
3831#define NINE_SHADER_DEBUG_OPTION_NO_NIR_PS        (1 << 3)
3832#define NINE_SHADER_DEBUG_OPTION_DUMP_NIR         (1 << 4)
3833#define NINE_SHADER_DEBUG_OPTION_DUMP_TGSI        (1 << 5)
3834
3835static const struct debug_named_value nine_shader_debug_options[] = {
3836    { "nir_vs", NINE_SHADER_DEBUG_OPTION_NIR_VS, "Use NIR for vertex shaders even if the driver doesn't prefer it." },
3837    { "nir_ps", NINE_SHADER_DEBUG_OPTION_NIR_PS, "Use NIR for pixel shaders even if the driver doesn't prefer it." },
3838    { "no_nir_vs", NINE_SHADER_DEBUG_OPTION_NO_NIR_VS, "Never use NIR for vertex shaders even if the driver prefers it." },
3839    { "no_nir_ps", NINE_SHADER_DEBUG_OPTION_NO_NIR_PS, "Never use NIR for pixel shaders even if the driver prefers it." },
3840    { "dump_nir", NINE_SHADER_DEBUG_OPTION_DUMP_NIR, "Print translated NIR shaders." },
3841    { "dump_tgsi", NINE_SHADER_DEBUG_OPTION_DUMP_TGSI, "Print TGSI shaders." },
3842    DEBUG_NAMED_VALUE_END /* must be last */
3843};
3844
3845static inline boolean
3846nine_shader_get_debug_flag(uint64_t flag)
3847{
3848    static uint64_t flags = 0;
3849    static boolean first_run = TRUE;
3850
3851    if (unlikely(first_run)) {
3852        first_run = FALSE;
3853        flags = debug_get_flags_option("NINE_SHADER", nine_shader_debug_options, 0);
3854
3855        // Check old TGSI dump envvar too
3856        if (debug_get_bool_option("NINE_TGSI_DUMP", FALSE)) {
3857            flags |= NINE_SHADER_DEBUG_OPTION_DUMP_TGSI;
3858        }
3859    }
3860
3861    return !!(flags & flag);
3862}
3863
3864static void
3865nine_pipe_nir_shader_state_from_tgsi(struct pipe_shader_state *state, const struct tgsi_token *tgsi_tokens,
3866                                     struct pipe_screen *screen)
3867{
3868    struct nir_shader *nir = tgsi_to_nir(tgsi_tokens, screen, screen->get_disk_shader_cache != NULL);
3869
3870    if (unlikely(nine_shader_get_debug_flag(NINE_SHADER_DEBUG_OPTION_DUMP_NIR))) {
3871        nir_print_shader(nir, stdout);
3872    }
3873
3874    state->type = PIPE_SHADER_IR_NIR;
3875    state->tokens = NULL;
3876    state->ir.nir = nir;
3877    memset(&state->stream_output, 0, sizeof(state->stream_output));
3878}
3879
3880static void *
3881nine_ureg_create_shader(struct ureg_program                  *ureg,
3882                        struct pipe_context                  *pipe,
3883                        const struct pipe_stream_output_info   *so)
3884{
3885    struct pipe_shader_state state;
3886    const struct tgsi_token *tgsi_tokens;
3887    struct pipe_screen *screen = pipe->screen;
3888
3889    tgsi_tokens = ureg_finalize(ureg);
3890    if (!tgsi_tokens)
3891        return NULL;
3892
3893    assert(((struct tgsi_header *) &tgsi_tokens[0])->HeaderSize >= 2);
3894    enum pipe_shader_type shader_type = ((struct tgsi_processor *) &tgsi_tokens[1])->Processor;
3895
3896    int preferred_ir = screen->get_shader_param(screen, shader_type, PIPE_SHADER_CAP_PREFERRED_IR);
3897    bool prefer_nir = (preferred_ir == PIPE_SHADER_IR_NIR);
3898    bool use_nir = prefer_nir ||
3899        ((shader_type == PIPE_SHADER_VERTEX) && nine_shader_get_debug_flag(NINE_SHADER_DEBUG_OPTION_NIR_VS)) ||
3900        ((shader_type == PIPE_SHADER_FRAGMENT) && nine_shader_get_debug_flag(NINE_SHADER_DEBUG_OPTION_NIR_PS));
3901
3902    /* Allow user to override preferred IR, this is very useful for debugging */
3903    if (unlikely(shader_type == PIPE_SHADER_VERTEX && nine_shader_get_debug_flag(NINE_SHADER_DEBUG_OPTION_NO_NIR_VS)))
3904        use_nir = false;
3905    if (unlikely(shader_type == PIPE_SHADER_FRAGMENT && nine_shader_get_debug_flag(NINE_SHADER_DEBUG_OPTION_NO_NIR_PS)))
3906        use_nir = false;
3907
3908    DUMP("shader type: %s, preferred IR: %s, selected IR: %s\n",
3909         shader_type == PIPE_SHADER_VERTEX ? "VS" : "PS",
3910         prefer_nir ? "NIR" : "TGSI",
3911         use_nir ? "NIR" : "TGSI");
3912
3913    if (use_nir) {
3914        nine_pipe_nir_shader_state_from_tgsi(&state, tgsi_tokens, screen);
3915    } else {
3916        pipe_shader_state_from_tgsi(&state, tgsi_tokens);
3917    }
3918
3919    assert(state.tokens || state.ir.nir);
3920
3921    if (so)
3922        state.stream_output = *so;
3923
3924    switch (shader_type) {
3925    case PIPE_SHADER_VERTEX:
3926        return pipe->create_vs_state(pipe, &state);
3927    case PIPE_SHADER_FRAGMENT:
3928        return pipe->create_fs_state(pipe, &state);
3929    default:
3930        unreachable("unsupported shader type");
3931    }
3932}
3933
3934
3935void *
3936nine_create_shader_with_so_and_destroy(struct ureg_program                   *p,
3937                                       struct pipe_context                *pipe,
3938                                       const struct pipe_stream_output_info *so)
3939{
3940    void *result = nine_ureg_create_shader(p, pipe, so);
3941    ureg_destroy(p);
3942    return result;
3943}
3944
3945HRESULT
3946nine_translate_shader(struct NineDevice9 *device, struct nine_shader_info *info, struct pipe_context *pipe)
3947{
3948    struct shader_translator *tx;
3949    HRESULT hr = D3D_OK;
3950    const unsigned processor = info->type;
3951    struct pipe_screen *screen = info->process_vertices ? device->screen_sw : device->screen;
3952    unsigned *const_ranges = NULL;
3953
3954    user_assert(processor != ~0, D3DERR_INVALIDCALL);
3955
3956    tx = MALLOC_STRUCT(shader_translator);
3957    if (!tx)
3958        return E_OUTOFMEMORY;
3959
3960    if (tx_ctor(tx, screen, info) == E_OUTOFMEMORY) {
3961        hr = E_OUTOFMEMORY;
3962        goto out;
3963    }
3964
3965    assert(IS_VS || !info->swvp_on);
3966
3967    if (((tx->version.major << 16) | tx->version.minor) > 0x00030000) {
3968        hr = D3DERR_INVALIDCALL;
3969        DBG("Unsupported shader version: %u.%u !\n",
3970            tx->version.major, tx->version.minor);
3971        goto out;
3972    }
3973    if (tx->processor != processor) {
3974        hr = D3DERR_INVALIDCALL;
3975        DBG("Shader type mismatch: %u / %u !\n", tx->processor, processor);
3976        goto out;
3977    }
3978    DUMP("%s%u.%u\n", processor == PIPE_SHADER_VERTEX ? "VS" : "PS",
3979         tx->version.major, tx->version.minor);
3980
3981    parse_shader(tx);
3982
3983    if (tx->failure) {
3984        /* For VS shaders, we print the warning later,
3985         * we first try with swvp. */
3986        if (IS_PS)
3987            ERR("Encountered buggy shader\n");
3988        ureg_destroy(tx->ureg);
3989        hr = D3DERR_INVALIDCALL;
3990        goto out;
3991    }
3992
3993    /* Recompile after compacting constant slots if possible */
3994    if (!tx->indirect_const_access && !info->swvp_on && tx->num_slots > 0) {
3995        unsigned *slot_map;
3996        unsigned c;
3997        int i, j, num_ranges, prev;
3998
3999        DBG("Recompiling shader for constant compaction\n");
4000        ureg_destroy(tx->ureg);
4001
4002        if (tx->num_inst_labels)
4003            FREE(tx->inst_labels);
4004        FREE(tx->lconstf);
4005        FREE(tx->regs.r);
4006
4007        num_ranges = 0;
4008        prev = -2;
4009        for (i = 0; i < NINE_MAX_CONST_ALL; i++) {
4010            if (tx->slots_used[i]) {
4011                if (prev != i - 1)
4012                    num_ranges++;
4013                prev = i;
4014            }
4015        }
4016        slot_map = MALLOC(NINE_MAX_CONST_ALL * sizeof(unsigned));
4017        const_ranges = CALLOC(num_ranges + 1, 2 * sizeof(unsigned)); /* ranges stop when last is of size 0 */
4018        if (!slot_map || !const_ranges) {
4019            hr = E_OUTOFMEMORY;
4020            goto out;
4021        }
4022        c = 0;
4023        j = -1;
4024        prev = -2;
4025        for (i = 0; i < NINE_MAX_CONST_ALL; i++) {
4026            if (tx->slots_used[i]) {
4027                if (prev != i - 1)
4028                    j++;
4029                /* Initialize first slot of the range */
4030                if (!const_ranges[2*j+1])
4031                    const_ranges[2*j] = i;
4032                const_ranges[2*j+1]++;
4033                prev = i;
4034                slot_map[i] = c++;
4035            }
4036        }
4037
4038        if (tx_ctor(tx, screen, info) == E_OUTOFMEMORY) {
4039            hr = E_OUTOFMEMORY;
4040            goto out;
4041        }
4042        tx->slot_map = slot_map;
4043        parse_shader(tx);
4044        assert(!tx->failure);
4045#if !defined(NDEBUG)
4046        i = 0;
4047        j = 0;
4048        while (const_ranges[i*2+1] != 0) {
4049            j += const_ranges[i*2+1];
4050            i++;
4051        }
4052        assert(j == tx->num_slots);
4053#endif
4054    }
4055
4056    /* record local constants */
4057    if (tx->num_lconstf && tx->indirect_const_access) {
4058        struct nine_range *ranges;
4059        float *data;
4060        int *indices;
4061        unsigned i, k, n;
4062
4063        hr = E_OUTOFMEMORY;
4064
4065        data = MALLOC(tx->num_lconstf * 4 * sizeof(float));
4066        if (!data)
4067            goto out;
4068        info->lconstf.data = data;
4069
4070        indices = MALLOC(tx->num_lconstf * sizeof(indices[0]));
4071        if (!indices)
4072            goto out;
4073
4074        /* lazy sort, num_lconstf should be small */
4075        for (n = 0; n < tx->num_lconstf; ++n) {
4076            for (k = 0, i = 0; i < tx->num_lconstf; ++i) {
4077                if (tx->lconstf[i].idx < tx->lconstf[k].idx)
4078                    k = i;
4079            }
4080            indices[n] = tx->lconstf[k].idx;
4081            memcpy(&data[n * 4], &tx->lconstf[k].f[0], 4 * sizeof(float));
4082            tx->lconstf[k].idx = INT_MAX;
4083        }
4084
4085        /* count ranges */
4086        for (n = 1, i = 1; i < tx->num_lconstf; ++i)
4087            if (indices[i] != indices[i - 1] + 1)
4088                ++n;
4089        ranges = MALLOC(n * sizeof(ranges[0]));
4090        if (!ranges) {
4091            FREE(indices);
4092            goto out;
4093        }
4094        info->lconstf.ranges = ranges;
4095
4096        k = 0;
4097        ranges[k].bgn = indices[0];
4098        for (i = 1; i < tx->num_lconstf; ++i) {
4099            if (indices[i] != indices[i - 1] + 1) {
4100                ranges[k].next = &ranges[k + 1];
4101                ranges[k].end = indices[i - 1] + 1;
4102                ++k;
4103                ranges[k].bgn = indices[i];
4104            }
4105        }
4106        ranges[k].end = indices[i - 1] + 1;
4107        ranges[k].next = NULL;
4108        assert(n == (k + 1));
4109
4110        FREE(indices);
4111        hr = D3D_OK;
4112    }
4113
4114    /* r500 */
4115    if (info->const_float_slots > device->max_vs_const_f &&
4116        (info->const_int_slots || info->const_bool_slots) &&
4117        !info->swvp_on)
4118        ERR("Overlapping constant slots. The shader is likely to be buggy\n");
4119
4120
4121    if (tx->indirect_const_access) { /* vs only */
4122        info->const_float_slots = device->max_vs_const_f;
4123        tx->num_slots = MAX2(tx->num_slots, device->max_vs_const_f);
4124    }
4125
4126    if (!info->swvp_on) {
4127        info->const_used_size = sizeof(float[4]) * tx->num_slots;
4128        if (tx->num_slots)
4129            ureg_DECL_constant2D(tx->ureg, 0, tx->num_slots-1, 0);
4130    } else {
4131         ureg_DECL_constant2D(tx->ureg, 0, 4095, 0);
4132         ureg_DECL_constant2D(tx->ureg, 0, 4095, 1);
4133         ureg_DECL_constant2D(tx->ureg, 0, 2047, 2);
4134         ureg_DECL_constant2D(tx->ureg, 0, 511, 3);
4135    }
4136
4137    if (info->process_vertices)
4138        ureg_DECL_constant2D(tx->ureg, 0, 2, 4); /* Viewport data */
4139
4140    if (unlikely(nine_shader_get_debug_flag(NINE_SHADER_DEBUG_OPTION_DUMP_TGSI))) {
4141        const struct tgsi_token *toks = ureg_get_tokens(tx->ureg, NULL);
4142        tgsi_dump(toks, 0);
4143        ureg_free_tokens(toks);
4144    }
4145
4146    if (info->process_vertices) {
4147        NineVertexDeclaration9_FillStreamOutputInfo(info->vdecl_out,
4148                                                    tx->output_info,
4149                                                    tx->num_outputs,
4150                                                    &(info->so));
4151        info->cso = nine_create_shader_with_so_and_destroy(tx->ureg, pipe, &(info->so));
4152    } else
4153        info->cso = nine_create_shader_with_so_and_destroy(tx->ureg, pipe, NULL);
4154    if (!info->cso) {
4155        hr = D3DERR_DRIVERINTERNALERROR;
4156        FREE(info->lconstf.data);
4157        FREE(info->lconstf.ranges);
4158        goto out;
4159    }
4160
4161    info->const_ranges = const_ranges;
4162    const_ranges = NULL;
4163    info->byte_size = (tx->parse - tx->byte_code) * sizeof(DWORD);
4164out:
4165    if (const_ranges)
4166        FREE(const_ranges);
4167    tx_dtor(tx);
4168    return hr;
4169}
4170