1/* 2 * 3GPP TS 26.245 Timed Text decoder 3 * Copyright (c) 2012 Philip Langdale <philipl@overt.org> 4 * 5 * This file is part of FFmpeg. 6 * 7 * FFmpeg is free software; you can redistribute it and/or 8 * modify it under the terms of the GNU Lesser General Public 9 * License as published by the Free Software Foundation; either 10 * version 2.1 of the License, or (at your option) any later version. 11 * 12 * FFmpeg is distributed in the hope that it will be useful, 13 * but WITHOUT ANY WARRANTY; without even the implied warranty of 14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 15 * Lesser General Public License for more details. 16 * 17 * You should have received a copy of the GNU Lesser General Public 18 * License along with FFmpeg; if not, write to the Free Software 19 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 20 */ 21 22#include "avcodec.h" 23#include "ass.h" 24#include "libavutil/opt.h" 25#include "libavutil/avstring.h" 26#include "libavutil/common.h" 27#include "libavutil/bprint.h" 28#include "libavutil/intreadwrite.h" 29#include "libavutil/mem.h" 30#include "bytestream.h" 31#include "codec_internal.h" 32 33#define STYLE_FLAG_BOLD (1<<0) 34#define STYLE_FLAG_ITALIC (1<<1) 35#define STYLE_FLAG_UNDERLINE (1<<2) 36 37#define BOX_SIZE_INITIAL 40 38 39#define STYL_BOX (1<<0) 40#define HLIT_BOX (1<<1) 41#define HCLR_BOX (1<<2) 42#define TWRP_BOX (1<<3) 43 44#define BOTTOM_LEFT 1 45#define BOTTOM_CENTER 2 46#define BOTTOM_RIGHT 3 47#define MIDDLE_LEFT 4 48#define MIDDLE_CENTER 5 49#define MIDDLE_RIGHT 6 50#define TOP_LEFT 7 51#define TOP_CENTER 8 52#define TOP_RIGHT 9 53 54#define RGB_TO_BGR(c) (((c) & 0xff) << 16 | ((c) & 0xff00) | (((c) >> 16) & 0xff)) 55 56typedef struct { 57 uint16_t font_id; 58 char *font; 59} FontRecord; 60 61typedef struct { 62 uint16_t start; 63 uint16_t end; 64 uint8_t flags; 65 uint8_t bold; 66 uint8_t italic; 67 uint8_t underline; 68 int color; 69 uint8_t alpha; 70 uint8_t fontsize; 71 uint16_t font_id; 72} StyleBox; 73 74typedef struct { 75 StyleBox style; 76 const char *font; 77 int back_color; 78 uint8_t back_alpha; 79 int alignment; 80} MovTextDefault; 81 82typedef struct { 83 uint16_t hlit_start; 84 uint16_t hlit_end; 85} HighlightBox; 86 87typedef struct { 88 uint8_t hlit_color[4]; 89} HilightcolorBox; 90 91typedef struct { 92 uint8_t wrap_flag; 93} TextWrapBox; 94 95typedef struct { 96 AVClass *class; 97 StyleBox *s; 98 HighlightBox h; 99 HilightcolorBox c; 100 FontRecord *ftab; 101 TextWrapBox w; 102 MovTextDefault d; 103 uint8_t box_flags; 104 uint16_t style_entries, ftab_entries; 105 int readorder; 106 int frame_width; 107 int frame_height; 108} MovTextContext; 109 110typedef struct { 111 uint32_t type; 112 unsigned base_size; 113 int (*decode)(const uint8_t *tsmb, MovTextContext *m, uint64_t size); 114} Box; 115 116static void mov_text_cleanup(MovTextContext *m) 117{ 118 if (m->box_flags & STYL_BOX) { 119 av_freep(&m->s); 120 m->style_entries = 0; 121 } 122} 123 124static void mov_text_cleanup_ftab(MovTextContext *m) 125{ 126 for (unsigned i = 0; i < m->ftab_entries; i++) 127 av_freep(&m->ftab[i].font); 128 av_freep(&m->ftab); 129 m->ftab_entries = 0; 130} 131 132static void mov_text_parse_style_record(StyleBox *style, const uint8_t **ptr) 133{ 134 // fontID 135 style->font_id = bytestream_get_be16(ptr); 136 // face-style-flags 137 style->flags = bytestream_get_byte(ptr); 138 style->bold = !!(style->flags & STYLE_FLAG_BOLD); 139 style->italic = !!(style->flags & STYLE_FLAG_ITALIC); 140 style->underline = !!(style->flags & STYLE_FLAG_UNDERLINE); 141 // fontsize 142 style->fontsize = bytestream_get_byte(ptr); 143 // Primary color 144 style->color = bytestream_get_be24(ptr); 145 style->color = RGB_TO_BGR(style->color); 146 style->alpha = bytestream_get_byte(ptr); 147} 148 149static int mov_text_tx3g(AVCodecContext *avctx, MovTextContext *m) 150{ 151 const uint8_t *tx3g_ptr = avctx->extradata; 152 int i, j = -1, font_length, remaining = avctx->extradata_size - BOX_SIZE_INITIAL; 153 int8_t v_align, h_align; 154 unsigned ftab_entries; 155 156 m->ftab_entries = 0; 157 if (remaining < 0) 158 return -1; 159 160 // Display Flags 161 tx3g_ptr += 4; 162 // Alignment 163 h_align = bytestream_get_byte(&tx3g_ptr); 164 v_align = bytestream_get_byte(&tx3g_ptr); 165 if (h_align == 0) { 166 if (v_align == 0) 167 m->d.alignment = TOP_LEFT; 168 if (v_align == 1) 169 m->d.alignment = MIDDLE_LEFT; 170 if (v_align == -1) 171 m->d.alignment = BOTTOM_LEFT; 172 } 173 if (h_align == 1) { 174 if (v_align == 0) 175 m->d.alignment = TOP_CENTER; 176 if (v_align == 1) 177 m->d.alignment = MIDDLE_CENTER; 178 if (v_align == -1) 179 m->d.alignment = BOTTOM_CENTER; 180 } 181 if (h_align == -1) { 182 if (v_align == 0) 183 m->d.alignment = TOP_RIGHT; 184 if (v_align == 1) 185 m->d.alignment = MIDDLE_RIGHT; 186 if (v_align == -1) 187 m->d.alignment = BOTTOM_RIGHT; 188 } 189 // Background Color 190 m->d.back_color = bytestream_get_be24(&tx3g_ptr); 191 m->d.back_color = RGB_TO_BGR(m->d.back_color); 192 m->d.back_alpha = bytestream_get_byte(&tx3g_ptr); 193 // BoxRecord 194 tx3g_ptr += 8; 195 // StyleRecord 196 tx3g_ptr += 4; 197 mov_text_parse_style_record(&m->d.style, &tx3g_ptr); 198 // FontRecord 199 // FontRecord Size 200 tx3g_ptr += 4; 201 // ftab 202 tx3g_ptr += 4; 203 204 // In case of broken header, init default font 205 m->d.font = ASS_DEFAULT_FONT; 206 207 ftab_entries = bytestream_get_be16(&tx3g_ptr); 208 if (!ftab_entries) 209 return 0; 210 remaining -= 3 * ftab_entries; 211 if (remaining < 0) 212 return AVERROR_INVALIDDATA; 213 m->ftab = av_calloc(ftab_entries, sizeof(*m->ftab)); 214 if (!m->ftab) 215 return AVERROR(ENOMEM); 216 m->ftab_entries = ftab_entries; 217 218 for (i = 0; i < m->ftab_entries; i++) { 219 m->ftab[i].font_id = bytestream_get_be16(&tx3g_ptr); 220 if (m->ftab[i].font_id == m->d.style.font_id) 221 j = i; 222 font_length = bytestream_get_byte(&tx3g_ptr); 223 224 remaining -= font_length; 225 if (remaining < 0) { 226 mov_text_cleanup_ftab(m); 227 return -1; 228 } 229 m->ftab[i].font = av_malloc(font_length + 1); 230 if (!m->ftab[i].font) { 231 mov_text_cleanup_ftab(m); 232 return AVERROR(ENOMEM); 233 } 234 bytestream_get_buffer(&tx3g_ptr, m->ftab[i].font, font_length); 235 m->ftab[i].font[font_length] = '\0'; 236 } 237 if (j >= 0) 238 m->d.font = m->ftab[j].font; 239 return 0; 240} 241 242static int decode_twrp(const uint8_t *tsmb, MovTextContext *m, uint64_t size) 243{ 244 m->box_flags |= TWRP_BOX; 245 m->w.wrap_flag = bytestream_get_byte(&tsmb); 246 return 0; 247} 248 249static int decode_hlit(const uint8_t *tsmb, MovTextContext *m, uint64_t size) 250{ 251 m->box_flags |= HLIT_BOX; 252 m->h.hlit_start = bytestream_get_be16(&tsmb); 253 m->h.hlit_end = bytestream_get_be16(&tsmb); 254 return 0; 255} 256 257static int decode_hclr(const uint8_t *tsmb, MovTextContext *m, uint64_t size) 258{ 259 m->box_flags |= HCLR_BOX; 260 bytestream_get_buffer(&tsmb, m->c.hlit_color, 4); 261 return 0; 262} 263 264static int styles_equivalent(const StyleBox *a, const StyleBox *b) 265{ 266#define CMP(field) ((a)->field == (b)->field) 267 return CMP(bold) && CMP(italic) && CMP(underline) && CMP(color) && 268 CMP(alpha) && CMP(fontsize) && CMP(font_id); 269#undef CMP 270} 271 272static int decode_styl(const uint8_t *tsmb, MovTextContext *m, uint64_t size) 273{ 274 int i; 275 int style_entries = bytestream_get_be16(&tsmb); 276 StyleBox *tmp; 277 278 // A single style record is of length 12 bytes. 279 if (2 + style_entries * 12 > size) 280 return -1; 281 282 tmp = av_realloc_array(m->s, style_entries, sizeof(*m->s)); 283 if (!tmp) 284 return AVERROR(ENOMEM); 285 m->s = tmp; 286 m->style_entries = style_entries; 287 288 m->box_flags |= STYL_BOX; 289 for(i = 0; i < m->style_entries; i++) { 290 StyleBox *style = &m->s[i]; 291 292 style->start = bytestream_get_be16(&tsmb); 293 style->end = bytestream_get_be16(&tsmb); 294 if (style->end < style->start || 295 (i && style->start < m->s[i - 1].end)) { 296 mov_text_cleanup(m); 297 return AVERROR_INVALIDDATA; 298 } 299 if (style->start == style->end) { 300 /* Skip this style as it applies to no character */ 301 tsmb += 8; 302 m->style_entries--; 303 i--; 304 continue; 305 } 306 307 mov_text_parse_style_record(style, &tsmb); 308 if (styles_equivalent(style, &m->d.style)) { 309 /* Skip this style as it is equivalent to the default style */ 310 m->style_entries--; 311 i--; 312 continue; 313 } else if (i && style->start == style[-1].end && 314 styles_equivalent(style, &style[-1])) { 315 /* Merge the two adjacent styles */ 316 style[-1].end = style->end; 317 m->style_entries--; 318 i--; 319 continue; 320 } 321 } 322 return 0; 323} 324 325static const Box box_types[] = { 326 { MKBETAG('s','t','y','l'), 2, decode_styl }, 327 { MKBETAG('h','l','i','t'), 4, decode_hlit }, 328 { MKBETAG('h','c','l','r'), 4, decode_hclr }, 329 { MKBETAG('t','w','r','p'), 1, decode_twrp } 330}; 331 332const static size_t box_count = FF_ARRAY_ELEMS(box_types); 333 334// Return byte length of the UTF-8 sequence starting at text[0]. 0 on error. 335static int get_utf8_length_at(const char *text, const char *text_end) 336{ 337 const char *start = text; 338 int err = 0; 339 uint32_t c; 340 GET_UTF8(c, text < text_end ? (uint8_t)*text++ : (err = 1, 0), goto error;); 341 if (err) 342 goto error; 343 return text - start; 344error: 345 return 0; 346} 347 348static int text_to_ass(AVBPrint *buf, const char *text, const char *text_end, 349 AVCodecContext *avctx) 350{ 351 MovTextContext *m = avctx->priv_data; 352 const StyleBox *const default_style = &m->d.style; 353 int i = 0; 354 int text_pos = 0; 355 int entry = 0; 356 int color = default_style->color; 357 358 if (text < text_end && m->box_flags & TWRP_BOX) { 359 if (m->w.wrap_flag == 1) { 360 av_bprintf(buf, "{\\q1}"); /* End of line wrap */ 361 } else { 362 av_bprintf(buf, "{\\q2}"); /* No wrap */ 363 } 364 } 365 366 while (text < text_end) { 367 int len; 368 369 if ((m->box_flags & STYL_BOX) && entry < m->style_entries) { 370 const StyleBox *style = &m->s[entry]; 371 if (text_pos == style->end) { 372 av_bprintf(buf, "{\\r}"); 373 color = default_style->color; 374 entry++; 375 style++; 376 } 377 if (entry < m->style_entries && text_pos == style->start) { 378 if (style->bold ^ default_style->bold) 379 av_bprintf(buf, "{\\b%d}", style->bold); 380 if (style->italic ^ default_style->italic) 381 av_bprintf(buf, "{\\i%d}", style->italic); 382 if (style->underline ^ default_style->underline) 383 av_bprintf(buf, "{\\u%d}", style->underline); 384 if (style->fontsize != default_style->fontsize) 385 av_bprintf(buf, "{\\fs%d}", style->fontsize); 386 if (style->font_id != default_style->font_id) 387 for (i = 0; i < m->ftab_entries; i++) { 388 if (style->font_id == m->ftab[i].font_id) 389 av_bprintf(buf, "{\\fn%s}", m->ftab[i].font); 390 } 391 if (default_style->color != style->color) { 392 color = style->color; 393 av_bprintf(buf, "{\\1c&H%X&}", color); 394 } 395 if (default_style->alpha != style->alpha) 396 av_bprintf(buf, "{\\1a&H%02X&}", 255 - style->alpha); 397 } 398 } 399 if (m->box_flags & HLIT_BOX) { 400 if (text_pos == m->h.hlit_start) { 401 /* If hclr box is present, set the secondary color to the color 402 * specified. Otherwise, set primary color to white and secondary 403 * color to black. These colors will come from TextSampleModifier 404 * boxes in future and inverse video technique for highlight will 405 * be implemented. 406 */ 407 if (m->box_flags & HCLR_BOX) { 408 av_bprintf(buf, "{\\2c&H%02x%02x%02x&}", m->c.hlit_color[2], 409 m->c.hlit_color[1], m->c.hlit_color[0]); 410 } else { 411 av_bprintf(buf, "{\\1c&H000000&}{\\2c&HFFFFFF&}"); 412 } 413 } 414 if (text_pos == m->h.hlit_end) { 415 if (m->box_flags & HCLR_BOX) { 416 av_bprintf(buf, "{\\2c&H%X&}", default_style->color); 417 } else { 418 av_bprintf(buf, "{\\1c&H%X&}{\\2c&H%X&}", 419 color, default_style->color); 420 } 421 } 422 } 423 424 len = get_utf8_length_at(text, text_end); 425 if (len < 1) { 426 av_log(avctx, AV_LOG_ERROR, "invalid UTF-8 byte in subtitle\n"); 427 len = 1; 428 } 429 switch (*text) { 430 case '\r': 431 break; 432 case '\n': 433 av_bprintf(buf, "\\N"); 434 break; 435 default: 436 av_bprint_append_data(buf, text, len); 437 break; 438 } 439 text += len; 440 text_pos++; 441 } 442 443 return 0; 444} 445 446static int mov_text_init(AVCodecContext *avctx) { 447 /* 448 * TODO: Handle the default text style. 449 * NB: Most players ignore styles completely, with the result that 450 * it's very common to find files where the default style is broken 451 * and respecting it results in a worse experience than ignoring it. 452 */ 453 int ret; 454 MovTextContext *m = avctx->priv_data; 455 ret = mov_text_tx3g(avctx, m); 456 if (ret == 0) { 457 const StyleBox *const default_style = &m->d.style; 458 if (!m->frame_width || !m->frame_height) { 459 m->frame_width = ASS_DEFAULT_PLAYRESX; 460 m->frame_height = ASS_DEFAULT_PLAYRESY; 461 } 462 return ff_ass_subtitle_header_full(avctx, 463 m->frame_width, m->frame_height, 464 m->d.font, default_style->fontsize, 465 (255U - default_style->alpha) << 24 | default_style->color, 466 (255U - default_style->alpha) << 24 | default_style->color, 467 (255U - m->d.back_alpha) << 24 | m->d.back_color, 468 (255U - m->d.back_alpha) << 24 | m->d.back_color, 469 default_style->bold, default_style->italic, default_style->underline, 470 ASS_DEFAULT_BORDERSTYLE, m->d.alignment); 471 } else 472 return ff_ass_subtitle_header_default(avctx); 473} 474 475static int mov_text_decode_frame(AVCodecContext *avctx, AVSubtitle *sub, 476 int *got_sub_ptr, const AVPacket *avpkt) 477{ 478 MovTextContext *m = avctx->priv_data; 479 int ret; 480 AVBPrint buf; 481 const char *ptr = avpkt->data, *end; 482 int text_length; 483 size_t i; 484 485 if (!ptr || avpkt->size < 2) 486 return AVERROR_INVALIDDATA; 487 488 /* 489 * A packet of size two with value zero is an empty subtitle 490 * used to mark the end of the previous non-empty subtitle. 491 * We can just drop them here as we have duration information 492 * already. If the value is non-zero, then it's technically a 493 * bad packet. 494 */ 495 if (avpkt->size == 2) 496 return AV_RB16(ptr) == 0 ? 0 : AVERROR_INVALIDDATA; 497 498 /* 499 * The first two bytes of the packet are the length of the text string 500 * In complex cases, there are style descriptors appended to the string 501 * so we can't just assume the packet size is the string size. 502 */ 503 text_length = AV_RB16(ptr); 504 end = ptr + FFMIN(2 + text_length, avpkt->size); 505 ptr += 2; 506 507 mov_text_cleanup(m); 508 509 m->style_entries = 0; 510 m->box_flags = 0; 511 // Note that the spec recommends lines be no longer than 2048 characters. 512 av_bprint_init(&buf, 0, AV_BPRINT_SIZE_UNLIMITED); 513 if (text_length + 2 < avpkt->size) { 514 const uint8_t *tsmb = end; 515 const uint8_t *const tsmb_end = avpkt->data + avpkt->size; 516 // A box is a minimum of 8 bytes. 517 while (tsmb_end - tsmb >= 8) { 518 uint64_t tsmb_size = bytestream_get_be32(&tsmb); 519 uint32_t tsmb_type = bytestream_get_be32(&tsmb); 520 int size_var, ret_tsmb; 521 522 if (tsmb_size == 1) { 523 if (tsmb_end - tsmb < 8) 524 break; 525 tsmb_size = bytestream_get_be64(&tsmb); 526 size_var = 16; 527 } else 528 size_var = 8; 529 //size_var is equal to 8 or 16 depending on the size of box 530 531 if (tsmb_size < size_var) { 532 av_log(avctx, AV_LOG_ERROR, "tsmb_size invalid\n"); 533 return AVERROR_INVALIDDATA; 534 } 535 tsmb_size -= size_var; 536 537 if (tsmb_end - tsmb < tsmb_size) 538 break; 539 540 for (i = 0; i < box_count; i++) { 541 if (tsmb_type == box_types[i].type) { 542 if (tsmb_size < box_types[i].base_size) 543 break; 544 ret_tsmb = box_types[i].decode(tsmb, m, tsmb_size); 545 if (ret_tsmb == -1) 546 break; 547 } 548 } 549 tsmb += tsmb_size; 550 } 551 text_to_ass(&buf, ptr, end, avctx); 552 mov_text_cleanup(m); 553 } else 554 text_to_ass(&buf, ptr, end, avctx); 555 556 ret = ff_ass_add_rect(sub, buf.str, m->readorder++, 0, NULL, NULL); 557 av_bprint_finalize(&buf, NULL); 558 if (ret < 0) 559 return ret; 560 *got_sub_ptr = sub->num_rects > 0; 561 return avpkt->size; 562} 563 564static int mov_text_decode_close(AVCodecContext *avctx) 565{ 566 MovTextContext *m = avctx->priv_data; 567 mov_text_cleanup_ftab(m); 568 mov_text_cleanup(m); 569 return 0; 570} 571 572static void mov_text_flush(AVCodecContext *avctx) 573{ 574 MovTextContext *m = avctx->priv_data; 575 if (!(avctx->flags2 & AV_CODEC_FLAG2_RO_FLUSH_NOOP)) 576 m->readorder = 0; 577} 578 579#define OFFSET(x) offsetof(MovTextContext, x) 580#define FLAGS AV_OPT_FLAG_DECODING_PARAM | AV_OPT_FLAG_SUBTITLE_PARAM 581static const AVOption options[] = { 582 { "width", "Frame width, usually video width", OFFSET(frame_width), AV_OPT_TYPE_INT, {.i64=0}, 0, INT_MAX, FLAGS }, 583 { "height", "Frame height, usually video height", OFFSET(frame_height), AV_OPT_TYPE_INT, {.i64=0}, 0, INT_MAX, FLAGS }, 584 { NULL }, 585}; 586 587static const AVClass mov_text_decoder_class = { 588 .class_name = "MOV text decoder", 589 .item_name = av_default_item_name, 590 .option = options, 591 .version = LIBAVUTIL_VERSION_INT, 592}; 593 594const FFCodec ff_movtext_decoder = { 595 .p.name = "mov_text", 596 .p.long_name = NULL_IF_CONFIG_SMALL("3GPP Timed Text subtitle"), 597 .p.type = AVMEDIA_TYPE_SUBTITLE, 598 .p.id = AV_CODEC_ID_MOV_TEXT, 599 .priv_data_size = sizeof(MovTextContext), 600 .p.priv_class = &mov_text_decoder_class, 601 .init = mov_text_init, 602 FF_CODEC_DECODE_SUB_CB(mov_text_decode_frame), 603 .close = mov_text_decode_close, 604 .flush = mov_text_flush, 605 .caps_internal = FF_CODEC_CAP_INIT_THREADSAFE, 606}; 607