1/* 2Copyright (c) 2000-2002 Lee Thomason (www.grinninglizard.com) 3 4This software is provided 'as-is', without any express or implied 5warranty. In no event will the authors be held liable for any 6damages arising from the use of this software. 7 8Permission is granted to anyone to use this software for any 9purpose, including commercial applications, and to alter it and 10redistribute it freely, subject to the following restrictions: 11 121. The origin of this software must not be misrepresented; you must 13not claim that you wrote the original software. If you use this 14software in a product, an acknowledgment in the product documentation 15would be appreciated but is not required. 16 172. Altered source versions must be plainly marked as such, and 18must not be misrepresented as being the original software. 19 203. This notice may not be removed or altered from any source 21distribution. 22*/ 23 24#include "tinyxml.h" 25#include <ctype.h> 26#include <strstream> 27using namespace std; 28 29//#define DEBUG_PARSER 30 31TiXmlBase::Entity TiXmlBase::entity[ NUM_ENTITY ] = 32{ 33 { "&", 5, '&' }, 34 { "<", 4, '<' }, 35 { ">", 4, '>' }, 36 { """, 6, '\"' }, 37 { "'", 6, '\'' } 38}; 39 40 41const char* TiXmlBase::SkipWhiteSpace( const char* p ) 42{ 43 if ( !p || !*p ) 44 { 45 return 0; 46 } 47 while ( p && *p ) 48 { 49 if ( isspace( *p ) || *p == '\n' || *p =='\r' ) // Still using old rules for white space. 50 ++p; 51 else 52 break; 53 } 54 55 return p; 56} 57 58 59/*static*/ bool TiXmlBase::StreamWhiteSpace( std::istream* in, std::string* tag ) 60{ 61 for( ;; ) 62 { 63 if ( !in->good() ) return false; 64 65 int c = in->peek(); 66 if ( !IsWhiteSpace( c ) ) 67 return true; 68 *tag += in->get(); 69 } 70} 71 72 73/*static*/ bool TiXmlBase::StreamTo( std::istream* in, int character, std::string* tag ) 74{ 75 while ( in->good() ) 76 { 77 int c = in->peek(); 78 if ( c == character ) 79 return true; 80 81 in->get(); 82 *tag += c; 83 } 84 return false; 85} 86 87 88const char* TiXmlBase::ReadName( const char* p, string* name ) 89{ 90 *name = ""; 91 assert( p ); 92 93 // Names start with letters or underscores. 94 // After that, they can be letters, underscores, numbers, 95 // hyphens, or colons. (Colons are valid ony for namespaces, 96 // but tinyxml can't tell namespaces from names.) 97 if ( p && *p 98 && ( isalpha( (unsigned char) *p ) || *p == '_' ) ) 99 { 100 while( p && *p 101 && ( isalnum( (unsigned char ) *p ) 102 || *p == '_' 103 || *p == '-' 104 || *p == ':' ) ) 105 { 106 (*name) += *p; 107 ++p; 108 } 109 return p; 110 } 111 return 0; 112} 113 114 115const char* TiXmlBase::GetEntity( const char* p, char* value ) 116{ 117 // Presume an entity, and pull it out. 118 string ent; 119 int i; 120 121 // Ignore the &#x entities. 122 if ( strncmp( "&#x", p, 3 ) == 0 ) 123 { 124 *value = *p; 125 return p+1; 126 } 127 128 // Now try to match it. 129 for( i=0; i<NUM_ENTITY; ++i ) 130 { 131 if ( strncmp( entity[i].str, p, entity[i].strLength ) == 0 ) 132 { 133 assert( strlen( entity[i].str ) == entity[i].strLength ); 134 *value = entity[i].chr; 135 return ( p + entity[i].strLength ); 136 } 137 } 138 139 // So it wasn't an entity, its unrecognized, or something like that. 140 *value = *p; // Don't put back the last one, since we return it! 141 return p+1; 142} 143 144 145bool TiXmlBase::StringEqual( const char* p, 146 const char* tag, 147 bool ignoreCase ) 148{ 149 assert( p ); 150 if ( !p || !*p ) 151 { 152 assert( 0 ); 153 return false; 154 } 155 156 if ( tolower( *p ) == tolower( *tag ) ) 157 { 158 const char* q = p; 159 160 if (ignoreCase) 161 { 162 while ( *q && *tag && *q == *tag ) 163 { 164 ++q; 165 ++tag; 166 } 167 168 if ( *tag == 0 ) // Have we found the end of the tag, and everything equal? 169 { 170 return true; 171 } 172 } 173 else 174 { 175 while ( *q && *tag && tolower( *q ) == tolower( *tag ) ) 176 { 177 ++q; 178 ++tag; 179 } 180 181 if ( *tag == 0 ) 182 { 183 return true; 184 } 185 } 186 } 187 return false; 188} 189 190 191const char* TiXmlBase::ReadText( const char* p, 192 string* text, 193 bool trimWhiteSpace, 194 const char* endTag, 195 bool caseInsensitive ) 196{ 197 *text = ""; 198 199 if ( !trimWhiteSpace // certain tags always keep whitespace 200 || !condenseWhiteSpace ) // if true, whitespace is always kept 201 { 202 // Keep all the white space. 203 while ( p && *p 204 && !StringEqual( p, endTag, caseInsensitive ) 205 ) 206 { 207 char c; 208 p = GetChar( p, &c ); 209 text->append( &c, 1 ); 210 } 211 } 212 else 213 { 214 bool whitespace = false; 215 216 // Remove leading white space: 217 p = SkipWhiteSpace( p ); 218 while ( p && *p 219 && !StringEqual( p, endTag, caseInsensitive ) ) 220 { 221 if ( *p == '\r' || *p == '\n' ) 222 { 223 whitespace = true; 224 ++p; 225 } 226 else if ( isspace( *p ) ) 227 { 228 whitespace = true; 229 ++p; 230 } 231 else 232 { 233 // If we've found whitespace, add it before the 234 // new character. Any whitespace just becomes a space. 235 if ( whitespace ) 236 { 237 text->append( " ", 1 ); 238 whitespace = false; 239 } 240 char c; 241 p = GetChar( p, &c ); 242 text->append( &c, 1 ); 243 } 244 } 245 } 246 return p + strlen( endTag ); 247} 248 249 250void TiXmlDocument::StreamIn( std::istream* in, std::string* tag ) 251{ 252 // The basic issue with a document is that we don't know what we're 253 // streaming. Read something presumed to be a tag (and hope), then 254 // identify it, and call the appropriate stream method on the tag. 255 // 256 // This "pre-streaming" will never read the closing ">" so the 257 // sub-tag can orient itself. 258 259 if ( !StreamTo( in, '<', tag ) ) 260 { 261 SetError( TIXML_ERROR_PARSING_EMPTY ); 262 return; 263 } 264 265 while ( in->good() ) 266 { 267 int tagIndex = tag->length(); 268 while ( in->good() && in->peek() != '>' ) 269 { 270 int c = in->get(); 271 (*tag) += (char) c; 272 } 273 274 if ( in->good() ) 275 { 276 // We now have something we presume to be a node of 277 // some sort. Identify it, and call the node to 278 // continue streaming. 279 TiXmlNode* node = Identify( tag->c_str() + tagIndex ); 280 281 if ( node ) 282 { 283 node->StreamIn( in, tag ); 284 bool isElement = node->ToElement() != 0; 285 delete node; 286 node = 0; 287 288 // If this is the root element, we're done. Parsing will be 289 // done by the >> operator. 290 if ( isElement ) 291 { 292 return; 293 } 294 } 295 else 296 { 297 SetError( TIXML_ERROR ); 298 return; 299 } 300 } 301 } 302 // We should have returned sooner. 303 SetError( TIXML_ERROR ); 304} 305 306 307const char* TiXmlDocument::Parse( const char* p ) 308{ 309 // Parse away, at the document level. Since a document 310 // contains nothing but other tags, most of what happens 311 // here is skipping white space. 312 // 313 // In this variant (as opposed to stream and Parse) we 314 // read everything we can. 315 316 317 if ( !p || !*p || !( p = SkipWhiteSpace( p ) ) ) 318 { 319 SetError( TIXML_ERROR_DOCUMENT_EMPTY ); 320 return false; 321 } 322 323 while ( p && *p ) 324 { 325 TiXmlNode* node = Identify( p ); 326 if ( node ) 327 { 328 p = node->Parse( p ); 329 LinkEndChild( node ); 330 } 331 else 332 { 333 break; 334 } 335 p = SkipWhiteSpace( p ); 336 } 337 // All is well. 338 return p; 339} 340 341 342TiXmlNode* TiXmlNode::Identify( const char* p ) 343{ 344 TiXmlNode* returnNode = 0; 345 346 p = SkipWhiteSpace( p ); 347 if( !p || !*p || *p != '<' ) 348 { 349 return 0; 350 } 351 352 TiXmlDocument* doc = GetDocument(); 353 p = SkipWhiteSpace( p ); 354 355 if ( !p || !*p ) 356 { 357 return 0; 358 } 359 360 // What is this thing? 361 // - Elements start with a letter or underscore, but xml is reserved. 362 // - Comments: <!-- 363 // - Decleration: <?xml 364 // - Everthing else is unknown to tinyxml. 365 // 366 367 const char* xmlHeader = { "<?xml" }; 368 const char* commentHeader = { "<!--" }; 369 370 if ( StringEqual( p, xmlHeader, true ) ) 371 { 372 #ifdef DEBUG_PARSER 373 TIXML_LOG( "XML parsing Declaration\n" ); 374 #endif 375 returnNode = new TiXmlDeclaration(); 376 } 377 else if ( isalpha( *(p+1) ) 378 || *(p+1) == '_' ) 379 { 380 #ifdef DEBUG_PARSER 381 TIXML_LOG( "XML parsing Element\n" ); 382 #endif 383 returnNode = new TiXmlElement( "" ); 384 } 385 else if ( StringEqual( p, commentHeader, false ) ) 386 { 387 #ifdef DEBUG_PARSER 388 TIXML_LOG( "XML parsing Comment\n" ); 389 #endif 390 returnNode = new TiXmlComment(); 391 } 392 else 393 { 394 #ifdef DEBUG_PARSER 395 TIXML_LOG( "XML parsing Unknown\n" ); 396 #endif 397 returnNode = new TiXmlUnknown(); 398 } 399 400 if ( returnNode ) 401 { 402 // Set the parent, so it can report errors 403 returnNode->parent = this; 404 //p = returnNode->Parse( p ); 405 } 406 else 407 { 408 if ( doc ) 409 doc->SetError( TIXML_ERROR_OUT_OF_MEMORY ); 410 } 411 return returnNode; 412} 413 414 415void TiXmlElement::StreamIn( std::istream* in, std::string* tag ) 416{ 417 // We're called with some amount of pre-parsing. That is, some of "this" 418 // element is in "tag". Go ahead and stream to the closing ">" 419 while( in->good() ) 420 { 421 int c = in->get(); 422 (*tag) += (char) c ; 423 424 if ( c == '>' ) 425 break; 426 } 427 428 if ( tag->length() < 3 ) return; 429 430 // Okay...if we are a "/>" tag, then we're done. We've read a complete tag. 431 // If not, identify and stream. 432 433 if ( tag->at( tag->length() - 1 ) == '>' 434 && tag->at( tag->length() - 2 ) == '/' ) 435 { 436 // All good! 437 return; 438 } 439 else if ( tag->at( tag->length() - 1 ) == '>' ) 440 { 441 // There is more. Could be: 442 // text 443 // closing tag 444 // another node. 445 for ( ;; ) 446 { 447 StreamWhiteSpace( in, tag ); 448 449 // Do we have text? 450 if ( in->peek() != '<' ) 451 { 452 // Yep, text. 453 TiXmlText text( "" ); 454 text.StreamIn( in, tag ); 455 456 // What follows text is a closing tag or another node. 457 // Go around again and figure it out. 458 continue; 459 } 460 461 // We now have either a closing tag...or another node. 462 // We should be at a "<", regardless. 463 if ( !in->good() ) return; 464 assert( in->peek() == '<' ); 465 int tagIndex = tag->length(); 466 467 bool closingTag = false; 468 bool firstCharFound = false; 469 470 for( ;; ) 471 { 472 if ( !in->good() ) 473 return; 474 475 int c = in->peek(); 476 477 if ( c == '>' ) 478 break; 479 480 *tag += c; 481 in->get(); 482 483 if ( !firstCharFound && c != '<' && !IsWhiteSpace( c ) ) 484 { 485 firstCharFound = true; 486 if ( c == '/' ) 487 closingTag = true; 488 } 489 } 490 // If it was a closing tag, then read in the closing '>' to clean up the input stream. 491 // If it was not, the streaming will be done by the tag. 492 if ( closingTag ) 493 { 494 int c = in->get(); 495 assert( c == '>' ); 496 *tag += c; 497 498 // We are done, once we've found our closing tag. 499 return; 500 } 501 else 502 { 503 // If not a closing tag, id it, and stream. 504 const char* tagloc = tag->c_str() + tagIndex; 505 TiXmlNode* node = Identify( tagloc ); 506 if ( !node ) 507 return; 508 node->StreamIn( in, tag ); 509 delete node; 510 node = 0; 511 512 // No return: go around from the beginning: text, closing tag, or node. 513 } 514 } 515 } 516} 517 518 519const char* TiXmlElement::Parse( const char* p ) 520{ 521 p = SkipWhiteSpace( p ); 522 TiXmlDocument* document = GetDocument(); 523 524 if ( !p || !*p || *p != '<' ) 525 { 526 if ( document ) document->SetError( TIXML_ERROR_PARSING_ELEMENT ); 527 return false; 528 } 529 530 p = SkipWhiteSpace( p+1 ); 531 532 // Read the name. 533 p = ReadName( p, &value ); 534 if ( !p || !*p ) 535 { 536 if ( document ) document->SetError( TIXML_ERROR_FAILED_TO_READ_ELEMENT_NAME ); 537 return false; 538 } 539 540 string endTag = "</"; 541 endTag += value; 542 endTag += ">"; 543 544 // Check for and read attributes. Also look for an empty 545 // tag or an end tag. 546 while ( p && *p ) 547 { 548 p = SkipWhiteSpace( p ); 549 if ( !p || !*p ) 550 { 551 if ( document ) document->SetError( TIXML_ERROR_READING_ATTRIBUTES ); 552 return 0; 553 } 554 if ( *p == '/' ) 555 { 556 ++p; 557 // Empty tag. 558 if ( *p != '>' ) 559 { 560 if ( document ) document->SetError( TIXML_ERROR_PARSING_EMPTY ); 561 return 0; 562 } 563 return (p+1); 564 } 565 else if ( *p == '>' ) 566 { 567 // Done with attributes (if there were any.) 568 // Read the value -- which can include other 569 // elements -- read the end tag, and return. 570 ++p; 571 p = ReadValue( p ); // Note this is an Element method, and will set the error if one happens. 572 if ( !p || !*p ) 573 return 0; 574 575 // We should find the end tag now 576 if ( StringEqual( p, endTag.c_str(), false ) ) 577 { 578 p += endTag.length(); 579 return p; 580 } 581 else 582 { 583 if ( document ) document->SetError( TIXML_ERROR_READING_END_TAG ); 584 return 0; 585 } 586 } 587 else 588 { 589 // Try to read an element: 590 TiXmlAttribute attrib; 591 attrib.SetDocument( document ); 592 p = attrib.Parse( p ); 593 594 if ( !p || !*p ) 595 { 596 if ( document ) document->SetError( TIXML_ERROR_PARSING_ELEMENT ); 597 return 0; 598 } 599 SetAttribute( attrib.Name(), attrib.Value() ); 600 } 601 } 602 return p; 603} 604 605 606const char* TiXmlElement::ReadValue( const char* p ) 607{ 608 TiXmlDocument* document = GetDocument(); 609 610 // Read in text and elements in any order. 611 p = SkipWhiteSpace( p ); 612 while ( p && *p ) 613 { 614// string text; 615// while ( p && *p && *p != '<' ) 616// { 617// text += (*p); 618// ++p; 619// } 620// 621// p = SkipWhiteSpace( p ); 622 623 if ( *p != '<' ) 624 { 625 // Take what we have, make a text element. 626 TiXmlText* textNode = new TiXmlText( "" ); 627 628 if ( !textNode ) 629 { 630 if ( document ) document->SetError( TIXML_ERROR_OUT_OF_MEMORY ); 631 return 0; 632 } 633 634 p = textNode->Parse( p ); 635 636 if ( !textNode->Blank() ) 637 LinkEndChild( textNode ); 638 else 639 delete textNode; 640 } 641 else 642 { 643 // We hit a '<' 644 // Have we hit a new element or an end tag? 645 if ( StringEqual( p, "</", false ) ) 646 { 647 return p; 648 } 649 else 650 { 651 TiXmlNode* node = Identify( p ); 652 if ( node ) 653 { 654 p = node->Parse( p ); 655 LinkEndChild( node ); 656 } 657 else 658 { 659 return 0; 660 } 661 } 662 } 663 p = SkipWhiteSpace( p ); 664 } 665 666 if ( !p ) 667 { 668 if ( document ) document->SetError( TIXML_ERROR_READING_ELEMENT_VALUE ); 669 } 670 return p; 671} 672 673 674void TiXmlUnknown::StreamIn( std::istream* in, std::string* tag ) 675{ 676 while ( in->good() ) 677 { 678 int c = in->get(); 679 (*tag) += c; 680 681 if ( c == '>' ) 682 { 683 // All is well. 684 return; 685 } 686 } 687} 688 689 690const char* TiXmlUnknown::Parse( const char* p ) 691{ 692 TiXmlDocument* document = GetDocument(); 693 p = SkipWhiteSpace( p ); 694 if ( !p || !*p || *p != '<' ) 695 { 696 if ( document ) document->SetError( TIXML_ERROR_PARSING_UNKNOWN ); 697 return 0; 698 } 699 ++p; 700 value = ""; 701 702 while ( *p && *p != '>' ) 703 { 704 value += *p; 705 ++p; 706 } 707 708 if ( *p == '>' ) 709 return p+1; 710 return p; 711} 712 713 714void TiXmlComment::StreamIn( std::istream* in, std::string* tag ) 715{ 716 while ( in->good() ) 717 { 718 int c = in->get(); 719 (*tag) += c; 720 721 if ( c == '>' 722 && tag->at( tag->length() - 2 ) == '-' 723 && tag->at( tag->length() - 3 ) == '-' ) 724 { 725 // All is well. 726 return; 727 } 728 } 729} 730 731 732const char* TiXmlComment::Parse( const char* p ) 733{ 734 TiXmlDocument* document = GetDocument(); 735 value = ""; 736 737 p = SkipWhiteSpace( p ); 738 const char* startTag = "<!--"; 739 const char* endTag = "-->"; 740 741 if ( !StringEqual( p, startTag, false ) ) 742 { 743 document->SetError( TIXML_ERROR_PARSING_COMMENT ); 744 return 0; 745 } 746 p += strlen( startTag ); 747 p = ReadText( p, &value, false, endTag, false ); 748 return p; 749} 750 751 752const char* TiXmlAttribute::Parse( const char* p ) 753{ 754 p = SkipWhiteSpace( p ); 755 if ( !p || !*p ) return 0; 756 757 // Read the name, the '=' and the value. 758 p = ReadName( p, &name ); 759 if ( !p || !*p ) 760 { 761 if ( document ) document->SetError( TIXML_ERROR_READING_ATTRIBUTES ); 762 return 0; 763 } 764 p = SkipWhiteSpace( p ); 765 if ( !p || !*p || *p != '=' ) 766 { 767 if ( document ) document->SetError( TIXML_ERROR_READING_ATTRIBUTES ); 768 return 0; 769 } 770 771 ++p; // skip '=' 772 p = SkipWhiteSpace( p ); 773 if ( !p || !*p ) 774 { 775 if ( document ) document->SetError( TIXML_ERROR_READING_ATTRIBUTES ); 776 return 0; 777 } 778 779 const char* end; 780 781 if ( *p == '\'' ) 782 { 783 ++p; 784 end = "\'"; 785 p = ReadText( p, &value, false, end, false ); 786 } 787 else if ( *p == '"' ) 788 { 789 ++p; 790 end = "\""; 791 p = ReadText( p, &value, false, end, false ); 792 } 793 else 794 { 795 // All attribute values should be in single or double quotes. 796 // But this is such a common error that the parser will try 797 // its best, even without them. 798 value = ""; 799 while ( p && *p // existence 800 && !isspace( *p ) && *p != '\n' && *p != '\r' // whitespace 801 && *p != '/' && *p != '>' ) // tag end 802 { 803 value += *p; 804 ++p; 805 } 806 } 807 return p; 808} 809 810 811void TiXmlText::StreamIn( std::istream* in, std::string* tag ) 812{ 813 while ( in->good() ) 814 { 815 int c = in->peek(); 816 if ( c == '<' ) 817 return; 818 819 (*tag) += c; 820 in->get(); 821 } 822} 823 824 825 826const char* TiXmlText::Parse( const char* p ) 827{ 828 value = ""; 829 830 //TiXmlDocument* doc = GetDocument(); 831 bool ignoreWhite = true; 832// if ( doc && !doc->IgnoreWhiteSpace() ) ignoreWhite = false; 833 834 const char* end = "<"; 835 p = ReadText( p, &value, ignoreWhite, end, false ); 836 if ( p ) 837 return p-1; // don't truncate the '<' 838 return 0; 839} 840 841 842void TiXmlDeclaration::StreamIn( std::istream* in, std::string* tag ) 843{ 844 while ( in->good() ) 845 { 846 int c = in->get(); 847 (*tag) += c; 848 849 if ( c == '>' ) 850 { 851 // All is well. 852 return; 853 } 854 } 855} 856 857const char* TiXmlDeclaration::Parse( const char* p ) 858{ 859 p = SkipWhiteSpace( p ); 860 // Find the beginning, find the end, and look for 861 // the stuff in-between. 862 TiXmlDocument* document = GetDocument(); 863 if ( !p || !*p || !StringEqual( p, "<?xml", true ) ) 864 { 865 if ( document ) document->SetError( TIXML_ERROR_PARSING_DECLARATION ); 866 return 0; 867 } 868 869 p += 5; 870// const char* start = p+5; 871// const char* end = strstr( start, "?>" ); 872 873 version = ""; 874 encoding = ""; 875 standalone = ""; 876 877 while ( p && *p ) 878 { 879 if ( *p == '>' ) 880 { 881 ++p; 882 return p; 883 } 884 885 p = SkipWhiteSpace( p ); 886 if ( StringEqual( p, "version", true ) ) 887 { 888// p += 7; 889 TiXmlAttribute attrib; 890 p = attrib.Parse( p ); 891 version = attrib.Value(); 892 } 893 else if ( StringEqual( p, "encoding", true ) ) 894 { 895// p += 8; 896 TiXmlAttribute attrib; 897 p = attrib.Parse( p ); 898 encoding = attrib.Value(); 899 } 900 else if ( StringEqual( p, "standalone", true ) ) 901 { 902// p += 10; 903 TiXmlAttribute attrib; 904 p = attrib.Parse( p ); 905 standalone = attrib.Value(); 906 } 907 else 908 { 909 // Read over whatever it is. 910 while( p && *p && *p != '>' && !isspace( *p ) ) 911 ++p; 912 } 913 } 914 return 0; 915} 916 917bool TiXmlText::Blank() const 918{ 919 for ( unsigned i=0; i<value.size(); i++ ) 920 if ( !isspace( value[i] ) ) 921 return false; 922 return true; 923} 924 925