1/*
2Copyright (c) 2000-2002 Lee Thomason (www.grinninglizard.com)
3
4This software is provided 'as-is', without any express or implied
5warranty. In no event will the authors be held liable for any
6damages arising from the use of this software.
7
8Permission is granted to anyone to use this software for any
9purpose, including commercial applications, and to alter it and
10redistribute it freely, subject to the following restrictions:
11
121. The origin of this software must not be misrepresented; you must
13not claim that you wrote the original software. If you use this
14software in a product, an acknowledgment in the product documentation
15would be appreciated but is not required.
16
172. Altered source versions must be plainly marked as such, and
18must not be misrepresented as being the original software.
19
203. This notice may not be removed or altered from any source
21distribution.
22*/
23
24#include "tinyxml.h"
25#include <ctype.h>
26#include <strstream>
27using namespace std;
28
29//#define DEBUG_PARSER
30
31TiXmlBase::Entity TiXmlBase::entity[ NUM_ENTITY ] =
32{
33	{ "&amp;",  5, '&' },
34	{ "&lt;",   4, '<' },
35	{ "&gt;",   4, '>' },
36	{ "&quot;", 6, '\"' },
37	{ "&apos;", 6, '\'' }
38};
39
40
41const char* TiXmlBase::SkipWhiteSpace( const char* p )
42{
43	if ( !p || !*p )
44	{
45		return 0;
46	}
47	while ( p && *p )
48	{
49		if ( isspace( *p ) || *p == '\n' || *p =='\r' )		// Still using old rules for white space.
50			++p;
51		else
52			break;
53	}
54
55	return p;
56}
57
58
59/*static*/ bool TiXmlBase::StreamWhiteSpace( std::istream* in, std::string* tag )
60{
61	for( ;; )
62	{
63		if ( !in->good() ) return false;
64
65		int c = in->peek();
66		if ( !IsWhiteSpace( c ) )
67			return true;
68		*tag += in->get();
69	}
70}
71
72
73/*static*/ bool TiXmlBase::StreamTo( std::istream* in, int character, std::string* tag )
74{
75	while ( in->good() )
76	{
77		int c = in->peek();
78		if ( c == character )
79			return true;
80
81		in->get();
82		*tag += c;
83	}
84	return false;
85}
86
87
88const char* TiXmlBase::ReadName( const char* p, string* name )
89{
90	*name = "";
91	assert( p );
92
93	// Names start with letters or underscores.
94	// After that, they can be letters, underscores, numbers,
95	// hyphens, or colons. (Colons are valid ony for namespaces,
96	// but tinyxml can't tell namespaces from names.)
97	if (    p && *p
98		 && ( isalpha( (unsigned char) *p ) || *p == '_' ) )
99	{
100		while(		p && *p
101				&&	(		isalnum( (unsigned char ) *p )
102						 || *p == '_'
103						 || *p == '-'
104						 || *p == ':' ) )
105		{
106			(*name) += *p;
107			++p;
108		}
109		return p;
110	}
111	return 0;
112}
113
114
115const char* TiXmlBase::GetEntity( const char* p, char* value )
116{
117	// Presume an entity, and pull it out.
118	string ent;
119	int i;
120
121	// Ignore the &#x entities.
122	if ( strncmp( "&#x", p, 3 ) == 0 )
123	{
124		*value = *p;
125		return p+1;
126	}
127
128	// Now try to match it.
129	for( i=0; i<NUM_ENTITY; ++i )
130	{
131		if ( strncmp( entity[i].str, p, entity[i].strLength ) == 0 )
132		{
133			assert( strlen( entity[i].str ) == entity[i].strLength );
134			*value = entity[i].chr;
135			return ( p + entity[i].strLength );
136		}
137	}
138
139	// So it wasn't an entity, its unrecognized, or something like that.
140	*value = *p;	// Don't put back the last one, since we return it!
141	return p+1;
142}
143
144
145bool TiXmlBase::StringEqual( const char* p,
146							 const char* tag,
147							 bool ignoreCase )
148{
149	assert( p );
150	if ( !p || !*p )
151	{
152		assert( 0 );
153		return false;
154	}
155
156    if ( tolower( *p ) == tolower( *tag ) )
157	{
158		const char* q = p;
159
160		if (ignoreCase)
161		{
162			while ( *q && *tag && *q == *tag )
163			{
164				++q;
165				++tag;
166			}
167
168			if ( *tag == 0 )		// Have we found the end of the tag, and everything equal?
169			{
170				return true;
171			}
172		}
173		else
174		{
175			while ( *q && *tag && tolower( *q ) == tolower( *tag ) )
176			{
177				++q;
178				++tag;
179			}
180
181			if ( *tag == 0 )
182			{
183				return true;
184			}
185		}
186	}
187	return false;
188}
189
190
191const char* TiXmlBase::ReadText(	const char* p,
192									string* text,
193									bool trimWhiteSpace,
194									const char* endTag,
195									bool caseInsensitive )
196{
197	*text = "";
198
199	if (    !trimWhiteSpace			// certain tags always keep whitespace
200		 || !condenseWhiteSpace )	// if true, whitespace is always kept
201	{
202		// Keep all the white space.
203		while (	   p && *p
204				&& !StringEqual( p, endTag, caseInsensitive )
205			  )
206		{
207			char c;
208			p = GetChar( p, &c );
209			text->append( &c, 1 );
210		}
211	}
212	else
213	{
214		bool whitespace = false;
215
216		// Remove leading white space:
217		p = SkipWhiteSpace( p );
218		while (	   p && *p
219				&& !StringEqual( p, endTag, caseInsensitive ) )
220		{
221			if ( *p == '\r' || *p == '\n' )
222			{
223				whitespace = true;
224				++p;
225			}
226			else if ( isspace( *p ) )
227			{
228				whitespace = true;
229				++p;
230			}
231			else
232			{
233				// If we've found whitespace, add it before the
234				// new character. Any whitespace just becomes a space.
235				if ( whitespace )
236				{
237					text->append( " ", 1 );
238					whitespace = false;
239				}
240				char c;
241				p = GetChar( p, &c );
242				text->append( &c, 1 );
243			}
244		}
245	}
246	return p + strlen( endTag );
247}
248
249
250void TiXmlDocument::StreamIn( std::istream* in, std::string* tag )
251{
252	// The basic issue with a document is that we don't know what we're
253	// streaming. Read something presumed to be a tag (and hope), then
254	// identify it, and call the appropriate stream method on the tag.
255	//
256	// This "pre-streaming" will never read the closing ">" so the
257	// sub-tag can orient itself.
258
259	if ( !StreamTo( in, '<', tag ) )
260	{
261		SetError( TIXML_ERROR_PARSING_EMPTY );
262		return;
263	}
264
265	while ( in->good() )
266	{
267		int tagIndex = tag->length();
268		while ( in->good() && in->peek() != '>' )
269		{
270			int c = in->get();
271			(*tag) += (char) c;
272		}
273
274		if ( in->good() )
275		{
276			// We now have something we presume to be a node of
277			// some sort. Identify it, and call the node to
278			// continue streaming.
279			TiXmlNode* node = Identify( tag->c_str() + tagIndex );
280
281			if ( node )
282			{
283				node->StreamIn( in, tag );
284				bool isElement = node->ToElement() != 0;
285				delete node;
286				node = 0;
287
288				// If this is the root element, we're done. Parsing will be
289				// done by the >> operator.
290				if ( isElement )
291				{
292					return;
293				}
294			}
295			else
296			{
297				SetError( TIXML_ERROR );
298				return;
299			}
300		}
301	}
302	// We should have returned sooner.
303	SetError( TIXML_ERROR );
304}
305
306
307const char* TiXmlDocument::Parse( const char* p )
308{
309	// Parse away, at the document level. Since a document
310	// contains nothing but other tags, most of what happens
311	// here is skipping white space.
312	//
313	// In this variant (as opposed to stream and Parse) we
314	// read everything we can.
315
316
317	if ( !p || !*p  || !( p = SkipWhiteSpace( p ) ) )
318	{
319		SetError( TIXML_ERROR_DOCUMENT_EMPTY );
320		return false;
321	}
322
323	while ( p && *p )
324	{
325		TiXmlNode* node = Identify( p );
326		if ( node )
327		{
328			p = node->Parse( p );
329			LinkEndChild( node );
330		}
331		else
332		{
333			break;
334		}
335		p = SkipWhiteSpace( p );
336	}
337	// All is well.
338	return p;
339}
340
341
342TiXmlNode* TiXmlNode::Identify( const char* p )
343{
344	TiXmlNode* returnNode = 0;
345
346	p = SkipWhiteSpace( p );
347	if( !p || !*p || *p != '<' )
348	{
349		return 0;
350	}
351
352	TiXmlDocument* doc = GetDocument();
353	p = SkipWhiteSpace( p );
354
355	if ( !p || !*p )
356	{
357		return 0;
358	}
359
360	// What is this thing?
361	// - Elements start with a letter or underscore, but xml is reserved.
362	// - Comments: <!--
363	// - Decleration: <?xml
364	// - Everthing else is unknown to tinyxml.
365	//
366
367	const char* xmlHeader = { "<?xml" };
368	const char* commentHeader = { "<!--" };
369
370	if ( StringEqual( p, xmlHeader, true ) )
371	{
372		#ifdef DEBUG_PARSER
373			TIXML_LOG( "XML parsing Declaration\n" );
374		#endif
375		returnNode = new TiXmlDeclaration();
376	}
377	else if (    isalpha( *(p+1) )
378			  || *(p+1) == '_' )
379	{
380		#ifdef DEBUG_PARSER
381			TIXML_LOG( "XML parsing Element\n" );
382		#endif
383		returnNode = new TiXmlElement( "" );
384	}
385	else if ( StringEqual( p, commentHeader, false ) )
386	{
387		#ifdef DEBUG_PARSER
388			TIXML_LOG( "XML parsing Comment\n" );
389		#endif
390		returnNode = new TiXmlComment();
391	}
392	else
393	{
394		#ifdef DEBUG_PARSER
395			TIXML_LOG( "XML parsing Unknown\n" );
396		#endif
397		returnNode = new TiXmlUnknown();
398	}
399
400	if ( returnNode )
401	{
402		// Set the parent, so it can report errors
403		returnNode->parent = this;
404		//p = returnNode->Parse( p );
405	}
406	else
407	{
408		if ( doc )
409			doc->SetError( TIXML_ERROR_OUT_OF_MEMORY );
410	}
411	return returnNode;
412}
413
414
415void TiXmlElement::StreamIn( std::istream* in, std::string* tag )
416{
417	// We're called with some amount of pre-parsing. That is, some of "this"
418	// element is in "tag". Go ahead and stream to the closing ">"
419	while( in->good() )
420	{
421		int c = in->get();
422		(*tag) += (char) c ;
423
424		if ( c == '>' )
425			break;
426	}
427
428	if ( tag->length() < 3 ) return;
429
430	// Okay...if we are a "/>" tag, then we're done. We've read a complete tag.
431	// If not, identify and stream.
432
433	if (    tag->at( tag->length() - 1 ) == '>'
434		 && tag->at( tag->length() - 2 ) == '/' )
435	{
436		// All good!
437		return;
438	}
439	else if ( tag->at( tag->length() - 1 ) == '>' )
440	{
441		// There is more. Could be:
442		//		text
443		//		closing tag
444		//		another node.
445		for ( ;; )
446		{
447			StreamWhiteSpace( in, tag );
448
449			// Do we have text?
450			if ( in->peek() != '<' )
451			{
452				// Yep, text.
453				TiXmlText text( "" );
454				text.StreamIn( in, tag );
455
456				// What follows text is a closing tag or another node.
457				// Go around again and figure it out.
458				continue;
459			}
460
461			// We now have either a closing tag...or another node.
462			// We should be at a "<", regardless.
463			if ( !in->good() ) return;
464			assert( in->peek() == '<' );
465			int tagIndex = tag->length();
466
467			bool closingTag = false;
468			bool firstCharFound = false;
469
470			for( ;; )
471			{
472				if ( !in->good() )
473					return;
474
475				int c = in->peek();
476
477				if ( c == '>' )
478					break;
479
480				*tag += c;
481				in->get();
482
483				if ( !firstCharFound && c != '<' && !IsWhiteSpace( c ) )
484				{
485					firstCharFound = true;
486					if ( c == '/' )
487						closingTag = true;
488				}
489			}
490			// If it was a closing tag, then read in the closing '>' to clean up the input stream.
491			// If it was not, the streaming will be done by the tag.
492			if ( closingTag )
493			{
494				int c = in->get();
495				assert( c == '>' );
496				*tag += c;
497
498				// We are done, once we've found our closing tag.
499				return;
500			}
501			else
502			{
503				// If not a closing tag, id it, and stream.
504				const char* tagloc = tag->c_str() + tagIndex;
505				TiXmlNode* node = Identify( tagloc );
506				if ( !node )
507					return;
508				node->StreamIn( in, tag );
509				delete node;
510				node = 0;
511
512				// No return: go around from the beginning: text, closing tag, or node.
513			}
514		}
515	}
516}
517
518
519const char* TiXmlElement::Parse( const char* p )
520{
521	p = SkipWhiteSpace( p );
522	TiXmlDocument* document = GetDocument();
523
524	if ( !p || !*p || *p != '<' )
525	{
526		if ( document ) document->SetError( TIXML_ERROR_PARSING_ELEMENT );
527		return false;
528	}
529
530	p = SkipWhiteSpace( p+1 );
531
532	// Read the name.
533	p = ReadName( p, &value );
534	if ( !p || !*p )
535	{
536		if ( document )	document->SetError( TIXML_ERROR_FAILED_TO_READ_ELEMENT_NAME );
537		return false;
538	}
539
540	string endTag = "</";
541	endTag += value;
542	endTag += ">";
543
544	// Check for and read attributes. Also look for an empty
545	// tag or an end tag.
546	while ( p && *p )
547	{
548		p = SkipWhiteSpace( p );
549		if ( !p || !*p )
550		{
551			if ( document ) document->SetError( TIXML_ERROR_READING_ATTRIBUTES );
552			return 0;
553		}
554		if ( *p == '/' )
555		{
556			++p;
557			// Empty tag.
558			if ( *p  != '>' )
559			{
560				if ( document ) document->SetError( TIXML_ERROR_PARSING_EMPTY );
561				return 0;
562			}
563			return (p+1);
564		}
565		else if ( *p == '>' )
566		{
567			// Done with attributes (if there were any.)
568			// Read the value -- which can include other
569			// elements -- read the end tag, and return.
570			++p;
571			p = ReadValue( p );		// Note this is an Element method, and will set the error if one happens.
572			if ( !p || !*p )
573				return 0;
574
575			// We should find the end tag now
576			if ( StringEqual( p, endTag.c_str(), false ) )
577			{
578				p += endTag.length();
579				return p;
580			}
581			else
582			{
583				if ( document ) document->SetError( TIXML_ERROR_READING_END_TAG );
584				return 0;
585			}
586		}
587		else
588		{
589			// Try to read an element:
590			TiXmlAttribute attrib;
591			attrib.SetDocument( document );
592			p = attrib.Parse( p );
593
594			if ( !p || !*p )
595			{
596				if ( document ) document->SetError( TIXML_ERROR_PARSING_ELEMENT );
597				return 0;
598			}
599			SetAttribute( attrib.Name(), attrib.Value() );
600		}
601	}
602	return p;
603}
604
605
606const char* TiXmlElement::ReadValue( const char* p )
607{
608	TiXmlDocument* document = GetDocument();
609
610	// Read in text and elements in any order.
611	p = SkipWhiteSpace( p );
612	while ( p && *p )
613	{
614//		string text;
615//		while ( p && *p && *p != '<' )
616//		{
617//			text += (*p);
618//			++p;
619//		}
620//
621//		p = SkipWhiteSpace( p );
622
623		if ( *p != '<' )
624		{
625			// Take what we have, make a text element.
626			TiXmlText* textNode = new TiXmlText( "" );
627
628			if ( !textNode )
629			{
630				if ( document ) document->SetError( TIXML_ERROR_OUT_OF_MEMORY );
631				return 0;
632			}
633
634			p = textNode->Parse( p );
635
636			if ( !textNode->Blank() )
637				LinkEndChild( textNode );
638			else
639				delete textNode;
640		}
641		else
642		{
643			// We hit a '<'
644			// Have we hit a new element or an end tag?
645			if ( StringEqual( p, "</", false ) )
646			{
647				return p;
648			}
649			else
650			{
651				TiXmlNode* node = Identify( p );
652				if ( node )
653				{
654					p = node->Parse( p );
655					LinkEndChild( node );
656				}
657				else
658				{
659					return 0;
660				}
661			}
662		}
663		p = SkipWhiteSpace( p );
664	}
665
666	if ( !p )
667	{
668		if ( document ) document->SetError( TIXML_ERROR_READING_ELEMENT_VALUE );
669	}
670	return p;
671}
672
673
674void TiXmlUnknown::StreamIn( std::istream* in, std::string* tag )
675{
676	while ( in->good() )
677	{
678		int c = in->get();
679		(*tag) += c;
680
681		if ( c == '>' )
682		{
683			// All is well.
684			return;
685		}
686	}
687}
688
689
690const char* TiXmlUnknown::Parse( const char* p )
691{
692	TiXmlDocument* document = GetDocument();
693	p = SkipWhiteSpace( p );
694	if ( !p || !*p || *p != '<' )
695	{
696		if ( document ) document->SetError( TIXML_ERROR_PARSING_UNKNOWN );
697		return 0;
698	}
699	++p;
700	value = "";
701
702	while ( *p && *p != '>' )
703	{
704		value += *p;
705		++p;
706	}
707
708	if ( *p == '>' )
709		return p+1;
710	return p;
711}
712
713
714void TiXmlComment::StreamIn( std::istream* in, std::string* tag )
715{
716	while ( in->good() )
717	{
718		int c = in->get();
719		(*tag) += c;
720
721		if ( c == '>'
722			 && tag->at( tag->length() - 2 ) == '-'
723			 && tag->at( tag->length() - 3 ) == '-' )
724		{
725			// All is well.
726			return;
727		}
728	}
729}
730
731
732const char* TiXmlComment::Parse( const char* p )
733{
734	TiXmlDocument* document = GetDocument();
735	value = "";
736
737	p = SkipWhiteSpace( p );
738	const char* startTag = "<!--";
739	const char* endTag   = "-->";
740
741	if ( !StringEqual( p, startTag, false ) )
742	{
743		document->SetError( TIXML_ERROR_PARSING_COMMENT );
744		return 0;
745	}
746	p += strlen( startTag );
747	p = ReadText( p, &value, false, endTag, false );
748	return p;
749}
750
751
752const char* TiXmlAttribute::Parse( const char* p )
753{
754	p = SkipWhiteSpace( p );
755	if ( !p || !*p ) return 0;
756
757	// Read the name, the '=' and the value.
758	p = ReadName( p, &name );
759	if ( !p || !*p )
760	{
761		if ( document ) document->SetError( TIXML_ERROR_READING_ATTRIBUTES );
762		return 0;
763	}
764	p = SkipWhiteSpace( p );
765	if ( !p || !*p || *p != '=' )
766	{
767		if ( document ) document->SetError( TIXML_ERROR_READING_ATTRIBUTES );
768		return 0;
769	}
770
771	++p;	// skip '='
772	p = SkipWhiteSpace( p );
773	if ( !p || !*p )
774	{
775		if ( document ) document->SetError( TIXML_ERROR_READING_ATTRIBUTES );
776		return 0;
777	}
778
779	const char* end;
780
781	if ( *p == '\'' )
782	{
783		++p;
784		end = "\'";
785		p = ReadText( p, &value, false, end, false );
786	}
787	else if ( *p == '"' )
788	{
789		++p;
790		end = "\"";
791		p = ReadText( p, &value, false, end, false );
792	}
793	else
794	{
795		// All attribute values should be in single or double quotes.
796		// But this is such a common error that the parser will try
797		// its best, even without them.
798		value = "";
799		while (    p && *p										// existence
800				&& !isspace( *p ) && *p != '\n' && *p != '\r'	// whitespace
801				&& *p != '/' && *p != '>' )						// tag end
802		{
803			value += *p;
804			++p;
805		}
806	}
807	return p;
808}
809
810
811void TiXmlText::StreamIn( std::istream* in, std::string* tag )
812{
813	while ( in->good() )
814	{
815		int c = in->peek();
816		if ( c == '<' )
817			return;
818
819		(*tag) += c;
820		in->get();
821	}
822}
823
824
825
826const char* TiXmlText::Parse( const char* p )
827{
828	value = "";
829
830	//TiXmlDocument* doc = GetDocument();
831	bool ignoreWhite = true;
832//	if ( doc && !doc->IgnoreWhiteSpace() ) ignoreWhite = false;
833
834	const char* end = "<";
835	p = ReadText( p, &value, ignoreWhite, end, false );
836	if ( p )
837		return p-1;	// don't truncate the '<'
838	return 0;
839}
840
841
842void TiXmlDeclaration::StreamIn( std::istream* in, std::string* tag )
843{
844	while ( in->good() )
845	{
846		int c = in->get();
847		(*tag) += c;
848
849		if ( c == '>' )
850		{
851			// All is well.
852			return;
853		}
854	}
855}
856
857const char* TiXmlDeclaration::Parse( const char* p )
858{
859	p = SkipWhiteSpace( p );
860	// Find the beginning, find the end, and look for
861	// the stuff in-between.
862	TiXmlDocument* document = GetDocument();
863	if ( !p || !*p || !StringEqual( p, "<?xml", true ) )
864	{
865		if ( document ) document->SetError( TIXML_ERROR_PARSING_DECLARATION );
866		return 0;
867	}
868
869	p += 5;
870//	const char* start = p+5;
871//	const char* end  = strstr( start, "?>" );
872
873	version = "";
874	encoding = "";
875	standalone = "";
876
877	while ( p && *p )
878	{
879		if ( *p == '>' )
880		{
881			++p;
882			return p;
883		}
884
885		p = SkipWhiteSpace( p );
886		if ( StringEqual( p, "version", true ) )
887		{
888//			p += 7;
889			TiXmlAttribute attrib;
890			p = attrib.Parse( p );
891			version = attrib.Value();
892		}
893		else if ( StringEqual( p, "encoding", true ) )
894		{
895//			p += 8;
896			TiXmlAttribute attrib;
897			p = attrib.Parse( p );
898			encoding = attrib.Value();
899		}
900		else if ( StringEqual( p, "standalone", true ) )
901		{
902//			p += 10;
903			TiXmlAttribute attrib;
904			p = attrib.Parse( p );
905			standalone = attrib.Value();
906		}
907		else
908		{
909			// Read over whatever it is.
910			while( p && *p && *p != '>' && !isspace( *p ) )
911				++p;
912		}
913	}
914	return 0;
915}
916
917bool TiXmlText::Blank() const
918{
919	for ( unsigned i=0; i<value.size(); i++ )
920		if ( !isspace( value[i] ) )
921			return false;
922	return true;
923}
924
925