-
Notifications
You must be signed in to change notification settings - Fork 57
/
Copy pathxml.parse.m
3674 lines (3348 loc) · 145 KB
/
xml.parse.m
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
%---------------------------------------------------------------------------%
% Copyright (C) 2000, 2005-2006, 2011 The University of Melbourne.
% Copyright (C) 2014, 2018 The Mercury team.
% This file is distributed under the terms specified in COPYING.LIB.
%---------------------------------------------------------------------------%
%
% Main author: [email protected].
%
%---------------------------------------------------------------------------%
%
% W3C REC-xml-19980210
%
% Extensible Markup Language (XML) 1.0
%
% W3C Recommendation 10-February-1998
%
% This version:
% http://www.w3.org/TR/1998/REC-xml-19980210
% http://www.w3.org/TR/1998/REC-xml-19980210.xml
% http://www.w3.org/TR/1998/REC-xml-19980210.html
% http://www.w3.org/TR/1998/REC-xml-19980210.pdf
% http://www.w3.org/TR/1998/REC-xml-19980210.ps
%
% Latest version:
% http://www.w3.org/TR/REC-xml
%
% Previous version:
% http://www.w3.org/TR/PR-xml-971208
%
% Editors:
% Tim Bray (Textuality and Netscape) <[email protected]>
% Jean Paoli (Microsoft) <[email protected]>
% C. M. Sperberg-McQueen (University of Illinois at Chicago)
%
% Abstract
%
% The Extensible Markup Language (XML) is a subset of SGML that is
% completely described in this document. Its goal is to enable generic
% SGML to be served, received, and processed on the Web in the way that
% is now possible with HTML. XML has been designed for ease of
% implementation and for interoperability with both SGML and HTML.
%
% Status of this document
%
% This document has been reviewed by W3C Members and other interested
% parties and has been endorsed by the Director as a W3C Recommendation.
% It is a stable document and may be used as reference material or cited
% as a normative reference from another document. W3C's role in making
% the Recommendation is to draw attention to the specification and to
% promote its widespread deployment. This enhances the functionality and
% interoperability of the Web.
%
% This document specifies a syntax created by subsetting an existing,
% widely used international text processing standard (Standard
% Generalized Markup Language, ISO 8879:1986(E) as amended and
% corrected) for use on the World Wide Web. It is a product of the W3C
% XML Activity, details of which can be found at http://www.w3.org/XML.
% A list of current W3C Recommendations and other technical documents
% can be found at http://www.w3.org/TR.
%
% This specification uses the term URI, which is defined by [Berners-Lee
% et al.], a work in progress expected to update [IETF RFC1738] and
% [IETF RFC1808].
%
% The list of known errors in this specification is available at
% http://www.w3.org/XML/xml-19980210-errata.
%
% Please report errors in this document to [email protected].
%
% Extensible Markup Language (XML) 1.0
%
% Table of Contents
%
% 1. Introduction
% 1.1 Origin and Goals
% 1.2 Terminology
% 2. Documents
% 2.1 Well-Formed XML Documents
% 2.2 Characters
% 2.3 Common Syntactic Constructs
% 2.4 Character Data and Markup
% 2.5 Comments
% 2.6 Processing Instructions
% 2.7 CDATA Sections
% 2.8 Prolog and Document Type Declaration
% 2.9 Standalone Document Declaration
% 2.10 White Space Handling
% 2.11 End-of-Line Handling
% 2.12 Language Identification
% 3. Logical Structures
% 3.1 Start-Tags, End-Tags, and Empty-Element Tags
% 3.2 Element Type Declarations
% 3.2.1 Element Content
% 3.2.2 Mixed Content
% 3.3 Attribute-List Declarations
% 3.3.1 Attribute Types
% 3.3.2 Attribute Defaults
% 3.3.3 Attribute-Value Normalization
% 3.4 Conditional Sections
% 4. Physical Structures
% 4.1 Character and Entity References
% 4.2 Entity Declarations
% 4.2.1 Internal Entities
% 4.2.2 External Entities
% 4.3 Parsed Entities
% 4.3.1 The Text Declaration
% 4.3.2 Well-Formed Parsed Entities
% 4.3.3 Character Encoding in Entities
% 4.4 XML Processor Treatment of Entities and References
% 4.4.1 Not Recognized
% 4.4.2 Included
% 4.4.3 Included If Validating
% 4.4.4 Forbidden
% 4.4.5 Included in Literal
% 4.4.6 Notify
% 4.4.7 Bypassed
% 4.4.8 Included as PE
% 4.5 Construction of Internal Entity Replacement Text
% 4.6 Predefined Entities
% 4.7 Notation Declarations
% 4.8 Document Entity
% 5. Conformance
% 5.1 Validating and Non-Validating Processors
% 5.2 Using XML Processors
% 6. Notation
%
% Appendices
%
% A. References
% A.1 Normative References
% A.2 Other References
% B. Character Classes
% C. XML and SGML (Non-Normative)
% D. Expansion of Entity and Character References (Non-Normative)
% E. Deterministic Content Models (Non-Normative)
% F. Autodetection of Character Encodings (Non-Normative)
% G. W3C XML Working Group (Non-Normative)
% _________________________________________________________________
%
% 1. Introduction
%
% Extensible Markup Language, abbreviated XML, describes a class of data
% objects called XML documents and partially describes the behavior of
% computer programs which process them. XML is an application profile or
% restricted form of SGML, the Standard Generalized Markup Language [ISO
% 8879]. By construction, XML documents are conforming SGML documents.
%
% XML documents are made up of storage units called entities, which
% contain either parsed or unparsed data. Parsed data is made up of
% characters, some of which form character data, and some of which form
% markup. Markup encodes a description of the document's storage layout
% and logical structure. XML provides a mechanism to impose constraints
% on the storage layout and logical structure.
%
% A software module called an XML processor is used to read XML
% documents and provide access to their content and structure. It is
% assumed that an XML processor is doing its work on behalf of another
% module, called the application. This specification describes the
% required behavior of an XML processor in terms of how it must read XML
% data and the information it must provide to the application.
%
% 1.1 Origin and Goals
%
% XML was developed by an XML Working Group (originally known as the
% SGML Editorial Review Board) formed under the auspices of the World
% Wide Web Consortium (W3C) in 1996. It was chaired by Jon Bosak of Sun
% Microsystems with the active participation of an XML Special Interest
% Group (previously known as the SGML Working Group) also organized by
% the W3C. The membership of the XML Working Group is given in an
% appendix. Dan Connolly served as the WG's contact with the W3C.
%
% The design goals for XML are:
% 1. XML shall be straightforwardly usable over the Internet.
% 2. XML shall support a wide variety of applications.
% 3. XML shall be compatible with SGML.
% 4. It shall be easy to write programs which process XML documents.
% 5. The number of optional features in XML is to be kept to the
% absolute minimum, ideally zero.
% 6. XML documents should be human-legible and reasonably clear.
% 7. The XML design should be prepared quickly.
% 8. The design of XML shall be formal and concise.
% 9. XML documents shall be easy to create.
% 10. Terseness in XML markup is of minimal importance.
%
% This specification, together with associated standards (Unicode and
% ISO/IEC 10646 for characters, Internet RFC 1766 for language
% identification tags, ISO 639 for language name codes, and ISO 3166 for
% country name codes), provides all the information necessary to
% understand XML Version 1.0 and construct computer programs to process
% it.
%
% This version of the XML specification may be distributed freely, as
% long as all text and legal notices remain intact.
%
% 1.2 Terminology
%
% The terminology used to describe XML documents is defined in the body
% of this specification. The terms defined in the following list are
% used in building those definitions and in describing the actions of an
% XML processor:
%
% may
% Conforming documents and XML processors are permitted to but
% need not behave as described.
%
% must
% Conforming documents and XML processors are required to behave
% as described; otherwise they are in error.
%
% error
% A violation of the rules of this specification; results are
% undefined. Conforming software may detect and report an error
% and may recover from it.
%
% fatal error
% An error which a conforming XML processor must detect and
% report to the application. After encountering a fatal error,
% the processor may continue processing the data to search for
% further errors and may report such errors to the application.
% In order to support correction of errors, the processor may
% make unprocessed data from the document (with intermingled
% character data and markup) available to the application. Once a
% fatal error is detected, however, the processor must not
% continue normal processing (i.e., it must not continue to pass
% character data and information about the document's logical
% structure to the application in the normal way).
%
% at user option
% Conforming software may or must (depending on the modal verb in
% the sentence) behave as described; if it does, it must provide
% users a means to enable or disable the behavior described.
%
% validity constraint
% A rule which applies to all valid XML documents. Violations of
% validity constraints are errors; they must, at user option, be
% reported by validating XML processors.
%
% well-formedness constraint
% A rule which applies to all well-formed XML documents.
% Violations of well-formedness constraints are fatal errors.
%
% match
% (Of strings or names:) Two strings or names being compared must
% be identical. Characters with multiple possible representations
% in ISO/IEC 10646 (e.g. characters with both precomposed and
% base+diacritic forms) match only if they have the same
% representation in both strings. At user option, processors may
% normalize such characters to some canonical form. No case
% folding is performed. (Of strings and rules in the grammar:) A
% string matches a grammatical production if it belongs to the
% language generated by that production. (Of content and content
% models:) An element matches its declaration when it conforms in
% the fashion described in the constraint "Element Valid".
%
% for compatibility
% A feature of XML included solely to ensure that XML remains
% compatible with SGML.
%
% for interoperability
% A non-binding recommendation included to increase the chances
% that XML documents can be processed by the existing installed
% base of SGML processors which predate the WebSGML Adaptations
% Annex to ISO 8879.
%
% 2. Documents
%
% A data object is an XML document if it is well-formed, as defined in
% this specification. A well-formed XML document may in addition be
% valid if it meets certain further constraints.
%
% Each XML document has both a logical and a physical structure.
% Physically, the document is composed of units called entities. An
% entity may refer to other entities to cause their inclusion in the
% document. A document begins in a "root" or document entity. Logically,
% the document is composed of declarations, elements, comments,
% character references, and processing instructions, all of which are
% indicated in the document by explicit markup. The logical and physical
% structures must nest properly, as described in "4.3.2 Well-Formed
% Parsed Entities".
%
% 2.1 Well-Formed XML Documents
%
% A textual object is a well-formed XML document if:
% 1. Taken as a whole, it matches the production labeled document.
% 2. It meets all the well-formedness constraints given in this
% specification.
% 3. Each of the parsed entities which is referenced directly or
% indirectly within the document is well-formed.
%
% Document
% [1] document ::= prolog element Misc*
%
%---------------------------------------------------------------------------%
:- module xml.parse.
:- interface.
:- import_module parsing.
:- import_module xml.cat.
:- import_module xml.doc.
:- import_module xml.dtd.
:- import_module map.
% The following three globals should be set in the globals included
% in the initial parsing state.
%
:- type gCatalog
---> gCatalog.
:- type gDirs
---> gDirs.
:- type dirs
---> dirs(cat.dirs).
:- type gEncodings
---> gEncodings.
:- type encodings
---> encodings(map(string, encoding)).
:- instance global(gCatalog, catalog).
:- instance global(gDirs, parse.dirs).
:- instance global(gEncodings, encodings).
:- pred parse_document(pstate(_)::pdi, pstate({dtd, document})::puo) is det.
%---------------------------------------------------------------------------%
:- implementation.
:- import_module unicode.
% Note that using import_module for char would generate many ambiguities
% between char.m in the standard library and unicode.m.
:- use_module char.
:- import_module array.
:- import_module int.
:- import_module io.
:- import_module list.
:- import_module pair.
:- import_module prolog.
:- import_module require.
:- import_module string.
:- import_module unit.
%---------------------------------------------------------------------------%
:- instance global(gCatalog, catalog) where [].
:- instance global(gDirs, parse.dirs) where [].
:- instance global(gEncodings, encodings) where [].
:- type gContent
---> gContent.
:- type gElements
---> gElements.
:- type elements
---> elements(map(name, dtd.element)).
:- type gAttributes
---> gAttributes.
:- type attributes
---> attributes(map(name, map(name, dtd.attribute))).
:- type gEntities
---> gEntities.
:- type entities
---> entities(map(name, entity_def)).
:- type gPEntities
---> gPEntities.
:- type p_entities
---> p_entities(map(name, entity_def)).
:- type gDTD
---> gDTD.
:- type gExtEntities
---> gExtEntities.
:- type extEntities
---> extEntities(map(external_id, dtd.entity)).
:- instance global(gContent, content_store) where [].
:- instance global(gElements, elements) where [].
:- instance global(gAttributes, attributes) where [].
:- instance global(gEntities, entities) where [].
:- instance global(gPEntities, p_entities) where [].
:- instance global(gDTD, dtd) where [].
:- instance global(gExtEntities, extEntities) where [].
parse_document -->
{ init_content_store(Content0) },
set_global(gContent, Content0),
set_global(gExtEntities, extEntities(init)),
set_global(gEntities, entities(entities)),
set_global(gPEntities, p_entities(init)),
set_global(gElements, elements(init)),
set_global(gAttributes, attributes(init)),
(prolog `next` (pred({DTD, PreMisc}::in, pdi, puo) is det -->
(
set_global(gDTD, DTD),
(element `next` (pred(Root::in, pdi, puo) is det -->
star(misc) `next` (pred(PostMisc0::in, pdi, puo) is det -->
get_global(gContent, Content),
{ filter_opt(PostMisc0, PostMisc) },
{ Doc = doc(PreMisc, Root, PostMisc, array(values(Content^e_map))) },
return({DTD, Doc})
)))))).
:- pred init_content_store(content_store::out) is det.
init_content_store(content(0, Map)) :-
map.init(Map).
:- pred same_type(T::unused, T::unused) is det.
same_type(_, _).
:- func entities = map(name, entity_def).
entities = Entities :-
map.from_assoc_list([
"lt" - entity_internal("<"),
"gt" - entity_internal(">"),
"amp" - entity_internal("&"),
"quot" - entity_internal("'"),
"apos" - entity_internal(""")
], Entities).
:- pred init_dtd(name::in, dtd::out) is det.
init_dtd(Root, DTD) :-
map.init(Elems),
map.from_assoc_list([
"lt" - entity_internal("<"),
"gt" - entity_internal(">"),
"amp" - entity_internal("&"),
"quot" - entity_internal("'"),
"apos" - entity_internal(""")
], Entities),
map.init(PEntities),
DTD = dtd(Root, Elems, Entities, PEntities).
% Matching the document production implies that:
% 1. It contains one or more elements.
% 2. There is exactly one element, called the root, or document
% element, no part of which appears in the content of any other
% element. For all other elements, if the start-tag is in the
% content of another element, the end-tag is in the content of the
% same element. More simply stated, the elements, delimited by
% start- and end-tags, nest properly within each other.
%
% As a consequence of this, for each non-root element C in the document,
% there is one other element P in the document such that C is in the
% content of P, but is not in the content of any other element that is
% in the content of P. P is referred to as the parent of C, and C as a
% child of P.
%
% 2.2 Characters
%
% A parsed entity contains text, a sequence of characters, which may
% represent markup or character data. A character is an atomic unit of
% text as specified by ISO/IEC 10646 [ISO/IEC 10646]. Legal characters
% are tab, carriage return, line feed, and the legal graphic characters
% of Unicode and ISO/IEC 10646. The use of "compatibility characters",
% as defined in section 6.8 of [Unicode], is discouraged.
%
% Character Range
% [2] Char ::= #x9 | #xA | #xD | [#x20-#xD7FF] | [#xE000-#xFFFD]
% | [#x10000-#x10FFFF] /* any Unicode character, excluding the surrogate
% blocks, FFFE, and FFFF. */
:- pred char(pstate(_)::pdi, pstate(unicode)::puo) is det.
char -->
tok `next` (pred(C::in, pdi, puo) is det -->
( if
{
C = 0x09
;
C = 0x0A
;
C = 0x0D
;
C >= 0x20, C =< 0xD7FF
;
C >= 0xE000, C =< 0xFFFD
;
C >= 0x10000, C =< 0x10FFFF
}
then
return(C)
else
{ string.format("Unexpected character `%x'.", [i(C)], Msg) },
record_error(Msg)
)).
% The mechanism for encoding character code points into bit patterns
% may vary from entity to entity. All XML processors must accept the UTF-8
% and UTF-16 encodings of 10646; the mechanisms for signaling which of
% the two is in use, or for bringing other encodings into play, are
% discussed later, in "4.3.3 Character Encoding in Entities".
%
% 2.3 Common Syntactic Constructs
%
% This section defines some symbols used widely in the grammar.
%
% S (white space) consists of one or more space (#x20) characters,
% carriage returns, line feeds, or tabs.
%
% White Space
% [3] S ::= (#x20 | #x9 | #xD | #xA)+
:- pred ws(pstate(_)::pdi, pstate(list(unicode))::puo) is det.
ws -->
plus(ws0).
:- pred ws0(pstate(_)::pdi, pstate(unicode)::puo) is det.
ws0 -->
tok `next` (pred(C::in, pdi, puo) is det -->
( if
{
C = 0x20
;
C = 0x09
;
C = 0x0D
;
C = 0x0A
}
then
return(C)
else
record_failure("not whitespace")
)).
% Characters are classified for convenience as letters, digits, or other
% characters. Letters consist of an alphabetic or syllabic base
% character possibly followed by one or more combining characters, or of
% an ideographic character. Full definitions of the specific characters
% in each class are given in "B. Character Classes".
%
% A Name is a token beginning with a letter or one of a few punctuation
% characters, and continuing with letters, digits, hyphens, underscores,
% colons, or full stops, together known as name characters. Names
% beginning with the string "xml", or any string which would match
% (('X'|'x') ('M'|'m') ('L'|'l')), are reserved for standardization in
% this or future versions of this specification.
%
% Note: The colon character within XML names is reserved for
% experimentation with name spaces. Its meaning is expected to be
% standardized at some future point, at which point those documents
% using the colon for experimental purposes may need to be updated.
% (There is no guarantee that any name-space mechanism adopted for XML
% will in fact use the colon as a name-space delimiter.) In practice,
% this means that authors should not use the colon in XML names except
% as part of name-space experiments, but that XML processors should
% accept the colon as a name character.
%
% An Nmtoken (name token) is any mixture of name characters.
%
% Names and Tokens
% [4] NameChar ::= Letter | Digit | '.' | '-' | '_' | ':'
% | CombiningChar | Extender
:- pred name_char(pstate(_)::in, pstate(unicode)::out) is det.
name_char -->
letter or digit or mchr('.') or mchr('-') or mchr('_') or mchr(':') or
combining_char or extender.
% [5] Name ::= (Letter | '_' | ':') (NameChar)*
:- pred name(pstate(_)::in, pstate(name)::out) is det.
name -->
(letter or mchr('_') or mchr(':')) `next` (pred(C::in, pdi, puo) is det -->
star(name_char) `next` (pred(Cs::in, pdi, puo) is det -->
make_string([C | Cs], Name),
return(Name)
)).
% [6] Names ::= Name (S Name)*
:- pred names(pstate(_)::pdi, pstate(list(name))::puo) is det.
names -->
name `next` (pred(Name::in, pdi, puo) is det -->
star(second(ws and name)) `next` (pred(Names0::in, pdi, puo) is det -->
{ Names = [Name | Names0] },
return(Names)
)).
% [7] Nmtoken ::= (NameChar)+
:- pred nm_token(pstate(_)::pdi, pstate(name)::puo) is det.
nm_token -->
plus(name_char) `next` (pred(Cs::in, pdi, puo) is det -->
make_string(Cs, Name),
return(Name)
).
% [8] Nmtokens ::= Nmtoken (S Nmtoken)*
:- pred nm_tokens(pstate(_)::pdi, pstate(list(name))::puo) is det.
nm_tokens -->
nm_token `next` (pred(Name::in, pdi, puo) is det -->
star(second(ws and nm_token)) `next` (pred(Names::in, pdi, puo) is det -->
return([Name | Names])
)).
% Literal data is any quoted string not containing the quotation mark
% used as a delimiter for that string. Literals are used for specifying
% the content of internal entities (EntityValue), the values of
% attributes (AttValue), and external identifiers (SystemLiteral). Note
% that a SystemLiteral can be parsed without scanning for markup.
%
% Literals
% [9] EntityValue ::= '"' ([^%&"] | PEReference | Reference)* '"'
% | "'" ([^%&'] | PEReference | Reference)* "'"
:- pred entity_value(pstate(_)::pdi, pstate(string)::puo) is det.
entity_value -->
quote `next` (pred(Q::in, pdi, puo) is det -->
entity_value1(Q) `next` (pred(Chars::in, pdi, puo) is det -->
quote `next` (pred(EndQ::in, pdi, puo) is det -->
( if { Q = EndQ } then
make_string(Chars, Val),
return(Val)
else
record_error("mismatched quotes")
)))).
:- pred entity_value1(unicode::in,
pstate(_)::pdi, pstate(list(unicode))::puo) is det.
entity_value1(Q) -->
star(list(char_ref) or list(except([('%'), Q])) or
pe_reference(star(char)))
`next` (pred(Css::in, pdi, puo) is det -->
{ list.condense(Css, Cs) },
return(Cs)
).
% [10] AttValue ::= '"' ([^<&"] | Reference)* '"'
% | "'" ([^<&'] | Reference)* "'"
:- pred attr_value(pstate(_)::pdi, pstate(string)::puo) is det.
attr_value -->
quote `next` (pred(Q::in, pdi, puo) is det -->
attr_value1(Q) `next` (pred(Chars::in, pdi, puo) is det -->
quote `next` (pred(EndQ::in, pdi, puo) is det -->
( if { Q = EndQ } then
make_string(Chars, Val),
return(Val)
else
record_error("mismatched quotes")
)))).
:- pred attr_value1(unicode::in,
pstate(_)::pdi, pstate(list(unicode))::puo) is det.
attr_value1(Q) -->
star(list(char_ref) or list(except([('&'), ('<'), Q])) or
entity_ref(star(char)))
`next` (pred(Css::in, pdi, puo) is det -->
{ condense(Css, Cs) },
return(Cs)
).
:- pred attr_value2(pstate(_)::pdi, pstate(list(unicode))::puo) is det.
attr_value2 -->
star(list(char_ref) or entity_ref(attr_value2) or list(char))
`next` (pred(Css::in, pdi, puo) is det -->
{ condense(Css, Cs) },
return(Cs)
).
% [11] SystemLiteral ::= ('"' [^"]* '"') | ("'" [^']* "'")
:- pred system_literal(pstate(_)::pdi, pstate(string)::puo) is det.
system_literal -->
quote `next` (pred(Q::in, pdi, puo) is det -->
star(except([Q])) `next` (pred(Chars::in, pdi, puo) is det -->
quote `next` (pred(EndQ::in, pdi, puo) is det -->
( if { Q = EndQ } then
make_string(Chars, Val),
return(Val)
else
record_error("mismatched quotes")
)))).
% [12] PubidLiteral ::= '"' PubidChar* '"' | "'" (PubidChar - "'")* "'"
:- pred pub_id_literal(pstate(_)::pdi, pstate(string)::puo) is det.
pub_id_literal -->
quote `next` (pred(Q::in, pdi, puo) is det -->
star(pub_id_char(Q)) `next` (pred(Chars::in, pdi, puo) is det -->
quote `next` (pred(EndQ::in, pdi, puo) is det -->
( if { Q = EndQ } then
make_string(Chars, Val),
return(Val)
else
record_error("mismatched quotes")
)))).
% [13] PubidChar ::= #x20 | #xD | #xA | [a-zA-Z0-9]
% | [-'()+,./:=?;!*#@$_%]
:- pred pub_id_char(unicode::in, pstate(_)::pdi, pstate(unicode)::puo) is det.
pub_id_char(Q) -->
tok `next` (pred(C::in, pdi, puo) is det -->
( if
{ C \= Q },
{
C = 0x20
;
C = 0x0D
;
C = 0x0A
;
C >= a, C =< z
;
C >= 'A', C =< 'Z'
;
C >= '0', C =< '9'
;
char.to_int(Ch, C),
contains_char("-'()+,./:=?;!*#@$_%\"", Ch)
}
then
return(C)
else
record_failure("not a public_id char")
)).
% 2.4 Character Data and Markup
%
% Text consists of intermingled character data and markup. Markup takes
% the form of start-tags, end-tags, empty-element tags, entity
% references, character references, comments, CDATA section delimiters,
% document type declarations, and processing instructions.
%
% All text that is not markup constitutes the character data of the
% document.
%
% The ampersand character (&) and the left angle bracket (<) may appear
% in their literal form only when used as markup delimiters, or within a
% comment, a processing instruction, or a CDATA section. They are also
% legal within the literal entity value of an internal entity
% declaration; see "4.3.2 Well-Formed Parsed Entities". If they are
% needed elsewhere, they must be escaped using either numeric character
% references or the strings "&" and "<" respectively. The right
% angle bracket (>) may be represented using the string ">", and
% must, for compatibility, be escaped using ">" or a character
% reference when it appears in the string "]]>" in content, when that
% string is not marking the end of a CDATA section.
%
% In the content of elements, character data is any string of characters
% which does not contain the start-delimiter of any markup. In a CDATA
% section, character data is any string of characters not including the
% CDATA-section-close delimiter, "]]>".
%
% To allow attribute values to contain both single and double quotes,
% the apostrophe or single-quote character (') may be represented as
% "'", and the double-quote character (") as """.
%
% Character Data
% [14] CharData ::= [^<&]* - ([^<&]* ']]>' [^<&]*)
:- pred char_data(pstate(_)::pdi, pstate(ref(doc.content))::puo) is det.
char_data -->
plus(except([('<'), ('&')]) or char_ref)
`next` (pred(Chars::in, pdi, puo) is det -->
make_string(Chars, Data),
add(data(Data), Ref),
return(Ref)
).
% 2.5 Comments
%
% Comments may appear anywhere in a document outside other markup; in
% addition, they may appear within the document type declaration at
% places allowed by the grammar. They are not part of the document's
% character data; an XML processor may, but need not, make it possible
% for an application to retrieve the text of comments. For
% compatibility, the string "--" (double-hyphen) must not occur within
% comments.
%
% Comments
% [15] Comment ::= '<!--' ((Char - '-') | ('-' (Char - '-')))* '-->'
:- pred comment(pstate(_)::pdi, pstate(ref(doc.content))::puo) is det.
comment -->
mstr("<!--") `next` (pred(_::in, pdi, puo) is det -->
upto(char, mstr("-->"))
`next` (pred(next(Cs, _)::in, pdi, puo) is det -->
make_string(Cs, Comment),
add(comment(Comment), Ref),
return(Ref)
)).
% An example of a comment:
%
% <!-- declarations for <head> & <body> -->
%
% 2.6 Processing Instructions
%
% Processing instructions (PIs) allow documents to contain instructions
% for applications.
%
% Processing Instructions
% [16] PI ::= '<?' PITarget (S (Char* - (Char* '?>' Char*)))? '?>'
:- pred pi(pstate(_)::pdi, pstate(ref(doc.content))::puo) is det.
pi -->
mstr("<?") `next` (pred(_::in, pdi, puo) is det -->
pi_target `next` (pred(Target::in, pdi, puo) is det -->
opt(ws and upto(char, mstr("?>")))
`next` (pred(MD::in, pdi, puo) is det -->
( if { MD = yes(next(_, next(Chars, _))) } then
make_string(Chars, Data)
else
{ Data = "" }
),
add(pi(Target, Data), Ref),
return(Ref)
))).
% [17] PITarget ::= Name - (('X' | 'x') ('M' | 'm') ('L' | 'l'))
:- pred pi_target(pstate(_)::pdi, pstate(name)::puo) is det.
pi_target -->
name `next` (pred(Target::in, pdi, puo) is det -->
( if { Target = "XML" ; Target = "xml" } then
record_failure("(x|X)(m|M)(l|L) is not a valid pi target")
else
return(Target)
)).
% PIs are not part of the document's character data, but must be passed
% through to the application. The PI begins with a target (PITarget)
% used to identify the application to which the instruction is directed.
% The target names "XML", "xml", and so on are reserved for
% standardization in this or future versions of this specification. The
% XML Notation mechanism may be used for formal declaration of PI
% targets.
%
% 2.7 CDATA Sections
%
% CDATA sections may occur anywhere character data may occur; they are
% used to escape blocks of text containing characters which would
% otherwise be recognized as markup. CDATA sections begin with the
% string "<![CDATA[" and end with the string "]]>":
%
% CDATA Sections
% [18] CDSect ::= CDStart CData CDEnd
% [19] CDStart ::= '<![CDATA['
% [20] CData ::= (Char* - (Char* ']]>' Char*))
% [21] CDEnd ::= ']]>'
:- pred cd_sect(pstate(_)::pdi, pstate(ref(doc.content))::puo) is det.
cd_sect -->
mstr("<![CDATA[") `next` (pred(_::in, pdi, puo) is det -->
upto(char, mstr_return("]]>", unit))
`next` (pred(next(Cs, _)::in, pdi, puo) is det -->
make_string(Cs, Data),
add(data(Data), Ref),
return(Ref)
)).
% Within a CDATA section, only the CDEnd string is recognized as markup,
% so that left angle brackets and ampersands may occur in their literal
% form; they need not (and cannot) be escaped using "<" and "&".
% CDATA sections cannot nest.
%
% An example of a CDATA section, in which "<greeting>" and "</greeting>"
% are recognized as character data, not markup:
%
% <![CDATA[<greeting>Hello, world!</greeting>]]>
%
% 2.8 Prolog and Document Type Declaration
%
% XML documents may, and should, begin with an XML declaration which
% specifies the version of XML being used. For example, the following is
% a complete XML document, well-formed but not valid:
%
% <?xml version="1.0"?>
% <greeting>Hello, world!</greeting>
%
% and so is this:
%
% <greeting>Hello, world!</greeting>
%
% The version number "1.0" should be used to indicate conformance to
% this version of this specification; it is an error for a document to
% use the value "1.0" if it does not conform to this version of this
% specification. It is the intent of the XML working group to give later
% versions of this specification numbers other than "1.0", but this
% intent does not indicate a commitment to produce any future versions
% of XML, nor if any are produced, to use any particular numbering
% scheme. Since future versions are not ruled out, this construct is
% provided as a means to allow the possibility of automatic version
% recognition, should it become necessary. Processors may signal an
% error if they receive documents labeled with versions they do not
% support.
%
% The function of the markup in an XML document is to describe its
% storage and logical structure and to associate attribute-value pairs
% with its logical structures. XML provides a mechanism, the document
% type declaration, to define constraints on the logical structure and
% to support the use of predefined storage units. An XML document is
% valid if it has an associated document type declaration and if the
% document complies with the constraints expressed in it.
%
% The document type declaration must appear before the first element in
% the document.
%
% Prolog
% [22] prolog ::= XMLDecl? Misc* (doctype_decl Misc*)?
:- pred prolog(pstate(_)::pdi, pstate({dtd, list(ref(doc.content))})::puo)
is det.
prolog -->
opt(xml_decl) `next` (pred(_::in, pdi, puo) is det -->
star(misc) `next` (pred(Misc0::in, pdi, puo) is det -->
opt(doctype_decl and star(misc))
`next` (pred(MStuff::in, pdi, puo) is det -->
{
MStuff = yes(next(DTD, Misc1)),
list.append(Misc0, Misc1, Misc2),
filter_opt(Misc2, Misc)
;
MStuff = no,
map.init(Elems),
map.init(Entities),
map.init(PEntities),
DTD = dtd("", Elems, Entities, PEntities),
filter_opt(Misc0, Misc)
},
return({DTD, Misc})
))).
% [23] XMLDecl ::= '<?xml' VersionInfo EncodingDecl? SDDecl? S? '?>'
:- pred xml_decl(pstate(_)::in, pstate(unit)::out) is det.
xml_decl -->
mstr("<?xml") `next` (pred(_::in, pdi, puo) is det -->
version_info `next` (pred(_::in, pdi, puo) is det -->
opt(encoding_decl) `next` (pred(MEnc::in, pdi, puo) is det -->
opt(sd_decl) `next` (pred(_::in, pdi, puo) is det -->
opt(ws) `next` (pred(_::in, pdi, puo) is det -->
mstr("?>") `next` (pred(_::in, pdi, puo) is det -->
(
{ MEnc = yes(EncName) },
get_global(gEncodings, encodings(Encodings)),
( if { search(Encodings, EncName, Encoding) } then
set_encoding(Encoding),
return_unit
else
{ string.format("unknown encoding `%s'", [s(EncName)], Msg) },
record_error(Msg)
)
;
{ MEnc = no },
return_unit
))))))).
% [24] VersionInfo ::= S 'version' Eq (' VersionNum ' | " VersionNum ")
:- pred version_info(pstate(_)::pdi, pstate(unit)::puo) is det.
version_info -->
ws `next` (pred(_::in, pdi, puo) is det -->
mstr("version") `next` (pred(_::in, pdi, puo) is det -->
eq `next` (pred(_::in, pdi, puo) is det -->
quote `next` (pred(Q::in, pdi, puo) is det -->
version_num `next` (pred(_::in, pdi, puo) is det -->
quote `next` (pred(EndQ::in, pdi, puo) is det -->
( if { Q = EndQ } then
return_unit