1 |
110 |
olivier.gi |
#############################################################################
|
2 |
|
|
#
|
3 |
|
|
# xml.tcl -- Simple XML parser
|
4 |
|
|
# by Keith Vetter, March 2004
|
5 |
|
|
#
|
6 |
|
|
|
7 |
|
|
namespace eval ::XML { variable XML "" loc 0}
|
8 |
|
|
|
9 |
|
|
proc ::XML::Init {xmlData} {
|
10 |
|
|
variable XML
|
11 |
|
|
variable loc
|
12 |
|
|
|
13 |
|
|
set XML [string trim $xmlData];
|
14 |
|
|
regsub -all {<!--.*?-->} $XML {} XML ;# Remove all comments
|
15 |
|
|
set loc 0
|
16 |
|
|
}
|
17 |
|
|
|
18 |
|
|
# Returns {XML|TXT|EOF|PI value attributes START|END|EMPTY}
|
19 |
|
|
proc ::XML::NextToken {{peek 0}} {
|
20 |
|
|
variable XML
|
21 |
|
|
variable loc
|
22 |
|
|
|
23 |
|
|
set n [regexp -start $loc -indices {(.*?)\s*?<(/?)(.*?)(/?)>} \
|
24 |
|
|
$XML all txt stok tok etok]
|
25 |
|
|
if {! $n} {return [list EOF]}
|
26 |
|
|
foreach {all0 all1} $all {txt0 txt1} $txt \
|
27 |
|
|
{stok0 stok1} $stok {tok0 tok1} $tok {etok0 etok1} $etok break
|
28 |
|
|
|
29 |
|
|
if {$txt1 >= $txt0} { ;# Got text
|
30 |
|
|
set txt [string range $XML $txt0 $txt1]
|
31 |
|
|
if {! $peek} {set loc [expr {$txt1 + 1}]}
|
32 |
|
|
return [list TXT $txt]
|
33 |
|
|
}
|
34 |
|
|
|
35 |
|
|
set token [string range $XML $tok0 $tok1] ;# Got something in brackets
|
36 |
|
|
if {! $peek} {set loc [expr {$all1 + 1}]}
|
37 |
|
|
if {[regexp {^!\[CDATA\[(.*)\]\]} $token => txt]} { ;# Is it CDATA stuff?
|
38 |
|
|
return [list TXT $txt]
|
39 |
|
|
}
|
40 |
|
|
|
41 |
|
|
# Check for Processing Instruction <?...?>
|
42 |
|
|
set type XML
|
43 |
|
|
if {[regexp {^\?(.*)\?$} $token => token]} {
|
44 |
|
|
set type PI
|
45 |
|
|
}
|
46 |
|
|
set attr ""
|
47 |
|
|
regexp {^(.*?)\s+(.*?)$} $token => token attr
|
48 |
|
|
|
49 |
|
|
set etype START ;# Entity type
|
50 |
|
|
if {$etok0 <= $etok1} {
|
51 |
|
|
if {$stok0 <= $stok1} { set token "/$token"} ;# Bad XML
|
52 |
|
|
set etype EMPTY
|
53 |
|
|
} elseif {$stok0 <= $stok1} {
|
54 |
|
|
set etype END
|
55 |
|
|
}
|
56 |
|
|
return [list $type $token $attr $etype]
|
57 |
|
|
}
|
58 |
|
|
# ::XML::IsWellFormed
|
59 |
|
|
# checks if the XML is well-formed )http://www.w3.org/TR/1998/REC-xml-19980210)
|
60 |
|
|
#
|
61 |
|
|
# Returns "" if well-formed, error message otherwise
|
62 |
|
|
# missing:
|
63 |
|
|
# characters: doesn't check valid extended characters
|
64 |
|
|
# attributes: doesn't check anything: quotes, equals, unique, etc.
|
65 |
|
|
# text stuff: references, entities, parameters, etc.
|
66 |
|
|
# doctype internal stuff
|
67 |
|
|
#
|
68 |
|
|
proc ::XML::IsWellFormed {} {
|
69 |
|
|
set result [::XML::_IsWellFormed]
|
70 |
|
|
set ::XML::loc 0
|
71 |
|
|
return $result
|
72 |
|
|
}
|
73 |
|
|
;proc ::XML::_IsWellFormed {} {
|
74 |
|
|
array set emsg {
|
75 |
|
|
XMLDECLFIRST "The XML declaration must come first"
|
76 |
|
|
MULTIDOCTYPE "Only one DOCTYPE is allowed"
|
77 |
|
|
INVALID "Invalid document structure"
|
78 |
|
|
MISMATCH "Ending tag '$val' doesn't match starting tag"
|
79 |
|
|
BADELEMENT "Bad element name '$val'"
|
80 |
|
|
EOD "Only processing instructions allowed at end of document"
|
81 |
|
|
BADNAME "Bad name '$val'"
|
82 |
|
|
BADPI "No processing instruction starts with 'xml'"
|
83 |
|
|
}
|
84 |
|
|
|
85 |
|
|
# [1] document ::= prolog element Misc*
|
86 |
|
|
# [22] prolog ::= XMLDecl? Misc* (doctypedecl Misc*)?
|
87 |
|
|
# [27] Misc ::= Comment | PI | S
|
88 |
|
|
# [28] doctypedecl ::= <!DOCTYPE...>
|
89 |
|
|
# [16] PI ::= <? Name ...?>
|
90 |
|
|
set seen 0 ;# 1 xml, 2 pi, 4 doctype
|
91 |
|
|
while {1} {
|
92 |
|
|
foreach {type val attr etype} [::XML::NextToken] break
|
93 |
|
|
if {$type eq "PI"} {
|
94 |
|
|
if {! [regexp {^[a-zA-Z_:][a-zA-Z0-9.-_:\xB7]+$} $val]} {
|
95 |
|
|
return [subst $emsg(BADNAME)]
|
96 |
|
|
}
|
97 |
|
|
if {$val eq "xml"} { ;# XMLDecl
|
98 |
|
|
if {$seen != 0} { return $emsg(XMLDECLFIRST) }
|
99 |
|
|
# TODO: check version number exist and only encoding and
|
100 |
|
|
# standalone attributes are allowed
|
101 |
|
|
incr seen ;# Mark as seen XMLDecl
|
102 |
|
|
continue
|
103 |
|
|
}
|
104 |
|
|
if {[string equal -nocase "xml" $val]} {return $emsg(BADPI)}
|
105 |
|
|
set seen [expr {$seen | 2}] ;# Mark as seen PI
|
106 |
|
|
continue
|
107 |
|
|
} elseif {$type eq "XML" && $val eq "!DOCTYPE"} { ;# Doctype
|
108 |
|
|
if {$seen & 4} { return $emsg(MULTIDOCTYPE) }
|
109 |
|
|
set seen [expr {$seen | 4}]
|
110 |
|
|
continue
|
111 |
|
|
}
|
112 |
|
|
break
|
113 |
|
|
}
|
114 |
|
|
|
115 |
|
|
# [39] element ::= EmptyElemTag | STag content ETag
|
116 |
|
|
# [40] STag ::= < Name (S Attribute)* S? >
|
117 |
|
|
# [42] ETag ::= </ Name S? >
|
118 |
|
|
# [43] content ::= CharData? ((element | Reference | CDSect | PI | Comment) CharData?)*
|
119 |
|
|
# [44] EmptyElemTag ::= < Name (S Attribute)* S? />
|
120 |
|
|
#
|
121 |
|
|
|
122 |
|
|
set stack {}
|
123 |
|
|
set first 1
|
124 |
|
|
while {1} {
|
125 |
|
|
if {! $first} { ;# Skip first time in
|
126 |
|
|
foreach {type val attr etype} [::XML::NextToken] break
|
127 |
|
|
} else {
|
128 |
|
|
if {$type ne "XML" && $type ne "EOF"} { return $emsg(INVALID) }
|
129 |
|
|
set first 0
|
130 |
|
|
}
|
131 |
|
|
|
132 |
|
|
if {$type eq "EOF"} break
|
133 |
|
|
;# TODO: check attributes: quotes, equals and unique
|
134 |
|
|
|
135 |
|
|
if {$type eq "TXT"} continue
|
136 |
|
|
if {! [regexp {^[a-zA-Z_:][a-zA-Z0-9.-_:\xB7]+$} $val]} {
|
137 |
|
|
return [subst $emsg(BADNAME)]
|
138 |
|
|
}
|
139 |
|
|
|
140 |
|
|
if {$type eq "PI"} {
|
141 |
|
|
if {[string equal -nocase xml $val]} { return $emsg(BADPI) }
|
142 |
|
|
continue
|
143 |
|
|
}
|
144 |
|
|
if {$etype eq "START"} { ;# Starting tag
|
145 |
|
|
lappend stack $val
|
146 |
|
|
} elseif {$etype eq "END"} { ;# </tag>
|
147 |
|
|
if {$val ne [lindex $stack end]} { return [subst $emsg(MISMATCH)] }
|
148 |
|
|
set stack [lrange $stack 0 end-1]
|
149 |
|
|
if {[llength $stack] == 0} break ;# Empty stack
|
150 |
|
|
} elseif {$etype eq "EMPTY"} { ;# <tag/>
|
151 |
|
|
}
|
152 |
|
|
}
|
153 |
|
|
|
154 |
|
|
# End-of-Document can only contain processing instructions
|
155 |
|
|
while {1} {
|
156 |
|
|
foreach {type val attr etype} [::XML::NextToken] break
|
157 |
|
|
if {$type eq "EOF"} break
|
158 |
|
|
if {$type eq "PI"} {
|
159 |
|
|
if {[string equal -nocase xml $val]} { return $emsg(BADPI) }
|
160 |
|
|
continue
|
161 |
|
|
}
|
162 |
|
|
return $emsg(EOD)
|
163 |
|
|
}
|
164 |
|
|
return ""
|
165 |
|
|
}
|
166 |
|
|
|
167 |
|
|
################################################################
|
168 |
|
|
#
|
169 |
|
|
# Demo code
|
170 |
|
|
#
|
171 |
|
|
#set xml {<?xml version="1.0" encoding="ISO-8859-1"?>
|
172 |
|
|
# <loc version="1.0" src="Groundspeak">
|
173 |
|
|
# <waypoint>
|
174 |
|
|
# <name id="GCGPXK"><![CDATA[Playing Poker with the Squirrels by Rino 'n Rinette]]></name>
|
175 |
|
|
# <coord lat="40.1548166" lon="-82.5202833"/>
|
176 |
|
|
# <type>Geocache</type>
|
177 |
|
|
# <link text="Cache Details">http://www.geocaching.com/seek/cache_details.aspx?wp=GCGPXK</link>
|
178 |
|
|
# </waypoint><waypoint>
|
179 |
|
|
# <name id="GC19DF"><![CDATA[Great Playground Caper by Treasure Hunters Inc.]]></name>
|
180 |
|
|
# <coord lat="40.0667166666667" lon="-82.5358"/>
|
181 |
|
|
# <type>Geocache</type>
|
182 |
|
|
# <link text="Cache Details">http://www.geocaching.com/seek/cache_details.aspx?wp=GC19DF</link>
|
183 |
|
|
# </waypoint>
|
184 |
|
|
# </loc>
|
185 |
|
|
#}
|
186 |
|
|
|
187 |
|
|
#::XML::Init $xml
|
188 |
|
|
#set wellFormed [::XML::IsWellFormed]
|
189 |
|
|
#if {$wellFormed ne ""} {
|
190 |
|
|
# puts "The xml is not well-formed: $wellFormed"
|
191 |
|
|
#} else {
|
192 |
|
|
# puts "The xml is well-formed"
|
193 |
|
|
# while {1} {
|
194 |
|
|
# foreach {type val attr etype} [::XML::NextToken] break
|
195 |
|
|
# puts "looking at: $type '$val' '$attr' '$etype'"
|
196 |
|
|
# if {$type == "EOF"} break
|
197 |
|
|
# }
|
198 |
|
|
#}
|