Arduino DLNA Server
Loading...
Searching...
No Matches
XMLParser.h
Go to the documentation of this file.
1#pragma once
2
3#include "Print.h"
4#include "assert.h"
5#include "basic/Str.h"
6#include "basic/StrView.h"
7#include "basic/Vector.h"
8
9namespace tiny_dlna {
27class XMLParser {
28 public:
30 XMLParser() = default;
46 XMLParser(const char* xmlStr,
47 void (*callback)(Str& nodeName, Vector<Str>& path, Str& text,
48 Str& attributes, int start, int len, void* ref),
49 bool textOnly = false) {
50 setXml(xmlStr);
52 setReportTextOnly(textOnly);
53 }
54
60 void setReference(void* ref) { reference = ref; }
61
62 // Setters
68 void setXml(const char* xmlStr) {
69 str_view.set(xmlStr);
70 parsePos = 0;
71 }
72
78 void setCallback(void (*cb)(Str& nodeName, Vector<Str>& path, Str& text,
79 Str& attributes, int start, int len, void* ref)) {
80 this->callback = cb;
81 }
82
90 void parse() { do_parse(); }
91
104 bool parseSingle() { return do_parse_single(); }
105
110 void resetParse() { parsePos = 0; }
111
118 void resetParser() {
119 parsePos = 0;
120 path.resize(0);
123 node_name.set("");
124 txt.set("");
125 }
126
128 void setReportTextOnly(bool flag) { report_text_only = flag; }
129
131 int getParsePos() { return parsePos; }
132
133 protected:
138 Str txt{100};
139 Str str{100};
140 // last parsed attributes for the most recent start tag
142 bool report_text_only = true;
143 // callback signature: nodeName, path, text (inner text for elements or
144 // trimmed text), start, len, ref
145 // NOTE: the `Str` references passed to the callback (nodeName and text)
146 // are backed by parser-owned storage (members of this class) and are only
147 // valid for the duration of the callback invocation. If the callback
148 // needs to retain the data it must make its own copy.
149 void (*callback)(Str& nodeName, Vector<Str>& path, Str& text, Str& attributes,
150 int start, int len, void* ref) = nullptr;
151 Vector<int> contentStarts; // parallel stack to `path`: start position of
152 // content (after the start tag)
153 // user-provided opaque pointer for convenience
154 void* reference = nullptr;
155 // Resume position for incremental parseSingle() calls.
156 // (declared above in protected members)
157 // Helper methods used by the parser loop (kept protected for
158 // testing/extension)
159 int findGt(const char* s, int start, int len) {
160 int gt = -1;
161 bool inQuote = false;
162 char qchar = 0;
163 for (int i = start + 1; i < len; ++i) {
164 char c = s[i];
165 if (!inQuote && (c == '"' || c == '\'')) {
166 inQuote = true;
167 qchar = c;
168 } else if (inQuote && c == qchar) {
169 inQuote = false;
170 } else if (!inQuote && c == '>') {
171 gt = i;
172 break;
173 }
174 }
175 return gt;
176 }
177
178 void emitTextSegment(const char* s, int ts, int te) {
179 // trim whitespace bounds already provided by caller when appropriate
180 if (te > ts && callback) {
181 txt.copyFrom(s + ts, te - ts);
182 node_name = path.size() > 0 ? path.back() : empty_str;
183 // Entity expansion is performed at setXml() time. Callbacks receive
184 // the parser-owned (or expanded) text directly.
186 }
187 }
188
189 void emitTagSegment(const char* s, int lt, int gt) {
190 if (callback) {
191 node_name = path.size() > 0 ? path.back() : empty_str;
193 gt - lt + 1);
194 }
195 }
196
197 int handleEndTag(const char* s, int lt, int gt) {
198 if (path.size() > 0) path.erase(path.size() - 1);
200 return gt + 1;
201 }
202
203 void handleStartTag(const char* s, int lt, int gt) {
204 // Clear previous attributes first to avoid carrying attributes from a
205 // previously parsed tag into the current element when the current
206 // tag contains no attributes. Use clear() so the underlying char buffer
207 // is zero-terminated and not accidentally read as a C-string by callers
208 // that use attributes.c_str().
210 int nameStart = lt + 1;
211 while (nameStart < gt && isspace((unsigned char)s[nameStart])) nameStart++;
212 int nameEnd = nameStart;
213 while (nameEnd < gt && !isspace((unsigned char)s[nameEnd]) &&
214 s[nameEnd] != '/' && s[nameEnd] != '>')
215 nameEnd++;
216
217 if (nameEnd > nameStart) {
218 node_name.copyFrom(s + nameStart, nameEnd - nameStart);
219 path.push_back(node_name);
220 int contentStart = gt + 1;
221 contentStarts.push_back(contentStart);
222 // extract raw attribute text (between name end and tag end), exclude
223 // trailing '/' for self-closing tags
224 int attrStart = nameEnd;
225 int attrEnd = gt;
226 // back up one if tag ends with '/>'
227 int back = gt - 1;
228 while (back > lt && isspace((unsigned char)s[back])) back--;
229 if (back > lt && s[back] == '/') attrEnd = back;
230 // trim leading/trailing whitespace for attributes
231 while (attrStart < attrEnd && isspace((unsigned char)s[attrStart]))
232 attrStart++;
233 while (attrEnd > attrStart && isspace((unsigned char)s[attrEnd - 1]))
234 attrEnd--;
235 if (attrEnd > attrStart) {
236 last_attributes.copyFrom(s + attrStart, attrEnd - attrStart);
237 } else {
239 }
240 }
241 }
242
243 int handleCommentOrPI(int lt, const char* s, int len) {
244 // comment <!-- ... -->
245 if (lt + 4 < len && s[lt + 1] == '!' && s[lt + 2] == '-' &&
246 s[lt + 3] == '-') {
247 int end = str_view.indexOf("-->", lt + 4);
248 return end < 0 ? len : end + 3;
249 }
250 // processing instruction <? ... ?>
251 if (lt + 1 < len && s[lt + 1] == '?') {
252 int end = str_view.indexOf("?>", lt + 2);
253 return end < 0 ? len : end + 2;
254 }
255 return lt; // not a comment/PI
256 }
257
258 // Helper: invoke the registered callback but pass a path that excludes the
259 // current node (only ancestor elements). This keeps the `nodeName`
260 // parameter as the current element while `path` contains only parents.
261 bool invokeCallback(Str& nodeName, Vector<Str>& fullPath, Str& text,
262 Str& attributes, int start, int len) {
263 if (!callback) return false;
264 if (report_text_only && text.isEmpty()) return false;
265 Vector<Str> ancestorPath;
266 int ancCount = fullPath.size() > 0 ? (int)fullPath.size() - 1 : 0;
267 for (int i = 0; i < ancCount; ++i) {
268 // Make a fresh copy of the ancestor name to avoid any aliasing
269 // or lifetime issues with parser-owned storage.
270 ancestorPath.push_back(Str(fullPath[i].c_str()));
271 }
272 callback(nodeName, ancestorPath, text, attributes, start, len, reference);
273 return true;
274 }
275
276 // Run the parser. This is a small, forgiving parser suitable for the
277 // embedded use-cases in this project (DIDL fragments, simple SCPD parsing).
278 // It is not a full XML validator but handles start/end tags, comments,
279 // processing instructions and self-closing tags. For each text node that
280 // contains non-whitespace characters the provided callback is invoked with
281 // the text and the current element path.
282 void do_parse() {
283 const char* s = str_view.c_str();
284 int len = str_view.length();
285 int pos = 0;
286
287 while (pos < len) {
288 int lt = str_view.indexOf('<', pos);
289 if (lt < 0) break;
290
291 // Handle text between pos and lt
292 if (lt > pos) {
293 int ts = pos;
294 int te = lt;
295 while (ts < te && isspace((unsigned char)s[ts])) ts++;
296 while (te > ts && isspace((unsigned char)s[te - 1])) te--;
297 emitTextSegment(s, ts, te);
298 }
299
300 // comment or processing instruction handling
301 int newPos = handleCommentOrPI(lt, s, len);
302 if (newPos != lt) {
303 pos = newPos;
304 continue;
305 }
306
307 // find closing '>' (respect quotes)
308 int gt = findGt(s, lt, len);
309 if (gt < 0) break; // malformed
310
311 // end tag
312 if (lt + 1 < len && s[lt + 1] == '/') {
313 pos = handleEndTag(s, lt, gt);
314 continue;
315 }
316
317 // start tag (or self-closing)
318 handleStartTag(s, lt, gt);
319
320 // callback for the tag itself
321 emitTagSegment(s, lt, gt);
322
323 // detect self-closing and pop if needed
324 bool selfClosing = false;
325 int back = gt - 1;
326 while (back > lt && isspace((unsigned char)s[back])) back--;
327 if (back > lt && s[back] == '/') selfClosing = true;
328
329 pos = gt + 1;
330
331 if (selfClosing && path.size() > 0) {
332 path.erase(path.size() - 1);
333 if (contentStarts.size() > 0)
335 }
336 }
337 }
338
339 // Incremental single-callback parser. It will resume from `parsePos`
340 // and advance `parsePos` past the fragment it invoked (or to the end
341 // if nothing more is parseable). Returns true if a callback was
342 // invoked.
344 const char* s = str_view.c_str();
345 int len = str_view.length();
346 int pos = parsePos;
347 bool result = false;
348
349 while (pos < len) {
350 int lt = str_view.indexOf('<', pos);
351 if (lt < 0) break;
352
353 // Handle text between pos and lt
354 if (lt > pos) {
355 int ts = pos;
356 int te = lt;
357 while (ts < te && isspace((unsigned char)s[ts])) ts++;
358 while (te > ts && isspace((unsigned char)s[te - 1])) te--;
359 if (te > ts && callback) {
360 // emit this text segment and advance parsePos past it
361 txt.copyFrom(s + ts, te - ts);
362 node_name = path.size() > 0 ? path.back() : empty_str;
363 // Entity expansion is handled in setXml(); the callback receives
364 // the parser text as-is.
366 te - ts);
367 parsePos = te; // next position is end of emitted text
368 return result;
369 }
370 }
371
372 // comment or processing instruction handling
373 int newPos = handleCommentOrPI(lt, s, len);
374 if (newPos != lt) {
375 pos = newPos;
376 parsePos = pos;
377 continue; // comments/PI produce no callback
378 }
379
380 // find closing '>' (respect quotes)
381 int gt = findGt(s, lt, len);
382 if (gt < 0) break; // malformed
383
384 // end tag
385 if (lt + 1 < len && s[lt + 1] == '/') {
386 // handle end tag; this may not produce a callback by itself
387 pos = handleEndTag(s, lt, gt);
388 parsePos = pos;
389 // End tags don't generate callbacks in this parser's model unless
390 // there is trimmed text to emit; continue to next loop iteration.
391 continue;
392 }
393
394 // start tag (or self-closing)
395 handleStartTag(s, lt, gt);
396
397 // callback for the tag itself
398 if (callback) {
399 node_name = path.size() > 0 ? path.back() : empty_str;
401 gt - lt + 1);
402 // detect self-closing and pop if needed
403 bool selfClosing = false;
404 int back = gt - 1;
405 while (back > lt && isspace((unsigned char)s[back])) back--;
406 if (back > lt && s[back] == '/') selfClosing = true;
407 pos = gt + 1;
408 if (selfClosing && path.size() > 0) {
409 path.erase(path.size() - 1);
410 if (contentStarts.size() > 0)
412 }
413 parsePos = pos;
414 return result;
415 }
416
417 pos = gt + 1;
418 parsePos = pos;
419 }
420
421 // nothing left to parse
422 parsePos = pos;
423 return false;
424 }
425
426 // Resume position for incremental parseSingle() calls.
427 int parsePos = 0;
428};
429
430} // namespace tiny_dlna
A simple wrapper to provide string functions on char*. If the underlying char* is a const we do not a...
Definition: StrView.h:19
virtual bool isEmpty()
checks if the string is empty
Definition: StrView.h:383
virtual const char * c_str()
provides the string value as const char*
Definition: StrView.h:376
virtual int indexOf(const char c, int start=0)
Definition: StrView.h:275
virtual int length()
Definition: StrView.h:380
virtual void set(const char *alt)
assigs a value
Definition: StrView.h:45
String implementation which keeps the data on the heap. We grow the allocated memory only if the copy...
Definition: Str.h:22
void clear() override
clears the string by setting the terminating 0 at the beginning
Definition: Str.h:163
void copyFrom(const char *source, int len, int maxlen=0)
assigns a memory buffer
Definition: Str.h:93
Vector implementation which provides the most important methods as defined by std::vector....
Definition: Vector.h:21
bool resize(int newSize, T value)
Definition: Vector.h:251
int size()
Definition: Vector.h:167
void push_back(T &&value)
Definition: Vector.h:171
void erase(Iterator it)
Definition: Vector.h:279
Lightweight streaming XML parser.
Definition: XMLParser.h:27
Str last_attributes
Definition: XMLParser.h:141
int handleCommentOrPI(int lt, const char *s, int len)
Definition: XMLParser.h:243
void do_parse()
Definition: XMLParser.h:282
void setCallback(void(*cb)(Str &nodeName, Vector< Str > &path, Str &text, Str &attributes, int start, int len, void *ref))
Set the callback to be invoked for parsed fragments.
Definition: XMLParser.h:78
Str txt
Definition: XMLParser.h:138
void setReportTextOnly(bool flag)
report only nodes with text
Definition: XMLParser.h:128
int handleEndTag(const char *s, int lt, int gt)
Definition: XMLParser.h:197
Str node_name
Definition: XMLParser.h:137
Str empty_str
Definition: XMLParser.h:136
int parsePos
Definition: XMLParser.h:427
void(* callback)(Str &nodeName, Vector< Str > &path, Str &text, Str &attributes, int start, int len, void *ref)
Definition: XMLParser.h:149
XMLParser(const char *xmlStr, void(*callback)(Str &nodeName, Vector< Str > &path, Str &text, Str &attributes, int start, int len, void *ref), bool textOnly=false)
Construct with XML buffer and callback.
Definition: XMLParser.h:46
void setXml(const char *xmlStr)
Set the XML buffer to parse.
Definition: XMLParser.h:68
void resetParser()
Fully reset parser state (parse position, path stack and content starts). Use this when the underlyin...
Definition: XMLParser.h:118
void * reference
Definition: XMLParser.h:154
Vector< Str > path
Definition: XMLParser.h:135
void handleStartTag(const char *s, int lt, int gt)
Definition: XMLParser.h:203
StrView str_view
Definition: XMLParser.h:134
Str str
Definition: XMLParser.h:139
void emitTagSegment(const char *s, int lt, int gt)
Definition: XMLParser.h:189
Vector< int > contentStarts
Definition: XMLParser.h:151
void setReference(void *ref)
Attach an opaque user pointer to the parser instance.
Definition: XMLParser.h:60
bool do_parse_single()
Definition: XMLParser.h:343
bool report_text_only
Definition: XMLParser.h:142
void parse()
Parse the previously set XML buffer and invoke the callback.
Definition: XMLParser.h:90
int findGt(const char *s, int start, int len)
Definition: XMLParser.h:159
void resetParse()
Reset the internal parse position so subsequent parseSingle() calls start from the beginning of the b...
Definition: XMLParser.h:110
bool parseSingle()
Parse a single fragment (one callback invocation) from the previously set XML buffer.
Definition: XMLParser.h:104
int getParsePos()
Expose current parse position for incremental wrappers.
Definition: XMLParser.h:131
void emitTextSegment(const char *s, int ts, int te)
Definition: XMLParser.h:178
bool invokeCallback(Str &nodeName, Vector< Str > &fullPath, Str &text, Str &attributes, int start, int len)
Definition: XMLParser.h:261
Definition: Allocator.h:6