Arduino DLNA Server
Loading...
Searching...
No Matches
XMLParser.h
Go to the documentation of this file.
1#pragma once
2
3#include "Print.h"
4#include "assert.h"
5#include "basic/Str.h"
6#include "basic/StrView.h"
7#include "basic/Vector.h"
8
9namespace tiny_dlna {
27class XMLParser {
28 public:
30 XMLParser() = default;
46 XMLParser(const char* xmlStr,
47 void (*callback)(Str& nodeName, Vector<Str>& path, Str& text,
48 Str& attributes, int start, int len, void* ref),
49 bool textOnly = false) {
50 setXml(xmlStr);
52 setReportTextOnly(textOnly);
53 }
54
60 void setReference(void* ref) { reference = ref; }
61
62 // Setters
68 void setXml(const char* xmlStr) {
69 str_view.set(xmlStr);
70 parsePos = 0;
71 }
72
78 void setCallback(void (*cb)(Str& nodeName, Vector<Str>& path, Str& text,
79 Str& attributes, int start, int len, void* ref)) {
80 this->callback = cb;
81 }
82
90 void parse() { do_parse(); }
91
104 bool parseSingle() { return do_parse_single(); }
105
110 void resetParse() { parsePos = 0; }
111
118 void end() {
119 parsePos = 0;
120 path.reset();
124 txt.release();
125 }
126
128 void setReportTextOnly(bool flag) { report_text_only = flag; }
129
131 int getParsePos() { return parsePos; }
132
133 protected:
138 Str txt{100};
139 Str str{100};
140 // last parsed attributes for the most recent start tag
142 bool report_text_only = true;
143 // callback signature: nodeName, path, text (inner text for elements or
144 // trimmed text), start, len, ref
145 // NOTE: the `Str` references passed to the callback (nodeName and text)
146 // are backed by parser-owned storage (members of this class) and are only
147 // valid for the duration of the callback invocation. If the callback
148 // needs to retain the data it must make its own copy.
149 void (*callback)(Str& nodeName, Vector<Str>& path, Str& text, Str& attributes,
150 int start, int len, void* ref) = nullptr;
151 Vector<int> contentStarts; // parallel stack to `path`: start position of
152 // content (after the start tag)
153 // user-provided opaque pointer for convenience
154 void* reference = nullptr;
155
156 // Resume position for incremental parseSingle() calls.
157 // (declared above in protected members)
158 // Helper methods used by the parser loop (kept protected for
159 // testing/extension)
160 int findGt(const char* s, int start, int len) {
161 int gt = -1;
162 bool inQuote = false;
163 char qchar = 0;
164 for (int i = start + 1; i < len; ++i) {
165 char c = s[i];
166 if (!inQuote && (c == '"' || c == '\'')) {
167 inQuote = true;
168 qchar = c;
169 } else if (inQuote && c == qchar) {
170 inQuote = false;
171 } else if (!inQuote && c == '>') {
172 gt = i;
173 break;
174 }
175 }
176 return gt;
177 }
178
179 // Check if there is non-whitespace text immediately after this tag
180 // (before the next '<'). If true, the element has text content and we
181 // should suppress the start-tag-only callback to avoid duplicate events
182 // (first empty, then text) for the same node.
183 bool hasNonWhitespaceTextAhead(const char* s, int gt, int len) {
184 int start = gt + 1;
185 if (start >= len) return false;
186 int next = str_view.indexOf('<', start);
187 if (next < 0) next = len;
188 for (int i = start; i < next; ++i) {
189 if (!isspace((unsigned char)s[i])) {
190 return true;
191 }
192 }
193 return false;
194 }
195
196 void emitTextSegment(const char* s, int ts, int te) {
197 // trim whitespace bounds already provided by caller when appropriate
198 if (te > ts && callback) {
199 txt.copyFrom(s + ts, te - ts);
200 node_name = path.size() > 0 ? path.back() : empty_str;
201 // Entity expansion is performed at setXml() time. Callbacks receive
202 // the parser-owned (or expanded) text directly.
204 }
205 }
206
207 void emitTagSegment(const char* s, int lt, int gt) {
208 if (callback) {
209 node_name = path.size() > 0 ? path.back() : empty_str;
211 gt - lt + 1);
212 }
213 }
214
215 int handleEndTag(const char* s, int lt, int gt) {
216 if (path.size() > 0) path.erase(path.size() - 1);
217 if (contentStarts.size() > 0) contentStarts.erase(contentStarts.size() - 1);
218 return gt + 1;
219 }
220
221 void handleStartTag(const char* s, int lt, int gt) {
222 // Clear previous attributes first to avoid carrying attributes from a
223 // previously parsed tag into the current element when the current
224 // tag contains no attributes. Use clear() so the underlying char buffer
225 // is zero-terminated and not accidentally read as a C-string by callers
226 // that use attributes.c_str().
228 int nameStart = lt + 1;
229 while (nameStart < gt && isspace((unsigned char)s[nameStart])) nameStart++;
230 int nameEnd = nameStart;
231 while (nameEnd < gt && !isspace((unsigned char)s[nameEnd]) &&
232 s[nameEnd] != '/' && s[nameEnd] != '>')
233 nameEnd++;
234
235 if (nameEnd > nameStart) {
236 node_name.copyFrom(s + nameStart, nameEnd - nameStart);
237 path.push_back(node_name);
238 int contentStart = gt + 1;
239 contentStarts.push_back(contentStart);
240 // extract raw attribute text (between name end and tag end), exclude
241 // trailing '/' for self-closing tags
242 int attrStart = nameEnd;
243 int attrEnd = gt;
244 // back up one if tag ends with '/>'
245 int back = gt - 1;
246 while (back > lt && isspace((unsigned char)s[back])) back--;
247 if (back > lt && s[back] == '/') attrEnd = back;
248 // trim leading/trailing whitespace for attributes
249 while (attrStart < attrEnd && isspace((unsigned char)s[attrStart]))
250 attrStart++;
251 while (attrEnd > attrStart && isspace((unsigned char)s[attrEnd - 1]))
252 attrEnd--;
253 if (attrEnd > attrStart) {
254 last_attributes.copyFrom(s + attrStart, attrEnd - attrStart);
255 } else {
257 }
258 }
259 }
260
261 int handleCommentOrPI(int lt, const char* s, int len) {
262 // comment <!-- ... -->
263 if (lt + 4 < len && s[lt + 1] == '!' && s[lt + 2] == '-' &&
264 s[lt + 3] == '-') {
265 int end = str_view.indexOf("-->", lt + 4);
266 return end < 0 ? len : end + 3;
267 }
268 // processing instruction <? ... ?>
269 if (lt + 1 < len && s[lt + 1] == '?') {
270 int end = str_view.indexOf("?>", lt + 2);
271 return end < 0 ? len : end + 2;
272 }
273 return lt; // not a comment/PI
274 }
275
276 // Helper: invoke the registered callback but pass a path that excludes the
277 // current node (only ancestor elements). This keeps the `nodeName`
278 // parameter as the current element while `path` contains only parents.
279 bool invokeCallback(Str& nodeName, Vector<Str>& fullPath, Str& text,
280 Str& attributes, int start, int len) {
281 if (!callback) return false;
282 if (report_text_only && text.isEmpty()) return false;
283 Vector<Str> ancestorPath;
284 int ancCount = fullPath.size() > 0 ? (int)fullPath.size() - 1 : 0;
285 for (int i = 0; i < ancCount; ++i) {
286 // Make a fresh copy of the ancestor name to avoid any aliasing
287 // or lifetime issues with parser-owned storage.
288 ancestorPath.push_back(Str(fullPath[i].c_str()));
289 }
290 callback(nodeName, ancestorPath, text, attributes, start, len, reference);
291 return true;
292 }
293
294 // Run the parser. This is a small, forgiving parser suitable for the
295 // embedded use-cases in this project (DIDL fragments, simple SCPD parsing).
296 // It is not a full XML validator but handles start/end tags, comments,
297 // processing instructions and self-closing tags. For each text node that
298 // contains non-whitespace characters the provided callback is invoked with
299 // the text and the current element path.
300 void do_parse() {
301 const char* s = str_view.c_str();
302 int len = str_view.length();
303 int pos = 0;
304
305 while (pos < len) {
306 int lt = str_view.indexOf('<', pos);
307 if (lt < 0) break;
308
309 // Handle text between pos and lt
310 if (lt > pos) {
311 int ts = pos;
312 int te = lt;
313 while (ts < te && isspace((unsigned char)s[ts])) ts++;
314 while (te > ts && isspace((unsigned char)s[te - 1])) te--;
315 emitTextSegment(s, ts, te);
316 }
317
318 // comment or processing instruction handling
319 int newPos = handleCommentOrPI(lt, s, len);
320 if (newPos != lt) {
321 pos = newPos;
322 continue;
323 }
324
325 // find closing '>' (respect quotes)
326 int gt = findGt(s, lt, len);
327 if (gt < 0) break; // malformed
328
329 // end tag
330 if (lt + 1 < len && s[lt + 1] == '/') {
331 pos = handleEndTag(s, lt, gt);
332 continue;
333 }
334
335 // start tag (or self-closing)
336 handleStartTag(s, lt, gt);
337
338 // detect self-closing and pop if needed
339 bool selfClosing = false;
340 int back = gt - 1;
341 while (back > lt && isspace((unsigned char)s[back])) back--;
342 if (back > lt && s[back] == '/') selfClosing = true;
343
344 // Only emit the start tag callback if the element is self-closing
345 // or if there is no non-whitespace text ahead. This suppresses the
346 // first (empty-text) callback for nodes that will immediately yield
347 // a text callback.
348 if (selfClosing || !hasNonWhitespaceTextAhead(s, gt, len)) {
349 emitTagSegment(s, lt, gt);
350 }
351
352 pos = gt + 1;
353
354 if (selfClosing && path.size() > 0) {
355 path.erase(path.size() - 1);
356 if (contentStarts.size() > 0)
358 }
359 }
360 }
361
362 // Incremental single-callback parser. It will resume from `parsePos`
363 // and advance `parsePos` past the fragment it invoked (or to the end
364 // if nothing more is parseable). Returns true if a callback was
365 // invoked.
367 const char* s = str_view.c_str();
368 int len = str_view.length();
369 int pos = parsePos;
370 bool result = false;
371
372 while (pos < len) {
373 int lt = str_view.indexOf('<', pos);
374 if (lt < 0) break;
375
376 // Handle text between pos and lt
377 if (lt > pos) {
378 int ts = pos;
379 int te = lt;
380 while (ts < te && isspace((unsigned char)s[ts])) ts++;
381 while (te > ts && isspace((unsigned char)s[te - 1])) te--;
382 if (te > ts && callback) {
383 // emit this text segment and advance parsePos past it
384 txt.copyFrom(s + ts, te - ts);
385 node_name = path.size() > 0 ? path.back() : empty_str;
386 // Entity expansion is handled in setXml(); the callback receives
387 // the parser text as-is.
389 te - ts);
390 parsePos = te; // next position is end of emitted text
391 return result;
392 }
393 }
394
395 // comment or processing instruction handling
396 int newPos = handleCommentOrPI(lt, s, len);
397 if (newPos != lt) {
398 pos = newPos;
399 parsePos = pos;
400 continue; // comments/PI produce no callback
401 }
402
403 // find closing '>' (respect quotes)
404 int gt = findGt(s, lt, len);
405 if (gt < 0) break; // malformed
406
407 // end tag
408 if (lt + 1 < len && s[lt + 1] == '/') {
409 // handle end tag; this may not produce a callback by itself
410 pos = handleEndTag(s, lt, gt);
411 parsePos = pos;
412 // End tags don't generate callbacks in this parser's model unless
413 // there is trimmed text to emit; continue to next loop iteration.
414 continue;
415 }
416
417 // start tag (or self-closing)
418 handleStartTag(s, lt, gt);
419
420 // detect self-closing and whether to emit start-tag callback
421 bool selfClosing = false;
422 int back = gt - 1;
423 while (back > lt && isspace((unsigned char)s[back])) back--;
424 if (back > lt && s[back] == '/') selfClosing = true;
425
426 bool emitStartTag = selfClosing || !hasNonWhitespaceTextAhead(s, gt, len);
427
428 if (callback && emitStartTag) {
429 node_name = path.size() > 0 ? path.back() : empty_str;
431 gt - lt + 1);
432 pos = gt + 1;
433 if (selfClosing && path.size() > 0) {
434 path.erase(path.size() - 1);
435 if (contentStarts.size() > 0)
437 }
438 parsePos = pos;
439 return result;
440 }
441
442 // Suppressed start-tag callback (expecting text ahead) or no callback
443 // registered; just advance position and continue scanning.
444 pos = gt + 1;
445 if (selfClosing && path.size() > 0) {
446 path.erase(path.size() - 1);
447 if (contentStarts.size() > 0)
449 }
450
451 parsePos = pos;
452 }
453
454 // nothing left to parse
455 parsePos = pos;
456 return false;
457 }
458
459 // Resume position for incremental parseSingle() calls.
460 int parsePos = 0;
461};
462
463} // namespace tiny_dlna
A simple wrapper to provide string functions on char*. If the underlying char* is a const we do not a...
Definition: StrView.h:18
virtual const char * c_str()
provides the string value as const char*
Definition: StrView.h:376
virtual int indexOf(const char c, int start=0)
Definition: StrView.h:274
virtual int length()
Definition: StrView.h:380
virtual void set(const char *alt)
assigs a value
Definition: StrView.h:44
Heap-backed string utility used throughout tiny_dlna.
Definition: Str.h:27
bool isEmpty() const
True if empty.
Definition: Str.h:54
void copyFrom(const char *source, int len)
Copy from raw buffer with length.
Definition: Str.h:60
void clear()
Clear contents (size -> 0)
Definition: Str.h:93
void release()
Clear and shrink capacity to fit.
Definition: Str.h:161
Lightweight wrapper around std::vector with Arduino-friendly helpers and a pluggable allocator.
Definition: Vector.h:39
void reset()
Reset the container by clearing and shrinking capacity to fit.
Definition: Vector.h:83
iterator erase(size_t index)
Convenience overload to erase by index.
Definition: Vector.h:73
Lightweight streaming XML parser.
Definition: XMLParser.h:27
Str last_attributes
Definition: XMLParser.h:141
int handleCommentOrPI(int lt, const char *s, int len)
Definition: XMLParser.h:261
void do_parse()
Definition: XMLParser.h:300
void setCallback(void(*cb)(Str &nodeName, Vector< Str > &path, Str &text, Str &attributes, int start, int len, void *ref))
Set the callback to be invoked for parsed fragments.
Definition: XMLParser.h:78
Str txt
Definition: XMLParser.h:138
void setReportTextOnly(bool flag)
report only nodes with text
Definition: XMLParser.h:128
int handleEndTag(const char *s, int lt, int gt)
Definition: XMLParser.h:215
Str node_name
Definition: XMLParser.h:137
Str empty_str
Definition: XMLParser.h:136
int parsePos
Definition: XMLParser.h:460
void(* callback)(Str &nodeName, Vector< Str > &path, Str &text, Str &attributes, int start, int len, void *ref)
Definition: XMLParser.h:149
XMLParser(const char *xmlStr, void(*callback)(Str &nodeName, Vector< Str > &path, Str &text, Str &attributes, int start, int len, void *ref), bool textOnly=false)
Construct with XML buffer and callback.
Definition: XMLParser.h:46
void setXml(const char *xmlStr)
Set the XML buffer to parse.
Definition: XMLParser.h:68
void * reference
Definition: XMLParser.h:154
Vector< Str > path
Definition: XMLParser.h:135
void handleStartTag(const char *s, int lt, int gt)
Definition: XMLParser.h:221
StrView str_view
Definition: XMLParser.h:134
Str str
Definition: XMLParser.h:139
void emitTagSegment(const char *s, int lt, int gt)
Definition: XMLParser.h:207
Vector< int > contentStarts
Definition: XMLParser.h:151
void setReference(void *ref)
Attach an opaque user pointer to the parser instance.
Definition: XMLParser.h:60
bool do_parse_single()
Definition: XMLParser.h:366
void end()
Fully reset parser state (parse position, path stack and content starts). Use this when the underlyin...
Definition: XMLParser.h:118
bool report_text_only
Definition: XMLParser.h:142
void parse()
Parse the previously set XML buffer and invoke the callback.
Definition: XMLParser.h:90
int findGt(const char *s, int start, int len)
Definition: XMLParser.h:160
void resetParse()
Reset the internal parse position so subsequent parseSingle() calls start from the beginning of the b...
Definition: XMLParser.h:110
bool parseSingle()
Parse a single fragment (one callback invocation) from the previously set XML buffer.
Definition: XMLParser.h:104
int getParsePos()
Expose current parse position for incremental wrappers.
Definition: XMLParser.h:131
void emitTextSegment(const char *s, int ts, int te)
Definition: XMLParser.h:196
bool invokeCallback(Str &nodeName, Vector< Str > &fullPath, Str &text, Str &attributes, int start, int len)
Definition: XMLParser.h:279
bool hasNonWhitespaceTextAhead(const char *s, int gt, int len)
Definition: XMLParser.h:183
Definition: Allocator.h:13