Merge pull request #9729 from huwpascoe/rxml-fix

Fixed xml parsing
2025-04-17 11:43:00 +00:00 · 2019-11-17 20:23:07 +01:00 · 2019-11-17 20:23:07 +01:00 · 8e6e30370d
commit 8e6e30370d
parent 80e49531b1 4eebfa237b
6 changed files with 1343 additions and 355 deletions
--- a/Makefile.common
+++ b/Makefile.common
@ -1718,6 +1718,7 @@ OBJ += $(LIBRETRO_COMM_DIR)/formats/bmp/rbmp_encode.o \
       $(LIBRETRO_COMM_DIR)/formats/json/jsonsax.o \
       $(LIBRETRO_COMM_DIR)/formats/json/jsonsax_full.o \
       $(LIBRETRO_COMM_DIR)/formats/xml/rxml.o \
+       deps/yxml/yxml.o \
       $(LIBRETRO_COMM_DIR)/formats/image_transfer.o

 # Easter Egg
--- a/deps/yxml/yxml.c
+++ b/deps/yxml/yxml.c
--- a/deps/yxml/yxml.h
+++ b/deps/yxml/yxml.h
@ -0,0 +1,167 @@
+/* Copyright (c) 2013-2014 Yoran Heling
+
+  Permission is hereby granted, free of charge, to any person obtaining
+  a copy of this software and associated documentation files (the
+  "Software"), to deal in the Software without restriction, including
+  without limitation the rights to use, copy, modify, merge, publish,
+  distribute, sublicense, and/or sell copies of the Software, and to
+  permit persons to whom the Software is furnished to do so, subject to
+  the following conditions:
+
+  The above copyright notice and this permission notice shall be included
+  in all copies or substantial portions of the Software.
+
+  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+  EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+  MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+  IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+  CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+  TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+  SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+*/
+
+#ifndef YXML_H
+#define YXML_H
+
+#include <stdint.h>
+#include <stddef.h>
+#include <retro_environment.h>
+
+#if defined(_MSC_VER) && !defined(__cplusplus) && !defined(inline)
+#define inline __inline
+#endif
+
+#if defined(__STDC_C89__) && !defined(inline)
+#define inline
+#endif
+
+/* Full API documentation for this library can be found in the "yxml.pod" file
+ * in the yxml git repository, or online at http://dev.yorhel.nl/yxml/man */
+
+typedef enum {
+	YXML_EEOF        = -5, /* Unexpected EOF                             */
+	YXML_EREF        = -4, /* Invalid character or entity reference (&whatever;) */
+	YXML_ECLOSE      = -3, /* Close tag does not match open tag (<Tag> .. </OtherTag>) */
+	YXML_ESTACK      = -2, /* Stack overflow (too deeply nested tags or too long element/attribute name) */
+	YXML_ESYN        = -1, /* Syntax error (unexpected byte)             */
+	YXML_OK          =  0, /* Character consumed, no new token present   */
+	YXML_ELEMSTART   =  1, /* Start of an element:   '<Tag ..'           */
+	YXML_CONTENT     =  2, /* Element content                            */
+	YXML_ELEMEND     =  3, /* End of an element:     '.. />' or '</Tag>' */
+	YXML_ATTRSTART   =  4, /* Attribute:             'Name=..'           */
+	YXML_ATTRVAL     =  5, /* Attribute value                            */
+	YXML_ATTREND     =  6, /* End of attribute       '.."'               */
+	YXML_PISTART     =  7, /* Start of a processing instruction          */
+	YXML_PICONTENT   =  8, /* Content of a PI                            */
+	YXML_PIEND       =  9  /* End of a processing instruction            */
+} yxml_ret_t;
+
+/* When, exactly, are tokens returned?
+ *
+ * <TagName
+ *   '>' ELEMSTART
+ *   '/' ELEMSTART, '>' ELEMEND
+ *   ' ' ELEMSTART
+ *     '>'
+ *     '/', '>' ELEMEND
+ *     Attr
+ *       '=' ATTRSTART
+ *         "X ATTRVAL
+ *           'Y'  ATTRVAL
+ *             'Z'  ATTRVAL
+ *               '"' ATTREND
+ *                 '>'
+ *                 '/', '>' ELEMEND
+ *
+ * </TagName
+ *   '>' ELEMEND
+ */
+
+
+typedef struct {
+	/* PUBLIC (read-only) */
+
+	/* Name of the current element, zero-length if not in any element. Changed
+	 * after YXML_ELEMSTART. The pointer will remain valid up to and including
+	 * the next non-YXML_ATTR* token, the pointed-to buffer will remain valid
+	 * up to and including the YXML_ELEMEND for the corresponding element. */
+	char *elem;
+
+	/* The last read character(s) of an attribute value (YXML_ATTRVAL), element
+	 * data (YXML_CONTENT), or processing instruction (YXML_PICONTENT). Changed
+	 * after one of the respective YXML_ values is returned, and only valid
+	 * until the next yxml_parse() call. Usually, this string only consists of
+	 * a single byte, but multiple bytes are returned in the following cases:
+	 * - "<?SomePI ?x ?>": The two characters "?x"
+	 * - "<![CDATA[ ]x ]]>": The two characters "]x"
+	 * - "<![CDATA[ ]]x ]]>": The three characters "]]x"
+	 * - "&#N;" and "&#xN;", where dec(n) > 127. The referenced Unicode
+	 *   character is then encoded in multiple UTF-8 bytes.
+	 */
+	char data[8];
+
+	/* Name of the current attribute. Changed after YXML_ATTRSTART, valid up to
+	 * and including the next YXML_ATTREND. */
+	char *attr;
+
+	/* Name/target of the current processing instruction, zero-length if not in
+	 * a PI. Changed after YXML_PISTART, valid up to (but excluding)
+	 * the next YXML_PIEND. */
+	char *pi;
+
+	/* Line number, byte offset within that line, and total bytes read. These
+	 * values refer to the position _after_ the last byte given to
+	 * yxml_parse(). These are useful for debugging and error reporting. */
+	uint64_t byte;
+	uint64_t total;
+	uint32_t line;
+
+
+	/* PRIVATE */
+	int state;
+	unsigned char *stack; /* Stack of element names + attribute/PI name, separated by \0. Also starts with a \0. */
+	size_t stacksize, stacklen;
+	unsigned reflen;
+	unsigned quote;
+	int nextstate; /* Used for '@' state remembering and for the "string" consuming state */
+	unsigned ignore;
+	unsigned char *string;
+} yxml_t;
+
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+void yxml_init(yxml_t *, void *, size_t);
+
+
+yxml_ret_t yxml_parse(yxml_t *, int);
+
+
+/* May be called after the last character has been given to yxml_parse().
+ * Returns YXML_OK if the XML document is valid, YXML_EEOF otherwise.  Using
+ * this function isn't really necessary, but can be used to detect documents
+ * that don't end correctly. In particular, an error is returned when the XML
+ * document did not contain a (complete) root element, or when the document
+ * ended while in a comment or processing instruction. */
+yxml_ret_t yxml_eof(yxml_t *);
+
+#ifdef __cplusplus
+}
+#endif
+
+
+/* Returns the length of the element name (x->elem), attribute name (x->attr),
+ * or PI name (x->pi). This function should ONLY be used directly after the
+ * YXML_ELEMSTART, YXML_ATTRSTART or YXML_PISTART (respectively) tokens have
+ * been returned by yxml_parse(), calling this at any other time may not give
+ * the correct results. This function should also NOT be used on strings other
+ * than x->elem, x->attr or x->pi. */
+static inline size_t yxml_symlen(yxml_t *x, const char *s) {
+	return (x->stack + x->stacklen) - (const unsigned char*)s;
+}
+
+#endif
+
+/* vim: set noet sw=4 ts=4: */
--- a/griffin/griffin.c
+++ b/griffin/griffin.c
@ -1486,6 +1486,7 @@ DEPENDENCIES
 XML
 ============================================================ */
 #include "../libretro-common/formats/xml/rxml.c"
+#include "../deps/yxml/yxml.c"

 /*============================================================
 AUDIO UTILS
--- a/libretro-common/formats/xml/rxml.c
+++ b/libretro-common/formats/xml/rxml.c
@ -20,14 +20,6 @@
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 */

-#include <stdio.h>
-#include <stdint.h>
-#include <stddef.h>
-#include <stdlib.h>
-#include <string.h>
-
-#include <ctype.h>
-
 #include <boolean.h>
 #include <streams/file_stream.h>
 #include <compat/posix_string.h>
@ -35,6 +27,16 @@

 #include <formats/rxml.h>

+#include "../../deps/yxml/yxml.h"
+
+#define BUFSIZE 4096
+
+struct rxml_parse_buffer {
+   char xml[BUFSIZE];
+   char val[BUFSIZE];
+   rxml_node_t* stack[32];
+};
+
 struct rxml_document
 {
   struct rxml_node *root_node;
@ -89,338 +91,6 @@ static void rxml_free_node(struct rxml_node *node)
      free(node);
 }

-static bool validate_header(const char **ptr)
-{
-   if (memcmp(*ptr, "<?xml", 5) == 0)
-   {
-      const char *eol = strstr(*ptr, "?>\n");
-      if (!eol)
-         return false;
-
-      /* Always use UTF-8. Don't really care to check. */
-      *ptr = eol + 3;
-      return true;
-   }
-   return true;
-}
-
-static bool range_is_space(const char *begin, const char *end)
-{
-   for (; begin < end; begin++)
-      if (!isspace(*begin))
-         return false;
-
-   return true;
-}
-
-static void rxml_skip_spaces(const char **ptr_)
-{
-   const char *ptr = *ptr_;
-   while (isspace(*ptr))
-      ptr++;
-
-   *ptr_ = ptr;
-}
-
-static char *strdup_range(const char *begin, const char *end)
-{
-   ptrdiff_t len = end - begin;
-   char *ret = (char*)malloc(len + 1);
-
-   if (!ret)
-      return NULL;
-
-   memcpy(ret, begin, len);
-   ret[len] = '\0';
-   return ret;
-}
-
-static char *strdup_range_escape(const char *begin, const char *end)
-{
-   /* Escaping is ignored. Assume we don't deal with that. */
-   return strdup_range(begin, end);
-}
-
-static struct rxml_attrib_node *rxml_parse_attrs(const char *str)
-{
-   const char *elem;
-   struct rxml_attrib_node *list = NULL;
-   struct rxml_attrib_node *tail = NULL;
-   char *attrib                  = NULL;
-   char *value                   = NULL;
-   char *last_char               = NULL;
-   char *save                    = NULL;
-   char *copy                    = strdup(str);
-   if (!copy)
-      return NULL;
-
-   last_char = copy + strlen(copy) - 1;
-   if (*last_char == '/')
-      *last_char = '\0';
-
-   elem = strtok_r(copy, " \n\t\f\v\r", &save);
-   while (elem)
-   {
-      const char *end;
-      struct rxml_attrib_node *new_node;
-      const char *eq = strstr(elem, "=\"");
-      if (!eq)
-         goto end;
-
-      end = strrchr(eq + 2, '\"');
-      if (!end || end != (elem + strlen(elem) - 1))
-         goto end;
-
-      attrib = strdup_range_escape(elem, eq);
-      value  = strdup_range_escape(eq + 2, end);
-      if (!attrib || !value)
-         goto end;
-
-      new_node =
-         (struct rxml_attrib_node*)calloc(1, sizeof(*new_node));
-      if (!new_node)
-         goto end;
-
-      new_node->attrib = attrib;
-      new_node->value  = value;
-      attrib           = NULL;
-      value            = NULL;
-
-      if (tail)
-      {
-         tail->next = new_node;
-         tail = new_node;
-      }
-      else
-         list = tail = new_node;
-
-      elem = strtok_r(NULL, " \n\t\f\v\r", &save);
-   }
-
-end:
-   if (copy)
-      free(copy);
-   if (attrib)
-      free(attrib);
-   if (value)
-      free(value);
-   return list;
-}
-
-static char *find_first_space(const char *str)
-{
-   while (*str && !isspace(*str))
-      str++;
-
-   return isspace(*str) ? (char*)str : NULL;
-}
-
-static bool rxml_parse_tag(struct rxml_node *node, const char *str)
-{
-   const char *name_end;
-   const char *str_ptr = str;
-   rxml_skip_spaces(&str_ptr);
-
-   name_end = find_first_space(str_ptr);
-   if (name_end)
-   {
-      node->name = strdup_range(str_ptr, name_end);
-      if (!node->name || !*node->name)
-         return false;
-
-      node->attrib = rxml_parse_attrs(name_end);
-      return true;
-   }
-   else
-   {
-      node->name = strdup(str_ptr);
-      return node->name && *node->name;
-   }
-}
-
-static struct rxml_node *rxml_parse_node(const char **ptr_)
-{
-   const char *ptr        = NULL;
-   const char *closing    = NULL;
-   char *str              = NULL;
-   bool is_closing        = false;
-
-   struct rxml_node *node = (struct rxml_node*)calloc(1, sizeof(*node));
-   if (!node)
-      return NULL;
-
-   rxml_skip_spaces(ptr_);
-
-   ptr = *ptr_;
-   if (*ptr != '<')
-      goto error;
-
-   closing = strchr(ptr, '>');
-   if (!closing)
-      goto error;
-
-   str = strdup_range(ptr + 1, closing);
-   if (!str)
-      goto error;
-
-   if (!rxml_parse_tag(node, str))
-      goto error;
-
-   /* Are spaces between / and > allowed? */
-   is_closing = strstr(ptr, "/>") + 1 == closing;
-
-   /* Look for more data. Either child nodes or data. */
-   if (!is_closing)
-   {
-      size_t copied             = 0;
-      size_t closing_tag_size   = strlen(node->name) + 4;
-      char *closing_tag         = (char*)malloc(closing_tag_size);
-
-      const char *cdata_start   = NULL;
-      const char *child_start   = NULL;
-      const char *closing_start = NULL;
-
-      if (!closing_tag)
-         goto error;
-
-      closing_tag[copied]       = '<';
-      closing_tag[copied+1]     = '/';
-      closing_tag[copied+2]     = '\0';
-
-      copied  = strlcat(closing_tag, node->name, closing_tag_size);
-
-      closing_tag[copied]       = '>';
-      closing_tag[copied+1]     = '\0';
-
-      cdata_start   = strstr(closing + 1, "<![CDATA[");
-      child_start   = strchr(closing + 1, '<');
-      closing_start = strstr(closing + 1, closing_tag);
-
-      if (!closing_start)
-      {
-         free(closing_tag);
-         goto error;
-      }
-
-      if (cdata_start && range_is_space(closing + 1, cdata_start))
-      {
-         /* CDATA section */
-         const char *cdata_end = strstr(cdata_start, "]]>");
-         if (!cdata_end)
-         {
-            free(closing_tag);
-            goto error;
-         }
-
-         node->data = strdup_range(cdata_start +
-               STRLEN_CONST("<![CDATA["), cdata_end);
-      }
-      else if (closing_start && closing_start == child_start) /* Simple Data */
-         node->data = strdup_range(closing + 1, closing_start);
-      else
-      {
-         /* Parse all child nodes. */
-         struct rxml_node *list = NULL;
-         struct rxml_node *tail = NULL;
-         const char *first_start = NULL;
-         const char *first_closing = NULL;
-
-         ptr           = child_start;
-         first_start   = strchr(ptr, '<');
-         first_closing = strstr(ptr, "</");
-
-         while (
-                first_start &&
-                first_closing &&
-                (first_start < first_closing)
-                )
-         {
-            struct rxml_node *new_node = rxml_parse_node(&ptr);
-
-            if (!new_node)
-            {
-               free(closing_tag);
-               goto error;
-            }
-
-            if (tail)
-            {
-               tail->next = new_node;
-               tail = new_node;
-            }
-            else
-               list = tail = new_node;
-
-            first_start   = strchr(ptr, '<');
-            first_closing = strstr(ptr, "</");
-         }
-
-         node->children = list;
-
-         closing_start = strstr(ptr, closing_tag);
-         if (!closing_start)
-         {
-            free(closing_tag);
-            goto error;
-         }
-      }
-
-      *ptr_ = closing_start + strlen(closing_tag);
-      free(closing_tag);
-   }
-   else
-      *ptr_ = closing + 1;
-
-   if (str)
-      free(str);
-   return node;
-
-error:
-   if (str)
-      free(str);
-   rxml_free_node(node);
-   return NULL;
-}
-
-static char *purge_xml_comments(const char *str)
-{
-   char *copy_dest;
-   const char *copy_src;
-   size_t len    = strlen(str);
-   char *new_str = (char*)malloc(len + 1);
-   if (!new_str)
-      return NULL;
-
-   new_str[len]          = '\0';
-
-   copy_dest             = new_str;
-   copy_src              = str;
-
-   for (;;)
-   {
-      ptrdiff_t copy_len;
-      const char *comment_start = strstr(copy_src, "<!--");
-      const char *comment_end   = strstr(copy_src, "-->");
-
-      if (!comment_start || !comment_end)
-         break;
-
-      copy_len = comment_start - copy_src;
-      memcpy(copy_dest, copy_src, copy_len);
-
-      copy_dest += copy_len;
-      copy_src   = comment_end + STRLEN_CONST("-->");
-   }
-
-   /* Avoid strcpy() as OpenBSD is anal and hates you
-    * for using it even when it's perfectly safe. */
-   len = strlen(copy_src);
-   memcpy(copy_dest, copy_src, len);
-   copy_dest[len] = '\0';
-
-   return new_str;
-}
-
 rxml_document_t *rxml_load_document(const char *path)
 {
   rxml_document_t *doc;
@ -458,35 +128,119 @@ error:

 rxml_document_t *rxml_load_document_string(const char *str)
 {
-   rxml_document_t *doc;
-   char *memory_buffer = NULL;
-   const char *mem_ptr = NULL;
+   rxml_document_t *doc          = NULL;
+   struct rxml_parse_buffer *buf = NULL;
+   size_t stack_i                = 0;
+   size_t level                  = 0;
+   int c                         = 0;
+   char *valptr                  = NULL;
+   yxml_t x;
+
+   rxml_node_t *node             = NULL;
+   struct rxml_attrib_node *attr = NULL;
+
+   buf = (struct rxml_parse_buffer*)malloc(sizeof(*buf));
+   if (!buf)
+      goto error;
+
+   valptr = buf->val;

   doc = (rxml_document_t*)calloc(1, sizeof(*doc));
   if (!doc)
      goto error;

-   mem_ptr = str;
+   yxml_init(&x, buf->xml, BUFSIZE);

-   if (!validate_header(&mem_ptr))
-      goto error;
+   for (; *str; ++str) {
+      yxml_ret_t r = yxml_parse(&x, *str);

-   memory_buffer = purge_xml_comments(mem_ptr);
-   if (!memory_buffer)
-      goto error;
+      if (r < 0)
+         goto error;

-   mem_ptr = memory_buffer;
+      switch (r) {

-   doc->root_node = rxml_parse_node(&mem_ptr);
-   if (!doc->root_node)
-      goto error;
+      case YXML_ELEMSTART:
+         if (node) {
+            if (level > stack_i) {
+               buf->stack[stack_i] = node;
+               ++stack_i;

-   free(memory_buffer);
+               node->children = (rxml_node_t*)calloc(1, sizeof(*node));
+               node = node->children;
+            }
+            else {
+               node->next = (rxml_node_t*)calloc(1, sizeof(*node));
+               node = node->next;
+            }
+         }
+         else {
+            node = doc->root_node = (rxml_node_t*)calloc(1, sizeof(*node));
+         }
+
+         node->name = strdup(x.elem);
+         attr = NULL;
+
+         ++level;
+         break;
+
+      case YXML_ELEMEND:
+         --level;
+
+         if (valptr > buf->val) {
+            *valptr = '\0';
+            node->data = strdup(buf->val);
+            valptr = buf->val;
+         }
+
+         if (level < stack_i) {
+            --stack_i;
+            node = buf->stack[stack_i];
+         }
+         break;
+
+      case YXML_CONTENT:
+         for (c = 0; c < sizeof(x.data) && x.data[c]; ++c) {
+            *valptr = x.data[c];
+            ++valptr;
+         }
+         break;
+
+      case YXML_ATTRSTART:
+         if (attr)
+            attr = attr->next = (struct rxml_attrib_node*)calloc(1, sizeof(*attr));
+         else
+            attr = node->attrib = (struct rxml_attrib_node*)calloc(1, sizeof(*attr));
+
+         attr->attrib = strdup(x.attr);
+         valptr = buf->val;
+         break;
+
+      case YXML_ATTRVAL:
+         for(c = 0; c < sizeof(x.data) && x.data[c]; ++c) {
+            *valptr = x.data[c];
+            ++valptr;
+         }
+         break;
+
+      case YXML_ATTREND:
+         if (valptr > buf->val) {
+            *valptr = '\0';
+            attr->value = strdup(buf->val);
+            valptr = buf->val;
+         }
+         break;
+
+      default:
+         break;
+      }
+   }
+
+   free(buf);
   return doc;

 error:
-   free(memory_buffer);
   rxml_free_document(doc);
+   free(buf);
   return NULL;
 }

--- a/libretro-common/formats/xml/test/Makefile
+++ b/libretro-common/formats/xml/test/Makefile
@ -2,10 +2,12 @@ TARGET := rxml

 LIBRETRO_XML_DIR  := ..
 LIBRETRO_COMM_DIR := ../../..
+LIBRETRO_DEPS_DIR := ../../../../deps

 SOURCES := \
 	rxml_test.c \
 	$(LIBRETRO_XML_DIR)/rxml.c \
+	$(LIBRETRO_DEPS_DIR)/yxml/yxml.c \
 	$(LIBRETRO_COMM_DIR)/streams/file_stream.c

 OBJS := $(SOURCES:.c=.o)