From 83699774fa669abfbc5c5c3dc9265308246bd4f6 Mon Sep 17 00:00:00 2001 From: Michael Meeks Date: Fri, 1 May 2009 15:23:23 +0100 Subject: [PATCH] Patch originally committed only to the ChangeLog & tests ... MIME-Version: 1.0 Content-Type: text/plain; charset=utf-8 Content-Transfer-Encoding: 8bit Bug 572508 – gmarkup speedup ... * glib/gmarkup.c: Various optimizations: do less allocations by keeping a pool of GStrings, do in-place unescaping, avoid redundant utf-8 validation. --- glib/gmarkup.c | 1129 +++++++++++++++++++++++--------------------------------- 1 files changed, 455 insertions(+), 674 deletions(-) diff --git a/glib/gmarkup.c b/glib/gmarkup.c index b1f4f46..35118ad 100644 --- a/glib/gmarkup.c +++ b/glib/gmarkup.c @@ -82,11 +82,15 @@ struct _GMarkupParseContext * the callback for it. */ GString *partial_chunk; + GSList *spare_chunks; GMarkupParseState state; GSList *tag_stack; - gchar **attr_names; - gchar **attr_values; + GSList *tag_stack_gstr; + GSList *spare_list_nodes; + + GString **attr_names; + GString **attr_values; gint cur_attr; gint alloc_attrs; @@ -94,8 +98,6 @@ struct _GMarkupParseContext gssize current_text_len; const gchar *current_text_end; - GString *leftover_char_portion; - /* used to save the start of the last interesting thingy */ const gchar *start; @@ -112,6 +114,39 @@ struct _GMarkupParseContext gpointer held_user_data; }; +/* + * Helpers to reduce our allocation overhead, we have + * a well defined allocation lifecycle. + */ +static GSList * +get_list_node (GMarkupParseContext *context, gpointer data) +{ + GSList *node; + if (context->spare_list_nodes != NULL) + { + node = context->spare_list_nodes; + context->spare_list_nodes = g_slist_remove_link (context->spare_list_nodes, node); + } + else + node = g_slist_alloc(); + node->data = data; + return node; +} + +static void +free_list_node (GMarkupParseContext *context, GSList *node) +{ + node->data = NULL; + context->spare_list_nodes = g_slist_concat (node, context->spare_list_nodes); +} + +static inline void +string_blank (GString *string) +{ + string->str[0] = '\0'; + string->len = 0; +} + /** * g_markup_parse_context_new: * @parser: a #GMarkupParser @@ -148,9 +183,12 @@ g_markup_parse_context_new (const GMarkupParser *parser, context->char_number = 1; context->partial_chunk = NULL; + context->spare_chunks = NULL; + context->spare_list_nodes = NULL; context->state = STATE_START; context->tag_stack = NULL; + context->tag_stack_gstr = NULL; context->attr_names = NULL; context->attr_values = NULL; context->cur_attr = -1; @@ -159,7 +197,6 @@ g_markup_parse_context_new (const GMarkupParser *parser, context->current_text = NULL; context->current_text_len = -1; context->current_text_end = NULL; - context->leftover_char_portion = NULL; context->start = NULL; context->iter = NULL; @@ -179,6 +216,14 @@ g_markup_parse_context_new (const GMarkupParser *parser, return context; } +static void +string_full_free (gpointer ptr, gpointer user_data) +{ + g_string_free (ptr, TRUE); +} + +static void clear_attributes (GMarkupParseContext *context); + /** * g_markup_parse_context_free: * @context: a #GMarkupParseContext @@ -198,18 +243,21 @@ g_markup_parse_context_free (GMarkupParseContext *context) if (context->dnotify) (* context->dnotify) (context->user_data); - g_strfreev (context->attr_names); - g_strfreev (context->attr_values); + clear_attributes (context); + g_free (context->attr_names); + g_free (context->attr_values); - g_slist_foreach (context->tag_stack, (GFunc)g_free, NULL); + g_slist_foreach (context->tag_stack_gstr, string_full_free, NULL); + g_slist_free (context->tag_stack_gstr); g_slist_free (context->tag_stack); + g_slist_foreach (context->spare_chunks, string_full_free, NULL); + g_slist_free (context->spare_chunks); + g_slist_free (context->spare_list_nodes); + if (context->partial_chunk) g_string_free (context->partial_chunk, TRUE); - if (context->leftover_char_portion) - g_string_free (context->leftover_char_portion, TRUE); - g_free (context); } @@ -301,43 +349,101 @@ propagate_error (GMarkupParseContext *context, g_propagate_error (dest, src); } -/* To make these faster, we first use the ascii-only tests, then check - * for the usual non-alnum name-end chars, and only then call the - * expensive unicode stuff. Nobody uses non-ascii in XML tag/attribute - * names, so this is a reasonable hack that virtually always avoids - * the guniprop call. - */ #define IS_COMMON_NAME_END_CHAR(c) \ ((c) == '=' || (c) == '/' || (c) == '>' || (c) == ' ') static gboolean -is_name_start_char (const gchar *p) +slow_name_validate (GMarkupParseContext *context, const char *name, GError **error) { - if (g_ascii_isalpha (*p) || - (!IS_COMMON_NAME_END_CHAR (*p) && - (*p == '_' || - *p == ':' || - g_unichar_isalpha (g_utf8_get_char (p))))) - return TRUE; - else - return FALSE; + const char *p = name; + + if (!g_utf8_validate (name, strlen (name), NULL)) + { + set_error (context, error, G_MARKUP_ERROR_BAD_UTF8, + _("Invalid UTF-8 encoded text in name - not valid '%s'"), name); + return FALSE; + } + + if (!(g_ascii_isalpha (*p) || + (!IS_COMMON_NAME_END_CHAR (*p) && + (*p == '_' || + *p == ':' || + g_unichar_isalpha (g_utf8_get_char (p)))))) + { + set_error (context, error, G_MARKUP_ERROR_PARSE, + _("'%s' is not a valid name "), name); + return FALSE; + } + + for (p = g_utf8_next_char (name); *p != '\0'; p = g_utf8_next_char (p)) + { + /* is_name_char */ + if (!(g_ascii_isalnum (*p) || + (!IS_COMMON_NAME_END_CHAR (*p) && + (*p == '.' || + *p == '-' || + *p == '_' || + *p == ':' || + g_unichar_isalpha (g_utf8_get_char (p)))))) + { + set_error (context, error, G_MARKUP_ERROR_PARSE, + _("'%s' is not a valid name: '%c' "), name, *p); + return FALSE; + } + } + return TRUE; } +/* + * Use me for elements, attributes etc. + */ static gboolean -is_name_char (const gchar *p) +name_validate (GMarkupParseContext *context, const char *name, GError **error) { - if (g_ascii_isalnum (*p) || - (!IS_COMMON_NAME_END_CHAR (*p) && - (*p == '.' || - *p == '-' || - *p == '_' || - *p == ':' || - g_unichar_isalpha (g_utf8_get_char (p))))) - return TRUE; - else - return FALSE; + char mask; + const char *p; + + /* name start char */ + p = name; + if (G_UNLIKELY (IS_COMMON_NAME_END_CHAR (*p) || + !(g_ascii_isalpha (*p) || *p == '_' || *p == ':'))) + goto slow_validate; + + for (mask = *p++; *p != '\0'; p++) + { + mask |= *p; + + /* is_name_char */ + if (G_UNLIKELY (!(g_ascii_isalnum (*p) || + (!IS_COMMON_NAME_END_CHAR (*p) && + (*p == '.' || + *p == '-' || + *p == '_' || + *p == ':'))))) + goto slow_validate; + } + + if (mask & 0x80) /* un-common / non-ascii */ + goto slow_validate; + + return TRUE; + + slow_validate: + return slow_name_validate (context, name, error); } +static gboolean +text_validate (GMarkupParseContext *context, const char *p, int len, GError **error) +{ + if (!g_utf8_validate (p, len, NULL)) + { + set_error (context, error, G_MARKUP_ERROR_BAD_UTF8, + _("Invalid UTF-8 encoded text in name - not valid '%s'"), p); + return FALSE; + } + else + return TRUE; +} static gchar* char_str (gunichar c, @@ -360,7 +466,6 @@ static void set_unescape_error (GMarkupParseContext *context, GError **error, const gchar *remaining_text, - const gchar *remaining_text_end, GMarkupError code, const gchar *format, ...) @@ -373,7 +478,7 @@ set_unescape_error (GMarkupParseContext *context, remaining_newlines = 0; p = remaining_text; - while (p != remaining_text_end) + while (*p != '\0') { if (*p == '\n') ++remaining_newlines; @@ -397,414 +502,199 @@ set_unescape_error (GMarkupParseContext *context, g_propagate_error (error, tmp_error); } -typedef enum -{ - USTATE_INSIDE_TEXT, - USTATE_AFTER_AMPERSAND, - USTATE_INSIDE_ENTITY_NAME, - USTATE_AFTER_CHARREF_HASH -} UnescapeState; - -typedef struct -{ - GMarkupParseContext *context; - GString *str; - UnescapeState state; - const gchar *text; - const gchar *text_end; - const gchar *entity_start; -} UnescapeContext; - -static const gchar* -unescape_text_state_inside_text (UnescapeContext *ucontext, - const gchar *p, - GError **error) +/* + * re-write the GString in-place, unescaping anything that escaped. + * most XML does not contain entities, or escaping. + */ +static gboolean +unescape_gstring_inplace (GMarkupParseContext *context, + GString *string, + gboolean *is_ascii, + GError **error) { - const gchar *start; + char mask, *to; + int line_num = 1; + const char *from; gboolean normalize_attribute; - if (ucontext->context->state == STATE_INSIDE_ATTRIBUTE_VALUE_SQ || - ucontext->context->state == STATE_INSIDE_ATTRIBUTE_VALUE_DQ) + *is_ascii = FALSE; + + /* are we unescaping an attribute or not ? */ + if (context->state == STATE_INSIDE_ATTRIBUTE_VALUE_SQ || + context->state == STATE_INSIDE_ATTRIBUTE_VALUE_DQ) normalize_attribute = TRUE; else normalize_attribute = FALSE; - start = p; - - while (p != ucontext->text_end) - { - if (*p == '&') - { - break; - } - else if (normalize_attribute && (*p == '\t' || *p == '\n')) - { - g_string_append_len (ucontext->str, start, p - start); - g_string_append_c (ucontext->str, ' '); - p = g_utf8_next_char (p); - start = p; - } - else if (*p == '\r') - { - g_string_append_len (ucontext->str, start, p - start); - g_string_append_c (ucontext->str, normalize_attribute ? ' ' : '\n'); - p = g_utf8_next_char (p); - if (p != ucontext->text_end && *p == '\n') - p = g_utf8_next_char (p); - start = p; - } - else - p = g_utf8_next_char (p); - } - - if (p != start) - g_string_append_len (ucontext->str, start, p - start); - - if (p != ucontext->text_end && *p == '&') - { - p = g_utf8_next_char (p); - ucontext->state = USTATE_AFTER_AMPERSAND; - } - - return p; -} - -static const gchar* -unescape_text_state_after_ampersand (UnescapeContext *ucontext, - const gchar *p, - GError **error) -{ - ucontext->entity_start = NULL; - - if (*p == '#') - { - p = g_utf8_next_char (p); - - ucontext->entity_start = p; - ucontext->state = USTATE_AFTER_CHARREF_HASH; - } - else if (!is_name_start_char (p)) - { - if (*p == ';') - { - set_unescape_error (ucontext->context, error, - p, ucontext->text_end, - G_MARKUP_ERROR_PARSE, - _("Empty entity '&;' seen; valid " - "entities are: & " < > '")); - } - else - { - gchar buf[8]; - - set_unescape_error (ucontext->context, error, - p, ucontext->text_end, - G_MARKUP_ERROR_PARSE, - _("Character '%s' is not valid at " - "the start of an entity name; " - "the & character begins an entity; " - "if this ampersand isn't supposed " - "to be an entity, escape it as " - "&"), - utf8_str (p, buf)); - } - } - else - { - ucontext->entity_start = p; - ucontext->state = USTATE_INSIDE_ENTITY_NAME; - } - - return p; -} - -static const gchar* -unescape_text_state_inside_entity_name (UnescapeContext *ucontext, - const gchar *p, - GError **error) -{ - while (p != ucontext->text_end) - { - if (*p == ';') - break; - else if (!is_name_char (p)) - { - gchar ubuf[8]; - - set_unescape_error (ucontext->context, error, - p, ucontext->text_end, - G_MARKUP_ERROR_PARSE, - _("Character '%s' is not valid " - "inside an entity name"), - utf8_str (p, ubuf)); - break; - } - - p = g_utf8_next_char (p); - } - - if (ucontext->context->state != STATE_ERROR) - { - if (p != ucontext->text_end) - { - gint len = p - ucontext->entity_start; - - /* move to after semicolon */ - p = g_utf8_next_char (p); - ucontext->state = USTATE_INSIDE_TEXT; - - if (strncmp (ucontext->entity_start, "lt", len) == 0) - g_string_append_c (ucontext->str, '<'); - else if (strncmp (ucontext->entity_start, "gt", len) == 0) - g_string_append_c (ucontext->str, '>'); - else if (strncmp (ucontext->entity_start, "amp", len) == 0) - g_string_append_c (ucontext->str, '&'); - else if (strncmp (ucontext->entity_start, "quot", len) == 0) - g_string_append_c (ucontext->str, '"'); - else if (strncmp (ucontext->entity_start, "apos", len) == 0) - g_string_append_c (ucontext->str, '\''); - else - { - gchar *name; - - name = g_strndup (ucontext->entity_start, len); - set_unescape_error (ucontext->context, error, - p, ucontext->text_end, - G_MARKUP_ERROR_PARSE, - _("Entity name '%s' is not known"), - name); - g_free (name); - } - } - else - { - set_unescape_error (ucontext->context, error, - /* give line number of the & */ - ucontext->entity_start, ucontext->text_end, - G_MARKUP_ERROR_PARSE, - _("Entity did not end with a semicolon; " - "most likely you used an ampersand " - "character without intending to start " - "an entity - escape ampersand as &")); - } - } -#undef MAX_ENT_LEN - - return p; -} - -static const gchar* -unescape_text_state_after_charref_hash (UnescapeContext *ucontext, - const gchar *p, - GError **error) -{ - gboolean is_hex = FALSE; - const char *start; - - start = ucontext->entity_start; - - if (*p == 'x') - { - is_hex = TRUE; - p = g_utf8_next_char (p); - start = p; - } - - while (p != ucontext->text_end && *p != ';') - p = g_utf8_next_char (p); - - if (p != ucontext->text_end) + /* + * Meeks' theorum: unescaping can only shrink text. + * for < etc. this is obvious, for ￿ more + * thought is required, but this is patently so. + */ + mask = 0; + for (from = to = string->str; *from != '\0'; from++, to++) { - g_assert (*p == ';'); + *to = *from; + + mask |= *to; + if (*to == '\n') + line_num++; + if (normalize_attribute && (*to == '\t' || *to == '\n')) + *to = ' '; + if (*to == '\r') + { + *to = normalize_attribute ? ' ' : '\n'; + if (from[1] == '\n') + from++; + } + if (*from == '&') + { + from++; + if (*from == '#') + { + gboolean is_hex = FALSE; + gulong l; + gchar *end = NULL; - /* digit is between start and p */ + from++; - if (start != p) - { - gulong l; - gchar *end = NULL; - - errno = 0; - if (is_hex) - l = strtoul (start, &end, 16); - else - l = strtoul (start, &end, 10); + if (*from == 'x') + { + is_hex = TRUE; + from++; + } - if (end != p || errno != 0) - { - set_unescape_error (ucontext->context, error, - start, ucontext->text_end, - G_MARKUP_ERROR_PARSE, - _("Failed to parse '%-.*s', which " - "should have been a digit " - "inside a character reference " - "(ê for example) - perhaps " - "the digit is too large"), - p - start, start); - } - else - { - /* characters XML 1.1 permits */ - if ((0 < l && l <= 0xD7FF) || - (0xE000 <= l && l <= 0xFFFD) || - (0x10000 <= l && l <= 0x10FFFF)) - { - gchar buf[8]; - g_string_append (ucontext->str, char_str (l, buf)); - } - else - { - set_unescape_error (ucontext->context, error, - start, ucontext->text_end, - G_MARKUP_ERROR_PARSE, - _("Character reference '%-.*s' does not " - "encode a permitted character"), - p - start, start); - } + /* digit is between start and p */ + errno = 0; + if (is_hex) + l = strtoul (from, &end, 16); + else + l = strtoul (from, &end, 10); + + if (end == from || errno != 0) + { + set_unescape_error (context, error, + from, G_MARKUP_ERROR_PARSE, + _("Failed to parse '%-.*s', which " + "should have been a digit " + "inside a character reference " + "(ê for example) - perhaps " + "the digit is too large"), + end - from, from); + return FALSE; + } + else if (*end != ';') + { + set_unescape_error (context, error, + from, G_MARKUP_ERROR_PARSE, + _("Character reference did not end with a " + "semicolon; " + "most likely you used an ampersand " + "character without intending to start " + "an entity - escape ampersand as &")); + return FALSE; + } + else + { + /* characters XML 1.1 permits */ + if ((0 < l && l <= 0xD7FF) || + (0xE000 <= l && l <= 0xFFFD) || + (0x10000 <= l && l <= 0x10FFFF)) + { + gchar buf[8]; + char_str (l, buf); + strcpy (to, buf); + to += strlen (buf) - 1; + from = end; + if (l >= 0x80) /* not ascii */ + mask |= 0x80; + } + else + { + set_unescape_error (context, error, + from, G_MARKUP_ERROR_PARSE, + _("Character reference '%-.*s' does not " + "encode a permitted character"), + end - from, from); + return FALSE; + } + } } - /* Move to next state */ - p = g_utf8_next_char (p); /* past semicolon */ - ucontext->state = USTATE_INSIDE_TEXT; - } - else - { - set_unescape_error (ucontext->context, error, - start, ucontext->text_end, - G_MARKUP_ERROR_PARSE, - _("Empty character reference; " - "should include a digit such as " - "dž")); - } - } - else - { - set_unescape_error (ucontext->context, error, - start, ucontext->text_end, - G_MARKUP_ERROR_PARSE, - _("Character reference did not end with a " - "semicolon; " - "most likely you used an ampersand " - "character without intending to start " - "an entity - escape ampersand as &")); + else if (strncmp (from, "lt;", 3) == 0) + { + *to = '<'; + from += 2; + } + else if (strncmp (from, "gt;", 3) == 0) + { + *to = '>'; + from += 2; + } + else if (strncmp (from, "amp;", 4) == 0) + { + *to = '&'; + from += 3; + } + else if (strncmp (from, "quot;", 5) == 0) + { + *to = '"'; + from += 4; + } + else if (strncmp (from, "apos;", 5) == 0) + { + *to = '\''; + from += 4; + } + else + { + if (*from == ';') + set_unescape_error (context, error, + from, G_MARKUP_ERROR_PARSE, + _("Empty entity '&;' seen; valid " + "entities are: & " < > '")); + else + { + const char *end = strchr (from, ';'); + if (end) + set_unescape_error (context, error, + from, G_MARKUP_ERROR_PARSE, + _("Entity name '%-.*s' is not known"), + end-from, from); + else + set_unescape_error (context, error, + from, G_MARKUP_ERROR_PARSE, + _("Entity did not end with a semicolon; " + "most likely you used an ampersand " + "character without intending to start " + "an entity - escape ampersand as &")); + } + return FALSE; + } + } } - return p; -} + g_assert (to - string->str <= string->len); + if (to - string->str != string->len) + g_string_truncate (string, to - string->str); -static gboolean -unescape_text (GMarkupParseContext *context, - const gchar *text, - const gchar *text_end, - GString **unescaped, - GError **error) -{ - UnescapeContext ucontext; - const gchar *p; - - ucontext.context = context; - ucontext.text = text; - ucontext.text_end = text_end; - ucontext.entity_start = NULL; - - ucontext.str = g_string_sized_new (text_end - text); - - ucontext.state = USTATE_INSIDE_TEXT; - p = text; - - while (p != text_end && context->state != STATE_ERROR) - { - g_assert (p < text_end); - - switch (ucontext.state) - { - case USTATE_INSIDE_TEXT: - { - p = unescape_text_state_inside_text (&ucontext, - p, - error); - } - break; - - case USTATE_AFTER_AMPERSAND: - { - p = unescape_text_state_after_ampersand (&ucontext, - p, - error); - } - break; - - - case USTATE_INSIDE_ENTITY_NAME: - { - p = unescape_text_state_inside_entity_name (&ucontext, - p, - error); - } - break; - - case USTATE_AFTER_CHARREF_HASH: - { - p = unescape_text_state_after_charref_hash (&ucontext, - p, - error); - } - break; + *is_ascii = !(mask & 0x80); - default: - g_assert_not_reached (); - break; - } - } - - if (context->state != STATE_ERROR) - { - switch (ucontext.state) - { - case USTATE_INSIDE_TEXT: - break; - case USTATE_AFTER_AMPERSAND: - case USTATE_INSIDE_ENTITY_NAME: - set_unescape_error (context, error, - NULL, NULL, - G_MARKUP_ERROR_PARSE, - _("Unfinished entity reference")); - break; - case USTATE_AFTER_CHARREF_HASH: - set_unescape_error (context, error, - NULL, NULL, - G_MARKUP_ERROR_PARSE, - _("Unfinished character reference")); - break; - } - } - - if (context->state == STATE_ERROR) - { - g_string_free (ucontext.str, TRUE); - *unescaped = NULL; - return FALSE; - } - else - { - *unescaped = ucontext.str; - return TRUE; - } + return TRUE; } static inline gboolean advance_char (GMarkupParseContext *context) { - context->iter = g_utf8_next_char (context->iter); - context->char_number += 1; + context->iter++; + context->char_number++; - if (context->iter == context->current_text_end) - { + if (G_UNLIKELY (context->iter == context->current_text_end)) return FALSE; - } - else if (*context->iter == '\n') + + else if (G_UNLIKELY (*context->iter == '\n')) { - context->line_number += 1; + context->line_number++; context->char_number = 1; } @@ -833,37 +723,62 @@ advance_to_name_end (GMarkupParseContext *context) { do { - if (!is_name_char (context->iter)) + if (IS_COMMON_NAME_END_CHAR (*(context->iter))) return; + if (xml_isspace (*(context->iter))) + return; } while (advance_char (context)); } static void +release_chunk (GMarkupParseContext *context, GString *str) +{ + GSList *node; + if (!str) + return; + if (str->allocated_len > 256) + { /* large strings are unusual and worth freeing */ + g_string_free (str, TRUE); + return; + } + string_blank (str); + node = get_list_node (context, str); + context->spare_chunks = g_slist_concat (node, context->spare_chunks); +} + +static void add_to_partial (GMarkupParseContext *context, const gchar *text_start, const gchar *text_end) { if (context->partial_chunk == NULL) - context->partial_chunk = g_string_sized_new (text_end - text_start); + { /* allocate a new chunk to parse into */ - if (text_start != text_end) - g_string_append_len (context->partial_chunk, text_start, - text_end - text_start); + if (context->spare_chunks != NULL) + { + GSList *node = context->spare_chunks; + context->spare_chunks = g_slist_remove_link (context->spare_chunks, node); + context->partial_chunk = node->data; + free_list_node (context, node); + } + else + context->partial_chunk = g_string_sized_new (MAX (28, text_end - text_start)); + } - /* Invariant here that partial_chunk exists */ + if (text_start != text_end) + g_string_insert_len (context->partial_chunk, -1, + text_start, text_end - text_start); } -static void +static inline void truncate_partial (GMarkupParseContext *context) { if (context->partial_chunk != NULL) - { - context->partial_chunk = g_string_truncate (context->partial_chunk, 0); - } + string_blank (context->partial_chunk); } -static const gchar* +static inline const gchar* current_element (GMarkupParseContext *context) { return context->tag_stack->data; @@ -891,6 +806,30 @@ pop_subparser_stack (GMarkupParseContext *context) } static void +push_partial_as_tag (GMarkupParseContext *context) +{ + GString *str = context->partial_chunk; + /* sadly, this is exported by gmarkup_get_element_stack as-is */ + context->tag_stack = g_slist_concat (get_list_node (context, str->str), context->tag_stack); + context->tag_stack_gstr = g_slist_concat (get_list_node (context, str), context->tag_stack_gstr); + context->partial_chunk = NULL; +} + +static void +pop_tag (GMarkupParseContext *context) +{ + GSList *nodea, *nodeb; + + nodea = context->tag_stack; + nodeb = context->tag_stack_gstr; + release_chunk (context, nodeb->data); + context->tag_stack = g_slist_remove_link (context->tag_stack, nodea); + context->tag_stack_gstr = g_slist_remove_link (context->tag_stack_gstr, nodeb); + free_list_node (context, nodea); + free_list_node (context, nodeb); +} + +static void possibly_finish_subparser (GMarkupParseContext *context) { if (current_element (context) == context->subparser_element) @@ -916,60 +855,42 @@ static const gchar* current_attribute (GMarkupParseContext *context) { g_assert (context->cur_attr >= 0); - return context->attr_names[context->cur_attr]; -} - -static void -find_current_text_end (GMarkupParseContext *context) -{ - /* This function must be safe (non-segfaulting) on invalid UTF8. - * It assumes the string starts with a character start - */ - const gchar *end = context->current_text + context->current_text_len; - const gchar *p; - const gchar *next; - - g_assert (context->current_text_len > 0); - - p = g_utf8_find_prev_char (context->current_text, end); - - g_assert (p != NULL); /* since current_text was a char start */ - - /* p is now the start of the last character or character portion. */ - g_assert (p != end); - next = g_utf8_next_char (p); /* this only touches *p, nothing beyond */ - - if (next == end) - { - /* whole character */ - context->current_text_end = end; - } - else - { - /* portion */ - context->leftover_char_portion = g_string_new_len (p, end - p); - context->current_text_len -= (end - p); - context->current_text_end = p; - } + return context->attr_names[context->cur_attr]->str; } - static void -add_attribute (GMarkupParseContext *context, char *name) +add_attribute (GMarkupParseContext *context, GString *str) { if (context->cur_attr + 2 >= context->alloc_attrs) { context->alloc_attrs += 5; /* silly magic number */ - context->attr_names = g_realloc (context->attr_names, sizeof(char*)*context->alloc_attrs); - context->attr_values = g_realloc (context->attr_values, sizeof(char*)*context->alloc_attrs); + context->attr_names = g_realloc (context->attr_names, sizeof(GString*)*context->alloc_attrs); + context->attr_values = g_realloc (context->attr_values, sizeof(GString*)*context->alloc_attrs); } context->cur_attr++; - context->attr_names[context->cur_attr] = name; + context->attr_names[context->cur_attr] = str; context->attr_values[context->cur_attr] = NULL; context->attr_names[context->cur_attr+1] = NULL; context->attr_values[context->cur_attr+1] = NULL; } +static void +clear_attributes (GMarkupParseContext *context) +{ + /* Go ahead and free the attributes. */ + for (; context->cur_attr >= 0; context->cur_attr--) + { + int pos = context->cur_attr; + release_chunk (context, context->attr_names[pos]); + release_chunk (context, context->attr_values[pos]); + context->attr_names[pos] = context->attr_values[pos] = NULL; + } + g_assert (context->cur_attr == -1); + g_assert (context->attr_names == NULL || + context->attr_names[0] == NULL); + g_assert (context->attr_values == NULL || + context->attr_values[0] == NULL); +} /** * g_markup_parse_context_parse: * @context: a #GMarkupParseContext @@ -994,8 +915,6 @@ g_markup_parse_context_parse (GMarkupParseContext *context, gssize text_len, GError **error) { - const gchar *first_invalid; - g_return_val_if_fail (context != NULL, FALSE); g_return_val_if_fail (text != NULL, FALSE); g_return_val_if_fail (context->state != STATE_ERROR, FALSE); @@ -1009,130 +928,16 @@ g_markup_parse_context_parse (GMarkupParseContext *context, context->parsing = TRUE; - if (context->leftover_char_portion) - { - const gchar *first_char; - - if ((*text & 0xc0) != 0x80) - first_char = text; - else - first_char = g_utf8_find_next_char (text, text + text_len); - - if (first_char) - { - /* leftover_char_portion was completed. Parse it. */ - GString *portion = context->leftover_char_portion; - - g_string_append_len (context->leftover_char_portion, - text, first_char - text); - - /* hacks to allow recursion */ - context->parsing = FALSE; - context->leftover_char_portion = NULL; - - if (!g_markup_parse_context_parse (context, - portion->str, portion->len, - error)) - { - g_assert (context->state == STATE_ERROR); - } - - g_string_free (portion, TRUE); - context->parsing = TRUE; - - /* Skip the fraction of char that was in this text */ - text_len -= (first_char - text); - text = first_char; - } - else - { - /* another little chunk of the leftover char; geez - * someone is inefficient. - */ - g_string_append_len (context->leftover_char_portion, - text, text_len); - - if (context->leftover_char_portion->len > 7) - { - /* The leftover char portion is too big to be - * a UTF-8 character - */ - set_error_literal (context, - error, - G_MARKUP_ERROR_BAD_UTF8, - _("Invalid UTF-8 encoded text - overlong sequence")); - } - - goto finished; - } - } context->current_text = text; context->current_text_len = text_len; + context->current_text_end = context->current_text + text_len; context->iter = context->current_text; context->start = context->iter; - /* Nothing left after finishing the leftover char, or nothing - * passed in to begin with. - */ if (context->current_text_len == 0) goto finished; - /* find_current_text_end () assumes the string starts at - * a character start, so we need to validate at least - * that much. It doesn't assume any following bytes - * are valid. - */ - if ((*context->current_text & 0xc0) == 0x80) /* not a char start */ - { - set_error_literal (context, - error, - G_MARKUP_ERROR_BAD_UTF8, - _("Invalid UTF-8 encoded text - not a start char")); - goto finished; - } - - /* Initialize context->current_text_end, possibly adjusting - * current_text_len, and add any leftover char portion - */ - find_current_text_end (context); - - /* Validate UTF8 (must be done after we find the end, since - * we could have a trailing incomplete char) - */ - if (!g_utf8_validate (context->current_text, - context->current_text_len, - &first_invalid)) - { - gint newlines = 0; - const gchar *p, *q; - gchar *current_text_dup; - - q = p = context->current_text; - while (p != first_invalid) - { - if (*p == '\n') - { - ++newlines; - q = p + 1; - context->char_number = 1; - } - ++p; - } - - context->line_number += newlines; - context->char_number += g_utf8_strlen (q, first_invalid - q); - - current_text_dup = g_strndup (context->current_text, context->current_text_len); - set_error (context, - error, - G_MARKUP_ERROR_BAD_UTF8, - _("Invalid UTF-8 encoded text - not valid '%s'"), - current_text_dup); - g_free (current_text_dup); - goto finished; - } - while (context->iter != context->current_text_end) { switch (context->state) @@ -1191,7 +996,7 @@ g_markup_parse_context_parse (GMarkupParseContext *context, context->state = STATE_AFTER_CLOSE_TAG_SLASH; } - else if (is_name_start_char (context->iter)) + else if (!IS_COMMON_NAME_END_CHAR (*(context->iter))) { context->state = STATE_INSIDE_OPEN_TAG_NAME; @@ -1247,7 +1052,7 @@ g_markup_parse_context_parse (GMarkupParseContext *context, tmp_error = NULL; if (context->parser->end_element) (* context->parser->end_element) (context, - context->tag_stack->data, + current_element (context), context->user_data, &tmp_error); @@ -1279,10 +1084,7 @@ g_markup_parse_context_parse (GMarkupParseContext *context, current_element (context)); } } - - g_free (context->tag_stack->data); - context->tag_stack = g_slist_delete_link (context->tag_stack, - context->tag_stack); + pop_tag (context); } break; @@ -1309,12 +1111,7 @@ g_markup_parse_context_parse (GMarkupParseContext *context, * if any; push it on the stack; enter next state. */ add_to_partial (context, context->start, context->iter); - context->tag_stack = - g_slist_prepend (context->tag_stack, - g_string_free (context->partial_chunk, - FALSE)); - - context->partial_chunk = NULL; + push_partial_as_tag (context); context->state = STATE_BETWEEN_ATTRIBUTES; context->start = NULL; @@ -1345,7 +1142,10 @@ g_markup_parse_context_parse (GMarkupParseContext *context, /* The name has ended. Combine it with the partial chunk * if any; push it on the stack; enter next state. */ - add_attribute (context, g_string_free (context->partial_chunk, FALSE)); + if (!name_validate (context, context->partial_chunk->str, error)) + break; + + add_attribute (context, context->partial_chunk); context->partial_chunk = NULL; context->start = NULL; @@ -1387,11 +1187,10 @@ g_markup_parse_context_parse (GMarkupParseContext *context, } else if (*context->iter == '>') { - advance_char (context); context->state = STATE_AFTER_CLOSE_ANGLE; } - else if (is_name_start_char (context->iter)) + else if (!IS_COMMON_NAME_END_CHAR (*(context->iter))) { context->state = STATE_INSIDE_ATTRIBUTE_NAME; /* start of attribute name */ @@ -1419,44 +1218,35 @@ g_markup_parse_context_parse (GMarkupParseContext *context, if (context->state == STATE_AFTER_ELISION_SLASH || context->state == STATE_AFTER_CLOSE_ANGLE) { - const gchar *start_name; - /* Ugly, but the current code expects an empty array instead of NULL */ - const gchar *empty = NULL; - const gchar **attr_names = ∅ - const gchar **attr_values = ∅ + int i; + const gchar *start_name; + const gchar **attr_names; + const gchar **attr_values; GError *tmp_error; - /* Call user callback for element start */ - start_name = current_element (context); - - if (context->cur_attr >= 0) + attr_names = g_newa (const gchar *, context->cur_attr + 2); + attr_values = g_newa (const gchar *, context->cur_attr + 2); + for (i = 0; i < context->cur_attr + 1; i++) { - attr_names = (const gchar**)context->attr_names; - attr_values = (const gchar**)context->attr_values; + attr_names[i] = context->attr_names[i]->str; + attr_values[i] = context->attr_values[i]->str; } + attr_names[i] = NULL; + attr_values[i] = NULL; + /* Call user callback for element start */ tmp_error = NULL; - if (context->parser->start_element) + start_name = current_element (context); + + if (context->parser->start_element && + name_validate (context, start_name, error)) (* context->parser->start_element) (context, - start_name, + start_name, (const gchar **)attr_names, (const gchar **)attr_values, context->user_data, &tmp_error); - - /* Go ahead and free the attributes. */ - for (; context->cur_attr >= 0; context->cur_attr--) - { - int pos = context->cur_attr; - g_free (context->attr_names[pos]); - g_free (context->attr_values[pos]); - context->attr_names[pos] = context->attr_values[pos] = NULL; - } - g_assert (context->cur_attr == -1); - g_assert (context->attr_names == NULL || - context->attr_names[0] == NULL); - g_assert (context->attr_values == NULL || - context->attr_values[0] == NULL); + clear_attributes (context); if (tmp_error != NULL) propagate_error (context, error, tmp_error); @@ -1531,25 +1321,22 @@ g_markup_parse_context_parse (GMarkupParseContext *context, } else { + gboolean is_ascii; /* The value has ended at the quote mark. Combine it * with the partial chunk if any; set it for the current * attribute. */ - GString *unescaped; - add_to_partial (context, context->start, context->iter); g_assert (context->cur_attr >= 0); - if (unescape_text (context, - context->partial_chunk->str, - context->partial_chunk->str + - context->partial_chunk->len, - &unescaped, - error)) + if (unescape_gstring_inplace (context, context->partial_chunk, &is_ascii, error) && + (is_ascii || text_validate (context, context->partial_chunk->str, + context->partial_chunk->len, error))) { /* success, advance past quote and set state. */ - context->attr_values[context->cur_attr] = g_string_free (unescaped, FALSE); + context->attr_values[context->cur_attr] = context->partial_chunk; + context->partial_chunk = NULL; advance_char (context); context->state = STATE_BETWEEN_ATTRIBUTES; context->start = NULL; @@ -1576,30 +1363,25 @@ g_markup_parse_context_parse (GMarkupParseContext *context, if (context->iter != context->current_text_end) { - GString *unescaped = NULL; + gboolean is_ascii; /* The text has ended at the open angle. Call the text * callback. */ - if (unescape_text (context, - context->partial_chunk->str, - context->partial_chunk->str + - context->partial_chunk->len, - &unescaped, - error)) + if (unescape_gstring_inplace (context, context->partial_chunk, &is_ascii, error) && + (is_ascii || text_validate (context, context->partial_chunk->str, + context->partial_chunk->len, error))) { GError *tmp_error = NULL; if (context->parser->text) (*context->parser->text) (context, - unescaped->str, - unescaped->len, + context->partial_chunk->str, + context->partial_chunk->len, context->user_data, &tmp_error); - g_string_free (unescaped, TRUE); - if (tmp_error == NULL) { /* advance past open angle and set state. */ @@ -1618,7 +1400,7 @@ g_markup_parse_context_parse (GMarkupParseContext *context, case STATE_AFTER_CLOSE_TAG_SLASH: /* Possible next state: INSIDE_CLOSE_TAG_NAME */ - if (is_name_start_char (context->iter)) + if (!IS_COMMON_NAME_END_CHAR (*(context->iter))) { context->state = STATE_INSIDE_CLOSE_TAG_NAME; @@ -1656,13 +1438,9 @@ g_markup_parse_context_parse (GMarkupParseContext *context, if (context->iter != context->current_text_end) { - gchar *close_name; + GString *close_name; - /* The name has ended. Combine it with the partial chunk - * if any; check that it matches stack top and pop - * stack; invoke proper callback; enter next state. - */ - close_name = g_string_free (context->partial_chunk, FALSE); + close_name = context->partial_chunk; context->partial_chunk = NULL; if (*context->iter != '>') @@ -1676,7 +1454,7 @@ g_markup_parse_context_parse (GMarkupParseContext *context, "the close element name '%s'; the allowed " "character is '>'"), utf8_str (context->iter, buf), - close_name); + close_name->str); } else if (context->tag_stack == NULL) { @@ -1685,16 +1463,16 @@ g_markup_parse_context_parse (GMarkupParseContext *context, G_MARKUP_ERROR_PARSE, _("Element '%s' was closed, no element " "is currently open"), - close_name); + close_name->str); } - else if (strcmp (close_name, current_element (context)) != 0) + else if (strcmp (close_name->str, current_element (context)) != 0) { set_error (context, error, G_MARKUP_ERROR_PARSE, _("Element '%s' was closed, but the currently " "open element is '%s'"), - close_name, + close_name->str, current_element (context)); } else @@ -1710,22 +1488,18 @@ g_markup_parse_context_parse (GMarkupParseContext *context, tmp_error = NULL; if (context->parser->end_element) (* context->parser->end_element) (context, - close_name, + close_name->str, context->user_data, &tmp_error); ensure_no_outstanding_subparser (context); - - /* Pop the tag stack */ - g_free (context->tag_stack->data); - context->tag_stack = g_slist_delete_link (context->tag_stack, - context->tag_stack); + pop_tag (context); if (tmp_error) propagate_error (context, error, tmp_error); } - - g_free (close_name); + context->partial_chunk = close_name; + truncate_partial (context); } break; @@ -1784,14 +1558,22 @@ g_markup_parse_context_parse (GMarkupParseContext *context, if (context->flags & G_MARKUP_TREAT_CDATA_AS_TEXT && strncmp (context->partial_chunk->str, "parser->text) + if (context->parser->text && + text_validate (context, + context->partial_chunk->str + 9, + context->partial_chunk->len - 12, + error)) (*context->parser->text) (context, context->partial_chunk->str + 9, context->partial_chunk->len - 12, context->user_data, &tmp_error); } - else if (context->parser->passthrough) + else if (context->parser->passthrough && + text_validate (context, + context->partial_chunk->str, + context->partial_chunk->len, + error)) (*context->parser->passthrough) (context, context->partial_chunk->str, context->partial_chunk->len, @@ -1999,7 +1781,6 @@ G_CONST_RETURN GSList * g_markup_parse_context_get_element_stack (GMarkupParseContext *context) { g_return_val_if_fail (context != NULL, NULL); - return context->tag_stack; } -- 1.6.1.3