Skip to content

Commit 46d5e4d

Browse files
Add dynamic estimated size for the token array, XML attribute fixes
1 parent 17f796f commit 46d5e4d

File tree

6 files changed

+104
-21
lines changed

6 files changed

+104
-21
lines changed

README.md

Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -313,13 +313,19 @@ The echttp library provides functions to handle JSON and XML data: a small JSON
313313
These parsers support UTF-8 only.
314314

315315
```
316+
int echttp_json_estimate (const char *json);
316317
const char *echttp_json_parse (char *json, ParserToken *token, int *count);
317318
```
318-
Parses the provided JSON string and populate the array of token. The content of the string is modified during the parsing. The variable pointed by count must contain the size of the token array before the call, and is set to the actual number of JSON items found by the parser. The parser return a null pointer on success, or an error message on failure. The error message container is a static buffer and it thus overwritten on the next call.
319+
Parses the provided JSON string and populate the array of token. The content of the string is modified during the parsing. The variable pointed by count must contain the size of the token array before the call, and is set to the actual number of JSON items found by the parser (even on error). The parser return a null pointer on success, or an error message on failure. The error message container is a static buffer and it thus overwritten on the next call.
320+
321+
The token array must be large enough to hold all the tokens found, or else an error is returned. The `echttp_json_estimate()` function calculates an estimated size for the token array that should be sufficient. if the same token array is used multiple times, it is recommended to allocate an initial size large enough for most common cases and rely on the estimate only to protect against larger data sets. This avoids trashing the heap with frequent re-allocations.
319322
```
323+
int echttp_xml_estimate ((const char *xml);
320324
const char *echttp_xml_parse (char *xml, ParserToken *token, int *count);
321325
```
322-
Parses the provided XML string and populate the array of token. The content of the string is modified during the parsing. The variable pointed by count must contain the size of the token array before the call, and is set to the actual number of JSON items found by the parser. The parser return a null pointer on success, or an error message on failure. The error message container is a static buffer and it thus overwritten on the next call.
326+
Parses the provided XML string and populate the array of token. The content of the string is modified during the parsing. The variable pointed by count must contain the size of the token array before the call, and is set to the actual number of JSON items found by the parser (even on error). The parser return a null pointer on success, or an error message on failure. The error message container is a static buffer and it thus overwritten on the next call.
327+
328+
The `echttp_xml_estimate()` function is the XML equivalent of the `echttp_json_estimate()` function.
323329

324330
The ParserToken type is defined as follows:
325331
```

echttp_json.c

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,8 @@
2626
* but is a totally independent implementation, using recursive descent
2727
* instead of a state machine.
2828
*
29+
* int echttp_json_estimate (const char *json);
30+
*
2931
* const char *echttp_json_parse (char *json, ParserToken *token, int *count);
3032
*
3133
* Decode a JSON string and return a list of tokens. The decoding breaks
@@ -423,6 +425,27 @@ static const char *echttp_json_object (ParserContext context) {
423425
return "object processing error";
424426
}
425427

428+
int echttp_json_estimate (const char *json) {
429+
// This method of counting does not escape the literal strings content
430+
// and does not account for a ',' after an object or array: it might
431+
// overestimates the number of tokens needed. This is OK because
432+
// we are looking for enough space, not for the smallest space.
433+
int count = 0;
434+
for (;;) {
435+
switch (*(json++)) {
436+
case ']':
437+
case '}':
438+
count += 2; // the item before, plus the object/array.
439+
break;
440+
case ',':
441+
count += 1;
442+
break;
443+
case 0:
444+
return count;
445+
}
446+
}
447+
}
448+
426449
void echttp_json_enable_debug (void) {
427450
echttp_json_debug = 1;
428451
}
@@ -443,6 +466,7 @@ const char *echttp_json_parse (char *json, ParserToken *token, int *count) {
443466
context.max = *count;
444467

445468
echttp_json_error_text[0] = 0;
469+
*count = 0;
446470

447471
token[0].key = 0;
448472

echttp_json.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@
1010

1111
void echttp_json_enable_debug (void);
1212

13+
int echttp_json_estimate (const char *json);
1314
const char *echttp_json_parse (char *json, ParserToken *token, int *count);
1415

1516
int echttp_json_search (const ParserToken *parent, const char *path);

echttp_print.c

Lines changed: 40 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -41,8 +41,6 @@ static char *inbuffer = 0;
4141
static char *outbuffer = 0;
4242
static int outbuffer_size = 0;
4343

44-
#define PRINT_MAX 20480
45-
4644

4745
static void print_tokens (ParserToken *token, int count) {
4846
int i;
@@ -90,11 +88,14 @@ int main (int argc, const char **argv) {
9088
int size;
9189
int count;
9290
int show_tokens = 0;
93-
int xml_input = 0;
94-
int consume_xml = 0;
91+
int force_xml = 0; // default is JSON.
92+
int xml_input;
9593
int pretty = PRINT_OPTION_PRETTY;
94+
int silent = 0;
9695
const char *error;
97-
ParserToken token[PRINT_MAX];
96+
ParserToken *token = 0;
97+
int max = 0;
98+
int estimated;
9899

99100
for (i = 1; i < argc; ++i) {
100101
if (strcmp (argv[i], "-d") == 0) {
@@ -107,14 +108,18 @@ int main (int argc, const char **argv) {
107108
continue;
108109
}
109110
if (strcmp (argv[i], "-x") == 0) {
110-
consume_xml = 1;
111+
force_xml = 1; // Force XML no matter what.
111112
continue;
112113
}
113114
if (strcmp (argv[i], "-r") == 0) {
114115
pretty = 0;
115116
continue;
116117
}
117-
xml_input = consume_xml;
118+
if (strcmp (argv[i], "-s") == 0) {
119+
silent = 1;
120+
continue;
121+
}
122+
xml_input = force_xml;
118123
if (strstr(argv[i], ".xml")) xml_input = 1;
119124

120125
inbuffer = echttp_parser_load (argv[i]);
@@ -128,26 +133,44 @@ int main (int argc, const char **argv) {
128133
outbuffer = (char *) realloc (outbuffer, outbuffer_size);
129134
}
130135

131-
count = PRINT_MAX;
132-
if (consume_xml)
136+
if (xml_input) {
137+
int estimated = echttp_xml_estimate (inbuffer);
138+
printf ("// File %s: estimated %d XML tokens\n", argv[i], estimated);
139+
if (estimated > max) {
140+
token = realloc (token, estimated * sizeof(*token));
141+
max = estimated;
142+
}
143+
count = max;
133144
error = echttp_xml_parse (inbuffer, token, &count);
134-
else
145+
} else {
146+
int estimated = echttp_json_estimate (inbuffer);
147+
printf ("// File %s: estimated %d JSON tokens\n", argv[i], estimated);
148+
if (estimated > max) {
149+
token = realloc (token, estimated * sizeof(*token));
150+
max = estimated;
151+
}
152+
count = max;
135153
error = echttp_json_parse (inbuffer, token, &count);
154+
}
136155
if (error) {
137-
fprintf (stderr, "Cannot decode %s: %s\n", argv[i], error);
156+
fprintf (stderr,
157+
"%s: error after %d tokens, %s\n", argv[i], count, error);
138158
continue;
139159
}
140160
if (show_tokens) print_tokens (token, count);
141161
printf ("// File %s (%d characters, %d tokens)\n",
142162
argv[i], size, count);
143163

144-
error = echttp_json_format (token, count, outbuffer, outbuffer_size, pretty);
145-
if (error) {
146-
fprintf (stderr, "Cannot format: %s: %s\n", argv[i], error);
147-
continue;
164+
if (!silent) {
165+
error = echttp_json_format
166+
(token, count, outbuffer, outbuffer_size, pretty);
167+
if (error) {
168+
fprintf (stderr, "Cannot format: %s: %s\n", argv[i], error);
169+
continue;
170+
}
171+
printf ("%s", outbuffer);
172+
echttp_parser_free (inbuffer);
148173
}
149-
printf ("%s", outbuffer);
150-
echttp_parser_free (inbuffer);
151174
}
152175
}
153176

echttp_xml.c

Lines changed: 30 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,8 @@
2626
* but is a totally independent implementation, using recursive descent
2727
* instead of a state machine.
2828
*
29+
* int echttp_xml_estimate (const char *xml);
30+
*
2931
* const char *echttp_xml_parse (char *xml, ParserToken *token, int *count);
3032
*
3133
* Decode a XML string and return a list of tokens. The decoding breaks
@@ -209,6 +211,10 @@ static const char *echttp_xml_tagname (ParserContext context, int index) {
209211
return 0;
210212
}
211213

214+
static int echttp_xml_valid_attribute (char c) {
215+
return isalnum(c) || c == '_' || c == ':' || c == '-';
216+
}
217+
212218
static const char *echttp_xml_attributes (ParserContext context, int parent) {
213219
char *xml = context->xml;
214220
ParserToken *token = context->token;
@@ -234,7 +240,7 @@ static const char *echttp_xml_attributes (ParserContext context, int parent) {
234240
token[this_attribute].key = xml + context->cursor;
235241
do {
236242
context->cursor += 1;
237-
} while (isalnum(xml[context->cursor]));
243+
} while (echttp_xml_valid_attribute(xml[context->cursor]));
238244
if (xml[context->cursor] != '=') return "invalid attributes syntax";
239245
xml[context->cursor] = 0;
240246
context->cursor += 1;
@@ -311,6 +317,27 @@ static const char *echttp_xml_element (ParserContext context, int parent) {
311317
return echttp_xml_content (context, index);
312318
}
313319

320+
int echttp_xml_estimate (const char *xml) {
321+
int count = 1; // Implicit outer object.
322+
// This method of counting does not escape the literal strings content,
323+
// always assumes a token for attributes and accounts for 4 items per
324+
// start/end tag pair instead of the max 3.
325+
// This is not a problem because the goal is to estimate a sufficient
326+
// space, not the smallest possible space.
327+
for (;;) {
328+
switch (*(xml++)) {
329+
case '=': // Tell-all sign of an attribute.
330+
count += 1;
331+
break;
332+
case '>':
333+
count += 2; // total of 4 when content is present (start + end)
334+
break;
335+
case 0:
336+
return count;
337+
}
338+
}
339+
}
340+
314341
const char *echttp_xml_parse (char *xml, ParserToken *token, int *count) {
315342

316343
const char *error;
@@ -327,6 +354,7 @@ const char *echttp_xml_parse (char *xml, ParserToken *token, int *count) {
327354
context.max = *count;
328355

329356
echttp_xml_error_text[0] = 0;
357+
*count = 0;
330358

331359
token[0].key = 0;
332360

@@ -364,6 +392,7 @@ const char *echttp_xml_parse (char *xml, ParserToken *token, int *count) {
364392
break;
365393
default: return "probably not XML data";
366394
}
395+
*count = context.count;
367396

368397
if (! error) {
369398
if (next_word(&context) != 0) error = "data left at the end of input";
@@ -375,7 +404,6 @@ const char *echttp_xml_parse (char *xml, ParserToken *token, int *count) {
375404
error, context.line_count, context.cursor-context.line_start);
376405
return echttp_xml_error_text;
377406
}
378-
*count = context.count;
379407
return 0;
380408
}
381409

echttp_xml.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,5 +10,6 @@
1010

1111
void echttp_xml_enable_debug (void);
1212

13+
int echttp_xml_estimate (const char *xml);
1314
const char *echttp_xml_parse (char *xml, ParserToken *token, int *count);
1415

0 commit comments

Comments
 (0)