1#include <limits.h>
2#include <stdio.h>
3#include <stdlib.h>
4#include <string.h>
5
6#include <myaw.h>
7#include <pwlib/ctype.h>
8#include <pwlib/parsers.h>
9
10#define DEFAULT_LINE_CAPACITY 250
11
12#ifdef TRACE_ENABLED
13 static unsigned tracelevel = 0;
14
15# define _TRACE_INDENT() \
16 for (unsigned i = 0; i < tracelevel * 4; i++) { \
17 fputc(' ', stderr); \
18 }
19
20# define _TRACE_POS() \
21 _TRACE_INDENT() \
22 fprintf(stderr, "%s; line %u, block indent %u", \
23 __func__, parser->line_number, parser->block_indent);
24
25# define TRACE_ENTER() \
26 do { \
27 _TRACE_POS() \
28 fputs(" {\n", stderr); \
29 tracelevel++; \
30 } while (false)
31
32# define TRACE_EXIT() \
33 do { \
34 tracelevel--; \
35 _TRACE_INDENT() \
36 fputs("}\n", stderr); \
37 } while (false)
38
39# define TRACEPOINT() \
40 do { \
41 _TRACE_POS() \
42 fputc('\n', stderr); \
43 } while (false)
44
45# define TRACE(...) \
46 do { \
47 _TRACE_INDENT() \
48 fprintf(stderr, "%s: ", __func__); \
49 fprintf(stderr, __VA_ARGS__); \
50 fputc('\n', stderr); \
51 } while (false)
52#else
53# define TRACEPOINT()
54# define TRACE_ENTER()
55# define TRACE_EXIT()
56# define TRACE(...)
57#endif
58
59// forward declarations
60[[nodiscard]] static bool parse_value(MwParser* parser, unsigned* nested_value_pos, PwValuePtr convspec, PwValuePtr result);
61[[nodiscard]] static bool value_parser_func(MwParser* parser, PwValuePtr result);
62[[nodiscard]] static bool parse_raw_value(MwParser* parser, PwValuePtr result);
63[[nodiscard]] static bool parse_literal_string(MwParser* parser, PwValuePtr result);
64[[nodiscard]] static bool parse_folded_string(MwParser* parser, PwValuePtr result);
65[[nodiscard]] static bool parse_datetime(MwParser* parser, PwValuePtr result);
66[[nodiscard]] static bool parse_timestamp(MwParser* parser, PwValuePtr result);
67
68static char32_t number_terminators[] = { MW_COMMENT, ':', 0 };
69
70
71MwParser* mw_create_parser(PwValuePtr markup)
72{
73 MwParser* parser = allocate(sizeof(MwParser), true);
74 if (!parser) {
75 pw_set_status(PwOOM());
76 return nullptr;
77 }
78 parser->markup = pw_clone(markup);
79
80 parser->blocklevel = 1;
81 parser->max_blocklevel = MW_MAX_RECURSION_DEPTH;
82
83 parser->json_depth = 1;
84 parser->max_json_depth = MW_MAX_RECURSION_DEPTH;
85
86 parser->skip_comments = true;
87
88 if (!pw_create_empty_string(DEFAULT_LINE_CAPACITY, 1, &parser->current_line)) {
89 goto error;
90 }
91 if (!pw_map_va(&parser->custom_parsers,
92 PwString("raw"), PwPtr((void*) parse_raw_value),
93 PwString("literal"), PwPtr((void*) parse_literal_string),
94 PwString("folded"), PwPtr((void*) parse_folded_string),
95 PwString("datetime"), PwPtr((void*) parse_datetime),
96 PwString("timestamp"), PwPtr((void*) parse_timestamp),
97 PwString("json"), PwPtr((void*) _mw_json_parser_func)
98 )) {
99 goto error;
100 }
101 if (!pw_start_read_lines(markup)) {
102 goto error;
103 }
104 return parser;
105
106error:
107 mw_delete_parser(&parser);
108 return nullptr;
109}
110
111void mw_delete_parser(MwParser** parser_ptr)
112{
113 MwParser* parser = *parser_ptr;
114 *parser_ptr = nullptr;
115 pw_destroy(&parser->markup);
116 pw_destroy(&parser->current_line);
117 pw_destroy(&parser->custom_parsers);
118 release((void**) &parser, sizeof(MwParser));
119}
120
121[[nodiscard]] bool mw_set_custom_parser(MwParser* parser, char* convspec, MwBlockParserFunc parser_func)
122{
123 PwValue key = PwStaticString(convspec);
124 PwValue value = PwPtr((void*) parser_func);
125 return pw_map_update(&parser->custom_parsers, &key, &value);
126}
127
128static inline bool have_custom_parser(MwParser* parser, PwValuePtr convspec)
129{
130 return pw_map_has_key(&parser->custom_parsers, convspec);
131}
132
133static inline MwBlockParserFunc get_custom_parser(MwParser* parser, PwValuePtr convspec)
134{
135 PwValue parser_func = PW_NULL;
136 if (!pw_map_get(&parser->custom_parsers, convspec, &parser_func)) {
137 return nullptr;
138 }
139 return (MwBlockParserFunc) (parser_func.func_ptr);
140}
141
142bool _mw_end_of_block()
143{
144 PwValuePtr status = ¤t_task->status;
145 return (status->type_id == PwTypeId_Status)
146 && (status->kind == PwStatusKind_Basic)
147 && (status->status_code == MweEndOfBlock);
148}
149
150static inline bool end_of_line(PwValuePtr str, unsigned position)
151/*
152 * Return true if position is beyond end of line.
153 */
154{
155 return !pw_string_index_valid(str, position);
156}
157
158static inline bool isspace_or_eol_at(PwValuePtr str, unsigned position)
159{
160 if (end_of_line(str, position)) {
161 return true;
162 } else {
163 return pw_isspace(pw_char_at(str, position));
164 }
165}
166
167[[nodiscard]] static bool read_line(MwParser* parser)
168/*
169 * Read line into parser->current line and strip trailing spaces.
170 */
171{
172 if (!pw_read_line_inplace(&parser->markup, &parser->current_line)) {
173 return false;
174 }
175 // strip trailing spaces
176 if (!pw_string_rstrip(&parser->current_line)) {
177 return false;
178 }
179 // measure indent
180 parser->current_indent = pw_string_skip_spaces(&parser->current_line, 0);
181
182 // set current_line
183 parser->line_number = pw_get_line_number(&parser->markup);
184
185 return true;
186}
187
188static inline bool is_comment_line(MwParser* parser)
189/*
190 * Return true if current line starts with MW_COMMENT char.
191 */
192{
193 return pw_char_at(&parser->current_line, parser->current_indent) == MW_COMMENT;
194}
195
196[[nodiscard]] bool _mw_read_block_line(MwParser* parser)
197{
198 TRACEPOINT();
199
200 if (parser->eof) {
201 if (parser->blocklevel) {
202 // continue returning this for nested blocks
203 pw_set_status(PwStatus(MweEndOfBlock));
204 } else {
205 pw_set_status(PwStatus(PweEOF));
206 }
207 return false;
208 }
209 for (;;) {
210 if (!read_line(parser)) {
211 PwValue status = pw_get_status();
212 if (pw_is_eof(&status)) {
213 parser->eof = true;
214 pw_destroy(&parser->current_line);
215 pw_set_status(PwStatus(MweEndOfBlock));
216 }
217 return false;
218 }
219 if (parser->skip_comments) {
220 // skip empty lines too
221 if (pw_strlen(&parser->current_line) == 0) {
222 continue;
223 }
224 if (is_comment_line(parser)) {
225 continue;
226 }
227 parser->skip_comments = false;
228 }
229 if (pw_strlen(&parser->current_line) == 0) {
230 // return empty line as is
231 return true;
232 }
233 if (parser->current_indent >= parser->block_indent) {
234 // indentation is okay, return line
235 return true;
236 }
237 // unindent detected
238 if (is_comment_line(parser)) {
239 // skip unindented comments
240 continue;
241 }
242 TRACE("unindent");
243 // end of block
244 if (!pw_unread_line(&parser->markup, &parser->current_line)) {
245 return false;
246 }
247 if (!pw_string_truncate(&parser->current_line, 0)) {
248 return false;
249 }
250 pw_set_status(PwStatus(MweEndOfBlock));
251 return false;
252 }
253}
254
255[[nodiscard]] bool _mw_read_block(MwParser* parser, PwValuePtr result)
256{
257 TRACEPOINT();
258
259 if (!pw_create(PwTypeId_BasicArray, result)) {
260 return false;
261 }
262 for (;;) {
263 // append line
264 PwValue line = PW_NULL;
265 if (!pw_substr(&parser->current_line, parser->block_indent, UINT_MAX, &line)) {
266 return false;
267 }
268 if (!pw_array_append(result, &line)){
269 return false;
270 }
271 // read next line
272 if (!_mw_read_block_line(parser)) {
273 if (_mw_end_of_block()) {
274 return true;
275 }
276 return false;
277 }
278 }
279}
280
281[[nodiscard]] static bool parse_nested_block(MwParser* parser, unsigned block_pos,
282 MwBlockParserFunc parser_func, PwValuePtr result)
283/*
284 * Set block indent to `block_pos` and call parser_func.
285 */
286{
287 if (parser->blocklevel >= parser->max_blocklevel) {
288 mw_exception(parser->line_number, parser->current_indent, "Too many nested blocks");
289 return false;
290 }
291
292 // start nested block
293 parser->blocklevel++;
294 unsigned saved_block_indent = parser->block_indent;
295 parser->block_indent = block_pos;
296
297 TRACE_ENTER();
298
299 // call parser function
300 bool ret = parser_func(parser, result);
301
302 // end nested block
303 parser->block_indent = saved_block_indent;
304 parser->blocklevel--;
305
306 TRACE_EXIT();
307 return ret;
308}
309
310[[nodiscard]] static bool parse_nested_block_from_next_line(MwParser* parser,
311 MwBlockParserFunc parser_func, PwValuePtr result)
312/*
313 * Read next line, set block indent to current indent plus one, and call parser_func.
314 */
315{
316 TRACEPOINT();
317 TRACE("new block_pos %u", parser->block_indent + 1);
318
319 // temporarily increment block indent by one and read next line
320 parser->block_indent++;
321 parser->skip_comments = true;
322 bool ret = _mw_read_block_line(parser);
323 parser->block_indent--;
324
325 if (!ret) {
326 if (_mw_end_of_block()) {
327 mw_exception(parser->line_number, parser->current_indent, "Empty block");
328 }
329 return false;
330 }
331
332 // call parse_nested_block
333 return parse_nested_block(parser, parser->block_indent + 1, parser_func, result);
334}
335
336unsigned _mw_get_start_position(MwParser* parser)
337{
338 if (parser->block_indent < parser->current_indent) {
339 return parser->current_indent;
340 } else {
341 return pw_string_skip_spaces(&parser->current_line, parser->block_indent);
342 }
343}
344
345bool _mw_comment_or_end_of_line(MwParser* parser, unsigned position)
346{
347 position = pw_string_skip_spaces(&parser->current_line, position);
348 return (end_of_line(&parser->current_line, position)
349 || pw_char_at(&parser->current_line, position) == MW_COMMENT);
350}
351
352[[nodiscard]] static bool parse_convspec(MwParser* parser, unsigned opening_colon_pos,
353 unsigned* end_pos, PwValuePtr result)
354/*
355 * Extract conversion specifier starting from `opening_colon_pos` in the `current_line`.
356 *
357 * On success return string and write `end_pos`.
358 *
359 * If conversion specified is not detected, return PwNull()
360 */
361{
362 // make result Null
363 pw_destroy(result);
364
365 PwValuePtr current_line = &parser->current_line;
366
367 unsigned start_pos = opening_colon_pos + 1;
368 unsigned closing_colon_pos;
369 if (!pw_strchr(current_line, ':', start_pos, &closing_colon_pos)) {
370 return true;
371 }
372 if (closing_colon_pos == start_pos) {
373 // empty conversion specifier
374 return true;
375 }
376 if (!isspace_or_eol_at(current_line, closing_colon_pos + 1)) {
377 // not a conversion specifier
378 return true;
379 }
380 PwValue convspec = PW_NULL;
381 if (!pw_substr(current_line, start_pos, closing_colon_pos, &convspec)) {
382 return false;
383 }
384 if (!pw_string_strip(&convspec)) {
385 return false;
386 }
387 if (!have_custom_parser(parser, &convspec)) {
388 // such a conversion specifier is not defined
389 return true;
390 }
391 *end_pos = closing_colon_pos + 1;
392 pw_move(result, &convspec);
393 return true;
394}
395
396[[nodiscard]] static bool parse_raw_value(MwParser* parser, PwValuePtr result)
397{
398 TRACEPOINT();
399
400 PwValue lines = PW_NULL;
401 if (!_mw_read_block(parser, &lines)) {
402 return false;
403 }
404 if (pw_array_length(&lines) > 1) {
405 // append one empty line for ending line break
406 PwValue empty_line = PW_STRING("");
407 if (!pw_array_append(&lines, &empty_line)) {
408 return false;
409 }
410 }
411 // return concatenated lines
412 return pw_array_join(&lines, '\n', result);
413}
414
415[[nodiscard]] static bool parse_literal_string(MwParser* parser, PwValuePtr result)
416/*
417 * Parse current block as a literal string.
418 */
419{
420 TRACEPOINT();
421
422 PwValue lines = PW_NULL;
423 if (!_mw_read_block(parser, &lines)) {
424 return false;
425 }
426
427 // normalize list of lines
428
429 if (!pw_dedent(&lines)) {
430 return false;
431 }
432
433 // drop empty trailing lines
434 unsigned len = pw_array_length(&lines);
435 while (len--) {
436 PwValue line = PW_NULL;
437 if (!pw_array_item(&lines, len, &line)) {
438 return false;
439 }
440 if (pw_strlen(&line) != 0) {
441 break;
442 }
443 if (!pw_array_del(&lines, len, len + 1)) {
444 return false;
445 }
446 }
447
448 // append one empty line for ending line break
449 if (pw_array_length(&lines) > 1) {
450 PwValue empty_line = PW_STRING("");
451 if (!pw_array_append(&lines, &empty_line)) {
452 return false;
453 }
454 }
455
456 // return concatenated lines
457 return pw_array_join(&lines, '\n', result);
458}
459
460[[nodiscard]] bool _mw_unescape_line(MwParser* parser, PwValuePtr line, unsigned line_number,
461 char32_t quote, unsigned start_pos, unsigned end_pos, PwValuePtr result)
462{
463 if (!pw_create_empty_string(end_pos - start_pos, // unescaped string can be shorter
464 line->str_params.char_size, result)) {
465 return false;
466 }
467 unsigned pos = start_pos;
468 while (pos < end_pos) {
469 char32_t chr = pw_char_at(line, pos);
470 if (chr == quote) {
471 // closing quotation mark detected
472 break;
473 }
474 if (chr != '\\') {
475 if (!pw_string_append(result, chr)) {
476 return false;
477 }
478 } else {
479 // start of escape sequence
480 pos++;
481 if (pos >= end_pos) {
482 if (!pw_string_append(result, chr)) { // leave backslash in the result
483 return false;
484 }
485 }
486 int hexlen;
487 chr = pw_char_at(line, pos);
488 switch (chr) {
489
490 // Simple escape sequences
491 case '\'': // \' single quote byte 0x27
492 case '"': // \" double quote byte 0x22
493 case '?': // \? question mark byte 0x3f
494 case '\\': // \\ backslash byte 0x5c
495 if (!pw_string_append(result, chr)) { return false; };
496 break;
497 case 'a': if (!pw_string_append(result, 0x07)) { return false; } break; // audible bell
498 case 'b': if (!pw_string_append(result, 0x08)) { return false; } break; // backspace
499 case 'f': if (!pw_string_append(result, 0x0c)) { return false; } break; // form feed
500 case 'n': if (!pw_string_append(result, 0x0a)) { return false; } break; // line feed
501 case 'r': if (!pw_string_append(result, 0x0d)) { return false; } break; // carriage return
502 case 't': if (!pw_string_append(result, 0x09)) { return false; } break; // horizontal tab
503 case 'v': if (!pw_string_append(result, 0x0b)) { return false; } break; // vertical tab
504
505 // Numeric escape sequences
506 case 'o': {
507 // \on{1:3} code unit n... (1-3 octal digits)
508 char32_t v = 0;
509 for (int i = 0; i < 3; i++) {
510 pos++;
511 if (pos >= end_pos) {
512 if (i == 0) {
513 mw_exception(parser->line_number, pos, "Incomplete octal value");
514 return false;
515 }
516 break;
517 }
518 char32_t c = pw_char_at(line, pos);
519 if ('0' <= c && c <= '7') {
520 v <<= 3;
521 v += c - '0';
522 } else {
523 mw_exception(parser->line_number, pos, "Bad octal value");
524 return false;
525 }
526 }
527 if (!pw_string_append(result, v)) { return false; }
528 break;
529 }
530 case 'x':
531 // \xn{2} code unit n... (exactly 2 hexadecimal digits are required)
532 hexlen = 2;
533 goto parse_hex_value;
534
535 // Unicode escape sequences
536 case 'u':
537 // \un{4} code point U+n... (exactly 4 hexadecimal digits are required)
538 hexlen = 4;
539 goto parse_hex_value;
540 case 'U':
541 // \Un{8} code point U+n... (exactly 8 hexadecimal digits are required)
542 hexlen = 8;
543
544 parse_hex_value: {
545 char32_t v = 0;
546 for (int i = 0; i < hexlen; i++) {
547 pos++;
548 if (pos >= end_pos) {
549 mw_exception(parser->line_number, pos, "Incomplete hexadecimal value");
550 return false;
551 }
552 char32_t c = pw_char_at(line, pos);
553 if ('0' <= c && c <= '9') {
554 v <<= 4;
555 v += c - '0';
556 } else if ('a' <= c && c <= 'f') {
557 v <<= 4;
558 v += c - 'a' + 10;
559 } else if ('A' <= c && c <= 'F') {
560 v <<= 4;
561 v += c - 'A' + 10;
562 } else {
563 mw_exception(parser->line_number, pos, "Bad hexadecimal value");
564 return false;
565 }
566 }
567 if (!pw_string_append(result, v)) { return false; }
568 break;
569 }
570 default:
571 // not a valid escape sequence
572 if (!pw_string_append(result, '\\')) { return false; }
573 if (!pw_string_append(result, chr)) { return false; }
574 break;
575 }
576 }
577 pos++;
578 }
579 return true;
580}
581
582[[nodiscard]] static bool fold_lines(MwParser* parser, PwValuePtr lines,
583 char32_t quote, PwValuePtr line_numbers, PwValuePtr result)
584/*
585 * Fold list of lines and return concatenated string.
586 *
587 * If `quote` is nonzero, unescape lines.
588 */
589{
590 if (!pw_dedent(lines)) {
591 return false;
592 }
593 unsigned len = pw_array_length(lines);
594
595 // skip leading empty lines
596 unsigned start_i = 0;
597 for (; start_i < len; start_i++) {
598 PwValue line = PW_NULL;
599 if (!pw_array_item(lines, start_i, &line)) {
600 return false;
601 }
602 if (pw_strlen(&line) != 0 && !pw_string_isspace(&line)) {
603 break;
604 }
605 }
606 if (start_i == len) {
607 // return empty string
608 pw_destroy(result);
609 *result = PwString("");
610 return true;
611 }
612
613 // skip trailing empty lines
614 unsigned end_i = len;
615 for (; end_i; end_i--) {
616 PwValue line = PW_NULL;
617 if (!pw_array_item(lines, end_i - 1, &line)) {
618 return false;
619 }
620 if (pw_strlen(&line) != 0 && !pw_string_isspace(&line)) {
621 break;
622 }
623 }
624 if (end_i == 0) {
625 // return empty string
626 pw_destroy(result);
627 *result = PwString("");
628 return true;
629 }
630
631 // calculate length of result
632 unsigned result_len = end_i - start_i - 1; // reserve space for separators
633 uint8_t char_size = 1;
634 for (unsigned i = start_i; i < end_i; i++) {
635 PwValue line = PW_NULL;
636 if (!pw_array_item(lines, i, &line)) {
637 return false;
638 }
639 result_len += pw_strlen(&line);
640 uint8_t cs = line.str_params.char_size;
641 if (cs > char_size) {
642 char_size = cs;
643 }
644 }
645
646 // allocate result
647 if (!pw_create_empty_string(result_len, char_size, result)) {
648 return false;
649 }
650
651 // concatenate lines
652 bool prev_LF = false;
653 for (unsigned i = start_i; i < end_i; i++) {
654 PwValue line = PW_NULL;
655 if (!pw_array_item(lines, i, &line)) {
656 return false;
657 }
658 if (i > start_i) {
659 if (pw_strlen(&line) == 0) {
660 // treat empty lines as LF
661 if (!pw_string_append(result, '\n')) {
662 return false;
663 }
664 prev_LF = true;
665 } else {
666 if (prev_LF) {
667 // do not append separator if previous line was empty
668 prev_LF = false;
669 } else {
670 if (pw_isspace(pw_char_at(&line, 0))) {
671 // do not append separator if the line aleady starts with space
672 } else {
673 if (!pw_string_append(result, ' ')) {
674 return false;
675 }
676 }
677 }
678 }
679 }
680 if (quote) {
681 PwValue line_number = PW_NULL;
682 if (!pw_array_item(line_numbers, i, &line_number)) {
683 return false;
684 }
685 PwValue unescaped = PW_NULL;
686 if (!_mw_unescape_line(parser, &line, line_number.unsigned_value, quote, 0, pw_strlen(&line), &unescaped)) {
687 return false;
688 }
689 if (!pw_string_append(result, &unescaped)) {
690 return false;
691 }
692 } else {
693 if (!pw_string_append(result, &line)) {
694 return false;
695 }
696 }
697 }
698 return true;
699}
700
701[[nodiscard]] static bool parse_folded_string(MwParser* parser, PwValuePtr result)
702{
703 TRACEPOINT();
704
705 PwValue lines = PW_NULL;
706 if (!_mw_read_block(parser, &lines)) {
707 return false;
708 }
709 return fold_lines(parser, &lines, 0, nullptr, result);
710}
711
712bool _mw_find_closing_quote(PwValuePtr line, char32_t quote, unsigned start_pos, unsigned* end_pos)
713{
714 for (;;) {
715 if (!pw_strchr(line, quote, start_pos, end_pos)) {
716 return false;
717 }
718 // check if the quotation mark is not escaped
719 if (*end_pos && pw_char_at(line, *end_pos - 1) == '\\') {
720 // continue searching
721 start_pos = *end_pos + 1;
722 } else {
723 return true;
724 }
725 }
726}
727
728[[nodiscard]] static bool parse_quoted_string(MwParser* parser, unsigned opening_quote_pos,
729 unsigned* end_pos, PwValuePtr result)
730/*
731 * Parse quoted string starting from `opening_quote_pos` in the current line.
732 *
733 * Write next position after the closing quotation mark to `end_pos`.
734 */
735{
736 TRACEPOINT();
737
738 // Get opening quote. The closing quote should be the same.
739 char32_t quote = pw_char_at(&parser->current_line, opening_quote_pos);
740
741 // process first line
742 unsigned closing_quote_pos;
743 if (_mw_find_closing_quote(&parser->current_line, quote, opening_quote_pos + 1, &closing_quote_pos)) {
744 // single-line string
745 *end_pos = closing_quote_pos + 1;
746 return _mw_unescape_line(parser, &parser->current_line, parser->line_number,
747 quote, opening_quote_pos + 1, closing_quote_pos, result);
748 }
749
750 unsigned block_indent = opening_quote_pos + 1;
751
752 // make parser read nested block
753 unsigned saved_block_indent = parser->block_indent;
754 parser->block_indent = block_indent;
755 parser->blocklevel++;
756
757 // read block
758 PwValue lines = PW_NULL;
759 if (!pw_create(PwTypeId_BasicArray, &lines)) {
760 return false;
761 }
762 PwValue line_numbers = PW_NULL;
763 if (!pw_create(PwTypeId_BasicArray, &line_numbers)) {
764 return false;
765 }
766 bool closing_quote_detected = false;
767 for (;;) {
768 // append line number
769 PwValue n = PwUnsigned(parser->line_number);
770 if (!pw_array_append(&line_numbers, &n)) {
771 return false;
772 }
773 // append line
774 if (_mw_find_closing_quote(&parser->current_line, quote, block_indent, end_pos)) {
775 // final line
776 PwValue final_line = PW_NULL;
777 if (!pw_substr(&parser->current_line, block_indent, *end_pos, &final_line)) {
778 return false;
779 }
780 // strip trailing spaces
781 if (!pw_string_rstrip(&final_line)) {
782 return false;
783 }
784 if (!pw_array_append(&lines, &final_line)) {
785 return false;
786 }
787 (*end_pos)++;
788 closing_quote_detected = true;
789 break;
790 } else {
791 // intermediate line
792 PwValue line = PW_NULL;
793 if (!pw_substr(&parser->current_line, block_indent, UINT_MAX, &line)) {
794 return false;
795 }
796 if (!pw_array_append(&lines, &line)) {
797 return false;
798 }
799 }
800 // read next line
801 if (!_mw_read_block_line(parser)) {
802 if (_mw_end_of_block()) {
803 break;
804 }
805 return false;
806 }
807 }
808
809 // finished reading nested block
810 parser->block_indent = saved_block_indent;
811 parser->blocklevel--;
812
813 if (!closing_quote_detected) {
814
815 static char unterminated[] = "String has no closing quote";
816
817 // the above loop terminated abnormally, need to read next line
818 if (!_mw_read_block_line(parser)) {
819 if (_mw_end_of_block()) {
820 mw_exception(parser->line_number, parser->current_indent, unterminated);
821 }
822 return false;
823 }
824 // check if the line starts with a quote with the same indent as the opening quote
825 if (parser->current_indent == opening_quote_pos
826 && pw_char_at(&parser->current_line, parser->current_indent) == quote) {
827
828 *end_pos = opening_quote_pos + 1;
829 } else {
830 mw_exception(parser->line_number, parser->current_indent, unterminated);
831 return false;
832 }
833 }
834
835 // fold and unescape
836 return fold_lines(parser, &lines, quote, &line_numbers, result);
837}
838
839[[nodiscard]] static bool parse_datetime(MwParser* parser, PwValuePtr result)
840/*
841 * Parse value date/time starting from block indent in the current line.
842 */
843{
844 static char32_t allowed_terminators[] = { MW_COMMENT, 0 };
845
846 unsigned start_pos = _mw_get_start_position(parser);
847 unsigned end_pos;
848 if (!_pw_parse_datetime(&parser->current_line, start_pos, &end_pos, allowed_terminators, result)) {
849 if (pw_is_basic_error(¤t_task->status, PweBadDatetime)) {
850 mw_exception(parser->line_number, start_pos, "Bad date/time");
851 }
852 return false;
853 }
854 if (!_mw_comment_or_end_of_line(parser, end_pos)) {
855 mw_exception(parser->line_number, start_pos, "Bad date/time");
856 return false;
857 }
858 return true;
859}
860
861[[nodiscard]] static bool parse_timestamp(MwParser* parser, PwValuePtr result)
862/*
863 * Parse value as timestamp starting from block indent in the current line.
864 */
865{
866 static char32_t allowed_terminators[] = { MW_COMMENT, 0 };
867
868 unsigned start_pos = _mw_get_start_position(parser);
869 unsigned end_pos;
870 if (!_pw_parse_timestamp(&parser->current_line, start_pos, &end_pos, allowed_terminators, result)) {
871 if (pw_is_basic_error(¤t_task->status, PweBadTimestamp)
872 || pw_is_basic_error(¤t_task->status, PweNumericOverflow)) {
873 mw_exception(parser->line_number, start_pos, "Bad timestamp");
874 }
875 return false;
876 }
877 if (!_mw_comment_or_end_of_line(parser, end_pos)) {
878 mw_exception(parser->line_number, end_pos, "Bad timestamp");
879 return false;
880 }
881 return true;
882}
883
884[[nodiscard]] bool _mw_parse_number(MwParser* parser, unsigned start_pos, int sign,
885 unsigned* end_pos, char32_t* allowed_terminators, PwValuePtr result)
886{
887 TRACEPOINT();
888 TRACE("start_pos %u", start_pos);
889
890 if (!_pw_parse_num_str(&parser->current_line, start_pos, sign, end_pos, allowed_terminators, false, result)) {
891 if (pw_is_basic_error(¤t_task->status, PweBadNumber)
892 || pw_is_basic_error(¤t_task->status, PweNumericOverflow)) {
893 mw_exception(parser->line_number, start_pos, "Bad number");
894 }
895 return false;
896 }
897 return true;
898}
899
900[[nodiscard]] static bool parse_list(MwParser* parser, PwValuePtr result)
901/*
902 * Parse list.
903 */
904{
905 TRACE_ENTER();
906
907 if (!pw_create(PwTypeId_BasicArray, result)) {
908 return false;
909 }
910
911 /*
912 * All list items must have the same indent.
913 * Save indent of the first item (current one) and check it for subsequent items.
914 */
915 unsigned item_indent = _mw_get_start_position(parser);
916
917 for (;;) {
918 {
919 // check if hyphen is followed by space or end of line
920 unsigned next_pos = item_indent + 1;
921 if (!isspace_or_eol_at(&parser->current_line, next_pos)) {
922 mw_exception(parser->line_number, item_indent, "Bad list item");
923 return false;
924 }
925
926 // parse item as a nested block
927
928 PwValue item = PW_NULL;
929 if (_mw_comment_or_end_of_line(parser, next_pos)) {
930 if (!parse_nested_block_from_next_line(parser, value_parser_func, &item)) {
931 return false;
932 }
933 } else {
934 // nested block starts on the same line, increment block position
935 next_pos++;
936 if (!parse_nested_block(parser, next_pos, value_parser_func, &item)) {
937 return false;
938 }
939 }
940 if (!pw_array_append(result, &item)) {
941 return false;
942 }
943 if (!_mw_read_block_line(parser)) {
944 if (_mw_end_of_block()) {
945 break;
946 }
947 return false;
948 }
949 if (parser->current_indent != item_indent) {
950 mw_exception(parser->line_number, parser->current_indent, "Bad indentation of list item");
951 return false;
952 }
953 }
954 }
955 TRACE_EXIT();
956 return true;
957}
958
959[[nodiscard]] static bool parse_map(MwParser* parser, PwValuePtr first_key,
960 PwValuePtr convspec_arg, unsigned value_pos, PwValuePtr result)
961/*
962 * Parse map.
963 *
964 * Key is already parsed, continue parsing from `value_pos` in the `current_line`.
965 */
966{
967 TRACE_ENTER();
968
969 if (!pw_create(PwTypeId_BasicMap, result)) {
970 TRACE_EXIT();
971 return false;
972 }
973
974 PwValue key = pw_clone(first_key);
975 PwValue convspec = pw_clone(convspec_arg);
976
977 /*
978 * All keys in the map must have the same indent.
979 * Save indent of the first key (current one) and check it for subsequent keys.
980 */
981 unsigned key_indent = _mw_get_start_position(parser);
982
983 for (;;) {
984 TRACE("parse value (line %u) from position %u", parser->line_number, value_pos);
985 {
986 // parse value as a nested block
987
988 MwBlockParserFunc parser_func = value_parser_func;
989 if (pw_is_string(&convspec)) {
990 parser_func = get_custom_parser(parser, &convspec);
991 }
992 PwValue value = PW_NULL;
993 if (_mw_comment_or_end_of_line(parser, value_pos)) {
994 if (!parse_nested_block_from_next_line(parser, parser_func, &value)) {
995 TRACE_EXIT();
996 return false;
997 }
998 } else {
999 if (!parse_nested_block(parser, value_pos, parser_func, &value)) {
1000 TRACE_EXIT();
1001 return false;
1002 }
1003 }
1004 if (!pw_map_update(result, &key, &value)) {
1005 TRACE_EXIT();
1006 return false;
1007 }
1008 }
1009 TRACE("parse next key");
1010 {
1011 pw_destroy(&key);
1012 pw_destroy(&convspec);
1013
1014 if (!_mw_read_block_line(parser)) {
1015 if (_mw_end_of_block()) {
1016 TRACE("end of map");
1017 break;
1018 }
1019 TRACE_EXIT();
1020 return false;
1021 }
1022 if (parser->current_indent != key_indent) {
1023 mw_exception(parser->line_number, parser->current_indent, "Bad indentation of map key");
1024 TRACE_EXIT();
1025 return false;
1026 }
1027 if (!parse_value(parser, &value_pos, &convspec, &key)) {
1028 TRACE_EXIT();
1029 return false;
1030 }
1031 }
1032 }
1033 TRACE_EXIT();
1034 return true;
1035}
1036
1037[[nodiscard]] static bool is_kv_separator(MwParser* parser, unsigned colon_pos,
1038 PwValuePtr convspec_out, unsigned *value_pos, bool* result)
1039/*
1040 * Set result true if colon_pos is followed by end of line, space, or conversion specifier.
1041 * Write conversion specifier to `convspec_out` if value is followed by conversion specifier.
1042 * Write position of value to value_pos.
1043 */
1044{
1045 PwValuePtr current_line = &parser->current_line;
1046
1047 unsigned next_pos = colon_pos + 1;
1048
1049 if (end_of_line(current_line, next_pos)) {
1050 *value_pos = next_pos;
1051 *result = true;
1052 return true;
1053 }
1054 char32_t chr = pw_char_at(current_line, next_pos);
1055 if (pw_isspace(chr)) {
1056 *value_pos = next_pos + 1; // value should be separated from key by at least one space
1057 next_pos = pw_string_skip_spaces(current_line, next_pos);
1058 // cannot be end of line here because current line is R-stripped and EOL is already checked
1059 chr = pw_char_at(current_line, next_pos);
1060 if (chr != ':') {
1061 // separator without conversion specifier
1062 *result = true;
1063 return true;
1064 }
1065 } else if (chr != ':') {
1066 // key not followed immediately by conversion specifier -> not a separator
1067 *result = false;
1068 return true;
1069 }
1070
1071 // try parsing conversion specifier
1072 // value_pos will be updated only if conversion specifier is valid
1073 PwValue convspec = PW_NULL;
1074 if (!parse_convspec(parser, next_pos, value_pos, &convspec)) {
1075 return false;
1076 }
1077 if (pw_is_string(&convspec)) {
1078 if (convspec_out) {
1079 pw_move(convspec_out, &convspec);
1080 }
1081 *result = true;
1082 return true;
1083 }
1084
1085 // bad conversion specifier -> not a separator
1086 *result = false;
1087 return true;
1088}
1089
1090[[nodiscard]] static bool check_value_end(MwParser* parser, PwValuePtr value, unsigned end_pos,
1091 unsigned* nested_value_pos, PwValuePtr convspec_out, PwValuePtr result)
1092/*
1093 * Helper function for parse_value.
1094 *
1095 * Check if value ends with key-value separator and parse map.
1096 * If not, check if end_pos points to end of line or comment.
1097 *
1098 * If `nested_value_pos` is provided, the value is _expected_ to be a map key
1099 * and _must_ end with key-value separator.
1100 * The position of the next char after colon is stored in it
1101 * and conversion specifier, if any, is stored in `convspec_out`.
1102 *
1103 * Read next line if nothing to parse on the current_line.
1104 *
1105 * If there's no nested map to parse, return cloned value.
1106 */
1107{
1108 end_pos = pw_string_skip_spaces(&parser->current_line, end_pos);
1109 if (end_of_line(&parser->current_line, end_pos)) {
1110 if (nested_value_pos) {
1111 mw_exception(parser->line_number, end_pos, "Map key expected");
1112 return false;
1113 }
1114 // read next line
1115 if (!_mw_read_block_line(parser)) {
1116 if (!_mw_end_of_block()) {
1117 return false;
1118 }
1119 }
1120 pw_clone2(result, value);
1121 return true;
1122 }
1123
1124 char32_t chr = pw_char_at(&parser->current_line, end_pos);
1125 if (chr == ':') {
1126 // check key-value separator
1127 PwValue convspec = PW_NULL;
1128 unsigned value_pos;
1129 bool kvs;
1130 if (!is_kv_separator(parser, end_pos, &convspec, &value_pos, &kvs)) {
1131 return false;
1132 }
1133 if (kvs) {
1134 // found key-value separator
1135 if (nested_value_pos) {
1136 // the separator was expected, just return the value
1137 *nested_value_pos = value_pos;
1138 pw_move(convspec_out, &convspec);
1139 pw_clone2(result, value);
1140 return true;
1141 }
1142 // parse map
1143 PwValue first_key = pw_clone(value);
1144 return parse_map(parser, &first_key, &convspec, value_pos, result);
1145 }
1146 mw_exception(parser->line_number, end_pos + 1, "Bad character encountered");
1147 return false;
1148 }
1149
1150 if (chr != MW_COMMENT) {
1151 mw_exception(parser->line_number, end_pos, "Bad character encountered");
1152 return false;
1153 }
1154
1155 // read next line
1156 if (!_mw_read_block_line(parser)) {
1157 if (!_mw_end_of_block()) {
1158 return false;
1159 }
1160 }
1161 pw_clone2(result, value);
1162 return true;
1163}
1164
1165[[nodiscard]] static bool parse_value(MwParser* parser, unsigned* nested_value_pos,
1166 PwValuePtr convspec_out, PwValuePtr result)
1167/*
1168 * Parse value starting from `current_line[block_indent]` .
1169 *
1170 * If `nested_value_pos` is provided, the value is _expected_ to be a map key
1171 * and _must_ end with colon or include a colon if it's a literal strings.
1172 *
1173 * On success return parsed value.
1174 * If `nested_value_pos' is provided, write position of the next char after colon to it
1175 * and write conversion specifier to `convspec_out` if it's followed by conversion specifier.
1176 *
1177 * On error return status and set `parser->result["error"]`.
1178 */
1179{
1180 TRACEPOINT();
1181
1182 unsigned start_pos = _mw_get_start_position(parser);
1183
1184 // Analyze first character.
1185 char32_t chr = pw_char_at(&parser->current_line, start_pos);
1186
1187 // first, check if value starts with colon that may denote conversion specifier
1188
1189 if (chr == ':') {
1190 // this might be conversion specifier
1191 if (nested_value_pos) {
1192 // we expect map key, and map keys cannot start with colon
1193 // because they would look same as conversion specifier
1194 mw_exception(parser->line_number, start_pos, "Map key expected and it cannot start with colon");
1195 return false;
1196 }
1197 unsigned value_pos;
1198 PwValue convspec =PW_NULL;
1199 if (!parse_convspec(parser, start_pos, &value_pos, &convspec)) {;
1200 return false;
1201 }
1202 if (!pw_is_string(&convspec)) {
1203 // not a conversion specifier
1204 return parse_literal_string(parser, result);
1205 }
1206 // we have conversion specifier
1207 if (end_of_line(&parser->current_line, value_pos)) {
1208
1209 // conversion specifier is followed by LF
1210 // continue parsing CURRENT block from next line
1211 if (!_mw_read_block_line(parser)) {
1212 if (_mw_end_of_block()) {
1213 mw_exception(parser->line_number, parser->current_indent, "Empty block");
1214 }
1215 return false;
1216 }
1217 // call parser function
1218 MwBlockParserFunc parser_func = get_custom_parser(parser, &convspec);
1219 return parser_func(parser, result);
1220
1221 } else {
1222 // value is on the same line, parse it as nested block
1223 return parse_nested_block(
1224 parser, value_pos, get_custom_parser(parser, &convspec), result
1225 );
1226 }
1227 }
1228
1229 // other values can be map keys
1230
1231 // check for dash
1232
1233 if (chr == '-') {
1234 unsigned next_pos = start_pos + 1;
1235 char32_t next_chr = pw_char_at(&parser->current_line, next_pos);
1236
1237 // if followed by digit, it's a number
1238 if ('0' <= next_chr && next_chr <= '9') {
1239 unsigned end_pos;
1240 PwValue number = PW_NULL;
1241 if (!_mw_parse_number(parser, next_pos, -1, &end_pos, number_terminators, &number)) {
1242 return false;
1243 }
1244 return check_value_end(parser, &number, end_pos, nested_value_pos, convspec_out, result);
1245 }
1246 // if followed by space or end of line, that's a list item
1247 if (isspace_or_eol_at(&parser->current_line, next_pos)) {
1248 if (nested_value_pos) {
1249 mw_exception(parser->line_number, start_pos, "Map key expected and it cannot be a list");
1250 return false;
1251 }
1252 // yes, it's a list item
1253 return parse_list(parser, result);
1254 }
1255 // otherwise, it's a literal string or map
1256 goto parse_literal_string_or_map;
1257 }
1258
1259 // check for quoted string
1260
1261 if (chr == '"' || chr == '\'') {
1262 // quoted string
1263 unsigned start_line = parser->line_number;
1264 unsigned end_pos;
1265 PwValue str = PW_NULL;
1266 if (!parse_quoted_string(parser, start_pos, &end_pos, &str)) {
1267 return false;
1268 }
1269 unsigned end_line = parser->line_number;
1270 if (end_line == start_line) {
1271 // single-line string can be a map key
1272 return check_value_end(parser, &str, end_pos, nested_value_pos, convspec_out, result);
1273 } else if (_mw_comment_or_end_of_line(parser, end_pos)) {
1274 // multi-line string cannot be a key
1275 pw_move(result, &str);
1276 return true;
1277 } else {
1278 mw_exception(parser->line_number, end_pos, "Bad character after quoted string");
1279 return false;
1280 }
1281 }
1282
1283 // check for reserved keywords
1284
1285 TRACE("trying reserved keywords");
1286 if (pw_substring_eq(&parser->current_line, start_pos, start_pos + 4, "null")) {
1287 PwValue null_value = PW_NULL;
1288 return check_value_end(parser, &null_value, start_pos + 4, nested_value_pos, convspec_out, result);
1289 }
1290 if (pw_substring_eq(&parser->current_line, start_pos, start_pos + 4, "true")) {
1291 PwValue true_value = PW_BOOL(true);
1292 return check_value_end(parser, &true_value, start_pos + 4, nested_value_pos, convspec_out, result);
1293 }
1294 if (pw_substring_eq(&parser->current_line, start_pos, start_pos + 5, "false")) {
1295 PwValue false_value = PW_BOOL(false);
1296 return check_value_end(parser, &false_value, start_pos + 5, nested_value_pos, convspec_out, result);
1297 }
1298
1299 // try parsing number
1300
1301 TRACE("not a keyword, trying number");
1302 if (chr == '+') {
1303 char32_t next_chr = pw_char_at(&parser->current_line, start_pos + 1);
1304 if ('0' <= next_chr && next_chr <= '9') {
1305 start_pos++;
1306 chr = next_chr;
1307 }
1308 }
1309 if ('0' <= chr && chr <= '9') {
1310 unsigned end_pos;
1311 PwValue number = PW_NULL;
1312 if (!_mw_parse_number(parser, start_pos, 1, &end_pos, number_terminators, &number)) {
1313 return false;
1314 }
1315 return check_value_end(parser, &number, end_pos, nested_value_pos, convspec_out, result);
1316 }
1317 TRACE("not a number, pasring literal string or map");
1318
1319parse_literal_string_or_map:
1320
1321 // look for key-value separator
1322 for (unsigned pos = start_pos;;) {
1323 unsigned colon_pos;
1324 if (!pw_strchr(&parser->current_line, ':', pos, &colon_pos)) {
1325 break;
1326 }
1327 PwValue convspec = PW_NULL;
1328 unsigned value_pos;
1329 bool kvs;
1330 if (!is_kv_separator(parser, colon_pos, &convspec, &value_pos, &kvs)) {
1331 return false;
1332 }
1333 if (kvs) {
1334 // found key-value separator, get key
1335 PwValue key = PW_NULL;
1336 if (!pw_substr(&parser->current_line, start_pos, colon_pos, &key)) {
1337 return false;
1338 }
1339 // strip trailing spaces
1340 if (!pw_string_rstrip(&key)) {
1341 return false;
1342 }
1343 if (nested_value_pos) {
1344 // key was anticipated, simply return it
1345 *nested_value_pos = value_pos;
1346 pw_move(convspec_out, &convspec);
1347 pw_move(result, &key);
1348 return true;
1349 }
1350 // parse map
1351 return parse_map(parser, &key, &convspec, value_pos, result);
1352 }
1353 pos = colon_pos + 1;
1354 }
1355
1356 // separator not found
1357
1358 if (nested_value_pos) {
1359 // expecting key, but it's a bare literal string
1360 mw_exception(parser->line_number, parser->current_indent, "Not a key");
1361 return false;
1362 }
1363 return parse_literal_string(parser, result);
1364}
1365
1366[[nodiscard]] static bool value_parser_func(MwParser* parser, PwValuePtr result)
1367{
1368 return parse_value(parser, nullptr, nullptr, result);
1369}
1370
1371[[nodiscard]] bool mw_parse(PwValuePtr markup, PwValuePtr result)
1372{
1373 [[ gnu::cleanup(mw_delete_parser) ]] MwParser* parser = mw_create_parser(markup);
1374 if (!parser) {
1375 return false;
1376 }
1377 // read first line to prepare for parsing and to detect EOF
1378 if (!_mw_read_block_line(parser)) {
1379 if (_mw_end_of_block() && parser->eof) {
1380 pw_set_status(PwStatus(PweEOF));
1381 }
1382 return false;
1383 }
1384
1385 // parse top-level value
1386 if (!value_parser_func(parser, result)) {
1387 return false;
1388 }
1389
1390 // make sure markup has no more data
1391 if (_mw_read_block_line(parser)) {
1392 mw_exception(parser->line_number, parser->current_indent, "Extra data after parsed value");
1393 return false;
1394 }
1395 if (_mw_end_of_block() && parser->eof) {
1396 // EOF means success
1397 pw_destroy(¤t_task->status);
1398 return true;
1399 }
1400 return false;
1401}