A powerful Parser Combinator library with error reporting and input rewriting capabilities
define QParse_RULES_USE_QT_FRAMEWORK when building with QT6 for QT6 specialization support, uses QString and friends
otherwise leave QParse_RULES_USE_QT_FRAMEWORK undefined to build with std (non-QT) support, uses std::string and friends
to add support for another framework, see framework_defines.h
#include <QParse/Rules_Extra.h>we first start with an Iterator
using namespace QParse;
Iterator iter = "some string";we then create a Grammar
first, lets get the obvious out of the way
QParse supports the creation of user defined Rule objects
a custom Rule object can be defined by extending from QParse::Rules::Rule (for a single rule), QParse::Rules::RuleHolder (for a collection of rules), or any of their subclasses
it is recommended to extend from Rule and store Rule input arguments in one or more RuleHolder variables, where the number of rules is known, or in a vector of RuleHolder if the number of rules is unknown
it is recommended to extend from RuleHolder directly if the input is a single rule
for a single input rule, either Rule or RuleHolder will do, but RuleHolder is recommended if possible
here is the rule for the custom If rule, implemented via Rule
#include "Rules.h"
namespace QParse {
namespace Rules {
using Condition = std::function<bool()>;
class If : public Rule {
Condition condition;
RuleHolder rule_if_true;
RuleHolder rule_if_false;
public:
If(Condition cond, Rule * rule_if_true) : condition(cond), rule_if_true(rule_if_true), rule_if_false(nullptr) {}
If(Condition cond, Rule * rule_if_true, Rule* rule_if_false) : condition(cond), rule_if_true(rule_if_true), rule_if_false(rule_if_false) {}
using Rule::match;
virtual std::optional<IteratorMatcher::MatchData> match(Iterator &iterator, UndoRedo *undo, bool doAction, bool logErrors = true) override {
IteratorMatcher::MatchData match;
match.begin = iterator.current();
match.end = iterator.current();
match.matched = false;
if (condition()) {
auto tmp_ = rule_if_true.match(iterator, undo, doAction, logErrors);
if (!tmp_.has_value()) return std::nullopt;
auto tmp = *tmp_;
match.matched = tmp.matched;
match.end = tmp.end;
match.matches += tmp.matches;
} else {
auto tmp_ = rule_if_false.match(iterator, undo, doAction, logErrors);
if (!tmp_.has_value()) return std::nullopt;
auto tmp = *tmp_;
match.matched = tmp.matched;
match.end = tmp.end;
match.matches += tmp.matches;
}
return match;
}
};
}
}here is the rule for Optional, implemented via RuleHolder
struct Optional : RuleHolder {
Optional(Rule * rule, Action action = NO_ACTION);
using Rule::match;
virtual std::optional<IteratorMatcher::MatchData> match(Iterator &iterator, UndoRedo *undo, bool doAction = true, bool logErrors = true) override;
};QParse::Rules::Optional::Optional(Rule *rule, Action action) : RuleHolder(rule, action) {}
std::optional<QParse::IteratorMatcher::MatchData> QParse::Rules::Optional::match(Iterator &iterator, UndoRedo *undo, bool doAction, bool logErrors) {
IteratorMatcher::MatchData match;
match.begin = iterator.current();
match.end = iterator.current();
auto tmp_ = rule->match(iterator, undo, doAction, logErrors);
if (!tmp_.has_value()) return std::nullopt;
auto tmp = *tmp_;
if (tmp) {
match.end = tmp.end;
match.matches = tmp.matches;
}
match.matched = true;
iterator.pushInfo();
match.matches++;
if (doAction) action(Input(iterator, match, undo, match.matches));
return match;
}and here is the rule for Or, implemented via Rule
struct Or : Rule {
QParse_RULES____VECTOR<RuleHolder> rules;
Or(std::initializer_list<Rule*> rules, Action action = NO_ACTION);
using Rule::match;
virtual std::optional<IteratorMatcher::MatchData> match(Iterator &iterator, UndoRedo *undo, bool doAction = true, bool logErrors = true) override;
};QParse::Rules::Or::Or(std::initializer_list<Rule *> rules, Action action) : Rule(action) {
for (Rule * rule : rules) {
this->rules.push_back(rule);
}
}
std::optional<QParse::IteratorMatcher::MatchData> QParse::Rules::Or::match(Iterator &iterator, UndoRedo *undo, bool doAction, bool logErrors) {
IteratorMatcher::MatchData match;
match.begin = iterator.current();
match.end = iterator.current();
match.matched = false;
if (rules.size() == 0) {
match.matched = true;
iterator.pushInfo();
match.matches++;
if (doAction) action(Input(iterator, match, undo, match.matches));
return match;
}
for (Rule & rule : rules) {
auto match_ = rule.match(iterator, undo, doAction, logErrors);
if (!match_.has_value()) {
iterator.popInfo(match.matches);
return std::nullopt;
}
match = *match_;
if (match) {
if (doAction) action(Input(iterator, match, undo, match.matches));
return match;
} else {
iterator.popInfo(match.matches);
match.matches = 0;
}
}
return match;
}a Grammar is a set of Rule objects that define the Grammar Definition
all rule objects accept an ACTION in the form of the following lambda: [&] (QParse::Rules::Input input) { code_here(); }, or std::function<void, QParse::Rules::Input>
rule objects are sometimes called expressions as they express how you want to parse something
for example, lets parse "some string"
using namespace QParse;
Iterator iter = "some string";
Rules::String("some string").match(iter);here, we match "some string" exactly, character for character
this is too easy, lets match it via an expression
we know it consists of alphabetical characters and a white space
for this we will use Rules::Sequence and Rules::Range
auto text = new Rules::OneOrMore(
new Rules::Range('a', 'z')
);
auto space = new Rules::Char(' ');
Rules::Sequence({ text, space, text }).match(iter);here, we match against one or more range of a to z, followed by a space, followed by one or more range of a to z
you may notice we do not delete text or space, this is because each Rules:: takes a Rule* object, and manages its lifetime automatically
{
Rules::At(
new Rules::Char('a') // managed by At
);
}
// At is destructed
// Char gets deleted by the destructor
this is done via a hidden RuleHolder*
a RuleHolder object extends Rule, and manages the lifetime of a Rule object via reference counting
this makes it possible to use a Rule in multiple places without worrying about dangling references or use-after-free
all Rules:: that accept Rule* objects must store each Rule* object in a RuleHolder* object
this is done by simply extending RuleHolder or by extending Rule and then storing each Rule* inside a RuleHolder
a RuleHolder can only contain one Rule at a time
we will cover some advance topics such as conditional expressions, stack expressions, error reporting, and input modification later
see above
see above
unless explicitly stated, all rules fail if they encounter EOF
fails the entire rule, except if inside an Or, Optional, or At rules
any further execution is haulted, and the input stack (see Input Modification) is unwound
the name will be displayed for identifying iterators
the name can be changed via the iterator.name variable
the default name is unknown
same as Error but conditionally, actions are invoked conditionally, respectively
explicitly matches against EOF (succeeds if and ONLY if EOF is encountered)
matches if either \n or \r\n are encountered
matches if either Newline or EOF are encountered
self explanatory, the rule itself always succeeds, even at EOF
self explanatory, the rule itself always fails
accepts a single character 'c', use this for matching individual characters or hexadecimal
matches characters in a range
syntax is as follows
one or more pairs of [start, end]
may end with an extra character, as if or [ [ range ... ], [ 'extra character' ] ]
accepts a string of characters "string", use this for matching individual strings or character/hexadecimal sequences
matches and consumes any input
advances the input by N characters, this is faster than Any as it can skip multiple characters in a single go
temporarily overrides the action of the specified rule, prevents all other actions from executing within the rule
TemporaryAction B1
TemporaryAction B2
TemporaryAction B3
only B1 will run, any Error rules will be logged but their actions will not be executed
executes the given rule but does not consume any input and does not execute any actions, no errors are logged
executes the given rule but does not consume any input and does not execute any actions, no errors are logged
fails if the given rule succeeds, succeeds if the given rule fails
always matches regardless of if the given Rule matches or not, any Error rules will execute as normal
various rules for logging information about the current given rule
prints the current character, does not consume input
combines all Log* into a single rule
keeps matching until the given rule fails to match, always succeeds
equivalent to Optional(OneOrMore(rule))
matches if the given rule matches at least once, and keeps matching until the given rule fails to match
matches rule B until rule A is matched
1. A is matched, if A succeeds, the rule succeeds
2. if A fails, B is matched
3. if B succeeds, go to 1
4. if B fails to match, the rule fails
matches if any of the given rules match, any Error rules will execute as normal
matches only if all of the given rules match
advances the input by 1 until the given rule matches
an if statement in the form of a Rule
syntax: if([&] { return CONDITION; }, RTrue, RFalse)
use this to conditionally execute rules based on conditional input that may be determined by executed actions or outside factors such as threads or input variables
a Rule stack, use setBase to set the base rule, and an optional action
use push to push to the top of the stack
use pop/popAll to pop from top of the stack
the top most rule is executed
it is an error to have a nullptr base
a powerful feature of QParse is its input modification and rescan capabilities
input modification and rescan are done via actions, specifically the Input object that is passed to every ACTION argument
this works based on captured input, if a rule succeeds, it will capture any input it sees, otherwise it will capture nothing
note that a capture of nothing is itself a valid capture
the input stream can be modified in the following ways:
rescans the input stream starting at the beginning of the capture
if you capture a, and the input is ab, then it will rescan a on the next rule, and then b on the next rule, depending on the rule
removes the captured input from the input stream and then rescans
replaces the captured input with a character or a string, then continues from the start of the next character at the end of the replacement
the replacement can be smaller or larger than the capture
if you capture ab and replace it with f, and the input is abcd, then it will transform the input into fcd and then continue at c on the next rule, and then d on the next rule, depending on the rule
same as doing replace and then rescan, with above, it will continue at f on the next rule, and then c on the next rule, depending on the rule
inserts a character or a string ahead of the captured input, then continues from the start of the inserted text
if you capture ab and insert f, and the input is abcd, then it will transform the input into abfcd and then continue at f on the next rule, and then c on the next rule, depending on the rule
same as doing insert and then rescan, with above, it will continue at a on the next rule, and then b on the next rule, and then f on the next rule, depending on the rule
// y, ye, yes, n, no
// case insensitive
auto y_ = new Rules::Or({new Rules::Char('y'), new Rules::Char('Y')});
auto e_ = new Rules::Or({new Rules::Char('e'), new Rules::Char('E')});
auto s_ = new Rules::Or({new Rules::Char('s'), new Rules::Char('S')});
auto n_ = new Rules::Or({new Rules::Char('n'), new Rules::Char('N')});
auto o_ = new Rules::Or({new Rules::Char('o'), new Rules::Char('O')});
bool a;
auto yes = new Rules::Or({
y_,
new Rules::Sequence({y_, e_}),
new Rules::Sequence({y_, e_, s_}),
}, [&](Rules::Input i) {
a = true;
});
auto no = new Rules::Or({
n_,
new Rules::Sequence({n_, o_}),
}, [&](Rules::Input i) {
a = false;
});
Rules::Or({yes, no}).match(answer);
return a; Iterator it = content;
auto single_comment = new Rules::Sequence({
new Rules::String("//"),
new Rules::Optional(new Rules::Char(' ')),
new Rules::Until(new Rules::At(new Rules::NewlineOrEOF), [](QParse::Rules::Input i) {
std::cout << "comment: " << i.quotedString() << std::endl;
}),
new Rules::Optional(new Rules::Newline)
});
auto block_comment = new Rules::Sequence({
new Rules::String("#COMMENT_BEGIN"),
new Rules::Until(new Rules::Sequence({
new Rules::Newline(),
new Rules::String("#COMMENT_END")}
))
}, [](QParse::Rules::Input i) {
//std::cout << "block comment: " << i.quotedString() << std::endl;
});
auto line = new Rules::Sequence({
new Rules::Until(new Rules::At(new Rules::NewlineOrEOF), [](QParse::Rules::Input i) {
std::cout << "line: " << i.quotedString() << std::endl;
}),
new Rules::Optional(new Rules::Newline)
});
auto empty_line = new Rules::Sequence({
new Rules::MatchBUntilA(new Rules::At(new Rules::Newline), new Rules::Or({new Rules::Char(' '), new Rules::Char('\t')})),
new Rules::Newline
});
if (!Rules::MatchBUntilA(new Rules::EndOfFile,
new Rules::Or({
single_comment,
block_comment,
empty_line,
line,
new Rules::Error("unexpected token")
}
)).match(it)) {
return -1;
} std::string a = "../../wl_syscalls.decl";
auto content = readFile(a);
typedef struct Info {
bool is_typedef;
std::vector<std::string> comment;
std::string current_typedef, current_syscall, current_arguments, current_argument_count, current_arguments_usages;
} Info;
Info info;
info.is_typedef = false;
std::vector<Info> syscalls;
using namespace QParse;
Iterator it = content;
auto space = new Rules::Or({new Rules::Char(' '), new Rules::Char('\t')});
auto spaces = new Rules::ZeroOrMore(space);
auto single_comment = new Rules::Sequence({
new Rules::String("//"),
spaces,
new Rules::Sequence({
new Rules::Until(new Rules::At(new Rules::NewlineOrEOF), [&](QParse::Rules::Input i) {
info.comment.push_back(i.string());
}),
new Rules::Optional(new Rules::Newline)
})
});
auto block_comment = new Rules::Sequence({
new Rules::String("#COMMENT_BEGIN"), new Rules::Newline(),
new Rules::MatchBUntilA(
new Rules::Sequence({
new Rules::String("#COMMENT_END"), new Rules::Newline()
}),
new Rules::Sequence({
new Rules::Until(new Rules::At(new Rules::NewlineOrEOF), [&](QParse::Rules::Input i) {
info.comment.push_back(i.string());
}),
new Rules::Optional(new Rules::Newline)
})
)
});
auto c_ident = Rules_NS_LogTrace1(new Rules::Sequence({
new Rules::Range({'a', 'z', 'A', 'Z', '_'}),
new Rules::Optional(new Rules::OneOrMore(new Rules::Range({'a', 'z', 'A', 'Z', '0', '9', '_'})))
}), "c_ident");
auto number = Rules_NS_LogTrace1(new Rules::OneOrMore(new Rules::Range({'0', '9'})), "number");
auto c_value = Rules_NS_LogTrace1(new Rules::Or({c_ident, number}), "c_value");
auto syscall = new Rules::TemporaryAction(c_ident, [&](QParse::Rules::Input i) {
std::string sl = i.string();
std::transform(sl.begin(), sl.end(), sl.begin(), std::tolower);
if (!info.is_typedef) {
info.current_syscall = sl;
} else {
info.current_typedef = sl;
info.is_typedef = false;
}
});
auto arguments = new Rules::Sequence({
new Rules::Char('<'),
spaces,
new Rules::Or({
Rules_NS_LogTrace1(new Rules::Sequence({
Rules_NS_LogTrace(new Rules::NotAt(
new Rules::Or({
new Rules::Sequence({
new Rules::Char('>'),
spaces,
new Rules::Or({
single_comment,
new Rules::Newline()
})
}),
new Rules::String("...")
})
)),
new Rules::ErrorIfNotMatch(new Rules::TemporaryAction(number, [&](Rules::Input i) { info.current_argument_count = i.string(); }), "expected an integer"),
new Rules::ErrorIfNotMatch(new Rules::Char(','), "expected comma after number of arguments"),
spaces,
Rules_NS_LogTrace1(new Rules::ErrorIfNotMatch(new Rules::Sequence({
c_ident,
new Rules::Until(new Rules::At(new Rules::Sequence({spaces, new Rules::Char('>')})))
}, [&](QParse::Rules::Input i) {
info.current_arguments = i.string();
}), "expected argument declarations, followed by closing '>'"), "argument declaration"),
spaces,
new Rules::ErrorIfNotMatch(new Rules::Char('>'), "expected closing '>'"),
spaces,
new Rules::ErrorIfNotMatch(new Rules::Char('<'), "expected opening '<'"),
spaces,
new Rules::ErrorIfNotMatch(new Rules::Sequence({
new Rules::Optional(new Rules::Char('&')),
c_value,
// we cannot use an Until rule here since we need to skip '->' if it occurs
Rules_NS_LogTrace1(new Rules::MatchBUntilA(
Rules_NS_LogTrace1(new Rules::At(new Rules::Char('>')), "at >"),
new Rules::Or({
new Rules::Sequence({
new Rules::At(new Rules::String("->")),
Rules_NS_LogTrace1(new Rules::String("->"), "ignore ->")
}),
new Rules::Any
})
), "argument usages"),
}, [&](QParse::Rules::Input i) { info.current_arguments_usages = i.string(); }), "expected argument usages, followed by closing '>'"),
}), "numbered arguments"),
Rules_NS_LogTrace1(new Rules::If(
[&]() { return info.current_typedef.length() != 0; },
new Rules::ErrorIfMatch(
new Rules::At(new Rules::String("..."))
, "typedef declaration does not support varadic arguments (...)"
),
new Rules::Optional(
new Rules::String("...", [&](QParse::Rules::Input i) { info.current_arguments = i.string(); })
)
), "any number of arguments")
}),
spaces,
new Rules::ErrorIfNotMatch(new Rules::Char('>'), "expected closing '>'")
});
auto syscall_line_end = new Rules::Or({
single_comment,
new Rules::Newline,
new Rules::At(new Rules::EndOfFile)
});
auto syscall_line = new Rules::Sequence({
syscall,
spaces,
new Rules::Optional(Rules_NS_LogTrace(arguments)),
spaces,
syscall_line_end,
});
auto syscall_line__or__typedef_syscall_line = new Rules::Or({
new Rules::Sequence({
new Rules::At(new Rules::Sequence({
Rules_NS_LogTrace1(syscall, "at syscall"), spaces,
new Rules::Or({
Rules_NS_LogTrace1(new Rules::Char('<'), "at <"),
Rules_NS_LogTrace1(syscall_line_end, "at syscall line end")
})
})),
syscall_line
}),
new Rules::Sequence({
new Rules::At(new Rules::Sequence({
Rules_NS_LogTrace1(new Rules::String("typedef"), "at typedef"),
spaces,
Rules_NS_LogTrace1(syscall, "at syscall"), spaces,
Rules_NS_LogTrace1(syscall, "at syscall"), spaces,
new Rules::Or({
Rules_NS_LogTrace1(new Rules::Char('<'), "at <"),
Rules_NS_LogTrace1(syscall_line_end, "at syscall line end")
})
})),
new Rules::String("typedef", [&](QParse::Rules::Input i) { info.is_typedef = true; }),
spaces,
new Rules::ErrorIfNotMatch(syscall, "expected syscall"),
spaces,
syscall_line
}),
}, [&](Rules::Input i) {
syscalls.push_back(info);
info.is_typedef = false;
info.current_typedef.clear();
info.comment.clear();
info.comment.shrink_to_fit();
info.current_syscall.clear();
info.current_argument_count.clear();
info.current_arguments.clear();
info.current_arguments_usages.clear();
});
auto empty_line = new Rules::Sequence({
new Rules::MatchBUntilA(new Rules::At(new Rules::NewlineOrEOF), space),
new Rules::Optional(new Rules::Newline)
}, [&](QParse::Rules::Input i) {
info.comment.clear();
info.comment.shrink_to_fit();
});
if (!Rules::MatchBUntilA(
new Rules::EndOfFile,
new Rules::ErrorIfNotMatch(
new Rules::Or({
single_comment,
block_comment,
empty_line,
Rules_NS_LogTrace(syscall_line__or__typedef_syscall_line),
}),
"expected a comment, empty line, typedef syscall, or a syscall declaration"
)
).match(it)) {
printf("failed to parse syscalls.decl\n");
return -1;
}
if (syscalls.empty()) {
printf("no syscalls specified in syscalls.decl\n");
return 0;
}parses the following
// syscall.decl
//
// file format:
// // comment, this can < contain anything % at @ all !
//
// #COMMENT_BEGIN
// this is a block comment
// this can < contain anything % at @ all !
// #COMMENT_END
//
// // syscall documentation goes directly above syscall
// // this can be
// // multiple comments
// #COMMENT_BEGIN
// or multiple
// #COMMENT_END
// #COMMENT_BEGIN
// block comments
// #COMMENT_END
// // or both
// syscall // you can even put documentation here! zero arguments example: foo
// syscall <> // zero arguments example: foo <>
// syscall <argc, arg declaration> <argument usage> // argc arguments, example: foo <2, int foo, float bar> <foo, bar>
// syscall <...> // up to 125 arguments of any type example: foo <...>
//
// // syscalls can also be typedef'd
// //
// // a typedef is like a mapping that maps the input of syscall_to to the input of syscall_from
//
// // can map to any syscall that can accept specified argument types
// // eg, struct IO arg -> int, int*
// // eg, struct IO arg -> arg.input, &arg.outputInt // passing arg.input would produce a compile error since it expects int* but we passed int
//
// // eg, struct IO arg -> int, float*
// // eg, struct IO arg -> arg.input, &arg.outputFloat // passing arg.outputInt or arg.input would produce a compile error since it expects float* but we passed int or int*
// //
// typedef syscall_from syscall_to <argc, arg declaration> <argument usage> // typedef argc arguments, example: typedef foo foo1 <2, foobar f> <f.foo, f.bar>
// typedef syscall_from syscall_to <...> // NOT SUPPORTED !!! IT IS IMPOSSIBLE TO RELIABLE TYPEDEF A VARADIC ARGUMENT !