First version

This commit is contained in:
Андреев Григорий 2024-07-28 19:54:57 +03:00
commit b4381e9238
83 changed files with 7060 additions and 0 deletions

11
.gitignore vendored Normal file
View File

@ -0,0 +1,11 @@
# Never use CMAKE in production
CMakeLists.txt
cmake-build-debug/
# Output of build system
built/
# This is a compilated build system script
building/main
building/*.png
building/*.svg
.idea/

8
README.txt Normal file
View File

@ -0,0 +1,8 @@
libregexis024
Library for Regular Expressions, implementation of summer 2024
libregexis024vm
Provides only means of configuration and running my regexp virtual machine bytecode
libgreexpis024sol
Provides functions to compile regular expression into libregexis024 virual machine bytecode

9
building/build_build_system.sh Executable file
View File

@ -0,0 +1,9 @@
#!/bin/sh
BUILDING_DIR="./building"
[ -d "$BUILDING_DIR" ] || exit 1
MAIN_FILE="$BUILDING_DIR/main.cpp"
[ -f "$MAIN_FILE" ] || exit 1
COOL_FLAGS="$(pkg-config --cflags regexis024-build-system)"
g++ $COOL_FLAGS -o "$BUILDING_DIR/main" "$MAIN_FILE" || exit 1

156
building/main.cpp Normal file
View File

@ -0,0 +1,156 @@
#include <regexis024_build_system.h>
/*
* LIBREGEXIS024 SPECIFIC BUILD COMMANDS BEGIN
*/
struct Libregexis024BuildSystem {
/* Building runlevel */
BuildUnitsArray runlevel_1;
/* Installation runlevel */
BuildUnitsArray runlevel_2;
/* "debug" or "release" */
std::string build_type;
bool build_tests = false;
std::vector<std::string> warning_flags = {"-Wall", "-Wno-unused-variable", "-Werror=return-type","-pedantic",
"-Wno-unused-but-set-variable", "-Wno-reorder"};
std::vector<std::string> version_flags = {"--std", "c++14", "-D", "_POSIX_C_SOURCE=200809L"};
std::vector<std::string> debug_defines_release = {"_GLIBCXX_DEBUG"};
std::vector<std::string> debug_defines_debug = {"_GLIBCXX_DEBUG", "LIBREGEXIS024_DEBUG", "LIBREGEXIS024_ALLOW_LOUD"};
std::vector<std::string> opt_flags_release = {"-g", "-O2"};
std::vector<std::string> opt_flags_debug = {"-g", "-ggdb", "-O0"};
std::vector<std::string> getSomeRadFlags() {
std::vector<std::string> my_flag_collection;
gxx_add_cli_options(my_flag_collection, warning_flags);
gxx_add_cli_options(my_flag_collection, version_flags);
if (build_type == "release") {
gxx_add_cli_defines(my_flag_collection, debug_defines_release);
gxx_add_cli_options(my_flag_collection, opt_flags_release);
} else if (build_type == "debug") {
gxx_add_cli_defines(my_flag_collection, debug_defines_debug);
gxx_add_cli_options(my_flag_collection, opt_flags_debug);
}
return my_flag_collection;
}
Libregexis024BuildSystem(const std::string& build_type, const NormalCBuildSystemCommandMeaning& cmd)
:build_type(build_type)
{
ASSERT(build_type == "release" || build_type == "debug", "Unknown build type");
std::vector<ExternalLibraryTarget> ext_targets;
std::vector<CTarget> my_targets;
{
std::vector<std::string> compilation_units_release = {
"libregexis024vm/utils.cpp",
"libregexis024vm/vm_errno.cpp",
"libregexis024vm/vm_opcodes_disassembly.cpp",
"libregexis024vm/libregexis024vm_interface.cpp",
"libregexis024vm/libregexis024vm_disassembly.cpp",
"libregexis024vm/libregexis024vm_context.cpp",
"libregexis024vm/instruction_implementation.cpp",
"libregexis024vm/libregex024opcodes_stringification.cpp",
"libregexis024fa/codeset.cpp",
"libregexis024fa/colored_codeset.cpp",
"libregexis024fa/fa_first_stage_fix.cpp",
"libregexis024fa/finite_automaton.cpp",
"libregexis024fa/misc_fa_funcs.cpp",
"libregexis024fa/selarr_priority_table.cpp",
"libregexis024fa/tracking_fa_nodes.cpp",
"libregexis024fa/fa_make_deterministic.cpp",
"libregexis024fa/graph_to_bytecode/natural_compiler_utils.cpp",
"libregexis024fa/graph_to_bytecode/writing_commands.cpp",
"libregexis024fa/graph_to_bytecode/filter.cpp",
"libregexis024fa/graph_to_bytecode/fa_compiler.cpp",
"libregexis024fa/graph_to_bytecode/core.cpp",
"libregexis024sol/common_codesets.cpp",
"libregexis024sol/part_of_expr_that_tracks.cpp",
"libregexis024sol/expr_compiler.cpp",
"libregexis024sol/square_bracket_expression.cpp",
"libregexis024sol/sol_misc_base.cpp",
"libregexis024sol/command_expression.cpp",
"libregexis024sol/backslash_expression.cpp",
"libregexis024sol/subexpr_fa_transformed.cpp",
"libregexis024sol/expr_parse_functions/tracking_units.cpp",
"libregexis024sol/expr_parse_functions/ep_sequence.cpp",
"libregexis024sol/expr_parse_functions/command_recognition.cpp",
"libregexis024tools/stringmatching.cpp",
};
/* These are added to compilation_units_of_release */
std::vector<std::string> additional_compilation_units_debug = {
"debugging_regexis024/prettyprint/prettyprint_util.cpp",
"debugging_regexis024/vm/libregexis024vm_debug.cpp",
"debugging_regexis024/debug_through_graphviz.cpp",
};
/* Suitable forr both release and debug (even though you will pretty much never need to export headers of build of
* debug build type */
std::vector<std::string> exported_headers = {
"libregexis024vm/vm_errno.h",
"libregexis024vm/vm_opcodes_types.h",
"libregexis024vm/vm_opcodes.h",
"libregexis024vm/libregexis024vm_interface.h",
"libregexis024fa/tracking_variables.h",
"libregexis024sol/part_of_expr_that_tracks.h",
"libregexis024sol/expr_compiler.h",
"libregexis024tools/stringmatching.h",
};
CTarget T("libregexis024", "shared_library");
T.additional_compilation_flags = getSomeRadFlags();
array_concat(T.units, compilation_units_release);
if (build_type == "debug")
array_concat(T.units, additional_compilation_units_debug);
T.include_pr = "";
T.include_ir = "";
T.exported_headers = exported_headers;
T.installation_dir = "";
T.pc_output_path = "libregexis024.pc";
my_targets.push_back(T);
}
if (build_tests) {
CTarget T("libregexis024_test4", "executable");
T.additional_compilation_flags = getSomeRadFlags();
T.proj_deps = {CTargetDependenceOnProjectsLibrary("libregexis024")};
T.units = {"libregexis024test/test4.cpp"};
my_targets.push_back(T);
}
regular_ctargets_to_2bus_conversion(ext_targets, my_targets, runlevel_1, runlevel_2,
cmd.project_root, cmd.installation_root);
}
};
int main(int argc, char** argv) {
try {
assert(argc > 0);
std::vector<std::string> args(argc - 1);
for (int i = 0; i + 1 < argc; i++) {
args[i] = argv[i + 1];
}
NormalCBuildSystemCommandMeaning cmd;
regular_bs_cli_cmd_interpret(args, cmd);
Libregexis024BuildSystem bs("debug", cmd);
show_build_units_array_with_image_viewer(bs.runlevel_1, "true");
show_build_units_array_with_image_viewer(bs.runlevel_2, "true");
if (cmd.need_to_build)
complete_tasks_of_build_units(bs.runlevel_1);
if (cmd.need_to_install)
complete_tasks_of_build_units(bs.runlevel_2);
} catch (const buildSystemFailure& e) {
printf("Build system failure\n""%s\n", e.toString().c_str());
}
}

View File

@ -0,0 +1,325 @@
#include <debugging_regexis024/debug_through_graphviz.h>
#include <unistd.h>
#include <fcntl.h>
#include <wait.h>
#include <assert.h>
#include <stdlib.h>
#include <stdio.h>
#include <string>
#include <algorithm>
#include <libregexis024vm/vm_opcodes.h>
#include <libregexis024fa/tracking_fa_nodes.h>
const char* one_char_read_color = "black";
const char* forking_color = "darkorchid1";
const char* look_one_behind_color = "darkslateblue";
const char* look_one_ahead_color = "coral1";
const char* track_array_mov_imm_color = "lightblue2";
const char* track_array_mov_halfinvariant_color = "lightseagreen";
const char* match_pending_lob_color = "darkgoldenrod2";
const char* match_color = "gold";
const char* det_char_crossroads_color = "navy";
const char* error_color = "crimson";
const char* STAR = "";
const char* get_associated_color(FA_Node* node){
switch (node->type) {
#define ccase(tn) case tn: return tn##_color;
ccase(one_char_read)
ccase(forking)
ccase(look_one_behind)
ccase(look_one_ahead)
ccase(track_array_mov_imm)
ccase(track_array_mov_halfinvariant)
ccase(det_char_crossroads)
case match:
return dynamic_cast<FA_NodeOfMatch*>(node)->ext_filter_added ? match_pending_lob_color : match_color;
default:
return "black";
#undef ccase
}
}
struct NodesProblems{
size_t actual_refcount = 0;
bool refcount_problem = false;
size_t edges_point_to_null = 0;
};
struct EdgesProblems {
bool points_to_null = false;
explicit EdgesProblems(bool points_to_null): points_to_null(points_to_null) {}
};
std::string get_applied_edge_attributes(FA_Node* node, const NodesProblems& np, const EdgesProblems& ep){
std::string res = "color=";
if (ep.points_to_null) {
res += error_color;
} else {
res += get_associated_color(node);
if (node->type == one_char_read || node->type == det_char_crossroads)
res += " style=bold";
}
return res;
}
std::string get_applied_node_attributes(FA_Node* node, const NodesProblems& bd){
std::string res = "color=";
res += get_associated_color(node);
if (bd.refcount_problem)
res += " fontcolor=crimson";
if ((node->type == match) ||
(node->type == det_char_crossroads && dynamic_cast<FA_NodeOfDetCharCrossroads*>(node)->matching))
res += " shape=doublecircle";
return res;
}
void append_reverse_hex(std::string& res, uint32_t num){
if (num == 0){
res += "0";
} else {
while (num){
uint32_t r = num & 0x0F;
res += static_cast<char>((r < 10) ? (r + '0') : (r - 10 + 'a'));
num >>= 4;
}
}
}
std::string stringify_codeset(const codeset_t& cs){
std::string res;
for (long i = static_cast<long>(cs.size()) - 1; i >= 0; i--) {
uint64_t start = cs[i].first, end = cs[i].second;
if (start == end) {
append_reverse_hex(res, start);
} else {
append_reverse_hex(res, end);
res += '-';
append_reverse_hex(res, start);
}
if (i != 0)
res += ',';
}
std::reverse(res.begin(), res.end()); /* ascii works wonders */
return res;
}
std::string get_extended_node_lable(FA_Node* node){
if ((node->type == one_char_read && dynamic_cast<FA_NodeOfOneCharRead*>(node)->second_ns) ||
(node->type == det_char_crossroads && dynamic_cast<FA_NodeOfDetCharCrossroads*>(node)->second_ns)) {
return std::string(" ") + STAR;
}
if (node->type == match) {
FA_NodeOfMatch* mn = static_cast<FA_NodeOfMatch*>(node);
if (mn->ext_filter_added)
return std::string(" pending loa ") + stringify_codeset(mn->pending_filter);
}
return "";
}
std::string get_node_lable(FA_Node* node, const NodesProblems& bd){
std::string res;
switch (node->type) {
#define tcase(tn, str) case tn: res = str; break;
tcase(one_char_read, "ocr")
tcase(match, "m")
tcase(forking, "f")
tcase(look_one_behind, "lob")
tcase(look_one_ahead, "loa")
tcase(track_array_mov_imm, "tami")
tcase(track_array_mov_halfinvariant, "tamh")
tcase(det_char_crossroads, "dcc")
}
res += ("[" + std::to_string(node->nodeId) + "]");
res += get_extended_node_lable(node);
if (bd.refcount_problem)
res += ("!refcount: " + std::to_string(node->refs) + "!");
return res;
}
void print_edge(FA_Node* start, const FA_Node* dest, const std::string& label, FILE* fd, NodesProblems& np){
if (!dest){
fprintf(stderr, "NULL transition going from node %lu\n", start->nodeId);
fprintf(fd, "%lu->NULL_%lu_%lu [label=\"%s\" color=crimson]", start->nodeId,
start->nodeId, np.edges_point_to_null++, label.c_str());
return;
}
fprintf(fd, "%lu->%lu [label=\"%s\" %s]\n", start->nodeId, dest->nodeId, label.c_str(),
get_applied_edge_attributes(start, np, EdgesProblems(false)).c_str());
}
void print_fa(const FA_Container& fa, FILE* fd, const KnownTrackingTools& ktr,
const RegexPriorityTable& priority_table){
assert(fa.start);
assert(fd);
fprintf(fd, "digraph finite_automaton {\ngraph ["
"fontname = \"Helvetica\" charset = \"UTF-8\" label = \"Finite Automaton\" labelloc = \"t\" labeljust = \"c\" "
"bgcolor = \"#FFFAF4\" fontcolor = black fontsize = 18 style = \"filled\" rankdir = LR margin = 0.2 "
"splines = spline nodesep = 0.9 ranksep = 1.2 ]\n node [ style = \"solid,filled\" fontsize = 15 "
"fontcolor = black fontname = \"Helvetica\" color = black fillcolor = white margin = \"0.2,0.2\" shape=circle "
"]\n edge [ style = solid fontsize = 16 fontcolor = black fontname = \"Helvetica\" color = black "
"labelfloat = false labeldistance = 2.5 labelangle = 70 arrowhead = normal ]\n"
"start_state [label = \"start\\nfrom\\nhere\" shape=none style=\"\" ]\n");
size_t n = fa.all.size();
std::vector<NodesProblems> breakdown;
breakdown.resize(n);
breakdown[fa.start->nodeId].actual_refcount++;
for (size_t i = 0; i < n; i++){
assert(fa.all[i]->nodeId == static_cast<int64_t>(i));
for (FA_Node** nxtN: fa.all[i]->get_all_transitions())
if ((*nxtN) != NULL)
breakdown[(**nxtN).nodeId].actual_refcount++;
}
for (size_t i = 0; i < n; i++){
if (fa.all[i]->refs != breakdown[i].actual_refcount){
breakdown[i].refcount_problem = true;
fprintf(stderr, "Corrupted FA: wrong refcount on node %lu\n", fa.all[i]->nodeId);
}
}
for (size_t i = 0; i < n; i++){
fprintf(fd, "%lu [label=\"%s\" %s]\n", i, get_node_lable(fa.all[i], breakdown[i]).c_str(),
get_applied_node_attributes(fa.all[i], breakdown[i]).c_str());
}
/* Two Infoboxes */
auto stringifyTrackingVarType = [](tracking_var_type type) -> std::string {
switch (type) {
case tracking_var_types::range:
return "range";
case tracking_var_types::dot_cur_pos:
return "dot of cur pos";
default:
return "dot of immediate";
}
};
std::string infoText;
for (auto& p: ktr.track_names){
const SubtrackingNameInfo& tu = ktr.retrieval_info[p.second];
auto getRole = [](bool presence, tracking_var_type type, int first, int second,
const std::string& ARR_NAME) -> std::string {
if (!presence) {
assert(first == -1 && second == -1);
return "Not involved in " + ARR_NAME;
}
if (type == tracking_var_types::range){
assert(first != -1 && second != -1);
return "In " + ARR_NAME + ": " + std::to_string(first) + " &lt;&minus;&gt; " + std::to_string(second);
}
assert(first != -1 && second == -1);
return "In " + ARR_NAME + ": ( " + std::to_string(first) + " )";
};
char buf[2048] = {0};
snprintf(buf, 2048, "Tracking unit name: %s\\n" "Discovered: %s\\n" "Type: %s\\n" "%s\\n%s",
p.first.c_str(), tu.discovered ? "ofcourse" : "no",
stringifyTrackingVarType(tu.type).c_str(),
getRole(tu.stored_in_ca, tu.type, tu.colarr_first, tu.colarr_second, "colarr").c_str(),
getRole(tu.stored_in_sa, tu.type, tu.selarr_first, tu.selarr_second, "selarr").c_str());
if (!infoText.empty())
infoText += "|";
infoText += buf;
}
fprintf(fd, "infoBoard1 [label=\"%s\" shape = record]\n", infoText.c_str());
infoText = "";
for (size_t i = 0; i < priority_table.size(); i++){
const RegexPriorityTableAction& tu = priority_table[i];
if (!infoText.empty())
infoText += "|";
infoText += tu.minimize ? "Minimize " : "Maximize ";
if (tu.pos.isForRange()){
infoText += "[" + std::to_string(tu.pos.second) + "] - [" + std::to_string(tu.pos.first) + "]";
} else {
infoText += "[" + std::to_string(tu.pos.first) + "]";
}
}
fprintf(fd, "infoBoard2 [label=\"%s\" shape = record]\n", infoText.c_str());
assert(fa.start);
fprintf(fd, "start_state->%lu [color=gray style=dotted]\n", fa.start->nodeId);
for (FA_Node* node: fa.all){
NodesProblems& bd = breakdown[node->nodeId];
if (node->type == one_char_read){
FA_NodeOfOneCharRead* cn = dynamic_cast<FA_NodeOfOneCharRead *>(node);
std::string str = stringify_codeset(cn->filter);
print_edge(node, cn->nxt_node, str + (cn->second_ns ? std::string(" ") + STAR : ""), fd, bd);
} else if (node->type == forking){
FA_NodeOfForking* cn = dynamic_cast<FA_NodeOfForking *>(node);
for (FA_Node* nxt: cn->nxt_options){
print_edge(node, nxt, "", fd, bd);
}
} else if (node->type == look_one_behind){
FA_NodeOfLookOneBehind* cn = dynamic_cast<FA_NodeOfLookOneBehind *>(node);
print_edge(node, cn->nxt_node, stringify_codeset(cn->filter), fd, bd);
} else if (node->type == look_one_ahead){
FA_NodeOfLookOneAhead* cn = dynamic_cast<FA_NodeOfLookOneAhead *>(node);
print_edge(node, cn->nxt_node, stringify_codeset(cn->restriction), fd, bd);
} else if (node->type == track_array_mov_imm){
FA_NodeOfTrackArrayMovImm* cn = dynamic_cast<FA_NodeOfTrackArrayMovImm *>(node);
char buf[1024];
if (!isImmMovOpcode(cn->operation))
fprintf(stderr, "bad operation in node %lu\n", node->nodeId);
snprintf(buf, 1024, "%s %hu %lu",
regex024_opcode_tostr(cn->operation), cn->key, cn->imm_value);
print_edge(node, cn->nxt_node,std::string(buf), fd, bd);
} else if (node->type == track_array_mov_halfinvariant){
FA_NodeOfTrackArrayMovHalfinvariant* cn = dynamic_cast<FA_NodeOfTrackArrayMovHalfinvariant *>(node);
char buf[1024];
if (!isCurPosMovOpcode(cn->operation))
fprintf(stderr, "bad operation in node %lu\n", node->nodeId);
snprintf(buf, 1024, "%s %hu",
regex024_opcode_tostr(cn->operation), cn->key);
print_edge(node, cn->nxt_node,std::string(buf), fd, bd);
} else if (node->type == det_char_crossroads){
FA_NodeOfDetCharCrossroads* cn = dynamic_cast<FA_NodeOfDetCharCrossroads *>(node);
for (const auto& transition: cn->crossroads){
std::string str = stringify_codeset(transition.input);
print_edge(node, transition.nxt_node, str + (cn->second_ns ? std::string(" ") + STAR : ""),
fd, bd);
}
}
}
fprintf(fd, "}\n");
}
FILE* get_fd(const char* apath){
errno = 0;
FILE *fd = fopen(apath, "w");
if (!fd)
perror("fopen w");
if (ftruncate(fileno(fd), 0) != 0)
perror("truncation");
fd = fopen(apath, "a");
if (!fd)
perror("fopen a");
return fd;
}
void show_fa_with_sxiv_after_dot(const FA_Container& fa, const KnownTrackingTools& ktr,
const RegexPriorityTable& priority_table) {
const char* temp_gv = "FAGraph.gv";
const char* temp_png = "FAGraph.png";
int temp_descriptor = open(temp_gv, O_CLOEXEC | O_APPEND | O_CREAT | O_WRONLY, S_IRWXU | S_IRWXG);
assert(temp_descriptor >= 0);
assert(fa.start);
FILE* fd = get_fd(temp_gv);
print_fa(fa, fd, ktr, priority_table);
fclose(fd);
char cmdBuf[1024];
// todo: get rid of temporary dot file and shell usage
snprintf(cmdBuf, 1024, "dot %s -Tpng >%s", temp_gv, temp_png);
int chw = system(cmdBuf);
assert(WIFEXITED(chw));
assert(WEXITSTATUS(chw) == 0);
snprintf(cmdBuf, 1024, "sxiv %s", temp_png);
chw = system(cmdBuf);
assert(WIFEXITED(chw));
assert(WEXITSTATUS(chw) == 0);
assert(chw >= 0);
unlink(temp_gv);
unlink(temp_png);
}

View File

@ -0,0 +1,12 @@
#ifndef DEBUGGING_REGEXIS024_DEBUG_THROUGH_GRAPHVIZ_H
#define DEBUGGING_REGEXIS024_DEBUG_THROUGH_GRAPHVIZ_H
#include <libregexis024fa/finite_automaton.h>
#include <libregexis024sol/part_of_expr_that_tracks.h>
#include <libregexis024fa/selarr_priority_table.h>
/* Uses temporary file FAGraph.gv,png, dot command and sxiv */
void show_fa_with_sxiv_after_dot(const FA_Container& fa, const KnownTrackingTools& ktr,
const RegexPriorityTable& priority_table);
#endif

View File

@ -0,0 +1,89 @@
#include <debugging_regexis024/prettyprint/prettyprint_util.h>
#include <functional>
#include <libregexis024vm/utils.h>
TreeWithStringsNode::TreeWithStringsNode(const std::string &val): val(val) {
}
static const char* ch_empty = " ";
static const char* ch_passing_by = "\u2502 ";
static const char* ch_connect_right_and_forward = "\u251c\u2500\u2500\u2500";
static const char* ch_connect_right_last = "\u2514\u2500\u2500\u2500";
static const char* ch_box_left_side = "\u2551";
static const char* ch_box_right_side = "\u2551";
static const char* ch_box_top_side = "\u2550";
static const char* ch_box_bottom_side = "\u2550";
static const char* ch_box_crn_top_left = "\u2554";
static const char* ch_box_crn_top_right = "\u2557";
static const char* ch_box_crn_bottom_left = "\u255A";
static const char* ch_box_crn_bottom_right = "\u255D";
size_t length_of_line(const std::string& str) {
size_t ch = 0;
size_t pos = 0;
while (pos < str.size()) {
int32_t code;
size_t adj;
utf8_string_iterat(code, adj, pos, reinterpret_cast<const uint8_t*>(str.data()), str.size());
if (code < 0)
return ch;
ch++;
pos += adj;
}
return ch;
}
/* Warning: recursion used */
void toLines_dfs(const TreeWithStringsNode& node, lines& out, std::vector<bool>& prefix) {
out.push_back("");
size_t n = prefix.size();
for (size_t i = 0; i < n; i++) {
if (i + 1 < n) {
out.back() += prefix[i] ? ch_passing_by : ch_empty;
} else {
out.back() += prefix[i] ? ch_connect_right_and_forward : ch_connect_right_last;
}
}
out.back() += node.val;
prefix.push_back(true);
size_t m = node.childeren.size();
for (size_t i = 0; i < m; i++) {
if (i + 1 == m)
prefix[n] = false;
toLines_dfs(node.childeren[i], out, prefix);
}
prefix.pop_back();
}
void TreeWithStringsNode::toLines(lines &out) const {
std::vector<bool> prefix;
toLines_dfs(*this, out, prefix);
}
std::string strMul(size_t n, const char* str) {
std::string res;
for (size_t i = 0; i < n; i++)
res += str;
return res;
}
lines wrapWithBox(const lines &in) {
lines out;
size_t max_width = 0;
for (auto& l: in)
max_width = std::max(max_width, length_of_line(l));
out.push_back(ch_box_crn_top_left + strMul(max_width, ch_box_top_side) + ch_box_crn_top_right);
for (auto& line: in) {
size_t s = length_of_line(line);
out.push_back(ch_box_left_side + line + strMul(max_width - s, " ") + ch_box_right_side);
}
out.push_back(ch_box_crn_bottom_left + strMul(max_width, ch_box_bottom_side) + ch_box_crn_bottom_right);
return out;
}
void printLines(const lines &in) {
for (auto& l: in)
printf("%s\n", l.c_str());
}

View File

@ -0,0 +1,25 @@
#ifndef DEBUGGING_REGEXIS024_PRETTYPRINT_UTIL_H
#define DEBUGGING_REGEXIS024_PRETTYPRINT_UTIL_H
/* Used for debug. Do not give to user */
#include <vector>
#include <string>
typedef std::vector<std::string> lines;
struct TreeWithStringsNode {
std::string val;
std::vector<TreeWithStringsNode> childeren;
explicit TreeWithStringsNode(const std::string &val);
TreeWithStringsNode() = default;
void toLines(lines& out) const;
};
lines wrapWithBox(const lines& in);
void printLines(const lines& in);
#endif

View File

@ -0,0 +1,58 @@
#include <debugging_regexis024/vm/libregexis024vm_debug.h>
#include <stdio.h>
#include <string>
std::string thread_to_str(const REGEX_IS024_Thread& thread){
if (!(thread.slot_occupation_status & SLOT_OCCUPIED))
return "{ unoccupied }";
char buf[1024];
snprintf(buf, 1024, "{ IP = %lu }", thread.IP);
return buf;
}
std::string stack_to_str(const REGEX_IS024_Stack& stack){
std::string res = "{ ";
for (uint32_t i = 0; i < stack.sz; i++){
if (i != 0)
res += ", ";
res += std::to_string(stack.slots[i]);
}
res += " }";
return res;
}
std::string slots_to_str(const REGEX_IS024_CONTEXT& ctx){
if (!ctx.initialized)
return "uninitialized";
std::string READ_slots;
for (size_t i = 0; i < ctx.read_slots_number; i++){
uint8_t stat = ctx.READ_halted_slots[i].slot_occupation_status;
READ_slots += (stat & SLOT_OCCUPIED) ? ((stat & SLOT_NEW) ? "N" : "O") : "x";
}
std::string FORK_slots;
for (size_t i = 0; i < ctx.fork_slots_number; i++){
uint8_t stat = ctx.FORK_halted_slots[i].slot_occupation_status;
FORK_slots += (stat & SLOT_OCCUPIED) ? "O" : "x";
}
char buf[4096];
snprintf(buf, 4096, "READ_slots: %s ; FORK_slots: %s ; READ_stack_new_main: %s ; "
"READ_stack_new_second: %s ; READ_stack_old: %s ; FORK_stack: %s",
READ_slots.c_str(), FORK_slots.c_str(), stack_to_str(ctx.READ_halted_stack_new_first).c_str(),
stack_to_str(ctx.READ_halted_stack_new_second).c_str(),
stack_to_str(ctx.READ_halted_stack_old).c_str(), stack_to_str(ctx.FORK_halted_stack).c_str());
return buf;
}
void debug_print_context(const REGEX_IS024_CONTEXT& ctx, const char* place) {
printf("== DEBUG `%s` ==\n", place);
printf("Active thread: %s, sifting_with: %s, match: %s\n%s\n",
thread_to_str(ctx.active_thread).c_str(),
ctx.sifting_with ? thread_to_str(*ctx.sifting_with).c_str() : "NO", thread_to_str(ctx.matched_thread).c_str(),
slots_to_str(ctx).c_str());
}
void debug_print_thread(const REGEX_IS024_Thread& thr, const char *place) {
printf("== DEBUG `%s` ==\n", place);
printf("This thread: %s\n", thread_to_str(thr).c_str());
}

View File

@ -0,0 +1,11 @@
#ifndef LIBREGEXIS024_SRC_LIBREGEXIS024VM_DEBUG_H
#define LIBREGEXIS024_SRC_LIBREGEXIS024VM_DEBUG_H
#include <libregexis024vm/libregexis024vm.h>
#include <libregexis024vm/instruction_implementation.h>
void debug_print_context(const REGEX_IS024_CONTEXT& ctx, const char* place);
void debug_print_thread(const REGEX_IS024_Thread& thr, const char *place);
#endif

View File

@ -0,0 +1,120 @@
#include <libregexis024fa/codeset.h>
#include <assert.h>
codeset_t invert_set(const codeset_t &X) {
if (X.empty())
return {{0, UINT32_MAX}};
codeset_t res;
if (X[0].first != 0)
res.emplace_back(0, X[0].first - 1);
for (size_t i = 0; i + 1 < X.size(); i++){
res.emplace_back(X[i].second + 1, X[i + 1].first - 1);
}
if (X.back().second != UINT32_MAX)
res.emplace_back(X.back().second + 1, UINT32_MAX);
return res;
}
#define elA (A[i])
#define elB (B[j])
#define Ainc i++
#define Binc j++
#define prepare size_t An = A.size(); size_t Bn = B.size(); size_t i = 0; size_t j = 0;
#define Aended (i == An)
#define Bended (j == Bn)
codeset_t merge_sets(const codeset_t &A, const codeset_t &B) {
codeset_t res;
prepare
std::pair<uint32_t, uint32_t> cur;
while (true){
if (Aended && Bended)
break;
if (i == An){
cur = elB;
Binc;
} else if (j == Bn){
cur = elA;
Ainc;
} else {
if (elA.first < elB.first) {
cur = elA;
Ainc;
} else {
cur = elB;
Binc;
}
}
while (true){
if (Aended && Bended){
res.push_back(cur);
break;
}
if (i < An && (cur.second == UINT32_MAX || elA.first <= cur.second + 1)){
cur.second = std::max(elA.second, cur.second);
Ainc;
} else if (j < Bn && (cur.second == UINT32_MAX || elB.first <= cur.second + 1)){
cur.second = std::max(elB.second, cur.second);
Binc;
} else {
res.push_back(cur);
break;
}
}
}
return res;
}
codeset_t intersect_sets(const codeset_t &A, const codeset_t &B) {
codeset_t res;
prepare
while (true){
if (Aended || Bended)
break;
if (elB.first <= elA.first && elA.first <= elB.second)
res.emplace_back(elA.first, std::min(elA.second, elB.second));
else if (elA.first <= elB.first && elB.first <= elA.second)
res.emplace_back(elB.first, std::min(elA.second, elB.second));
if (elA.second <= elB.second)
Ainc;
else
Binc;
}
return res;
}
codeset_t subtract_sets(const codeset_t &A, const codeset_t &B) {
return intersect_sets(A, invert_set(B));
}
bool is_inside(uint32_t start, uint32_t end, codeset_t &X) {
for (auto& p: X){
if (p.first <= start && end <= p.second)
return true;
assert(end < p.first || p.second < start);
}
return false;
}
codeset_t set_add_char(const codeset_t& X, uint32_t cp) {
return merge_sets(X, {{cp, cp}});
}
codeset_t set_add_range(const codeset_t& X, uint32_t start, uint32_t end) {
return merge_sets(X, {{start, end}});
}
codeset_t codeset_of_one_char(uint32_t ch) {
return codeset_t({{ch, ch}});
}
std::string stringifyCodesetBase10(const codeset_t& CS) {
std::string cs;
for (auto p: CS) {
if (!cs.empty())
cs += "; ";
cs += std::to_string(p.first) + "-" + std::to_string(p.second);
}
return cs;
}

View File

@ -0,0 +1,27 @@
#ifndef LIBREGEXIS024_CODESET_H
#define LIBREGEXIS024_CODESET_H
#include <vector>
#include <utility>
#include <stdint.h>
#include <string>
typedef std::vector<std::pair<uint32_t, uint32_t>> codeset_t;
codeset_t invert_set(const codeset_t& X);
codeset_t merge_sets(const codeset_t& A, const codeset_t& B);
codeset_t intersect_sets(const codeset_t& A, const codeset_t& B);
codeset_t subtract_sets(const codeset_t& A, const codeset_t& B);
/* Aborts if segment in question hit the edge (unsafe function) */
bool is_inside(uint32_t start, uint32_t end, codeset_t& X);
codeset_t set_add_char(const codeset_t& X, uint32_t cp);
codeset_t set_add_range(const codeset_t& X, uint32_t start, uint32_t end);
codeset_t codeset_of_one_char(uint32_t ch);
#define codeset_of_all codeset_t({{0, UINT32_MAX}})
std::string stringifyCodesetBase10(const codeset_t& CS);
#endif //LIBREGEXIS024_CODESET_H

View File

@ -0,0 +1,183 @@
#include <libregexis024fa/colored_codeset.h>
#include <assert.h>
ColoredCodesetSegment::ColoredCodesetSegment(uint32_t color, uint32_t right_code): color(color), right_code(right_code) {}
ColoredCodesetSegmentList::ColoredCodesetSegmentList() {
first = new ColoredCodesetSegment(0, UINT32_MAX);
}
void ColoredCodesetSegmentList::replace_myself(const ColoredCodesetSegmentList &other) {
assert(other.first);
ColoredCodesetSegment** in_cur = &first;
ColoredCodesetSegment* in_other = other.first;
while (in_other) {
*in_cur = new ColoredCodesetSegment(*in_other);
in_cur = &((**in_cur).next);
in_other = in_other->next;
}
}
ColoredCodesetSegmentList::ColoredCodesetSegmentList(const ColoredCodesetSegmentList &other) {
replace_myself(other);
}
void ColoredCodesetSegmentList::free_myself() {
ColoredCodesetSegment* cur = first;
while (cur) {
ColoredCodesetSegment* nxt = cur->next;
delete cur;
cur = nxt;
}
}
ColoredCodesetSegmentList::~ColoredCodesetSegmentList() {
free_myself();
}
ColoredCodesetSegmentList& ColoredCodesetSegmentList::operator=(const ColoredCodesetSegmentList &other) {
free_myself();
replace_myself(other);
return *this;
}
ColoredCodeset::ColoredCodeset(uint64_t dummy_n): DummyN(dummy_n) {
requests = {{}};
}
void ColoredCodeset::split_phase(const codeset_t &X) {
uint32_t cA = 0;
ColoredCodesetSegment* cur_seg = list.first;
uint32_t pi = 0;
auto advance_old = [&]()->void{
cA = cur_seg->right_code + 1;
cur_seg = cur_seg->next;
};
/* How to use: splits are made from left to right. After each split cur_seg
* points to the rightest among sub-segments of cur_segment. */
auto SPLIT = [&](uint32_t code_before_split)->void {
assert(code_before_split < cur_seg->right_code);
ColoredCodesetSegment* new_next = new ColoredCodesetSegment(cur_seg->color, cur_seg->right_code);
new_next->divisor_on_left = true;
cur_seg->right_code = code_before_split;
new_next->next = cur_seg->next;
cur_seg->next = new_next;
advance_old();
};
while (cur_seg && pi < X.size()) {
uint32_t cB = cur_seg->right_code;
uint32_t L = X[pi].first, R = X[pi].second;
if (L < cA) {
if (R != UINT32_MAX && R + 1 < cA) {
pi++;
} else if (R != UINT32_MAX && R + 1 == cA) {
cur_seg->divisor_on_left = true;
pi++;
} else if (R < cB) {
SPLIT(R);
pi++;
} else {
advance_old();
}
} else if (L == cA) {
cur_seg->divisor_on_left = true;
if (R < cB) {
SPLIT(R);
pi++;
} else {
advance_old();
}
} else if (L <= cB) {
SPLIT(L - 1);
if (R < cB) {
SPLIT(R);
pi++;
} else {
advance_old();
}
} else {
advance_old();
}
}
}
void ColoredCodeset::apply_divisor(const codeset_t &X) {
split_phase(X);
size_t X_id = nxt_request_id++;
size_t m = requests.size();
size_t bm = m;
std::vector<bool> skipped(bm, false);
std::vector<bool> overlapped(bm, false);
{
bool inside = false;
ColoredCodesetSegment* cur = list.first;
while (cur) {
inside = (inside != cur->divisor_on_left);
if (inside) {
overlapped[cur->color] = true;
} else {
skipped[cur->color] = true;
}
cur = cur->next;
}
}
std::vector<uint32_t> alt_color(bm, 0);
for (size_t i = 0; i < bm; i++) {
if (skipped[i] && overlapped[i]) {
alt_color[i] = m++;
requests.push_back(requests[i]);
if (X_id >= DummyN)
requests.back().push_back(X_id - DummyN);
} else if (overlapped[i]) {
if (X_id >= DummyN)
requests[i].push_back(X_id - DummyN);
} else
assert(skipped[i]);
}
{
bool inside = false;
ColoredCodesetSegment* cur = list.first;
while (cur) {
inside = (inside != cur->divisor_on_left);
cur->divisor_on_left = false;
uint32_t c = cur->color;
if (inside && skipped[c] && overlapped[c]) {
cur->color = alt_color[c];
}
cur = cur->next;
}
}
}
void ColoredCodeset::get_splits_of_non_dummy(std::vector<codeset_t> &res_input,
std::vector<std::vector<size_t>> &res_color_to_requests) {
size_t n = requests.size();
std::vector<ssize_t> nonclean_to_clean(n, -1);
res_color_to_requests = {};
for (size_t i = 0; i < n; i++) {
if (!requests[i].empty()) {
nonclean_to_clean[i] = res_color_to_requests.size();
res_color_to_requests.push_back(requests[i]);
}
}
ColoredCodesetSegment* cur = list.first;
uint32_t L = 0;
res_input.assign(res_color_to_requests.size(), {});
while (cur) {
size_t Sc = cur->color;
if (nonclean_to_clean[Sc] >= 0) {
res_input[nonclean_to_clean[Sc]].emplace_back(L, cur->right_code);
}
L = cur->right_code + 1;
cur = cur->next;
}
}

View File

@ -0,0 +1,66 @@
#ifndef LIBREGEXIS024_COLORED_CODESET_H
#define LIBREGEXIS024_COLORED_CODESET_H
#include <stdlib.h>
#include <stdint.h>
#include <vector>
#include <libregexis024fa/codeset.h>
/* Used for determinizer. Nowhere else */
struct ColoredCodesetSegment {
uint32_t color;
uint32_t right_code;
ColoredCodesetSegment* next = NULL;
/* Temporary varaible (used by apply_divisor() method) */
bool divisor_on_left = false;
ColoredCodesetSegment(uint32_t color, uint32_t right_code);
};
/* Warning!!! This stupid class is OOM-unsafe!!!
* This is not an issue as far as you don't show any of it's instance to the user of libregexis024 */
struct ColoredCodesetSegmentList {
ColoredCodesetSegment* first = NULL;
ColoredCodesetSegmentList();
void replace_myself(const ColoredCodesetSegmentList& other);
ColoredCodesetSegmentList(const ColoredCodesetSegmentList& other);
/* Use only internally */
void free_myself();
~ColoredCodesetSegmentList();
ColoredCodesetSegmentList& operator=(const ColoredCodesetSegmentList& other);
};
/* Highly unoptimized algorithm on this data structure O(C^2) time*/
class ColoredCodeset {
ColoredCodesetSegmentList list;
/* Size of this vector is equal to the number of colors */
std::vector<std::vector<size_t>> requests;
uint64_t DummyN;
size_t nxt_request_id = 0;
void split_phase(const codeset_t& X);
public:
/* First dummy_n split requests will be viewed as 'dummy requests', when complete map of splits is requested,
* colors that are registed indide only dummy requests won't be returned. */
ColoredCodeset(uint64_t dummy_n);
/* O(C, which is bad, but my library's compiler is already slow by itself, so who cares) */
void apply_divisor(const codeset_t& X);
/* Returned 'requests' mapping will feature request id's with DummyN substituted from them */
void get_splits_of_non_dummy(std::vector<codeset_t>& res_input,
std::vector<std::vector<size_t>>& res_color_to_requests);
};
#endif

View File

@ -0,0 +1,191 @@
#include <libregexis024fa/fa_first_stage_fix.h>
#include <libregexis024fa/misc_fa_funcs.h>
#include <libregexis024vm/utils.h>
#include <assert.h>
// #ifdef LIBREGEXIS024_DEBUG
// #include <debugging_regexis024/debug_through_graphviz.h>
// #endif
REGEX_IS024_FA_FirstStageFixInfo first_stage_fix_fa(FA_Container& sourceFa, FA_Container& resultFa) {
assert(sourceFa.start);
REGEX_IS024_FA_FirstStageFixInfo info;
for (size_t I_scans = 0; I_scans < sourceFa.all.size(); I_scans++){
FA_Node* beg = sourceFa.all[I_scans];
if (beg->type != look_one_ahead)
continue;
FA_NodeOfLookOneAhead& loa = (*(FA_NodeOfLookOneAhead*)beg);
codeset_t& restriction = loa.restriction;
assert(loa.nxt_node);
struct Marked{
FA_Node* node;
size_t refs_from_my = 1;
bool making_copy = false;
FA_Node* copy = NULL;
explicit Marked(FA_Node *node) : node(node) {}
};
std::vector<Marked> searched;
searched.emplace_back(loa.nxt_node);
beg->search_mark = 0;
for (size_t done = 0; done < searched.size(); done++){
FA_Node& cur = *searched[done].node;
for (FA_Node** nxtN : cur.get_all_empty_valid_transitions()){
if ((**nxtN).search_mark == -1){
assert((**nxtN).nodeId != loa.nodeId);
(**nxtN).search_mark = (int64_t)searched.size();
searched.emplace_back(*nxtN);
} else {
searched[(**nxtN).search_mark].refs_from_my++;
}
}
}
std::vector<FA_Node*> s2s;
for (auto& v_sete: searched){
if (v_sete.refs_from_my < v_sete.node->refs){
v_sete.making_copy = true;
s2s.push_back(v_sete.node);
}
}
while (!s2s.empty()){
FA_Node& m = *s2s.back(); s2s.pop_back();
assert(searched[m.search_mark].making_copy);
/* Beacuse of this operation source Fa is not read-only. It becomes useless after renerating resultFa */
searched[m.search_mark].copy = copy_fa_node(m, sourceFa);
for (FA_Node** nxtN: m.get_all_empty_valid_transitions()){
Marked& nxtNaux = searched[(**nxtN).search_mark];
if (!nxtNaux.making_copy){
nxtNaux.making_copy = true;
s2s.push_back(*nxtN);
}
}
}
for (auto& v_sete : searched){
FA_Node* my = v_sete.making_copy ? v_sete.copy : v_sete.node;
for (FA_Node** nxtN: my->get_all_empty_valid_transitions()){
Marked& nxtNaux = searched[(**nxtN).search_mark];
if (nxtNaux.making_copy)
reattach_fa_node_edge(nxtN, nxtNaux.copy);
}
my->apply_lookahead_restriction(restriction);
if (my->type == match)
info.fed_chars_extend_one_right = true;
}
for (auto& v_sete: searched)
v_sete.node->search_mark = -1;
}
// show_fa_with_sxiv_after_dot(sourceFa, {{}, {}}, {});
{
/* Now it's time to fill resultFa. Skipping all look one ahead's */
auto skip_useless = [&](FA_Node* v) -> FA_Node* {
while (v->type == look_one_ahead){
v = ((FA_NodeOfLookOneAhead*)v)->nxt_node;
}
return v;
};
resultFa.start = sourceFa.start;
std::vector<FA_Node**> homework = {&(resultFa.start)};
std::vector<FA_Node*> sourceIdToResNode(sourceFa.all.size(), NULL);
while (!homework.empty()) {
FA_Node** vPtr = homework.back(); homework.pop_back();
FA_Node* right_source_v = skip_useless(*vPtr);
size_t vid = right_source_v->nodeId;
if (!sourceIdToResNode[vid]) {
sourceIdToResNode[vid] = copy_fa_node_to_another_fa(*right_source_v, resultFa);
for (FA_Node** uuPtr: sourceIdToResNode[vid]->get_all_transitions())
homework.push_back(uuPtr);
}
*vPtr = sourceIdToResNode[vid];
sourceIdToResNode[vid]->refs++;
}
}
{
/* Guessing info.fed_chars_extend_one_left */
size_t done = 0;
std::vector<FA_Node*> searched;
searched.push_back(resultFa.start);
resultFa.start->search_mark = 0;
while (done < searched.size()){
if (searched[done]->type == look_one_behind){
info.fed_chars_extend_one_left = true;
break;
}
for (FA_Node** nxtN: searched[done]->get_all_empty_valid_transitions()){
if ((**nxtN).search_mark < 0){
(**nxtN).search_mark = 0;
searched.push_back(*nxtN);
}
}
done++;
}
for (FA_Node* d: searched)
d->search_mark = -1;
}
return info;
}
FA_NodeOfOneCharRead* generate_alt_ending(const codeset_t& restriction, FA_Container& fa){
FA_NodeOfOneCharRead* n1 = fa.makeOneCharRead(restriction, true);
FA_NodeOfMatch* n2 = fa.makeMatch();
n2->ext_filter_added = true; // Won't actually be used
reattach_fa_node_edge(&(n1->nxt_node), n2);
return n1;
}
void regular_second_stage_fix(const FA_Container& sourceFa, FA_Container& resultFa,
const REGEX_IS024_FA_FirstStageFixInfo &info1)
{
assert(resultFa.all.empty() && !resultFa.start);
if (!sourceFa.start)
return;
// todo: rewrite first stage using that cool technique I just invented
resultFa.start = sourceFa.start;
// A vector of pointers in resutFa to nodes that belong to sourceFa. They should undergo a little bit of copying.
std::vector<FA_Node**> homework = {&(resultFa.start)};
// source node id s index. Element is NULL if no copy (in resultFa) exists and resFa node if copying was performed
std::vector<FA_Node*> sourceIdToResNode(sourceFa.all.size(), NULL);
while (!homework.empty()) {
FA_Node** vPtr = homework.back(); homework.pop_back();
FA_Node* sourceV = *vPtr; assert(sourceV);
size_t sourceVId = sourceV->nodeId;
if (!sourceIdToResNode[sourceVId]) {
if (sourceV->type == match) {
FA_NodeOfMatch& mn = dynamic_cast<FA_NodeOfMatch&>(*sourceV);
FA_NodeOfMatch* res_mn = resultFa.makeMatch();
if (mn.ext_filter_added && mn.pending_filter != codeset_of_all) {
assert(info1.fed_chars_extend_one_right);
FA_NodeOfOneCharRead* res_ocr2n = resultFa.makeOneCharRead(mn.pending_filter, true);
reattach_nxt_node(res_ocr2n, res_mn);
sourceIdToResNode[sourceVId] = res_ocr2n;
} else {
sourceIdToResNode[sourceVId] = res_mn;
}
} else {
sourceIdToResNode[sourceVId] = copy_fa_node_to_another_fa(*sourceV, resultFa);
/* O_o */
for (FA_Node** uuPtr: sourceIdToResNode[sourceVId]->get_all_transitions())
homework.push_back(uuPtr);
}
}
*vPtr = sourceIdToResNode[sourceVId];
sourceIdToResNode[sourceVId]->refs++;
}
if (info1.fed_chars_extend_one_left) {
FA_NodeOfOneCharRead* ns = resultFa.makeOneCharRead(codeset_of_all, true);
yay_new_start(resultFa, ns);
}
}

View File

@ -0,0 +1,18 @@
#ifndef LIBREGEXIS024_FA_FIRST_STAGE_FIX_H
#define LIBREGEXIS024_FA_FIRST_STAGE_FIX_H
#include "finite_automaton.h"
struct REGEX_IS024_FA_FirstStageFixInfo{
bool fed_chars_extend_one_left = false;
bool fed_chars_extend_one_right = false;
};
/* Will look for look_one_ahead nodes and apply their filter to reading filters ahead *
* sourceFa will be ruined. The output will be in resultFa */
REGEX_IS024_FA_FirstStageFixInfo first_stage_fix_fa(FA_Container& sourceFa, FA_Container& resultFa);
void regular_second_stage_fix(const FA_Container& sourceFa, FA_Container& resultFa,
const REGEX_IS024_FA_FirstStageFixInfo &info1);
#endif //LIBREGEXIS024_FA_FIRST_STAGE_FIX_H

View File

@ -0,0 +1,665 @@
#include <libregexis024fa/fa_make_deterministic.h>
#include <libregexis024fa/misc_fa_funcs.h>
#include <libregexis024vm/utils.h> /* to get exitf */
#include <assert.h>
#include <libregexis024fa/tracking_fa_nodes.h>
#include <vector>
#include <map>
#include <algorithm>
#include <set>
#include <libregexis024fa/colored_codeset.h>
#if defined(LIBREGEXIS024_DEBUG) && defined(LIBREGEXIS024_ALLOW_LOUD)
#include <debugging_regexis024/prettyprint/prettyprint_util.h>
#include <string>
#include <functional>
#include <stdio.h>
#define PR_DEB
#endif
/* debug nonsence */
void input_fa_assert(const FA_Container& fa){
assert(fa.start);
for (FA_Node* node: fa.all){
if (node->type == one_char_read){
assert(!dynamic_cast<FA_NodeOfOneCharRead*>(node)->second_ns);
} else if (node->type == look_one_ahead ||
node->type == det_char_crossroads){
exitf("not allowed at this stage\n");
}
}
}
struct OperHistoryNodeTransition {
TrackingOperationInFa op;
size_t u;
OperHistoryNodeTransition(const TrackingOperationInFa &op, size_t u): op(op), u(u) {}
};
struct OperHistoryNode {
std::vector<OperHistoryNodeTransition> next;
/* When it is part of clean history, this */
std::vector<uint64_t> compressed_selarr;
std::vector<uint64_t> raisin;
OperHistoryNode() = default;
};
/* This object can describe an empty superstate (needed to describe clean history nodes without raisin)
* If det_stops is empty, interpret it as empty superstate */
struct SuperState {
std::vector<uint64_t> sorted_raisin;
std::vector<uint64_t> double_compressed_selarr;
bool empty() const {
return sorted_raisin.empty();
}
#ifdef PR_DEB
std::string toString() const {
std::string f1_raisin;
for (uint64_t el: sorted_raisin) {
if (!f1_raisin.empty())
f1_raisin += ", ";
f1_raisin += std::to_string(el);
}
std::string f2_selarr;
for (uint64_t el: double_compressed_selarr) {
if (!f2_selarr.empty())
f2_selarr += ", ";
f2_selarr += std::to_string(el);
}
return "sorted_raisin: {" + f1_raisin + "}, double_comp_selarr: {" + f2_selarr + "}";
}
#endif
};
struct CleanOperHistoryNode {
std::vector<OperHistoryNodeTransition> next;
SuperState exit;
};
struct SelarrCompressionScheme {
size_t SN1, SN2 = 0, SN3 = 0;
std::vector<int32_t> S1_to_S2;
std::vector<regex_tai_t> S2_to_sifter;
std::vector<regex_tai_t> S3_to_sifter;
const RegexPriorityTable& sifter;
SelarrCompressionScheme(size_t sn1, const RegexPriorityTable &sifter) : SN1(sn1), sifter(sifter) {
assert(sifter.size() <= UINT32_MAX);
S1_to_S2.assign(SN1, -1);
for (regex_tai_t i = 0; i < sifter.size(); i++) {
auto& act = sifter[i].pos;
regex_tai_t first_on_s2 = S2_to_sifter.size();
S2_to_sifter.push_back(i);
S1_to_S2[act.first] = first_on_s2;
if (act.type != tracking_var_types::dot_cur_pos) {
S3_to_sifter.push_back(i);
}
if (act.type == tracking_var_types::range) {
regex_tai_t second_on_s2 = S2_to_sifter.size();
S2_to_sifter.push_back(i);
S1_to_S2[act.second] = second_on_s2;
}
}
SN2 = S2_to_sifter.size();
SN3 = S3_to_sifter.size();
assert(SN3 <= SN2 && SN2 <= SN1 && SN1 <= UINT16_MAX);
}
};
std::vector<uint64_t> compress_compressed_selarr(const std::vector<uint64_t>& S2,
const SelarrCompressionScheme& cmp) {
std::vector<uint64_t> S3(cmp.SN3);
for (size_t i = 0; i < cmp.SN3; i++) {
const RegexPriorityTableAction_Pos& act = cmp.sifter[cmp.S3_to_sifter[i]].pos;
if (act.type == tracking_var_types::dot_immediate) {
S3[i] = S2[cmp.S1_to_S2[act.first]];
} else {
assert(act.type == tracking_var_types::range); // It must be range type
uint64_t onFirstBorder = S2[cmp.S1_to_S2[act.first]];
uint64_t onSecondBorder = S2[cmp.S1_to_S2[act.second]];
S3[i] = (onFirstBorder > onSecondBorder) ? 1 : 0;
}
}
return S3;
}
bool compressed_selarr_A_outranks_B(const std::vector<uint64_t>& A, const std::vector<uint64_t>& B,
const SelarrCompressionScheme& cmp) {
for (const RegexPriorityTableAction& act: cmp.sifter) {
uint64_t valA = A[cmp.S1_to_S2[act.pos.first]];
uint64_t valB = B[cmp.S1_to_S2[act.pos.first]];
if (act.pos.type == tracking_var_types::range) {
uint64_t valAsecond = A[cmp.S1_to_S2[act.pos.second]];
uint64_t valBsecond = A[cmp.S1_to_S2[act.pos.second]];
valA = valAsecond > valA ? valAsecond - valA : 0;
valB = valBsecond > valB ? valBsecond - valB : 0;
}
if (valA == valB)
continue;
return (valA < valB) == act.minimize;
}
return false;
}
/* Beacuse of the way wash_history_bush builds this structure, root is te last node.
* rankdir is from left to right (guaranteed). Can be empty if original history contained no raisin */
struct RaisinBush {
std::vector<CleanOperHistoryNode> clean_history;
ssize_t start = -1;
bool empty() const {
return start < 0;
}
#ifdef PR_DEB
void print() {
lines text;
text.push_back("Raisin bush");
if (start >= 0) {
size_t n = clean_history.size();
std::vector<bool> m(n, false);
TreeWithStringsNode e{""};
std::function<void(TreeWithStringsNode&, size_t)> dfs = [&]
(TreeWithStringsNode& fill, size_t nodeId)
{
if (m[nodeId]) {
fill.val = "PARADOX";
return;
}
m[nodeId] = true;
const CleanOperHistoryNode& node = clean_history[nodeId];
fill.val = "[" + std::to_string(nodeId) + "]";
if (!node.exit.empty())
fill.val += (" EXIT: " + node.exit.toString());
size_t CN = node.next.size();
fill.childeren.resize(CN);
for (size_t i = 0; i < CN; i++) {
fill.childeren[i].val = node.next[i].op.toString();
fill.childeren[i].childeren = {{}};
dfs(fill.childeren[i].childeren[0], node.next[i].u);
}
};
dfs(e, start);
size_t am = 0;
for (bool el: m)
am += static_cast<size_t>(el);
if (am < n)
text[0] += ": " + std::to_string(n - am) + " nodes are unreachable by detour";
e.toLines(text);
} else {
if (clean_history.empty())
text[0] = "Empty Raisin Bush";
else
text [0] = "Raisin bush with no root and " + std::to_string(clean_history.size()) = " nodes missed";
}
printLines(wrapWithBox(text));
}
#endif
};
void wash_history_bush(const std::vector<OperHistoryNode>& history, RaisinBush& answer,
const SelarrCompressionScheme& cmp) {
assert(!history.empty());
std::vector<bool> has_raisin(history.size());
std::vector<ssize_t> dirty_to_clean(history.size(), -1);
std::vector<std::pair<size_t, size_t> > callStack = {{0, 0}};
auto hist_clean_detour_init_clean = [&](uint64_t v) -> uint64_t {
if (!has_raisin[v]) {
has_raisin[v] = true;
dirty_to_clean[v] = answer.clean_history.size();
answer.clean_history.emplace_back();
}
return dirty_to_clean[v];
};
while (!callStack.empty()) {
size_t v = callStack.back().first;
size_t od = callStack.back().second;
if (od == 0) {
if (!history[v].raisin.empty()) {
size_t cleanVId = hist_clean_detour_init_clean(v);
std::vector<uint64_t>& sr = answer.clean_history[cleanVId].exit.sorted_raisin;
sr = history[v].raisin;
std::sort(sr.begin(), sr.end());
answer.clean_history[cleanVId].exit.double_compressed_selarr = compress_compressed_selarr(history[v].compressed_selarr, cmp);
}
} else {
const OperHistoryNodeTransition& old_hist_tr = history[v].next[od - 1];
uint64_t ou = old_hist_tr.u;
if (has_raisin[ou]) {
size_t cleanVId = hist_clean_detour_init_clean(v);
answer.clean_history[cleanVId].next.emplace_back(old_hist_tr.op, dirty_to_clean[ou]);
}
}
if (od == history[v].next.size()) {
callStack.pop_back();
} else {
callStack.back().second++;
callStack.emplace_back(history[v].next[od].u, 0);
}
}
if (has_raisin[0]) {
assert(dirty_to_clean[0] >= 0);
answer.start = dirty_to_clean[0];
}
}
/* If is_it_after_read is false, unknown selarr range variable border and cur pos are evaluated to 0.
* Otherwise, cur pos considered to be greater than previous values of selarr ange variable boundaries */
void building_detour(const SelarrCompressionScheme& cmp,
const std::vector<uint64_t>& outer_selarr, const std::vector<FA_Node*>& zeroeps, const codeset_t& I,
RaisinBush& answer, bool is_it_after_read)
{
#ifdef PR_DEB
printf("Det Debug: build_detour started with zeroeps:{");
for (FA_Node* node: zeroeps)
printf("%lu,", node->nodeId);
printf("}, I: {%s}\n", stringifyCodesetBase10(I).c_str());
#endif
assert(cmp.SN3 == outer_selarr.size());
if (!is_it_after_read)
for (uint64_t val: outer_selarr)
assert(val == 0);
struct SearchMark {
FA_Node* domain_node;
uint64_t epsilon_refs = 0;
uint64_t detour_sat = 0;
/* id of corresponding history node */
size_t Hv = 0;
explicit SearchMark(FA_Node *domain_node) : domain_node(domain_node) {}
};
/* Default values are good for me */
std::vector<SearchMark> marks;
for (size_t i = 0; i < zeroeps.size(); i++) {
marks.emplace_back(zeroeps[i]);
zeroeps[i]->search_mark = i;
}
auto lob_allows_to_pass = [&](FA_NodeOfLookOneBehind* lob) -> bool {
if (!intersect_sets(lob->filter, I).empty()) {
assert(merge_sets(lob->filter, I) == lob->filter);
return true;
}
return false;
};
{ /* First i need to know exacly how many of MINE epsilon transitions are referencing each NODE */
std::vector<FA_Node*> domain_detour = zeroeps;
while (!domain_detour.empty()) {
FA_Node* v = domain_detour.back(); domain_detour.pop_back();
if (v->type == look_one_behind && !lob_allows_to_pass(dynamic_cast<FA_NodeOfLookOneBehind*>(v)))
continue;
for (FA_Node** uPtr: v->get_all_empty_valid_transitions()) {
assert(*uPtr);
int64_t &rds = (**uPtr).search_mark;
if (rds == -1) {
rds = marks.size();
domain_detour.push_back(*uPtr);
marks.emplace_back(*uPtr);
}
marks[rds].epsilon_refs++;
}
}
}
std::vector<OperHistoryNode> history = {OperHistoryNode()};
history[0].compressed_selarr.assign(cmp.SN2, 0);
for (size_t i = 0; i < cmp.SN3; i++) {
const RegexPriorityTableAction_Pos& act = cmp.sifter[cmp.S3_to_sifter[i]].pos;
if (act.type == tracking_var_types::range) {
if (outer_selarr[i]) {
history[0].compressed_selarr[cmp.S1_to_S2[act.second]] = 1;
}
} else {
assert(act.type == tracking_var_types::dot_immediate);
history[0].compressed_selarr[cmp.S1_to_S2[act.first]] = outer_selarr[i];
}
}
/* As a result, dot_cur_pos variables will be initialized as zero (always) */
/* In my second detour, I will pass each vertex here only one time: after hitting the total epsilon refcount */
std::vector<FA_Node*> can_process = zeroeps;
/*
auto increase_sat_refcount = [&](SearchMark& mark) {
mark.detour_sat++;
if (mark.detour_sat == mark.epsilon_refs && mark.ever_walked_in) {
can_process.push_back(mark.domain_node);
}
};
*/
auto add_history_update = [&](TrackingOperationInFa how, uint64_t where, uint64_t from_where) {
history[from_where].next.emplace_back(how, where);
};
while (!can_process.empty()) {
FA_Node* v = can_process.back(); can_process.pop_back();
SearchMark& Vmark = marks[v->search_mark];
assert(Vmark.detour_sat == Vmark.epsilon_refs);
uint64_t Hv = Vmark.Hv;
uint64_t Hop = Hv;
if (v->type == look_one_behind) {
FA_NodeOfLookOneBehind* tv = dynamic_cast<FA_NodeOfLookOneBehind*>(v);
if (!lob_allows_to_pass(tv))
continue;
} else if (isTrackingFaNode(v)) {
Hop = history.size();
history.emplace_back();
std::vector<uint64_t>& val2 = history.back().compressed_selarr;
val2 = history[Hv].compressed_selarr;
if (v->type == track_array_mov_imm) {
FA_NodeOfTrackArrayMovImm* tv = dynamic_cast<FA_NodeOfTrackArrayMovImm*>(v);
if (isSelarrOpcode(tv->operation)) {
int key_s2 = cmp.S1_to_S2[tv->key];
if (key_s2 >= 0){
assert(cmp.sifter[cmp.S2_to_sifter[key_s2]].pos.type == tracking_var_types::dot_immediate);
val2[key_s2] = tv->imm_value;
}
}
add_history_update(TrackingOperationInFa(tv->operation, tv->key, tv->imm_value), Hop, Hv);
} else if (v->type == track_array_mov_halfinvariant) {
FA_NodeOfTrackArrayMovHalfinvariant* tv = dynamic_cast<FA_NodeOfTrackArrayMovHalfinvariant*>(v);
if (isSelarrOpcode(tv->operation)) {
int key_s2 = cmp.S1_to_S2[tv->key];
if (key_s2 >= 0){
const RegexPriorityTableAction_Pos& act = cmp.sifter[cmp.S2_to_sifter[key_s2]].pos;
assert(act.type != tracking_var_types::dot_immediate);
if (act.type == tracking_var_types::dot_cur_pos) {
val2[key_s2] = is_it_after_read ? 1 : 0;
} else {
val2[key_s2] = is_it_after_read ? 2 : 0;
}
}
}
add_history_update(TrackingOperationInFa(tv->operation, tv->key), Hop, Hv);
}
} else if (v->type == match || v->type == one_char_read) {
// Determinization stop
history[Hv].raisin.push_back(v->nodeId);
}
for (FA_Node** uPtr: v->get_all_empty_valid_transitions()) {
assert(*uPtr);
SearchMark& Umark = marks[(**uPtr).search_mark];
/* Here I use Hop to determine Hv value of u */
if (Umark.detour_sat == 0) {
Umark.Hv = Hop;
} else if (Umark.Hv != Hop) {
if (compressed_selarr_A_outranks_B(
history[Hop].compressed_selarr, history[Umark.Hv].compressed_selarr, cmp)){
Umark.Hv = Hop;
}
}
/* Collision calculation finished */
Umark.detour_sat++;
if (Umark.detour_sat == Umark.epsilon_refs) {
can_process.push_back(Umark.domain_node);
}
}
}
/* Cleaning this mess */
for (auto& m: marks) {
m.domain_node->search_mark = -1;
}
/* Packaging the answer (we do a little bit of dfs here) */
wash_history_bush(history, answer, cmp);
}
void update_had_to_fork_status(const RaisinBush& bush, int& had_to_fork) {
for (const CleanOperHistoryNode& node: bush.clean_history) {
if (node.next.size() > 1 || (!node.next.empty() && !node.exit.empty())) {
had_to_fork = 1;
return;
}
}
}
typedef size_t superstate_id_t;
typedef std::vector<std::pair<FA_Node**, superstate_id_t>> homework_t;
struct LessSuperState {
bool operator()(const SuperState& A, const SuperState& B) const {
std::less<std::vector<uint64_t>> f1L;
if (f1L(A.sorted_raisin, B.sorted_raisin))
return true;
if (f1L(B.sorted_raisin, A.sorted_raisin))
return false;
return f1L(A.double_compressed_selarr, B.double_compressed_selarr);
}
};
struct GlobalDetourProgress {
std::map<SuperState, superstate_id_t, LessSuperState> superstates;
/* Each element is a root of some megabush in resFa */
std::vector<FA_Node*> superstate_megabush_constructed;
std::vector<SuperState> todo_superstaes;
};
/* If x was not previously achieved, it will also add it to t o d o list of global detour */
superstate_id_t convertSuperstateToId(const SuperState& x, GlobalDetourProgress& gdp) {
if (gdp.superstates.count(x)) {
return gdp.superstates[x];
}
size_t n = gdp.superstates.size();
gdp.superstates.insert({x, n});
gdp.todo_superstaes.push_back(x);
gdp.superstate_megabush_constructed.push_back(NULL);
return n;
}
FA_Node* build_dead_end(FA_Container& resFa) {
return resFa.makeForking();
}
void build_bush(const RaisinBush& alpha, FA_Node** sowing_location, FA_Container& resFa,
homework_t& homework, GlobalDetourProgress& gdp) {
size_t n = alpha.clean_history.size();
if (n == 0) {
FA_Node* dead_end = build_dead_end(resFa);
reattach_fa_node_edge(sowing_location, dead_end);
return;
}
std::vector<std::pair<FA_Node**, size_t>> todo = {{sowing_location, alpha.start}};
while (!todo.empty()) {
FA_Node** sl = todo.back().first;
const CleanOperHistoryNode& hnode = alpha.clean_history[todo.back().second];
todo.pop_back();
auto history_transition = [&](size_t i, FA_Node** of_sl) {
FA_NodePathPart* pn = convert_to_node(hnode.next[i].op, resFa);
reattach_fa_node_edge(of_sl, pn);
todo.emplace_back(&(pn->nxt_node), hnode.next[i].u);
};
if (hnode.next.empty()) {
assert(!hnode.exit.empty());
superstate_id_t w = convertSuperstateToId(hnode.exit, gdp);
homework.emplace_back(sl, w);
} else if (hnode.next.size() == 1 && hnode.exit.empty()) {
history_transition(0, sl);
} else {
FA_NodeOfForking* forker = resFa.makeForking();
bool raisin = !hnode.exit.empty();
size_t k = hnode.next.size();
forker->nxt_options.assign(k + static_cast<size_t>(raisin), NULL);
for (size_t i = 0; i < k; i++) {
history_transition(i, &(forker->nxt_options[i]));
}
if (raisin) {
superstate_id_t w = convertSuperstateToId(hnode.exit, gdp);
homework.emplace_back(&(forker->nxt_options[k]), w);
}
reattach_fa_node_edge(sl, forker);
}
}
}
ColoredCodeset get_pretreated_cc(FA_Container& sourceFa) {
std::set<codeset_t> little_insects;
for (FA_Node* v: sourceFa.all) {
if (v->type == look_one_behind) {
little_insects.insert(static_cast<FA_NodeOfLookOneBehind*>(v)->filter);
}
}
ColoredCodeset pretreated_cc(little_insects.size());
for (const codeset_t& cs: little_insects) {
pretreated_cc.apply_divisor(cs);
}
return pretreated_cc;
}
// todo add a check on size of dfa
void try_determinize_fa(FA_Container &sourceFa, const RegexPriorityTable &sifter, regex_tai_t selarr_sz,
const REGEX_IS024_FA_FirstStageFixInfo &info1, FA_Container &resFa, int &error, int& had_to_fork)
{
/* During execuion, i will create pointers to field res.start and store them (inside the scope of this function)
* Luckily res argument is already immovable in this scope. */
error = 0;
had_to_fork = 0;
assert(resFa.start == NULL && resFa.all.empty());
input_fa_assert(sourceFa);
SelarrCompressionScheme cmp(selarr_sz, sifter);
GlobalDetourProgress gdp;
homework_t homework;
ColoredCodeset pretreated_cc = get_pretreated_cc(sourceFa);
FA_Node** res_start_ptr = &(resFa.start);
if (info1.fed_chars_extend_one_left) {
ColoredCodeset inp_distinction = pretreated_cc;
inp_distinction.apply_divisor(codeset_of_all);
std::vector<codeset_t> starting_Is;
std::vector<std::vector<size_t>> starting_Cids; /* Filler variable */
inp_distinction.get_splits_of_non_dummy(starting_Is, starting_Cids);
size_t R = starting_Is.size();
for (auto& rdh: starting_Cids) {
assert(rdh.size() == 1 && rdh[0] == 0);
}
FA_NodeOfDetCharCrossroads* very_first_cr = resFa.makeDetCharCrossroads();
very_first_cr->second_ns = true;
reattach_fa_node_edge(res_start_ptr, very_first_cr);
very_first_cr->crossroads.resize(R); /* After that, nobody has right to resize crossroads array */
for (size_t i = 0; i < R; i++) {
very_first_cr->crossroads[i].input = starting_Is[i];
FA_Node** sowing_place = &(very_first_cr->crossroads[i].nxt_node);
RaisinBush alpha;
building_detour(cmp, std::vector<uint64_t>(cmp.SN3, 0), {sourceFa.start}, starting_Is[i], alpha, false);
#ifdef PR_DEB
printf("Initialization hard %ld/%ld\n", i + 1, R);
alpha.print();
#endif
update_had_to_fork_status(alpha, had_to_fork);
build_bush(alpha, sowing_place, resFa, homework, gdp);
}
} else {
RaisinBush alpha;
building_detour(cmp, std::vector<uint64_t>(cmp.SN3, 0), {sourceFa.start}, codeset_of_all, alpha, false);
#ifdef PR_DEB
printf("Initialization easy\n");
alpha.print();
#endif
update_had_to_fork_status(alpha, had_to_fork);
build_bush(alpha, res_start_ptr, resFa, homework, gdp);
}
/* Now we start the actual detour. */
while (!gdp.todo_superstaes.empty()) {
SuperState SS = gdp.todo_superstaes.back(); gdp.todo_superstaes.pop_back();
// printf("Global detour turn: %s\n", SS.toString().c_str());
std::vector<FA_NodeOfOneCharRead*> reading_stops;
codeset_t how_can_i_finish = {};
for (size_t v: SS.sorted_raisin) {
FA_Node* node = sourceFa.all[v];
if (node->type == one_char_read) {
reading_stops.push_back(static_cast<FA_NodeOfOneCharRead*>(node));
} else if (node->type == match) {
auto fn = static_cast<FA_NodeOfMatch*>(node);
assert(!fn->ext_filter_added || info1.fed_chars_extend_one_right);
if (fn->ext_filter_added) {
how_can_i_finish = merge_sets(how_can_i_finish, fn->pending_filter);
} else {
how_can_i_finish = codeset_of_all;
}
} else
assert(false);
}
// Determinization stop: one char read (input)
ColoredCodeset inp_distinction = pretreated_cc;
size_t pr = reading_stops.size();
for (size_t i = 0; i < pr; i++) {
inp_distinction.apply_divisor(reading_stops[i]->filter);
}
std::vector<codeset_t> Is;
std::vector<std::vector<size_t>> Cids;
inp_distinction.get_splits_of_non_dummy(Is, Cids);
size_t R = Is.size();
FA_NodeOfDetCharCrossroads* my_cr = NULL;
if (R > 0) {
my_cr = resFa.makeDetCharCrossroads();
if (!info1.fed_chars_extend_one_right && !how_can_i_finish.empty()) {
assert(how_can_i_finish == codeset_of_all);
my_cr->matching = true;
}
my_cr->crossroads.resize(R);
}
for (size_t i = 0; i < R; i++) {
my_cr->crossroads[i].input = Is[i];
my_cr->crossroads[i].nxt_node = NULL;
std::vector<FA_Node*> fl_passed_filters;
for (size_t j: Cids[i]) {
fl_passed_filters.push_back(reading_stops[j]->nxt_node);
}
// todo: make a function out of next 6 lines of code
RaisinBush alpha;
building_detour(cmp, SS.double_compressed_selarr, fl_passed_filters, Is[i], alpha, true);
#ifdef PR_DEB
printf("That same turn, subbush %ld/%ld\n", i + 1, R);
alpha.print();
#endif
update_had_to_fork_status(alpha, had_to_fork);
build_bush(alpha, &(my_cr->crossroads[i].nxt_node), resFa, homework, gdp);
}
// Determinization stop: match (finish)
FA_Node* finish_route = NULL;
if (!how_can_i_finish.empty() && (info1.fed_chars_extend_one_right || R == 0)) {
FA_NodeOfMatch* matcher = resFa.makeMatch();
finish_route = matcher;
if (info1.fed_chars_extend_one_right) {
FA_NodeOfOneCharRead* right_ext_read = resFa.makeOneCharRead(how_can_i_finish, true);
reattach_nxt_node(right_ext_read, matcher);
finish_route = right_ext_read;
}
}
// Combining these two cases
assert(finish_route || my_cr);
FA_Node*& endsUp = gdp.superstate_megabush_constructed[gdp.superstates[SS]];
if (!finish_route) {
endsUp = my_cr;
} else if (!my_cr) {
endsUp = finish_route;
} else {
FA_NodeOfForking* F = resFa.makeForking();
F->nxt_options = {NULL, NULL};
reattach_fa_node_edge(&(F->nxt_options[0]), my_cr);
reattach_fa_node_edge(&(F->nxt_options[1]), finish_route);
endsUp = F;
}
}
/* Now it's time to do the homework: link all megabushes */
for (auto& p: homework) {
reattach_fa_node_edge(p.first, gdp.superstate_megabush_constructed[p.second]);
}
}

View File

@ -0,0 +1,10 @@
#ifndef LIBREGEXIS024_FA_MAKE_DETERMINISTIC_H
#define LIBREGEXIS024_FA_MAKE_DETERMINISTIC_H
#include <libregexis024fa/fa_first_stage_fix.h>
#include <libregexis024fa/selarr_priority_table.h>
void try_determinize_fa(FA_Container &sourceFa, const RegexPriorityTable &sifter, regex_tai_t selarr_sz,
const REGEX_IS024_FA_FirstStageFixInfo &info1, FA_Container &resFa, int &error, int& had_to_fork);
#endif //LIBREGEXIS024_FA_MAKE_DETERMINISTIC_H

View File

@ -0,0 +1,141 @@
#include <libregexis024fa/finite_automaton.h>
#include <libregexis024vm/utils.h>
#include <assert.h>
bool FA_Node::empty() {
return type != one_char_read && type != det_char_crossroads;
}
void FA_Node::apply_lookahead_restriction(const codeset_t &restriction) {}
void FA_Node::reAdd_references() {
for (FA_Node** nxtPtr: get_all_transitions()){
if (*nxtPtr)
(**nxtPtr).refs++;
}
}
std::vector<FA_Node **> FA_Node::get_all_transitions() {
return {};
}
std::vector<FA_Node **> FA_Node::get_all_empty_valid_transitions() {
return {};
}
std::vector<FA_Node **> FA_NodePathPart::get_all_transitions() {
return {&nxt_node};
}
std::vector<FA_Node **> FA_NodePathPart::get_all_empty_valid_transitions() {
if (nxt_node)
return {&nxt_node};
return {};
}
FA_NodeOfMatch::FA_NodeOfMatch() {type = match;}
void FA_NodeOfMatch::apply_lookahead_restriction(const codeset_t &restriction) {
ext_filter_added = true;
pending_filter = restriction;
}
FA_NodeOfOneCharRead::FA_NodeOfOneCharRead(const codeset_t &filter, bool second_namespace) : filter(filter),
second_ns(second_namespace) { type = one_char_read;}
void FA_NodeOfOneCharRead::apply_lookahead_restriction(const codeset_t &restriction) {
filter = intersect_sets(filter, restriction);
}
std::vector<FA_Node **> FA_NodeOfOneCharRead::get_all_empty_valid_transitions() {
return {};
}
FA_NodeOfForking::FA_NodeOfForking() {type = forking;}
std::vector<FA_Node **> FA_NodeOfForking::get_all_empty_valid_transitions() {
std::vector<FA_Node**> res;
for (size_t i = 0; i < nxt_options.size(); i++)
if (nxt_options[i])
res.push_back(&nxt_options[i]);
return res;
}
std::vector<FA_Node **> FA_NodeOfForking::get_all_transitions() {
std::vector<FA_Node**> res;
for (size_t i = 0; i < nxt_options.size(); i++)
res.push_back(&nxt_options[i]);
return res;
}
FA_NodeOfLookOneBehind::FA_NodeOfLookOneBehind(const codeset_t &filter) : filter(filter) {type = look_one_behind;}
FA_NodeOfLookOneAhead::FA_NodeOfLookOneAhead(const codeset_t &restriction) : restriction(restriction) {
type = look_one_ahead;
}
FA_NodeOfTrackArrayMovImm::FA_NodeOfTrackArrayMovImm(regex024_opcode operation, uint16_t key, uint64_t immValue) :
operation(operation), key(key), imm_value(immValue) {type = track_array_mov_imm;}
//
FA_NodeOfTrackArrayMovHalfinvariant::FA_NodeOfTrackArrayMovHalfinvariant(regex024_opcode operation, uint16_t key):
operation(operation), key(key){type = track_array_mov_halfinvariant;}
//
void FA_NodeOfDetCharCrossroads::apply_lookahead_restriction(const codeset_t &restriction) {
exitf("What?? Oh, no, no. I am NOT doing it");
}
FA_NodeOfDetCharCrossroads::FA_NodeOfDetCharCrossroads(const std::vector<DFA_CrossroadPath> &crossroads)
: crossroads(crossroads) {type = det_char_crossroads;}
std::vector<FA_Node **> FA_NodeOfDetCharCrossroads::get_all_empty_valid_transitions() {
return {};
}
std::vector<FA_Node **> FA_NodeOfDetCharCrossroads::get_all_transitions() {
std::vector<FA_Node**> res;
for (auto& tr: crossroads)
res.push_back(&tr.nxt_node);
return res;
}
/* If transferring ownership of node to container has failed, node is freed (which means it is ivalidated)
* If this semi-ownership transfer succeded (no std::bad_alloc), then node is still valid to use, and at the end
* of FA_Container lifetime it is guaranteed to be deleted
*/
void FA_Container::registerNew(FA_Node *node) {
try {
node->nodeId = (int64_t)all.size();
all.push_back(node);
} catch (const std::bad_alloc& ba) {
delete node;
throw;
}
}
DFA_CrossroadPath::DFA_CrossroadPath(const codeset_t &input, FA_Node *nxt_node): input(input),nxt_node(nxt_node) {}
//
FA_Container::~FA_Container() {
for (FA_Node* n: all)
delete n;
}
#define bs(name, args, params) \
FA_NodeOf ## name *FA_Container::make ## name(args) { \
FA_NodeOf ## name *node = new FA_NodeOf ## name(params); \
registerNew(node); \
return node; \
}
#define COMMA ,
bs(Match, , )
bs(OneCharRead, const codeset_t& filter COMMA bool second_namespace, filter COMMA second_namespace)
bs(Forking, , )
bs(LookOneBehind, const codeset_t& filter, filter)
bs(LookOneAhead, const codeset_t& filter, filter)
bs(TrackArrayMovImm, regex024_opcode operation COMMA uint16_t key COMMA uint64_t immValue,
operation COMMA key COMMA immValue)
bs(TrackArrayMovHalfinvariant, regex024_opcode operation COMMA uint16_t key, operation COMMA key)
bs(DetCharCrossroads, ,{})

View File

@ -0,0 +1,149 @@
#ifndef LIBREGEXIS024_FINITE_AUTOMATON_H
#define LIBREGEXIS024_FINITE_AUTOMATON_H
#include <stdint.h>
#include <vector>
#include <libregexis024fa/codeset.h>
#include <libregexis024vm/vm_opcodes.h>
enum FA_Node_type: uint8_t {
match,
one_char_read,
forking,
look_one_behind,
look_one_ahead,
track_array_mov_imm,
track_array_mov_halfinvariant,
/* Used for DFA */
det_char_crossroads,
};
struct FA_Node{
size_t refs = 0;
/* If node is not in searched subset (at least yet), `search mark == -1`, otherwise
* it is an index (for that particular node) in the vector that captures all nodes in
* searched subset*/
int64_t search_mark = -1;
FA_Node_type type;
int64_t nodeId;
bool empty();
virtual std::vector<FA_Node**> get_all_empty_valid_transitions();
virtual void apply_lookahead_restriction(const codeset_t &restriction);
void reAdd_references();
virtual ~FA_Node() = default;
virtual std::vector<FA_Node**> get_all_transitions();
};
struct FA_NodePathPart: public FA_Node{
FA_Node* nxt_node = NULL;
std::vector<FA_Node **> get_all_empty_valid_transitions() override;
std::vector<FA_Node **> get_all_transitions() override;
};
struct FA_NodeOfMatch: public FA_Node{
bool ext_filter_added = false;
codeset_t pending_filter;
explicit FA_NodeOfMatch();
void apply_lookahead_restriction(const codeset_t &restriction) override;
};
/* .type == one_char_read */
struct FA_NodeOfOneCharRead: public FA_NodePathPart{
codeset_t filter;
bool second_ns = false;
FA_NodeOfOneCharRead(const codeset_t &filter, bool second_namespace);
void apply_lookahead_restriction(const codeset_t &restriction) override;
std::vector<FA_Node **> get_all_empty_valid_transitions() override;
};
/* .type == forking */
struct FA_NodeOfForking: public FA_Node{
/* Won't be modified after init (in regexp compilation into NFA) */
std::vector<FA_Node*> nxt_options;
int64_t stopId = -1;
explicit FA_NodeOfForking();
std::vector<FA_Node **> get_all_empty_valid_transitions() override;
std::vector<FA_Node **> get_all_transitions() override;
};
/* .type == look_one_behind */
struct FA_NodeOfLookOneBehind: public FA_NodePathPart{
/* [0; UINT32_MAX] is equivalent to no filter */
codeset_t filter;
explicit FA_NodeOfLookOneBehind(const codeset_t &filter);
};
/* .type == look_one_ahead */
struct FA_NodeOfLookOneAhead: public FA_NodePathPart{
/* [0; UINT32_MAX] is equivalent to no restriction */
codeset_t restriction;
explicit FA_NodeOfLookOneAhead(const codeset_t &restriction);
};
/* .type == track_array_mov_imm */
struct FA_NodeOfTrackArrayMovImm: public FA_NodePathPart{
regex024_opcode operation;
uint16_t key;
uint64_t imm_value;
FA_NodeOfTrackArrayMovImm(regex024_opcode operation, uint16_t key, uint64_t immValue);
};
/* .type == track_array_mov_halfinvariant */
struct FA_NodeOfTrackArrayMovHalfinvariant: public FA_NodePathPart{
regex024_opcode operation;
uint16_t key;
FA_NodeOfTrackArrayMovHalfinvariant(regex024_opcode operation, uint16_t key);
};
struct DFA_CrossroadPath{
codeset_t input;
FA_Node* nxt_node = NULL;
DFA_CrossroadPath(const codeset_t &input, FA_Node *nxt_node);
DFA_CrossroadPath() = default;
};
/* .type == det_char_crossroads */
struct FA_NodeOfDetCharCrossroads: public FA_Node{
std::vector<DFA_CrossroadPath> crossroads;
bool matching = false;
bool second_ns = false;
explicit FA_NodeOfDetCharCrossroads(const std::vector<DFA_CrossroadPath> &crossroads);
void apply_lookahead_restriction(const codeset_t &restriction) override;
std::vector<FA_Node **> get_all_empty_valid_transitions() override;
std::vector<FA_Node **> get_all_transitions() override;
};
struct FA_Container{
FA_Container(const FA_Container&) = delete;
FA_Container& operator=(const FA_Container&) = delete;
FA_Container() = default;
std::vector<FA_Node*> all;
FA_Node* start = NULL;
void registerNew(FA_Node* node);
FA_NodeOfMatch* makeMatch();
FA_NodeOfOneCharRead* makeOneCharRead(const codeset_t& filter, bool second_namespace);
FA_NodeOfForking* makeForking();
FA_NodeOfLookOneBehind* makeLookOneBehind(const codeset_t& filter);
FA_NodeOfLookOneAhead* makeLookOneAhead(const codeset_t& filter);
FA_NodeOfTrackArrayMovImm* makeTrackArrayMovImm(regex024_opcode operation, uint16_t key, uint64_t immValue);
FA_NodeOfTrackArrayMovHalfinvariant* makeTrackArrayMovHalfinvariant(regex024_opcode operation, uint16_t key);
FA_NodeOfDetCharCrossroads* makeDetCharCrossroads();
~FA_Container();
};
#endif //LIBREGEXIS024_FINITE_AUTOMATON_H

View File

@ -0,0 +1,117 @@
#include <libregexis024fa/graph_to_bytecode/core.h>
#include <assert.h>
#include <libregexis024fa/graph_to_bytecode/writing_commands.h>
#include <libregexis024fa/graph_to_bytecode/filter.h>
#define nonthrowing_assert(expr) if (!(expr)) {error = -1; return; }
void compilation_core(std::vector<uint8_t>& result, FA_Container& fa, explicit_bookmarks& bookmark_manager,
size_t& first_read_ns, size_t& second_read_ns, size_t& fork_ss_ns, int& error)
{
bookmark_id_t node_start_bm_offset = bookmark_manager.new_range_of_bookmarks(fa.all.size());
std::vector<size_t> not_yet_dedicated_second_read_ns_ssids;
first_read_ns = 0;
second_read_ns = 0;
fork_ss_ns = 0;
assert(fa.start);
std::vector<FA_Node*> todo = {fa.start};
// std::vector<bool> promised(fa.all.size(), false);
// promised[fa.start->nodeId] = true;
auto nodesBookmark = [&](FA_Node* node) -> bookmark_id_t {
assert(node);
return node_start_bm_offset + node->nodeId;
};
auto addBranching = [&](FA_Node* node) {
todo.push_back(node);
};
auto reading_head = [&](bool is_in_second_ns) {
if (is_in_second_ns) {
cmd_READ_second_ns(result, not_yet_dedicated_second_read_ns_ssids);
second_read_ns++;
} else {
cmd_READ_first_ns(result, first_read_ns++);
}
};
while (!todo.empty()) {
FA_Node* node = todo.back(); todo.pop_back();
if (bookmark_manager.has_landed(nodesBookmark(node))) {
continue;
}
while (true) {
if (bookmark_manager.has_landed(nodesBookmark(node))) {
cmd_JUMP(result, bookmark_manager, nodesBookmark(node));
break;
}
bookmark_manager.land_bookmark(result, nodesBookmark(node));
if (node->type == match) {
cmd_MATCH(result);
cmd_DIE(result);
break;
} else if (node->type == one_char_read) {
FA_NodeOfOneCharRead* ocr = dynamic_cast<FA_NodeOfOneCharRead*>(node);
nonthrowing_assert(first_read_ns + second_read_ns < UINT32_MAX);
reading_head(ocr->second_ns);
write_filter(result, bookmark_manager, {ocr->filter},{nodesBookmark(ocr->nxt_node)});
node = ocr->nxt_node;
} else if (node->type == look_one_behind) {
FA_NodeOfLookOneBehind* lob = dynamic_cast<FA_NodeOfLookOneBehind*>(node);
write_filter(result, bookmark_manager, {lob->filter}, {nodesBookmark(lob->nxt_node)});
node = lob->nxt_node;
} else if (node->type == forking) {
FA_NodeOfForking* fn = dynamic_cast<FA_NodeOfForking*>(node);
std::vector<FA_Node*>& nxt_options = fn->nxt_options;
if (nxt_options.empty()) {
cmd_DIE(result);
break;
}
if (nxt_options.size() >= 2) {
nonthrowing_assert(fork_ss_ns < UINT32_MAX);
regex_sslot_id_t sslot = fork_ss_ns++;
for (size_t i = 0; i + 1 < nxt_options.size(); i++) {
cmd_FORK(result, bookmark_manager, sslot, nodesBookmark(nxt_options[i]));
addBranching(nxt_options[i]);
}
}
node = nxt_options.back();
} else if (node->type == track_array_mov_imm) {
FA_NodeOfTrackArrayMovImm* tami = dynamic_cast<FA_NodeOfTrackArrayMovImm*>(node);
write_byte(result, tami->operation);
write_tai(result, tami->key);
write_quadword(result, tami->imm_value);
node = tami->nxt_node;
} else if (node->type == track_array_mov_halfinvariant) {
FA_NodeOfTrackArrayMovHalfinvariant* tamh = dynamic_cast<FA_NodeOfTrackArrayMovHalfinvariant *>(node);
write_byte(result, tamh->operation);
write_tai(result, tamh->key);
node = tamh->nxt_node;
} else if (node->type == det_char_crossroads) {
FA_NodeOfDetCharCrossroads* dcc = dynamic_cast<FA_NodeOfDetCharCrossroads*>(node);
nonthrowing_assert(first_read_ns + second_read_ns < UINT32_MAX);
if (dcc->matching)
cmd_MATCH(result);
reading_head(dcc->second_ns);
std::vector<codeset_t> codesets;
std::vector<bookmark_id_t> branches;
for (const DFA_CrossroadPath& p: dcc->crossroads) {
codesets.push_back(p.input);
branches.push_back(nodesBookmark(p.nxt_node));
addBranching(p.nxt_node);
}
write_filter(result, bookmark_manager, codesets, branches);
if (dcc->crossroads.empty())
break;
node = dcc->crossroads[0].nxt_node;
} else
assert(false);
}
}
for (size_t j = 0; j < not_yet_dedicated_second_read_ns_ssids.size(); j++) {
belated_sslot_id(result, not_yet_dedicated_second_read_ns_ssids[j], j + first_read_ns);
}
}

View File

@ -0,0 +1,10 @@
#ifndef LIBREGEXIS024_SRC_LIBREGEXIS024FA_GRAPH_TO_BYTECODE_CORE_H
#define LIBREGEXIS024_SRC_LIBREGEXIS024FA_GRAPH_TO_BYTECODE_CORE_H
#include <libregexis024fa/finite_automaton.h>
#include <libregexis024fa/graph_to_bytecode/natural_compiler_utils.h>
void compilation_core(std::vector<uint8_t>& result, FA_Container& fa, explicit_bookmarks& bookmark_manager,
size_t& first_read_ns, size_t& second_read_ns, size_t& fork_ss_ns, int& error);
#endif

View File

@ -0,0 +1,102 @@
#include <libregexis024fa/graph_to_bytecode/fa_compiler.h>
#include <libregexis024vm/vm_opcodes.h>
#include <libregexis024fa/graph_to_bytecode/natural_compiler_utils.h>
#include <assert.h>
#include <libregexis024fa/graph_to_bytecode/writing_commands.h>
#include <libregexis024fa/graph_to_bytecode/core.h>
void write_priority_table_actions(std::vector<uint8_t>& result, RegexPriorityTable &priority_table) {
for (RegexPriorityTableAction& act: priority_table) {
if (act.pos.isForRange()) {
write_byte(result, regex024_opcodes::DDIST_RABX_SELARR);
write_tai(result, act.pos.first);
write_tai(result, act.pos.second);
} else {
write_byte(result, regex024_opcodes::DMOV_RABX_SELARR);
write_tai(result, act.pos.first);
}
write_byte(result, act.minimize ?
regex024_opcodes::SIFTPRIOR_MIN_RABX :
regex024_opcodes::SIFTPRIOR_MAX_RABX);
}
write_byte(result, regex024_opcodes::SIFT_DONE);
}
struct belate_initialization_parameters {
size_t todo_pos_read_ss_n;
size_t todo_pos_fork_ss_n;
size_t todo_pos_second_ns_size;
void complete_it(std::vector<uint8_t>& result,
regex_sslot_id_t first_read_ns, regex_sslot_id_t second_read_ns, regex_sslot_id_t fork_ss_ns)
{
assert((uint64_t)first_read_ns + (uint64_t)second_read_ns <= UINT32_MAX);
belated_sslot_id(result, todo_pos_read_ss_n , first_read_ns + second_read_ns);
belated_sslot_id(result, todo_pos_fork_ss_n, fork_ss_ns);
belated_sslot_id(result, todo_pos_second_ns_size, second_read_ns);
}
};
/* when I compile initializational part of program, I don't yet know what to put in
* PARAM_READ_SS_NUMBER, PARAM_FORK_SS_NUMBER and MSG_FED_INPUT_EXTENDED (second namespace size).
* These values are belate. */
belate_initialization_parameters write_some_normal_initialization(std::vector<uint8_t>& result,
size_t selarr_size, const REGEX_IS024_FA_FirstStageFixInfo& info1)
{
belate_initialization_parameters todo;
write_byte(result, regex024_opcodes::PARAM_READ_SS_NUMBER);
todo.todo_pos_read_ss_n = result.size();
write_sslot_id(result, 0); // Belate
write_byte(result, regex024_opcodes::PARAM_FORK_SS_NUMBER);
todo.todo_pos_fork_ss_n = result.size();
write_sslot_id(result, 0); // Belate
write_byte(result, regex024_opcodes::PARAM_SELARR_LEN);
write_tai(result, selarr_size);
write_byte(result, regex024_opcodes::MSG_MULTISTART_ALLOWED);
write_byte(result, 1);
write_byte(result, regex024_opcodes::MSG_FED_INPUT_EXTENDED);
write_byte(result, info1.fed_chars_extend_one_left ? 1 : 0);
write_byte(result, info1.fed_chars_extend_one_right ? 1 : 0);
todo.todo_pos_second_ns_size = result.size();
write_sslot_id(result, 0); // Belate
write_byte(result, regex024_opcodes::INIT);
return todo;
}
void compile_fa_to_regexis024_bytecode(std::vector<uint8_t>& result,
FA_Container &fa, RegexPriorityTable &priority_table,
size_t selarr_size, const REGEX_IS024_FA_FirstStageFixInfo& info1, int& error)
{
error = 0;
explicit_bookmarks bookmark_manager;
if (!priority_table.empty()) {
bookmark_id_t BM_sift_function = bookmark_manager.new_bookmark();
bookmark_id_t BM_after_sift = bookmark_manager.new_bookmark();
cmd_JUMP(result, bookmark_manager, BM_after_sift);
bookmark_manager.land_bookmark(result, BM_sift_function);
write_priority_table_actions(result, priority_table);
bookmark_manager.land_bookmark(result, BM_after_sift);
write_byte(result, regex024_opcodes::PARAM_COLSIFTFUNC_SET);
bookmark_manager.write_unresolved_reference(result, BM_sift_function);
}
belate_initialization_parameters init_param_todo = write_some_normal_initialization(result, selarr_size, info1);
size_t first_read_ns, second_read_ns, fork_ss_ns;
compilation_core(result, fa, bookmark_manager, first_read_ns, second_read_ns, fork_ss_ns, error);
if (error < 0)
return;
init_param_todo.complete_it(result, first_read_ns, second_read_ns, fork_ss_ns);
bookmark_manager.finish(result);
}

View File

@ -0,0 +1,14 @@
#ifndef LIBREGEXIS024_SRC_LIBREGEXIS024FA_GRAPH_TO_BYTECODE_FA_COMPILER_H
#define LIBREGEXIS024_SRC_LIBREGEXIS024FA_GRAPH_TO_BYTECODE_FA_COMPILER_H
#include <stdint.h>
#include <vector>
#include <libregexis024fa/finite_automaton.h>
#include <libregexis024fa/selarr_priority_table.h>
#include <libregexis024fa/fa_first_stage_fix.h>
void compile_fa_to_regexis024_bytecode(std::vector<uint8_t>& result, FA_Container& fa, RegexPriorityTable& priority_table,
size_t selarr_size, const REGEX_IS024_FA_FirstStageFixInfo& info1, int& error);
#endif

View File

@ -0,0 +1,120 @@
#include <libregexis024fa/graph_to_bytecode/filter.h>
#include <assert.h>
#include <algorithm>
#include <libregexis024fa/graph_to_bytecode/writing_commands.h>
std::vector<FilterSegment> convert_to_compSeg(const std::vector<codeset_t>& crossroad_codesets)
{
std::vector<FilterSegment> compSeg;
std::vector<FilterSegment> seg;
for (size_t i = 0; i < crossroad_codesets.size(); i++) {
for (auto& p: crossroad_codesets[i]) {
seg.emplace_back(i, p.first, p.second);
}
}
std::sort(seg.begin(), seg.end(),
[](const FilterSegment& a, const FilterSegment& b)->bool{return a.L < b.L;});
if (seg.empty()) {
compSeg.emplace_back(-1, 0, UINT32_MAX);
} else {
if (seg[0].L > 0)
compSeg.emplace_back(-1, 0, seg[0].L - 1);
size_t N = seg.size();
for (size_t i = 0; i + 1 < N; i++) {
compSeg.push_back(seg[i]);
assert(seg[i].R < seg[i + 1].L);
if (seg[i].R + 1 < seg[i + 1].L)
compSeg.emplace_back(-1, seg[i].R + 1, seg[i + 1].L - 1);
}
compSeg.push_back(seg.back());
if (seg.back().R < UINT32_MAX)
compSeg.emplace_back(-1, seg[N - 1].R + 1, UINT32_MAX);
}
assert(!compSeg.empty());
return compSeg;
}
/* Return whether the resulting bytecode relies on me placing [0]'th node at the end */
void write_filter_exit(std::vector<uint8_t>& result, explicit_bookmarks& bookmark_manager,
const std::vector<bookmark_id_t>& crossroad_marks,
ssize_t color, bool at_the_end, bool& relies_on_proper_ending)
{
if (color < 0) {
cmd_DIE(result);
} else if (color != 0 || !at_the_end) {
cmd_JUMP(result, bookmark_manager, crossroad_marks[color]);
} else {
relies_on_proper_ending = true;
}
}
// todo: use return value of this function
bool write_filter(std::vector<uint8_t>& result, explicit_bookmarks& bookmark_manager,
const std::vector<codeset_t>& crossroad_codesets, const std::vector<bookmark_id_t>& crossroad_marks)
{
bool relies_on_proper_ending = false;
std::vector<FilterSegment> compSeg = convert_to_compSeg(crossroad_codesets);
size_t N = compSeg.size();
struct RecFrame {
size_t Li;
size_t Ri;
bool second_part = false;
bookmark_id_t to_the_right_part;
RecFrame(size_t li, size_t ri): Li(li),Ri(ri) {}
};
std::vector<RecFrame> call_stack = {RecFrame(0, N - 1)};
auto is_sandwich = [&](size_t Li, size_t Ri) -> bool {
return Li + 2 == Ri && compSeg[Li].color == compSeg[Ri].color && compSeg[Li + 1].L == compSeg[Li + 1].R;
};
while (!call_stack.empty()) {
RecFrame& cur_frame = call_stack.back();
size_t Li = cur_frame.Li;
size_t Ri = cur_frame.Ri;
if (Li == Ri) {
write_filter_exit(result, bookmark_manager, crossroad_marks, compSeg[Li].color,
Ri + 1 == N, relies_on_proper_ending);
call_stack.pop_back();
} else if (is_sandwich(Li, Ri)){
ssize_t A = compSeg[Li].color;
ssize_t B = compSeg[Li + 1].color;
size_t midVal = compSeg[Li + 1].L;
if (B < 0) {
assert(A >= 0);
bookmark_id_t b_to_end = bookmark_manager.new_bookmark();
cmd_JCEQUAL(result, bookmark_manager, midVal, b_to_end);
cmd_JUMP(result, bookmark_manager, crossroad_marks[A]);
bookmark_manager.land_bookmark(result, b_to_end);
cmd_DIE(result);
} else {
cmd_JCEQUAL(result, bookmark_manager, midVal, crossroad_marks[B]);
write_filter_exit(result, bookmark_manager, crossroad_marks, A,
Ri + 1 == N, relies_on_proper_ending);
}
call_stack.pop_back();
} else {
size_t m = (Li + Ri) / 2;
if (!cur_frame.second_part) {
cur_frame.to_the_right_part = bookmark_manager.new_bookmark();
cmd_JCGRTR(result, bookmark_manager, compSeg[m].R, cur_frame.to_the_right_part);
cur_frame.second_part = true;
/* cur_frame was just invalidated */
call_stack.emplace_back(Li, m);
} else {
bookmark_manager.land_bookmark(result, cur_frame.to_the_right_part);
/* cur_frame was invalidated */
call_stack.pop_back();
call_stack.emplace_back(m + 1, Ri);
}
}
}
return relies_on_proper_ending;
}
FilterSegment::FilterSegment(ssize_t color, uint32_t l, uint32_t r): color(color), L(l), R(r) {}
//

View File

@ -0,0 +1,21 @@
#ifndef LIBREGEXIS024_SRC_LIBREGEXIS024FA_FILTER_H
#define LIBREGEXIS024_SRC_LIBREGEXIS024FA_FILTER_H
#include <stdlib.h>
#include <stdint.h>
#include <libregexis024fa/codeset.h>
#include <libregexis024fa/graph_to_bytecode/natural_compiler_utils.h>
struct FilterSegment {
ssize_t color;
uint32_t L, R;
FilterSegment(ssize_t color, uint32_t l, uint32_t r);
};
/* Return whether user of function must place [0]'th option after the filter
* The filter can end up being written in such a way that the end will never be reached */
bool write_filter(std::vector<uint8_t>& result, explicit_bookmarks& bookmark_manager,
const std::vector<codeset_t>& crossroad_codesets, const std::vector<bookmark_id_t>& crossroad_marks);
#endif

View File

@ -0,0 +1,115 @@
#include <libregexis024fa/graph_to_bytecode/natural_compiler_utils.h>
#include <assert.h>
#include <libregexis024vm/vm_opcodes.h>
#define push_to_res_least_signif result.push_back(x & 0xffLU); x >>= 8
void write_byte(std::vector<uint8_t>& result, uint8_t x) {
result.push_back(x);
}
void write_word(std::vector<uint8_t>& result, uint16_t x) {
push_to_res_least_signif; push_to_res_least_signif;
}
void write_doubleword(std::vector<uint8_t>& result, uint32_t x) {
push_to_res_least_signif; push_to_res_least_signif; push_to_res_least_signif; push_to_res_least_signif;
}
void write_quadword(std::vector<uint8_t>& result, uint64_t x) {
for (int i = 0; i < 8; i++) {
push_to_res_least_signif;
}
}
#undef push_to_res_least_signif
#define put_belated_to_res assert(result[pos] == 0); result[pos++] = value & 0xffLU; value >>= 8
void belated_byte(std::vector<uint8_t>& result, size_t pos, uint8_t value) {
assert(pos < result.size());
result[pos] = value;
}
void belated_word(std::vector<uint8_t>& result, size_t pos, uint16_t value) {
assert(pos + 2 <= result.size());
put_belated_to_res; put_belated_to_res;
}
void belated_doubleword(std::vector<uint8_t>& result, size_t pos, uint32_t value) {
assert(pos + 4 <= result.size());
put_belated_to_res; put_belated_to_res; put_belated_to_res; put_belated_to_res;
}
void belated_quadword(std::vector<uint8_t>& result, size_t pos, uint64_t value) {
assert(pos + 8 <= result.size());
for (int i = 0; i < 8; i++) {
put_belated_to_res;
}
}
#undef put_belated_to_res
void write_sslot_id(std::vector<uint8_t>& result, regex_sslot_id_t x) {
write_doubleword(result, x);
}
void write_tai(std::vector<uint8_t>& result, regex_tai_t x) {
write_word(result, x);
}
void write_near_ptr(std::vector<uint8_t>& result, regex_near_ptr_t x) {
write_quadword(result, x);
}
void belated_sslot_id(std::vector<uint8_t>& result, size_t pos, regex_sslot_id_t value) {
belated_doubleword(result, pos, value);
}
void belated_tai(std::vector<uint8_t>& result, size_t pos, regex_tai_t value) {
belated_word(result, pos, value);
}
void belated_near_ptr(std::vector<uint8_t>& result, size_t pos, regex_near_ptr_t value) {
belated_quadword(result, pos, value);
}
bookmark_id_t explicit_bookmarks::new_bookmark() {
pile.emplace_back();
return free_bid++;
}
void explicit_bookmarks::write_unresolved_reference(std::vector<uint8_t> &result, bookmark_id_t bm) {
size_t where_to_fill_later = result.size();
write_near_ptr(result, 0);
pile[bm].positions_of_belated_refs.push_back(where_to_fill_later);
}
void explicit_bookmarks::land_bookmark(std::vector<uint8_t> &result, bookmark_id_t bm) {
assert(!pile[bm].placed_somewhere);
pile[bm].placed_somewhere = true;
pile[bm].actual_position = result.size();
}
void explicit_bookmarks::finish(std::vector<uint8_t> &result) {
for (explicit_bookmark_info& bmi: pile) {
assert(bmi.positions_of_belated_refs.empty() || bmi.placed_somewhere);
if (bmi.placed_somewhere) {
for (size_t ref_to_mine_belate: bmi.positions_of_belated_refs) {
belated_near_ptr(result, ref_to_mine_belate, bmi.actual_position);
}
}
}
}
bookmark_id_t explicit_bookmarks::new_range_of_bookmarks(size_t n) {
bookmark_id_t offset = free_bid;
free_bid += n;
for (size_t i = 0; i < n; i++) {
pile.emplace_back();
}
return offset;
}
bool explicit_bookmarks::has_landed(bookmark_id_t bm) {
return pile[bm].placed_somewhere;
}
#undef put_belated_to_res

View File

@ -0,0 +1,63 @@
#ifndef LIBREGEXIS024_SRC_LIBREGEXIS024FA_NATURAL_COMPILER_UTILS_H
#define LIBREGEXIS024_SRC_LIBREGEXIS024FA_NATURAL_COMPILER_UTILS_H
#include <stdint.h>
#include <libregexis024vm/vm_opcodes_types.h>
#include <vector>
void write_byte(std::vector<uint8_t>& result, uint8_t x);
void write_word(std::vector<uint8_t>& result, uint16_t x);
void write_doubleword(std::vector<uint8_t>& result, uint32_t x);
void write_quadword(std::vector<uint8_t>& result, uint64_t x);
void belated_byte(std::vector<uint8_t>& result, size_t pos, uint8_t value);
void belated_word(std::vector<uint8_t>& result, size_t pos, uint16_t value);
void belated_doubleword(std::vector<uint8_t>& result, size_t pos, uint32_t value);
void belated_quadword(std::vector<uint8_t>& result, size_t pos, uint64_t value);
void write_sslot_id(std::vector<uint8_t>& result, regex_sslot_id_t x);
void write_tai(std::vector<uint8_t>& result, regex_tai_t x);
void write_near_ptr(std::vector<uint8_t>& result, regex_near_ptr_t x);
void belated_sslot_id(std::vector<uint8_t>& result, size_t pos, regex_sslot_id_t value);
void belated_tai(std::vector<uint8_t>& result, size_t pos, regex_tai_t value);
void belated_near_ptr(std::vector<uint8_t>& result, size_t pos, regex_near_ptr_t value);
// constexpr uint64_t INSTRUCTION_SZ = REGEX024_BYTECODE_INSTRUCTION_SZ;
// constexpr uint64_t SSLOT_ID_SZ = REGEX024_BYTECODE_SSLOT_ID_SZ;
// constexpr uint64_t TRACK_ARRAY_INDEX_ID_SZ = REGEX024_BYTECODE_TRACK_ARRAY_INDEX_ID_SZ;
// constexpr uint64_t NEAR_POINTER_SZ = REGEX024_BYTECODE_NEAR_POINTER_SZ;
typedef size_t bookmark_id_t;
struct explicit_bookmark_info {
std::vector<size_t> positions_of_belated_refs;
bool placed_somewhere = false;
size_t actual_position;
};
struct explicit_bookmarks {
bookmark_id_t free_bid = 0;
/* For each named explicit bookmark there is an element in PILE */
std::vector<explicit_bookmark_info> pile;
bookmark_id_t new_bookmark();
/* bm is the bookmark I refer to. Each bookmark has an id. It is like a name, but fits in 8 bytes */
void write_unresolved_reference(std::vector<uint8_t>& result, bookmark_id_t bm);
/* bm is the bookmark I place into program `result` */
void land_bookmark(std::vector<uint8_t>& result, bookmark_id_t bm);
/* call it at the very end of bytecode-building */
void finish(std::vector<uint8_t>& result);
/* Returns offset of range of bookmark id's */
bookmark_id_t new_range_of_bookmarks(size_t n);
bool has_landed(bookmark_id_t bm);
};
#endif

View File

@ -0,0 +1,75 @@
#include <libregexis024fa/graph_to_bytecode/writing_commands.h>
#include <libregexis024vm/vm_opcodes.h>
#include <assert.h>
void cmd_JUMP(std::vector<uint8_t>& result, explicit_bookmarks& bookmark_manager, bookmark_id_t dest) {
write_byte(result, regex024_opcodes::JUMP);
bookmark_manager.write_unresolved_reference(result, dest);
}
constexpr regex024_opcode cmp_EQUAL[4] = {regex024_opcodes::JCEQUAL_B, regex024_opcodes::JCEQUAL_W,
regex024_opcodes::JCEQUAL_DW, regex024_opcodes::JCEQUAL_QW};
constexpr regex024_opcode cmp_LESS[4] = {regex024_opcodes::JCLESS_B, regex024_opcodes::JCLESS_W,
regex024_opcodes::JCLESS_DW, regex024_opcodes::JCLESS_QW};
constexpr regex024_opcode cmp_GRTR[4] = {regex024_opcodes::JCGRTR_B, regex024_opcodes::JCGRTR_W,
regex024_opcodes::JCGRTR_DW, regex024_opcodes::JCGRTR_QW};
void cmd_JC(const regex024_opcode cmpT[4],
std::vector<uint8_t>& result, explicit_bookmarks& bookmark_manager, uint64_t val, bookmark_id_t dest)
{
if (val <= UINT8_MAX) {
write_byte(result, cmpT[0]);
write_byte(result, static_cast<uint8_t>(val));
} else if (val <= UINT16_MAX) {
write_byte(result, cmpT[1]);
write_word(result, static_cast<uint16_t>(val));
} else if (val <= UINT32_MAX) {
write_byte(result, cmpT[2]);
write_doubleword(result, static_cast<uint32_t>(val));
} else {
write_byte(result, cmpT[3]);
write_quadword(result, val);
}
bookmark_manager.write_unresolved_reference(result, dest);
}
void cmd_JCEQUAL(std::vector<uint8_t>& result, explicit_bookmarks& bookmark_manager, uint64_t val, bookmark_id_t dest) {
cmd_JC(cmp_EQUAL, result, bookmark_manager, val, dest);
}
void cmd_JCLESS(std::vector<uint8_t>& result, explicit_bookmarks& bookmark_manager, uint64_t val, bookmark_id_t dest) {
cmd_JC(cmp_LESS, result, bookmark_manager, val, dest);
}
void cmd_JCGRTR(std::vector<uint8_t>& result, explicit_bookmarks& bookmark_manager, uint64_t val, bookmark_id_t dest) {
cmd_JC(cmp_GRTR, result, bookmark_manager, val, dest);
}
void cmd_DIE(std::vector<uint8_t> &result) {
write_byte(result, regex024_opcodes::DIE);
}
void cmd_MATCH(std::vector<uint8_t> &result) {
write_byte(result, regex024_opcodes::MATCH);
}
void cmd_READ_first_ns(std::vector<uint8_t>& result, size_t slot) {
assert(slot <= UINT32_MAX);
write_byte(result, regex024_opcodes::READ);
write_sslot_id(result, slot);
}
void cmd_FORK(std::vector<uint8_t> &result, explicit_bookmarks& bookmark_manager, size_t slot, bookmark_id_t dest) {
assert(slot <= UINT32_MAX);
write_byte(result, regex024_opcodes::FORK);
write_sslot_id(result, slot);
bookmark_manager.write_unresolved_reference(result, dest);
}
void cmd_READ_second_ns(std::vector<uint8_t>& result, std::vector<size_t>& belate_second_read_ns_slot_args) {
write_byte(result, regex024_opcodes::READ);
belate_second_read_ns_slot_args.push_back(result.size());
write_sslot_id(result, 0);
}

View File

@ -0,0 +1,20 @@
#ifndef LIBREGEXIS024_SRC_LIBREGEXIS024FA_WRITING_COMMANDS_H
#define LIBREGEXIS024_SRC_LIBREGEXIS024FA_WRITING_COMMANDS_H
#include <libregexis024fa/graph_to_bytecode/natural_compiler_utils.h>
#include <libregexis024vm/vm_opcodes.h>
void cmd_JUMP(std::vector<uint8_t>& result, explicit_bookmarks& bookmark_manager, bookmark_id_t dest);
void cmd_JCEQUAL(std::vector<uint8_t>& result, explicit_bookmarks& bookmark_manager, uint64_t val, bookmark_id_t dest);
void cmd_JCLESS(std::vector<uint8_t>& result, explicit_bookmarks& bookmark_manager, uint64_t val, bookmark_id_t dest);
void cmd_JCGRTR(std::vector<uint8_t>& result, explicit_bookmarks& bookmark_manager, uint64_t val, bookmark_id_t dest);
void cmd_DIE(std::vector<uint8_t>& result);
void cmd_MATCH(std::vector<uint8_t>& result);
void cmd_READ_first_ns(std::vector<uint8_t>& result, size_t slot);
void cmd_READ_second_ns(std::vector<uint8_t>& result, std::vector<size_t>& belate_second_read_ns_slot_args);
void cmd_FORK(std::vector<uint8_t> &result, explicit_bookmarks& bookmark_manager, size_t slot, bookmark_id_t dest);
#endif

View File

@ -0,0 +1,71 @@
#include <libregexis024fa/misc_fa_funcs.h>
#include <libregexis024vm/vm_opcodes.h>
#include <assert.h>
#include <libregexis024vm/utils.h>
void reattach_fa_node_edge(FA_Node **old_node_ptr, FA_Node *new_node) {
assert(old_node_ptr);
if (*old_node_ptr){
assert((**old_node_ptr).refs);
(**old_node_ptr).refs--;
}
if (new_node)
new_node->refs++;
*old_node_ptr = new_node;
}
/* We basically reattch fa.start to node */
void yay_new_start(FA_Container &fa, FA_NodePathPart *node) {
assert(node);
node->refs++;
node->nxt_node = fa.start;
fa.start = node;
}
void add_option_to_fork_node(FA_NodeOfForking *fnode, FA_Node *transition_dest) {
fnode->nxt_options.push_back(transition_dest);
if(transition_dest)
transition_dest->refs++;
}
void reattach_nxt_node(FA_NodePathPart *node, FA_Node *dest) {
reattach_fa_node_edge(&(node->nxt_node), dest);
}
// todo: get rid of exitf in the whole project
FA_Node* copy_node_no_container_adjustments(FA_Node& node){
FA_Node* res;
/* Using implicitly defined copy constructors */
#define typeCase(etype, ctype) case etype: res = new ctype((ctype&)node); break;
switch (node.type) {
typeCase(match, FA_NodeOfMatch)
typeCase(one_char_read, FA_NodeOfOneCharRead)
typeCase(forking, FA_NodeOfForking)
typeCase(look_one_behind, FA_NodeOfLookOneBehind)
typeCase(look_one_ahead, FA_NodeOfLookOneAhead)
typeCase(track_array_mov_imm, FA_NodeOfTrackArrayMovImm)
typeCase(track_array_mov_halfinvariant, FA_NodeOfTrackArrayMovHalfinvariant)
typeCase(det_char_crossroads, FA_NodeOfDetCharCrossroads)
default:
assert(false);
}
#undef typeCase
res->refs = 0;
res->search_mark = -1;
return res;
}
/* In case when transferring the ownership of this new raw pointer has failed, node is destroyed, exception is thrown */
FA_Node *copy_fa_node(FA_Node& node, FA_Container &fa) {
FA_Node* res = copy_node_no_container_adjustments(node);
/* Can invalidate ponter res (in which case it also throws exeption, so none of this matters in the end) */
fa.registerNew(res);
res->reAdd_references();
return res;
}
FA_Node *copy_fa_node_to_another_fa(FA_Node& node, FA_Container &resultFa) {
FA_Node* res = copy_node_no_container_adjustments(node);
resultFa.registerNew(res);
return res;
}

View File

@ -0,0 +1,17 @@
#ifndef LIBREGEXIS024_MISC_FA_FUNCS_H
#define LIBREGEXIS024_MISC_FA_FUNCS_H
#include "finite_automaton.h"
#include "fa_first_stage_fix.h"
FA_Node* copy_fa_node(FA_Node& node, FA_Container& fa);
void yay_new_start(FA_Container& fa, FA_NodePathPart* node);
void reattach_fa_node_edge(FA_Node** old_node_ptr, FA_Node* new_node);
void add_option_to_fork_node(FA_NodeOfForking* fnode, FA_Node* transition_dest);
void reattach_nxt_node(FA_NodePathPart* node, FA_Node* dest);
/* This is a one weird operation. New node in resultFa will still point to nodes in sourceFa,
* without increasing refcount of those nodes. YOU HAVE TO FIX IT ASAP */
FA_Node* copy_fa_node_to_another_fa(FA_Node& node, FA_Container& resultFa);
#endif //LIBREGEXIS024_MISC_FA_FUNCS_H

View File

@ -0,0 +1,15 @@
#include <libregexis024fa/selarr_priority_table.h>
#include <assert.h>
bool RegexPriorityTableAction_Pos::isForRange() const {
return second >= 0;
}
RegexPriorityTableAction_Pos::RegexPriorityTableAction_Pos(int first, int second, tracking_var_type type):
first(first),second(second), type(type) {}
//
RegexPriorityTableAction::RegexPriorityTableAction(bool minimize, int first, int second, tracking_var_type type):
minimize(minimize), pos(first, second, type) {}
//

View File

@ -0,0 +1,26 @@
#ifndef LIBREGEXIS024_SRC_LIBREGEXIS024FA_SELARR_PRIORITY_TABLE_H
#define LIBREGEXIS024_SRC_LIBREGEXIS024FA_SELARR_PRIORITY_TABLE_H
#include <stdint.h>
#include <vector>
#include <libregexis024fa/tracking_variables.h>
struct RegexPriorityTableAction_Pos{
/* first and second are indexes in selarr (but second can be -1 if it is unused) */
int first;
int second;
tracking_var_type type;
bool isForRange() const;
RegexPriorityTableAction_Pos(int first, int second, tracking_var_type type);
};
struct RegexPriorityTableAction{
bool minimize;
RegexPriorityTableAction_Pos pos;
RegexPriorityTableAction(bool minimize, int first, int second, tracking_var_type type);
};
typedef std::vector<RegexPriorityTableAction> RegexPriorityTable;
#endif //LIBREGEXIS024_SRC_LIBREGEXIS024FA_SELARR_PRIORITY_TABLE_H

View File

@ -0,0 +1,53 @@
#include <libregexis024fa/tracking_fa_nodes.h>
#include <assert.h>
bool isImmMovOpcode(regex024_opcode inst) {
return inst == regex024_opcodes::MOV_COLARR_IMM || inst == regex024_opcodes::MOV_SELARR_IMM;
}
bool isCurPosMovOpcode(regex024_opcode inst) {
return inst == regex024_opcodes::MOV_COLARR_BTPOS || inst == regex024_opcodes::MOV_SELARR_CHPOS;
}
bool isColarrOpcode(regex024_opcode inst) {
return inst == regex024_opcodes::MOV_COLARR_IMM || inst == regex024_opcodes::MOV_COLARR_BTPOS;
}
bool isSelarrOpcode(regex024_opcode inst) {
return inst == regex024_opcodes::MOV_SELARR_IMM || inst == regex024_opcodes::MOV_SELARR_CHPOS;
}
bool isTrackingFaNode(const FA_Node *n) {
return n->type == track_array_mov_imm || n->type == track_array_mov_halfinvariant;
}
TrackingOperationInFa::TrackingOperationInFa(regex024_opcode opcode, regex_tai_t key, uint64_t imm_value)
: opcode(opcode), key(key), immValue(imm_value) {}
TrackingOperationInFa::TrackingOperationInFa(regex024_opcode opcode, regex_tai_t key)
: opcode(opcode), key(key) {}
std::string TrackingOperationInFa::toString() const {
switch (opcode){
case regex024_opcodes::MOV_COLARR_IMM:
return "colarr[" + std::to_string(key) + "] := " + std::to_string(immValue);
case regex024_opcodes::MOV_SELARR_IMM:
return "selarr[" + std::to_string(key) + "] := " + std::to_string(immValue);
case regex024_opcodes::MOV_COLARR_BTPOS:
return "colarr[" + std::to_string(key) + "] := cur byte position";
case regex024_opcodes::MOV_SELARR_CHPOS:
return "selarr[" + std::to_string(key) + "] := cur char position";
default:
return "wrong collection operation";
}
}
FA_NodePathPart* convert_to_node(const TrackingOperationInFa& op, FA_Container& fa) {
if (isImmMovOpcode(op.opcode)) {
return fa.makeTrackArrayMovImm(op.opcode, op.key, op.immValue);
}
assert(isCurPosMovOpcode(op.opcode));
return fa.makeTrackArrayMovHalfinvariant(op.opcode, op.key);
}

View File

@ -0,0 +1,31 @@
#ifndef LIBREGEXIS024_TRACKING_FA_NODES_H
#define LIBREGEXIS024_TRACKING_FA_NODES_H
#include <libregexis024vm/vm_opcodes.h>
#include <libregexis024fa/finite_automaton.h>
#include <string>
bool isImmMovOpcode(regex024_opcode inst);
bool isCurPosMovOpcode(regex024_opcode inst);
bool isColarrOpcode(regex024_opcode inst);
bool isSelarrOpcode(regex024_opcode inst);
bool isTrackingFaNode(const FA_Node* n);
struct TrackingOperationInFa {
regex024_opcode opcode;
regex_tai_t key;
/* Not needed for halfinvariant operations */
uint64_t immValue;
TrackingOperationInFa(regex024_opcode opcode, regex_tai_t key, uint64_t imm_value);
TrackingOperationInFa(regex024_opcode opcode, regex_tai_t key);
std::string toString() const;
};
FA_NodePathPart* convert_to_node(const TrackingOperationInFa& op, FA_Container& fa);
#endif

View File

@ -0,0 +1,14 @@
#ifndef LIBREGEXIS024_SRC_LIBREGEXIS024FA_TRACKING_VARIABLES_H
#define LIBREGEXIS024_SRC_LIBREGEXIS024FA_TRACKING_VARIABLES_H
namespace tracking_var_types {
enum tracking_var_type_I {
range,
dot_cur_pos,
dot_immediate
};
}
typedef tracking_var_types::tracking_var_type_I tracking_var_type;
#endif

View File

@ -0,0 +1,62 @@
#include <libregexis024sol/special_terminals.h>
#include <libregexis024sol/sol_misc_base.h>
#include <assert.h>
uint32_t read_hex(REGEX_IS024_MeaningContext& ctx, int sz){
uint32_t res = 0;
for (int i = 0; i < sz; i++){
int32_t ch = peep(ctx);
if ('0' <= ch && ch <= '9')
res = ((res << 4) | ((uint32_t)ch - '0'));
else if ('a' <= ch && ch <= 'z')
res = ((res << 4) | ((uint32_t)ch - 'a' + 10));
else if ('A' <= ch && ch <= 'Z')
res = ((res << 4) | ((uint32_t)ch - 'A' + 10));
else{
report(ctx, "escape backslash expression: bad unicode code");
return 0;
}
readChar(ctx);
}
return res;
}
void unicode_in_bs_case(REGEX_IS024_MeaningContext &ctx, bool &ret_is_multicode, codeset_t &ret_set, int sz){
ret_is_multicode = false;
readChar(ctx);
uint32_t hc = read_hex(ctx, sz); // Might create an error
ret_set = codeset_of_one_char(hc);
}
void
backslash_expression_parsing_try_regular(REGEX_IS024_MeaningContext &ctx, const CommonCodesets& cc,
bool &ret_is_multicode, codeset_t &ret_set)
{
int32_t leader = peep(ctx);
if (ctx.error)
return;
#define block(l, b, E) case l: ret_is_multicode = b; ret_set = E; readChar(ctx); break;
switch (leader) {
block('s', false, codeset_of_one_char(U' '))
block('t', false, codeset_of_one_char(U'\t'))
block('n', false, codeset_of_one_char(U'\n'))
block('r', false, codeset_of_one_char(U'\r'))
block('e', true, cc.spaces);
block('E', true, invert_set(cc.spaces))
block('w', true, cc.word_constituents);
block('W', true, invert_set(cc.word_constituents));
case 'u':
unicode_in_bs_case(ctx, ret_is_multicode, ret_set, 4);
break;
case 'U':
unicode_in_bs_case(ctx, ret_is_multicode, ret_set, 8);
break;
default:
if (leader >= 0){
ret_is_multicode = false;
ret_set = codeset_of_one_char(leader);
} else {
report(ctx, "backslash in the wrong place");
}
}
}

View File

@ -0,0 +1,143 @@
#include <libregexis024sol/special_terminals.h>
#include <libregexis024vm/utils.h>
#include <libregexis024sol/sol_misc_base.h>
#include <assert.h>
#include <memory>
struct ParseCall{
virtual ~ParseCall() = default;
virtual std::unique_ptr<ParseCall> afterReceive(REGEX_IS024_MeaningContext& ctx) { assert(false); }
virtual std::unique_ptr<ParseCall> firstTime(REGEX_IS024_MeaningContext& ctx) { assert(false); }
};
struct Top_ParseCall: public ParseCall{
Command& res;
explicit Top_ParseCall(Command &res) : res(res) {}
std::unique_ptr<ParseCall> firstTime(REGEX_IS024_MeaningContext &ctx) override;
std::unique_ptr<ParseCall> afterReceive(REGEX_IS024_MeaningContext &ctx) override;
};
struct Bracker_ParseCall: public ParseCall{
std::vector<CommandArgument>& res;
bool closingBraceEnded = false;
explicit Bracker_ParseCall(std::vector<CommandArgument> &res) : res(res) {}
std::unique_ptr<ParseCall> argReadProc(REGEX_IS024_MeaningContext& ctx);
std::unique_ptr<ParseCall> firstTime(REGEX_IS024_MeaningContext &ctx) override;
std::unique_ptr<ParseCall> afterReceive(REGEX_IS024_MeaningContext &ctx) override;
};
#define call_ERROR_CHECK do { if (ctx.error) { return NULL; } } while (0)
#define call_THROW(str) do { report(ctx, "command expression: " str); return NULL; } while (0)
std::unique_ptr<ParseCall> Top_ParseCall::firstTime(REGEX_IS024_MeaningContext &ctx) {
assert(readChar(ctx) == U'!');
int32_t ch = peep(ctx); call_ERROR_CHECK;
if (ch == U'~'){
/* I assume during construction I received reference to newly initialized struct */
res.tilda = true;
return NULL;
}
res.name = tryRead_REGEX024_name(ctx); call_ERROR_CHECK;
if (res.name.empty())
call_THROW("top lvl: no command name specified");
ch = peep(ctx); call_ERROR_CHECK;
if (ch == U';'){
readChar(ctx);
return NULL;
}
if (ch == U'{'){
return std::make_unique<Bracker_ParseCall>(res.arguments);
}
call_THROW("top lvl: command call should be ended with ';' or '{...}'");
}
std::unique_ptr<ParseCall> Top_ParseCall::afterReceive(REGEX_IS024_MeaningContext &ctx) {
return NULL;
}
std::unique_ptr<ParseCall> Bracker_ParseCall::firstTime(REGEX_IS024_MeaningContext &ctx) {
assert(readChar(ctx) == U'{');
return argReadProc(ctx);
}
std::unique_ptr<ParseCall> Bracker_ParseCall::afterReceive(REGEX_IS024_MeaningContext &ctx) {
closingBraceEnded = true;
return argReadProc(ctx);
}
std::unique_ptr<ParseCall> Bracker_ParseCall::argReadProc(REGEX_IS024_MeaningContext &ctx) {
repeat:
int32_t ch = peep(ctx); call_ERROR_CHECK;
if (ch == U';'){
res.emplace_back();
readChar(ctx);
closingBraceEnded = false;
goto repeat;
} else if (ch == U'}'){
readChar(ctx);
if (!closingBraceEnded){
res.emplace_back();
}
return NULL;
} else if (is_REGEX024_nameConstituent(ch)){
res.emplace_back();
res.back().is_empty = false;
res.back().name = tryRead_REGEX024_name(ctx);
int32_t eCh = peep(ctx); call_ERROR_CHECK;
if (eCh == U';'){
readChar(ctx);
closingBraceEnded = false;
goto repeat;
} else if (eCh == U'{'){
return std::make_unique<Bracker_ParseCall>(res.back().arguments);
} else if (eCh == U'}'){
readChar(ctx);
return NULL;
}
call_THROW("brace lvl: argument ends with ';' or {...}");
}
call_THROW("brace lvl: argument starts with ';' or it's name");
}
Command command_expr_parse(REGEX_IS024_MeaningContext &ctx) {
std::vector<std::unique_ptr<ParseCall>> callStack;
Command res;
callStack.push_back(std::make_unique<Top_ParseCall>(res));
bool first_time = true;
while (!callStack.empty()){
if (ctx.error)
return {};
auto nxt = first_time ? callStack.back()->firstTime(ctx) : callStack.back()->afterReceive(ctx);
if (nxt){
callStack.push_back(std::move(nxt));
first_time = true;
} else {
callStack.pop_back();
first_time = false;
}
}
return res;
}
const char* commands_for_codesets[] = {"word", "space", "digit", "variable", "any", "A", NULL};
bool is_command_for_charset(const Command &cmd) {
return !cmd.tilda && cmd.arguments.empty() && is_string_in_stringset(cmd.name.c_str(), commands_for_codesets);
}
void interpret_command_as_charset_giving(const CommonCodesets& cc, const Command &cmd, codeset_t& ret)
{
if (cmd.name == "word")
ret = cc.word_constituents;
else if (cmd.name == "space")
ret = cc.spaces;
else if (cmd.name == "digit")
ret = cc.digits;
else if (cmd.name == "variable")
ret = cc.variable_constituents;
else if (cmd.name == "any" || cmd.name == "A")
ret = codeset_of_all;
else
assert(false);
}

View File

@ -0,0 +1,13 @@
#include <libregexis024sol/common_codesets.h>
CommonCodesets::CommonCodesets() {
spaces = set_add_char(spaces, U'\n');
spaces = set_add_char(spaces, U' ');
spaces = set_add_char(spaces, U'\t');
spaces = set_add_char(spaces, U'\r');
word_constituents = set_add_range(word_constituents, U'a', U'z');
word_constituents = set_add_range(word_constituents, U'A', U'Z');
digits = codeset_t({{'0', '9'}});
variable_constituents = set_add_char(word_constituents, U'-');
variable_constituents = merge_sets(variable_constituents, digits);
}

View File

@ -0,0 +1,14 @@
#ifndef LIBREGEXIS024_SRC_LIBREGEXIS024SOL_COMMON_CODESETS_H
#define LIBREGEXIS024_SRC_LIBREGEXIS024SOL_COMMON_CODESETS_H
#include <libregexis024fa/codeset.h>
struct CommonCodesets {
codeset_t spaces;
codeset_t word_constituents;
codeset_t digits;
codeset_t variable_constituents;
CommonCodesets();
};
#endif

View File

@ -0,0 +1,280 @@
#include <libregexis024sol/expr_compiler.h>
#include <libregexis024vm/vm_opcodes.h>
#include <libregexis024vm/utils.h>
#include <libregexis024fa/codeset.h>
#include <libregexis024fa/finite_automaton.h>
#include <libregexis024fa/fa_first_stage_fix.h>
#include <libregexis024fa/fa_make_deterministic.h>
#include <libregexis024fa/misc_fa_funcs.h>
#include <assert.h>
#include <memory>
#include <libregexis024sol/sol_misc_base.h>
#include <libregexis024sol/special_terminals.h>
#include <libregexis024sol/subexpr_fa_transformed.h>
#include <libregexis024sol/expr_parse_functions/epf.h>
#include <libregexis024sol/expr_parse_functions/command_recognition.h>
#include <libregexis024fa/graph_to_bytecode/fa_compiler.h>
#include <libregexis024sol/common_codesets.h>
/* Temporary debug measures */
#include <debugging_regexis024/debug_through_graphviz.h>
#define call_ERROR_CHECK do { if (ctx.error) { return NULL; } } while (0)
#define call_THROW(str) do { report(ctx, "regex: " str); return NULL; } while (0)
#define aux_ERROR_CHECK do { if (ctx.error) { return; } } while (0)
#define aux_THROW(str) do { report(ctx, "regex: " str); return; } while (0)
/* ****************************** Top */
const char* dfa_arg_aliases_condone[] = {"forgive", "condone", "okay", "optional", "nonimportant", "ifpossible", NULL};
const char* dfa_arg_aliases_acerbic[] = {"acerbic", "angry", "pedantic", "nofork", "pure", "important", "fierce", NULL};
void dfa_command_processing(REGEX_IS024_MeaningContext &ctx, ParsingContext& pctx, const Command& cmdBuf){
if (pctx.dfa_cmd_activated){
report(ctx, "repeating !dfa command");
return;
}
pctx.dfa_cmd_activated = true;
if (cmdBuf.arguments.empty())
return;
if (cmdBuf.arguments.size() == 1 && cmdBuf.arguments[0].arguments.empty()){
const std::string& arg_name = cmdBuf.arguments[0].name;
if (is_string_in_stringset(arg_name.c_str(), dfa_arg_aliases_acerbic)) {
pctx.dfa_cmd_unforgiving = true;
return;
}
if (is_string_in_stringset(arg_name.c_str(), dfa_arg_aliases_condone)) {
pctx.dfa_cmd_nonimportant = true;
return;
}
}
report(ctx, "wrong arguments in !dfa command");
}
void select_command_processing(REGEX_IS024_MeaningContext& ctx, ParsingContext& pctx, const Command& cmdBuf){
if (pctx.select_cmd_encountered)
aux_THROW("repeating !select command");
pctx.select_cmd_encountered = true;
for (const CommandArgument& arg: cmdBuf.arguments){
if (arg.is_empty)
aux_THROW("wrong arguments in !select command");
if (ctx.ktr.track_names.count(arg.name) != 0)
aux_THROW("repeated names in !select command");
int64_t namedThingId = static_cast<int64_t>(ctx.ktr.track_names.size());
ctx.ktr.track_names.insert({arg.name, namedThingId});
ctx.ktr.retrieval_info.emplace_back();
ctx.ktr.retrieval_info.back().stored_in_sa = true;
ctx.ktr.retrieval_info.back().stored_in_ca = false;
bool mm = false, coll = false;
for (const CommandArgument& argarg: arg.arguments){
#define mm_shenanigans if (mm) {aux_THROW("bad argument to !select command");} mm = true;
if (argarg.name == "ca" || argarg.name == "col") {
if (coll)
aux_THROW("bad argument to !select command");
coll = true;
ctx.ktr.retrieval_info.back().stored_in_ca = true;
} else if (argarg.name == "min") {
mm_shenanigans
ctx.ktr.retrieval_info.back().used_in_sifting = true;
ctx.ktr.retrieval_info.back().minimizing = true;
} else if (argarg.name == "max"){
mm_shenanigans
ctx.ktr.retrieval_info.back().used_in_sifting = true;
} else if (argarg.name == "ign") {
mm_shenanigans
} else {
aux_THROW("wrong parameter for prioritized parameter in !select command");
}
#undef mm_shenanigans
}
pctx.is_inside_of_these_sa_subexpressions.assign(ctx.ktr.retrieval_info.size(), false);
/* Other info will be filled once a tracking-unit with such name will be actually found in regex */
}
}
void jump_into_madness(ctx_t& ctx, ParsingContext& pctx, FA_Container &fa, int hn){
while (true){
int32_t pch = peep(ctx); aux_ERROR_CHECK;
if (pch != U'!'){
return;
}
size_t before_it = ctx.pos;
Command cmd = command_expr_parse(ctx); aux_ERROR_CHECK;
if (cmd.tilda){
ctx.have_comment_tail = true;
ctx.comment_tail_start = ctx.pos;
ctx.pos = ctx.input_size;
} else if (is_header_dfa_cmd(cmd)){
dfa_command_processing(ctx, pctx, cmd);
} else if (is_header_select_cmd(cmd)){
if (hn != 1)
aux_THROW("!select command at the wrong place");
select_command_processing(ctx, pctx, cmd);
} else {
assert(!is_header_cmd(cmd));
ctx.pos = before_it;
break;
}
}
}
chekushka TopLvl_ParseCall::firstTime(ctx_t &ctx, ParsingContext &pctx, FA_Container &fa) {
result.assertDefault();
jump_into_madness(ctx, pctx, fa, 1);
if (ctx.have_comment_tail)
return NULL;
return std::make_unique<ForkLvl_ParseCall>(result);
}
chekushka TopLvl_ParseCall::afterReceive(ctx_t &ctx, ParsingContext &pctx, FA_Container &fa) {
jump_into_madness(ctx, pctx, fa, 2);
if (!isEnd(ctx))
call_THROW("top lvl: EOF expected");
return NULL;
}
/* ********************************* Bracket */
chekushka BracketLvl_ParseCall::firstTime(REGEX_IS024_MeaningContext& ctx, ParsingContext& pctx, FA_Container& fa) {
result.assertDefault();
assert(readChar(ctx) == U'(');
/* sequence lvl already took care about resolving name and configuring SubtrackingNameInfo */
if (namedSubexpressionId >= 0){
assert(ctx.ktr.retrieval_info[namedSubexpressionId].type == tracking_var_types::range);
if (ctx.ktr.retrieval_info[namedSubexpressionId].stored_in_sa){
assert(namedSubexpressionId < (int64_t)pctx.is_inside_of_these_sa_subexpressions.size());
if (pctx.is_inside_of_these_sa_subexpressions[namedSubexpressionId])
call_THROW("subexpression that selection array tracks is nested");
pctx.is_inside_of_these_sa_subexpressions[namedSubexpressionId] = true;
}
}
return std::make_unique<ForkLvl_ParseCall>(tmp_ret_buff);
}
chekushka BracketLvl_ParseCall::afterReceive(REGEX_IS024_MeaningContext& ctx, ParsingContext& pctx, FA_Container& fa) {
if (peep(ctx) != U')')
call_THROW("missing ')'");
readChar(ctx);
result = tmp_ret_buff;
if (namedSubexpressionId >= 0) {
SubtrackingNameInfo& tai_slots = ctx.ktr.retrieval_info[namedSubexpressionId];
if (tai_slots.stored_in_ca){
assert(tai_slots.colarr_first >= 0 && tai_slots.colarr_first < UINT16_MAX);
assert(tai_slots.colarr_second >= 0 && tai_slots.colarr_second < UINT16_MAX);
result = join(subexpression_from_path(fa.makeTrackArrayMovHalfinvariant(
regex024_opcodes::MOV_COLARR_BTPOS, tai_slots.colarr_first)), result);
result = join(result, subexpression_from_path(fa.makeTrackArrayMovHalfinvariant(
regex024_opcodes::MOV_COLARR_BTPOS, tai_slots.colarr_second)));
}
if (tai_slots.stored_in_sa){
assert(tai_slots.selarr_first >= 0 && tai_slots.selarr_first < UINT16_MAX);
assert(tai_slots.selarr_second >= 0 && tai_slots.selarr_second < UINT16_MAX);
result = join(subexpression_from_path(fa.makeTrackArrayMovHalfinvariant(
regex024_opcodes::MOV_SELARR_CHPOS, tai_slots.selarr_first)), result);
result = join(result, subexpression_from_path(fa.makeTrackArrayMovHalfinvariant(
regex024_opcodes::MOV_SELARR_CHPOS, tai_slots.selarr_second)));
pctx.is_inside_of_these_sa_subexpressions[namedSubexpressionId] = false;
}
}
return NULL;
}
/* ******************************* Fork */
chekushka ForkLvl_ParseCall::firstTime(REGEX_IS024_MeaningContext& ctx, ParsingContext& pctx, FA_Container& fa) {
result.assertDefault();
options.emplace_back(); // Default one contains nothing. It will be overwritten
return std::make_unique<Sequence_ParseCall>(options.back());
}
chekushka ForkLvl_ParseCall::afterReceive(REGEX_IS024_MeaningContext& ctx, ParsingContext& pctx, FA_Container& fa) {
int32_t end_reason = peep(ctx); call_ERROR_CHECK;
if (end_reason == U'|'){
readChar(ctx);
return firstTime(ctx, pctx, fa);
}
result = forkify(options, fa);
return NULL;
}
void parseBody(REGEX_IS024_MeaningContext& ctx, FA_Container& fa, SubExprCompiled& result, ParsingContext& pctx){
std::vector<std::shared_ptr<ParseCall>> callStack;
callStack.push_back(std::make_unique<TopLvl_ParseCall>(result));
bool first_time = true;
while (!callStack.empty()){
aux_ERROR_CHECK;
auto nxt = first_time ? callStack.back()->firstTime(ctx, pctx, fa) : \
callStack.back()->afterReceive(ctx, pctx, fa);
if (nxt){
callStack.push_back(std::move(nxt));
first_time = true;
} else {
callStack.pop_back();
first_time = false;
}
}
/* Generating priority table (sifting program) */
for (const SubtrackingNameInfo& sni: ctx.ktr.retrieval_info) {
if (!sni.discovered)
aux_THROW("tracking tool named in !select is not used anywhere");
if (sni.used_in_sifting) {
assert(sni.selarr_first >= 0);
assert((sni.type == tracking_var_types::range) == (sni.selarr_second != -1));
pctx.priority_table.emplace_back(sni.minimizing, sni.selarr_first, sni.selarr_second, sni.type);
}
}
}
REGEX_IS024_MeaningContext::REGEX_IS024_MeaningContext(size_t inputSize, const char *input) : input_size(inputSize),
input(reinterpret_cast<const uint8_t *>(input)) {
CommonCodesets codeset_collection;
FA_Container fa;
FA_Container fa_1f;
FA_Container fa_2f;
SubExprCompiled result;
ParsingContext pctx(codeset_collection);
parseBody(*this, fa, result, pctx);
/* CLion gone crazy here. It thinks error is always false (It doesn't know about such thing as macros) */
if (error)
return;
FA_NodeOfMatch* matcher = fa.makeMatch();
if (!result.start){
fa.start = matcher;
} else {
fa.start = result.start;
for (FA_Node** ending: result.ends)
reattach_fa_node_edge(ending, matcher);
}
fa.start->refs++;
// show_fa_with_sxiv_after_dot(fa, ktr, pctx.priority_table); // todo debug
REGEX_IS024_FA_FirstStageFixInfo info1 = first_stage_fix_fa(fa, fa_1f);
// show_fa_with_sxiv_after_dot(fa_1f, ktr, pctx.priority_table); // todo debug
if (pctx.dfa_cmd_activated) {
int det_err;
int had_to_fork;
try_determinize_fa(fa_1f, pctx.priority_table, free_selarr_tai, info1, fa_2f, det_err, had_to_fork);
if (det_err < 0 && !pctx.dfa_cmd_nonimportant) {
report(*this, "Unable to determinize dfa");
return;
}
if (pctx.dfa_cmd_unforgiving && had_to_fork < 0) {
report(*this, "Attempt to determinize dfa was not good enough");
return;
}
} else {
regular_second_stage_fix(fa_1f, fa_2f, info1);
}
// show_fa_with_sxiv_after_dot(fa_2f, ktr, pctx.priority_table); // todo debug
int compilation_error;
compile_fa_to_regexis024_bytecode(compiled_program, fa_2f, pctx.priority_table, free_selarr_tai, info1, compilation_error);
if (compilation_error) {
report(*this, "Failed to compile graph representation to bytecode representation");
return;
}
}

View File

@ -0,0 +1,34 @@
#ifndef LIBREGEXIS024_EXPR_COMPILER_H
#define LIBREGEXIS024_EXPR_COMPILER_H
#include <string>
#include <vector>
#include <stdint.h>
// todo: SUPER HIGHT PRIORITY: MOVE all this spaces digits variable_constituents junk out of this class
// todo: also PLEEEASE, write static before literally nearly every single one little stupid function in this library
#include <libregexis024sol/part_of_expr_that_tracks.h>
struct REGEX_IS024_MeaningContext{
size_t input_size;
const uint8_t* input;
bool error = false;
std::string error_msg;
size_t pos = 0;
bool have_comment_tail = false;
size_t comment_tail_start;
std::vector<uint8_t> compiled_program;
KnownTrackingTools ktr;
uint16_t free_selarr_tai = 0;
uint16_t free_colarr_tai = 0;
REGEX_IS024_MeaningContext(size_t inputSize, const char *input);
};
#endif //LIBREGEXIS024_EXPR_COMPILER_H

View File

@ -0,0 +1,34 @@
#include <libregexis024sol/expr_parse_functions/command_recognition.h>
#include <libregexis024vm/utils.h>
#include <libregexis024sol/expr_compiler.h>
#include <libregexis024sol/sol_misc_base.h>
#define aux_ERROR_CHECK do { if (ctx.error) { return; } } while (0)
#define aux_THROW(str) do { report(ctx, "regex: " str); return; } while (0)
const char* header_command_dfa_names[] = {"dfa", "determinize", NULL};
const char* header_command_select_names[] = {"s", "select", "selarr", "selectional", NULL};
bool is_header_cmd(const Command &cmd) {
return cmd.tilda || is_header_dfa_cmd(cmd), is_header_dfa_cmd(cmd);
}
bool is_header_dfa_cmd(const Command &cmd) {
return is_string_in_stringset(cmd.name.c_str(), header_command_dfa_names);
}
bool is_header_select_cmd(const Command &cmd) {
return is_string_in_stringset(cmd.name.c_str(), header_command_select_names);
}
void int_parse_with_limit_concern(const std::string &str, REGEX_IS024_MeaningContext &ctx, size_t &res, int lim) {
res = 0;
for (char ch: str){
if (!('0' <= ch && ch <= '9'))
aux_THROW("bad integer argument");
res = res * 10 + (ch - '0');
if (res > (size_t)lim)
aux_THROW("integer is too big");
}
}

View File

@ -0,0 +1,13 @@
/* Internal use only */
#ifndef LIBREGEXIS024_SRC_LIBREGEXIS024SOL_EXPR_PARSE_FUNCTIONS_COMMAND_RECOGNITION_H
#define LIBREGEXIS024_SRC_LIBREGEXIS024SOL_EXPR_PARSE_FUNCTIONS_COMMAND_RECOGNITION_H
#include <libregexis024sol/special_terminals.h>
bool is_header_cmd(const Command& cmd);
bool is_header_dfa_cmd(const Command& cmd);
bool is_header_select_cmd(const Command& cmd);
void int_parse_with_limit_concern(const std::string& str, REGEX_IS024_MeaningContext &ctx, size_t& res, int lim);
#endif //LIBREGEXIS024_SRC_LIBREGEXIS024SOL_EXPR_PARSE_FUNCTIONS_COMMAND_RECOGNITION_H

View File

@ -0,0 +1,222 @@
#include <libregexis024sol/expr_parse_functions/epf.h>
#include <assert.h>
#include <libregexis024sol/expr_parse_functions/tracking_units.h>
#include <libregexis024sol/sol_misc_base.h>
#include <libregexis024sol/expr_compiler.h>
#include <libregexis024sol/special_terminals.h>
#include <libregexis024vm/vm_opcodes.h>
#include <libregexis024sol/square_bracket_expression.h>
#include <libregexis024sol/expr_parse_functions/command_recognition.h>
#include <libregexis024fa/misc_fa_funcs.h>
#define call_ERROR_CHECK do { if (ctx.error) { return NULL; } } while (0)
#define call_THROW(str) do { report(ctx, "regex: " str); return NULL; } while (0)
#define aux_ERROR_CHECK do { if (ctx.error) { return; } } while (0)
#define aux_THROW(str) do { report(ctx, "regex: " str); return; } while (0)
/* **************************** Sequence */
void in_case_of_backslash(REGEX_IS024_MeaningContext &ctx, const CommonCodesets& cc, FA_Container &fa, SubExprCompiled& backPart) {
assert(readChar(ctx) == U'\\');
int32_t leader = peep(ctx); aux_ERROR_CHECK;
if (leader == U'b'){
FA_NodeOfForking* n1 = fa.makeForking();
FA_NodeOfLookOneBehind* n1a = fa.makeLookOneBehind(invert_set(cc.word_constituents));
FA_NodeOfLookOneAhead* n2a = fa.makeLookOneAhead(cc.word_constituents);
reattach_nxt_node(n1a, n2a);
FA_NodeOfLookOneBehind* n1b = fa.makeLookOneBehind(cc.word_constituents);
FA_NodeOfLookOneAhead* n2b = fa.makeLookOneAhead(invert_set(cc.word_constituents));
reattach_nxt_node(n1b, n2b);
add_option_to_fork_node(n1, n1a);
add_option_to_fork_node(n1, n1b);
backPart.start = n1;
backPart.ends = {&(n2a->nxt_node), &(n2b->nxt_node)};
} else if (leader == U'B'){
FA_NodeOfForking* n1 = fa.makeForking();
FA_NodeOfLookOneBehind* n1a = fa.makeLookOneBehind(cc.word_constituents);
FA_NodeOfLookOneAhead* n2a = fa.makeLookOneAhead(cc.word_constituents);
reattach_nxt_node(n1a, n2a);
FA_NodeOfLookOneBehind* n1b = fa.makeLookOneBehind(invert_set(cc.word_constituents));
FA_NodeOfLookOneAhead* n2b = fa.makeLookOneAhead(invert_set(cc.word_constituents));
reattach_nxt_node(n1b, n2b);
add_option_to_fork_node(n1, n1a);
add_option_to_fork_node(n1, n1b);
backPart.start = n1;
backPart.ends = {&(n2a->nxt_node), &(n2b->nxt_node)};
} else if (leader == U'<'){
FA_NodeOfLookOneBehind *n1 = fa.makeLookOneBehind(invert_set(cc.word_constituents));
FA_NodeOfLookOneAhead *n2 = fa.makeLookOneAhead(cc.word_constituents);
reattach_nxt_node(n1, n2);
backPart.start = n1;
backPart.ends = {&(n2->nxt_node)};
} else if (leader == U'>'){
FA_NodeOfLookOneBehind *n1 = fa.makeLookOneBehind(cc.word_constituents);
FA_NodeOfLookOneAhead *n2 = fa.makeLookOneAhead(invert_set(cc.word_constituents));
reattach_nxt_node(n1, n2);
backPart.start = n1;
backPart.ends = {&(n2->nxt_node)};
} else {
bool ret_is_multicode; codeset_t res_codeset;
backslash_expression_parsing_try_regular(ctx, cc, ret_is_multicode, res_codeset);
backPart = subexpr_charset_reading_filter(res_codeset, fa);
return; // To avoid reading leader again (it gets read in the end)
}
readChar(ctx);
}
void repeat_stuff_with_check(REGEX_IS024_MeaningContext& ctx,
SubExprCompiled &patient, FA_Container& fa, size_t min_allowed, size_t max_allowed){
if (min_allowed > max_allowed)
aux_THROW("repeat operation: min > max");
if (min_allowed > REGEXIS024_MAX_REPEAT)
aux_THROW("minimum repeat factor is too high");
if (max_allowed > REGEXIS024_MAX_REPEAT && patient.can_be_empty)
aux_THROW("safety abortion: possible infinite loop. Если вы считаете, что ваше регулярное "
"выражение корректно и не вызвает бесконечного цикла, напишите об этом в жалобную книгу: "
"По ссылке: file:///dev/null Ваши предложения по улучшению libregexis024 обязательно будут рассмотрены.");
apply_repeat_to_subexpression(patient, fa, min_allowed, max_allowed);
}
void repeat_command_processing(REGEX_IS024_MeaningContext &ctx, FA_Container &fa, std::vector<SubExprCompiled>& parts,
const Command& cmd){
if (parts.empty())
aux_THROW("no subexpression before !repeat command");
if (cmd.arguments.empty() || (cmd.arguments.size() == 1 && cmd.arguments[0].is_empty)) {
repeat_stuff_with_check(ctx, parts.back(), fa, 0, REGEXIS024_MAX_REPEAT + 1); aux_ERROR_CHECK;
} else if (cmd.arguments.size() == 1){
size_t mm;
int_parse_with_limit_concern(cmd.arguments[0].name, ctx, mm, REGEXIS024_MAX_REPEAT); aux_ERROR_CHECK;
repeat_stuff_with_check(ctx, parts.back(), fa, mm, mm); aux_ERROR_CHECK;
} else if (cmd.arguments.size() > 2){
aux_THROW("too many arguments in !repeat command");
} else {
size_t min_allowed, max_allowed;
if (cmd.arguments[0].is_empty){
min_allowed = 0;
} else {
int_parse_with_limit_concern(cmd.arguments[0].name, ctx, min_allowed, REGEXIS024_MAX_REPEAT);
aux_ERROR_CHECK;
}
if (cmd.arguments[1].is_empty){
max_allowed = REGEXIS024_MAX_REPEAT + 1;
} else {
int_parse_with_limit_concern(cmd.arguments[1].name, ctx, max_allowed, REGEXIS024_MAX_REPEAT);
aux_ERROR_CHECK;
}
if (min_allowed > max_allowed)
aux_THROW("!repeat: min > max");
repeat_stuff_with_check(ctx, parts.back(), fa, min_allowed, max_allowed); aux_ERROR_CHECK;
}
}
chekushka Sequence_ParseCall::firstTime(REGEX_IS024_MeaningContext &ctx, ParsingContext &pctx, FA_Container &fa) {
while (true) {
int32_t fst = peep(ctx);
call_ERROR_CHECK;
if (fst == U'!') {
Command cmdBuf;
size_t before_cmd = ctx.pos;
cmdBuf = command_expr_parse(ctx);
call_ERROR_CHECK;
if (is_header_cmd(cmdBuf)){
ctx.pos = before_cmd;
break;
} else if (cmdBuf.name == "r" || cmdBuf.name == "repeat"){
repeat_command_processing(ctx, fa, parts, cmdBuf); call_ERROR_CHECK;
} else if (is_command_for_charset(cmdBuf)){
codeset_t cs;
interpret_command_as_charset_giving(pctx.cc, cmdBuf, cs); call_ERROR_CHECK;
parts.push_back(subexpr_charset_reading_filter(cs, fa));
} else {
call_THROW("unknown command");
}
} else if (fst == U'\\') {
parts.emplace_back();
in_case_of_backslash(ctx, pctx.cc, fa, parts.back());
call_ERROR_CHECK;
} else if (fst == U'^'){
readChar(ctx);
parts.push_back(subexpression_from_path(fa.makeLookOneBehind(codeset_of_one_char(U'\n'))));
} else if (fst == U'$'){
readChar(ctx);
parts.push_back(subexpression_from_path(fa.makeLookOneAhead(codeset_of_one_char(U'\n'))));
} else if (fst == U'*'){
#define vibe_check(sn) if (parts.empty()) { call_THROW("no subexpression before `" sn "` operator"); } readChar(ctx);
vibe_check("*")
repeat_stuff_with_check(ctx, parts.back(), fa, 0, REGEXIS024_MAX_REPEAT + 1); call_ERROR_CHECK;
} else if (fst == U'+'){
vibe_check("+")
repeat_stuff_with_check(ctx, parts.back(), fa, 1, REGEXIS024_MAX_REPEAT + 1); call_ERROR_CHECK;
} else if (fst == U'?'){
vibe_check("?")
repeat_stuff_with_check(ctx, parts.back(), fa, 0, 1); call_ERROR_CHECK;
#undef vibe_check
} else if (fst == U'#'){
readChar(ctx);
std::string name = tryRead_REGEX024_name(ctx); call_ERROR_CHECK;
if (name.empty())
call_THROW("No name provided after #");
if (ctx.ktr.track_names.count(name) == 0){
ctx.ktr.track_names[name] = static_cast<int64_t>(ctx.ktr.retrieval_info.size());
ctx.ktr.retrieval_info.emplace_back();
}
int64_t id = ctx.ktr.track_names[name];
int32_t typeDet = peep(ctx);
if (typeDet == U'('){
ensure_space_for_track_unit(ctx, name, tracking_var_types::range); call_ERROR_CHECK;
parts.emplace_back();
return std::make_unique<BracketLvl_ParseCall>(parts.back(), id);
} else if (typeDet == U':'){
ensure_space_for_track_unit(ctx, name, tracking_var_types::dot_immediate); call_ERROR_CHECK;
readChar(ctx);
std::string value_str = tryRead_REGEX024_name(ctx);
size_t value;
int_parse_with_limit_concern(value_str, ctx, value, UINT16_MAX);
int32_t cl = peep(ctx);
if (cl != U';')
call_THROW("Missing ; after dot track unit operator");
readChar(ctx);
if (ctx.ktr.retrieval_info[id].stored_in_sa)
parts.emplace_back(subexpression_from_path(
fa.makeTrackArrayMovImm(regex024_opcodes::MOV_SELARR_IMM,
ctx.ktr.retrieval_info[id].selarr_first, value)));
if (ctx.ktr.retrieval_info[id].stored_in_ca)
parts.emplace_back(subexpression_from_path(
fa.makeTrackArrayMovImm(regex024_opcodes::MOV_COLARR_IMM,
ctx.ktr.retrieval_info[id].colarr_first, value)));
} else if (typeDet == U';'){
ensure_space_for_track_unit(ctx, name, tracking_var_types::dot_cur_pos); call_ERROR_CHECK;
readChar(ctx);
if (ctx.ktr.retrieval_info[id].stored_in_sa)
parts.emplace_back(subexpression_from_path(
fa.makeTrackArrayMovHalfinvariant(regex024_opcodes::MOV_SELARR_CHPOS,
ctx.ktr.retrieval_info[id].selarr_first)));
if (ctx.ktr.retrieval_info[id].stored_in_ca)
parts.emplace_back(subexpression_from_path(
fa.makeTrackArrayMovHalfinvariant(regex024_opcodes::MOV_COLARR_BTPOS,
ctx.ktr.retrieval_info[id].colarr_first)));
} else
call_THROW("Missing ; or ( in the beginning of tracking unit");
} else if (fst == U'(') {
parts.emplace_back();
return std::make_unique<BracketLvl_ParseCall>(parts.back(), -1);
} else if (fst == U'[') {
codeset_t filter = sq_bracket_expr_parse(ctx, pctx.cc); call_ERROR_CHECK;
parts.push_back(subexpr_charset_reading_filter(filter, fa));
} else if (fst >= 0 && fst != U')' && fst != U'|' && fst != U']'){
readChar(ctx);
parts.push_back(subexpr_charset_reading_filter(codeset_of_one_char(fst), fa));
} else {
break;
}
}
for (SubExprCompiled& part: parts)
result = join(result, part);
return NULL;
}
chekushka Sequence_ParseCall::afterReceive(REGEX_IS024_MeaningContext &ctx, ParsingContext &pctx, FA_Container &fa) {
// This is possible only if I received a bracket expression
return firstTime(ctx, pctx, fa);
}

View File

@ -0,0 +1,74 @@
#ifndef LIBREGEXIS024_SRC_LIBREGEXIS024SOL_EXPR_PARSE_FUNCTIONS_EPF_H
#define LIBREGEXIS024_SRC_LIBREGEXIS024SOL_EXPR_PARSE_FUNCTIONS_EPF_H
/* For internal usage only */
#include <libregexis024sol/expr_compiler.h>
#include <libregexis024sol/common_codesets.h>
#include <libregexis024fa/finite_automaton.h>
#include <memory>
#include <libregexis024sol/subexpr_fa_transformed.h>
#include <assert.h>
#include <libregexis024fa/selarr_priority_table.h>
struct ParsingContext{
/* Those subexpressions, that are tracket by s`a are forbidden from nesting inside themselves */
std::vector<bool> is_inside_of_these_sa_subexpressions;
bool select_cmd_encountered = false;
RegexPriorityTable priority_table;
bool dfa_cmd_activated = false;
/* Completely failing to build dfa with this flag on will result in no error */
bool dfa_cmd_nonimportant = false;
/* With this flag, your dfa should be absolutely pure, no forks are allowed. */
bool dfa_cmd_unforgiving = false;
/* Reference to active cc set (actually, there is only one cc, but who cares, I placed
* it here to lower the number of arguments in ParseCall methods, again WHO CARES?) */
const CommonCodesets& cc;
explicit ParsingContext(const CommonCodesets& cc_): cc(cc_){}
};
typedef REGEX_IS024_MeaningContext ctx_t;
struct ParseCall;
typedef std::unique_ptr<ParseCall> chekushka;
struct ParseCall{
SubExprCompiled& result;
explicit ParseCall(SubExprCompiled &result) : result(result) {}
virtual ~ParseCall() = default;
virtual chekushka afterReceive(ctx_t& ctx, ParsingContext& pctx, FA_Container& fa) { assert(false); }
virtual chekushka firstTime(ctx_t& ctx, ParsingContext& pctx, FA_Container& fa) { assert(false); }
};
struct TopLvl_ParseCall: public ParseCall{
explicit TopLvl_ParseCall(SubExprCompiled &result) : ParseCall(result) {}
chekushka afterReceive(ctx_t &ctx, ParsingContext &pctx, FA_Container &fa) override;
chekushka firstTime(ctx_t &ctx, ParsingContext &pctx, FA_Container &fa) override;
};
struct BracketLvl_ParseCall: public ParseCall{
/* -1 if this is a normal bracket expression. Otherwise, it is an index in ctx.retrieval_info vector */
int64_t namedSubexpressionId;
SubExprCompiled tmp_ret_buff;
explicit BracketLvl_ParseCall(SubExprCompiled& result, int64_t namedSubexpressionId) :
ParseCall(result), namedSubexpressionId(namedSubexpressionId) {}
chekushka afterReceive(REGEX_IS024_MeaningContext& ctx, ParsingContext& pctx, FA_Container& fa) override;
chekushka firstTime(REGEX_IS024_MeaningContext& ctx, ParsingContext& pctx, FA_Container& fa) override;
};
struct ForkLvl_ParseCall: public ParseCall{
std::vector<SubExprCompiled> options;
explicit ForkLvl_ParseCall(SubExprCompiled &result) : ParseCall(result) {}
chekushka afterReceive(REGEX_IS024_MeaningContext& ctx, ParsingContext& pctx, FA_Container& fa);
chekushka firstTime(REGEX_IS024_MeaningContext& ctx, ParsingContext& pctx, FA_Container& fa);
};
struct Sequence_ParseCall: public ParseCall{
std::vector<SubExprCompiled> parts;
explicit Sequence_ParseCall(SubExprCompiled &result) :ParseCall(result) {}
chekushka afterReceive(REGEX_IS024_MeaningContext& ctx, ParsingContext& pctx, FA_Container& fa);
chekushka firstTime(REGEX_IS024_MeaningContext& ctx, ParsingContext& pctx, FA_Container& fa);
};
/* Some auxilary functions */
#endif //LIBREGEXIS024_SRC_LIBREGEXIS024SOL_EXPR_PARSE_FUNCTIONS_EPF_H

View File

@ -0,0 +1,38 @@
#include <libregexis024sol/expr_parse_functions/tracking_units.h>
#include <libregexis024sol/sol_misc_base.h>
#define aux_THROW(str) do { report(ctx, "regex: " str); return; } while (0)
#define aux_ERROR_CHECK do { if (ctx.error) { return; } } while (0)
void for_one_type(REGEX_IS024_MeaningContext &ctx, uint16_t& free_ARR_tai, int& ARR_first, int& ARR_second,
const std::string& ARR_NAME, tracking_var_type type){
#define check_is_available() if (free_ARR_tai == UINT16_MAX) { \
report(ctx, ("regex: " + ARR_NAME + ": key namespace overflow").c_str()); return;}
check_is_available()
ARR_first = free_ARR_tai++;
if (type == tracking_var_types::range){
check_is_available()
ARR_second = free_ARR_tai++;
}
}
void ensure_space_for_track_unit(REGEX_IS024_MeaningContext &ctx, const std::string& name, tracking_var_type type) {
size_t id = ctx.ktr.track_names[name];
/* Size of this verctor won't be changed. THis is a safe reference */
SubtrackingNameInfo& info = ctx.ktr.retrieval_info[id];
if (!info.discovered){
info.type = type;
if (info.stored_in_ca) {
for_one_type(ctx, ctx.free_colarr_tai, info.colarr_first, info.colarr_second, "collection array", type);
aux_ERROR_CHECK;
}
if (info.stored_in_sa) {
for_one_type(ctx, ctx.free_selarr_tai, info.selarr_first, info.selarr_second, "selection array", type);
aux_ERROR_CHECK;
}
info.discovered = true;
} else if (info.type != type){
aux_THROW("tracking tool unit type mismatch");
}
}

View File

@ -0,0 +1,10 @@
/* For internal use only */
#ifndef LIBREGEXIS024_SRC_LIBREGEXIS024SOL_TRACKING_UNITS_H
#define LIBREGEXIS024_SRC_LIBREGEXIS024SOL_TRACKING_UNITS_H
#include <libregexis024sol/expr_compiler.h>
void ensure_space_for_track_unit(REGEX_IS024_MeaningContext &ctx, const std::string& name, tracking_var_type type);
#endif //LIBREGEXIS024_SRC_LIBREGEXIS024SOL_TRACKING_UNITS_H

View File

@ -0,0 +1,2 @@
// #include <libregexis024sol/part_of_expr_that_tracks.h>

View File

@ -0,0 +1,31 @@
#ifndef PART_OF_EXPR_THAT_TRACKS_H
#define PART_OF_EXPR_THAT_TRACKS_H
#include <map>
#include <vector>
#include <string>
#include <libregexis024fa/tracking_variables.h>
struct SubtrackingNameInfo{
bool stored_in_ca = true;
bool stored_in_sa = false;
bool discovered = false;
tracking_var_type type;
/* These fields will be -1 if unused */
int colarr_first = -1;
int colarr_second = -1;
bool used_in_sifting = false;
bool minimizing = false;
int selarr_first = -1;
int selarr_second = -1;
};
struct KnownTrackingTools {
std::map<std::string, int64_t> track_names;
std::vector<SubtrackingNameInfo> retrieval_info;
};
#endif //PART_OF_EXPR_THAT_TRACKS_H

View File

@ -0,0 +1,55 @@
#include <libregexis024sol/sol_misc_base.h>
#include <libregexis024vm/utils.h>
void report(REGEX_IS024_MeaningContext &ctx, const char *error) {
if (!ctx.error){
ctx.error = true;
ctx.error_msg = error;
}
}
bool isEnd(REGEX_IS024_MeaningContext &ctx) {
return ctx.pos == ctx.input_size;
}
int32_t peep(REGEX_IS024_MeaningContext &ctx) {
// printf("pos = %lu\n", ctx.pos);
if (isEnd(ctx))
return -1; // This is probably the only place where getting negative return does not generate error
int32_t cp; size_t sz;
utf8_string_iterat(cp, sz, ctx.pos, ctx.input, ctx.input_size);
if (cp < 0)
report(ctx, "encoding error");
return cp;
}
int32_t readChar(REGEX_IS024_MeaningContext &ctx) {
// printf("READ pos = %lu\n", ctx.pos);
int32_t cp; size_t sz;
utf8_string_iterat(cp, sz, ctx.pos, ctx.input, ctx.input_size);
if (cp >= 0)
ctx.pos += sz;
else
report(ctx, "bruh what?? How this even happened");
return cp;
}
bool is_REGEX024_nameConstituent(int32_t ch) {
return ('0' <= ch && ch <= '9') || ('a' <= ch && ch <= 'z') || ('A' <= ch && ch <= 'Z');
}
std::string tryRead_REGEX024_name(REGEX_IS024_MeaningContext &ctx) {
std::string res;
while (true){
int32_t ch = peep(ctx);
if (is_REGEX024_nameConstituent(ch)){
res += (char)ch;
readChar(ctx);
} else {
break;
}
}
return res;
}

View File

@ -0,0 +1,20 @@
/* DO NOT INCLUDE THIS FILE OUTSIDE libregexis024sol implementation */
#ifndef LIBREGEXIS024_SRC_LIBREGEXIS024SOL_SOL_MISC_BASE_H
#define LIBREGEXIS024_SRC_LIBREGEXIS024SOL_SOL_MISC_BASE_H
#include <libregexis024sol/expr_compiler.h>
#include <string>
void report(REGEX_IS024_MeaningContext& ctx, const char* error);
bool isEnd(REGEX_IS024_MeaningContext& ctx);
int32_t peep(REGEX_IS024_MeaningContext& ctx);
int32_t readChar(REGEX_IS024_MeaningContext& ctx);
bool is_REGEX024_nameConstituent(int32_t ch);
/* Name in my library consists of [0-9a-zA-Z]. If the first peeped letter is not name constituent,
* empty string is returned */
std::string tryRead_REGEX024_name(REGEX_IS024_MeaningContext& ctx);
#endif //LIBREGEXIS024_SRC_LIBREGEXIS024SOL_SOL_MISC_BASE_H

View File

@ -0,0 +1,36 @@
/* DO NOT INCLUDE THIS FILE OUTSIDE libregexis024sol implementation */
#ifndef LIBREGEXIS024_SRC_LIBREGEXIS024SOL_SPECIAL_TERMINALS_H
#define LIBREGEXIS024_SRC_LIBREGEXIS024SOL_SPECIAL_TERMINALS_H
#include <libregexis024sol/expr_compiler.h>
#include <libregexis024sol/common_codesets.h>
/* This option of backslash usage should be checked last.
* Function can generate error. Always check the error first */
void
backslash_expression_parsing_try_regular(REGEX_IS024_MeaningContext& ctx, const CommonCodesets& cc,
bool& ret_is_multicode, codeset_t& ret_set);
struct CommandEntity;
struct Command;
struct CommandArgument;
struct CommandEntity{
std::string name;
std::vector<CommandArgument> arguments;
};
struct CommandArgument: CommandEntity{
bool is_empty = true;
};
struct Command: CommandEntity{
bool tilda = false;
};
/* Zlaya sobaka. Kidaet oshibki v context */
Command command_expr_parse(REGEX_IS024_MeaningContext& ctx);
bool is_command_for_charset(const Command& cmd);
void interpret_command_as_charset_giving(const CommonCodesets& cc, const Command& cmd, codeset_t& ret);
#endif //LIBREGEXIS024_SRC_LIBREGEXIS024SOL_SPECIAL_TERMINALS_H

View File

@ -0,0 +1,189 @@
#include <libregexis024sol/square_bracket_expression.h>
#include <libregexis024sol/sol_misc_base.h>
#include <libregexis024sol/special_terminals.h>
#include <vector>
#include <memory>
#include <assert.h>
/* Can allow backslash (later should check that backslash expression is not multicharar or empty */
bool soundsLikeCharOrRangeStart(int32_t peeped) {
return peeped >= 0 && (peeped != U'[' && peeped != U']' && peeped != U'!' && \
peeped != '^' && peeped != '&' && peeped != '-');
}
typedef REGEX_IS024_MeaningContext ctx_t;
struct ParseCall;
typedef std::shared_ptr<ParseCall> chekushka;
struct ParseCall{
codeset_t& result;
explicit ParseCall(codeset_t &result) : result(result) {}
virtual ~ParseCall() = default;
virtual chekushka afterReceive(ctx_t& ctx, const CommonCodesets& cc) { assert(false); }
virtual chekushka firstTime(ctx_t& ctx, const CommonCodesets& cc) { assert(false); }
};
#define call_ERROR_CHECK do { if (ctx.error) { return NULL; } } while (0)
#define call_THROW(str) do { report(ctx, "square bracket expression: " str); return NULL; } while (0)
/* [...] */
struct ZeroLvl_ParseCall: public ParseCall{
explicit ZeroLvl_ParseCall(codeset_t &result) : ParseCall(result) {}
chekushka afterReceive(ctx_t &ctx, const CommonCodesets& cc) override;
chekushka firstTime(ctx_t &ctx, const CommonCodesets& cc) override;
};
/* ...&...&... */
struct FirstLvl_ParseCall: public ParseCall{
codeset_t ret_buf_for_new;
bool got_one = false;
explicit FirstLvl_ParseCall(codeset_t& result) : ParseCall(result) {}
chekushka afterReceive(ctx_t &ctx, const CommonCodesets& cc) override;
chekushka firstTime(ctx_t &ctx, const CommonCodesets& cc) override;
};
/* ab[]vgd[]eyo[]zhz */
struct SecondLvl_ParseCall: public ParseCall{
codeset_t ret_buf_for_new;
explicit SecondLvl_ParseCall(codeset_t& result) : ParseCall(result) {}
chekushka afterReceive(ctx_t &ctx, const CommonCodesets& cc) override;
chekushka firstTime(ctx_t &ctx, const CommonCodesets& cc) override;
};
/* ^... */
struct CircumflexLvl_ParseCall: public ParseCall{
codeset_t ret_buf_for_new;
explicit CircumflexLvl_ParseCall(codeset_t& result) : ParseCall(result) {}
chekushka afterReceive(ctx_t &ctx, const CommonCodesets& cc) override;
chekushka firstTime(ctx_t &ctx, const CommonCodesets& cc) override;
};
/* ********* ZeroLvl_ParseCall ********** */
chekushka ZeroLvl_ParseCall::firstTime(ctx_t &ctx, const CommonCodesets& cc) {
assert(readChar(ctx) == U'[');
return std::make_shared<FirstLvl_ParseCall>(result);
}
chekushka ZeroLvl_ParseCall::afterReceive(ctx_t &ctx, const CommonCodesets& cc) {
if (peep(ctx) != U']')
call_THROW("lvl 0: missing ]");
readChar(ctx);
return NULL;
}
/* ********* FirstLvl_ParseCall ********** */
chekushka FirstLvl_ParseCall::firstTime(ctx_t &ctx, const CommonCodesets& cc) {
return std::make_shared<SecondLvl_ParseCall>(result);
}
chekushka FirstLvl_ParseCall::afterReceive(ctx_t &ctx, const CommonCodesets& cc) {
if (got_one)
result = intersect_sets(result, ret_buf_for_new);
else
got_one = true;
if (peep(ctx) == U'&'){
readChar(ctx);
return std::make_shared<SecondLvl_ParseCall>(ret_buf_for_new);
}
return NULL;
}
/* ********* SecondLvl_ParseCall ********** */
chekushka SecondLvl_ParseCall::firstTime(ctx_t &ctx, const CommonCodesets& cc) {
repeat:
int32_t ch = peep(ctx); call_ERROR_CHECK;
if (ch == U'^'){
return std::make_shared<CircumflexLvl_ParseCall>(ret_buf_for_new);
} else if (ch == U'!'){
Command cmd = command_expr_parse(ctx); call_ERROR_CHECK;
if (!is_command_for_charset(cmd))
call_THROW("second lvl: illegal command");
interpret_command_as_charset_giving(cc, cmd, ret_buf_for_new);
result = merge_sets(result, ret_buf_for_new);
goto repeat;
} else if (ch == U'['){
return std::make_shared<ZeroLvl_ParseCall>(ret_buf_for_new);
} else if (soundsLikeCharOrRangeStart(ch)){
readChar(ctx);
bool bs_multicode;
codeset_t bs_stuff;
if (ch == '\\'){
backslash_expression_parsing_try_regular(ctx, cc, bs_multicode, bs_stuff);
if (bs_multicode){
result = merge_sets(result, bs_stuff);
goto repeat;
} else {
ret_buf_for_new = codeset_of_one_char(bs_stuff[0].first);
}
} else {
ret_buf_for_new = codeset_of_one_char(ch);
}
int32_t mCh = peep(ctx); call_ERROR_CHECK;
if (mCh == U'-'){
readChar(ctx);
int32_t scnd = peep(ctx); call_ERROR_CHECK;
readChar(ctx);
if (scnd == U'\\'){
backslash_expression_parsing_try_regular(ctx, cc, bs_multicode, bs_stuff);
if (bs_multicode)
call_THROW("second lvl: char range: bad escape expression after hyphen");
ret_buf_for_new[0].second = bs_stuff[0].first;
} else if (soundsLikeCharOrRangeStart(scnd)){
ret_buf_for_new[0].second = (uint32_t)scnd;
} else {
call_THROW("second lvl: char range: bad value after hyphen");
}
if (ret_buf_for_new[0].second < ret_buf_for_new[0].first)
call_THROW("second: lvl: char range: invalid range");
}
result = merge_sets(result, ret_buf_for_new);
goto repeat;
}
return NULL;
}
chekushka SecondLvl_ParseCall::afterReceive(ctx_t &ctx, const CommonCodesets& cc) {
result = merge_sets(result, ret_buf_for_new);
return firstTime(ctx, cc);
}
/* ********* CircumflexLvl_ParseCall ********* */
chekushka CircumflexLvl_ParseCall::firstTime(ctx_t &ctx, const CommonCodesets& cc) {
assert(readChar(ctx) == U'^');
return std::make_shared<FirstLvl_ParseCall>(ret_buf_for_new);
}
chekushka CircumflexLvl_ParseCall::afterReceive(ctx_t &ctx, const CommonCodesets& cc) {
result = invert_set(ret_buf_for_new);
return NULL;
}
/* Aaaaaaaaand... The function we have all been waiting for so long! */
codeset_t sq_bracket_expr_parse(REGEX_IS024_MeaningContext &ctx, const CommonCodesets& cc) {
std::vector<std::shared_ptr<ParseCall>> callStack;
codeset_t res;
callStack.push_back(std::make_shared<ZeroLvl_ParseCall>(res));
bool first_time = true;
while (!callStack.empty()){
if (ctx.error)
return {};
auto nxt = first_time ? callStack.back()->firstTime(ctx, cc) : callStack.back()->afterReceive(ctx, cc);
if (nxt){
callStack.push_back(nxt);
first_time = true;
} else {
callStack.pop_back();
first_time = false;
}
}
return res;
}

View File

@ -0,0 +1,10 @@
/* DO NOT INCLUDE THIS FILE OUTSIDE libregexis024sol implementation */
#ifndef LIBREGEXIS024_SRC_LIBREGEXIS024SOL_SQUARE_BRACKET_EXPRESSION_H
#define LIBREGEXIS024_SRC_LIBREGEXIS024SOL_SQUARE_BRACKET_EXPRESSION_H
#include <libregexis024sol/expr_compiler.h>
#include <libregexis024sol/common_codesets.h>
codeset_t sq_bracket_expr_parse(REGEX_IS024_MeaningContext& ctx, const CommonCodesets& cc);
#endif //LIBREGEXIS024_SRC_LIBREGEXIS024SOL_SQUARE_BRACKET_EXPRESSION_H

View File

@ -0,0 +1,184 @@
#include <libregexis024sol/subexpr_fa_transformed.h>
#include <libregexis024fa/misc_fa_funcs.h>
#include <assert.h>
#include <stdio.h>
SubExprCompiled subexpr_charset_reading_filter(const codeset_t &codeset, FA_Container &fa) {
return subexpression_from_path(fa.makeOneCharRead(codeset, false));
}
SubExprCompiled join(const SubExprCompiled &A, const SubExprCompiled &B) {
if (!A.start)
return B;
if (!B.start)
return A;
SubExprCompiled res;
res.start = A.start;
for (FA_Node** ptrToptr : A.ends)
reattach_fa_node_edge(ptrToptr, B.start);
res.ends = B.ends;
res.can_be_empty = A.can_be_empty && B.can_be_empty;
return res;
}
SubExprCompiled subexpression_from_path(FA_NodePathPart *node) {
SubExprCompiled res;
res.start = node;
res.ends.push_back(&(node->nxt_node));
/* There is only one char reading path node type */
res.can_be_empty = (node->type != one_char_read);
return res;
}
SubExprCompiled RobertAngier(const SubExprCompiled& source, FA_Container& fa) {
SubExprCompiled res;
if (!source.start)
return res;
struct Marked{
FA_Node *original = NULL, *clone = NULL;
explicit Marked(FA_Node *original) : original(original) {}
};
std::vector<Marked> searched;
searched.push_back(Marked(source.start));
source.start->search_mark = 0;
for (size_t done = 0; done < searched.size(); done++){
FA_Node& v = *searched[done].original;
searched[done].clone = copy_fa_node(v, fa);
for (FA_Node **nxtN: searched[done].clone->get_all_transitions()){
if (!(*nxtN))
res.ends.push_back(nxtN);
else if ((**nxtN).search_mark < 0){
(**nxtN).search_mark = (int64_t)searched.size();
searched.emplace_back(*nxtN);
}
}
}
res.start = searched[0].clone;
for (Marked& mrkd: searched){
for (FA_Node **nxtN: mrkd.clone->get_all_transitions()){
if (*nxtN){
assert((**nxtN).search_mark >= 0);
Marked& proc_nxt = searched[(**nxtN).search_mark];
reattach_fa_node_edge(nxtN, proc_nxt.clone);
}
}
}
for (Marked& mrkd: searched)
mrkd.original->search_mark = -1;
return res;
}
void reattach_all_ends_to_one_node(SubExprCompiled& patient, FA_Node* node){
assert(node);
assert(patient.start);
for (FA_Node** end: patient.ends){
assert(!(*end));
printf("DEBUG %lu->->->->->%lu\n", patient.start->nodeId, node->nodeId);
reattach_fa_node_edge(end, node);
}
}
void apply_repeat_to_subexpression(SubExprCompiled &patient, FA_Container& fa, size_t min_allowed, size_t max_allowed) {
assert(min_allowed <= max_allowed && min_allowed <= REGEXIS024_MAX_REPEAT);
if (!patient.start)
return;
bool infinite_repeat = max_allowed > REGEXIS024_MAX_REPEAT;
if (min_allowed == 0 && max_allowed == 0){
patient = {};
} else if (min_allowed == 1 && max_allowed == 1){
/* Chill */
} else if (min_allowed == 0 && infinite_repeat){
FA_NodeOfForking* fn = fa.makeForking();
add_option_to_fork_node(fn, patient.start);
for (FA_Node** old_end: patient.ends)
reattach_fa_node_edge(old_end, fn);
add_option_to_fork_node(fn, NULL);
patient.start = fn;
patient.ends = {&(fn->nxt_options[1])};
} else if (min_allowed == 1 && infinite_repeat) {
FA_NodeOfForking* fn = fa.makeForking();
reattach_all_ends_to_one_node(patient, fn);
add_option_to_fork_node(fn, patient.start);
add_option_to_fork_node(fn, NULL);
patient.ends = {&(fn->nxt_options[1])};
} else if (min_allowed == 0 && max_allowed == 1){
FA_NodeOfForking* fn = fa.makeForking();
add_option_to_fork_node(fn, patient.start);
add_option_to_fork_node(fn, NULL);
patient.start = fn;
patient.ends.push_back(&(fn->nxt_options[1]));
} else if (infinite_repeat) {
std::vector<SubExprCompiled> Colon(min_allowed);
Colon[0] = patient;
for (size_t i = 1; i < min_allowed; i++)
Colon[i] = RobertAngier(patient, fa);
FA_NodeOfForking* fn = fa.makeForking();
for (size_t i = 0; i + 1 < min_allowed; i++)
reattach_all_ends_to_one_node(Colon[i], Colon[i + 1].start);
reattach_all_ends_to_one_node(Colon[min_allowed - 1], fn);
add_option_to_fork_node(fn, Colon[min_allowed - 1].start);
add_option_to_fork_node(fn, NULL);
/* patient.start is the same (the original is at Colon[0] */
patient.ends = {&(fn->nxt_options[1])};
} else {
std::vector<SubExprCompiled> Avenue(max_allowed);
Avenue[max_allowed - 1] = patient;
for (size_t i = 0; i < max_allowed - 1; i++)
Avenue[i] = RobertAngier(patient, fa);
for (size_t i = 0; i + 1 < max_allowed; i++)
reattach_all_ends_to_one_node(Avenue[i], Avenue[i + 1].start);
FA_NodeOfForking* fn = fa.makeForking();
if (min_allowed > 0){
for (size_t i = 0; i <= max_allowed - min_allowed; i++)
add_option_to_fork_node(fn, Avenue[i].start);
} else {
for (size_t i = 0; i < max_allowed; i++)
add_option_to_fork_node(fn, Avenue[i].start);
add_option_to_fork_node(fn, NULL);
patient.ends.push_back(&(fn->nxt_options[max_allowed]));
}
patient.start = fn;
/* patient.ends is the same (the original is Avenue.back()) */
}
if (min_allowed == 0)
patient.can_be_empty = true;
}
SubExprCompiled forkify(const std::vector<SubExprCompiled> &options, FA_Container& fa){
SubExprCompiled result;
size_t non_empty = 0;
result.can_be_empty = false;
for (const SubExprCompiled& opt: options){
result.can_be_empty |= opt.can_be_empty;
if (opt.start)
non_empty++;
}
if (non_empty == 0){
result.can_be_empty = true;
return result;
}
if (non_empty == 1){
for (const SubExprCompiled& opt: options)
if (opt.start){
result = opt;
break;
}
} else {
FA_NodeOfForking* n1 = fa.makeForking();
result.start = n1;
n1->nxt_options.reserve(non_empty);
for (const SubExprCompiled& opt: options)
if (opt.start){
add_option_to_fork_node(n1, opt.start);
for (FA_Node** end: opt.ends)
result.ends.push_back(end);
}
}
return result;
}
void SubExprCompiled::assertDefault() {
assert(!start && ends.empty() && can_be_empty);
}

View File

@ -0,0 +1,32 @@
#ifndef LIBREGEXIS024_SRC_LIBREGEXIS024SOL_SUBEXPR_FA_TRANSFORMED_H
#define LIBREGEXIS024_SRC_LIBREGEXIS024SOL_SUBEXPR_FA_TRANSFORMED_H
#include <libregexis024fa/finite_automaton.h>
struct SubExprCompiled{
FA_Node* start = NULL;
/* After putting there values from neighbour vectors in nodes, these vectors must not change size */
std::vector<FA_Node**> ends;
bool can_be_empty = true;
void assertDefault();
};
SubExprCompiled subexpr_charset_reading_filter(const codeset_t& codeset, FA_Container& fa);
SubExprCompiled join(const SubExprCompiled& A, const SubExprCompiled& B);
SubExprCompiled forkify(const std::vector<SubExprCompiled>& options, FA_Container& fa);
SubExprCompiled subexpression_from_path(FA_NodePathPart* node);
/* And then Robert Angier said `It's prestige time` and prestiged all over the place.
* If you still don't get it, this function copies section of NFA of regexp */
SubExprCompiled RobertAngier(const SubExprCompiled& source, FA_Container& fa);
#define REGEXIS024_MAX_REPEAT 64
/* pass REGEXIS024_MAX_REPEAT + 1 as max_allowed to allow infinite repeat */
void apply_repeat_to_subexpression(SubExprCompiled& patient, FA_Container& fa, size_t min_allowed, size_t max_allowed);
#endif //LIBREGEXIS024_SRC_LIBREGEXIS024SOL_SUBEXPR_FA_TRANSFORMED_H

View File

@ -0,0 +1,141 @@
/* This file is used for testing purposes only. Do not copy this file to installation prefix.
* This tehnique exploits C compiler capabilities to get regex024 assembler for free*/
#ifndef LIBREGEXIS024_BYTE_CODE_ASSEMBLER_H
#define LIBREGEXIS024_BYTE_CODE_ASSEMBLER_H
#include "vibe_check.h"
#include <libregexis024vm/vm_opcodes.h>
#include <string>
#include <vector>
#include <map>
#include <stdio.h>
struct assembler_context_bookmark{
regex_near_ptr_t pos_in_r024program;
int LINE;
};
struct pending_bookmark{
/* Must fill this byte with pos of pos_in_r024program in assembler_context_bookmark
* In a sense, this is a pointer to a NULL pointer that is yet to become normal kinda pointer */
regex_near_ptr_t pos_in_r024program;
const char* name;
/* LINE of the reference is needed in case of error */
int LINE;
};
struct assembler_context{
std::map<std::string, assembler_context_bookmark> bookmarks;
std::vector<pending_bookmark> unresolved_references;
std::vector<uint8_t> result;
void declare_bookmark(const char* name, int LINE_of_this){
if (bookmarks.count(name)){
fprintf(stderr, "Double bookmark '%s' definition in lines %d and %d\n", name, bookmarks[name].LINE, LINE_of_this);
exit(1);
}
bookmarks[name] = {result.size(), LINE_of_this};
}
void resolve_references(){
for (pending_bookmark& br: unresolved_references){
if (bookmarks.count(br.name) == 0){
fprintf(stderr, "Unknown bookmark '%s' is requested on line %d\n", br.name, br.LINE);
exit(1);
}
/* pending bookmerk requests should be added only with beg_for_bookmark method,
* or else SEGFAULT will be your frequent guest */
*reinterpret_cast<regex_near_ptr_t *>(&result[br.pos_in_r024program]) = bookmarks[br.name].pos_in_r024program;
}
}
void put_byte(uint8_t x){
result.push_back(x);
}
void put_word(uint16_t x){
put_byte(x & UINT8_MAX);
put_byte(x >> 8);
}
void put_doubleword(uint32_t x){
put_word(x & UINT16_MAX);
put_word(x >> 16);
}
void put_quadword(uint64_t x){
put_doubleword(x & UINT32_MAX);
put_doubleword(x >> 32);
}
void beg_for_bookmark(const char* name, int LINE_of_this){
unresolved_references.push_back({result.size(), name, LINE_of_this});
put_quadword(0);
}
};
#define msh_put_instr(ename) daCtx.put_byte(regex024_opcodes::ename);
#define msh_put_sslot(ssid) daCtx.put_doubleword(ssid);
#define msh_put_track_arr_ind(i) daCtx.put_word(i);
#define msh_put_x(meth, x) daCtx.meth(x);
/* Here my assembler begs for bookmark to jump on */
#define msh_bookmark_reference(name) daCtx.beg_for_bookmark(name, __LINE__);
#define s_BEGIN_ASSEMBLER_CONTEXT() { assembler_context daCtx{};
#define s_END_ASSEMBLER_CONTEXT(pass_vec) daCtx.resolve_references(); (pass_vec) = std::move(daCtx.result); }
/* Here user declares a bookmark */
#define c_BOOKMARK(name) daCtx.declare_bookmark((name), __LINE__);
#define i_READ(ssid) msh_put_instr(READ) msh_put_sslot(ssid)
#define i_READZ() msh_put_instr(READZ)
#define i_JUMP(bookmark) msh_put_instr(JUMP) msh_bookmark_reference(bookmark)
#define msh_conditional_jump(condition, size_postfix, meth, x, bookmark) \
msh_put_instr(JC ## condition ## _ ## size_postfix) \
msh_put_x(meth, x) msh_bookmark_reference(bookmark)
#define i_JCEQUAL_B(x, bookmark) msh_conditional_jump(EQUAL, B, put_byte, x, bookmark)
#define i_JCEQUAL_W(x, bookmark) msh_conditional_jump(EQUAL, W, put_word, x, bookmark)
#define i_JCEQUAL_DW(x, bookmark) msh_conditional_jump(EQUAL, DW, put_doubleword, x, bookmark)
#define i_JCEQUAL_QW(x, bookmark) msh_conditional_jump(EQUAL, QW, put_quadword, x, bookmark)
#define i_JCLESS_B(x, bookmark) msh_conditional_jump(LESS, B, put_byte, x, bookmark)
#define i_JCLESS_W(x, bookmark) msh_conditional_jump(LESS, W, put_word, x, bookmark)
#define i_JCLESS_DW(x, bookmark) msh_conditional_jump(LESS, DW, put_doubleword, x, bookmark)
#define i_JCLESS_QW(x, bookmark) msh_conditional_jump(LESS, QW, put_quadword, x, bookmark)
#define i_JCGRTR_B(x, bookmark) msh_conditional_jump(GRTR, B, put_byte, x, bookmark)
#define i_JCGRTR_W(x, bookmark) msh_conditional_jump(GRTR, W, put_word, x, bookmark)
#define i_JCGRTR_DW(x, bookmark) msh_conditional_jump(GRTR, DW, put_doubleword, x, bookmark)
#define i_JCGRTR_QW(x, bookmark) msh_conditional_jump(GRTR, QW, put_quadword, x, bookmark)
#define i_FORK(ssid, bookmark) msh_put_instr(FORK) msh_put_sslot(ssid) msh_bookmark_reference(bookmark)
#define i_MATCH() msh_put_instr(MATCH)
#define i_DIE() msh_put_instr(DIE)
#define i_PARAM_READ_SS_NUMBER(ssid) msh_put_instr(PARAM_READ_SS_NUMBER) msh_put_sslot(ssid)
#define i_PARAM_FORK_SS_NUMBER(ssid) msh_put_instr(PARAM_FORK_SS_NUMBER) msh_put_sslot(ssid)
#define i_PARAM_SELARR_LEN(tai) msh_put_instr(PARAM_SELARR_LEN) msh_put_track_arr_ind(tai)
#define i_PARAM_COLSIFTFUNC_SET(bookmark) msh_put_instr(PARAM_COLSIFTFUNC_SET) msh_bookmark_reference(bookmark)
#define i_PARAM_COLSIFTFUNC_WIPE() msh_put_instr(PARAM_COLSIFTFUNC_WIPE)
#define i_MSG_MULTISTART_ALLOWED(is_allowed) msh_put_instr(MSG_MULTISTART_ALLOWED) daCtx.put_byte(is_allowed);
#define i_MSG_FED_INPUT_EXTENDED(left, right, part) msh_put_instr(MSG_FED_INPUT_EXTENDED) \
daCtx.put_byte(left); daCtx.put_byte(right); msh_put_sslot(part)
#define i_DMOV_RABX_SELARR(tai) msh_put_instr(DMOV_RABX_SELARR) msh_put_track_arr_ind(tai)
#define DDIST_RABX_SELARR(tai_beg, tai_end) msh_put_instr(DDIST_RABX_SELARR) \
msh_put_track_arr_ind(tai_beg) msh_put_track_arr_ind(tai_end)
#define i_SIFTPRIOR_MIN_RABX() msh_put_instr(SIFTPRIOR_MIN_RABX)
#define i_SIFTPRIOR_MAX_RABX() msh_put_instr(SIFTPRIOR_MAX_RABX)
#define i_SIFT_DONE() msh_put_instr(SIFT_DONE)
#define i_MOV_COLARR_IMM(tai, qw_x) msh_put_instr(MOV_COLARR_IMM) msh_put_track_arr_ind(tai) \
daCtx.put_quadword(qw_x);
#define i_MOV_COLARR_BTPOS(tai) msh_put_instr(MOV_COLARR_BTPOS) msh_put_track_arr_ind(tai)
#define i_MOV_SELARR_IMM(tai, qw_x) msh_put_instr(MOV_SELARR_IMM) msh_put_track_arr_ind(tai) \
daCtx.put_quadword(qw_x);
#define i_MOV_SELARR_CHPOS(tai) msh_put_instr(MOV_SELARR_CHPOS)
#define i_INIT() msh_put_instr(INIT)
#define i_THROW() msh_put_instr(THROW)
#define i_DEBUG() msh_put_instr(DEBUG)
#endif //LIBREGEXIS024_BYTE_CODE_ASSEMBLER_H

View File

@ -0,0 +1,205 @@
/* This file is used for testing purposes only. Do not copy this file to installation prefix. */
#ifndef LIBREGEXIS024_BYTE_CODE_DISASSEMBLER_H
#define LIBREGEXIS024_BYTE_CODE_DISASSEMBLER_H
#include "vibe_check.h"
#include <libregexis024vm/vm_opcodes.h>
#include <libregexis024vm/utils.h>
#include <string>
#include <vector>
#include <map>
#include <stdio.h>
#include <inttypes.h>
// TODO: apply here my new change in near pointer size
struct landing_place_resolvance{
size_t name_id;
bool visited = false;
landing_place_resolvance() = default;
landing_place_resolvance(size_t nameId, bool visited) : name_id(nameId), visited(visited) {}
};
void print_disassembly(size_t prgSize, uint8_t* prg){
std::vector<std::string> names = {
"Александр", "Мария", "Иван", "Анна", "Дмитрий", "Екатерина", "Алексей",
"Ольга", "Михаил", "София", "Сергей", "Анастасия", "Артем", "Виктория",
"Андрей", "Елена", "Максим", "Алиса", "Павел", "Наталья", "Денис", "Юлия",
"Владимир", "Маргарита", "Никита", "Дарья", "Илья", "Алина", "Роман", "Евгения",
"Кирилл", "Елизавета", "Антон", "Татьяна", "Владислав", "Валерия", "Георгий",
"Ксения", "Арсений", "Милана", "Даниил", "Вероника", "Тимофей", "Арина",
"Николай", "Кристина", "Степан", "Алёна", "Игорь", "Алла", "Григорий", "Ева",
"Олег", "Яна", "Семен", "Марина", "Федор", "Светлана", "Василий", "Людмила"
};
uint64_t used_names = 0;
/* From program position -> to names[ind] & */
std::map<regex_near_ptr_t, landing_place_resolvance> bookmarks;
regex_near_ptr_t IP = 0;
auto check_inboundness = [&](int region){
if (!vmprog_check_inboundness(prgSize, IP, region))
exitf("This program can't be decomposed into commands in a trivial way");
};
auto extract_b = [&]() -> uint8_t{
check_inboundness(1);
return vmprog_extract_b(&IP, prg);
};
auto extract_w = [&]() -> uint16_t {
check_inboundness(2);
return vmprog_extract_w(&IP, prg);
};
auto extract_dw = [&]() -> uint32_t {
check_inboundness(4);
return vmprog_extract_dw(&IP, prg);
};
auto extract_qw = [&]() -> uint64_t {
check_inboundness(8);
return vmprog_extract_qw(&IP, prg);
};
auto extract_instruction = [&]() -> uint8_t{
return extract_b();
};
auto extract_sslot_id = [&]() -> regex_sslot_id_t{
return extract_dw();
};
auto extract_near_pointer = [&]() -> regex_near_ptr_t{
return extract_qw();
};
auto extract_track_array_index = [&]() -> regex_tai_t{
return extract_w();
};
bool second_phase = false;
auto fph_register_landing = [&](regex_near_ptr_t pos){
if (!second_phase){
if (bookmarks.count(pos) == 0){
if (used_names == names.size())
names.push_back("Закладка_" + std::to_string(used_names));
bookmarks.insert({pos, {used_names, false}});
used_names++;
}
}
};
auto get_bookmark_in_2phase = [&](regex_near_ptr_t pos) -> std::string {
if (bookmarks.count(pos) == 0)
exitf("bruh");
return names[bookmarks[pos].name_id];
};
auto one_reading = [&](){
while (IP < prgSize) {
regex_near_ptr_t start_pos = IP;
if (second_phase){
if (bookmarks.count(IP) != 0){
printf("%s:\n", get_bookmark_in_2phase(IP).c_str());
bookmarks[IP].visited = true;
}
}
uint8_t opcode = extract_instruction();
switch (opcode) {
#define secPrint(fmt, ...) if (second_phase) {printf("% 3lu) " fmt, start_pos, __VA_ARGS__);} } break;
#define secPrintNoArg(str) if (second_phase) {printf("% 3lu) " str, start_pos);} } break;
#define instCase(oper_code) case regex024_opcodes::oper_code: {
#define jcMess(cond, sz_uppercase, x_t, extract_method, printf_sign) \
instCase(JC ## cond ## _ ## sz_uppercase) \
x_t x = extract_method(); \
regex_near_ptr_t dest = extract_near_pointer(); \
fph_register_landing(dest); \
secPrint("JC" #cond "_" #sz_uppercase " %" printf_sign " $%s\n", x, get_bookmark_in_2phase(dest).c_str())
#define jcCacaphony(cond) \
jcMess(cond, B, uint8_t, extract_b, PRIu8) \
jcMess(cond, W, uint16_t, extract_w, PRIu16) \
jcMess(cond, DW, uint32_t, extract_dw, PRIu32) \
jcMess(cond, QW, uint64_t, extract_qw, PRIu64)
#define simpleDimple(name) instCase(name) secPrintNoArg(#name "\n")
instCase(READ)
uint32_t ssid = extract_sslot_id();
secPrint("READ %u\n", ssid)
simpleDimple(READZ)
instCase(JUMP)
uint32_t dest = extract_near_pointer();
fph_register_landing(dest);
secPrint("JUMP $%s\n", get_bookmark_in_2phase(dest).c_str())
jcCacaphony(EQUAL)
jcCacaphony(LESS)
jcCacaphony(GRTR)
instCase(FORK)
uint32_t ssid = extract_sslot_id();
regex_near_ptr_t dest = extract_near_pointer();
fph_register_landing(dest);
secPrint("FORK %u $%s\n", ssid, get_bookmark_in_2phase(dest).c_str())
simpleDimple(MATCH)
simpleDimple(DIE)
instCase(PARAM_READ_SS_NUMBER)
regex_sslot_id_t ssid_max_plus_one = extract_sslot_id();
secPrint("PARAM_READ_SS_NUMBER %u\n", ssid_max_plus_one)
instCase(PARAM_FORK_SS_NUMBER)
regex_sslot_id_t ssid_max_plus_one = extract_sslot_id();
secPrint("PARAM_FORK_SS_NUMBER %u\n", ssid_max_plus_one)
instCase(PARAM_SELARR_LEN)
regex_tai_t tai_max_plus_one = extract_track_array_index();
secPrint("PARAM_SELARR_LEN %hu\n", tai_max_plus_one)
instCase(PARAM_COLSIFTFUNC_SET)
regex_near_ptr_t entry = extract_near_pointer();
fph_register_landing(entry);
secPrint("PARAM_COLSIFTFUNC_SET $%s\n", get_bookmark_in_2phase(entry).c_str())
simpleDimple(PARAM_COLSIFTFUNC_WIPE)
instCase(MSG_MULTISTART_ALLOWED)
uint8_t is_allowed = extract_b();
secPrint("MSG_MULTISTART_ALLOWED %hhu\n", is_allowed)
instCase(MSG_FED_INPUT_EXTENDED)
uint8_t left = extract_b();
uint8_t right = extract_b();
regex_sslot_id_t part = extract_sslot_id();
secPrint("MSG_FED_INPUT_EXTENDED %hhu %hhu %u\n", left, right, part)
instCase(DMOV_RABX_SELARR)
regex_tai_t i = extract_track_array_index();
secPrint("DMOV_RABX_SELARR %hu\n", i)
instCase(DDIST_RABX_SELARR)
regex_tai_t s = extract_track_array_index();
regex_tai_t e = extract_track_array_index();
secPrint("DDIST_RABX_SELARR %hu %hu\n", s, e);
simpleDimple(SIFTPRIOR_MIN_RABX)
simpleDimple(SIFTPRIOR_MAX_RABX)
simpleDimple(SIFT_DONE)
instCase(MOV_COLARR_IMM)
regex_tai_t tai = extract_track_array_index();
uint64_t imm = extract_qw();
secPrint("MOV_COLARR_IMM %hu %lu\n", tai, imm);
instCase(MOV_COLARR_BTPOS)
regex_tai_t tai = extract_track_array_index();
secPrint("MOV_COLARR_BTPOS %hu\n", tai);
instCase(MOV_SELARR_IMM)
regex_tai_t tai = extract_track_array_index();
uint64_t imm = extract_qw();
secPrint("MOV_SELARR_IMM %hu %lu\n", tai, imm);
instCase(MOV_SELARR_CHPOS)
regex_tai_t tai = extract_track_array_index();
secPrint("MOV_SELARR_CHPOS %hu\n", tai);
simpleDimple(INIT)
simpleDimple(THROW)
default:
exitf("Bad opcode\n");
#undef secPrint
#undef secPrintNoArg
#undef instCase
#undef jcMess
#undef jcCacaphony
#undef simpleDimple
}
}
};
one_reading();
second_phase = true;
IP = 0;
one_reading();
}
#endif //LIBREGEXIS024_BYTE_CODE_DISASSEMBLER_H

View File

@ -0,0 +1,52 @@
#include <libregexis024fa/codeset.h>
#include <libregexis024vm/utils.h>
#include <stdio.h>
void test_ccs_fnc(const codeset_t &got, const codeset_t &expected){
static int id = 1;
if (got == expected)
printf("Test %d passed\n", id++);
else
exitf("Test %d failed\n", id);
}
void invert_test(const codeset_t& A, const codeset_t& C){
test_ccs_fnc(invert_set(A), C);
test_ccs_fnc(invert_set(C), A);
}
void merge_test(const codeset_t& A, const codeset_t& B, const codeset_t& C){
test_ccs_fnc(merge_sets(A, A), A);
test_ccs_fnc(merge_sets(B, B), B);
test_ccs_fnc(merge_sets(A, B), C);
test_ccs_fnc(merge_sets(B, A), C);
}
void intersect_test(const codeset_t& A, const codeset_t& B, const codeset_t& C){
test_ccs_fnc(intersect_sets(A, A), A);
test_ccs_fnc(intersect_sets(B, B), B);
test_ccs_fnc(intersect_sets(A, B), C);
test_ccs_fnc(intersect_sets(B, A), C);
}
int main(){
merge_test({{34, 111}}, {}, {{34, 111}});
merge_test({{1, 1}}, {{3, 3}}, {{1, 1}, {3, 3}});
invert_test({{0, 1}}, {{2, UINT32_MAX}});
invert_test({{32, 34}}, {{0, 31}, {35, UINT32_MAX}});
merge_test({{10, 10}, {20, 20}}, {{19, 19}}, {{10, 10}, {19, 20}});
merge_test({{0, 5}, {7, 10}}, {{4, 6}}, {{0, 10}});
merge_test({{1, 10}, {50, 60}}, {{11, 70}}, {{1, 70}});
merge_test({{23, 23}, {56, 100}}, {{30, 55}}, {{23, 23}, {30, 100}});
intersect_test({{100, 200}, {300, 400}}, {}, {});
intersect_test({{2, 30}}, {{15, 50}}, {{15, 30}});
intersect_test({{10, 30}}, {{15, 25}}, {{15, 25}});
intersect_test({{10, 20}}, {{21, 30}}, {});
intersect_test({{1, 100}, {150, 200}}, {{50, 175}}, {{50, 100}, {150, 175}});
intersect_test({{1, 100}}, {}, {});
intersect_test({{50, 50}}, {{50, 50}}, {{50, 50}});
intersect_test({{49, 49}}, {{50, 50}}, {});
intersect_test({{1, 20}, {50, 80}}, {{10, 55}, {60, 100}}, {{10, 20}, {50, 55}, {60, 80}});
merge_test({{2, 3}, {5, 5}, {7, 7}}, {{1, 1}, {3, 7}}, {{1, 7}});
return 0;
}

View File

@ -0,0 +1,136 @@
#include <stdio.h>
#include <libregexis024vm/libregexis024vm.h>
#include <libregexis024vm/vm_opcodes.h>
#include <libregexis024test/byte_code_assembler.h>
#include <libregexis024test/byte_code_disassembler.h>
#include <assert.h>
static int test_id = 0;
void do_test(const std::vector<uint8_t>& prg, const std::string& str, const std::vector<bool>& prefix_matching){
assert(str.size() + 1 == prefix_matching.size());
REGEX_IS024_CONTEXT ctx{prg.size(), prg.data(), 0, 0, 1000, 1000, 1000000};
regex024_error_code ret;
// todo
printf("TEST %d passed\n", test_id);
test_id++;
}
std::vector<bool> tfff(size_t n){
std::vector<bool> res = std::vector<bool>(n + 1, false);
res[0] = true;
return res;
}
std::vector<bool> ffft(size_t n){
std::vector<bool> res = std::vector<bool>(n + 1, false);
res.back() = true;
return res;
}
std::vector<uint8_t> program_0(){
std::vector<uint8_t> res;
s_BEGIN_ASSEMBLER_CONTEXT()
i_PARAM_READ_SS_NUMBER(2)
i_PARAM_FORK_SS_NUMBER(1)
i_INIT()
i_FORK(0, "wb")
i_READ(0)
i_JCEQUAL_B('a', "finish")
i_DIE()
c_BOOKMARK("wb")
i_READ(1)
i_JCEQUAL_B('b', "finish")
i_DIE()
c_BOOKMARK("finish")
i_MATCH()
i_DIE()
s_END_ASSEMBLER_CONTEXT(res)
return res;
}
std::vector<uint8_t> program_1(){
std::vector<uint8_t> res;
s_BEGIN_ASSEMBLER_CONTEXT()
i_PARAM_READ_SS_NUMBER(4)
i_PARAM_FORK_SS_NUMBER(2)
// i_PARAM_SELARR_LEN(0)
i_INIT()
i_FORK(0, "wb")
c_BOOKMARK("wa")
i_READ(0)
i_JCEQUAL_B('a', "razvilka")
i_DIE()
c_BOOKMARK("wb")
i_READ(1)
i_JCEQUAL_B('b', "razvilka")
i_DIE()
c_BOOKMARK("razvilka")
i_FORK(1, "wd")
c_BOOKMARK("wc")
i_READ(2)
i_JCEQUAL_B('c', "finish")
i_DIE()
c_BOOKMARK("wd")
i_READ(3)
i_JCEQUAL_B('d', "finish")
i_DIE()
c_BOOKMARK("finish")
i_MATCH()
i_DIE()
s_END_ASSEMBLER_CONTEXT(res)
return res;
}
/*
int main(){
auto prg0 = program_0();
auto prg1 = program_1();
// printf("Disassembled program:\n");
// print_disassembly(prg.size(), prg.data());
printf("Testing starts\n");
test_id = 0;
do_test(prg0, "a", {false, true});
do_test(prg0, "b", {false, true});
do_test(prg0, "c", {false, false});
do_test(prg0, "a4", {false, true, false});
do_test(prg0, "b4", {false, true, false});
do_test(prg1, "aa", {false, false, false});
do_test(prg1, "db", {false, false, false});
do_test(prg1, "ac", {false, false, true});
do_test(prg1, "bc", {false, false, true});
do_test(prg1, "ad", {false, false, true});
do_test(prg1, "bd", {false, false, true});
do_test(prg1, "bd12", {false, false, true, false, false});
return 0;
}
*/
int main() {
std::vector<uint8_t> prg;
s_BEGIN_ASSEMBLER_CONTEXT()
c_BOOKMARK("111")
i_READ(0)
i_READ(1)
i_FORK(12, "vdv")
c_BOOKMARK("vdv")
i_READ(2)
i_READ(10000000)
i_THROW()
i_THROW()
i_THROW()
i_JUMP("111")
s_END_ASSEMBLER_CONTEXT(prg)
print_disassembly(prg.size(), prg.data());
}

View File

@ -0,0 +1,12 @@
#include <libregexis024sol/expr_compiler.h>
#include <libregexis024test/byte_code_disassembler.h>
int main(){
std::string regular_expression = "!selarr{boba{ca}}^a#boba(b)c$";
REGEX_IS024_MeaningContext regex(regular_expression.size(), regular_expression.c_str());
if (regex.error)
fprintf(stderr, "%s\n", regex.error_msg.c_str());
std::vector<uint8_t> res = regex.compiled_program;
print_disassembly(res.size(), res.data());
return 0;
}

View File

@ -0,0 +1,217 @@
#include <libregexis024fa/colored_codeset.h>
#include <assert.h>
#include <string>
#include <algorithm>
#include <map>
#include <exception>
#include <stdexcept>
#include <random>
struct test_id_t {
int test_id;
int subtest_id;
test_id_t(int test_id, int subtest_id) : test_id(test_id),subtest_id(subtest_id) {}
std::string toString() const {
char buf[128];
snprintf(buf, 128, "#%d::%d", test_id + 1, subtest_id + 1);
return buf;
}
};
std::string stringifyCodeset(const codeset_t& CS) {
std::string cs;
for (auto p: CS) {
if (!cs.empty())
cs += "; ";
cs += std::to_string(p.first) + "-" + std::to_string(p.second);
}
return cs;
}
std::string stringifyRequestList(const std::vector<uint64_t>& arr) {
std::string rl;
for (auto r: arr) {
if (!rl.empty())
rl += ", ";
rl += std::to_string(r);
}
return rl;
}
void print_obj(const std::vector<std::pair<codeset_t, std::vector<size_t>>>& answer) {
size_t R = answer.size();
for (int i = 0; i < R; i++) {
std::string cs = stringifyCodeset(answer[i].first);
std::string rl = stringifyRequestList(answer[i].second);
printf("{%s} -> {%s}\n", cs.c_str(), rl.c_str());
}
}
void print_answer_canonic(const std::map<std::vector<size_t>, codeset_t>& answer) {
for (auto& p: answer) {
printf("{%s} -> {%s}\n", stringifyCodeset(p.second).c_str(), stringifyRequestList(p.first).c_str());
}
}
void print_test_details(int dummyN, const std::vector<codeset_t>& requests) {
for (int i = 0; i < requests.size(); i++) {
printf("%d) %s\n", i - dummyN, stringifyCodeset(requests[i]).c_str());
}
}
void fail_test(test_id_t my_test_id) {
char buf[1024];
snprintf(buf, 1024, "Test %s failed", my_test_id.toString().c_str());
throw std::runtime_error(buf);
}
std::map<std::vector<size_t>, codeset_t> safe_canonificate_answer(test_id_t my_test_id,
const std::vector<std::pair<codeset_t, std::vector<size_t>>>& answer)
{
std::map<std::vector<size_t>, codeset_t> answer_canonic;
for (auto& p: answer) {
if (answer_canonic.count(p.second) == 0)
answer_canonic[p.second] = {};
if (!intersect_sets(answer_canonic[p.second], p.first).empty())
fail_test(my_test_id);
answer_canonic[p.second] = merge_sets(answer_canonic[p.second], p.first);
}
return answer_canonic;
}
std::vector<std::pair<codeset_t, std::vector<size_t>>> safe_zip_answer(test_id_t my_test_id,
const std::vector<codeset_t>& ti, const std::vector<std::vector<size_t>>& to)
{
size_t R = ti.size();
std::vector<std::pair<codeset_t, std::vector<size_t>>> res(R);
if (to.size() != R)
fail_test(my_test_id);
for (size_t i = 0; i < R; i++) {
res[i].first = ti[i];
res[i].second = to[i];
}
return res;
}
void perform_test(test_id_t my_test_id, int InpDummyN, const std::vector<codeset_t>& rqInpCs, const std::vector<std::pair<codeset_t, std::vector<size_t>>>& answer_right)
{
std::map<std::vector<size_t>, codeset_t> answer_canonic_right = safe_canonificate_answer(my_test_id, answer_right);
ColoredCodeset cc(InpDummyN);
for (auto& c: rqInpCs)
cc.apply_divisor(c);
std::vector<codeset_t> ti;
std::vector<std::vector<size_t>> to;
cc.get_splits_of_non_dummy(ti, to);
std::vector<std::pair<codeset_t, std::vector<size_t>>> answer_returned = safe_zip_answer(my_test_id, ti, to);
std::map<std::vector<size_t>, codeset_t> answer_canonic_returned = safe_canonificate_answer(my_test_id,
answer_returned);
if (answer_canonic_right != answer_canonic_returned) {
printf("Test failed!!\n");
printf("Test details:\n");
print_test_details(InpDummyN, rqInpCs);
printf("Right answer:\n");
print_obj(answer_right);
printf("Given answer:\n");
print_obj(answer_returned);
fail_test(my_test_id);
}
printf("Test %s passed\n", my_test_id.toString().c_str());
}
void perform_test_with_shuffle(const std::vector<codeset_t>& rqInpCs,
const std::vector<std::pair<codeset_t, std::vector<size_t>>>& answer)
{
static int cur_test_id = 0;
int my_test_id = cur_test_id++;
int dn = rqInpCs.size();
std::vector<int> shuf(dn);
for (int i = 0; i < dn; i++)
shuf[i] = i;
for (int stid = 0; stid < 14; stid++) {
std::random_device d;
std::mt19937 r_gen(d());
std::shuffle(shuf.begin(), shuf.end(), r_gen);
int InpDummyN = std::uniform_int_distribution<int>(0, dn - 1)(r_gen);
// for (int e: shuf)
// printf("%d ", e);
// printf("\n");
std::vector<codeset_t> new_rqInpCs(dn);
for (int i = 0; i < dn; i++)
new_rqInpCs[shuf[i]] = rqInpCs[i];
std::vector<std::pair<codeset_t, std::vector<size_t>>> new_right_answer;
for (const std::pair<codeset_t, std::vector<size_t>>& p: answer) {
std::vector<uint64_t> new_request_list;
for (size_t oldRid: p.second) {
size_t unabridged_new_rid = shuf[oldRid];
if (unabridged_new_rid >= InpDummyN) {
new_request_list.push_back(unabridged_new_rid - InpDummyN);
}
}
if (!new_request_list.empty()) {
std::sort(new_request_list.begin(), new_request_list.end());
new_right_answer.emplace_back(p.first, new_request_list);
}
}
perform_test(test_id_t(my_test_id, stid), InpDummyN, new_rqInpCs, new_right_answer);
}
}
int main() {
perform_test_with_shuffle(
{
{{5, 500}},
{{10, 20}},
{{5, 10}},
},
{
{{{21, 500}}, {0}},
{{{11, 20}}, {0, 1}},
{{{5, 9}}, {0, 2}},
{{{10, 10}}, {0, 1, 2}},
});
perform_test_with_shuffle({
{{10, 19}},
{{10, 15}},
{{5, 9}},
{{20, 40}},
{{16, UINT32_MAX}},
},
{
{{{5, 9}}, {2}},
{{{10, 15}}, {0, 1}},
{{{16, 19}}, {0, 4}},
{{{20, 40}}, {3, 4}},
{{{41, UINT32_MAX}}, {4}}
});
perform_test_with_shuffle(
{
{{10, 19}, {30, 39}, {50, 69}},
{{20, 29}, {40, 59}, },
{{20, 39}, {70, 79}},
codeset_of_one_char(UINT32_MAX - 1),
codeset_of_one_char(UINT32_MAX),
{{UINT32_MAX - 1, UINT32_MAX}},
codeset_of_one_char(0),
codeset_of_one_char(1),
{{0, 1}},
},
{
{{{10, 19}, {60, 69}}, {0}},
{{{40, 49}}, {1}},
{{{70, 79}}, {2}},
{{{50, 59}}, {0, 1}},
{{{20, 29}}, {1, 2}},
{{{30, 39}}, {0, 2}},
{{{0, 0}}, {6, 8}},
{{{1, 1}}, {7, 8}},
{{{UINT32_MAX - 1, UINT32_MAX - 1}}, {3, 5}},
{{{UINT32_MAX, UINT32_MAX}}, {4, 5}},
});
return 0;
}

View File

@ -0,0 +1,43 @@
#include <libregexis024tools/stringmatching.h>
#include <assert.h>
#include <stdexcept>
#include <libregexis024sol/expr_compiler.h>
using namespace regexis024;
using namespace std;
void test(const string& input, const string& pattern, const MatchInfo& right_answer) {
MatchInfo given_answer;
track_var_list retTrackVarList;
string retStatus;
matchStrToRegexp(input, pattern, given_answer, retTrackVarList, retStatus);
if (given_answer != right_answer) {
throw runtime_error("Test failed");
}
printf("Test passed\n");
}
int main() {
test("b", "#boba(b)", MatchInfo({{0, 0}, {1, 1}}, {}));
test("abc", "!selarr{boba{ca}}^a#boba(b)c$", MatchInfo({{0, 1}, {1, 2}}, {1, 2}));
for (int i = 0; i < 64; i++) {
std::string T;
T += ('a' + (i >> 3));
T+= ('a' + (i % 8));
test(T, "(((a|b)|(c|d))|((e|f)|(g|h)))!r{2}", MatchInfo({}, {}));
}
test("abba", "!select{M{max}}a#M(b*)a", MatchInfo({}, {1, 3}));
test("abba", "!dfa;!select{M{max}}a#M(b*)a", MatchInfo({}, {1, 3}));
test("abba", "!select{M{max}}a#M(!any;*)a", MatchInfo({}, {1, 3}));
test("abba", "!dfa;!select{M{max}}a#M(!any;*)a", MatchInfo({}, {1, 3}));
test("", "", MatchInfo({}, {}));
test("a", "a", MatchInfo({}, {}));
test("a3", "[abc]3", MatchInfo({}, {}));
test("b3", "[abc]3", MatchInfo({}, {}));
test("c3", "[abc]3", MatchInfo({}, {}));
test("aa", "aa", MatchInfo({}, {}));
test("aaaaa", "a*", MatchInfo({}, {}));
test("bababbaa", "[ab]*", MatchInfo({}, {}));
test("bababbaa", "!dfa;[ab]*", MatchInfo({}, {}));
return 0;
}

View File

@ -0,0 +1,14 @@
#ifndef VIBE_CHECK_H
#define VIBE_CHECK_H
#ifndef __ORDER_LITTLE_ENDIAN__
#error "All the cool kids use little endian. Get lost, you are forbidden from entering this party"
#endif
#ifndef _GLIBCXX_DEBUG
#error "Kinda stupid to test without _GLIBCXX_DEBIG. Or... Don't tell me you are using this header in non-testing environment. OH MY \
GOD! THIS LUNATIC USES TESTING HEADER IN PRODUCTION CODE. I-I-I am calling 911, COMON, SOMEBODY CATCH HIM AND PUT HIM IN LOONEYBLOCK!!!"
#endif
#endif //VIBE_CHECK_H

View File

@ -0,0 +1,109 @@
#include <algorithm>
#include <libregexis024tools/stringmatching.h>
#include <libregexis024sol/expr_compiler.h>
#include <libregexis024vm/libregexis024vm_interface.h>
#include <libregexis024vm/utils.h>
#include <assert.h>
// using namespace regexis024;
void convert(regexis024::TrackingVariableInfo& to, const SubtrackingNameInfo& from) {
#define plagiat(field) to.field = from.field;
plagiat(type);
plagiat(colarr_first);
plagiat(colarr_second);
plagiat(stored_in_ca);
plagiat(selarr_first);
plagiat(selarr_second);
plagiat(stored_in_sa);
#undef plagiat
}
int regexis024::matchStrToRegexp(const std::string& input, const std::string& pattern,
MatchInfo& retMatchInfo, track_var_list& retTrackVarList, std::string& retStatus)
{
retTrackVarList = {};
retMatchInfo = MatchInfo();
retStatus = "";
REGEX_IS024_MeaningContext regexp(pattern.size(), pattern.data());
if (regexp.error) {
retStatus = "Pattern compilation. " + regexp.error_msg;
return -1;
}
retTrackVarList = {};
for (auto& iip: regexp.ktr.track_names) {
convert(retTrackVarList[iip.first], regexp.ktr.retrieval_info[iip.second]);
}
REGEX_IS024_VirtualMachine vm(regexp.compiled_program.size(), regexp.compiled_program.data(),
UINT64_MAX, UINT16_MAX,
UINT32_MAX, UINT32_MAX, UINT64_MAX);
auto getVMErrString = [&]() -> std::string {
return std::string(regex024_error_code_tostr(vm.getErrno()));
};
if (vm.initialize() != regex024_error_codes::stable) {
retStatus = "Virtual machine initialization. " + getVMErrString();
return -1;
}
int left_ext_feed = vm.getInputLeftExtensionSize();
int right_ext_feed = vm.getInputRightExtensionSize();
if (left_ext_feed > 1 || right_ext_feed > 1) {
retStatus = "Unnatural extended input request.";
return -1;
}
if (vm.addNewMatchingThread() != regex024_error_codes::stable) {
retStatus = "Virtual machine first kick. " + getVMErrString();
}
if (left_ext_feed) {
if (vm.extendedFeedCharacter('\n') != regex024_error_codes::stable) {
retStatus = "VM left extended input. " + getVMErrString();
return -1;
}
}
for (size_t cur_text_pos = 0;cur_text_pos < input.size();) {
int32_t inp_code;
size_t adj;
utf8_string_iterat(inp_code, adj, cur_text_pos, reinterpret_cast<const uint8_t*>(input.data()), input.size());
if (inp_code < 0) {
retStatus = "Input string encoding error.";
return -1;
}
if (vm.feedCharacter(static_cast<uint64_t>(inp_code), adj) != regex024_error_codes::stable) {
retStatus = "VM input. " + getVMErrString();
return -1;
}
cur_text_pos += adj;
}
if (right_ext_feed) {
if (vm.extendedFeedCharacter('\n') != regex024_error_codes::stable) {
retStatus = "VM right extended input. " + getVMErrString();
return -1;
}
}
assert(vm.isUsable());
if (vm.isMatched()) {
retMatchInfo.have_match = true;
size_t SN1 = vm.getSelectionArrayLength();
retMatchInfo.sa.assign(SN1, 0);
for (size_t i = 0; i < SN1; i++)
retMatchInfo.sa[i] = vm.getMatchedThreadSAValue(i);
retMatchInfo.ca_history = vm.getMatchedThreadCABranchReverse();
std::reverse(retMatchInfo.ca_history.begin(), retMatchInfo.ca_history.end());
return 0;
}
return -1;
}
bool regexis024::MatchInfo::operator==(const MatchInfo &other) const {
if (!have_match && !other.have_match)
return true;
return (have_match == other.have_match) && (sa == other.sa) && (ca_history == other.ca_history);
}
bool regexis024::MatchInfo::operator!=(const MatchInfo &other) const {
return !(*this == other);
}
regexis024::MatchInfo::MatchInfo(const std::vector<REGEX_IS024_CAEvent> &ca_history, const std::vector<uint64_t> &sa):
ca_history(ca_history), sa(sa), have_match(true) {
}

View File

@ -0,0 +1,42 @@
#ifndef LIBREGEXIS024_SRC_LIBREGEXIS024TOOLS_STRINGMATCHING_H
#define LIBREGEXIS024_SRC_LIBREGEXIS024TOOLS_STRINGMATCHING_H
#include <libregexis024fa/tracking_variables.h>
#include <map>
#include <string>
#include <libregexis024vm/libregexis024vm_interface.h>
namespace regexis024 {
struct TrackingVariableInfo {
bool stored_in_ca = true;
bool stored_in_sa = false;
tracking_var_type type;
/* These fields will be -1 if unused */
int colarr_first = -1;
int colarr_second = -1;
int selarr_first = -1;
int selarr_second = -1;
};
typedef std::map<std::string, TrackingVariableInfo> track_var_list;
struct MatchInfo {
bool have_match = false;
std::vector<REGEX_IS024_CAEvent> ca_history;
std::vector<uint64_t> sa;
bool operator==(const MatchInfo& other) const ;
bool operator!=(const MatchInfo& other) const ;
MatchInfo() = default;
MatchInfo(const std::vector<REGEX_IS024_CAEvent> &ca_history, const std::vector<uint64_t> &sa);
};
int matchStrToRegexp(const std::string& input, const std::string& pattern,
MatchInfo& retMatchInfo, track_var_list& retTrackVarList, std::string& retStatus);
}
#endif

View File

@ -0,0 +1,491 @@
#include <libregexis024vm/instruction_implementation.h>
#include <stdexcept>
void swap_old_settled_and_new_active(REGEX_IS024_CONTEXT &ctx, REGEX_IS024_Thread& old_settled){
ctx_print_debug(ctx);
assert(old_settled.slot_occupation_status == SLOT_OCCUPIED_val);
REGEX_IS024_Thread temp = old_settled;
old_settled = ctx.active_thread;
old_settled.slot_occupation_status = SLOT_NEW_val;
ctx.active_thread = temp;
// slot_occupation_status & SLOT_OCCUPIED of actie thread is true, because it was retrieved from old_settled
}
void start_noncloning_conflict(REGEX_IS024_CONTEXT& ctx, REGEX_IS024_Thread& other){
ctx_print_debug(ctx);
if (ctx.have_sift_function){
ctx.sifting_with = &other;
ctx.who_started_sift = regex024_opcode::READ;
ctx.intruder_IP = ctx.active_thread.IP;
ctx.active_thread.IP = ctx.sift_function;
ctx.RAX = ctx.RBX = 0;
} else {
ctx.active_thread.delete_thread();
ctx.try_to_continue_scheduled();
}
}
/* The one that drops as an intruder here is current active.thread.IP */
void start_cloning_conflict(REGEX_IS024_CONTEXT& ctx, REGEX_IS024_Thread& other, regex_near_ptr_t clone_IP){
ctx_print_debug(ctx);
if (ctx.have_sift_function){
ctx.sifting_with = &other;
ctx.who_started_sift = regex024_opcode::FORK;
ctx.intruder_IP = ctx.active_thread.IP;
ctx.child_ret_IP = clone_IP;
ctx.active_thread.IP = ctx.sift_function;
ctx.RAX = ctx.RBX = 0;
} else {
ctx.active_thread.IP = clone_IP;
}
}
#define initialization_phase_check() if (ctx.initialized){ \
ctx.error = regex024_error_codes::too_late; return; }
#define general_matching_mode_check() if (!ctx.initialized){ \
ctx.error = regex024_error_codes::too_early; return; } if(ctx.sifting_with){ \
ctx.error = regex024_error_codes::instruction_not_for_collision_thread; return; }
#define sift_mode_check() if (!ctx.sifting_with){ \
ctx.error = regex024_error_codes::instruction_not_for_collision_thread; return; }
/* Can append to both read_halted+new stacks of context */
void read_halted_new_type_stacks_append(REGEX_IS024_CONTEXT &ctx, regex_sslot_id_t ssid){
ctx_print_debug(ctx);
if (ssid < ctx.portion_of_FIRST_read_halt_ns){
ctx.READ_halted_stack_new_first.append(ssid);
} else {
ctx.READ_halted_stack_new_second.append(ssid);
}
}
void do_i_read(REGEX_IS024_CONTEXT &ctx, regex_sslot_id_t ssid) {
ctx_print_debug(ctx);
general_matching_mode_check()
if (ssid >= ctx.read_slots_number)
smitsya(read_sslot_out_of_range);
REGEX_IS024_Thread& other = ctx.READ_halted_slots[ssid];
if (other.slot_occupation_status & SLOT_OCCUPIED){
if (other.slot_occupation_status & SLOT_NEW){
start_noncloning_conflict(ctx, other);
} else {
swap_old_settled_and_new_active(ctx, other);
/* Even though ssid was registed in stack for elders, now young stack should also track this slot */
read_halted_new_type_stacks_append(ctx, ssid);
}
} else {
other = ctx.active_thread;
other.slot_occupation_status = SLOT_NEW_val;
ctx.active_thread.slot_occupation_status = SLOT_EMPTY_val;
read_halted_new_type_stacks_append(ctx, ssid);
ctx.try_to_continue_scheduled();
}
}
void i_READ(REGEX_IS024_CONTEXT &ctx) {
ctx_print_debug(ctx);
check_available_prg(REGEX024_BYTECODE_SSLOT_ID_SZ)
regex_sslot_id_t ssid = ctx.extract_sslot_id();
do_i_read(ctx, ssid);
}
void i_READZ(REGEX_IS024_CONTEXT &ctx) {
ctx_print_debug(ctx);
do_i_read(ctx, 0);
}
void i_JUMP(REGEX_IS024_CONTEXT& ctx){
ctx_print_debug(ctx);
check_available_prg(REGEX024_BYTECODE_NEAR_POINTER_SZ)
ctx.active_thread.IP = ctx.extract_near_pointer();
}
template<typename conditionT, typename immArgSzT>
void i_JC(REGEX_IS024_CONTEXT& ctx)
{
ctx_print_debug(ctx);
check_available_prg(immArgSzT::byte_sz + REGEX024_BYTECODE_NEAR_POINTER_SZ);
uint64_t imm_val_B = immArgSzT::extract(ctx);
regex_near_ptr_t dest = ctx.extract_near_pointer();
uint64_t imm_val_A = ctx.INP;
if (conditionT::call(imm_val_A, imm_val_B))
ctx.active_thread.IP = dest;
}
struct condEqual{static bool call(uint64_t A, uint64_t B){return A == B;}};
struct condLess{static bool call(uint64_t A, uint64_t B){return A < B;}};
struct condGrtr{static bool call(uint64_t A, uint64_t B){return A > B;}};
struct immArgByte{
static constexpr int byte_sz = 1;
static uint64_t extract(REGEX_IS024_CONTEXT& ctx){return ctx.extract_b();}
};
struct immArgWord{
static constexpr int byte_sz = 2;
static uint64_t extract(REGEX_IS024_CONTEXT& ctx){return ctx.extract_w();}
};
struct immArgDoubleWord{
static constexpr int byte_sz = 4;
static uint64_t extract(REGEX_IS024_CONTEXT& ctx){return ctx.extract_dw();}
};
struct immArgQuadWord{
static constexpr int byte_sz = 8;
static uint64_t extract(REGEX_IS024_CONTEXT& ctx){return ctx.extract_qw();}
};
void clone_thread_into_slot(REGEX_IS024_Thread& source, REGEX_IS024_Thread& vessel){
thread_print_debug(source);
my_assert(!(vessel.slot_occupation_status & SLOT_OCCUPIED));
my_assert((source.slot_occupation_status & SLOT_OCCUPIED));
vessel = source;
if (vessel.CAHptr){
vessel.CAHptr->refs++;
}
if (vessel.SAptr){
vessel.SAptr[0]++;
}
}
/* One FORK-slot governs the one single unique position in program: the next one after the fork */
void i_FORK(REGEX_IS024_CONTEXT& ctx){
ctx_print_debug(ctx);
general_matching_mode_check()
check_available_prg(REGEX024_BYTECODE_SSLOT_ID_SZ + REGEX024_BYTECODE_NEAR_POINTER_SZ);
regex_sslot_id_t ssid = ctx.extract_sslot_id();
regex_near_ptr_t dest = ctx.extract_near_pointer();
if (ssid >= ctx.fork_slots_number)
smitsya(fork_sslot_out_of_range);
REGEX_IS024_Thread& other = ctx.FORK_halted_slots[ssid];
if (other.slot_occupation_status & SLOT_OCCUPIED){
start_cloning_conflict(ctx, other, dest);
} else {
clone_thread_into_slot(ctx.active_thread, other);
ctx.active_thread.IP = dest;
ctx.FORK_halted_stack.append(ssid);
}
}
void i_MATCH(REGEX_IS024_CONTEXT& ctx){
ctx_print_debug(ctx);
general_matching_mode_check()
if (ctx.matched_thread.slot_occupation_status & SLOT_OCCUPIED){
start_cloning_conflict(ctx, ctx.matched_thread, ctx.active_thread.IP);
} else {
clone_thread_into_slot(ctx.active_thread, ctx.matched_thread);
}
}
void i_DIE(REGEX_IS024_CONTEXT& ctx){
ctx_print_debug(ctx);
general_matching_mode_check()
ctx.active_thread.delete_thread();
ctx.try_to_continue_scheduled();
}
void i_PARAM_READ_SS_NUMBER(REGEX_IS024_CONTEXT& ctx){
ctx_print_debug(ctx);
initialization_phase_check()
check_available_prg(REGEX024_BYTECODE_SSLOT_ID_SZ)
regex_sslot_id_t read_slots_number = ctx.extract_sslot_id();
ctx.read_slots_number = read_slots_number;
}
void i_PARAM_FORK_SS_NUMBER(REGEX_IS024_CONTEXT& ctx){
ctx_print_debug(ctx);
initialization_phase_check()
check_available_prg(REGEX024_BYTECODE_SSLOT_ID_SZ)
regex_sslot_id_t fork_slots_number = ctx.extract_sslot_id();
ctx.fork_slots_number = fork_slots_number;
}
void i_PARAM_SELARR_LEN(REGEX_IS024_CONTEXT& ctx){
ctx_print_debug(ctx);
initialization_phase_check()
check_available_prg(REGEX024_BYTECODE_TRACK_ARRAY_INDEX_ID_SZ)
regex_tai_t selection_array_len = ctx.extract_track_array_index();
ctx.selection_array_len = selection_array_len;
}
void i_PARAM_COLSIFTFUNC_SET(REGEX_IS024_CONTEXT& ctx){
ctx_print_debug(ctx);
initialization_phase_check()
check_available_prg(REGEX024_BYTECODE_NEAR_POINTER_SZ)
regex_near_ptr_t sift_function = ctx.extract_near_pointer();
ctx.have_sift_function = true;
ctx.sift_function = sift_function;
}
void i_PARAM_COLSIFTFUNC_WIPE(REGEX_IS024_CONTEXT& ctx){
ctx_print_debug(ctx);
initialization_phase_check()
ctx.have_sift_function = false;
}
void i_MSG_MULTISTART_ALLOWED(REGEX_IS024_CONTEXT& ctx){
ctx_print_debug(ctx);
initialization_phase_check()
check_available_prg(1)
ctx.allows_multistart = (bool)ctx.extract_b();
}
void i_MSG_FED_INPUT_EXTENDED(REGEX_IS024_CONTEXT& ctx){
ctx_print_debug(ctx);
initialization_phase_check()
check_available_prg(1 + 1 + REGEX024_BYTECODE_SSLOT_ID_SZ)
ctx.fed_input_extends_left = ctx.extract_b();
ctx.fed_input_extends_right = ctx.extract_b();
ctx.portion_of_second_read_halt_ns = ctx.extract_sslot_id();
}
uint64_t get_el_from_selarr(uint64_t* sa, regex_near_ptr_t ind){
return sa ? sa[1UL + ind] : 0;
}
void i_DMOV_RABX_SELARR(REGEX_IS024_CONTEXT& ctx){
ctx_print_debug(ctx);
sift_mode_check()
check_available_prg(REGEX024_BYTECODE_TRACK_ARRAY_INDEX_ID_SZ)
regex_tai_t i1 = ctx.extract_track_array_index();
if (i1 >= ctx.selection_array_len)
smitsya(selection_arr_out_of_range);
ctx.RAX = get_el_from_selarr(ctx.active_thread.SAptr, i1);
ctx.RBX = get_el_from_selarr(ctx.sifting_with->SAptr, i1);
}
uint64_t get_selarr_el_dist(uint64_t* sa, uint16_t start, uint16_t end){
uint64_t v_start = get_el_from_selarr(sa, start);
uint64_t v_end = get_el_from_selarr(sa, end);
return v_end > v_start ? v_end - v_start : 0;
}
void i_DDIST_RABX_SELARR(REGEX_IS024_CONTEXT& ctx){
ctx_print_debug(ctx);
sift_mode_check()
check_available_prg(REGEX024_BYTECODE_TRACK_ARRAY_INDEX_ID_SZ * 2)
regex_tai_t i_start = ctx.extract_track_array_index();
if (i_start >= ctx.selection_array_len)
smitsya(selection_arr_out_of_range);
regex_tai_t i_end = ctx.extract_track_array_index();
if (i_end >= ctx.selection_array_len)
smitsya(selection_arr_out_of_range);
ctx.RAX = get_selarr_el_dist(ctx.active_thread.SAptr, i_start, i_end);
ctx.RBX = get_selarr_el_dist(ctx.sifting_with->SAptr, i_start, i_end);
}
void finish_conflict_homesteader_wins(REGEX_IS024_CONTEXT& ctx){
ctx_print_debug(ctx);
if (ctx.who_started_sift == regex024_opcodes::READ){
ctx.active_thread.delete_thread();
ctx.try_to_continue_scheduled();
} else {
/* FORK or MATCH (which will also be shown as FORK) */
/* Cloning conflict ends, active_thread jumps to offsprings IP */
ctx.active_thread.IP = ctx.child_ret_IP;
}
ctx.sifting_with = NULL;
}
void finish_conflict_intruder_wins(REGEX_IS024_CONTEXT& ctx){
ctx_print_debug(ctx);
ctx.sifting_with->delete_thread();
ctx.active_thread.IP = ctx.intruder_IP;
if (ctx.who_started_sift == regex024_opcodes::READ){
/* noncloning conflict won by intruder+ */
*ctx.sifting_with = ctx.active_thread;
ctx.active_thread.slot_occupation_status = SLOT_EMPTY_val;
ctx.try_to_continue_scheduled();
} else {
/* End of cloning conflict (it involved cloning) */
clone_thread_into_slot(ctx.active_thread, *ctx.sifting_with);
ctx.active_thread.IP = ctx.child_ret_IP;
}
ctx.sifting_with = NULL;
}
void i_SIFTPRIOR_MIN_RABX(REGEX_IS024_CONTEXT& ctx){
ctx_print_debug(ctx);
sift_mode_check()
if (ctx.RAX < ctx.RBX){
finish_conflict_intruder_wins(ctx);
} else if (ctx.RAX > ctx.RBX){
finish_conflict_homesteader_wins(ctx);
}
}
void i_SIFTPRIOR_MAX_RABX(REGEX_IS024_CONTEXT& ctx){
ctx_print_debug(ctx);
sift_mode_check()
if (ctx.RAX > ctx.RBX){
finish_conflict_intruder_wins(ctx);
} else if (ctx.RAX < ctx.RBX){
finish_conflict_homesteader_wins(ctx);
}
}
void i_SIFT_DONE(REGEX_IS024_CONTEXT& ctx){
ctx_print_debug(ctx);
sift_mode_check()
finish_conflict_homesteader_wins(ctx);
}
/* Can give errors */
void ca_branch_new_node(REGEX_IS024_CONTEXT& ctx, regex_tai_t key, uint64_t val){
ctx_print_debug(ctx);
if (ctx.CAN_total >= ctx.CA_TREE_LIMIT)
smitsya(ca_tree_limit_violation);
REGEX024_CollectionArrayNode* node = new REGEX024_CollectionArrayNode{key, val, ctx.active_thread.CAHptr, 1};
// if (ctx.active_thread.CAHptr)
// (ctx.active_thread.CAHptr->refs)++;
ctx.active_thread.CAHptr = node;
ctx.CAN_total++;
}
void i_MOV_COLARR_IMM(REGEX_IS024_CONTEXT& ctx){
ctx_print_debug(ctx);
general_matching_mode_check()
check_available_prg(REGEX024_BYTECODE_TRACK_ARRAY_INDEX_ID_SZ + 8)
regex_tai_t ca_ind = ctx.extract_track_array_index();
uint64_t imm = ctx.extract_qw();
ca_branch_new_node(ctx, ca_ind, imm);
}
void i_MOV_COLARR_BTPOS(REGEX_IS024_CONTEXT& ctx){
ctx_print_debug(ctx);
general_matching_mode_check()
check_available_prg(REGEX024_BYTECODE_TRACK_ARRAY_INDEX_ID_SZ)
regex_tai_t ca_ind = ctx.extract_track_array_index();
ca_branch_new_node(ctx, ca_ind, ctx.passed_bytes);
}
/* Can throw error, should be placed at the end. Call ONLY in general matching mode */
void edit_selection_array(REGEX_IS024_CONTEXT& ctx, uint64_t key, uint64_t val){
ctx_print_debug(ctx);
uint64_t N = ctx.selection_array_len;
if (key >= N)
smitsya(selection_arr_out_of_range);
if (!ctx.active_thread.SAptr){
uint64_t* sa_instance = (uint64_t*)calloc(N + 1, 8);
if (!sa_instance)
throw std::bad_alloc();
sa_instance[0] = 1;
sa_instance[key + 1] = val;
ctx.active_thread.SAptr = sa_instance;
} else if (ctx.active_thread.SAptr[0] == 1){
ctx.active_thread.SAptr[key + 1] = val;
} else {
uint64_t* sa_instance = (uint64_t*)calloc(N + 1, 8);
if (!sa_instance)
throw std::bad_alloc();
sa_instance[0] = 1;
for (uint64_t i = 1; i <= ctx.selection_array_len; i++)
sa_instance[i] = ctx.active_thread.SAptr[i];
sa_instance[key + 1] = val;
ctx.active_thread.SAptr[0]--;
ctx.active_thread.SAptr = sa_instance;
}
}
void i_MOV_SELARR_IMM(REGEX_IS024_CONTEXT& ctx){
ctx_print_debug(ctx);
general_matching_mode_check()
check_available_prg(REGEX024_BYTECODE_TRACK_ARRAY_INDEX_ID_SZ + 8)
regex_tai_t sa_ind = ctx.extract_track_array_index();
uint64_t imm = ctx.extract_qw();
edit_selection_array(ctx, sa_ind, imm);
}
void i_MOV_SELARR_CHPOS(REGEX_IS024_CONTEXT& ctx){
ctx_print_debug(ctx);
general_matching_mode_check()
check_available_prg(REGEX024_BYTECODE_TRACK_ARRAY_INDEX_ID_SZ)
regex_tai_t sa_ind = ctx.extract_track_array_index();
edit_selection_array(ctx, sa_ind, ctx.passed_chars);
}
void calloc_stack_slots(REGEX_IS024_Stack& stack, regex_sslot_id_t nmemb) {
assert(stack.sz == 0 && !stack.slots);
regex_sslot_id_t* storage = static_cast<regex_sslot_id_t *>(calloc(nmemb, sizeof(regex_sslot_id_t)));
if (!storage)
throw std::bad_alloc();
stack.slots = storage;
}
REGEX_IS024_Thread* calloc_slots_array(regex_sslot_id_t nmemb) {
REGEX_IS024_Thread* ptr = static_cast<REGEX_IS024_Thread *>(calloc(nmemb, sizeof(REGEX_IS024_Thread)));
if (!ptr)
throw std::bad_alloc();
return ptr;
}
void i_INIT(REGEX_IS024_CONTEXT& ctx){
ctx_print_debug(ctx);
initialization_phase_check()
if (ctx.selection_array_len > ctx.SA_LEN_LIMIT)
smitsya(sa_length_limit_violation);
if (ctx.read_slots_number > ctx.READ_SS_LIMIT)
smitsya(read_sslot_count_limit_violation);
if (ctx.fork_slots_number > ctx.FORK_SS_LIMIT)
smitsya(fork_sslot_count_limit_violation);
if (ctx.portion_of_second_read_halt_ns > ctx.read_slots_number)
smitsya(fork_sslot_out_of_range);
ctx.READ_halted_slots = calloc_slots_array(ctx.read_slots_number);
calloc_stack_slots(ctx.READ_halted_stack_old, ctx.read_slots_number);
ctx.portion_of_FIRST_read_halt_ns = ctx.read_slots_number - ctx.portion_of_second_read_halt_ns;
calloc_stack_slots(ctx.READ_halted_stack_new_first, ctx.portion_of_FIRST_read_halt_ns);
calloc_stack_slots(ctx.READ_halted_stack_new_second, ctx.portion_of_second_read_halt_ns);
ctx.FORK_halted_slots = calloc_slots_array(ctx.fork_slots_number);
calloc_stack_slots(ctx.FORK_halted_stack, ctx.fork_slots_number);
ctx.initialized = true;
ctx.unnatural_started_thread_IP = ctx.active_thread.IP;
ctx.active_thread.delete_thread();
}
void i_THROW(REGEX_IS024_CONTEXT& ctx){
ctx.error = regex024_error_codes::program_throw;
}
void instruction_table(REGEX_IS024_CONTEXT &ctx) {
ctx_print_debug(ctx);
uint8_t opcode = ctx.extract_instruction();
#define rcase(inst) case regex024_opcodes::inst: return i_ ## inst (ctx);
#define jumpC(UN, st) case regex024_opcodes::JC ## UN ## _B: return i_JC<st, immArgByte>(ctx); \
case regex024_opcodes::JC ## UN ## _W: return i_JC<st, immArgWord>(ctx); \
case regex024_opcodes::JC ## UN ## _DW: return i_JC<st, immArgDoubleWord>(ctx); \
case regex024_opcodes::JC ## UN ## _QW: return i_JC<st, immArgQuadWord>(ctx);
switch (opcode) {
rcase(READ)
rcase(READZ)
rcase(JUMP)
jumpC(EQUAL, condEqual)
jumpC(LESS, condLess)
jumpC(GRTR, condGrtr)
rcase(FORK)
rcase(MATCH)
rcase(DIE)
rcase(PARAM_READ_SS_NUMBER)
rcase(PARAM_FORK_SS_NUMBER)
rcase(PARAM_SELARR_LEN)
rcase(PARAM_COLSIFTFUNC_SET)
rcase(PARAM_COLSIFTFUNC_WIPE)
rcase(MSG_MULTISTART_ALLOWED)
rcase(MSG_FED_INPUT_EXTENDED)
rcase(DMOV_RABX_SELARR)
rcase(DDIST_RABX_SELARR)
rcase(SIFTPRIOR_MIN_RABX)
rcase(SIFTPRIOR_MAX_RABX)
rcase(SIFT_DONE)
rcase(MOV_COLARR_IMM)
rcase(MOV_COLARR_BTPOS)
rcase(MOV_SELARR_IMM)
rcase(MOV_SELARR_CHPOS)
rcase(INIT)
rcase(THROW)
default:
ctx.error = regex024_error_codes::invalid_opcode;
}
}

View File

@ -0,0 +1,35 @@
#ifndef LIBREGEXIS024_INSTRUCTION_IMPLEMENTATION_H
#define LIBREGEXIS024_INSTRUCTION_IMPLEMENTATION_H
/* This file should not be included outside libregex024 virtual machine implementation */
#include <libregexis024vm/libregexis024vm.h>
#include <libregexis024vm/vm_opcodes.h>
#include <assert.h>
#define smitsya(error_type) do {ctx.error = regex024_error_codes::error_type; return; } while (0)
#define SLOT_EMPTY_val 0
#define SLOT_OCCUPIED 1
#define SLOT_OCCUPIED_val SLOT_OCCUPIED
#define SLOT_NEW 2
#define SLOT_NEW_val (SLOT_OCCUPIED | SLOT_NEW)
#define check_available_prg(regionSz) if (!ctx.check_inboundness(regionSz)){ \
ctx.error = regex024_error_codes::improper_finish; return; }
#if defined(LIBREGEXIS024_DEBUG) && defined(LIBREGEXIS024_ALLOW_LOUD)
#include <debugging_regexis024/vm/libregexis024vm_debug.h>
#define my_assert(expr) assert(expr)
#define ctx_print_debug(ctx) debug_print_context(ctx, __func__)
#define thread_print_debug(thread) debug_print_thread(thread, __func__)
#else
#define my_assert(expr) assert(expr)
#define ctx_print_debug(ctx)
#define thread_print_debug(thread)
#endif
void instruction_table(REGEX_IS024_CONTEXT& ctx);
#endif //LIBREGEXIS024_INSTRUCTION_IMPLEMENTATION_H

View File

@ -0,0 +1,47 @@
#include <libregexis024vm/vm_opcodes.h>
#include <libregexis024vm/utils.h>
#define rcase(name) case regex024_opcodes::name: return #name;
const char *regex024_opcode_tostr(regex024_opcode x) {
switch (x) {
rcase(READ)
rcase(READZ)
rcase(JUMP)
rcase(JCEQUAL_B)
rcase(JCEQUAL_W)
rcase(JCEQUAL_DW)
rcase(JCEQUAL_QW)
rcase(JCLESS_B)
rcase(JCLESS_W)
rcase(JCLESS_DW)
rcase(JCLESS_QW)
rcase(JCGRTR_B)
rcase(JCGRTR_W)
rcase(JCGRTR_DW)
rcase(JCGRTR_QW)
rcase(FORK)
rcase(MATCH)
rcase(DIE)
rcase(PARAM_READ_SS_NUMBER)
rcase(PARAM_FORK_SS_NUMBER)
rcase(PARAM_SELARR_LEN)
rcase(PARAM_COLSIFTFUNC_SET)
rcase(PARAM_COLSIFTFUNC_WIPE)
rcase(MSG_MULTISTART_ALLOWED)
rcase(MSG_FED_INPUT_EXTENDED)
rcase(DMOV_RABX_SELARR)
rcase(DDIST_RABX_SELARR)
rcase(SIFTPRIOR_MIN_RABX)
rcase(SIFTPRIOR_MAX_RABX)
rcase(SIFT_DONE)
rcase(MOV_COLARR_IMM)
rcase(MOV_COLARR_BTPOS)
rcase(MOV_SELARR_IMM)
rcase(MOV_SELARR_CHPOS)
rcase(INIT)
rcase(THROW)
default:
return "Invalid opcode";
}
}

View File

@ -0,0 +1,158 @@
#ifndef LIBREGEXIS024_LIBREGEXIS024VM_H
#define LIBREGEXIS024_LIBREGEXIS024VM_H
/* This thing is bloated. And slow (Because I designed it imperfectly and because it is bloated).
* I could have halven the amount of bloat, but that would require me writing code in headers.
* I am gonna use it for KM, even more bloated project. So I thought that this design is on the spot.
* C++ is such a funny language. Code is divided into .cpp and .h files. But it only makes problems.
* All of my work on this C++ project was not serious from the beginning. It's all funny stuff. */
/* Also, please, consider using libregexis024vm/libregexis024vm_interface.h
* Naming in this project is super inconsistent. I don't want it to trash your namespace */
#include <libregexis024vm/vm_errno.h>
#include <libregexis024vm/utils.h>
#include <libregexis024vm/vm_opcodes_types.h>
#include <stdlib.h>
#include <stdint.h>
struct REGEX_IS024_Stack{
regex_sslot_id_t* slots = NULL;
regex_sslot_id_t sz = 0;
regex_sslot_id_t pop();
void append(regex_sslot_id_t x);
bool empty() const;
bool non_empty() const;
REGEX_IS024_Stack(const REGEX_IS024_Stack&) = delete;
REGEX_IS024_Stack& operator=(const REGEX_IS024_Stack&) = delete;
REGEX_IS024_Stack() = default;
~REGEX_IS024_Stack();
};
struct REGEX024_CollectionArrayNode{
/* Key is small for historical reasons I do not rememeber. Who cares anyway */
regex_tai_t key;
uint64_t value;
/* NULL at the beginning */
REGEX024_CollectionArrayNode* prev;
/* Reference counting */
uint64_t refs = 0;
};
struct REGEX_IS024_Thread{
/* First byte field is used only when thread is located in slot */
uint8_t slot_occupation_status = 0;
regex_near_ptr_t IP = 0;
REGEX024_CollectionArrayNode* CAHptr = NULL;
/* Pointer to the seletion array. SA's are reference counted. Because of that every SA
* is elongated by one meta element in the beginning - reference counter. So the actual elements
* are enumerated starting from one. */
uint64_t* SAptr = NULL;
void delete_thread() noexcept;
void debug_print(const char* place);
};
struct REGEX_IS024_CONTEXT{
REGEX_IS024_CONTEXT(size_t programSize, const uint8_t *data, uint64_t caTreeLimit, regex_tai_t saLenLimit,
regex_sslot_id_t readSsLimit, regex_sslot_id_t forkSsLimit, uint64_t timeTickLimit);
regex024_error_code feedSOF();
/* You can safely pile up calls to this command, nothing bad will happen */
regex024_error_code startThread();
regex024_error_code extendedFeedCharacter(uint64_t input);
regex024_error_code feedCharacter(uint64_t INP, uint64_t corresponding_byte_amount);
~REGEX_IS024_CONTEXT();
/* Program size larger than 2^62 is forbidden */
size_t program_size = 0;
const uint8_t* prg = NULL;
/* Max allowed index of CA is 2^16 - 1
* Max allowed index of SA is 2^16 - 1. VM can be configured to allow even less */
/* CA = Collecton array. */
uint64_t CA_TREE_LIMIT;
/* SA = Selection array */
regex_tai_t SA_LEN_LIMIT;
regex_sslot_id_t READ_SS_LIMIT;
regex_sslot_id_t FORK_SS_LIMIT;
/* If time_tick_limit is non-zero, regex virtual machine will stop with error
* after this many ticks. This parameter set's the timeout.*/
uint64_t time_tick_limit;
/* This context is used only for one FA match session. This field measures each tick
* timer <= time_tick_limit */
uint64_t timer = 0;
/* CAN_total <= CA_TREE_LIMIT */
uint64_t CAN_total = 0;
/* Program selects it */
regex_tai_t selection_array_len = 0;
regex_sslot_id_t read_slots_number = 0;
regex_sslot_id_t fork_slots_number = 0;
bool have_sift_function = false;
regex_near_ptr_t sift_function;
bool allows_multistart = false;
uint8_t fed_input_extends_left = 0, fed_input_extends_right = 0;
regex_sslot_id_t portion_of_second_read_halt_ns = 0, portion_of_FIRST_read_halt_ns = 0;
bool initialized = false;
regex_near_ptr_t unnatural_started_thread_IP = 1337;
regex024_error_code error = regex024_error_codes::stable;
REGEX_IS024_Thread* READ_halted_slots;
REGEX_IS024_Stack READ_halted_stack_old;
REGEX_IS024_Stack READ_halted_stack_new_first;
REGEX_IS024_Stack READ_halted_stack_new_second;
REGEX_IS024_Thread* FORK_halted_slots;
REGEX_IS024_Stack FORK_halted_stack;
REGEX_IS024_Thread active_thread;
/* Environment for sifting stuff */
REGEX_IS024_Thread* sifting_with = NULL;
/* specifies the type of operation vm should do after shift (there are only two distinct options) */
uint8_t who_started_sift;
/* Sifting process uses IP field of active thread. Other data of thread is not modified or used during collision
* procudure. Old IP is stored there, if needed */
regex_near_ptr_t child_ret_IP;
regex_near_ptr_t intruder_IP;
/* RAX corresponds to intruder. Its data is stored in active thread field*/
uint64_t RAX;
/* RBX corresponds to homesteader. Its data is accessible by `REGEX_IS024_Thread* sifting_with` pointer*/
uint64_t RBX;
/* Will be unoccupied if no threads matched. After each feed of character this field will be wiped
* User should take care of intermediate success himself */
REGEX_IS024_Thread matched_thread;
uint64_t INP = 0;
uint64_t passed_chars = 0;
uint64_t passed_bytes = 0;
void try_to_continue_scheduled();
bool check_inboundness(int region);
uint8_t extract_b();
uint16_t extract_w();
uint32_t extract_dw();
uint64_t extract_qw();
uint8_t extract_instruction();
regex_sslot_id_t extract_sslot_id();
regex_near_ptr_t extract_near_pointer();
regex_tai_t extract_track_array_index();
void debug_print(const char* place);
};
#endif //LIBREGEXIS024_LIBREGEXIS024VM_H

View File

@ -0,0 +1,197 @@
#include <libregexis024vm/libregexis024vm.h>
#include <libregexis024vm/instruction_implementation.h>
#include <utility>
regex_sslot_id_t REGEX_IS024_Stack::pop() {
assert(sz != 0);
return slots[--sz];
}
void REGEX_IS024_Stack::append(regex_sslot_id_t x) {
assert(slots);
slots[sz] = x;
sz++;
}
bool REGEX_IS024_Stack::empty() const {
return !non_empty();
}
bool REGEX_IS024_Stack::non_empty() const {
return sz;
}
REGEX_IS024_Stack::~REGEX_IS024_Stack() {
assert(empty());
free(slots);
}
REGEX_IS024_CONTEXT::REGEX_IS024_CONTEXT(size_t programSize, const uint8_t *data,
uint64_t caTreeLimit, regex_tai_t saLenLimit,
regex_sslot_id_t readSsLimit, regex_sslot_id_t forkSsLimit,
uint64_t timeTickLimit) :
program_size(programSize), prg(data), CA_TREE_LIMIT(caTreeLimit), SA_LEN_LIMIT(saLenLimit),
READ_SS_LIMIT(readSsLimit), FORK_SS_LIMIT(forkSsLimit), time_tick_limit(timeTickLimit)
{
if (program_size > (1UL << 62))
exitf("Program is too huge\n");
active_thread.slot_occupation_status = SLOT_OCCUPIED;
}
/* No only will it launch a wave of deallocation in CA tree, but as a nice bonus it's
* gonna deoccupy slot_occupation_status*/
void REGEX_IS024_Thread::delete_thread() noexcept {
thread_print_debug(*this);
my_assert(slot_occupation_status & SLOT_OCCUPIED);
slot_occupation_status = SLOT_EMPTY_val;
REGEX024_CollectionArrayNode* cur_CAptr = CAHptr;
while (cur_CAptr){
assert(cur_CAptr->refs > 0);
if (--(cur_CAptr->refs) == 0){
REGEX024_CollectionArrayNode* next_CAptr = cur_CAptr->prev;
delete cur_CAptr;
cur_CAptr = next_CAptr;
} else
break;
}
if (SAptr){
if (--(SAptr[0]) == 0)
free(SAptr);
}
}
void emptify_one_of_new_read_halted_stacks(REGEX_IS024_CONTEXT& ctx, REGEX_IS024_Stack& type_new_stack){
while (type_new_stack.non_empty()){
REGEX_IS024_Thread& thread = ctx.READ_halted_slots[type_new_stack.pop()];
assert(thread.slot_occupation_status & SLOT_OCCUPIED);
thread.delete_thread();
}
}
/* First it will try to pop pending thread from FORK_halted_stack
* Then it will try popping thread from READ_halted_stack_old (checking if top
* thread here is not actually SLOT_NEW). If something succeded, corresponding slot will be deoccupied, and
* active slot will be occupied with it.
*
* try_to_continue_scheduled() assumes that active thread is unoccupied.*/
void REGEX_IS024_CONTEXT::try_to_continue_scheduled(){
ctx_print_debug(*this);
my_assert(!(active_thread.slot_occupation_status & SLOT_OCCUPIED));
if (FORK_halted_stack.sz){
regex_sslot_id_t ssid = FORK_halted_stack.pop();
active_thread = FORK_halted_slots[ssid];
FORK_halted_slots[ssid].slot_occupation_status = SLOT_EMPTY_val;
return;
}
while (READ_halted_stack_old.sz){
regex_sslot_id_t ssid = READ_halted_stack_old.pop();
if (READ_halted_slots[ssid].slot_occupation_status & SLOT_NEW){
/* This is the case when old thread was silently replaced by settled new thread */
continue;
}
active_thread = READ_halted_slots[ssid];
READ_halted_slots[ssid].slot_occupation_status = SLOT_EMPTY_val;
return;
}
/* Failure here will be detected. We started with unoccupied active thread. iterator inside kick will see it */
}
void kick(REGEX_IS024_CONTEXT& ctx) {
ctx_print_debug(ctx);
while ((ctx.active_thread.slot_occupation_status & SLOT_OCCUPIED)
&& ctx.error == regex024_error_codes::stable){
if (ctx.timer >= ctx.time_tick_limit)
smitsya(timeout);
ctx.timer++;
check_available_prg(REGEX024_BYTECODE_INSTRUCTION_SZ) // May return from kick(ctx)
// smivanie from those instructions will be immediately detected. Everything is OK
instruction_table(ctx);
}
}
regex024_error_code REGEX_IS024_CONTEXT::feedSOF() {
ctx_print_debug(*this);
kick(*this);
return error;
}
regex024_error_code REGEX_IS024_CONTEXT::startThread() {
ctx_print_debug(*this);
active_thread.slot_occupation_status = SLOT_OCCUPIED;
active_thread.IP = unnatural_started_thread_IP;
active_thread.SAptr = NULL;
active_thread.CAHptr = NULL;
kick(*this);
return error;
}
/* I hate C++ (aka antichrist), won't use move sementic (aka drink cornsyrup) */
void swap_stacks(REGEX_IS024_Stack& A, REGEX_IS024_Stack& B) {
std::swap(A.sz, B.sz);
std::swap(A.slots, B.slots);
}
void fill_empty_old_read_halted_stack(REGEX_IS024_CONTEXT& ctx, REGEX_IS024_Stack& read_halted_stack_new){
ctx_print_debug(ctx);
my_assert(!ctx.READ_halted_stack_old.non_empty());
// Actually, READ_halted_stack_old is always empty in this case
assert(ctx.READ_halted_stack_old.empty());
swap_stacks(ctx.READ_halted_stack_old, read_halted_stack_new);
for (uint32_t i = 0; i < ctx.READ_halted_stack_old.sz; i++){
REGEX_IS024_Thread& slot = ctx.READ_halted_slots[ctx.READ_halted_stack_old.slots[i]];
/* Should get rid of 'NEW' qualifier */
assert(slot.slot_occupation_status & SLOT_OCCUPIED);
if (slot.slot_occupation_status & SLOT_OCCUPIED)
slot.slot_occupation_status = SLOT_OCCUPIED;
}
}
regex024_error_code REGEX_IS024_CONTEXT::feedCharacter(uint64_t input, uint64_t corresponding_byte_amount) {
ctx_print_debug(*this);
if (matched_thread.slot_occupation_status & SLOT_OCCUPIED)
matched_thread.delete_thread();
emptify_one_of_new_read_halted_stacks(*this, READ_halted_stack_new_second);
fill_empty_old_read_halted_stack(*this, READ_halted_stack_new_first);
INP = input;
passed_bytes += corresponding_byte_amount;
passed_chars++;
try_to_continue_scheduled();
kick(*this);
return error;
}
regex024_error_code REGEX_IS024_CONTEXT::extendedFeedCharacter(uint64_t input) {
ctx_print_debug(*this);
if (matched_thread.slot_occupation_status & SLOT_OCCUPIED)
matched_thread.delete_thread();
fill_empty_old_read_halted_stack(*this, READ_halted_stack_new_second);
INP = input;
try_to_continue_scheduled();
kick(*this);
return error;
}
REGEX_IS024_CONTEXT::~REGEX_IS024_CONTEXT() {
ctx_print_debug(*this);
if (initialized){
emptify_one_of_new_read_halted_stacks(*this, READ_halted_stack_new_first);
emptify_one_of_new_read_halted_stacks(*this, READ_halted_stack_new_second);
while (READ_halted_stack_old.non_empty()){
REGEX_IS024_Thread& thread = READ_halted_slots[READ_halted_stack_old.pop()];
assert(thread.slot_occupation_status & SLOT_OCCUPIED);
if (!(thread.slot_occupation_status & SLOT_NEW))
thread.delete_thread();
}
free(READ_halted_slots);
while (FORK_halted_stack.non_empty())
FORK_halted_slots[FORK_halted_stack.pop()].delete_thread();
free(FORK_halted_slots);
if (matched_thread.slot_occupation_status & SLOT_OCCUPIED){
matched_thread.delete_thread();
}
}
}

View File

@ -0,0 +1,38 @@
#include <libregexis024vm/libregexis024vm.h>
#include <libregexis024vm/vm_opcodes.h>
bool REGEX_IS024_CONTEXT::check_inboundness(int region){
return vmprog_check_inboundness(program_size, active_thread.IP, region);
}
uint8_t REGEX_IS024_CONTEXT::extract_b() {
return vmprog_extract_b(&active_thread.IP, prg);
}
uint16_t REGEX_IS024_CONTEXT::extract_w() {
return vmprog_extract_w(&active_thread.IP, prg);
}
uint32_t REGEX_IS024_CONTEXT::extract_dw() {
return vmprog_extract_dw(&active_thread.IP, prg);
}
uint64_t REGEX_IS024_CONTEXT::extract_qw() {
return vmprog_extract_qw(&active_thread.IP, prg);
}
uint8_t REGEX_IS024_CONTEXT::extract_instruction() {
return extract_b();
}
regex_sslot_id_t REGEX_IS024_CONTEXT::extract_sslot_id() {
return extract_dw();
}
regex_near_ptr_t REGEX_IS024_CONTEXT::extract_near_pointer() {
return extract_qw();
}
regex_tai_t REGEX_IS024_CONTEXT::extract_track_array_index() {
return extract_w();
}

View File

@ -0,0 +1,105 @@
#include <libregexis024vm/libregexis024vm_interface.h>
#include <libregexis024vm/libregexis024vm.h>
#include <libregexis024vm/instruction_implementation.h>
bool REGEX_IS024_CAEvent::operator==(const REGEX_IS024_CAEvent &other) const {
return (key == other.key) && (value == other.value);
}
#define reveal ((REGEX_IS024_CONTEXT*)opaque)
REGEX_IS024_VirtualMachine::REGEX_IS024_VirtualMachine(size_t programSize, const uint8_t *data,
uint64_t caTreeLimit, regex_tai_t saLenLimit,
regex_sslot_id_t readSsLimit, regex_sslot_id_t forkSsLimit,
uint64_t timeTickLimit) {
opaque = new REGEX_IS024_CONTEXT(programSize, data, caTreeLimit, saLenLimit,
readSsLimit, forkSsLimit, timeTickLimit);
}
regex024_error_code REGEX_IS024_VirtualMachine::initialize() {
if (gave_SOF)
exitf("double feedSOF\n");
gave_SOF = true;
return reveal->feedSOF();
}
bool REGEX_IS024_VirtualMachine::isInitialized() {
return reveal->initialized;
}
bool REGEX_IS024_VirtualMachine::isUsable() {
return isInitialized() && reveal->error == regex024_error_codes::stable;
}
REGEX_IS024_VirtualMachine::~REGEX_IS024_VirtualMachine() {
delete reveal;
}
regex_tai_t REGEX_IS024_VirtualMachine::getSelectionArrayLength() {
return isUsable() ? reveal->selection_array_len : 0;
}
bool REGEX_IS024_VirtualMachine::isAllowMultistart() {
return isUsable() ? reveal->allows_multistart : false;
}
uint8_t REGEX_IS024_VirtualMachine::getInputLeftExtensionSize() {
return isUsable() ? reveal->fed_input_extends_left : 0;
}
uint8_t REGEX_IS024_VirtualMachine::getInputRightExtensionSize() {
return isUsable() ? reveal->fed_input_extends_right : 0;
}
regex024_error_code REGEX_IS024_VirtualMachine::getErrno() {
return reveal->error;
}
/* Stupid kinda function. Checks if somebody is ready to continue reading the actual string */
bool REGEX_IS024_VirtualMachine::haveSurvivors() {
return isUsable() && (reveal->READ_halted_stack_new_first.non_empty());
}
bool REGEX_IS024_VirtualMachine::isMatched() {
return isUsable() && static_cast<bool>((reveal->matched_thread.slot_occupation_status & SLOT_OCCUPIED));
}
std::vector<REGEX_IS024_CAEvent> REGEX_IS024_VirtualMachine::getMatchedThreadCABranchReverse() {
if (!isMatched())
return {};
std::vector<REGEX_IS024_CAEvent> res;
REGEX024_CollectionArrayNode* cur = reveal->matched_thread.CAHptr;
while (cur != NULL){
res.push_back({cur->key, cur->value});
cur = cur->prev;
}
return res;
}
uint64_t REGEX_IS024_VirtualMachine::getMatchedThreadSAValue(uint16_t key) {
if (key >= getSelectionArrayLength())
return 0;
if (!isMatched())
return 0;
return reveal->matched_thread.SAptr ? reveal->matched_thread.SAptr[key + 1] : 0;
}
regex024_error_code REGEX_IS024_VirtualMachine::addNewMatchingThread() {
if (!isUsable())
exitf("unusable\n");
// if (started_first_thread && !isAllowMultistart())
// exitf("Multistart is forbidden, bad usage of program\n");
return reveal->startThread();
}
regex024_error_code REGEX_IS024_VirtualMachine::extendedFeedCharacter(uint64_t input) {
if (!isUsable())
exitf("unusable\n");
return reveal->extendedFeedCharacter(input);
}
regex024_error_code REGEX_IS024_VirtualMachine::feedCharacter(uint64_t input, uint64_t bytesResembled) {
if (!isUsable())
exitf("unusable\n");
return reveal->feedCharacter(input, bytesResembled);
}

View File

@ -0,0 +1,46 @@
#ifndef LIBREGEXIS024_LIBREGEXIS024VM_INTERFACE_H
#define LIBREGEXIS024_LIBREGEXIS024VM_INTERFACE_H
#include <stdint.h>
#include <vector>
#include <libregexis024vm/vm_errno.h>
#include <libregexis024vm/vm_opcodes_types.h>
struct REGEX_IS024_CAEvent{
regex_tai_t key;
uint64_t value;
bool operator==(const REGEX_IS024_CAEvent& other) const;
};
class REGEX_IS024_VirtualMachine{
public:
REGEX_IS024_VirtualMachine(size_t programSize, const uint8_t *data, uint64_t caTreeLimit, uint16_t saLenLimit,
uint32_t readSsLimit, uint32_t forkSsLimit, uint64_t timeTickLimit);
REGEX_IS024_VirtualMachine(const REGEX_IS024_VirtualMachine& ) = delete;
REGEX_IS024_VirtualMachine& operator=(const REGEX_IS024_VirtualMachine&) = delete;
regex024_error_code initialize();
bool isInitialized();
bool isUsable();
virtual ~REGEX_IS024_VirtualMachine();
regex_tai_t getSelectionArrayLength();
bool isAllowMultistart();
uint8_t getInputLeftExtensionSize();
uint8_t getInputRightExtensionSize();
regex024_error_code getErrno();
bool haveSurvivors();
bool isMatched();
std::vector<REGEX_IS024_CAEvent> getMatchedThreadCABranchReverse();
uint64_t getMatchedThreadSAValue(uint16_t key);
regex024_error_code addNewMatchingThread();
regex024_error_code extendedFeedCharacter(uint64_t input);
regex024_error_code feedCharacter(uint64_t input, uint64_t bytesResembled);
private:
bool gave_SOF = false;
void* opaque;
};
#endif //LIBREGEXIS024_LIBREGEXIS024VM_INTERFACE_H

View File

@ -0,0 +1,69 @@
#include <libregexis024vm/utils.h>
#include <string.h>
#include <stdio.h>
#include <stdarg.h>
#include <stdlib.h>
#include <assert.h>
#ifndef __ORDER_LITTLE_ENDIAN__
#error "Big endian is currently unsupported"
#endif
void exitf(const char *fmt, ...) {
va_list va;
va_start(va, fmt);
vfprintf(stderr, fmt, va);
va_end(va);
exit(1);
}
int utf8_retrieve_size(uint8_t firstByte) {
if (!(firstByte & 0b10000000))
return 1;
uint8_t a = 0b11000000;
uint8_t b = 0b00100000;
for (int i = 2; i <= 4; i++){
if ((firstByte & (a | b)) == a)
return i;
a |= b;
b >>= 1;
}
return -1;
}
int32_t utf8_retrieve_character(int sz, size_t pos, const uint8_t *string) {
if (sz == 1)
return string[pos];
uint32_t v = string[pos] & (0b01111111 >> sz);
pos++;
for (int i = 1; i < sz; i++){
uint32_t th = string[pos];
if ((th & 0b11000000) != 0b10000000)
return -1;
v <<= 6;
v |= (th & 0b00111111);
pos++;
}
assert(v <= INT32_MAX);
return static_cast<int32_t>(v);
}
#define AAAAAA {cp = -1; return;}
void utf8_string_iterat(int32_t &cp, size_t &adj, size_t pos, const uint8_t *string, size_t string_size) {
if (pos >= string_size) AAAAAA
adj = utf8_retrieve_size(string[pos]);
if (adj < 0 || pos + adj > string_size) AAAAAA
if ((cp = utf8_retrieve_character(adj, pos, string)) < 0) AAAAAA
}
bool is_string_in_stringset(const char *strSample, const char **strSet) {
const char** cmpSubject = strSet;
while ((*cmpSubject) != NULL){
if (strcmp(strSample, *cmpSubject) == 0)
return true;
cmpSubject++; // += 8 bytes
}
return false;
}

View File

@ -0,0 +1,21 @@
#ifndef LIBREGEXIS024_UTILS_H
#define LIBREGEXIS024_UTILS_H
#include <stdint.h>
#include <stdlib.h>
void exitf(const char* fmt, ...);
/* 1, 2, 3, 4 on success; -1 on error */
int utf8_retrieve_size(uint8_t firstByte);
/* sz is a positive value returned by utf8_retrieve_size. Returns negative on error */
int32_t utf8_retrieve_character(int sz, size_t pos, const uint8_t* string);
/* cp is negative on error. adj is the size of letter in bytes. Can be used to adjust pos.
* All safety checks will be performed */
void utf8_string_iterat(int32_t& cp, size_t& adj, size_t pos, const uint8_t* string, size_t string_size);
bool is_string_in_stringset(const char* strSample, const char* strSet[]);
#endif //LIBREGEXIS024_UTILS_H

View File

@ -0,0 +1,26 @@
#include <libregexis024vm/vm_errno.h>
const char *regex024_error_code_tostr(regex024_error_code x) {
#define rcase(name) case regex024_error_codes::name: return #name;
switch (x) {
rcase(stable)
rcase(ca_tree_limit_violation)
rcase(sa_length_limit_violation)
rcase(read_sslot_count_limit_violation)
rcase(fork_sslot_count_limit_violation)
rcase(timeout)
rcase(improper_finish)
rcase(too_early)
rcase(too_late)
rcase(selection_arr_out_of_range)
rcase(read_sslot_out_of_range)
rcase(fork_sslot_out_of_range)
rcase(invalid_opcode)
rcase(invalid_register_code)
rcase(instruction_not_for_general_thread)
rcase(instruction_not_for_collision_thread)
rcase(bad_alloc)
default:
return "unknown_error_code";
}
}

View File

@ -0,0 +1,45 @@
#ifndef LIBREGEXIS024_VM_ERRNO_H
#define LIBREGEXIS024_VM_ERRNO_H
#include <stdint.h>
namespace regex024_error_codes {
enum regex024_error_code_I: int {
stable = 0,
ca_tree_limit_violation = -1,
sa_length_limit_violation = -2,
read_sslot_count_limit_violation = -3,
fork_sslot_count_limit_violation = -4,
timeout = -5,
/* Threads should be either abandoned by user of virtual machine after MATCH,
* ot be stopped by DIE instruction. Out of bound jump is disallowed */
improper_finish = -6,
/* Operation for general phase is executed in init phase */
too_early = -7,
/* Operation for init phase is executed in general phase */
too_late = -8,
/* Used selection array index is out of range */
selection_arr_out_of_range = -9,
/* Used read slot is out of range */
read_sslot_out_of_range = -10,
/* Used fork slot is out of range */
fork_sslot_out_of_range = -11,
invalid_opcode = -12,
invalid_register_code = -13,
/* Next operation scheduled for execution is forbidden in general thread */
instruction_not_for_general_thread = -14,
/* Next operation scheduled for execution is forbidden in collision thread */
instruction_not_for_collision_thread = -15,
/* Program willingly threw exception */
program_throw = -16,
/* O_o */
bad_alloc = -17,
};
}
typedef regex024_error_codes::regex024_error_code_I regex024_error_code;
const char* regex024_error_code_tostr(regex024_error_code x);
#endif //LIBREGEXIS024_VM_ERRNO_H

View File

@ -0,0 +1,99 @@
#ifndef LIBREGEXIS024_VM_OPCODES_H
#define LIBREGEXIS024_VM_OPCODES_H
#include <libregexis024vm/vm_opcodes_types.h>
namespace regex024_opcodes {
enum regex024_opcode_I: uint8_t{
/* READ <Settlement ID> */
READ = 0,
/* READZ = READ 0 */
READZ = 1,
/* JUMP <Near pointer> */
JUMP = 2,
/* JCEQUAL - jump conditional (equal): JCEQUAL <s1> <Near pointer> */
JCEQUAL_B = 3,
JCEQUAL_W = 4,
JCEQUAL_DW = 5,
JCEQUAL_QW = 6,
/* JCLESS - jump conditional (less): JCLESS <s1> <Near pointer> */
JCLESS_B = 7,
JCLESS_W = 8,
JCLESS_DW = 9,
JCLESS_QW = 10,
/* JCGRTR - jump conditional (greater): JCGRTR <s1> <Near pointer> */
JCGRTR_B = 11,
JCGRTR_W = 12,
JCGRTR_DW = 13,
JCGRTR_QW = 14,
/* FORK <Settlemnt ID> <Near pointer> */
FORK = 15,
/* MATCH | */
MATCH = 16,
/* DIE | */
DIE = 17,
/* PARAM_READ_SS_NUMBER <Settlement ID (length)> */
PARAM_READ_SS_NUMBER = 18,
/* PARAM_FORK_SS_NUMBER <Settlement ID (length)> */
PARAM_FORK_SS_NUMBER = 19,
/* PARAM_SELARR_LEN <Track array index (length)> */
PARAM_SELARR_LEN = 20,
/* PARAM_COLSIFTFUNC_SET <Near pointer> */
PARAM_COLSIFTFUNC_SET = 21,
/* PARAM_COLSIFTFUNC_WIPE */
PARAM_COLSIFTFUNC_WIPE = 22,
/* MSG_MULTISTART_ALLOWED <1B> */
MSG_MULTISTART_ALLOWED = 23,
/* MSG_FED_INPUT_EXTENDED <1B> <1B> <Settlement ID (length of suffix)> */
MSG_FED_INPUT_EXTENDED = 24,
/* DMOVRABXSELARR <Track array index> */
DMOV_RABX_SELARR = 25,
/* DDISTRABXSELARR <Track array index> <Track array index> */
DDIST_RABX_SELARR = 26,
/* SIFTPRIOR_MIN_RABX */
SIFTPRIOR_MIN_RABX = 27,
/* SIFTPRIOR_MAX_RABX */
SIFTPRIOR_MAX_RABX = 28,
/* SIFT_DONE */
SIFT_DONE = 29,
/* MOV_COLARR_IMM <Track array index> <8B> */
MOV_COLARR_IMM = 30,
/* MOV_COLARR_BTPOS <Track array index> */
MOV_COLARR_BTPOS = 31,
/* MOV_SELARR_IMM <Track array index> <8B> */
MOV_SELARR_IMM = 32,
/* MOV_SELARR_CHPOS <Track array index> */
MOV_SELARR_CHPOS = 33,
/* INIT */
INIT = 34,
/* THROW */
THROW = 35,
regex024_opcode_greaterMax = 36
};
}
typedef regex024_opcodes::regex024_opcode_I regex024_opcode;
const char* regex024_opcode_tostr(regex024_opcode x);
constexpr uint64_t REGEX024_BYTECODE_INSTRUCTION_SZ = 1;
constexpr uint64_t REGEX024_BYTECODE_SSLOT_ID_SZ = 4;
constexpr uint64_t REGEX024_BYTECODE_TRACK_ARRAY_INDEX_ID_SZ = 2;
constexpr uint64_t REGEX024_BYTECODE_NEAR_POINTER_SZ = 8;
bool vmprog_check_inboundness(regex_near_ptr_t prgSize, regex_near_ptr_t IP, regex_near_ptr_t region);
uint8_t vmprog_extract_b(regex_near_ptr_t* IPptr, const uint8_t* prg);
uint16_t vmprog_extract_w(regex_near_ptr_t* IPptr, const uint8_t* prg);
uint32_t vmprog_extract_dw(regex_near_ptr_t* IPptr, const uint8_t* prg);
uint64_t vmprog_extract_qw(regex_near_ptr_t* IPptr, const uint8_t* prg);
uint8_t vmprog_extract_instruction(regex_near_ptr_t* IPptr, const uint8_t* prg);
regex_sslot_id_t vmprog_extract_sslot_id(regex_near_ptr_t* IPptr, const uint8_t* prg);
regex_near_ptr_t vmprog_extract_near_pointer(regex_near_ptr_t* IPptr, const uint8_t* prg);
regex_tai_t vmprog_extrack_track_array_index(regex_near_ptr_t* IPptr, const uint8_t* prg);
#endif //LIBREGEXIS024_VM_OPCODES_H

View File

@ -0,0 +1,47 @@
#include <libregexis024vm/vm_opcodes.h>
#ifndef __ORDER_LITTLE_ENDIAN__
#error "Big endian is currently unsupported"
#endif
bool vmprog_check_inboundness(regex_near_ptr_t prgSz, regex_near_ptr_t IP, regex_near_ptr_t region) {
return IP + region <= prgSz;
}
uint8_t vmprog_extract_b(regex_near_ptr_t *IPptr, const uint8_t *prg) {
return prg[(*IPptr)++];
}
uint16_t vmprog_extract_w(regex_near_ptr_t *IPptr, const uint8_t *prg) {
uint16_t answer = *(uint16_t*)(&prg[*IPptr]);
*IPptr += 2;
return answer;
}
uint32_t vmprog_extract_dw(regex_near_ptr_t *IPptr, const uint8_t *prg) {
uint32_t answer = *(uint32_t *)(&prg[*IPptr]);
*IPptr += 4;
return answer;
}
uint64_t vmprog_extract_qw(regex_near_ptr_t *IPptr, const uint8_t *prg) {
uint64_t answer = *(uint64_t *)(&prg[*IPptr]);
*IPptr += 8;
return answer;
}
uint8_t vmprog_extract_instruction(regex_near_ptr_t *IPptr, const uint8_t *prg) {
return vmprog_extract_b(IPptr, prg);
}
regex_sslot_id_t vmprog_extract_sslot_id(regex_near_ptr_t *IPptr, const uint8_t *prg) {
return vmprog_extract_dw(IPptr, prg);
}
regex_near_ptr_t vmprog_extract_near_pointer(regex_near_ptr_t *IPptr, const uint8_t *prg) {
return vmprog_extract_qw(IPptr, prg);
}
regex_tai_t vmprog_extrack_track_array_index(regex_near_ptr_t *IPptr, const uint8_t *prg) {
return vmprog_extract_w(IPptr, prg);
}

View File

@ -0,0 +1,11 @@
#ifndef VM_OPCODES_TYPES_H
#define VM_OPCODES_TYPES_H
#include <stdint.h>
typedef uint32_t regex_sslot_id_t;
typedef uint64_t regex_near_ptr_t;
typedef uint16_t regex_tai_t;
#endif //VM_OPCODES_TYPES_H