From 80d7935974e0d6a221fe459fb16493e50979e7cf Mon Sep 17 00:00:00 2001 From: VaclavT Date: Tue, 26 Oct 2021 22:02:40 +0200 Subject: [PATCH] string regex function added --- debug.lsp | 7 ++++++- doc/Doc.md | 4 +++- ml.cpp | 27 +++++++++++++++++++++++++++ ml_string.cpp | 49 +++++++++++++++++++++++++++++++++++++++++++++++++ ml_string.h | 2 ++ tests/test.lsp | 9 ++++++--- 6 files changed, 93 insertions(+), 5 deletions(-) diff --git a/debug.lsp b/debug.lsp index 89d9f9b..b1b934a 100644 --- a/debug.lsp +++ b/debug.lsp @@ -1,5 +1,10 @@ +(print (string-regex-list "12" "(.*?)" "match" "ignore")) +(print (string-regex-list "12" "(.*?)" "token")) + + ;; (include "/usr/local/var/mlisp/ut.lsp") -;; (ut::define-test "result of (get-localtime-offset)" '(ut::assert-equal 7200 (get-localtime-offset))) +;; (ut::define-test "result of (string-regex-list ... \"match\")" (ut::assert-equal '(("1" "2")) (string-regex-list "12" "(.*?)" "match" "ignore"))) +;; (ut::define-test "result of (string-regex-list ... \"token\")" '(ut::assert-equal '(("1") ("2")) (string-regex-list "12" "(.*?)" "token"))) ;; (ut::run-tests) diff --git a/doc/Doc.md b/doc/Doc.md index d010238..f1e2749 100644 --- a/doc/Doc.md +++ b/doc/Doc.md @@ -5,6 +5,7 @@ |-b|Skips loadin of std lib| |-c code|Runs given code| |-f source_file ..|Executes code in files| +|-run source_file ..|Executes code in file, if first line of file is sheebang, it is skipped| |-i|Runs REPL| |-d|Turns on better stacktrace| |-p|Prints profile info at the end| @@ -103,10 +104,11 @@ |`start-of-year datetime`||`>>> (start-of-year (str-to-date "2021-05-13 10:32:12" "%Y-%m-%d %H:%M:%S")) => 1609459200`| |`end-of-year datetime`||`>>> (end-of-year (str-to-date "2021-05-13 10:32:12" "%Y-%m-%d %H:%M:%S")) => 1640995199`| |`(debug ..)`||| -|`(display ..)`||| +|`(display ..)`|Displays passed parameters|`>>> (display '(1 2 3)) => "(1 2 3)"`| |`(string-replace source substr replacement)`|Replace a substring with a replacement string in a source string|`>>> (string-replace "abcdefg" "de" "DE") => "abcDEfg"`| |`(string-replace-re source substr replacement)`|Replace a substring regex with a replacement string in a source string|`>>> (string-replace-re "there is a subsequence in the string" "\\b(sub)([^ ]*)" "sub-$2") => "there is a sub-sequence in the string"`| |`(string-regex? where regex)`| Returns true if where contains regex|`>>> (string-regex? "aba123cdefg" "[0-9]+") => 1`| +|`(string-regex-list where regex [mode] [ignorecase])`| Returns list of substring from where captured by regex in mode match or tokens|`>>> (string-regex-list "12" "(.*?)" "match" "ignore") => (("1" "2"))` `>>> (string-regex-list "12" "(.*?)" "token") => (("1") ("2"))`| |`(string-pad str len char rpad_lpad)`||| |`(string-lpad str len char)`|Pad string from start with char to length len|`>>> (string-lpad "0" 10 "x") => "xxxxxxxxx0"`| |`(string-rpad str len char)`|Pad string from righ with char to length len|`>>> (string-rpad "0" 10 "x") => "0xxxxxxxxx"`| diff --git a/ml.cpp b/ml.cpp index 4865daa..9b38143 100644 --- a/ml.cpp +++ b/ml.cpp @@ -1660,6 +1660,32 @@ MlValue string_regex(std::vector args, MlEnvironment &env) { return MlValue(regexp_search(args[0].as_string(), args[1].as_string())); } + +// Returns found substrings of a regex +MlValue string_regex_list(std::vector args, MlEnvironment &env) { + eval_args(args, env); + + bool match_mode = true; + bool ignore_case = false; + + if (args.size() < 2 && args.size() > 4) + throw MlError(MlValue("string-regex-list", string_regex_list), env, args.size() > 4 ? TOO_MANY_ARGS : TOO_FEW_ARGS); + + if (args.size() >= 3) match_mode = args[2].as_string() == "match"; + if (args.size() == 4) ignore_case = args[3].as_string() == "ignore"; + + auto found_matches = regexp_search2(args[0].as_string(), args[1].as_string(), match_mode, ignore_case); + std::vector list; + for(auto &l : found_matches) { + std::vector sublist; + for(auto &v : l) { + sublist.push_back(MlValue::string(v)); + } + list.push_back(sublist); + } + return MlValue(list); +} + // Splits string by regexp and returns list containing splited parts MlValue string_split(std::vector args, MlEnvironment &env) { eval_args(args, env); @@ -2087,6 +2113,7 @@ MlValue MlEnvironment::get(const std::string &name) const { if (name == "string-replace") return MlValue("string-replace", builtin::string_replace); if (name == "string-replace-re") return MlValue("string-replace-re", builtin::string_replace_re); if (name == "string-regex?") return MlValue("string-regex?", builtin::string_regex); + if (name == "string-regex-list") return MlValue("string-regex?", builtin::string_regex_list); if (name == "string-split") return MlValue("string-split", builtin::string_split); if (name == "string-pad") return MlValue("string-pad", builtin::string_pad); if (name == "string-rltrim") return MlValue("string-rltrim", builtin::string_rltrim); diff --git a/ml_string.cpp b/ml_string.cpp index dbc8569..fe73ff2 100644 --- a/ml_string.cpp +++ b/ml_string.cpp @@ -49,6 +49,55 @@ std::vector regexp_strsplit(const std::string &string_to_split, con return elems; } + +std::vector regexp_match(const std::string &str, const std::string ®_ex, bool case_sensitive) { + std::vector matches; + + std::regex pars_regex(reg_ex); // , (case_sensitive ? std::regex::basic : std::regex::basic|std::regex::icase)); + auto words_begin = std::sregex_iterator(str.begin(), str.end(), pars_regex); + auto words_end = std::sregex_iterator(); + + for (std::sregex_iterator i = words_begin; i != words_end; ++i) { + std::smatch match = *i; + matches.push_back(match.str()); + } + + return matches; +} + +std::vector> regexp_tokens(const std::string &str, const std::string ®_ex, bool case_sensitive) { + std::vector> captured_groups; + std::vector captured_subgroups; + + std::smatch res; + std::regex exp(reg_ex); // , (case_sensitive ? std::regex::basic : std::regex::basic|std::regex::icase)); + std::string::const_iterator searchStart(str.cbegin()); + while (std::regex_search(searchStart, str.cend(), res, exp)) { + captured_subgroups.clear(); + if (res.size() == 1) { // no subgroups + captured_subgroups.push_back(res[0]); + } else { + for (size_t i = 1; i < res.size(); ++i) { // [0] is whole match + captured_subgroups.push_back(res[i]); + } + } + if (captured_subgroups.size() > 0) + captured_groups.push_back(captured_subgroups); + + searchStart += res.position() + res.length(); + } + return captured_groups; +} + +std::vector> regexp_search2(const std::string &string_to_split, const std::string &rgx_str, bool match_mode, bool ignore_case) { + if (match_mode) { + std::vector> matches{regexp_match(string_to_split, rgx_str, ignore_case)}; + return matches; + } else { + return regexp_tokens(string_to_split, rgx_str, ignore_case); + } +} + std::string string_lucase(std::string s, const std::string &strcase) { if (strcase == "upper") std::transform(s.begin(), s.end(),s.begin(), ::toupper); diff --git a/ml_string.h b/ml_string.h index 5e51e0a..3e25998 100644 --- a/ml_string.h +++ b/ml_string.h @@ -9,6 +9,8 @@ std::string replace_substring_regexp(const std::string &src, const std::string & // Returns true if where contains regex bool regexp_search(const std::string &where, const std::string ®ex_str); +// Returns list of contained regex patterns +std::vector> regexp_search2(const std::string &where, const std::string ®ex_str, bool match_mode, bool ignore_case); std::vector regexp_strsplit(const std::string &string_to_split, const std::string &rgx_str); diff --git a/tests/test.lsp b/tests/test.lsp index ce32ea5..2b8518b 100644 --- a/tests/test.lsp +++ b/tests/test.lsp @@ -62,18 +62,21 @@ (ut::define-test "result of (string-rtrim \"abc \")" '(ut::assert-equal "abc" (string-rtrim "abc "))) (ut::define-test "result of (string-ltrim \" abc\")" '(ut::assert-equal "abc" (string-ltrim " abc"))) (ut::define-test "result of (string-trim \" abc \")" '(ut::assert-equal "abc" (string-trim " abc "))) -(ut::define-test "result of (string-regex? \"test.lsp\" \"^.*\.l(i)?sp$\")" '(ut::assert-true (string-regex? "test.lsp" "^.*\.l(i)?sp$"))) -(ut::define-test "result of (string-split \"split me by space\" \"\\s+\")" '(ut::assert-equal '("split" "me" "by" "space") (string-split "split me by space" "\\s+"))) (ut::define-test "result of (string-upcase \"abcABCD\")" '(ut::assert-equal "ABCABCD" (string-upcase "abcABCD"))) (ut::define-test "result of (string-downcase \"abcABCD\")" '(ut::assert-equal "abcabcd" (string-downcase "abcABCD"))) (ut::define-test "result of (string-len \"abcdef\")" '(ut::assert-equal 6 (string-len "abcdef"))) +(ut::define-test "result of (string-split \"split me by space\" \"\\s+\")" '(ut::assert-equal '("split" "me" "by" "space") (string-split "split me by space" "\\s+"))) + +(ut::define-test "result of (string-regex? \"test.lsp\" \"^.*\.l(i)?sp$\")" '(ut::assert-true (string-regex? "test.lsp" "^.*\.l(i)?sp$"))) +(ut::define-test "result of (string-regex-list ... \"match\")" (ut::assert-equal '(("1" "2")) (string-regex-list "12" "(.*?)" "match" "ignore"))) +(ut::define-test "result of (string-regex-list ... \"token\")" '(ut::assert-equal '(("1") ("2")) (string-regex-list "12" "(.*?)" "token"))) + (ut::define-test "result of (string-replace \"abcdef\" \"de\" \"DE\")" '(ut::assert-equal "abcDEfg" (string-replace "abcdefg" "de" "DE"))) (ut::define-test "result of (string-replace-re \"there is a subsequence in the string\" \"\\b(sub)([^ ]*)\" \"sub-$2\")" '(ut::assert-equal "there is a sub-sequence in the string" (string-replace-re "there is a subsequence in the string" "\\b(sub)([^ ]*)" "sub-$2"))) (ut::define-test "result of (string-replace-re \"XXYYZZ\" \"\" \"\")" '(ut::assert-equal "XXYYZZ" (string-replace-re "XXYYZZ" "" ""))) - (ut::define-test "result of (string-substr \"ABCDEF\")" '(ut::assert-equal "ABCDEF" (string-substr "ABCDEF"))) (ut::define-test "result of (string-substr \"ABCDEF\" 1)" '(ut::assert-equal "BCDEF" (string-substr "ABCDEF" 1))) (ut::define-test "result of (string-substr \"ABCDEF\" 2 3)" '(ut::assert-equal "CDE" (string-substr "ABCDEF" 2 3)))