# HG changeset patch # User Leonard Ritter # Date 1720458518 -7200 # Mon Jul 08 19:08:38 2024 +0200 # Node ID 0b950672b9ac05a74c080a6f1e0654c69da23c0c # Parent 873cb16db33e6701fc8e21e2fc21493c763ecf6f * more work on dl lexer diff --git a/testing/various/lexer.dl b/testing/various/lexer.dl --- a/testing/various/lexer.dl +++ b/testing/various/lexer.dl @@ -134,7 +134,6 @@ // no longer applies, it delegates the job back to LexAny, for any takers. #define RE_SINGLE_TOKEN "[\\(\\)\\[\\]\\{\\}\\'\\`\\;\\,]" -#define RE_SYMBOL_TERMINATOR "[ \f\r\t\v\n\\(\\)\\[\\]\\{\\}\\'\\`\\;\\,\"#]" #define RE_SPACE "[ \f\r\t\v\n]" // all possible escape characters @@ -191,23 +190,34 @@ maybe_space_token(id, i) :- source_map(str, id, sz), i = range(0, sz), - match(RE_SPACE, substr(str, i, 1)). + match(RE_SPACE, substr(str, i, 1)), + !escape(id, i - 1). -// build largest space groups using connected-component labeling -// the label is the index of the first space, so the set doubles as range -.decl maybe_space(id:SourceId, group:number, index:number) -maybe_space(id, i, i) :- maybe_space_token(id, i). -maybe_space(id, x0, x2) :- - maybe_space(id, g, i), - maybe_space(id, g, i), - maybe_space(id, x1, x2). - -// possible single-token symbols +// possible single-character symbols .decl maybe_single(id:SourceId, offset:number) maybe_single(id, i) :- source_map(str, id, sz), i = range(0, sz), - match(RE_SINGLE_TOKEN, substr(str, i, 1)). + match(RE_SINGLE_TOKEN, substr(str, i, 1)), + !escape(id, i - 1). + +// possible multi-character symbols +.decl maybe_symbol_token(id:SourceId, offset:number) +maybe_symbol_token(id, i) :- + source_map(_, id, sz), + i = range(0, sz), + !maybe_space_token(id, i), + !maybe_single(id, i), + !maybe_comment(id, i, _), + !maybe_blockstring(id, i, _), + !maybe_stringquote_token(id, i). + +// find symbol ranges through connected-component labeling +.decl maybe_symbol_cc(id:SourceId, start:number, end:number) +maybe_symbol_cc(id, i, i) :- + maybe_symbol_token(id, i), + !maybe_symbol_token(id, i - 1). +maybe_symbol_cc(id, x0, x1 + 1) :- maybe_symbol_cc(id, x0, x1), maybe_symbol_token(id, x1 + 1). // possible string quotes .decl maybe_stringquote_token(id:SourceId, offset:number) @@ -254,43 +264,27 @@ region(id, kind, x1, x2) :- region(id, _, _, x1), region_token(id, kind, x1, x2). -#define R_ANY_SYMBOL 0 -#define R_ANY_SINGLE 1 -#define R_ANY_SPACE 2 - -.decl any_token(id:SourceId, kind:number, x0:number, x1:number) -any_token(id, R_ANY_SINGLE, x, x + 1) :- - region(id, R_ANY, x0, x1), - maybe_single(id, x), - x >= x0, - x < x1. - -/* -// tokenize symbols -.decl any(id:SourceId, kind:number, x0:number, x1:number) -// if there is some distance to the next specialized region, insert an ANY -any(id, R_BEGIN, x0, x0), any(id, R_END, x1, x1) :- region(id, R_ANY, x0, x1). -any(id, R_, x0, x0) :- - region(id, R_ANY, x0, x1), - z0 = min y0 : { - maybe_space(id, x2), - x1 <= x2 - }, - x1 < z0. -*/ +.decl symbol(id:SourceId, start:number, end:number) +symbol(id, x0, x2 + 1) :- + region(id, R_ANY, r0, r1), + maybe_symbol_cc(id, x0, x0), + x0 >= r0, + x2 = max x1 : { maybe_symbol_cc(id, x0, x1), x1 < r1 }, + maybe_symbol_cc(id, x0, x2). .decl source(str:symbol) -source("xy(the) \nqui\"ck bro\\\"wn\n\nfox j\"ump;ed o\\\"ver the\n lazy dog.\n"). +source("xy(the) \nqui\"ck bro\\\"wn\n\nfox j\"ump;ed o\\\"ver the\n lazy dog.\nxy"). .decl debugout(str:symbol) //debugout(cat("'",s,"'")) :- L8(id, x0, x1), source_map(str, id, _), s = substr(str, x0, x1 - x0 + 1). //debugout(cat("'",s,"'")) :- source_map(str, id, sz), x = 33, y = 40, s = substr(str, x, y - x). -debugout(cat(to_string(kind),"'",s,"'")) :- region(id, kind, x0, x1), source_map(str, id, _), s = substr(str, x0, x1 - x0). +//debugout(cat(to_string(kind),"'",s,"'")) :- region(id, kind, x0, x1), source_map(str, id, _), s = substr(str, x0, x1 - x0). +debugout(cat("'",s,"'")) :- symbol(id, x0, x1), source_map(str, id, _), s = substr(str, x0, x1 - x0). .output source_map (IO=stdout) .output maybe_stringquote (IO=stdout) .output maybe_blockstring (IO=stdout) -.output maybe_space (IO=stdout) +.output symbol (IO=stdout) .output region (IO=stdout) .output debugout (IO=stdout)