0b950672b9ac — Leonard Ritter 9 months ago
* more work on dl lexer
1 files changed, 34 insertions(+), 40 deletions(-)

M testing/various/lexer.dl
M testing/various/lexer.dl +34 -40
@@ 134,7 134,6 @@ location(id, offset + 1, line, column + 
 // no longer applies, it delegates the job back to LexAny, for any takers.
 
 #define RE_SINGLE_TOKEN "[\\(\\)\\[\\]\\{\\}\\'\\`\\;\\,]"
-#define RE_SYMBOL_TERMINATOR "[ \f\r\t\v\n\\(\\)\\[\\]\\{\\}\\'\\`\\;\\,\"#]"
 #define RE_SPACE "[ \f\r\t\v\n]"
 
 // all possible escape characters

          
@@ 191,23 190,34 @@ maybe_blockstring(id, i, j) :-
 maybe_space_token(id, i) :-
     source_map(str, id, sz),
     i = range(0, sz),
-    match(RE_SPACE, substr(str, i, 1)).
+    match(RE_SPACE, substr(str, i, 1)),
+    !escape(id, i - 1).
 
-// build largest space groups using connected-component labeling
-// the label is the index of the first space, so the set doubles as range
-.decl maybe_space(id:SourceId, group:number, index:number)
-maybe_space(id, i, i) :- maybe_space_token(id, i).
-maybe_space(id, x0, x2) :-
-    maybe_space(id, g, i),
-    maybe_space(id, g, i),
-    maybe_space(id, x1, x2).
-
-// possible single-token symbols
+// possible single-character symbols
 .decl maybe_single(id:SourceId, offset:number)
 maybe_single(id, i) :-
     source_map(str, id, sz),
     i = range(0, sz),
-    match(RE_SINGLE_TOKEN, substr(str, i, 1)).
+    match(RE_SINGLE_TOKEN, substr(str, i, 1)),
+    !escape(id, i - 1).
+
+// possible multi-character symbols
+.decl maybe_symbol_token(id:SourceId, offset:number)
+maybe_symbol_token(id, i) :-
+    source_map(_, id, sz),
+    i = range(0, sz),
+    !maybe_space_token(id, i),
+    !maybe_single(id, i),
+    !maybe_comment(id, i, _),
+    !maybe_blockstring(id, i, _),
+    !maybe_stringquote_token(id, i).
+
+// find symbol ranges through connected-component labeling
+.decl maybe_symbol_cc(id:SourceId, start:number, end:number)
+maybe_symbol_cc(id, i, i) :-
+    maybe_symbol_token(id, i),
+    !maybe_symbol_token(id, i - 1).
+maybe_symbol_cc(id, x0, x1 + 1) :- maybe_symbol_cc(id, x0, x1), maybe_symbol_token(id, x1 + 1).
 
 // possible string quotes
 .decl maybe_stringquote_token(id:SourceId, offset:number)

          
@@ 254,43 264,27 @@ region(id, R_ANY, x1, z0) :- region(id, 
 region(id, kind, x1, x2) :- region(id, _, _, x1),
     region_token(id, kind, x1, x2).
 
-#define R_ANY_SYMBOL 0
-#define R_ANY_SINGLE 1
-#define R_ANY_SPACE 2
-
-.decl any_token(id:SourceId, kind:number, x0:number, x1:number)
-any_token(id, R_ANY_SINGLE, x, x + 1) :-
-    region(id, R_ANY, x0, x1),
-    maybe_single(id, x),
-    x >= x0,
-    x < x1.
-
-/*
-// tokenize symbols
-.decl any(id:SourceId, kind:number, x0:number, x1:number)
-// if there is some distance to the next specialized region, insert an ANY
-any(id, R_BEGIN, x0, x0), any(id, R_END, x1, x1) :- region(id, R_ANY, x0, x1).
-any(id, R_, x0, x0) :-
-    region(id, R_ANY, x0, x1),
-    z0 = min y0 : {
-        maybe_space(id, x2),
-        x1 <= x2
-    },
-    x1 < z0.
-*/
+.decl symbol(id:SourceId, start:number, end:number)
+symbol(id, x0, x2 + 1) :-
+    region(id, R_ANY, r0, r1),
+    maybe_symbol_cc(id, x0, x0),
+    x0 >= r0,
+    x2 = max x1 : { maybe_symbol_cc(id, x0, x1), x1 < r1 },
+    maybe_symbol_cc(id, x0, x2).
 
 .decl source(str:symbol)
-source("xy(the)  \nqui\"ck bro\\\"wn\n\nfox j\"ump;ed o\\\"ver the\n    lazy dog.\n").
+source("xy(the)  \nqui\"ck bro\\\"wn\n\nfox j\"ump;ed o\\\"ver the\n    lazy dog.\nxy").
 
 .decl debugout(str:symbol)
 //debugout(cat("'",s,"'")) :- L8(id, x0, x1), source_map(str, id, _), s = substr(str, x0, x1 - x0 + 1).
 //debugout(cat("'",s,"'")) :- source_map(str, id, sz), x = 33, y = 40, s = substr(str, x, y - x).
-debugout(cat(to_string(kind),"'",s,"'")) :- region(id, kind, x0, x1), source_map(str, id, _), s = substr(str, x0, x1 - x0).
+//debugout(cat(to_string(kind),"'",s,"'")) :- region(id, kind, x0, x1), source_map(str, id, _), s = substr(str, x0, x1 - x0).
+debugout(cat("'",s,"'")) :- symbol(id, x0, x1), source_map(str, id, _), s = substr(str, x0, x1 - x0).
 
 .output source_map (IO=stdout)
 .output maybe_stringquote (IO=stdout)
 .output maybe_blockstring (IO=stdout)
-.output maybe_space (IO=stdout)
+.output symbol (IO=stdout)
 .output region (IO=stdout)
 
 .output debugout (IO=stdout)