@@ 134,7 134,6 @@ location(id, offset + 1, line, column +
// no longer applies, it delegates the job back to LexAny, for any takers.
#define RE_SINGLE_TOKEN "[\\(\\)\\[\\]\\{\\}\\'\\`\\;\\,]"
-#define RE_SYMBOL_TERMINATOR "[ \f\r\t\v\n\\(\\)\\[\\]\\{\\}\\'\\`\\;\\,\"#]"
#define RE_SPACE "[ \f\r\t\v\n]"
// all possible escape characters
@@ 191,23 190,34 @@ maybe_blockstring(id, i, j) :-
maybe_space_token(id, i) :-
source_map(str, id, sz),
i = range(0, sz),
- match(RE_SPACE, substr(str, i, 1)).
+ match(RE_SPACE, substr(str, i, 1)),
+ !escape(id, i - 1).
-// build largest space groups using connected-component labeling
-// the label is the index of the first space, so the set doubles as range
-.decl maybe_space(id:SourceId, group:number, index:number)
-maybe_space(id, i, i) :- maybe_space_token(id, i).
-maybe_space(id, x0, x2) :-
- maybe_space(id, g, i),
- maybe_space(id, g, i),
- maybe_space(id, x1, x2).
-
-// possible single-token symbols
+// possible single-character symbols
.decl maybe_single(id:SourceId, offset:number)
maybe_single(id, i) :-
source_map(str, id, sz),
i = range(0, sz),
- match(RE_SINGLE_TOKEN, substr(str, i, 1)).
+ match(RE_SINGLE_TOKEN, substr(str, i, 1)),
+ !escape(id, i - 1).
+
+// possible multi-character symbols
+.decl maybe_symbol_token(id:SourceId, offset:number)
+maybe_symbol_token(id, i) :-
+ source_map(_, id, sz),
+ i = range(0, sz),
+ !maybe_space_token(id, i),
+ !maybe_single(id, i),
+ !maybe_comment(id, i, _),
+ !maybe_blockstring(id, i, _),
+ !maybe_stringquote_token(id, i).
+
+// find symbol ranges through connected-component labeling
+.decl maybe_symbol_cc(id:SourceId, start:number, end:number)
+maybe_symbol_cc(id, i, i) :-
+ maybe_symbol_token(id, i),
+ !maybe_symbol_token(id, i - 1).
+maybe_symbol_cc(id, x0, x1 + 1) :- maybe_symbol_cc(id, x0, x1), maybe_symbol_token(id, x1 + 1).
// possible string quotes
.decl maybe_stringquote_token(id:SourceId, offset:number)
@@ 254,43 264,27 @@ region(id, R_ANY, x1, z0) :- region(id,
region(id, kind, x1, x2) :- region(id, _, _, x1),
region_token(id, kind, x1, x2).
-#define R_ANY_SYMBOL 0
-#define R_ANY_SINGLE 1
-#define R_ANY_SPACE 2
-
-.decl any_token(id:SourceId, kind:number, x0:number, x1:number)
-any_token(id, R_ANY_SINGLE, x, x + 1) :-
- region(id, R_ANY, x0, x1),
- maybe_single(id, x),
- x >= x0,
- x < x1.
-
-/*
-// tokenize symbols
-.decl any(id:SourceId, kind:number, x0:number, x1:number)
-// if there is some distance to the next specialized region, insert an ANY
-any(id, R_BEGIN, x0, x0), any(id, R_END, x1, x1) :- region(id, R_ANY, x0, x1).
-any(id, R_, x0, x0) :-
- region(id, R_ANY, x0, x1),
- z0 = min y0 : {
- maybe_space(id, x2),
- x1 <= x2
- },
- x1 < z0.
-*/
+.decl symbol(id:SourceId, start:number, end:number)
+symbol(id, x0, x2 + 1) :-
+ region(id, R_ANY, r0, r1),
+ maybe_symbol_cc(id, x0, x0),
+ x0 >= r0,
+ x2 = max x1 : { maybe_symbol_cc(id, x0, x1), x1 < r1 },
+ maybe_symbol_cc(id, x0, x2).
.decl source(str:symbol)
-source("xy(the) \nqui\"ck bro\\\"wn\n\nfox j\"ump;ed o\\\"ver the\n lazy dog.\n").
+source("xy(the) \nqui\"ck bro\\\"wn\n\nfox j\"ump;ed o\\\"ver the\n lazy dog.\nxy").
.decl debugout(str:symbol)
//debugout(cat("'",s,"'")) :- L8(id, x0, x1), source_map(str, id, _), s = substr(str, x0, x1 - x0 + 1).
//debugout(cat("'",s,"'")) :- source_map(str, id, sz), x = 33, y = 40, s = substr(str, x, y - x).
-debugout(cat(to_string(kind),"'",s,"'")) :- region(id, kind, x0, x1), source_map(str, id, _), s = substr(str, x0, x1 - x0).
+//debugout(cat(to_string(kind),"'",s,"'")) :- region(id, kind, x0, x1), source_map(str, id, _), s = substr(str, x0, x1 - x0).
+debugout(cat("'",s,"'")) :- symbol(id, x0, x1), source_map(str, id, _), s = substr(str, x0, x1 - x0).
.output source_map (IO=stdout)
.output maybe_stringquote (IO=stdout)
.output maybe_blockstring (IO=stdout)
-.output maybe_space (IO=stdout)
+.output symbol (IO=stdout)
.output region (IO=stdout)
.output debugout (IO=stdout)