1627a2350aa0 — Leonard Ritter 19 days ago
* prototype logsuffix tree
2 files changed, 245 insertions(+), 1 deletions(-)

M lib/tukan/logregion.sc
A => testing/logsuffix.sc
M lib/tukan/logregion.sc +9 -1
@@ 36,13 36,21 @@ case (x : u64)
     x + 1
 
 """"extract the highest bit from x (round x to the next lowest power of 2)
-inline msbbit (x)
+inline... msbbit (x : u32)
     x := x | x >> 1
     x := x | x >> 2
     x := x | x >> 4
     x := x | x >> 8
     x := x | x >> 16
     x & ((x >> 1) + 1)
+case (x : u64)
+    x := x | x >> 1
+    x := x | x >> 2
+    x := x | x >> 4
+    x := x | x >> 8
+    x := x | x >> 16
+    x := x | x >> 32
+    x & ((x >> 1) + 1)
 
 inline alignoffsetu (offset align)
     """"align `offset` up to `align`, which must be a power of 2

          
A => testing/logsuffix.sc +236 -0
@@ 0,0 1,236 @@ 
+
+using import struct
+using import Array
+using import String
+using import Map
+using import Set
+
+import ..lib.tukan.use
+using import tukan.logregion
+
+#
+    track active segments (which are still growing)
+    every time segment reaches log2 size, register hash up to that point
+    always check against known hashes
+        if there is a duplicate, and it's shorter, remove it
+        when it's longer, only remove growing segment when it's complete
+
+################################################################################
+
+""""extract the highest bit from x (round x to the next lowest power of 2)
+inline msbbit (x)
+    x := x | x >> 1
+    x := x | x >> 2
+    x := x | x >> 4
+    x := x | x >> 8
+    x := x | x >> 16
+    x & ((x >> 1) + 1)
+
+
+#struct Node
+    let Rc = (Rc this-type)
+
+    offset : u32
+    sub : (Array Rc)
+
+    fn... size (self)
+        offset := self.offset
+        if (offset == 0) -1:u32
+        else (offset & -offset)
+
+    fn... complete? (self, strlen : u32)
+        offset := self.offset
+        ('size self) <= (msbbit (strlen - offset))
+
+    fn... maxlen (self, strlen : u32)
+        offset := self.offset
+        min
+            msbbit (strlen - offset)
+            'size self
+
+# list of node children that are still being expanded, and their depth
+#local actives : (Array (tuple Node.Rc i32 u32))
+#fn insert (self n1 str depth actives)
+    viewing self n1
+    strlen := ((countof str) as u32)
+    let o1 = (copy n1.offset)
+    let s1 = ('size n1)
+    let m1 = ('maxlen n1 strlen)
+    loop (self depth = (copy self) depth)
+        maxoffset := 1:u32 << depth
+        minoffset := maxoffset >> 1
+        if (minoffset >= s1)
+            # would be empty
+            break;
+        label _repeat
+            for idx2 n2 in (enumerate self.sub)
+                let o2 = n2.offset
+                let m2 = ('maxlen n2 strlen)
+                let same? =
+                    for i in (range minoffset maxoffset)
+                        if (i >= m1)
+                            break true
+                        if (i >= m2)
+                            break true
+                        if (str @ (o1 + i) != str @ (o2 + i))
+                            break false
+                    else true
+                if same?
+                    merge _repeat (copy n2) (depth + 1)
+            idx := (countof self.sub) as i32
+            'append self.sub (copy n1)
+            if (not ('complete? n1 strlen))
+                'append actives (tupleof (copy self) (copy idx) depth)
+            break;
+
+#fn printnode (node str depth)
+    returning void
+    let strlen = ((countof str) as u32)
+    maxoffset := 1:u32 << depth
+    minoffset := maxoffset >> 1
+    for n in node.sub
+        printnode1 n str depth
+        this-function n str (depth + 1)
+
+#fn checkdup (self idx1 str depth)
+    strlen := ((countof str) as u32)
+    n1 := self.sub @ idx1
+    let o1 = (copy n1.offset)
+    let s1 = ('size n1)
+    let m1 = ('maxlen n1 strlen)
+    maxoffset := 1:u32 << depth
+    minoffset := maxoffset >> 1
+    # check against root first
+    let o2 = self.offset
+    let m2 = ('maxlen self strlen)
+    let same? =
+        for i in (range minoffset maxoffset)
+            if (i >= m1)
+                break true
+            if (i >= m2)
+                break true
+            if (str @ (o1 + i) != str @ (o2 + i))
+                break false
+        else true
+    if same?
+        'remove self.sub idx1
+        return;
+    for idx2 n2 in (enumerate self.sub)
+        if (idx2 == idx1)
+            break;
+        let o2 = n2.offset
+        let m2 = ('maxlen n2 strlen)
+        let same? =
+            for i in (range minoffset maxoffset)
+                if (i >= m1)
+                    break true
+                if (i >= m2)
+                    break true
+                if (str @ (o1 + i) != str @ (o2 + i))
+                    break false
+            else true
+        if same?
+            return;
+
+#local root : Node.Rc
+
+#local srcstr = "ABCDABCDBCDABDEF"
+#local srcstr = "ABCDABCDBCDABDEFABCDABCDABCDABCDABCDABCDABCFABCDABCE"
+#               001100110011001100110011001100110011001100110011
+local srcstr = "ABCDBCDACDABDABCDABCCDABBCDAABCDABCDABCDBCDABDEABCDBCDACDABDABCDABCCDABBCDAABCAABCDABCDBCDABDEF"
+
+local str : String
+
+local tiles : (Map hash u32)
+local result : (Set u32)
+local active : (Array u32)
+
+fn... size (offset : u32)
+    if (offset == 0) -1:u32
+    else (offset & -offset)
+
+fn... complete? (offset : u32, strlen : u32)
+    (size offset) <= (msbbit (strlen - offset))
+
+fn... maxlen (offset : u32, strlen : u32)
+    min
+        msbbit (strlen - offset)
+        size offset
+
+fn nodehash (offset str)
+    let strlen = ((countof str) as u32)
+    let maxlen = (maxlen offset strlen)
+    local s : String
+    for k in (range offset (offset + maxlen))
+        'append s (str @ k)
+    hash s
+
+fn nodestr (offset str)
+    let strlen = ((countof str) as u32)
+    #maxoffset := 1:u32 << depth
+    #minoffset := maxoffset >> 1
+    let maxlen = (maxlen offset strlen)
+    local s : String
+    for k in (range offset (offset + maxlen))
+        'append s (str @ k)
+    sz := (size offset)
+    if (sz != -1:u32)
+        for k in (range maxlen sz)
+            'append s "_"
+        'append s "$"
+    else
+        'append s "..."
+    s
+
+for offset c in (enumerate srcstr u32)
+    'append str c
+    let strlen = ((countof str) as u32)
+    'append active offset
+
+    # track actives
+    let activecount = (countof active)
+    for k in (rrange 0:usize activecount)
+        let offset = (copy (active @ k))
+        let delta = (strlen - offset)
+        if ((msbbit delta) == delta) # log2 level completed
+            completed? := (complete? offset strlen)
+            if completed?
+                'remove active k
+            h := (nodehash offset str)
+            let existing_offset =
+                try ('get tiles h)
+                else
+                    'set tiles h offset
+                    'insert result offset
+                    continue;
+            assert (existing_offset != offset)
+            if (complete? existing_offset strlen)
+                sz := (size existing_offset)
+                ml := (maxlen offset strlen)
+                if ((sz < ml) | ((not completed?) & (sz == ml)))
+                    #print (nodestr existing_offset str) (nodestr offset str)
+                        \ (size existing_offset) (size offset)
+                        \ existing_offset offset
+                    'discard result existing_offset
+                    'insert result offset
+                    'set tiles h offset
+
+    let activecount = (countof active)
+    print (activecount as i32) "active"
+
+do
+    let strlen = ((countof str) as u32)
+    local outp : (Array u32)
+    for offset in result
+        do  #if (offset == 0 or (complete? offset strlen))
+            'append outp offset
+    'sort outp
+        inline (offset str)
+            nodestr offset str
+        str
+    for offset in outp
+        print (nodestr offset str) offset
+
+################################################################################
+
+;
  No newline at end of file