64b295863214 — Leonard Ritter a month ago
* added support for weak references
1 files changed, 116 insertions(+), 137 deletions(-)

M testing/test_node4.sc
M testing/test_node4.sc +116 -137
@@ 2,15 2,17 @@ using import struct
 using import Array
 using import String
 using import Map
+using import Set
 using import property
 
 import ..lib.tukan.use
 using import tukan.logregion
 using import tukan.SHA256
 
+# print debug output while deduplicating
+let DEBUG_DEDUP = true
 # print debug output while scanning
-let DEBUG_SCAN = false
-
+let DEBUG_SCAN = true
 
 inline firstbit (mask bit)
     mask & (~ (mask - bit))

          
@@ 60,6 62,8 @@ inline... ritermaskbits (value : integer
 ################################################################################
 
 WordType := u32
+# references use this bit to indicate that they are weak
+WeakRefBit := (1 as WordType) << 31
 
 type Region : u32
 

          
@@ 184,7 188,8 @@ type+ Region
 
 ################################################################################
 
-
+inline unpack_weakref (value)
+    _ (bitcast (value & (~ WeakRefBit)) Region) (value & WeakRefBit)
 
 ################################################################################
 

          
@@ 227,140 232,123 @@ struct Module
         assert ((o == offset) & (s == size))
         bitcast id Region
 
-    fn... setref (self, targetid : Id, sourceid : Region)
-        assert (sourceid < targetid)
+    fn... internalsetref (self, targetid : Id, sourceid : u32)
         targetid as:= integer
-        self.p @ targetid = (storagecast sourceid)
+        self.p @ targetid = sourceid
         bitwordofs := (targetid // BitWordWidth)
         bitofs := targetid & (BitWordWidth - 1)
         self.ref @ bitwordofs |= (1 as BitWordType) << bitofs
         ;
 
-    fn... dedup (self, root : Region)
-        #   1. Iterate and hashmap the largest unique logregions
-            (using SHA256 hashes), while also aliasing addresses
-            (unique addresses are aliased to themselves). Then repeatedly
-            subdivide unique regions and repeat the same steps with those,
-            until we arrive at unit size.
+    inline... setref (self, targetid : Id, sourceid : Region)
+        assert (sourceid < targetid)
+        internalsetref self targetid (storagecast sourceid)
 
-            2. Scan content for references (using the ref bitmap) and rewrite
-            each reference according to the alias map. The hashmap is no longer
-            necessary for this step. The alias map will not contain addresses of
-            subregions of redundant blocks, but we can search upwards in the
-            alias map until we find an alias for the parent block, and
-            recalculate its new offset and try again.
+    fn... setweakref (self, targetid : Id, sourceid : Region)
+        assert (sourceid < targetid)
+        internalsetref self targetid ((storagecast sourceid) | WeakRefBit)
 
-            TODO: with this technique, we'd need to dedup multiple times, because
-            references are aliased after the deduplication, permitting to
-            deduplicate blocks further. in fact, we'd need in the worst case
-            log2(k) dedups for a tree depth of k.
+    fn... dedup (self, root : Region)
+        #   incremental deduplication
+
+        let report =
+            static-if DEBUG_DEDUP report
+            else (inline (...))
 
         let u256 = (integer 256)
 
-        # map hash to region
-        local map :
-            Map u256 Region
-                inline (key)
-                    key as u64 as hash
-        # for already aliased regions
-        local aliases : (Map Region Region)
+        inline... size (offset : u32)
+            if (offset == 0) -1:u32
+            else (offset & -offset)
+
+        inline... complete? (offset : u32, strlen : u32)
+            (size offset) <= (alignsized (strlen - offset))
+
+        inline... maxlen (offset : u32, strlen : u32)
+            min
+                alignsized (strlen - offset)
+                size offset
 
         let p = self.p
+        local hashedwords = 0:u32
 
-        inline hashblock (offset size)
-            let hcode = (sha256 (bitcast (& (p @ offset)) rawstring) size)
-            #report offset size
-                sha256-digest-string hcode
-            deref (@ (bitcast (& hcode) (@ u256)))
+        struct HashJob plain
+            sha : SHA256
+            offset : u32
+            processed : u32 = 0
+
+            inline process (self strlen)
+                # incremental hashing
+                offset := (copy self.offset)
+                maxlen := (maxlen offset strlen)
+                processed := (copy self.processed)
+                hoffset := offset + processed
+                hsize := maxlen - processed
+                sha := self.sha
+                if (hsize != 0)
+                    hashedwords += hsize
+                    self.processed = maxlen
+                    'hash sha (bitcast (& (p @ hoffset)) rawstring) (hsize * (sizeof WordType))
+                # requesting the digest alters sha's state, so make a copy
+                local sha_copy = (copy sha)
+                let hcode = ('digest sha_copy)
+                #report offset size
+                    sha256-digest-string hcode
+                deref (@ (bitcast (& hcode) (@ u256)))
+
+        local tiles : (Map u256 u32)
+        local active : (Array HashJob)
+
+        local aliases : (Map Region Region)
+
+        local maxactive = 0:usize
 
-        # TODO: one of the cases where recursive processing might be better
-        # start with largest complete blocksize
-        let sz = ((countof p) as u32)
-        let blocksz = (alignsizeu sz)
-        local blocks : (Array Region)
-        'append blocks (bitcast (encaddr 0:u32 blocksz) Region)
-        local nextblocks : (Array Region)
-        loop ()
-            for region in blocks
-                let offset size = (unpack region)
-                # halve size
-                size := size >> 1
-                for i in (range 2:u32)
-                    let offset = (offset + i * size)
-                    let region = (bitcast (encaddr offset size) Region)
-                    if (offset >= sz) # after end of buffer
-                        #report "skipping" region
-                        continue;
-                    if ((offset + size) > sz) # partially unhashable
-                        #report "deferring" region
-                        if (size != 1)
-                            'append nextblocks region
-                        continue;
-                    let hcode = (hashblock offset size)
-                    let newregion =
-                        try (deref ('get map hcode))
-                        else
-                            'set map hcode region
-                            region
-                    'set aliases region newregion
-                    if (region == newregion)
-                        #report "unique region" region
-                        if (size != 1)
-                            'append nextblocks region
+        let ref = self.ref
+        for offset word in (enumerate p u32)
+
+            let o b = (id2bitpos offset)
+            if ((ref @ o) & ((1 as BitWordType) << b))
+                let p@ = (p @ offset)
+                let oldptr weakbit = (unpack_weakref p@)
+                p@ =
+                    | weakbit
+                        storagecast
+                            try (copy ('get aliases oldptr))
+                            else oldptr
+
+            let strlen = ((offset + 1) as u32)
+            'append active (HashJob (offset = offset))
+
+            # track actives
+            let activecount = (countof active)
+            for k in (rrange 0:usize activecount)
+                let job = (active @ k)
+                let offset = (copy job.offset)
+                let delta = (strlen - offset)
+                if ((alignsized delta) == delta) # log2 level completed
+                    completed? := (complete? offset strlen)
+                    if completed?
+                        'remove active k
+                    h := ('process job strlen)
+                    try
+                        let bestoffset = (deref ('get tiles h))
+                        let region = (Region.from-aligned-offset offset delta)
+                        let bestregion = (Region.from-aligned-offset bestoffset delta)
+                        'set aliases region bestregion
                     else
-                        #report region "->" newregion
+                        'set tiles h offset
                         ;
-            if (empty? nextblocks)
-                break;
-            else
-                swap (view blocks) (view nextblocks)
-                'clear nextblocks
 
-        drop map
+            maxactive = (max maxactive (countof active))
 
-        fn alias-region (aliases region)
-            try (copy ('get aliases region))
-            else
-                # region is in redundant range
-                let original-region = region
-                loop (region)
-                    # find first unique parent
-                    let oldparent newparent =
-                        loop (oldparent = ('parent region))
-                            assert (oldparent != Region.Empty)
-                            try
-                                break oldparent
-                                    deref ('get aliases oldparent)
-                            else
-                                repeat ('parent oldparent)
-                    let offset size = (unpack region)
-                    let oldofs = (unpack oldparent)
-                    let newofs = (unpack newparent)
-                    # compute offset relative to new parent block
-                    let newregion = (Region.from-aligned-offset
-                        (offset - oldofs + newofs) size)
-                    try
-                        let newregion = (deref ('get aliases newregion))
-                        # write new alias for subsequent lookups
-                        'set aliases original-region newregion
-                        break (copy newregion)
-                    else
-                        # new region is also redundant; offset once more
-                        repeat newregion
+        report (hashedwords as i32) "words hashed"
+        report ((countof tiles) as i32) "tiles"
+        report ((countof aliases) as i32) "aliases"
+        report "max" (maxactive as i32) "active"
 
-        for wordword refmask in (enumerate self.ref u32)
-            for bit in (itermaskbits (copy refmask))
-                let id = (bitpos2id wordword (findlsb bit))
-                let addrptr = (p @ id)
-                let addr = (copy addrptr)
-                let region = (bitcast addr Region)
-                let newregion = (alias-region aliases region)
-                if (region != newregion)
-                    #report region "->" newregion
-                    # rewrite reference
-                    addrptr = (storagecast newregion)
+        try (copy ('get aliases root))
+        else root
 
-        alias-region aliases root
 
     fn... scan (self, root : Region)
         """"build a reverse ordered array of regions (in)directly connected

          
@@ 439,9 427,10 @@ struct Module
                 let word = (? (wordword == lhso) (word & lmask) word)
                 for bit in (ritermaskbits word)
                     let id = (bitpos2id wordword (findlsb bit))
-                    let target = (bitcast (copy (p @ id)) Region)
-                    report "ref" id "->" target
-                    mark self regions target
+                    let target weakbit = (unpack_weakref (copy (p @ id)))
+                    if (not weakbit)
+                        report "ref" id "->" target
+                        mark self regions target
 
         local visited : (Array Region)
         # iterate marked regions back to front

          
@@ 503,7 492,9 @@ struct Module
                 # find existing block
                 let oldparent newparent =
                     loop (oldparent = ('parent region))
-                        assert (oldparent != Region.Empty)
+                        if (oldparent == Region.Empty)
+                            # weak reference
+                            return Region.Empty
                         try
                             break oldparent
                                 deref ('get aliases oldparent)

          
@@ 544,9 535,9 @@ struct Module
                 let value = (copy srcval)
                 let refbit = ((1 as BitWordType) << bitofs)
                 if ((ref @ bitwordofs) & refbit)
-                    let oldref = (bitcast value Region)
+                    let oldref weakbit = (unpack_weakref value)
                     let newref = (find-alias aliases oldref)
-                    destval = (storagecast newref)
+                    destval = (| weakbit (storagecast newref))
                     if move?
                         # clear old bit
                         ref @ bitwordofs &= (~ bitofs)

          
@@ 588,30 579,18 @@ do
     'setref module (n2 @ 3) n01
     #'setref module (n5 @ 2) (n3 @ 0)
     'setref module (n5 @ 2) n4
-    'setref module (n5 @ 39) n3
+    'setweakref module (n5 @ 39) n3
     'setref module (n3 @ 0) (n1 @ 2)
-    'setref module (n3 @ 1) (n2 @ 3)
+    #'setref module (n3 @ 1) (n2 @ 3)
     'setref module (n2 @ 2) n0
     let n5 = ('dedup module n5)
 
-    fold (n5) for i in (range 3)
+    fold (n5) for i in (range 2)
         print "size:" (countof module.p)
         print "connect n5=" n5
         'collect module n5
     print "size:" (countof module.p)
 
-    #try
-        print n5
-        let n5 = (deref ('get addrmap n5))
-        print n5
-        let n5 = ('collect module n5)
-        print n5
-        print;
-        let n5 = ('collect module n5)
-        print n5
-        ;
-    #else;
-
 
 #static-if main-module?
     local module : Module