24b1be62fe5a — Leonard Ritter 17 days ago
* improved deduplication
1 files changed, 100 insertions(+), 77 deletions(-)

M testing/test_node4.sc
M testing/test_node4.sc +100 -77
@@ 117,6 117,11 @@ type+ Region
         assert (index < size)
         bitcast (offset + index) Id
 
+    let Empty = (bitcast 0:u32 this-type)
+
+    fn parent (self)
+        bitcast (parentaddr (storagecast self)) this-type
+
     fn endpoints (self)
         let offset size = (unpack self)
         _ (bitcast offset Id) (bitcast (offset + size - 1) Id)

          
@@ 142,6 147,9 @@ type+ Region
                 let offset size = (unpack self)
                 other < offset
 
+    inline... from-offset (offset : u32, size : u32 = 1:u32)
+        bitcast (encaddr offset size) Region
+
 ################################################################################
 
 

          
@@ 172,7 180,7 @@ struct Module
     fn alloc (self wordcount)
         count := (countof self.p)
         offset := (count as u32)
-        size := (alignsize (wordcount as u32))
+        size := (alignsizeu (wordcount as u32))
         offset := (alignoffsetu offset size)
         newcount := offset + size
         'resize self.p newcount (nullof WordType)

          
@@ 192,28 200,29 @@ struct Module
         self.ref @ bitwordofs |= (1 as BitWordType) << bitofs
         ;
 
-    fn... dedup (self, deep = false)
-        #
-            simple (fast): walk all references front to back; hash words in region, map
-                new hash to region, fix references whose hashes are derivative.
+    fn... dedup (self)
+        #   1. Iterate and hashmap the largest unique logregions
+            (using SHA256 hashes), while also aliasing addresses
+            (unique addresses are aliased to themselves). Then repeatedly
+            subdivide unique regions and repeat the same steps with those,
+            until we arrive at unit size.
 
-            thorough (slower, more memory intensive):
-                walk largest regionsize and register hashes for regions that are unique
-                then walk only blocks of registered unique hashes (in order) at half
-                blocksize, and repeat, down to the smallest blocks
-
-                then use this map to resolve references like in simple step
-
-        # map hash to address
+            2. Scan content for references (using the ref bitmap) and rewrite
+            each reference according to the alias map. The hashmap is no longer
+            necessary for this step. The alias map will not contain addresses of
+            subregions of redundant blocks, but we can search upwards in the
+            alias map until we find an alias for the parent block, and
+            recalculate its new offset and try again.
 
         let u256 = (integer 256)
 
+        # map hash to region
         local map :
             Map u256 Region
                 inline (key)
                     key as u64 as hash
         # for already aliased regions
-        local addr2addr : (Map Region Region)
+        local aliases : (Map Region Region)
 
         let p = self.p
 

          
@@ 223,53 232,79 @@ struct Module
                 sha256-digest-string hcode
             deref (@ (bitcast (& hcode) (@ u256)))
 
-        if deep
-            # TODO: one of the cases where recursive processing might be better
-            # start with largest complete blocksize
-            let sz = ((countof p) as u32)
-            let blocksz = (alignsize sz)
-            local blocks : (Array Region)
-            'append blocks (bitcast (encaddr 0:u32 blocksz) Region)
-            local nextblocks : (Array Region)
-            loop ()
-                for region in blocks
+        # TODO: one of the cases where recursive processing might be better
+        # start with largest complete blocksize
+        let sz = ((countof p) as u32)
+        let blocksz = (alignsizeu sz)
+        local blocks : (Array Region)
+        'append blocks (bitcast (encaddr 0:u32 blocksz) Region)
+        local nextblocks : (Array Region)
+        loop ()
+            for region in blocks
+                let offset size = (unpack region)
+                # halve size
+                size := size >> 1
+                for i in (range 2:u32)
+                    let offset = (offset + i * size)
+                    let region = (bitcast (encaddr offset size) Region)
+                    if (offset >= sz) # after end of buffer
+                        #report "skipping" region
+                        continue;
+                    if ((offset + size) > sz) # partially unhashable
+                        #report "deferring" region
+                        if (size != 1)
+                            'append nextblocks region
+                        continue;
+                    let hcode = (hashblock offset size)
+                    let newregion =
+                        try (deref ('get map hcode))
+                        else
+                            'set map hcode region
+                            region
+                    'set aliases region newregion
+                    if (region == newregion)
+                        #report "unique region" region
+                        if (size != 1)
+                            'append nextblocks region
+                    else
+                        #report region "->" newregion
+                        ;
+            if (empty? nextblocks)
+                break;
+            else
+                swap (view blocks) (view nextblocks)
+                'clear nextblocks
+
+        drop map
+
+        fn alias-region (aliases region)
+            try (deref ('get aliases region))
+            else
+                let original-region = region
+                loop (region)
+                    # find first unique parent
+                    let oldparent newparent =
+                        loop (oldparent = ('parent region))
+                            assert (oldparent != Region.Empty)
+                            try
+                                break oldparent
+                                    deref ('get aliases oldparent)
+                            else
+                                repeat ('parent oldparent)
                     let offset size = (unpack region)
-                    # halve size
-                    size := size >> 1
-                    for i in (range 2:u32)
-                        let offset = (offset + i * size)
-                        let region = (bitcast (encaddr offset size) Region)
-                        if (offset >= sz) # after end of buffer
-                            #report "skipping" region
-                            continue;
-                        if ((offset + size) > sz) # partially unhashable
-                            #report "deferring" region
-                            if (size != 1)
-                                'append nextblocks region
-                            continue;
-                        let hcode = (hashblock offset size)
-                        let newregion =
-                            try (deref ('get map hcode))
-                            else
-                                'set map hcode region
-                                region
-                        'set addr2addr region newregion
-                        if (region == newregion)
-                            #report "unique region" region
-                            if (size != 1)
-                                'append nextblocks region
-                        else
-                            #report region "->" newregion
-                            ;
-                if (empty? nextblocks)
-                    break;
-                else
-                    swap (view blocks) (view nextblocks)
-                    'clear nextblocks
-
-                #'append blocks (bitcast (encaddr (i * blocksz) blocksz) Region)
-
-            #report "done with deep hashing"
+                    let oldofs = (unpack oldparent)
+                    let newofs = (unpack newparent)
+                    # compute offset relative to new parent block
+                    let newregion = (Region.from-offset
+                        (offset - oldofs + newofs) size)
+                    try
+                        let newregion = (deref ('get aliases newregion))
+                        # write new alias for subsequent lookups
+                        'set aliases original-region newregion
+                        break newregion
+                    else
+                        # new region is also redundant; offset once more
+                        repeat newregion
 
         for wordword refmask in (enumerate self.ref u32)
             for bit in (itermaskbits (copy refmask))

          
@@ 277,25 312,13 @@ struct Module
                 let addrptr = (p @ id)
                 let addr = (copy addrptr)
                 let region = (bitcast addr Region)
-                let newregion =
-                    try (deref ('get addr2addr region))
-                    else
-                        let offset size = (decaddr addr)
-                        #print "ref!" id "->" offset size
-                        let hcode = (hashblock offset size)
-                        let newregion =
-                            try (deref ('get map hcode))
-                            else
-                                'set map hcode region
-                                region
-                        'set addr2addr region newregion
-                        newregion
+                let newregion = (alias-region aliases region)
                 if (region != newregion)
                     #report region "->" newregion
                     # rewrite reference
                     addrptr = (storagecast newregion)
 
-        addr2addr
+        aliases
 
 
     let DEBUG_COLLECT = true

          
@@ 336,7 359,7 @@ struct Module
             lw := (usedl @ wordword)
             rw := (usedr @ wordword)
             refw := (copy (ref @ wordword))
-            fold (rc = rc) for bit in (ritermaskbits (| lw rw refw))
+            fold (rc) for bit in (ritermaskbits (| lw rw refw))
                 rc := rc + (? ((rw & bit) == 0) 0 1)
                 if ((rc > 0) & ((refw & bit) != 0))
                     let id = (bitpos2id wordword (findlsb bit))

          
@@ 358,7 381,7 @@ struct Module
             \ for wordword in (range bitwordcount)
             lw := (copy (usedl @ wordword))
             rw := (copy (usedr @ wordword))
-            fold (rc startpos = rc startpos)
+            fold (rc startpos)
                 \ for bit in (itermaskbits (| lw rw))
                 let started? = ((lw & bit) != 0)
                 let stopped? = ((rw & bit) != 0)

          
@@ 433,7 456,7 @@ do
     'setref module (n3 @ 0) (n1 @ 2)
     'setref module (n3 @ 1) (n2 @ 3)
     'setref module (n2 @ 2) n0
-    let addrmap = ('dedup module true)
+    let addrmap = ('dedup module)
 
     try
         print n5