b70277fc0e8a — Leonard Ritter 18 days ago
* dedup implementation
3 files changed, 138 insertions(+), 9 deletions(-)

M lib/tukan/logregion.sc
M testing/logsuffix.sc
M testing/test_node4.sc
M lib/tukan/logregion.sc +1 -1
@@ 165,7 165,7 @@ if main-module?
 
 do
     let encaddr encaddrun encaddrrangeun decaddr alignoffsetu alignoffsetd
-        \ alignsize
+        \ alignsize msbbit
     locals;
 
 

          
M testing/logsuffix.sc +2 -2
@@ 134,10 134,10 @@ inline msbbit (x)
 
 #local root : Node.Rc
 
-#local srcstr = "ABCDABCDBCDABDEF"
+local srcstr = "ABCDABCDBCDABDEF"
 #local srcstr = "ABCDABCDBCDABDEFABCDABCDABCDABCDABCDABCDABCFABCDABCE"
 #               001100110011001100110011001100110011001100110011
-local srcstr = "ABCDBCDACDABDABCDABCCDABBCDAABCDABCDABCDBCDABDEABCDBCDACDABDABCDABCCDABBCDAABCAABCDABCDBCDABDEF"
+#local srcstr = "ABCDBCDACDABDABCDABCCDABBCDAABCDABCDABCDBCDABDEABCDBCDACDABDABCDABCCDABBCDAABCAABCDABCDBCDABDEF"
 
 local str : String
 

          
M testing/test_node4.sc +135 -6
@@ 1,10 1,12 @@ 
 using import struct
 using import Array
 using import String
+using import Map
 using import property
 
 import ..lib.tukan.use
 using import tukan.logregion
+using import tukan.SHA256
 
 inline firstbit (mask bit)
     mask & (~ (mask - bit))

          
@@ 97,6 99,15 @@ type+ Region
     fn __unpack (self)
         decaddr (storagecast self)
 
+    fn __hash (self)
+        hash (storagecast self)
+
+    @@ memo
+    inline __== (cls T)
+        static-if (cls == T)
+            inline (self other)
+                (storagecast self) == (storagecast other)
+
     fn __repr (self)
         let offset size = (unpack self)
         .. "%" (hex offset) ":" (hex (offset + size))

          
@@ 181,7 192,113 @@ struct Module
         self.ref @ bitwordofs |= (1 as BitWordType) << bitofs
         ;
 
-    let DEBUG_COLLECT = false
+    fn... dedup (self, deep = false)
+        #
+            simple (fast): walk all references front to back; hash words in region, map
+                new hash to region, fix references whose hashes are derivative.
+
+            thorough (slower, more memory intensive):
+                walk largest regionsize and register hashes for regions that are unique
+                then walk only blocks of registered unique hashes (in order) at half
+                blocksize, and repeat, down to the smallest blocks
+
+                then use this map to resolve references like in simple step
+
+        # map hash to address
+
+        let u256 = (integer 256)
+
+        local map :
+            Map u256 Region
+                inline (key)
+                    key as u64 as hash
+        # for already aliased regions
+        local addr2addr : (Map Region Region)
+
+        let p = self.p
+
+        inline hashblock (offset size)
+            let hcode = (sha256 (bitcast (& (p @ offset)) rawstring) size)
+            #report offset size
+                sha256-digest-string hcode
+            deref (@ (bitcast (& hcode) (@ u256)))
+
+        if deep
+            # TODO: one of the cases where recursive processing might be better
+            # start with largest complete blocksize
+            let sz = ((countof p) as u32)
+            let blocksz = (alignsize sz)
+            local blocks : (Array Region)
+            'append blocks (bitcast (encaddr 0:u32 blocksz) Region)
+            local nextblocks : (Array Region)
+            loop ()
+                for region in blocks
+                    let offset size = (unpack region)
+                    # halve size
+                    size := size >> 1
+                    for i in (range 2:u32)
+                        let offset = (offset + i * size)
+                        let region = (bitcast (encaddr offset size) Region)
+                        if (offset >= sz) # after end of buffer
+                            #report "skipping" region
+                            continue;
+                        if ((offset + size) > sz) # partially unhashable
+                            #report "deferring" region
+                            if (size != 1)
+                                'append nextblocks region
+                            continue;
+                        let hcode = (hashblock offset size)
+                        let newregion =
+                            try (deref ('get map hcode))
+                            else
+                                'set map hcode region
+                                region
+                        'set addr2addr region newregion
+                        if (region == newregion)
+                            #report "unique region" region
+                            if (size != 1)
+                                'append nextblocks region
+                        else
+                            #report region "->" newregion
+                            ;
+                if (empty? nextblocks)
+                    break;
+                else
+                    swap (view blocks) (view nextblocks)
+                    'clear nextblocks
+
+                #'append blocks (bitcast (encaddr (i * blocksz) blocksz) Region)
+
+            #report "done with deep hashing"
+
+        for wordword refmask in (enumerate self.ref u32)
+            for bit in (itermaskbits (copy refmask))
+                let id = (bitpos2id wordword (findlsb bit))
+                let addrptr = (p @ id)
+                let addr = (copy addrptr)
+                let region = (bitcast addr Region)
+                let newregion =
+                    try (deref ('get addr2addr region))
+                    else
+                        let offset size = (decaddr addr)
+                        #print "ref!" id "->" offset size
+                        let hcode = (hashblock offset size)
+                        let newregion =
+                            try (deref ('get map hcode))
+                            else
+                                'set map hcode region
+                                region
+                        'set addr2addr region newregion
+                        newregion
+                if (region != newregion)
+                    #report region "->" newregion
+                    # rewrite reference
+                    addrptr = (storagecast newregion)
+
+        addr2addr
+
+
+    let DEBUG_COLLECT = true
 
     fn... collect (self, root : Region)
         let report =

          
@@ 297,6 414,8 @@ do
     local module : Module
     print "n0="
         n0 := ('alloc module 4)
+    print "n0="
+        n01 := ('alloc module 4)
     print "n1="
         n1 := ('alloc module 5)
     print "n2="

          
@@ 307,16 426,26 @@ do
         n4 := ('alloc module 16)
     print "n5="
         n5 := ('alloc module 40)
-    'setref module (n2 @ 3) n0
+    'setref module (n2 @ 3) n01
     #'setref module (n5 @ 2) (n3 @ 0)
     'setref module (n5 @ 2) n4
     'setref module (n5 @ 35) n3
     'setref module (n3 @ 0) (n1 @ 2)
     'setref module (n3 @ 1) (n2 @ 3)
-    'setref module (n2 @ 2) n1
-    let n5 = ('collect module n5)
-    print;
-    'collect module n5
+    'setref module (n2 @ 2) n0
+    let addrmap = ('dedup module true)
+
+    try
+        print n5
+        let n5 = (deref ('get addrmap n5))
+        print n5
+        let n5 = ('collect module n5)
+        print n5
+        print;
+        let n5 = ('collect module n5)
+        print n5
+        ;
+    else;
 
 
 #static-if main-module?