#!/usr/bin/env python # -*- coding: utf-8 -*- """ This is a pure Python implementation of the [rsync algorithm](TM96). [TM96] Andrew Tridgell and Paul Mackerras. The rsync algorithm. Technical Report TR-CS-96-05, Canberra 0200 ACT, Australia, 1996. http://samba.anu.edu.au/rsync/. ### Example Use Case: ### # On the system containing the file that needs to be patched >>> unpatched = open("unpatched.file", "rb") >>> hashes = blockchecksums(unpatched) # On the remote system after having received `hashes` >>> patchedfile = open("patched.file", "rb") >>> delta = rsyncdelta(patchedfile, hashes) # System with the unpatched file after receiving `delta` >>> unpatched.seek(0) >>> save_to = open("locally-patched.file", "wb") >>> patchstream(unpatched, save_to, delta) """ import collections import hashlib if not ( hasattr ( __builtins__ , "bytes" )) or str is bytes : # Python 2.x compatibility def bytes ( var , * args ): try : return '' . join ( map ( chr , var )) except TypeError : return map ( ord , var ) __all__ = [ "rollingchecksum" , "weakchecksum" , "patchstream" , "rsyncdelta" , "blockchecksums" ] def rsyncdelta ( datastream , remotesignatures , blocksize = 4096 ): """ Generates a binary patch when supplied with the weak and strong hashes from an unpatched target and a readable stream for the up-to-date data. The blocksize must be the same as the value used to generate remotesignatures. """ remote_weak , remote_strong = remotesignatures match = True matchblock = - 1 deltaqueue = collections . deque () while True : if match and datastream is not None : # Whenever there is a match or the loop is running for the first # time, populate the window using weakchecksum instead of rolling # through every single byte which takes at least twice as long. window = collections . deque ( bytes ( datastream . read ( blocksize ))) checksum , a , b = weakchecksum ( window ) try : # If there are two identical weak checksums in a file, and the # matching strong hash does not occur at the first match, it will # be missed and the data sent over. May fix eventually, but this # problem arises very rarely. matchblock = remote_weak . index ( checksum , matchblock + 1 ) stronghash = hashlib . md5 ( bytes ( window )) . hexdigest () matchblock = remote_strong . index ( stronghash , matchblock ) match = True deltaqueue . append ( matchblock ) if datastream . closed : break continue except ValueError : # The weakchecksum did not match match = False try : if datastream : # Get the next byte and affix to the window newbyte = ord ( datastream . read ( 1 )) window . append ( newbyte ) except TypeError : # No more data from the file; the window will slowly shrink. # newbyte needs to be zero from here on to keep the checksum # correct. newbyte = 0 tailsize = datastream . tell () % blocksize datastream = None if datastream is None and len ( window ) <= tailsize : # The likelihood that any blocks will match after this is # nearly nil so call it quits. deltaqueue . append ( window ) break # Yank off the extra byte and calculate the new window checksum oldbyte = window . popleft () checksum , a , b = rollingchecksum ( oldbyte , newbyte , a , b , blocksize ) # Add the old byte the file delta. This is data that was not found # inside of a matching block so it needs to be sent to the target. try : deltaqueue [ - 1 ] . append ( oldbyte ) except ( AttributeError , IndexError ): deltaqueue . append ([ oldbyte ]) # Return a delta that starts with the blocksize and converts all iterables # to bytes. deltastructure = [ blocksize ] for element in deltaqueue : if isinstance ( element , int ): deltastructure . append ( element ) elif element : deltastructure . append ( bytes ( element )) return deltastructure def blockchecksums ( instream , blocksize = 4096 ): """ Returns a list of weak and strong hashes for each block of the defined size for the given data stream. """ weakhashes = list () stronghashes = list () read = instream . read ( blocksize ) while read : weakhashes . append ( weakchecksum ( bytes ( read ))[ 0 ]) stronghashes . append ( hashlib . md5 ( read ) . hexdigest ()) read = instream . read ( blocksize ) return weakhashes , stronghashes def patchstream ( instream , outstream , delta ): """ Patches instream using the supplied delta and write the resultantant data to outstream. """ blocksize = delta [ 0 ] for element in delta [ 1 :]: if isinstance ( element , int ) and blocksize : instream . seek ( element * blocksize ) element = instream . read ( blocksize ) outstream . write ( element ) def rollingchecksum ( removed , new , a , b , blocksize = 4096 ): """ Generates a new weak checksum when supplied with the internal state of the checksum calculation for the previous window, the removed byte, and the added byte. """ a -= removed - new b -= removed * blocksize - a return ( b << 16 ) | a , a , b def weakchecksum ( data ): """ Generates a weak checksum from an iterable set of bytes. """ a = b = 0 l = len ( data ) for i in range ( l ): a += data [ i ] b += ( l - i ) * data [ i ] return ( b << 16 ) | a , a , b