From 6d323bbbdc4d3e36e2757e501ad1d2df9ef02d60 Mon Sep 17 00:00:00 2001 From: Florent Rougon <f.rougon@free.fr> Date: Fri, 26 Jan 2018 19:07:30 +0100 Subject: [PATCH] terrasync.py: prepare the terrain for --mode and --report - Add computeHash() utility function that can work with any file-like object (e.g., a connected socket). - Rename hash_of_file() to hashForFile(), and of course implement it using our new computeHash(). - Add class HTTPSocketRequest derived from HTTPGetCallback. It allows one to process data from the network without storing it to a file (it uses the file-like interface provided by http.client.HTTPResponse). The callback returns the http.client.HTTPResponse object, which can be conveniently used in a 'with' statement. - Simplify the API of TerraSync.updateDirectory(): its 'dirIndexHash' argument must now be a hash (a string); the None object is not allowed anymore (with the soon-to-come addition of --mode=check, having to deal with this special case in updateDirectory() would make the logic too difficult to follow, or we would have to really completely separate check-only mode from update mode, which would entail code duplication). Since TerraSync.updateDirectory() must now always have a hash to work with, compute the hash of the root '.dirindex' file from the server in TerraSync.start(), using our new HTTPSocketRequest class---which was written for this purpose, since that will have to work in check-only mode (but not only), where we don't want to write any file to disk. - TerraSync.updateFile(): correctly handle the case where a directory inside the TerraSync repository is (now) a file according to the server: the directory must be recursively removed before the file can be downloaded in the place formerly occupied by the directory. - Add stub class Report. Its methods do nothing for now, but are already called in a couple of appropriate places. The class will be completed in a future commit, of course. --- scripts/python/terrasync.py | 101 +++++++++++++++++++++++++++++++----- 1 file changed, 87 insertions(+), 14 deletions(-) diff --git a/scripts/python/terrasync.py b/scripts/python/terrasync.py index 63123d9e2..9375b22f5 100755 --- a/scripts/python/terrasync.py +++ b/scripts/python/terrasync.py @@ -121,6 +121,20 @@ def removeDirectoryTree(base, whatToRemove): shutil.rmtree(absPath) +def computeHash(fileLike): + hash = hashlib.sha1() + + for chunk in iter(lambda: fileLike.read(4096), b""): + hash.update(chunk) + + return hash.hexdigest() + + +def hashForFile(fname): + with open(fname, "rb") as f: + return computeHash(f) + + # ***************************************************************************** # * Network-related classes * # ***************************************************************************** @@ -247,16 +261,32 @@ class HTTPDownloadRequest(HTTPGetCallback): if self.mycallback != None: self.mycallback(self) -################################################################################################################################# -def hash_of_file(fname): - hash = hashlib.sha1() +class HTTPSocketRequest(HTTPGetCallback): + """HTTPGetCallback class whose callback returns a file-like object. - with open(fname, "rb") as f: - for chunk in iter(lambda: f.read(4096), b""): - hash.update(chunk) + The file-like object returned by the callback, and thus by + HTTPGetter.get(), is a socket or similar. This allows one to read + the data obtained from the network without necessarily storing it + to a file. - return hash.hexdigest() + """ + def __init__(self, src): + """Initialize an HTTPSocketRequest object. + + src -- path to the resource on the server (no protocol, no + server name, just the path starting with a '/'). + + """ + HTTPGetCallback.__init__(self, src, self.callback) + + def callback(self, url, httpResponse): + # Same comment as for HTTPDownloadRequest.callback() + if httpResponse.status != 200: + raise NetworkError("HTTP callback got status {status} for URL {url}" + .format(status=httpResponse.status, url=url)) + + return httpResponse ################################################################################################################################# @@ -300,6 +330,17 @@ def parse_terrasync_coordinate(coordinate): lat *= -1 return Coordinate(lat, lon) + +class Report: + """Gather and format data about the state of a TerraSync mirror.""" + + def addDirIndexWithIncorrectHash(self, localDirIndex): + pass + + def addMissingDirIndex(self, localDirIndex): + pass + + class TerraSync: def __init__(self, url, target, quick, removeOrphan, downloadBoundaries): @@ -308,6 +349,9 @@ class TerraSync: self.removeOrphan = removeOrphan self.httpGetter = None self.downloadBoundaries = downloadBoundaries + # Status of the local repository (as compared to what the server says), + # before any update we might do to it. + self.report = Report() def setUrl(self, url): self.url = url.rstrip('/').strip() @@ -319,14 +363,32 @@ class TerraSync: def start(self): self.httpGetter = HTTPGetter(self.url) - self.updateDirectory("", "", None ) + + # Get the hash of the root .dirindex file + try: + request = HTTPSocketRequest("/.dirindex") + with self.httpGetter.get(request) as fileLike: + rootDirIndexHash = computeHash(fileLike) + except HTTPException as exc: + raise NetworkError("for the root .dirindex file: {errMsg}" + .format(errMsg=exc)) from exc + + # Process the root (TerraSync) directory + self.updateDirectory("", "", rootDirIndexHash) def updateFile(self, serverPath, localPath, fileHash ): localFullPath = join(self.target, localPath) - if fileHash != None and hash_of_file(localFullPath) == fileHash: + + if (os.path.isfile(localFullPath) and + hashForFile(localFullPath) == fileHash): #print("hash of file matches, not downloading") return + if os.path.isdir(localFullPath): + # 'localFullPath' is a directory (locally), but on the server it is + # a file -> remove the dir so that we can store the file. + removeDirectoryTree(self.target, localFullPath) + print("Downloading '{}'".format(serverPath)) request = HTTPDownloadRequest(self, serverPath, localFullPath ) @@ -343,15 +405,26 @@ class TerraSync: return localFullPath = join(self.target, localPath) - if not os.path.exists( localFullPath ): - os.makedirs( localFullPath ) - localDirIndex = join(localFullPath, ".dirindex") - if dirIndexHash != None and hash_of_file(localDirIndex) == dirIndexHash: - # print("hash of dirindex matches, not downloading") + localDirIndexPresent = localDirIndexHasCorrectHash = False + + if os.path.isfile(localDirIndex): + localDirIndexPresent = True + + if hashForFile(localDirIndex) == dirIndexHash: + localDirIndexHasCorrectHash = True + else: + self.report.addDirIndexWithIncorrectHash(localDirIndex) + else: + self.report.addMissingDirIndex(localDirIndex) + + if localDirIndexPresent and localDirIndexHasCorrectHash: if not self.quick: self.handleDirindexFile( localDirIndex ) else: + if not os.path.exists(localFullPath): + os.makedirs(localFullPath) + request = HTTPDownloadRequest(self, serverPath + "/.dirindex", localDirIndex,