#!/usr/bin/env python3 # -*- coding: utf-8 -*- # # main.py --- Main module for terrasync.py # # Copyright (C) 2016 Torsten Dreyer # # This program is free software; you can redistribute it and/or # modify it under the terms of the GNU General Public License as # published by the Free Software Foundation; either version 2 of the # License, or (at your option) any later version. # # This program is distributed in the hope that it will be useful, but # WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU # General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program; if not, write to the Free Software # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. import argparse import enum import hashlib import os import re import shutil import sys import time import urllib from urllib.parse import urlparse, urljoin from http.client import HTTPConnection, _CS_IDLE, HTTPException from os import listdir from os.path import isfile, isdir, join from .exceptions import UserError, NetworkError, RepoDataError, \ InvalidDirIndexFile from .virtual_path import VirtualPath PROGNAME = os.path.basename(sys.argv[0]) class ExitStatus(enum.Enum): SUCCESS = 0 # The program exit status is 1 when an exception isn't caught. ERROR = 1 CHECK_MODE_FOUND_MISMATCH = 2 # ***************************************************************************** # * Utility functions * # ***************************************************************************** # If a path matches this regexp, we really don't want to delete it recursively # (“cre” stands for “compiled regexp”). _removeDirectoryTree_dangerous_cre = re.compile( r"""^(/ (home (/ [^/]*)? )? /* | # for Unix-like systems [a-zA-Z]: [\/]* # for Windows )$""", re.VERBOSE) def removeDirectoryTree(base, whatToRemove): """Recursively remove directory 'whatToRemove', with safety checks. This function ensures that 'whatToRemove' does not resolve to a directory such as /, /home, /home/foobar, C:\, d:\, etc. It is also an error if 'whatToRemove' does not literally start with the value of 'base' (IOW, this function refuses to erase anything that is not under 'base'). 'whatToRemove' is *not* interpreted relatively to 'base' (this would be doable, just a different API). """ assert os.path.isdir(base), "Not a directory: {!r}".format(base) assert (base and whatToRemove.startswith(base) and whatToRemove[len(base):].startswith(os.sep)), \ "Unexpected base path for removeDirectoryTree(): {!r}".format(base) absPath = os.path.abspath(whatToRemove) if _removeDirectoryTree_dangerous_cre.match(absPath): raise UserError("in order to protect your data, refusing to " "recursively delete '{}'".format(absPath)) else: shutil.rmtree(absPath) def computeHash(fileLike): hash = hashlib.sha1() for chunk in iter(lambda: fileLike.read(4096), b""): hash.update(chunk) return hash.hexdigest() def hashForFile(fname): with open(fname, "rb") as f: return computeHash(f) # ***************************************************************************** # * Network-related classes * # ***************************************************************************** class HTTPGetCallback: def __init__(self, src, callback): """Initialize an HTTPGetCallback instance. src -- a VirtualPath instance (corresponding to the path on the server for which a GET request is to be issued) callback -- a function taking two parameters: the URL (string) and an http.client.HTTPResponse instance. When invoked, the callback return value will be returned by HTTPGetter.get(). """ self.callback = callback self.src = src class HTTPGetter: def __init__(self, baseUrl, maxPending=10): self.baseUrl = baseUrl self.parsedBaseUrl = urlparse(baseUrl) self.maxPending = maxPending self.requests = [] self.pendingRequests = [] self.httpConnection = HTTPConnection(self.parsedBaseUrl.netloc) self.httpRequestHeaders = headers = {'Host':self.parsedBaseUrl.netloc,'Content-Length':0,'Connection':'Keep-Alive','User-Agent':'FlightGear terrasync.py'} def assemblePath(self, httpGetCallback): """Return the path-on-server for the file to download. Example: '/scenery/Airports/N/E/4/.dirindex' """ assert not self.parsedBaseUrl.path.endswith('/'), \ repr(self.parsedBaseUrl) return self.parsedBaseUrl.path + str(httpGetCallback.src) def assembleUrl(self, httpGetCallback): """Return the URL of the file to download.""" baseUrl = self.parsedBaseUrl.geturl() assert not baseUrl.endswith('/'), repr(baseUrl) return urljoin(baseUrl + '/', httpGetCallback.src.asRelative()) def doGet(self, httpGetCallback): conn = self.httpConnection pathOnServer = self.assemblePath(httpGetCallback) self.httpConnection.request("GET", pathOnServer, None, self.httpRequestHeaders) httpResponse = self.httpConnection.getresponse() # 'httpResponse' is an http.client.HTTPResponse instance return httpGetCallback.callback(self.assembleUrl(httpGetCallback), httpResponse) def get(self, httpGetCallback): nbRetries = nbRetriesLeft = 5 while True: try: return self.doGet(httpGetCallback) except HTTPException as exc: if nbRetriesLeft == 0: raise NetworkError( "after {nbRetries} retries for URL {url}: {errMsg}" .format(nbRetries=nbRetries, url=self.assembleUrl(httpGetCallback), errMsg=exc)) from exc # Try to reconnect self.httpConnection.close() time.sleep(1) self.httpConnection.connect() nbRetriesLeft -= 1 ################################################################################################################################# class DirIndex: def __init__(self, dirIndexFile): self.d = [] self.f = [] self.t = [] self.version = 0 self.path = None # will be a VirtualPath instance when set # readFrom() stores the raw contents of the .dirindex file in this # attribute. This is useful for troubleshooting. self._rawContents = None with open(dirIndexFile) as f: self.readFrom(f) self._sanityCheck() def readFrom(self, readable): self._rawContents = readable.read() for line in self._rawContents.split('\n'): line = line.strip() if line.startswith('#'): continue tokens = line.split(':') if len(tokens) == 0: continue if tokens[0] == "version": self.version = int(tokens[1]) elif tokens[0] == "path": # This is relative to the repository root self.path = VirtualPath(tokens[1]) elif tokens[0] == "d": self.d.append({ 'name': tokens[1], 'hash': tokens[2] }) elif tokens[0] == "f": self.f.append({ 'name': tokens[1], 'hash': tokens[2], 'size': tokens[3] }) elif tokens[0] == "t": self.t.append({ 'name': tokens[1], 'hash': tokens[2], 'size': tokens[3] }) def _sanityCheck(self): if self.path is None: assert self._rawContents is not None firstLines = self._rawContents.split('\n')[:5] raise InvalidDirIndexFile( "no 'path' field found; the first lines of this .dirindex file " "follow:\n\n" + '\n'.join(firstLines)) def getVersion(self): return self.version def getPath(self): return self.path def getDirectories(self): return self.d def getTarballs(self): return self.t def getFiles(self): return self.f ################################################################################################################################# class HTTPDownloadRequest(HTTPGetCallback): def __init__(self, terrasync, src, dst, callback = None ): """Initialize an HTTPDownloadRequest instance. terrasync -- a TerraSync instance src -- a VirtualPath instance (corresponding to the path on the server for which a GET request is to be issued) dst -- file path (or whatever open() accepts) where the downloaded data is to be stored callback -- a function that will be called if the download is successful, or None if no such callback is desired. The function must take one parameter: when invoked, it will be passed this HTTPDownloadRequest instance. Its return value is ignored. """ super().__init__(src, self.callback) self.terrasync = terrasync self.dst = dst self.mycallback = callback # 'httpResponse' is an http.client.HTTPResponse instance def callback(self, url, httpResponse): # I suspect this doesn't handle HTTP redirects and things like that. As # mentioned at , # http.client is a low-level interface that should normally not be used # directly! if httpResponse.status != 200: raise NetworkError("HTTP callback got status {status} for URL {url}" .format(status=httpResponse.status, url=url)) try: with open(self.dst, 'wb') as f: f.write(httpResponse.read()) except HTTPException as exc: raise NetworkError("for URL {url}: {error}" .format(url=url, error=exc)) from exc if self.mycallback != None: self.mycallback(self) class HTTPSocketRequest(HTTPGetCallback): """HTTPGetCallback class whose callback returns a file-like object. The file-like object returned by the callback, and thus by HTTPGetter.get(), is a socket or similar. This allows one to read the data obtained from the network without necessarily storing it to a file. """ def __init__(self, src): """Initialize an HTTPSocketRequest object. src -- VirtualPath instance for the resource on the server (presumably a file) """ HTTPGetCallback.__init__(self, src, self.callback) def callback(self, url, httpResponse): # Same comment as for HTTPDownloadRequest.callback() if httpResponse.status != 200: raise NetworkError("HTTP callback got status {status} for URL {url}" .format(status=httpResponse.status, url=url)) return httpResponse ################################################################################################################################# class Coordinate: def __init__(self, lat, lon): self.lat = lat self.lon = lon class DownloadBoundaries: def __init__(self, top, left, bottom, right): if top < bottom: raise ValueError("top cannot be less than bottom") if right < left: # right may be less than left when wrapping across the antimeridian if not (left >= 0 and right < 0): raise ValueError("right cannot be less than left") if top > 90 or bottom < -90: raise ValueError("top and bottom must be a valid latitude") if left < -180 or right >= 180: raise ValueError("left and right must be a valid longitude") self.top = top self.left = left self.bottom = bottom self.right = right def is_coordinate_inside_boundaries(self, coordinate, isOuterBucket): bigTileBottom = coordinate.lat bigTileTop = bigTileBottom + (10 if isOuterBucket else 1) bigTileLeft = coordinate.lon bigTileRight = bigTileLeft + (10 if isOuterBucket else 1) # if the two regions do not overlap then we are done if bigTileTop <= self.bottom or bigTileBottom > self.top: return False if bigTileRight <= self.left or bigTileLeft > self.right: # check for spanning across the antimeridian if self.left >= 0 and self.right < 0: # determine which side we are on and check of region overlap if bigTileLeft >= 0: if bigTileRight <= self.left: return False elif bigTileLeft > self.right: return False else: return False # at least a partial overlap exists, so more processing will be needed return True def parse_terrasync_coordinate(coordinate): matches = re.match("(w|e)(\d{3})(n|s)(\d{2})", coordinate) if not matches: return None lon = int(matches.group(2)) if matches.group(1) == "w": lon *= -1 lat = int(matches.group(4)) if matches.group(3) == "s": lat *= -1 return Coordinate(lat, lon) class Report: """Gather and format data about the state of a TerraSync mirror.""" def __init__(self, targetDir): self.targetDir = targetDir self.dirsWithMissingIndex = set() self.dirsWithMismatchingDirIndexHash = set() self.missingFiles = set() self.filesWithMismatchingHash = set() self.dirsSkippedDueToBoundaries = set() self.orphanFiles = set() self.orphanDirs = set() def addMissingDirIndex(self, directoryVirtualPath): self.dirsWithMissingIndex.add(directoryVirtualPath) def addDirIndexWithMismatchingHash(self, directoryVirtualPath): self.dirsWithMismatchingDirIndexHash.add(directoryVirtualPath) def addMissingFile(self, virtualPath): self.missingFiles.add(virtualPath) def addFileWithMismatchingHash(self, virtualPath): self.filesWithMismatchingHash.add(virtualPath) def addSkippedDueToBoundaries(self, virtualPath): self.dirsSkippedDueToBoundaries.add(virtualPath) def addOrphanFile(self, virtualPath): self.orphanFiles.add(virtualPath) def addOrphanDir(self, virtualPath): self.orphanDirs.add(virtualPath) def summaryString(self): reportElements = [ ("Directories with missing index", self.dirsWithMissingIndex), ("Directories whose .dirindex file had a mismatching hash", self.dirsWithMismatchingDirIndexHash), ("Missing files", self.missingFiles), ("Files with a mismatching hash", self.filesWithMismatchingHash), ("Directories skipped because of the specified boundaries", self.dirsSkippedDueToBoundaries), ("Orphan files", self.orphanFiles), ("Orphan directories", self.orphanDirs)] l = [] for heading, setOfFilesOrDirs in reportElements: if setOfFilesOrDirs: l.append(heading + ":\n") l.extend( (" " + str(f) for f in sorted(setOfFilesOrDirs)) ) l.append('') # ensure a blank line follows the list else: l.append(heading + ": none") return '\n'.join(l) def printReport(self): title = "{prg} report".format(prg=PROGNAME) print("\n" + title + '\n' + len(title)*"=", end="\n\n") print(self.summaryString()) @enum.unique class FailedCheckReason(enum.Enum): """Reasons that can cause 'check' mode to report a mismatch. Note that network errors and things like that do *not* belong here. """ missingDirIndexFile, mismatchingHashForDirIndexFile, \ missingNormalFile, mismatchingHashForNormalFile, \ orphanFile, orphanDirectory = range(6) # 'path': VirtualPath instance for a file or directory def explain(self, path): if self is FailedCheckReason.missingDirIndexFile: res = ".dirindex file '{}' is missing locally".format(path) elif self is FailedCheckReason.mismatchingHashForDirIndexFile: res = ".dirindex file '{}' doesn't have the hash it " \ "should have according to the server".format(path) elif self is FailedCheckReason.missingNormalFile: res = "file '{}' is present on the server but missing locally" \ .format(path) elif self is FailedCheckReason.mismatchingHashForNormalFile: res = "file '{}' doesn't have the hash given in the " \ ".dirindex file of its containing directory".format(path) elif self is FailedCheckReason.orphanFile: res = "file '{}' was found locally but is not present on the " \ "server".format(path) elif self is FailedCheckReason.orphanDirectory: res = "directory '{}' was found locally but is not present " \ "on the server".format(path) else: assert False, "Unhandled enum value: {!r}".format(self) return res class TerraSync: @enum.unique class Mode(enum.Enum): """Main modes of operation for the TerraSync class.""" # Using lower case for the member names, because this way # enumMember.name is exactly the mode string passed to --mode on the # command line (can be useful for messages destined to users). check, sync = range(2) def __init__(self, mode, doReport, url, target, quick, removeOrphan, downloadBoundaries): self.mode = self.Mode[mode] self.doReport = doReport self.setUrl(url).setTarget(target) self.quick = quick self.removeOrphan = removeOrphan self.httpGetter = None self.downloadBoundaries = downloadBoundaries # Status of the local repository (as compared to what the server says), # before any update we might do to it. self.report = Report(self.target) def inSyncMode(self): return self.mode == self.Mode.sync def setUrl(self, url): self.url = url.rstrip('/').strip() return self def setTarget(self, target): # Using os.path.abspath() here is safer in case the process later uses # os.chdir(), which would change the meaning of the "." directory. self.target = os.path.abspath(target) return self def start(self, virtualSubdir=VirtualPath('/')): """Start the 'sync' or 'check' process. The 'virtualSubdir' argument must be a VirtualPath instance and allows one to start the 'sync' or 'check' process in a chosen subdirectory of the TerraSync repository, instead of at its root. """ # Remove the leading '/' from 'virtualSubdir' and convert to native # separators ('/' or '\' depending on the platform). localSubdir = os.path.normpath(virtualSubdir.asRelative()) if localSubdir == ".": # just ugly, but it wouldn't hurt localSubdir = "" assert not os.path.isabs(localSubdir), repr(localSubdir) self.httpGetter = HTTPGetter(self.url) # Get the hash of the .dirindex file for 'virtualSubdir' try: request = HTTPSocketRequest(virtualSubdir / ".dirindex") with self.httpGetter.get(request) as fileLike: dirIndexHash = computeHash(fileLike) except HTTPException as exc: raise NetworkError("for the root .dirindex file: {errMsg}" .format(errMsg=exc)) from exc # Process the chosen part of the repository (recursive) self.processDirectoryEntry(virtualSubdir, localSubdir, dirIndexHash) return self.report def processFileEntry(self, virtualPath, localPath, fileHash): """Process a file entry from a .dirindex file.""" localFullPath = join(self.target, localPath) failedCheckReason = None if not os.path.isfile(localFullPath): self.report.addMissingFile(virtualPath) failedCheckReason = FailedCheckReason.missingNormalFile elif hashForFile(localFullPath) != fileHash: self.report.addFileWithMismatchingHash(virtualPath) failedCheckReason = FailedCheckReason.mismatchingHashForNormalFile else: # The file exists and has the hash mentioned in the .dirindex file return assert failedCheckReason is not None if self.inSyncMode(): if os.path.isdir(localFullPath): # 'localFullPath' is a directory (locally), but on the server # it is a file -> remove the dir so that we can store the file. removeDirectoryTree(self.target, localFullPath) print("Downloading '{}'".format(virtualPath)) request = HTTPDownloadRequest(self, virtualPath, localFullPath) self.httpGetter.get(request) else: self.abortCheckMode(failedCheckReason, virtualPath) def processDirectoryEntry(self, virtualPath, localPath, dirIndexHash): """Process a directory entry from a .dirindex file.""" print("Processing '{}'...".format(virtualPath)) isOuterBucket = True if len(virtualPath.parts) <= 3 else False coord = parse_terrasync_coordinate(virtualPath.name) if (coord and not self.downloadBoundaries.is_coordinate_inside_boundaries(coord, isOuterBucket)): self.report.addSkippedDueToBoundaries(virtualPath) return localFullPath = join(self.target, localPath) localDirIndex = join(localFullPath, ".dirindex") failedCheckReason = None if not os.path.isfile(localDirIndex): failedCheckReason = FailedCheckReason.missingDirIndexFile self.report.addMissingDirIndex(virtualPath) elif hashForFile(localDirIndex) != dirIndexHash: failedCheckReason = FailedCheckReason.mismatchingHashForDirIndexFile self.report.addDirIndexWithMismatchingHash(virtualPath) if failedCheckReason is None: if not self.quick: self.handleDirindexFile(localDirIndex) elif self.inSyncMode(): if not os.path.exists(localFullPath): os.makedirs(localFullPath) request = HTTPDownloadRequest(self, virtualPath / ".dirindex", localDirIndex, self.handleDirindexRequest) self.httpGetter.get(request) else: self.abortCheckMode(failedCheckReason, virtualPath / ".dirindex") def handleDirindexRequest(self, dirindexRequest): self.handleDirindexFile(dirindexRequest.dst) def handleDirindexFile(self, dirindexFile): dirIndex = DirIndex(dirindexFile) virtualBase = dirIndex.getPath() # VirtualPath instance relativeBase = virtualBase.asRelative() # string, doesn't start with '/' serverFiles = [] serverDirs = [] for file in dirIndex.getFiles(): f = file['name'] self.processFileEntry(virtualBase / f, join(relativeBase, f), file['hash']) serverFiles.append(f) for subdir in dirIndex.getDirectories(): d = subdir['name'] self.processDirectoryEntry(virtualBase / d, join(relativeBase, d), subdir['hash']) serverDirs.append(d) for tarball in dirIndex.getTarballs(): # Tarballs are handled the same as normal files. f = tarball['name'] self.processFileEntry(virtualBase / f, join(relativeBase, f), tarball['hash']) serverFiles.append(f) localFullPath = join(self.target, relativeBase) localFiles = [ f for f in listdir(localFullPath) if isfile(join(localFullPath, f)) ] for f in localFiles: if f != ".dirindex" and f not in serverFiles: virtualPath = virtualBase / f self.report.addOrphanFile(virtualPath) if self.inSyncMode(): if self.removeOrphan: os.remove(join(self.target, virtualPath.asRelative())) else: self.abortCheckMode(FailedCheckReason.orphanFile, virtualPath) localDirs = [ f for f in listdir(localFullPath) if isdir(join(localFullPath, f)) ] for d in localDirs: if d not in serverDirs: virtualPath = virtualBase / d self.report.addOrphanDir(virtualPath) if self.inSyncMode(): if self.removeOrphan: removeDirectoryTree(self.target, join(self.target, virtualPath.asRelative())) else: self.abortCheckMode(FailedCheckReason.orphanDirectory, virtualPath) # 'reason' is a member of the FailedCheckReason enum def abortCheckMode(self, reason, fileOrDirVirtualPath): assert self.mode == self.Mode.check, repr(self.mode) print("{prg}: exiting from 'check' mode because {explanation}." .format(prg=PROGNAME, explanation=reason.explain(fileOrDirVirtualPath))) if self.doReport: self.report.printReport() sys.exit(ExitStatus.CHECK_MODE_FOUND_MISMATCH.value) ################################################################################################################################# def parseCommandLine(): parser = argparse.ArgumentParser() parser.add_argument("-u", "--url", dest="url", metavar="URL", default="http://flightgear.sourceforge.net/scenery", help="server URL [default: %(default)s]") parser.add_argument("-t", "--target", dest="target", metavar="DIR", default=".", help="""\ directory where to store the files [default: the current directory]""") parser.add_argument("--only-subdir", dest="onlySubdir", metavar="SUBDIR", default="", help="""\ restrict processing to this subdirectory of the TerraSync repository. Use a path relative to the repository root, for instance 'Models/Residential' [default: process the whole repository]""") parser.add_argument("-q", "--quick", dest="quick", action="store_true", default=False, help="enable quick mode") parser.add_argument("-r", "--remove-orphan", dest="removeOrphan", action="store_true", default=False, help="remove old scenery files") parser.add_argument("--mode", default="sync", choices=("check", "sync"), help="""\ main mode of operation (default: '%(default)s'). In 'sync' mode, contents is downloaded from the server to the target directory. On the other hand, in 'check' mode, {progname} compares the contents of the target directory with the remote repository without writing nor deleting anything on disk.""".format(progname=PROGNAME)) parser.add_argument("--report", dest="report", action="store_true", default=False, help="""\ before normal exit, print a report of what was found""") parser.add_argument("--top", dest="top", type=int, default=90, help="""\ maximum latitude to include in download [default: %(default)d]""") parser.add_argument("--bottom", dest="bottom", type=int, default=-90, help="""\ minimum latitude to include in download [default: %(default)d]""") parser.add_argument("--left", dest="left", type=int, default=-180, help="""\ minimum longitude to include in download [default: %(default)d]""") parser.add_argument("--right", dest="right", type=int, default=179, help="""\ maximum longitude to include in download [default: %(default)d]""") args = parser.parse_args() # Perform consistency checks on the arguments if args.mode == "check" and args.removeOrphan: print("{prg}: 'check' mode is read-only and thus doesn't make sense " "with\noption --remove-orphan (-r)".format(prg=PROGNAME), file=sys.stderr) sys.exit(ExitStatus.ERROR.value) # Replace backslashes with forward slashes, remove leading and trailing # slashes, collapse consecutive slashes. Yes, this implies that we tolerate # leading slashes for --only-subdir (which makes sense because virtual # paths are printed like that by this program, therefore it is natural for # users to copy & paste such paths in order to use them for --only-subdir). args.virtualSubdir = VirtualPath(args.onlySubdir.replace('\\', '/')) # Be nice to our user in case the path starts with '\', 'C:\', etc. if os.path.isabs(args.virtualSubdir.asRelative()): print("{prg}: option --only-subdir expects a *relative* path, but got " "'{subdir}'".format(prg=PROGNAME, subdir=args.onlySubdir), file=sys.stderr) sys.exit(ExitStatus.ERROR.value) return args def main(): args = parseCommandLine() terraSync = TerraSync(args.mode, args.report, args.url, args.target, args.quick, args.removeOrphan, DownloadBoundaries(args.top, args.left, args.bottom, args.right)) report = terraSync.start(args.virtualSubdir) if args.report: report.printReport() sys.exit(ExitStatus.SUCCESS.value)