Much better terrasync.py

- tortellini instead of spaghetti code (use oop) - reuse connection
2016-05-18 12:51:29 +02:00 · 2016-05-18 12:51:29 +02:00 · 6921c98933
commit 6921c98933
parent a4a3659ee9
1 changed files with 176 additions and 129 deletions
--- a/scripts/python/terrasync.py
+++ b/scripts/python/terrasync.py
@ -19,27 +19,113 @@
 #
 # terrasync.py - synchronize terrascenery data to your local disk
 # needs dnspython (pip install dnspython)
 #
-import os 
+import urllib, os, hashlib
-import hashlib
+from urllib.parse import urlparse
-import urllib.request
+from http.client import HTTPConnection, _CS_IDLE
 from os import listdir
 from os.path import isfile, join
-dirindex = ".dirindex"
+#################################################################################################################################
-DIRINDEXVERSION = 1
+class HTTPGetCallback:
    def __init__(self, src, callback):
        self.callback = callback
        self.src = src
        self.result = None
-URL="http://flightgear.sourceforge.net/scenery"
+class HTTPGetter:
-# User master repository for now
+    def __init__(self, baseUrl, maxPending=10):
-#URL="automatic"
+        self.baseUrl = baseUrl
-TARGET="."
+        self.parsedBaseUrl = urlparse(baseUrl)
-QUICK=False
+        self.maxPending = maxPending
-REMOVE_ORPHAN=False
+        self.requests = []
        self.pendingRequests = []
        self.httpConnection = HTTPConnection(self.parsedBaseUrl.netloc,80, True)
        self.httpRequestHeaders = headers = {'Host':self.parsedBaseUrl.netloc,'Content-Length':0,'Connection':'Keep-Alive'}
-########################################################################
+    def get(self, httpGetCallback):
-def fn_hash_of_file(fname):
+        #self.requests.append(httpGetCallback)
        conn = self.httpConnection
        request = httpGetCallback
        conn.request("GET", self.parsedBaseUrl.path + request.src, None, self.httpRequestHeaders)
        try:
            httpGetCallback.result = conn.getresponse()
        except:
            # try to reconnect once
            #print("reconnect")
            conn.close()
            conn.connect()
            conn.request("GET", self.parsedBaseUrl.path + request.src, None, self.httpRequestHeaders)
            httpGetCallback.result = conn.getresponse()
        httpGetCallback.callback()
        #self.requests.remove(httpGetCallback)
 #################################################################################################################################
 class DirIndex:
    def __init__(self, dirIndexFile):
        self.d = []
        self.f = []
        self.version = 0
        self.path = ""
        with open(dirIndexFile) as f:
            self.readFrom(f)
    def readFrom(self, readable):
        for line in readable:
            line = line.strip()
            if line.startswith('#'):
                continue
            tokens = line.split(':')
            if len(tokens) == 0:
                continue
            if tokens[0] == "version":
                self.version = int(tokens[1])
            elif tokens[0] == "path":
                self.path = tokens[1]
            elif tokens[0] == "d":
                self.d.append({ 'name': tokens[1], 'hash': tokens[2] })
            elif tokens[0] == "f":
                self.f.append({ 'name': tokens[1], 'hash': tokens[2], 'size': tokens[3] })
    def getVersion(self):
        return self.version
    def getPath(self):
        return self.path
    def getDirectories(self):
        return self.d
    def getFiles(self):
        return self.f
 #################################################################################################################################
 class HTTPDownloadRequest(HTTPGetCallback):
    def __init__(self, terrasync, src, dst, callback = None ):
        super().__init__(src, self.callback)
        self.terrasync = terrasync
        self.dst = dst
        self.mycallback = callback
    def callback(self):
        with open(self.dst, 'wb') as f:
            f.write(self.result.read())
        if self.mycallback != None:
            self.mycallback(self)
 #################################################################################################################################
 def hash_of_file(fname):
    if not os.path.exists( fname ):
      return None
@ -53,85 +139,92 @@ def fn_hash_of_file(fname):
    return hash.hexdigest()
-########################################################################
+#################################################################################################################################
-def do_download_file( _url, _path, _localfile, _hash, _force ):
+class TerraSync:
  if os.path.exists( _localfile ) and not _force:
    h = fn_hash_of_file(_localfile)
    if h == _hash:
      #print("hash match for ", _localfile)
      return False
-  r = urllib.request.urlopen( _url + _path )
+    def __init__(self, url="http://flightgear.sourceforge.net/scenery", target=".", quick=False, removeOrphan=False):
-  with open(_localfile, 'wb') as f:
+        self.setUrl(url).setTarget(target)
-    f.write( r.read() )
+        self.quick = quick
-  #print("downloaded ", _localfile, " from ", _url + _path )
+        self.removeOrphan = removeOrphan
-  return True
+        self.httpGetter = None
-########################################################################
+    def setUrl(self, url):
-def do_terrasync( _url, _path, _localdir, _dirIndexHash ):
+        self.url = url.rstrip('/').strip()
-  url = _url + _path
+        return self
  print(url)
-  if not os.path.exists( _localdir ):
+    def setTarget(self, target):
-    os.makedirs( _localdir )
+        self.target = target.rstrip('/').strip()
        return self
-  # download and process .dirindex as temporary file
+    def start(self):
-  # rename to .dirindex after successful processing of directory
+        self.httpGetter = HTTPGetter(self.url)
-  # in case of abort, .dirindex.tmp will be removed as orphan
+        self.updateDirectory("", "", None )
  myDirIndexFile = os.path.join(_localdir, ".dirindex.tmp")
-  try:
+    def updateFile(self, serverPath, localPath, fileHash ):
-    if not do_download_file( url, "/.dirindex", myDirIndexFile, _dirIndexHash, QUICK == False ):
+        localFullPath = join(self.target, localPath)
-      # dirindex hash matches, file not downloaded, skip directory
+        if fileHash != None and hash_of_file(localFullPath) == fileHash:
-      return
+            #print("hash of file matches, not downloading")
            return
-  except urllib.error.HTTPError as err:
+        print("downloading ", serverPath )
    if err.code == 404 and _path == "":
      # HACK: only the master on SF provides .dirindex for root, fake it if it's missing
      print("Using static root hack.")
      for _sub in ("Models", "Terrain", "Objects", "Airports" ):
        do_terrasync( _url, "/" + _sub, os.path.join(_localdir,_sub), None )
      return
-    else:
+        request = HTTPDownloadRequest(self, serverPath, localFullPath )
-      raise
+        self.httpGetter.get(request)
  with open(myDirIndexFile, 'r') as myDirIndex:
    serverFiles = []
    for line in myDirIndex:
      tokens = line.rstrip().split(':')
      if( len(tokens) == 0 ):
        continue
-      # TODO: check version number, should be equal to DIRINDEXVERSION
+    def updateDirectory(self, serverPath, localPath, dirIndexHash):
-      #       otherwise complain and terminate
+        print("processing ", serverPath)
      if( tokens[0] == "version" ):
        continue
-      if( tokens[0] == "path" ):
+        localFullPath = join(self.target, localPath)
-        continue
+        if not os.path.exists( localFullPath ):
          os.makedirs( localFullPath )
-      if( tokens[0] == "d" ):
+        localDirIndex = join(localFullPath, ".dirindex")
-        do_terrasync( url,  "/" + tokens[1], os.path.join(_localdir,tokens[1]), tokens[2] )
+        if dirIndexHash != None and  hash_of_file(localDirIndex) == dirIndexHash:
            # print("hash of dirindex matches, not downloading")
            if not self.quick:
                self.handleDirindexFile( localDirIndex )
        else:
            request = HTTPDownloadRequest(self, serverPath + "/.dirindex", localDirIndex, self.handleDirindexRequest )
            self.httpGetter.get(request)
-      if( tokens[0] == "f" ):
+    def handleDirindexRequest(self, dirindexRequest):
-        do_download_file( url, "/" + tokens[1], os.path.join(_localdir,tokens[1]), tokens[2], False )
+        self.handleDirindexFile(dirindexRequest.dst)
        serverFiles.append( tokens[1] )
-  os.rename( myDirIndexFile, os.path.join(_localdir, ".dirindex" ) )
+    def handleDirindexFile(self, dirindexFile):
        dirIndex = DirIndex(dirindexFile)
        serverFiles = []
-  localFiles = [f for f in listdir(_localdir) if isfile(join(_localdir, f))]
+        for file in dirIndex.getFiles():
-  for f in localFiles:
+            f = file['name']
-    if f != ".dirindex" and not f in serverFiles:
+            h = file['hash']
-      if REMOVE_ORPHAN:
+            self.updateFile( "/" + dirIndex.getPath() + "/" + f, join(dirIndex.getPath(),f), h )
-        os.remove( os.path.join(_localdir,f) )
+            serverFiles.append(f)
-  #TODO: cleanup orphan files
+        for subdir in dirIndex.getDirectories():
            d = subdir['name']
            h = subdir['hash']
            self.updateDirectory( "/" + dirIndex.getPath() + "/" + d, join(dirIndex.getPath(),d), h )
-########################################################################
+        if self.removeOrphan:
            localFullPath = join(self.target, dirIndex.getPath())
            localFiles = [f for f in listdir(localFullPath) if isfile(join(localFullPath, f))]
            for f in localFiles:
                if f != ".dirindex" and not f in serverFiles:
                    #print("removing orphan", join(localFullPath,f) )
                    os.remove( join(localFullPath,f) )
 import getopt, sys, random, re
    def isReady(self):
        return self.httpGetter and self.httpGetter.isReady()
        return False
    def update(self):
        if self.httpGetter:
            self.httpGetter.update()
 #################################################################################################################################
 import getopt, sys
 try:
  opts, args = getopt.getopt(sys.argv[1:], "u:t:qr", [ "url=", "target=", "quick", "remove-orphan" ])
@ -139,64 +232,18 @@ except getopt.GetoptError:
  print("terrasync.py [--url=http://some.server.org/scenery] [--target=/some/path] [-q|--quick] [-r|--remove-orphan]")
  sys.exit(2)
 terraSync = TerraSync()
 for opt, arg in opts:
-  if opt in( "-u", "--url"):
+  if opt in("-u", "--url"):
-    URL = arg
+    terraSync.url = arg
-  elif opt in ( "-t", "--target"):
+  elif opt in ("-t", "--target"):
-    TARGET= arg
+    terraSync.target = arg
  elif opt in ("-q", "--quick"):
-    QUICK = True
+    terraSync.quick = True
  elif opt in ("-r", "--remove-orphan"):
-    REMOVE_ORPHAN = True
+    terraSync.removeOrphan = True
-# automatic URL lookup from DNS NAPTR
+terraSync.start()
 # - lookup terrasync.flightgear.org, type=NAPTR, service="ws20", flags="U"
 # - sort by order,preference ascending
 # - pick entries with lowest order and preference
 # - randomly pick one of those
 # - use regexp fields URL
 if URL == "automatic":
  import dns.resolver
  dnsResolver = dns.resolver.Resolver()
  order = -1
  preference = -1
  # find lowes preference/order for service 'ws20' and flags 'U'
  dnsAnswer = dnsResolver.query("terrasync.flightgear.org", "NAPTR" )
  for naptr in dnsAnswer:
    if naptr.service != b'ws20' or naptr.flags != b'U':
      continue
    if order == -1 or naptr.order < order:
      order = naptr.order
      preference = naptr.preference
    if order == naptr.order:
      if naptr.preference < preference:
        preference = naptr.preference
  # grab candidats
  candidates = []
  for naptr in dnsAnswer:
    if naptr.service != b'ws20' or naptr.flags != b'U' or naptr.preference != preference or naptr.order != order:
      continue
    candidates.append( naptr.regexp.decode('utf-8') )
  if not candidates:
    print("sorry, no terrascenery URLs found. You may specify one with --url=http://some.url.org/foo")
    sys.exit(3)
  _url  = random.choice(candidates)
  _subst = _url.split(_url[0]) # split string, first character is separator <sep>regex<sep>replacement<sep>
  URL = re.sub(_subst[1], _subst[2], "" ) # apply regex substitude on empty string
 print( "terrasyncing from ", URL, "to ", TARGET )
 do_terrasync( URL, "", TARGET, None )
 ########################################################################