fgdata_checkers: add text encoding checker,

add more options to create_reduced_fgdata
2014-07-28 09:10:12 +01:00 · 2014-07-28 09:10:12 +01:00 · 841a58097b
commit 841a58097b
parent 1514eaee73
1 changed files with 205 additions and 75 deletions
--- a/fgdata_checkers.py
+++ b/fgdata_checkers.py
@ -1,6 +1,8 @@
 #!/usr/bin/env python3
 from __future__ import print_function#defaults to Python 3, but should also work in 2.7
+"""Functions for checking fgdata for various problems (and one for creating smaller/split versions of it)

+By Rebecca Palmer"""
 import os
 import os.path
 import re
@ -8,21 +10,44 @@ from collections import defaultdict
 import subprocess
 import math
 import tarfile
-
+import gzip
+def path_join(*args):
+    """Unlike plain os.path.join, this doesn't add a trailing / if the last component is empty"""
+    return os.path.normpath(os.path.join(*args))
 def rfilelist(path,exclude_dirs=[]):
    """Dict of files/sizes in path, including those in any subdirectories (as relative paths)"""
    files=defaultdict(int)
    dirs=[""]
    while dirs:
        cdir=dirs.pop()
-        cdirfiles=os.listdir(os.path.join(path,cdir))
+        cdirfiles=os.listdir(path_join(path,cdir))
        for file in cdirfiles:
-            if os.path.isdir(os.path.join(path,cdir,file)):
-                if os.path.join(cdir,file) not in exclude_dirs:
-                    dirs.append(os.path.join(cdir,file))
+            if os.path.isdir(path_join(path,cdir,file)):
+                if path_join(cdir,file) not in exclude_dirs:
+                    dirs.append(path_join(cdir,file))
            else:
-                files[os.path.join(cdir,file)]=os.path.getsize(os.path.join(path,cdir,file))
+                files[path_join(cdir,file)]=os.path.getsize(path_join(path,cdir,file))
    return files
+def strip_comments(text,comment_types=None,filename=None):
+    """Remove comments from text
+    Assumes comments don't nest (including different types of comments: will be wrong for e.g. /* aaa // bbb */ will-remove-this in C++ if // are removed first)
+    Doesn't check for being inside a string literal, and doesn't check for line-start * in C /* ... */"""
+    if comment_types is None:
+        if filename is None:
+            raise TypeError("must give either filename or comment_types")
+        if os.path.splitext(filename)[1] in (".xml",".eff"):
+            comment_types=(("<!--","-->",""),)
+        elif os.path.splitext(filename)[1] in (".c",".cpp",".cxx",".h",".hpp",".hxx",".frag",".vert"):
+            comment_types=(("//","\n","\n"),("/*","*/",""))
+        elif os.path.splitext(filename)[1] in (".nas",):
+            comment_types=(("#","\n","\n"),)
+        else:
+            comment_types=[]
+    if type(text) in (bytes,bytearray):
+        comment_types=[[bytes(c,encoding="ascii") for c in ct] for ct in comment_types]
+    for comment_type in comment_types:
+        text=text.split(comment_type[0],maxsplit=1)[0]+comment_type[2].join(s.split(comment_type[1],maxsplit=1)[1] for s in text.split(comment_type[0])[1:] if comment_type[1] in s)
+    return text
 def files_used(pattern,path,exclude_dirs=[],filelist=None,filetypes=None,relative_path=False):
    """Files used by an element matching pattern, in a file in path or filelist"""
    textures=[]
@ -33,18 +58,18 @@ def files_used(pattern,path,exclude_dirs=[],filelist=None,filetypes=None,relativ
    texfind=re.compile(pattern)
    for file in filelist:
        try:
-            f=open(os.path.join(path,file),'r',errors='replace')
+            f=open(path_join(path,file),'r',errors='replace')
        except FileNotFoundError:
            continue
        for line in f:
            tex=texfind.search(line)
            if tex:
                if relative_path:
-                    textures.append(os.path.normpath(os.path.join(os.path.dirname(file),tex.group(1).replace('\\','/'))))
+                    textures.append(os.path.normpath(path_join(os.path.dirname(file),tex.group(1).replace('\\','/'))))
                else:
                    textures.append(os.path.normpath(tex.group(1).replace('\\','/')))
    return textures
-def find_unused_textures(basedir,output_lists=True,grep_check=False,output_rsync_rules=False,output_comparison_strips=False,output_removal_commands=False,return_used_noregions=False):
+def find_unused_textures(basedir,output_lists=True,grep_check=False,output_rsync_rules=False,output_comparison_strips=False, output_removal_commands=False,return_used_noregions=False):
    """Checks if any textures are unused (wasting space), and if any textures are only available as .dds (not recommended in the source repository, as it is a lossy-compressed format)

 Set basedir to your fg-root, and enable the kind(s) of output you want:
@ -55,13 +80,13 @@ output_comparison_strips creates thumbnail strips, unused_duplicate.png/unused_d
 output_removal_commands creates another script, delete_unused_textures.sh, which will remove unused textures when run in a Unix shell"""

    false_positives=set(['buildings-lightmap.png','buildings.png','Credits','Globe/00README.txt', 'Globe/01READMEocean_depth_1png.txt', 'Globe/world.topo.bathy.200407.3x4096x2048.png','Trees/convert.pl','Splash1.png','Splash2.png','Splash3.png','Splash4.png','Splash5.png'])#these either aren't textures, or are used where we don't check; 'unknown.rgb','Terrain/unknown.rgb' are also referenced, but already don't exist
-    used_textures=set(files_used(path=os.path.join(basedir,'Materials'),pattern=r'<(?:texture|object-mask|tree-texture).*?>(\S+?)</(texture|object-mask|tree-texture)'))|false_positives
-    used_textures_noregions=set(files_used(path=os.path.join(basedir,'Materials'),exclude_dirs=['regions'],pattern=r'<(?:texture|object-mask|tree-texture).*?>(\S+?)</(texture|object-mask|tree-texture)'))|false_positives#this pattern matches a <texture> (possibly with number), <tree-texture> or <object-mask> element
-    used_effectslow=set(files_used(path=os.path.join(basedir,'Effects'),pattern=r'image.*?>[\\/]?Textures[\\/](\S+?)</.*?image'))|set(files_used(path=os.path.join(basedir,'Materials'),pattern=r'<building-(?:texture|lightmap).*?>Textures[\\/](\S+?)</building-(?:texture|lightmap)'))#Effects (<image>), and Materials <building-texture>/<building-lightmap>, explicitly includes the Textures/ or Textures.high/
-    used_effectshigh=set(files_used(path=os.path.join(basedir,'Effects'),pattern=r'image.*?>[\\/]?Textures.high[\\/](\S+?)</.*?image'))|set(files_used(path=os.path.join(basedir,'Materials'),pattern=r'<building-(?:texture|lightmap).*?>Textures.high[\\/](\S+?)</building-(?:texture|lightmap)'))
-    high_tsizes=rfilelist(os.path.join(basedir,'Textures.high'))
+    used_textures=set(files_used(path=path_join(basedir,'Materials'),pattern=r'<(?:texture|object-mask|tree-texture).*?>(\S+?)</(texture|object-mask|tree-texture)'))|false_positives
+    used_textures_noregions=set(files_used(path=path_join(basedir,'Materials'),exclude_dirs=['regions'],pattern=r'<(?:texture|object-mask|tree-texture).*?>(\S+?)</(texture|object-mask|tree-texture)'))|false_positives#this pattern matches a <texture> (possibly with number), <tree-texture> or <object-mask> element
+    used_effectslow=set(files_used(path=path_join(basedir,'Effects'),pattern=r'image.*?>[\\/]?Textures[\\/](\S+?)</.*?image'))|set(files_used(path=path_join(basedir,'Materials'),pattern=r'<building-(?:texture|lightmap).*?>Textures[\\/](\S+?)</building-(?:texture|lightmap)'))#Effects (<image>), and Materials <building-texture>/<building-lightmap>, explicitly includes the Textures/ or Textures.high/
+    used_effectshigh=set(files_used(path=path_join(basedir,'Effects'),pattern=r'image.*?>[\\/]?Textures.high[\\/](\S+?)</.*?image'))|set(files_used(path=path_join(basedir,'Materials'),pattern=r'<building-(?:texture|lightmap).*?>Textures.high[\\/](\S+?)</building-(?:texture|lightmap)'))
+    high_tsizes=rfilelist(path_join(basedir,'Textures.high'))
    high_textures=set(high_tsizes.keys())
-    low_tsizes=rfilelist(os.path.join(basedir,'Textures'),exclude_dirs=['Sky','Unused'])#sky textures are used where we don't check
+    low_tsizes=rfilelist(path_join(basedir,'Textures'),exclude_dirs=['Sky','Unused'])#sky textures are used where we don't check
    low_textures=set(low_tsizes.keys())
    only_high=high_textures-low_textures
    used_noreg_onlyhigh=(only_high&used_textures_noregions)|used_effectshigh
@ -93,23 +118,23 @@ output_removal_commands creates another script, delete_unused_textures.sh, which
    def image_check_strip(basedir,index_fname,ilist1,ilist2=None,size=128):
        """Generate two rows of thumbnails, for easy visual comparison (between the two lists given, or if a single list is given, between low and high resolution)"""
        if ilist2 is None:
-            ipairs=[[os.path.join(basedir,'Textures',f),os.path.join(basedir,'Textures.high',f)] for f in ilist1]
+            ipairs=[[path_join(basedir,'Textures',f),path_join(basedir,'Textures.high',f)] for f in ilist1]
        else:
            ipairs=[]
            for f1,f2 in zip(ilist1,ilist2):
                if f1 in low_textures:
-                    ipairs.append([os.path.join(basedir,'Textures',f1),os.path.join(basedir,'Textures',f2) if f2 in low_textures else os.path.join(basedir,'Textures.high',f2)])
+                    ipairs.append([path_join(basedir,'Textures',f1),path_join(basedir,'Textures',f2) if f2 in low_textures else path_join(basedir,'Textures.high',f2)])
                if f1 in high_textures:
-                    ipairs.append([os.path.join(basedir,'Textures.high',f1),os.path.join(basedir,'Textures.high',f2) if f2 in high_textures else os.path.join(basedir,'Textures',f2)])
+                    ipairs.append([path_join(basedir,'Textures.high',f1),path_join(basedir,'Textures.high',f2) if f2 in high_textures else path_join(basedir,'Textures',f2)])
        ilist_f=[f[0] for f in ipairs]+[f[1] for f in ipairs]
        subprocess.call(['montage','-label',"'%f'"]+ilist_f+['-tile','x2','-geometry',str(size)+'x'+str(size)]+[index_fname])
    def rsync_rules(basedir,flist,include=False,high=None):
        """Output rsync rules to exclude/include the specified textures from high/low/both (high=True/False/None) resolutions"""
        for f in flist:
            if high!=True and f in low_textures:
-                print("+" if include else "-",os.path.join('/fgdata/Textures',f))
+                print("+" if include else "-",path_join('/fgdata/Textures',f))
            if high!=False and f in high_textures:
-                print("+" if include else "-",os.path.join('/fgdata/Textures.high',f))
+                print("+" if include else "-",path_join('/fgdata/Textures.high',f))
    def removal_command(basedir,flist,high=None):
        """Return command to delete the specified textures from high/low/both (high=True/False/None) resolutions"""
        if not flist:
@ -117,9 +142,9 @@ output_removal_commands creates another script, delete_unused_textures.sh, which
        a="rm"
        for f in flist:
            if high!=True and f in low_textures:
-                a=a+" "+os.path.join('Textures',f)
+                a=a+" "+path_join('Textures',f)
            if high!=False and f in high_textures:
-                a=a+" "+os.path.join('Textures.high',f)
+                a=a+" "+path_join('Textures.high',f)
        a=a+"\n"
        return a
    def move_command(basedir,flist,high=None,comment=False):
@ -130,9 +155,9 @@ output_removal_commands creates another script, delete_unused_textures.sh, which
        dirset_high=set() if high==False else set(os.path.dirname(f) for f in set(flist)&high_textures)
        a=""
        for d in dirset_low:
-            a=a+("#" if comment else "")+"mv --target-directory="+os.path.join("Textures/Unused",d)+" "+(" ".join(os.path.join("Textures",f) for f in flist if (os.path.dirname(f)==d and f in low_textures)))+"\n"
+            a=a+("#" if comment else "")+"mv --target-directory="+path_join("Textures/Unused",d)+" "+(" ".join(path_join("Textures",f) for f in flist if (os.path.dirname(f)==d and f in low_textures)))+"\n"
        for d in dirset_high:
-            a=a+("#" if comment else "")+"mv --target-directory="+os.path.join("Textures/Unused",d+".high")+" "+(" ".join(os.path.join("Textures.high",f) for f in flist if (os.path.dirname(f)==d and f in high_textures)))+"\n"
+            a=a+("#" if comment else "")+"mv --target-directory="+path_join("Textures/Unused",d+".high")+" "+(" ".join(path_join("Textures.high",f) for f in flist if (os.path.dirname(f)==d and f in high_textures)))+"\n"
        return a
    if output_comparison_strips:
        image_check_strip(basedir,"unused_duplicate.png",unused_duplicate,["Terrain"+f[14:] for f in unused_duplicate])
@ -180,13 +205,13 @@ output_removal_commands creates another script, delete_unused_textures.sh, which
        r_script.write(move_command(basedir,[f for f in (unused_other-high_textures)|low_unneeded_nondup|unused_dds_matchhigh if (f[-4:]==".dds" and f[:5]!="Signs" and f[:6]!="Runway")],high=False,comment=True))
        r_script.close()
    if return_used_noregions:
-        return used_noregions|set([os.path.join('Sky',f) for f in rfilelist(os.path.join(basedir,'Textures/Sky'))])
+        return used_noregions|set([path_join('Sky',f) for f in rfilelist(path_join(basedir,'Textures/Sky'))])
 def find_locally_unused_models(basedir):
    """Find models not used in the base scenery (these do need to be in Terrasync as they may well be used in other locations, but don't need to be in the base flightgear-data package)
    Known bug: doesn't search everywhere: check /Nasal,.eff <image>,<inherits-from>,/(AI/)Aircraft not referenced in AI scenarios, unusual tags in Aircraft/Generic/Human/Models/walker.xml,HLA/av-aircraft.xml,/Environment,MP/aircraft_types.xml,preferences.xml"""
-    models_allfiles={os.path.join('Models',f):s for f,s in rfilelist(os.path.join(basedir,'Models')).items()}
+    models_allfiles={path_join('Models',f):s for f,s in rfilelist(path_join(basedir,'Models')).items()}
    t_size=lambda flist: sum(models_allfiles[f] for f in flist if f in models_allfiles)
-    used_models=set(files_used(path=os.path.join(basedir,'Scenery'),filetypes=".stg",pattern=r'OBJECT_SHARED (\S+?) '))|set(files_used(path=os.path.join(basedir,'AI'),exclude_dirs=["Aircraft","Traffic"],pattern=r'<model>[\\/]?(\S+?)</model>'))|set(f for f in files_used(path=os.path.join(basedir,'Materials'),filetypes=".xml",pattern=r'<path>[\\/]?(\S+?)</path>') if f[-4:]==".xml")
+    used_models=set(files_used(path=path_join(basedir,'Scenery'),filetypes=".stg",pattern=r'OBJECT_SHARED (\S+?) '))|set(files_used(path=path_join(basedir,'AI'),exclude_dirs=["Aircraft","Traffic"],pattern=r'<model>[\\/]?(\S+?)</model>'))|set(f for f in files_used(path=path_join(basedir,'Materials'),filetypes=".xml",pattern=r'<path>[\\/]?(\S+?)</path>') if f[-4:]==".xml")
    n=0
    while n!=len(used_models):
        n=len(used_models)
@ -207,12 +232,12 @@ def find_locally_unused_models(basedir):
                print("non-unique/not found:",f1,f2,p2)
                continue
            try:
-                used_textures=used_textures|set(os.path.normpath(os.path.join(p2[0],f)) for f in files_used(path=basedir,filelist=[f2],filetypes=".ac",pattern=r'texture "(\S+?)"'))
+                used_textures=used_textures|set(os.path.normpath(path_join(p2[0],f)) for f in files_used(path=basedir,filelist=[f2],filetypes=".ac",pattern=r'texture "(\S+?)"'))
            except (IOError,OSError):
                print("not found",f1,f2,p2)
    used_models=used_models|extra_used_models
    unused=set(models_allfiles.keys())-(used_models|used_textures)
-    missing=set(f for f in (used_models|used_textures) if ((f.startswith('Models') and f not in models_allfiles.keys()) or not os.path.isfile(os.path.join(basedir,f))))
+    missing=set(f for f in (used_models|used_textures) if ((f.startswith('Models') and f not in models_allfiles.keys()) or not os.path.isfile(path_join(basedir,f))))
    print("used\n",sorted(used_models),"\nsize=",t_size(used_models),"\n\n",sorted(used_textures),"\nsize=",t_size(used_textures),"\n\nunused\n",sorted(unused),"\nsize=",t_size(unused),"\n\nmissing\n",sorted(missing),"\nsize=",t_size(missing))

 def size_by_type(path,exclude_dirs=[]):
@ -236,9 +261,9 @@ def size_by_size(path,exclude_dirs=[],exts=[".png",".dds",".rgb"]):
    return size_totals
 def fgdata_size(path,dirs_to_list=["AI/Aircraft","AI/Traffic","Aircraft","Models","Scenery","Textures","Textures.high"],exclude_dirs=None,compressed_size=False,num_types=3):
    if dirs_to_list is None:
-        dirs_to_list=[d for d in os.listdir(path) if os.path.isdir(os.path.join(path,d))]
+        dirs_to_list=[d for d in os.listdir(path) if os.path.isdir(path_join(path,d))]
    if exclude_dirs is None:
-        if os.path.exists(os.path.join(path,".git")):
+        if os.path.exists(path_join(path,".git")):
            exclude_dirs=[".git","Aircraft"]
        else:
            exclude_dirs=[]
@ -246,71 +271,176 @@ def fgdata_size(path,dirs_to_list=["AI/Aircraft","AI/Traffic","Aircraft","Models
    exclude_list=[[]]*len(dirs_to_list)+[dirs_to_list+exclude_dirs]+[exclude_dirs]
    names_list=dirs_to_list+["other","all"]
    for n,dir1 in enumerate(dirs_to_list+["",""]):
-        size_totals=size_by_type(os.path.join(path,dir1),exclude_list[n])
+        size_totals=size_by_type(path_join(path,dir1),exclude_list[n])
        print(names_list[n],sorted(size_totals.items(),key=lambda x:-x[1])[:num_types],"total",sum(size_totals.values()))
        if compressed_size:
            if names_list[n]=="all":
                print("compressed size",total_compressed_size)
                continue
            targz=tarfile.open("fgdata_sizetest_temp.tar.gz",mode="w:gz")
-            for file in rfilelist(os.path.join(path,dir1),exclude_list[n]):
-                targz.add(os.path.join(path,dir1,file))
+            for file in rfilelist(path_join(path,dir1),exclude_list[n]):
+                targz.add(path_join(path,dir1,file))
            targz.close()
            print("compressed size",os.path.getsize("fgdata_sizetest_temp.tar.gz"))
            total_compressed_size=total_compressed_size+os.path.getsize("fgdata_sizetest_temp.tar.gz")

-def create_reduced_fgdata(input_path,output_path,exclude_ai=False,exclude_reg=False,dirs_to_downsample=("Textures.high/Terrain","Textures.high/Trees","Textures.high/Terrain.winter","AI/Aircraft","Models"),downsample_min_filesize=30000):
+def create_reduced_fgdata(input_path,output_path,split_textures=True,exclude_parts=[],include_aircraft=['UIUC','777','777-200','b1900d','CitationX','ZLT-NT','dhc2','Cub','sopwithCamel','f-14b','ASK13','bo105','Dragonfly','SenecaII','A6M2'],dirs_to_downsample=(),downsample_min_filesize=30000):
    """Create a smaller, reduced-quality flightgear-data package
-Can downsample textures 50% (selected by dirs_to_downsample/downsample_min_filesize), omit AI aircraft (no background traffic, but tankers etc do still work), and/or omit regional textures
-Requires Unix shell, imagemagick or graphicsmagick (for convert), and libnvtt-bin (for nvcompress)"""
+Can downsample textures 50% and/or omit sections
+Requires Unix shell; downsampling requires imagemagick or graphicsmagick (for convert) and libnvtt-bin (for nvcompress)
+
+Optional parts, use exclude_parts to omit:
+ai: no background traffic, but tankers etc do still work
+extra-textures (requires split_textures=True): no region-specific textures
+
+The c172p and ufo are always included; other aircraft are added by include_aircraft
+
+Texture downsampling: textures in dirs_to_downsample and larger than downsample_min_filesize downsampled 50%
+Example: dirs_to_downsample=("Textures.high/Terrain","Textures.high/Trees","Textures.high/Terrain.winter","AI/Aircraft","Models"),downsample_min_filesize=30000
+
+To put each section in its own directory use {0} in output_path, e.g.
+python3 -c "import fgdata_checkers; fgdata_checkers.create_reduced_fgdata(input_path='~/fs_dev/git/fgdata',output_path='~/fs_dev/flightgear/data_split/debian/flightgear-data-{0}/usr/share/games/flightgear',include_aircraft=['UIUC','b1900d','CitationX','ZLT-NT','dhc2','Cub','sopwithCamel','f-14b','ASK13','bo105','Dragonfly','SenecaII','A6M2'])"
+This creates separate preferences-regions.xml and preferences-noregions.xml files for with and without regional textures; you need to handle symlinking preferences.xml to the correct one
+"""
    texture_filetypes={".png":"PNG",".dds":"DDS"}#,".rgb":"SGI" loses cloud transparency
    exclude_dirs=[".git","Textures/Unused"]
-    exclude_unnamed_subdirs=["Aircraft"]
-    exclude_unnamed_files=[]
-    include_subdirs=["Aircraft/c172p","Aircraft/Generic","Aircraft/Instruments","Aircraft/Instruments-3d","Aircraft/ufo"]
+    exclude_unnamed_subdirs=["Aircraft"]#these are a separate mechanism from subtree_class/exclude_parts mostly to save time (subtree_class still fully scans excluded directories because the class may change again further down the tree, e.g. AI/Aircraft ai -> performancedb.xml base; these don't)
+    subtree_class={"Aircraft/c172p":"base","Aircraft/Generic":"base","Aircraft/Instruments":"base","Aircraft/Instruments-3d":"base","Aircraft/ufo":"base","Textures":"textures","Textures.high":"textures","AI/Aircraft":"ai","AI/Traffic":"ai","AI/Aircraft/performancedb.xml":"base","Scenery":"scenery","Models":"models"}
+    for aircraft in include_aircraft:
+        if "Aircraft/"+aircraft not in subtree_class:
+            subtree_class["Aircraft/"+aircraft]="aircraft"
    include_files=[]
-    if exclude_ai:
-        exclude_unnamed_subdirs.extend(["AI/Aircraft","AI/Traffic"])
-    if exclude_reg:
-        exclude_unnamed_files.extend(["Textures","Textures.high"])
-        used_textures=find_unused_textures(input_path,return_used_noregions=True)
-        for t in used_textures:
-            include_files.extend([os.path.join("Textures",t),os.path.join("Textures.high",t)])
-    subprocess.call(["mkdir","-p",output_path])
-    if os.path.exists(os.path.join(input_path,".git")):
-        print(input_path,"appears to be a git clone; this will work, but the result will be larger than starting from a standard flightgear-data package.\nTo create this use (adjusting paths as necessary) rsync -av --filter=\"merge /home/palmer/fs_dev/git/fgmeta/base-package.rules\" ~/fs_dev/git/fgdata ~/fs_dev/flightgear/data_full")
-    if os.listdir(output_path):
+    if split_textures:
+        base_texture_files=[]
+        for t in find_unused_textures(input_path,return_used_noregions=True):
+            base_texture_files.extend([path_join("Textures",t),path_join("Textures.high",t)])
+    #if os.path.exists(path_join(input_path,".git")):
+        #print(input_path,"appears to be a git clone; this will work, but the result will be larger than starting from a standard flightgear-data package.\nTo create this use (adjusting paths as necessary) rsync -av --filter=\"merge /home/palmer/fs_dev/git/fgmeta/base-package.rules\" ~/fs_dev/git/fgdata ~/fs_dev/flightgear/data_full")
+    if os.path.exists(output_path.format("base")) and os.listdir(output_path.format("base")):
        print("output path",output_path,"non-empty, aborting to avoid data loss\nIf you did want to lose its previous contents, run:\nrm -r",output_path,"\nthen re-run this script")
        return
-    dirs=[""]
+    dirs={"":"base"}
    while dirs:
-        cdir=dirs.pop()
-        cdirfiles=os.listdir(os.path.join(input_path,cdir))
+        cdir,cclass=dirs.popitem()
+        cdirfiles=os.listdir(path_join(input_path,cdir))
        for file in cdirfiles:
-            if os.path.isdir(os.path.join(input_path,cdir,file)):
-                if (os.path.join(cdir,file) not in exclude_dirs) and (cdir not in exclude_unnamed_subdirs or os.path.join(cdir,file) in include_subdirs):
-                    subprocess.call(["mkdir","-p",os.path.join(output_path,cdir,file)])
-                    dirs.append(os.path.join(cdir,file))
-            else:
-                if (cdir.startswith(tuple(exclude_unnamed_files))) and (os.path.join(cdir,file) not in include_files):
+            fclass=subtree_class.get(path_join(cdir,file),cclass)
+            if os.path.isdir(path_join(input_path,cdir,file)):
+                if (path_join(cdir,file) not in exclude_dirs) and (cdir not in exclude_unnamed_subdirs or path_join(cdir,file) in subtree_class):
+                    dirs[path_join(cdir,file)]=fclass
+            else:#file
+                if split_textures and fclass=="textures":
+                    if path_join(cdir,file) in base_texture_files:
+                        fclass="base-textures"
+                    else:
+                        fclass="extra-textures"
+                if fclass in exclude_parts:
                    continue
-                if (cdir.startswith(dirs_to_downsample)) and (os.path.splitext(file)[1] in texture_filetypes) and (os.path.getsize(os.path.join(input_path,cdir,file))>downsample_min_filesize):
+                if not os.path.exists(path_join(output_path.format(fclass),cdir)):
+                    subprocess.call(["mkdir","-p",path_join(output_path.format(fclass),cdir)])
+                if (cdir.startswith(dirs_to_downsample)) and (os.path.splitext(file)[1] in texture_filetypes) and (os.path.getsize(path_join(input_path,cdir,file))>downsample_min_filesize):
                    image_type=texture_filetypes[os.path.splitext(file)[1]]
                    if image_type=="DDS":# in Ubuntu, neither imagemagick nor graphicsmagick can write .dds
-                        #doesn't work subprocess.call(["nvzoom","-s","0.5","-f","box",os.path.join(input_path,cdir,file),os.path.join(output_path,cdir,file)])
-                        if subprocess.call(["convert",image_type+":"+os.path.join(input_path,cdir,file),"-sample","50%","temp_reduced_size.png"]):#fails on normal maps, so just copy them
-                            subprocess.call(["cp",os.path.join(input_path,cdir,file),os.path.join(output_path,cdir,file)])
+                        #doesn't work subprocess.call(["nvzoom","-s","0.5","-f","box",path_join(input_path,cdir,file),path_join(output_path.format(fclass),cdir,file)])
+                        if subprocess.call(["convert",image_type+":"+path_join(input_path,cdir,file),"-sample","50%","temp_reduced_size.png"]):#fails on normal maps, so just copy them
+                            subprocess.call(["cp",path_join(input_path,cdir,file),path_join(output_path.format(fclass),cdir,file)])
                        else:
-                            subprocess.call(["nvcompress","-bc3","temp_reduced_size.png",os.path.join(output_path,cdir,file)])
+                            subprocess.call(["nvcompress","-bc3","temp_reduced_size.png",path_join(output_path.format(fclass),cdir,file)])
                    else:
-                        subprocess.call(["convert",image_type+":"+os.path.join(input_path,cdir,file),"-sample","50%",image_type+":"+os.path.join(output_path,cdir,file)])#we use sample rather than an averaging filter to not break mask/rotation/... maps
-                else:
-                    subprocess.call(["cp",os.path.join(input_path,cdir,file),os.path.join(output_path,cdir,file)])
-    prefs_in=open(os.path.join(input_path,"preferences.xml"),'r')
-    prefs_out=open(os.path.join(output_path,"preferences.xml"),'w')
-    prefs_str=prefs_in.read(None)
-    prefs_in.close()
-    if exclude_reg:
+                        subprocess.call(["convert",image_type+":"+path_join(input_path,cdir,file),"-sample","50%",image_type+":"+path_join(output_path.format(fclass),cdir,file)])#we use sample rather than an averaging filter to not break mask/rotation/... maps
+                else:#not to be downsampled
+                    subprocess.call(["cp",path_join(input_path,cdir,file),path_join(output_path.format(fclass),cdir,file)])
+    if "{0}" in output_path:
+        subprocess.call(["mv",path_join(output_path.format("base"),"preferences.xml"),path_join(output_path.format("base"),"preferences-regions.xml")
+    if "extra-textures" in exclude_parts or "{0}" in output_path:
+        prefs_in=open(path_join(input_path,"preferences.xml"),'r')
+        prefs_out=open(path_join(output_path.format("base"),"preferences-noregions.xml" if "{0}" in output_path else "preferences.xml"),'w')
+        prefs_str=prefs_in.read(None)
+        prefs_in.close()
        prefs_str=prefs_str.replace("Materials/regions/materials.xml","Materials/default/materials.xml")#turn off regional textures
-    prefs_out.write(prefs_str)
-    prefs_out.close()
+        prefs_out.write(prefs_str)
+        prefs_out.close()
+def check_text_encoding(path,filelist=None,binary_types=(".png",".dds",".rgb",".RGB",".jpg",".wav",".WAV",".btg.gz",".xcf.gz",".xcf",".XCF","Thumbs.db",".blend",".bmp",".gif", ".3ds",".3DS",".pdf",".ttf",".txf",".htsvoice",".odt",".ods",".xls",".mp3",".zip",".tar.gz"),exclude_dirs=[".git","Timezone"]):
+    """filelist is intended for quick testing: see fgdata_nonascii_filelist.py"""
+    def err_context(err):
+        start=max(err.object.rfind(b'\n',0,err.start)+1,err.start-30,0)
+        end=min(err.object.find(b'\n',err.start),err.start+30,len(err.object))
+        if end<0:#not found
+            end=err.start+30
+        return err.object[start:end]
+    def dict_print(d):
+        return "".join(i[0]+"\n\t"+str(i[1])+"\n\t"+(str(i[1],encoding="utf-8",errors="replace")+"\n\t"+str(i[1],encoding="latin-1") if type(i[1])==bytes else "")+"\n" for i in sorted(d.items()))
+    if filelist is None:
+        filelist=[f for f in rfilelist(path,exclude_dirs) if not f.endswith(tuple(binary_types))]
+    utf8_files={}
+    withnulls_files=[]
+    othertext_files={}
+    mislabeled_xml={}
+    mislabeled_xml_nocomments={}
+    xml_encoding_pattern=re.compile(r'<\?xml.*?encoding="(\S+?)".*?\?>')
+    xml_noencoding_pattern=re.compile(r'<\?xml.*?\?>')
+    utf8_files_nocomments={}
+    othertext_files_nocomments={}
+    for fname in filelist:
+        if os.path.splitext(fname)[1]==".gz":
+            fobj=gzip.open(path_join(path,fname),mode='rb')
+        else:
+            fobj=open(path_join(path,fname),mode='rb')
+        fdata=fobj.read()
+        if b"\0" in fdata:
+            withnulls_files.append(fname)#two look like corrupted files: Aircraft/p51d/Resources/WIP/P-51D-25NA.ac (hangs gedit,large block of nulls in middle) Docs/Serial/nmeafaq.txt (block of nulls at end), rest are probably-binary types
+            continue
+        if os.path.splitext(fname)[1] in (".xml",".svg",".xhtml"):
+            encoding_mark=xml_encoding_pattern.search(str(fdata.split(b'\n',maxsplit=1)[0],encoding="utf-8"))
+            if encoding_mark:
+                encoding_mark=encoding_mark.group(1)
+                if encoding_mark not in ("utf-8","UTF-8","ISO-8859-1"):
+                    mislabeled_xml_nocomments[fname]="unrecognised encoding "+encoding_mark
+                    encoding_mark=None
+            else:
+                if xml_noencoding_pattern.search(str(fdata.split(b'\n',maxsplit=1)[0],encoding="utf-8")):
+                    encoding_mark="utf-8"#XML standard allows either UTF-8 or UTF-16 (with BOM) in unlabeled files, but we only use -8
+                else:
+                    encoding_mark=None
+                    #mislabeled_xml_nocomments[fname]="no xml header"
+        else:
+            encoding_mark=None
+        try:
+            fdata.decode(encoding="ascii")
+            continue
+        except UnicodeError as err:
+            errline=err_context(err)
+        try:
+            fdata.decode(encoding="utf-8")
+            utf8_files[fname]=errline
+            if encoding_mark not in ("utf-8","UTF-8",None):
+                mislabeled_xml[fname]=bytes(encoding_mark,encoding="ascii")+errline
+        except UnicodeError as err:
+            errline=err_context(err)
+            othertext_files[fname]=errline
+            if encoding_mark not in ("ISO-8859-1",None):
+                mislabeled_xml[fname]=bytes(encoding_mark,encoding="ascii")+errline
+        if os.path.basename(fname) in ("Read-Me.txt","README.txt","Readme.txt","readme.txt","LIS-MOI_GNU-GPL"):
+            continue
+        fdata_nocomments=strip_comments(fdata,filename=fname)
+        if fdata_nocomments.startswith(bytes([0xef,0xbb,0xbf])) and fname not in mislabeled_xml:#UTF-8 BOM
+            fdata_nocomments=fdata_nocomments[3:]
+        try:
+            fdata_nocomments.decode(encoding="ascii")
+            continue
+        except UnicodeError as err:
+            errline=err_context(err)
+        try:
+            fdata_nocomments.decode(encoding="utf-8")
+            if encoding_mark is None:
+                utf8_files_nocomments[fname]=errline
+            if encoding_mark not in ("utf-8","UTF-8",None):
+                mislabeled_xml_nocomments[fname]=bytes(encoding_mark,encoding="ascii")+errline
+        except UnicodeError as err:
+            errline=err_context(err)
+            if encoding_mark is None:
+                othertext_files_nocomments[fname]=errline
+            if encoding_mark not in ("ISO-8859-1",None):
+                mislabeled_xml_nocomments[fname]=bytes(encoding_mark,encoding="ascii")+errline
+    print("non-ASCII valid UTF-8:",dict_print(utf8_files),"\n\nother:",dict_print(othertext_files),"\n\nmislabeled/unrecognised",dict_print(mislabeled_xml),"\n\nwith nulls (binary or UTF-16/32):",sorted(withnulls_files),"\n\nnon-ASCII valid UTF-8 (outside BOM/comments):",dict_print(utf8_files_nocomments),"\n\nother (outside comments):",dict_print(othertext_files_nocomments),"\n\nmislabeled/unrecognised (outside comments)",dict_print(mislabeled_xml_nocomments))
+