Fix the Unicode workaround in wscript

The previous code was supposed to ensure VERSION and PROGRAM_VERSION were *not* Unicode objects since though they are the correct type to represent strings in Python 2, too many libraries have strange issues dealing with them, and UnicodeErrors can creep in with implicit conversions from/to Unicode objects. But in fact it did exactly the contrary since str.decode() always returns Unicode objects, whose type corresponds to the str class in Python 3. Fix it so that in both Python 2 and 3 the constants are eventually instances of str.
author: Julien "_FrnchFrgg_" RIVAUD <frnchfrgg@free.fr> 2017-09-03 20:39:02 +0200
committer: Julien "_FrnchFrgg_" RIVAUD <frnchfrgg@free.fr> 2017-09-03 20:39:10 +0200
commit: 5d0b00589c7c61b1eae2f4670b50fa8defea1a70 (patch)
tree: cbb062cb0c5715e5ddfc22a92fee8b120259d942 /wscript
parent: bfb62d2cb3e38e857e0944adbe2dd851bf6003a1 (diff)
1 files changed, 24 insertions, 7 deletions
diff --git a/wscript b/wscript
index c7e4245024..4b5acb0ad7 100644
--- a/wscript
+++ b/wscript
@@ -157,7 +157,7 @@ def fetch_tarball_revision ():
     if not os.path.exists ('libs/ardour/revision.cc'):
         print ('This tarball was not created correctly - it is missing libs/ardour/revision.cc')
         sys.exit (1)
-    with open('libs/ardour/revision.cc') as f:
+    with open('libs/ardour/revision.cc', 'rb') as f:
         content = f.readlines()
         remove_punctuation_map = dict((ord(char), None) for char in '";')
         return content[1].decode('utf-8').strip().split(' ')[7].translate (remove_punctuation_map)
@@ -182,12 +182,29 @@ else:
     MICRO = '0'
 
 V = MAJOR + '.' + MINOR + '.' + MICRO
-# Ensure that these are not unicode, which
-# can cause odd problems elsewhere. Note that
-# in python3, encode and decode do not return
-# strings, so we have to force the type.
-VERSION = V.encode ('ascii', 'ignore').decode ("utf-8")
-PROGRAM_VERSION = MAJOR.encode ('ascii', 'ignore').decode ("utf-8")
+
+def sanitize(s):
+    # round-trip to remove anything in the string that is not encodable in
+    # ASCII, yet still keep a real (utf8-encoded internally) string.
+    s = s.encode ('ascii', 'ignore').decode ("utf-8")
+    # In Python3, bytes is the class of binary content and encode() returns
+    # bytes to transform a string according to a text encoding; str is the
+    # class of normal strings (utf8-encoded internally) and decode() returns
+    # that type.
+    # Python 2 did not initially cater for encoding problems and can use str
+    # for both binary content and for (decoded) strings. The Unicode type was
+    # added to correspond to Python 3 str, and the Python 2 str type should
+    # only correspond to bytes. Alas, almost everything in the Python 2
+    # ecosystem has been written with str in mind and doesn't handle Unicode
+    # objects correctly. If Python 2 is in use, s will be a Unicode object and
+    # to avoid strange problems later we convert back to str, but in utf-8
+    # nonetheless.
+    if not isinstance(s, str):
+        s = s.encode("utf-8")
+    return s
+VERSION = sanitize(V)
+PROGRAM_VERSION = sanitize(MAJOR)
+del sanitize
 
 if len (sys.argv) > 1 and sys.argv[1] == 'dist':
         if not 'APPNAME' in os.environ:
author	Julien "_FrnchFrgg_" RIVAUD <frnchfrgg@free.fr>	2017-09-03 20:39:02 +0200
committer	Julien "_FrnchFrgg_" RIVAUD <frnchfrgg@free.fr>	2017-09-03 20:39:10 +0200
commit	5d0b00589c7c61b1eae2f4670b50fa8defea1a70 (patch)
tree	cbb062cb0c5715e5ddfc22a92fee8b120259d942 /wscript
parent	bfb62d2cb3e38e857e0944adbe2dd851bf6003a1 (diff)