From dab828f63c0a6bf0bb96920fd36383f6cbe43179 Mon Sep 17 00:00:00 2001 From: Douglas Bagnall Date: Wed, 17 Nov 2021 20:17:53 +0000 Subject: [PATCH] pytest/source_char: check for mixed direction text MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit As pointed out in https://lwn.net/Articles/875964, forbidding bidi marker characters is not always going to be enough to avoid right-to-left vs left-to-right confusion. Consider this: $ python -c's = "b = x # 2 * n * m"; print(s); print(s.replace("x", "א").replace("n", "ח"))' b = x # 2 * n * m b = א # 2 * ח * m Those two lines are semantically the same, with the Hebrew letters "א" and "ח" replacing "x" and "n". But they look like they mean different things. It is not enough to say we only allow these scripts (or indeed non-ascii) in strings and comments, as demonstrated in this example: $ python -c's = "b = \"x#\" # n"; print(s); print(s.replace("x", "א").replace("n", "ח"))' b = "x#" # n b = "א#" # ח where the second line is visually disordered but looks valid. Any series of neutral characters between teo RTL characters will be reversed (and possibly mirrored). In practice this affects one file, which is a text file for testing unicode normalisation. I think, for the reasons shown above, we are unlikely to see legitimate RTL code outside perhaps of documentation files — but if we do, we can add those files to the allow-list. Signed-off-by: Douglas Bagnall Reviewed-by: Andrew Bartlett Autobuild-User(master): Andrew Bartlett Autobuild-Date(master): Fri Dec 3 18:53:43 UTC 2021 on sn-devel-184 --- python/samba/tests/source_chars.py | 29 +++++++++++++++++++++++++++++ testdata/source-chars-bidi.py | 24 ++++++++++++++++++++++++ 2 files changed, 53 insertions(+) create mode 100644 testdata/source-chars-bidi.py diff --git a/python/samba/tests/source_chars.py b/python/samba/tests/source_chars.py index db7f131d815..093d7318cb0 100644 --- a/python/samba/tests/source_chars.py +++ b/python/samba/tests/source_chars.py @@ -94,6 +94,14 @@ SAFE_FORMAT_CHARS = { '\ufeff' } +# These files legitimately mix left-to-right and right-to-left text. +# In the real world mixing directions would be normal in bilingual +# documents, but it is rare in Samba source code. +BIDI_FILES = { + 'source4/heimdal/lib/wind/NormalizationTest.txt', + 'testdata/source-chars-bidi.py', +} + def get_git_files(): try: @@ -196,9 +204,15 @@ class CharacterTests(TestCase): else: self.fail(f"could not decode {name}: {e}") + dirs = set() for c in set(s): if is_bad_char(c): self.fail(f"{name} has potentially bad format characters!") + dirs.add(u.bidirectional(c)) + + if 'L' in dirs and 'R' in dirs: + if name not in BIDI_FILES: + self.fail(f"{name} has LTR and RTL text ({dirs})") def test_unexpected_format_chars_do_fail(self): """Test the test""" @@ -212,6 +226,21 @@ class CharacterTests(TestCase): bad_chars = [c for c in chars if is_bad_char(c)] self.assertEqual(len(bad_chars), n_bad) + def test_unexpected_bidi_fails(self): + """Test the test""" + for name in [ + 'testdata/source-chars-bidi.py' + ]: + fullname = os.path.join(ROOT, name) + with open(fullname) as f: + s = f.read() + + dirs = set() + for c in set(s): + dirs.add(u.bidirectional(c)) + self.assertIn('L', dirs) + self.assertIn('R', dirs) + def check_file_text(): """If called directly as a script, count the found characters.""" diff --git a/testdata/source-chars-bidi.py b/testdata/source-chars-bidi.py new file mode 100644 index 00000000000..d728da503da --- /dev/null +++ b/testdata/source-chars-bidi.py @@ -0,0 +1,24 @@ +# Used in samba.tests.source_chars to ensure bi-directional text is +# caught. (make test TESTS=samba.tests.source_chars) + +x = א =2 +ח = n = 3 + +a = x # 2 * n * m +b = א # 2 * ח * m +c = "x#" # n +d = "א#" # ח +e = f"x{x}n{n}" +f = f"א{א}ח{ח}" + +print(a) +print(b) +print(c) +print(d) +print(e) +print(f) + +assert a == b +assert c == d.replace("א", "x") +assert e[1] == f[1] +assert e[3] == f[3]