Add improved version of generateTranslationDiffs

A Python script with better performance and does not have a bug in which the '0' (zero) number is ignored when sorting and removing duplicates.
2024-07-06 22:24:32 -07:00
parent 64dce38c33
commit b10d3e36ea
1 changed files with 630 additions and 0 deletions
--- a/tools/generateTranslationDiffs.py
+++ b/tools/generateTranslationDiffs.py
@@ -0,0 +1,630 @@
+#!/usr/bin/env python
+
+# This script iterates through interface translation files, moves comments to
+# the front, puts translated strings next, and finally looks for
+# untranslated/missing strings by matching against "default" which it then adds
+# to the translation, each line prepended by "!".
+#
+# Developers should run it after receiving a translation file from a
+# translator:
+# cp /tmp/new_japanese_translation rtdata/languages/Japanese
+# ./tools/generateTranslationDiffs "Japanese"
+#
+# Running the script without an argument iterates through all files.
+#
+# Locale files are generated automatically:
+# - English (UK)
+
+import argparse
+from collections import defaultdict
+from datetime import datetime
+from functools import cmp_to_key, reduce
+import logging
+import os
+from pathlib import Path
+import re
+from sys import stdout
+from typing import Dict, Iterable, List, Mapping, Optional
+
+
+FILE_DEFAULT = 'default'
+FILE_ENGLISH_US = 'English (US)'
+FILE_ENGLISH_UK = 'English (UK)'
+
+
+class SortHelper:
+    """
+    String sorting utilities.
+    """
+    char_indices: Optional[Dict[str, str]] = None
+
+    @staticmethod
+    def get_char_index(char: str):
+        """
+        Return the sort order of a character.
+        """
+        if SortHelper.char_indices is None:
+            # Printable characters sorted using the `sort -V` command.
+            characters = (
+                '.~0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ'
+                'abcdefghijklmnopqrstuvwxyz\t\x0b\x0c\n '
+                '!"#$%&\'()*+,-/:;<=>?@[\\]^_`{|}'
+            )
+            SortHelper.char_indices = {}
+            for i, char in enumerate(characters):
+                SortHelper.char_indices[char] = i
+        return SortHelper.char_indices.get(char)
+
+
+class TranslationEntry:
+    """
+    An entry in a translation file, consisting of a key and value.
+    """
+
+    def __init__(self, line: str, key: str, value: str):
+        """
+        :param line: The entire line containing the entry.
+        :param key: The key.
+        :param value: The value.
+        """
+        self.line = line
+        self.key = key
+        self.value = value
+
+    def __repr__(self):
+        return (
+            f'TranslationEntry(line={self.line}, key={self.key},'
+            f' value={self.value})'
+        )
+
+    def __str__(self):
+        return repr(self)
+
+    @staticmethod
+    def create_from_line(line: str):
+        """
+        Create an instance of this class from a line containing the entry
+        definition.
+
+        :raises ValueError: If the line does not contain a valid definition
+        consisting of a key and value separated by a ';' character.
+        """
+        split_line = line.split(';', maxsplit=1)
+        if len(split_line) != 2:
+            raise ValueError()
+        key, value = split_line
+        return TranslationEntry(line, key, value)
+
+
+class FileLines:
+    """
+    Lines of a translation file categorized by type.
+
+    The types are:
+    - Comment: Comments, which start with the '#' character.
+    - Changed: Translation entries consisting of a key and value.
+    Other lines are ignored.
+    """
+
+    def __init__(
+            self,
+            all: List[str],
+            comments: List[str],
+            changed: List[TranslationEntry]
+    ):
+        self.all = all
+        self.comments = comments
+        self.changed = changed
+
+    def __repr__(self):
+        return (
+            f'FileLines(all={self.all}, comments={self.comments},'
+            f' changed={self.changed})'
+        )
+
+    def __str__(self):
+        return repr(self)
+
+
+def main():
+    args = parse_args()
+    configure_logger()
+    start_time = datetime.now()
+    file_paths = get_file_paths(args.file_names)
+    default_file_lines = format_default()
+    format_files(file_paths, default_file_lines)
+    generate_locale_files(default_file_lines, file_paths)
+    end_time = datetime.now()
+    log_duration(start_time, end_time, len(file_paths))
+
+
+def parse_args():
+    """
+    Return the arguments passed to this program.
+    """
+    parser = argparse.ArgumentParser(
+        prog='generateTranslationDiffs',
+        description='Formats translation files in rtdata/languages.',
+    )
+    parser.add_argument(
+        'file_names',
+        nargs='*',
+        help='list of language files to format, or leave empty to format all',
+    )
+    return parser.parse_args()
+
+
+def configure_logger():
+    """
+    Set up the logger.
+    """
+    logger = get_logger()
+    handler = logging.StreamHandler(stdout)
+    formatter = logging.Formatter('%(levelname)s: %(message)s')
+    handler.setFormatter(formatter)
+    logger.addHandler(handler)
+    logger.setLevel(logging.INFO)
+
+
+def get_logger():
+    """
+    Return the logger.
+    """
+    return logging.getLogger('generateTranslationDiffs')
+
+
+def get_file_paths(file_names: List[str]):
+    """
+    Return the paths for all the given file names if they exist and are
+    writable. If no files names are given, return paths of all translation
+    files.
+
+    :param file_names: List of file names, as path strings and/or name.
+    """
+    if not file_names:
+        return get_default_file_paths()
+    return list(filter(lambda p: p, (get_file_path(n) for n in file_names)))
+
+
+def get_languages_path():
+    """
+    Return the path of the languages directory.
+    """
+    return Path(__file__).parent.parent / 'rtdata' / 'languages'
+
+
+def get_file_path(file_name):
+    """
+    Return the path for the translation file, or None if it doesn't exist or is
+    not writable.
+
+    :param file_name: The file name as a path string or name.
+    """
+    file_path = None
+    if Path(file_name).exists():
+        file_path = Path(file_name)
+    elif (get_languages_path() / file_name).exists():
+        file_path = get_languages_path() / file_name
+
+    if not is_writable_file(file_path):
+        get_logger().warning('File "%s" not found or not writable.', file_name)
+        return None
+    return file_path
+
+
+def get_default_file_paths():
+    """
+    Return a list of paths for all translation files excluding "default" and
+    locale translations.
+    """
+    ignored_files = [
+        FILE_DEFAULT, FILE_ENGLISH_UK, 'LICENSE', 'README', 'temp_file'
+    ]
+    ignored_files_regex = '|'.join(re.escape(file) for file in ignored_files)
+    ignore_pattern = re.compile(rf'({ignored_files_regex}|.*\.sh|\..*)')
+    return list(filter(
+        lambda p: p.is_file() and not ignore_pattern.fullmatch(p.name),
+        get_languages_path().iterdir()
+    ))
+
+
+def is_writable_file(path: Path):
+    """
+    Return if the file is writable with the current permissions.
+    """
+    return path and path.is_file() and os.access(path, os.W_OK)
+
+
+def format_default():
+    """
+    Format the default language file.
+
+    :return: File lines of "default".
+    """
+    get_logger().info('Formatting %s.', FILE_DEFAULT)
+    path = get_languages_path() / FILE_DEFAULT
+    file_lines = read_file(path)
+    changed_lines = [key_line.line for key_line in file_lines.changed]
+    file_lines.all = file_lines.comments + changed_lines + ['']
+    new_contents = '\n'.join(file_lines.all)
+    get_logger().debug(
+        'New contents for file %s:\n%s',
+        FILE_DEFAULT,
+        new_contents
+    )
+    path.write_text(new_contents)
+    return file_lines
+
+
+def format_files(file_paths: List[Path], default_file_lines: FileLines):
+    """
+    Format the translation files.
+
+    :param file_paths: Files to format.
+    :param default_file_lines: File lines of the default language file.
+    """
+    if not file_paths:
+        get_logger().info('No language files to format.')
+    for file_path in file_paths:
+        format_file(file_path, default_file_lines)
+
+
+def generate_locale_files(
+        default_file_lines: FileLines,
+        file_paths: List[Path]
+):
+    """
+    Generate locale translation files.
+
+    :param default_file_lines: File lines of the default language file.
+    :param file_paths: Paths of files to generate locale translations of.
+    """
+    file_name_to_generator = {
+        FILE_ENGLISH_US: generate_english_locales,
+    }
+    for path in file_paths:
+        generator = file_name_to_generator.get(path.name)
+        if generator:
+            generator(default_file_lines, path)
+
+
+def generate_english_locales(default_file_lines: FileLines, us_path: Path):
+    """
+    Generate English locale files.
+    """
+    us_file_lines: FileLines = read_file(us_path)
+    generate_english_locale_uk(default_file_lines, us_file_lines)
+
+
+def generate_english_locale_uk(
+        default_file_lines: FileLines,
+        us_file_lines: FileLines
+):
+    """
+    Generate the UK English locale file.
+    """
+    get_logger().info('Creating %s file', FILE_ENGLISH_UK)
+
+    new_lines = list(us_file_lines.comments)
+    new_lines.extend(get_english_uk_translations(default_file_lines))
+    new_lines.extend(get_english_uk_untranslated(us_file_lines.all))
+    new_lines.append('')
+
+    new_contents = '\n'.join(new_lines)
+    get_logger().debug(
+        'New contents for file %s:\n%s',
+        FILE_ENGLISH_UK,
+        new_contents
+    )
+    path = get_languages_path() / FILE_ENGLISH_UK
+    path.write_text(new_contents)
+
+
+def get_english_uk_translations(default_file_lines: FileLines):
+    """
+    Return a list of lines with translated entries for UK English.
+
+    :param default_file_lines: File lines of "default".
+    """
+    line_pattern = re.compile(r'(color|behavior|center)', flags=re.IGNORECASE)
+    replacements = {
+        'olor': 'olour',
+        'ehavior': 'ehaviour',
+        'center': 'centre',
+        'Center': 'Centre',
+    }
+    entries_to_translate = filter(
+        lambda entry: line_pattern.search(entry.value),
+        default_file_lines.changed
+    )
+
+    translations = []
+    for default_entry in entries_to_translate:
+        new_value = reduce(
+            lambda value, replacement: value.replace(*replacement),
+            replacements.items(),
+            default_entry.value
+        )
+        translations.append(f'{default_entry.key};{new_value}')
+    return translations
+
+
+def get_english_uk_untranslated(us_lines: List[str]):
+    """
+    Return a list of lines from the US English file excluding comments and
+    those translated for UK English.
+    """
+    pattern = re.compile(
+        r'.+;.*(color|behavior|center).*',
+        flags=re.IGNORECASE
+    )
+    return list(filter(
+        lambda line: not pattern.search(line) and not line.startswith('#'),
+        us_lines
+    ))
+
+
+def format_file(path: Path, default_file_lines: FileLines):
+    """
+    Format a translation file.
+
+    :param path: Path of the translation file.
+    :param default_file_lines: File lines of "default".
+    """
+    get_logger().info(f'Formatting {path.name}.')
+    file_lines = read_file(path)
+    translated_lines, untranslated_lines = get_translated_untranslated_lines(
+        file_lines,
+        default_file_lines
+    )
+    warn_removed_entry(file_lines, default_file_lines)
+    new_lines = list(file_lines.comments)
+    new_lines.append('')
+    new_lines.extend(translated_lines)
+    new_lines.append('')
+    new_lines.extend(untranslated_lines)
+    new_lines.append('')
+    file_lines.all = new_lines
+    new_contents = '\n'.join(file_lines.all)
+    get_logger().debug(
+        'New contents for file %s:\n%s',
+        path.name,
+        new_contents
+    )
+    path.write_text(new_contents)
+
+
+def get_translated_untranslated_lines(
+        file_lines: FileLines,
+        default_file_lines: FileLines
+):
+    """
+    Return a tuple containing a list of translated lines and a list of
+    untranslated lines.
+    """
+    key_to_entry = {
+        entry.key: entry for entry in file_lines.changed
+    }
+    translated_lines = []
+    untranslated_lines = []
+    for default_key_line in default_file_lines.changed:
+        key = default_key_line.key
+        if key in key_to_entry:
+            translated_lines.append(key_to_entry[key].line)
+        else:
+            untranslated_lines.append(f'!{default_key_line.line}')
+    if untranslated_lines:
+        header = [
+            '!!!!!!!!!!!!!!!!!!!!!!!!!',
+            (
+                '! Untranslated keys follow;'
+                ' remove the ! prefix after an entry is translated.'
+            ),
+            '!!!!!!!!!!!!!!!!!!!!!!!!!',
+            '',
+        ]
+        untranslated_lines = header + untranslated_lines
+    return translated_lines, untranslated_lines
+
+
+def warn_removed_entry(file_lines: FileLines, default_file_lines: FileLines):
+    """
+    Emit a warning for any translation entries in the translation file but not
+    in the default file, if any.
+    """
+    default_keys = set(entry.key for entry in default_file_lines.changed)
+    removed_lines = [
+        entry.line for entry in file_lines.changed if entry.key not in
+        default_keys
+    ]
+    if removed_lines:
+        warning_lines = ['Removed entry/entries']
+        warning_lines.extend([f'\t{line}' for line in removed_lines])
+        get_logger().warning('\n'.join(warning_lines))
+
+
+def read_file(path: Path):
+    """
+    Return the file lines from a language file.
+    """
+    # Begins with '#' followed by 1+ characters.
+    comment_pattern = re.compile(r'^#.+')
+    # Does not begin with '!', '#', or end of line.
+    changed_pattern = re.compile(r'^[^!#$]')
+    file_lines = FileLines(path.read_text().splitlines(), [], [])
+    for line_num, line in enumerate(file_lines.all):
+        if comment_pattern.match(line):
+            file_lines.comments.append(line)
+        elif changed_pattern.match(line):
+            try:
+                translation_entry = TranslationEntry.create_from_line(line)
+            except ValueError:
+                get_logger().warning(
+                    'Malformed translation entry in "%s" on line %d: %s',
+                    path.name,
+                    line_num,
+                    line
+                )
+            else:
+                file_lines.changed.append(translation_entry)
+    sort_file_lines(file_lines)
+    return file_lines
+
+
+def sort_file_lines(file_lines: FileLines):
+    """
+    Sort the comments and changed entries of a file lines.
+    """
+    comments = sort_comment_lines(file_lines.comments)
+    changed = sort_changed_unchanged_lines(file_lines.changed)
+    file_lines.comments.clear()
+    file_lines.changed.clear()
+    file_lines.comments.extend(comments)
+    file_lines.changed.extend(changed)
+
+
+def sort_comment_lines(lines: List[str]):
+    """
+    Sort comment lines using natural sort.
+    """
+    return sorted(set(lines), key=cmp_to_key(compare_string_natural))
+
+
+def sort_changed_unchanged_lines(entries: List[TranslationEntry]):
+    """
+    Sort changed or unchanged lines using natural sort of the translation entry
+    keys.
+    """
+    key_to_lines = defaultdict(set)
+    for entry in entries:
+        key_to_lines[entry.key].add(entry.line)
+
+    warn_duplicate_entries(key_to_lines)
+
+    sort_key = cmp_to_key(lambda a, b: compare_string_natural(a[0], b[0]))
+    return list(map(
+        lambda item: TranslationEntry.create_from_line(item[1].pop()),
+        sorted(key_to_lines.items(), key=sort_key)
+    ))
+
+
+def warn_duplicate_entries(key_to_lines: Mapping[str, Iterable[str]]):
+    """
+    Emit a warning if there are duplicate translation entries.
+
+    :param key_to_lines: Mapping from entry key to iterable containing all
+    values.
+    """
+    duplicate_key_to_lines = {
+        k: v for k, v in key_to_lines.items() if len(v) > 1
+    }
+    if duplicate_key_to_lines:
+        warning_lines = ['Duplicate key(s)']
+        for key, lines in duplicate_key_to_lines.items():
+            warning_lines.append(f'\t{key}')
+            warning_lines.extend(f'\t\t{line}' for line in lines)
+        get_logger().warning('\n'.join(warning_lines))
+
+
+def compare_string_natural(a: str, b: str):
+    """
+    Compare two strings using natural ordering.
+
+    :return: Negative integer if a comes before b, positive integer if b comes
+    before a, or zero if a and b are identical.
+    """
+    ia = 0  # Character index for a.
+    ib = 0  # Character index for b.
+    while ia < len(a) and ib < len(b):
+        if a[ia].isdigit():
+            if b[ib].isdigit():
+                # Compare numbers
+                a_number, ia = read_int(a, ia)
+                b_number, ib = read_int(b, ib)
+                if a_number != b_number:
+                    return a_number - b_number
+            else:
+                # Compare number with character.
+                return cmp_chars(a[ia], b[ib])
+        else:
+            if b[ib].isdigit():
+                # Compare character with number.
+                return cmp_chars(a[ia], b[ib])
+            else:
+                # Compare character with character.
+                if a[ia] != b[ib]:
+                    return cmp_chars(a[ia], b[ib])
+                ia += 1
+                ib += 1
+    if ia < len(a):
+        # a is "longer".
+        return 1
+    if ib < len(b):
+        # b is "longer".
+        return -1
+    # a and b are equivalent.
+    return cmp_string(a, b)
+
+
+def read_int(string: str, index: int):
+    """
+    Read an integer from the string starting at the index.
+
+    :param string: The string.
+    :param index: Index in the string where the number starts.
+    :return: A tuple containing the number and the index after the end of where
+    the number was extracted in the string. If there is no number, returns zero
+    and the original index.
+    """
+    i = index
+    while i < len(string) and string[i].isdigit():
+        i += 1
+    number = 0 if i == index else int(string[index:i])
+    return number, i
+
+
+def cmp_chars(a: str, b: str):
+    """
+    Compare two characters according to the `sort -v` command.
+
+    :return: Negative integer if a comes before b, positive integer if b comes
+    before a, or zero if a and b are identical.
+    """
+    a_index = SortHelper.get_char_index(a)
+    b_index = SortHelper.get_char_index(b)
+    if a_index is not None and b_index is not None:
+        return a_index - b_index
+    if a == b:
+        return 0
+    return -1 if a < b else 1
+
+
+def cmp_string(a: str, b: str):
+    """
+    Compare two strings according to a character-by-character comparison using
+    the `sort -v` command.
+
+    :return: Negative integer if a comes before b, positive integer if b comes
+    before a, or zero if a and b are identical.
+    """
+    for a_char, b_char in zip(a, b):
+        cmp_result = cmp_chars(a_char, b_char)
+        if cmp_result != 0:
+            return cmp_result
+    return len(a) - len(b)
+
+
+def log_duration(start_time: datetime, end_time: datetime, file_count):
+    """
+    Log the time it took to format the files.
+    """
+    duration = end_time - start_time
+    get_logger().info(
+        'Finished updating %d files.\nTotal time: %fs',
+        file_count,
+        duration.total_seconds()
+    )
+
+
+if __name__ == '__main__':
+    main()