From b10d3e36ea034daa374dfa64fc276f12d6f02502 Mon Sep 17 00:00:00 2001 From: Lawrence Lee <45837045+Lawrence37@users.noreply.github.com> Date: Sat, 6 Jul 2024 22:24:32 -0700 Subject: [PATCH] Add improved version of generateTranslationDiffs A Python script with better performance and does not have a bug in which the '0' (zero) number is ignored when sorting and removing duplicates. --- tools/generateTranslationDiffs.py | 630 ++++++++++++++++++++++++++++++ 1 file changed, 630 insertions(+) create mode 100644 tools/generateTranslationDiffs.py diff --git a/tools/generateTranslationDiffs.py b/tools/generateTranslationDiffs.py new file mode 100644 index 000000000..2b1284efb --- /dev/null +++ b/tools/generateTranslationDiffs.py @@ -0,0 +1,630 @@ +#!/usr/bin/env python + +# This script iterates through interface translation files, moves comments to +# the front, puts translated strings next, and finally looks for +# untranslated/missing strings by matching against "default" which it then adds +# to the translation, each line prepended by "!". +# +# Developers should run it after receiving a translation file from a +# translator: +# cp /tmp/new_japanese_translation rtdata/languages/Japanese +# ./tools/generateTranslationDiffs "Japanese" +# +# Running the script without an argument iterates through all files. +# +# Locale files are generated automatically: +# - English (UK) + +import argparse +from collections import defaultdict +from datetime import datetime +from functools import cmp_to_key, reduce +import logging +import os +from pathlib import Path +import re +from sys import stdout +from typing import Dict, Iterable, List, Mapping, Optional + + +FILE_DEFAULT = 'default' +FILE_ENGLISH_US = 'English (US)' +FILE_ENGLISH_UK = 'English (UK)' + + +class SortHelper: + """ + String sorting utilities. + """ + char_indices: Optional[Dict[str, str]] = None + + @staticmethod + def get_char_index(char: str): + """ + Return the sort order of a character. + """ + if SortHelper.char_indices is None: + # Printable characters sorted using the `sort -V` command. + characters = ( + '.~0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ' + 'abcdefghijklmnopqrstuvwxyz\t\x0b\x0c\n ' + '!"#$%&\'()*+,-/:;<=>?@[\\]^_`{|}' + ) + SortHelper.char_indices = {} + for i, char in enumerate(characters): + SortHelper.char_indices[char] = i + return SortHelper.char_indices.get(char) + + +class TranslationEntry: + """ + An entry in a translation file, consisting of a key and value. + """ + + def __init__(self, line: str, key: str, value: str): + """ + :param line: The entire line containing the entry. + :param key: The key. + :param value: The value. + """ + self.line = line + self.key = key + self.value = value + + def __repr__(self): + return ( + f'TranslationEntry(line={self.line}, key={self.key},' + f' value={self.value})' + ) + + def __str__(self): + return repr(self) + + @staticmethod + def create_from_line(line: str): + """ + Create an instance of this class from a line containing the entry + definition. + + :raises ValueError: If the line does not contain a valid definition + consisting of a key and value separated by a ';' character. + """ + split_line = line.split(';', maxsplit=1) + if len(split_line) != 2: + raise ValueError() + key, value = split_line + return TranslationEntry(line, key, value) + + +class FileLines: + """ + Lines of a translation file categorized by type. + + The types are: + - Comment: Comments, which start with the '#' character. + - Changed: Translation entries consisting of a key and value. + Other lines are ignored. + """ + + def __init__( + self, + all: List[str], + comments: List[str], + changed: List[TranslationEntry] + ): + self.all = all + self.comments = comments + self.changed = changed + + def __repr__(self): + return ( + f'FileLines(all={self.all}, comments={self.comments},' + f' changed={self.changed})' + ) + + def __str__(self): + return repr(self) + + +def main(): + args = parse_args() + configure_logger() + start_time = datetime.now() + file_paths = get_file_paths(args.file_names) + default_file_lines = format_default() + format_files(file_paths, default_file_lines) + generate_locale_files(default_file_lines, file_paths) + end_time = datetime.now() + log_duration(start_time, end_time, len(file_paths)) + + +def parse_args(): + """ + Return the arguments passed to this program. + """ + parser = argparse.ArgumentParser( + prog='generateTranslationDiffs', + description='Formats translation files in rtdata/languages.', + ) + parser.add_argument( + 'file_names', + nargs='*', + help='list of language files to format, or leave empty to format all', + ) + return parser.parse_args() + + +def configure_logger(): + """ + Set up the logger. + """ + logger = get_logger() + handler = logging.StreamHandler(stdout) + formatter = logging.Formatter('%(levelname)s: %(message)s') + handler.setFormatter(formatter) + logger.addHandler(handler) + logger.setLevel(logging.INFO) + + +def get_logger(): + """ + Return the logger. + """ + return logging.getLogger('generateTranslationDiffs') + + +def get_file_paths(file_names: List[str]): + """ + Return the paths for all the given file names if they exist and are + writable. If no files names are given, return paths of all translation + files. + + :param file_names: List of file names, as path strings and/or name. + """ + if not file_names: + return get_default_file_paths() + return list(filter(lambda p: p, (get_file_path(n) for n in file_names))) + + +def get_languages_path(): + """ + Return the path of the languages directory. + """ + return Path(__file__).parent.parent / 'rtdata' / 'languages' + + +def get_file_path(file_name): + """ + Return the path for the translation file, or None if it doesn't exist or is + not writable. + + :param file_name: The file name as a path string or name. + """ + file_path = None + if Path(file_name).exists(): + file_path = Path(file_name) + elif (get_languages_path() / file_name).exists(): + file_path = get_languages_path() / file_name + + if not is_writable_file(file_path): + get_logger().warning('File "%s" not found or not writable.', file_name) + return None + return file_path + + +def get_default_file_paths(): + """ + Return a list of paths for all translation files excluding "default" and + locale translations. + """ + ignored_files = [ + FILE_DEFAULT, FILE_ENGLISH_UK, 'LICENSE', 'README', 'temp_file' + ] + ignored_files_regex = '|'.join(re.escape(file) for file in ignored_files) + ignore_pattern = re.compile(rf'({ignored_files_regex}|.*\.sh|\..*)') + return list(filter( + lambda p: p.is_file() and not ignore_pattern.fullmatch(p.name), + get_languages_path().iterdir() + )) + + +def is_writable_file(path: Path): + """ + Return if the file is writable with the current permissions. + """ + return path and path.is_file() and os.access(path, os.W_OK) + + +def format_default(): + """ + Format the default language file. + + :return: File lines of "default". + """ + get_logger().info('Formatting %s.', FILE_DEFAULT) + path = get_languages_path() / FILE_DEFAULT + file_lines = read_file(path) + changed_lines = [key_line.line for key_line in file_lines.changed] + file_lines.all = file_lines.comments + changed_lines + [''] + new_contents = '\n'.join(file_lines.all) + get_logger().debug( + 'New contents for file %s:\n%s', + FILE_DEFAULT, + new_contents + ) + path.write_text(new_contents) + return file_lines + + +def format_files(file_paths: List[Path], default_file_lines: FileLines): + """ + Format the translation files. + + :param file_paths: Files to format. + :param default_file_lines: File lines of the default language file. + """ + if not file_paths: + get_logger().info('No language files to format.') + for file_path in file_paths: + format_file(file_path, default_file_lines) + + +def generate_locale_files( + default_file_lines: FileLines, + file_paths: List[Path] +): + """ + Generate locale translation files. + + :param default_file_lines: File lines of the default language file. + :param file_paths: Paths of files to generate locale translations of. + """ + file_name_to_generator = { + FILE_ENGLISH_US: generate_english_locales, + } + for path in file_paths: + generator = file_name_to_generator.get(path.name) + if generator: + generator(default_file_lines, path) + + +def generate_english_locales(default_file_lines: FileLines, us_path: Path): + """ + Generate English locale files. + """ + us_file_lines: FileLines = read_file(us_path) + generate_english_locale_uk(default_file_lines, us_file_lines) + + +def generate_english_locale_uk( + default_file_lines: FileLines, + us_file_lines: FileLines +): + """ + Generate the UK English locale file. + """ + get_logger().info('Creating %s file', FILE_ENGLISH_UK) + + new_lines = list(us_file_lines.comments) + new_lines.extend(get_english_uk_translations(default_file_lines)) + new_lines.extend(get_english_uk_untranslated(us_file_lines.all)) + new_lines.append('') + + new_contents = '\n'.join(new_lines) + get_logger().debug( + 'New contents for file %s:\n%s', + FILE_ENGLISH_UK, + new_contents + ) + path = get_languages_path() / FILE_ENGLISH_UK + path.write_text(new_contents) + + +def get_english_uk_translations(default_file_lines: FileLines): + """ + Return a list of lines with translated entries for UK English. + + :param default_file_lines: File lines of "default". + """ + line_pattern = re.compile(r'(color|behavior|center)', flags=re.IGNORECASE) + replacements = { + 'olor': 'olour', + 'ehavior': 'ehaviour', + 'center': 'centre', + 'Center': 'Centre', + } + entries_to_translate = filter( + lambda entry: line_pattern.search(entry.value), + default_file_lines.changed + ) + + translations = [] + for default_entry in entries_to_translate: + new_value = reduce( + lambda value, replacement: value.replace(*replacement), + replacements.items(), + default_entry.value + ) + translations.append(f'{default_entry.key};{new_value}') + return translations + + +def get_english_uk_untranslated(us_lines: List[str]): + """ + Return a list of lines from the US English file excluding comments and + those translated for UK English. + """ + pattern = re.compile( + r'.+;.*(color|behavior|center).*', + flags=re.IGNORECASE + ) + return list(filter( + lambda line: not pattern.search(line) and not line.startswith('#'), + us_lines + )) + + +def format_file(path: Path, default_file_lines: FileLines): + """ + Format a translation file. + + :param path: Path of the translation file. + :param default_file_lines: File lines of "default". + """ + get_logger().info(f'Formatting {path.name}.') + file_lines = read_file(path) + translated_lines, untranslated_lines = get_translated_untranslated_lines( + file_lines, + default_file_lines + ) + warn_removed_entry(file_lines, default_file_lines) + new_lines = list(file_lines.comments) + new_lines.append('') + new_lines.extend(translated_lines) + new_lines.append('') + new_lines.extend(untranslated_lines) + new_lines.append('') + file_lines.all = new_lines + new_contents = '\n'.join(file_lines.all) + get_logger().debug( + 'New contents for file %s:\n%s', + path.name, + new_contents + ) + path.write_text(new_contents) + + +def get_translated_untranslated_lines( + file_lines: FileLines, + default_file_lines: FileLines +): + """ + Return a tuple containing a list of translated lines and a list of + untranslated lines. + """ + key_to_entry = { + entry.key: entry for entry in file_lines.changed + } + translated_lines = [] + untranslated_lines = [] + for default_key_line in default_file_lines.changed: + key = default_key_line.key + if key in key_to_entry: + translated_lines.append(key_to_entry[key].line) + else: + untranslated_lines.append(f'!{default_key_line.line}') + if untranslated_lines: + header = [ + '!!!!!!!!!!!!!!!!!!!!!!!!!', + ( + '! Untranslated keys follow;' + ' remove the ! prefix after an entry is translated.' + ), + '!!!!!!!!!!!!!!!!!!!!!!!!!', + '', + ] + untranslated_lines = header + untranslated_lines + return translated_lines, untranslated_lines + + +def warn_removed_entry(file_lines: FileLines, default_file_lines: FileLines): + """ + Emit a warning for any translation entries in the translation file but not + in the default file, if any. + """ + default_keys = set(entry.key for entry in default_file_lines.changed) + removed_lines = [ + entry.line for entry in file_lines.changed if entry.key not in + default_keys + ] + if removed_lines: + warning_lines = ['Removed entry/entries'] + warning_lines.extend([f'\t{line}' for line in removed_lines]) + get_logger().warning('\n'.join(warning_lines)) + + +def read_file(path: Path): + """ + Return the file lines from a language file. + """ + # Begins with '#' followed by 1+ characters. + comment_pattern = re.compile(r'^#.+') + # Does not begin with '!', '#', or end of line. + changed_pattern = re.compile(r'^[^!#$]') + file_lines = FileLines(path.read_text().splitlines(), [], []) + for line_num, line in enumerate(file_lines.all): + if comment_pattern.match(line): + file_lines.comments.append(line) + elif changed_pattern.match(line): + try: + translation_entry = TranslationEntry.create_from_line(line) + except ValueError: + get_logger().warning( + 'Malformed translation entry in "%s" on line %d: %s', + path.name, + line_num, + line + ) + else: + file_lines.changed.append(translation_entry) + sort_file_lines(file_lines) + return file_lines + + +def sort_file_lines(file_lines: FileLines): + """ + Sort the comments and changed entries of a file lines. + """ + comments = sort_comment_lines(file_lines.comments) + changed = sort_changed_unchanged_lines(file_lines.changed) + file_lines.comments.clear() + file_lines.changed.clear() + file_lines.comments.extend(comments) + file_lines.changed.extend(changed) + + +def sort_comment_lines(lines: List[str]): + """ + Sort comment lines using natural sort. + """ + return sorted(set(lines), key=cmp_to_key(compare_string_natural)) + + +def sort_changed_unchanged_lines(entries: List[TranslationEntry]): + """ + Sort changed or unchanged lines using natural sort of the translation entry + keys. + """ + key_to_lines = defaultdict(set) + for entry in entries: + key_to_lines[entry.key].add(entry.line) + + warn_duplicate_entries(key_to_lines) + + sort_key = cmp_to_key(lambda a, b: compare_string_natural(a[0], b[0])) + return list(map( + lambda item: TranslationEntry.create_from_line(item[1].pop()), + sorted(key_to_lines.items(), key=sort_key) + )) + + +def warn_duplicate_entries(key_to_lines: Mapping[str, Iterable[str]]): + """ + Emit a warning if there are duplicate translation entries. + + :param key_to_lines: Mapping from entry key to iterable containing all + values. + """ + duplicate_key_to_lines = { + k: v for k, v in key_to_lines.items() if len(v) > 1 + } + if duplicate_key_to_lines: + warning_lines = ['Duplicate key(s)'] + for key, lines in duplicate_key_to_lines.items(): + warning_lines.append(f'\t{key}') + warning_lines.extend(f'\t\t{line}' for line in lines) + get_logger().warning('\n'.join(warning_lines)) + + +def compare_string_natural(a: str, b: str): + """ + Compare two strings using natural ordering. + + :return: Negative integer if a comes before b, positive integer if b comes + before a, or zero if a and b are identical. + """ + ia = 0 # Character index for a. + ib = 0 # Character index for b. + while ia < len(a) and ib < len(b): + if a[ia].isdigit(): + if b[ib].isdigit(): + # Compare numbers + a_number, ia = read_int(a, ia) + b_number, ib = read_int(b, ib) + if a_number != b_number: + return a_number - b_number + else: + # Compare number with character. + return cmp_chars(a[ia], b[ib]) + else: + if b[ib].isdigit(): + # Compare character with number. + return cmp_chars(a[ia], b[ib]) + else: + # Compare character with character. + if a[ia] != b[ib]: + return cmp_chars(a[ia], b[ib]) + ia += 1 + ib += 1 + if ia < len(a): + # a is "longer". + return 1 + if ib < len(b): + # b is "longer". + return -1 + # a and b are equivalent. + return cmp_string(a, b) + + +def read_int(string: str, index: int): + """ + Read an integer from the string starting at the index. + + :param string: The string. + :param index: Index in the string where the number starts. + :return: A tuple containing the number and the index after the end of where + the number was extracted in the string. If there is no number, returns zero + and the original index. + """ + i = index + while i < len(string) and string[i].isdigit(): + i += 1 + number = 0 if i == index else int(string[index:i]) + return number, i + + +def cmp_chars(a: str, b: str): + """ + Compare two characters according to the `sort -v` command. + + :return: Negative integer if a comes before b, positive integer if b comes + before a, or zero if a and b are identical. + """ + a_index = SortHelper.get_char_index(a) + b_index = SortHelper.get_char_index(b) + if a_index is not None and b_index is not None: + return a_index - b_index + if a == b: + return 0 + return -1 if a < b else 1 + + +def cmp_string(a: str, b: str): + """ + Compare two strings according to a character-by-character comparison using + the `sort -v` command. + + :return: Negative integer if a comes before b, positive integer if b comes + before a, or zero if a and b are identical. + """ + for a_char, b_char in zip(a, b): + cmp_result = cmp_chars(a_char, b_char) + if cmp_result != 0: + return cmp_result + return len(a) - len(b) + + +def log_duration(start_time: datetime, end_time: datetime, file_count): + """ + Log the time it took to format the files. + """ + duration = end_time - start_time + get_logger().info( + 'Finished updating %d files.\nTotal time: %fs', + file_count, + duration.total_seconds() + ) + + +if __name__ == '__main__': + main()