rawTherapee/tools/generateTranslationDiffs.py

#!/usr/bin/env python

# This script iterates through interface translation files, moves comments to
# the front, puts translated strings next, and finally looks for
# untranslated/missing strings by matching against "default" which it then adds
# to the translation, each line prepended by "!".
#
# Developers should run it after receiving a translation file from a
# translator:
# cp /tmp/new_japanese_translation rtdata/languages/Japanese
# ./tools/generateTranslationDiffs "Japanese"
#
# Running the script without an argument iterates through all files.
#
# Locale files are generated automatically:
# - English (UK)

import argparse
from collections import defaultdict
from datetime import datetime
from functools import cmp_to_key, reduce
import logging
import os
from pathlib import Path
import re
from sys import stdout
from typing import Dict, Iterable, List, Mapping, Optional


FILE_DEFAULT = 'default'
FILE_ENGLISH_US = 'English (US)'
FILE_ENGLISH_UK = 'English (UK)'


class SortHelper:
    """
    String sorting utilities.
    """
    char_indices: Optional[Dict[str, str]] = None

    @staticmethod
    def get_char_index(char: str):
        """
        Return the sort order of a character.
        """
        if SortHelper.char_indices is None:
            # Printable characters sorted using the `sort -V` command.
            characters = (
                '.~0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ'
                'abcdefghijklmnopqrstuvwxyz\t\x0b\x0c\n '
                '!"#$%&\'()*+,-/:;<=>?@[\\]^_`{|}'
            )
            SortHelper.char_indices = {}
            for i, char in enumerate(characters):
                SortHelper.char_indices[char] = i
        return SortHelper.char_indices.get(char)


class TranslationEntry:
    """
    An entry in a translation file, consisting of a key and value.
    """

    def __init__(self, line: str, key: str, value: str):
        """
        :param line: The entire line containing the entry.
        :param key: The key.
        :param value: The value.
        """
        self.line = line
        self.key = key
        self.value = value

    def __repr__(self):
        return (
            f'TranslationEntry(line={self.line}, key={self.key},'
            f' value={self.value})'
        )

    def __str__(self):
        return repr(self)

    @staticmethod
    def create_from_line(line: str):
        """
        Create an instance of this class from a line containing the entry
        definition.

        :raises ValueError: If the line does not contain a valid definition
        consisting of a key and value separated by a ';' character.
        """
        split_line = line.split(';', maxsplit=1)
        if len(split_line) != 2:
            raise ValueError()
        key, value = split_line
        return TranslationEntry(line, key, value)


class FileLines:
    """
    Lines of a translation file categorized by type.

    The types are:
    - Comment: Comments, which start with the '#' character.
    - Changed: Translation entries consisting of a key and value.
    Other lines are ignored.
    """

    def __init__(
            self,
            all: List[str],
            comments: List[str],
            changed: List[TranslationEntry]
    ):
        self.all = all
        self.comments = comments
        self.changed = changed

    def __repr__(self):
        return (
            f'FileLines(all={self.all}, comments={self.comments},'
            f' changed={self.changed})'
        )

    def __str__(self):
        return repr(self)


def main():
    args = parse_args()
    configure_logger()
    start_time = datetime.now()
    file_paths = get_file_paths(args.file_names)
    default_file_lines = format_default()
    format_files(file_paths, default_file_lines)
    generate_locale_files(default_file_lines, file_paths)
    end_time = datetime.now()
    log_duration(start_time, end_time, len(file_paths))


def parse_args():
    """
    Return the arguments passed to this program.
    """
    parser = argparse.ArgumentParser(
        prog='generateTranslationDiffs',
        description='Formats translation files in rtdata/languages.',
    )
    parser.add_argument(
        'file_names',
        nargs='*',
        help='list of language files to format, or leave empty to format all',
    )
    return parser.parse_args()


def configure_logger():
    """
    Set up the logger.
    """
    logger = get_logger()
    handler = logging.StreamHandler(stdout)
    formatter = logging.Formatter('%(levelname)s: %(message)s')
    handler.setFormatter(formatter)
    logger.addHandler(handler)
    logger.setLevel(logging.INFO)


def get_logger():
    """
    Return the logger.
    """
    return logging.getLogger('generateTranslationDiffs')


def get_file_paths(file_names: List[str]):
    """
    Return the paths for all the given file names if they exist and are
    writable. If no files names are given, return paths of all translation
    files.

    :param file_names: List of file names, as path strings and/or name.
    """
    if not file_names:
        return get_default_file_paths()
    return list(filter(lambda p: p, (get_file_path(n) for n in file_names)))


def get_languages_path():
    """
    Return the path of the languages directory.
    """
    return Path(__file__).parent.parent / 'rtdata' / 'languages'


def get_file_path(file_name):
    """
    Return the path for the translation file, or None if it doesn't exist or is
    not writable.

    :param file_name: The file name as a path string or name.
    """
    file_path = None
    if Path(file_name).exists():
        file_path = Path(file_name)
    elif (get_languages_path() / file_name).exists():
        file_path = get_languages_path() / file_name

    if not is_writable_file(file_path):
        get_logger().warning('File "%s" not found or not writable.', file_name)
        return None
    return file_path


def get_default_file_paths():
    """
    Return a list of paths for all translation files excluding "default" and
    locale translations.
    """
    ignored_files = [
        FILE_DEFAULT, FILE_ENGLISH_UK, 'LICENSE', 'README', 'temp_file'
    ]
    ignored_files_regex = '|'.join(re.escape(file) for file in ignored_files)
    ignore_pattern = re.compile(rf'({ignored_files_regex}|.*\.sh|\..*)')
    return list(filter(
        lambda p: p.is_file() and not ignore_pattern.fullmatch(p.name),
        get_languages_path().iterdir()
    ))


def is_writable_file(path: Path):
    """
    Return if the file is writable with the current permissions.
    """
    return path and path.is_file() and os.access(path, os.W_OK)


def format_default():
    """
    Format the default language file.

    :return: File lines of "default".
    """
    get_logger().info('Formatting %s.', FILE_DEFAULT)
    path = get_languages_path() / FILE_DEFAULT
    file_lines = read_file(path)
    changed_lines = [key_line.line for key_line in file_lines.changed]
    file_lines.all = file_lines.comments + changed_lines + ['']
    new_contents = '\n'.join(file_lines.all)
    get_logger().debug(
        'New contents for file %s:\n%s',
        FILE_DEFAULT,
        new_contents
    )
    path.write_text(new_contents)
    return file_lines


def format_files(file_paths: List[Path], default_file_lines: FileLines):
    """
    Format the translation files.

    :param file_paths: Files to format.
    :param default_file_lines: File lines of the default language file.
    """
    if not file_paths:
        get_logger().info('No language files to format.')
    for file_path in file_paths:
        format_file(file_path, default_file_lines)


def generate_locale_files(
        default_file_lines: FileLines,
        file_paths: List[Path]
):
    """
    Generate locale translation files.

    :param default_file_lines: File lines of the default language file.
    :param file_paths: Paths of files to generate locale translations of.
    """
    file_name_to_generator = {
        FILE_ENGLISH_US: generate_english_locales,
    }
    for path in file_paths:
        generator = file_name_to_generator.get(path.name)
        if generator:
            generator(default_file_lines, path)


def generate_english_locales(default_file_lines: FileLines, us_path: Path):
    """
    Generate English locale files.
    """
    us_file_lines: FileLines = read_file(us_path)
    generate_english_locale_uk(default_file_lines, us_file_lines)


def generate_english_locale_uk(
        default_file_lines: FileLines,
        us_file_lines: FileLines
):
    """
    Generate the UK English locale file.
    """
    get_logger().info('Creating %s file', FILE_ENGLISH_UK)

    new_lines = list(us_file_lines.comments)
    new_lines.extend(get_english_uk_translations(default_file_lines))
    new_lines.extend(get_english_uk_untranslated(us_file_lines.all))
    new_lines.append('')

    new_contents = '\n'.join(new_lines)
    get_logger().debug(
        'New contents for file %s:\n%s',
        FILE_ENGLISH_UK,
        new_contents
    )
    path = get_languages_path() / FILE_ENGLISH_UK
    path.write_text(new_contents)


def get_english_uk_translations(default_file_lines: FileLines):
    """
    Return a list of lines with translated entries for UK English.

    :param default_file_lines: File lines of "default".
    """
    line_pattern = re.compile(r'(color|behavior|center)', flags=re.IGNORECASE)
    replacements = {
        'olor': 'olour',
        'ehavior': 'ehaviour',
        'center': 'centre',
        'Center': 'Centre',
    }
    entries_to_translate = filter(
        lambda entry: line_pattern.search(entry.value),
        default_file_lines.changed
    )

    translations = []
    for default_entry in entries_to_translate:
        new_value = reduce(
            lambda value, replacement: value.replace(*replacement),
            replacements.items(),
            default_entry.value
        )
        translations.append(f'{default_entry.key};{new_value}')
    return translations


def get_english_uk_untranslated(us_lines: List[str]):
    """
    Return a list of lines from the US English file excluding comments and
    those translated for UK English.
    """
    pattern = re.compile(
        r'.+;.*(color|behavior|center).*',
        flags=re.IGNORECASE
    )
    return list(filter(
        lambda line: not pattern.search(line) and not line.startswith('#'),
        us_lines
    ))


def format_file(path: Path, default_file_lines: FileLines):
    """
    Format a translation file.

    :param path: Path of the translation file.
    :param default_file_lines: File lines of "default".
    """
    get_logger().info(f'Formatting {path.name}.')
    file_lines = read_file(path)
    translated_lines, untranslated_lines = get_translated_untranslated_lines(
        file_lines,
        default_file_lines
    )
    warn_removed_entry(file_lines, default_file_lines)
    new_lines = list(file_lines.comments)
    new_lines.append('')
    new_lines.extend(translated_lines)
    new_lines.append('')
    new_lines.extend(untranslated_lines)
    new_lines.append('')
    file_lines.all = new_lines
    new_contents = '\n'.join(file_lines.all)
    get_logger().debug(
        'New contents for file %s:\n%s',
        path.name,
        new_contents
    )
    path.write_text(new_contents)


def get_translated_untranslated_lines(
        file_lines: FileLines,
        default_file_lines: FileLines
):
    """
    Return a tuple containing a list of translated lines and a list of
    untranslated lines.
    """
    key_to_entry = {
        entry.key: entry for entry in file_lines.changed
    }
    translated_lines = []
    untranslated_lines = []
    for default_key_line in default_file_lines.changed:
        key = default_key_line.key
        if key in key_to_entry:
            translated_lines.append(key_to_entry[key].line)
        else:
            untranslated_lines.append(f'!{default_key_line.line}')
    if untranslated_lines:
        header = [
            '!!!!!!!!!!!!!!!!!!!!!!!!!',
            (
                '! Untranslated keys follow;'
                ' remove the ! prefix after an entry is translated.'
            ),
            '!!!!!!!!!!!!!!!!!!!!!!!!!',
            '',
        ]
        untranslated_lines = header + untranslated_lines
    return translated_lines, untranslated_lines


def warn_removed_entry(file_lines: FileLines, default_file_lines: FileLines):
    """
    Emit a warning for any translation entries in the translation file but not
    in the default file, if any.
    """
    default_keys = set(entry.key for entry in default_file_lines.changed)
    removed_lines = [
        entry.line for entry in file_lines.changed if entry.key not in
        default_keys
    ]
    if removed_lines:
        warning_lines = ['Removed entry/entries']
        warning_lines.extend([f'\t{line}' for line in removed_lines])
        get_logger().warning('\n'.join(warning_lines))


def read_file(path: Path):
    """
    Return the file lines from a language file.
    """
    # Begins with '#' followed by 1+ characters.
    comment_pattern = re.compile(r'^#.+')
    # Does not begin with '!', '#', or end of line.
    changed_pattern = re.compile(r'^[^!#$]')
    file_lines = FileLines(path.read_text().splitlines(), [], [])
    for line_num, line in enumerate(file_lines.all):
        if comment_pattern.match(line):
            file_lines.comments.append(line)
        elif changed_pattern.match(line):
            try:
                translation_entry = TranslationEntry.create_from_line(line)
            except ValueError:
                get_logger().warning(
                    'Malformed translation entry in "%s" on line %d: %s',
                    path.name,
                    line_num,
                    line
                )
            else:
                file_lines.changed.append(translation_entry)
    sort_file_lines(file_lines)
    return file_lines


def sort_file_lines(file_lines: FileLines):
    """
    Sort the comments and changed entries of a file lines.
    """
    comments = sort_comment_lines(file_lines.comments)
    changed = sort_changed_unchanged_lines(file_lines.changed)
    file_lines.comments.clear()
    file_lines.changed.clear()
    file_lines.comments.extend(comments)
    file_lines.changed.extend(changed)


def sort_comment_lines(lines: List[str]):
    """
    Sort comment lines using natural sort.
    """
    return sorted(set(lines), key=cmp_to_key(compare_string_natural))


def sort_changed_unchanged_lines(entries: List[TranslationEntry]):
    """
    Sort changed or unchanged lines using natural sort of the translation entry
    keys.
    """
    key_to_lines = defaultdict(set)
    for entry in entries:
        key_to_lines[entry.key].add(entry.line)

    warn_duplicate_entries(key_to_lines)

    sort_key = cmp_to_key(lambda a, b: compare_string_natural(a[0], b[0]))
    return list(map(
        lambda item: TranslationEntry.create_from_line(item[1].pop()),
        sorted(key_to_lines.items(), key=sort_key)
    ))


def warn_duplicate_entries(key_to_lines: Mapping[str, Iterable[str]]):
    """
    Emit a warning if there are duplicate translation entries.

    :param key_to_lines: Mapping from entry key to iterable containing all
    values.
    """
    duplicate_key_to_lines = {
        k: v for k, v in key_to_lines.items() if len(v) > 1
    }
    if duplicate_key_to_lines:
        warning_lines = ['Duplicate key(s)']
        for key, lines in duplicate_key_to_lines.items():
            warning_lines.append(f'\t{key}')
            warning_lines.extend(f'\t\t{line}' for line in lines)
        get_logger().warning('\n'.join(warning_lines))


def compare_string_natural(a: str, b: str):
    """
    Compare two strings using natural ordering.

    :return: Negative integer if a comes before b, positive integer if b comes
    before a, or zero if a and b are identical.
    """
    ia = 0  # Character index for a.
    ib = 0  # Character index for b.
    while ia < len(a) and ib < len(b):
        if a[ia].isdigit():
            if b[ib].isdigit():
                # Compare numbers
                a_number, ia = read_int(a, ia)
                b_number, ib = read_int(b, ib)
                if a_number != b_number:
                    return a_number - b_number
            else:
                # Compare number with character.
                return cmp_chars(a[ia], b[ib])
        else:
            if b[ib].isdigit():
                # Compare character with number.
                return cmp_chars(a[ia], b[ib])
            else:
                # Compare character with character.
                if a[ia] != b[ib]:
                    return cmp_chars(a[ia], b[ib])
                ia += 1
                ib += 1
    if ia < len(a):
        # a is "longer".
        return 1
    if ib < len(b):
        # b is "longer".
        return -1
    # a and b are equivalent.
    return cmp_string(a, b)


def read_int(string: str, index: int):
    """
    Read an integer from the string starting at the index.

    :param string: The string.
    :param index: Index in the string where the number starts.
    :return: A tuple containing the number and the index after the end of where
    the number was extracted in the string. If there is no number, returns zero
    and the original index.
    """
    i = index
    while i < len(string) and string[i].isdigit():
        i += 1
    number = 0 if i == index else int(string[index:i])
    return number, i


def cmp_chars(a: str, b: str):
    """
    Compare two characters according to the `sort -v` command.

    :return: Negative integer if a comes before b, positive integer if b comes
    before a, or zero if a and b are identical.
    """
    a_index = SortHelper.get_char_index(a)
    b_index = SortHelper.get_char_index(b)
    if a_index is not None and b_index is not None:
        return a_index - b_index
    if a == b:
        return 0
    return -1 if a < b else 1


def cmp_string(a: str, b: str):
    """
    Compare two strings according to a character-by-character comparison using
    the `sort -v` command.

    :return: Negative integer if a comes before b, positive integer if b comes
    before a, or zero if a and b are identical.
    """
    for a_char, b_char in zip(a, b):
        cmp_result = cmp_chars(a_char, b_char)
        if cmp_result != 0:
            return cmp_result
    return len(a) - len(b)


def log_duration(start_time: datetime, end_time: datetime, file_count):
    """
    Log the time it took to format the files.
    """
    duration = end_time - start_time
    get_logger().info(
        'Finished updating %d files.\nTotal time: %fs',
        file_count,
        duration.total_seconds()
    )


if __name__ == '__main__':
    main()