rawTherapee/tools/generateTranslationDiffs.py
Lawrence Lee b10d3e36ea
Add improved version of generateTranslationDiffs
A Python script with better performance and does not have a bug in which
the '0' (zero) number is ignored when sorting and removing duplicates.
2024-07-06 22:24:32 -07:00

631 lines
18 KiB
Python

#!/usr/bin/env python
# This script iterates through interface translation files, moves comments to
# the front, puts translated strings next, and finally looks for
# untranslated/missing strings by matching against "default" which it then adds
# to the translation, each line prepended by "!".
#
# Developers should run it after receiving a translation file from a
# translator:
# cp /tmp/new_japanese_translation rtdata/languages/Japanese
# ./tools/generateTranslationDiffs "Japanese"
#
# Running the script without an argument iterates through all files.
#
# Locale files are generated automatically:
# - English (UK)
import argparse
from collections import defaultdict
from datetime import datetime
from functools import cmp_to_key, reduce
import logging
import os
from pathlib import Path
import re
from sys import stdout
from typing import Dict, Iterable, List, Mapping, Optional
FILE_DEFAULT = 'default'
FILE_ENGLISH_US = 'English (US)'
FILE_ENGLISH_UK = 'English (UK)'
class SortHelper:
"""
String sorting utilities.
"""
char_indices: Optional[Dict[str, str]] = None
@staticmethod
def get_char_index(char: str):
"""
Return the sort order of a character.
"""
if SortHelper.char_indices is None:
# Printable characters sorted using the `sort -V` command.
characters = (
'.~0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ'
'abcdefghijklmnopqrstuvwxyz\t\x0b\x0c\n '
'!"#$%&\'()*+,-/:;<=>?@[\\]^_`{|}'
)
SortHelper.char_indices = {}
for i, char in enumerate(characters):
SortHelper.char_indices[char] = i
return SortHelper.char_indices.get(char)
class TranslationEntry:
"""
An entry in a translation file, consisting of a key and value.
"""
def __init__(self, line: str, key: str, value: str):
"""
:param line: The entire line containing the entry.
:param key: The key.
:param value: The value.
"""
self.line = line
self.key = key
self.value = value
def __repr__(self):
return (
f'TranslationEntry(line={self.line}, key={self.key},'
f' value={self.value})'
)
def __str__(self):
return repr(self)
@staticmethod
def create_from_line(line: str):
"""
Create an instance of this class from a line containing the entry
definition.
:raises ValueError: If the line does not contain a valid definition
consisting of a key and value separated by a ';' character.
"""
split_line = line.split(';', maxsplit=1)
if len(split_line) != 2:
raise ValueError()
key, value = split_line
return TranslationEntry(line, key, value)
class FileLines:
"""
Lines of a translation file categorized by type.
The types are:
- Comment: Comments, which start with the '#' character.
- Changed: Translation entries consisting of a key and value.
Other lines are ignored.
"""
def __init__(
self,
all: List[str],
comments: List[str],
changed: List[TranslationEntry]
):
self.all = all
self.comments = comments
self.changed = changed
def __repr__(self):
return (
f'FileLines(all={self.all}, comments={self.comments},'
f' changed={self.changed})'
)
def __str__(self):
return repr(self)
def main():
args = parse_args()
configure_logger()
start_time = datetime.now()
file_paths = get_file_paths(args.file_names)
default_file_lines = format_default()
format_files(file_paths, default_file_lines)
generate_locale_files(default_file_lines, file_paths)
end_time = datetime.now()
log_duration(start_time, end_time, len(file_paths))
def parse_args():
"""
Return the arguments passed to this program.
"""
parser = argparse.ArgumentParser(
prog='generateTranslationDiffs',
description='Formats translation files in rtdata/languages.',
)
parser.add_argument(
'file_names',
nargs='*',
help='list of language files to format, or leave empty to format all',
)
return parser.parse_args()
def configure_logger():
"""
Set up the logger.
"""
logger = get_logger()
handler = logging.StreamHandler(stdout)
formatter = logging.Formatter('%(levelname)s: %(message)s')
handler.setFormatter(formatter)
logger.addHandler(handler)
logger.setLevel(logging.INFO)
def get_logger():
"""
Return the logger.
"""
return logging.getLogger('generateTranslationDiffs')
def get_file_paths(file_names: List[str]):
"""
Return the paths for all the given file names if they exist and are
writable. If no files names are given, return paths of all translation
files.
:param file_names: List of file names, as path strings and/or name.
"""
if not file_names:
return get_default_file_paths()
return list(filter(lambda p: p, (get_file_path(n) for n in file_names)))
def get_languages_path():
"""
Return the path of the languages directory.
"""
return Path(__file__).parent.parent / 'rtdata' / 'languages'
def get_file_path(file_name):
"""
Return the path for the translation file, or None if it doesn't exist or is
not writable.
:param file_name: The file name as a path string or name.
"""
file_path = None
if Path(file_name).exists():
file_path = Path(file_name)
elif (get_languages_path() / file_name).exists():
file_path = get_languages_path() / file_name
if not is_writable_file(file_path):
get_logger().warning('File "%s" not found or not writable.', file_name)
return None
return file_path
def get_default_file_paths():
"""
Return a list of paths for all translation files excluding "default" and
locale translations.
"""
ignored_files = [
FILE_DEFAULT, FILE_ENGLISH_UK, 'LICENSE', 'README', 'temp_file'
]
ignored_files_regex = '|'.join(re.escape(file) for file in ignored_files)
ignore_pattern = re.compile(rf'({ignored_files_regex}|.*\.sh|\..*)')
return list(filter(
lambda p: p.is_file() and not ignore_pattern.fullmatch(p.name),
get_languages_path().iterdir()
))
def is_writable_file(path: Path):
"""
Return if the file is writable with the current permissions.
"""
return path and path.is_file() and os.access(path, os.W_OK)
def format_default():
"""
Format the default language file.
:return: File lines of "default".
"""
get_logger().info('Formatting %s.', FILE_DEFAULT)
path = get_languages_path() / FILE_DEFAULT
file_lines = read_file(path)
changed_lines = [key_line.line for key_line in file_lines.changed]
file_lines.all = file_lines.comments + changed_lines + ['']
new_contents = '\n'.join(file_lines.all)
get_logger().debug(
'New contents for file %s:\n%s',
FILE_DEFAULT,
new_contents
)
path.write_text(new_contents)
return file_lines
def format_files(file_paths: List[Path], default_file_lines: FileLines):
"""
Format the translation files.
:param file_paths: Files to format.
:param default_file_lines: File lines of the default language file.
"""
if not file_paths:
get_logger().info('No language files to format.')
for file_path in file_paths:
format_file(file_path, default_file_lines)
def generate_locale_files(
default_file_lines: FileLines,
file_paths: List[Path]
):
"""
Generate locale translation files.
:param default_file_lines: File lines of the default language file.
:param file_paths: Paths of files to generate locale translations of.
"""
file_name_to_generator = {
FILE_ENGLISH_US: generate_english_locales,
}
for path in file_paths:
generator = file_name_to_generator.get(path.name)
if generator:
generator(default_file_lines, path)
def generate_english_locales(default_file_lines: FileLines, us_path: Path):
"""
Generate English locale files.
"""
us_file_lines: FileLines = read_file(us_path)
generate_english_locale_uk(default_file_lines, us_file_lines)
def generate_english_locale_uk(
default_file_lines: FileLines,
us_file_lines: FileLines
):
"""
Generate the UK English locale file.
"""
get_logger().info('Creating %s file', FILE_ENGLISH_UK)
new_lines = list(us_file_lines.comments)
new_lines.extend(get_english_uk_translations(default_file_lines))
new_lines.extend(get_english_uk_untranslated(us_file_lines.all))
new_lines.append('')
new_contents = '\n'.join(new_lines)
get_logger().debug(
'New contents for file %s:\n%s',
FILE_ENGLISH_UK,
new_contents
)
path = get_languages_path() / FILE_ENGLISH_UK
path.write_text(new_contents)
def get_english_uk_translations(default_file_lines: FileLines):
"""
Return a list of lines with translated entries for UK English.
:param default_file_lines: File lines of "default".
"""
line_pattern = re.compile(r'(color|behavior|center)', flags=re.IGNORECASE)
replacements = {
'olor': 'olour',
'ehavior': 'ehaviour',
'center': 'centre',
'Center': 'Centre',
}
entries_to_translate = filter(
lambda entry: line_pattern.search(entry.value),
default_file_lines.changed
)
translations = []
for default_entry in entries_to_translate:
new_value = reduce(
lambda value, replacement: value.replace(*replacement),
replacements.items(),
default_entry.value
)
translations.append(f'{default_entry.key};{new_value}')
return translations
def get_english_uk_untranslated(us_lines: List[str]):
"""
Return a list of lines from the US English file excluding comments and
those translated for UK English.
"""
pattern = re.compile(
r'.+;.*(color|behavior|center).*',
flags=re.IGNORECASE
)
return list(filter(
lambda line: not pattern.search(line) and not line.startswith('#'),
us_lines
))
def format_file(path: Path, default_file_lines: FileLines):
"""
Format a translation file.
:param path: Path of the translation file.
:param default_file_lines: File lines of "default".
"""
get_logger().info(f'Formatting {path.name}.')
file_lines = read_file(path)
translated_lines, untranslated_lines = get_translated_untranslated_lines(
file_lines,
default_file_lines
)
warn_removed_entry(file_lines, default_file_lines)
new_lines = list(file_lines.comments)
new_lines.append('')
new_lines.extend(translated_lines)
new_lines.append('')
new_lines.extend(untranslated_lines)
new_lines.append('')
file_lines.all = new_lines
new_contents = '\n'.join(file_lines.all)
get_logger().debug(
'New contents for file %s:\n%s',
path.name,
new_contents
)
path.write_text(new_contents)
def get_translated_untranslated_lines(
file_lines: FileLines,
default_file_lines: FileLines
):
"""
Return a tuple containing a list of translated lines and a list of
untranslated lines.
"""
key_to_entry = {
entry.key: entry for entry in file_lines.changed
}
translated_lines = []
untranslated_lines = []
for default_key_line in default_file_lines.changed:
key = default_key_line.key
if key in key_to_entry:
translated_lines.append(key_to_entry[key].line)
else:
untranslated_lines.append(f'!{default_key_line.line}')
if untranslated_lines:
header = [
'!!!!!!!!!!!!!!!!!!!!!!!!!',
(
'! Untranslated keys follow;'
' remove the ! prefix after an entry is translated.'
),
'!!!!!!!!!!!!!!!!!!!!!!!!!',
'',
]
untranslated_lines = header + untranslated_lines
return translated_lines, untranslated_lines
def warn_removed_entry(file_lines: FileLines, default_file_lines: FileLines):
"""
Emit a warning for any translation entries in the translation file but not
in the default file, if any.
"""
default_keys = set(entry.key for entry in default_file_lines.changed)
removed_lines = [
entry.line for entry in file_lines.changed if entry.key not in
default_keys
]
if removed_lines:
warning_lines = ['Removed entry/entries']
warning_lines.extend([f'\t{line}' for line in removed_lines])
get_logger().warning('\n'.join(warning_lines))
def read_file(path: Path):
"""
Return the file lines from a language file.
"""
# Begins with '#' followed by 1+ characters.
comment_pattern = re.compile(r'^#.+')
# Does not begin with '!', '#', or end of line.
changed_pattern = re.compile(r'^[^!#$]')
file_lines = FileLines(path.read_text().splitlines(), [], [])
for line_num, line in enumerate(file_lines.all):
if comment_pattern.match(line):
file_lines.comments.append(line)
elif changed_pattern.match(line):
try:
translation_entry = TranslationEntry.create_from_line(line)
except ValueError:
get_logger().warning(
'Malformed translation entry in "%s" on line %d: %s',
path.name,
line_num,
line
)
else:
file_lines.changed.append(translation_entry)
sort_file_lines(file_lines)
return file_lines
def sort_file_lines(file_lines: FileLines):
"""
Sort the comments and changed entries of a file lines.
"""
comments = sort_comment_lines(file_lines.comments)
changed = sort_changed_unchanged_lines(file_lines.changed)
file_lines.comments.clear()
file_lines.changed.clear()
file_lines.comments.extend(comments)
file_lines.changed.extend(changed)
def sort_comment_lines(lines: List[str]):
"""
Sort comment lines using natural sort.
"""
return sorted(set(lines), key=cmp_to_key(compare_string_natural))
def sort_changed_unchanged_lines(entries: List[TranslationEntry]):
"""
Sort changed or unchanged lines using natural sort of the translation entry
keys.
"""
key_to_lines = defaultdict(set)
for entry in entries:
key_to_lines[entry.key].add(entry.line)
warn_duplicate_entries(key_to_lines)
sort_key = cmp_to_key(lambda a, b: compare_string_natural(a[0], b[0]))
return list(map(
lambda item: TranslationEntry.create_from_line(item[1].pop()),
sorted(key_to_lines.items(), key=sort_key)
))
def warn_duplicate_entries(key_to_lines: Mapping[str, Iterable[str]]):
"""
Emit a warning if there are duplicate translation entries.
:param key_to_lines: Mapping from entry key to iterable containing all
values.
"""
duplicate_key_to_lines = {
k: v for k, v in key_to_lines.items() if len(v) > 1
}
if duplicate_key_to_lines:
warning_lines = ['Duplicate key(s)']
for key, lines in duplicate_key_to_lines.items():
warning_lines.append(f'\t{key}')
warning_lines.extend(f'\t\t{line}' for line in lines)
get_logger().warning('\n'.join(warning_lines))
def compare_string_natural(a: str, b: str):
"""
Compare two strings using natural ordering.
:return: Negative integer if a comes before b, positive integer if b comes
before a, or zero if a and b are identical.
"""
ia = 0 # Character index for a.
ib = 0 # Character index for b.
while ia < len(a) and ib < len(b):
if a[ia].isdigit():
if b[ib].isdigit():
# Compare numbers
a_number, ia = read_int(a, ia)
b_number, ib = read_int(b, ib)
if a_number != b_number:
return a_number - b_number
else:
# Compare number with character.
return cmp_chars(a[ia], b[ib])
else:
if b[ib].isdigit():
# Compare character with number.
return cmp_chars(a[ia], b[ib])
else:
# Compare character with character.
if a[ia] != b[ib]:
return cmp_chars(a[ia], b[ib])
ia += 1
ib += 1
if ia < len(a):
# a is "longer".
return 1
if ib < len(b):
# b is "longer".
return -1
# a and b are equivalent.
return cmp_string(a, b)
def read_int(string: str, index: int):
"""
Read an integer from the string starting at the index.
:param string: The string.
:param index: Index in the string where the number starts.
:return: A tuple containing the number and the index after the end of where
the number was extracted in the string. If there is no number, returns zero
and the original index.
"""
i = index
while i < len(string) and string[i].isdigit():
i += 1
number = 0 if i == index else int(string[index:i])
return number, i
def cmp_chars(a: str, b: str):
"""
Compare two characters according to the `sort -v` command.
:return: Negative integer if a comes before b, positive integer if b comes
before a, or zero if a and b are identical.
"""
a_index = SortHelper.get_char_index(a)
b_index = SortHelper.get_char_index(b)
if a_index is not None and b_index is not None:
return a_index - b_index
if a == b:
return 0
return -1 if a < b else 1
def cmp_string(a: str, b: str):
"""
Compare two strings according to a character-by-character comparison using
the `sort -v` command.
:return: Negative integer if a comes before b, positive integer if b comes
before a, or zero if a and b are identical.
"""
for a_char, b_char in zip(a, b):
cmp_result = cmp_chars(a_char, b_char)
if cmp_result != 0:
return cmp_result
return len(a) - len(b)
def log_duration(start_time: datetime, end_time: datetime, file_count):
"""
Log the time it took to format the files.
"""
duration = end_time - start_time
get_logger().info(
'Finished updating %d files.\nTotal time: %fs',
file_count,
duration.total_seconds()
)
if __name__ == '__main__':
main()