Add improved version of generateTranslationDiffs
A Python script with better performance and does not have a bug in which the '0' (zero) number is ignored when sorting and removing duplicates.
This commit is contained in:
parent
64dce38c33
commit
b10d3e36ea
630
tools/generateTranslationDiffs.py
Normal file
630
tools/generateTranslationDiffs.py
Normal file
@ -0,0 +1,630 @@
|
||||
#!/usr/bin/env python
|
||||
|
||||
# This script iterates through interface translation files, moves comments to
|
||||
# the front, puts translated strings next, and finally looks for
|
||||
# untranslated/missing strings by matching against "default" which it then adds
|
||||
# to the translation, each line prepended by "!".
|
||||
#
|
||||
# Developers should run it after receiving a translation file from a
|
||||
# translator:
|
||||
# cp /tmp/new_japanese_translation rtdata/languages/Japanese
|
||||
# ./tools/generateTranslationDiffs "Japanese"
|
||||
#
|
||||
# Running the script without an argument iterates through all files.
|
||||
#
|
||||
# Locale files are generated automatically:
|
||||
# - English (UK)
|
||||
|
||||
import argparse
|
||||
from collections import defaultdict
|
||||
from datetime import datetime
|
||||
from functools import cmp_to_key, reduce
|
||||
import logging
|
||||
import os
|
||||
from pathlib import Path
|
||||
import re
|
||||
from sys import stdout
|
||||
from typing import Dict, Iterable, List, Mapping, Optional
|
||||
|
||||
|
||||
FILE_DEFAULT = 'default'
|
||||
FILE_ENGLISH_US = 'English (US)'
|
||||
FILE_ENGLISH_UK = 'English (UK)'
|
||||
|
||||
|
||||
class SortHelper:
|
||||
"""
|
||||
String sorting utilities.
|
||||
"""
|
||||
char_indices: Optional[Dict[str, str]] = None
|
||||
|
||||
@staticmethod
|
||||
def get_char_index(char: str):
|
||||
"""
|
||||
Return the sort order of a character.
|
||||
"""
|
||||
if SortHelper.char_indices is None:
|
||||
# Printable characters sorted using the `sort -V` command.
|
||||
characters = (
|
||||
'.~0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ'
|
||||
'abcdefghijklmnopqrstuvwxyz\t\x0b\x0c\n '
|
||||
'!"#$%&\'()*+,-/:;<=>?@[\\]^_`{|}'
|
||||
)
|
||||
SortHelper.char_indices = {}
|
||||
for i, char in enumerate(characters):
|
||||
SortHelper.char_indices[char] = i
|
||||
return SortHelper.char_indices.get(char)
|
||||
|
||||
|
||||
class TranslationEntry:
|
||||
"""
|
||||
An entry in a translation file, consisting of a key and value.
|
||||
"""
|
||||
|
||||
def __init__(self, line: str, key: str, value: str):
|
||||
"""
|
||||
:param line: The entire line containing the entry.
|
||||
:param key: The key.
|
||||
:param value: The value.
|
||||
"""
|
||||
self.line = line
|
||||
self.key = key
|
||||
self.value = value
|
||||
|
||||
def __repr__(self):
|
||||
return (
|
||||
f'TranslationEntry(line={self.line}, key={self.key},'
|
||||
f' value={self.value})'
|
||||
)
|
||||
|
||||
def __str__(self):
|
||||
return repr(self)
|
||||
|
||||
@staticmethod
|
||||
def create_from_line(line: str):
|
||||
"""
|
||||
Create an instance of this class from a line containing the entry
|
||||
definition.
|
||||
|
||||
:raises ValueError: If the line does not contain a valid definition
|
||||
consisting of a key and value separated by a ';' character.
|
||||
"""
|
||||
split_line = line.split(';', maxsplit=1)
|
||||
if len(split_line) != 2:
|
||||
raise ValueError()
|
||||
key, value = split_line
|
||||
return TranslationEntry(line, key, value)
|
||||
|
||||
|
||||
class FileLines:
|
||||
"""
|
||||
Lines of a translation file categorized by type.
|
||||
|
||||
The types are:
|
||||
- Comment: Comments, which start with the '#' character.
|
||||
- Changed: Translation entries consisting of a key and value.
|
||||
Other lines are ignored.
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
all: List[str],
|
||||
comments: List[str],
|
||||
changed: List[TranslationEntry]
|
||||
):
|
||||
self.all = all
|
||||
self.comments = comments
|
||||
self.changed = changed
|
||||
|
||||
def __repr__(self):
|
||||
return (
|
||||
f'FileLines(all={self.all}, comments={self.comments},'
|
||||
f' changed={self.changed})'
|
||||
)
|
||||
|
||||
def __str__(self):
|
||||
return repr(self)
|
||||
|
||||
|
||||
def main():
|
||||
args = parse_args()
|
||||
configure_logger()
|
||||
start_time = datetime.now()
|
||||
file_paths = get_file_paths(args.file_names)
|
||||
default_file_lines = format_default()
|
||||
format_files(file_paths, default_file_lines)
|
||||
generate_locale_files(default_file_lines, file_paths)
|
||||
end_time = datetime.now()
|
||||
log_duration(start_time, end_time, len(file_paths))
|
||||
|
||||
|
||||
def parse_args():
|
||||
"""
|
||||
Return the arguments passed to this program.
|
||||
"""
|
||||
parser = argparse.ArgumentParser(
|
||||
prog='generateTranslationDiffs',
|
||||
description='Formats translation files in rtdata/languages.',
|
||||
)
|
||||
parser.add_argument(
|
||||
'file_names',
|
||||
nargs='*',
|
||||
help='list of language files to format, or leave empty to format all',
|
||||
)
|
||||
return parser.parse_args()
|
||||
|
||||
|
||||
def configure_logger():
|
||||
"""
|
||||
Set up the logger.
|
||||
"""
|
||||
logger = get_logger()
|
||||
handler = logging.StreamHandler(stdout)
|
||||
formatter = logging.Formatter('%(levelname)s: %(message)s')
|
||||
handler.setFormatter(formatter)
|
||||
logger.addHandler(handler)
|
||||
logger.setLevel(logging.INFO)
|
||||
|
||||
|
||||
def get_logger():
|
||||
"""
|
||||
Return the logger.
|
||||
"""
|
||||
return logging.getLogger('generateTranslationDiffs')
|
||||
|
||||
|
||||
def get_file_paths(file_names: List[str]):
|
||||
"""
|
||||
Return the paths for all the given file names if they exist and are
|
||||
writable. If no files names are given, return paths of all translation
|
||||
files.
|
||||
|
||||
:param file_names: List of file names, as path strings and/or name.
|
||||
"""
|
||||
if not file_names:
|
||||
return get_default_file_paths()
|
||||
return list(filter(lambda p: p, (get_file_path(n) for n in file_names)))
|
||||
|
||||
|
||||
def get_languages_path():
|
||||
"""
|
||||
Return the path of the languages directory.
|
||||
"""
|
||||
return Path(__file__).parent.parent / 'rtdata' / 'languages'
|
||||
|
||||
|
||||
def get_file_path(file_name):
|
||||
"""
|
||||
Return the path for the translation file, or None if it doesn't exist or is
|
||||
not writable.
|
||||
|
||||
:param file_name: The file name as a path string or name.
|
||||
"""
|
||||
file_path = None
|
||||
if Path(file_name).exists():
|
||||
file_path = Path(file_name)
|
||||
elif (get_languages_path() / file_name).exists():
|
||||
file_path = get_languages_path() / file_name
|
||||
|
||||
if not is_writable_file(file_path):
|
||||
get_logger().warning('File "%s" not found or not writable.', file_name)
|
||||
return None
|
||||
return file_path
|
||||
|
||||
|
||||
def get_default_file_paths():
|
||||
"""
|
||||
Return a list of paths for all translation files excluding "default" and
|
||||
locale translations.
|
||||
"""
|
||||
ignored_files = [
|
||||
FILE_DEFAULT, FILE_ENGLISH_UK, 'LICENSE', 'README', 'temp_file'
|
||||
]
|
||||
ignored_files_regex = '|'.join(re.escape(file) for file in ignored_files)
|
||||
ignore_pattern = re.compile(rf'({ignored_files_regex}|.*\.sh|\..*)')
|
||||
return list(filter(
|
||||
lambda p: p.is_file() and not ignore_pattern.fullmatch(p.name),
|
||||
get_languages_path().iterdir()
|
||||
))
|
||||
|
||||
|
||||
def is_writable_file(path: Path):
|
||||
"""
|
||||
Return if the file is writable with the current permissions.
|
||||
"""
|
||||
return path and path.is_file() and os.access(path, os.W_OK)
|
||||
|
||||
|
||||
def format_default():
|
||||
"""
|
||||
Format the default language file.
|
||||
|
||||
:return: File lines of "default".
|
||||
"""
|
||||
get_logger().info('Formatting %s.', FILE_DEFAULT)
|
||||
path = get_languages_path() / FILE_DEFAULT
|
||||
file_lines = read_file(path)
|
||||
changed_lines = [key_line.line for key_line in file_lines.changed]
|
||||
file_lines.all = file_lines.comments + changed_lines + ['']
|
||||
new_contents = '\n'.join(file_lines.all)
|
||||
get_logger().debug(
|
||||
'New contents for file %s:\n%s',
|
||||
FILE_DEFAULT,
|
||||
new_contents
|
||||
)
|
||||
path.write_text(new_contents)
|
||||
return file_lines
|
||||
|
||||
|
||||
def format_files(file_paths: List[Path], default_file_lines: FileLines):
|
||||
"""
|
||||
Format the translation files.
|
||||
|
||||
:param file_paths: Files to format.
|
||||
:param default_file_lines: File lines of the default language file.
|
||||
"""
|
||||
if not file_paths:
|
||||
get_logger().info('No language files to format.')
|
||||
for file_path in file_paths:
|
||||
format_file(file_path, default_file_lines)
|
||||
|
||||
|
||||
def generate_locale_files(
|
||||
default_file_lines: FileLines,
|
||||
file_paths: List[Path]
|
||||
):
|
||||
"""
|
||||
Generate locale translation files.
|
||||
|
||||
:param default_file_lines: File lines of the default language file.
|
||||
:param file_paths: Paths of files to generate locale translations of.
|
||||
"""
|
||||
file_name_to_generator = {
|
||||
FILE_ENGLISH_US: generate_english_locales,
|
||||
}
|
||||
for path in file_paths:
|
||||
generator = file_name_to_generator.get(path.name)
|
||||
if generator:
|
||||
generator(default_file_lines, path)
|
||||
|
||||
|
||||
def generate_english_locales(default_file_lines: FileLines, us_path: Path):
|
||||
"""
|
||||
Generate English locale files.
|
||||
"""
|
||||
us_file_lines: FileLines = read_file(us_path)
|
||||
generate_english_locale_uk(default_file_lines, us_file_lines)
|
||||
|
||||
|
||||
def generate_english_locale_uk(
|
||||
default_file_lines: FileLines,
|
||||
us_file_lines: FileLines
|
||||
):
|
||||
"""
|
||||
Generate the UK English locale file.
|
||||
"""
|
||||
get_logger().info('Creating %s file', FILE_ENGLISH_UK)
|
||||
|
||||
new_lines = list(us_file_lines.comments)
|
||||
new_lines.extend(get_english_uk_translations(default_file_lines))
|
||||
new_lines.extend(get_english_uk_untranslated(us_file_lines.all))
|
||||
new_lines.append('')
|
||||
|
||||
new_contents = '\n'.join(new_lines)
|
||||
get_logger().debug(
|
||||
'New contents for file %s:\n%s',
|
||||
FILE_ENGLISH_UK,
|
||||
new_contents
|
||||
)
|
||||
path = get_languages_path() / FILE_ENGLISH_UK
|
||||
path.write_text(new_contents)
|
||||
|
||||
|
||||
def get_english_uk_translations(default_file_lines: FileLines):
|
||||
"""
|
||||
Return a list of lines with translated entries for UK English.
|
||||
|
||||
:param default_file_lines: File lines of "default".
|
||||
"""
|
||||
line_pattern = re.compile(r'(color|behavior|center)', flags=re.IGNORECASE)
|
||||
replacements = {
|
||||
'olor': 'olour',
|
||||
'ehavior': 'ehaviour',
|
||||
'center': 'centre',
|
||||
'Center': 'Centre',
|
||||
}
|
||||
entries_to_translate = filter(
|
||||
lambda entry: line_pattern.search(entry.value),
|
||||
default_file_lines.changed
|
||||
)
|
||||
|
||||
translations = []
|
||||
for default_entry in entries_to_translate:
|
||||
new_value = reduce(
|
||||
lambda value, replacement: value.replace(*replacement),
|
||||
replacements.items(),
|
||||
default_entry.value
|
||||
)
|
||||
translations.append(f'{default_entry.key};{new_value}')
|
||||
return translations
|
||||
|
||||
|
||||
def get_english_uk_untranslated(us_lines: List[str]):
|
||||
"""
|
||||
Return a list of lines from the US English file excluding comments and
|
||||
those translated for UK English.
|
||||
"""
|
||||
pattern = re.compile(
|
||||
r'.+;.*(color|behavior|center).*',
|
||||
flags=re.IGNORECASE
|
||||
)
|
||||
return list(filter(
|
||||
lambda line: not pattern.search(line) and not line.startswith('#'),
|
||||
us_lines
|
||||
))
|
||||
|
||||
|
||||
def format_file(path: Path, default_file_lines: FileLines):
|
||||
"""
|
||||
Format a translation file.
|
||||
|
||||
:param path: Path of the translation file.
|
||||
:param default_file_lines: File lines of "default".
|
||||
"""
|
||||
get_logger().info(f'Formatting {path.name}.')
|
||||
file_lines = read_file(path)
|
||||
translated_lines, untranslated_lines = get_translated_untranslated_lines(
|
||||
file_lines,
|
||||
default_file_lines
|
||||
)
|
||||
warn_removed_entry(file_lines, default_file_lines)
|
||||
new_lines = list(file_lines.comments)
|
||||
new_lines.append('')
|
||||
new_lines.extend(translated_lines)
|
||||
new_lines.append('')
|
||||
new_lines.extend(untranslated_lines)
|
||||
new_lines.append('')
|
||||
file_lines.all = new_lines
|
||||
new_contents = '\n'.join(file_lines.all)
|
||||
get_logger().debug(
|
||||
'New contents for file %s:\n%s',
|
||||
path.name,
|
||||
new_contents
|
||||
)
|
||||
path.write_text(new_contents)
|
||||
|
||||
|
||||
def get_translated_untranslated_lines(
|
||||
file_lines: FileLines,
|
||||
default_file_lines: FileLines
|
||||
):
|
||||
"""
|
||||
Return a tuple containing a list of translated lines and a list of
|
||||
untranslated lines.
|
||||
"""
|
||||
key_to_entry = {
|
||||
entry.key: entry for entry in file_lines.changed
|
||||
}
|
||||
translated_lines = []
|
||||
untranslated_lines = []
|
||||
for default_key_line in default_file_lines.changed:
|
||||
key = default_key_line.key
|
||||
if key in key_to_entry:
|
||||
translated_lines.append(key_to_entry[key].line)
|
||||
else:
|
||||
untranslated_lines.append(f'!{default_key_line.line}')
|
||||
if untranslated_lines:
|
||||
header = [
|
||||
'!!!!!!!!!!!!!!!!!!!!!!!!!',
|
||||
(
|
||||
'! Untranslated keys follow;'
|
||||
' remove the ! prefix after an entry is translated.'
|
||||
),
|
||||
'!!!!!!!!!!!!!!!!!!!!!!!!!',
|
||||
'',
|
||||
]
|
||||
untranslated_lines = header + untranslated_lines
|
||||
return translated_lines, untranslated_lines
|
||||
|
||||
|
||||
def warn_removed_entry(file_lines: FileLines, default_file_lines: FileLines):
|
||||
"""
|
||||
Emit a warning for any translation entries in the translation file but not
|
||||
in the default file, if any.
|
||||
"""
|
||||
default_keys = set(entry.key for entry in default_file_lines.changed)
|
||||
removed_lines = [
|
||||
entry.line for entry in file_lines.changed if entry.key not in
|
||||
default_keys
|
||||
]
|
||||
if removed_lines:
|
||||
warning_lines = ['Removed entry/entries']
|
||||
warning_lines.extend([f'\t{line}' for line in removed_lines])
|
||||
get_logger().warning('\n'.join(warning_lines))
|
||||
|
||||
|
||||
def read_file(path: Path):
|
||||
"""
|
||||
Return the file lines from a language file.
|
||||
"""
|
||||
# Begins with '#' followed by 1+ characters.
|
||||
comment_pattern = re.compile(r'^#.+')
|
||||
# Does not begin with '!', '#', or end of line.
|
||||
changed_pattern = re.compile(r'^[^!#$]')
|
||||
file_lines = FileLines(path.read_text().splitlines(), [], [])
|
||||
for line_num, line in enumerate(file_lines.all):
|
||||
if comment_pattern.match(line):
|
||||
file_lines.comments.append(line)
|
||||
elif changed_pattern.match(line):
|
||||
try:
|
||||
translation_entry = TranslationEntry.create_from_line(line)
|
||||
except ValueError:
|
||||
get_logger().warning(
|
||||
'Malformed translation entry in "%s" on line %d: %s',
|
||||
path.name,
|
||||
line_num,
|
||||
line
|
||||
)
|
||||
else:
|
||||
file_lines.changed.append(translation_entry)
|
||||
sort_file_lines(file_lines)
|
||||
return file_lines
|
||||
|
||||
|
||||
def sort_file_lines(file_lines: FileLines):
|
||||
"""
|
||||
Sort the comments and changed entries of a file lines.
|
||||
"""
|
||||
comments = sort_comment_lines(file_lines.comments)
|
||||
changed = sort_changed_unchanged_lines(file_lines.changed)
|
||||
file_lines.comments.clear()
|
||||
file_lines.changed.clear()
|
||||
file_lines.comments.extend(comments)
|
||||
file_lines.changed.extend(changed)
|
||||
|
||||
|
||||
def sort_comment_lines(lines: List[str]):
|
||||
"""
|
||||
Sort comment lines using natural sort.
|
||||
"""
|
||||
return sorted(set(lines), key=cmp_to_key(compare_string_natural))
|
||||
|
||||
|
||||
def sort_changed_unchanged_lines(entries: List[TranslationEntry]):
|
||||
"""
|
||||
Sort changed or unchanged lines using natural sort of the translation entry
|
||||
keys.
|
||||
"""
|
||||
key_to_lines = defaultdict(set)
|
||||
for entry in entries:
|
||||
key_to_lines[entry.key].add(entry.line)
|
||||
|
||||
warn_duplicate_entries(key_to_lines)
|
||||
|
||||
sort_key = cmp_to_key(lambda a, b: compare_string_natural(a[0], b[0]))
|
||||
return list(map(
|
||||
lambda item: TranslationEntry.create_from_line(item[1].pop()),
|
||||
sorted(key_to_lines.items(), key=sort_key)
|
||||
))
|
||||
|
||||
|
||||
def warn_duplicate_entries(key_to_lines: Mapping[str, Iterable[str]]):
|
||||
"""
|
||||
Emit a warning if there are duplicate translation entries.
|
||||
|
||||
:param key_to_lines: Mapping from entry key to iterable containing all
|
||||
values.
|
||||
"""
|
||||
duplicate_key_to_lines = {
|
||||
k: v for k, v in key_to_lines.items() if len(v) > 1
|
||||
}
|
||||
if duplicate_key_to_lines:
|
||||
warning_lines = ['Duplicate key(s)']
|
||||
for key, lines in duplicate_key_to_lines.items():
|
||||
warning_lines.append(f'\t{key}')
|
||||
warning_lines.extend(f'\t\t{line}' for line in lines)
|
||||
get_logger().warning('\n'.join(warning_lines))
|
||||
|
||||
|
||||
def compare_string_natural(a: str, b: str):
|
||||
"""
|
||||
Compare two strings using natural ordering.
|
||||
|
||||
:return: Negative integer if a comes before b, positive integer if b comes
|
||||
before a, or zero if a and b are identical.
|
||||
"""
|
||||
ia = 0 # Character index for a.
|
||||
ib = 0 # Character index for b.
|
||||
while ia < len(a) and ib < len(b):
|
||||
if a[ia].isdigit():
|
||||
if b[ib].isdigit():
|
||||
# Compare numbers
|
||||
a_number, ia = read_int(a, ia)
|
||||
b_number, ib = read_int(b, ib)
|
||||
if a_number != b_number:
|
||||
return a_number - b_number
|
||||
else:
|
||||
# Compare number with character.
|
||||
return cmp_chars(a[ia], b[ib])
|
||||
else:
|
||||
if b[ib].isdigit():
|
||||
# Compare character with number.
|
||||
return cmp_chars(a[ia], b[ib])
|
||||
else:
|
||||
# Compare character with character.
|
||||
if a[ia] != b[ib]:
|
||||
return cmp_chars(a[ia], b[ib])
|
||||
ia += 1
|
||||
ib += 1
|
||||
if ia < len(a):
|
||||
# a is "longer".
|
||||
return 1
|
||||
if ib < len(b):
|
||||
# b is "longer".
|
||||
return -1
|
||||
# a and b are equivalent.
|
||||
return cmp_string(a, b)
|
||||
|
||||
|
||||
def read_int(string: str, index: int):
|
||||
"""
|
||||
Read an integer from the string starting at the index.
|
||||
|
||||
:param string: The string.
|
||||
:param index: Index in the string where the number starts.
|
||||
:return: A tuple containing the number and the index after the end of where
|
||||
the number was extracted in the string. If there is no number, returns zero
|
||||
and the original index.
|
||||
"""
|
||||
i = index
|
||||
while i < len(string) and string[i].isdigit():
|
||||
i += 1
|
||||
number = 0 if i == index else int(string[index:i])
|
||||
return number, i
|
||||
|
||||
|
||||
def cmp_chars(a: str, b: str):
|
||||
"""
|
||||
Compare two characters according to the `sort -v` command.
|
||||
|
||||
:return: Negative integer if a comes before b, positive integer if b comes
|
||||
before a, or zero if a and b are identical.
|
||||
"""
|
||||
a_index = SortHelper.get_char_index(a)
|
||||
b_index = SortHelper.get_char_index(b)
|
||||
if a_index is not None and b_index is not None:
|
||||
return a_index - b_index
|
||||
if a == b:
|
||||
return 0
|
||||
return -1 if a < b else 1
|
||||
|
||||
|
||||
def cmp_string(a: str, b: str):
|
||||
"""
|
||||
Compare two strings according to a character-by-character comparison using
|
||||
the `sort -v` command.
|
||||
|
||||
:return: Negative integer if a comes before b, positive integer if b comes
|
||||
before a, or zero if a and b are identical.
|
||||
"""
|
||||
for a_char, b_char in zip(a, b):
|
||||
cmp_result = cmp_chars(a_char, b_char)
|
||||
if cmp_result != 0:
|
||||
return cmp_result
|
||||
return len(a) - len(b)
|
||||
|
||||
|
||||
def log_duration(start_time: datetime, end_time: datetime, file_count):
|
||||
"""
|
||||
Log the time it took to format the files.
|
||||
"""
|
||||
duration = end_time - start_time
|
||||
get_logger().info(
|
||||
'Finished updating %d files.\nTotal time: %fs',
|
||||
file_count,
|
||||
duration.total_seconds()
|
||||
)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
Loading…
x
Reference in New Issue
Block a user