Source code for toolforge_i18n._translations

import json
import os
import re
from collections.abc import Callable, Mapping, Sequence
from dataclasses import dataclass, field
from typing import Any, Literal

import babel

from toolforge_i18n._get_gender import get_gender_by_user_name


[docs] def language_code_to_babel(code: str) -> str: """Default implementation to map a MediaWiki language code to Babel. This implementation is conservative and only maps language codes where Babel has an alternative that does not lose any information (at least as far as toolforge_i18n is concerned). MediaWiki also supports many language codes that (as far as I know) have no lossless equivalent in Babel, such as (at the time of writing) sh-latn (Serbo-Croatian in Latin script). If your tool is translated into one of those languages, you will have to configure a custom ``language_code_to_babel`` implementation in your ``tool_translations_config`` and pick some lossy fallback (e.g. hr, Croatian, for sh-latn). Your implementation should generally delegate to this one first, for instance:: def language_code_to_babel(code: str) -> str: mapped = toolforge_i18n.language_code_to_babel(code) if mapped != code: return mapped return { 'sh-latn': 'hr', # ... }.get(code, code.partition('-')[0]) """ return { 'be-tarask': 'be_TARASK', 'de-formal': 'de', # the formal/informal distinction is irrelevant for I18nFormatter 'nds-nl': 'nds_NL', 'kk-cyrl': 'kk', # Cyrl implied: see e.g. babel.Locale('kk').languages['en'] 'ko-kp': 'ko_KP', 'ks-arab': 'ks_Arab', 'ku-latn': 'ku', # Latn implied: see e.g. babel.Locale('ku').languages['en'] 'ms-arab': 'ms_Arab', 'nl-informal': 'nl', # the formal/informal distinction is irrelevant for I18nFormatter 'pt-br': 'pt_BR', 'sr-ec': 'sr_Cyrl', 'sr-el': 'sr_Latn', 'tg-cyrl': 'tg', # Cyrl implied: see e.g. babel.Locale('tg').languages['en'] 'tt-cyrl': 'tt', # Cyrl implied: see e.g. babel.Locale('tt').languages['en'] 'ug-arab': 'ug', # Arab implied: see e.g. babel.Locale('ug').languages['en'] 'yue-hant': 'yue_Hant', 'zh-hans': 'zh_Hans', 'zh-hant': 'zh_Hant', }.get(code, code)
[docs] @dataclass class TranslationsConfig: """Configuration for loading message translations. To use this library, a tool should define a ``tool_translations_config`` module which exports a ``config`` member of this type, like so:: # tool_translations_config.py import TranslationsConfig from toolforge_i18n config = TranslationsConfig( # ... ) The most important config to define is :py:attr:`~variables`, which most tools will need (unless all your messages have no variables); the others may or may not be necessary depending on the tool. """ directory: str = 'i18n/' """The path to the directory to load message files from.""" variables: Mapping[str, Sequence[str]] = field(default_factory=dict) """Variable names used in messages. The source messages use $1, $2 etc., but the Python format strings use named variables, whose names are specified here. The variable name (or its prefix) encodes the type: * ``url``, ``url_*`` - hyperlink: ``[$1 text]`` => ``{url!h:text}`` * ``user_name``, ``user_name_*`` - gender: ``{{GENDER:$1|he|she|they}}`` => ``{user_name!g:m=he:f=she:n=they}`` * ``num``, ``num_*`` - plural: ``{{PLURAL:$1|one egg|$1 eggs}}`` => ``{num_eggs!p:one=one egg:other={num_eggs} eggs}`` * ``list``, ``list_*`` - list: ``$1`` => ``{list_chicken_names!l}`` * anything else - markup without further formatting: ``$1`` => ``{description}`` """ derived_messages: Mapping[str, tuple[str, Callable[[str], str]]] = field(default_factory=dict) """Messages that are derived from other messages. The key is a message key that is not expected in the JSON files, but that is instead generated by taking another message (whose key is the first element of the tuple) and sending it through the callable in the second element of the tuple. Examples for that callable include the identity function (to copy a message) or simple case transformations. """ language_code_to_babel: Callable[[str], str] = language_code_to_babel """Mapping from MediaWiki to Babel language codes. Message files use MediaWiki language codes, which are not always standard language codes; additionally, MediaWiki supports many languages that Babel does not, even when the language code is standard. You will need to map these codes to a supported alternative if you have any translations for such language codes. """ allowed_html_elements: dict[str, set[str]] = field( default_factory=lambda: { 'abbr': {'title'}, 'kbd': set(), 'q': set(), } ) """HTML elements that should be allowed in messages. The key is an element name, and the value is a set of attributes that are allowed on that element. All other elements and attributes will cause a test failure. (See also :py:attr:`~allowed_global_attributes`.) """ allowed_global_attributes: set[str] = field( default_factory=lambda: { 'dir', 'lang', } ) """HTML attributes that should be allowed on any element in messages. This is similar to :py:attr:`~allowed_html_elements`, but the given attribute names are allowed regardless of element name. """ get_gender: Callable[[Any], Literal['m', 'f', 'n']] = get_gender_by_user_name """The ``get_gender`` function used by :py:class:`I18nFormatter`. Defaults to a function that expects the argument to be a user name, and looks it up on Meta-Wiki. You may specify a different function, where the type (the value you pass as user_name= into messages) can be anything you want. """ check_translations: bool = True """Whether to check translations when they are loaded. By default, translations are checked as soon as they are loaded, and if there is a problem with the translations, an error is raised and the translations cannot be used. (This generally means that the tool cannot run; you will probably have to revert the latest localisation updates and fix the translation on translatewiki.net.) This protects against broken or even malicious messages. If you have set up Continuous Integration (CI), e.g. using GitLab CI or GitHub actions, and you are running ``pytest`` as part of your tests, then the translations checks will also be registered as tests (you should see various ``i18n/*.json`` files in pytest’s output). In this case, assuming CI also runs on translatewiki.net exports (and you won’t merge any localisation updates where CI fails), you can set this config to ``False`` to disable the runtime checks; this will speed up translation loading and therefore the tool’s startup (for a well-translated tool, by more than a second). Beware that, if you set this to ``False``, only the pytest integration in CI protects your tool from malicious translations. You must be confident that CI will run, and will run all pytest tests, and if possible you should configure your repository so that localisation updates cannot be merged if CI fails (in GitLab: Settings > Merge requests > Pipelines must succeed; no direct equivalent in GitHub). Otherwise, it is always safe to leave this set to ``True``. """
def mw2py(mw: str, locale: babel.Locale, variables: Sequence[str]) -> str: """Convert a MediaWiki-formatted message to a Python format string.""" def replace_plural(match: re.Match[str]) -> str: nonlocal locale, variables number = int(match[1]) variable = variables[number - 1] args = match[2].split('|') plurals = [] tag_args = [] for arg in args: key, _, text = arg.partition('=') if key.isnumeric(): plurals.append(arg) else: tag_args.append(arg) tags = [tag for tag in ['zero', 'one', 'two', 'few', 'many'] if tag in locale.plural_form.tags] tags = tags[: len(tag_args) - 1] + ['other'] for tag, tag_arg in zip(tags, tag_args, strict=False): plurals.append(f'{tag}={tag_arg}') return '{' + variable + '!p:' + ':'.join(plurals) + '}' py = re.sub(r'\{\{PLURAL:\$([1-9][0-9]*)\|([^}]*)\}\}', replace_plural, mw) def replace_gender(match: re.Match[str]) -> str: nonlocal variables number = int(match[1]) variable = variables[number - 1] args = match[2].split('|') genders = [] for gender, arg in zip(['m', 'f', 'n'], args, strict=False): genders.append(f'{gender}={arg}') return '{' + variable + '!g:' + ':'.join(genders) + '}' py = re.sub(r'\{\{GENDER:\$([1-9][0-9]*)\|([^}]*)\}\}', replace_gender, py) def replace_hyperlink(match: re.Match[str]) -> str: nonlocal variables number = int(match[1]) variable = variables[number - 1] inner_html = match[2] assert '{' not in inner_html and '}' not in inner_html return '{' + variable + '!h:' + inner_html + '}' py = re.sub(r'\[\$([1-9][0-9]*) ([^]]*)\]', replace_hyperlink, py) def replace_unconverted(match: re.Match[str]) -> str: nonlocal variables number = int(match[1]) variable = variables[number - 1] if variable.startswith('list_'): return '{' + variable + '!l}' else: return '{' + variable + '}' py = re.sub(r'\$([1-9][0-9]*)', replace_unconverted, py) return py
[docs] def load_translations(config: TranslationsConfig) -> tuple[dict[str, dict[str, str]], dict[str, str]]: """Load the translations according to the given ``config``. Returns a tuple of ``translations, documentation`` where ``translations`` is a nested ``dict`` from language code to message key to message, and ``documentation`` is a ``dict`` from message key to message documentation. The messages in ``translations`` are Python format strings intended to be formatted by :py:class:`~I18nFormatter`. If :py:attr:`~TranslationsConfig.check_translations` is enabled in the ``config``, the translation checks are run before this function returns, ensuring that the translations are safe to use. Flask-based tools don’t need to call this function directly (it’s called by :py:class:`~ToolforgeI18n`). """ translations: dict[str, dict[str, str]] = {} documentation: dict[str, str] = {} for entry in os.scandir(config.directory): if not entry.is_file(): continue match = re.match(r'(.*)\.json$', entry.name) if not match: continue language = match[1] with open(entry.path, 'r') as f: data = json.load(f) if language == 'qqq': documentation = {key: value for key, value in data.items() if not key.startswith('@')} continue babel_language = config.language_code_to_babel(language) try: locale = babel.Locale(babel_language) except babel.UnknownLocaleError as e: if babel_language != language: note = f"The translation language code '{language}' was mapped to the Babel language code '{babel_language}', yet the " else: note = 'The ' note += f"language code '{babel_language}' was not recognized by Babel. " if config.language_code_to_babel != language_code_to_babel: note += 'You may have to update your language_code_to_babel implementation in your tool_translations_config.' else: note += 'You may have to configure a custom language_code_to_babel implementation in your tool_translations_config.' e.add_note(note) raise translations[language] = {} for key in data: if key.startswith('@'): continue msg = mw2py(data[key], locale, config.variables.get(key, [])) translations[language][key] = msg for key in config.derived_messages: source_key, transformation = config.derived_messages[key] if source_key in translations[language]: translations[language][key] = transformation(translations[language][source_key]) if config.check_translations: from toolforge_i18n._translations_checks import check_all_translations check_all_translations(config, translations, documentation) return translations, documentation