mirror of
https://github.com/veekun/pokedex.git
synced 2024-08-20 18:16:34 +00:00
Merge remote-tracking branch 'origin/encukou-translations'
This commit is contained in:
commit
8e1bbe0004
6 changed files with 2509 additions and 20 deletions
374
bin/poupdate
Executable file
374
bin/poupdate
Executable file
|
@ -0,0 +1,374 @@
|
|||
#! /usr/bin/env python
|
||||
# Encoding: UTF-8
|
||||
|
||||
u"""Creation and loading of GNU Gettext language files.
|
||||
|
||||
poupdate [options] [file1.csv file2.csv ...]
|
||||
|
||||
Use this script to
|
||||
- Create .pot files (in pokedex/i18n/)
|
||||
- Update the .po files (in pokedex/i18n/<lang>)
|
||||
- Update the pokedex .csv files in (pokedex/data/csv/translations)
|
||||
|
||||
To make pos for a new language, make sure it is in the database, make
|
||||
a directory for it in pokedex/i18n/, and run this.
|
||||
|
||||
You can also give one or more translation CSVs as arguments.
|
||||
These are in the same format as veekun's main database CSVs, for example
|
||||
pokedex/data/csv/ability_prose.csv. Be sure to set the correct language
|
||||
ID (which implies the language must be in the database).
|
||||
Also be sure to have the correct column order: first an appropriately named
|
||||
foreign key, then local_language_id, and then the text columns.
|
||||
|
||||
"""
|
||||
|
||||
# Everything related to Gettext files, and the CLI interface, is here.
|
||||
# General message handling and CSV I/O is in the pokedex library.
|
||||
|
||||
# Notes on how we use PO format:
|
||||
# The source information is stored in the occurences fields, using
|
||||
# "table_name.column_name" for file and object ID for line number. This is used
|
||||
# as a message key, instead of the source string. So it's important not to
|
||||
# discard location information. It also means "obsolete" and "fuzzy" mean
|
||||
# pretty much the same in our context.
|
||||
#
|
||||
# Also note that a pot file is just a po file with all strings untranslated.
|
||||
# So some functions here will work on either.
|
||||
#
|
||||
# Gettext context (msgctxt) is written to the files so that tools don't merge
|
||||
# unrelated strings together. It is ignored when reading the PO files.
|
||||
|
||||
# Also of note, "polib" means "(do) kiss!" in Czech.
|
||||
|
||||
import os
|
||||
import re
|
||||
import sys
|
||||
from datetime import datetime
|
||||
from optparse import OptionParser
|
||||
from collections import defaultdict
|
||||
|
||||
import pkg_resources
|
||||
|
||||
from pokedex.db import tables, translations
|
||||
from pokedex.defaults import get_default_csv_dir
|
||||
|
||||
try:
|
||||
import polib
|
||||
except ImportError:
|
||||
if __name__ == '__main__':
|
||||
exit('This utility needs polib installed.\n$ pip install polib')
|
||||
raise
|
||||
|
||||
number_replacement_flag = '-pokedex-number-replacement'
|
||||
|
||||
default_gettext_directory = pkg_resources.resource_filename('pokedex', 'i18n')
|
||||
|
||||
mapped_class_dict = dict((c.__name__, c) for c in tables.mapped_classes)
|
||||
for cls in tables.mapped_classes:
|
||||
mapped_class_dict.update(dict((c.__name__, cls) for c in cls.translation_classes))
|
||||
|
||||
class PokedexPot(polib.POFile):
|
||||
def __init__(self, name):
|
||||
super(PokedexPot, self).__init__()
|
||||
self.metadata = {
|
||||
'Project-Id-Version': 'pokedex-%s 0.1' % name,
|
||||
'Report-Msgid-Bugs-To': 'encukou@gmail.com',
|
||||
'POT-Creation-Date': datetime.now().isoformat(),
|
||||
'PO-Revision-Date': 'YEAR-MO-DA HO:MI+ZONE',
|
||||
'MIME-Version': '1.0',
|
||||
'Content-Type': 'text/plain; charset=utf-8',
|
||||
'Content-Transfer-Encoding': '8bit',
|
||||
'Generated-By': "The pokedex",
|
||||
}
|
||||
self.seen_entries = {}
|
||||
|
||||
def append(self, entry):
|
||||
"""Append an entry. POEntries that only differ in numbers are merged.
|
||||
|
||||
For example "Route 1", "Route 2", etc. are replaced by a single
|
||||
"Route {num}".
|
||||
|
||||
Multiple numbers might be replaced, for example in "{num}--{num}
|
||||
different Unown caught"
|
||||
|
||||
Entries without numbers are merged as well (e.g. "Has no overworld
|
||||
effect" appears quite a few times in in AbilityChangelog)
|
||||
"""
|
||||
replaced = translations.number_re.sub('{num}', entry.msgid)
|
||||
try:
|
||||
common_entry = self.seen_entries[(entry.msgctxt, replaced)]
|
||||
except KeyError:
|
||||
self.seen_entries[(entry.msgctxt, replaced)] = entry
|
||||
else:
|
||||
common_entry.occurrences += entry.occurrences
|
||||
# Only now is the actual entry replaced. So we get
|
||||
# "Route {num}", but "Porygon2" because there's no Porygon3.
|
||||
common_entry.msgid = replaced
|
||||
common_entry.msgstr = translations.number_re.sub('{num}', common_entry.msgstr)
|
||||
if replaced != entry.msgid and number_replacement_flag not in common_entry.flags:
|
||||
common_entry.flags.append(number_replacement_flag)
|
||||
return
|
||||
self += [entry]
|
||||
|
||||
class PotDict(dict):
|
||||
"""A defaultdict of pot files"""
|
||||
def __missing__(self, name):
|
||||
pot = PokedexPot(name)
|
||||
self[name] = pot
|
||||
return pot
|
||||
|
||||
def yield_po_messages(pos):
|
||||
"""Yield messages from all given .po files
|
||||
"""
|
||||
merger = translations.Merge()
|
||||
for po in pos.values():
|
||||
merger.add_iterator(_yield_one_po_messages(po, merger))
|
||||
return merger
|
||||
|
||||
def entry_sort_key(entry):
|
||||
try:
|
||||
cls_col, line = entry.occurrences[0]
|
||||
except IndexError:
|
||||
return
|
||||
else:
|
||||
if line:
|
||||
classname, col = cls_col.split('.')
|
||||
fuzzy = entry.obsolete or 'fuzzy' in entry.flags
|
||||
try:
|
||||
cls = mapped_class_dict[classname]
|
||||
except KeyError, k:
|
||||
# Renamed table?
|
||||
print 'Warning: Unknown class %s' % classname
|
||||
return '', int(line), col, fuzzy
|
||||
else:
|
||||
return cls.__name__, int(line), col, fuzzy
|
||||
|
||||
def _yield_one_po_messages(pofile, merger):
|
||||
# Yield messages from one po file
|
||||
#
|
||||
# Messages in our po files are ordered by the first occurrence.
|
||||
# The occurrences of a single message are also ordered.
|
||||
# So just merge all the subsequences as we go
|
||||
for entry in sorted(pofile, key=entry_sort_key):
|
||||
if entry.msgstr:
|
||||
fuzzy = (entry.obsolete or 'fuzzy' in entry.flags)
|
||||
messages = []
|
||||
for occurrence in entry.occurrences:
|
||||
cls_colname, id = occurrence
|
||||
if id:
|
||||
clsname, colname = cls_colname.split('.')
|
||||
cls = mapped_class_dict[clsname]
|
||||
messages.append(translations.Message(
|
||||
mapped_class_dict[clsname].__name__,
|
||||
int(id),
|
||||
colname,
|
||||
entry.msgstr,
|
||||
source=entry.msgid,
|
||||
number_replacement=number_replacement_flag in entry.flags,
|
||||
origin='PO file',
|
||||
fuzzy=fuzzy,
|
||||
))
|
||||
if messages[1:]:
|
||||
# Spawn extra iterators before yielding
|
||||
merger.add_iterator(messages[1:])
|
||||
if messages:
|
||||
yield messages[0]
|
||||
|
||||
def create_pots(source, *translation_streams):
|
||||
"""Convert an iterator of Messages to a dictionary of pot/po files
|
||||
|
||||
If translations are given, they're merged, and any exact matches are put
|
||||
in the po file. Give some for po files, don't give any for pot files.
|
||||
"""
|
||||
obsolete = []
|
||||
pots = PotDict()
|
||||
merged = translations.merge_translations(source, *translation_streams, unused=obsolete.append)
|
||||
for source, sourcehash, string, exact in merged:
|
||||
ctxt = '.'.join((source.cls, source.colname))
|
||||
entry = polib.POEntry(
|
||||
msgid=source.string,
|
||||
occurrences=[(ctxt, source.id)],
|
||||
msgctxt=ctxt,
|
||||
)
|
||||
if string:
|
||||
entry.msgstr = string
|
||||
if not exact:
|
||||
entry.flags.append('fuzzy')
|
||||
pots[source.pot].append(entry)
|
||||
for message in obsolete:
|
||||
ctxt = '.'.join((message.cls, message.colname))
|
||||
entry = polib.POEntry(
|
||||
msgid=message.source or '???',
|
||||
occurrences=[(ctxt, message.id)],
|
||||
msgctxt=ctxt,
|
||||
obsolete=True,
|
||||
)
|
||||
return pots
|
||||
|
||||
def save_pots(pots, gettext_directory=default_gettext_directory):
|
||||
"""Save pot files to a directory."""
|
||||
for name, pot in pots.items():
|
||||
pot.save(os.path.join(gettext_directory, 'pokedex-%s.pot' % name))
|
||||
|
||||
def save_pos(pos, lang, gettext_directory=default_gettext_directory):
|
||||
"""Save po files to the appropriate directory."""
|
||||
for name, po in pos.items():
|
||||
po.save(os.path.join(gettext_directory, lang, 'pokedex-%s.po' % name))
|
||||
|
||||
def read_pots(directory=default_gettext_directory, extension='.pot'):
|
||||
"""Read all files from the given directory with the given extension as pofiles
|
||||
|
||||
Works on pos or pots.
|
||||
"""
|
||||
pots = {}
|
||||
for filename in os.listdir(directory):
|
||||
basename, ext = os.path.splitext(filename)
|
||||
if ext == extension:
|
||||
pots[basename] = polib.pofile(os.path.join(directory, filename))
|
||||
|
||||
return pots
|
||||
|
||||
def all_langs(gettext_directory=default_gettext_directory):
|
||||
return [
|
||||
d for d in os.listdir(gettext_directory)
|
||||
if os.path.isdir(os.path.join(gettext_directory, d))
|
||||
]
|
||||
|
||||
def merge_pos(transl, lang, language_directory):
|
||||
"""Update all po files for the given language
|
||||
|
||||
Takes into account the source, the official translations from the database,
|
||||
the existing PO files, and the current translation CSV, in that order.
|
||||
|
||||
Returns a name -> pofile dict
|
||||
"""
|
||||
return create_pots(
|
||||
transl.source,
|
||||
transl.official_messages(lang),
|
||||
yield_po_messages(pos=read_pots(language_directory, '.po')),
|
||||
transl.yield_target_messages(lang),
|
||||
)
|
||||
|
||||
def bar(fraction, size, done_char='=', split_char='|', notdone_char='-'):
|
||||
"""Build an ASCII art progress bar
|
||||
"""
|
||||
size -= 1
|
||||
if fraction == 1:
|
||||
split_char = done_char
|
||||
completed = int(round(size * fraction))
|
||||
bar = [done_char] * completed
|
||||
bar.append(split_char)
|
||||
bar += notdone_char * (size - completed)
|
||||
return ''.join(bar)
|
||||
|
||||
def print_stats(pos):
|
||||
"""Print out some fun stats about a set of po files
|
||||
"""
|
||||
template = u"{0:>10}: {1:4}/{2:4} {3:6.2f}% [{4}]"
|
||||
total_translated = 0
|
||||
total = 0
|
||||
for name, po in pos.items():
|
||||
num_translated = len(po.translated_entries())
|
||||
total_translated += num_translated
|
||||
fraction_translated = 1. * num_translated / len(po)
|
||||
total += len(po)
|
||||
print template.format(
|
||||
name,
|
||||
num_translated,
|
||||
len(po),
|
||||
100 * fraction_translated,
|
||||
bar(fraction_translated, 47),
|
||||
).encode('utf-8')
|
||||
fraction_translated = 1. * total_translated / total
|
||||
print template.format(
|
||||
'Total',
|
||||
total_translated,
|
||||
total,
|
||||
100 * fraction_translated,
|
||||
bar(fraction_translated, 47),
|
||||
).encode('utf-8')
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
parser = OptionParser(__doc__)
|
||||
|
||||
parser.add_option('-l', '--langs', dest='langs',
|
||||
help="List of languages to handle, separated by commas (example: -l 'en,de,ja') (default: all in gettext directory)")
|
||||
parser.add_option('-P', '--no-pots', dest='pots', action='store_false', default=True,
|
||||
help='Do not create POT files (templates)')
|
||||
parser.add_option('-p', '--no-pos', dest='pos', action='store_false', default=True,
|
||||
help='Do not update PO files (message catalogs)')
|
||||
|
||||
parser.add_option('-c', '--no-csv', dest='csv', action='store_false', default=True,
|
||||
help='Do not update pokedex translations files')
|
||||
|
||||
parser.add_option('-d', '--directory', dest='directory',
|
||||
help='Veekun data directory')
|
||||
parser.add_option('-L', '--source-language', dest='source_lang',
|
||||
help="Source language identifier (default: 'en')")
|
||||
|
||||
parser.add_option('-g', '--gettext-dir', dest='gettext_directory', default=default_gettext_directory,
|
||||
help='Gettext directory (default: pokedex/i18n/)')
|
||||
|
||||
parser.add_option('-q', '--quiet', dest='verbose', default=True, action='store_false',
|
||||
help="Don't print what's going on")
|
||||
|
||||
options, arguments = parser.parse_args()
|
||||
|
||||
transl = translations.Translations.from_parsed_options(options)
|
||||
|
||||
gettext_directory = options.gettext_directory
|
||||
|
||||
if (options.pots or options.pos) and not os.path.exists(gettext_directory):
|
||||
print "Error: Gettext directory doesn't exist. Skipping pot/po creation"
|
||||
options.pots = options.pos = False
|
||||
|
||||
if options.pots:
|
||||
if options.verbose:
|
||||
print 'Creating pots in', gettext_directory
|
||||
save_pots(create_pots(transl.source), gettext_directory=gettext_directory)
|
||||
|
||||
if options.pos or options.csv:
|
||||
# Merge in CSV files from command line
|
||||
csv_streams = defaultdict(translations.Merge)
|
||||
for argument in arguments:
|
||||
# Add each message in its own stream, to sort them.
|
||||
file = open(argument, 'rb')
|
||||
with file:
|
||||
for message in translations.yield_guessed_csv_messages(file):
|
||||
lang = transl.language_identifiers[message.language_id]
|
||||
csv_streams[lang].add_iterator([message])
|
||||
streams = defaultdict(list)
|
||||
for lang, stream in csv_streams.items():
|
||||
streams[lang].append(stream)
|
||||
|
||||
if os.path.exists(gettext_directory):
|
||||
# Merge in the PO files
|
||||
if options.langs:
|
||||
langs = options.langs.split(',')
|
||||
else:
|
||||
langs = all_langs(gettext_directory)
|
||||
|
||||
for lang in langs:
|
||||
language_directory = os.path.join(gettext_directory, lang)
|
||||
if options.verbose:
|
||||
print 'Merging translations for %s in %s' % (lang, language_directory)
|
||||
pos = merge_pos(transl, lang, language_directory)
|
||||
|
||||
if options.pos:
|
||||
if options.verbose:
|
||||
print 'Writing POs for %s' % lang
|
||||
save_pos(pos, lang, gettext_directory=gettext_directory)
|
||||
|
||||
if options.verbose:
|
||||
print_stats(pos)
|
||||
|
||||
streams[lang].append(yield_po_messages(pos))
|
||||
|
||||
if options.csv:
|
||||
for lang, lang_streams in streams.items():
|
||||
if options.verbose:
|
||||
print "Merging %s translation stream/s for '%s'" % (len(lang_streams), lang)
|
||||
existing_messages = list(transl.yield_target_messages(lang))
|
||||
lang_streams.append(existing_messages)
|
||||
transl.write_translations(lang, *lang_streams)
|
1205
pokedex/data/csv/translations/cs.csv
Normal file
1205
pokedex/data/csv/translations/cs.csv
Normal file
File diff suppressed because it is too large
Load diff
|
@ -8,8 +8,8 @@ from sqlalchemy.orm.attributes import instrumentation_registry
|
|||
import sqlalchemy.sql.util
|
||||
import sqlalchemy.types
|
||||
|
||||
from pokedex.db import metadata
|
||||
import pokedex.db.tables as tables
|
||||
import pokedex
|
||||
from pokedex.db import metadata, tables, translations
|
||||
from pokedex.defaults import get_default_csv_dir
|
||||
from pokedex.db.dependencies import find_dependent_tables
|
||||
|
||||
|
@ -96,7 +96,7 @@ def _get_verbose_prints(verbose):
|
|||
return print_start, print_status, print_done
|
||||
|
||||
|
||||
def load(session, tables=[], directory=None, drop_tables=False, verbose=False, safe=True, recursive=False):
|
||||
def load(session, tables=[], directory=None, drop_tables=False, verbose=False, safe=True, recursive=True, langs=None):
|
||||
"""Load data from CSV files into the given database session.
|
||||
|
||||
Tables are created automatically.
|
||||
|
@ -123,6 +123,9 @@ def load(session, tables=[], directory=None, drop_tables=False, verbose=False, s
|
|||
|
||||
`recursive`
|
||||
If set to True, load all dependent tables too.
|
||||
|
||||
`langs`
|
||||
List of identifiers of extra language to load, or None to load them all
|
||||
"""
|
||||
|
||||
# First take care of verbosity
|
||||
|
@ -300,13 +303,30 @@ def load(session, tables=[], directory=None, drop_tables=False, verbose=False, s
|
|||
|
||||
print_done()
|
||||
|
||||
|
||||
print_start('Translations')
|
||||
transl = translations.Translations(csv_directory=directory)
|
||||
|
||||
new_row_count = 0
|
||||
for translation_class, rows in transl.get_load_data(langs):
|
||||
table_obj = translation_class.__table__
|
||||
if table_obj in table_objs:
|
||||
insert_stmt = table_obj.insert()
|
||||
session.connection().execute(insert_stmt, rows)
|
||||
session.commit()
|
||||
# We don't have a total, but at least show some increasing number
|
||||
new_row_count += len(rows)
|
||||
print_status(str(new_row_count))
|
||||
|
||||
print_done()
|
||||
|
||||
# SQLite check
|
||||
if session.connection().dialect.name == 'sqlite':
|
||||
session.connection().execute("PRAGMA integrity_check")
|
||||
|
||||
|
||||
|
||||
def dump(session, tables=[], directory=None, verbose=False):
|
||||
def dump(session, tables=[], directory=None, verbose=False, langs=['en']):
|
||||
"""Dumps the contents of a database to a set of CSV files. Probably not
|
||||
useful to anyone besides a developer.
|
||||
|
||||
|
@ -322,11 +342,15 @@ def dump(session, tables=[], directory=None, verbose=False):
|
|||
|
||||
`verbose`
|
||||
If set to True, status messages will be printed to stdout.
|
||||
|
||||
`langs`
|
||||
List of identifiers of languages to dump unofficial texts for
|
||||
"""
|
||||
|
||||
# First take care of verbosity
|
||||
print_start, print_status, print_done = _get_verbose_prints(verbose)
|
||||
|
||||
languages = dict((l.id, l) for l in session.query(pokedex.db.tables.Language))
|
||||
|
||||
if not directory:
|
||||
directory = get_default_csv_dir()
|
||||
|
@ -342,10 +366,28 @@ def dump(session, tables=[], directory=None, verbose=False):
|
|||
writer = csv.writer(open("%s/%s.csv" % (directory, table_name), 'wb'),
|
||||
lineterminator='\n')
|
||||
columns = [col.name for col in table.columns]
|
||||
|
||||
# For name tables, dump rows for official languages, as well as
|
||||
# for those in `langs`.
|
||||
# For other translation tables, only dump rows for languages in `langs`
|
||||
# For non-translation tables, dump all rows.
|
||||
if 'local_language_id' in columns:
|
||||
if any(col.info.get('official') for col in table.columns):
|
||||
def include_row(row):
|
||||
return (languages[row.local_language_id].official or
|
||||
languages[row.local_language_id].identifier in langs)
|
||||
else:
|
||||
def include_row(row):
|
||||
return languages[row.local_language_id].identifier in langs
|
||||
else:
|
||||
def include_row(row):
|
||||
return True
|
||||
|
||||
writer.writerow(columns)
|
||||
|
||||
primary_key = table.primary_key
|
||||
for row in session.query(table).order_by(*primary_key).all():
|
||||
if include_row(row):
|
||||
csvs = []
|
||||
for col in columns:
|
||||
# Convert Pythony values to something more universal
|
||||
|
|
659
pokedex/db/translations.py
Executable file
659
pokedex/db/translations.py
Executable file
|
@ -0,0 +1,659 @@
|
|||
#! /usr/bin/env python
|
||||
u"""General handling of translations
|
||||
|
||||
The general idea is to get messages from somewhere: the source pokedex CSVs,
|
||||
or the translation CSVs, etc., then merge them together in some way, and shove
|
||||
them into the database.
|
||||
|
||||
If a message is translated, it has a source string attached to it, with the
|
||||
original English version. Or at least it has a CRC of the original.
|
||||
When that doesn't match, it means the English string changed and the
|
||||
translation has to be updated.
|
||||
Also this is why we can't dump translations from the database: there's no
|
||||
original string info.
|
||||
|
||||
Some complications:
|
||||
|
||||
Flavor text is so repetitive that we take strings from all the version,
|
||||
separate the unique ones by blank lines, let translators work on that, and then
|
||||
put it in flavor_summary tables.
|
||||
|
||||
Routes names and other repetitive numeric things are replaced by e.g.
|
||||
"Route {num}" so translators only have to work on each set once.
|
||||
|
||||
"""
|
||||
|
||||
import binascii
|
||||
import csv
|
||||
import heapq
|
||||
import itertools
|
||||
import os
|
||||
import re
|
||||
import sys
|
||||
from collections import defaultdict
|
||||
|
||||
from pokedex.db import tables
|
||||
from pokedex.defaults import get_default_csv_dir
|
||||
|
||||
default_source_lang = 'en'
|
||||
|
||||
# Top-level classes we want translations for: in order, and by name
|
||||
# These are all mapped_classes that have translatable texts and aren't summarized
|
||||
toplevel_classes = []
|
||||
toplevel_class_by_name = {}
|
||||
|
||||
# summary_map[pokemon_prose]['flavor_summary'] == PokemonFlavorTexts
|
||||
summary_map = {}
|
||||
|
||||
# translation_class_by_column[class_name, column_name] == translation_class
|
||||
translation_class_by_column = {}
|
||||
|
||||
for cls in tables.mapped_classes:
|
||||
try:
|
||||
summary_class, col = cls.summary_column
|
||||
except AttributeError:
|
||||
if cls.translation_classes:
|
||||
toplevel_classes.append(cls)
|
||||
toplevel_class_by_name[cls.__name__] = cls
|
||||
for translation_class in cls.translation_classes:
|
||||
for column in translation_class.__table__.c:
|
||||
translation_class_by_column[cls, column.name] = translation_class
|
||||
else:
|
||||
summary_map.setdefault(summary_class, {})[col] = cls
|
||||
|
||||
number_re = re.compile("[0-9]+")
|
||||
|
||||
def crc(string):
|
||||
"""Return a hash to we use in translation CSV files"""
|
||||
return "%08x" % (binascii.crc32(string.encode('utf-8')) & 0xffffffff)
|
||||
# Two special values are also used in source_crc:
|
||||
# UNKNOWN: no source string was available
|
||||
# OFFICIAL: an official string from the main database
|
||||
|
||||
class Message(object):
|
||||
"""Holds all info about a translatable or translated string
|
||||
|
||||
cls: Name of the mapped class the message belongs to
|
||||
id: The id of the thing the message belongs to
|
||||
colname: name of the database column
|
||||
strings: A list of strings in the message, usualy of length 1.
|
||||
|
||||
Optional attributes (None if not set):
|
||||
colsize: Max length of the database column
|
||||
source: The string this was translated from
|
||||
number_replacement: True if this is a translation with {num} placeholders
|
||||
pot: Name of the pot the message goes to (see pot_for_column)
|
||||
source_crc: CRC of the source
|
||||
origin: Some indication of where the string came from (CSV, PO, ...)
|
||||
fuzzy: True for fuzzy translations
|
||||
language_id: ID of the language
|
||||
official: True if this is a known-good translation
|
||||
"""
|
||||
__slots__ = 'cls id colname strings colsize source number_replacement pot source_crc origin fuzzy language_id official'.split()
|
||||
def __init__(self, cls, id, colname, string,
|
||||
colsize=None, source=None, number_replacement=None, pot=None,
|
||||
source_crc=None, origin=None, fuzzy=None, language_id=None,
|
||||
official=None,
|
||||
):
|
||||
self.cls = cls
|
||||
self.id = id
|
||||
self.colname = colname
|
||||
self.strings = [string]
|
||||
self.colsize = colsize
|
||||
self.source = source
|
||||
self.number_replacement = number_replacement
|
||||
self.pot = pot
|
||||
self.source_crc = source_crc
|
||||
if source and not source_crc:
|
||||
self.source_crc = crc(source)
|
||||
self.origin = origin
|
||||
self.fuzzy = fuzzy
|
||||
self.language_id = language_id
|
||||
self.official = official
|
||||
|
||||
def merge(self, other):
|
||||
"""Merge two messages, as required for flavor text summarizing
|
||||
"""
|
||||
assert self.merge_key == other.merge_key
|
||||
for string in other.strings:
|
||||
if string not in self.strings:
|
||||
self.strings.append(string)
|
||||
self.colsize = self.colsize or other.colsize
|
||||
self.pot = self.pot or other.pot
|
||||
self.source = None
|
||||
self.source_crc = None
|
||||
self.number_replacement = None
|
||||
|
||||
@property
|
||||
def string(self):
|
||||
return '\n\n'.join(self.strings)
|
||||
|
||||
@property
|
||||
def merge_key(self):
|
||||
return self.cls, self.id, self.colname
|
||||
|
||||
@property
|
||||
def sort_key(self):
|
||||
return self.merge_key, self.language_id, self.fuzzy
|
||||
|
||||
@property
|
||||
def eq_key(self):
|
||||
return self.sort_key, self.strings
|
||||
|
||||
def __eq__(self, other): return self.eq_key == other.eq_key
|
||||
def __ne__(self, other): return self.eq_key != other.eq_key
|
||||
def __gt__(self, other): return self.sort_key > other.sort_key
|
||||
def __lt__(self, other): return self.sort_key < other.sort_key
|
||||
def __ge__(self, other): return self.sort_key >= other.sort_key
|
||||
def __le__(self, other): return self.sort_key <= other.sort_key
|
||||
|
||||
def __unicode__(self):
|
||||
string = '"%s"' % self.string
|
||||
if len(string) > 20:
|
||||
string = string[:15] + u'"...'
|
||||
template = u'<Message from {self.origin} for {self.cls}.{self.colname}:{self.id} -- {string}>'
|
||||
return template.format(self=self, string=string)
|
||||
|
||||
def __str__(self):
|
||||
return unicode(self).encode('utf-8')
|
||||
|
||||
def __repr__(self):
|
||||
return unicode(self).encode('utf-8')
|
||||
|
||||
class Translations(object):
|
||||
"""Data and opertaions specific to a location on disk (and a source language)
|
||||
"""
|
||||
def __init__(self, source_lang=default_source_lang, csv_directory=None, translation_directory=None):
|
||||
if csv_directory is None:
|
||||
csv_directory = get_default_csv_dir()
|
||||
|
||||
if translation_directory is None:
|
||||
translation_directory = os.path.join(csv_directory, 'translations')
|
||||
|
||||
self.source_lang = default_source_lang
|
||||
self.csv_directory = csv_directory
|
||||
self.translation_directory = translation_directory
|
||||
|
||||
self.language_ids = {}
|
||||
self.language_identifiers = {}
|
||||
self.official_langs = []
|
||||
for row in self.reader_for_class(tables.Language, reader_class=csv.DictReader):
|
||||
self.language_ids[row['identifier']] = int(row['id'])
|
||||
self.language_identifiers[int(row['id'])] = row['identifier']
|
||||
if row['official'] and int(row['official']):
|
||||
self.official_langs.append(row['identifier'])
|
||||
|
||||
self.source_lang_id = self.language_ids[self.source_lang]
|
||||
|
||||
@classmethod
|
||||
def from_parsed_options(cls, options):
|
||||
return cls(options.source_lang, options.directory)
|
||||
|
||||
@property
|
||||
def source(self):
|
||||
"""All source (i.e. English) messages
|
||||
"""
|
||||
return self.official_messages(self.source_lang)
|
||||
|
||||
def official_messages(self, lang):
|
||||
"""All official messages (i.e. from main database) for the given lang
|
||||
"""
|
||||
# Cached as tuples, since they're used pretty often
|
||||
lang_id = self.language_ids[lang]
|
||||
try:
|
||||
return self._sources[lang_id]
|
||||
except AttributeError:
|
||||
self._sources = {}
|
||||
for message in self.yield_source_messages():
|
||||
self._sources.setdefault(message.language_id, []).append(message)
|
||||
self._sources = dict((k, tuple(merge_adjacent(v))) for k, v in self._sources.items())
|
||||
return self.official_messages(lang)
|
||||
except KeyError:
|
||||
# Looks like there are no messages in the DB for this language
|
||||
# This should only happen for non-official languages
|
||||
assert lang not in self.official_langs
|
||||
return ()
|
||||
|
||||
def write_translations(self, lang, *streams):
|
||||
"""Write a translation CSV containing messages from streams.
|
||||
|
||||
Streams should be ordered by priority, from highest to lowest.
|
||||
|
||||
Any official translations (from the main database) are added automatically.
|
||||
"""
|
||||
writer = self.writer_for_lang(lang)
|
||||
|
||||
writer.writerow('language_id table id column source_crc string'.split())
|
||||
|
||||
messages = merge_translations(self.source, self.official_messages(lang), *streams)
|
||||
|
||||
warnings = {}
|
||||
for source, sourcehash, string, exact in messages:
|
||||
if string and sourcehash != 'OFFICIAL':
|
||||
utf8len = len(string.encode('utf-8'))
|
||||
if source.colsize and utf8len > source.colsize:
|
||||
key = source.cls, source.colname
|
||||
warnings[key] = max(warnings.get(key, (0,)), (utf8len, source, string))
|
||||
else:
|
||||
writer.writerow((
|
||||
self.language_ids[lang],
|
||||
source.cls,
|
||||
source.id,
|
||||
source.colname,
|
||||
sourcehash,
|
||||
string.encode('utf-8'),
|
||||
))
|
||||
for utf8len, source, string in warnings.values():
|
||||
template = u'Error: {size}B value for {colsize}B column! {key[0]}.{key[2]}:{key[1]}: {string}'
|
||||
warning = template.format(
|
||||
key=source.merge_key,
|
||||
string=string,
|
||||
size=utf8len,
|
||||
colsize=source.colsize,
|
||||
)
|
||||
if len(warning) > 79:
|
||||
warning = warning[:76] + u'...'
|
||||
print warning.encode('utf-8')
|
||||
|
||||
def reader_for_class(self, cls, reader_class=csv.reader):
|
||||
tablename = cls.__table__.name
|
||||
csvpath = os.path.join(self.csv_directory, tablename + '.csv')
|
||||
return reader_class(open(csvpath, 'rb'), lineterminator='\n')
|
||||
|
||||
def writer_for_lang(self, lang):
|
||||
csvpath = os.path.join(self.translation_directory, '%s.csv' % lang)
|
||||
return csv.writer(open(csvpath, 'wb'), lineterminator='\n')
|
||||
|
||||
def yield_source_messages(self, language_id=None):
|
||||
"""Yield all messages from source CSV files
|
||||
|
||||
Messages from all languages are returned. The messages are not ordered
|
||||
properly, but splitting the stream by language (and filtering results
|
||||
by merge_adjacent) will produce proper streams.
|
||||
"""
|
||||
if language_id is None:
|
||||
language_id = self.source_lang_id
|
||||
|
||||
for cls in sorted(toplevel_classes, key=lambda c: c.__name__):
|
||||
streams = []
|
||||
for translation_class in cls.translation_classes:
|
||||
streams.append(yield_source_csv_messages(
|
||||
translation_class,
|
||||
cls,
|
||||
self.reader_for_class(translation_class),
|
||||
))
|
||||
try:
|
||||
colmap = summary_map[translation_class]
|
||||
except KeyError:
|
||||
pass
|
||||
else:
|
||||
for colname, summary_class in colmap.items():
|
||||
column = translation_class.__table__.c[colname]
|
||||
streams.append(yield_source_csv_messages(
|
||||
summary_class,
|
||||
cls,
|
||||
self.reader_for_class(summary_class),
|
||||
force_column=column,
|
||||
))
|
||||
for message in Merge(*streams):
|
||||
yield message
|
||||
|
||||
def yield_target_messages(self, lang):
|
||||
"""Yield messages from the data/csv/translations/<lang>.csv file
|
||||
"""
|
||||
path = os.path.join(self.csv_directory, 'translations', '%s.csv' % lang)
|
||||
try:
|
||||
file = open(path, 'rb')
|
||||
except IOError:
|
||||
return ()
|
||||
return yield_translation_csv_messages(file)
|
||||
|
||||
def yield_all_translations(self):
|
||||
stream = Merge()
|
||||
for lang in self.language_identifiers.values():
|
||||
stream.add_iterator(self.yield_target_messages(lang))
|
||||
return (message for message in stream if not message.official)
|
||||
|
||||
def get_load_data(self, langs=None):
|
||||
"""Yield (translation_class, data for INSERT) pairs for loading into the DB
|
||||
|
||||
langs is either a list of language identifiers or None
|
||||
"""
|
||||
if langs is None:
|
||||
langs = self.language_identifiers.values()
|
||||
stream = Merge()
|
||||
for lang in self.language_identifiers.values():
|
||||
stream.add_iterator(self.yield_target_messages(lang))
|
||||
stream = (message for message in stream if not message.official)
|
||||
count = 0
|
||||
class GroupDict(dict):
|
||||
"""Dict to automatically set the foreign_id and local_language_id for new items
|
||||
"""
|
||||
def __missing__(self, key):
|
||||
# depends on `cls` from outside scope
|
||||
id, language_id = key
|
||||
data = self[key] = defaultdict(lambda: None)
|
||||
column_names = (c.name for c in translation_class.__table__.columns)
|
||||
data.update(dict.fromkeys(column_names))
|
||||
data.update({
|
||||
'%s_id' % cls.__singlename__: id,
|
||||
'local_language_id': language_id,
|
||||
})
|
||||
return data
|
||||
# Nested dict:
|
||||
# translation_class -> (lang, id) -> column -> value
|
||||
everything = defaultdict(GroupDict)
|
||||
# Group by object so we always have all of the messages for one DB row
|
||||
for (cls_name, id), group in group_by_object(stream):
|
||||
cls = toplevel_class_by_name[cls_name]
|
||||
for message in group:
|
||||
translation_class = translation_class_by_column[cls, message.colname]
|
||||
key = id, message.language_id
|
||||
colname = str(message.colname)
|
||||
everything[translation_class][key][colname] = message.string
|
||||
count += 1
|
||||
if count > 1000:
|
||||
for translation_class, key_data in everything.items():
|
||||
yield translation_class, key_data.values()
|
||||
count = 0
|
||||
everything.clear()
|
||||
for translation_class, data_dict in everything.items():
|
||||
yield translation_class, data_dict.values()
|
||||
|
||||
def group_by_object(stream):
|
||||
"""Group stream by object
|
||||
|
||||
Yields ((class name, object ID), (list of messages)) pairs.
|
||||
"""
|
||||
stream = iter(stream)
|
||||
current = stream.next()
|
||||
current_key = current.cls, current.id
|
||||
group = [current]
|
||||
for message in stream:
|
||||
if (message.cls, message.id) != current_key:
|
||||
yield current_key, group
|
||||
group = []
|
||||
group.append(message)
|
||||
current = message
|
||||
current_key = current.cls, current.id
|
||||
yield current_key, group
|
||||
|
||||
class Merge(object):
|
||||
"""Merge several sorted iterators together
|
||||
|
||||
Additional iterators may be added at any time with add_iterator.
|
||||
Accepts None for the initial iterators
|
||||
If the same value appears in more iterators, there will be duplicates in
|
||||
the output.
|
||||
"""
|
||||
def __init__(self, *iterators):
|
||||
self.next_values = []
|
||||
for iterator in iterators:
|
||||
if iterator is not None:
|
||||
self.add_iterator(iterator)
|
||||
|
||||
def add_iterator(self, iterator):
|
||||
iterator = iter(iterator)
|
||||
try:
|
||||
value = iterator.next()
|
||||
except StopIteration:
|
||||
return
|
||||
else:
|
||||
heapq.heappush(self.next_values, (value, iterator))
|
||||
|
||||
def __iter__(self):
|
||||
return self
|
||||
|
||||
def next(self):
|
||||
if self.next_values:
|
||||
value, iterator = heapq.heappop(self.next_values)
|
||||
self.add_iterator(iterator)
|
||||
return value
|
||||
else:
|
||||
raise StopIteration
|
||||
|
||||
def merge_adjacent(gen):
|
||||
"""Merge adjacent messages that compare equal"""
|
||||
gen = iter(gen)
|
||||
last = gen.next()
|
||||
for this in gen:
|
||||
if this.merge_key == last.merge_key:
|
||||
last.merge(this)
|
||||
elif last < this:
|
||||
yield last
|
||||
last = this
|
||||
else:
|
||||
raise AssertionError('Bad order, %s > %s' % (last, this))
|
||||
yield last
|
||||
|
||||
def leftjoin(left_stream, right_stream, key=lambda x: x, unused=None):
|
||||
"""A "left join" operation on sorted iterators
|
||||
|
||||
Yields (left, right) pairs, where left comes from left_stream and right
|
||||
is the corresponding item from right, or None
|
||||
|
||||
Note that if there are duplicates in right_stream, you won't get duplicate
|
||||
rows for them.
|
||||
|
||||
If given, unused should be a one-arg function that will get called on all
|
||||
unused items in right_stream.
|
||||
"""
|
||||
left_stream = iter(left_stream)
|
||||
right_stream = iter(right_stream)
|
||||
try:
|
||||
right = right_stream.next()
|
||||
for left in left_stream:
|
||||
while right and key(left) > key(right):
|
||||
if unused is not None:
|
||||
unused(right)
|
||||
right = right_stream.next()
|
||||
if key(left) == key(right):
|
||||
yield left, right
|
||||
del left
|
||||
right = right_stream.next()
|
||||
else:
|
||||
yield left, None
|
||||
except StopIteration:
|
||||
try:
|
||||
yield left, None
|
||||
except NameError:
|
||||
pass
|
||||
for left in left_stream:
|
||||
yield left, None
|
||||
else:
|
||||
if unused is not None:
|
||||
try:
|
||||
unused(right)
|
||||
except NameError:
|
||||
pass
|
||||
for right in right_stream:
|
||||
unused(right)
|
||||
|
||||
def synchronize(reference, stream, key=lambda x: x, unused=None):
|
||||
"""Just the right side part of leftjoin(), Nones included"""
|
||||
for left, right in leftjoin(reference, stream, key, unused):
|
||||
yield right
|
||||
|
||||
def yield_source_csv_messages(cls, foreign_cls, csvreader, force_column=None):
|
||||
"""Yield all messages from one source CSV file.
|
||||
"""
|
||||
columns = list(cls.__table__.c)
|
||||
column_names = csvreader.next()
|
||||
# Assumptions: rows are in lexicographic order
|
||||
# (taking numeric values as numbers of course)
|
||||
# Assumptions about the order of columns:
|
||||
# 1. It's the same in the table and in CSV
|
||||
# 2. Primary key is at the beginning
|
||||
# 3. First thing in the PK is the object id
|
||||
# 4. Last thing in the PK is the language
|
||||
# 5. Everything that follows is some translatable text
|
||||
assert [cls.__table__.c[name] for name in column_names] == columns, ','.join(c.name for c in columns)
|
||||
pk = columns[:len(cls.__table__.primary_key.columns)]
|
||||
first_string_index = len(pk)
|
||||
return _yield_csv_messages(foreign_cls, columns, first_string_index, csvreader, force_column=force_column)
|
||||
|
||||
def _yield_csv_messages(foreign_cls, columns, first_string_index, csvreader, origin='source CSV', crc_value='OFFICIAL', force_column=None):
|
||||
language_index = first_string_index - 1
|
||||
assert 'language' in columns[language_index].name, columns[language_index].name
|
||||
string_columns = columns[first_string_index:]
|
||||
if force_column is not None:
|
||||
assert len(string_columns) == 1
|
||||
string_columns = [force_column]
|
||||
for values in csvreader:
|
||||
id = int(values[0])
|
||||
messages = []
|
||||
for string, column in zip(values[first_string_index:], string_columns):
|
||||
message = Message(
|
||||
foreign_cls.__name__,
|
||||
id,
|
||||
column.name,
|
||||
string.decode('utf-8'),
|
||||
column.type.length,
|
||||
pot=pot_for_column(cls, column, force_column is not None),
|
||||
origin=origin,
|
||||
official=True,
|
||||
source_crc=crc_value,
|
||||
language_id=int(values[language_index]),
|
||||
)
|
||||
messages.append(message)
|
||||
messages.sort()
|
||||
for message in messages:
|
||||
yield message
|
||||
|
||||
def yield_guessed_csv_messages(file):
|
||||
"""Yield messages from a CSV file, using the header to figure out what the data means.
|
||||
"""
|
||||
csvreader = csv.reader(file, lineterminator='\n')
|
||||
column_names = csvreader.next()
|
||||
if column_names == 'language_id,table,id,column,source_crc,string'.split(','):
|
||||
# A translation CSV
|
||||
return yield_translation_csv_messages(file, True)
|
||||
# Not a translation CSV, figure out what the columns mean
|
||||
assert column_names[0].endswith('_id')
|
||||
assert column_names[1] == 'local_language_id'
|
||||
first_string_index = 2
|
||||
foreign_singlename = column_names[0][:-len('_id')]
|
||||
columns = [None] * len(column_names)
|
||||
column_indexes = dict((name, i) for i, name in enumerate(column_names))
|
||||
for foreign_cls in toplevel_classes:
|
||||
if foreign_cls.__singlename__ == foreign_singlename:
|
||||
break
|
||||
else:
|
||||
raise ValueError("Foreign key column name %s in %s doesn't correspond to a table" % (column_names[0], file))
|
||||
for translation_class in foreign_cls.translation_classes:
|
||||
for column in translation_class.__table__.c:
|
||||
column_index = column_indexes.get(column.name)
|
||||
if column_index is not None:
|
||||
columns[column_index] = column
|
||||
assert all([c is not None for c in columns[first_string_index:]])
|
||||
return _yield_csv_messages(foreign_cls, columns, first_string_index, csvreader, origin=file.name, crc_value='UNKNOWN')
|
||||
|
||||
def yield_translation_csv_messages(file, no_header=False):
|
||||
"""Yield messages from a translation CSV file
|
||||
"""
|
||||
csvreader = csv.reader(file, lineterminator='\n')
|
||||
if not no_header:
|
||||
columns = csvreader.next()
|
||||
assert columns == 'language_id,table,id,column,source_crc,string'.split(',')
|
||||
for language_id, table, id, column, source_crc, string in csvreader:
|
||||
yield Message(
|
||||
table,
|
||||
int(id),
|
||||
column,
|
||||
string.decode('utf-8'),
|
||||
origin='target CSV',
|
||||
source_crc=source_crc,
|
||||
language_id=int(language_id),
|
||||
)
|
||||
|
||||
def pot_for_column(cls, column, summary=False):
|
||||
"""Translatable texts get categorized into different POT files to help
|
||||
translators prioritize. The pots are:
|
||||
|
||||
- flavor: Flavor texts: here, strings from multiple versions are summarized
|
||||
- ripped: Strings ripped from the games; translators for "official"
|
||||
languages don't need to bother with these
|
||||
- effects: Fanon descriptions of things; they usually use technical
|
||||
language
|
||||
- misc: Everything else; usually small texts
|
||||
|
||||
Set source to true if this is a flavor summary column. Others are
|
||||
determined by the column itself.
|
||||
"""
|
||||
if summary:
|
||||
return 'flavor'
|
||||
elif column.info.get('ripped'):
|
||||
return 'ripped'
|
||||
elif column.name.endswith('effect'):
|
||||
return 'effects'
|
||||
else:
|
||||
return 'misc'
|
||||
|
||||
def number_replace(source, string):
|
||||
numbers_iter = iter(number_re.findall(source))
|
||||
next_number = lambda match: numbers_iter.next()
|
||||
return re.sub(r'\{num\}', next_number, string)
|
||||
|
||||
def match_to_source(source, *translations):
|
||||
"""Matches translated string(s) to source
|
||||
|
||||
The first translation whose source matches the source message, or whose CRC
|
||||
matches, or which is official, and which is not fuzzy, it is used.
|
||||
If thre's no such translation, the first translation is used.
|
||||
|
||||
Returns (source, source string CRC, string for CSV file, exact match?)
|
||||
If there are no translations, returns (source, None, None, None)
|
||||
|
||||
Handles translations where numbers have been replaced by {num}, if they
|
||||
have source information.
|
||||
"""
|
||||
first = True
|
||||
best_crc = None
|
||||
for translation in translations:
|
||||
if translation is None:
|
||||
continue
|
||||
if translation.number_replacement:
|
||||
current_string = number_replace(source.string, translation.string)
|
||||
current_source = number_replace(source.string, translation.source)
|
||||
current_crc = crc(current_source)
|
||||
elif '{num}' in translation.string:
|
||||
print (u'Warning: {num} appears in %s, but not marked for number replacement. Discarding!' % translation).encode('utf-8')
|
||||
continue
|
||||
else:
|
||||
current_string = translation.string
|
||||
current_source = translation.source
|
||||
current_crc = translation.source_crc
|
||||
if translation.fuzzy:
|
||||
match = False
|
||||
elif translation.official:
|
||||
match = True
|
||||
elif current_source:
|
||||
match = source.string == current_source
|
||||
else:
|
||||
match = current_crc == crc(source.string)
|
||||
if first or match:
|
||||
best_string = current_string
|
||||
best_crc = current_crc
|
||||
best_message = translation
|
||||
if match:
|
||||
break
|
||||
first = False
|
||||
if best_crc:
|
||||
return source, best_crc, best_string, match
|
||||
else:
|
||||
return source, None, None, None
|
||||
|
||||
def merge_translations(source_stream, *translation_streams, **kwargs):
|
||||
"""For each source message, get its best translation from translations.
|
||||
|
||||
Translations should be ordered by priority, highest to lowest.
|
||||
|
||||
Messages that don't appear in translations at all aren't included.
|
||||
"""
|
||||
source = tuple(source_stream)
|
||||
streams = [
|
||||
synchronize(source, t, key=lambda m: m.merge_key, unused=kwargs.get('unused'))
|
||||
for t in translation_streams
|
||||
]
|
||||
for messages in itertools.izip(source, *streams):
|
||||
yield match_to_source(*messages)
|
|
@ -108,14 +108,20 @@ def get_csv_directory(options):
|
|||
def command_dump(*args):
|
||||
parser = get_parser(verbose=True)
|
||||
parser.add_option('-d', '--directory', dest='directory', default=None)
|
||||
parser.add_option('-l', '--langs', dest='langs', default='en',
|
||||
help="Comma-separated list of languages to dump all strings for. "
|
||||
"Default is English ('en')")
|
||||
options, tables = parser.parse_args(list(args))
|
||||
|
||||
session = get_session(options)
|
||||
get_csv_directory(options)
|
||||
|
||||
langs = [l.strip() for l in options.langs.split(',')]
|
||||
|
||||
pokedex.db.load.dump(session, directory=options.directory,
|
||||
tables=tables,
|
||||
verbose=options.verbose)
|
||||
verbose=options.verbose,
|
||||
langs=langs)
|
||||
|
||||
def command_load(*args):
|
||||
parser = get_parser(verbose=True)
|
||||
|
@ -124,6 +130,9 @@ def command_load(*args):
|
|||
parser.add_option('-r', '--recursive', dest='recursive', default=False, action='store_true')
|
||||
parser.add_option('-S', '--safe', dest='safe', default=False, action='store_true',
|
||||
help="Do not use backend-specific optimalizations.")
|
||||
parser.add_option('-l', '--langs', dest='langs', default=None,
|
||||
help="Comma-separated list of extra languages to load, or 'none' for none. "
|
||||
"Default is to load 'em all. Example: 'fr,de'")
|
||||
options, tables = parser.parse_args(list(args))
|
||||
|
||||
if not options.engine_uri:
|
||||
|
@ -133,6 +142,13 @@ def command_load(*args):
|
|||
print "`pokedex setup` to do both at once."
|
||||
print
|
||||
|
||||
if options.langs == 'none':
|
||||
langs = []
|
||||
elif options.langs is None:
|
||||
langs = None
|
||||
else:
|
||||
langs = [l.strip() for l in options.langs.split(',')]
|
||||
|
||||
session = get_session(options)
|
||||
get_csv_directory(options)
|
||||
|
||||
|
@ -141,7 +157,8 @@ def command_load(*args):
|
|||
tables=tables,
|
||||
verbose=options.verbose,
|
||||
safe=options.safe,
|
||||
recursive=options.recursive)
|
||||
recursive=options.recursive,
|
||||
langs=langs)
|
||||
|
||||
def command_reindex(*args):
|
||||
parser = get_parser(verbose=True)
|
||||
|
@ -284,6 +301,15 @@ Load options:
|
|||
-D|--drop-tables Drop all tables before loading data.
|
||||
-S|--safe Disable engine-specific optimizations.
|
||||
-r|--recursive Load (and drop) all dependent tables.
|
||||
-l|--langs Load translations for the given languages.
|
||||
By default, all available translations are loaded.
|
||||
Separate multiple languages by a comma (-l en,de,fr)
|
||||
|
||||
Dump options:
|
||||
-l|--langs Dump unofficial texts for given languages.
|
||||
By default, English (en) is dumped.
|
||||
Separate multiple languages by a comma (-l en,de,fr)
|
||||
Use 'none' to not dump any unofficial texts.
|
||||
|
||||
Additionally, load and dump accept a list of table names (possibly with
|
||||
wildcards) and/or csv fileames as an argument list.
|
||||
|
|
183
pokedex/tests/test_translations.py
Normal file
183
pokedex/tests/test_translations.py
Normal file
|
@ -0,0 +1,183 @@
|
|||
# Encoding: UTF-8
|
||||
|
||||
import csv
|
||||
|
||||
from nose.tools import *
|
||||
|
||||
from pokedex.db import translations, tables
|
||||
|
||||
fake_version_names = (
|
||||
'version_id,local_language_id,name',
|
||||
'1,0,name1', '2,0,name2', '3,0,name3', '3,1,othername3',
|
||||
)
|
||||
|
||||
fake_translation_csv = (
|
||||
'language_id,table,id,column,source_crc,string',
|
||||
'0,Version,1,name,,name1',
|
||||
'0,Version,2,name,,name2',
|
||||
'0,Version,3,name,,name3',
|
||||
'1,Version,3,name,,othername3',
|
||||
)
|
||||
|
||||
def test_yield_source_csv_messages():
|
||||
check_version_message_stream(translations.yield_source_csv_messages(
|
||||
tables.Version.names_table,
|
||||
tables.Version,
|
||||
csv.reader(iter(fake_version_names)),
|
||||
))
|
||||
|
||||
def test_yield_guessed_csv_messages():
|
||||
check_version_message_stream(translations.yield_guessed_csv_messages(
|
||||
iter(fake_translation_csv),
|
||||
))
|
||||
|
||||
def test_yield_translation_csv_messages():
|
||||
check_version_message_stream(translations.yield_translation_csv_messages(
|
||||
iter(fake_translation_csv),
|
||||
))
|
||||
|
||||
def check_version_message_stream(messages):
|
||||
messages = list(messages)
|
||||
assert messages[0].string == 'name1'
|
||||
assert messages[1].string == 'name2'
|
||||
assert messages[2].string == 'name3'
|
||||
assert messages[3].string == 'othername3'
|
||||
for message in messages[:3]:
|
||||
assert message.language_id == 0
|
||||
assert messages[3].language_id == 1
|
||||
for id, message in zip((1, 2, 3, 3), messages):
|
||||
assert message.merge_key == ('Version', id, 'name'), message.key
|
||||
|
||||
def get_messages(*rows):
|
||||
return list(translations.yield_translation_csv_messages(iter(rows), True))
|
||||
|
||||
def test_merge_translations():
|
||||
source = get_messages(
|
||||
'0,Table,1,col,,none',
|
||||
'0,Table,2,col,,new',
|
||||
'0,Table,3,col,,existing',
|
||||
'0,Table,4,col,,both',
|
||||
'0,Table,5,col,,(gap)',
|
||||
'0,Table,6,col,,new-bad',
|
||||
'0,Table,7,col,,existing-bad',
|
||||
'0,Table,8,col,,both-bad',
|
||||
'0,Table,9,col,,new-bad-ex-good',
|
||||
'0,Table,10,col,,new-good-ex-bad',
|
||||
'0,Table,11,col,,(gap)',
|
||||
'0,Table,12,col,,"Numbers: 1, 2, and 003"',
|
||||
'0,Table,13,col,,"Numbers: 3, 2, and 001"',
|
||||
)
|
||||
new = get_messages(
|
||||
'0,Table,2,col,%s,new' % translations.crc('new'),
|
||||
'0,Table,4,col,%s,new' % translations.crc('both'),
|
||||
'0,Table,6,col,%s,new' % translations.crc('----'),
|
||||
'0,Table,8,col,%s,new' % translations.crc('----'),
|
||||
'0,Table,9,col,%s,new' % translations.crc('----'),
|
||||
'0,Table,10,col,%s,new' % translations.crc('new-good-ex-bad'),
|
||||
'0,Table,12,col,%s,{num} {num} {num}' % translations.crc('Numbers: {num}, {num}, and {num}'),
|
||||
'0,Table,13,col,%s,{num} {num} {num}' % translations.crc('----'),
|
||||
'0,Table,100,col,%s,unused' % translations.crc('----'),
|
||||
)
|
||||
new[-3].number_replacement = True
|
||||
new[-3].source = 'Numbers: 1, 2, and 003'
|
||||
new[-2].number_replacement = True
|
||||
new[-2].source = '----'
|
||||
existing = get_messages(
|
||||
'0,Table,3,col,%s,existing' % translations.crc('existing'),
|
||||
'0,Table,4,col,%s,existing' % translations.crc('both'),
|
||||
'0,Table,7,col,%s,existing' % translations.crc('----'),
|
||||
'0,Table,8,col,%s,existing' % translations.crc('----'),
|
||||
'0,Table,9,col,%s,existing' % translations.crc('new-bad-ex-good'),
|
||||
'0,Table,10,col,%s,existing' % translations.crc('----'),
|
||||
'0,Table,100,col,%s,unused' % translations.crc('----'),
|
||||
)
|
||||
expected_list = (
|
||||
('none', None, None),
|
||||
('new', True, 'new'),
|
||||
('existing', True, 'existing'),
|
||||
('both', True, 'new'),
|
||||
('(gap)', None, None),
|
||||
('new-bad', False, 'new'),
|
||||
('existing-bad', False, 'existing'),
|
||||
('both-bad', False, 'new'),
|
||||
('new-bad-ex-good', True, 'existing'),
|
||||
('new-good-ex-bad', True, 'new'),
|
||||
('(gap)', None, None),
|
||||
('Numbers: 1, 2, and 003', True, '1 2 003'),
|
||||
('Numbers: 3, 2, and 001', False, '3 2 001'),
|
||||
)
|
||||
unused = []
|
||||
result_stream = list(translations.merge_translations(source, new, [], existing, unused=unused.append))
|
||||
for result, expected in zip(result_stream, expected_list):
|
||||
res_src, res_crc, res_str, res_match = result
|
||||
exp_src, exp_match, exp_str = expected
|
||||
print result, expected
|
||||
assert res_src.string == exp_src
|
||||
assert res_str == exp_str, (res_str, exp_str)
|
||||
if exp_match is None:
|
||||
assert res_crc is None
|
||||
elif exp_match is True:
|
||||
assert res_crc == translations.crc(res_src.string)
|
||||
elif exp_match is False:
|
||||
assert res_crc == translations.crc('----')
|
||||
assert res_match == exp_match
|
||||
print 'unused:', unused
|
||||
for message in unused:
|
||||
assert message.string == 'unused'
|
||||
assert message.id == 100
|
||||
|
||||
def test_merge():
|
||||
check_merge((0, 1, 2, 3))
|
||||
check_merge((0, 1), (2, 3))
|
||||
check_merge((2, 3), (0, 1))
|
||||
check_merge((0, 2), (1, 3))
|
||||
check_merge((0, 3), (1, 2))
|
||||
check_merge((0, 1), (2, 3), (2, 3))
|
||||
|
||||
def check_merge(*sequences):
|
||||
merged = list(translations.Merge(*sequences))
|
||||
concatenated = [val for seq in sequences for val in seq]
|
||||
assert merged == sorted(concatenated)
|
||||
|
||||
def test_merge_dynamic_add():
|
||||
merge = translations.Merge((1, 2, 3))
|
||||
def adder():
|
||||
for val in (1, 2, 3):
|
||||
yield val
|
||||
merge.add_iterator([4])
|
||||
merge.add_iterator(adder())
|
||||
assert tuple(merge) == (1, 1, 2, 2, 3, 3, 4, 4, 4)
|
||||
|
||||
def test_merge_adjacent():
|
||||
messages = get_messages(
|
||||
'0,Table,1,col,,strA',
|
||||
'0,Table,2,col,,strB',
|
||||
'0,Table,2,col,,strC',
|
||||
'0,Table,2,col,,strB',
|
||||
'0,Table,2,col,,strD',
|
||||
'0,Table,3,col,,strE',
|
||||
)
|
||||
result = [m.string for m in translations.merge_adjacent(messages)]
|
||||
expected = ['strA', 'strB\n\nstrC\n\nstrD', 'strE']
|
||||
assert result == expected
|
||||
|
||||
def test_leftjoin():
|
||||
check_leftjoin([], [], [], [])
|
||||
check_leftjoin([], [1], [], [1])
|
||||
check_leftjoin([], [1, 2], [], [1, 2])
|
||||
check_leftjoin([1], [], [(1, None)], [])
|
||||
check_leftjoin([1], [1], [(1, 1)], [])
|
||||
check_leftjoin([1], [2], [(1, None)], [2])
|
||||
check_leftjoin([1, 2], [1], [(1, 1), (2, None)], [])
|
||||
check_leftjoin([1, 2], [1, 2], [(1, 1), (2, 2)], [])
|
||||
check_leftjoin([1], [1, 2], [(1, 1)], [2])
|
||||
check_leftjoin([1, 2], [1, 3], [(1, 1), (2, None)], [3])
|
||||
check_leftjoin([1, 2, 3], [1, 3], [(1, 1), (2, None), (3, 3)], [])
|
||||
check_leftjoin([1, 2, 2, 3], [1, 3], [(1, 1), (2, None), (2, None), (3, 3)], [])
|
||||
check_leftjoin([1, 2, 2, 3], [2, 2, 2], [(1, None), (2, 2), (2, 2), (3, None)], [2])
|
||||
|
||||
def check_leftjoin(seqa, seqb, expected, expected_unused):
|
||||
unused = []
|
||||
result = list(translations.leftjoin(seqa, seqb, unused=unused.append))
|
||||
assert result == list(expected)
|
||||
assert unused == list(expected_unused)
|
Loading…
Reference in a new issue