Reading, merging, and writing translations

2024-08-20 18:16:34 +00:00 · 2011-04-07 01:28:54 +03:00 · 2011-04-07 01:28:54 +03:00 · 4c2ad2bdf1
commit 4c2ad2bdf1
parent 36fa8b7093
3 changed files with 1210 additions and 0 deletions
--- a/bin/poupdate
+++ b/bin/poupdate
@ -0,0 +1,368 @@
 #! /usr/bin/env python
 # Encoding: UTF-8
 u"""Creation and loading of GNU Gettext language files.
 poupdate [options] [file1.csv file2.csv ...]
 Use this script to
 - Create .pot files (in pokedex/i18n/)
 - Update the .po files (in pokedex/i18n/<lang>)
 - Update the pokedex .csv files in (pokedex/data/csv/translations)
 To make pos for a new language, make sure it is in the database, make
 a directory for it in pokedex/i18n/, and run this.
 You can also give one or more translation CSVs as arguments.
 These are in the same format as veekun's main database CSVs, for example
 pokedex/data/csv/ability_prose.csv. Be sure to set the correct language
 ID (which implies the language must be in the database).
 Also be sure to have the correct column order: first an appropriately named
 foreign key, then local_language_id, and then the text columns.
 """
 # Everything related to Gettext files, and the CLI interface, is here.
 # General message handling and CSV I/O is in the pokedex library.
 # Notes on how we use PO format:
 # The source information is stored in the occurences fields, using
 # "table_name.column_name" for file and object ID for line number. This is used
 # as a message key, instead of the source string. So it's important not to
 # discard location information. It also means "obsolete" and "fuzzy" mean
 # pretty much the same in our context.
 #
 # Also note that a pot file is just a po file with all strings untranslated.
 # So some functions here will work on either.
 #
 # Gettext context (msgctxt) is written to the files so that tools don't merge
 # unrelated strings together. It is ignored when reading the PO files.
 # Also of note, "polib" means "(do) kiss!" in Czech.
 import os
 import re
 import sys
 from datetime import datetime
 from optparse import OptionParser
 from collections import defaultdict
 import pkg_resources
 from pokedex.db import tables, translations
 from pokedex.defaults import get_default_csv_dir
 try:
    import polib
 except ImportError:
    if __name__ == '__main__':
        exit('This utility needs polib installed.\n$ pip install polib')
    raise
 number_replacement_flag = '-pokedex-number-replacement'
 default_gettext_directory = pkg_resources.resource_filename('pokedex', 'i18n')
 mapped_class_dict = dict((c.__name__, c) for c in tables.mapped_classes)
 for cls in tables.mapped_classes:
    mapped_class_dict.update(dict((c.__name__, cls) for c in cls.translation_classes))
 class PokedexPot(polib.POFile):
    def __init__(self, name):
        super(PokedexPot, self).__init__()
        self.metadata = {
                'Project-Id-Version': 'pokedex-%s 0.1' % name,
                'Report-Msgid-Bugs-To': 'encukou@gmail.com',
                'POT-Creation-Date': datetime.now().isoformat(),
                'PO-Revision-Date': 'YEAR-MO-DA HO:MI+ZONE',
                'MIME-Version': '1.0',
                'Content-Type': 'text/plain; charset=utf-8',
                'Content-Transfer-Encoding': '8bit',
                'Generated-By': "The pokedex",
            }
        self.seen_entries = {}
    def append(self, entry):
        """Append an entry. POEntries that only differ in numbers are merged.
        For example "Route 1", "Route 2", etc. are replaced by a single
        "Route {num}".
        Multiple numbers might be replaced, for example in "{num}--{num}
        different Unown caught"
        Entries without numbers are merged as well (e.g. "Has no overworld
        effect" appears quite a few times in in AbilityChangelog)
        """
        replaced = translations.number_re.sub('{num}', entry.msgid)
        try:
            common_entry = self.seen_entries[(entry.msgctxt, replaced)]
        except KeyError:
            self.seen_entries[(entry.msgctxt, replaced)] = entry
        else:
            common_entry.occurrences += entry.occurrences
            # Only now is the actual entry replaced. So we get
            # "Route {num}", but "Porygon2" because there's no Porygon3.
            common_entry.msgid = replaced
            common_entry.msgstr = translations.number_re.sub('{num}', common_entry.msgstr)
            if replaced != entry.msgid and number_replacement_flag not in common_entry.flags:
                common_entry.flags.append(number_replacement_flag)
            return
        self += [entry]
 class PotDict(dict):
    """A defaultdict of pot files"""
    def __missing__(self, name):
        pot = PokedexPot(name)
        self[name] = pot
        return pot
 def yield_po_messages(pos):
    """Yield messages from all given .po files
    """
    merger = translations.Merge()
    for po in pos.values():
        merger.add_iterator(_yield_one_po_messages(po, merger))
    return merger
 def entry_sort_key(entry):
    try:
        cls_col, line = entry.occurrences[0]
    except IndexError:
        return
    else:
        if line:
            classname, col = cls_col.split('.')
            fuzzy = entry.obsolete or 'fuzzy' in entry.flags
            try:
                cls = mapped_class_dict[classname]
            except KeyError, k:
                # Renamed table?
                print 'Warning: Unknown class %s' % classname
                return '', int(line), col, fuzzy
            else:
                return cls.__name__, int(line), col, fuzzy
 def _yield_one_po_messages(pofile, merger):
    # Yield messages from one po file
    #
    # Messages in our po files are ordered by the first occurrence.
    # The occurrences of a single message are also ordered.
    # So just merge all the subsequences as we go
    for entry in sorted(pofile, key=entry_sort_key):
        if entry.msgstr:
            fuzzy = (entry.obsolete or 'fuzzy' in entry.flags)
            messages = []
            for occurrence in entry.occurrences:
                cls_colname, id = occurrence
                if id:
                    clsname, colname = cls_colname.split('.')
                    cls = mapped_class_dict[clsname]
                    messages.append(translations.Message(
                            mapped_class_dict[clsname].__name__,
                            int(id),
                            colname,
                            entry.msgstr,
                            source=entry.msgid,
                            number_replacement=number_replacement_flag in entry.flags,
                            origin='PO file',
                            fuzzy=fuzzy,
                        ))
            if messages[1:]:
                # Spawn extra iterators before yielding
                merger.add_iterator(messages[1:])
            if messages:
                yield messages[0]
 def create_pots(source, *translation_streams):
    """Convert an iterator of Messages to a dictionary of pot/po files
    If translations are given, they're merged, and any exact matches are put
    in the po file. Give some for po files, don't give any for pot files.
    """
    obsolete = []
    pots = PotDict()
    merged = translations.merge_translations(source, *translation_streams, unused=obsolete.append)
    for source, sourcehash, string, exact in merged:
        ctxt = '.'.join((source.cls, source.colname))
        entry = polib.POEntry(
                msgid=source.string,
                occurrences=[(ctxt, source.id)],
                msgctxt=ctxt,
            )
        if string:
            entry.msgstr = string
            if not exact:
                entry.flags.append('fuzzy')
        pots[source.pot].append(entry)
    for message in obsolete:
        ctxt = '.'.join((message.cls, message.colname))
        entry = polib.POEntry(
                msgid=message.source or '???',
                occurrences=[(ctxt, message.id)],
                msgctxt=ctxt,
                obsolete=True,
            )
    return pots
 def save_pots(pots, gettext_directory=default_gettext_directory):
    """Save pot files to a directory."""
    for name, pot in pots.items():
        pot.save(os.path.join(gettext_directory, 'pokedex-%s.pot' % name))
 def save_pos(pos, lang, gettext_directory=default_gettext_directory):
    """Save po files to the appropriate directory."""
    for name, po in pos.items():
        po.save(os.path.join(gettext_directory, lang, 'pokedex-%s.po' % name))
 def read_pots(directory=default_gettext_directory, extension='.pot'):
    """Read all files from the given directory with the given extension as pofiles
    Works on pos or pots.
    """
    pots = {}
    for filename in os.listdir(directory):
        basename, ext = os.path.splitext(filename)
        if ext == extension:
            pots[basename] = polib.pofile(os.path.join(directory, filename))
    return pots
 def all_langs(gettext_directory=default_gettext_directory):
    return [
            d for d in os.listdir(gettext_directory)
            if os.path.isdir(os.path.join(gettext_directory, d))
        ]
 def merge_pos(transl, lang, language_directory):
    """Update all po files for the given language
    Takes into account the source, the official translations from the database,
    the existing PO files, and the current translation CSV, in that order.
    Returns a name -> pofile dict
    """
    return create_pots(
            transl.source,
            transl.official_messages(lang),
            yield_po_messages(pos=read_pots(language_directory, '.po')),
            transl.yield_target_messages(lang),
        )
 def bar(fraction, size, done_char='=', split_char='|', notdone_char='-'):
    """Build an ASCII art progress bar
    """
    size -= 1
    if fraction == 1:
        split_char = done_char
    completed = int(round(size * fraction))
    bar = [done_char] * completed
    bar.append(split_char)
    bar += notdone_char * (size - completed)
    return ''.join(bar)
 def print_stats(pos):
    """Print out some fun stats about a set of po files
    """
    template = u"{0:>10}: {1:4}/{2:4} {3:6.2f}% [{4}]"
    total_translated = 0
    total = 0
    for name, po in pos.items():
        num_translated = len(po.translated_entries())
        total_translated += num_translated
        fraction_translated = 1. * num_translated / len(po)
        total += len(po)
        print template.format(
                name,
                num_translated,
                len(po),
                100 * fraction_translated,
                bar(fraction_translated, 47),
            ).encode('utf-8')
    fraction_translated = 1. * total_translated / total
    print template.format(
            'Total',
            total_translated,
            total,
            100 * fraction_translated,
            bar(fraction_translated, 47),
        ).encode('utf-8')
 if __name__ == '__main__':
    parser = OptionParser(__doc__)
    parser.add_option('-l', '--langs', dest='langs',
            help="List of languages to handle, separated by commas (example: -l 'en,de,ja') (default: all in gettext directory)")
    parser.add_option('-P', '--no-pots', dest='pots', action='store_false', default=True,
            help='Do not create POT files (templates)')
    parser.add_option('-p', '--no-pos', dest='pos', action='store_false', default=True,
            help='Do not update PO files (message catalogs)')
    parser.add_option('-c', '--no-csv', dest='csv', action='store_false', default=True,
            help='Do not update pokedex translations files')
    parser.add_option('-d', '--directory', dest='directory',
            help='Veekun data directory')
    parser.add_option('-L', '--source-language', dest='source_lang',
            help="Source language identifier (default: 'en')")
    parser.add_option('-g', '--gettext-dir', dest='gettext_directory', default=default_gettext_directory,
            help='Gettext directory (default: pokedex/i18n/)')
    parser.add_option('-q', '--quiet', dest='verbose', default=True, action='store_false',
            help="Don't print what's going on")
    options, arguments = parser.parse_args()
    transl = translations.Translations.from_parsed_options(options)
    gettext_directory = options.gettext_directory
    if options.pots:
        if options.verbose:
            print 'Creating pots in', gettext_directory
        save_pots(create_pots(transl.source), gettext_directory=gettext_directory)
    if options.pos or options.csv:
        # Merge in CSV files from command line
        csv_streams = defaultdict(translations.Merge)
        for argument in arguments:
            # Add each message in its own stream, to sort them.
            file = open(argument, 'rb')
            with file:
                for message in translations.yield_guessed_csv_messages(file):
                    lang = transl.language_identifiers[message.language_id]
                    csv_streams[lang].add_iterator([message])
        streams = defaultdict(list)
        for lang, stream in csv_streams.items():
            streams[lang].append(stream)
        # Merge in the PO files
        if options.langs:
            langs = options.langs.split(',')
        else:
            langs = all_langs(gettext_directory)
        for lang in langs:
            language_directory = os.path.join(gettext_directory, lang)
            if options.verbose:
                print 'Merging translations for %s in %s' % (lang, language_directory)
            pos = merge_pos(transl, lang, language_directory)
            if options.pos:
                if options.verbose:
                    print 'Writing POs for %s' % lang
                save_pos(pos, lang, gettext_directory=gettext_directory)
                if options.verbose:
                    print_stats(pos)
            streams[lang].append(yield_po_messages(pos))
    if options.csv:
        for lang, lang_streams in streams.items():
            if options.verbose:
                print "Merging %s translation stream/s for '%s'" % (len(lang_streams), lang)
            existing_messages = list(transl.yield_target_messages(lang))
            lang_streams.append(existing_messages)
            transl.write_translations(lang, *lang_streams)
--- a/pokedex/db/translations.py
+++ b/pokedex/db/translations.py
@ -0,0 +1,659 @@
 #! /usr/bin/env python
 u"""General handling of translations
 The general idea is to get messages from somewhere: the source pokedex CSVs,
 or the translation CSVs, etc., then merge them together in some way, and shove
 them into the database.
 If a message is translated, it has a source string attached to it, with the
 original English version. Or at least it has a CRC of the original.
 When that doesn't match, it means the English string changed and the
 translation has to be updated.
 Also this is why we can't dump translations from the database: there's no
 original string info.
 Some complications:
 Flavor text is so repetitive that we take strings from all the version,
 separate the unique ones by blank lines, let translators work on that, and then
 put it in flavor_summary tables.
 Routes names and other repetitive numeric things are replaced by e.g.
 "Route {num}" so translators only have to work on each set once.
 """
 import binascii
 import csv
 import heapq
 import itertools
 import os
 import re
 import sys
 from collections import defaultdict
 from pokedex.db import tables
 from pokedex.defaults import get_default_csv_dir
 default_source_lang = 'en'
 # Top-level classes we want translations for: in order, and by name
 # These are all mapped_classes that have translatable texts and aren't summarized
 toplevel_classes = []
 toplevel_class_by_name = {}
 # summary_map[pokemon_prose]['flavor_summary'] == PokemonFlavorTexts
 summary_map = {}
 # translation_class_by_column[class_name, column_name] == translation_class
 translation_class_by_column = {}
 for cls in tables.mapped_classes:
    try:
        summary_class, col = cls.summary_column
    except AttributeError:
        if cls.translation_classes:
            toplevel_classes.append(cls)
            toplevel_class_by_name[cls.__name__] = cls
            for translation_class in cls.translation_classes:
                for column in translation_class.__table__.c:
                    translation_class_by_column[cls, column.name] = translation_class
    else:
        summary_map.setdefault(summary_class, {})[col] = cls
 number_re = re.compile("[0-9]+")
 def crc(string):
    """Return a hash to we use in translation CSV files"""
    return "%08x" % (binascii.crc32(string.encode('utf-8')) & 0xffffffff)
    # Two special values are also used in source_crc:
    # UNKNOWN: no source string was available
    # OFFICIAL: an official string from the main database
 class Message(object):
    """Holds all info about a translatable or translated string
    cls: Name of the mapped class the message belongs to
    id: The id of the thing the message belongs to
    colname: name of the database column
    strings: A list of strings in the message, usualy of length 1.
    Optional attributes (None if not set):
    colsize: Max length of the database column
    source: The string this was translated from
    number_replacement: True if this is a translation with {num} placeholders
    pot: Name of the pot the message goes to (see pot_for_column)
    source_crc: CRC of the source
    origin: Some indication of where the string came from (CSV, PO, ...)
    fuzzy: True for fuzzy translations
    language_id: ID of the language
    official: True if this is a known-good translation
    """
    __slots__ = 'cls id colname strings colsize source number_replacement pot source_crc origin fuzzy language_id official'.split()
    def __init__(self, cls, id, colname, string,
            colsize=None, source=None, number_replacement=None, pot=None,
            source_crc=None, origin=None, fuzzy=None, language_id=None,
            official=None,
        ):
        self.cls = cls
        self.id = id
        self.colname = colname
        self.strings = [string]
        self.colsize = colsize
        self.source = source
        self.number_replacement = number_replacement
        self.pot = pot
        self.source_crc = source_crc
        if source and not source_crc:
             self.source_crc = crc(source)
        self.origin = origin
        self.fuzzy = fuzzy
        self.language_id = language_id
        self.official = official
    def merge(self, other):
        """Merge two messages, as required for flavor text summarizing
        """
        assert self.merge_key == other.merge_key
        for string in other.strings:
            if string not in self.strings:
                self.strings.append(string)
        self.colsize = self.colsize or other.colsize
        self.pot = self.pot or other.pot
        self.source = None
        self.source_crc = None
        self.number_replacement = None
    @property
    def string(self):
        return '\n\n'.join(self.strings)
    @property
    def merge_key(self):
        return self.cls, self.id, self.colname
    @property
    def sort_key(self):
        return self.merge_key, self.language_id, self.fuzzy
    @property
    def eq_key(self):
        return self.sort_key, self.strings
    def __eq__(self, other): return self.eq_key == other.eq_key
    def __ne__(self, other): return self.eq_key != other.eq_key
    def __gt__(self, other): return self.sort_key > other.sort_key
    def __lt__(self, other): return self.sort_key < other.sort_key
    def __ge__(self, other): return self.sort_key >= other.sort_key
    def __le__(self, other): return self.sort_key <= other.sort_key
    def __unicode__(self):
        string = '"%s"' % self.string
        if len(string) > 20:
            string = string[:15] + u'"...'
        template = u'<Message from {self.origin} for {self.cls}.{self.colname}:{self.id} -- {string}>'
        return template.format(self=self, string=string)
    def __str__(self):
        return unicode(self).encode('utf-8')
    def __repr__(self):
        return unicode(self).encode('utf-8')
 class Translations(object):
    """Data and opertaions specific to a location on disk (and a source language)
    """
    def __init__(self, source_lang=default_source_lang, csv_directory=None, translation_directory=None):
        if csv_directory is None:
            csv_directory = get_default_csv_dir()
        if translation_directory is None:
            translation_directory = os.path.join(csv_directory, 'translations')
        self.source_lang = default_source_lang
        self.csv_directory = csv_directory
        self.translation_directory = translation_directory
        self.language_ids = {}
        self.language_identifiers = {}
        self.official_langs = []
        for row in self.reader_for_class(tables.Language, reader_class=csv.DictReader):
            self.language_ids[row['identifier']] = int(row['id'])
            self.language_identifiers[int(row['id'])] = row['identifier']
            if row['official'] and int(row['official']):
                self.official_langs.append(row['identifier'])
        self.source_lang_id = self.language_ids[self.source_lang]
    @classmethod
    def from_parsed_options(cls, options):
        return cls(options.source_lang, options.directory)
    @property
    def source(self):
        """All source (i.e. English) messages
        """
        return self.official_messages(self.source_lang)
    def official_messages(self, lang):
        """All official messages (i.e. from main database) for the given lang
        """
        # Cached as tuples, since they're used pretty often
        lang_id = self.language_ids[lang]
        try:
            return self._sources[lang_id]
        except AttributeError:
            self._sources = {}
            for message in self.yield_source_messages():
                self._sources.setdefault(message.language_id, []).append(message)
            self._sources = dict((k, tuple(merge_adjacent(v))) for k, v in self._sources.items())
            return self.official_messages(lang)
        except KeyError:
            # Looks like there are no messages in the DB for this language
            # This should only happen for non-official languages
            assert lang not in self.official_langs
            return ()
    def write_translations(self, lang, *streams):
        """Write a translation CSV containing messages from streams.
        Streams should be ordered by priority, from highest to lowest.
        Any official translations (from the main database) are added automatically.
        """
        writer = self.writer_for_lang(lang)
        writer.writerow('language_id table id column source_crc string'.split())
        messages = merge_translations(self.source, self.official_messages(lang), *streams)
        warnings = {}
        for source, sourcehash, string, exact in messages:
            if string and sourcehash != 'OFFICIAL':
                utf8len = len(string.encode('utf-8'))
                if source.colsize and utf8len > source.colsize:
                    key = source.cls, source.colname
                    warnings[key] = max(warnings.get(key, (0,)), (utf8len, source, string))
                else:
                    writer.writerow((
                            self.language_ids[lang],
                            source.cls,
                            source.id,
                            source.colname,
                            sourcehash,
                            string.encode('utf-8'),
                        ))
        for utf8len, source, string in warnings.values():
            template = u'Error: {size}B value for {colsize}B column! {key[0]}.{key[2]}:{key[1]}: {string}'
            warning = template.format(
                    key=source.merge_key,
                    string=string,
                    size=utf8len,
                    colsize=source.colsize,
                )
            if len(warning) > 79:
                warning = warning[:76] + u'...'
            print warning.encode('utf-8')
    def reader_for_class(self, cls, reader_class=csv.reader):
        tablename = cls.__table__.name
        csvpath = os.path.join(self.csv_directory, tablename + '.csv')
        return reader_class(open(csvpath, 'rb'), lineterminator='\n')
    def writer_for_lang(self, lang):
        csvpath = os.path.join(self.translation_directory, '%s.csv' % lang)
        return csv.writer(open(csvpath, 'wb'), lineterminator='\n')
    def yield_source_messages(self, language_id=None):
        """Yield all messages from source CSV files
        Messages from all languages are returned. The messages are not ordered
        properly, but splitting the stream by language (and filtering results
        by merge_adjacent) will produce proper streams.
        """
        if language_id is None:
            language_id = self.source_lang_id
        for cls in sorted(toplevel_classes, key=lambda c: c.__name__):
            streams = []
            for translation_class in cls.translation_classes:
                streams.append(yield_source_csv_messages(
                        translation_class,
                        cls,
                        self.reader_for_class(translation_class),
                    ))
                try:
                    colmap = summary_map[translation_class]
                except KeyError:
                    pass
                else:
                    for colname, summary_class in colmap.items():
                        column = translation_class.__table__.c[colname]
                        streams.append(yield_source_csv_messages(
                                summary_class,
                                cls,
                                self.reader_for_class(summary_class),
                                force_column=column,
                            ))
            for message in Merge(*streams):
                yield message
    def yield_target_messages(self, lang):
        """Yield messages from the data/csv/translations/<lang>.csv file
        """
        path = os.path.join(self.csv_directory, 'translations', '%s.csv' % lang)
        try:
            file = open(path, 'rb')
        except IOError:
            return ()
        return yield_translation_csv_messages(file)
    def yield_all_translations(self):
        stream = Merge()
        for lang in self.language_identifiers.values():
            stream.add_iterator(self.yield_target_messages(lang))
        return (message for message in stream if not message.official)
    def get_load_data(self, langs=None):
        """Yield (translation_class, data for INSERT) pairs for loading into the DB
        langs is either a list of language identifiers or None
        """
        if langs is None:
            langs = self.language_identifiers.values()
        stream = Merge()
        for lang in self.language_identifiers.values():
            stream.add_iterator(self.yield_target_messages(lang))
        stream = (message for message in stream if not message.official)
        count = 0
        class GroupDict(dict):
            """Dict to automatically set the foreign_id and local_language_id for new items
            """
            def __missing__(self, key):
                # depends on `cls` from outside scope
                id, language_id = key
                data = self[key] = defaultdict(lambda: None)
                column_names = (c.name for c in translation_class.__table__.columns)
                data.update(dict.fromkeys(column_names))
                data.update({
                        '%s_id' % cls.__singlename__: id,
                        'local_language_id': language_id,
                    })
                return data
        # Nested dict:
        # translation_class -> (lang, id) -> column -> value
        everything = defaultdict(GroupDict)
        # Group by object so we always have all of the messages for one DB row
        for (cls_name, id), group in group_by_object(stream):
            cls = toplevel_class_by_name[cls_name]
            for message in group:
                translation_class = translation_class_by_column[cls, message.colname]
                key = id, message.language_id
                colname = str(message.colname)
                everything[translation_class][key][colname] = message.string
                count += 1
            if count > 1000:
                for translation_class, key_data in everything.items():
                    yield translation_class, key_data.values()
                count = 0
                everything.clear()
        for translation_class, data_dict in everything.items():
            yield translation_class, data_dict.values()
 def group_by_object(stream):
    """Group stream by object
    Yields ((class name, object ID), (list of messages)) pairs.
    """
    stream = iter(stream)
    current = stream.next()
    current_key = current.cls, current.id
    group = [current]
    for message in stream:
        if (message.cls, message.id) != current_key:
            yield current_key, group
            group = []
        group.append(message)
        current = message
        current_key = current.cls, current.id
    yield current_key, group
 class Merge(object):
    """Merge several sorted iterators together
    Additional iterators may be added at any time with add_iterator.
    Accepts None for the initial iterators
    If the same value appears in more iterators, there will be duplicates in
    the output.
    """
    def __init__(self, *iterators):
        self.next_values = []
        for iterator in iterators:
            if iterator is not None:
                self.add_iterator(iterator)
    def add_iterator(self, iterator):
        iterator = iter(iterator)
        try:
            value = iterator.next()
        except StopIteration:
            return
        else:
            heapq.heappush(self.next_values, (value, iterator))
    def __iter__(self):
        return self
    def next(self):
        if self.next_values:
            value, iterator = heapq.heappop(self.next_values)
            self.add_iterator(iterator)
            return value
        else:
            raise StopIteration
 def merge_adjacent(gen):
    """Merge adjacent messages that compare equal"""
    gen = iter(gen)
    last = gen.next()
    for this in gen:
        if this.merge_key == last.merge_key:
            last.merge(this)
        elif last < this:
            yield last
            last = this
        else:
            raise AssertionError('Bad order, %s > %s' % (last, this))
    yield last
 def leftjoin(left_stream, right_stream, key=lambda x: x, unused=None):
    """A "left join" operation on sorted iterators
    Yields (left, right) pairs, where left comes from left_stream and right
    is the corresponding item from right, or None
    Note that if there are duplicates in right_stream, you won't get duplicate
    rows for them.
    If given, unused should be a one-arg function that will get called on all
    unused items in right_stream.
    """
    left_stream = iter(left_stream)
    right_stream = iter(right_stream)
    try:
        right = right_stream.next()
        for left in left_stream:
            while right and key(left) > key(right):
                if unused is not None:
                    unused(right)
                right = right_stream.next()
            if key(left) == key(right):
                yield left, right
                del left
                right = right_stream.next()
            else:
                yield left, None
    except StopIteration:
        try:
            yield left, None
        except NameError:
            pass
        for left in left_stream:
            yield left, None
    else:
        if unused is not None:
            try:
                unused(right)
            except NameError:
                pass
            for right in right_stream:
                unused(right)
 def synchronize(reference, stream, key=lambda x: x, unused=None):
    """Just the right side part of leftjoin(), Nones included"""
    for left, right in leftjoin(reference, stream, key, unused):
        yield right
 def yield_source_csv_messages(cls, foreign_cls, csvreader, force_column=None):
    """Yield all messages from one source CSV file.
    """
    columns = list(cls.__table__.c)
    column_names = csvreader.next()
    # Assumptions: rows are in lexicographic order
    #  (taking numeric values as numbers of course)
    # Assumptions about the order of columns:
    # 1. It's the same in the table and in CSV
    # 2. Primary key is at the beginning
    # 3. First thing in the PK is the object id
    # 4. Last thing in the PK is the language
    # 5. Everything that follows is some translatable text
    assert [cls.__table__.c[name] for name in column_names] == columns, ','.join(c.name for c in columns)
    pk = columns[:len(cls.__table__.primary_key.columns)]
    first_string_index = len(pk)
    return _yield_csv_messages(foreign_cls, columns, first_string_index, csvreader, force_column=force_column)
 def _yield_csv_messages(foreign_cls, columns, first_string_index, csvreader, origin='source CSV', crc_value='OFFICIAL', force_column=None):
    language_index = first_string_index - 1
    assert 'language' in columns[language_index].name, columns[language_index].name
    string_columns = columns[first_string_index:]
    if force_column is not None:
        assert len(string_columns) == 1
        string_columns = [force_column]
    for values in csvreader:
        id = int(values[0])
        messages = []
        for string, column in zip(values[first_string_index:], string_columns):
            message = Message(
                    foreign_cls.__name__,
                    id,
                    column.name,
                    string.decode('utf-8'),
                    column.type.length,
                    pot=pot_for_column(cls, column, force_column is not None),
                    origin=origin,
                    official=True,
                    source_crc=crc_value,
                    language_id=int(values[language_index]),
                )
            messages.append(message)
        messages.sort()
        for message in messages:
            yield message
 def yield_guessed_csv_messages(file):
    """Yield messages from a CSV file, using the header to figure out what the data means.
    """
    csvreader = csv.reader(file, lineterminator='\n')
    column_names = csvreader.next()
    if column_names == 'language_id,table,id,column,source_crc,string'.split(','):
        # A translation CSV
        return yield_translation_csv_messages(file, True)
    # Not a translation CSV, figure out what the columns mean
    assert column_names[0].endswith('_id')
    assert column_names[1] == 'local_language_id'
    first_string_index = 2
    foreign_singlename = column_names[0][:-len('_id')]
    columns = [None] * len(column_names)
    column_indexes = dict((name, i) for i, name in enumerate(column_names))
    for foreign_cls in toplevel_classes:
        if foreign_cls.__singlename__ == foreign_singlename:
            break
    else:
        raise ValueError("Foreign key column name %s in %s doesn't correspond to a table" % (column_names[0], file))
    for translation_class in foreign_cls.translation_classes:
        for column in translation_class.__table__.c:
            column_index = column_indexes.get(column.name)
            if column_index is not None:
                columns[column_index] = column
    assert all([c is not None for c in columns[first_string_index:]])
    return _yield_csv_messages(foreign_cls, columns, first_string_index, csvreader, origin=file.name, crc_value='UNKNOWN')
 def yield_translation_csv_messages(file, no_header=False):
    """Yield messages from a translation CSV file
    """
    csvreader = csv.reader(file, lineterminator='\n')
    if not no_header:
        columns = csvreader.next()
        assert columns == 'language_id,table,id,column,source_crc,string'.split(',')
    for language_id, table, id, column, source_crc, string in csvreader:
        yield Message(
                table,
                int(id),
                column,
                string.decode('utf-8'),
                origin='target CSV',
                source_crc=source_crc,
                language_id=int(language_id),
            )
 def pot_for_column(cls, column, summary=False):
    """Translatable texts get categorized into different POT files to help
       translators prioritize. The pots are:
    - flavor: Flavor texts: here, strings from multiple versions are summarized
    - ripped: Strings ripped from the games; translators for "official"
      languages don't need to bother with these
    - effects: Fanon descriptions of things; they usually use technical
      language
    - misc: Everything else; usually small texts
    Set source to true if this is a flavor summary column. Others are
    determined by the column itself.
    """
    if summary:
        return 'flavor'
    elif column.info.get('ripped'):
        return 'ripped'
    elif column.name.endswith('effect'):
        return 'effects'
    else:
        return 'misc'
 def number_replace(source, string):
    numbers_iter = iter(number_re.findall(source))
    next_number = lambda match: numbers_iter.next()
    return re.sub(r'\{num\}', next_number, string)
 def match_to_source(source, *translations):
    """Matches translated string(s) to source
    The first translation whose source matches the source message, or whose CRC
    matches, or which is official, and which is not fuzzy, it is used.
    If thre's no such translation, the first translation is used.
    Returns (source, source string CRC, string for CSV file, exact match?)
    If there are no translations, returns (source, None, None, None)
    Handles translations where numbers have been replaced by {num}, if they
    have source information.
    """
    first = True
    best_crc = None
    for translation in translations:
        if translation is None:
            continue
        if translation.number_replacement:
            current_string = number_replace(source.string, translation.string)
            current_source = number_replace(source.string, translation.source)
            current_crc = crc(current_source)
        elif '{num}' in translation.string:
            print (u'Warning: {num} appears in %s, but not marked for number replacement. Discarding!' % translation).encode('utf-8')
            continue
        else:
            current_string = translation.string
            current_source = translation.source
            current_crc = translation.source_crc
        if translation.fuzzy:
            match = False
        elif translation.official:
            match = True
        elif current_source:
            match = source.string == current_source
        else:
            match = current_crc == crc(source.string)
        if first or match:
            best_string = current_string
            best_crc = current_crc
            best_message = translation
        if match:
            break
        first = False
    if best_crc:
        return source, best_crc, best_string, match
    else:
        return source, None, None, None
 def merge_translations(source_stream, *translation_streams, **kwargs):
    """For each source message, get its best translation from translations.
    Translations should be ordered by priority, highest to lowest.
    Messages that don't appear in translations at all aren't included.
    """
    source = tuple(source_stream)
    streams = [
            synchronize(source, t, key=lambda m: m.merge_key, unused=kwargs.get('unused'))
            for t in translation_streams
        ]
    for messages in itertools.izip(source, *streams):
        yield match_to_source(*messages)
--- a/pokedex/tests/test_translations.py
+++ b/pokedex/tests/test_translations.py
@ -0,0 +1,183 @@
 # Encoding: UTF-8
 import csv
 from nose.tools import *
 from pokedex.db import translations, tables
 fake_version_names = (
        'version_id,local_language_id,name',
        '1,0,name1', '2,0,name2', '3,0,name3', '3,1,othername3',
    )
 fake_translation_csv = (
        'language_id,table,id,column,source_crc,string',
        '0,Version,1,name,,name1',
        '0,Version,2,name,,name2',
        '0,Version,3,name,,name3',
        '1,Version,3,name,,othername3',
    )
 def test_yield_source_csv_messages():
    check_version_message_stream(translations.yield_source_csv_messages(
            tables.Version.names_table,
            tables.Version,
            csv.reader(iter(fake_version_names)),
        ))
 def test_yield_guessed_csv_messages():
    check_version_message_stream(translations.yield_guessed_csv_messages(
            iter(fake_translation_csv),
        ))
 def test_yield_translation_csv_messages():
    check_version_message_stream(translations.yield_translation_csv_messages(
            iter(fake_translation_csv),
        ))
 def check_version_message_stream(messages):
    messages = list(messages)
    assert messages[0].string == 'name1'
    assert messages[1].string == 'name2'
    assert messages[2].string == 'name3'
    assert messages[3].string == 'othername3'
    for message in messages[:3]:
        assert message.language_id == 0
    assert messages[3].language_id == 1
    for id, message in zip((1, 2, 3, 3), messages):
        assert message.merge_key == ('Version', id, 'name'), message.key
 def get_messages(*rows):
    return list(translations.yield_translation_csv_messages(iter(rows), True))
 def test_merge_translations():
    source = get_messages(
            '0,Table,1,col,,none',
            '0,Table,2,col,,new',
            '0,Table,3,col,,existing',
            '0,Table,4,col,,both',
            '0,Table,5,col,,(gap)',
            '0,Table,6,col,,new-bad',
            '0,Table,7,col,,existing-bad',
            '0,Table,8,col,,both-bad',
            '0,Table,9,col,,new-bad-ex-good',
            '0,Table,10,col,,new-good-ex-bad',
            '0,Table,11,col,,(gap)',
            '0,Table,12,col,,"Numbers: 1, 2, and 003"',
            '0,Table,13,col,,"Numbers: 3, 2, and 001"',
        )
    new = get_messages(
            '0,Table,2,col,%s,new' % translations.crc('new'),
            '0,Table,4,col,%s,new' % translations.crc('both'),
            '0,Table,6,col,%s,new' % translations.crc('----'),
            '0,Table,8,col,%s,new' % translations.crc('----'),
            '0,Table,9,col,%s,new' % translations.crc('----'),
            '0,Table,10,col,%s,new' % translations.crc('new-good-ex-bad'),
            '0,Table,12,col,%s,{num} {num} {num}' % translations.crc('Numbers: {num}, {num}, and {num}'),
            '0,Table,13,col,%s,{num} {num} {num}' % translations.crc('----'),
            '0,Table,100,col,%s,unused' % translations.crc('----'),
        )
    new[-3].number_replacement = True
    new[-3].source = 'Numbers: 1, 2, and 003'
    new[-2].number_replacement = True
    new[-2].source = '----'
    existing = get_messages(
            '0,Table,3,col,%s,existing' % translations.crc('existing'),
            '0,Table,4,col,%s,existing' % translations.crc('both'),
            '0,Table,7,col,%s,existing' % translations.crc('----'),
            '0,Table,8,col,%s,existing' % translations.crc('----'),
            '0,Table,9,col,%s,existing' % translations.crc('new-bad-ex-good'),
            '0,Table,10,col,%s,existing' % translations.crc('----'),
            '0,Table,100,col,%s,unused' % translations.crc('----'),
        )
    expected_list = (
            ('none', None, None),
            ('new', True, 'new'),
            ('existing', True, 'existing'),
            ('both', True, 'new'),
            ('(gap)', None, None),
            ('new-bad', False, 'new'),
            ('existing-bad', False, 'existing'),
            ('both-bad', False, 'new'),
            ('new-bad-ex-good', True, 'existing'),
            ('new-good-ex-bad', True, 'new'),
            ('(gap)', None, None),
            ('Numbers: 1, 2, and 003', True, '1 2 003'),
            ('Numbers: 3, 2, and 001', False, '3 2 001'),
        )
    unused = []
    result_stream = list(translations.merge_translations(source, new, [], existing, unused=unused.append))
    for result, expected in zip(result_stream, expected_list):
        res_src, res_crc, res_str, res_match = result
        exp_src, exp_match, exp_str = expected
        print result, expected
        assert res_src.string == exp_src
        assert res_str == exp_str, (res_str, exp_str)
        if exp_match is None:
            assert res_crc is None
        elif exp_match is True:
            assert res_crc == translations.crc(res_src.string)
        elif exp_match is False:
            assert res_crc == translations.crc('----')
        assert res_match == exp_match
    print 'unused:', unused
    for message in unused:
        assert message.string == 'unused'
        assert message.id == 100
 def test_merge():
    check_merge((0, 1, 2, 3))
    check_merge((0, 1), (2, 3))
    check_merge((2, 3), (0, 1))
    check_merge((0, 2), (1, 3))
    check_merge((0, 3), (1, 2))
    check_merge((0, 1), (2, 3), (2, 3))
 def check_merge(*sequences):
    merged = list(translations.Merge(*sequences))
    concatenated = [val for seq in sequences for val in seq]
    assert merged == sorted(concatenated)
 def test_merge_dynamic_add():
    merge = translations.Merge((1, 2, 3))
    def adder():
        for val in (1, 2, 3):
            yield val
            merge.add_iterator([4])
    merge.add_iterator(adder())
    assert tuple(merge) == (1, 1, 2, 2, 3, 3, 4, 4, 4)
 def test_merge_adjacent():
    messages = get_messages(
            '0,Table,1,col,,strA',
            '0,Table,2,col,,strB',
            '0,Table,2,col,,strC',
            '0,Table,2,col,,strB',
            '0,Table,2,col,,strD',
            '0,Table,3,col,,strE',
        )
    result = [m.string for m in translations.merge_adjacent(messages)]
    expected = ['strA', 'strB\n\nstrC\n\nstrD', 'strE']
    assert result == expected
 def test_leftjoin():
    check_leftjoin([], [], [], [])
    check_leftjoin([], [1], [], [1])
    check_leftjoin([], [1, 2], [], [1, 2])
    check_leftjoin([1], [], [(1, None)], [])
    check_leftjoin([1], [1], [(1, 1)], [])
    check_leftjoin([1], [2], [(1, None)], [2])
    check_leftjoin([1, 2], [1], [(1, 1), (2, None)], [])
    check_leftjoin([1, 2], [1, 2], [(1, 1), (2, 2)], [])
    check_leftjoin([1], [1, 2], [(1, 1)], [2])
    check_leftjoin([1, 2], [1, 3], [(1, 1), (2, None)], [3])
    check_leftjoin([1, 2, 3], [1, 3], [(1, 1), (2, None), (3, 3)], [])
    check_leftjoin([1, 2, 2, 3], [1, 3], [(1, 1), (2, None), (2, None), (3, 3)], [])
    check_leftjoin([1, 2, 2, 3], [2, 2, 2], [(1, None), (2, 2), (2, 2), (3, None)], [2])
 def check_leftjoin(seqa, seqb, expected, expected_unused):
    unused = []
    result = list(translations.leftjoin(seqa, seqb, unused=unused.append))
    assert result == list(expected)
    assert unused == list(expected_unused)