Faster pokedex load for PostgreSQL #526

Also added the -S (--safe) option, which disables the backend-specific optimizations. This gives over 3× speedup on my machine :)
2024-08-20 18:16:34 +00:00 · 2011-03-14 05:11:27 +02:00 · 2011-03-14 05:11:27 +02:00 · bb4861b8c6
commit bb4861b8c6
parent 4daa6ab0c3
2 changed files with 33 additions and 1 deletions
--- a/pokedex/init.py
+++ b/pokedex/init.py
@ -122,6 +122,8 @@ def command_load(*args):
    parser = get_parser(verbose=True)
    parser.add_option('-d', '--directory', dest='directory', default=None)
    parser.add_option('-D', '--drop-tables', dest='drop_tables', default=False, action='store_true')
    parser.add_option('-S', '--safe', dest='safe', default=False, action='store_true',
        help="Do not use backend-specific optimalizations.")
    options, tables = parser.parse_args(list(args))
    if not options.engine_uri:
@ -138,7 +140,7 @@ def command_load(*args):
                                  drop_tables=options.drop_tables,
                                  tables=tables,
                                  verbose=options.verbose,
-                                  safe=False)
+                                  safe=options.safe)
 def command_reindex(*args):
    parser = get_parser(verbose=True)
--- a/pokedex/db/load.py
+++ b/pokedex/db/load.py
@ -168,6 +168,36 @@ def load(session, tables=[], directory=None, drop_tables=False, verbose=False, s
        reader = csv.reader(csvfile, lineterminator='\n')
        column_names = [unicode(column) for column in reader.next()]
        if not safe and session.connection().dialect.name == 'postgresql':
            """
            Postgres' CSV dialect is nearly the same as ours, except that it
            treats completely empty values as NULL, and empty quoted
            strings ("") as an empty strings.
            Pokedex dump does not quote empty strings. So, both empty strings
            and NULLs are read in as NULL.
            For an empty string in a NOT NULL column, the load will fail, and
            load will fall back to the cross-backend row-by-row loading. And in
            nullable columns, we already load empty stings as NULL.
            """
            session.commit()
            not_null_cols = [c for c in column_names if not table_obj.c[c].nullable]
            if not_null_cols:
                force_not_null = 'FORCE NOT NULL ' + ','.join('"%s"' % c for c in not_null_cols)
            else:
                force_not_null = ''
            command = "COPY {table_name} ({columns}) FROM '{csvpath}' CSV HEADER {force_not_null}"
            session.connection().execute(
                    command.format(
                            table_name=table_name,
                            csvpath=csvpath,
                            columns=','.join('"%s"' % c for c in column_names),
                            force_not_null=force_not_null,
                        )
                )
            session.commit()
            print_done()
            continue
        # Self-referential tables may contain rows with foreign keys of other
        # rows in the same table that do not yet exist.  Pull these out and add
        # them to the session last