Faster pokedex load for PostgreSQL #526

Also added the -S (--safe) option, which disables the backend-specific
optimizations.

This gives over 3× speedup on my machine :)
This commit is contained in:
Petr Viktorin 2011-03-14 05:11:27 +02:00
parent 4daa6ab0c3
commit bb4861b8c6
2 changed files with 33 additions and 1 deletions

View file

@ -122,6 +122,8 @@ def command_load(*args):
parser = get_parser(verbose=True) parser = get_parser(verbose=True)
parser.add_option('-d', '--directory', dest='directory', default=None) parser.add_option('-d', '--directory', dest='directory', default=None)
parser.add_option('-D', '--drop-tables', dest='drop_tables', default=False, action='store_true') parser.add_option('-D', '--drop-tables', dest='drop_tables', default=False, action='store_true')
parser.add_option('-S', '--safe', dest='safe', default=False, action='store_true',
help="Do not use backend-specific optimalizations.")
options, tables = parser.parse_args(list(args)) options, tables = parser.parse_args(list(args))
if not options.engine_uri: if not options.engine_uri:
@ -138,7 +140,7 @@ def command_load(*args):
drop_tables=options.drop_tables, drop_tables=options.drop_tables,
tables=tables, tables=tables,
verbose=options.verbose, verbose=options.verbose,
safe=False) safe=options.safe)
def command_reindex(*args): def command_reindex(*args):
parser = get_parser(verbose=True) parser = get_parser(verbose=True)

View file

@ -168,6 +168,36 @@ def load(session, tables=[], directory=None, drop_tables=False, verbose=False, s
reader = csv.reader(csvfile, lineterminator='\n') reader = csv.reader(csvfile, lineterminator='\n')
column_names = [unicode(column) for column in reader.next()] column_names = [unicode(column) for column in reader.next()]
if not safe and session.connection().dialect.name == 'postgresql':
"""
Postgres' CSV dialect is nearly the same as ours, except that it
treats completely empty values as NULL, and empty quoted
strings ("") as an empty strings.
Pokedex dump does not quote empty strings. So, both empty strings
and NULLs are read in as NULL.
For an empty string in a NOT NULL column, the load will fail, and
load will fall back to the cross-backend row-by-row loading. And in
nullable columns, we already load empty stings as NULL.
"""
session.commit()
not_null_cols = [c for c in column_names if not table_obj.c[c].nullable]
if not_null_cols:
force_not_null = 'FORCE NOT NULL ' + ','.join('"%s"' % c for c in not_null_cols)
else:
force_not_null = ''
command = "COPY {table_name} ({columns}) FROM '{csvpath}' CSV HEADER {force_not_null}"
session.connection().execute(
command.format(
table_name=table_name,
csvpath=csvpath,
columns=','.join('"%s"' % c for c in column_names),
force_not_null=force_not_null,
)
)
session.commit()
print_done()
continue
# Self-referential tables may contain rows with foreign keys of other # Self-referential tables may contain rows with foreign keys of other
# rows in the same table that do not yet exist. Pull these out and add # rows in the same table that do not yet exist. Pull these out and add
# them to the session last # them to the session last