Vastly improved the pokedex import/export UI.

csvimport is now load; csvexport is now dump.

Both take an optional -e switch to specify an engine, but will happily
use a default SQLite database in the pokedex package directory.

Additionally, the CSV directory is now controlled by the optional -d
switch, and defaults to Doing The Right Thing.

So `pokedex load` now does exactly what you'd expect: loads the data
from the right files into a consistently-located database.
This commit is contained in:
Eevee 2009-08-18 18:02:53 -07:00
parent 238487c908
commit 1a7d046fbc
3 changed files with 246 additions and 164 deletions

View file

@ -1,15 +1,14 @@
# encoding: utf8 # encoding: utf8
from optparse import OptionParser
import sys import sys
from sqlalchemy.exc import IntegrityError from .db import connect, metadata
import sqlalchemy.types import pokedex.db.load
from .db import connect, metadata, tables as tables_module
from pokedex.lookup import lookup as pokedex_lookup from pokedex.lookup import lookup as pokedex_lookup
def main(): def main():
if len(sys.argv) <= 1: if len(sys.argv) <= 1:
help() command_help()
command = sys.argv[1] command = sys.argv[1]
args = sys.argv[2:] args = sys.argv[2:]
@ -22,161 +21,27 @@ def main():
command_help() command_help()
def command_csvimport(engine_uri, directory='.'): def command_dump(*args):
import csv parser = OptionParser()
parser.add_option('-e', '--engine', dest='engine_uri', default=None)
parser.add_option('-d', '--directory', dest='directory', default=None)
options, _ = parser.parse_args(list(args))
from sqlalchemy.orm.attributes import instrumentation_registry session = connect(options.engine_uri)
pokedex.db.load.dump(session, directory=options.directory)
session = connect(engine_uri) def command_load(*args):
parser = OptionParser()
parser.add_option('-e', '--engine', dest='engine_uri', default=None)
parser.add_option('-d', '--directory', dest='directory', default=None)
parser.add_option('-D', '--drop-tables', dest='drop_tables', default=False, action='store_true')
options, _ = parser.parse_args(list(args))
metadata.create_all() session = connect(options.engine_uri)
# SQLAlchemy is retarded and there is no way for me to get a list of ORM pokedex.db.load.load(session, directory=options.directory,
# classes besides to inspect the module they all happen to live in for drop_tables=options.drop_tables)
# things that look right.
table_base = tables_module.TableBase
orm_classes = {} # table object => table class
for name in dir(tables_module):
# dir() returns strings! How /convenient/.
thingy = getattr(tables_module, name)
if not isinstance(thingy, type):
# Not a class; bail
continue
elif not issubclass(thingy, table_base):
# Not a declarative table; bail
continue
elif thingy == table_base:
# Declarative table base, so not a real table; bail
continue
# thingy is definitely a table class! Hallelujah.
orm_classes[thingy.__table__] = thingy
# Okay, run through the tables and actually load the data now
for table_obj in metadata.sorted_tables:
table_class = orm_classes[table_obj]
table_name = table_obj.name
# Print the table name but leave the cursor in a fixed column
print table_name + '...', ' ' * (40 - len(table_name)),
sys.stdout.flush()
try:
csvfile = open("%s/%s.csv" % (directory, table_name), 'rb')
except IOError:
# File doesn't exist; don't load anything!
print 'no data!'
continue
reader = csv.reader(csvfile, lineterminator='\n')
column_names = [unicode(column) for column in reader.next()]
# Self-referential tables may contain rows with foreign keys of other
# rows in the same table that do not yet exist. Pull these out and add
# them to the session last
# ASSUMPTION: Self-referential tables have a single PK called "id"
deferred_rows = [] # ( row referring to id, [foreign ids we need] )
seen_ids = {} # primary key we've seen => 1
# Fetch foreign key columns that point at this table, if any
self_ref_columns = []
for column in table_obj.c:
if any(_.references(table_obj) for _ in column.foreign_keys):
self_ref_columns.append(column)
for csvs in reader:
row = table_class()
for column_name, value in zip(column_names, csvs):
column = table_obj.c[column_name]
if column.nullable and value == '':
# Empty string in a nullable column really means NULL
value = None
elif isinstance(column.type, sqlalchemy.types.Boolean):
# Boolean values are stored as string values 0/1, but both
# of those evaluate as true; SQLA wants True/False
if value == '0':
value = False
else:
value = True
else:
# Otherwise, unflatten from bytes
value = value.decode('utf-8')
setattr(row, column_name, value)
# May need to stash this row and add it later if it refers to a
# later row in this table
if self_ref_columns:
foreign_ids = [getattr(row, _.name) for _ in self_ref_columns]
foreign_ids = [_ for _ in foreign_ids if _] # remove NULL ids
if not foreign_ids:
# NULL key. Remember this row and add as usual.
seen_ids[row.id] = 1
elif all(_ in seen_ids for _ in foreign_ids):
# Non-NULL key we've already seen. Remember it and commit
# so we know the old row exists when we add the new one
session.commit()
seen_ids[row.id] = 1
else:
# Non-NULL future id. Save this and insert it later!
deferred_rows.append((row, foreign_ids))
continue
session.add(row)
session.commit()
# Attempt to add any spare rows we've collected
for row, foreign_ids in deferred_rows:
if not all(_ in seen_ids for _ in foreign_ids):
# Could happen if row A refers to B which refers to C.
# This is ridiculous and doesn't happen in my data so far
raise ValueError("Too many levels of self-reference! "
"Row was: " + str(row.__dict__))
session.add(row)
seen_ids[row.id] = 1
session.commit()
print 'loaded'
def command_csvexport(engine_uri, directory='.'):
import csv
session = connect(engine_uri)
for table_name in sorted(metadata.tables.keys()):
print table_name
table = metadata.tables[table_name]
writer = csv.writer(open("%s/%s.csv" % (directory, table_name), 'wb'),
lineterminator='\n')
columns = [col.name for col in table.columns]
writer.writerow(columns)
primary_key = table.primary_key
for row in session.query(table).order_by(*primary_key).all():
csvs = []
for col in columns:
# Convert Pythony values to something more universal
val = getattr(row, col)
if val == None:
val = ''
elif val == True:
val = '1'
elif val == False:
val = '0'
else:
val = unicode(val).encode('utf-8')
csvs.append(val)
writer.writerow(csvs)
def command_lookup(engine_uri, name): def command_lookup(engine_uri, name):
# XXX don't require uri! somehow # XXX don't require uri! somehow
@ -194,16 +59,25 @@ def command_lookup(engine_uri, name):
def command_help(): def command_help():
print u"""pokedex -- a command-line Pokédex interface print u"""pokedex -- a command-line Pokédex interface
usage: pokedex {command} [options...]
Run `pokedex setup` first, or nothing will work!
help Displays this message. Commands:
lookup {uri} [name] Look up something in the Pokédex. help Displays this message.
lookup [thing] Look up something in the Pokédex.
These commands are only useful for developers: System commands:
csvimport {uri} [dir] Import data from a set of CSVs to the database load Load Pokédex data into a database from CSV files.
given by the URI. dump Dump Pokédex data from a database into CSV files.
csvexport {uri} [dir] Export data from the database given by the URI
to a set of CSVs. Options:
Directory defaults to cwd. -d|--directory By default, load and dump will use the CSV files in the
pokedex install directory. Use this option to specify
a different directory.
-D|--drop-tables With load, drop all tables before loading data.
-e|--engine=URI By default, all commands try to use a SQLite database
in the pokedex install directory. Use this option to
specify an alternate database.
""".encode(sys.getdefaultencoding(), 'replace') """.encode(sys.getdefaultencoding(), 'replace')
sys.exit(0) sys.exit(0)

View file

@ -1,13 +1,24 @@
import pkg_resources
from sqlalchemy import MetaData, Table, create_engine, orm from sqlalchemy import MetaData, Table, create_engine, orm
from .tables import metadata from .tables import metadata
def connect(uri, **kwargs): def connect(uri=None, **kwargs):
"""Connects to the requested URI. Returns a session object. """Connects to the requested URI. Returns a session object.
With the URI omitted, attempts to connect to a default SQLite database
contained within the package directory.
Calling this function also binds the metadata object to the created engine. Calling this function also binds the metadata object to the created engine.
""" """
# Default to a URI within the package, which was hopefully created at some point
if not uri:
sqlite_path = pkg_resources.resource_filename('pokedex',
'data/pokedex.sqlite')
uri = 'sqlite:///' + sqlite_path
### Do some fixery for MySQL ### Do some fixery for MySQL
if uri[0:5] == 'mysql': if uri[0:5] == 'mysql':
# MySQL uses latin1 for connections by default even if the server is # MySQL uses latin1 for connections by default even if the server is

197
pokedex/db/load.py Normal file
View file

@ -0,0 +1,197 @@
"""CSV to database or vice versa."""
import csv
import pkg_resources
import sys
from sqlalchemy.orm.attributes import instrumentation_registry
import sqlalchemy.types
from pokedex.db import metadata
import pokedex.db.tables as tables
def load(session, directory=None, drop_tables=False):
"""Load data from CSV files into the given database session.
Tables are created automatically.
`session`
SQLAlchemy session to use.
`directory`
Directory the CSV files reside in. Defaults to the `pokedex` data
directory.
`drop_tables`
If set to True, existing `pokedex`-related tables will be dropped.
"""
if not directory:
directory = pkg_resources.resource_filename('pokedex', 'data/csv')
# Drop all tables if requested
if options.drop_tables:
print 'Dropping tables...'
metadata.drop_all()
metadata.create_all()
# SQLAlchemy is retarded and there is no way for me to get a list of ORM
# classes besides to inspect the module they all happen to live in for
# things that look right.
table_base = tables.TableBase
orm_classes = {} # table object => table class
for name in dir(tables):
# dir() returns strings! How /convenient/.
thingy = getattr(tables, name)
if not isinstance(thingy, type):
# Not a class; bail
continue
elif not issubclass(thingy, table_base):
# Not a declarative table; bail
continue
elif thingy == table_base:
# Declarative table base, so not a real table; bail
continue
# thingy is definitely a table class! Hallelujah.
orm_classes[thingy.__table__] = thingy
# Okay, run through the tables and actually load the data now
for table_obj in metadata.sorted_tables:
table_class = orm_classes[table_obj]
table_name = table_obj.name
# Print the table name but leave the cursor in a fixed column
print table_name + '...', ' ' * (40 - len(table_name)),
sys.stdout.flush()
try:
csvfile = open("%s/%s.csv" % (directory, table_name), 'rb')
except IOError:
# File doesn't exist; don't load anything!
print 'no data!'
continue
reader = csv.reader(csvfile, lineterminator='\n')
column_names = [unicode(column) for column in reader.next()]
# Self-referential tables may contain rows with foreign keys of other
# rows in the same table that do not yet exist. Pull these out and add
# them to the session last
# ASSUMPTION: Self-referential tables have a single PK called "id"
deferred_rows = [] # ( row referring to id, [foreign ids we need] )
seen_ids = {} # primary key we've seen => 1
# Fetch foreign key columns that point at this table, if any
self_ref_columns = []
for column in table_obj.c:
if any(_.references(table_obj) for _ in column.foreign_keys):
self_ref_columns.append(column)
for csvs in reader:
row = table_class()
for column_name, value in zip(column_names, csvs):
column = table_obj.c[column_name]
if column.nullable and value == '':
# Empty string in a nullable column really means NULL
value = None
elif isinstance(column.type, sqlalchemy.types.Boolean):
# Boolean values are stored as string values 0/1, but both
# of those evaluate as true; SQLA wants True/False
if value == '0':
value = False
else:
value = True
else:
# Otherwise, unflatten from bytes
value = value.decode('utf-8')
setattr(row, column_name, value)
# May need to stash this row and add it later if it refers to a
# later row in this table
if self_ref_columns:
foreign_ids = [getattr(row, _.name) for _ in self_ref_columns]
foreign_ids = [_ for _ in foreign_ids if _] # remove NULL ids
if not foreign_ids:
# NULL key. Remember this row and add as usual.
seen_ids[row.id] = 1
elif all(_ in seen_ids for _ in foreign_ids):
# Non-NULL key we've already seen. Remember it and commit
# so we know the old row exists when we add the new one
session.commit()
seen_ids[row.id] = 1
else:
# Non-NULL future id. Save this and insert it later!
deferred_rows.append((row, foreign_ids))
continue
session.add(row)
session.commit()
# Attempt to add any spare rows we've collected
for row, foreign_ids in deferred_rows:
if not all(_ in seen_ids for _ in foreign_ids):
# Could happen if row A refers to B which refers to C.
# This is ridiculous and doesn't happen in my data so far
raise ValueError("Too many levels of self-reference! "
"Row was: " + str(row.__dict__))
session.add(row)
seen_ids[row.id] = 1
session.commit()
print 'loaded'
def dump(session, directory=None):
"""Dumps the contents of a database to a set of CSV files. Probably not
useful to anyone besides a developer.
`session`
SQLAlchemy session to use.
`directory`
Directory the CSV files should be put in. Defaults to the `pokedex`
data directory.
"""
if not directory:
directory = pkg_resources.resource_filename('pokedex', 'data/csv')
for table_name in sorted(metadata.tables.keys()):
print table_name
table = metadata.tables[table_name]
writer = csv.writer(open("%s/%s.csv" % (directory, table_name), 'wb'),
lineterminator='\n')
columns = [col.name for col in table.columns]
writer.writerow(columns)
primary_key = table.primary_key
for row in session.query(table).order_by(*primary_key).all():
csvs = []
for col in columns:
# Convert Pythony values to something more universal
val = getattr(row, col)
if val == None:
val = ''
elif val == True:
val = '1'
elif val == False:
val = '0'
else:
val = unicode(val).encode('utf-8')
csvs.append(val)
writer.writerow(csvs)