Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 4 additions & 1 deletion csvapi/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,13 +29,16 @@ def cli():
help='Do not parse CSV again if DB already exists')
@click.option('-w', '--max-workers', default=3,
help='Max number of ThreadPoolExecutor workers')
@click.option('-m', '--parse-module', default=None,
help='CSV pre-parser module to use')
@cli.command()
def serve(dbs, host, port, debug, reload, cache, max_workers):
def serve(dbs, host, port, debug, reload, cache, max_workers, parse_module):
if reload:
import hupper
hupper.start_reloader('csvapi.cli.serve')
app.config.DB_ROOT_DIR = dbs
app.config.CSV_CACHE_ENABLED = cache
app.config.MAX_WORKERS = max_workers
app.config.RESPONSE_TIMEOUT = RESPONSE_TIMEOUT
app.config.PARSE_MODULE = parse_module
app.run(host=host, port=port, debug=debug)
29 changes: 24 additions & 5 deletions csvapi/parser.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import os
import logging

from itertools import islice

import agate
import agatesql # noqa
Expand All @@ -10,6 +11,7 @@
log = logging.getLogger('__name__')

SNIFF_LIMIT = 2048
MAX_PREPARSE_LINES = 50


def is_binary(filepath):
Expand All @@ -19,11 +21,11 @@ def is_binary(filepath):

def detect_encoding(filepath):
with os.popen('file {} -b --mime-encoding'.format(filepath)) as proc:
return proc.read()
return proc.read().replace('\n', '')


def from_csv(filepath, encoding='utf-8'):
return agate.Table.from_csv(filepath, sniff_limit=SNIFF_LIMIT, encoding=encoding)
def from_csv(filepath, **agate_params):
return agate.Table.from_csv(filepath, **agate_params)


def from_excel(filepath):
Expand All @@ -36,10 +38,27 @@ def to_sql(table, _hash, storage):
table.to_sql(db_info['dsn'], db_info['db_name'], overwrite=True)


def parse(filepath, _hash, storage='.'):
def parse(filepath, _hash, storage='.', parse_module=None):
if is_binary(filepath):
table = from_excel(filepath)
else:
encoding = detect_encoding(filepath)
table = from_csv(filepath, encoding=encoding)
agate_params = {
'encoding': encoding,
'sniff_limit': SNIFF_LIMIT,
}
# TODO exception here do not bubble up to parseview.py :thinking:
if parse_module:
with open(filepath, encoding=encoding) as f:
try:
pm = __import__(parse_module)
except ModuleNotFoundError:
log.warning('Pre-parse module "{}" not found'.format(parse_module))
else:
delimiter, skip_lines = pm.parse_csv(list(islice(f, MAX_PREPARSE_LINES)))
agate_params.update({
'delimiter': delimiter,
'skip_lines': skip_lines,
})
table = from_csv(filepath, **agate_params)
return to_sql(table, _hash, storage)
3 changes: 2 additions & 1 deletion csvapi/parseview.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,8 @@ def do_parse_in_thread():
tmp.write(chunk)
tmp.close()
try:
parse(tmp.name, _hash, storage=request.app.config.DB_ROOT_DIR)
parse(tmp.name, _hash, storage=request.app.config.DB_ROOT_DIR,
parse_module=request.app.config.PARSE_MODULE)
except Exception as e:
return api_error('Error parsing CSV', details=str(e))
finally:
Expand Down