From 62ca65265569593b3affd51508f74b2f133f6ef5 Mon Sep 17 00:00:00 2001 From: Sheshank Shankar Date: Sat, 19 Sep 2020 22:13:13 -0700 Subject: [PATCH 01/12] loading code with sqlite3 + new unit tests + new speed testing --- Pipfile | 1 + litecoder/__init__.py | 4 +- litecoder/usa.py | 62 +++-- scripts/cities_to_yaml.py | 119 +++++++++ scripts/speed_test.py | 123 +++++++++ tests/prod_db/conftest.py | 4 +- tests/prod_db/test_us_city_index_2.yml | 340 ++++++++++++++++++++++++ tests/prod_db/test_us_state_index.py | 10 +- tests/prod_db/test_us_state_index_2.yml | 340 ++++++++++++++++++++++++ 9 files changed, 971 insertions(+), 32 deletions(-) create mode 100644 scripts/cities_to_yaml.py create mode 100644 scripts/speed_test.py create mode 100644 tests/prod_db/test_us_city_index_2.yml create mode 100644 tests/prod_db/test_us_state_index_2.yml diff --git a/Pipfile b/Pipfile index b65227a..c0b76e4 100644 --- a/Pipfile +++ b/Pipfile @@ -35,6 +35,7 @@ PyYAML = "*" Shapely = "*" numpy = "*" scipy = "*" +sqlitedict = "*" [dev-packages] diff --git a/litecoder/__init__.py b/litecoder/__init__.py index e67b0f9..9a01450 100644 --- a/litecoder/__init__.py +++ b/litecoder/__init__.py @@ -9,9 +9,9 @@ DATA_DIR = os.path.join(os.path.dirname(__file__), 'data') -US_STATE_PATH = os.path.join(DATA_DIR, 'us-states.p') +US_STATE_PATH = os.path.join(DATA_DIR, 'us-states.db') -US_CITY_PATH = os.path.join(DATA_DIR, 'us-cities.p') +US_CITY_PATH = os.path.join(DATA_DIR, 'us-cities.db') logging.basicConfig( diff --git a/litecoder/usa.py b/litecoder/usa.py index d6ff15a..749cdcd 100644 --- a/litecoder/usa.py +++ b/litecoder/usa.py @@ -1,8 +1,10 @@ import re -import pickle - +import os +import hashlib +import struct +from sqlitedict import SqliteDict from tqdm import tqdm from collections import defaultdict from itertools import product @@ -192,14 +194,16 @@ def __repr__(self): class Index: - @classmethod - def load(cls, path): - with open(path, 'rb') as fh: - return pickle.load(fh) + # Now that loading the database is instantenous, it is better to put it in the constructor over a + # separate load method + # @classmethod + # def load(cls, path): + # with open(path, 'rb') as fh: + # return pickle.load(fh) - def __init__(self): - self._key_to_ids = defaultdict(set) - self._id_to_loc = dict() + def __init__(self, path): + self._key_to_ids = SqliteDict(filename=path, tablename="keys") + self._id_to_loc = SqliteDict(filename=path, tablename="locations") def __len__(self): return len(self._key_to_ids) @@ -211,38 +215,47 @@ def __repr__(self): len(self._id_to_loc), ) - def __getitem__(self, text): + def __getitem__(self, key): """Get ids, map to records only if there is a match in the index """ - if keyify(text) not in self._key_to_ids: + # convert string to integer + hash = hashlib.md5(bytes(keyify(key), encoding="utf-8")).digest() + hashed_key = struct.unpack("L", hash[:8])[0] % (2 ** 63) + if hashed_key not in self._key_to_ids: return None - ids = self._key_to_ids[keyify(text)] + ids = self._key_to_ids[hashed_key] return [self._id_to_loc[id] for id in ids] def add_key(self, key, id): - self._key_to_ids[key].add(id) + # convert string to integer + hash = hashlib.md5(bytes(key, encoding="utf-8")).digest() + hashed_key = struct.unpack("L", hash[:8])[0] % (2 ** 63) + if hashed_key not in self._key_to_ids: + self._key_to_ids[hashed_key] = set() + curr_ids = self._key_to_ids[hashed_key] + curr_ids.add(id) + self._key_to_ids[hashed_key] = curr_ids + self._key_to_ids.commit() + del curr_ids def add_location(self, id, location): self._id_to_loc[id] = location + self._id_to_loc.commit() def locations(self): return list(self._id_to_loc.values()) - def save(self, path): - with open(path, 'wb') as fh: - pickle.dump(self, fh) + def close(self): + self._key_to_ids.close() + self._id_to_loc.close() class USCityIndex(Index): - @classmethod - def load(cls, path=US_CITY_PATH): - return super().load(path) - def __init__(self, bare_name_blocklist=None): - super().__init__() + super().__init__(path=US_CITY_PATH) self.bare_name_blocklist = bare_name_blocklist def build(self): @@ -269,9 +282,8 @@ def build(self): class USStateIndex(Index): - @classmethod - def load(cls, path=US_STATE_PATH): - return super().load(path) + def __init__(self): + super().__init__(path=US_STATE_PATH) def build(self): """Index all US states. @@ -287,4 +299,4 @@ def build(self): self.add_key(key, row.wof_id) # ID -> state - self.add_location(row.wof_id, StateMatch(row)) + self.add_location(row.wof_id, StateMatch(row)) \ No newline at end of file diff --git a/scripts/cities_to_yaml.py b/scripts/cities_to_yaml.py new file mode 100644 index 0000000..113f145 --- /dev/null +++ b/scripts/cities_to_yaml.py @@ -0,0 +1,119 @@ +from litecoder.usa import USCityIndex, USStateIndex + +city_idx = USCityIndex() +state_idx = USStateIndex() + +states = """North Carolina, USA +District of Columbia +Illinois, United States +Georgia United States +north carolina +texas +iowa +Florida, United States +Vermont, USA +TX USA +FL U.S.A. +pennsylvania usa +nebraska + Oregon +Pennsylvania +New Hampshire USA +Nebraska, USA +New mexico +Indiana +South Dakota + Oklahoma +Ohio,US +Kansas, USA +indiana +MA, USA + New York +Ohio, United States +NJ USA +ohio usa +Connecticut, USA +MICHIGAN, United States +Missouri +New York +California - USA +Massachusetts, USA + Missouri +FL, United States of America +New Hampshire +Georgia. +Nevada USA + PENNSYLVANIA +Virginia, USA. +Alabama, USA +Indiana +Louisiana, United States +New Mexico +Ohio USA +Nevada, USA +LOUISIANA +New Jersey, us""".split("\n") + +cities = """Edinburg, Texas +Lakeville , Minnesota +Woodland, CA. +Gary, IN +Cornelius, NC +Okeechobee, Fl +Saginaw Township South, MI +Lansdowne, PA +Knoxville TN +OAKLAND, CA +suffolk va +Port Orange, FL +Sedona, AZ +Cedar City UT +Cincinnati. +Huntington Beach CA +Wooster,Ohio +Lewisville, Texas +traverse city mi +Pennsauken, New Jersey +Jonesboro, Arkansas +Zephyrhills, FL +West Jefferson, NC +Escondido, CA +Lumberton, NC +Cayce, SC +Stratford, Connecticut, USA +Avondale, AZ +Coral Springs, FL +Gaithersburg, MD +Westchester, IL +Louisa, Virginia +Norway, ME +Philadelphia PA, USA +Fort worth, tx +Eureka Springs, Arkansas +Nashville , TN +Ellenwood Ga +Floral Park, NY +Nashville Tennessee +Malvern, AR +Valdosta, Georgia +Valley Center Ca +St. Robert Mo. +Hollandale, MS +New Castle, PA +Harlem, FL +Kings Mills, OH +knoxville Tennessee +BrooklYn""".split("\n") +output = "" +for state in states: + if (len(state_idx[state]) == 0): + print(state) + wof_ids = [result.data.wof_id for result in state_idx[state]] + output += """- query: + - {} + matches:\n""".format(state) + for wof_id in wof_ids: + output += " - {}".format(wof_id) + output += "\n" +with open("output.yml", "w") as o_file: + o_file.write(output) diff --git a/scripts/speed_test.py b/scripts/speed_test.py new file mode 100644 index 0000000..e7c3756 --- /dev/null +++ b/scripts/speed_test.py @@ -0,0 +1,123 @@ +from litecoder.usa import USCityIndex, USStateIndex +import time + +print("Loading USCityIndex... ", end="") +start_time = time.time() +city_idx = USCityIndex() +print("finished: {}s!".format(time.time() - start_time)) + +print("Loading USStateIndex... ", end="") +start_time = time.time() +state_idx = USStateIndex() +print("finished: {}s!".format(time.time() - start_time)) + +city_tests = """Edinburg, Texas +Lakeville , Minnesota +Woodland, CA. +Gary, IN +Cornelius, NC +Okeechobee, Fl +Saginaw Township South, MI +Lansdowne, PA +Knoxville TN +OAKLAND, CA +suffolk va +Port Orange, FL +Sedona, AZ +Cedar City UT +Cincinnati. +Huntington Beach CA +Wooster,Ohio +Lewisville, Texas +traverse city mi +Pennsauken, New Jersey +Jonesboro, Arkansas +Zephyrhills, FL +West Jefferson, NC +Escondido, CA +Lumberton, NC +Cayce, SC +Stratford, Connecticut, USA +Avondale, AZ +Coral Springs, FL +Gaithersburg, MD +Westchester, IL +Louisa, Virginia +Norway, ME +Philadelphia PA, USA +Fort worth, tx +Eureka Springs, Arkansas +Nashville , TN +Ellenwood Ga +Floral Park, NY +Nashville Tennessee +Malvern, AR +Valdosta, Georgia +Valley Center Ca +St. Robert Mo. +Hollandale, MS +New Castle, PA +Harlem, FL +Kings Mills, OH +knoxville Tennessee +BrooklYn""".split("\n") +print("measuring time for {} cities... ".format(len(city_tests)), end="") +start_time = time.time() +for city in city_tests: + x = city_idx[city] +print("finished: took {}s!".format(1000*(time.time() - start_time))) +state_tests = """North Carolina, USA +District of Columbia +Illinois, United States +Georgia United States +north carolina +texas +iowa +Florida, United States +Vermont, USA +TX USA +FL U.S.A. +pennsylvania usa +nebraska + Oregon +Pennsylvania +New Hampshire USA +Nebraska, USA +New mexico +Indiana +South Dakota + Oklahoma +Ohio,US +Kansas, USA +indiana +MA, USA + New York +Ohio, United States +NJ USA +ohio usa +Connecticut, USA +MICHIGAN, United States +Missouri +New York +California - USA +Massachusetts, USA + Missouri +FL, United States of America +New Hampshire +Georgia. +Nevada USA + PENNSYLVANIA +Virginia, USA. +Alabama, USA +Indiana +Louisiana, United States +New Mexico +Ohio USA +Nevada, USA +LOUISIANA +New Jersey, us""".split("\n") +print("measuring time for {} states... ".format(len(state_tests)), end="") +start_time = time.time() +for state in state_tests: + x = state_idx[state] +print("finished: took {}s!".format(1000 * (time.time() - start_time))) \ No newline at end of file diff --git a/tests/prod_db/conftest.py b/tests/prod_db/conftest.py index 1005d12..2a746d5 100644 --- a/tests/prod_db/conftest.py +++ b/tests/prod_db/conftest.py @@ -7,9 +7,9 @@ @pytest.fixture(scope='session') def city_idx(): - return USCityIndex.load() + return USCityIndex() @pytest.fixture(scope='session') def state_idx(): - return USStateIndex.load() + return USStateIndex() diff --git a/tests/prod_db/test_us_city_index_2.yml b/tests/prod_db/test_us_city_index_2.yml new file mode 100644 index 0000000..89c2f34 --- /dev/null +++ b/tests/prod_db/test_us_city_index_2.yml @@ -0,0 +1,340 @@ +- query: + - Edinburg, Texas + matches: + - 101723563 +- query: + - Lakeville , Minnesota + matches: + - 85968479 +- query: + - Woodland, CA. + matches: + - 85922405 +- query: + - Gary, IN + matches: + - 85941813 +- query: + - Cornelius, NC + matches: + - 85981335 +- query: + - Okeechobee, Fl + matches: + - 85932281 +- query: + - Saginaw Township South, MI + matches: + - 1125767499 +- query: + - Lansdowne, PA + matches: + - 101718067 +- query: + - Knoxville TN + matches: + - 101722865 +- query: + - OAKLAND, CA + matches: + - 85921881 +- query: + - suffolk va + matches: + - 101728729 +- query: + - Port Orange, FL + matches: + - 85932629 +- query: + - Sedona, AZ + matches: + - 85917431 +- query: + - Cedar City UT + matches: + - 101727685 +- query: + - Cincinnati. + matches: + - 101712203 +- query: + - Huntington Beach CA + matches: + - 85923137 +- query: + - Wooster,Ohio + matches: + - 101712345 +- query: + - Lewisville, Texas + matches: + - 101724413 +- query: + - traverse city mi + matches: + - 85950881 +- query: + - Pennsauken, New Jersey + matches: + - 1125947935 +- query: + - Jonesboro, Arkansas + matches: + - 85920203 +- query: + - Zephyrhills, FL + matches: + - 85932233 +- query: + - West Jefferson, NC + matches: + - 85981009 +- query: + - Escondido, CA + matches: + - 85922263 +- query: + - Lumberton, NC + matches: + - 85981189 +- query: + - Cayce, SC + matches: + - 101720791 +- query: + - Stratford, Connecticut, USA + matches: + - 85930997 +- query: + - Avondale, AZ + matches: + - 85917553 +- query: + - Coral Springs, FL + matches: + - 85932415 +- query: + - Gaithersburg, MD + matches: + - 85949491 +- query: + - Westchester, IL + matches: + - 85940923 +- query: + - Louisa, Virginia + matches: + - 101728949 +- query: + - Norway, ME + matches: + - 85948973 +- query: + - Philadelphia PA, USA + matches: + - 101718083 +- query: + - Fort worth, tx + matches: + - 101724443 +- query: + - Eureka Springs, Arkansas + matches: + - 85919765 +- query: + - Nashville , TN + matches: + - 101723183 +- query: + - Ellenwood Ga + matches: + - 1126054897 +- query: + - Floral Park, NY + matches: + - 85977689 +- query: + - Nashville Tennessee + matches: + - 101723183 +- query: + - Malvern, AR + matches: + - 85920689 +- query: + - Valdosta, Georgia + matches: + - 85936921 +- query: + - Valley Center Ca + matches: + - 85925201 +- query: + - St. Robert Mo. + matches: + - 85971093 +- query: + - Hollandale, MS + matches: + - 85969835 +- query: + - New Castle, PA + matches: + - 101716721 +- query: + - Harlem, FL + matches: + - 85934341 +- query: + - Kings Mills, OH + matches: + - 101713989 +- query: + - knoxville Tennessee + matches: + - 101722865 +- query: + - BrooklYn + matches: + - 85977539 +- query: + - Nuketown + matches: [] + xfail: true +- query: + - Under Lefty's Skin + matches: [] + xfail: true +- query: + - oY’– 7/6/20 oY’– + matches: [] + xfail: true +- query: + - Palestine.Ramallah + matches: [] + xfail: true +- query: + - DONT REPOST MY ART ! carrd byf + matches: [] + xfail: true +- query: + - AJ&K, Pakistan. + matches: [] + xfail: true +- query: + - La Plana Alta + matches: [] + xfail: true +- query: + - Com o ChorAEo + matches: [] + xfail: true +- query: + - PLUTO + matches: [] + xfail: true +- query: + - All Over The WORLD + matches: [] + xfail: true +- query: + - /dev/null + matches: [] + xfail: true +- query: + - University of Bath + matches: [] + xfail: true +- query: + - Gaia + matches: [] + xfail: true +- query: + - Plc + matches: [] + xfail: true +- query: + - Jay cooke mn + matches: [] + xfail: true +- query: + - Toulouse - Perpignan + matches: [] + xfail: true +- query: + - oYtaoYt |oYtaoYt¦ aeC sc squad.a†’ + matches: [] + xfail: true +- query: + - oYscoY + matches: [] + xfail: true +- query: + - sheher ecoe 20 + matches: [] + xfail: true +- query: + - pequitas de sunoo^^ + matches: [] + xfail: true +- query: + - West M + matches: [] + xfail: true +- query: + - Maputo city + matches: [] + xfail: true +- query: + - Entwined w peace + matches: [] + xfail: true +- query: + - DDDDDDDDDDDDDDD + matches: [] + xfail: true +- query: + - mons vaticanus, subterrane + matches: [] + xfail: true +- query: + - oYoO oyOY Oyoyo OY + matches: [] + xfail: true +- query: + - Loin du hood + matches: [] + xfail: true +- query: + - acab blm + matches: [] + xfail: true +- query: + - 010997 + matches: [] + xfail: true +- query: + - tidytuanzebeaETMs basement + matches: [] + xfail: true +- query: + - oYOyomasdEmr + matches: [] + xfail: true +- query: + - somewhere + matches: [] + xfail: true +- query: + - 330 + matches: [] + xfail: true +- query: + - oSUSdod + matches: [] + xfail: true +- query: + - she/her 16 isfp + matches: [] + xfail: true \ No newline at end of file diff --git a/tests/prod_db/test_us_state_index.py b/tests/prod_db/test_us_state_index.py index 9adafc6..c1353e9 100644 --- a/tests/prod_db/test_us_state_index.py +++ b/tests/prod_db/test_us_state_index.py @@ -16,15 +16,19 @@ def yield_cases(): queries = group['query'] + xfail = group.get('xfail', False) + if type(queries) is str: queries = [queries] for query in queries: - yield query, group['matches'] + yield query, group['matches'], xfail -@pytest.mark.parametrize('query,matches', yield_cases()) -def test_cases(state_idx, query, matches): +@pytest.mark.parametrize('query,matches,xfail', yield_cases()) +def test_cases(state_idx, query, matches, xfail): + if xfail: + pytest.xfail() res = state_idx[query] diff --git a/tests/prod_db/test_us_state_index_2.yml b/tests/prod_db/test_us_state_index_2.yml new file mode 100644 index 0000000..c0eedb2 --- /dev/null +++ b/tests/prod_db/test_us_state_index_2.yml @@ -0,0 +1,340 @@ +- query: + - North Carolina, USA + matches: + - 85688773 +- query: + - District of Columbia + matches: + - 85688741 +- query: + - Illinois, United States + matches: + - 85688697 +- query: + - Georgia United States + matches: + - 85688535 +- query: + - north carolina + matches: + - 85688773 +- query: + - texas + matches: + - 85688753 +- query: + - iowa + matches: + - 85688713 +- query: + - Florida, United States + matches: + - 85688651 +- query: + - Vermont, USA + matches: + - 85688763 +- query: + - TX USA + matches: + - 85688753 +- query: + - FL U.S.A. + matches: + - 85688651 +- query: + - pennsylvania usa + matches: + - 85688481 +- query: + - nebraska + matches: + - 85688563 +- query: + - Oregon + matches: + - 85688513 +- query: + - Pennsylvania + matches: + - 85688481 +- query: + - New Hampshire USA + matches: + - 85688689 +- query: + - Nebraska, USA + matches: + - 85688563 +- query: + - New mexico + matches: + - 85688493 +- query: + - Indiana + matches: + - 85688709 +- query: + - South Dakota + matches: + - 85688693 +- query: + - Oklahoma + matches: + - 85688585 +- query: + - Ohio,US + matches: + - 85688485 +- query: + - Kansas, USA + matches: + - 85688555 +- query: + - indiana + matches: + - 85688709 +- query: + - MA, USA + matches: + - 85688645 +- query: + - New York + matches: + - 85688543 +- query: + - Ohio, United States + matches: + - 85688485 +- query: + - NJ USA + matches: + - 85688607 +- query: + - ohio usa + matches: + - 85688485 +- query: + - Connecticut, USA + matches: + - 85688629 +- query: + - MICHIGAN, United States + matches: + - 85688599 +- query: + - Missouri + matches: + - 85688661 +- query: + - New York + matches: + - 85688543 +- query: + - California - USA + matches: + - 85688637 +- query: + - Massachusetts, USA + matches: + - 85688645 +- query: + - Missouri + matches: + - 85688661 +- query: + - FL, United States of America + matches: + - 85688651 +- query: + - New Hampshire + matches: + - 85688689 +- query: + - Georgia. + matches: + - 85688535 +- query: + - Nevada USA + matches: + - 85688531 +- query: + - PENNSYLVANIA + matches: + - 85688481 +- query: + - Virginia, USA. + matches: + - 85688747 +- query: + - Alabama, USA + matches: + - 85688675 +- query: + - Indiana + matches: + - 85688709 +- query: + - Louisiana, United States + matches: + - 85688735 +- query: + - New Mexico + matches: + - 85688493 +- query: + - Ohio USA + matches: + - 85688485 +- query: + - Nevada, USA + matches: + - 85688531 +- query: + - LOUISIANA + matches: + - 85688735 +- query: + - New Jersey, us + matches: + - 85688607 +- query: + - Nuketown + matches: [] + xfail: true +- query: + - Under Lefty's Skin + matches: [] + xfail: true +- query: + - oY’– 7/6/20 oY’– + matches: [] + xfail: true +- query: + - Palestine.Ramallah + matches: [] + xfail: true +- query: + - DONT REPOST MY ART ! carrd byf + matches: [] + xfail: true +- query: + - AJ&K, Pakistan. + matches: [] + xfail: true +- query: + - La Plana Alta + matches: [] + xfail: true +- query: + - Com o ChorAEo + matches: [] + xfail: true +- query: + - PLUTO + matches: [] + xfail: true +- query: + - All Over The WORLD + matches: [] + xfail: true +- query: + - /dev/null + matches: [] + xfail: true +- query: + - University of Bath + matches: [] + xfail: true +- query: + - Gaia + matches: [] + xfail: true +- query: + - Plc + matches: [] + xfail: true +- query: + - Jay cooke mn + matches: [] + xfail: true +- query: + - Toulouse - Perpignan + matches: [] + xfail: true +- query: + - oYtaoYt |oYtaoYt¦ aeC sc squad.a†’ + matches: [] + xfail: true +- query: + - oYscoY + matches: [] + xfail: true +- query: + - sheher ecoe 20 + matches: [] + xfail: true +- query: + - pequitas de sunoo^^ + matches: [] + xfail: true +- query: + - West M + matches: [] + xfail: true +- query: + - Maputo city + matches: [] + xfail: true +- query: + - Entwined w peace + matches: [] + xfail: true +- query: + - DDDDDDDDDDDDDDD + matches: [] + xfail: true +- query: + - mons vaticanus, subterrane + matches: [] + xfail: true +- query: + - oYoO oyOY Oyoyo OY + matches: [] + xfail: true +- query: + - Loin du hood + matches: [] + xfail: true +- query: + - acab blm + matches: [] + xfail: true +- query: + - 010997 + matches: [] + xfail: true +- query: + - tidytuanzebeaETMs basement + matches: [] + xfail: true +- query: + - oYOyomasdEmr + matches: [] + xfail: true +- query: + - somewhere + matches: [] + xfail: true +- query: + - 330 + matches: [] + xfail: true +- query: + - oSUSdod + matches: [] + xfail: true +- query: + - she/her 16 isfp + matches: [] + xfail: true \ No newline at end of file From 6dab318f7d3a119fafa7773b2d2af70abe9ad58b Mon Sep 17 00:00:00 2001 From: Sheshank Shankar Date: Sat, 19 Sep 2020 22:17:27 -0700 Subject: [PATCH 02/12] Revert "loading code with sqlite3 + new unit tests + new speed testing" This reverts commit 62ca65265569593b3affd51508f74b2f133f6ef5. --- Pipfile | 1 - litecoder/__init__.py | 4 +- litecoder/usa.py | 62 ++--- scripts/cities_to_yaml.py | 119 --------- scripts/speed_test.py | 123 --------- tests/prod_db/conftest.py | 4 +- tests/prod_db/test_us_city_index_2.yml | 340 ------------------------ tests/prod_db/test_us_state_index.py | 10 +- tests/prod_db/test_us_state_index_2.yml | 340 ------------------------ 9 files changed, 32 insertions(+), 971 deletions(-) delete mode 100644 scripts/cities_to_yaml.py delete mode 100644 scripts/speed_test.py delete mode 100644 tests/prod_db/test_us_city_index_2.yml delete mode 100644 tests/prod_db/test_us_state_index_2.yml diff --git a/Pipfile b/Pipfile index c0b76e4..b65227a 100644 --- a/Pipfile +++ b/Pipfile @@ -35,7 +35,6 @@ PyYAML = "*" Shapely = "*" numpy = "*" scipy = "*" -sqlitedict = "*" [dev-packages] diff --git a/litecoder/__init__.py b/litecoder/__init__.py index 9a01450..e67b0f9 100644 --- a/litecoder/__init__.py +++ b/litecoder/__init__.py @@ -9,9 +9,9 @@ DATA_DIR = os.path.join(os.path.dirname(__file__), 'data') -US_STATE_PATH = os.path.join(DATA_DIR, 'us-states.db') +US_STATE_PATH = os.path.join(DATA_DIR, 'us-states.p') -US_CITY_PATH = os.path.join(DATA_DIR, 'us-cities.db') +US_CITY_PATH = os.path.join(DATA_DIR, 'us-cities.p') logging.basicConfig( diff --git a/litecoder/usa.py b/litecoder/usa.py index 749cdcd..d6ff15a 100644 --- a/litecoder/usa.py +++ b/litecoder/usa.py @@ -1,10 +1,8 @@ import re -import os -import hashlib -import struct -from sqlitedict import SqliteDict +import pickle + from tqdm import tqdm from collections import defaultdict from itertools import product @@ -194,16 +192,14 @@ def __repr__(self): class Index: - # Now that loading the database is instantenous, it is better to put it in the constructor over a - # separate load method - # @classmethod - # def load(cls, path): - # with open(path, 'rb') as fh: - # return pickle.load(fh) + @classmethod + def load(cls, path): + with open(path, 'rb') as fh: + return pickle.load(fh) - def __init__(self, path): - self._key_to_ids = SqliteDict(filename=path, tablename="keys") - self._id_to_loc = SqliteDict(filename=path, tablename="locations") + def __init__(self): + self._key_to_ids = defaultdict(set) + self._id_to_loc = dict() def __len__(self): return len(self._key_to_ids) @@ -215,47 +211,38 @@ def __repr__(self): len(self._id_to_loc), ) - def __getitem__(self, key): + def __getitem__(self, text): """Get ids, map to records only if there is a match in the index """ - # convert string to integer - hash = hashlib.md5(bytes(keyify(key), encoding="utf-8")).digest() - hashed_key = struct.unpack("L", hash[:8])[0] % (2 ** 63) - if hashed_key not in self._key_to_ids: + if keyify(text) not in self._key_to_ids: return None - ids = self._key_to_ids[hashed_key] + ids = self._key_to_ids[keyify(text)] return [self._id_to_loc[id] for id in ids] def add_key(self, key, id): - # convert string to integer - hash = hashlib.md5(bytes(key, encoding="utf-8")).digest() - hashed_key = struct.unpack("L", hash[:8])[0] % (2 ** 63) - if hashed_key not in self._key_to_ids: - self._key_to_ids[hashed_key] = set() - curr_ids = self._key_to_ids[hashed_key] - curr_ids.add(id) - self._key_to_ids[hashed_key] = curr_ids - self._key_to_ids.commit() - del curr_ids + self._key_to_ids[key].add(id) def add_location(self, id, location): self._id_to_loc[id] = location - self._id_to_loc.commit() def locations(self): return list(self._id_to_loc.values()) - def close(self): - self._key_to_ids.close() - self._id_to_loc.close() + def save(self, path): + with open(path, 'wb') as fh: + pickle.dump(self, fh) class USCityIndex(Index): + @classmethod + def load(cls, path=US_CITY_PATH): + return super().load(path) + def __init__(self, bare_name_blocklist=None): - super().__init__(path=US_CITY_PATH) + super().__init__() self.bare_name_blocklist = bare_name_blocklist def build(self): @@ -282,8 +269,9 @@ def build(self): class USStateIndex(Index): - def __init__(self): - super().__init__(path=US_STATE_PATH) + @classmethod + def load(cls, path=US_STATE_PATH): + return super().load(path) def build(self): """Index all US states. @@ -299,4 +287,4 @@ def build(self): self.add_key(key, row.wof_id) # ID -> state - self.add_location(row.wof_id, StateMatch(row)) \ No newline at end of file + self.add_location(row.wof_id, StateMatch(row)) diff --git a/scripts/cities_to_yaml.py b/scripts/cities_to_yaml.py deleted file mode 100644 index 113f145..0000000 --- a/scripts/cities_to_yaml.py +++ /dev/null @@ -1,119 +0,0 @@ -from litecoder.usa import USCityIndex, USStateIndex - -city_idx = USCityIndex() -state_idx = USStateIndex() - -states = """North Carolina, USA -District of Columbia -Illinois, United States -Georgia United States -north carolina -texas -iowa -Florida, United States -Vermont, USA -TX USA -FL U.S.A. -pennsylvania usa -nebraska - Oregon -Pennsylvania -New Hampshire USA -Nebraska, USA -New mexico -Indiana -South Dakota - Oklahoma -Ohio,US -Kansas, USA -indiana -MA, USA - New York -Ohio, United States -NJ USA -ohio usa -Connecticut, USA -MICHIGAN, United States -Missouri -New York -California - USA -Massachusetts, USA - Missouri -FL, United States of America -New Hampshire -Georgia. -Nevada USA - PENNSYLVANIA -Virginia, USA. -Alabama, USA -Indiana -Louisiana, United States -New Mexico -Ohio USA -Nevada, USA -LOUISIANA -New Jersey, us""".split("\n") - -cities = """Edinburg, Texas -Lakeville , Minnesota -Woodland, CA. -Gary, IN -Cornelius, NC -Okeechobee, Fl -Saginaw Township South, MI -Lansdowne, PA -Knoxville TN -OAKLAND, CA -suffolk va -Port Orange, FL -Sedona, AZ -Cedar City UT -Cincinnati. -Huntington Beach CA -Wooster,Ohio -Lewisville, Texas -traverse city mi -Pennsauken, New Jersey -Jonesboro, Arkansas -Zephyrhills, FL -West Jefferson, NC -Escondido, CA -Lumberton, NC -Cayce, SC -Stratford, Connecticut, USA -Avondale, AZ -Coral Springs, FL -Gaithersburg, MD -Westchester, IL -Louisa, Virginia -Norway, ME -Philadelphia PA, USA -Fort worth, tx -Eureka Springs, Arkansas -Nashville , TN -Ellenwood Ga -Floral Park, NY -Nashville Tennessee -Malvern, AR -Valdosta, Georgia -Valley Center Ca -St. Robert Mo. -Hollandale, MS -New Castle, PA -Harlem, FL -Kings Mills, OH -knoxville Tennessee -BrooklYn""".split("\n") -output = "" -for state in states: - if (len(state_idx[state]) == 0): - print(state) - wof_ids = [result.data.wof_id for result in state_idx[state]] - output += """- query: - - {} - matches:\n""".format(state) - for wof_id in wof_ids: - output += " - {}".format(wof_id) - output += "\n" -with open("output.yml", "w") as o_file: - o_file.write(output) diff --git a/scripts/speed_test.py b/scripts/speed_test.py deleted file mode 100644 index e7c3756..0000000 --- a/scripts/speed_test.py +++ /dev/null @@ -1,123 +0,0 @@ -from litecoder.usa import USCityIndex, USStateIndex -import time - -print("Loading USCityIndex... ", end="") -start_time = time.time() -city_idx = USCityIndex() -print("finished: {}s!".format(time.time() - start_time)) - -print("Loading USStateIndex... ", end="") -start_time = time.time() -state_idx = USStateIndex() -print("finished: {}s!".format(time.time() - start_time)) - -city_tests = """Edinburg, Texas -Lakeville , Minnesota -Woodland, CA. -Gary, IN -Cornelius, NC -Okeechobee, Fl -Saginaw Township South, MI -Lansdowne, PA -Knoxville TN -OAKLAND, CA -suffolk va -Port Orange, FL -Sedona, AZ -Cedar City UT -Cincinnati. -Huntington Beach CA -Wooster,Ohio -Lewisville, Texas -traverse city mi -Pennsauken, New Jersey -Jonesboro, Arkansas -Zephyrhills, FL -West Jefferson, NC -Escondido, CA -Lumberton, NC -Cayce, SC -Stratford, Connecticut, USA -Avondale, AZ -Coral Springs, FL -Gaithersburg, MD -Westchester, IL -Louisa, Virginia -Norway, ME -Philadelphia PA, USA -Fort worth, tx -Eureka Springs, Arkansas -Nashville , TN -Ellenwood Ga -Floral Park, NY -Nashville Tennessee -Malvern, AR -Valdosta, Georgia -Valley Center Ca -St. Robert Mo. -Hollandale, MS -New Castle, PA -Harlem, FL -Kings Mills, OH -knoxville Tennessee -BrooklYn""".split("\n") -print("measuring time for {} cities... ".format(len(city_tests)), end="") -start_time = time.time() -for city in city_tests: - x = city_idx[city] -print("finished: took {}s!".format(1000*(time.time() - start_time))) -state_tests = """North Carolina, USA -District of Columbia -Illinois, United States -Georgia United States -north carolina -texas -iowa -Florida, United States -Vermont, USA -TX USA -FL U.S.A. -pennsylvania usa -nebraska - Oregon -Pennsylvania -New Hampshire USA -Nebraska, USA -New mexico -Indiana -South Dakota - Oklahoma -Ohio,US -Kansas, USA -indiana -MA, USA - New York -Ohio, United States -NJ USA -ohio usa -Connecticut, USA -MICHIGAN, United States -Missouri -New York -California - USA -Massachusetts, USA - Missouri -FL, United States of America -New Hampshire -Georgia. -Nevada USA - PENNSYLVANIA -Virginia, USA. -Alabama, USA -Indiana -Louisiana, United States -New Mexico -Ohio USA -Nevada, USA -LOUISIANA -New Jersey, us""".split("\n") -print("measuring time for {} states... ".format(len(state_tests)), end="") -start_time = time.time() -for state in state_tests: - x = state_idx[state] -print("finished: took {}s!".format(1000 * (time.time() - start_time))) \ No newline at end of file diff --git a/tests/prod_db/conftest.py b/tests/prod_db/conftest.py index 2a746d5..1005d12 100644 --- a/tests/prod_db/conftest.py +++ b/tests/prod_db/conftest.py @@ -7,9 +7,9 @@ @pytest.fixture(scope='session') def city_idx(): - return USCityIndex() + return USCityIndex.load() @pytest.fixture(scope='session') def state_idx(): - return USStateIndex() + return USStateIndex.load() diff --git a/tests/prod_db/test_us_city_index_2.yml b/tests/prod_db/test_us_city_index_2.yml deleted file mode 100644 index 89c2f34..0000000 --- a/tests/prod_db/test_us_city_index_2.yml +++ /dev/null @@ -1,340 +0,0 @@ -- query: - - Edinburg, Texas - matches: - - 101723563 -- query: - - Lakeville , Minnesota - matches: - - 85968479 -- query: - - Woodland, CA. - matches: - - 85922405 -- query: - - Gary, IN - matches: - - 85941813 -- query: - - Cornelius, NC - matches: - - 85981335 -- query: - - Okeechobee, Fl - matches: - - 85932281 -- query: - - Saginaw Township South, MI - matches: - - 1125767499 -- query: - - Lansdowne, PA - matches: - - 101718067 -- query: - - Knoxville TN - matches: - - 101722865 -- query: - - OAKLAND, CA - matches: - - 85921881 -- query: - - suffolk va - matches: - - 101728729 -- query: - - Port Orange, FL - matches: - - 85932629 -- query: - - Sedona, AZ - matches: - - 85917431 -- query: - - Cedar City UT - matches: - - 101727685 -- query: - - Cincinnati. - matches: - - 101712203 -- query: - - Huntington Beach CA - matches: - - 85923137 -- query: - - Wooster,Ohio - matches: - - 101712345 -- query: - - Lewisville, Texas - matches: - - 101724413 -- query: - - traverse city mi - matches: - - 85950881 -- query: - - Pennsauken, New Jersey - matches: - - 1125947935 -- query: - - Jonesboro, Arkansas - matches: - - 85920203 -- query: - - Zephyrhills, FL - matches: - - 85932233 -- query: - - West Jefferson, NC - matches: - - 85981009 -- query: - - Escondido, CA - matches: - - 85922263 -- query: - - Lumberton, NC - matches: - - 85981189 -- query: - - Cayce, SC - matches: - - 101720791 -- query: - - Stratford, Connecticut, USA - matches: - - 85930997 -- query: - - Avondale, AZ - matches: - - 85917553 -- query: - - Coral Springs, FL - matches: - - 85932415 -- query: - - Gaithersburg, MD - matches: - - 85949491 -- query: - - Westchester, IL - matches: - - 85940923 -- query: - - Louisa, Virginia - matches: - - 101728949 -- query: - - Norway, ME - matches: - - 85948973 -- query: - - Philadelphia PA, USA - matches: - - 101718083 -- query: - - Fort worth, tx - matches: - - 101724443 -- query: - - Eureka Springs, Arkansas - matches: - - 85919765 -- query: - - Nashville , TN - matches: - - 101723183 -- query: - - Ellenwood Ga - matches: - - 1126054897 -- query: - - Floral Park, NY - matches: - - 85977689 -- query: - - Nashville Tennessee - matches: - - 101723183 -- query: - - Malvern, AR - matches: - - 85920689 -- query: - - Valdosta, Georgia - matches: - - 85936921 -- query: - - Valley Center Ca - matches: - - 85925201 -- query: - - St. Robert Mo. - matches: - - 85971093 -- query: - - Hollandale, MS - matches: - - 85969835 -- query: - - New Castle, PA - matches: - - 101716721 -- query: - - Harlem, FL - matches: - - 85934341 -- query: - - Kings Mills, OH - matches: - - 101713989 -- query: - - knoxville Tennessee - matches: - - 101722865 -- query: - - BrooklYn - matches: - - 85977539 -- query: - - Nuketown - matches: [] - xfail: true -- query: - - Under Lefty's Skin - matches: [] - xfail: true -- query: - - oY’– 7/6/20 oY’– - matches: [] - xfail: true -- query: - - Palestine.Ramallah - matches: [] - xfail: true -- query: - - DONT REPOST MY ART ! carrd byf - matches: [] - xfail: true -- query: - - AJ&K, Pakistan. - matches: [] - xfail: true -- query: - - La Plana Alta - matches: [] - xfail: true -- query: - - Com o ChorAEo - matches: [] - xfail: true -- query: - - PLUTO - matches: [] - xfail: true -- query: - - All Over The WORLD - matches: [] - xfail: true -- query: - - /dev/null - matches: [] - xfail: true -- query: - - University of Bath - matches: [] - xfail: true -- query: - - Gaia - matches: [] - xfail: true -- query: - - Plc - matches: [] - xfail: true -- query: - - Jay cooke mn - matches: [] - xfail: true -- query: - - Toulouse - Perpignan - matches: [] - xfail: true -- query: - - oYtaoYt |oYtaoYt¦ aeC sc squad.a†’ - matches: [] - xfail: true -- query: - - oYscoY - matches: [] - xfail: true -- query: - - sheher ecoe 20 - matches: [] - xfail: true -- query: - - pequitas de sunoo^^ - matches: [] - xfail: true -- query: - - West M - matches: [] - xfail: true -- query: - - Maputo city - matches: [] - xfail: true -- query: - - Entwined w peace - matches: [] - xfail: true -- query: - - DDDDDDDDDDDDDDD - matches: [] - xfail: true -- query: - - mons vaticanus, subterrane - matches: [] - xfail: true -- query: - - oYoO oyOY Oyoyo OY - matches: [] - xfail: true -- query: - - Loin du hood - matches: [] - xfail: true -- query: - - acab blm - matches: [] - xfail: true -- query: - - 010997 - matches: [] - xfail: true -- query: - - tidytuanzebeaETMs basement - matches: [] - xfail: true -- query: - - oYOyomasdEmr - matches: [] - xfail: true -- query: - - somewhere - matches: [] - xfail: true -- query: - - 330 - matches: [] - xfail: true -- query: - - oSUSdod - matches: [] - xfail: true -- query: - - she/her 16 isfp - matches: [] - xfail: true \ No newline at end of file diff --git a/tests/prod_db/test_us_state_index.py b/tests/prod_db/test_us_state_index.py index c1353e9..9adafc6 100644 --- a/tests/prod_db/test_us_state_index.py +++ b/tests/prod_db/test_us_state_index.py @@ -16,19 +16,15 @@ def yield_cases(): queries = group['query'] - xfail = group.get('xfail', False) - if type(queries) is str: queries = [queries] for query in queries: - yield query, group['matches'], xfail + yield query, group['matches'] -@pytest.mark.parametrize('query,matches,xfail', yield_cases()) -def test_cases(state_idx, query, matches, xfail): - if xfail: - pytest.xfail() +@pytest.mark.parametrize('query,matches', yield_cases()) +def test_cases(state_idx, query, matches): res = state_idx[query] diff --git a/tests/prod_db/test_us_state_index_2.yml b/tests/prod_db/test_us_state_index_2.yml deleted file mode 100644 index c0eedb2..0000000 --- a/tests/prod_db/test_us_state_index_2.yml +++ /dev/null @@ -1,340 +0,0 @@ -- query: - - North Carolina, USA - matches: - - 85688773 -- query: - - District of Columbia - matches: - - 85688741 -- query: - - Illinois, United States - matches: - - 85688697 -- query: - - Georgia United States - matches: - - 85688535 -- query: - - north carolina - matches: - - 85688773 -- query: - - texas - matches: - - 85688753 -- query: - - iowa - matches: - - 85688713 -- query: - - Florida, United States - matches: - - 85688651 -- query: - - Vermont, USA - matches: - - 85688763 -- query: - - TX USA - matches: - - 85688753 -- query: - - FL U.S.A. - matches: - - 85688651 -- query: - - pennsylvania usa - matches: - - 85688481 -- query: - - nebraska - matches: - - 85688563 -- query: - - Oregon - matches: - - 85688513 -- query: - - Pennsylvania - matches: - - 85688481 -- query: - - New Hampshire USA - matches: - - 85688689 -- query: - - Nebraska, USA - matches: - - 85688563 -- query: - - New mexico - matches: - - 85688493 -- query: - - Indiana - matches: - - 85688709 -- query: - - South Dakota - matches: - - 85688693 -- query: - - Oklahoma - matches: - - 85688585 -- query: - - Ohio,US - matches: - - 85688485 -- query: - - Kansas, USA - matches: - - 85688555 -- query: - - indiana - matches: - - 85688709 -- query: - - MA, USA - matches: - - 85688645 -- query: - - New York - matches: - - 85688543 -- query: - - Ohio, United States - matches: - - 85688485 -- query: - - NJ USA - matches: - - 85688607 -- query: - - ohio usa - matches: - - 85688485 -- query: - - Connecticut, USA - matches: - - 85688629 -- query: - - MICHIGAN, United States - matches: - - 85688599 -- query: - - Missouri - matches: - - 85688661 -- query: - - New York - matches: - - 85688543 -- query: - - California - USA - matches: - - 85688637 -- query: - - Massachusetts, USA - matches: - - 85688645 -- query: - - Missouri - matches: - - 85688661 -- query: - - FL, United States of America - matches: - - 85688651 -- query: - - New Hampshire - matches: - - 85688689 -- query: - - Georgia. - matches: - - 85688535 -- query: - - Nevada USA - matches: - - 85688531 -- query: - - PENNSYLVANIA - matches: - - 85688481 -- query: - - Virginia, USA. - matches: - - 85688747 -- query: - - Alabama, USA - matches: - - 85688675 -- query: - - Indiana - matches: - - 85688709 -- query: - - Louisiana, United States - matches: - - 85688735 -- query: - - New Mexico - matches: - - 85688493 -- query: - - Ohio USA - matches: - - 85688485 -- query: - - Nevada, USA - matches: - - 85688531 -- query: - - LOUISIANA - matches: - - 85688735 -- query: - - New Jersey, us - matches: - - 85688607 -- query: - - Nuketown - matches: [] - xfail: true -- query: - - Under Lefty's Skin - matches: [] - xfail: true -- query: - - oY’– 7/6/20 oY’– - matches: [] - xfail: true -- query: - - Palestine.Ramallah - matches: [] - xfail: true -- query: - - DONT REPOST MY ART ! carrd byf - matches: [] - xfail: true -- query: - - AJ&K, Pakistan. - matches: [] - xfail: true -- query: - - La Plana Alta - matches: [] - xfail: true -- query: - - Com o ChorAEo - matches: [] - xfail: true -- query: - - PLUTO - matches: [] - xfail: true -- query: - - All Over The WORLD - matches: [] - xfail: true -- query: - - /dev/null - matches: [] - xfail: true -- query: - - University of Bath - matches: [] - xfail: true -- query: - - Gaia - matches: [] - xfail: true -- query: - - Plc - matches: [] - xfail: true -- query: - - Jay cooke mn - matches: [] - xfail: true -- query: - - Toulouse - Perpignan - matches: [] - xfail: true -- query: - - oYtaoYt |oYtaoYt¦ aeC sc squad.a†’ - matches: [] - xfail: true -- query: - - oYscoY - matches: [] - xfail: true -- query: - - sheher ecoe 20 - matches: [] - xfail: true -- query: - - pequitas de sunoo^^ - matches: [] - xfail: true -- query: - - West M - matches: [] - xfail: true -- query: - - Maputo city - matches: [] - xfail: true -- query: - - Entwined w peace - matches: [] - xfail: true -- query: - - DDDDDDDDDDDDDDD - matches: [] - xfail: true -- query: - - mons vaticanus, subterrane - matches: [] - xfail: true -- query: - - oYoO oyOY Oyoyo OY - matches: [] - xfail: true -- query: - - Loin du hood - matches: [] - xfail: true -- query: - - acab blm - matches: [] - xfail: true -- query: - - 010997 - matches: [] - xfail: true -- query: - - tidytuanzebeaETMs basement - matches: [] - xfail: true -- query: - - oYOyomasdEmr - matches: [] - xfail: true -- query: - - somewhere - matches: [] - xfail: true -- query: - - 330 - matches: [] - xfail: true -- query: - - oSUSdod - matches: [] - xfail: true -- query: - - she/her 16 isfp - matches: [] - xfail: true \ No newline at end of file From 0380cd880cd2127345f73e67e41d2d80a6282e31 Mon Sep 17 00:00:00 2001 From: Sheshank Shankar Date: Sun, 20 Sep 2020 00:04:21 -0700 Subject: [PATCH 03/12] two level dict => marisa trie --- litecoder/usa.py | 61 ++++++++++++++++++++++++++---------------------- 1 file changed, 33 insertions(+), 28 deletions(-) diff --git a/litecoder/usa.py b/litecoder/usa.py index d6ff15a..8ef2345 100644 --- a/litecoder/usa.py +++ b/litecoder/usa.py @@ -1,6 +1,7 @@ import re +import marisa_trie import pickle from tqdm import tqdm @@ -192,14 +193,13 @@ def __repr__(self): class Index: - @classmethod - def load(cls, path): - with open(path, 'rb') as fh: - return pickle.load(fh) + def load(self, key_to_ids_path, id_to_loc_path): + self._key_to_ids.load(key_to_ids_path) + self._id_to_loc.load(id_to_loc_path) def __init__(self): - self._key_to_ids = defaultdict(set) - self._id_to_loc = dict() + self._key_to_ids = marisa_trie.BytesTrie() + self._id_to_loc = marisa_trie.BytesTrie() def __len__(self): return len(self._key_to_ids) @@ -214,33 +214,30 @@ def __repr__(self): def __getitem__(self, text): """Get ids, map to records only if there is a match in the index """ - if keyify(text) not in self._key_to_ids: + normalized_key = keyify(text) + if normalized_key not in self._key_to_ids: return None - ids = self._key_to_ids[keyify(text)] + ids = pickle.loads(self._key_to_ids[normalized_key][0]) - return [self._id_to_loc[id] for id in ids] + return [pickle.loads(self._id_to_loc[id][0]) for id in ids] - def add_key(self, key, id): - self._key_to_ids[key].add(id) + # def add_key(self, key, id): + # self._key_to_ids[key].add(id) - def add_location(self, id, location): - self._id_to_loc[id] = location + # def add_location(self, id, location): + # self._id_to_loc[id] = location def locations(self): return list(self._id_to_loc.values()) - def save(self, path): - with open(path, 'wb') as fh: - pickle.dump(self, fh) + def save(self, key_to_ids_path, id_to_loc_path): + self._key_to_ids.save(key_to_ids_path) + self._id_to_loc.save(id_to_loc_path) class USCityIndex(Index): - @classmethod - def load(cls, path=US_CITY_PATH): - return super().load(path) - def __init__(self, bare_name_blocklist=None): super().__init__() self.bare_name_blocklist = bare_name_blocklist @@ -257,22 +254,24 @@ def build(self): logger.info('Indexing US cities.') + key_to_ids = defaultdict(set) + id_to_loc = dict() + for row in tqdm(cities): # Key -> id(s) for key in map(keyify, iter_keys(row)): - self.add_key(key, row.wof_id) + key_to_ids[key].add(str(row.wof_id)) # ID -> city - self.add_location(row.wof_id, CityMatch(row)) + id_to_loc[str(row.wof_id)] = pickle.dumps(CityMatch(row)) + + self._key_to_ids = marisa_trie.BytesTrie([(key, pickle.dumps(key_to_ids[key])) for key in key_to_ids]) + self._id_to_loc = marisa_trie.BytesTrie(id_to_loc.items()) class USStateIndex(Index): - @classmethod - def load(cls, path=US_STATE_PATH): - return super().load(path) - def build(self): """Index all US states. """ @@ -280,11 +279,17 @@ def build(self): logger.info('Indexing US states.') + key_to_ids = defaultdict(set) + id_to_loc = dict() + for row in tqdm(states): # Key -> id(s) for key in map(keyify, state_key_iter(row)): - self.add_key(key, row.wof_id) + key_to_ids[key].add(str(row.wof_id)) # ID -> state - self.add_location(row.wof_id, StateMatch(row)) + id_to_loc[str(row.wof_id)] = pickle.dumps(StateMatch(row)) + + self._key_to_ids = marisa_trie.BytesTrie([(key, pickle.dumps(key_to_ids[key])) for key in key_to_ids]) + self._id_to_loc = marisa_trie.BytesTrie(id_to_loc.items()) From 9bc934d1c6037a692fcea40c3b59d851aa969b73 Mon Sep 17 00:00:00 2001 From: Sheshank Shankar Date: Sun, 27 Sep 2020 14:56:40 -0700 Subject: [PATCH 04/12] combine all trie files into one --- litecoder/__init__.py | 4 +- litecoder/usa.py | 64 ++++++++++++------- speed_test.py | 127 ++++++++++++++++++++++++++++++++++++++ tests/prod_db/conftest.py | 8 ++- 4 files changed, 174 insertions(+), 29 deletions(-) create mode 100644 speed_test.py diff --git a/litecoder/__init__.py b/litecoder/__init__.py index e67b0f9..52ad2ed 100644 --- a/litecoder/__init__.py +++ b/litecoder/__init__.py @@ -9,9 +9,7 @@ DATA_DIR = os.path.join(os.path.dirname(__file__), 'data') -US_STATE_PATH = os.path.join(DATA_DIR, 'us-states.p') - -US_CITY_PATH = os.path.join(DATA_DIR, 'us-cities.p') +DATA_PATH = os.path.join(DATA_DIR, 'trie.marisa') logging.basicConfig( diff --git a/litecoder/usa.py b/litecoder/usa.py index 8ef2345..21d7a57 100644 --- a/litecoder/usa.py +++ b/litecoder/usa.py @@ -12,7 +12,7 @@ from sqlalchemy.inspection import inspect -from . import logger, US_CITY_PATH, US_STATE_PATH +from . import logger, DATA_PATH from .models import WOFRegion, WOFLocality @@ -193,34 +193,39 @@ def __repr__(self): class Index: - def load(self, key_to_ids_path, id_to_loc_path): - self._key_to_ids.load(key_to_ids_path) - self._id_to_loc.load(id_to_loc_path) + # city keys -> ids = A + # city ids -> loc = B + # state keys -> ids = C + # state ids -> loc = D - def __init__(self): - self._key_to_ids = marisa_trie.BytesTrie() - self._id_to_loc = marisa_trie.BytesTrie() + def load(self, trie_path=DATA_PATH): + self._trie.load(trie_path) + + def __init__(self, keys_key, ids_key): + self._trie = marisa_trie.BytesTrie() + self._keys_key = keys_key + self._ids_key = ids_key def __len__(self): - return len(self._key_to_ids) + return len(self._trie.keys(self._keys_key)) def __repr__(self): return '%s<%d keys, %d entities>' % ( self.__class__.__name__, - len(self._key_to_ids), - len(self._id_to_loc), + len(self._trie.keys(self._keys_key)), + len(self._trie.keys(self._ids_key)), ) def __getitem__(self, text): """Get ids, map to records only if there is a match in the index """ - normalized_key = keyify(text) - if normalized_key not in self._key_to_ids: + normalized_key = self._keys_key + keyify(text) + if normalized_key not in self._trie: return None - ids = pickle.loads(self._key_to_ids[normalized_key][0]) + ids = pickle.loads(self._trie[normalized_key][0]) - return [pickle.loads(self._id_to_loc[id][0]) for id in ids] + return [pickle.loads(self._trie[self._ids_key + id][0]) for id in ids] # def add_key(self, key, id): # self._key_to_ids[key].add(id) @@ -231,20 +236,20 @@ def __getitem__(self, text): def locations(self): return list(self._id_to_loc.values()) - def save(self, key_to_ids_path, id_to_loc_path): - self._key_to_ids.save(key_to_ids_path) - self._id_to_loc.save(id_to_loc_path) + def save(self, path): + self._trie.save(path) class USCityIndex(Index): def __init__(self, bare_name_blocklist=None): - super().__init__() + super().__init__(u"A", u"B") self.bare_name_blocklist = bare_name_blocklist def build(self): """Index all US cities. """ + allow_bare = AllowBareCityName(blocklist=self.bare_name_blocklist) iter_keys = CityKeyIter(allow_bare) @@ -264,14 +269,21 @@ def build(self): key_to_ids[key].add(str(row.wof_id)) # ID -> city - id_to_loc[str(row.wof_id)] = pickle.dumps(CityMatch(row)) + id_to_loc[self._ids_key + str(row.wof_id)] = pickle.dumps(CityMatch(row)) + + # In case the loaded trie already has states data + previous_trie_data = [(key, value) for (key, value) in self._trie.items() if not (key.startswith(self._keys_key) or key.startswith(self._ids_key))] + key_to_ids_trie_data = [(self._keys_key + key, pickle.dumps(key_to_ids[key])) for key in key_to_ids] + id_to_loc_trie_data = list(id_to_loc.items()) - self._key_to_ids = marisa_trie.BytesTrie([(key, pickle.dumps(key_to_ids[key])) for key in key_to_ids]) - self._id_to_loc = marisa_trie.BytesTrie(id_to_loc.items()) + self._trie = marisa_trie.BytesTrie(previous_trie_data + key_to_ids_trie_data + id_to_loc_trie_data) class USStateIndex(Index): + def __init__(self): + super().__init__(u"C", u"D") + def build(self): """Index all US states. """ @@ -289,7 +301,11 @@ def build(self): key_to_ids[key].add(str(row.wof_id)) # ID -> state - id_to_loc[str(row.wof_id)] = pickle.dumps(StateMatch(row)) + id_to_loc[self._ids_key + str(row.wof_id)] = pickle.dumps(StateMatch(row)) + + # In case the loaded trie already has states data + previous_trie_data = [(key, value) for (key, value) in self._trie.items() if not (key.startswith(self._keys_key) or key.startswith(self._ids_key))] + key_to_ids_trie_data = [(self._keys_key + key, pickle.dumps(key_to_ids[key])) for key in key_to_ids] + id_to_loc_trie_data = list(id_to_loc.items()) - self._key_to_ids = marisa_trie.BytesTrie([(key, pickle.dumps(key_to_ids[key])) for key in key_to_ids]) - self._id_to_loc = marisa_trie.BytesTrie(id_to_loc.items()) + self._trie = marisa_trie.BytesTrie(previous_trie_data + key_to_ids_trie_data + id_to_loc_trie_data) \ No newline at end of file diff --git a/speed_test.py b/speed_test.py new file mode 100644 index 0000000..aa43c75 --- /dev/null +++ b/speed_test.py @@ -0,0 +1,127 @@ +from litecoder.usa import USCityIndex, USStateIndex +import time + +print("Loading USCityIndex... ", end="") +start_time = time.time() +city_idx = USCityIndex() +city_idx.load() +print("finished: {}s!".format(time.time() - start_time)) + +print("Loading USStateIndex... ", end="") +start_time = time.time() +state_idx = USStateIndex() +state_idx.load() +print("finished: {}s!".format(time.time() - start_time)) + +city_tests = """Edinburg, Texas +Lakeville , Minnesota +Woodland, CA. +Gary, IN +Cornelius, NC +Okeechobee, Fl +Saginaw Township South, MI +Lansdowne, PA +Knoxville TN +OAKLAND, CA +suffolk va +Port Orange, FL +Sedona, AZ +Cedar City UT +Cincinnati. +Huntington Beach CA +Wooster,Ohio +Lewisville, Texas +traverse city mi +Pennsauken, New Jersey +Jonesboro, Arkansas +Zephyrhills, FL +West Jefferson, NC +Escondido, CA +Lumberton, NC +Cayce, SC +Stratford, Connecticut, USA +Avondale, AZ +Coral Springs, FL +Gaithersburg, MD +Westchester, IL +Louisa, Virginia +Norway, ME +Philadelphia PA, USA +Fort worth, tx +Eureka Springs, Arkansas +Nashville , TN +Ellenwood Ga +Floral Park, NY +Nashville Tennessee +Malvern, AR +Valdosta, Georgia +Valley Center Ca +St. Robert Mo. +Hollandale, MS +New Castle, PA +Harlem, FL +Kings Mills, OH +knoxville Tennessee +BrooklYn""".split("\n") +for x in range (8): + city_tests += city_tests +print("measuring time for {} cities... ".format(len(city_tests)), end="") +start_time = time.time() +for city in city_tests: + x = city_idx[city] +print("finished: took {}s!".format(1000*(time.time() - start_time))) +state_tests = """North Carolina, USA +District of Columbia +Illinois, United States +Georgia United States +north carolina +texas +iowa +Florida, United States +Vermont, USA +TX USA +FL U.S.A. +pennsylvania usa +nebraska + Oregon +Pennsylvania +New Hampshire USA +Nebraska, USA +New mexico +Indiana +South Dakota + Oklahoma +Ohio,US +Kansas, USA +indiana +MA, USA + New York +Ohio, United States +NJ USA +ohio usa +Connecticut, USA +MICHIGAN, United States +Missouri +New York +California - USA +Massachusetts, USA + Missouri +FL, United States of America +New Hampshire +Georgia. +Nevada USA + PENNSYLVANIA +Virginia, USA. +Alabama, USA +Indiana +Louisiana, United States +New Mexico +Ohio USA +Nevada, USA +LOUISIANA +New Jersey, us""".split("\n") +print("measuring time for {} states... ".format(len(state_tests)), end="") +start_time = time.time() +for state in state_tests: + x = state_idx[state] +print("finished: took {}s!".format(1000 * (time.time() - start_time))) diff --git a/tests/prod_db/conftest.py b/tests/prod_db/conftest.py index 1005d12..a9420a6 100644 --- a/tests/prod_db/conftest.py +++ b/tests/prod_db/conftest.py @@ -7,9 +7,13 @@ @pytest.fixture(scope='session') def city_idx(): - return USCityIndex.load() + city_idx = USCityIndex() + city_idx.load() + return city_idx @pytest.fixture(scope='session') def state_idx(): - return USStateIndex.load() + state_idx = USStateIndex() + state_idx.load() + return state_idx \ No newline at end of file From 19e557f441466f69d54a8db71a2fa047cf4c4a77 Mon Sep 17 00:00:00 2001 From: Sheshank Shankar Date: Sun, 27 Sep 2020 15:06:21 -0700 Subject: [PATCH 05/12] bug fix --- litecoder/usa.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/litecoder/usa.py b/litecoder/usa.py index 21d7a57..12d1f1c 100644 --- a/litecoder/usa.py +++ b/litecoder/usa.py @@ -234,7 +234,7 @@ def __getitem__(self, text): # self._id_to_loc[id] = location def locations(self): - return list(self._id_to_loc.values()) + return [loc for (id, loc) in self._trie.items() if id.startswith(self._ids_key)] def save(self, path): self._trie.save(path) From 59a46b36c44324e66c54911702d4a7b5a17b7120 Mon Sep 17 00:00:00 2001 From: Sheshank Shankar Date: Wed, 30 Sep 2020 17:19:44 -0700 Subject: [PATCH 06/12] separate tries per index --- litecoder/__init__.py | 4 +++- litecoder/usa.py | 53 +++++++++++++++++++++---------------------- speed_test.py | 4 ++-- 3 files changed, 31 insertions(+), 30 deletions(-) diff --git a/litecoder/__init__.py b/litecoder/__init__.py index 52ad2ed..ec7ce0a 100644 --- a/litecoder/__init__.py +++ b/litecoder/__init__.py @@ -9,7 +9,9 @@ DATA_DIR = os.path.join(os.path.dirname(__file__), 'data') -DATA_PATH = os.path.join(DATA_DIR, 'trie.marisa') +US_STATE_PATH = os.path.join(DATA_DIR, 'us-states.marisa') + +US_CITY_PATH = os.path.join(DATA_DIR, 'us-cities.marisa') logging.basicConfig( diff --git a/litecoder/usa.py b/litecoder/usa.py index 12d1f1c..ef325fe 100644 --- a/litecoder/usa.py +++ b/litecoder/usa.py @@ -12,7 +12,7 @@ from sqlalchemy.inspection import inspect -from . import logger, DATA_PATH +from . import logger, US_CITY_PATH, US_STATE_PATH from .models import WOFRegion, WOFLocality @@ -198,34 +198,34 @@ class Index: # state keys -> ids = C # state ids -> loc = D - def load(self, trie_path=DATA_PATH): - self._trie.load(trie_path) + def load(self, path): + self._trie.load(path) - def __init__(self, keys_key, ids_key): + def __init__(self): self._trie = marisa_trie.BytesTrie() - self._keys_key = keys_key - self._ids_key = ids_key + self._keys_prefix = "A" + self._ids_prefix = "B" def __len__(self): - return len(self._trie.keys(self._keys_key)) + return len(self._trie.keys(self._keys_prefix)) def __repr__(self): return '%s<%d keys, %d entities>' % ( self.__class__.__name__, - len(self._trie.keys(self._keys_key)), - len(self._trie.keys(self._ids_key)), + len(self._trie.keys(self._keys_prefix)), + len(self._trie.keys(self._ids_prefix)), ) def __getitem__(self, text): """Get ids, map to records only if there is a match in the index """ - normalized_key = self._keys_key + keyify(text) + normalized_key = self._keys_prefix + keyify(text) if normalized_key not in self._trie: return None ids = pickle.loads(self._trie[normalized_key][0]) - return [pickle.loads(self._trie[self._ids_key + id][0]) for id in ids] + return [pickle.loads(self._trie[self._ids_prefix + id][0]) for id in ids] # def add_key(self, key, id): # self._key_to_ids[key].add(id) @@ -234,7 +234,7 @@ def __getitem__(self, text): # self._id_to_loc[id] = location def locations(self): - return [loc for (id, loc) in self._trie.items() if id.startswith(self._ids_key)] + return [loc for (id, loc) in self._trie.items() if id.startswith(self._ids_prefix)] def save(self, path): self._trie.save(path) @@ -242,8 +242,11 @@ def save(self, path): class USCityIndex(Index): + def load(self, path=US_CITY_PATH): + return super().load(path) + def __init__(self, bare_name_blocklist=None): - super().__init__(u"A", u"B") + super().__init__() self.bare_name_blocklist = bare_name_blocklist def build(self): @@ -269,20 +272,18 @@ def build(self): key_to_ids[key].add(str(row.wof_id)) # ID -> city - id_to_loc[self._ids_key + str(row.wof_id)] = pickle.dumps(CityMatch(row)) + id_to_loc[self._ids_prefix + str(row.wof_id)] = pickle.dumps(CityMatch(row)) - # In case the loaded trie already has states data - previous_trie_data = [(key, value) for (key, value) in self._trie.items() if not (key.startswith(self._keys_key) or key.startswith(self._ids_key))] - key_to_ids_trie_data = [(self._keys_key + key, pickle.dumps(key_to_ids[key])) for key in key_to_ids] - id_to_loc_trie_data = list(id_to_loc.items()) + key_to_ids_data = [(self._keys_prefix + key, pickle.dumps(key_to_ids[key])) for key in key_to_ids] + id_to_loc_data = list(id_to_loc.items()) - self._trie = marisa_trie.BytesTrie(previous_trie_data + key_to_ids_trie_data + id_to_loc_trie_data) + self._trie = marisa_trie.BytesTrie(key_to_ids_data + id_to_loc_data) class USStateIndex(Index): - def __init__(self): - super().__init__(u"C", u"D") + def load(self, path=US_STATE_PATH): + return super().load(path) def build(self): """Index all US states. @@ -301,11 +302,9 @@ def build(self): key_to_ids[key].add(str(row.wof_id)) # ID -> state - id_to_loc[self._ids_key + str(row.wof_id)] = pickle.dumps(StateMatch(row)) + id_to_loc[self._ids_prefix + str(row.wof_id)] = pickle.dumps(StateMatch(row)) - # In case the loaded trie already has states data - previous_trie_data = [(key, value) for (key, value) in self._trie.items() if not (key.startswith(self._keys_key) or key.startswith(self._ids_key))] - key_to_ids_trie_data = [(self._keys_key + key, pickle.dumps(key_to_ids[key])) for key in key_to_ids] - id_to_loc_trie_data = list(id_to_loc.items()) + key_to_ids_data = [(self._keys_prefix + key, pickle.dumps(key_to_ids[key])) for key in key_to_ids] + id_to_loc_data = list(id_to_loc.items()) - self._trie = marisa_trie.BytesTrie(previous_trie_data + key_to_ids_trie_data + id_to_loc_trie_data) \ No newline at end of file + self._trie = marisa_trie.BytesTrie(key_to_ids_data + id_to_loc_data) \ No newline at end of file diff --git a/speed_test.py b/speed_test.py index aa43c75..5f8b435 100644 --- a/speed_test.py +++ b/speed_test.py @@ -63,8 +63,8 @@ Kings Mills, OH knoxville Tennessee BrooklYn""".split("\n") -for x in range (8): - city_tests += city_tests +# for x in range (10): +# city_tests += city_tests print("measuring time for {} cities... ".format(len(city_tests)), end="") start_time = time.time() for city in city_tests: From 81d3bb6d9b6223f7bdd47af9d2eecccc0aecf43d Mon Sep 17 00:00:00 2001 From: Sheshank Shankar Date: Sat, 3 Oct 2020 21:54:56 -0700 Subject: [PATCH 07/12] some speed optimizations --- litecoder/__init__.py | 2 +- litecoder/usa.py | 22 +++++++++++++--------- speed_test.py | 4 ++-- 3 files changed, 16 insertions(+), 12 deletions(-) diff --git a/litecoder/__init__.py b/litecoder/__init__.py index ec7ce0a..d3249e5 100644 --- a/litecoder/__init__.py +++ b/litecoder/__init__.py @@ -11,7 +11,7 @@ US_STATE_PATH = os.path.join(DATA_DIR, 'us-states.marisa') -US_CITY_PATH = os.path.join(DATA_DIR, 'us-cities.marisa') +US_CITY_PATH = os.path.join(DATA_DIR, 'us-cities-ujson.marisa') logging.basicConfig( diff --git a/litecoder/usa.py b/litecoder/usa.py index ef325fe..4908231 100644 --- a/litecoder/usa.py +++ b/litecoder/usa.py @@ -2,20 +2,20 @@ import re import marisa_trie -import pickle +import _pickle as pickle +import ujson from tqdm import tqdm from collections import defaultdict from itertools import product from cached_property import cached_property from box import Box +import gc from sqlalchemy.inspection import inspect from . import logger, US_CITY_PATH, US_STATE_PATH from .models import WOFRegion, WOFLocality - - # TODO: Country alt-names YAML. USA_NAMES = ( 'USA', @@ -199,6 +199,7 @@ class Index: # state ids -> loc = D def load(self, path): + print(path) self._trie.load(path) def __init__(self): @@ -223,9 +224,12 @@ def __getitem__(self, text): if normalized_key not in self._trie: return None - ids = pickle.loads(self._trie[normalized_key][0]) + ids = ujson.loads(self._trie[normalized_key][0]) - return [pickle.loads(self._trie[self._ids_prefix + id][0]) for id in ids] + gc.disable() + z= [pickle.loads(self._trie[self._ids_prefix + id][0]) for id in ids] + gc.enable() + return z # def add_key(self, key, id): # self._key_to_ids[key].add(id) @@ -272,9 +276,9 @@ def build(self): key_to_ids[key].add(str(row.wof_id)) # ID -> city - id_to_loc[self._ids_prefix + str(row.wof_id)] = pickle.dumps(CityMatch(row)) + id_to_loc[self._ids_prefix + str(row.wof_id)] = pickle.dumps(CityMatch(row), protocol=-1) - key_to_ids_data = [(self._keys_prefix + key, pickle.dumps(key_to_ids[key])) for key in key_to_ids] + key_to_ids_data = [(self._keys_prefix + key, bytes(ujson.dumps(list(key_to_ids[key])), encoding="utf-8")) for key in key_to_ids] id_to_loc_data = list(id_to_loc.items()) self._trie = marisa_trie.BytesTrie(key_to_ids_data + id_to_loc_data) @@ -302,9 +306,9 @@ def build(self): key_to_ids[key].add(str(row.wof_id)) # ID -> state - id_to_loc[self._ids_prefix + str(row.wof_id)] = pickle.dumps(StateMatch(row)) + id_to_loc[self._ids_prefix + str(row.wof_id)] = pickle.dumps(StateMatch(row), protocol=-1) - key_to_ids_data = [(self._keys_prefix + key, pickle.dumps(key_to_ids[key])) for key in key_to_ids] + key_to_ids_data = [(self._keys_prefix + key, bytes(ujson.dumps(list(key_to_ids[key])), encoding="utf-8")) for key in key_to_ids] id_to_loc_data = list(id_to_loc.items()) self._trie = marisa_trie.BytesTrie(key_to_ids_data + id_to_loc_data) \ No newline at end of file diff --git a/speed_test.py b/speed_test.py index 5f8b435..74e026b 100644 --- a/speed_test.py +++ b/speed_test.py @@ -63,8 +63,8 @@ Kings Mills, OH knoxville Tennessee BrooklYn""".split("\n") -# for x in range (10): -# city_tests += city_tests +for x in range (5): + city_tests += city_tests print("measuring time for {} cities... ".format(len(city_tests)), end="") start_time = time.time() for city in city_tests: From 4963a3f9bbce0ca02bd4c1d8a2025dc2b946fb65 Mon Sep 17 00:00:00 2001 From: Sheshank Shankar Date: Sun, 25 Oct 2020 16:50:57 -0700 Subject: [PATCH 08/12] use raw json --- litecoder/__init__.py | 2 +- litecoder/usa.py | 70 +++++++------------------------------------ 2 files changed, 12 insertions(+), 60 deletions(-) diff --git a/litecoder/__init__.py b/litecoder/__init__.py index d3249e5..ec7ce0a 100644 --- a/litecoder/__init__.py +++ b/litecoder/__init__.py @@ -11,7 +11,7 @@ US_STATE_PATH = os.path.join(DATA_DIR, 'us-states.marisa') -US_CITY_PATH = os.path.join(DATA_DIR, 'us-cities-ujson.marisa') +US_CITY_PATH = os.path.join(DATA_DIR, 'us-cities.marisa') logging.basicConfig( diff --git a/litecoder/usa.py b/litecoder/usa.py index 4908231..758bee0 100644 --- a/litecoder/usa.py +++ b/litecoder/usa.py @@ -3,7 +3,7 @@ import re import marisa_trie import _pickle as pickle -import ujson +import ujson as json from tqdm import tqdm from collections import defaultdict @@ -148,49 +148,6 @@ def state_key_iter(row): yield ' '.join((abbr, usa)) -class Match: - - def __init__(self, row): - """Set model class, PK, metadata. - """ - state = inspect(row) - - # Don't store the actual row, so we can serialize. - self._model_cls = state.class_ - self._pk = state.identity - - self.data = Box(dict(row)) - - @cached_property - def db_row(self): - """Hydrate database row, lazily. - """ - return self._model_cls.query.get(self._pk) - - -class CityMatch(Match): - - def __repr__(self): - return '%s<%s, %s, %s, wof:%d>' % ( - self.__class__.__name__, - self.data.name, - self.data.name_a1, - self.data.name_a0, - self.data.wof_id, - ) - - -class StateMatch(Match): - - def __repr__(self): - return '%s<%s, %s, wof:%d>' % ( - self.__class__.__name__, - self.data.name, - self.data.name_a0, - self.data.wof_id, - ) - - class Index: # city keys -> ids = A @@ -224,12 +181,9 @@ def __getitem__(self, text): if normalized_key not in self._trie: return None - ids = ujson.loads(self._trie[normalized_key][0]) + ids = json.loads(self._trie[normalized_key][0]) - gc.disable() - z= [pickle.loads(self._trie[self._ids_prefix + id][0]) for id in ids] - gc.enable() - return z + return [json.loads(self._trie[self._ids_prefix + id][0]) for id in ids] # def add_key(self, key, id): # self._key_to_ids[key].add(id) @@ -267,7 +221,7 @@ def build(self): logger.info('Indexing US cities.') key_to_ids = defaultdict(set) - id_to_loc = dict() + id_to_loc_items = list() for row in tqdm(cities): @@ -276,12 +230,11 @@ def build(self): key_to_ids[key].add(str(row.wof_id)) # ID -> city - id_to_loc[self._ids_prefix + str(row.wof_id)] = pickle.dumps(CityMatch(row), protocol=-1) + id_to_loc_items.append((self._ids_prefix + str(row.wof_id), bytes(json.dumps(dict(row)), encoding="utf-8"))) - key_to_ids_data = [(self._keys_prefix + key, bytes(ujson.dumps(list(key_to_ids[key])), encoding="utf-8")) for key in key_to_ids] - id_to_loc_data = list(id_to_loc.items()) + key_to_ids_items = [(self._keys_prefix + key, bytes(json.dumps(list(key_to_ids[key])), encoding="utf-8")) for key in key_to_ids] - self._trie = marisa_trie.BytesTrie(key_to_ids_data + id_to_loc_data) + self._trie = marisa_trie.BytesTrie(key_to_ids_items + id_to_loc_items) class USStateIndex(Index): @@ -297,7 +250,7 @@ def build(self): logger.info('Indexing US states.') key_to_ids = defaultdict(set) - id_to_loc = dict() + id_to_loc_items = list() for row in tqdm(states): @@ -306,9 +259,8 @@ def build(self): key_to_ids[key].add(str(row.wof_id)) # ID -> state - id_to_loc[self._ids_prefix + str(row.wof_id)] = pickle.dumps(StateMatch(row), protocol=-1) + id_to_loc_items.append((self._ids_prefix + str(row.wof_id), bytes(json.dumps(dict(row)), encoding="utf-8"))) - key_to_ids_data = [(self._keys_prefix + key, bytes(ujson.dumps(list(key_to_ids[key])), encoding="utf-8")) for key in key_to_ids] - id_to_loc_data = list(id_to_loc.items()) + key_to_ids_items = [(self._keys_prefix + key, bytes(json.dumps(list(key_to_ids[key])), encoding="utf-8")) for key in key_to_ids] - self._trie = marisa_trie.BytesTrie(key_to_ids_data + id_to_loc_data) \ No newline at end of file + self._trie = marisa_trie.BytesTrie(key_to_ids_items + id_to_loc_items) \ No newline at end of file From 6eacada3c0eac767596188079fdd6306c0284453 Mon Sep 17 00:00:00 2001 From: Sheshank Shankar Date: Wed, 28 Oct 2020 11:56:07 -0700 Subject: [PATCH 09/12] remove unused imports --- litecoder/usa.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/litecoder/usa.py b/litecoder/usa.py index 758bee0..a0b5561 100644 --- a/litecoder/usa.py +++ b/litecoder/usa.py @@ -2,7 +2,6 @@ import re import marisa_trie -import _pickle as pickle import ujson as json from tqdm import tqdm @@ -10,7 +9,6 @@ from itertools import product from cached_property import cached_property from box import Box -import gc from sqlalchemy.inspection import inspect From cfff9b2c2c369835969ff2f894aef36ec16440bd Mon Sep 17 00:00:00 2001 From: Sheshank Shankar Date: Wed, 28 Oct 2020 17:09:22 -0700 Subject: [PATCH 10/12] cleanup --- litecoder/usa.py | 11 +-- speed_test.py | 127 --------------------------- tests/runtime/concurrency_test.py | 45 ++++++++++ tests/runtime/speed_test.py | 44 ++++++++++ tests/runtime/test_city_lookups.txt | 50 +++++++++++ tests/runtime/test_state_lookups.txt | 50 +++++++++++ 6 files changed, 192 insertions(+), 135 deletions(-) delete mode 100644 speed_test.py create mode 100644 tests/runtime/concurrency_test.py create mode 100644 tests/runtime/speed_test.py create mode 100644 tests/runtime/test_city_lookups.txt create mode 100644 tests/runtime/test_state_lookups.txt diff --git a/litecoder/usa.py b/litecoder/usa.py index a0b5561..0afabf4 100644 --- a/litecoder/usa.py +++ b/litecoder/usa.py @@ -14,6 +14,8 @@ from . import logger, US_CITY_PATH, US_STATE_PATH from .models import WOFRegion, WOFLocality + + # TODO: Country alt-names YAML. USA_NAMES = ( 'USA', @@ -154,7 +156,6 @@ class Index: # state ids -> loc = D def load(self, path): - print(path) self._trie.load(path) def __init__(self): @@ -183,12 +184,6 @@ def __getitem__(self, text): return [json.loads(self._trie[self._ids_prefix + id][0]) for id in ids] - # def add_key(self, key, id): - # self._key_to_ids[key].add(id) - - # def add_location(self, id, location): - # self._id_to_loc[id] = location - def locations(self): return [loc for (id, loc) in self._trie.items() if id.startswith(self._ids_prefix)] @@ -261,4 +256,4 @@ def build(self): key_to_ids_items = [(self._keys_prefix + key, bytes(json.dumps(list(key_to_ids[key])), encoding="utf-8")) for key in key_to_ids] - self._trie = marisa_trie.BytesTrie(key_to_ids_items + id_to_loc_items) \ No newline at end of file + self._trie = marisa_trie.BytesTrie(key_to_ids_items + id_to_loc_items) diff --git a/speed_test.py b/speed_test.py deleted file mode 100644 index 74e026b..0000000 --- a/speed_test.py +++ /dev/null @@ -1,127 +0,0 @@ -from litecoder.usa import USCityIndex, USStateIndex -import time - -print("Loading USCityIndex... ", end="") -start_time = time.time() -city_idx = USCityIndex() -city_idx.load() -print("finished: {}s!".format(time.time() - start_time)) - -print("Loading USStateIndex... ", end="") -start_time = time.time() -state_idx = USStateIndex() -state_idx.load() -print("finished: {}s!".format(time.time() - start_time)) - -city_tests = """Edinburg, Texas -Lakeville , Minnesota -Woodland, CA. -Gary, IN -Cornelius, NC -Okeechobee, Fl -Saginaw Township South, MI -Lansdowne, PA -Knoxville TN -OAKLAND, CA -suffolk va -Port Orange, FL -Sedona, AZ -Cedar City UT -Cincinnati. -Huntington Beach CA -Wooster,Ohio -Lewisville, Texas -traverse city mi -Pennsauken, New Jersey -Jonesboro, Arkansas -Zephyrhills, FL -West Jefferson, NC -Escondido, CA -Lumberton, NC -Cayce, SC -Stratford, Connecticut, USA -Avondale, AZ -Coral Springs, FL -Gaithersburg, MD -Westchester, IL -Louisa, Virginia -Norway, ME -Philadelphia PA, USA -Fort worth, tx -Eureka Springs, Arkansas -Nashville , TN -Ellenwood Ga -Floral Park, NY -Nashville Tennessee -Malvern, AR -Valdosta, Georgia -Valley Center Ca -St. Robert Mo. -Hollandale, MS -New Castle, PA -Harlem, FL -Kings Mills, OH -knoxville Tennessee -BrooklYn""".split("\n") -for x in range (5): - city_tests += city_tests -print("measuring time for {} cities... ".format(len(city_tests)), end="") -start_time = time.time() -for city in city_tests: - x = city_idx[city] -print("finished: took {}s!".format(1000*(time.time() - start_time))) -state_tests = """North Carolina, USA -District of Columbia -Illinois, United States -Georgia United States -north carolina -texas -iowa -Florida, United States -Vermont, USA -TX USA -FL U.S.A. -pennsylvania usa -nebraska - Oregon -Pennsylvania -New Hampshire USA -Nebraska, USA -New mexico -Indiana -South Dakota - Oklahoma -Ohio,US -Kansas, USA -indiana -MA, USA - New York -Ohio, United States -NJ USA -ohio usa -Connecticut, USA -MICHIGAN, United States -Missouri -New York -California - USA -Massachusetts, USA - Missouri -FL, United States of America -New Hampshire -Georgia. -Nevada USA - PENNSYLVANIA -Virginia, USA. -Alabama, USA -Indiana -Louisiana, United States -New Mexico -Ohio USA -Nevada, USA -LOUISIANA -New Jersey, us""".split("\n") -print("measuring time for {} states... ".format(len(state_tests)), end="") -start_time = time.time() -for state in state_tests: - x = state_idx[state] -print("finished: took {}s!".format(1000 * (time.time() - start_time))) diff --git a/tests/runtime/concurrency_test.py b/tests/runtime/concurrency_test.py new file mode 100644 index 0000000..6adc618 --- /dev/null +++ b/tests/runtime/concurrency_test.py @@ -0,0 +1,45 @@ +from multiprocessing import Pool +from litecoder.usa import USCityIndex, USStateIndex +import time + +NUM_PROCESSES = 4 + +# Load 50 test city lookups +with open("tests/runtime/test_city_lookups.txt", "r") as lookups_file: + city_tests = lookups_file.read().splitlines() + +# Increase the number of lookups for the speed test if necessary +for x in range (10): + city_tests += city_tests +num_tests_per_process = len(city_tests) +num_tests = NUM_PROCESSES * num_tests_per_process + +# Load USCityIndex +city_idx = USCityIndex() +city_idx.load() + + +def lookup_cities(process_num): + print ('Process {}: looking up {} cities'.format(process_num, num_tests_per_process)) + start_time = time.time() + for city in city_tests: + city_idx[city] + ms = 1000*(time.time() - start_time) + print("Process {}: finished, took {}ms @ {} ms/lookup!".format(process_num, ms, float(ms/num_tests_per_process))) + +if __name__ == '__main__': + print("Looking up {} cities on {} processes...".format(num_tests, NUM_PROCESSES)) + start_time = time.time() + with Pool(5) as p: + p.map(lookup_cities, range(1, NUM_PROCESSES+1)) + ms = 1000*(time.time() - start_time) + print("Fully finished: took {}ms @ {} ms/lookup!".format(ms, float(ms/num_tests))) + + print() + print("Looking up all {} cities on one process...".format(num_tests), end="") + start_time = time.time() + for i in range(NUM_PROCESSES): + for city in city_tests: + city_idx[city] + ms = 1000*(time.time() - start_time) + print("finished: took {}ms @ {} ms/lookup!".format(ms, ms/num_tests)) \ No newline at end of file diff --git a/tests/runtime/speed_test.py b/tests/runtime/speed_test.py new file mode 100644 index 0000000..de226d9 --- /dev/null +++ b/tests/runtime/speed_test.py @@ -0,0 +1,44 @@ +from litecoder.usa import USCityIndex, USStateIndex +import time + +print("Loading USCityIndex... ", end="") +start_time = time.time() +city_idx = USCityIndex() +city_idx.load() +print("finished: {}s!".format(time.time() - start_time)) + +# Load 50 test city lookups +with open("tests/runtime/test_city_lookups.txt", "r") as lookups_file: + city_tests = lookups_file.read().splitlines() + +# Increase the number of lookups for the speed test if necessary +for x in range (5): + city_tests += city_tests +num_tests = len(city_tests) +print("measuring time for {} cities... ".format(num_tests), end="") +start_time = time.time() +for city in city_tests: + city_idx[city] +ms = 1000*(time.time() - start_time) +print("finished: took {}ms at {} ms/lookup!".format(ms, float(ms/num_tests))) + +print("Loading USStateIndex... ", end="") +start_time = time.time() +state_idx = USStateIndex() +state_idx.load() +print("finished: {}s!".format(time.time() - start_time)) + +# Load 50 test state lookups +with open("tests/runtime/test_state_lookups.txt", "r") as lookups_file: + state_tests = lookups_file.read().splitlines() + +# Increase the number of lookups for the speed test if necessary +for x in range (5): + state_tests += state_tests +num_tests = len(state_tests) +print("measuring time for {} states... ".format(num_tests), end="") +start_time = time.time() +for state in state_tests: + state_idx[state] +ms = 1000*(time.time() - start_time) +print("finished: took {}ms at {} ms/lookup!".format(ms, float(ms/num_tests))) diff --git a/tests/runtime/test_city_lookups.txt b/tests/runtime/test_city_lookups.txt new file mode 100644 index 0000000..43d1bd5 --- /dev/null +++ b/tests/runtime/test_city_lookups.txt @@ -0,0 +1,50 @@ +Edinburg, Texas +Lakeville , Minnesota +Woodland, CA. +Gary, IN +Cornelius, NC +Okeechobee, Fl +Saginaw Township South, MI +Lansdowne, PA +Knoxville TN +OAKLAND, CA +suffolk va +Port Orange, FL +Sedona, AZ +Cedar City UT +Cincinnati. +Huntington Beach CA +Wooster,Ohio +Lewisville, Texas +traverse city mi +Pennsauken, New Jersey +Jonesboro, Arkansas +Zephyrhills, FL +West Jefferson, NC +Escondido, CA +Lumberton, NC +Cayce, SC +Stratford, Connecticut, USA +Avondale, AZ +Coral Springs, FL +Gaithersburg, MD +Westchester, IL +Louisa, Virginia +Norway, ME +Philadelphia PA, USA +Fort worth, tx +Eureka Springs, Arkansas +Nashville , TN +Ellenwood Ga +Floral Park, NY +Nashville Tennessee +Malvern, AR +Valdosta, Georgia +Valley Center Ca +St. Robert Mo. +Hollandale, MS +New Castle, PA +Harlem, FL +Kings Mills, OH +knoxville Tennessee +BrooklYn \ No newline at end of file diff --git a/tests/runtime/test_state_lookups.txt b/tests/runtime/test_state_lookups.txt new file mode 100644 index 0000000..fd8b3c9 --- /dev/null +++ b/tests/runtime/test_state_lookups.txt @@ -0,0 +1,50 @@ +North Carolina, USA +District of Columbia +Illinois, United States +Georgia United States +north carolina +texas +iowa +Florida, United States +Vermont, USA +TX USA +FL U.S.A. +pennsylvania usa +nebraska + Oregon +Pennsylvania +New Hampshire USA +Nebraska, USA +New mexico +Indiana +South Dakota + Oklahoma +Ohio,US +Kansas, USA +indiana +MA, USA + New York +Ohio, United States +NJ USA +ohio usa +Connecticut, USA +MICHIGAN, United States +Missouri +New York +California - USA +Massachusetts, USA + Missouri +FL, United States of America +New Hampshire +Georgia. +Nevada USA + PENNSYLVANIA +Virginia, USA. +Alabama, USA +Indiana +Louisiana, United States +New Mexico +Ohio USA +Nevada, USA +LOUISIANA +New Jersey, us \ No newline at end of file From a2f49156f7c98fcbef254ec2081a94517a2cdfd6 Mon Sep 17 00:00:00 2001 From: Sheshank Shankar Date: Wed, 28 Oct 2020 17:12:44 -0700 Subject: [PATCH 11/12] Update tests to match new match output format --- tests/prod_db/conftest.py | 2 +- tests/prod_db/test_us_city_index.py | 4 ++-- tests/prod_db/test_us_state_index.py | 4 ++-- 3 files changed, 5 insertions(+), 5 deletions(-) diff --git a/tests/prod_db/conftest.py b/tests/prod_db/conftest.py index a9420a6..06583e3 100644 --- a/tests/prod_db/conftest.py +++ b/tests/prod_db/conftest.py @@ -16,4 +16,4 @@ def city_idx(): def state_idx(): state_idx = USStateIndex() state_idx.load() - return state_idx \ No newline at end of file + return state_idx diff --git a/tests/prod_db/test_us_city_index.py b/tests/prod_db/test_us_city_index.py index aec57f0..e05cfa2 100644 --- a/tests/prod_db/test_us_city_index.py +++ b/tests/prod_db/test_us_city_index.py @@ -33,7 +33,7 @@ def test_cases(city_idx, query, matches, xfail): res = city_idx[query] - ids = [r.data.wof_id for r in res] + ids = [r["wof_id"] for r in res] # Exact id list match. assert sorted(ids) == sorted(matches) @@ -49,6 +49,6 @@ def test_topn(city_idx, city): """Smoke test N most populous cities. """ res = city_idx['%s, %s' % (city.name, city.name_a1)] - res_ids = [r.data.wof_id for r in res] + res_ids = [r["wof_id"] for r in res] assert city.wof_id in res_ids diff --git a/tests/prod_db/test_us_state_index.py b/tests/prod_db/test_us_state_index.py index 9adafc6..2ae1561 100644 --- a/tests/prod_db/test_us_state_index.py +++ b/tests/prod_db/test_us_state_index.py @@ -28,7 +28,7 @@ def test_cases(state_idx, query, matches): res = state_idx[query] - ids = [r.data.wof_id for r in res] + ids = [r["wof_id"] for r in res] assert sorted(ids) == sorted(matches) @@ -41,6 +41,6 @@ def test_all(state_idx, state): """Smoke test N most populous cities. """ res = state_idx[state.name] - res_ids = [r.data.wof_id for r in res] + res_ids = [r["wof_id"] for r in res] assert state.wof_id in res_ids From 0ad83aece64cb1c3f96aad6f4b97dbf6cf6bf94f Mon Sep 17 00:00:00 2001 From: Sheshank Shankar Date: Wed, 18 Nov 2020 14:50:06 -0800 Subject: [PATCH 12/12] Resolve merge request reviews --- Pipfile | 1 + litecoder/usa.py | 34 +++++++++++++++++----------------- tests/runtime/speed_test.py | 4 ++-- 3 files changed, 20 insertions(+), 19 deletions(-) diff --git a/Pipfile b/Pipfile index b65227a..576163e 100644 --- a/Pipfile +++ b/Pipfile @@ -35,6 +35,7 @@ PyYAML = "*" Shapely = "*" numpy = "*" scipy = "*" +marisa_trie = "*" [dev-packages] diff --git a/litecoder/usa.py b/litecoder/usa.py index 0afabf4..362d485 100644 --- a/litecoder/usa.py +++ b/litecoder/usa.py @@ -150,16 +150,16 @@ def state_key_iter(row): class Index: - # city keys -> ids = A - # city ids -> loc = B - # state keys -> ids = C - # state ids -> loc = D - - def load(self, path): - self._trie.load(path) + def load(self, path, mmap=False): + if mmap: + self._trie.mmap(path) + else: + self._trie.load(path) def __init__(self): self._trie = marisa_trie.BytesTrie() + + # We use prefixes here to store the keys -> ids and ids -> loc "maps" as subtrees in one marisa trie. self._keys_prefix = "A" self._ids_prefix = "B" @@ -177,15 +177,15 @@ def __getitem__(self, text): """Get ids, map to records only if there is a match in the index """ normalized_key = self._keys_prefix + keyify(text) - if normalized_key not in self._trie: + val = self._trie.get(normalized_key, None) + if not val: return None - - ids = json.loads(self._trie[normalized_key][0]) + ids = json.loads(val[0]) return [json.loads(self._trie[self._ids_prefix + id][0]) for id in ids] def locations(self): - return [loc for (id, loc) in self._trie.items() if id.startswith(self._ids_prefix)] + return self._trie.items(self._ids_prefix) def save(self, path): self._trie.save(path) @@ -193,8 +193,8 @@ def save(self, path): class USCityIndex(Index): - def load(self, path=US_CITY_PATH): - return super().load(path) + def load(self, path=US_CITY_PATH, mmap=False): + return super().load(path, mmap) def __init__(self, bare_name_blocklist=None): super().__init__() @@ -225,15 +225,15 @@ def build(self): # ID -> city id_to_loc_items.append((self._ids_prefix + str(row.wof_id), bytes(json.dumps(dict(row)), encoding="utf-8"))) - key_to_ids_items = [(self._keys_prefix + key, bytes(json.dumps(list(key_to_ids[key])), encoding="utf-8")) for key in key_to_ids] + key_to_ids_items = [(self._keys_prefix + key, json.dumps(list(key_to_ids[key])).encode("utf-8")) for key in key_to_ids] self._trie = marisa_trie.BytesTrie(key_to_ids_items + id_to_loc_items) class USStateIndex(Index): - def load(self, path=US_STATE_PATH): - return super().load(path) + def load(self, path=US_STATE_PATH, mmap=False): + return super().load(path, mmap) def build(self): """Index all US states. @@ -254,6 +254,6 @@ def build(self): # ID -> state id_to_loc_items.append((self._ids_prefix + str(row.wof_id), bytes(json.dumps(dict(row)), encoding="utf-8"))) - key_to_ids_items = [(self._keys_prefix + key, bytes(json.dumps(list(key_to_ids[key])), encoding="utf-8")) for key in key_to_ids] + key_to_ids_items = [(self._keys_prefix + key, json.dumps(list(key_to_ids[key])).encode("utf-8")) for key in key_to_ids] self._trie = marisa_trie.BytesTrie(key_to_ids_items + id_to_loc_items) diff --git a/tests/runtime/speed_test.py b/tests/runtime/speed_test.py index de226d9..17a7092 100644 --- a/tests/runtime/speed_test.py +++ b/tests/runtime/speed_test.py @@ -8,7 +8,7 @@ print("finished: {}s!".format(time.time() - start_time)) # Load 50 test city lookups -with open("tests/runtime/test_city_lookups.txt", "r") as lookups_file: +with open("test_city_lookups.txt", "r") as lookups_file: city_tests = lookups_file.read().splitlines() # Increase the number of lookups for the speed test if necessary @@ -29,7 +29,7 @@ print("finished: {}s!".format(time.time() - start_time)) # Load 50 test state lookups -with open("tests/runtime/test_state_lookups.txt", "r") as lookups_file: +with open("test_state_lookups.txt", "r") as lookups_file: state_tests = lookups_file.read().splitlines() # Increase the number of lookups for the speed test if necessary