From 62ca65265569593b3affd51508f74b2f133f6ef5 Mon Sep 17 00:00:00 2001
From: Sheshank Shankar <sheshank@outlook.com>
Date: Sat, 19 Sep 2020 22:13:13 -0700
Subject: [PATCH 01/12] loading code with sqlite3 + new unit tests + new speed
 testing

---
 Pipfile                                 |   1 +
 litecoder/__init__.py                   |   4 +-
 litecoder/usa.py                        |  62 +++--
 scripts/cities_to_yaml.py               | 119 +++++++++
 scripts/speed_test.py                   | 123 +++++++++
 tests/prod_db/conftest.py               |   4 +-
 tests/prod_db/test_us_city_index_2.yml  | 340 ++++++++++++++++++++++++
 tests/prod_db/test_us_state_index.py    |  10 +-
 tests/prod_db/test_us_state_index_2.yml | 340 ++++++++++++++++++++++++
 9 files changed, 971 insertions(+), 32 deletions(-)
 create mode 100644 scripts/cities_to_yaml.py
 create mode 100644 scripts/speed_test.py
 create mode 100644 tests/prod_db/test_us_city_index_2.yml
 create mode 100644 tests/prod_db/test_us_state_index_2.yml

diff --git a/Pipfile b/Pipfile
index b65227a..c0b76e4 100644
--- a/Pipfile
+++ b/Pipfile
@@ -35,6 +35,7 @@ PyYAML = "*"
 Shapely = "*"
 numpy = "*"
 scipy = "*"
+sqlitedict = "*"
 
 [dev-packages]
 
diff --git a/litecoder/__init__.py b/litecoder/__init__.py
index e67b0f9..9a01450 100644
--- a/litecoder/__init__.py
+++ b/litecoder/__init__.py
@@ -9,9 +9,9 @@
 
 DATA_DIR = os.path.join(os.path.dirname(__file__), 'data')
 
-US_STATE_PATH = os.path.join(DATA_DIR, 'us-states.p')
+US_STATE_PATH = os.path.join(DATA_DIR, 'us-states.db')
 
-US_CITY_PATH = os.path.join(DATA_DIR, 'us-cities.p')
+US_CITY_PATH = os.path.join(DATA_DIR, 'us-cities.db')
 
 
 logging.basicConfig(
diff --git a/litecoder/usa.py b/litecoder/usa.py
index d6ff15a..749cdcd 100644
--- a/litecoder/usa.py
+++ b/litecoder/usa.py
@@ -1,8 +1,10 @@
 
 
 import re
-import pickle
-
+import os
+import hashlib
+import struct
+from sqlitedict import SqliteDict
 from tqdm import tqdm
 from collections import defaultdict
 from itertools import product
@@ -192,14 +194,16 @@ def __repr__(self):
 
 class Index:
 
-    @classmethod
-    def load(cls, path):
-        with open(path, 'rb') as fh:
-            return pickle.load(fh)
+    # Now that loading the database is instantenous, it is better to put it in the constructor over a
+    #   separate load method
+    # @classmethod
+    # def load(cls, path):
+    #     with open(path, 'rb') as fh:
+    #         return pickle.load(fh)
 
-    def __init__(self):
-        self._key_to_ids = defaultdict(set)
-        self._id_to_loc = dict()
+    def __init__(self, path):
+        self._key_to_ids = SqliteDict(filename=path, tablename="keys")
+        self._id_to_loc = SqliteDict(filename=path, tablename="locations")
 
     def __len__(self):
         return len(self._key_to_ids)
@@ -211,38 +215,47 @@ def __repr__(self):
             len(self._id_to_loc),
         )
 
-    def __getitem__(self, text):
+    def __getitem__(self, key):
         """Get ids, map to records only if there is a match in the index
         """
-        if keyify(text) not in self._key_to_ids:
+        # convert string to integer
+        hash = hashlib.md5(bytes(keyify(key), encoding="utf-8")).digest()
+        hashed_key = struct.unpack("L", hash[:8])[0] % (2 ** 63)
+        if hashed_key not in self._key_to_ids:
             return None
 
-        ids = self._key_to_ids[keyify(text)]
+        ids = self._key_to_ids[hashed_key]
 
         return [self._id_to_loc[id] for id in ids]
 
     def add_key(self, key, id):
-        self._key_to_ids[key].add(id)
+        # convert string to integer
+        hash = hashlib.md5(bytes(key, encoding="utf-8")).digest()
+        hashed_key = struct.unpack("L", hash[:8])[0] % (2 ** 63)
+        if hashed_key not in self._key_to_ids:
+            self._key_to_ids[hashed_key] = set()
+        curr_ids = self._key_to_ids[hashed_key]
+        curr_ids.add(id)
+        self._key_to_ids[hashed_key] = curr_ids
+        self._key_to_ids.commit()
+        del curr_ids
 
     def add_location(self, id, location):
         self._id_to_loc[id] = location
+        self._id_to_loc.commit()
 
     def locations(self):
         return list(self._id_to_loc.values())
 
-    def save(self, path):
-        with open(path, 'wb') as fh:
-            pickle.dump(self, fh)
+    def close(self):
+        self._key_to_ids.close()
+        self._id_to_loc.close()
 
 
 class USCityIndex(Index):
 
-    @classmethod
-    def load(cls, path=US_CITY_PATH):
-        return super().load(path)
-
     def __init__(self, bare_name_blocklist=None):
-        super().__init__()
+        super().__init__(path=US_CITY_PATH)
         self.bare_name_blocklist = bare_name_blocklist
 
     def build(self):
@@ -269,9 +282,8 @@ def build(self):
 
 class USStateIndex(Index):
 
-    @classmethod
-    def load(cls, path=US_STATE_PATH):
-        return super().load(path)
+    def __init__(self):
+        super().__init__(path=US_STATE_PATH)
 
     def build(self):
         """Index all US states.
@@ -287,4 +299,4 @@ def build(self):
                 self.add_key(key, row.wof_id)
 
             # ID -> state
-            self.add_location(row.wof_id, StateMatch(row))
+            self.add_location(row.wof_id, StateMatch(row))
\ No newline at end of file
diff --git a/scripts/cities_to_yaml.py b/scripts/cities_to_yaml.py
new file mode 100644
index 0000000..113f145
--- /dev/null
+++ b/scripts/cities_to_yaml.py
@@ -0,0 +1,119 @@
+from litecoder.usa import USCityIndex, USStateIndex
+
+city_idx = USCityIndex()
+state_idx = USStateIndex()
+
+states = """North Carolina, USA   
+District of Columbia
+Illinois, United States
+Georgia United States 
+north carolina
+texas
+iowa
+Florida, United States
+Vermont,  USA
+TX USA 
+FL U.S.A.
+pennsylvania usa
+nebraska 
+ Oregon 
+Pennsylvania 
+New Hampshire  USA
+Nebraska, USA
+New mexico
+Indiana    
+South Dakota 
+ Oklahoma
+Ohio,US
+Kansas, USA 
+indiana
+MA, USA
+ New York
+Ohio, United States
+NJ USA
+ohio usa
+Connecticut, USA
+MICHIGAN, United States
+Missouri
+New York
+California - USA
+Massachusetts, USA 
+ Missouri
+FL, United States of America
+New Hampshire
+Georgia.
+Nevada USA
+ PENNSYLVANIA
+Virginia, USA.
+Alabama, USA
+Indiana 
+Louisiana, United States
+New Mexico 
+Ohio USA
+Nevada, USA
+LOUISIANA
+New Jersey, us""".split("\n")
+
+cities = """Edinburg, Texas
+Lakeville , Minnesota
+Woodland, CA.
+Gary, IN
+Cornelius, NC
+Okeechobee, Fl
+Saginaw Township South, MI
+Lansdowne, PA
+Knoxville TN
+OAKLAND, CA
+suffolk va
+Port Orange, FL
+Sedona, AZ
+Cedar City UT 
+Cincinnati. 
+Huntington Beach CA
+Wooster,Ohio
+Lewisville, Texas
+traverse city mi
+Pennsauken, New Jersey
+Jonesboro, Arkansas
+Zephyrhills, FL
+West Jefferson, NC
+Escondido, CA 
+Lumberton, NC
+Cayce, SC
+Stratford, Connecticut, USA
+Avondale, AZ
+Coral Springs, FL 
+Gaithersburg, MD
+Westchester, IL
+Louisa, Virginia 
+Norway, ME
+Philadelphia PA, USA
+Fort worth, tx
+Eureka Springs, Arkansas
+Nashville , TN
+Ellenwood Ga
+Floral Park, NY
+Nashville Tennessee
+Malvern, AR
+Valdosta, Georgia
+Valley Center Ca
+St. Robert Mo. 
+Hollandale, MS
+New Castle, PA 
+Harlem, FL
+Kings Mills, OH
+knoxville Tennessee
+BrooklYn""".split("\n")
+output = ""
+for state in states:
+	if (len(state_idx[state]) == 0):
+		print(state)
+	wof_ids = [result.data.wof_id for result in state_idx[state]]
+	output += """- query:
+    - {}
+  matches:\n""".format(state)
+	for wof_id in wof_ids:
+		output += "    - {}".format(wof_id)
+	output += "\n"
+with open("output.yml", "w") as o_file:
+	o_file.write(output)
diff --git a/scripts/speed_test.py b/scripts/speed_test.py
new file mode 100644
index 0000000..e7c3756
--- /dev/null
+++ b/scripts/speed_test.py
@@ -0,0 +1,123 @@
+from litecoder.usa import USCityIndex, USStateIndex
+import time
+
+print("Loading USCityIndex... ", end="")
+start_time = time.time()
+city_idx = USCityIndex()
+print("finished: {}s!".format(time.time() - start_time))
+
+print("Loading USStateIndex... ", end="")
+start_time = time.time()
+state_idx = USStateIndex()
+print("finished: {}s!".format(time.time() - start_time))
+
+city_tests = """Edinburg, Texas
+Lakeville , Minnesota
+Woodland, CA.
+Gary, IN
+Cornelius, NC
+Okeechobee, Fl
+Saginaw Township South, MI
+Lansdowne, PA
+Knoxville TN
+OAKLAND, CA
+suffolk va
+Port Orange, FL
+Sedona, AZ
+Cedar City UT 
+Cincinnati. 
+Huntington Beach CA
+Wooster,Ohio
+Lewisville, Texas
+traverse city mi
+Pennsauken, New Jersey
+Jonesboro, Arkansas
+Zephyrhills, FL
+West Jefferson, NC
+Escondido, CA 
+Lumberton, NC
+Cayce, SC
+Stratford, Connecticut, USA
+Avondale, AZ
+Coral Springs, FL 
+Gaithersburg, MD
+Westchester, IL
+Louisa, Virginia 
+Norway, ME
+Philadelphia PA, USA
+Fort worth, tx
+Eureka Springs, Arkansas
+Nashville , TN
+Ellenwood Ga
+Floral Park, NY
+Nashville Tennessee
+Malvern, AR
+Valdosta, Georgia
+Valley Center Ca
+St. Robert Mo. 
+Hollandale, MS
+New Castle, PA 
+Harlem, FL
+Kings Mills, OH
+knoxville Tennessee
+BrooklYn""".split("\n")
+print("measuring time for {} cities... ".format(len(city_tests)), end="")
+start_time = time.time()
+for city in city_tests:
+    x = city_idx[city]
+print("finished: took {}s!".format(1000*(time.time() - start_time)))
+state_tests = """North Carolina, USA   
+District of Columbia
+Illinois, United States
+Georgia United States 
+north carolina
+texas
+iowa
+Florida, United States
+Vermont,  USA
+TX USA 
+FL U.S.A.
+pennsylvania usa
+nebraska 
+ Oregon 
+Pennsylvania 
+New Hampshire  USA
+Nebraska, USA
+New mexico
+Indiana    
+South Dakota 
+ Oklahoma
+Ohio,US
+Kansas, USA 
+indiana
+MA, USA
+ New York
+Ohio, United States
+NJ USA
+ohio usa
+Connecticut, USA
+MICHIGAN, United States
+Missouri
+New York
+California - USA
+Massachusetts, USA 
+ Missouri
+FL, United States of America
+New Hampshire
+Georgia.
+Nevada USA
+ PENNSYLVANIA
+Virginia, USA.
+Alabama, USA
+Indiana 
+Louisiana, United States
+New Mexico 
+Ohio USA
+Nevada, USA
+LOUISIANA
+New Jersey, us""".split("\n")
+print("measuring time for {} states... ".format(len(state_tests)), end="")
+start_time = time.time()
+for state in state_tests:
+    x = state_idx[state]
+print("finished: took {}s!".format(1000 * (time.time() - start_time)))
\ No newline at end of file
diff --git a/tests/prod_db/conftest.py b/tests/prod_db/conftest.py
index 1005d12..2a746d5 100644
--- a/tests/prod_db/conftest.py
+++ b/tests/prod_db/conftest.py
@@ -7,9 +7,9 @@
 
 @pytest.fixture(scope='session')
 def city_idx():
-    return USCityIndex.load()
+    return USCityIndex()
 
 
 @pytest.fixture(scope='session')
 def state_idx():
-    return USStateIndex.load()
+    return USStateIndex()
diff --git a/tests/prod_db/test_us_city_index_2.yml b/tests/prod_db/test_us_city_index_2.yml
new file mode 100644
index 0000000..89c2f34
--- /dev/null
+++ b/tests/prod_db/test_us_city_index_2.yml
@@ -0,0 +1,340 @@
+- query:
+    - Edinburg, Texas
+  matches:
+    - 101723563
+- query:
+    - Lakeville , Minnesota
+  matches:
+    - 85968479
+- query:
+    - Woodland, CA.
+  matches:
+    - 85922405
+- query:
+    - Gary, IN
+  matches:
+    - 85941813
+- query:
+    - Cornelius, NC
+  matches:
+    - 85981335
+- query:
+    - Okeechobee, Fl
+  matches:
+    - 85932281
+- query:
+    - Saginaw Township South, MI
+  matches:
+    - 1125767499
+- query:
+    - Lansdowne, PA
+  matches:
+    - 101718067
+- query:
+    - Knoxville TN
+  matches:
+    - 101722865
+- query:
+    - OAKLAND, CA
+  matches:
+    - 85921881
+- query:
+    - suffolk va
+  matches:
+    - 101728729
+- query:
+    - Port Orange, FL
+  matches:
+    - 85932629
+- query:
+    - Sedona, AZ
+  matches:
+    - 85917431
+- query:
+    - Cedar City UT 
+  matches:
+    - 101727685
+- query:
+    - Cincinnati. 
+  matches:
+    - 101712203
+- query:
+    - Huntington Beach CA
+  matches:
+    - 85923137
+- query:
+    - Wooster,Ohio
+  matches:
+    - 101712345
+- query:
+    - Lewisville, Texas
+  matches:
+    - 101724413
+- query:
+    - traverse city mi
+  matches:
+    - 85950881
+- query:
+    - Pennsauken, New Jersey
+  matches:
+    - 1125947935
+- query:
+    - Jonesboro, Arkansas
+  matches:
+    - 85920203
+- query:
+    - Zephyrhills, FL
+  matches:
+    - 85932233
+- query:
+    - West Jefferson, NC
+  matches:
+    - 85981009
+- query:
+    - Escondido, CA 
+  matches:
+    - 85922263
+- query:
+    - Lumberton, NC
+  matches:
+    - 85981189
+- query:
+    - Cayce, SC
+  matches:
+    - 101720791
+- query:
+    - Stratford, Connecticut, USA
+  matches:
+    - 85930997
+- query:
+    - Avondale, AZ
+  matches:
+    - 85917553
+- query:
+    - Coral Springs, FL 
+  matches:
+    - 85932415
+- query:
+    - Gaithersburg, MD
+  matches:
+    - 85949491
+- query:
+    - Westchester, IL
+  matches:
+    - 85940923
+- query:
+    - Louisa, Virginia 
+  matches:
+    - 101728949
+- query:
+    - Norway, ME
+  matches:
+    - 85948973
+- query:
+    - Philadelphia PA, USA
+  matches:
+    - 101718083
+- query:
+    - Fort worth, tx
+  matches:
+    - 101724443
+- query:
+    - Eureka Springs, Arkansas
+  matches:
+    - 85919765
+- query:
+    - Nashville , TN
+  matches:
+    - 101723183
+- query:
+    - Ellenwood Ga
+  matches:
+    - 1126054897
+- query:
+    - Floral Park, NY
+  matches:
+    - 85977689
+- query:
+    - Nashville Tennessee
+  matches:
+    - 101723183
+- query:
+    - Malvern, AR
+  matches:
+    - 85920689
+- query:
+    - Valdosta, Georgia
+  matches:
+    - 85936921
+- query:
+    - Valley Center Ca
+  matches:
+    - 85925201
+- query:
+    - St. Robert Mo. 
+  matches:
+    - 85971093
+- query:
+    - Hollandale, MS
+  matches:
+    - 85969835
+- query:
+    - New Castle, PA 
+  matches:
+    - 101716721
+- query:
+    - Harlem, FL
+  matches:
+    - 85934341
+- query:
+    - Kings Mills, OH
+  matches:
+    - 101713989
+- query:
+    - knoxville Tennessee
+  matches:
+    - 101722865
+- query:
+    - BrooklYn
+  matches:
+    - 85977539
+- query: 
+    - Nuketown
+  matches: []
+  xfail: true
+- query: 
+    - Under Lefty's Skin
+  matches: []
+  xfail: true
+- query: 
+    - oY’– 7/6/20 oY’–
+  matches: []
+  xfail: true
+- query: 
+    - Palestine.Ramallah
+  matches: []
+  xfail: true
+- query: 
+    - DONT REPOST MY ART ! carrd byf
+  matches: []
+  xfail: true
+- query: 
+    - AJ&K, Pakistan.
+  matches: []
+  xfail: true
+- query: 
+    - La Plana Alta 
+  matches: []
+  xfail: true
+- query: 
+    - Com o ChorAEo
+  matches: []
+  xfail: true
+- query: 
+    - PLUTO
+  matches: []
+  xfail: true
+- query: 
+    - All Over The WORLD
+  matches: []
+  xfail: true
+- query: 
+    - /dev/null
+  matches: []
+  xfail: true
+- query: 
+    - University of Bath
+  matches: []
+  xfail: true
+- query: 
+    - Gaia
+  matches: []
+  xfail: true
+- query: 
+    - Plc
+  matches: []
+  xfail: true
+- query: 
+    - Jay cooke mn
+  matches: []
+  xfail: true
+- query: 
+    - Toulouse - Perpignan 
+  matches: []
+  xfail: true
+- query: 
+    - oYtaoYt |oYtaoYt¦ aeC sc squad.a†’
+  matches: []
+  xfail: true
+- query: 
+    - oYscoY
+  matches: []
+  xfail: true
+- query: 
+    - sheher ecoe 20
+  matches: []
+  xfail: true
+- query: 
+    - pequitas de sunoo^^
+  matches: []
+  xfail: true
+- query: 
+    - West M
+  matches: []
+  xfail: true
+- query: 
+    - Maputo city
+  matches: []
+  xfail: true
+- query: 
+    - Entwined w peace 
+  matches: []
+  xfail: true
+- query: 
+    - DDDDDDDDDDDDDDD
+  matches: []
+  xfail: true
+- query: 
+    - mons vaticanus, subterrane
+  matches: []
+  xfail: true
+- query: 
+    - oYoO oyOY Oyoyo OY
+  matches: []
+  xfail: true
+- query: 
+    - Loin du hood 
+  matches: []
+  xfail: true
+- query: 
+    - acab blm
+  matches: []
+  xfail: true
+- query: 
+    - 010997
+  matches: []
+  xfail: true
+- query: 
+    - tidytuanzebeaETMs basement
+  matches: []
+  xfail: true
+- query: 
+    - oYOyomasdEmr
+  matches: []
+  xfail: true
+- query: 
+    - somewhere
+  matches: []
+  xfail: true
+- query: 
+    - 330
+  matches: []
+  xfail: true
+- query: 
+    - oSUSdod
+  matches: []
+  xfail: true
+- query: 
+    - she/her 16 isfp
+  matches: []
+  xfail: true
\ No newline at end of file
diff --git a/tests/prod_db/test_us_state_index.py b/tests/prod_db/test_us_state_index.py
index 9adafc6..c1353e9 100644
--- a/tests/prod_db/test_us_state_index.py
+++ b/tests/prod_db/test_us_state_index.py
@@ -16,15 +16,19 @@ def yield_cases():
 
         queries = group['query']
 
+        xfail = group.get('xfail', False)
+
         if type(queries) is str:
             queries = [queries]
 
         for query in queries:
-            yield query, group['matches']
+            yield query, group['matches'], xfail
 
 
-@pytest.mark.parametrize('query,matches', yield_cases())
-def test_cases(state_idx, query, matches):
+@pytest.mark.parametrize('query,matches,xfail', yield_cases())
+def test_cases(state_idx, query, matches, xfail):
+    if xfail:
+        pytest.xfail()
 
     res = state_idx[query]
 
diff --git a/tests/prod_db/test_us_state_index_2.yml b/tests/prod_db/test_us_state_index_2.yml
new file mode 100644
index 0000000..c0eedb2
--- /dev/null
+++ b/tests/prod_db/test_us_state_index_2.yml
@@ -0,0 +1,340 @@
+- query:
+    - North Carolina, USA   
+  matches:
+    - 85688773
+- query:
+    - District of Columbia
+  matches:
+    - 85688741
+- query:
+    - Illinois, United States
+  matches:
+    - 85688697
+- query:
+    - Georgia United States 
+  matches:
+    - 85688535
+- query:
+    - north carolina
+  matches:
+    - 85688773
+- query:
+    - texas
+  matches:
+    - 85688753
+- query:
+    - iowa
+  matches:
+    - 85688713
+- query:
+    - Florida, United States
+  matches:
+    - 85688651
+- query:
+    - Vermont,  USA
+  matches:
+    - 85688763
+- query:
+    - TX USA 
+  matches:
+    - 85688753
+- query:
+    - FL U.S.A.
+  matches:
+    - 85688651
+- query:
+    - pennsylvania usa
+  matches:
+    - 85688481
+- query:
+    - nebraska 
+  matches:
+    - 85688563
+- query:
+    -  Oregon 
+  matches:
+    - 85688513
+- query:
+    - Pennsylvania 
+  matches:
+    - 85688481
+- query:
+    - New Hampshire  USA
+  matches:
+    - 85688689
+- query:
+    - Nebraska, USA
+  matches:
+    - 85688563
+- query:
+    - New mexico
+  matches:
+    - 85688493
+- query:
+    - Indiana    
+  matches:
+    - 85688709
+- query:
+    - South Dakota 
+  matches:
+    - 85688693
+- query:
+    -  Oklahoma
+  matches:
+    - 85688585
+- query:
+    - Ohio,US
+  matches:
+    - 85688485
+- query:
+    - Kansas, USA 
+  matches:
+    - 85688555
+- query:
+    - indiana
+  matches:
+    - 85688709
+- query:
+    - MA, USA
+  matches:
+    - 85688645
+- query:
+    -  New York
+  matches:
+    - 85688543
+- query:
+    - Ohio, United States
+  matches:
+    - 85688485
+- query:
+    - NJ USA
+  matches:
+    - 85688607
+- query:
+    - ohio usa
+  matches:
+    - 85688485
+- query:
+    - Connecticut, USA
+  matches:
+    - 85688629
+- query:
+    - MICHIGAN, United States
+  matches:
+    - 85688599
+- query:
+    - Missouri
+  matches:
+    - 85688661
+- query:
+    - New York
+  matches:
+    - 85688543
+- query:
+    - California - USA
+  matches:
+    - 85688637
+- query:
+    - Massachusetts, USA 
+  matches:
+    - 85688645
+- query:
+    -  Missouri
+  matches:
+    - 85688661
+- query:
+    - FL, United States of America
+  matches:
+    - 85688651
+- query:
+    - New Hampshire
+  matches:
+    - 85688689
+- query:
+    - Georgia.
+  matches:
+    - 85688535
+- query:
+    - Nevada USA
+  matches:
+    - 85688531
+- query:
+    -  PENNSYLVANIA
+  matches:
+    - 85688481
+- query:
+    - Virginia, USA.
+  matches:
+    - 85688747
+- query:
+    - Alabama, USA
+  matches:
+    - 85688675
+- query:
+    - Indiana 
+  matches:
+    - 85688709
+- query:
+    - Louisiana, United States
+  matches:
+    - 85688735
+- query:
+    - New Mexico 
+  matches:
+    - 85688493
+- query:
+    - Ohio USA
+  matches:
+    - 85688485
+- query:
+    - Nevada, USA
+  matches:
+    - 85688531
+- query:
+    - LOUISIANA
+  matches:
+    - 85688735
+- query:
+    - New Jersey, us
+  matches:
+    - 85688607
+- query: 
+    - Nuketown
+  matches: []
+  xfail: true
+- query: 
+    - Under Lefty's Skin
+  matches: []
+  xfail: true
+- query: 
+    - oY’– 7/6/20 oY’–
+  matches: []
+  xfail: true
+- query: 
+    - Palestine.Ramallah
+  matches: []
+  xfail: true
+- query: 
+    - DONT REPOST MY ART ! carrd byf
+  matches: []
+  xfail: true
+- query: 
+    - AJ&K, Pakistan.
+  matches: []
+  xfail: true
+- query: 
+    - La Plana Alta 
+  matches: []
+  xfail: true
+- query: 
+    - Com o ChorAEo
+  matches: []
+  xfail: true
+- query: 
+    - PLUTO
+  matches: []
+  xfail: true
+- query: 
+    - All Over The WORLD
+  matches: []
+  xfail: true
+- query: 
+    - /dev/null
+  matches: []
+  xfail: true
+- query: 
+    - University of Bath
+  matches: []
+  xfail: true
+- query: 
+    - Gaia
+  matches: []
+  xfail: true
+- query: 
+    - Plc
+  matches: []
+  xfail: true
+- query: 
+    - Jay cooke mn
+  matches: []
+  xfail: true
+- query: 
+    - Toulouse - Perpignan 
+  matches: []
+  xfail: true
+- query: 
+    - oYtaoYt |oYtaoYt¦ aeC sc squad.a†’
+  matches: []
+  xfail: true
+- query: 
+    - oYscoY
+  matches: []
+  xfail: true
+- query: 
+    - sheher ecoe 20
+  matches: []
+  xfail: true
+- query: 
+    - pequitas de sunoo^^
+  matches: []
+  xfail: true
+- query: 
+    - West M
+  matches: []
+  xfail: true
+- query: 
+    - Maputo city
+  matches: []
+  xfail: true
+- query: 
+    - Entwined w peace 
+  matches: []
+  xfail: true
+- query: 
+    - DDDDDDDDDDDDDDD
+  matches: []
+  xfail: true
+- query: 
+    - mons vaticanus, subterrane
+  matches: []
+  xfail: true
+- query: 
+    - oYoO oyOY Oyoyo OY
+  matches: []
+  xfail: true
+- query: 
+    - Loin du hood 
+  matches: []
+  xfail: true
+- query: 
+    - acab blm
+  matches: []
+  xfail: true
+- query: 
+    - 010997
+  matches: []
+  xfail: true
+- query: 
+    - tidytuanzebeaETMs basement
+  matches: []
+  xfail: true
+- query: 
+    - oYOyomasdEmr
+  matches: []
+  xfail: true
+- query: 
+    - somewhere
+  matches: []
+  xfail: true
+- query: 
+    - 330
+  matches: []
+  xfail: true
+- query: 
+    - oSUSdod
+  matches: []
+  xfail: true
+- query: 
+    - she/her 16 isfp
+  matches: []
+  xfail: true
\ No newline at end of file

From 6dab318f7d3a119fafa7773b2d2af70abe9ad58b Mon Sep 17 00:00:00 2001
From: Sheshank Shankar <sheshank@outlook.com>
Date: Sat, 19 Sep 2020 22:17:27 -0700
Subject: [PATCH 02/12] Revert "loading code with sqlite3 + new unit tests +
 new speed testing"

This reverts commit 62ca65265569593b3affd51508f74b2f133f6ef5.
---
 Pipfile                                 |   1 -
 litecoder/__init__.py                   |   4 +-
 litecoder/usa.py                        |  62 ++---
 scripts/cities_to_yaml.py               | 119 ---------
 scripts/speed_test.py                   | 123 ---------
 tests/prod_db/conftest.py               |   4 +-
 tests/prod_db/test_us_city_index_2.yml  | 340 ------------------------
 tests/prod_db/test_us_state_index.py    |  10 +-
 tests/prod_db/test_us_state_index_2.yml | 340 ------------------------
 9 files changed, 32 insertions(+), 971 deletions(-)
 delete mode 100644 scripts/cities_to_yaml.py
 delete mode 100644 scripts/speed_test.py
 delete mode 100644 tests/prod_db/test_us_city_index_2.yml
 delete mode 100644 tests/prod_db/test_us_state_index_2.yml

diff --git a/Pipfile b/Pipfile
index c0b76e4..b65227a 100644
--- a/Pipfile
+++ b/Pipfile
@@ -35,7 +35,6 @@ PyYAML = "*"
 Shapely = "*"
 numpy = "*"
 scipy = "*"
-sqlitedict = "*"
 
 [dev-packages]
 
diff --git a/litecoder/__init__.py b/litecoder/__init__.py
index 9a01450..e67b0f9 100644
--- a/litecoder/__init__.py
+++ b/litecoder/__init__.py
@@ -9,9 +9,9 @@
 
 DATA_DIR = os.path.join(os.path.dirname(__file__), 'data')
 
-US_STATE_PATH = os.path.join(DATA_DIR, 'us-states.db')
+US_STATE_PATH = os.path.join(DATA_DIR, 'us-states.p')
 
-US_CITY_PATH = os.path.join(DATA_DIR, 'us-cities.db')
+US_CITY_PATH = os.path.join(DATA_DIR, 'us-cities.p')
 
 
 logging.basicConfig(
diff --git a/litecoder/usa.py b/litecoder/usa.py
index 749cdcd..d6ff15a 100644
--- a/litecoder/usa.py
+++ b/litecoder/usa.py
@@ -1,10 +1,8 @@
 
 
 import re
-import os
-import hashlib
-import struct
-from sqlitedict import SqliteDict
+import pickle
+
 from tqdm import tqdm
 from collections import defaultdict
 from itertools import product
@@ -194,16 +192,14 @@ def __repr__(self):
 
 class Index:
 
-    # Now that loading the database is instantenous, it is better to put it in the constructor over a
-    #   separate load method
-    # @classmethod
-    # def load(cls, path):
-    #     with open(path, 'rb') as fh:
-    #         return pickle.load(fh)
+    @classmethod
+    def load(cls, path):
+        with open(path, 'rb') as fh:
+            return pickle.load(fh)
 
-    def __init__(self, path):
-        self._key_to_ids = SqliteDict(filename=path, tablename="keys")
-        self._id_to_loc = SqliteDict(filename=path, tablename="locations")
+    def __init__(self):
+        self._key_to_ids = defaultdict(set)
+        self._id_to_loc = dict()
 
     def __len__(self):
         return len(self._key_to_ids)
@@ -215,47 +211,38 @@ def __repr__(self):
             len(self._id_to_loc),
         )
 
-    def __getitem__(self, key):
+    def __getitem__(self, text):
         """Get ids, map to records only if there is a match in the index
         """
-        # convert string to integer
-        hash = hashlib.md5(bytes(keyify(key), encoding="utf-8")).digest()
-        hashed_key = struct.unpack("L", hash[:8])[0] % (2 ** 63)
-        if hashed_key not in self._key_to_ids:
+        if keyify(text) not in self._key_to_ids:
             return None
 
-        ids = self._key_to_ids[hashed_key]
+        ids = self._key_to_ids[keyify(text)]
 
         return [self._id_to_loc[id] for id in ids]
 
     def add_key(self, key, id):
-        # convert string to integer
-        hash = hashlib.md5(bytes(key, encoding="utf-8")).digest()
-        hashed_key = struct.unpack("L", hash[:8])[0] % (2 ** 63)
-        if hashed_key not in self._key_to_ids:
-            self._key_to_ids[hashed_key] = set()
-        curr_ids = self._key_to_ids[hashed_key]
-        curr_ids.add(id)
-        self._key_to_ids[hashed_key] = curr_ids
-        self._key_to_ids.commit()
-        del curr_ids
+        self._key_to_ids[key].add(id)
 
     def add_location(self, id, location):
         self._id_to_loc[id] = location
-        self._id_to_loc.commit()
 
     def locations(self):
         return list(self._id_to_loc.values())
 
-    def close(self):
-        self._key_to_ids.close()
-        self._id_to_loc.close()
+    def save(self, path):
+        with open(path, 'wb') as fh:
+            pickle.dump(self, fh)
 
 
 class USCityIndex(Index):
 
+    @classmethod
+    def load(cls, path=US_CITY_PATH):
+        return super().load(path)
+
     def __init__(self, bare_name_blocklist=None):
-        super().__init__(path=US_CITY_PATH)
+        super().__init__()
         self.bare_name_blocklist = bare_name_blocklist
 
     def build(self):
@@ -282,8 +269,9 @@ def build(self):
 
 class USStateIndex(Index):
 
-    def __init__(self):
-        super().__init__(path=US_STATE_PATH)
+    @classmethod
+    def load(cls, path=US_STATE_PATH):
+        return super().load(path)
 
     def build(self):
         """Index all US states.
@@ -299,4 +287,4 @@ def build(self):
                 self.add_key(key, row.wof_id)
 
             # ID -> state
-            self.add_location(row.wof_id, StateMatch(row))
\ No newline at end of file
+            self.add_location(row.wof_id, StateMatch(row))
diff --git a/scripts/cities_to_yaml.py b/scripts/cities_to_yaml.py
deleted file mode 100644
index 113f145..0000000
--- a/scripts/cities_to_yaml.py
+++ /dev/null
@@ -1,119 +0,0 @@
-from litecoder.usa import USCityIndex, USStateIndex
-
-city_idx = USCityIndex()
-state_idx = USStateIndex()
-
-states = """North Carolina, USA   
-District of Columbia
-Illinois, United States
-Georgia United States 
-north carolina
-texas
-iowa
-Florida, United States
-Vermont,  USA
-TX USA 
-FL U.S.A.
-pennsylvania usa
-nebraska 
- Oregon 
-Pennsylvania 
-New Hampshire  USA
-Nebraska, USA
-New mexico
-Indiana    
-South Dakota 
- Oklahoma
-Ohio,US
-Kansas, USA 
-indiana
-MA, USA
- New York
-Ohio, United States
-NJ USA
-ohio usa
-Connecticut, USA
-MICHIGAN, United States
-Missouri
-New York
-California - USA
-Massachusetts, USA 
- Missouri
-FL, United States of America
-New Hampshire
-Georgia.
-Nevada USA
- PENNSYLVANIA
-Virginia, USA.
-Alabama, USA
-Indiana 
-Louisiana, United States
-New Mexico 
-Ohio USA
-Nevada, USA
-LOUISIANA
-New Jersey, us""".split("\n")
-
-cities = """Edinburg, Texas
-Lakeville , Minnesota
-Woodland, CA.
-Gary, IN
-Cornelius, NC
-Okeechobee, Fl
-Saginaw Township South, MI
-Lansdowne, PA
-Knoxville TN
-OAKLAND, CA
-suffolk va
-Port Orange, FL
-Sedona, AZ
-Cedar City UT 
-Cincinnati. 
-Huntington Beach CA
-Wooster,Ohio
-Lewisville, Texas
-traverse city mi
-Pennsauken, New Jersey
-Jonesboro, Arkansas
-Zephyrhills, FL
-West Jefferson, NC
-Escondido, CA 
-Lumberton, NC
-Cayce, SC
-Stratford, Connecticut, USA
-Avondale, AZ
-Coral Springs, FL 
-Gaithersburg, MD
-Westchester, IL
-Louisa, Virginia 
-Norway, ME
-Philadelphia PA, USA
-Fort worth, tx
-Eureka Springs, Arkansas
-Nashville , TN
-Ellenwood Ga
-Floral Park, NY
-Nashville Tennessee
-Malvern, AR
-Valdosta, Georgia
-Valley Center Ca
-St. Robert Mo. 
-Hollandale, MS
-New Castle, PA 
-Harlem, FL
-Kings Mills, OH
-knoxville Tennessee
-BrooklYn""".split("\n")
-output = ""
-for state in states:
-	if (len(state_idx[state]) == 0):
-		print(state)
-	wof_ids = [result.data.wof_id for result in state_idx[state]]
-	output += """- query:
-    - {}
-  matches:\n""".format(state)
-	for wof_id in wof_ids:
-		output += "    - {}".format(wof_id)
-	output += "\n"
-with open("output.yml", "w") as o_file:
-	o_file.write(output)
diff --git a/scripts/speed_test.py b/scripts/speed_test.py
deleted file mode 100644
index e7c3756..0000000
--- a/scripts/speed_test.py
+++ /dev/null
@@ -1,123 +0,0 @@
-from litecoder.usa import USCityIndex, USStateIndex
-import time
-
-print("Loading USCityIndex... ", end="")
-start_time = time.time()
-city_idx = USCityIndex()
-print("finished: {}s!".format(time.time() - start_time))
-
-print("Loading USStateIndex... ", end="")
-start_time = time.time()
-state_idx = USStateIndex()
-print("finished: {}s!".format(time.time() - start_time))
-
-city_tests = """Edinburg, Texas
-Lakeville , Minnesota
-Woodland, CA.
-Gary, IN
-Cornelius, NC
-Okeechobee, Fl
-Saginaw Township South, MI
-Lansdowne, PA
-Knoxville TN
-OAKLAND, CA
-suffolk va
-Port Orange, FL
-Sedona, AZ
-Cedar City UT 
-Cincinnati. 
-Huntington Beach CA
-Wooster,Ohio
-Lewisville, Texas
-traverse city mi
-Pennsauken, New Jersey
-Jonesboro, Arkansas
-Zephyrhills, FL
-West Jefferson, NC
-Escondido, CA 
-Lumberton, NC
-Cayce, SC
-Stratford, Connecticut, USA
-Avondale, AZ
-Coral Springs, FL 
-Gaithersburg, MD
-Westchester, IL
-Louisa, Virginia 
-Norway, ME
-Philadelphia PA, USA
-Fort worth, tx
-Eureka Springs, Arkansas
-Nashville , TN
-Ellenwood Ga
-Floral Park, NY
-Nashville Tennessee
-Malvern, AR
-Valdosta, Georgia
-Valley Center Ca
-St. Robert Mo. 
-Hollandale, MS
-New Castle, PA 
-Harlem, FL
-Kings Mills, OH
-knoxville Tennessee
-BrooklYn""".split("\n")
-print("measuring time for {} cities... ".format(len(city_tests)), end="")
-start_time = time.time()
-for city in city_tests:
-    x = city_idx[city]
-print("finished: took {}s!".format(1000*(time.time() - start_time)))
-state_tests = """North Carolina, USA   
-District of Columbia
-Illinois, United States
-Georgia United States 
-north carolina
-texas
-iowa
-Florida, United States
-Vermont,  USA
-TX USA 
-FL U.S.A.
-pennsylvania usa
-nebraska 
- Oregon 
-Pennsylvania 
-New Hampshire  USA
-Nebraska, USA
-New mexico
-Indiana    
-South Dakota 
- Oklahoma
-Ohio,US
-Kansas, USA 
-indiana
-MA, USA
- New York
-Ohio, United States
-NJ USA
-ohio usa
-Connecticut, USA
-MICHIGAN, United States
-Missouri
-New York
-California - USA
-Massachusetts, USA 
- Missouri
-FL, United States of America
-New Hampshire
-Georgia.
-Nevada USA
- PENNSYLVANIA
-Virginia, USA.
-Alabama, USA
-Indiana 
-Louisiana, United States
-New Mexico 
-Ohio USA
-Nevada, USA
-LOUISIANA
-New Jersey, us""".split("\n")
-print("measuring time for {} states... ".format(len(state_tests)), end="")
-start_time = time.time()
-for state in state_tests:
-    x = state_idx[state]
-print("finished: took {}s!".format(1000 * (time.time() - start_time)))
\ No newline at end of file
diff --git a/tests/prod_db/conftest.py b/tests/prod_db/conftest.py
index 2a746d5..1005d12 100644
--- a/tests/prod_db/conftest.py
+++ b/tests/prod_db/conftest.py
@@ -7,9 +7,9 @@
 
 @pytest.fixture(scope='session')
 def city_idx():
-    return USCityIndex()
+    return USCityIndex.load()
 
 
 @pytest.fixture(scope='session')
 def state_idx():
-    return USStateIndex()
+    return USStateIndex.load()
diff --git a/tests/prod_db/test_us_city_index_2.yml b/tests/prod_db/test_us_city_index_2.yml
deleted file mode 100644
index 89c2f34..0000000
--- a/tests/prod_db/test_us_city_index_2.yml
+++ /dev/null
@@ -1,340 +0,0 @@
-- query:
-    - Edinburg, Texas
-  matches:
-    - 101723563
-- query:
-    - Lakeville , Minnesota
-  matches:
-    - 85968479
-- query:
-    - Woodland, CA.
-  matches:
-    - 85922405
-- query:
-    - Gary, IN
-  matches:
-    - 85941813
-- query:
-    - Cornelius, NC
-  matches:
-    - 85981335
-- query:
-    - Okeechobee, Fl
-  matches:
-    - 85932281
-- query:
-    - Saginaw Township South, MI
-  matches:
-    - 1125767499
-- query:
-    - Lansdowne, PA
-  matches:
-    - 101718067
-- query:
-    - Knoxville TN
-  matches:
-    - 101722865
-- query:
-    - OAKLAND, CA
-  matches:
-    - 85921881
-- query:
-    - suffolk va
-  matches:
-    - 101728729
-- query:
-    - Port Orange, FL
-  matches:
-    - 85932629
-- query:
-    - Sedona, AZ
-  matches:
-    - 85917431
-- query:
-    - Cedar City UT 
-  matches:
-    - 101727685
-- query:
-    - Cincinnati. 
-  matches:
-    - 101712203
-- query:
-    - Huntington Beach CA
-  matches:
-    - 85923137
-- query:
-    - Wooster,Ohio
-  matches:
-    - 101712345
-- query:
-    - Lewisville, Texas
-  matches:
-    - 101724413
-- query:
-    - traverse city mi
-  matches:
-    - 85950881
-- query:
-    - Pennsauken, New Jersey
-  matches:
-    - 1125947935
-- query:
-    - Jonesboro, Arkansas
-  matches:
-    - 85920203
-- query:
-    - Zephyrhills, FL
-  matches:
-    - 85932233
-- query:
-    - West Jefferson, NC
-  matches:
-    - 85981009
-- query:
-    - Escondido, CA 
-  matches:
-    - 85922263
-- query:
-    - Lumberton, NC
-  matches:
-    - 85981189
-- query:
-    - Cayce, SC
-  matches:
-    - 101720791
-- query:
-    - Stratford, Connecticut, USA
-  matches:
-    - 85930997
-- query:
-    - Avondale, AZ
-  matches:
-    - 85917553
-- query:
-    - Coral Springs, FL 
-  matches:
-    - 85932415
-- query:
-    - Gaithersburg, MD
-  matches:
-    - 85949491
-- query:
-    - Westchester, IL
-  matches:
-    - 85940923
-- query:
-    - Louisa, Virginia 
-  matches:
-    - 101728949
-- query:
-    - Norway, ME
-  matches:
-    - 85948973
-- query:
-    - Philadelphia PA, USA
-  matches:
-    - 101718083
-- query:
-    - Fort worth, tx
-  matches:
-    - 101724443
-- query:
-    - Eureka Springs, Arkansas
-  matches:
-    - 85919765
-- query:
-    - Nashville , TN
-  matches:
-    - 101723183
-- query:
-    - Ellenwood Ga
-  matches:
-    - 1126054897
-- query:
-    - Floral Park, NY
-  matches:
-    - 85977689
-- query:
-    - Nashville Tennessee
-  matches:
-    - 101723183
-- query:
-    - Malvern, AR
-  matches:
-    - 85920689
-- query:
-    - Valdosta, Georgia
-  matches:
-    - 85936921
-- query:
-    - Valley Center Ca
-  matches:
-    - 85925201
-- query:
-    - St. Robert Mo. 
-  matches:
-    - 85971093
-- query:
-    - Hollandale, MS
-  matches:
-    - 85969835
-- query:
-    - New Castle, PA 
-  matches:
-    - 101716721
-- query:
-    - Harlem, FL
-  matches:
-    - 85934341
-- query:
-    - Kings Mills, OH
-  matches:
-    - 101713989
-- query:
-    - knoxville Tennessee
-  matches:
-    - 101722865
-- query:
-    - BrooklYn
-  matches:
-    - 85977539
-- query: 
-    - Nuketown
-  matches: []
-  xfail: true
-- query: 
-    - Under Lefty's Skin
-  matches: []
-  xfail: true
-- query: 
-    - oY’– 7/6/20 oY’–
-  matches: []
-  xfail: true
-- query: 
-    - Palestine.Ramallah
-  matches: []
-  xfail: true
-- query: 
-    - DONT REPOST MY ART ! carrd byf
-  matches: []
-  xfail: true
-- query: 
-    - AJ&K, Pakistan.
-  matches: []
-  xfail: true
-- query: 
-    - La Plana Alta 
-  matches: []
-  xfail: true
-- query: 
-    - Com o ChorAEo
-  matches: []
-  xfail: true
-- query: 
-    - PLUTO
-  matches: []
-  xfail: true
-- query: 
-    - All Over The WORLD
-  matches: []
-  xfail: true
-- query: 
-    - /dev/null
-  matches: []
-  xfail: true
-- query: 
-    - University of Bath
-  matches: []
-  xfail: true
-- query: 
-    - Gaia
-  matches: []
-  xfail: true
-- query: 
-    - Plc
-  matches: []
-  xfail: true
-- query: 
-    - Jay cooke mn
-  matches: []
-  xfail: true
-- query: 
-    - Toulouse - Perpignan 
-  matches: []
-  xfail: true
-- query: 
-    - oYtaoYt |oYtaoYt¦ aeC sc squad.a†’
-  matches: []
-  xfail: true
-- query: 
-    - oYscoY
-  matches: []
-  xfail: true
-- query: 
-    - sheher ecoe 20
-  matches: []
-  xfail: true
-- query: 
-    - pequitas de sunoo^^
-  matches: []
-  xfail: true
-- query: 
-    - West M
-  matches: []
-  xfail: true
-- query: 
-    - Maputo city
-  matches: []
-  xfail: true
-- query: 
-    - Entwined w peace 
-  matches: []
-  xfail: true
-- query: 
-    - DDDDDDDDDDDDDDD
-  matches: []
-  xfail: true
-- query: 
-    - mons vaticanus, subterrane
-  matches: []
-  xfail: true
-- query: 
-    - oYoO oyOY Oyoyo OY
-  matches: []
-  xfail: true
-- query: 
-    - Loin du hood 
-  matches: []
-  xfail: true
-- query: 
-    - acab blm
-  matches: []
-  xfail: true
-- query: 
-    - 010997
-  matches: []
-  xfail: true
-- query: 
-    - tidytuanzebeaETMs basement
-  matches: []
-  xfail: true
-- query: 
-    - oYOyomasdEmr
-  matches: []
-  xfail: true
-- query: 
-    - somewhere
-  matches: []
-  xfail: true
-- query: 
-    - 330
-  matches: []
-  xfail: true
-- query: 
-    - oSUSdod
-  matches: []
-  xfail: true
-- query: 
-    - she/her 16 isfp
-  matches: []
-  xfail: true
\ No newline at end of file
diff --git a/tests/prod_db/test_us_state_index.py b/tests/prod_db/test_us_state_index.py
index c1353e9..9adafc6 100644
--- a/tests/prod_db/test_us_state_index.py
+++ b/tests/prod_db/test_us_state_index.py
@@ -16,19 +16,15 @@ def yield_cases():
 
         queries = group['query']
 
-        xfail = group.get('xfail', False)
-
         if type(queries) is str:
             queries = [queries]
 
         for query in queries:
-            yield query, group['matches'], xfail
+            yield query, group['matches']
 
 
-@pytest.mark.parametrize('query,matches,xfail', yield_cases())
-def test_cases(state_idx, query, matches, xfail):
-    if xfail:
-        pytest.xfail()
+@pytest.mark.parametrize('query,matches', yield_cases())
+def test_cases(state_idx, query, matches):
 
     res = state_idx[query]
 
diff --git a/tests/prod_db/test_us_state_index_2.yml b/tests/prod_db/test_us_state_index_2.yml
deleted file mode 100644
index c0eedb2..0000000
--- a/tests/prod_db/test_us_state_index_2.yml
+++ /dev/null
@@ -1,340 +0,0 @@
-- query:
-    - North Carolina, USA   
-  matches:
-    - 85688773
-- query:
-    - District of Columbia
-  matches:
-    - 85688741
-- query:
-    - Illinois, United States
-  matches:
-    - 85688697
-- query:
-    - Georgia United States 
-  matches:
-    - 85688535
-- query:
-    - north carolina
-  matches:
-    - 85688773
-- query:
-    - texas
-  matches:
-    - 85688753
-- query:
-    - iowa
-  matches:
-    - 85688713
-- query:
-    - Florida, United States
-  matches:
-    - 85688651
-- query:
-    - Vermont,  USA
-  matches:
-    - 85688763
-- query:
-    - TX USA 
-  matches:
-    - 85688753
-- query:
-    - FL U.S.A.
-  matches:
-    - 85688651
-- query:
-    - pennsylvania usa
-  matches:
-    - 85688481
-- query:
-    - nebraska 
-  matches:
-    - 85688563
-- query:
-    -  Oregon 
-  matches:
-    - 85688513
-- query:
-    - Pennsylvania 
-  matches:
-    - 85688481
-- query:
-    - New Hampshire  USA
-  matches:
-    - 85688689
-- query:
-    - Nebraska, USA
-  matches:
-    - 85688563
-- query:
-    - New mexico
-  matches:
-    - 85688493
-- query:
-    - Indiana    
-  matches:
-    - 85688709
-- query:
-    - South Dakota 
-  matches:
-    - 85688693
-- query:
-    -  Oklahoma
-  matches:
-    - 85688585
-- query:
-    - Ohio,US
-  matches:
-    - 85688485
-- query:
-    - Kansas, USA 
-  matches:
-    - 85688555
-- query:
-    - indiana
-  matches:
-    - 85688709
-- query:
-    - MA, USA
-  matches:
-    - 85688645
-- query:
-    -  New York
-  matches:
-    - 85688543
-- query:
-    - Ohio, United States
-  matches:
-    - 85688485
-- query:
-    - NJ USA
-  matches:
-    - 85688607
-- query:
-    - ohio usa
-  matches:
-    - 85688485
-- query:
-    - Connecticut, USA
-  matches:
-    - 85688629
-- query:
-    - MICHIGAN, United States
-  matches:
-    - 85688599
-- query:
-    - Missouri
-  matches:
-    - 85688661
-- query:
-    - New York
-  matches:
-    - 85688543
-- query:
-    - California - USA
-  matches:
-    - 85688637
-- query:
-    - Massachusetts, USA 
-  matches:
-    - 85688645
-- query:
-    -  Missouri
-  matches:
-    - 85688661
-- query:
-    - FL, United States of America
-  matches:
-    - 85688651
-- query:
-    - New Hampshire
-  matches:
-    - 85688689
-- query:
-    - Georgia.
-  matches:
-    - 85688535
-- query:
-    - Nevada USA
-  matches:
-    - 85688531
-- query:
-    -  PENNSYLVANIA
-  matches:
-    - 85688481
-- query:
-    - Virginia, USA.
-  matches:
-    - 85688747
-- query:
-    - Alabama, USA
-  matches:
-    - 85688675
-- query:
-    - Indiana 
-  matches:
-    - 85688709
-- query:
-    - Louisiana, United States
-  matches:
-    - 85688735
-- query:
-    - New Mexico 
-  matches:
-    - 85688493
-- query:
-    - Ohio USA
-  matches:
-    - 85688485
-- query:
-    - Nevada, USA
-  matches:
-    - 85688531
-- query:
-    - LOUISIANA
-  matches:
-    - 85688735
-- query:
-    - New Jersey, us
-  matches:
-    - 85688607
-- query: 
-    - Nuketown
-  matches: []
-  xfail: true
-- query: 
-    - Under Lefty's Skin
-  matches: []
-  xfail: true
-- query: 
-    - oY’– 7/6/20 oY’–
-  matches: []
-  xfail: true
-- query: 
-    - Palestine.Ramallah
-  matches: []
-  xfail: true
-- query: 
-    - DONT REPOST MY ART ! carrd byf
-  matches: []
-  xfail: true
-- query: 
-    - AJ&K, Pakistan.
-  matches: []
-  xfail: true
-- query: 
-    - La Plana Alta 
-  matches: []
-  xfail: true
-- query: 
-    - Com o ChorAEo
-  matches: []
-  xfail: true
-- query: 
-    - PLUTO
-  matches: []
-  xfail: true
-- query: 
-    - All Over The WORLD
-  matches: []
-  xfail: true
-- query: 
-    - /dev/null
-  matches: []
-  xfail: true
-- query: 
-    - University of Bath
-  matches: []
-  xfail: true
-- query: 
-    - Gaia
-  matches: []
-  xfail: true
-- query: 
-    - Plc
-  matches: []
-  xfail: true
-- query: 
-    - Jay cooke mn
-  matches: []
-  xfail: true
-- query: 
-    - Toulouse - Perpignan 
-  matches: []
-  xfail: true
-- query: 
-    - oYtaoYt |oYtaoYt¦ aeC sc squad.a†’
-  matches: []
-  xfail: true
-- query: 
-    - oYscoY
-  matches: []
-  xfail: true
-- query: 
-    - sheher ecoe 20
-  matches: []
-  xfail: true
-- query: 
-    - pequitas de sunoo^^
-  matches: []
-  xfail: true
-- query: 
-    - West M
-  matches: []
-  xfail: true
-- query: 
-    - Maputo city
-  matches: []
-  xfail: true
-- query: 
-    - Entwined w peace 
-  matches: []
-  xfail: true
-- query: 
-    - DDDDDDDDDDDDDDD
-  matches: []
-  xfail: true
-- query: 
-    - mons vaticanus, subterrane
-  matches: []
-  xfail: true
-- query: 
-    - oYoO oyOY Oyoyo OY
-  matches: []
-  xfail: true
-- query: 
-    - Loin du hood 
-  matches: []
-  xfail: true
-- query: 
-    - acab blm
-  matches: []
-  xfail: true
-- query: 
-    - 010997
-  matches: []
-  xfail: true
-- query: 
-    - tidytuanzebeaETMs basement
-  matches: []
-  xfail: true
-- query: 
-    - oYOyomasdEmr
-  matches: []
-  xfail: true
-- query: 
-    - somewhere
-  matches: []
-  xfail: true
-- query: 
-    - 330
-  matches: []
-  xfail: true
-- query: 
-    - oSUSdod
-  matches: []
-  xfail: true
-- query: 
-    - she/her 16 isfp
-  matches: []
-  xfail: true
\ No newline at end of file

From 0380cd880cd2127345f73e67e41d2d80a6282e31 Mon Sep 17 00:00:00 2001
From: Sheshank Shankar <sheshank@outlook.com>
Date: Sun, 20 Sep 2020 00:04:21 -0700
Subject: [PATCH 03/12] two level dict => marisa trie

---
 litecoder/usa.py | 61 ++++++++++++++++++++++++++----------------------
 1 file changed, 33 insertions(+), 28 deletions(-)

diff --git a/litecoder/usa.py b/litecoder/usa.py
index d6ff15a..8ef2345 100644
--- a/litecoder/usa.py
+++ b/litecoder/usa.py
@@ -1,6 +1,7 @@
 
 
 import re
+import marisa_trie
 import pickle
 
 from tqdm import tqdm
@@ -192,14 +193,13 @@ def __repr__(self):
 
 class Index:
 
-    @classmethod
-    def load(cls, path):
-        with open(path, 'rb') as fh:
-            return pickle.load(fh)
+    def load(self, key_to_ids_path, id_to_loc_path):
+        self._key_to_ids.load(key_to_ids_path)
+        self._id_to_loc.load(id_to_loc_path)
 
     def __init__(self):
-        self._key_to_ids = defaultdict(set)
-        self._id_to_loc = dict()
+        self._key_to_ids = marisa_trie.BytesTrie()
+        self._id_to_loc = marisa_trie.BytesTrie()
 
     def __len__(self):
         return len(self._key_to_ids)
@@ -214,33 +214,30 @@ def __repr__(self):
     def __getitem__(self, text):
         """Get ids, map to records only if there is a match in the index
         """
-        if keyify(text) not in self._key_to_ids:
+        normalized_key = keyify(text)
+        if normalized_key not in self._key_to_ids:
             return None
 
-        ids = self._key_to_ids[keyify(text)]
+        ids = pickle.loads(self._key_to_ids[normalized_key][0])
 
-        return [self._id_to_loc[id] for id in ids]
+        return [pickle.loads(self._id_to_loc[id][0]) for id in ids]
 
-    def add_key(self, key, id):
-        self._key_to_ids[key].add(id)
+    # def add_key(self, key, id):
+    #     self._key_to_ids[key].add(id)
 
-    def add_location(self, id, location):
-        self._id_to_loc[id] = location
+    # def add_location(self, id, location):
+    #     self._id_to_loc[id] = location
 
     def locations(self):
         return list(self._id_to_loc.values())
 
-    def save(self, path):
-        with open(path, 'wb') as fh:
-            pickle.dump(self, fh)
+    def save(self, key_to_ids_path, id_to_loc_path):
+        self._key_to_ids.save(key_to_ids_path)
+        self._id_to_loc.save(id_to_loc_path)
 
 
 class USCityIndex(Index):
 
-    @classmethod
-    def load(cls, path=US_CITY_PATH):
-        return super().load(path)
-
     def __init__(self, bare_name_blocklist=None):
         super().__init__()
         self.bare_name_blocklist = bare_name_blocklist
@@ -257,22 +254,24 @@ def build(self):
 
         logger.info('Indexing US cities.')
 
+        key_to_ids = defaultdict(set)
+        id_to_loc = dict()
+
         for row in tqdm(cities):
 
             # Key -> id(s)
             for key in map(keyify, iter_keys(row)):
-                self.add_key(key, row.wof_id)
+                key_to_ids[key].add(str(row.wof_id))
 
             # ID -> city
-            self.add_location(row.wof_id, CityMatch(row))
+            id_to_loc[str(row.wof_id)] = pickle.dumps(CityMatch(row))
+        
+        self._key_to_ids = marisa_trie.BytesTrie([(key, pickle.dumps(key_to_ids[key])) for key in key_to_ids])
+        self._id_to_loc = marisa_trie.BytesTrie(id_to_loc.items())
 
 
 class USStateIndex(Index):
 
-    @classmethod
-    def load(cls, path=US_STATE_PATH):
-        return super().load(path)
-
     def build(self):
         """Index all US states.
         """
@@ -280,11 +279,17 @@ def build(self):
 
         logger.info('Indexing US states.')
 
+        key_to_ids = defaultdict(set)
+        id_to_loc = dict()
+
         for row in tqdm(states):
 
             # Key -> id(s)
             for key in map(keyify, state_key_iter(row)):
-                self.add_key(key, row.wof_id)
+                key_to_ids[key].add(str(row.wof_id))
 
             # ID -> state
-            self.add_location(row.wof_id, StateMatch(row))
+            id_to_loc[str(row.wof_id)] = pickle.dumps(StateMatch(row))
+        
+        self._key_to_ids = marisa_trie.BytesTrie([(key, pickle.dumps(key_to_ids[key])) for key in key_to_ids])
+        self._id_to_loc = marisa_trie.BytesTrie(id_to_loc.items())

From 9bc934d1c6037a692fcea40c3b59d851aa969b73 Mon Sep 17 00:00:00 2001
From: Sheshank Shankar <sheshank@outlook.com>
Date: Sun, 27 Sep 2020 14:56:40 -0700
Subject: [PATCH 04/12] combine all trie files into one

---
 litecoder/__init__.py     |   4 +-
 litecoder/usa.py          |  64 ++++++++++++-------
 speed_test.py             | 127 ++++++++++++++++++++++++++++++++++++++
 tests/prod_db/conftest.py |   8 ++-
 4 files changed, 174 insertions(+), 29 deletions(-)
 create mode 100644 speed_test.py

diff --git a/litecoder/__init__.py b/litecoder/__init__.py
index e67b0f9..52ad2ed 100644
--- a/litecoder/__init__.py
+++ b/litecoder/__init__.py
@@ -9,9 +9,7 @@
 
 DATA_DIR = os.path.join(os.path.dirname(__file__), 'data')
 
-US_STATE_PATH = os.path.join(DATA_DIR, 'us-states.p')
-
-US_CITY_PATH = os.path.join(DATA_DIR, 'us-cities.p')
+DATA_PATH = os.path.join(DATA_DIR, 'trie.marisa')
 
 
 logging.basicConfig(
diff --git a/litecoder/usa.py b/litecoder/usa.py
index 8ef2345..21d7a57 100644
--- a/litecoder/usa.py
+++ b/litecoder/usa.py
@@ -12,7 +12,7 @@
 
 from sqlalchemy.inspection import inspect
 
-from . import logger, US_CITY_PATH, US_STATE_PATH
+from . import logger, DATA_PATH
 from .models import WOFRegion, WOFLocality
 
 
@@ -193,34 +193,39 @@ def __repr__(self):
 
 class Index:
 
-    def load(self, key_to_ids_path, id_to_loc_path):
-        self._key_to_ids.load(key_to_ids_path)
-        self._id_to_loc.load(id_to_loc_path)
+    # city keys -> ids = A
+    # city ids -> loc = B
+    # state keys -> ids = C
+    # state ids -> loc = D
 
-    def __init__(self):
-        self._key_to_ids = marisa_trie.BytesTrie()
-        self._id_to_loc = marisa_trie.BytesTrie()
+    def load(self, trie_path=DATA_PATH):
+        self._trie.load(trie_path)
+
+    def __init__(self, keys_key, ids_key):
+        self._trie = marisa_trie.BytesTrie()
+        self._keys_key = keys_key
+        self._ids_key = ids_key
 
     def __len__(self):
-        return len(self._key_to_ids)
+        return len(self._trie.keys(self._keys_key))
 
     def __repr__(self):
         return '%s<%d keys, %d entities>' % (
             self.__class__.__name__,
-            len(self._key_to_ids),
-            len(self._id_to_loc),
+            len(self._trie.keys(self._keys_key)),
+            len(self._trie.keys(self._ids_key)),
         )
 
     def __getitem__(self, text):
         """Get ids, map to records only if there is a match in the index
         """
-        normalized_key = keyify(text)
-        if normalized_key not in self._key_to_ids:
+        normalized_key = self._keys_key + keyify(text)
+        if normalized_key not in self._trie:
             return None
 
-        ids = pickle.loads(self._key_to_ids[normalized_key][0])
+        ids = pickle.loads(self._trie[normalized_key][0])
 
-        return [pickle.loads(self._id_to_loc[id][0]) for id in ids]
+        return [pickle.loads(self._trie[self._ids_key + id][0]) for id in ids]
 
     # def add_key(self, key, id):
     #     self._key_to_ids[key].add(id)
@@ -231,20 +236,20 @@ def __getitem__(self, text):
     def locations(self):
         return list(self._id_to_loc.values())
 
-    def save(self, key_to_ids_path, id_to_loc_path):
-        self._key_to_ids.save(key_to_ids_path)
-        self._id_to_loc.save(id_to_loc_path)
+    def save(self, path):
+        self._trie.save(path)
 
 
 class USCityIndex(Index):
 
     def __init__(self, bare_name_blocklist=None):
-        super().__init__()
+        super().__init__(u"A", u"B")
         self.bare_name_blocklist = bare_name_blocklist
 
     def build(self):
         """Index all US cities.
         """
+        
         allow_bare = AllowBareCityName(blocklist=self.bare_name_blocklist)
 
         iter_keys = CityKeyIter(allow_bare)
@@ -264,14 +269,21 @@ def build(self):
                 key_to_ids[key].add(str(row.wof_id))
 
             # ID -> city
-            id_to_loc[str(row.wof_id)] = pickle.dumps(CityMatch(row))
+            id_to_loc[self._ids_key + str(row.wof_id)] = pickle.dumps(CityMatch(row))
+
+        # In case the loaded trie already has states data
+        previous_trie_data = [(key, value) for (key, value) in self._trie.items() if not (key.startswith(self._keys_key) or key.startswith(self._ids_key))]
+        key_to_ids_trie_data = [(self._keys_key + key, pickle.dumps(key_to_ids[key])) for key in key_to_ids]
+        id_to_loc_trie_data = list(id_to_loc.items())
         
-        self._key_to_ids = marisa_trie.BytesTrie([(key, pickle.dumps(key_to_ids[key])) for key in key_to_ids])
-        self._id_to_loc = marisa_trie.BytesTrie(id_to_loc.items())
+        self._trie = marisa_trie.BytesTrie(previous_trie_data + key_to_ids_trie_data + id_to_loc_trie_data)
 
 
 class USStateIndex(Index):
 
+    def __init__(self):
+        super().__init__(u"C", u"D")
+
     def build(self):
         """Index all US states.
         """
@@ -289,7 +301,11 @@ def build(self):
                 key_to_ids[key].add(str(row.wof_id))
 
             # ID -> state
-            id_to_loc[str(row.wof_id)] = pickle.dumps(StateMatch(row))
+            id_to_loc[self._ids_key + str(row.wof_id)] = pickle.dumps(StateMatch(row))
+
+        # In case the loaded trie already has states data
+        previous_trie_data = [(key, value) for (key, value) in self._trie.items() if not (key.startswith(self._keys_key) or key.startswith(self._ids_key))]
+        key_to_ids_trie_data = [(self._keys_key + key, pickle.dumps(key_to_ids[key])) for key in key_to_ids]
+        id_to_loc_trie_data = list(id_to_loc.items())
         
-        self._key_to_ids = marisa_trie.BytesTrie([(key, pickle.dumps(key_to_ids[key])) for key in key_to_ids])
-        self._id_to_loc = marisa_trie.BytesTrie(id_to_loc.items())
+        self._trie = marisa_trie.BytesTrie(previous_trie_data + key_to_ids_trie_data + id_to_loc_trie_data)
\ No newline at end of file
diff --git a/speed_test.py b/speed_test.py
new file mode 100644
index 0000000..aa43c75
--- /dev/null
+++ b/speed_test.py
@@ -0,0 +1,127 @@
+from litecoder.usa import USCityIndex, USStateIndex
+import time
+
+print("Loading USCityIndex... ", end="")
+start_time = time.time()
+city_idx = USCityIndex()
+city_idx.load()
+print("finished: {}s!".format(time.time() - start_time))
+
+print("Loading USStateIndex... ", end="")
+start_time = time.time()
+state_idx = USStateIndex()
+state_idx.load()
+print("finished: {}s!".format(time.time() - start_time))
+
+city_tests = """Edinburg, Texas
+Lakeville , Minnesota
+Woodland, CA.
+Gary, IN
+Cornelius, NC
+Okeechobee, Fl
+Saginaw Township South, MI
+Lansdowne, PA
+Knoxville TN
+OAKLAND, CA
+suffolk va
+Port Orange, FL
+Sedona, AZ
+Cedar City UT 
+Cincinnati. 
+Huntington Beach CA
+Wooster,Ohio
+Lewisville, Texas
+traverse city mi
+Pennsauken, New Jersey
+Jonesboro, Arkansas
+Zephyrhills, FL
+West Jefferson, NC
+Escondido, CA 
+Lumberton, NC
+Cayce, SC
+Stratford, Connecticut, USA
+Avondale, AZ
+Coral Springs, FL 
+Gaithersburg, MD
+Westchester, IL
+Louisa, Virginia 
+Norway, ME
+Philadelphia PA, USA
+Fort worth, tx
+Eureka Springs, Arkansas
+Nashville , TN
+Ellenwood Ga
+Floral Park, NY
+Nashville Tennessee
+Malvern, AR
+Valdosta, Georgia
+Valley Center Ca
+St. Robert Mo. 
+Hollandale, MS
+New Castle, PA 
+Harlem, FL
+Kings Mills, OH
+knoxville Tennessee
+BrooklYn""".split("\n")
+for x in range (8):
+    city_tests += city_tests
+print("measuring time for {} cities... ".format(len(city_tests)), end="")
+start_time = time.time()
+for city in city_tests:
+    x = city_idx[city]
+print("finished: took {}s!".format(1000*(time.time() - start_time)))
+state_tests = """North Carolina, USA   
+District of Columbia
+Illinois, United States
+Georgia United States 
+north carolina
+texas
+iowa
+Florida, United States
+Vermont,  USA
+TX USA 
+FL U.S.A.
+pennsylvania usa
+nebraska 
+ Oregon 
+Pennsylvania 
+New Hampshire  USA
+Nebraska, USA
+New mexico
+Indiana    
+South Dakota 
+ Oklahoma
+Ohio,US
+Kansas, USA 
+indiana
+MA, USA
+ New York
+Ohio, United States
+NJ USA
+ohio usa
+Connecticut, USA
+MICHIGAN, United States
+Missouri
+New York
+California - USA
+Massachusetts, USA 
+ Missouri
+FL, United States of America
+New Hampshire
+Georgia.
+Nevada USA
+ PENNSYLVANIA
+Virginia, USA.
+Alabama, USA
+Indiana 
+Louisiana, United States
+New Mexico 
+Ohio USA
+Nevada, USA
+LOUISIANA
+New Jersey, us""".split("\n")
+print("measuring time for {} states... ".format(len(state_tests)), end="")
+start_time = time.time()
+for state in state_tests:
+    x = state_idx[state]
+print("finished: took {}s!".format(1000 * (time.time() - start_time)))
diff --git a/tests/prod_db/conftest.py b/tests/prod_db/conftest.py
index 1005d12..a9420a6 100644
--- a/tests/prod_db/conftest.py
+++ b/tests/prod_db/conftest.py
@@ -7,9 +7,13 @@
 
 @pytest.fixture(scope='session')
 def city_idx():
-    return USCityIndex.load()
+    city_idx = USCityIndex()
+    city_idx.load()
+    return city_idx
 
 
 @pytest.fixture(scope='session')
 def state_idx():
-    return USStateIndex.load()
+    state_idx = USStateIndex()
+    state_idx.load()
+    return state_idx
\ No newline at end of file

From 19e557f441466f69d54a8db71a2fa047cf4c4a77 Mon Sep 17 00:00:00 2001
From: Sheshank Shankar <sheshank@outlook.com>
Date: Sun, 27 Sep 2020 15:06:21 -0700
Subject: [PATCH 05/12] bug fix

---
 litecoder/usa.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/litecoder/usa.py b/litecoder/usa.py
index 21d7a57..12d1f1c 100644
--- a/litecoder/usa.py
+++ b/litecoder/usa.py
@@ -234,7 +234,7 @@ def __getitem__(self, text):
     #     self._id_to_loc[id] = location
 
     def locations(self):
-        return list(self._id_to_loc.values())
+        return [loc for (id, loc) in self._trie.items() if id.startswith(self._ids_key)]
 
     def save(self, path):
         self._trie.save(path)

From 59a46b36c44324e66c54911702d4a7b5a17b7120 Mon Sep 17 00:00:00 2001
From: Sheshank Shankar <sheshank@outlook.com>
Date: Wed, 30 Sep 2020 17:19:44 -0700
Subject: [PATCH 06/12] separate tries per index

---
 litecoder/__init__.py |  4 +++-
 litecoder/usa.py      | 53 +++++++++++++++++++++----------------------
 speed_test.py         |  4 ++--
 3 files changed, 31 insertions(+), 30 deletions(-)

diff --git a/litecoder/__init__.py b/litecoder/__init__.py
index 52ad2ed..ec7ce0a 100644
--- a/litecoder/__init__.py
+++ b/litecoder/__init__.py
@@ -9,7 +9,9 @@
 
 DATA_DIR = os.path.join(os.path.dirname(__file__), 'data')
 
-DATA_PATH = os.path.join(DATA_DIR, 'trie.marisa')
+US_STATE_PATH = os.path.join(DATA_DIR, 'us-states.marisa')
+
+US_CITY_PATH = os.path.join(DATA_DIR, 'us-cities.marisa')
 
 
 logging.basicConfig(
diff --git a/litecoder/usa.py b/litecoder/usa.py
index 12d1f1c..ef325fe 100644
--- a/litecoder/usa.py
+++ b/litecoder/usa.py
@@ -12,7 +12,7 @@
 
 from sqlalchemy.inspection import inspect
 
-from . import logger, DATA_PATH
+from . import logger, US_CITY_PATH, US_STATE_PATH
 from .models import WOFRegion, WOFLocality
 
 
@@ -198,34 +198,34 @@ class Index:
     # state keys -> ids = C
     # state ids -> loc = D
 
-    def load(self, trie_path=DATA_PATH):
-        self._trie.load(trie_path)
+    def load(self, path):
+        self._trie.load(path)
 
-    def __init__(self, keys_key, ids_key):
+    def __init__(self):
         self._trie = marisa_trie.BytesTrie()
-        self._keys_key = keys_key
-        self._ids_key = ids_key
+        self._keys_prefix = "A"
+        self._ids_prefix = "B"
 
     def __len__(self):
-        return len(self._trie.keys(self._keys_key))
+        return len(self._trie.keys(self._keys_prefix))
 
     def __repr__(self):
         return '%s<%d keys, %d entities>' % (
             self.__class__.__name__,
-            len(self._trie.keys(self._keys_key)),
-            len(self._trie.keys(self._ids_key)),
+            len(self._trie.keys(self._keys_prefix)),
+            len(self._trie.keys(self._ids_prefix)),
         )
 
     def __getitem__(self, text):
         """Get ids, map to records only if there is a match in the index
         """
-        normalized_key = self._keys_key + keyify(text)
+        normalized_key = self._keys_prefix + keyify(text)
         if normalized_key not in self._trie:
             return None
 
         ids = pickle.loads(self._trie[normalized_key][0])
 
-        return [pickle.loads(self._trie[self._ids_key + id][0]) for id in ids]
+        return [pickle.loads(self._trie[self._ids_prefix + id][0]) for id in ids]
 
     # def add_key(self, key, id):
     #     self._key_to_ids[key].add(id)
@@ -234,7 +234,7 @@ def __getitem__(self, text):
     #     self._id_to_loc[id] = location
 
     def locations(self):
-        return [loc for (id, loc) in self._trie.items() if id.startswith(self._ids_key)]
+        return [loc for (id, loc) in self._trie.items() if id.startswith(self._ids_prefix)]
 
     def save(self, path):
         self._trie.save(path)
@@ -242,8 +242,11 @@ def save(self, path):
 
 class USCityIndex(Index):
 
+    def load(self, path=US_CITY_PATH):
+        return super().load(path)
+
     def __init__(self, bare_name_blocklist=None):
-        super().__init__(u"A", u"B")
+        super().__init__()
         self.bare_name_blocklist = bare_name_blocklist
 
     def build(self):
@@ -269,20 +272,18 @@ def build(self):
                 key_to_ids[key].add(str(row.wof_id))
 
             # ID -> city
-            id_to_loc[self._ids_key + str(row.wof_id)] = pickle.dumps(CityMatch(row))
+            id_to_loc[self._ids_prefix + str(row.wof_id)] = pickle.dumps(CityMatch(row))
 
-        # In case the loaded trie already has states data
-        previous_trie_data = [(key, value) for (key, value) in self._trie.items() if not (key.startswith(self._keys_key) or key.startswith(self._ids_key))]
-        key_to_ids_trie_data = [(self._keys_key + key, pickle.dumps(key_to_ids[key])) for key in key_to_ids]
-        id_to_loc_trie_data = list(id_to_loc.items())
+        key_to_ids_data = [(self._keys_prefix + key, pickle.dumps(key_to_ids[key])) for key in key_to_ids]
+        id_to_loc_data = list(id_to_loc.items())
         
-        self._trie = marisa_trie.BytesTrie(previous_trie_data + key_to_ids_trie_data + id_to_loc_trie_data)
+        self._trie = marisa_trie.BytesTrie(key_to_ids_data + id_to_loc_data)
 
 
 class USStateIndex(Index):
 
-    def __init__(self):
-        super().__init__(u"C", u"D")
+    def load(self, path=US_STATE_PATH):
+        return super().load(path)
 
     def build(self):
         """Index all US states.
@@ -301,11 +302,9 @@ def build(self):
                 key_to_ids[key].add(str(row.wof_id))
 
             # ID -> state
-            id_to_loc[self._ids_key + str(row.wof_id)] = pickle.dumps(StateMatch(row))
+            id_to_loc[self._ids_prefix + str(row.wof_id)] = pickle.dumps(StateMatch(row))
 
-        # In case the loaded trie already has states data
-        previous_trie_data = [(key, value) for (key, value) in self._trie.items() if not (key.startswith(self._keys_key) or key.startswith(self._ids_key))]
-        key_to_ids_trie_data = [(self._keys_key + key, pickle.dumps(key_to_ids[key])) for key in key_to_ids]
-        id_to_loc_trie_data = list(id_to_loc.items())
+        key_to_ids_data = [(self._keys_prefix + key, pickle.dumps(key_to_ids[key])) for key in key_to_ids]
+        id_to_loc_data = list(id_to_loc.items())
         
-        self._trie = marisa_trie.BytesTrie(previous_trie_data + key_to_ids_trie_data + id_to_loc_trie_data)
\ No newline at end of file
+        self._trie = marisa_trie.BytesTrie(key_to_ids_data + id_to_loc_data)
\ No newline at end of file
diff --git a/speed_test.py b/speed_test.py
index aa43c75..5f8b435 100644
--- a/speed_test.py
+++ b/speed_test.py
@@ -63,8 +63,8 @@
 Kings Mills, OH
 knoxville Tennessee
 BrooklYn""".split("\n")
-for x in range (8):
-    city_tests += city_tests
+# for x in range (10):
+#     city_tests += city_tests
 print("measuring time for {} cities... ".format(len(city_tests)), end="")
 start_time = time.time()
 for city in city_tests:

From 81d3bb6d9b6223f7bdd47af9d2eecccc0aecf43d Mon Sep 17 00:00:00 2001
From: Sheshank Shankar <sheshank@outlook.com>
Date: Sat, 3 Oct 2020 21:54:56 -0700
Subject: [PATCH 07/12] some speed optimizations

---
 litecoder/__init__.py |  2 +-
 litecoder/usa.py      | 22 +++++++++++++---------
 speed_test.py         |  4 ++--
 3 files changed, 16 insertions(+), 12 deletions(-)

diff --git a/litecoder/__init__.py b/litecoder/__init__.py
index ec7ce0a..d3249e5 100644
--- a/litecoder/__init__.py
+++ b/litecoder/__init__.py
@@ -11,7 +11,7 @@
 
 US_STATE_PATH = os.path.join(DATA_DIR, 'us-states.marisa')
 
-US_CITY_PATH = os.path.join(DATA_DIR, 'us-cities.marisa')
+US_CITY_PATH = os.path.join(DATA_DIR, 'us-cities-ujson.marisa')
 
 
 logging.basicConfig(
diff --git a/litecoder/usa.py b/litecoder/usa.py
index ef325fe..4908231 100644
--- a/litecoder/usa.py
+++ b/litecoder/usa.py
@@ -2,20 +2,20 @@
 
 import re
 import marisa_trie
-import pickle
+import _pickle as pickle
+import ujson
 
 from tqdm import tqdm
 from collections import defaultdict
 from itertools import product
 from cached_property import cached_property
 from box import Box
+import gc
 
 from sqlalchemy.inspection import inspect
 
 from . import logger, US_CITY_PATH, US_STATE_PATH
 from .models import WOFRegion, WOFLocality
-
-
 # TODO: Country alt-names YAML.
 USA_NAMES = (
     'USA',
@@ -199,6 +199,7 @@ class Index:
     # state ids -> loc = D
 
     def load(self, path):
+        print(path)
         self._trie.load(path)
 
     def __init__(self):
@@ -223,9 +224,12 @@ def __getitem__(self, text):
         if normalized_key not in self._trie:
             return None
 
-        ids = pickle.loads(self._trie[normalized_key][0])
+        ids = ujson.loads(self._trie[normalized_key][0])
 
-        return [pickle.loads(self._trie[self._ids_prefix + id][0]) for id in ids]
+        gc.disable()
+        z= [pickle.loads(self._trie[self._ids_prefix + id][0]) for id in ids]
+        gc.enable()
+        return z
 
     # def add_key(self, key, id):
     #     self._key_to_ids[key].add(id)
@@ -272,9 +276,9 @@ def build(self):
                 key_to_ids[key].add(str(row.wof_id))
 
             # ID -> city
-            id_to_loc[self._ids_prefix + str(row.wof_id)] = pickle.dumps(CityMatch(row))
+            id_to_loc[self._ids_prefix + str(row.wof_id)] = pickle.dumps(CityMatch(row), protocol=-1)
 
-        key_to_ids_data = [(self._keys_prefix + key, pickle.dumps(key_to_ids[key])) for key in key_to_ids]
+        key_to_ids_data = [(self._keys_prefix + key, bytes(ujson.dumps(list(key_to_ids[key])), encoding="utf-8")) for key in key_to_ids]
         id_to_loc_data = list(id_to_loc.items())
         
         self._trie = marisa_trie.BytesTrie(key_to_ids_data + id_to_loc_data)
@@ -302,9 +306,9 @@ def build(self):
                 key_to_ids[key].add(str(row.wof_id))
 
             # ID -> state
-            id_to_loc[self._ids_prefix + str(row.wof_id)] = pickle.dumps(StateMatch(row))
+            id_to_loc[self._ids_prefix + str(row.wof_id)] = pickle.dumps(StateMatch(row), protocol=-1)
 
-        key_to_ids_data = [(self._keys_prefix + key, pickle.dumps(key_to_ids[key])) for key in key_to_ids]
+        key_to_ids_data = [(self._keys_prefix + key, bytes(ujson.dumps(list(key_to_ids[key])), encoding="utf-8")) for key in key_to_ids]
         id_to_loc_data = list(id_to_loc.items())
         
         self._trie = marisa_trie.BytesTrie(key_to_ids_data + id_to_loc_data)
\ No newline at end of file
diff --git a/speed_test.py b/speed_test.py
index 5f8b435..74e026b 100644
--- a/speed_test.py
+++ b/speed_test.py
@@ -63,8 +63,8 @@
 Kings Mills, OH
 knoxville Tennessee
 BrooklYn""".split("\n")
-# for x in range (10):
-#     city_tests += city_tests
+for x in range (5):
+    city_tests += city_tests
 print("measuring time for {} cities... ".format(len(city_tests)), end="")
 start_time = time.time()
 for city in city_tests:

From 4963a3f9bbce0ca02bd4c1d8a2025dc2b946fb65 Mon Sep 17 00:00:00 2001
From: Sheshank Shankar <sheshank@outlook.com>
Date: Sun, 25 Oct 2020 16:50:57 -0700
Subject: [PATCH 08/12] use raw json

---
 litecoder/__init__.py |  2 +-
 litecoder/usa.py      | 70 +++++++------------------------------------
 2 files changed, 12 insertions(+), 60 deletions(-)

diff --git a/litecoder/__init__.py b/litecoder/__init__.py
index d3249e5..ec7ce0a 100644
--- a/litecoder/__init__.py
+++ b/litecoder/__init__.py
@@ -11,7 +11,7 @@
 
 US_STATE_PATH = os.path.join(DATA_DIR, 'us-states.marisa')
 
-US_CITY_PATH = os.path.join(DATA_DIR, 'us-cities-ujson.marisa')
+US_CITY_PATH = os.path.join(DATA_DIR, 'us-cities.marisa')
 
 
 logging.basicConfig(
diff --git a/litecoder/usa.py b/litecoder/usa.py
index 4908231..758bee0 100644
--- a/litecoder/usa.py
+++ b/litecoder/usa.py
@@ -3,7 +3,7 @@
 import re
 import marisa_trie
 import _pickle as pickle
-import ujson
+import ujson as json
 
 from tqdm import tqdm
 from collections import defaultdict
@@ -148,49 +148,6 @@ def state_key_iter(row):
         yield ' '.join((abbr, usa))
 
 
-class Match:
-
-    def __init__(self, row):
-        """Set model class, PK, metadata.
-        """
-        state = inspect(row)
-
-        # Don't store the actual row, so we can serialize.
-        self._model_cls = state.class_
-        self._pk = state.identity
-
-        self.data = Box(dict(row))
-
-    @cached_property
-    def db_row(self):
-        """Hydrate database row, lazily.
-        """
-        return self._model_cls.query.get(self._pk)
-
-
-class CityMatch(Match):
-
-    def __repr__(self):
-        return '%s<%s, %s, %s, wof:%d>' % (
-            self.__class__.__name__,
-            self.data.name,
-            self.data.name_a1,
-            self.data.name_a0,
-            self.data.wof_id,
-        )
-
-
-class StateMatch(Match):
-
-    def __repr__(self):
-        return '%s<%s, %s, wof:%d>' % (
-            self.__class__.__name__,
-            self.data.name,
-            self.data.name_a0,
-            self.data.wof_id,
-        )
-
-
 class Index:
 
     # city keys -> ids = A
@@ -224,12 +181,9 @@ def __getitem__(self, text):
         if normalized_key not in self._trie:
             return None
 
-        ids = ujson.loads(self._trie[normalized_key][0])
+        ids = json.loads(self._trie[normalized_key][0])
 
-        gc.disable()
-        z= [pickle.loads(self._trie[self._ids_prefix + id][0]) for id in ids]
-        gc.enable()
-        return z
+        return [json.loads(self._trie[self._ids_prefix + id][0]) for id in ids]
 
     # def add_key(self, key, id):
     #     self._key_to_ids[key].add(id)
@@ -267,7 +221,7 @@ def build(self):
         logger.info('Indexing US cities.')
 
         key_to_ids = defaultdict(set)
-        id_to_loc = dict()
+        id_to_loc_items = list()
 
         for row in tqdm(cities):
 
@@ -276,12 +230,11 @@ def build(self):
                 key_to_ids[key].add(str(row.wof_id))
 
             # ID -> city
-            id_to_loc[self._ids_prefix + str(row.wof_id)] = pickle.dumps(CityMatch(row), protocol=-1)
+            id_to_loc_items.append((self._ids_prefix + str(row.wof_id), bytes(json.dumps(dict(row)), encoding="utf-8")))
 
-        key_to_ids_data = [(self._keys_prefix + key, bytes(ujson.dumps(list(key_to_ids[key])), encoding="utf-8")) for key in key_to_ids]
-        id_to_loc_data = list(id_to_loc.items())
+        key_to_ids_items = [(self._keys_prefix + key, bytes(json.dumps(list(key_to_ids[key])), encoding="utf-8")) for key in key_to_ids]
         
-        self._trie = marisa_trie.BytesTrie(key_to_ids_data + id_to_loc_data)
+        self._trie = marisa_trie.BytesTrie(key_to_ids_items + id_to_loc_items)
 
 
 class USStateIndex(Index):
@@ -297,7 +250,7 @@ def build(self):
         logger.info('Indexing US states.')
 
         key_to_ids = defaultdict(set)
-        id_to_loc = dict()
+        id_to_loc_items = list()
 
         for row in tqdm(states):
 
@@ -306,9 +259,8 @@ def build(self):
                 key_to_ids[key].add(str(row.wof_id))
 
             # ID -> state
-            id_to_loc[self._ids_prefix + str(row.wof_id)] = pickle.dumps(StateMatch(row), protocol=-1)
+            id_to_loc_items.append((self._ids_prefix + str(row.wof_id), bytes(json.dumps(dict(row)), encoding="utf-8")))
 
-        key_to_ids_data = [(self._keys_prefix + key, bytes(ujson.dumps(list(key_to_ids[key])), encoding="utf-8")) for key in key_to_ids]
-        id_to_loc_data = list(id_to_loc.items())
+        key_to_ids_items = [(self._keys_prefix + key, bytes(json.dumps(list(key_to_ids[key])), encoding="utf-8")) for key in key_to_ids]
         
-        self._trie = marisa_trie.BytesTrie(key_to_ids_data + id_to_loc_data)
\ No newline at end of file
+        self._trie = marisa_trie.BytesTrie(key_to_ids_items + id_to_loc_items)
\ No newline at end of file

From 6eacada3c0eac767596188079fdd6306c0284453 Mon Sep 17 00:00:00 2001
From: Sheshank Shankar <sheshank@outlook.com>
Date: Wed, 28 Oct 2020 11:56:07 -0700
Subject: [PATCH 09/12] remove unused imports

---
 litecoder/usa.py | 2 --
 1 file changed, 2 deletions(-)

diff --git a/litecoder/usa.py b/litecoder/usa.py
index 758bee0..a0b5561 100644
--- a/litecoder/usa.py
+++ b/litecoder/usa.py
@@ -2,7 +2,6 @@
 
 import re
 import marisa_trie
-import _pickle as pickle
 import ujson as json
 
 from tqdm import tqdm
@@ -10,7 +9,6 @@
 from itertools import product
 from cached_property import cached_property
 from box import Box
-import gc
 
 from sqlalchemy.inspection import inspect
 

From cfff9b2c2c369835969ff2f894aef36ec16440bd Mon Sep 17 00:00:00 2001
From: Sheshank Shankar <sheshank@outlook.com>
Date: Wed, 28 Oct 2020 17:09:22 -0700
Subject: [PATCH 10/12] cleanup

---
 litecoder/usa.py                     |  11 +--
 speed_test.py                        | 127 ---------------------------
 tests/runtime/concurrency_test.py    |  45 ++++++++++
 tests/runtime/speed_test.py          |  44 ++++++++++
 tests/runtime/test_city_lookups.txt  |  50 +++++++++++
 tests/runtime/test_state_lookups.txt |  50 +++++++++++
 6 files changed, 192 insertions(+), 135 deletions(-)
 delete mode 100644 speed_test.py
 create mode 100644 tests/runtime/concurrency_test.py
 create mode 100644 tests/runtime/speed_test.py
 create mode 100644 tests/runtime/test_city_lookups.txt
 create mode 100644 tests/runtime/test_state_lookups.txt

diff --git a/litecoder/usa.py b/litecoder/usa.py
index a0b5561..0afabf4 100644
--- a/litecoder/usa.py
+++ b/litecoder/usa.py
@@ -14,6 +14,8 @@
 
 from . import logger, US_CITY_PATH, US_STATE_PATH
 from .models import WOFRegion, WOFLocality
+
+
 # TODO: Country alt-names YAML.
 USA_NAMES = (
     'USA',
@@ -154,7 +156,6 @@ class Index:
     # state ids -> loc = D
 
     def load(self, path):
-        print(path)
         self._trie.load(path)
 
     def __init__(self):
@@ -183,12 +184,6 @@ def __getitem__(self, text):
 
         return [json.loads(self._trie[self._ids_prefix + id][0]) for id in ids]
 
-    # def add_key(self, key, id):
-    #     self._key_to_ids[key].add(id)
-
-    # def add_location(self, id, location):
-    #     self._id_to_loc[id] = location
-
     def locations(self):
         return [loc for (id, loc) in self._trie.items() if id.startswith(self._ids_prefix)]
 
@@ -261,4 +256,4 @@ def build(self):
 
         key_to_ids_items = [(self._keys_prefix + key, bytes(json.dumps(list(key_to_ids[key])), encoding="utf-8")) for key in key_to_ids]
         
-        self._trie = marisa_trie.BytesTrie(key_to_ids_items + id_to_loc_items)
\ No newline at end of file
+        self._trie = marisa_trie.BytesTrie(key_to_ids_items + id_to_loc_items)
diff --git a/speed_test.py b/speed_test.py
deleted file mode 100644
index 74e026b..0000000
--- a/speed_test.py
+++ /dev/null
@@ -1,127 +0,0 @@
-from litecoder.usa import USCityIndex, USStateIndex
-import time
-
-print("Loading USCityIndex... ", end="")
-start_time = time.time()
-city_idx = USCityIndex()
-city_idx.load()
-print("finished: {}s!".format(time.time() - start_time))
-
-print("Loading USStateIndex... ", end="")
-start_time = time.time()
-state_idx = USStateIndex()
-state_idx.load()
-print("finished: {}s!".format(time.time() - start_time))
-
-city_tests = """Edinburg, Texas
-Lakeville , Minnesota
-Woodland, CA.
-Gary, IN
-Cornelius, NC
-Okeechobee, Fl
-Saginaw Township South, MI
-Lansdowne, PA
-Knoxville TN
-OAKLAND, CA
-suffolk va
-Port Orange, FL
-Sedona, AZ
-Cedar City UT 
-Cincinnati. 
-Huntington Beach CA
-Wooster,Ohio
-Lewisville, Texas
-traverse city mi
-Pennsauken, New Jersey
-Jonesboro, Arkansas
-Zephyrhills, FL
-West Jefferson, NC
-Escondido, CA 
-Lumberton, NC
-Cayce, SC
-Stratford, Connecticut, USA
-Avondale, AZ
-Coral Springs, FL 
-Gaithersburg, MD
-Westchester, IL
-Louisa, Virginia 
-Norway, ME
-Philadelphia PA, USA
-Fort worth, tx
-Eureka Springs, Arkansas
-Nashville , TN
-Ellenwood Ga
-Floral Park, NY
-Nashville Tennessee
-Malvern, AR
-Valdosta, Georgia
-Valley Center Ca
-St. Robert Mo. 
-Hollandale, MS
-New Castle, PA 
-Harlem, FL
-Kings Mills, OH
-knoxville Tennessee
-BrooklYn""".split("\n")
-for x in range (5):
-    city_tests += city_tests
-print("measuring time for {} cities... ".format(len(city_tests)), end="")
-start_time = time.time()
-for city in city_tests:
-    x = city_idx[city]
-print("finished: took {}s!".format(1000*(time.time() - start_time)))
-state_tests = """North Carolina, USA   
-District of Columbia
-Illinois, United States
-Georgia United States 
-north carolina
-texas
-iowa
-Florida, United States
-Vermont,  USA
-TX USA 
-FL U.S.A.
-pennsylvania usa
-nebraska 
- Oregon 
-Pennsylvania 
-New Hampshire  USA
-Nebraska, USA
-New mexico
-Indiana    
-South Dakota 
- Oklahoma
-Ohio,US
-Kansas, USA 
-indiana
-MA, USA
- New York
-Ohio, United States
-NJ USA
-ohio usa
-Connecticut, USA
-MICHIGAN, United States
-Missouri
-New York
-California - USA
-Massachusetts, USA 
- Missouri
-FL, United States of America
-New Hampshire
-Georgia.
-Nevada USA
- PENNSYLVANIA
-Virginia, USA.
-Alabama, USA
-Indiana 
-Louisiana, United States
-New Mexico 
-Ohio USA
-Nevada, USA
-LOUISIANA
-New Jersey, us""".split("\n")
-print("measuring time for {} states... ".format(len(state_tests)), end="")
-start_time = time.time()
-for state in state_tests:
-    x = state_idx[state]
-print("finished: took {}s!".format(1000 * (time.time() - start_time)))
diff --git a/tests/runtime/concurrency_test.py b/tests/runtime/concurrency_test.py
new file mode 100644
index 0000000..6adc618
--- /dev/null
+++ b/tests/runtime/concurrency_test.py
@@ -0,0 +1,45 @@
+from multiprocessing import Pool
+from litecoder.usa import USCityIndex, USStateIndex
+import time
+
+NUM_PROCESSES = 4
+
+# Load 50 test city lookups
+with open("tests/runtime/test_city_lookups.txt", "r") as lookups_file:
+    city_tests = lookups_file.read().splitlines()
+
+# Increase the number of lookups for the speed test if necessary
+for x in range (10):
+    city_tests += city_tests
+num_tests_per_process = len(city_tests)
+num_tests = NUM_PROCESSES * num_tests_per_process
+
+# Load USCityIndex
+city_idx = USCityIndex()
+city_idx.load()
+
+
+def lookup_cities(process_num):
+    print ('Process {}: looking up {} cities'.format(process_num, num_tests_per_process))
+    start_time = time.time()
+    for city in city_tests:
+        city_idx[city]
+    ms = 1000*(time.time() - start_time)
+    print("Process {}: finished, took {}ms @ {} ms/lookup!".format(process_num, ms, float(ms/num_tests_per_process)))
+
+if __name__ == '__main__':
+    print("Looking up {} cities on {} processes...".format(num_tests, NUM_PROCESSES))
+    start_time = time.time()
+    with Pool(5) as p:
+        p.map(lookup_cities, range(1, NUM_PROCESSES+1))
+    ms = 1000*(time.time() - start_time)
+    print("Fully finished: took {}ms @ {} ms/lookup!".format(ms, float(ms/num_tests)))
+    
+    print()
+    print("Looking up all {} cities on one process...".format(num_tests), end="")
+    start_time = time.time()
+    for i in range(NUM_PROCESSES):
+        for city in city_tests:
+            city_idx[city]
+    ms = 1000*(time.time() - start_time)
+    print("finished: took {}ms @ {} ms/lookup!".format(ms, ms/num_tests))
\ No newline at end of file
diff --git a/tests/runtime/speed_test.py b/tests/runtime/speed_test.py
new file mode 100644
index 0000000..de226d9
--- /dev/null
+++ b/tests/runtime/speed_test.py
@@ -0,0 +1,44 @@
+from litecoder.usa import USCityIndex, USStateIndex
+import time
+
+print("Loading USCityIndex... ", end="")
+start_time = time.time()
+city_idx = USCityIndex()
+city_idx.load()
+print("finished: {}s!".format(time.time() - start_time))
+
+# Load 50 test city lookups
+with open("tests/runtime/test_city_lookups.txt", "r") as lookups_file:
+    city_tests = lookups_file.read().splitlines()
+
+# Increase the number of lookups for the speed test if necessary
+for x in range (5):
+    city_tests += city_tests
+num_tests = len(city_tests)
+print("measuring time for {} cities... ".format(num_tests), end="")
+start_time = time.time()
+for city in city_tests:
+    city_idx[city]
+ms = 1000*(time.time() - start_time)
+print("finished: took {}ms at {} ms/lookup!".format(ms, float(ms/num_tests)))
+
+print("Loading USStateIndex... ", end="")
+start_time = time.time()
+state_idx = USStateIndex()
+state_idx.load()
+print("finished: {}s!".format(time.time() - start_time))
+
+# Load 50 test state lookups
+with open("tests/runtime/test_state_lookups.txt", "r") as lookups_file:
+    state_tests = lookups_file.read().splitlines()
+
+# Increase the number of lookups for the speed test if necessary
+for x in range (5):
+    state_tests += state_tests
+num_tests = len(state_tests)
+print("measuring time for {} states... ".format(num_tests), end="")
+start_time = time.time()
+for state in state_tests:
+    state_idx[state]
+ms = 1000*(time.time() - start_time)
+print("finished: took {}ms at {} ms/lookup!".format(ms, float(ms/num_tests)))
diff --git a/tests/runtime/test_city_lookups.txt b/tests/runtime/test_city_lookups.txt
new file mode 100644
index 0000000..43d1bd5
--- /dev/null
+++ b/tests/runtime/test_city_lookups.txt
@@ -0,0 +1,50 @@
+Edinburg, Texas
+Lakeville , Minnesota
+Woodland, CA.
+Gary, IN
+Cornelius, NC
+Okeechobee, Fl
+Saginaw Township South, MI
+Lansdowne, PA
+Knoxville TN
+OAKLAND, CA
+suffolk va
+Port Orange, FL
+Sedona, AZ
+Cedar City UT 
+Cincinnati. 
+Huntington Beach CA
+Wooster,Ohio
+Lewisville, Texas
+traverse city mi
+Pennsauken, New Jersey
+Jonesboro, Arkansas
+Zephyrhills, FL
+West Jefferson, NC
+Escondido, CA 
+Lumberton, NC
+Cayce, SC
+Stratford, Connecticut, USA
+Avondale, AZ
+Coral Springs, FL 
+Gaithersburg, MD
+Westchester, IL
+Louisa, Virginia 
+Norway, ME
+Philadelphia PA, USA
+Fort worth, tx
+Eureka Springs, Arkansas
+Nashville , TN
+Ellenwood Ga
+Floral Park, NY
+Nashville Tennessee
+Malvern, AR
+Valdosta, Georgia
+Valley Center Ca
+St. Robert Mo. 
+Hollandale, MS
+New Castle, PA 
+Harlem, FL
+Kings Mills, OH
+knoxville Tennessee
+BrooklYn
\ No newline at end of file
diff --git a/tests/runtime/test_state_lookups.txt b/tests/runtime/test_state_lookups.txt
new file mode 100644
index 0000000..fd8b3c9
--- /dev/null
+++ b/tests/runtime/test_state_lookups.txt
@@ -0,0 +1,50 @@
+North Carolina, USA   
+District of Columbia
+Illinois, United States
+Georgia United States 
+north carolina
+texas
+iowa
+Florida, United States
+Vermont,  USA
+TX USA 
+FL U.S.A.
+pennsylvania usa
+nebraska 
+ Oregon 
+Pennsylvania 
+New Hampshire  USA
+Nebraska, USA
+New mexico
+Indiana    
+South Dakota 
+ Oklahoma
+Ohio,US
+Kansas, USA 
+indiana
+MA, USA
+ New York
+Ohio, United States
+NJ USA
+ohio usa
+Connecticut, USA
+MICHIGAN, United States
+Missouri
+New York
+California - USA
+Massachusetts, USA 
+ Missouri
+FL, United States of America
+New Hampshire
+Georgia.
+Nevada USA
+ PENNSYLVANIA
+Virginia, USA.
+Alabama, USA
+Indiana 
+Louisiana, United States
+New Mexico 
+Ohio USA
+Nevada, USA
+LOUISIANA
+New Jersey, us
\ No newline at end of file

From a2f49156f7c98fcbef254ec2081a94517a2cdfd6 Mon Sep 17 00:00:00 2001
From: Sheshank Shankar <sheshank@outlook.com>
Date: Wed, 28 Oct 2020 17:12:44 -0700
Subject: [PATCH 11/12] Update tests to match new match output format

---
 tests/prod_db/conftest.py            | 2 +-
 tests/prod_db/test_us_city_index.py  | 4 ++--
 tests/prod_db/test_us_state_index.py | 4 ++--
 3 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/tests/prod_db/conftest.py b/tests/prod_db/conftest.py
index a9420a6..06583e3 100644
--- a/tests/prod_db/conftest.py
+++ b/tests/prod_db/conftest.py
@@ -16,4 +16,4 @@ def city_idx():
 def state_idx():
     state_idx = USStateIndex()
     state_idx.load()
-    return state_idx
\ No newline at end of file
+    return state_idx
diff --git a/tests/prod_db/test_us_city_index.py b/tests/prod_db/test_us_city_index.py
index aec57f0..e05cfa2 100644
--- a/tests/prod_db/test_us_city_index.py
+++ b/tests/prod_db/test_us_city_index.py
@@ -33,7 +33,7 @@ def test_cases(city_idx, query, matches, xfail):
 
     res = city_idx[query]
 
-    ids = [r.data.wof_id for r in res]
+    ids = [r["wof_id"] for r in res]
 
     # Exact id list match.
     assert sorted(ids) == sorted(matches)
@@ -49,6 +49,6 @@ def test_topn(city_idx, city):
     """Smoke test N most populous cities.
     """
     res = city_idx['%s, %s' % (city.name, city.name_a1)]
-    res_ids = [r.data.wof_id for r in res]
+    res_ids = [r["wof_id"] for r in res]
 
     assert city.wof_id in res_ids
diff --git a/tests/prod_db/test_us_state_index.py b/tests/prod_db/test_us_state_index.py
index 9adafc6..2ae1561 100644
--- a/tests/prod_db/test_us_state_index.py
+++ b/tests/prod_db/test_us_state_index.py
@@ -28,7 +28,7 @@ def test_cases(state_idx, query, matches):
 
     res = state_idx[query]
 
-    ids = [r.data.wof_id for r in res]
+    ids = [r["wof_id"] for r in res]
 
     assert sorted(ids) == sorted(matches)
 
@@ -41,6 +41,6 @@ def test_all(state_idx, state):
     """Smoke test N most populous cities.
     """
     res = state_idx[state.name]
-    res_ids = [r.data.wof_id for r in res]
+    res_ids = [r["wof_id"] for r in res]
 
     assert state.wof_id in res_ids

From 0ad83aece64cb1c3f96aad6f4b97dbf6cf6bf94f Mon Sep 17 00:00:00 2001
From: Sheshank Shankar <sheshank@outlook.com>
Date: Wed, 18 Nov 2020 14:50:06 -0800
Subject: [PATCH 12/12] Resolve merge request reviews

---
 Pipfile                     |  1 +
 litecoder/usa.py            | 34 +++++++++++++++++-----------------
 tests/runtime/speed_test.py |  4 ++--
 3 files changed, 20 insertions(+), 19 deletions(-)

diff --git a/Pipfile b/Pipfile
index b65227a..576163e 100644
--- a/Pipfile
+++ b/Pipfile
@@ -35,6 +35,7 @@ PyYAML = "*"
 Shapely = "*"
 numpy = "*"
 scipy = "*"
+marisa_trie = "*"
 
 [dev-packages]
 
diff --git a/litecoder/usa.py b/litecoder/usa.py
index 0afabf4..362d485 100644
--- a/litecoder/usa.py
+++ b/litecoder/usa.py
@@ -150,16 +150,16 @@ def state_key_iter(row):
 
 class Index:
 
-    # city keys -> ids = A
-    # city ids -> loc = B
-    # state keys -> ids = C
-    # state ids -> loc = D
-
-    def load(self, path):
-        self._trie.load(path)
+    def load(self, path, mmap=False):
+        if mmap:
+            self._trie.mmap(path)
+        else:
+            self._trie.load(path)
 
     def __init__(self):
         self._trie = marisa_trie.BytesTrie()
+
+        # We use prefixes here to store the keys -> ids and ids -> loc "maps" as subtrees in one marisa trie.
         self._keys_prefix = "A"
         self._ids_prefix = "B"
 
@@ -177,15 +177,15 @@ def __getitem__(self, text):
         """Get ids, map to records only if there is a match in the index
         """
         normalized_key = self._keys_prefix + keyify(text)
-        if normalized_key not in self._trie:
+        val = self._trie.get(normalized_key, None)
+        if not val:
             return None
-
-        ids = json.loads(self._trie[normalized_key][0])
+        ids = json.loads(val[0])
 
         return [json.loads(self._trie[self._ids_prefix + id][0]) for id in ids]
 
     def locations(self):
-        return [loc for (id, loc) in self._trie.items() if id.startswith(self._ids_prefix)]
+        return self._trie.items(self._ids_prefix)
 
     def save(self, path):
         self._trie.save(path)
@@ -193,8 +193,8 @@ def save(self, path):
 
 class USCityIndex(Index):
 
-    def load(self, path=US_CITY_PATH):
-        return super().load(path)
+    def load(self, path=US_CITY_PATH, mmap=False):
+        return super().load(path, mmap)
 
     def __init__(self, bare_name_blocklist=None):
         super().__init__()
@@ -225,15 +225,15 @@ def build(self):
             # ID -> city
             id_to_loc_items.append((self._ids_prefix + str(row.wof_id), bytes(json.dumps(dict(row)), encoding="utf-8")))
 
-        key_to_ids_items = [(self._keys_prefix + key, bytes(json.dumps(list(key_to_ids[key])), encoding="utf-8")) for key in key_to_ids]
+        key_to_ids_items = [(self._keys_prefix + key, json.dumps(list(key_to_ids[key])).encode("utf-8")) for key in key_to_ids]
         
         self._trie = marisa_trie.BytesTrie(key_to_ids_items + id_to_loc_items)
 
 
 class USStateIndex(Index):
 
-    def load(self, path=US_STATE_PATH):
-        return super().load(path)
+    def load(self, path=US_STATE_PATH, mmap=False):
+        return super().load(path, mmap)
 
     def build(self):
         """Index all US states.
@@ -254,6 +254,6 @@ def build(self):
             # ID -> state
             id_to_loc_items.append((self._ids_prefix + str(row.wof_id), bytes(json.dumps(dict(row)), encoding="utf-8")))
 
-        key_to_ids_items = [(self._keys_prefix + key, bytes(json.dumps(list(key_to_ids[key])), encoding="utf-8")) for key in key_to_ids]
+        key_to_ids_items = [(self._keys_prefix + key, json.dumps(list(key_to_ids[key])).encode("utf-8")) for key in key_to_ids]
         
         self._trie = marisa_trie.BytesTrie(key_to_ids_items + id_to_loc_items)
diff --git a/tests/runtime/speed_test.py b/tests/runtime/speed_test.py
index de226d9..17a7092 100644
--- a/tests/runtime/speed_test.py
+++ b/tests/runtime/speed_test.py
@@ -8,7 +8,7 @@
 print("finished: {}s!".format(time.time() - start_time))
 
 # Load 50 test city lookups
-with open("tests/runtime/test_city_lookups.txt", "r") as lookups_file:
+with open("test_city_lookups.txt", "r") as lookups_file:
     city_tests = lookups_file.read().splitlines()
 
 # Increase the number of lookups for the speed test if necessary
@@ -29,7 +29,7 @@
 print("finished: {}s!".format(time.time() - start_time))
 
 # Load 50 test state lookups
-with open("tests/runtime/test_state_lookups.txt", "r") as lookups_file:
+with open("test_state_lookups.txt", "r") as lookups_file:
     state_tests = lookups_file.read().splitlines()
 
 # Increase the number of lookups for the speed test if necessary