HoloClean · allen-oneill · Nov 30, 2020
diff --git a/dataset/dataset.py b/dataset/dataset.py
@@ -79,7 +79,7 @@ def load_data(self, name, fpath, na_values=None, entity_col=None, src_col=None):
             specifies the column containing the source for each "mention" of an
             entity.
         """
-        tic = time.clock()
+        tic = time.time()
         try:
             # Do not include TID and source column as trainable attributes
             exclude_attr_cols = ['_tid_']
@@ -121,7 +121,7 @@ def load_data(self, name, fpath, na_values=None, entity_col=None, src_col=None):
         except Exception:
             logging.error('loading data for table %s', name)
             raise
-        toc = time.clock()
+        toc = time.time()
         load_time = toc - tic
         return status, load_time
 
@@ -218,9 +218,9 @@ def get_statistics(self):
         """
         if not self.stats_ready:
             logging.debug('computing frequency and co-occurrence statistics from raw data...')
-            tic = time.clock()
+            tic = time.time()
             self.collect_stats()
-            logging.debug('DONE computing statistics in %.2fs', time.clock() - tic)
+            logging.debug('DONE computing statistics in %.2fs', time.time() - tic)
 
         stats = (self.total_tuples, self.single_attr_stats, self.pair_attr_stats)
         self.stats_ready = True
@@ -287,7 +287,7 @@ def get_domain_info(self):
         return total_vars, classes
 
     def get_inferred_values(self):
-        tic = time.clock()
+        tic = time.time()
         # index into domain with inferred_val_idx + 1 since SQL arrays begin at index 1.
         query = "SELECT t1._tid_, t1.attribute, domain[inferred_val_idx + 1] as rv_value " \
                 "FROM " \
@@ -298,12 +298,12 @@ def get_inferred_values(self):
         self.generate_aux_table_sql(AuxTables.inf_values_dom, query, index_attrs=['_tid_'])
         self.aux_table[AuxTables.inf_values_dom].create_db_index(self.engine, ['attribute'])
         status = "DONE collecting the inferred values."
-        toc = time.clock()
+        toc = time.time()
         total_time = toc - tic
         return status, total_time
 
     def get_repaired_dataset(self):
-        tic = time.clock()
+        tic = time.time()
         init_records = self.raw_data.df.sort_values(['_tid_']).to_records(index=False)
         t = self.aux_table[AuxTables.inf_values_dom]
         repaired_vals = dictify_df(t.df.reset_index())
@@ -315,6 +315,6 @@ def get_repaired_dataset(self):
         self.repaired_data = Table(name, Source.DF, df=repaired_df)
         self.repaired_data.store_to_db(self.engine.engine)
         status = "DONE generating repaired dataset"
-        toc = time.clock()
+        toc = time.time()
         total_time = toc - tic
         return status, total_time
diff --git a/dataset/dbengine.py b/dataset/dbengine.py
@@ -35,9 +35,9 @@ def execute_queries(self, queries):
         :param queries: (list[str]) list of SQL queries to be executed
         """
         logging.debug('Preparing to execute %d queries.', len(queries))
-        tic = time.clock()
+        tic = time.time()
         results = self._apply_func(partial(_execute_query, conn_args=self.conn_args), [(idx, q) for idx, q in enumerate(queries)])
-        toc = time.clock()
+        toc = time.time()
         logging.debug('Time to execute %d queries: %.2f secs', len(queries), toc-tic)
         return results
 
@@ -48,11 +48,11 @@ def execute_queries_w_backup(self, queries):
         :param queries: (list[str]) list of SQL queries to be executed
         """
         logging.debug('Preparing to execute %d queries.', len(queries))
-        tic = time.clock()
+        tic = time.time()
         results = self._apply_func(
             partial(_execute_query_w_backup, conn_args=self.conn_args, timeout=self.timeout),
             [(idx, q) for idx, q in enumerate(queries)])
-        toc = time.clock()
+        toc = time.time()
         logging.debug('Time to execute %d queries: %.2f secs', len(queries), toc-tic)
         return results
 
@@ -62,23 +62,23 @@ def execute_query(self, query):
 
         :param query: (str) SQL query to be executed
         """
-        tic = time.clock()
+        tic = time.time()
         conn = self.engine.connect()
         result = conn.execute(query).fetchall()
         conn.close()
-        toc = time.clock()
+        toc = time.time()
         logging.debug('Time to execute query: %.2f secs', toc-tic)
         return result
 
     def create_db_table_from_query(self, name, query):
-        tic = time.clock()
+        tic = time.time()
         drop = drop_table_template.substitute(table=name)
         create = create_table_template.substitute(table=name, stmt=query)
         conn = self.engine.connect()
         conn.execute(drop)
         conn.execute(create)
         conn.close()
-        toc = time.clock()
+        toc = time.time()
         logging.debug('Time to create table: %.2f secs', toc-tic)
         return True
 
@@ -95,11 +95,11 @@ def create_db_index(self, name, table, attr_list):
         # We need to quote each attribute since Postgres auto-downcases unquoted column references
         quoted_attrs = map(lambda attr: '"{}"'.format(attr), attr_list)
         stmt = index_template.substitute(idx_title=name, table=table, attrs=','.join(quoted_attrs))
-        tic = time.clock()
+        tic = time.time()
         conn = self.engine.connect()
         result = conn.execute(stmt)
         conn.close()
-        toc = time.clock()
+        toc = time.time()
         logging.debug('Time to create index: %.2f secs', toc-tic)
         return result
 
@@ -113,13 +113,13 @@ def _execute_query(args, conn_args):
     query_id = args[0]
     query = args[1]
     logging.debug("Starting to execute query %s with id %s", query, query_id)
-    tic = time.clock()
+    tic = time.time()
     con = psycopg2.connect(conn_args)
     cur = con.cursor()
     cur.execute(query)
     res = cur.fetchall()
     con.close()
-    toc = time.clock()
+    toc = time.time()
     logging.debug('Time to execute query with id %d: %.2f secs', query_id, (toc - tic))
     return res
 
@@ -129,7 +129,7 @@ def _execute_query_w_backup(args, conn_args, timeout):
     query = args[1][0]
     query_backup = args[1][1]
     logging.debug("Starting to execute query %s with id %s", query, query_id)
-    tic = time.clock()
+    tic = time.time()
     con = psycopg2.connect(conn_args)
     cur = con.cursor()
     cur.execute("SET statement_timeout to %d;"%timeout)
@@ -151,6 +151,6 @@ def _execute_query_w_backup(args, conn_args, timeout):
         cur.execute(query_backup)
         res = cur.fetchall()
         con.close()
-    toc = time.clock()
+    toc = time.time()
     logging.debug('Time to execute query with id %d: %.2f secs', query_id, toc - tic)
     return res
diff --git a/dcparser/dcparser.py b/dcparser/dcparser.py
@@ -26,10 +26,10 @@ def load_denial_constraints(self, fpath):
 
         :param fpath: filepath to TXT file containing denial constraints
         """
-        tic = time.clock()
+        tic = time.time()
         if not self.ds.raw_data:
             status = 'No dataset specified'
-            toc = time.clock()
+            toc = time.time()
             return status, toc - tic
         attrs = self.ds.raw_data.get_attributes()
         try:
@@ -47,7 +47,7 @@ def load_denial_constraints(self, fpath):
         except Exception:
             logging.error('FAILED to load constraints from file %s', os.path.basename(fpath))
             raise
-        toc = time.clock()
+        toc = time.time()
         return status, toc - tic
 
     def get_dcs(self):

diff --git a/detect/detect.py b/detect/detect.py
@@ -17,17 +17,17 @@ def detect_errors(self, detectors):
         :param detectors: (list) of ErrorDetector objects
         """
         errors = []
-        tic_total = time.clock()
+        tic_total = time.time()
 
         # Initialize all error detectors.
         for detector in detectors:
             detector.setup(self.ds, self.env)
 
         # Run detection using each detector.
         for detector in detectors:
-            tic = time.clock()
+            tic = time.time()
             error_df = detector.detect_noisy_cells()
-            toc = time.clock()
+            toc = time.time()
             logging.debug("DONE with Error Detector: %s in %.2f secs", detector.name, toc-tic)
             errors.append(error_df)
 
@@ -39,7 +39,7 @@ def detect_errors(self, detectors):
         # Store errors to db.
         self.store_detected_errors(errors_df)
         status = "DONE with error detection."
-        toc_total = time.clock()
+        toc_total = time.time()
         detect_time = toc_total - tic_total
         return status, detect_time
 

diff --git a/domain/domain.py b/domain/domain.py
@@ -134,9 +134,9 @@ def setup_attributes(self):
         self.total = total
         self.single_stats = single_stats
         logging.debug("preparing pruned co-occurring statistics...")
-        tic = time.clock()
+        tic = time.time()
         self.pair_stats = self._pruned_pair_stats(pair_stats)
-        logging.debug("DONE with pruned co-occurring statistics in %.2f secs", time.clock() - tic)
+        logging.debug("DONE with pruned co-occurring statistics in %.2f secs", time.time() - tic)
         self.setup_complete = True
 
     def _pruned_pair_stats(self, pair_stats):
@@ -231,7 +231,7 @@ def generate_domain(self):
                 "Call <setup_attributes> to setup active attributes. Error detection should be performed before setup.")
 
         logging.debug('generating initial set of un-pruned domain values...')
-        tic = time.clock()
+        tic = time.time()
         # Iterate over dataset rows.
         cells = []
         vid = 0
@@ -297,7 +297,7 @@ def generate_domain(self):
                               "fixed": cell_status})
                 vid += 1
         domain_df = pd.DataFrame(data=cells).sort_values('_vid_')
-        logging.debug('DONE generating initial set of domain values in %.2f', time.clock() - tic)
+        logging.debug('DONE generating initial set of domain values in %.2f', time.time() - tic)
 
         # Skip estimator model since we do not require any weak labelling or domain
         # pruning based on posterior probabilities.
@@ -307,18 +307,18 @@ def generate_domain(self):
         # Run pruned domain values from correlated attributes above through
         # posterior model for a naive probability estimation.
         logging.debug('training posterior model for estimating domain value probabilities...')
-        tic = time.clock()
+        tic = time.time()
         estimator = NaiveBayes(self.env, self.ds, domain_df, self.correlations)
-        logging.debug('DONE training posterior model in %.2fs', time.clock() - tic)
+        logging.debug('DONE training posterior model in %.2fs', time.time() - tic)
 
         # Predict probabilities for all pruned domain values.
         logging.debug('predicting domain value probabilities from posterior model...')
-        tic = time.clock()
+        tic = time.time()
         preds_by_cell = estimator.predict_pp_batch()
-        logging.debug('DONE predictions in %.2f secs, re-constructing cell domain...', time.clock() - tic)
+        logging.debug('DONE predictions in %.2f secs, re-constructing cell domain...', time.time() - tic)
 
         logging.debug('re-assembling final cell domain table...')
-        tic = time.clock()
+        tic = time.time()
         # iterate through raw/current data and generate posterior probabilities for
         # weak labelling
         num_weak_labels = 0
@@ -365,7 +365,7 @@ def generate_domain(self):
 
         # update our cell domain df with our new updated domain
         domain_df = pd.DataFrame.from_records(updated_domain_df, columns=updated_domain_df[0].dtype.names).drop('index', axis=1).sort_values('_vid_')
-        logging.debug('DONE assembling cell domain table in %.2fs', time.clock() - tic)
+        logging.debug('DONE assembling cell domain table in %.2fs', time.time() - tic)
 
         logging.info('number of (additional) weak labels assigned from posterior model: %d', num_weak_labels)
 

diff --git a/domain/estimators/logistic.py b/domain/estimators/logistic.py
@@ -71,7 +71,7 @@ def _gen_training_data(self):
         used for training and prediction.
         """
         logging.debug('Logistic: featurizing training data...')
-        tic = time.clock()
+        tic = time.time()
         # Each row corresponds to a possible value for a given attribute
         # and given TID
         self._X = torch.zeros(self.n_samples, self.num_features)
@@ -120,7 +120,7 @@ def _gen_training_data(self):
         # Convert this to a vector of indices rather than a vector mask.
         self._train_idx = (self._train_idx == 1).nonzero()[:,0]
 
-        logging.debug('Logistic: DONE featurization in %.2fs', time.clock() - tic)
+        logging.debug('Logistic: DONE featurization in %.2fs', time.time() - tic)
 
     def _gen_feat_tensor(self, init_row, attr, domain_vals):
         """

diff --git a/evaluate/eval.py b/evaluate/eval.py
@@ -48,7 +48,7 @@ def __init__(self, env, dataset):
         self.ds = dataset
 
     def load_data(self, name, fpath, tid_col, attr_col, val_col, na_values=None):
-        tic = time.clock()
+        tic = time.time()
         try:
             raw_data = pd.read_csv(fpath, na_values=na_values, encoding='utf-8')
             # We drop any ground truth values that are NULLs since we follow
@@ -73,7 +73,7 @@ def load_data(self, name, fpath, tid_col, attr_col, val_col, na_values=None):
         except Exception:
             logging.error('load_data for table %s', name)
             raise
-        toc = time.clock()
+        toc = time.time()
         load_time = toc - tic
         return status, load_time
 
@@ -98,7 +98,7 @@ def eval_report(self):
         """
         Returns an EvalReport named tuple containing the experiment results.
         """
-        tic = time.clock()
+        tic = time.time()
         try:
             prec, rec, rep_recall, f1, rep_f1 = self.evaluate_repairs()
             report = "Precision = %.2f, Recall = %.2f, Repairing Recall = %.2f, F1 = %.2f, Repairing F1 = %.2f, Detected Errors = %d, Total Errors = %d, Correct Repairs = %d, Total Repairs = %d, Total Repairs on correct cells (Grdth present) = %d, Total Repairs on incorrect cells (Grdth present) = %d" % (
@@ -112,7 +112,7 @@ def eval_report(self):
             logging.error("ERROR generating evaluation report %s" % e)
             raise
 
-        toc = time.clock()
+        toc = time.time()
         report_time = toc - tic
         return report, report_time, eval_report
 
@@ -315,7 +315,7 @@ def log_weak_label_stats(self):
         logging.debug("weak label statistics:")
         pd.set_option('display.max_columns', None)
         pd.set_option('display.max_rows', len(df_stats))
-        pd.set_option('display.max_colwidth', -1)
+        pd.set_option('display.max_colwidth', None)
         logging.debug("%s", df_stats)
         pd.reset_option('display.max_columns')
         pd.reset_option('display.max_rows')