diff --git a/dataset/dataset.py b/dataset/dataset.py index f5eb3ef1a..2ab45b541 100644 --- a/dataset/dataset.py +++ b/dataset/dataset.py @@ -79,7 +79,7 @@ def load_data(self, name, fpath, na_values=None, entity_col=None, src_col=None): specifies the column containing the source for each "mention" of an entity. """ - tic = time.clock() + tic = time.time() try: # Do not include TID and source column as trainable attributes exclude_attr_cols = ['_tid_'] @@ -121,7 +121,7 @@ def load_data(self, name, fpath, na_values=None, entity_col=None, src_col=None): except Exception: logging.error('loading data for table %s', name) raise - toc = time.clock() + toc = time.time() load_time = toc - tic return status, load_time @@ -218,9 +218,9 @@ def get_statistics(self): """ if not self.stats_ready: logging.debug('computing frequency and co-occurrence statistics from raw data...') - tic = time.clock() + tic = time.time() self.collect_stats() - logging.debug('DONE computing statistics in %.2fs', time.clock() - tic) + logging.debug('DONE computing statistics in %.2fs', time.time() - tic) stats = (self.total_tuples, self.single_attr_stats, self.pair_attr_stats) self.stats_ready = True @@ -287,7 +287,7 @@ def get_domain_info(self): return total_vars, classes def get_inferred_values(self): - tic = time.clock() + tic = time.time() # index into domain with inferred_val_idx + 1 since SQL arrays begin at index 1. query = "SELECT t1._tid_, t1.attribute, domain[inferred_val_idx + 1] as rv_value " \ "FROM " \ @@ -298,12 +298,12 @@ def get_inferred_values(self): self.generate_aux_table_sql(AuxTables.inf_values_dom, query, index_attrs=['_tid_']) self.aux_table[AuxTables.inf_values_dom].create_db_index(self.engine, ['attribute']) status = "DONE collecting the inferred values." - toc = time.clock() + toc = time.time() total_time = toc - tic return status, total_time def get_repaired_dataset(self): - tic = time.clock() + tic = time.time() init_records = self.raw_data.df.sort_values(['_tid_']).to_records(index=False) t = self.aux_table[AuxTables.inf_values_dom] repaired_vals = dictify_df(t.df.reset_index()) @@ -315,6 +315,6 @@ def get_repaired_dataset(self): self.repaired_data = Table(name, Source.DF, df=repaired_df) self.repaired_data.store_to_db(self.engine.engine) status = "DONE generating repaired dataset" - toc = time.clock() + toc = time.time() total_time = toc - tic return status, total_time diff --git a/dataset/dbengine.py b/dataset/dbengine.py index 635dc723f..dece20832 100644 --- a/dataset/dbengine.py +++ b/dataset/dbengine.py @@ -35,9 +35,9 @@ def execute_queries(self, queries): :param queries: (list[str]) list of SQL queries to be executed """ logging.debug('Preparing to execute %d queries.', len(queries)) - tic = time.clock() + tic = time.time() results = self._apply_func(partial(_execute_query, conn_args=self.conn_args), [(idx, q) for idx, q in enumerate(queries)]) - toc = time.clock() + toc = time.time() logging.debug('Time to execute %d queries: %.2f secs', len(queries), toc-tic) return results @@ -48,11 +48,11 @@ def execute_queries_w_backup(self, queries): :param queries: (list[str]) list of SQL queries to be executed """ logging.debug('Preparing to execute %d queries.', len(queries)) - tic = time.clock() + tic = time.time() results = self._apply_func( partial(_execute_query_w_backup, conn_args=self.conn_args, timeout=self.timeout), [(idx, q) for idx, q in enumerate(queries)]) - toc = time.clock() + toc = time.time() logging.debug('Time to execute %d queries: %.2f secs', len(queries), toc-tic) return results @@ -62,23 +62,23 @@ def execute_query(self, query): :param query: (str) SQL query to be executed """ - tic = time.clock() + tic = time.time() conn = self.engine.connect() result = conn.execute(query).fetchall() conn.close() - toc = time.clock() + toc = time.time() logging.debug('Time to execute query: %.2f secs', toc-tic) return result def create_db_table_from_query(self, name, query): - tic = time.clock() + tic = time.time() drop = drop_table_template.substitute(table=name) create = create_table_template.substitute(table=name, stmt=query) conn = self.engine.connect() conn.execute(drop) conn.execute(create) conn.close() - toc = time.clock() + toc = time.time() logging.debug('Time to create table: %.2f secs', toc-tic) return True @@ -95,11 +95,11 @@ def create_db_index(self, name, table, attr_list): # We need to quote each attribute since Postgres auto-downcases unquoted column references quoted_attrs = map(lambda attr: '"{}"'.format(attr), attr_list) stmt = index_template.substitute(idx_title=name, table=table, attrs=','.join(quoted_attrs)) - tic = time.clock() + tic = time.time() conn = self.engine.connect() result = conn.execute(stmt) conn.close() - toc = time.clock() + toc = time.time() logging.debug('Time to create index: %.2f secs', toc-tic) return result @@ -113,13 +113,13 @@ def _execute_query(args, conn_args): query_id = args[0] query = args[1] logging.debug("Starting to execute query %s with id %s", query, query_id) - tic = time.clock() + tic = time.time() con = psycopg2.connect(conn_args) cur = con.cursor() cur.execute(query) res = cur.fetchall() con.close() - toc = time.clock() + toc = time.time() logging.debug('Time to execute query with id %d: %.2f secs', query_id, (toc - tic)) return res @@ -129,7 +129,7 @@ def _execute_query_w_backup(args, conn_args, timeout): query = args[1][0] query_backup = args[1][1] logging.debug("Starting to execute query %s with id %s", query, query_id) - tic = time.clock() + tic = time.time() con = psycopg2.connect(conn_args) cur = con.cursor() cur.execute("SET statement_timeout to %d;"%timeout) @@ -151,6 +151,6 @@ def _execute_query_w_backup(args, conn_args, timeout): cur.execute(query_backup) res = cur.fetchall() con.close() - toc = time.clock() + toc = time.time() logging.debug('Time to execute query with id %d: %.2f secs', query_id, toc - tic) return res diff --git a/dcparser/dcparser.py b/dcparser/dcparser.py index b764a3ebf..760b4a31a 100644 --- a/dcparser/dcparser.py +++ b/dcparser/dcparser.py @@ -26,10 +26,10 @@ def load_denial_constraints(self, fpath): :param fpath: filepath to TXT file containing denial constraints """ - tic = time.clock() + tic = time.time() if not self.ds.raw_data: status = 'No dataset specified' - toc = time.clock() + toc = time.time() return status, toc - tic attrs = self.ds.raw_data.get_attributes() try: @@ -47,7 +47,7 @@ def load_denial_constraints(self, fpath): except Exception: logging.error('FAILED to load constraints from file %s', os.path.basename(fpath)) raise - toc = time.clock() + toc = time.time() return status, toc - tic def get_dcs(self): diff --git a/detect/detect.py b/detect/detect.py index 776443966..cdf6e17e6 100644 --- a/detect/detect.py +++ b/detect/detect.py @@ -17,7 +17,7 @@ def detect_errors(self, detectors): :param detectors: (list) of ErrorDetector objects """ errors = [] - tic_total = time.clock() + tic_total = time.time() # Initialize all error detectors. for detector in detectors: @@ -25,9 +25,9 @@ def detect_errors(self, detectors): # Run detection using each detector. for detector in detectors: - tic = time.clock() + tic = time.time() error_df = detector.detect_noisy_cells() - toc = time.clock() + toc = time.time() logging.debug("DONE with Error Detector: %s in %.2f secs", detector.name, toc-tic) errors.append(error_df) @@ -39,7 +39,7 @@ def detect_errors(self, detectors): # Store errors to db. self.store_detected_errors(errors_df) status = "DONE with error detection." - toc_total = time.clock() + toc_total = time.time() detect_time = toc_total - tic_total return status, detect_time diff --git a/domain/domain.py b/domain/domain.py index 8c1a0948c..eb4930add 100644 --- a/domain/domain.py +++ b/domain/domain.py @@ -134,9 +134,9 @@ def setup_attributes(self): self.total = total self.single_stats = single_stats logging.debug("preparing pruned co-occurring statistics...") - tic = time.clock() + tic = time.time() self.pair_stats = self._pruned_pair_stats(pair_stats) - logging.debug("DONE with pruned co-occurring statistics in %.2f secs", time.clock() - tic) + logging.debug("DONE with pruned co-occurring statistics in %.2f secs", time.time() - tic) self.setup_complete = True def _pruned_pair_stats(self, pair_stats): @@ -231,7 +231,7 @@ def generate_domain(self): "Call to setup active attributes. Error detection should be performed before setup.") logging.debug('generating initial set of un-pruned domain values...') - tic = time.clock() + tic = time.time() # Iterate over dataset rows. cells = [] vid = 0 @@ -297,7 +297,7 @@ def generate_domain(self): "fixed": cell_status}) vid += 1 domain_df = pd.DataFrame(data=cells).sort_values('_vid_') - logging.debug('DONE generating initial set of domain values in %.2f', time.clock() - tic) + logging.debug('DONE generating initial set of domain values in %.2f', time.time() - tic) # Skip estimator model since we do not require any weak labelling or domain # pruning based on posterior probabilities. @@ -307,18 +307,18 @@ def generate_domain(self): # Run pruned domain values from correlated attributes above through # posterior model for a naive probability estimation. logging.debug('training posterior model for estimating domain value probabilities...') - tic = time.clock() + tic = time.time() estimator = NaiveBayes(self.env, self.ds, domain_df, self.correlations) - logging.debug('DONE training posterior model in %.2fs', time.clock() - tic) + logging.debug('DONE training posterior model in %.2fs', time.time() - tic) # Predict probabilities for all pruned domain values. logging.debug('predicting domain value probabilities from posterior model...') - tic = time.clock() + tic = time.time() preds_by_cell = estimator.predict_pp_batch() - logging.debug('DONE predictions in %.2f secs, re-constructing cell domain...', time.clock() - tic) + logging.debug('DONE predictions in %.2f secs, re-constructing cell domain...', time.time() - tic) logging.debug('re-assembling final cell domain table...') - tic = time.clock() + tic = time.time() # iterate through raw/current data and generate posterior probabilities for # weak labelling num_weak_labels = 0 @@ -365,7 +365,7 @@ def generate_domain(self): # update our cell domain df with our new updated domain domain_df = pd.DataFrame.from_records(updated_domain_df, columns=updated_domain_df[0].dtype.names).drop('index', axis=1).sort_values('_vid_') - logging.debug('DONE assembling cell domain table in %.2fs', time.clock() - tic) + logging.debug('DONE assembling cell domain table in %.2fs', time.time() - tic) logging.info('number of (additional) weak labels assigned from posterior model: %d', num_weak_labels) diff --git a/domain/estimators/logistic.py b/domain/estimators/logistic.py index ace20547b..47639f1a5 100644 --- a/domain/estimators/logistic.py +++ b/domain/estimators/logistic.py @@ -71,7 +71,7 @@ def _gen_training_data(self): used for training and prediction. """ logging.debug('Logistic: featurizing training data...') - tic = time.clock() + tic = time.time() # Each row corresponds to a possible value for a given attribute # and given TID self._X = torch.zeros(self.n_samples, self.num_features) @@ -120,7 +120,7 @@ def _gen_training_data(self): # Convert this to a vector of indices rather than a vector mask. self._train_idx = (self._train_idx == 1).nonzero()[:,0] - logging.debug('Logistic: DONE featurization in %.2fs', time.clock() - tic) + logging.debug('Logistic: DONE featurization in %.2fs', time.time() - tic) def _gen_feat_tensor(self, init_row, attr, domain_vals): """ diff --git a/evaluate/eval.py b/evaluate/eval.py index cf6a8361c..2b1f54582 100644 --- a/evaluate/eval.py +++ b/evaluate/eval.py @@ -48,7 +48,7 @@ def __init__(self, env, dataset): self.ds = dataset def load_data(self, name, fpath, tid_col, attr_col, val_col, na_values=None): - tic = time.clock() + tic = time.time() try: raw_data = pd.read_csv(fpath, na_values=na_values, encoding='utf-8') # We drop any ground truth values that are NULLs since we follow @@ -73,7 +73,7 @@ def load_data(self, name, fpath, tid_col, attr_col, val_col, na_values=None): except Exception: logging.error('load_data for table %s', name) raise - toc = time.clock() + toc = time.time() load_time = toc - tic return status, load_time @@ -98,7 +98,7 @@ def eval_report(self): """ Returns an EvalReport named tuple containing the experiment results. """ - tic = time.clock() + tic = time.time() try: prec, rec, rep_recall, f1, rep_f1 = self.evaluate_repairs() report = "Precision = %.2f, Recall = %.2f, Repairing Recall = %.2f, F1 = %.2f, Repairing F1 = %.2f, Detected Errors = %d, Total Errors = %d, Correct Repairs = %d, Total Repairs = %d, Total Repairs on correct cells (Grdth present) = %d, Total Repairs on incorrect cells (Grdth present) = %d" % ( @@ -112,7 +112,7 @@ def eval_report(self): logging.error("ERROR generating evaluation report %s" % e) raise - toc = time.clock() + toc = time.time() report_time = toc - tic return report, report_time, eval_report @@ -315,7 +315,7 @@ def log_weak_label_stats(self): logging.debug("weak label statistics:") pd.set_option('display.max_columns', None) pd.set_option('display.max_rows', len(df_stats)) - pd.set_option('display.max_colwidth', -1) + pd.set_option('display.max_colwidth', None) logging.debug("%s", df_stats) pd.reset_option('display.max_columns') pd.reset_option('display.max_rows') diff --git a/repair/repair.py b/repair/repair.py index cea79a076..da7892fff 100644 --- a/repair/repair.py +++ b/repair/repair.py @@ -14,41 +14,41 @@ def __init__(self, env, dataset): self.env = env def setup_featurized_ds(self, featurizers): - tic = time.clock() + tic = time.time() self.feat_dataset = FeaturizedDataset(self.ds, self.env, featurizers) - toc = time.clock() + toc = time.time() status = "DONE setting up featurized dataset." feat_time = toc - tic return status, feat_time def setup_repair_model(self): - tic = time.clock() + tic = time.time() feat_info = self.feat_dataset.featurizer_info output_dim = self.feat_dataset.classes self.repair_model = RepairModel(self.env, feat_info, output_dim, bias=self.env['bias']) - toc = time.clock() + toc = time.time() status = "DONE setting up repair model." setup_time = toc - tic return status, setup_time def fit_repair_model(self): - tic = time.clock() + tic = time.time() X_train, Y_train, mask_train = self.feat_dataset.get_training_data() logging.info('training with %d training examples (cells)', X_train.shape[0]) self.repair_model.fit_model(X_train, Y_train, mask_train) - toc = time.clock() + toc = time.time() status = "DONE training repair model." train_time = toc - tic return status, train_time def infer_repairs(self): - tic = time.clock() + tic = time.time() X_pred, mask_pred, infer_idx = self.feat_dataset.get_infer_data() Y_pred = self.repair_model.infer_values(X_pred, mask_pred) distr_df, infer_val_df = self.get_infer_dataframes(infer_idx, Y_pred) self.ds.generate_aux_table(AuxTables.cell_distr, distr_df, store=True, index_attrs=['_vid_']) self.ds.generate_aux_table(AuxTables.inf_values_idx, infer_val_df, store=True, index_attrs=['_vid_']) - toc = time.clock() + toc = time.time() status = "DONE inferring repairs." infer_time = toc - tic return status, infer_time @@ -84,8 +84,8 @@ def get_infer_dataframes(self, infer_idx, Y_pred): return distr_df, infer_val_df def get_featurizer_weights(self): - tic = time.clock() + tic = time.time() report = self.repair_model.get_featurizer_weights(self.feat_dataset.featurizer_info) - toc = time.clock() + toc = time.time() report_time = toc - tic return report, report_time