Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
48 commits
Select commit Hold shift + click to select a range
7c1a444
Relocated test data into subdirectories.
richardwu Apr 5, 2019
8e1d461
Move active attributes to right after error detection and inside
richardwu Apr 5, 2019
38db1c7
Remove 2.7 tests from Travis because of Python 3 features.
richardwu Apr 5, 2019
dcd9e0f
Refactor domain generation sort domain by co-occurrence probability and
richardwu Apr 5, 2019
c6c1941
Make co-occurrence featurizer only generate co-occurrence features for
richardwu Apr 5, 2019
83c941c
Implemented TupleEmbedding model as an estimator.
richardwu Apr 5, 2019
8929efc
Always load clean/ground truth as strings since we load/store raw dat…
richardwu Apr 5, 2019
1e240a6
Added featurizer for learned embeddings from TupleEmbedding model.
richardwu Apr 5, 2019
9c37148
Support multiple layers during repair and made TupleEmbedding dump/load
richardwu Apr 7, 2019
d5b4065
Quantization and handling of numerical/mixed data.
zaqthss Apr 3, 2019
2c24733
Improved validation logging and fixed a few bugs.
richardwu Apr 8, 2019
3fbe38f
Improve validation in TupleEmbedding using pandas dataframes.
richardwu Apr 9, 2019
22c903a
Suppose multi-dimensional quantization.
richardwu Apr 9, 2019
1c74de7
Quantize from dict rather than numerical attrs.
richardwu Apr 18, 2019
4627e38
Mean/var normalize numerical attributes in context and added
richardwu Apr 19, 2019
8d019db
Support specifying n-dimensional numerical attr groups vs splitting on
richardwu Apr 20, 2019
9a507d8
Fixed None numerical_attr_groups.
richardwu Apr 20, 2019
4bbe318
Fixed report RMS error and converting to floats for quantization.
richardwu Apr 20, 2019
d17442c
Added store_to_fb flag to load_data, added LR schedule to
richardwu Apr 28, 2019
5a0af23
Pre-split domain and ground truth values.
richardwu Apr 30, 2019
211bab3
Fixed batch size argument in EmbeddingFeaturizer.
richardwu May 1, 2019
5c940a5
Removed numerical_attrs reference from Table.
richardwu May 1, 2019
e5e01d0
Fix to how multi-ground truth is handled. Use simplified numerical
richardwu May 5, 2019
34adeee
Max domain size need only be as large as largest for categorical
richardwu May 7, 2019
d7ade7f
Remove domain for numerical attributes in TupleEmbedding.
richardwu May 7, 2019
d9f453a
Fixed some reference issues and added infer all mode.
richardwu May 17, 2019
e1a4b88
Fixed _nan_ replacement, max_cat_domain being possibly nan, and
richardwu May 20, 2019
b6ba6a7
Do not weak label clean cells and fixed raw data in Logistic estimator.
richardwu May 25, 2019
bdd2742
Added ReLU after context for numerical targets in TupleEmbedding and
richardwu May 28, 2019
b726749
Use cosine annealing with restart LR schedule and use weak_label instead
richardwu Jun 2, 2019
6c86d20
Fixed memory issues with get_features and predict_pp_batch.
richardwu Jun 4, 2019
04a1653
Fixed bug in get_features.
richardwu Jun 5, 2019
5b28dc7
Added comment to EmbeddingFeat.
richardwu Jun 5, 2019
1c2216e
Finally fixed memory issues with torch.no_grad.
richardwu Jun 8, 2019
3a52fe6
ConstraintFeaturizer runs on un-quantized values.
richardwu Jun 10, 2019
598dd80
Do not drop single value cells (for evaluation).
richardwu Jun 10, 2019
6d94842
Do not generate queries/feature for DC that does not pertain to
richardwu Jun 10, 2019
592c7ec
Fixed ConstraintFeaturizer to handle no DCs.
richardwu Jun 10, 2019
0c4a3e6
Removed deprecated code and added dropout.
richardwu Jun 11, 2019
591fa02
Fixed calculation of num_batches in learning loop.
richardwu Jun 12, 2019
bbe68e7
do not drop null inits cells with dom(len) <= 1
zaqthss Jun 17, 2019
ba1cc4b
Fixed z-scoring with 0 std and deleting e-notation numerical values.
richardwu Jun 19, 2019
2303c31
Do not quantize if bins > unique.
richardwu Jun 20, 2019
f0805ef
Fixed some things in domain.
richardwu Jun 22, 2019
ae336cc
Added notebook for using EmbeddingFeaturizer.
richardwu Jun 22, 2019
93e84e5
Fixed up notebook for mixed value repair.
richardwu Jun 22, 2019
ae30186
Merge hcq-embedding-3 (#97)
minafarid Sep 26, 2019
24e881a
Bug fix in embedding feat (#98)
zaqthss Sep 27, 2019
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,9 @@ _static
_templates
.DS_store
.venv
*.swo
*.swp
*.swn
*.pyc
.cache/

Expand Down
1 change: 0 additions & 1 deletion .travis.yml
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
language: python
python:
- "2.7"
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

are we sure that this does not break other dependencies?

- "3.6"

addons:
Expand Down
4 changes: 3 additions & 1 deletion dataset/__init__.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
from .dataset import Dataset
from .dataset import AuxTables
from .dataset import CellStatus
from .dataset import Source
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

what is a source?

from .dataset import Table

__all__ = ['Dataset', 'AuxTables', 'CellStatus']
__all__ = ['Dataset', 'AuxTables', 'CellStatus', 'Table', 'Source']
116 changes: 104 additions & 12 deletions dataset/dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
import os
import time

import numpy as np
import pandas as pd

from .dbengine import DBengine
Expand All @@ -25,12 +26,12 @@ class CellStatus(Enum):
WEAK_LABEL = 1
SINGLE_VALUE = 2


class Dataset:
"""
This class keeps all dataframes and tables for a HC session.
"""
def __init__(self, name, env):
self.env = env
self.id = name
self.raw_data = None
self.repaired_data = None
Expand Down Expand Up @@ -58,9 +59,25 @@ def __init__(self, name, env):
self.single_attr_stats = {}
# Domain stats for attribute pairs
self.pair_attr_stats = {}
# Active attributes (attributes with errors)
self._active_attributes = None
# Attributes to train on
self.train_attrs = env["train_attrs"]

# Embedding model for learned embedding vectors of domain values and
# tuple context
self._embedding_model = None

# Numerical attribute list, all strings
self.numerical_attrs = None
self.categorical_attrs = None

self.quantized_data = None
self.do_quantization = False

# TODO(richardwu): load more than just CSV files
def load_data(self, name, fpath, na_values=None, entity_col=None, src_col=None):
def load_data(self, name, fpath, na_values=None, entity_col=None, src_col=None,
exclude_attr_cols=None, numerical_attrs=None, store_to_db=True):
"""
load_data takes a CSV file of the initial data, adds tuple IDs (_tid_)
to each row to uniquely identify an 'entity', and generates unique
Expand All @@ -78,16 +95,22 @@ def load_data(self, name, fpath, na_values=None, entity_col=None, src_col=None):
:param src_col: (str) if not None, for fusion tasks
specifies the column containing the source for each "mention" of an
entity.
:param exclude_attr_cols:
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

what are the types of these inputs? format them appropriately.

:param numerical_attrs:
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

same as above.

"""
tic = time.clock()
try:
# Do not include TID and source column as trainable attributes
exclude_attr_cols = ['_tid_']
if exclude_attr_cols is None:
exclude_attr_cols = ['_tid_']
else:
exclude_attr_cols.append('_tid_')
if src_col is not None:
exclude_attr_cols.append(src_col)

# Load raw CSV file/data into a Postgres table 'name' (param).
self.raw_data = Table(name, Source.FILE, na_values=na_values, exclude_attr_cols=exclude_attr_cols, fpath=fpath)
self.raw_data = Table(name, Source.FILE, na_values=na_values,
exclude_attr_cols=exclude_attr_cols, fpath=fpath)

df = self.raw_data.df
# Add _tid_ column to dataset that uniquely identifies an entity.
Expand All @@ -100,19 +123,40 @@ def load_data(self, name, fpath, na_values=None, entity_col=None, src_col=None):
# use entity IDs as _tid_'s directly
df.rename({entity_col: '_tid_'}, axis='columns', inplace=True)

self.numerical_attrs = numerical_attrs or []
all_attrs = self.raw_data.get_attributes()
self.categorical_attrs = [attr for attr in all_attrs if attr not in self.numerical_attrs]

if store_to_db:
# Now df is all in str type, make a copy of df and then
# 1. replace the null values in categorical data
# 2. make the numerical attrs as float
# 3. store the correct type into db (categorical->str, numerical->float)
df_correct_type = df.copy()
for attr in self.categorical_attrs:
df_correct_type.loc[df_correct_type[attr].isnull(), attr] = NULL_REPR
for attr in self.numerical_attrs:
df_correct_type[attr] = df_correct_type[attr].astype(float)

df_correct_type.to_sql(self.raw_data.name, self.engine.engine, if_exists='replace', index=False,
index_label=None)

# for df, which is all str
# Use NULL_REPR to represent NULL values
df.replace('', NULL_REPR, inplace=True)
df.fillna(NULL_REPR, inplace=True)

logging.info("Loaded %d rows with %d cells", self.raw_data.df.shape[0], self.raw_data.df.shape[0] * self.raw_data.df.shape[1])
logging.info("Loaded %d rows with %d cells", self.raw_data.df.shape[0],
self.raw_data.df.shape[0] * self.raw_data.df.shape[1])

# Call to store to database
self.raw_data.store_to_db(self.engine.engine)
status = 'DONE Loading {fname}'.format(fname=os.path.basename(fpath))

# Generate indexes on attribute columns for faster queries
for attr in self.raw_data.get_attributes():
# Generate index on attribute
self.raw_data.create_db_index(self.engine,[attr])
if store_to_db:
# Generate indexes on attribute columns for faster queries
for attr in self.raw_data.get_attributes():
# Generate index on attribute
self.raw_data.create_db_index(self.engine,[attr])

# Create attr_to_idx dictionary (assign unique index for each attribute)
# and attr_count (total # of attributes)
Expand Down Expand Up @@ -178,6 +222,15 @@ def get_raw_data(self):
raise Exception('ERROR No dataset loaded')
return self.raw_data.df

def get_quantized_data(self):
"""
get_quantized_data returns a pandas.DataFrame containing the data after quantization
:return: the data after quantization in pandas.DataFrame
"""
if self.quantized_data is None:
raise Exception('ERROR No dataset quantized')
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Fix the message. This is no proper English.

return self.quantized_data.df

def get_attributes(self):
"""
get_attributes return the trainable/learnable attributes (i.e. exclude meta
Expand All @@ -187,6 +240,29 @@ def get_attributes(self):
raise Exception('ERROR No dataset loaded')
return self.raw_data.get_attributes()

def get_active_attributes(self):
"""
get_active_attributes returns the attributes to be modeled.

If infer_mode = 'dk', these attributes correspond only to attributes that contain at least
one potentially erroneous cell. Otherwise all attributes are returned.

If applicable, in the provided :param:`train_attrs` variable.
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

what does this second comment mean?

"""
if self.train_attrs is None:
self.train_attrs = self.get_attributes()

if self.env['infer_mode'] == 'dk':
if self._active_attributes is None:
raise Exception('ERROR no active attributes loaded. Run error detection first.')
attrs = self._active_attributes
elif self.env['infer_mode'] == 'all':
attrs = self.get_attributes()
else:
raise Exception('infer mode must be one of {dk, all}')

return sorted([attr for attr in attrs if attr in self.train_attrs])

def get_cell_id(self, tuple_id, attr_name):
"""
get_cell_id returns cell ID: a unique ID for every cell.
Expand Down Expand Up @@ -257,7 +333,7 @@ def get_stats_single(self, attr):
"""
# need to decode values into unicode strings since we do lookups via
# unicode strings from Postgres
data_df = self.get_raw_data()
data_df = self.get_quantized_data() if self.do_quantization else self.get_raw_data()
return data_df[[attr]].loc[data_df[attr] != NULL_REPR].groupby([attr]).size().to_dict()

def get_stats_pair(self, first_attr, second_attr):
Expand All @@ -268,7 +344,7 @@ def get_stats_pair(self, first_attr, second_attr):
<count>: frequency (# of entities) where first_attr=<first_val> AND second_attr=<second_val>
Filters out NULL values so no entries in the dictionary would have NULLs.
"""
data_df = self.get_raw_data()
data_df = self.get_quantized_data() if self.do_quantization else self.get_raw_data()
tmp_df = data_df[[first_attr, second_attr]]\
.loc[(data_df[first_attr] != NULL_REPR) & (data_df[second_attr] != NULL_REPR)]\
.groupby([first_attr, second_attr])\
Expand Down Expand Up @@ -318,3 +394,19 @@ def get_repaired_dataset(self):
toc = time.clock()
total_time = toc - tic
return status, total_time

def load_embedding_model(self, model):
"""
Memoize the TupleEmbedding model for retrieving learned embeddings
later (e.g. in EmbeddingFeaturizer).
"""
self._embedding_model = model

def get_embedding_model(self):
"""
Retrieve the memoized embedding model.
"""
if self._embedding_model is None:
raise Exception("cannot retrieve embedding model: it was never trained and loaded!")
return self._embedding_model

51 changes: 51 additions & 0 deletions dataset/quantization.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
import time

import numpy as np
from sklearn.cluster import KMeans
from utils import NULL_REPR


def quantize_km(env, df_raw, num_attr_groups_bins):
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

the name is not informative. Switch to kmeans (it is not long). Also, I would expect to specify "k" here. Do bins refer to clusters? I would change the name to clusters instead of bins to follow common convention.

"""
Kmeans clustering using sklearn
https://scikit-learn.org/stable/modules/generated/sklearn.cluster.KMeans.html
Currently do 1D clustering
:param df_raw: pandas.dataframe
:param num_attr_groups_bins: list[tuple] where each tuple consists of
(# of bins, list[str]) where the list[str] is a group of attribues to be
treated as numerical.
Groups must be disjoint.

:return: pandas.dataframe after quantization
"""
tic = time.time()
df_quantized = df_raw.copy()

# Assert groups are disjoint
num_attrs = [attr for _, group in num_attr_groups_bins for attr in group]
assert len(set(num_attrs)) == len(num_attrs)

for bins, attrs in num_attr_groups_bins:
fil_notnull = (df_quantized[attrs] != NULL_REPR).all(axis=1)

df_group = df_quantized.loc[fil_notnull, attrs].reset_index(drop=True)
# Matrix of possibly n-dimension values
X_attrs = df_group.values.astype(np.float)

if bins >= np.unique(X_attrs, axis=0).shape[0]:
# No need to quantize since more bins than unique values.
continue

km = KMeans(n_clusters=bins)
km.fit(X_attrs)

label_pred = km.labels_
centroids = km.cluster_centers_

# Lookup cluster centroids and replace their values.
df_quantized.loc[fil_notnull, attrs] = np.array([centroids[label_pred[idx]]
for idx in df_group.index]).astype(str)

status = "DONE with quantization"
toc = time.time()
return status, toc - tic, df_quantized
10 changes: 6 additions & 4 deletions dataset/table.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ class Table:
A wrapper class for Dataset Tables.
"""
def __init__(self, name, src, na_values=None, exclude_attr_cols=['_tid_'],
fpath=None, df=None, schema_name=None, table_query=None, db_engine=None):
fpath=None, df=None, schema_name=None, table_query=None, db_engine=None):
"""
:param name: (str) name to assign to dataset.
:param na_values: (str or list[str]) values to interpret as NULL.
Expand Down Expand Up @@ -46,16 +46,18 @@ def __init__(self, name, src, na_values=None, exclude_attr_cols=['_tid_'],
if fpath is None:
raise Exception("ERROR while loading table. File path for CSV file name expected. Please provide <fpath> param.")
# TODO(richardwu): use COPY FROM instead of loading this into memory
# TODO(richardwu): No support for numerical values. To be added.
self.df = pd.read_csv(fpath, dtype=str, na_values=na_values, encoding='utf-8')

# Normalize the dataframe: drop null columns, convert to lowercase strings, and strip whitespaces.
for attr in self.df.columns.values:
if self.df[attr].isnull().all():
logging.warning("Dropping the following null column from the dataset: '%s'", attr)
self.df.drop(labels=[attr], axis=1, inplace=True)
continue
if attr not in exclude_attr_cols:
self.df[attr] = self.df[attr].str.strip().str.lower()
if attr in exclude_attr_cols:
continue

self.df[attr] = self.df[attr].str.strip().str.lower()
elif src == Source.DF:
if df is None:
raise Exception("ERROR while loading table. Dataframe expected. Please provide <df> param.")
Expand Down
11 changes: 6 additions & 5 deletions detect/detect.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,12 +32,13 @@ def detect_errors(self, detectors):
errors.append(error_df)

# Get unique errors only that might have been detected from multiple detectors.
errors_df = pd.concat(errors, ignore_index=True).drop_duplicates().reset_index(drop=True)
errors_df['_cid_'] = errors_df.apply(lambda x: self.ds.get_cell_id(x['_tid_'], x['attribute']), axis=1)
logging.info("detected %d potentially erroneous cells", errors_df.shape[0])
self.errors_df = pd.concat(errors, ignore_index=True).drop_duplicates().reset_index(drop=True)
if self.errors_df.shape[0]:
self.errors_df['_cid_'] = self.errors_df.apply(lambda x: self.ds.get_cell_id(x['_tid_'], x['attribute']), axis=1)
logging.info("detected %d potentially erroneous cells", self.errors_df.shape[0])

# Store errors to db.
self.store_detected_errors(errors_df)
self.store_detected_errors(self.errors_df)
status = "DONE with error detection."
toc_total = time.clock()
detect_time = toc_total - tic_total
Expand All @@ -48,4 +49,4 @@ def store_detected_errors(self, errors_df):
raise Exception("ERROR: Detected errors dataframe is empty.")
self.ds.generate_aux_table(AuxTables.dk_cells, errors_df, store=True)
self.ds.aux_table[AuxTables.dk_cells].create_db_index(self.ds.engine, ['_cid_'])

self.ds._active_attributes = sorted(errors_df['attribute'].unique())
2 changes: 1 addition & 1 deletion detect/detector.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,4 +30,4 @@ def detect_noisy_cells(self):

:return dataframe for the dk_cell
"""
raise NotImplementedError
raise NotImplementedError
4 changes: 2 additions & 2 deletions detect/nulldetector.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,12 +26,12 @@ def detect_noisy_cells(self):
_tid_: entity ID
attribute: attribute with NULL value for this entity
"""
attributes = self.ds.get_attributes()
errors = []
for attr in attributes:
for attr in self.ds.get_attributes():
tmp_df = self.df[self.df[attr] == NULL_REPR]['_tid_'].to_frame()
tmp_df.insert(1, "attribute", attr)
errors.append(tmp_df)

errors_df = pd.concat(errors, ignore_index=True)
return errors_df

Loading