From 8f3754a2c5617dd4ea4d173314c17e6dab2f2963 Mon Sep 17 00:00:00 2001 From: Syler Date: Mon, 26 Jun 2023 17:17:01 -0400 Subject: [PATCH 1/4] Add script to anonymize reflection templates --- scripts/anonymize.py | 68 ++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 68 insertions(+) create mode 100755 scripts/anonymize.py diff --git a/scripts/anonymize.py b/scripts/anonymize.py new file mode 100755 index 0000000..37290d7 --- /dev/null +++ b/scripts/anonymize.py @@ -0,0 +1,68 @@ +import json +import random +import argparse +from nltk.corpus import words, wordnet +from pathlib import Path + +# Generate a list of nouns and adjectives +nouns = {x.name().split(".", 1)[0] for x in wordnet.all_synsets("n")} +adjectives = {x.name().split(".", 1)[0] for x in wordnet.all_synsets("a")} +words_list = list(nouns | adjectives) + + +def generate_random_word(): + word = random.choice(words_list) + return " ".join(part.capitalize() for part in word.split("_")) + + +def update_values(data, word_map): + if isinstance(data, list): + for i in range(len(data)): + if isinstance(data[i], str): + if data[i] not in word_map: + word_map[data[i]] = generate_random_word() + data[i] = word_map[data[i]] + else: + update_values(data[i], word_map) + elif isinstance(data, dict): + for key, value in data.items(): + if key in ["name", "group"] and value != "": + if value not in word_map: + word_map[value] = generate_random_word() + data[key] = word_map[value] + elif key == "choices" and isinstance(value, list): + data[key] = [generate_random_word() for _ in value] + else: + update_values(value, word_map) + return data + + +def main(input_file, output_file): + # Load the JSON file + with open(input_file) as f: + data = json.load(f) + + group_map = {} + # Update the values + updated_data = update_values(data, group_map) + + # Write to the new JSON file + with open(output_file, "w") as f: + json.dump(updated_data, f, indent=4) + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument("input_file", help="The path of the input JSON file") + parser.add_argument( + "-o", "--output_file", help="The path of the output JSON file" + ) + + args = parser.parse_args() + + if args.output_file is None: + # If no output file is provided, use the input file name with '_anonymized' appended + input_path = Path(args.input_file) + args.output_file = input_path.stem + "_anonymized" + input_path.suffix + + main(args.input_file, args.output_file) From cd16f417f66bee9d2da813ea37ca140ab7282597 Mon Sep 17 00:00:00 2001 From: Syler Date: Tue, 4 Jul 2023 15:07:20 -0400 Subject: [PATCH 2/4] Function to anonymize output CSV data --- reflect/conversions.py | 65 ++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 62 insertions(+), 3 deletions(-) diff --git a/reflect/conversions.py b/reflect/conversions.py index 26d732b..a430eee 100644 --- a/reflect/conversions.py +++ b/reflect/conversions.py @@ -106,19 +106,20 @@ def get_post_metric_default(self, metric_kind: str): ) raise + def convert_timestamp(apple_timestamp: float) -> str: """ Convert an Apple-style timestamp to a local time string. - + Args: apple_timestamp (float): The Apple timestamp to convert. Returns: - str: The timestamp in local time as a string formatted as + str: The timestamp in local time as a string formatted as "YYYY-MM-DD HH:MM:SS". """ # Convert the Apple timestamp to a Python timestamp - # 978307200 is the number of seconds between 1970-01-01T00:00:00Z and + # 978307200 is the number of seconds between 1970-01-01T00:00:00Z and # 2001-01-01T00:00:00Z timestamp = apple_timestamp + 978307200 @@ -352,6 +353,8 @@ def parse_json( def save_dataframes_to_csv( reflections_map: Dict[str, pd.DataFrame], output_folder: str, + anonymize: Optional[bool] = False, + set_all_nan: Optional[bool] = False, filter_list: Optional[List[str]] = None, ) -> None: """ @@ -373,4 +376,60 @@ def save_dataframes_to_csv( k: v for k, v in reflections_map.items() if k in filter_list } for name, df in reflections_map.items(): + if anonymize: + df = anonymize(df, set_all_nan) df.to_csv(os.path.join(output_folder, f"{name}.csv"), index=False) + + +def anonymize( + df: pd.DataFrame, set_all_nan: Optional[bool] = False +) -> pd.DataFrame: + """ + Anonymize the input dataframe by removing specific columns, adding a random + time offset to the "Date" column, setting all other values to NaN if s + et_all_nan is True, and replacing other column names with a random + capitalized word. + + Parameters + ---------- + df : pd.DataFrame + Input dataframe to be anonymized. + + set_all_nan : Optional[bool] + If True, set all values in the dataframe to NaN except for the "Date" + column. + + Returns + ------- + df : pd.DataFrame + The anonymized dataframe. + """ + import nltk + import random + + nltk.download("words") + + # Step 1: Remove specified columns + df = df.drop(columns=["Notes", "ID", "Timestamp"], errors="ignore") + + # Step 2: Add random time offset to "Date" column + if "Date" in df.columns: + random_offset = pd.series.DateOffset( + years=np.random.randint(-1000, 1000) + ) + df["Date"] += random_offset + + # Step 3: Set all values in dataframe to NaN if set_all_nan is True + if set_all_nan: + for col in df.columns: + if col != "Date": + df[col] = np.nan + + # Step 4: Replace other column names with random capitalized word + def random_word(): + return random.choice(nltk.corpus.words.words()).capitalize() + + new_columns = {col: random_word() for col in df.columns if col != "Date"} + df = df.rename(columns=new_columns) + + return df From d717cf232be771bcbaf011554d021db33af514dd Mon Sep 17 00:00:00 2001 From: Syler Date: Tue, 4 Jul 2023 15:41:12 -0400 Subject: [PATCH 3/4] Pass anonymization options to json_to_csv.py --- reflect/conversions.py | 31 +++++++++++++--------- scripts/json_to_csv.py | 58 ++++++++++++++++++++++++++++++++++-------- 2 files changed, 66 insertions(+), 23 deletions(-) diff --git a/reflect/conversions.py b/reflect/conversions.py index a430eee..eec2133 100644 --- a/reflect/conversions.py +++ b/reflect/conversions.py @@ -353,7 +353,7 @@ def parse_json( def save_dataframes_to_csv( reflections_map: Dict[str, pd.DataFrame], output_folder: str, - anonymize: Optional[bool] = False, + do_anonymize: Optional[bool] = False, set_all_nan: Optional[bool] = False, filter_list: Optional[List[str]] = None, ) -> None: @@ -371,23 +371,27 @@ def save_dataframes_to_csv( Returns: None """ + if do_anonymize: + import nltk + nltk.download("words") + if filter_list is not None: reflections_map = { k: v for k, v in reflections_map.items() if k in filter_list } for name, df in reflections_map.items(): - if anonymize: - df = anonymize(df, set_all_nan) + if do_anonymize: + name, df = anonymize(df, set_all_nan) df.to_csv(os.path.join(output_folder, f"{name}.csv"), index=False) def anonymize( df: pd.DataFrame, set_all_nan: Optional[bool] = False -) -> pd.DataFrame: +) -> Tuple[str, pd.DataFrame]: """ Anonymize the input dataframe by removing specific columns, adding a random - time offset to the "Date" column, setting all other values to NaN if s - et_all_nan is True, and replacing other column names with a random + time offset to the "Date" column, setting all other values to NaN if + set_all_nan is True, and replacing other column names with a random capitalized word. Parameters @@ -401,25 +405,26 @@ def anonymize( Returns ------- - df : pd.DataFrame - The anonymized dataframe. + Tuple[str, pd.DataFrame] + A tuple where the first element is a random capitalized word and the second + element is the anonymized dataframe. """ import nltk import random - nltk.download("words") - # Step 1: Remove specified columns df = df.drop(columns=["Notes", "ID", "Timestamp"], errors="ignore") # Step 2: Add random time offset to "Date" column if "Date" in df.columns: - random_offset = pd.series.DateOffset( + df["Date"] = pd.to_datetime(df["Date"]) + random_offset = pd.tseries.offsets.DateOffset( years=np.random.randint(-1000, 1000) ) df["Date"] += random_offset # Step 3: Set all values in dataframe to NaN if set_all_nan is True + # TODO(@syler): do some nicer data transformations here if set_all_nan: for col in df.columns: if col != "Date": @@ -432,4 +437,6 @@ def random_word(): new_columns = {col: random_word() for col in df.columns if col != "Date"} df = df.rename(columns=new_columns) - return df + name = random_word() + + return name, df diff --git a/scripts/json_to_csv.py b/scripts/json_to_csv.py index cbeeb4b..5a0d694 100644 --- a/scripts/json_to_csv.py +++ b/scripts/json_to_csv.py @@ -8,37 +8,73 @@ def main(): Each reflection in the JSON file will be parsed into a separate CSV file. The CSV files will be saved in the specified output directory. - An optional list of reflection names can be provided to filter which - reflections to save. Additionally, parsing options can be loaded from a + An optional list of reflection names can be provided to filter which + reflections to save. Additionally, parsing options can be loaded from a YAML file. Command-line arguments: - json_path: Path to the JSON reflections file. - output_dir: Output directory for CSV files. - - reflections: Optional list of reflection names to save. If not provided, + - reflections: Optional list of reflection names to save. If not provided, all reflections will be saved. - options_file: Optional YAML file with parsing options. """ - parser = argparse.ArgumentParser(description='Parse JSON reflections and save to CSV.') - parser.add_argument('json_path', type=str, help='Path to the JSON reflections file.') - parser.add_argument('output_dir', type=str, help='Output directory for CSV files.') - parser.add_argument('-r', '--reflections', nargs='*', help='Optional list of reflection names to save.') - parser.add_argument('-o', '--options_file', type=str, default=None, help='YAML file with parsing options.') + parser = argparse.ArgumentParser( + description="Parse JSON reflections and save to CSV." + ) + parser.add_argument( + "json_path", type=str, help="Path to the JSON reflections file." + ) + parser.add_argument( + "output_dir", type=str, help="Output directory for CSV files." + ) + parser.add_argument( + "-r", + "--reflections", + nargs="*", + help="Optional list of reflection names to save.", + ) + parser.add_argument( + "-o", + "--options_file", + type=str, + default=None, + help="YAML file with parsing options.", + ) + parser.add_argument( + "-a", + "--anonymize", + action="store_true", + help="Anonymize output CSV files", + ) + parser.add_argument( + "-n", + "--nan", + action="store_true", + help="Write all values of CSV to NaN when anonymizing", + ) args = parser.parse_args() - with open(args.json_path, 'r') as f: + with open(args.json_path, "r") as f: json_string = f.read() if args.options_file: - with open(args.options_file, 'r') as f: + with open(args.options_file, "r") as f: options_yaml = f.read() options = conv.load_parsing_options_from_yaml(options_yaml) else: options = conv.ParsingOptions() reflections_map = conv.parse_json(json_string, options) - conv.save_dataframes_to_csv(reflections_map, args.output_dir, args.reflections) + conv.save_dataframes_to_csv( + reflections_map, + args.output_dir, + do_anonymize=args.anonymize, + set_all_nan=args.nan, + filter_list=args.reflections, + ) + if __name__ == "__main__": main() From da6d8048875d55e4c41774bac9ec29a4672f9ecb Mon Sep 17 00:00:00 2001 From: Syler Date: Tue, 4 Jul 2023 15:49:54 -0400 Subject: [PATCH 4/4] Anonymize strings, bool, linear transformation --- reflect/conversions.py | 53 ++++++++++++++++++++++++------------------ scripts/json_to_csv.py | 7 ------ 2 files changed, 31 insertions(+), 29 deletions(-) diff --git a/reflect/conversions.py b/reflect/conversions.py index eec2133..25138c9 100644 --- a/reflect/conversions.py +++ b/reflect/conversions.py @@ -354,7 +354,6 @@ def save_dataframes_to_csv( reflections_map: Dict[str, pd.DataFrame], output_folder: str, do_anonymize: Optional[bool] = False, - set_all_nan: Optional[bool] = False, filter_list: Optional[List[str]] = None, ) -> None: """ @@ -373,6 +372,7 @@ def save_dataframes_to_csv( """ if do_anonymize: import nltk + nltk.download("words") if filter_list is not None: @@ -381,33 +381,36 @@ def save_dataframes_to_csv( } for name, df in reflections_map.items(): if do_anonymize: - name, df = anonymize(df, set_all_nan) + name, df = anonymize(df) df.to_csv(os.path.join(output_folder, f"{name}.csv"), index=False) -def anonymize( - df: pd.DataFrame, set_all_nan: Optional[bool] = False -) -> Tuple[str, pd.DataFrame]: +def anonymize(df: pd.DataFrame) -> Tuple[str, pd.DataFrame]: """ Anonymize the input dataframe by removing specific columns, adding a random - time offset to the "Date" column, setting all other values to NaN if - set_all_nan is True, and replacing other column names with a random - capitalized word. + time offset to the "Date" column, and replacing other column names with a + random capitalized word. + + Applies a random linear transformation of the form y = kx to any numerical + columns, where k is a random floating point number between 0.5 and 10 and a + sign flip is applied with 50% chance. + + Any non-empty string values in the dataframe are replaced with a random + word. + + If a column has boolean values, randomly flip the sign of the entire column + with a 50% chance. Parameters ---------- df : pd.DataFrame Input dataframe to be anonymized. - set_all_nan : Optional[bool] - If True, set all values in the dataframe to NaN except for the "Date" - column. - Returns ------- Tuple[str, pd.DataFrame] - A tuple where the first element is a random capitalized word and the second - element is the anonymized dataframe. + A tuple where the first element is a random capitalized word and the + second element is the anonymized dataframe. """ import nltk import random @@ -423,17 +426,23 @@ def anonymize( ) df["Date"] += random_offset - # Step 3: Set all values in dataframe to NaN if set_all_nan is True - # TODO(@syler): do some nicer data transformations here - if set_all_nan: - for col in df.columns: - if col != "Date": - df[col] = np.nan - - # Step 4: Replace other column names with random capitalized word def random_word(): return random.choice(nltk.corpus.words.words()).capitalize() + # Step 3: Set all values in dataframe to NaN if set_all_nan is True + # and apply a random linear transformation to numerical columns + # and replace non-empty strings with a random word + for col in df.columns: + if df[col].dtype == "object": + df[col] = df[col].apply(lambda x: random_word() if x else x) + elif df[col].dtype in ["int64", "float64"]: + k = random.uniform(0.5, 10) * random.choice([-1, 1]) + df[col] = df[col] * k + elif df[col].dtype == "bool": + if random.choice([True, False]): + df[col] = ~df[col] + + # Step 4: Replace other column names with random capitalized word new_columns = {col: random_word() for col in df.columns if col != "Date"} df = df.rename(columns=new_columns) diff --git a/scripts/json_to_csv.py b/scripts/json_to_csv.py index 5a0d694..35ef12c 100644 --- a/scripts/json_to_csv.py +++ b/scripts/json_to_csv.py @@ -47,12 +47,6 @@ def main(): action="store_true", help="Anonymize output CSV files", ) - parser.add_argument( - "-n", - "--nan", - action="store_true", - help="Write all values of CSV to NaN when anonymizing", - ) args = parser.parse_args() @@ -71,7 +65,6 @@ def main(): reflections_map, args.output_dir, do_anonymize=args.anonymize, - set_all_nan=args.nan, filter_list=args.reflections, )