diff --git a/reflect/conversions.py b/reflect/conversions.py index 26d732b..25138c9 100644 --- a/reflect/conversions.py +++ b/reflect/conversions.py @@ -106,19 +106,20 @@ def get_post_metric_default(self, metric_kind: str): ) raise + def convert_timestamp(apple_timestamp: float) -> str: """ Convert an Apple-style timestamp to a local time string. - + Args: apple_timestamp (float): The Apple timestamp to convert. Returns: - str: The timestamp in local time as a string formatted as + str: The timestamp in local time as a string formatted as "YYYY-MM-DD HH:MM:SS". """ # Convert the Apple timestamp to a Python timestamp - # 978307200 is the number of seconds between 1970-01-01T00:00:00Z and + # 978307200 is the number of seconds between 1970-01-01T00:00:00Z and # 2001-01-01T00:00:00Z timestamp = apple_timestamp + 978307200 @@ -352,6 +353,7 @@ def parse_json( def save_dataframes_to_csv( reflections_map: Dict[str, pd.DataFrame], output_folder: str, + do_anonymize: Optional[bool] = False, filter_list: Optional[List[str]] = None, ) -> None: """ @@ -368,9 +370,82 @@ def save_dataframes_to_csv( Returns: None """ + if do_anonymize: + import nltk + + nltk.download("words") + if filter_list is not None: reflections_map = { k: v for k, v in reflections_map.items() if k in filter_list } for name, df in reflections_map.items(): + if do_anonymize: + name, df = anonymize(df) df.to_csv(os.path.join(output_folder, f"{name}.csv"), index=False) + + +def anonymize(df: pd.DataFrame) -> Tuple[str, pd.DataFrame]: + """ + Anonymize the input dataframe by removing specific columns, adding a random + time offset to the "Date" column, and replacing other column names with a + random capitalized word. + + Applies a random linear transformation of the form y = kx to any numerical + columns, where k is a random floating point number between 0.5 and 10 and a + sign flip is applied with 50% chance. + + Any non-empty string values in the dataframe are replaced with a random + word. + + If a column has boolean values, randomly flip the sign of the entire column + with a 50% chance. + + Parameters + ---------- + df : pd.DataFrame + Input dataframe to be anonymized. + + Returns + ------- + Tuple[str, pd.DataFrame] + A tuple where the first element is a random capitalized word and the + second element is the anonymized dataframe. + """ + import nltk + import random + + # Step 1: Remove specified columns + df = df.drop(columns=["Notes", "ID", "Timestamp"], errors="ignore") + + # Step 2: Add random time offset to "Date" column + if "Date" in df.columns: + df["Date"] = pd.to_datetime(df["Date"]) + random_offset = pd.tseries.offsets.DateOffset( + years=np.random.randint(-1000, 1000) + ) + df["Date"] += random_offset + + def random_word(): + return random.choice(nltk.corpus.words.words()).capitalize() + + # Step 3: Set all values in dataframe to NaN if set_all_nan is True + # and apply a random linear transformation to numerical columns + # and replace non-empty strings with a random word + for col in df.columns: + if df[col].dtype == "object": + df[col] = df[col].apply(lambda x: random_word() if x else x) + elif df[col].dtype in ["int64", "float64"]: + k = random.uniform(0.5, 10) * random.choice([-1, 1]) + df[col] = df[col] * k + elif df[col].dtype == "bool": + if random.choice([True, False]): + df[col] = ~df[col] + + # Step 4: Replace other column names with random capitalized word + new_columns = {col: random_word() for col in df.columns if col != "Date"} + df = df.rename(columns=new_columns) + + name = random_word() + + return name, df diff --git a/scripts/anonymize.py b/scripts/anonymize.py new file mode 100755 index 0000000..37290d7 --- /dev/null +++ b/scripts/anonymize.py @@ -0,0 +1,68 @@ +import json +import random +import argparse +from nltk.corpus import words, wordnet +from pathlib import Path + +# Generate a list of nouns and adjectives +nouns = {x.name().split(".", 1)[0] for x in wordnet.all_synsets("n")} +adjectives = {x.name().split(".", 1)[0] for x in wordnet.all_synsets("a")} +words_list = list(nouns | adjectives) + + +def generate_random_word(): + word = random.choice(words_list) + return " ".join(part.capitalize() for part in word.split("_")) + + +def update_values(data, word_map): + if isinstance(data, list): + for i in range(len(data)): + if isinstance(data[i], str): + if data[i] not in word_map: + word_map[data[i]] = generate_random_word() + data[i] = word_map[data[i]] + else: + update_values(data[i], word_map) + elif isinstance(data, dict): + for key, value in data.items(): + if key in ["name", "group"] and value != "": + if value not in word_map: + word_map[value] = generate_random_word() + data[key] = word_map[value] + elif key == "choices" and isinstance(value, list): + data[key] = [generate_random_word() for _ in value] + else: + update_values(value, word_map) + return data + + +def main(input_file, output_file): + # Load the JSON file + with open(input_file) as f: + data = json.load(f) + + group_map = {} + # Update the values + updated_data = update_values(data, group_map) + + # Write to the new JSON file + with open(output_file, "w") as f: + json.dump(updated_data, f, indent=4) + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument("input_file", help="The path of the input JSON file") + parser.add_argument( + "-o", "--output_file", help="The path of the output JSON file" + ) + + args = parser.parse_args() + + if args.output_file is None: + # If no output file is provided, use the input file name with '_anonymized' appended + input_path = Path(args.input_file) + args.output_file = input_path.stem + "_anonymized" + input_path.suffix + + main(args.input_file, args.output_file) diff --git a/scripts/json_to_csv.py b/scripts/json_to_csv.py index cbeeb4b..35ef12c 100644 --- a/scripts/json_to_csv.py +++ b/scripts/json_to_csv.py @@ -8,37 +8,66 @@ def main(): Each reflection in the JSON file will be parsed into a separate CSV file. The CSV files will be saved in the specified output directory. - An optional list of reflection names can be provided to filter which - reflections to save. Additionally, parsing options can be loaded from a + An optional list of reflection names can be provided to filter which + reflections to save. Additionally, parsing options can be loaded from a YAML file. Command-line arguments: - json_path: Path to the JSON reflections file. - output_dir: Output directory for CSV files. - - reflections: Optional list of reflection names to save. If not provided, + - reflections: Optional list of reflection names to save. If not provided, all reflections will be saved. - options_file: Optional YAML file with parsing options. """ - parser = argparse.ArgumentParser(description='Parse JSON reflections and save to CSV.') - parser.add_argument('json_path', type=str, help='Path to the JSON reflections file.') - parser.add_argument('output_dir', type=str, help='Output directory for CSV files.') - parser.add_argument('-r', '--reflections', nargs='*', help='Optional list of reflection names to save.') - parser.add_argument('-o', '--options_file', type=str, default=None, help='YAML file with parsing options.') + parser = argparse.ArgumentParser( + description="Parse JSON reflections and save to CSV." + ) + parser.add_argument( + "json_path", type=str, help="Path to the JSON reflections file." + ) + parser.add_argument( + "output_dir", type=str, help="Output directory for CSV files." + ) + parser.add_argument( + "-r", + "--reflections", + nargs="*", + help="Optional list of reflection names to save.", + ) + parser.add_argument( + "-o", + "--options_file", + type=str, + default=None, + help="YAML file with parsing options.", + ) + parser.add_argument( + "-a", + "--anonymize", + action="store_true", + help="Anonymize output CSV files", + ) args = parser.parse_args() - with open(args.json_path, 'r') as f: + with open(args.json_path, "r") as f: json_string = f.read() if args.options_file: - with open(args.options_file, 'r') as f: + with open(args.options_file, "r") as f: options_yaml = f.read() options = conv.load_parsing_options_from_yaml(options_yaml) else: options = conv.ParsingOptions() reflections_map = conv.parse_json(json_string, options) - conv.save_dataframes_to_csv(reflections_map, args.output_dir, args.reflections) + conv.save_dataframes_to_csv( + reflections_map, + args.output_dir, + do_anonymize=args.anonymize, + filter_list=args.reflections, + ) + if __name__ == "__main__": main()