-
Notifications
You must be signed in to change notification settings - Fork 0
Script to anonymize JSON data #8
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: master
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -106,19 +106,20 @@ def get_post_metric_default(self, metric_kind: str): | |
| ) | ||
| raise | ||
|
|
||
|
|
||
| def convert_timestamp(apple_timestamp: float) -> str: | ||
| """ | ||
| Convert an Apple-style timestamp to a local time string. | ||
|
|
||
| Args: | ||
| apple_timestamp (float): The Apple timestamp to convert. | ||
|
|
||
| Returns: | ||
| str: The timestamp in local time as a string formatted as | ||
| str: The timestamp in local time as a string formatted as | ||
| "YYYY-MM-DD HH:MM:SS". | ||
| """ | ||
| # Convert the Apple timestamp to a Python timestamp | ||
| # 978307200 is the number of seconds between 1970-01-01T00:00:00Z and | ||
| # 978307200 is the number of seconds between 1970-01-01T00:00:00Z and | ||
| # 2001-01-01T00:00:00Z | ||
| timestamp = apple_timestamp + 978307200 | ||
|
|
||
|
|
@@ -352,6 +353,7 @@ def parse_json( | |
| def save_dataframes_to_csv( | ||
| reflections_map: Dict[str, pd.DataFrame], | ||
| output_folder: str, | ||
| do_anonymize: Optional[bool] = False, | ||
| filter_list: Optional[List[str]] = None, | ||
| ) -> None: | ||
| """ | ||
|
|
@@ -368,9 +370,82 @@ def save_dataframes_to_csv( | |
| Returns: | ||
| None | ||
| """ | ||
| if do_anonymize: | ||
| import nltk | ||
|
|
||
| nltk.download("words") | ||
|
|
||
| if filter_list is not None: | ||
| reflections_map = { | ||
| k: v for k, v in reflections_map.items() if k in filter_list | ||
| } | ||
| for name, df in reflections_map.items(): | ||
| if do_anonymize: | ||
| name, df = anonymize(df) | ||
| df.to_csv(os.path.join(output_folder, f"{name}.csv"), index=False) | ||
|
|
||
|
|
||
| def anonymize(df: pd.DataFrame) -> Tuple[str, pd.DataFrame]: | ||
| """ | ||
| Anonymize the input dataframe by removing specific columns, adding a random | ||
| time offset to the "Date" column, and replacing other column names with a | ||
| random capitalized word. | ||
|
|
||
| Applies a random linear transformation of the form y = kx to any numerical | ||
| columns, where k is a random floating point number between 0.5 and 10 and a | ||
| sign flip is applied with 50% chance. | ||
|
|
||
| Any non-empty string values in the dataframe are replaced with a random | ||
| word. | ||
|
|
||
| If a column has boolean values, randomly flip the sign of the entire column | ||
| with a 50% chance. | ||
|
|
||
| Parameters | ||
| ---------- | ||
| df : pd.DataFrame | ||
| Input dataframe to be anonymized. | ||
|
|
||
| Returns | ||
| ------- | ||
| Tuple[str, pd.DataFrame] | ||
| A tuple where the first element is a random capitalized word and the | ||
| second element is the anonymized dataframe. | ||
| """ | ||
| import nltk | ||
| import random | ||
|
|
||
| # Step 1: Remove specified columns | ||
| df = df.drop(columns=["Notes", "ID", "Timestamp"], errors="ignore") | ||
|
|
||
| # Step 2: Add random time offset to "Date" column | ||
| if "Date" in df.columns: | ||
| df["Date"] = pd.to_datetime(df["Date"]) | ||
| random_offset = pd.tseries.offsets.DateOffset( | ||
| years=np.random.randint(-1000, 1000) | ||
| ) | ||
| df["Date"] += random_offset | ||
|
|
||
| def random_word(): | ||
| return random.choice(nltk.corpus.words.words()).capitalize() | ||
|
|
||
| # Step 3: Set all values in dataframe to NaN if set_all_nan is True | ||
| # and apply a random linear transformation to numerical columns | ||
| # and replace non-empty strings with a random word | ||
| for col in df.columns: | ||
| if df[col].dtype == "object": | ||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. object means string? is there any way to narrow this down further? |
||
| df[col] = df[col].apply(lambda x: random_word() if x else x) | ||
| elif df[col].dtype in ["int64", "float64"]: | ||
| k = random.uniform(0.5, 10) * random.choice([-1, 1]) | ||
| df[col] = df[col] * k | ||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. if it's a rating type, won't this be obvious given our current fixed range? what is this protecting from? |
||
| elif df[col].dtype == "bool": | ||
| if random.choice([True, False]): | ||
| df[col] = ~df[col] | ||
|
|
||
| # Step 4: Replace other column names with random capitalized word | ||
| new_columns = {col: random_word() for col in df.columns if col != "Date"} | ||
| df = df.rename(columns=new_columns) | ||
|
|
||
| name = random_word() | ||
|
|
||
| return name, df | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,68 @@ | ||
| import json | ||
| import random | ||
| import argparse | ||
| from nltk.corpus import words, wordnet | ||
| from pathlib import Path | ||
|
|
||
| # Generate a list of nouns and adjectives | ||
| nouns = {x.name().split(".", 1)[0] for x in wordnet.all_synsets("n")} | ||
| adjectives = {x.name().split(".", 1)[0] for x in wordnet.all_synsets("a")} | ||
| words_list = list(nouns | adjectives) | ||
|
|
||
|
|
||
| def generate_random_word(): | ||
| word = random.choice(words_list) | ||
| return " ".join(part.capitalize() for part in word.split("_")) | ||
|
|
||
|
|
||
| def update_values(data, word_map): | ||
| if isinstance(data, list): | ||
| for i in range(len(data)): | ||
| if isinstance(data[i], str): | ||
| if data[i] not in word_map: | ||
| word_map[data[i]] = generate_random_word() | ||
| data[i] = word_map[data[i]] | ||
| else: | ||
| update_values(data[i], word_map) | ||
| elif isinstance(data, dict): | ||
| for key, value in data.items(): | ||
| if key in ["name", "group"] and value != "": | ||
| if value not in word_map: | ||
| word_map[value] = generate_random_word() | ||
| data[key] = word_map[value] | ||
| elif key == "choices" and isinstance(value, list): | ||
| data[key] = [generate_random_word() for _ in value] | ||
| else: | ||
| update_values(value, word_map) | ||
| return data | ||
|
|
||
|
|
||
| def main(input_file, output_file): | ||
| # Load the JSON file | ||
| with open(input_file) as f: | ||
| data = json.load(f) | ||
|
|
||
| group_map = {} | ||
| # Update the values | ||
| updated_data = update_values(data, group_map) | ||
|
|
||
| # Write to the new JSON file | ||
| with open(output_file, "w") as f: | ||
| json.dump(updated_data, f, indent=4) | ||
|
|
||
|
|
||
| if __name__ == "__main__": | ||
| parser = argparse.ArgumentParser() | ||
| parser.add_argument("input_file", help="The path of the input JSON file") | ||
| parser.add_argument( | ||
| "-o", "--output_file", help="The path of the output JSON file" | ||
| ) | ||
|
|
||
| args = parser.parse_args() | ||
|
|
||
| if args.output_file is None: | ||
| # If no output file is provided, use the input file name with '_anonymized' appended | ||
| input_path = Path(args.input_file) | ||
| args.output_file = input_path.stem + "_anonymized" + input_path.suffix | ||
|
|
||
| main(args.input_file, args.output_file) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This should be applied to all the dataframes equally so there is still time correspondence
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
all reflections get shifted by a random year offset?