NoTranslationLayer · 5yler · Jun 26, 2023 · Jul 4, 2023 · Jul 4, 2023 · Jul 4, 2023
diff --git a/reflect/conversions.py b/reflect/conversions.py
@@ -106,19 +106,20 @@ def get_post_metric_default(self, metric_kind: str):
             )
             raise
 
+
 def convert_timestamp(apple_timestamp: float) -> str:
     """
     Convert an Apple-style timestamp to a local time string.
-    
+
     Args:
         apple_timestamp (float): The Apple timestamp to convert.
 
     Returns:
-        str: The timestamp in local time as a string formatted as 
+        str: The timestamp in local time as a string formatted as
              "YYYY-MM-DD HH:MM:SS".
     """
     # Convert the Apple timestamp to a Python timestamp
-    # 978307200 is the number of seconds between 1970-01-01T00:00:00Z and 
+    # 978307200 is the number of seconds between 1970-01-01T00:00:00Z and
     # 2001-01-01T00:00:00Z
     timestamp = apple_timestamp + 978307200
 
@@ -352,6 +353,7 @@ def parse_json(
 def save_dataframes_to_csv(
     reflections_map: Dict[str, pd.DataFrame],
     output_folder: str,
+    do_anonymize: Optional[bool] = False,
     filter_list: Optional[List[str]] = None,
 ) -> None:
     """
@@ -368,9 +370,82 @@ def save_dataframes_to_csv(
     Returns:
         None
     """
+    if do_anonymize:
+        import nltk
+
+        nltk.download("words")
+
     if filter_list is not None:
         reflections_map = {
             k: v for k, v in reflections_map.items() if k in filter_list
         }
     for name, df in reflections_map.items():
+        if do_anonymize:
+            name, df = anonymize(df)
         df.to_csv(os.path.join(output_folder, f"{name}.csv"), index=False)
+
+
+def anonymize(df: pd.DataFrame) -> Tuple[str, pd.DataFrame]:
+    """
+    Anonymize the input dataframe by removing specific columns, adding a random
+    time offset to the "Date" column, and replacing other column names with a
+    random capitalized word.
+
+    Applies a random linear transformation of the form y = kx to any numerical
+    columns, where k is a random floating point number between 0.5 and 10 and a
+    sign flip is applied with 50% chance.
+
+    Any non-empty string values in the dataframe are replaced with a random
+    word.
+
+    If a column has boolean values, randomly flip the sign of the entire column
+    with a 50% chance.
+
+    Parameters
+    ----------
+    df : pd.DataFrame
+        Input dataframe to be anonymized.
+
+    Returns
+    -------
+    Tuple[str, pd.DataFrame]
+        A tuple where the first element is a random capitalized word and the
+        second element is the anonymized dataframe.
+    """
+    import nltk
+    import random
+
+    # Step 1: Remove specified columns
+    df = df.drop(columns=["Notes", "ID", "Timestamp"], errors="ignore")
+
+    # Step 2: Add random time offset to "Date" column
+    if "Date" in df.columns:
+        df["Date"] = pd.to_datetime(df["Date"])
+        random_offset = pd.tseries.offsets.DateOffset(
+            years=np.random.randint(-1000, 1000)
+        )
+        df["Date"] += random_offset
+
+    def random_word():
+        return random.choice(nltk.corpus.words.words()).capitalize()
+
+    # Step 3: Set all values in dataframe to NaN if set_all_nan is True
+    # and apply a random linear transformation to numerical columns
+    # and replace non-empty strings with a random word
+    for col in df.columns:
+        if df[col].dtype == "object":
+            df[col] = df[col].apply(lambda x: random_word() if x else x)
+        elif df[col].dtype in ["int64", "float64"]:
+            k = random.uniform(0.5, 10) * random.choice([-1, 1])
+            df[col] = df[col] * k
+        elif df[col].dtype == "bool":
+            if random.choice([True, False]):
+                df[col] = ~df[col]
+
+    # Step 4: Replace other column names with random capitalized word
+    new_columns = {col: random_word() for col in df.columns if col != "Date"}
+    df = df.rename(columns=new_columns)
+
+    name = random_word()
+
+    return name, df
diff --git a/scripts/anonymize.py b/scripts/anonymize.py
@@ -0,0 +1,68 @@
+import json
+import random
+import argparse
+from nltk.corpus import words, wordnet
+from pathlib import Path
+
+# Generate a list of nouns and adjectives
+nouns = {x.name().split(".", 1)[0] for x in wordnet.all_synsets("n")}
+adjectives = {x.name().split(".", 1)[0] for x in wordnet.all_synsets("a")}
+words_list = list(nouns | adjectives)
+
+
+def generate_random_word():
+    word = random.choice(words_list)
+    return " ".join(part.capitalize() for part in word.split("_"))
+
+
+def update_values(data, word_map):
+    if isinstance(data, list):
+        for i in range(len(data)):
+            if isinstance(data[i], str):
+                if data[i] not in word_map:
+                    word_map[data[i]] = generate_random_word()
+                data[i] = word_map[data[i]]
+            else:
+                update_values(data[i], word_map)
+    elif isinstance(data, dict):
+        for key, value in data.items():
+            if key in ["name", "group"] and value != "":
+                if value not in word_map:
+                    word_map[value] = generate_random_word()
+                data[key] = word_map[value]
+            elif key == "choices" and isinstance(value, list):
+                data[key] = [generate_random_word() for _ in value]
+            else:
+                update_values(value, word_map)
+    return data
+
+
+def main(input_file, output_file):
+    # Load the JSON file
+    with open(input_file) as f:
+        data = json.load(f)
+
+    group_map = {}
+    # Update the values
+    updated_data = update_values(data, group_map)
+
+    # Write to the new JSON file
+    with open(output_file, "w") as f:
+        json.dump(updated_data, f, indent=4)
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("input_file", help="The path of the input JSON file")
+    parser.add_argument(
+        "-o", "--output_file", help="The path of the output JSON file"
+    )
+
+    args = parser.parse_args()
+
+    if args.output_file is None:
+        # If no output file is provided, use the input file name with '_anonymized' appended
+        input_path = Path(args.input_file)
+        args.output_file = input_path.stem + "_anonymized" + input_path.suffix
+
+    main(args.input_file, args.output_file)
diff --git a/scripts/json_to_csv.py b/scripts/json_to_csv.py
@@ -8,37 +8,66 @@ def main():
 
     Each reflection in the JSON file will be parsed into a separate CSV file.
     The CSV files will be saved in the specified output directory.
-    An optional list of reflection names can be provided to filter which 
-    reflections to save. Additionally, parsing options can be loaded from a 
+    An optional list of reflection names can be provided to filter which
+    reflections to save. Additionally, parsing options can be loaded from a
     YAML file.
 
     Command-line arguments:
     - json_path: Path to the JSON reflections file.
     - output_dir: Output directory for CSV files.
-    - reflections: Optional list of reflection names to save. If not provided, 
+    - reflections: Optional list of reflection names to save. If not provided,
         all reflections will be saved.
     - options_file: Optional YAML file with parsing options.
     """
-    parser = argparse.ArgumentParser(description='Parse JSON reflections and save to CSV.')
-    parser.add_argument('json_path', type=str, help='Path to the JSON reflections file.')
-    parser.add_argument('output_dir', type=str, help='Output directory for CSV files.')
-    parser.add_argument('-r', '--reflections', nargs='*', help='Optional list of reflection names to save.')
-    parser.add_argument('-o', '--options_file', type=str, default=None, help='YAML file with parsing options.')
+    parser = argparse.ArgumentParser(
+        description="Parse JSON reflections and save to CSV."
+    )
+    parser.add_argument(
+        "json_path", type=str, help="Path to the JSON reflections file."
+    )
+    parser.add_argument(
+        "output_dir", type=str, help="Output directory for CSV files."
+    )
+    parser.add_argument(
+        "-r",
+        "--reflections",
+        nargs="*",
+        help="Optional list of reflection names to save.",
+    )
+    parser.add_argument(
+        "-o",
+        "--options_file",
+        type=str,
+        default=None,
+        help="YAML file with parsing options.",
+    )
+    parser.add_argument(
+        "-a",
+        "--anonymize",
+        action="store_true",
+        help="Anonymize output CSV files",
+    )
 
     args = parser.parse_args()
 
-    with open(args.json_path, 'r') as f:
+    with open(args.json_path, "r") as f:
         json_string = f.read()
 
     if args.options_file:
-        with open(args.options_file, 'r') as f:
+        with open(args.options_file, "r") as f:
             options_yaml = f.read()
         options = conv.load_parsing_options_from_yaml(options_yaml)
     else:
         options = conv.ParsingOptions()
 
     reflections_map = conv.parse_json(json_string, options)
-    conv.save_dataframes_to_csv(reflections_map, args.output_dir, args.reflections)
+    conv.save_dataframes_to_csv(
+        reflections_map,
+        args.output_dir,
+        do_anonymize=args.anonymize,
+        filter_list=args.reflections,
+    )
+
 
 if __name__ == "__main__":
     main()