From 8f3754a2c5617dd4ea4d173314c17e6dab2f2963 Mon Sep 17 00:00:00 2001
From: Syler <syler@syler.xyz>
Date: Mon, 26 Jun 2023 17:17:01 -0400
Subject: [PATCH 1/4] Add script to anonymize reflection templates

---
 scripts/anonymize.py | 68 ++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 68 insertions(+)
 create mode 100755 scripts/anonymize.py

diff --git a/scripts/anonymize.py b/scripts/anonymize.py
new file mode 100755
index 0000000..37290d7
--- /dev/null
+++ b/scripts/anonymize.py
@@ -0,0 +1,68 @@
+import json
+import random
+import argparse
+from nltk.corpus import words, wordnet
+from pathlib import Path
+
+# Generate a list of nouns and adjectives
+nouns = {x.name().split(".", 1)[0] for x in wordnet.all_synsets("n")}
+adjectives = {x.name().split(".", 1)[0] for x in wordnet.all_synsets("a")}
+words_list = list(nouns | adjectives)
+
+
+def generate_random_word():
+    word = random.choice(words_list)
+    return " ".join(part.capitalize() for part in word.split("_"))
+
+
+def update_values(data, word_map):
+    if isinstance(data, list):
+        for i in range(len(data)):
+            if isinstance(data[i], str):
+                if data[i] not in word_map:
+                    word_map[data[i]] = generate_random_word()
+                data[i] = word_map[data[i]]
+            else:
+                update_values(data[i], word_map)
+    elif isinstance(data, dict):
+        for key, value in data.items():
+            if key in ["name", "group"] and value != "":
+                if value not in word_map:
+                    word_map[value] = generate_random_word()
+                data[key] = word_map[value]
+            elif key == "choices" and isinstance(value, list):
+                data[key] = [generate_random_word() for _ in value]
+            else:
+                update_values(value, word_map)
+    return data
+
+
+def main(input_file, output_file):
+    # Load the JSON file
+    with open(input_file) as f:
+        data = json.load(f)
+
+    group_map = {}
+    # Update the values
+    updated_data = update_values(data, group_map)
+
+    # Write to the new JSON file
+    with open(output_file, "w") as f:
+        json.dump(updated_data, f, indent=4)
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("input_file", help="The path of the input JSON file")
+    parser.add_argument(
+        "-o", "--output_file", help="The path of the output JSON file"
+    )
+
+    args = parser.parse_args()
+
+    if args.output_file is None:
+        # If no output file is provided, use the input file name with '_anonymized' appended
+        input_path = Path(args.input_file)
+        args.output_file = input_path.stem + "_anonymized" + input_path.suffix
+
+    main(args.input_file, args.output_file)

From cd16f417f66bee9d2da813ea37ca140ab7282597 Mon Sep 17 00:00:00 2001
From: Syler <syler@syler.xyz>
Date: Tue, 4 Jul 2023 15:07:20 -0400
Subject: [PATCH 2/4] Function to anonymize output CSV data

---
 reflect/conversions.py | 65 ++++++++++++++++++++++++++++++++++++++++--
 1 file changed, 62 insertions(+), 3 deletions(-)

diff --git a/reflect/conversions.py b/reflect/conversions.py
index 26d732b..a430eee 100644
--- a/reflect/conversions.py
+++ b/reflect/conversions.py
@@ -106,19 +106,20 @@ def get_post_metric_default(self, metric_kind: str):
             )
             raise
 
+
 def convert_timestamp(apple_timestamp: float) -> str:
     """
     Convert an Apple-style timestamp to a local time string.
-    
+
     Args:
         apple_timestamp (float): The Apple timestamp to convert.
 
     Returns:
-        str: The timestamp in local time as a string formatted as 
+        str: The timestamp in local time as a string formatted as
              "YYYY-MM-DD HH:MM:SS".
     """
     # Convert the Apple timestamp to a Python timestamp
-    # 978307200 is the number of seconds between 1970-01-01T00:00:00Z and 
+    # 978307200 is the number of seconds between 1970-01-01T00:00:00Z and
     # 2001-01-01T00:00:00Z
     timestamp = apple_timestamp + 978307200
 
@@ -352,6 +353,8 @@ def parse_json(
 def save_dataframes_to_csv(
     reflections_map: Dict[str, pd.DataFrame],
     output_folder: str,
+    anonymize: Optional[bool] = False,
+    set_all_nan: Optional[bool] = False,
     filter_list: Optional[List[str]] = None,
 ) -> None:
     """
@@ -373,4 +376,60 @@ def save_dataframes_to_csv(
             k: v for k, v in reflections_map.items() if k in filter_list
         }
     for name, df in reflections_map.items():
+        if anonymize:
+            df = anonymize(df, set_all_nan)
         df.to_csv(os.path.join(output_folder, f"{name}.csv"), index=False)
+
+
+def anonymize(
+    df: pd.DataFrame, set_all_nan: Optional[bool] = False
+) -> pd.DataFrame:
+    """
+    Anonymize the input dataframe by removing specific columns, adding a random
+    time offset to the "Date" column, setting all other values to NaN if s
+    et_all_nan is True, and replacing other column names with a random
+    capitalized word.
+
+    Parameters
+    ----------
+    df : pd.DataFrame
+        Input dataframe to be anonymized.
+
+    set_all_nan : Optional[bool]
+        If True, set all values in the dataframe to NaN except for the "Date"
+        column.
+
+    Returns
+    -------
+    df : pd.DataFrame
+        The anonymized dataframe.
+    """
+    import nltk
+    import random
+
+    nltk.download("words")
+
+    # Step 1: Remove specified columns
+    df = df.drop(columns=["Notes", "ID", "Timestamp"], errors="ignore")
+
+    # Step 2: Add random time offset to "Date" column
+    if "Date" in df.columns:
+        random_offset = pd.series.DateOffset(
+            years=np.random.randint(-1000, 1000)
+        )
+        df["Date"] += random_offset
+
+    # Step 3: Set all values in dataframe to NaN if set_all_nan is True
+    if set_all_nan:
+        for col in df.columns:
+            if col != "Date":
+                df[col] = np.nan
+
+    # Step 4: Replace other column names with random capitalized word
+    def random_word():
+        return random.choice(nltk.corpus.words.words()).capitalize()
+
+    new_columns = {col: random_word() for col in df.columns if col != "Date"}
+    df = df.rename(columns=new_columns)
+
+    return df

From d717cf232be771bcbaf011554d021db33af514dd Mon Sep 17 00:00:00 2001
From: Syler <syler@syler.xyz>
Date: Tue, 4 Jul 2023 15:41:12 -0400
Subject: [PATCH 3/4] Pass anonymization options to json_to_csv.py

---
 reflect/conversions.py | 31 +++++++++++++---------
 scripts/json_to_csv.py | 58 ++++++++++++++++++++++++++++++++++--------
 2 files changed, 66 insertions(+), 23 deletions(-)

diff --git a/reflect/conversions.py b/reflect/conversions.py
index a430eee..eec2133 100644
--- a/reflect/conversions.py
+++ b/reflect/conversions.py
@@ -353,7 +353,7 @@ def parse_json(
 def save_dataframes_to_csv(
     reflections_map: Dict[str, pd.DataFrame],
     output_folder: str,
-    anonymize: Optional[bool] = False,
+    do_anonymize: Optional[bool] = False,
     set_all_nan: Optional[bool] = False,
     filter_list: Optional[List[str]] = None,
 ) -> None:
@@ -371,23 +371,27 @@ def save_dataframes_to_csv(
     Returns:
         None
     """
+    if do_anonymize:
+        import nltk
+        nltk.download("words")
+
     if filter_list is not None:
         reflections_map = {
             k: v for k, v in reflections_map.items() if k in filter_list
         }
     for name, df in reflections_map.items():
-        if anonymize:
-            df = anonymize(df, set_all_nan)
+        if do_anonymize:
+            name, df = anonymize(df, set_all_nan)
         df.to_csv(os.path.join(output_folder, f"{name}.csv"), index=False)
 
 
 def anonymize(
     df: pd.DataFrame, set_all_nan: Optional[bool] = False
-) -> pd.DataFrame:
+) -> Tuple[str, pd.DataFrame]:
     """
     Anonymize the input dataframe by removing specific columns, adding a random
-    time offset to the "Date" column, setting all other values to NaN if s
-    et_all_nan is True, and replacing other column names with a random
+    time offset to the "Date" column, setting all other values to NaN if 
+    set_all_nan is True, and replacing other column names with a random
     capitalized word.
 
     Parameters
@@ -401,25 +405,26 @@ def anonymize(
 
     Returns
     -------
-    df : pd.DataFrame
-        The anonymized dataframe.
+    Tuple[str, pd.DataFrame]
+        A tuple where the first element is a random capitalized word and the second
+        element is the anonymized dataframe.
     """
     import nltk
     import random
 
-    nltk.download("words")
-
     # Step 1: Remove specified columns
     df = df.drop(columns=["Notes", "ID", "Timestamp"], errors="ignore")
 
     # Step 2: Add random time offset to "Date" column
     if "Date" in df.columns:
-        random_offset = pd.series.DateOffset(
+        df["Date"] = pd.to_datetime(df["Date"])
+        random_offset = pd.tseries.offsets.DateOffset(
             years=np.random.randint(-1000, 1000)
         )
         df["Date"] += random_offset
 
     # Step 3: Set all values in dataframe to NaN if set_all_nan is True
+    # TODO(@syler): do some nicer data transformations here
     if set_all_nan:
         for col in df.columns:
             if col != "Date":
@@ -432,4 +437,6 @@ def random_word():
     new_columns = {col: random_word() for col in df.columns if col != "Date"}
     df = df.rename(columns=new_columns)
 
-    return df
+    name = random_word()
+
+    return name, df
diff --git a/scripts/json_to_csv.py b/scripts/json_to_csv.py
index cbeeb4b..5a0d694 100644
--- a/scripts/json_to_csv.py
+++ b/scripts/json_to_csv.py
@@ -8,37 +8,73 @@ def main():
 
     Each reflection in the JSON file will be parsed into a separate CSV file.
     The CSV files will be saved in the specified output directory.
-    An optional list of reflection names can be provided to filter which 
-    reflections to save. Additionally, parsing options can be loaded from a 
+    An optional list of reflection names can be provided to filter which
+    reflections to save. Additionally, parsing options can be loaded from a
     YAML file.
 
     Command-line arguments:
     - json_path: Path to the JSON reflections file.
     - output_dir: Output directory for CSV files.
-    - reflections: Optional list of reflection names to save. If not provided, 
+    - reflections: Optional list of reflection names to save. If not provided,
         all reflections will be saved.
     - options_file: Optional YAML file with parsing options.
     """
-    parser = argparse.ArgumentParser(description='Parse JSON reflections and save to CSV.')
-    parser.add_argument('json_path', type=str, help='Path to the JSON reflections file.')
-    parser.add_argument('output_dir', type=str, help='Output directory for CSV files.')
-    parser.add_argument('-r', '--reflections', nargs='*', help='Optional list of reflection names to save.')
-    parser.add_argument('-o', '--options_file', type=str, default=None, help='YAML file with parsing options.')
+    parser = argparse.ArgumentParser(
+        description="Parse JSON reflections and save to CSV."
+    )
+    parser.add_argument(
+        "json_path", type=str, help="Path to the JSON reflections file."
+    )
+    parser.add_argument(
+        "output_dir", type=str, help="Output directory for CSV files."
+    )
+    parser.add_argument(
+        "-r",
+        "--reflections",
+        nargs="*",
+        help="Optional list of reflection names to save.",
+    )
+    parser.add_argument(
+        "-o",
+        "--options_file",
+        type=str,
+        default=None,
+        help="YAML file with parsing options.",
+    )
+    parser.add_argument(
+        "-a",
+        "--anonymize",
+        action="store_true",
+        help="Anonymize output CSV files",
+    )
+    parser.add_argument(
+        "-n",
+        "--nan",
+        action="store_true",
+        help="Write all values of CSV to NaN when anonymizing",
+    )
 
     args = parser.parse_args()
 
-    with open(args.json_path, 'r') as f:
+    with open(args.json_path, "r") as f:
         json_string = f.read()
 
     if args.options_file:
-        with open(args.options_file, 'r') as f:
+        with open(args.options_file, "r") as f:
             options_yaml = f.read()
         options = conv.load_parsing_options_from_yaml(options_yaml)
     else:
         options = conv.ParsingOptions()
 
     reflections_map = conv.parse_json(json_string, options)
-    conv.save_dataframes_to_csv(reflections_map, args.output_dir, args.reflections)
+    conv.save_dataframes_to_csv(
+        reflections_map,
+        args.output_dir,
+        do_anonymize=args.anonymize,
+        set_all_nan=args.nan,
+        filter_list=args.reflections,
+    )
+
 
 if __name__ == "__main__":
     main()

From da6d8048875d55e4c41774bac9ec29a4672f9ecb Mon Sep 17 00:00:00 2001
From: Syler <syler@syler.xyz>
Date: Tue, 4 Jul 2023 15:49:54 -0400
Subject: [PATCH 4/4] Anonymize strings, bool, linear transformation

---
 reflect/conversions.py | 53 ++++++++++++++++++++++++------------------
 scripts/json_to_csv.py |  7 ------
 2 files changed, 31 insertions(+), 29 deletions(-)

diff --git a/reflect/conversions.py b/reflect/conversions.py
index eec2133..25138c9 100644
--- a/reflect/conversions.py
+++ b/reflect/conversions.py
@@ -354,7 +354,6 @@ def save_dataframes_to_csv(
     reflections_map: Dict[str, pd.DataFrame],
     output_folder: str,
     do_anonymize: Optional[bool] = False,
-    set_all_nan: Optional[bool] = False,
     filter_list: Optional[List[str]] = None,
 ) -> None:
     """
@@ -373,6 +372,7 @@ def save_dataframes_to_csv(
     """
     if do_anonymize:
         import nltk
+
         nltk.download("words")
 
     if filter_list is not None:
@@ -381,33 +381,36 @@ def save_dataframes_to_csv(
         }
     for name, df in reflections_map.items():
         if do_anonymize:
-            name, df = anonymize(df, set_all_nan)
+            name, df = anonymize(df)
         df.to_csv(os.path.join(output_folder, f"{name}.csv"), index=False)
 
 
-def anonymize(
-    df: pd.DataFrame, set_all_nan: Optional[bool] = False
-) -> Tuple[str, pd.DataFrame]:
+def anonymize(df: pd.DataFrame) -> Tuple[str, pd.DataFrame]:
     """
     Anonymize the input dataframe by removing specific columns, adding a random
-    time offset to the "Date" column, setting all other values to NaN if 
-    set_all_nan is True, and replacing other column names with a random
-    capitalized word.
+    time offset to the "Date" column, and replacing other column names with a
+    random capitalized word.
+
+    Applies a random linear transformation of the form y = kx to any numerical
+    columns, where k is a random floating point number between 0.5 and 10 and a
+    sign flip is applied with 50% chance.
+
+    Any non-empty string values in the dataframe are replaced with a random
+    word.
+
+    If a column has boolean values, randomly flip the sign of the entire column
+    with a 50% chance.
 
     Parameters
     ----------
     df : pd.DataFrame
         Input dataframe to be anonymized.
 
-    set_all_nan : Optional[bool]
-        If True, set all values in the dataframe to NaN except for the "Date"
-        column.
-
     Returns
     -------
     Tuple[str, pd.DataFrame]
-        A tuple where the first element is a random capitalized word and the second
-        element is the anonymized dataframe.
+        A tuple where the first element is a random capitalized word and the
+        second element is the anonymized dataframe.
     """
     import nltk
     import random
@@ -423,17 +426,23 @@ def anonymize(
         )
         df["Date"] += random_offset
 
-    # Step 3: Set all values in dataframe to NaN if set_all_nan is True
-    # TODO(@syler): do some nicer data transformations here
-    if set_all_nan:
-        for col in df.columns:
-            if col != "Date":
-                df[col] = np.nan
-
-    # Step 4: Replace other column names with random capitalized word
     def random_word():
         return random.choice(nltk.corpus.words.words()).capitalize()
 
+    # Step 3: Set all values in dataframe to NaN if set_all_nan is True
+    # and apply a random linear transformation to numerical columns
+    # and replace non-empty strings with a random word
+    for col in df.columns:
+        if df[col].dtype == "object":
+            df[col] = df[col].apply(lambda x: random_word() if x else x)
+        elif df[col].dtype in ["int64", "float64"]:
+            k = random.uniform(0.5, 10) * random.choice([-1, 1])
+            df[col] = df[col] * k
+        elif df[col].dtype == "bool":
+            if random.choice([True, False]):
+                df[col] = ~df[col]
+
+    # Step 4: Replace other column names with random capitalized word
     new_columns = {col: random_word() for col in df.columns if col != "Date"}
     df = df.rename(columns=new_columns)
 
diff --git a/scripts/json_to_csv.py b/scripts/json_to_csv.py
index 5a0d694..35ef12c 100644
--- a/scripts/json_to_csv.py
+++ b/scripts/json_to_csv.py
@@ -47,12 +47,6 @@ def main():
         action="store_true",
         help="Anonymize output CSV files",
     )
-    parser.add_argument(
-        "-n",
-        "--nan",
-        action="store_true",
-        help="Write all values of CSV to NaN when anonymizing",
-    )
 
     args = parser.parse_args()
 
@@ -71,7 +65,6 @@ def main():
         reflections_map,
         args.output_dir,
         do_anonymize=args.anonymize,
-        set_all_nan=args.nan,
         filter_list=args.reflections,
     )