Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
81 changes: 78 additions & 3 deletions reflect/conversions.py
Original file line number Diff line number Diff line change
Expand Up @@ -106,19 +106,20 @@ def get_post_metric_default(self, metric_kind: str):
)
raise


def convert_timestamp(apple_timestamp: float) -> str:
"""
Convert an Apple-style timestamp to a local time string.

Args:
apple_timestamp (float): The Apple timestamp to convert.

Returns:
str: The timestamp in local time as a string formatted as
str: The timestamp in local time as a string formatted as
"YYYY-MM-DD HH:MM:SS".
"""
# Convert the Apple timestamp to a Python timestamp
# 978307200 is the number of seconds between 1970-01-01T00:00:00Z and
# 978307200 is the number of seconds between 1970-01-01T00:00:00Z and
# 2001-01-01T00:00:00Z
timestamp = apple_timestamp + 978307200

Expand Down Expand Up @@ -352,6 +353,7 @@ def parse_json(
def save_dataframes_to_csv(
reflections_map: Dict[str, pd.DataFrame],
output_folder: str,
do_anonymize: Optional[bool] = False,
filter_list: Optional[List[str]] = None,
) -> None:
"""
Expand All @@ -368,9 +370,82 @@ def save_dataframes_to_csv(
Returns:
None
"""
if do_anonymize:
import nltk

nltk.download("words")

if filter_list is not None:
reflections_map = {
k: v for k, v in reflections_map.items() if k in filter_list
}
for name, df in reflections_map.items():
if do_anonymize:
name, df = anonymize(df)
df.to_csv(os.path.join(output_folder, f"{name}.csv"), index=False)


def anonymize(df: pd.DataFrame) -> Tuple[str, pd.DataFrame]:
"""
Anonymize the input dataframe by removing specific columns, adding a random
time offset to the "Date" column, and replacing other column names with a
random capitalized word.

Applies a random linear transformation of the form y = kx to any numerical
columns, where k is a random floating point number between 0.5 and 10 and a
sign flip is applied with 50% chance.

Any non-empty string values in the dataframe are replaced with a random
word.

If a column has boolean values, randomly flip the sign of the entire column
with a 50% chance.

Parameters
----------
df : pd.DataFrame
Input dataframe to be anonymized.

Returns
-------
Tuple[str, pd.DataFrame]
A tuple where the first element is a random capitalized word and the
second element is the anonymized dataframe.
"""
import nltk
import random

# Step 1: Remove specified columns
df = df.drop(columns=["Notes", "ID", "Timestamp"], errors="ignore")

# Step 2: Add random time offset to "Date" column
if "Date" in df.columns:
df["Date"] = pd.to_datetime(df["Date"])
random_offset = pd.tseries.offsets.DateOffset(
years=np.random.randint(-1000, 1000)
)
df["Date"] += random_offset
Comment on lines +421 to +427
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This should be applied to all the dataframes equally so there is still time correspondence

Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

all reflections get shifted by a random year offset?


def random_word():
return random.choice(nltk.corpus.words.words()).capitalize()

# Step 3: Set all values in dataframe to NaN if set_all_nan is True
# and apply a random linear transformation to numerical columns
# and replace non-empty strings with a random word
for col in df.columns:
if df[col].dtype == "object":
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

object means string? is there any way to narrow this down further?

df[col] = df[col].apply(lambda x: random_word() if x else x)
elif df[col].dtype in ["int64", "float64"]:
k = random.uniform(0.5, 10) * random.choice([-1, 1])
df[col] = df[col] * k
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

if it's a rating type, won't this be obvious given our current fixed range? what is this protecting from?

elif df[col].dtype == "bool":
if random.choice([True, False]):
df[col] = ~df[col]

# Step 4: Replace other column names with random capitalized word
new_columns = {col: random_word() for col in df.columns if col != "Date"}
df = df.rename(columns=new_columns)

name = random_word()

return name, df
68 changes: 68 additions & 0 deletions scripts/anonymize.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,68 @@
import json
import random
import argparse
from nltk.corpus import words, wordnet
from pathlib import Path

# Generate a list of nouns and adjectives
nouns = {x.name().split(".", 1)[0] for x in wordnet.all_synsets("n")}
adjectives = {x.name().split(".", 1)[0] for x in wordnet.all_synsets("a")}
words_list = list(nouns | adjectives)


def generate_random_word():
word = random.choice(words_list)
return " ".join(part.capitalize() for part in word.split("_"))


def update_values(data, word_map):
if isinstance(data, list):
for i in range(len(data)):
if isinstance(data[i], str):
if data[i] not in word_map:
word_map[data[i]] = generate_random_word()
data[i] = word_map[data[i]]
else:
update_values(data[i], word_map)
elif isinstance(data, dict):
for key, value in data.items():
if key in ["name", "group"] and value != "":
if value not in word_map:
word_map[value] = generate_random_word()
data[key] = word_map[value]
elif key == "choices" and isinstance(value, list):
data[key] = [generate_random_word() for _ in value]
else:
update_values(value, word_map)
return data


def main(input_file, output_file):
# Load the JSON file
with open(input_file) as f:
data = json.load(f)

group_map = {}
# Update the values
updated_data = update_values(data, group_map)

# Write to the new JSON file
with open(output_file, "w") as f:
json.dump(updated_data, f, indent=4)


if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("input_file", help="The path of the input JSON file")
parser.add_argument(
"-o", "--output_file", help="The path of the output JSON file"
)

args = parser.parse_args()

if args.output_file is None:
# If no output file is provided, use the input file name with '_anonymized' appended
input_path = Path(args.input_file)
args.output_file = input_path.stem + "_anonymized" + input_path.suffix

main(args.input_file, args.output_file)
51 changes: 40 additions & 11 deletions scripts/json_to_csv.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,37 +8,66 @@ def main():

Each reflection in the JSON file will be parsed into a separate CSV file.
The CSV files will be saved in the specified output directory.
An optional list of reflection names can be provided to filter which
reflections to save. Additionally, parsing options can be loaded from a
An optional list of reflection names can be provided to filter which
reflections to save. Additionally, parsing options can be loaded from a
YAML file.

Command-line arguments:
- json_path: Path to the JSON reflections file.
- output_dir: Output directory for CSV files.
- reflections: Optional list of reflection names to save. If not provided,
- reflections: Optional list of reflection names to save. If not provided,
all reflections will be saved.
- options_file: Optional YAML file with parsing options.
"""
parser = argparse.ArgumentParser(description='Parse JSON reflections and save to CSV.')
parser.add_argument('json_path', type=str, help='Path to the JSON reflections file.')
parser.add_argument('output_dir', type=str, help='Output directory for CSV files.')
parser.add_argument('-r', '--reflections', nargs='*', help='Optional list of reflection names to save.')
parser.add_argument('-o', '--options_file', type=str, default=None, help='YAML file with parsing options.')
parser = argparse.ArgumentParser(
description="Parse JSON reflections and save to CSV."
)
parser.add_argument(
"json_path", type=str, help="Path to the JSON reflections file."
)
parser.add_argument(
"output_dir", type=str, help="Output directory for CSV files."
)
parser.add_argument(
"-r",
"--reflections",
nargs="*",
help="Optional list of reflection names to save.",
)
parser.add_argument(
"-o",
"--options_file",
type=str,
default=None,
help="YAML file with parsing options.",
)
parser.add_argument(
"-a",
"--anonymize",
action="store_true",
help="Anonymize output CSV files",
)

args = parser.parse_args()

with open(args.json_path, 'r') as f:
with open(args.json_path, "r") as f:
json_string = f.read()

if args.options_file:
with open(args.options_file, 'r') as f:
with open(args.options_file, "r") as f:
options_yaml = f.read()
options = conv.load_parsing_options_from_yaml(options_yaml)
else:
options = conv.ParsingOptions()

reflections_map = conv.parse_json(json_string, options)
conv.save_dataframes_to_csv(reflections_map, args.output_dir, args.reflections)
conv.save_dataframes_to_csv(
reflections_map,
args.output_dir,
do_anonymize=args.anonymize,
filter_list=args.reflections,
)


if __name__ == "__main__":
main()