-
Notifications
You must be signed in to change notification settings - Fork 1
Expand file tree
/
Copy pathdata_preprocessing.py
More file actions
61 lines (50 loc) · 2.25 KB
/
data_preprocessing.py
File metadata and controls
61 lines (50 loc) · 2.25 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
import pandas as pd
# Define excluded_strings and disease_labels at the module level
excluded_strings = [
'normal fundus', 'diabetes', 'glaucoma', 'cataract',
'age related macular degeneration', 'hypertension',
'pathological myopia', 'other diseases/abnormalities'
]
disease_labels = {
"moderate non proliferative retinopathy": "MONR",
"mild nonproliferative retinopathy": "MINR",
"dry age-related macular degeneration": "DAMD",
"severe nonproliferative retinopathy": "SNR",
"drusen": "DR"
}
def clean_encoding_issues(df):
def replace_commas(disease_string):
# Replace the non-standard comma with a standard one
return disease_string.replace(',', ',')
df['Left-Diagnostic Keywords'] = df['Left-Diagnostic Keywords'].apply(replace_commas)
df['Right-Diagnostic Keywords'] = df['Right-Diagnostic Keywords'].apply(replace_commas)
return df
def remove_duplicates(df):
return df.drop_duplicates(subset='ID', keep='first').copy()
def filter_excluded_strings(df, excluded_strings):
def filter_strings(disease_counts):
return disease_counts[~disease_counts.index.isin(excluded_strings)]
combined_disease_counts = pd.concat([df['Left-Diagnostic Keywords'], df['Right-Diagnostic Keywords']]).value_counts()
top_five_combined_excluded = filter_strings(combined_disease_counts)
print(top_five_combined_excluded)
return df
def encode_disease_labels(df, disease_labels):
for label in disease_labels.values():
df[label] = 0
for disease, label in disease_labels.items():
condition = (df['Left-Diagnostic Keywords'].str.contains(disease, case=False, na=False)) | \
(df['Right-Diagnostic Keywords'].str.contains(disease, case=False, na=False))
df[label] = condition.astype(int)
return df
def encode_patient_sex(df):
df['Patient Sex'] = df['Patient Sex'].apply(lambda x: 1 if x == 'Female' else 0)
return df
def preprocess_data(df_path):
df = pd.read_csv(df_path)
df = clean_encoding_issues(df)
df = remove_duplicates(df)
# Directly use the module-level variables
df = filter_excluded_strings(df, excluded_strings)
df = encode_disease_labels(df, disease_labels)
df = encode_patient_sex(df)
return df