Skip to content

Commit 5bbb29a

Browse files
committed
Refactor event log splits to improve performance
1 parent 7c61f5a commit 5bbb29a

File tree

3 files changed

+82
-28
lines changed

3 files changed

+82
-28
lines changed

pyproject.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
[tool.poetry]
22
name = "pix-framework"
3-
version = "0.14.1"
3+
version = "0.14.2"
44
description = "Process Improvement Explorer Framework contains process discovery and improvement modules of the Process Improvement Explorer project."
55
authors = [
66
"David Chapela de la Campa <david.chapela.delacampa@gmail.com>",

src/pix_framework/io/event_log.py

Lines changed: 44 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -118,37 +118,52 @@ def read_csv_log(
118118

119119

120120
def split_log_training_validation_trace_wise(
121-
event_log: pd.DataFrame, log_ids: EventLogIDs, training_percentage: float
121+
event_log: pd.DataFrame,
122+
log_ids: EventLogIDs,
123+
training_percentage: float,
124+
sort: bool = True
122125
) -> tuple[pd.DataFrame, pd.DataFrame]:
123126
"""
124-
Split the traces of [event_log] into two separated event logs (one for training and the other for validation). Split full traces in
125-
order to achieve an approximate proportion of [training_percentage] events in the training set.
127+
Split the traces of [event_log] into two separated event logs (one for training and the other for validation). Split
128+
full traces in order to achieve an approximate proportion of [training_percentage] events in the training set.
126129
127130
:param event_log: event log to split.
128131
:param log_ids: IDs for the columns of the event log.
129132
:param training_percentage: percentage of events (approx) to retain in the training data.
133+
:param sort: if true, sort events in the log by start+end (if start available) or
134+
by end (otherwise).
130135
131136
:return: a tuple with two datasets, the training and the validation ones.
132137
"""
133-
# Sort event log
134-
sorted_event_log = event_log.sort_values([log_ids.start_time, log_ids.end_time])
135-
# Take first trace until the number of events is [training_percentage] * total size
136-
total_events = len(event_log)
137-
training_case_ids = []
138-
training_full = False
139-
# Go over the case IDs (sorted by start and end time of its events)
140-
for case_id in sorted_event_log[log_ids.case].unique():
141-
# The first traces until the size limit is met goes to the training set
142-
if not training_full:
143-
training_case_ids += [case_id]
144-
training_full = len(event_log[event_log[log_ids.case].isin(training_case_ids)]) >= (
145-
training_percentage * total_events
146-
)
138+
# Sort if needed
139+
if sort:
140+
keys = [log_ids.start_time, log_ids.end_time] if log_ids.start_time in event_log.columns else [log_ids.end_time]
141+
sorted_event_log = event_log.sort_values(keys)
142+
else:
143+
sorted_event_log = event_log
144+
# Estimate number of cases for training
145+
case_ids = list(sorted_event_log[log_ids.case].unique())
146+
num_cases_training = int(training_percentage * len(case_ids))
147+
# Retain first estimated cases
148+
training_case_ids = case_ids[:num_cases_training]
149+
# Loop adjusting until desired % of events
150+
well_distributed = False
151+
training_log, validation_log = None, None
152+
num_cases_history = [] # Store already considered num_cases to not repeat
153+
while not well_distributed and num_cases_training not in num_cases_history:
154+
num_cases_history += [num_cases_training]
155+
# Retain partitions
156+
training_log = event_log[event_log[log_ids.case].isin(training_case_ids)]
157+
validation_log = event_log[~event_log[log_ids.case].isin(training_case_ids)]
158+
# If well distributed, stop, otherwise adjust
159+
diff = len(training_log) / len(sorted_event_log) - training_percentage
160+
if abs(diff) < 0.01: # < 1% difference
161+
well_distributed = True
162+
else:
163+
num_cases_training += 1 if diff < 0 else -1
164+
training_case_ids = case_ids[:num_cases_training]
147165
# Return the two splits
148-
return (
149-
event_log[event_log[log_ids.case].isin(training_case_ids)],
150-
event_log[~event_log[log_ids.case].isin(training_case_ids)],
151-
)
166+
return training_log, validation_log
152167

153168

154169
def split_log_training_validation_event_wise(
@@ -159,15 +174,17 @@ def split_log_training_validation_event_wise(
159174
remove_partial_traces_from_validation: bool = False,
160175
) -> tuple[pd.DataFrame, pd.DataFrame]:
161176
"""
162-
Split the traces of [event_log] into two separated event logs (one for training and the other for validation). Split event-wise retaining the
163-
first [training_percentage] of events in the training set, and the remaining ones in the validation set.
177+
Split the traces of [event_log] into two separated event logs (one for training and the other for validation). Split
178+
event-wise retaining the first [training_percentage] of events in the training set, and the remaining ones in the
179+
validation set.
164180
165181
:param event_log: event log to split.
166182
:param log_ids: IDs for the columns of the event log.
167183
:param training_percentage: percentage of events to retain in the training data.
168-
:param sort: if true, sort events in the log by start+end (if start available) or by end (otherwise).
169-
:param remove_partial_traces_from_validation if true, remove from validation set the traces that has been split being some event in
170-
training and some events in validation.
184+
:param sort: if true, sort events in the log by start+end (if start available) or
185+
by end (otherwise).
186+
:param remove_partial_traces_from_validation if true, remove from validation set the traces that has been split
187+
being some event in training and some events in validation.
171188
172189
:return: a tuple with two datasets, the training and the validation ones.
173190
"""
@@ -179,7 +196,7 @@ def split_log_training_validation_event_wise(
179196
sorted_event_log = event_log
180197
# Get the event splitting train and validation
181198
num_train_events = int(len(event_log) * training_percentage)
182-
last_training_event = sorted_event_log.head(num_train_events).iloc[-1]
199+
last_training_event = sorted_event_log.iloc[num_train_events-1]
183200
# Split the log based on the timestamp of the splitting event
184201
if log_ids.start_time in event_log.columns:
185202
training_log = event_log[event_log[log_ids.start_time] <= last_training_event[log_ids.start_time]]

tests/pix_framework/io/log_split_test.py

Lines changed: 37 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -95,3 +95,40 @@ def test_split_log_training_validation_trace_wise():
9595
# Assert expected result
9696
assert train.equals(data[data["case_id"].isin(["0", "1", "2"])])
9797
assert test.equals(data[data["case_id"].isin(["3"])])
98+
99+
100+
def test_split_log_training_validation_trace_wise_unbalanced():
101+
# Create event log mock
102+
data = pd.DataFrame(
103+
{
104+
"case_id": [
105+
"0",
106+
"0",
107+
"0",
108+
"1",
109+
"0",
110+
"1",
111+
"2",
112+
"3",
113+
"1",
114+
"2",
115+
"3",
116+
"2",
117+
"3",
118+
"3",
119+
"2",
120+
"3",
121+
"3",
122+
"3",
123+
"3",
124+
"3",
125+
],
126+
"start_time": [1, 2, 3, 3, 4, 4, 4, 5, 5, 5, 6, 6, 6, 7, 7, 7, 8, 8, 9, 10],
127+
"end_time": [6, 2, 3, 5, 4, 6, 7, 8, 9, 8, 6, 1, 2, 4, 3, 2, 5, 32, 13, 25],
128+
}
129+
)
130+
# Split it in 50-50
131+
train, test = split_log_training_validation_trace_wise(data, DEFAULT_CSV_IDS, 0.5)
132+
# Assert expected result
133+
assert train.equals(data[data["case_id"].isin(["0", "1", "2"])])
134+
assert test.equals(data[data["case_id"].isin(["3"])])

0 commit comments

Comments
 (0)