@@ -118,37 +118,52 @@ def read_csv_log(
118118
119119
120120def split_log_training_validation_trace_wise (
121- event_log : pd .DataFrame , log_ids : EventLogIDs , training_percentage : float
121+ event_log : pd .DataFrame ,
122+ log_ids : EventLogIDs ,
123+ training_percentage : float ,
124+ sort : bool = True
122125) -> tuple [pd .DataFrame , pd .DataFrame ]:
123126 """
124- Split the traces of [event_log] into two separated event logs (one for training and the other for validation). Split full traces in
125- order to achieve an approximate proportion of [training_percentage] events in the training set.
127+ Split the traces of [event_log] into two separated event logs (one for training and the other for validation). Split
128+ full traces in order to achieve an approximate proportion of [training_percentage] events in the training set.
126129
127130 :param event_log: event log to split.
128131 :param log_ids: IDs for the columns of the event log.
129132 :param training_percentage: percentage of events (approx) to retain in the training data.
133+ :param sort: if true, sort events in the log by start+end (if start available) or
134+ by end (otherwise).
130135
131136 :return: a tuple with two datasets, the training and the validation ones.
132137 """
133- # Sort event log
134- sorted_event_log = event_log .sort_values ([log_ids .start_time , log_ids .end_time ])
135- # Take first trace until the number of events is [training_percentage] * total size
136- total_events = len (event_log )
137- training_case_ids = []
138- training_full = False
139- # Go over the case IDs (sorted by start and end time of its events)
140- for case_id in sorted_event_log [log_ids .case ].unique ():
141- # The first traces until the size limit is met goes to the training set
142- if not training_full :
143- training_case_ids += [case_id ]
144- training_full = len (event_log [event_log [log_ids .case ].isin (training_case_ids )]) >= (
145- training_percentage * total_events
146- )
138+ # Sort if needed
139+ if sort :
140+ keys = [log_ids .start_time , log_ids .end_time ] if log_ids .start_time in event_log .columns else [log_ids .end_time ]
141+ sorted_event_log = event_log .sort_values (keys )
142+ else :
143+ sorted_event_log = event_log
144+ # Estimate number of cases for training
145+ case_ids = list (sorted_event_log [log_ids .case ].unique ())
146+ num_cases_training = int (training_percentage * len (case_ids ))
147+ # Retain first estimated cases
148+ training_case_ids = case_ids [:num_cases_training ]
149+ # Loop adjusting until desired % of events
150+ well_distributed = False
151+ training_log , validation_log = None , None
152+ num_cases_history = [] # Store already considered num_cases to not repeat
153+ while not well_distributed and num_cases_training not in num_cases_history :
154+ num_cases_history += [num_cases_training ]
155+ # Retain partitions
156+ training_log = event_log [event_log [log_ids .case ].isin (training_case_ids )]
157+ validation_log = event_log [~ event_log [log_ids .case ].isin (training_case_ids )]
158+ # If well distributed, stop, otherwise adjust
159+ diff = len (training_log ) / len (sorted_event_log ) - training_percentage
160+ if abs (diff ) < 0.01 : # < 1% difference
161+ well_distributed = True
162+ else :
163+ num_cases_training += 1 if diff < 0 else - 1
164+ training_case_ids = case_ids [:num_cases_training ]
147165 # Return the two splits
148- return (
149- event_log [event_log [log_ids .case ].isin (training_case_ids )],
150- event_log [~ event_log [log_ids .case ].isin (training_case_ids )],
151- )
166+ return training_log , validation_log
152167
153168
154169def split_log_training_validation_event_wise (
@@ -159,15 +174,17 @@ def split_log_training_validation_event_wise(
159174 remove_partial_traces_from_validation : bool = False ,
160175) -> tuple [pd .DataFrame , pd .DataFrame ]:
161176 """
162- Split the traces of [event_log] into two separated event logs (one for training and the other for validation). Split event-wise retaining the
163- first [training_percentage] of events in the training set, and the remaining ones in the validation set.
177+ Split the traces of [event_log] into two separated event logs (one for training and the other for validation). Split
178+ event-wise retaining the first [training_percentage] of events in the training set, and the remaining ones in the
179+ validation set.
164180
165181 :param event_log: event log to split.
166182 :param log_ids: IDs for the columns of the event log.
167183 :param training_percentage: percentage of events to retain in the training data.
168- :param sort: if true, sort events in the log by start+end (if start available) or by end (otherwise).
169- :param remove_partial_traces_from_validation if true, remove from validation set the traces that has been split being some event in
170- training and some events in validation.
184+ :param sort: if true, sort events in the log by start+end (if start available) or
185+ by end (otherwise).
186+ :param remove_partial_traces_from_validation if true, remove from validation set the traces that has been split
187+ being some event in training and some events in validation.
171188
172189 :return: a tuple with two datasets, the training and the validation ones.
173190 """
@@ -179,7 +196,7 @@ def split_log_training_validation_event_wise(
179196 sorted_event_log = event_log
180197 # Get the event splitting train and validation
181198 num_train_events = int (len (event_log ) * training_percentage )
182- last_training_event = sorted_event_log .head ( num_train_events ). iloc [- 1 ]
199+ last_training_event = sorted_event_log .iloc [num_train_events - 1 ]
183200 # Split the log based on the timestamp of the splitting event
184201 if log_ids .start_time in event_log .columns :
185202 training_log = event_log [event_log [log_ids .start_time ] <= last_training_event [log_ids .start_time ]]
0 commit comments