#Imports import pandas as pd from pathlib import Path import numpy as np from sklearn.model_selection import train_test_split from sklearn.preprocessing import StandardScaler from sklearn.linear_model import LogisticRegression from sklearn.metrics import log_loss, roc_auc_score, roc_curve import matplotlib.pyplot as plt import seaborn as sns
#Load CSV files DATA_DIR = Path("/home/abdibasidadan/Téléchargements") teams = pd.read_csv(DATA_DIR / "MTeams.csv") results = pd.read_csv(DATA_DIR / "MRegularSeasonCompactResults.csv") seed_round_slots = pd.read_csv(DATA_DIR / "MNCAATourneySeedRoundSlots.csv")
#Compute team statistics team_stats = results.groupby(['Season', 'WTeamID']).size().reset_index(name='W') team_stats_L = results.groupby(['Season', 'LTeamID']).size().reset_index(name='L') team_stats = pd.merge(team_stats, team_stats_L, left_on=['Season','WTeamID'], right_on=['Season','LTeamID'], how='outer') team_stats['TeamID'] = team_stats['WTeamID'].combine_first(team_stats['LTeamID']) team_stats['Wins'] = team_stats['W'].fillna(0) team_stats['Losses'] = team_stats['L'].fillna(0) team_stats = team_stats[['Season','TeamID','Wins','Losses']]
#Prepare match dataset def create_match_dataset(results): data = [] for _, row in results.iterrows(): data.append([row['Season'], row['WTeamID'], row['LTeamID'], 1]) data.append([row['Season'], row['LTeamID'], row['WTeamID'], 0]) df = pd.DataFrame(data, columns=['Season','Team1','Team2','Target']) return df match_data = create_match_dataset(results)
#Merge team stats match_data = pd.merge(match_data, team_stats, left_on=['Season','Team1'], right_on=['Season','TeamID'], how='left') match_data = match_data.rename(columns={'Wins':'Team1_Wins','Losses':'Team1_Losses'}).drop(columns=['TeamID']) match_data = pd.merge(match_data, team_stats, left_on=['Season','Team2'], right_on=['Season','TeamID'], how='left') match_data = match_data.rename(columns={'Wins':'Team2_Wins','Losses':'Team2_Losses'}).drop(columns=['TeamID']) match_data.fillna(0, inplace=True) X = match_data[['Team1_Wins','Team1_Losses','Team2_Wins','Team2_Losses']] y = match_data['Target']
#Split train/test X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
#Normalization scaler = StandardScaler() X_train_scaled = scaler.fit_transform(X_train) X_test_scaled = scaler.transform(X_test)
#Train model model = LogisticRegression() model.fit(X_train_scaled, y_train)
#Evaluation y_pred_proba = model.predict_proba(X_test_scaled)[:,1] loss = log_loss(y_test, y_pred_proba) roc_auc = roc_auc_score(y_test, y_pred_proba) print(f"Log Loss: {loss:.4f}") print(f"ROC-AUC: {roc_auc:.4f}")
#Plot ROC curve fpr, tpr, thresholds = roc_curve(y_test, y_pred_proba) plt.figure(figsize=(8,6)) plt.plot(fpr, tpr, label=f'ROC curve (AUC = {roc_auc:.4f})') plt.plot([0,1],[0,1],'k--') plt.xlabel('False Positive Rate') plt.ylabel('True Positive Rate') plt.title('ROC Curve') plt.legend() plt.show()
#Tournament example matchup_example = pd.DataFrame({ 'Season': [2024, 2024], 'Team1': [1101, 1102], 'Team2': [1103, 1104] })
#Merge stats matchup_example = pd.merge(matchup_example, team_stats, left_on=['Season','Team1'], right_on=['Season','TeamID'], how='left') matchup_example = matchup_example.rename(columns={'Wins':'Team1_Wins','Losses':'Team1_Losses'}).drop(columns=['TeamID']) matchup_example = pd.merge(matchup_example, team_stats, left_on=['Season','Team2'], right_on=['Season','TeamID'], how='left') matchup_example = matchup_example.rename(columns={'Wins':'Team2_Wins','Losses':'Team2_Losses'}).drop(columns=['TeamID']) matchup_example.fillna(0, inplace=True) X_tourney = matchup_example[['Team1_Wins','Team1_Losses','Team2_Wins','Team2_Losses']] X_tourney_scaled = scaler.transform(X_tourney) matchup_example['Prob_Team1_Win'] = model.predict_proba(X_tourney_scaled)[:,1] print(matchup_example[['Team1','Team2','Prob_Team1_Win']])
#The supervised logistic regression model demonstrates a reasonable #ability to predict winners of college basketball matches. On the #test set, it achieves a Log Loss of approximately 0.515 and an ROC-AUC #of 0.822, indicating the model can distinguish winning and losing #teams based on historical win/loss records. #Observations: #1. Predicted probabilities for the example tournament matches show #realistic but simplified trends, as only wins and losses are used #as features. #2. Matches with probabilities near 0.5 indicate balanced contests #or limited historical data. #3. For accurate predictions of real tournaments, it is necessary #to correctly map seeds to TeamIDs and simulate all tournament rounds.
- MTeams.csv, MRegularSeasonCompactResults.csv, MNCAATourneySeedRoundSlots.csv
- Source: Kaggle, NCAA Basketball dataset
- URL: https://www.kaggle.com/c/march-mania-2023/data
- Content: team information, regular season results, and NCAA tournament seeds
- Accessed: December 2025