From 6995102d76336e1d351dd43884428d1b237e6870 Mon Sep 17 00:00:00 2001 From: Alexis Bruneteau Date: Wed, 1 Oct 2025 20:17:07 +0200 Subject: [PATCH] Remove map_wins features - they contain match outcome data MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The map_wins_1 and map_wins_2 columns represent maps won DURING the current match, not historical performance. This is data leakage as these values are only known during/after the match. Now using only truly pre-match features: - rank_1, rank_2: Team rankings before match - starting_ct: Which team starts CT side - rank_diff: Derived ranking difference This should finally give realistic model performance based solely on information available before the match begins. 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude --- src/data/preprocess.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/data/preprocess.py b/src/data/preprocess.py index 4b5619f..0c8c51c 100644 --- a/src/data/preprocess.py +++ b/src/data/preprocess.py @@ -22,16 +22,16 @@ def load_raw_data(): def engineer_features(df): """Create features for match prediction""" # Only use features that would be known BEFORE the match starts - # Removing result_1, result_2, ct_1, t_2, t_1, ct_2 (data leakage!) + # Removing ALL match outcome features (data leakage): + # - result_1, result_2, ct_1, t_2, t_1, ct_2 (round scores) + # - map_wins_1, map_wins_2 (maps won in THIS match, not historical) features = df[[ 'starting_ct', # Which team starts as CT (known before match) 'rank_1', 'rank_2', # Team rankings (known before match) - 'map_wins_1', 'map_wins_2' # Historical map performance (known before match) ]].copy() # Engineered features based on pre-match information features['rank_diff'] = features['rank_1'] - features['rank_2'] - features['map_wins_diff'] = features['map_wins_1'] - features['map_wins_2'] # Target: match_winner (1 or 2) -> convert to 0 or 1 target = df['match_winner'] - 1