From 6995102d76336e1d351dd43884428d1b237e6870 Mon Sep 17 00:00:00 2001
From: Alexis Bruneteau <alex@sortifal.dev>
Date: Wed, 1 Oct 2025 20:17:07 +0200
Subject: [PATCH] Remove map_wins features - they contain match outcome data
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The map_wins_1 and map_wins_2 columns represent maps won DURING
the current match, not historical performance. This is data leakage
as these values are only known during/after the match.

Now using only truly pre-match features:
- rank_1, rank_2: Team rankings before match
- starting_ct: Which team starts CT side
- rank_diff: Derived ranking difference

This should finally give realistic model performance based solely
on information available before the match begins.

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude <noreply@anthropic.com>
---
 src/data/preprocess.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/data/preprocess.py b/src/data/preprocess.py
index 4b5619f..0c8c51c 100644
--- a/src/data/preprocess.py
+++ b/src/data/preprocess.py
@@ -22,16 +22,16 @@ def load_raw_data():
 def engineer_features(df):
     """Create features for match prediction"""
     # Only use features that would be known BEFORE the match starts
-    # Removing result_1, result_2, ct_1, t_2, t_1, ct_2 (data leakage!)
+    # Removing ALL match outcome features (data leakage):
+    # - result_1, result_2, ct_1, t_2, t_1, ct_2 (round scores)
+    # - map_wins_1, map_wins_2 (maps won in THIS match, not historical)
     features = df[[
         'starting_ct',  # Which team starts as CT (known before match)
         'rank_1', 'rank_2',  # Team rankings (known before match)
-        'map_wins_1', 'map_wins_2'  # Historical map performance (known before match)
     ]].copy()
 
     # Engineered features based on pre-match information
     features['rank_diff'] = features['rank_1'] - features['rank_2']
-    features['map_wins_diff'] = features['map_wins_1'] - features['map_wins_2']
 
     # Target: match_winner (1 or 2) -> convert to 0 or 1
     target = df['match_winner'] - 1