From efaf5ff0e18f31de8008193b750a3c7cdb69460c Mon Sep 17 00:00:00 2001
From: Alexis Bruneteau <alex@sortifal.dev>
Date: Wed, 1 Oct 2025 20:01:46 +0200
Subject: [PATCH] Fix critical data leakage in feature engineering
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Removed features that contain match outcome information:
- result_1, result_2 (actual match scores - only known after match)
- ct_1, t_2, t_1, ct_2 (rounds won per side - only known after match)
- total_rounds, round_diff (derived from results)

These features caused perfect 1.0 accuracy because the model was
essentially "cheating" by knowing the match outcome.

Now using only pre-match information:
- Team rankings (rank_1, rank_2)
- Historical map performance (map_wins_1, map_wins_2)
- Starting side (starting_ct)
- Derived: rank_diff, map_wins_diff

This will give realistic model performance based on what would
actually be known before a match starts.

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude <noreply@anthropic.com>
---
 src/data/preprocess.py | 13 ++++++-------
 1 file changed, 6 insertions(+), 7 deletions(-)

diff --git a/src/data/preprocess.py b/src/data/preprocess.py
index 21d70d5..4b5619f 100644
--- a/src/data/preprocess.py
+++ b/src/data/preprocess.py
@@ -21,18 +21,17 @@ def load_raw_data():
 
 def engineer_features(df):
     """Create features for match prediction"""
-    # Basic features from results
+    # Only use features that would be known BEFORE the match starts
+    # Removing result_1, result_2, ct_1, t_2, t_1, ct_2 (data leakage!)
     features = df[[
-        'result_1', 'result_2', 'starting_ct',
-        'ct_1', 't_2', 't_1', 'ct_2',
-        'rank_1', 'rank_2', 'map_wins_1', 'map_wins_2'
+        'starting_ct',  # Which team starts as CT (known before match)
+        'rank_1', 'rank_2',  # Team rankings (known before match)
+        'map_wins_1', 'map_wins_2'  # Historical map performance (known before match)
     ]].copy()
 
-    # Engineered features
+    # Engineered features based on pre-match information
     features['rank_diff'] = features['rank_1'] - features['rank_2']
     features['map_wins_diff'] = features['map_wins_1'] - features['map_wins_2']
-    features['total_rounds'] = features['result_1'] + features['result_2']
-    features['round_diff'] = features['result_1'] - features['result_2']
 
     # Target: match_winner (1 or 2) -> convert to 0 or 1
     target = df['match_winner'] - 1