From a28a363dd9f217367edc452007a11401b56f88dd Mon Sep 17 00:00:00 2001
From: Alexis Bruneteau <alex@sortifal.dev>
Date: Wed, 1 Oct 2025 20:24:07 +0200
Subject: [PATCH] Add comprehensive pre-match features for better predictions
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Enhanced feature engineering with legitimate pre-match information:

New features:
- Map one-hot encoding (Dust2, Mirage, Inferno, etc.)
- rank_sum: Combined team strength indicator
- rank_ratio: Relative team strength
- team1_is_favorite: Whether team 1 has better ranking
- both_top_tier: Both teams in top 10
- underdog_matchup: Large ranking difference (>50)

All features are known before match starts - no data leakage.
Expected to improve model performance while maintaining integrity.

Current feature count: ~20 (4 base + 3 rank + ~10 maps + 3 indicators)

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude <noreply@anthropic.com>
---
 src/data/preprocess.py | 18 ++++++++++++++----
 1 file changed, 14 insertions(+), 4 deletions(-)

diff --git a/src/data/preprocess.py b/src/data/preprocess.py
index 0c8c51c..0d3f745 100644
--- a/src/data/preprocess.py
+++ b/src/data/preprocess.py
@@ -22,16 +22,26 @@ def load_raw_data():
 def engineer_features(df):
     """Create features for match prediction"""
     # Only use features that would be known BEFORE the match starts
-    # Removing ALL match outcome features (data leakage):
-    # - result_1, result_2, ct_1, t_2, t_1, ct_2 (round scores)
-    # - map_wins_1, map_wins_2 (maps won in THIS match, not historical)
+
+    # Base features
     features = df[[
         'starting_ct',  # Which team starts as CT (known before match)
         'rank_1', 'rank_2',  # Team rankings (known before match)
     ]].copy()
 
-    # Engineered features based on pre-match information
+    # Rank-based features
     features['rank_diff'] = features['rank_1'] - features['rank_2']
+    features['rank_sum'] = features['rank_1'] + features['rank_2']
+    features['rank_ratio'] = features['rank_1'] / (features['rank_2'] + 1)  # +1 to avoid division by zero
+
+    # Map encoding (one-hot encoding for map types)
+    map_dummies = pd.get_dummies(df['_map'], prefix='map')
+    features = pd.concat([features, map_dummies], axis=1)
+
+    # Team strength indicators
+    features['team1_is_favorite'] = (features['rank_1'] < features['rank_2']).astype(int)
+    features['both_top_tier'] = ((features['rank_1'] <= 10) & (features['rank_2'] <= 10)).astype(int)
+    features['underdog_matchup'] = (abs(features['rank_diff']) > 50).astype(int)
 
     # Target: match_winner (1 or 2) -> convert to 0 or 1
     target = df['match_winner'] - 1