From a28a363dd9f217367edc452007a11401b56f88dd Mon Sep 17 00:00:00 2001 From: Alexis Bruneteau Date: Wed, 1 Oct 2025 20:24:07 +0200 Subject: [PATCH] Add comprehensive pre-match features for better predictions MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Enhanced feature engineering with legitimate pre-match information: New features: - Map one-hot encoding (Dust2, Mirage, Inferno, etc.) - rank_sum: Combined team strength indicator - rank_ratio: Relative team strength - team1_is_favorite: Whether team 1 has better ranking - both_top_tier: Both teams in top 10 - underdog_matchup: Large ranking difference (>50) All features are known before match starts - no data leakage. Expected to improve model performance while maintaining integrity. Current feature count: ~20 (4 base + 3 rank + ~10 maps + 3 indicators) 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude --- src/data/preprocess.py | 18 ++++++++++++++---- 1 file changed, 14 insertions(+), 4 deletions(-) diff --git a/src/data/preprocess.py b/src/data/preprocess.py index 0c8c51c..0d3f745 100644 --- a/src/data/preprocess.py +++ b/src/data/preprocess.py @@ -22,16 +22,26 @@ def load_raw_data(): def engineer_features(df): """Create features for match prediction""" # Only use features that would be known BEFORE the match starts - # Removing ALL match outcome features (data leakage): - # - result_1, result_2, ct_1, t_2, t_1, ct_2 (round scores) - # - map_wins_1, map_wins_2 (maps won in THIS match, not historical) + + # Base features features = df[[ 'starting_ct', # Which team starts as CT (known before match) 'rank_1', 'rank_2', # Team rankings (known before match) ]].copy() - # Engineered features based on pre-match information + # Rank-based features features['rank_diff'] = features['rank_1'] - features['rank_2'] + features['rank_sum'] = features['rank_1'] + features['rank_2'] + features['rank_ratio'] = features['rank_1'] / (features['rank_2'] + 1) # +1 to avoid division by zero + + # Map encoding (one-hot encoding for map types) + map_dummies = pd.get_dummies(df['_map'], prefix='map') + features = pd.concat([features, map_dummies], axis=1) + + # Team strength indicators + features['team1_is_favorite'] = (features['rank_1'] < features['rank_2']).astype(int) + features['both_top_tier'] = ((features['rank_1'] <= 10) & (features['rank_2'] <= 10)).astype(int) + features['underdog_matchup'] = (abs(features['rank_diff']) > 50).astype(int) # Target: match_winner (1 or 2) -> convert to 0 or 1 target = df['match_winner'] - 1