diff --git a/src/data/preprocess.py b/src/data/preprocess.py index 21d70d5..4b5619f 100644 --- a/src/data/preprocess.py +++ b/src/data/preprocess.py @@ -21,18 +21,17 @@ def load_raw_data(): def engineer_features(df): """Create features for match prediction""" - # Basic features from results + # Only use features that would be known BEFORE the match starts + # Removing result_1, result_2, ct_1, t_2, t_1, ct_2 (data leakage!) features = df[[ - 'result_1', 'result_2', 'starting_ct', - 'ct_1', 't_2', 't_1', 'ct_2', - 'rank_1', 'rank_2', 'map_wins_1', 'map_wins_2' + 'starting_ct', # Which team starts as CT (known before match) + 'rank_1', 'rank_2', # Team rankings (known before match) + 'map_wins_1', 'map_wins_2' # Historical map performance (known before match) ]].copy() - # Engineered features + # Engineered features based on pre-match information features['rank_diff'] = features['rank_1'] - features['rank_2'] features['map_wins_diff'] = features['map_wins_1'] - features['map_wins_2'] - features['total_rounds'] = features['result_1'] + features['result_2'] - features['round_diff'] = features['result_1'] - features['result_2'] # Target: match_winner (1 or 2) -> convert to 0 or 1 target = df['match_winner'] - 1