""" Data preprocessing pipeline for CSGO match prediction. Loads raw data, performs feature engineering, and splits into train/test sets. """ import pandas as pd import yaml import json from pathlib import Path from sklearn.model_selection import train_test_split def load_params(): """Load parameters from params.yaml""" with open("params.yaml") as f: params = yaml.safe_load(f) return params["preprocess"] def load_raw_data(): """Load raw CSGO match data""" results = pd.read_csv("data/raw/results.csv") return results def engineer_features(df): """Create features for match prediction""" # Basic features from results features = df[[ 'result_1', 'result_2', 'starting_ct', 'ct_1', 't_2', 't_1', 'ct_2', 'rank_1', 'rank_2', 'map_wins_1', 'map_wins_2' ]].copy() # Engineered features features['rank_diff'] = features['rank_1'] - features['rank_2'] features['map_wins_diff'] = features['map_wins_1'] - features['map_wins_2'] features['total_rounds'] = features['result_1'] + features['result_2'] features['round_diff'] = features['result_1'] - features['result_2'] # Target: match_winner (1 or 2) -> convert to 0 or 1 target = df['match_winner'] - 1 return features, target def save_metrics(X_train, X_test, y_train, y_test): """Save dataset metrics""" metrics = { "n_samples": len(X_train) + len(X_test), "n_train": len(X_train), "n_test": len(X_test), "n_features": X_train.shape[1], "class_balance_train": { "class_0": int((y_train == 0).sum()), "class_1": int((y_train == 1).sum()) } } Path("data/processed").mkdir(parents=True, exist_ok=True) with open("data/processed/data_metrics.json", "w") as f: json.dump(metrics, f, indent=2) def main(): """Main preprocessing pipeline""" print("Loading parameters...") params = load_params() print("Loading raw data...") df = load_raw_data() print(f"Loaded {len(df)} matches") print("Engineering features...") X, y = engineer_features(df) print(f"Created {X.shape[1]} features") print("Splitting data...") X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=params["test_size"], random_state=params["random_state"], stratify=y ) print("Saving processed data...") Path("data/processed").mkdir(parents=True, exist_ok=True) # Save full features full_features = X.copy() full_features['target'] = y full_features.to_csv("data/processed/features.csv", index=False) # Save train set train_data = X_train.copy() train_data['target'] = y_train train_data.to_csv("data/processed/train.csv", index=False) # Save test set test_data = X_test.copy() test_data['target'] = y_test test_data.to_csv("data/processed/test.csv", index=False) # Save metrics save_metrics(X_train, X_test, y_train, y_test) print("Preprocessing completed successfully!") print(f"Train set: {len(X_train)} samples") print(f"Test set: {len(X_test)} samples") if __name__ == "__main__": main()