MLOps/src/data/preprocess.py
Alexis Bruneteau f107164b51 maybe maybe not
2025-10-01 15:04:13 +02:00

107 lines
3.1 KiB
Python

"""
Data preprocessing pipeline for CSGO match prediction.
Loads raw data, performs feature engineering, and splits into train/test sets.
"""
import pandas as pd
import yaml
import json
from pathlib import Path
from sklearn.model_selection import train_test_split
def load_params():
"""Load parameters from params.yaml"""
with open("params.yaml") as f:
params = yaml.safe_load(f)
return params["preprocess"]
def load_raw_data():
"""Load raw CSGO match data"""
results = pd.read_csv("data/raw/results.csv")
return results
def engineer_features(df):
"""Create features for match prediction"""
# Basic features from results
features = df[[
'result_1', 'result_2', 'starting_ct',
'ct_1', 't_2', 't_1', 'ct_2',
'rank_1', 'rank_2', 'map_wins_1', 'map_wins_2'
]].copy()
# Engineered features
features['rank_diff'] = features['rank_1'] - features['rank_2']
features['map_wins_diff'] = features['map_wins_1'] - features['map_wins_2']
features['total_rounds'] = features['result_1'] + features['result_2']
features['round_diff'] = features['result_1'] - features['result_2']
# Target: match_winner (1 or 2) -> convert to 0 or 1
target = df['match_winner'] - 1
return features, target
def save_metrics(X_train, X_test, y_train, y_test):
"""Save dataset metrics"""
metrics = {
"n_samples": len(X_train) + len(X_test),
"n_train": len(X_train),
"n_test": len(X_test),
"n_features": X_train.shape[1],
"class_balance_train": {
"class_0": int((y_train == 0).sum()),
"class_1": int((y_train == 1).sum())
}
}
Path("data/processed").mkdir(parents=True, exist_ok=True)
with open("data/processed/data_metrics.json", "w") as f:
json.dump(metrics, f, indent=2)
def main():
"""Main preprocessing pipeline"""
print("Loading parameters...")
params = load_params()
print("Loading raw data...")
df = load_raw_data()
print(f"Loaded {len(df)} matches")
print("Engineering features...")
X, y = engineer_features(df)
print(f"Created {X.shape[1]} features")
print("Splitting data...")
X_train, X_test, y_train, y_test = train_test_split(
X, y,
test_size=params["test_size"],
random_state=params["random_state"],
stratify=y
)
print("Saving processed data...")
Path("data/processed").mkdir(parents=True, exist_ok=True)
# Save full features
full_features = X.copy()
full_features['target'] = y
full_features.to_csv("data/processed/features.csv", index=False)
# Save train set
train_data = X_train.copy()
train_data['target'] = y_train
train_data.to_csv("data/processed/train.csv", index=False)
# Save test set
test_data = X_test.copy()
test_data['target'] = y_test
test_data.to_csv("data/processed/test.csv", index=False)
# Save metrics
save_metrics(X_train, X_test, y_train, y_test)
print("Preprocessing completed successfully!")
print(f"Train set: {len(X_train)} samples")
print(f"Test set: {len(X_test)} samples")
if __name__ == "__main__":
main()