107 lines
3.1 KiB
Python
107 lines
3.1 KiB
Python
"""
|
|
Data preprocessing pipeline for CSGO match prediction.
|
|
Loads raw data, performs feature engineering, and splits into train/test sets.
|
|
"""
|
|
import pandas as pd
|
|
import yaml
|
|
import json
|
|
from pathlib import Path
|
|
from sklearn.model_selection import train_test_split
|
|
|
|
def load_params():
|
|
"""Load parameters from params.yaml"""
|
|
with open("params.yaml") as f:
|
|
params = yaml.safe_load(f)
|
|
return params["preprocess"]
|
|
|
|
def load_raw_data():
|
|
"""Load raw CSGO match data"""
|
|
results = pd.read_csv("data/raw/results.csv")
|
|
return results
|
|
|
|
def engineer_features(df):
|
|
"""Create features for match prediction"""
|
|
# Basic features from results
|
|
features = df[[
|
|
'result_1', 'result_2', 'starting_ct',
|
|
'ct_1', 't_2', 't_1', 'ct_2',
|
|
'rank_1', 'rank_2', 'map_wins_1', 'map_wins_2'
|
|
]].copy()
|
|
|
|
# Engineered features
|
|
features['rank_diff'] = features['rank_1'] - features['rank_2']
|
|
features['map_wins_diff'] = features['map_wins_1'] - features['map_wins_2']
|
|
features['total_rounds'] = features['result_1'] + features['result_2']
|
|
features['round_diff'] = features['result_1'] - features['result_2']
|
|
|
|
# Target: match_winner (1 or 2) -> convert to 0 or 1
|
|
target = df['match_winner'] - 1
|
|
|
|
return features, target
|
|
|
|
def save_metrics(X_train, X_test, y_train, y_test):
|
|
"""Save dataset metrics"""
|
|
metrics = {
|
|
"n_samples": len(X_train) + len(X_test),
|
|
"n_train": len(X_train),
|
|
"n_test": len(X_test),
|
|
"n_features": X_train.shape[1],
|
|
"class_balance_train": {
|
|
"class_0": int((y_train == 0).sum()),
|
|
"class_1": int((y_train == 1).sum())
|
|
}
|
|
}
|
|
|
|
Path("data/processed").mkdir(parents=True, exist_ok=True)
|
|
with open("data/processed/data_metrics.json", "w") as f:
|
|
json.dump(metrics, f, indent=2)
|
|
|
|
def main():
|
|
"""Main preprocessing pipeline"""
|
|
print("Loading parameters...")
|
|
params = load_params()
|
|
|
|
print("Loading raw data...")
|
|
df = load_raw_data()
|
|
print(f"Loaded {len(df)} matches")
|
|
|
|
print("Engineering features...")
|
|
X, y = engineer_features(df)
|
|
print(f"Created {X.shape[1]} features")
|
|
|
|
print("Splitting data...")
|
|
X_train, X_test, y_train, y_test = train_test_split(
|
|
X, y,
|
|
test_size=params["test_size"],
|
|
random_state=params["random_state"],
|
|
stratify=y
|
|
)
|
|
|
|
print("Saving processed data...")
|
|
Path("data/processed").mkdir(parents=True, exist_ok=True)
|
|
|
|
# Save full features
|
|
full_features = X.copy()
|
|
full_features['target'] = y
|
|
full_features.to_csv("data/processed/features.csv", index=False)
|
|
|
|
# Save train set
|
|
train_data = X_train.copy()
|
|
train_data['target'] = y_train
|
|
train_data.to_csv("data/processed/train.csv", index=False)
|
|
|
|
# Save test set
|
|
test_data = X_test.copy()
|
|
test_data['target'] = y_test
|
|
test_data.to_csv("data/processed/test.csv", index=False)
|
|
|
|
# Save metrics
|
|
save_metrics(X_train, X_test, y_train, y_test)
|
|
|
|
print("Preprocessing completed successfully!")
|
|
print(f"Train set: {len(X_train)} samples")
|
|
print(f"Test set: {len(X_test)} samples")
|
|
|
|
if __name__ == "__main__":
|
|
main()
|