Compare commits

..

No commits in common. "ff71d052e6c35bb9cb25d8a464c68eb912b55736" and "8dc524af223135377e72f6c9220f919f66a28fc9" have entirely different histories.

4 changed files with 36 additions and 365 deletions

View File

@ -16,9 +16,9 @@ stages:
cache: false
train:
cmd: python src/models/train_multitask.py
cmd: python src/models/train.py
deps:
- src/models/train_multitask.py
- src/models/train.py
- data/processed/train.csv
- data/processed/test.csv
params:
@ -26,12 +26,7 @@ stages:
- train.max_depth
- train.random_state
outs:
- models/model_match_winner.pkl
- models/model_map_winner.pkl
- models/model_score_team1.pkl
- models/model_score_team2.pkl
- models/model_round_diff.pkl
- models/model_total_maps.pkl
- models/model.pkl
metrics:
- models/metrics.json:
cache: false

View File

@ -21,50 +21,23 @@ def load_raw_data():
def engineer_features(df):
"""Create features for match prediction"""
# Only use features that would be known BEFORE the match starts
# Base features
# Basic features from results
features = df[[
'starting_ct', # Which team starts as CT (known before match)
'rank_1', 'rank_2', # Team rankings (known before match)
'result_1', 'result_2', 'starting_ct',
'ct_1', 't_2', 't_1', 'ct_2',
'rank_1', 'rank_2', 'map_wins_1', 'map_wins_2'
]].copy()
# Rank-based features
# Engineered features
features['rank_diff'] = features['rank_1'] - features['rank_2']
features['rank_sum'] = features['rank_1'] + features['rank_2']
features['rank_ratio'] = features['rank_1'] / (features['rank_2'] + 1) # +1 to avoid division by zero
features['map_wins_diff'] = features['map_wins_1'] - features['map_wins_2']
features['total_rounds'] = features['result_1'] + features['result_2']
features['round_diff'] = features['result_1'] - features['result_2']
# Map encoding (one-hot encoding for map types)
map_dummies = pd.get_dummies(df['_map'], prefix='map')
features = pd.concat([features, map_dummies], axis=1)
# Target: match_winner (1 or 2) -> convert to 0 or 1
target = df['match_winner'] - 1
# Team strength indicators
features['team1_is_favorite'] = (features['rank_1'] < features['rank_2']).astype(int)
features['both_top_tier'] = ((features['rank_1'] <= 10) & (features['rank_2'] <= 10)).astype(int)
features['underdog_matchup'] = (abs(features['rank_diff']) > 50).astype(int)
# Multi-task targets
targets = {}
# Task 1: Match Winner (Binary Classification)
targets['match_winner'] = df['match_winner'] - 1 # Convert 1/2 to 0/1
# Task 2: Exact Score (Regression - two outputs)
targets['score_team1'] = df['result_1']
targets['score_team2'] = df['result_2']
# Task 3: Round Difference (Regression)
targets['round_diff'] = df['result_1'] - df['result_2']
# Task 4: Map Count for the match (Multi-class)
# Group by match_id to get total maps played
match_maps = df.groupby('match_id').size().to_dict()
targets['total_maps'] = df['match_id'].map(match_maps)
# Task 5: Map Winner (Binary Classification for this specific map)
targets['map_winner'] = df['map_winner'] - 1 # Convert 1/2 to 0/1
return features, targets
return features, target
def save_metrics(X_train, X_test, y_train, y_test):
"""Save dataset metrics"""
@ -90,67 +63,44 @@ def main():
print("Loading raw data...")
df = load_raw_data()
print(f"Loaded {len(df)} maps")
print(f"Loaded {len(df)} matches")
print("Engineering features...")
X, targets = engineer_features(df)
X, y = engineer_features(df)
print(f"Created {X.shape[1]} features")
print(f"Created {len(targets)} prediction targets:")
for target_name in targets.keys():
print(f" - {target_name}")
print("Splitting data...")
# Use match_winner for stratification
X_train, X_test, idx_train, idx_test = train_test_split(
X, X.index,
X_train, X_test, y_train, y_test = train_test_split(
X, y,
test_size=params["test_size"],
random_state=params["random_state"],
stratify=targets['match_winner']
stratify=y
)
print("Saving processed data...")
Path("data/processed").mkdir(parents=True, exist_ok=True)
# Save train set with all targets
train_data = X_train.copy()
for target_name, target_values in targets.items():
train_data[f'target_{target_name}'] = target_values.iloc[idx_train].values
train_data.to_csv("data/processed/train.csv", index=False)
# Save test set with all targets
test_data = X_test.copy()
for target_name, target_values in targets.items():
test_data[f'target_{target_name}'] = target_values.iloc[idx_test].values
test_data.to_csv("data/processed/test.csv", index=False)
# Save full features with all targets
# Save full features
full_features = X.copy()
for target_name, target_values in targets.items():
full_features[f'target_{target_name}'] = target_values.values
full_features['target'] = y
full_features.to_csv("data/processed/features.csv", index=False)
# Save train set
train_data = X_train.copy()
train_data['target'] = y_train
train_data.to_csv("data/processed/train.csv", index=False)
# Save test set
test_data = X_test.copy()
test_data['target'] = y_test
test_data.to_csv("data/processed/test.csv", index=False)
# Save metrics
print("\nDataset statistics:")
print(f"Train set: {len(X_train)} samples")
print(f"Test set: {len(X_test)} samples")
print(f"Features: {X.shape[1]}")
metrics = {
"n_samples": len(X),
"n_train": len(X_train),
"n_test": len(X_test),
"n_features": X.shape[1],
"targets": list(targets.keys()),
"class_balance_match_winner": {
"class_0": int((targets['match_winner'] == 0).sum()),
"class_1": int((targets['match_winner'] == 1).sum())
}
}
with open("data/processed/data_metrics.json", "w") as f:
json.dump(metrics, f, indent=2)
save_metrics(X_train, X_test, y_train, y_test)
print("Preprocessing completed successfully!")
print(f"Train set: {len(X_train)} samples")
print(f"Test set: {len(X_test)} samples")
if __name__ == "__main__":
main()

View File

@ -14,20 +14,12 @@ from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_sc
import pandas as pd
# Configure MLflow
# MLflow will automatically use MLFLOW_TRACKING_USERNAME and MLFLOW_TRACKING_PASSWORD env vars
tracking_uri = os.getenv("MLFLOW_TRACKING_URI", "https://mlflow.sortifal.dev")
mlflow.set_tracking_uri(tracking_uri)
if os.getenv("MLFLOW_TRACKING_USERNAME") and os.getenv("MLFLOW_TRACKING_PASSWORD"):
print(f"MLflow configured with authentication for {tracking_uri}")
else:
print(f"MLflow configured without authentication for {tracking_uri}")
mlflow.set_tracking_uri(os.getenv("MLFLOW_TRACKING_URI", "https://mlflow.sortifal.dev"))
# Try to set experiment, but handle auth errors gracefully
USE_MLFLOW = True
try:
mlflow.set_experiment("csgo-match-prediction")
print(f"Connected to MLflow at {mlflow.get_tracking_uri()}")
except Exception as e:
print(f"Warning: Could not connect to MLflow: {e}")
print("Training will continue without MLflow tracking.")
@ -137,13 +129,7 @@ def main():
# Try to log model to MLflow (if permissions allow)
try:
# Create input example for model signature
input_example = X_train.head(1)
mlflow.sklearn.log_model(
model,
artifact_path="model",
input_example=input_example
)
mlflow.sklearn.log_model(model, "model")
print("\nModel logged to MLflow successfully!")
except Exception as e:
print(f"\nWarning: Could not log model to MLflow: {e}")

View File

@ -1,260 +0,0 @@
"""
Multi-task model training pipeline for CSGO match prediction.
Trains separate models for different prediction objectives and logs to MLflow.
"""
import mlflow
import mlflow.sklearn
import yaml
import json
import pickle
import os
from pathlib import Path
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.metrics import (
accuracy_score, precision_score, recall_score, f1_score, roc_auc_score,
mean_absolute_error, mean_squared_error, r2_score
)
import pandas as pd
import numpy as np
# Configure MLflow
# MLflow will automatically use MLFLOW_TRACKING_USERNAME and MLFLOW_TRACKING_PASSWORD env vars
tracking_uri = os.getenv("MLFLOW_TRACKING_URI", "https://mlflow.sortifal.dev")
mlflow.set_tracking_uri(tracking_uri)
if os.getenv("MLFLOW_TRACKING_USERNAME") and os.getenv("MLFLOW_TRACKING_PASSWORD"):
print(f"MLflow configured with authentication for {tracking_uri}")
else:
print(f"MLflow configured without authentication for {tracking_uri}")
# Try to set experiment, but handle auth errors gracefully
USE_MLFLOW = True
try:
mlflow.set_experiment("csgo-match-prediction-multitask")
print(f"Connected to MLflow at {mlflow.get_tracking_uri()}")
except Exception as e:
print(f"Warning: Could not connect to MLflow: {e}")
print("Training will continue without MLflow tracking.")
USE_MLFLOW = False
def load_params():
"""Load training parameters from params.yaml"""
with open("params.yaml") as f:
params = yaml.safe_load(f)
return params["train"]
def load_data():
"""Load preprocessed training and test data"""
train_df = pd.read_csv("data/processed/train.csv")
test_df = pd.read_csv("data/processed/test.csv")
# Separate features and targets
feature_cols = [col for col in train_df.columns if not col.startswith('target_')]
target_cols = [col for col in train_df.columns if col.startswith('target_')]
X_train = train_df[feature_cols]
X_test = test_df[feature_cols]
# Extract all targets
targets_train = {col.replace('target_', ''): train_df[col] for col in target_cols}
targets_test = {col.replace('target_', ''): test_df[col] for col in target_cols}
return X_train, X_test, targets_train, targets_test
def train_classification_model(X_train, y_train, params, task_name):
"""Train a classification model"""
print(f"\n[{task_name}] Training Random Forest Classifier...")
model = RandomForestClassifier(
n_estimators=params["n_estimators"],
max_depth=params["max_depth"],
random_state=params["random_state"],
n_jobs=-1
)
model.fit(X_train, y_train)
return model
def train_regression_model(X_train, y_train, params, task_name):
"""Train a regression model"""
print(f"\n[{task_name}] Training Random Forest Regressor...")
model = RandomForestRegressor(
n_estimators=params["n_estimators"],
max_depth=params["max_depth"],
random_state=params["random_state"],
n_jobs=-1
)
model.fit(X_train, y_train)
return model
def evaluate_classification(model, X_test, y_test, task_name):
"""Evaluate classification model"""
print(f"[{task_name}] Evaluating...")
y_pred = model.predict(X_test)
y_pred_proba = model.predict_proba(X_test)[:, 1]
metrics = {
f"{task_name}_accuracy": float(accuracy_score(y_test, y_pred)),
f"{task_name}_precision": float(precision_score(y_test, y_pred, zero_division=0)),
f"{task_name}_recall": float(recall_score(y_test, y_pred, zero_division=0)),
f"{task_name}_f1_score": float(f1_score(y_test, y_pred, zero_division=0)),
f"{task_name}_roc_auc": float(roc_auc_score(y_test, y_pred_proba))
}
return metrics
def evaluate_regression(model, X_test, y_test, task_name):
"""Evaluate regression model"""
print(f"[{task_name}] Evaluating...")
y_pred = model.predict(X_test)
metrics = {
f"{task_name}_mae": float(mean_absolute_error(y_test, y_pred)),
f"{task_name}_mse": float(mean_squared_error(y_test, y_pred)),
f"{task_name}_rmse": float(np.sqrt(mean_squared_error(y_test, y_pred))),
f"{task_name}_r2": float(r2_score(y_test, y_pred))
}
return metrics
def save_models(models, all_metrics):
"""Save all models and metrics locally"""
Path("models").mkdir(parents=True, exist_ok=True)
# Save each model
for task_name, model in models.items():
model_path = f"models/model_{task_name}.pkl"
with open(model_path, "wb") as f:
pickle.dump(model, f)
print(f"Saved {task_name} model to {model_path}")
# Save all metrics
with open("models/metrics.json", "w") as f:
json.dump(all_metrics, f, indent=2)
print(f"Metrics saved to models/metrics.json")
def main():
"""Main multi-task training pipeline"""
print("=" * 70)
print("CSGO Match Prediction - Multi-Task Model Training")
print("=" * 70)
# Load parameters and data
params = load_params()
X_train, X_test, targets_train, targets_test = load_data()
print(f"\nDataset info:")
print(f" Training samples: {len(X_train)}")
print(f" Test samples: {len(X_test)}")
print(f" Features: {X_train.shape[1]}")
print(f" Prediction tasks: {len(targets_train)}")
# Define tasks
tasks = {
'match_winner': {'type': 'classification', 'description': 'Match Winner Prediction'},
'map_winner': {'type': 'classification', 'description': 'Map Winner Prediction'},
'score_team1': {'type': 'regression', 'description': 'Team 1 Score Prediction'},
'score_team2': {'type': 'regression', 'description': 'Team 2 Score Prediction'},
'round_diff': {'type': 'regression', 'description': 'Round Difference Prediction'},
'total_maps': {'type': 'regression', 'description': 'Total Maps Prediction'}
}
models = {}
all_metrics = {}
if USE_MLFLOW:
with mlflow.start_run(run_name="multitask-rf-csgo"):
# Log parameters
mlflow.log_params(params)
mlflow.log_param("n_features", X_train.shape[1])
mlflow.log_param("n_train_samples", len(X_train))
mlflow.log_param("n_test_samples", len(X_test))
mlflow.log_param("n_tasks", len(tasks))
# Train and evaluate each task
for task_name, task_config in tasks.items():
print(f"\n{'='*70}")
print(f"Task: {task_config['description']}")
print(f"{'='*70}")
if task_name not in targets_train:
print(f"Warning: {task_name} not found in training data, skipping...")
continue
y_train = targets_train[task_name]
y_test = targets_test[task_name]
# Train model based on task type
if task_config['type'] == 'classification':
model = train_classification_model(X_train, y_train, params, task_name)
metrics = evaluate_classification(model, X_test, y_test, task_name)
else:
model = train_regression_model(X_train, y_train, params, task_name)
metrics = evaluate_regression(model, X_test, y_test, task_name)
models[task_name] = model
all_metrics.update(metrics)
# Log metrics to MLflow
mlflow.log_metrics(metrics)
# Print results
print(f"\n{task_name} Results:")
for metric, value in metrics.items():
print(f" {metric}: {value:.4f}")
# Save models and metrics
save_models(models, all_metrics)
# Print summary
print("\n" + "=" * 70)
print("Training Summary:")
print("=" * 70)
print(f"Models trained: {len(models)}")
print(f"Total metrics: {len(all_metrics)}")
print("=" * 70)
print(f"\nMLflow run ID: {mlflow.active_run().info.run_id}")
print(f"View run at: {mlflow.get_tracking_uri()}")
else:
# Train without MLflow
for task_name, task_config in tasks.items():
print(f"\n{'='*70}")
print(f"Task: {task_config['description']}")
print(f"{'='*70}")
if task_name not in targets_train:
print(f"Warning: {task_name} not found in training data, skipping...")
continue
y_train = targets_train[task_name]
y_test = targets_test[task_name]
# Train model based on task type
if task_config['type'] == 'classification':
model = train_classification_model(X_train, y_train, params, task_name)
metrics = evaluate_classification(model, X_test, y_test, task_name)
else:
model = train_regression_model(X_train, y_train, params, task_name)
metrics = evaluate_regression(model, X_test, y_test, task_name)
models[task_name] = model
all_metrics.update(metrics)
# Print results
print(f"\n{task_name} Results:")
for metric, value in metrics.items():
print(f" {metric}: {value:.4f}")
# Save models and metrics
save_models(models, all_metrics)
print("\n" + "=" * 70)
print("Training Summary:")
print("=" * 70)
print(f"Models trained: {len(models)}")
print(f"Total metrics: {len(all_metrics)}")
print("=" * 70)
print("\nMulti-task training pipeline completed successfully!")
if __name__ == "__main__":
main()