Source code for py_predpurchase.function_model_cross_val

import numpy as np
import pandas as pd
from sklearn.model_selection import cross_validate
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.dummy import DummyClassifier
from sklearn.svm import SVC

[docs] def model_cross_validation(preprocessed_training_data, preprocessed_testing_data, target, k, gamma): """ Calculates the cross validation results for a four common off-the-shelf models (Dummy, KNN, SVM and RandomForests) using preprocessed and cleaned training and testing datasets. Random forests and Dummy hyperparameters are fixed for simplicity sake. Parameters: ---------- preprocessed_training_data : DataFrame Cleaned and preprocessed training data. preprocessed_testing_data : DataFrame Cleaned and preprocessed testing data. target : str Target column name in the dataset. k : int Hyperparameter 'k' value for KNearestNeighbours. gamma : float Hyperparameter 'gamma' value for SVM. Returns: ---------- dict Contains cross-validation results (mean and std of scores) for each specified model. Examples: -------- Assuming dataset is preprocessed and split into training and testing sets, with 'target' as the target column: >>> results = model_cross_validation(preprocessed_training_data, preprocessed_testing_data, 'target', k=5, gamma=0.1) >>> pd.DataFrame(results) This will output the cross-validation results for each model, displaying the mean and standard deviation of the scores (also includes train scores). Notes: ------- The function assumes that the input data is already scaled and encoded. """ train_data = preprocessed_training_data test_data = preprocessed_testing_data X_train = train_data.drop(target, axis=1) y_train = train_data[target] X_test = test_data.drop(target, axis=1) y_test = test_data[target] # Function to perform cross-validation and return mean/std scores def mean_std_cross_val_scores(model, X_train, y_train, **kwargs): scores = cross_validate(model, X_train, y_train, **kwargs) mean_scores = pd.DataFrame(scores).mean() std_scores = pd.DataFrame(scores).std() return pd.Series({col: f"{mean:.3f} (+/- {std:.3f})" for col, mean, std in zip(mean_scores.index, mean_scores, std_scores)}) results_dict = {} #Dummy Classifier dummy = DummyClassifier(strategy='most_frequent') results_dict["dummy"] = mean_std_cross_val_scores(dummy, X_train, y_train, cv=5, return_train_score=True) # kNN Classifier best_k = k knn = KNeighborsClassifier(n_neighbors=best_k) results_dict["knn"] = mean_std_cross_val_scores(knn, X_train, y_train, cv=5, return_train_score=True) # SVM Classifier svm = SVC(gamma= gamma) results_dict["SVM"] = mean_std_cross_val_scores(svm, X_train, y_train, cv=5, return_train_score=True) # Random Forest Classifier random_forest = RandomForestClassifier(n_estimators=50, max_depth=50, random_state=123) results_dict["random_forest"] = mean_std_cross_val_scores(random_forest, X_train, y_train, cv=5, return_train_score=True) return results_dict