Refactoring code and randomized search cv for random forest hyperparams

from sklearn.cluster import KMeans
import matplotlib.pyplot as plt
def elbow(X_train):
wcss = []
for i in range(1,20):
kmeans = KMeans(n_clusters=i, init='k-means++', max_iter=300, random_state=0, n_init=10)
wcss.append(kmeans.inertia_) # Sum of squared distances of samples to their closest cluster center.
plt.title("The Elbow Method")
plt.ylabel("no. of clusters")
def kmeans_fit_predict(X_train, X_test):
kmeans = KMeans(n_clusters=10, init="k-means++", max_iter=1000, random_state=0, n_init=10)
return kmeans.predict(X_test)
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.model_selection import RandomizedSearchCV
import numpy as np
# def rf_fit_predict(X_train, y_train, X_test):
# classifier = RandomForestClassifier(n_estimators= 10, criterion="entropy", random_state=0)
# print(classifier.get_params())
# return classifier.predict(X_test)
def rf_fit_predict(X_train, y_train, X_test):
classifier = RandomForestClassifier(max_samples=0.95, n_estimators= 3000, bootstrap=True, min_samples_split=2, min_samples_leaf=1, criterion="entropy", random_state=0)
# classifier = ExtraTreesClassifier(max_samples=0.75, n_estimators= 3000, bootstrap=True, min_samples_split=2, min_samples_leaf=1, criterion="entropy", random_state=0),y_train)
# print(classifier.get_params())
return classifier.predict(X_test)
def randomized_search_fold_size_rf_fit_predict(X_train, y_train, X_test):
max_samples = [0.1,0.2,0.3,0.4,0.5,0.6,0.7]
random_grid = {'max_samples': max_samples}
classifier = RandomForestClassifier(n_estimators= 5000, bootstrap=True, max_depth=40, min_samples_split=2, min_samples_leaf=1, criterion="entropy", random_state=0)
rf_random = RandomizedSearchCV(estimator = classifier, param_distributions = random_grid, n_iter = 100, cv = 5, verbose=2, random_state=42, n_jobs = -1), y_train)
return rf_random, rf_random.predict(X_test)
def randomized_search_cv_rf_fit_predict(X_train, y_train, X_test):
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_features = ['auto', 'sqrt']
min_samples_split = [2, 5, 10]
min_samples_leaf = [1, 2, 4]
bootstrap = [True, False]
random_grid = {'n_estimators': n_estimators,
'max_features': max_features,
'max_depth': max_depth,
'min_samples_split': min_samples_split,
'min_samples_leaf': min_samples_leaf,
'bootstrap': bootstrap}
classifier = RandomForestClassifier(criterion="entropy")
rf_random = RandomizedSearchCV(estimator = classifier, param_distributions = random_grid, n_iter = 100, cv = 3, verbose=2, random_state=42, n_jobs = -1), y_train)
#print("Hyperparameters: ", rf_random.best_params_)
return rf_random, rf_random.predict(X_test)
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder
def load_scale_xy_with_25p_split():
dataset = pd.read_csv("data/features_30_sec.csv")
X = dataset.iloc[:, 1:59].values
y = dataset.iloc[:, 59].values
#importing the dataset
X_train, X_test = train_test_split(X, test_size=0.25, random_state= 0)
y_train, y_test = train_test_split(y, test_size=0.25, random_state= 0)
#feature scaling
sc_X = StandardScaler()
X_train = sc_X.fit_transform(X_train)
X_test = sc_X.transform(X_test)
return X_train, X_test, y_train, y_test
def load_scale_x_encode_y():
dataset = pd.read_csv("data/features_30_sec.csv")
X = dataset.iloc[:, 1:59].values
y = dataset.iloc[:, 59].values
sc_X = StandardScaler()
X = sc_X.fit_transform(X)
encoder = LabelEncoder()
y = encoder.fit_transform(y)
return X, y
def get_accuracy(cm):
sum = 0
for i in range(cm.shape[0]):
sum = sum + cm[i][i]
return 100*(sum/np.sum(cm))
def fit_predict_print(fit_predict_function, X_train, y_train, X_test, y_test):
y_pred = fit_predict_function(X_train, y_train, X_test)
cm = confusion_matrix(y_test, y_pred)
print("Accuracy: ", get_accuracy(cm))
def fit_predict_print_unsupervised(fit_predict_function, X_train, X_test, y_test):
y_pred = fit_predict_function(X_train, X_test)
if y_test is not None:
cm = confusion_matrix(y_test, y_pred)
print("Accuracy: ", get_accuracy(cm))
def search_fit_predict_print(fit_predict_function, X_train, y_train, X_test, y_test):
ensemble, y_pred = fit_predict_function(X_train, y_train, X_test)
cm = confusion_matrix(y_test, y_pred)
print("Accuracy: ", get_accuracy(cm))
print("Hyperparamters: ", ensemble.best_params_)
