Commit c0bbd4e8 authored by Mahesha999's avatar Mahesha999

Refactoring code and randomized search cv for random forest hyperparams

parents
This diff is collapsed.
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt
def elbow(X_train):
wcss = []
for i in range(1,20):
kmeans = KMeans(n_clusters=i, init='k-means++', max_iter=300, random_state=0, n_init=10)
kmeans.fit(X_train)
wcss.append(kmeans.inertia_) # Sum of squared distances of samples to their closest cluster center.
plt.plot(range(1,20),wcss)
plt.title("The Elbow Method")
plt.ylabel("no. of clusters")
plt.xlabel("WCSS")
plt.show()
def kmeans_fit_predict(X_train, X_test):
kmeans = KMeans(n_clusters=10, init="k-means++", max_iter=1000, random_state=0, n_init=10)
kmeans.fit(X_train)
return kmeans.predict(X_test)
\ No newline at end of file
This diff is collapsed.
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.model_selection import RandomizedSearchCV
import numpy as np
# def rf_fit_predict(X_train, y_train, X_test):
# classifier = RandomForestClassifier(n_estimators= 10, criterion="entropy", random_state=0)
# classifier.fit(X_train,y_train)
# print(classifier.get_params())
# return classifier.predict(X_test)
def rf_fit_predict(X_train, y_train, X_test):
classifier = RandomForestClassifier(max_samples=0.95, n_estimators= 3000, bootstrap=True, min_samples_split=2, min_samples_leaf=1, criterion="entropy", random_state=0)
# classifier = ExtraTreesClassifier(max_samples=0.75, n_estimators= 3000, bootstrap=True, min_samples_split=2, min_samples_leaf=1, criterion="entropy", random_state=0)
classifier.fit(X_train,y_train)
# print(classifier.get_params())
return classifier.predict(X_test)
def randomized_search_fold_size_rf_fit_predict(X_train, y_train, X_test):
max_samples = [0.1,0.2,0.3,0.4,0.5,0.6,0.7]
random_grid = {'max_samples': max_samples}
classifier = RandomForestClassifier(n_estimators= 5000, bootstrap=True, max_depth=40, min_samples_split=2, min_samples_leaf=1, criterion="entropy", random_state=0)
rf_random = RandomizedSearchCV(estimator = classifier, param_distributions = random_grid, n_iter = 100, cv = 5, verbose=2, random_state=42, n_jobs = -1)
rf_random.fit(X_train, y_train)
print(rf_random.best_params_)
return rf_random, rf_random.predict(X_test)
def randomized_search_cv_rf_fit_predict(X_train, y_train, X_test):
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)
max_features = ['auto', 'sqrt']
min_samples_split = [2, 5, 10]
min_samples_leaf = [1, 2, 4]
bootstrap = [True, False]
random_grid = {'n_estimators': n_estimators,
'max_features': max_features,
'max_depth': max_depth,
'min_samples_split': min_samples_split,
'min_samples_leaf': min_samples_leaf,
'bootstrap': bootstrap}
classifier = RandomForestClassifier(criterion="entropy")
rf_random = RandomizedSearchCV(estimator = classifier, param_distributions = random_grid, n_iter = 100, cv = 3, verbose=2, random_state=42, n_jobs = -1)
rf_random.fit(X_train, y_train)
#print("Hyperparameters: ", rf_random.best_params_)
return rf_random, rf_random.predict(X_test)
\ No newline at end of file
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder
def load_scale_xy_with_25p_split():
dataset = pd.read_csv("data/features_30_sec.csv")
X = dataset.iloc[:, 1:59].values
y = dataset.iloc[:, 59].values
#importing the dataset
X_train, X_test = train_test_split(X, test_size=0.25, random_state= 0)
y_train, y_test = train_test_split(y, test_size=0.25, random_state= 0)
#feature scaling
sc_X = StandardScaler()
X_train = sc_X.fit_transform(X_train)
X_test = sc_X.transform(X_test)
return X_train, X_test, y_train, y_test
def load_scale_x_encode_y():
dataset = pd.read_csv("data/features_30_sec.csv")
X = dataset.iloc[:, 1:59].values
y = dataset.iloc[:, 59].values
sc_X = StandardScaler()
X = sc_X.fit_transform(X)
encoder = LabelEncoder()
y = encoder.fit_transform(y)
return X, y
def get_accuracy(cm):
sum = 0
for i in range(cm.shape[0]):
sum = sum + cm[i][i]
return 100*(sum/np.sum(cm))
def fit_predict_print(fit_predict_function, X_train, y_train, X_test, y_test):
y_pred = fit_predict_function(X_train, y_train, X_test)
cm = confusion_matrix(y_test, y_pred)
#print(cm)
print("Accuracy: ", get_accuracy(cm))
plt.matshow(cm)
plt.show()
def fit_predict_print_unsupervised(fit_predict_function, X_train, X_test, y_test):
y_pred = fit_predict_function(X_train, X_test)
if y_test is not None:
cm = confusion_matrix(y_test, y_pred)
#print(cm)
print("Accuracy: ", get_accuracy(cm))
plt.matshow(cm)
plt.show()
def search_fit_predict_print(fit_predict_function, X_train, y_train, X_test, y_test):
ensemble, y_pred = fit_predict_function(X_train, y_train, X_test)
cm = confusion_matrix(y_test, y_pred)
#print(cm)
print("Accuracy: ", get_accuracy(cm))
print("Hyperparamters: ", ensemble.best_params_)
plt.matshow(cm)
plt.show()
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment