Refactoring code and randomized search cv for random forest hyperparams

c0bbd4e8 · Mahesha999 · c0bbd4e8 · c0bbd4e8 · c0bbd4e8 · c0bbd4e8
Commit c0bbd4e8 authored Dec 06, 2020 by Mahesha999
5 changed files
--- a/data/features_30_sec.csv
+++ b/data/features_30_sec.csv
--- a/k_means.py
+++ b/k_means.py
+from sklearn.cluster import KMeans
+import matplotlib.pyplot as plt
+def elbow(X_train):
+    wcss = []
+    for i in range(1,20):
+        kmeans = KMeans(n_clusters=i, init='k-means++', max_iter=300, random_state=0, n_init=10)
+        kmeans.fit(X_train)
+        wcss.append(kmeans.inertia_) # Sum of squared distances of samples to their closest cluster center.
+    plt.plot(range(1,20),wcss)
+    plt.title("The Elbow Method")
+    plt.ylabel("no. of clusters")
+    plt.xlabel("WCSS")
+    plt.show()
+def kmeans_fit_predict(X_train, X_test):
+    kmeans = KMeans(n_clusters=10, init="k-means++", max_iter=1000, random_state=0, n_init=10)
+    kmeans.fit(X_train)
+    return kmeans.predict(X_test)
\ No newline at end of file
--- a/music_genre_classification.ipynb
+++ b/music_genre_classification.ipynb
--- a/random_forest.py
+++ b/random_forest.py
+from sklearn.ensemble import RandomForestClassifier
+from sklearn.ensemble import ExtraTreesClassifier
+from sklearn.model_selection import RandomizedSearchCV
+import numpy as np
+# def rf_fit_predict(X_train, y_train, X_test):
+#   classifier = RandomForestClassifier(n_estimators= 10, criterion="entropy", random_state=0)
+#   classifier.fit(X_train,y_train)
+#   print(classifier.get_params())
+#   return classifier.predict(X_test)
+def rf_fit_predict(X_train, y_train, X_test):
+  classifier = RandomForestClassifier(max_samples=0.95, n_estimators= 3000, bootstrap=True, min_samples_split=2, min_samples_leaf=1, criterion="entropy", random_state=0)
+  # classifier = ExtraTreesClassifier(max_samples=0.75, n_estimators= 3000, bootstrap=True, min_samples_split=2, min_samples_leaf=1, criterion="entropy", random_state=0)
+  classifier.fit(X_train,y_train)
+  # print(classifier.get_params())
+  return classifier.predict(X_test)
+def randomized_search_fold_size_rf_fit_predict(X_train, y_train, X_test):
+    max_samples = [0.1,0.2,0.3,0.4,0.5,0.6,0.7]
+    random_grid = {'max_samples': max_samples}
+    classifier = RandomForestClassifier(n_estimators= 5000, bootstrap=True, max_depth=40, min_samples_split=2, min_samples_leaf=1, criterion="entropy", random_state=0)
+    rf_random = RandomizedSearchCV(estimator = classifier, param_distributions = random_grid, n_iter = 100, cv = 5, verbose=2, random_state=42, n_jobs = -1)
+    rf_random.fit(X_train, y_train)
+    print(rf_random.best_params_)
+    return rf_random, rf_random.predict(X_test)
+def randomized_search_cv_rf_fit_predict(X_train, y_train, X_test):
+    n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]
+    max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
+    max_depth.append(None)
+    max_features = ['auto', 'sqrt']
+    min_samples_split = [2, 5, 10]
+    min_samples_leaf = [1, 2, 4]
+    bootstrap = [True, False]
+    random_grid = {'n_estimators': n_estimators,
+                'max_features': max_features,
+                'max_depth': max_depth,
+                'min_samples_split': min_samples_split,
+                'min_samples_leaf': min_samples_leaf,
+                'bootstrap': bootstrap}
+    classifier = RandomForestClassifier(criterion="entropy")
+    rf_random = RandomizedSearchCV(estimator = classifier, param_distributions = random_grid, n_iter = 100, cv = 3, verbose=2, random_state=42, n_jobs = -1)
+    rf_random.fit(X_train, y_train)
+    #print("Hyperparameters: ", rf_random.best_params_)
+    return rf_random, rf_random.predict(X_test)
\ No newline at end of file
--- a/util.py
+++ b/util.py
+import numpy as np
+import pandas as pd
+import matplotlib.pyplot as plt
+from sklearn.metrics import confusion_matrix
+from sklearn.model_selection import train_test_split
+from sklearn.preprocessing import StandardScaler
+from sklearn.preprocessing import LabelEncoder
+def load_scale_xy_with_25p_split():
+    dataset = pd.read_csv("data/features_30_sec.csv")
+    X = dataset.iloc[:, 1:59].values
+    y = dataset.iloc[:, 59].values
+    #importing the dataset
+    X_train, X_test = train_test_split(X, test_size=0.25, random_state= 0)
+    y_train, y_test = train_test_split(y, test_size=0.25, random_state= 0)
+    #feature scaling
+    sc_X = StandardScaler()
+    X_train = sc_X.fit_transform(X_train)
+    X_test = sc_X.transform(X_test)
+    return X_train, X_test, y_train, y_test
+def load_scale_x_encode_y():
+    dataset = pd.read_csv("data/features_30_sec.csv")
+    X = dataset.iloc[:, 1:59].values
+    y = dataset.iloc[:, 59].values
+    sc_X = StandardScaler()
+    X = sc_X.fit_transform(X)
+    encoder = LabelEncoder()
+    y = encoder.fit_transform(y)
+    return X, y
+def get_accuracy(cm):
+    sum = 0
+    for i in range(cm.shape[0]):
+        sum = sum + cm[i][i]
+    return 100*(sum/np.sum(cm))
+def fit_predict_print(fit_predict_function, X_train, y_train, X_test, y_test):
+    y_pred = fit_predict_function(X_train, y_train, X_test)
+    cm = confusion_matrix(y_test, y_pred)
+    #print(cm)
+    print("Accuracy: ", get_accuracy(cm))
+    plt.matshow(cm)
+    plt.show()
+def fit_predict_print_unsupervised(fit_predict_function, X_train, X_test, y_test):
+    y_pred = fit_predict_function(X_train, X_test)
+    if y_test is not None:
+        cm = confusion_matrix(y_test, y_pred)
+        #print(cm)
+        print("Accuracy: ", get_accuracy(cm))
+        plt.matshow(cm)
+        plt.show()
+def search_fit_predict_print(fit_predict_function, X_train, y_train, X_test, y_test):
+    ensemble, y_pred = fit_predict_function(X_train, y_train, X_test)
+    cm = confusion_matrix(y_test, y_pred)
+    #print(cm)
+    print("Accuracy: ", get_accuracy(cm))
+    print("Hyperparamters: ", ensemble.best_params_)
+    plt.matshow(cm)
+    plt.show()