diff --git a/SGDRegressor/SGDregressor_without_combinations.ipynb b/SGDRegressor/SGDregressor_without_combinations.ipynb new file mode 100644 index 0000000000000000000000000000000000000000..63ee093d5af1cd313e934e5f483a78b3fe1816db --- /dev/null +++ b/SGDRegressor/SGDregressor_without_combinations.ipynb @@ -0,0 +1,2735 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "id": "uxa8uaiWhPuw" + }, + "source": [ + "# Read data into a dataframe" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": { + "id": "pYYleLjJhr_o" + }, + "outputs": [], + "source": [ + "import pandas as pd\n", + "from sklearn.model_selection import train_test_split\n", + "from sklearn.preprocessing import OneHotEncoder \n", + "import numpy as np\n", + "from sklearn.ensemble import RandomForestRegressor \n", + "from matplotlib import pyplot as plt " + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": { + "id": "RP4jhT3Chr_u" + }, + "outputs": [], + "source": [ + "#!unzip 'final_dataset_2.zip'\n", + "df = pd.read_csv(\"new_dataset.csv\")" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Index(['Unnamed: 0', 'Unnamed: 0.1', 'id', 'vendor_id', 'pickup_datetime',\n", + " 'dropoff_datetime', 'passenger_count', 'pickup_longitude',\n", + " 'pickup_latitude', 'dropoff_longitude', 'dropoff_latitude',\n", + " 'store_and_fwd_flag', 'trip_duration', 'visi', 'vism', 'fog', 'rain',\n", + " 'snow', 'hail', 'thunder', 'tornado', 'holiday_or_not', 'turns'],\n", + " dtype='object')\n" + ] + } + ], + "source": [ + "print(df.columns)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "80n8bJeDhPuy" + }, + "source": [ + "# Data cleaning" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 338 + }, + "id": "Cgw2yRgU5zBZ", + "outputId": "5d3a3e40-008f-4321-fee8-8b52ae89d257" + }, + "outputs": [ + { + "data": { + "text/html": [ + "<div>\n", + "<style scoped>\n", + " .dataframe tbody tr th:only-of-type {\n", + " vertical-align: middle;\n", + " }\n", + "\n", + " .dataframe tbody tr th {\n", + " vertical-align: top;\n", + " }\n", + "\n", + " .dataframe thead th {\n", + " text-align: right;\n", + " }\n", + "</style>\n", + "<table border=\"1\" class=\"dataframe\">\n", + " <thead>\n", + " <tr style=\"text-align: right;\">\n", + " <th></th>\n", + " <th>Unnamed: 0</th>\n", + " <th>Unnamed: 0.1</th>\n", + " <th>vendor_id</th>\n", + " <th>passenger_count</th>\n", + " <th>pickup_longitude</th>\n", + " <th>pickup_latitude</th>\n", + " <th>dropoff_longitude</th>\n", + " <th>dropoff_latitude</th>\n", + " <th>trip_duration</th>\n", + " <th>visi</th>\n", + " <th>vism</th>\n", + " <th>fog</th>\n", + " <th>rain</th>\n", + " <th>snow</th>\n", + " <th>hail</th>\n", + " <th>thunder</th>\n", + " <th>tornado</th>\n", + " <th>holiday_or_not</th>\n", + " <th>turns</th>\n", + " </tr>\n", + " </thead>\n", + " <tbody>\n", + " <tr>\n", + " <th>count</th>\n", + " <td>1.458644e+06</td>\n", + " <td>1.458644e+06</td>\n", + " <td>1.458644e+06</td>\n", + " <td>1.458644e+06</td>\n", + " <td>1.458644e+06</td>\n", + " <td>1.458644e+06</td>\n", + " <td>1.458644e+06</td>\n", + " <td>1.458644e+06</td>\n", + " <td>1.458644e+06</td>\n", + " <td>1.406904e+06</td>\n", + " <td>1.406904e+06</td>\n", + " <td>1.458644e+06</td>\n", + " <td>1.458644e+06</td>\n", + " <td>1.458644e+06</td>\n", + " <td>1458644.0</td>\n", + " <td>1458644.0</td>\n", + " <td>1458644.0</td>\n", + " <td>1.458644e+06</td>\n", + " <td>1.458643e+06</td>\n", + " </tr>\n", + " <tr>\n", + " <th>mean</th>\n", + " <td>7.293215e+05</td>\n", + " <td>7.293215e+05</td>\n", + " <td>1.534950e+00</td>\n", + " <td>1.664530e+00</td>\n", + " <td>-7.397349e+01</td>\n", + " <td>4.075092e+01</td>\n", + " <td>-7.397342e+01</td>\n", + " <td>4.075180e+01</td>\n", + " <td>9.594923e+02</td>\n", + " <td>9.083394e+00</td>\n", + " <td>1.462562e+01</td>\n", + " <td>6.512898e-03</td>\n", + " <td>9.604811e-02</td>\n", + " <td>2.387149e-02</td>\n", + " <td>0.0</td>\n", + " <td>0.0</td>\n", + " <td>0.0</td>\n", + " <td>1.868242e-02</td>\n", + " <td>7.547126e+00</td>\n", + " </tr>\n", + " <tr>\n", + " <th>std</th>\n", + " <td>4.210744e+05</td>\n", + " <td>4.210744e+05</td>\n", + " <td>4.987772e-01</td>\n", + " <td>1.314242e+00</td>\n", + " <td>7.090186e-02</td>\n", + " <td>3.288119e-02</td>\n", + " <td>7.064327e-02</td>\n", + " <td>3.589056e-02</td>\n", + " <td>5.237432e+03</td>\n", + " <td>1.931623e+00</td>\n", + " <td>3.114066e+00</td>\n", + " <td>1.123877e-01</td>\n", + " <td>5.186301e-01</td>\n", + " <td>2.773470e-01</td>\n", + " <td>0.0</td>\n", + " <td>0.0</td>\n", + " <td>0.0</td>\n", + " <td>1.354009e-01</td>\n", + " <td>4.432504e+00</td>\n", + " </tr>\n", + " <tr>\n", + " <th>min</th>\n", + " <td>0.000000e+00</td>\n", + " <td>0.000000e+00</td>\n", + " <td>1.000000e+00</td>\n", + " <td>0.000000e+00</td>\n", + " <td>-1.219333e+02</td>\n", + " <td>3.435970e+01</td>\n", + " <td>-1.219333e+02</td>\n", + " <td>3.218114e+01</td>\n", + " <td>1.000000e+00</td>\n", + " <td>2.000000e-01</td>\n", + " <td>4.000000e-01</td>\n", + " <td>0.000000e+00</td>\n", + " <td>0.000000e+00</td>\n", + " <td>0.000000e+00</td>\n", + " <td>0.0</td>\n", + " <td>0.0</td>\n", + " <td>0.0</td>\n", + " <td>0.000000e+00</td>\n", + " <td>2.000000e+00</td>\n", + " </tr>\n", + " <tr>\n", + " <th>25%</th>\n", + " <td>3.646608e+05</td>\n", + " <td>3.646608e+05</td>\n", + " <td>1.000000e+00</td>\n", + " <td>1.000000e+00</td>\n", + " <td>-7.399187e+01</td>\n", + " <td>4.073735e+01</td>\n", + " <td>-7.399133e+01</td>\n", + " <td>4.073588e+01</td>\n", + " <td>3.970000e+02</td>\n", + " <td>9.000000e+00</td>\n", + " <td>1.450000e+01</td>\n", + " <td>0.000000e+00</td>\n", + " <td>0.000000e+00</td>\n", + " <td>0.000000e+00</td>\n", + " <td>0.0</td>\n", + " <td>0.0</td>\n", + " <td>0.0</td>\n", + " <td>0.000000e+00</td>\n", + " <td>5.000000e+00</td>\n", + " </tr>\n", + " <tr>\n", + " <th>50%</th>\n", + " <td>7.293215e+05</td>\n", + " <td>7.293215e+05</td>\n", + " <td>2.000000e+00</td>\n", + " <td>1.000000e+00</td>\n", + " <td>-7.398174e+01</td>\n", + " <td>4.075410e+01</td>\n", + " <td>-7.397975e+01</td>\n", + " <td>4.075452e+01</td>\n", + " <td>6.620000e+02</td>\n", + " <td>1.000000e+01</td>\n", + " <td>1.610000e+01</td>\n", + " <td>0.000000e+00</td>\n", + " <td>0.000000e+00</td>\n", + " <td>0.000000e+00</td>\n", + " <td>0.0</td>\n", + " <td>0.0</td>\n", + " <td>0.0</td>\n", + " <td>0.000000e+00</td>\n", + " <td>6.000000e+00</td>\n", + " </tr>\n", + " <tr>\n", + " <th>75%</th>\n", + " <td>1.093982e+06</td>\n", + " <td>1.093982e+06</td>\n", + " <td>2.000000e+00</td>\n", + " <td>2.000000e+00</td>\n", + " <td>-7.396733e+01</td>\n", + " <td>4.076836e+01</td>\n", + " <td>-7.396301e+01</td>\n", + " <td>4.076981e+01</td>\n", + " <td>1.075000e+03</td>\n", + " <td>1.000000e+01</td>\n", + " <td>1.610000e+01</td>\n", + " <td>0.000000e+00</td>\n", + " <td>0.000000e+00</td>\n", + " <td>0.000000e+00</td>\n", + " <td>0.0</td>\n", + " <td>0.0</td>\n", + " <td>0.0</td>\n", + " <td>0.000000e+00</td>\n", + " <td>9.000000e+00</td>\n", + " </tr>\n", + " <tr>\n", + " <th>max</th>\n", + " <td>1.458643e+06</td>\n", + " <td>1.458643e+06</td>\n", + " <td>2.000000e+00</td>\n", + " <td>9.000000e+00</td>\n", + " <td>-6.133553e+01</td>\n", + " <td>5.188108e+01</td>\n", + " <td>-6.133553e+01</td>\n", + " <td>4.392103e+01</td>\n", + " <td>3.526282e+06</td>\n", + " <td>1.000000e+01</td>\n", + " <td>1.610000e+01</td>\n", + " <td>4.000000e+00</td>\n", + " <td>7.000000e+00</td>\n", + " <td>6.000000e+00</td>\n", + " <td>0.0</td>\n", + " <td>0.0</td>\n", + " <td>0.0</td>\n", + " <td>1.000000e+00</td>\n", + " <td>4.600000e+01</td>\n", + " </tr>\n", + " </tbody>\n", + "</table>\n", + "</div>" + ], + "text/plain": [ + " Unnamed: 0 Unnamed: 0.1 vendor_id passenger_count \\\n", + "count 1.458644e+06 1.458644e+06 1.458644e+06 1.458644e+06 \n", + "mean 7.293215e+05 7.293215e+05 1.534950e+00 1.664530e+00 \n", + "std 4.210744e+05 4.210744e+05 4.987772e-01 1.314242e+00 \n", + "min 0.000000e+00 0.000000e+00 1.000000e+00 0.000000e+00 \n", + "25% 3.646608e+05 3.646608e+05 1.000000e+00 1.000000e+00 \n", + "50% 7.293215e+05 7.293215e+05 2.000000e+00 1.000000e+00 \n", + "75% 1.093982e+06 1.093982e+06 2.000000e+00 2.000000e+00 \n", + "max 1.458643e+06 1.458643e+06 2.000000e+00 9.000000e+00 \n", + "\n", + " pickup_longitude pickup_latitude dropoff_longitude dropoff_latitude \\\n", + "count 1.458644e+06 1.458644e+06 1.458644e+06 1.458644e+06 \n", + "mean -7.397349e+01 4.075092e+01 -7.397342e+01 4.075180e+01 \n", + "std 7.090186e-02 3.288119e-02 7.064327e-02 3.589056e-02 \n", + "min -1.219333e+02 3.435970e+01 -1.219333e+02 3.218114e+01 \n", + "25% -7.399187e+01 4.073735e+01 -7.399133e+01 4.073588e+01 \n", + "50% -7.398174e+01 4.075410e+01 -7.397975e+01 4.075452e+01 \n", + "75% -7.396733e+01 4.076836e+01 -7.396301e+01 4.076981e+01 \n", + "max -6.133553e+01 5.188108e+01 -6.133553e+01 4.392103e+01 \n", + "\n", + " trip_duration visi vism fog rain \\\n", + "count 1.458644e+06 1.406904e+06 1.406904e+06 1.458644e+06 1.458644e+06 \n", + "mean 9.594923e+02 9.083394e+00 1.462562e+01 6.512898e-03 9.604811e-02 \n", + "std 5.237432e+03 1.931623e+00 3.114066e+00 1.123877e-01 5.186301e-01 \n", + "min 1.000000e+00 2.000000e-01 4.000000e-01 0.000000e+00 0.000000e+00 \n", + "25% 3.970000e+02 9.000000e+00 1.450000e+01 0.000000e+00 0.000000e+00 \n", + "50% 6.620000e+02 1.000000e+01 1.610000e+01 0.000000e+00 0.000000e+00 \n", + "75% 1.075000e+03 1.000000e+01 1.610000e+01 0.000000e+00 0.000000e+00 \n", + "max 3.526282e+06 1.000000e+01 1.610000e+01 4.000000e+00 7.000000e+00 \n", + "\n", + " snow hail thunder tornado holiday_or_not \\\n", + "count 1.458644e+06 1458644.0 1458644.0 1458644.0 1.458644e+06 \n", + "mean 2.387149e-02 0.0 0.0 0.0 1.868242e-02 \n", + "std 2.773470e-01 0.0 0.0 0.0 1.354009e-01 \n", + "min 0.000000e+00 0.0 0.0 0.0 0.000000e+00 \n", + "25% 0.000000e+00 0.0 0.0 0.0 0.000000e+00 \n", + "50% 0.000000e+00 0.0 0.0 0.0 0.000000e+00 \n", + "75% 0.000000e+00 0.0 0.0 0.0 0.000000e+00 \n", + "max 6.000000e+00 0.0 0.0 0.0 1.000000e+00 \n", + "\n", + " turns \n", + "count 1.458643e+06 \n", + "mean 7.547126e+00 \n", + "std 4.432504e+00 \n", + "min 2.000000e+00 \n", + "25% 5.000000e+00 \n", + "50% 6.000000e+00 \n", + "75% 9.000000e+00 \n", + "max 4.600000e+01 " + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df.describe()" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": { + "id": "2z07A2gvqUeI" + }, + "outputs": [], + "source": [ + "df = df.drop('Unnamed: 0', axis=1)\n", + "df = df.drop('Unnamed: 0.1', axis=1)" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": { + "id": "xy5yhwjG58iQ" + }, + "outputs": [], + "source": [ + "df = df.drop('hail', axis=1)\n", + "df = df.drop('thunder', axis=1)\n", + "df = df.drop('tornado', axis=1)\n", + "df = df.drop('visi', axis=1)\n", + "# df = df.fillna(16.1)" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "WoLvPXIXIrTl", + "outputId": "8ced79b1-47ac-471d-b388-4bcf30662c6b" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "id 0\n", + "vendor_id 0\n", + "pickup_datetime 0\n", + "dropoff_datetime 0\n", + "passenger_count 0\n", + "pickup_longitude 0\n", + "pickup_latitude 0\n", + "dropoff_longitude 0\n", + "dropoff_latitude 0\n", + "store_and_fwd_flag 0\n", + "trip_duration 0\n", + "vism 51740\n", + "fog 0\n", + "rain 0\n", + "snow 0\n", + "holiday_or_not 0\n", + "turns 1\n", + "dtype: int64\n" + ] + } + ], + "source": [ + "missing_val_count_by_column = (df.isnull().sum())\n", + "print(missing_val_count_by_column)" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "wF7xm26tJlh0", + "outputId": "8d756971-fac1-4f15-9244-06a19fef23fb" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "id 0\n", + "vendor_id 0\n", + "pickup_datetime 0\n", + "dropoff_datetime 0\n", + "passenger_count 0\n", + "pickup_longitude 0\n", + "pickup_latitude 0\n", + "dropoff_longitude 0\n", + "dropoff_latitude 0\n", + "store_and_fwd_flag 0\n", + "trip_duration 0\n", + "vism 0\n", + "fog 0\n", + "rain 0\n", + "snow 0\n", + "holiday_or_not 0\n", + "turns 0\n", + "dtype: int64\n" + ] + } + ], + "source": [ + "values = {'vism': 16.1, 'turns': np.round(np.mean(df['turns']))}\n", + "df = df.fillna(value=values)\n", + "missing_val_count_by_column = (df.isnull().sum())\n", + "print(missing_val_count_by_column)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "ymGBKkTlhPu1" + }, + "source": [ + "Max value of trip duration : 3526282 second ~ 41 days which is impossible\n", + "\n", + "Clearly there are some outliers in the data" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "nahwlpHAKmMD" + }, + "source": [ + "# Outlier detection and removal" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 415 + }, + "id": "uTuo_KlphPu1", + "outputId": "594a720b-1ae3-4a15-9c91-a44b1f4499d1" + }, + "outputs": [ + { + "data": { + "text/plain": [ + "{'whiskers': [<matplotlib.lines.Line2D at 0x7f97877f68b0>,\n", + " <matplotlib.lines.Line2D at 0x7f97877f6c10>],\n", + " 'caps': [<matplotlib.lines.Line2D at 0x7f97877f6f70>,\n", + " <matplotlib.lines.Line2D at 0x7f9787809310>],\n", + " 'boxes': [<matplotlib.lines.Line2D at 0x7f97877f6550>],\n", + " 'medians': [<matplotlib.lines.Line2D at 0x7f9787809670>],\n", + " 'fliers': [<matplotlib.lines.Line2D at 0x7f9787809970>],\n", + " 'means': []}" + ] + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" + }, + { + "data": { + "image/png": "iVBORw0KGgoAAAANSUhEUgAAAXQAAAEDCAYAAAAlRP8qAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADh0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uMy4yLjIsIGh0dHA6Ly9tYXRwbG90bGliLm9yZy+WH4yJAAAQBUlEQVR4nO3dX4hc533G8efpemMH3FYXmmKhP1YoSrr2gv90UOxmKVqTgmQMunGLlhKDWbzIVUQCaSF0wU4u9q7kwpJrsWWFI0gnuCQ1ItLW+GKLtXXkeCUkVfLEsLgNXizQ2o6lqP4nmV8v5igZTWZ3zkgze3Zefz8w6Jzz/vbM7+rx8XveM8cRIQBA7/uDohsAAHQGgQ4AiSDQASARBDoAJIJAB4BEEOgAkIhCA932QdsXbJ/NWf83tt+wfc72v3a7PwDoJS5yHbrtv5R0WdKhiBhsUbtF0guSHoqIX9v+k4i4sBJ9AkAvKPQKPSJekfR+/THbf2r7P2yfsH3M9p9lQ09IejYifp39LWEOAHVW4xz6pKS9EfHnkv5e0j9nx78s6cu2/8v2cdvbC+sQAFahW4puoJ7t2yX9haR/s33t8K3Zv7dI2iJpm6QNko7ZHoyID1a6TwBYjVZVoKv2fwwfRMS9TcYWJB2PiCuS/sf2m6oF/Osr2SAArFarasolIi6pFtZ/LUmuuScbflHScHZ8rWpTMG8V0igArEJFL1usSPq5pK/YXrA9KulvJY3aPi3pnKSdWflLkt6z/YakGUn/EBHvFdE3AKxGhS5bBAB0zqqacgEA3LjCboquXbs2Nm/eXNTXA0BPOnHixLsRUWo2Vligb968WXNzc0V9PQD0JNu/WmqMKRcASASBDgCJINABIBEEOgAkgkAHgEQQ6ECdSqWiwcFB9fX1aXBwUJVKpeiWgNxW249zAYWpVCoaHx/X1NSUhoaGNDs7q9HRUUnSyMhIwd0BrbW8Qrd9m+1f2D6dvfrt+01qttm+aPtU9nmqO+0C3TMxMaGpqSkNDw+rv79fw8PDmpqa0sTERNGtAbnkuUL/RLXXvl223S9p1vZ0RBxvqDsWEY90vkVgZVSrVQ0NDV13bGhoSNVqtaCOgPa0vEKPmsvZbn/24Re9kJyBgQHNzs5ed2x2dlYDAwMFdQS0J9dNUdt9tk9JuiDp5Yh4rUnZg9m0zLTtu5c4z5jtOdtzi4uLN9E20Hnj4+MaHR3VzMyMrly5opmZGY2Ojmp8fLzo1oBcct0UjYjPJN1re42kf89e/Xa2ruSkpDuzaZmHVXsZxZYm55lU7Z2hKpfLXOVjVbl243Pv3r2qVqsaGBjQxMQEN0TRM9r+PXTbT0v6v4j4p2Vq/ldSOSLeXaqmXC4HP84FAO2xfSIiys3G8qxyKWVX5rL9RUlfl/TLhpo7nL3V2fbW7Ly8TQgAVlCeKZd1kn5ou0+1oH4hIn5me7ckRcQBSY9KetL2VUkfSdoVvAoJAFZUy0CPiDOS7mty/EDd9n5J+zvbGgCgHTz6DwCJINABIBEEOgAkgkAHgEQQ6ACQCAIdABJBoANAIgh0AEgEgQ4AiSDQASARBDoAJIJAB4BEEOgAkAgCHQASQaADQCIIdABIBIEOAIkg0AEgEQQ6ACSiZaDbvs32L2yftn3O9veb1Nj2M7bnbZ+xfX932gUALKXlS6IlfSLpoYi4bLtf0qzt6Yg4XlezQ9KW7PNVSc9l/wIAVkjLK/SouZzt9mefaCjbKelQVntc0hrb6zrbKgBgObnm0G332T4l6YKklyPitYaS9ZLerttfyI41nmfM9pztucXFxRvtGQDQRK5Aj4jPIuJeSRskbbU92FDiZn/W5DyTEVGOiHKpVGq/WwDAktpa5RIRH0j6T0nbG4YWJG2s298g6Z2b6gwA0JY8q1xKttdk21+U9HVJv2woOyzpsWy1ywOSLkbE+Y53CwBYUp5VLusk/dB2n2r/AXghIn5me7ckRcQBSUclPSxpXtKHkh7vUr8AgCW0DPSIOCPpvibHD9Rth6Q9nW0NANAOnhQFgEQQ6ACQCAIdABJBoANAIgh0AEgEgQ4AiSDQASARBDoAJIJAB4BEEOgAkAgCHQASQaADQCIIdABIBIEOAIkg0AEgEQQ6ACSCQAeARBDoAJAIAh0AEtEy0G1vtD1ju2r7nO1vNanZZvui7VPZ56nutAsAWErLl0RLuirpOxFx0vYfSjph++WIeKOh7lhEPNL5FgEAebS8Qo+I8xFxMtv+jaSqpPXdbgwA0J625tBtb5Z0n6TXmgw/aPu07Wnbdy/x92O252zPLS4utt0sAGBpuQPd9u2SfiLp2xFxqWH4pKQ7I+IeSfskvdjsHBExGRHliCiXSqUb7RkA0ESuQLfdr1qY/ygifto4HhGXIuJytn1UUr/ttR3tFACwrDyrXCxpSlI1In6wRM0dWZ1sb83O+14nGwUALC/PKpevSfqGpP+2fSo79o+SNklSRByQ9KikJ21flfSRpF0REV3oFwCwhJaBHhGzktyiZr+k/Z1qCgDQPp4UBYBEEOgAkAgCHQASQaADQCIIdABIBIEOAIkg0IE6lUpFg4OD6uvr0+DgoCqVStEtAbnlebAI+FyoVCoaHx/X1NSUhoaGNDs7q9HRUUnSyMhIwd0BrbmoBzrL5XLMzc0V8t1AM4ODg9q3b5+Gh4d/e2xmZkZ79+7V2bNnC+wM+B3bJyKi3HSMQAdq+vr69PHHH6u/v/+3x65cuaLbbrtNn332WYGdAb+zXKAzhw5kBgYGNDs7e92x2dlZDQwMFNQR0B4CHciMj49rdHRUMzMzunLlimZmZjQ6Oqrx8fGiWwNy4aYokLl243Pv3r2qVqsaGBjQxMQEN0TRM5hDB4Aewhw6AHwOEOgAkAgCHajDk6LoZdwUBTI8KYpex01RIMOTougFN3VT1PZG2zO2q7bP2f5Wkxrbfsb2vO0ztu/vROPASqpWqxoaGrru2NDQkKrVakEdAe3JM4d+VdJ3ImJA0gOS9ti+q6Fmh6Qt2WdM0nMd7RJYATwpil7XMtAj4nxEnMy2fyOpKml9Q9lOSYei5rikNbbXdbxboIt4UhS9rq2borY3S7pP0msNQ+slvV23v5AdO9/w92OqXcFr06ZN7XUKdNnIyIheffVV7dixQ5988oluvfVWPfHEE9wQRc/IvWzR9u2SfiLp2xFxqXG4yZ/83t3WiJiMiHJElEulUnudAl1WqVR05MgRTU9P69NPP9X09LSOHDnC0kX0jFyBbrtftTD/UUT8tEnJgqSNdfsbJL1z8+0BK2diYkJTU1MaHh5Wf3+/hoeHNTU1pYmJiaJbA3LJs8rFkqYkVSPiB0uUHZb0WLba5QFJFyPi/BK1wKrEKhf0ujxX6F+T9A1JD9k+lX0etr3b9u6s5qiktyTNS/oXSX/XnXaB7mGVC3pdy5uiETGr5nPk9TUhaU+nmgKKcG2VS+OToky5oFfw6D+Q4ffQ0et49B8Aegi/hw4AnwMEOgAkgkAHgEQQ6ACQCAIdABJBoANAIgh0AEgEgQ4AiSDQASARBDoAJIJAB4BEEOgAkAgCHQASQaADQCIIdABIBIEOAIkg0AEgES0D3fZB2xdsn11ifJvti3UvkH6q820CAFrJ807R5yXtl3RomZpjEfFIRzoCANyQllfoEfGKpPdXoBcAwE3o1Bz6g7ZP2562ffdSRbbHbM/ZnltcXOzQVwMApM4E+klJd0bEPZL2SXpxqcKImIyIckSUS6VSB74aAHDNTQd6RFyKiMvZ9lFJ/bbX3nRnAIC23HSg277DtrPtrdk537vZ8wIA2tNylYvtiqRtktbaXpD0tKR+SYqIA5IelfSk7auSPpK0KyKiax0DAJpqGegRMdJifL9qyxoBAAXiSVEASASBDgCJINABIBEEOgAkgkAHgEQQ6ACQCAIdABJBoANAIgh0AEgEgQ4AiSDQASARBDoAJIJAB4BEEOgAkAgCHQASQaADQCIIdABIBIEOAIkg0AEgES0D3fZB2xdsn11i3LafsT1v+4zt+zvfJgCglTxX6M9L2r7M+A5JW7LPmKTnbr4tAEC7WgZ6RLwi6f1lSnZKOhQ1xyWtsb2uUw0CAPLpxBz6eklv1+0vZMd+j+0x23O25xYXFzvw1QCAazoR6G5yLJoVRsRkRJQjolwqlTrw1QCAazoR6AuSNtbtb5D0TgfOCwBoQycC/bCkx7LVLg9IuhgR5ztwXgBAG25pVWC7ImmbpLW2FyQ9LalfkiLigKSjkh6WNC/pQ0mPd6tZAMDSWgZ6RIy0GA9JezrWEQDghvCkKAAkgkAHgEQQ6ACQCAIdABJBoANAIgh0AEgEgQ4AiSDQASARBDoAJIJAB4BEEOgAkAgCHQASQaADQCIIdABIBIEOAIkg0AEgEQQ6ACSCQAeARBDoAJCIXIFue7vtN23P2/5uk/Ftti/aPpV9nup8qwCA5bR8SbTtPknPSvorSQuSXrd9OCLeaCg9FhGPdKFHAEAOea7Qt0qaj4i3IuJTST+WtLO7bQEA2pUn0NdLertufyE71uhB26dtT9u+u9mJbI/ZnrM9t7i4eAPtAgCWkifQ3eRYNOyflHRnRNwjaZ+kF5udKCImI6IcEeVSqdRepwCAZeUJ9AVJG+v2N0h6p74gIi5FxOVs+6ikfttrO9YlAKClPIH+uqQttr9k+wuSdkk6XF9g+w7bzra3Zud9r9PNAgCW1nKVS0Rctf1NSS9J6pN0MCLO2d6djR+Q9KikJ21flfSRpF0R0TgtAwDoIheVu+VyOebm5gr5bgDoVbZPRES52RhPigJAIgh0AEgEgQ4AiSDQASARBDoAJIJAB4BEEOgAkAgCHQASQaADQCIIdABIBIEOAIkg0AEgEQQ6ACSCQAeARLT8PXTg8yR7T8t1+Gl/9Aqu0IFMszBf7jiw2hDoAJAIAh0AEkGgA0AicgW67e2237Q9b/u7TcZt+5ls/Izt+zvfKgBgOS0D3XafpGcl7ZB0l6QR23c1lO2QtCX7jEl6rsN9AgBayHOFvlXSfES8FRGfSvqxpJ0NNTslHYqa45LW2F7X4V4BAMvIsw59vaS36/YXJH01R816Sefri2yPqXYFr02bNrXbKyB974+7dup4+o9W/nu/d7E758XnUp5Ab7YIt/FJizw1iohJSZOSVC6XeVoD7etiAC633pyHi9AL8ky5LEjaWLe/QdI7N1ADAOiiPIH+uqQttr9k+wuSdkk63FBzWNJj2WqXByRdjIjzjScCVrOlrsK5OkevaDnlEhFXbX9T0kuS+iQdjIhztndn4wckHZX0sKR5SR9Kerx7LQPdQ3ijl+X6ca6IOKpaaNcfO1C3HZL2dLY1AEA7eFIUABJBoANAIgh0AEgEgQ4AiXBRd/VtL0r6VSFfDrS2VtK7RTcBNHFnRJSaDRQW6MBqZnsuIspF9wG0gykXAEgEgQ4AiSDQgeYmi24AaBdz6ACQCK7QASARBDoAJIJAB+rYPmj7gu2zRfcCtItAB673vKTtRTcB3AgCHagTEa9Ier/oPoAbQaADQCIIdABIBIEOAIkg0AEgEQQ6UMd2RdLPJX3F9oLt0aJ7AvLi0X8ASARX6ACQCAIdABJBoANAIgh0AEgEgQ4AiSDQASARBDoAJOL/AdDnmgN3xaePAAAAAElFTkSuQmCC\n", + "text/plain": [ + "<Figure size 432x288 with 1 Axes>" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "plt.boxplot(df['trip_duration'])" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "YMHAxF7h7W1m", + "outputId": "afb1f264-df51-48db-a4c1-a59fc9e551d8" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Instances with trip duration greater than 2092.0 are outliers as per Boxplot analysis.\n" + ] + } + ], + "source": [ + "Q1 = np.percentile(df['trip_duration'], 25, interpolation = 'midpoint') \n", + "Q2 = np.percentile(df['trip_duration'], 50, interpolation = 'midpoint') \n", + "Q3 = np.percentile(df['trip_duration'], 75, interpolation = 'midpoint') \n", + "IQR = Q3 - Q1\n", + "low_lim = Q1 - 1.5 * IQR \n", + "up_lim = Q3 + 1.5 * IQR\n", + "print(\"Instances with trip duration greater than {} are outliers as per Boxplot analysis.\".format(up_lim))" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "4igtTKF7hPu2" + }, + "source": [ + "Hence we can safely consider instances with trip duration > 5900 second as outliers.\n", + "Also trip duration < 60 second(~ 1 min) does not make any sense. Hence we will remove such instances as well. " + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": { + "id": "sxsY-b-IGWFl" + }, + "outputs": [], + "source": [ + "df = df[(df.trip_duration < 5900)]\n", + "df = df[(df.trip_duration > 60)]" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "KkVIECiehPu2" + }, + "source": [ + "Instances with passenger_count = 0 also need to be removed." + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": { + "id": "YWyc3pQjhPu2" + }, + "outputs": [], + "source": [ + "df = df[(df.passenger_count > 0)]" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 320 + }, + "id": "TvMqWf8jT1ab", + "outputId": "0442f604-7179-41a3-87a4-13b02f2ec111" + }, + "outputs": [ + { + "data": { + "text/html": [ + "<div>\n", + "<style scoped>\n", + " .dataframe tbody tr th:only-of-type {\n", + " vertical-align: middle;\n", + " }\n", + "\n", + " .dataframe tbody tr th {\n", + " vertical-align: top;\n", + " }\n", + "\n", + " .dataframe thead th {\n", + " text-align: right;\n", + " }\n", + "</style>\n", + "<table border=\"1\" class=\"dataframe\">\n", + " <thead>\n", + " <tr style=\"text-align: right;\">\n", + " <th></th>\n", + " <th>vendor_id</th>\n", + " <th>passenger_count</th>\n", + " <th>pickup_longitude</th>\n", + " <th>pickup_latitude</th>\n", + " <th>dropoff_longitude</th>\n", + " <th>dropoff_latitude</th>\n", + " <th>trip_duration</th>\n", + " <th>vism</th>\n", + " <th>fog</th>\n", + " <th>rain</th>\n", + " <th>snow</th>\n", + " <th>holiday_or_not</th>\n", + " <th>turns</th>\n", + " </tr>\n", + " </thead>\n", + " <tbody>\n", + " <tr>\n", + " <th>count</th>\n", + " <td>1.447222e+06</td>\n", + " <td>1.447222e+06</td>\n", + " <td>1.447222e+06</td>\n", + " <td>1.447222e+06</td>\n", + " <td>1.447222e+06</td>\n", + " <td>1.447222e+06</td>\n", + " <td>1.447222e+06</td>\n", + " <td>1.447222e+06</td>\n", + " <td>1.447222e+06</td>\n", + " <td>1.447222e+06</td>\n", + " <td>1.447222e+06</td>\n", + " <td>1.447222e+06</td>\n", + " <td>1.447222e+06</td>\n", + " </tr>\n", + " <tr>\n", + " <th>mean</th>\n", + " <td>1.535028e+00</td>\n", + " <td>1.665402e+00</td>\n", + " <td>-7.397364e+01</td>\n", + " <td>4.075099e+01</td>\n", + " <td>-7.397355e+01</td>\n", + " <td>4.075186e+01</td>\n", + " <td>8.395405e+02</td>\n", + " <td>1.467807e+01</td>\n", + " <td>6.509022e-03</td>\n", + " <td>9.603572e-02</td>\n", + " <td>2.380699e-02</td>\n", + " <td>1.866680e-02</td>\n", + " <td>7.570809e+00</td>\n", + " </tr>\n", + " <tr>\n", + " <th>std</th>\n", + " <td>4.987717e-01</td>\n", + " <td>1.314749e+00</td>\n", + " <td>7.060205e-02</td>\n", + " <td>3.254072e-02</td>\n", + " <td>7.032910e-02</td>\n", + " <td>3.559071e-02</td>\n", + " <td>6.472183e+02</td>\n", + " <td>3.070053e+00</td>\n", + " <td>1.124087e-01</td>\n", + " <td>5.185249e-01</td>\n", + " <td>2.769076e-01</td>\n", + " <td>1.353453e-01</td>\n", + " <td>4.424144e+00</td>\n", + " </tr>\n", + " <tr>\n", + " <th>min</th>\n", + " <td>1.000000e+00</td>\n", + " <td>1.000000e+00</td>\n", + " <td>-1.219333e+02</td>\n", + " <td>3.435970e+01</td>\n", + " <td>-1.219333e+02</td>\n", + " <td>3.218114e+01</td>\n", + " <td>6.100000e+01</td>\n", + " <td>4.000000e-01</td>\n", + " <td>0.000000e+00</td>\n", + " <td>0.000000e+00</td>\n", + " <td>0.000000e+00</td>\n", + " <td>0.000000e+00</td>\n", + " <td>2.000000e+00</td>\n", + " </tr>\n", + " <tr>\n", + " <th>25%</th>\n", + " <td>1.000000e+00</td>\n", + " <td>1.000000e+00</td>\n", + " <td>-7.399187e+01</td>\n", + " <td>4.073742e+01</td>\n", + " <td>-7.399133e+01</td>\n", + " <td>4.073595e+01</td>\n", + " <td>4.010000e+02</td>\n", + " <td>1.450000e+01</td>\n", + " <td>0.000000e+00</td>\n", + " <td>0.000000e+00</td>\n", + " <td>0.000000e+00</td>\n", + " <td>0.000000e+00</td>\n", + " <td>5.000000e+00</td>\n", + " </tr>\n", + " <tr>\n", + " <th>50%</th>\n", + " <td>2.000000e+00</td>\n", + " <td>1.000000e+00</td>\n", + " <td>-7.398177e+01</td>\n", + " <td>4.075414e+01</td>\n", + " <td>-7.397977e+01</td>\n", + " <td>4.075456e+01</td>\n", + " <td>6.650000e+02</td>\n", + " <td>1.610000e+01</td>\n", + " <td>0.000000e+00</td>\n", + " <td>0.000000e+00</td>\n", + " <td>0.000000e+00</td>\n", + " <td>0.000000e+00</td>\n", + " <td>6.000000e+00</td>\n", + " </tr>\n", + " <tr>\n", + " <th>75%</th>\n", + " <td>2.000000e+00</td>\n", + " <td>2.000000e+00</td>\n", + " <td>-7.396744e+01</td>\n", + " <td>4.076837e+01</td>\n", + " <td>-7.396310e+01</td>\n", + " <td>4.076983e+01</td>\n", + " <td>1.076000e+03</td>\n", + " <td>1.610000e+01</td>\n", + " <td>0.000000e+00</td>\n", + " <td>0.000000e+00</td>\n", + " <td>0.000000e+00</td>\n", + " <td>0.000000e+00</td>\n", + " <td>9.000000e+00</td>\n", + " </tr>\n", + " <tr>\n", + " <th>max</th>\n", + " <td>2.000000e+00</td>\n", + " <td>9.000000e+00</td>\n", + " <td>-6.133553e+01</td>\n", + " <td>5.188108e+01</td>\n", + " <td>-6.133553e+01</td>\n", + " <td>4.392103e+01</td>\n", + " <td>5.897000e+03</td>\n", + " <td>1.610000e+01</td>\n", + " <td>4.000000e+00</td>\n", + " <td>7.000000e+00</td>\n", + " <td>6.000000e+00</td>\n", + " <td>1.000000e+00</td>\n", + " <td>4.600000e+01</td>\n", + " </tr>\n", + " </tbody>\n", + "</table>\n", + "</div>" + ], + "text/plain": [ + " vendor_id passenger_count pickup_longitude pickup_latitude \\\n", + "count 1.447222e+06 1.447222e+06 1.447222e+06 1.447222e+06 \n", + "mean 1.535028e+00 1.665402e+00 -7.397364e+01 4.075099e+01 \n", + "std 4.987717e-01 1.314749e+00 7.060205e-02 3.254072e-02 \n", + "min 1.000000e+00 1.000000e+00 -1.219333e+02 3.435970e+01 \n", + "25% 1.000000e+00 1.000000e+00 -7.399187e+01 4.073742e+01 \n", + "50% 2.000000e+00 1.000000e+00 -7.398177e+01 4.075414e+01 \n", + "75% 2.000000e+00 2.000000e+00 -7.396744e+01 4.076837e+01 \n", + "max 2.000000e+00 9.000000e+00 -6.133553e+01 5.188108e+01 \n", + "\n", + " dropoff_longitude dropoff_latitude trip_duration vism \\\n", + "count 1.447222e+06 1.447222e+06 1.447222e+06 1.447222e+06 \n", + "mean -7.397355e+01 4.075186e+01 8.395405e+02 1.467807e+01 \n", + "std 7.032910e-02 3.559071e-02 6.472183e+02 3.070053e+00 \n", + "min -1.219333e+02 3.218114e+01 6.100000e+01 4.000000e-01 \n", + "25% -7.399133e+01 4.073595e+01 4.010000e+02 1.450000e+01 \n", + "50% -7.397977e+01 4.075456e+01 6.650000e+02 1.610000e+01 \n", + "75% -7.396310e+01 4.076983e+01 1.076000e+03 1.610000e+01 \n", + "max -6.133553e+01 4.392103e+01 5.897000e+03 1.610000e+01 \n", + "\n", + " fog rain snow holiday_or_not turns \n", + "count 1.447222e+06 1.447222e+06 1.447222e+06 1.447222e+06 1.447222e+06 \n", + "mean 6.509022e-03 9.603572e-02 2.380699e-02 1.866680e-02 7.570809e+00 \n", + "std 1.124087e-01 5.185249e-01 2.769076e-01 1.353453e-01 4.424144e+00 \n", + "min 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 2.000000e+00 \n", + "25% 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 5.000000e+00 \n", + "50% 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 6.000000e+00 \n", + "75% 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 9.000000e+00 \n", + "max 4.000000e+00 7.000000e+00 6.000000e+00 1.000000e+00 4.600000e+01 " + ] + }, + "execution_count": 13, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df.describe()" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "nDl5UVbAhPu2" + }, + "source": [ + "# Feature Extraction" + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "metadata": { + "id": "i2SF-MV1hr_z" + }, + "outputs": [], + "source": [ + "y = df['trip_duration'] \n", + "X = df.drop(['trip_duration'], axis=1)\n", + "X = X.drop('id',axis=1)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "UbsYDCWfsV6a" + }, + "source": [ + "### Encoding vendor_id" + ] + }, + { + "cell_type": "code", + "execution_count": 25, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "rD0BmhbJhPu3", + "outputId": "71dc642c-3c0a-4fcb-cd3f-988aaad46bb2", + "scrolled": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Vendor list : [2 1]\n" + ] + } + ], + "source": [ + "vendor_id_list = pd.unique(X['vendor_id'])\n", + "print(\"Vendor list :\", vendor_id_list)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "SYoAyI12hPu3" + }, + "source": [ + "There are two unique vendors in the dataset.\n", + "Since this is categorical data, we can perform one hot encoding on it." + ] + }, + { + "cell_type": "code", + "execution_count": 26, + "metadata": { + "id": "rWHK4tinhr_8", + "scrolled": true + }, + "outputs": [], + "source": [ + "#encoding vendor_id ={1,2} to vendor_id_1 and vendor_id_2 columns\n", + "encoded_vendor_id=pd.get_dummies(X['vendor_id'], prefix='vendor_id')\n", + "# Drop column vendor_id as it is now encoded\n", + "X = X.drop('vendor_id',axis = 1)\n", + "# Join original with encoded \n", + "X = X.join(encoded_vendor_id)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "_ualwbX7yq6k" + }, + "source": [ + "### Encoding store_and_fwd_flag" + ] + }, + { + "cell_type": "code", + "execution_count": 27, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "GHWPtsgfhPu3", + "outputId": "fdf527bb-4770-4db3-8d8b-53350157d91d" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Flag Values : ['N' 'Y']\n" + ] + } + ], + "source": [ + "flag_values = pd.unique(X['store_and_fwd_flag'])\n", + "print(\"Flag Values :\", flag_values)" + ] + }, + { + "cell_type": "code", + "execution_count": 28, + "metadata": { + "id": "R5LDruDZx7TM" + }, + "outputs": [], + "source": [ + "#encoding store_and_fwd_flag = {Y,N} to flag_1 and flag_2 columns\n", + "encoded_flag_id=pd.get_dummies(X['store_and_fwd_flag'], prefix='flag')\n", + "# Drop column store_and_fwd_flag as it is now encoded\n", + "X = X.drop('store_and_fwd_flag',axis = 1)\n", + "# Join original with encoded \n", + "X = X.join(encoded_flag_id)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "O1WUDPE1hPu3" + }, + "source": [ + "### Calculating distance related features" + ] + }, + { + "cell_type": "code", + "execution_count": 29, + "metadata": { + "id": "6GIJD3IGhsAA" + }, + "outputs": [], + "source": [ + "X['lat_diff'] = abs(X['pickup_latitude'] - X['dropoff_latitude'])\n", + "X['long_diff'] = abs(X['pickup_longitude'] - X['dropoff_longitude'])" + ] + }, + { + "cell_type": "code", + "execution_count": 30, + "metadata": { + "id": "OhMPK8SZhPu3" + }, + "outputs": [], + "source": [ + "def get_euclidean_dist(p_lat, p_long, d_lat, d_long):\n", + " return np.sqrt(np.power(p_lat-d_lat, 2) + np.power(p_long-d_long, 2))" + ] + }, + { + "cell_type": "code", + "execution_count": 31, + "metadata": { + "id": "pWybkDgQhPu3" + }, + "outputs": [], + "source": [ + "def get_haversine_dist(lat1, lng1, lat2, lng2):\n", + " lat1, lng1, lat2, lng2 = map(np.radians, (lat1, lng1, lat2, lng2))\n", + " AVG_EARTH_RADIUS = 6371 # in km\n", + " lat = lat2 - lat1\n", + " lng = lng2 - lng1\n", + " d = np.sin(lat * 0.5) ** 2 + np.cos(lat1) * np.cos(lat2) * np.sin(lng * 0.5) ** 2\n", + " h = 2 * AVG_EARTH_RADIUS * np.arcsin(np.sqrt(d))\n", + " return(h)" + ] + }, + { + "cell_type": "code", + "execution_count": 32, + "metadata": { + "id": "ZBKo6Af6hPu3" + }, + "outputs": [], + "source": [ + "def get_manhattan_distance(lat1, lng1, lat2, lng2):\n", + " a = get_haversine_dist(lat1, lng1, lat1, lng2)\n", + " b = get_haversine_dist(lat1, lng1, lat2, lng1)\n", + " return a + b" + ] + }, + { + "cell_type": "code", + "execution_count": 33, + "metadata": { + "id": "ooqpy-bghPu3" + }, + "outputs": [], + "source": [ + "X['euclidean_dist'] = get_euclidean_dist(X['pickup_latitude'].to_numpy(), X['pickup_longitude'].to_numpy(), X['dropoff_latitude'].to_numpy(), X['dropoff_longitude'].to_numpy())\n", + "X['haversine_dist'] = get_haversine_dist(X['pickup_latitude'].to_numpy(), X['pickup_longitude'].to_numpy(), X['dropoff_latitude'].to_numpy(), X['dropoff_longitude'].to_numpy())\n", + "X['manhattan_dist'] = get_manhattan_distance(X['pickup_latitude'].to_numpy(), X['pickup_longitude'].to_numpy(), X['dropoff_latitude'].to_numpy(), X['dropoff_longitude'].to_numpy())\n", + "# X['speed_haversine'] = X['manhattan_dist'] / y\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "ON03y96nhPu3" + }, + "source": [ + "### Calculating time related features" + ] + }, + { + "cell_type": "code", + "execution_count": 34, + "metadata": { + "id": "Rt3E99CkhsAP" + }, + "outputs": [], + "source": [ + "X['pickup_datetime'] = pd.to_datetime(X['pickup_datetime'], \n", + "format = '%Y-%m-%d %H:%M:%S', \n", + " errors = 'coerce')" + ] + }, + { + "cell_type": "code", + "execution_count": 35, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "iBjPHG7m7g6v", + "outputId": "b31054ef-ec62-44f6-f067-2086cdd38436" + }, + "outputs": [], + "source": [ + "X['pickup_day_of_the_week'] = X['pickup_datetime'].dt.dayofweek\n", + "# X['week_sin'] = np.sin(X['pickup_day_of_the_week']*(2*np.pi/7))\n", + "# X['week_cos'] = np.cos(X['pickup_day_of_the_week']*(2*np.pi/7))\n", + "\n", + "\n", + "X['pickup_hour'] = X['pickup_datetime'].dt.hour\n", + "# X['hr_sin'] = np.sin(X['pickup_hour']*(2*np.pi/24))\n", + "# X['hr_cos'] = np.cos(X['pickup_hour']*(2*np.pi/24))\n", + "\n", + "\n", + "X['pickup_month'] = X['pickup_datetime'].dt.month\n", + "# X['month_sin'] = np.sin(X['pickup_month']*(2*np.pi/12))\n", + "# X['month_cos'] = np.cos(X['pickup_month']*(2*np.pi/12))\n", + "\n", + "X['pickup_day_of_year'] = X['pickup_datetime'].dt.dayofyear\n", + "X['pickup_week_of_year'] = X['pickup_datetime'].dt.weekofyear\n" + ] + }, + { + "cell_type": "code", + "execution_count": 36, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 383 + }, + "id": "Q5n8YUbKWQUU", + "outputId": "cc0e26e9-402d-4e2d-dd8a-ec333b67af03" + }, + "outputs": [ + { + "data": { + "text/html": [ + "<div>\n", + "<style scoped>\n", + " .dataframe tbody tr th:only-of-type {\n", + " vertical-align: middle;\n", + " }\n", + "\n", + " .dataframe tbody tr th {\n", + " vertical-align: top;\n", + " }\n", + "\n", + " .dataframe thead th {\n", + " text-align: right;\n", + " }\n", + "</style>\n", + "<table border=\"1\" class=\"dataframe\">\n", + " <thead>\n", + " <tr style=\"text-align: right;\">\n", + " <th></th>\n", + " <th>passenger_count</th>\n", + " <th>pickup_longitude</th>\n", + " <th>pickup_latitude</th>\n", + " <th>dropoff_longitude</th>\n", + " <th>dropoff_latitude</th>\n", + " <th>vism</th>\n", + " <th>fog</th>\n", + " <th>rain</th>\n", + " <th>snow</th>\n", + " <th>holiday_or_not</th>\n", + " <th>...</th>\n", + " <th>lat_diff</th>\n", + " <th>long_diff</th>\n", + " <th>euclidean_dist</th>\n", + " <th>haversine_dist</th>\n", + " <th>manhattan_dist</th>\n", + " <th>pickup_day_of_the_week</th>\n", + " <th>pickup_hour</th>\n", + " <th>pickup_month</th>\n", + " <th>pickup_day_of_year</th>\n", + " <th>pickup_week_of_year</th>\n", + " </tr>\n", + " </thead>\n", + " <tbody>\n", + " <tr>\n", + " <th>0</th>\n", + " <td>1</td>\n", + " <td>-73.982155</td>\n", + " <td>40.767937</td>\n", + " <td>-73.964630</td>\n", + " <td>40.765602</td>\n", + " <td>12.90</td>\n", + " <td>0.0</td>\n", + " <td>0.0</td>\n", + " <td>0.0</td>\n", + " <td>0</td>\n", + " <td>...</td>\n", + " <td>0.002335</td>\n", + " <td>0.017525</td>\n", + " <td>0.017680</td>\n", + " <td>1.498521</td>\n", + " <td>1.735433</td>\n", + " <td>0</td>\n", + " <td>17</td>\n", + " <td>3</td>\n", + " <td>74</td>\n", + " <td>11</td>\n", + " </tr>\n", + " <tr>\n", + " <th>1</th>\n", + " <td>1</td>\n", + " <td>-73.980415</td>\n", + " <td>40.738564</td>\n", + " <td>-73.999481</td>\n", + " <td>40.731152</td>\n", + " <td>16.10</td>\n", + " <td>0.0</td>\n", + " <td>0.0</td>\n", + " <td>0.0</td>\n", + " <td>0</td>\n", + " <td>...</td>\n", + " <td>0.007412</td>\n", + " <td>0.019066</td>\n", + " <td>0.020456</td>\n", + " <td>1.805507</td>\n", + " <td>2.430506</td>\n", + " <td>6</td>\n", + " <td>0</td>\n", + " <td>6</td>\n", + " <td>164</td>\n", + " <td>23</td>\n", + " </tr>\n", + " <tr>\n", + " <th>2</th>\n", + " <td>1</td>\n", + " <td>-73.979027</td>\n", + " <td>40.763939</td>\n", + " <td>-74.005333</td>\n", + " <td>40.710087</td>\n", + " <td>16.10</td>\n", + " <td>0.0</td>\n", + " <td>0.0</td>\n", + " <td>0.0</td>\n", + " <td>0</td>\n", + " <td>...</td>\n", + " <td>0.053852</td>\n", + " <td>0.026306</td>\n", + " <td>0.059934</td>\n", + " <td>6.385098</td>\n", + " <td>8.203575</td>\n", + " <td>1</td>\n", + " <td>11</td>\n", + " <td>1</td>\n", + " <td>19</td>\n", + " <td>3</td>\n", + " </tr>\n", + " <tr>\n", + " <th>3</th>\n", + " <td>1</td>\n", + " <td>-74.010040</td>\n", + " <td>40.719971</td>\n", + " <td>-74.012268</td>\n", + " <td>40.706718</td>\n", + " <td>2.64</td>\n", + " <td>0.0</td>\n", + " <td>3.0</td>\n", + " <td>0.0</td>\n", + " <td>0</td>\n", + " <td>...</td>\n", + " <td>0.013252</td>\n", + " <td>0.002228</td>\n", + " <td>0.013438</td>\n", + " <td>1.485498</td>\n", + " <td>1.661331</td>\n", + " <td>2</td>\n", + " <td>19</td>\n", + " <td>4</td>\n", + " <td>97</td>\n", + " <td>14</td>\n", + " </tr>\n", + " <tr>\n", + " <th>4</th>\n", + " <td>1</td>\n", + " <td>-73.973053</td>\n", + " <td>40.793209</td>\n", + " <td>-73.972923</td>\n", + " <td>40.782520</td>\n", + " <td>16.10</td>\n", + " <td>0.0</td>\n", + " <td>0.0</td>\n", + " <td>0.0</td>\n", + " <td>0</td>\n", + " <td>...</td>\n", + " <td>0.010689</td>\n", + " <td>0.000130</td>\n", + " <td>0.010690</td>\n", + " <td>1.188588</td>\n", + " <td>1.199457</td>\n", + " <td>5</td>\n", + " <td>13</td>\n", + " <td>3</td>\n", + " <td>86</td>\n", + " <td>12</td>\n", + " </tr>\n", + " <tr>\n", + " <th>5</th>\n", + " <td>6</td>\n", + " <td>-73.982857</td>\n", + " <td>40.742195</td>\n", + " <td>-73.992081</td>\n", + " <td>40.749184</td>\n", + " <td>16.10</td>\n", + " <td>0.0</td>\n", + " <td>0.0</td>\n", + " <td>0.0</td>\n", + " <td>0</td>\n", + " <td>...</td>\n", + " <td>0.006989</td>\n", + " <td>0.009224</td>\n", + " <td>0.011572</td>\n", + " <td>1.098942</td>\n", + " <td>1.554180</td>\n", + " <td>5</td>\n", + " <td>22</td>\n", + " <td>1</td>\n", + " <td>30</td>\n", + " <td>4</td>\n", + " </tr>\n", + " <tr>\n", + " <th>6</th>\n", + " <td>4</td>\n", + " <td>-73.969017</td>\n", + " <td>40.757839</td>\n", + " <td>-73.957405</td>\n", + " <td>40.765896</td>\n", + " <td>16.10</td>\n", + " <td>0.0</td>\n", + " <td>0.0</td>\n", + " <td>0.0</td>\n", + " <td>0</td>\n", + " <td>...</td>\n", + " <td>0.008057</td>\n", + " <td>0.011612</td>\n", + " <td>0.014133</td>\n", + " <td>1.326279</td>\n", + " <td>1.873902</td>\n", + " <td>4</td>\n", + " <td>22</td>\n", + " <td>6</td>\n", + " <td>169</td>\n", + " <td>24</td>\n", + " </tr>\n", + " <tr>\n", + " <th>7</th>\n", + " <td>1</td>\n", + " <td>-73.969276</td>\n", + " <td>40.797779</td>\n", + " <td>-73.922470</td>\n", + " <td>40.760559</td>\n", + " <td>16.10</td>\n", + " <td>0.0</td>\n", + " <td>0.0</td>\n", + " <td>0.0</td>\n", + " <td>0</td>\n", + " <td>...</td>\n", + " <td>0.037220</td>\n", + " <td>0.046806</td>\n", + " <td>0.059801</td>\n", + " <td>5.714981</td>\n", + " <td>8.078684</td>\n", + " <td>5</td>\n", + " <td>7</td>\n", + " <td>5</td>\n", + " <td>142</td>\n", + " <td>20</td>\n", + " </tr>\n", + " <tr>\n", + " <th>8</th>\n", + " <td>1</td>\n", + " <td>-73.999481</td>\n", + " <td>40.738400</td>\n", + " <td>-73.985786</td>\n", + " <td>40.732815</td>\n", + " <td>12.90</td>\n", + " <td>0.0</td>\n", + " <td>0.0</td>\n", + " <td>0.0</td>\n", + " <td>0</td>\n", + " <td>...</td>\n", + " <td>0.005585</td>\n", + " <td>0.013695</td>\n", + " <td>0.014790</td>\n", + " <td>1.310353</td>\n", + " <td>1.774804</td>\n", + " <td>4</td>\n", + " <td>23</td>\n", + " <td>5</td>\n", + " <td>148</td>\n", + " <td>21</td>\n", + " </tr>\n", + " <tr>\n", + " <th>9</th>\n", + " <td>1</td>\n", + " <td>-73.981049</td>\n", + " <td>40.744339</td>\n", + " <td>-73.973000</td>\n", + " <td>40.789989</td>\n", + " <td>14.50</td>\n", + " <td>0.0</td>\n", + " <td>0.0</td>\n", + " <td>0.0</td>\n", + " <td>0</td>\n", + " <td>...</td>\n", + " <td>0.045650</td>\n", + " <td>0.008049</td>\n", + " <td>0.046355</td>\n", + " <td>5.121162</td>\n", + " <td>5.754187</td>\n", + " <td>3</td>\n", + " <td>21</td>\n", + " <td>3</td>\n", + " <td>70</td>\n", + " <td>10</td>\n", + " </tr>\n", + " </tbody>\n", + "</table>\n", + "<p>10 rows × 25 columns</p>\n", + "</div>" + ], + "text/plain": [ + " passenger_count pickup_longitude pickup_latitude dropoff_longitude \\\n", + "0 1 -73.982155 40.767937 -73.964630 \n", + "1 1 -73.980415 40.738564 -73.999481 \n", + "2 1 -73.979027 40.763939 -74.005333 \n", + "3 1 -74.010040 40.719971 -74.012268 \n", + "4 1 -73.973053 40.793209 -73.972923 \n", + "5 6 -73.982857 40.742195 -73.992081 \n", + "6 4 -73.969017 40.757839 -73.957405 \n", + "7 1 -73.969276 40.797779 -73.922470 \n", + "8 1 -73.999481 40.738400 -73.985786 \n", + "9 1 -73.981049 40.744339 -73.973000 \n", + "\n", + " dropoff_latitude vism fog rain snow holiday_or_not ... lat_diff \\\n", + "0 40.765602 12.90 0.0 0.0 0.0 0 ... 0.002335 \n", + "1 40.731152 16.10 0.0 0.0 0.0 0 ... 0.007412 \n", + "2 40.710087 16.10 0.0 0.0 0.0 0 ... 0.053852 \n", + "3 40.706718 2.64 0.0 3.0 0.0 0 ... 0.013252 \n", + "4 40.782520 16.10 0.0 0.0 0.0 0 ... 0.010689 \n", + "5 40.749184 16.10 0.0 0.0 0.0 0 ... 0.006989 \n", + "6 40.765896 16.10 0.0 0.0 0.0 0 ... 0.008057 \n", + "7 40.760559 16.10 0.0 0.0 0.0 0 ... 0.037220 \n", + "8 40.732815 12.90 0.0 0.0 0.0 0 ... 0.005585 \n", + "9 40.789989 14.50 0.0 0.0 0.0 0 ... 0.045650 \n", + "\n", + " long_diff euclidean_dist haversine_dist manhattan_dist \\\n", + "0 0.017525 0.017680 1.498521 1.735433 \n", + "1 0.019066 0.020456 1.805507 2.430506 \n", + "2 0.026306 0.059934 6.385098 8.203575 \n", + "3 0.002228 0.013438 1.485498 1.661331 \n", + "4 0.000130 0.010690 1.188588 1.199457 \n", + "5 0.009224 0.011572 1.098942 1.554180 \n", + "6 0.011612 0.014133 1.326279 1.873902 \n", + "7 0.046806 0.059801 5.714981 8.078684 \n", + "8 0.013695 0.014790 1.310353 1.774804 \n", + "9 0.008049 0.046355 5.121162 5.754187 \n", + "\n", + " pickup_day_of_the_week pickup_hour pickup_month pickup_day_of_year \\\n", + "0 0 17 3 74 \n", + "1 6 0 6 164 \n", + "2 1 11 1 19 \n", + "3 2 19 4 97 \n", + "4 5 13 3 86 \n", + "5 5 22 1 30 \n", + "6 4 22 6 169 \n", + "7 5 7 5 142 \n", + "8 4 23 5 148 \n", + "9 3 21 3 70 \n", + "\n", + " pickup_week_of_year \n", + "0 11 \n", + "1 23 \n", + "2 3 \n", + "3 14 \n", + "4 12 \n", + "5 4 \n", + "6 24 \n", + "7 20 \n", + "8 21 \n", + "9 10 \n", + "\n", + "[10 rows x 25 columns]" + ] + }, + "execution_count": 36, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "X = X.drop('pickup_datetime',axis=1)\n", + "# X = X.drop('pickup_month',axis=1)\n", + "# X = X.drop('pickup_hour',axis=1)\n", + "# X = X.drop('pickup_day_of_the_week',axis=1)\n", + "X = X.drop('dropoff_datetime', axis=1)\n", + "X[:10]" + ] + }, + { + "cell_type": "code", + "execution_count": 37, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "JsXogqJlWQUW", + "outputId": "1026fa0e-3383-49ed-d082-1b66b5716558", + "scrolled": true + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "List of features : ['passenger_count' 'pickup_longitude' 'pickup_latitude'\n", + " 'dropoff_longitude' 'dropoff_latitude' 'vism' 'fog' 'rain' 'snow'\n", + " 'holiday_or_not' 'turns' 'vendor_id_1' 'vendor_id_2' 'flag_N' 'flag_Y'\n", + " 'lat_diff' 'long_diff' 'euclidean_dist' 'haversine_dist' 'manhattan_dist'\n", + " 'pickup_day_of_the_week' 'pickup_hour' 'pickup_month'\n", + " 'pickup_day_of_year' 'pickup_week_of_year']\n" + ] + } + ], + "source": [ + "feature_list = X.columns.values\n", + "print(\"List of features : {}\".format(feature_list))" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "bQMrzdyGhPu4" + }, + "source": [ + "# Training" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "X3cvffRoUYJE" + }, + "source": [ + "**SGD Regressor**" + ] + }, + { + "cell_type": "code", + "execution_count": 38, + "metadata": { + "id": "eEEy6Yqj7g6v" + }, + "outputs": [], + "source": [ + "# from sklearn.preprocessing import StandardScaler\n", + "# s = StandardScaler()\n", + "X_temp = X\n", + "X_temp = X_temp.drop('flag_Y', axis=1)\n", + "X_temp = X_temp.drop('flag_N', axis=1)\n", + "X_temp = X_temp.drop('pickup_month', axis=1)\n", + "X_temp = X_temp.drop('pickup_week_of_year', axis=1)\n", + "\n", + "# X_temp = s.fit_transform(X_temp)" + ] + }, + { + "cell_type": "code", + "execution_count": 39, + "metadata": {}, + "outputs": [], + "source": [ + "from sklearn.preprocessing import StandardScaler\n", + "s = StandardScaler()\n", + "#X_temp = X\n", + "X_temp = s.fit_transform(X_temp)" + ] + }, + { + "cell_type": "code", + "execution_count": 40, + "metadata": { + "id": "LRtGHpn_7g6x" + }, + "outputs": [], + "source": [ + "X_train, X_test, y_train, y_test = train_test_split(X_temp, y, test_size = 0.2)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**Analysing Performance of SGDRegressor with different initial learning_rates given by 'invscaling' with parameters eta0 = 0.001,0.01,0.1,0.5,0.8,1.0,1.5,2.0 and constant of regularization = 0.0001**" + ] + }, + { + "cell_type": "code", + "execution_count": 41, + "metadata": { + "id": "cKgvCqtVa1KB" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Loop 1\n", + "Loop 2\n", + "Loop 3\n", + "Loop 4\n", + "Loop 5\n", + "Loop 6\n", + "Loop 7\n", + "Loop 8\n", + "Done\n", + "[0.5018087889966956, 3.6664808497880257, 0.5295705430059086, 4.711339897366976, 5.694038052853398, 13.499750147610504, 16.216268050256677, 16.371195752967424]\n" + ] + } + ], + "source": [ + "from sklearn.linear_model import SGDRegressor\n", + "\n", + "from sklearn.metrics import mean_squared_log_error\n", + "losses = []\n", + "loop = 0\n", + "eta0 = [0.001,0.01,0.1,0.5,0.8,1.0,1.5,2.0]\n", + "for i in eta0 :\n", + " loop = loop + 1\n", + " print(\"Loop \",loop)\n", + " model = SGDRegressor(max_iter = 5000,eta0=i,random_state = 42)\n", + " model.fit(X_train,y_train)\n", + " y_pred = model.predict(X_test)\n", + " y_temp_pred = y_pred.flatten()\n", + " losses.append(np.sqrt(mean_squared_log_error(y_test,abs(y_temp_pred))))\n", + "print(\"Done\")\n", + "print(losses)" + ] + }, + { + "cell_type": "code", + "execution_count": 42, + "metadata": {}, + "outputs": [], + "source": [ + "i = losses.index(min(losses))\n", + "min_eta = eta0[i]" + ] + }, + { + "cell_type": "code", + "execution_count": 43, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "PVHcFd-NWQUY", + "outputId": "e6a47f95-241e-4a81-a13c-ab8f97009de0", + "scrolled": true + }, + "outputs": [ + { + "data": { + "image/png": "\n", + "text/plain": [ + "<Figure size 720x360 with 1 Axes>" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "fig = plt.figure(figsize = (10, 5))\n", + "ax = fig.add_axes([0,0,1,1])\n", + "ax.plot(eta0,losses,color ='maroon') \n", + "ax.set_ylabel(\"Loss(RMSLE)\") \n", + "ax.set_xlabel(\"Value of learning rate\") \n", + "ax.set_title(\"Performance of gradient descent with different values of learning rate\") \n", + "plt.show()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**Analyzing the performance of SGDRegressor with different values of regularization constant and initial learning rate = 'invscaling' with default values (eta0 = 0.01,power_t = 0.25)**" + ] + }, + { + "cell_type": "code", + "execution_count": 44, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Loop 1\n", + "Loop 2\n", + "Loop 3\n", + "Loop 4\n", + "Loop 5\n", + "Loop 6\n", + "Loop 7\n", + "Loop 8\n", + "Loop 9\n", + "Loop 10\n", + "Loop 11\n", + "Done\n", + "[3.6888212534807114, 3.686525112072632, 3.6664808497880257, 3.505660412205236, 2.9378016245296945, 0.9694406239700845, 0.536173854912625, 0.519936439881697, 0.5219244640520138, 0.538681300065003, 0.5530030358591688]\n" + ] + } + ], + "source": [ + "from sklearn.linear_model import SGDRegressor\n", + "\n", + "from sklearn.metrics import mean_squared_error\n", + "losses_1 = []\n", + "loop = 0\n", + "alpha = [1e-9,1e-5,0.0001,0.001,0.01,0.1,0.5,0.8,1.0,1.5,2.0]\n", + "for i in alpha :\n", + " loop = loop + 1\n", + " print(\"Loop \",loop)\n", + " model = SGDRegressor(max_iter = 5000,alpha = i,random_state = 42)\n", + " model.fit(X_train,y_train)\n", + " y_pred = model.predict(X_test)\n", + " y_temp_pred = y_pred.flatten()\n", + " losses_1.append(np.sqrt(mean_squared_log_error(y_test,abs(y_temp_pred))))\n", + "print(\"Done\")\n", + "print(losses_1)" + ] + }, + { + "cell_type": "code", + "execution_count": 45, + "metadata": {}, + "outputs": [], + "source": [ + "i = losses_1.index(min(losses_1))\n", + "min_alpha = alpha[i]" + ] + }, + { + "cell_type": "code", + "execution_count": 46, + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "\n", + "text/plain": [ + "<Figure size 720x360 with 1 Axes>" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "fig = plt.figure(figsize = (10, 5))\n", + "ax = fig.add_axes([0,0,1,1])\n", + "ax.plot(alpha,losses_1,color ='maroon') \n", + "ax.set_ylabel(\"Loss(RMSLE)\") \n", + "ax.set_xlabel(\"Value of l2 regularizer\") \n", + "ax.set_title(\"Performance of gradient descent with different values of l2 regularizer\") \n", + "plt.show()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**Fixing the parameters : regularization constant = 0.0001 , and value for initial learning rate ('invscaling',eta0 = 0.001,power_t = 0.25)**" + ] + }, + { + "cell_type": "code", + "execution_count": 47, + "metadata": {}, + "outputs": [], + "source": [ + "from sklearn.linear_model import SGDRegressor\n", + "from sklearn.metrics import mean_squared_error,mean_squared_log_error\n", + "model = SGDRegressor(max_iter = 5000,alpha = min_alpha,eta0=min_eta,random_state = 42)\n", + "model.fit(X_train,y_train)\n", + "y_pred = model.predict(X_test)\n", + "y_temp_pred = y_pred.flatten()" + ] + }, + { + "cell_type": "code", + "execution_count": 48, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "0.546414874705318\n" + ] + } + ], + "source": [ + "from sklearn.metrics import mean_squared_error,mean_squared_log_error\n", + "print(np.sqrt(mean_squared_log_error(y_test,abs(y_temp_pred))))" + ] + }, + { + "cell_type": "code", + "execution_count": 49, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "0.001 0.8\n" + ] + } + ], + "source": [ + "print(min_eta,min_alpha)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "reocx_zPmchc" + }, + "source": [ + "# Feature Extraction on test data" + ] + }, + { + "cell_type": "code", + "execution_count": 50, + "metadata": { + "id": "gyqmdO6IUln3" + }, + "outputs": [], + "source": [ + "# !unzip 'extended_test.csv.zip'\n", + "df_test = pd.read_csv('extended_test.csv')" + ] + }, + { + "cell_type": "code", + "execution_count": 51, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 338 + }, + "id": "wulg-B6jC-pQ", + "outputId": "14bbaea8-d09f-4e3b-8e82-b3b004536c3e" + }, + "outputs": [ + { + "data": { + "text/html": [ + "<div>\n", + "<style scoped>\n", + " .dataframe tbody tr th:only-of-type {\n", + " vertical-align: middle;\n", + " }\n", + "\n", + " .dataframe tbody tr th {\n", + " vertical-align: top;\n", + " }\n", + "\n", + " .dataframe thead th {\n", + " text-align: right;\n", + " }\n", + "</style>\n", + "<table border=\"1\" class=\"dataframe\">\n", + " <thead>\n", + " <tr style=\"text-align: right;\">\n", + " <th></th>\n", + " <th>Unnamed: 0</th>\n", + " <th>Unnamed: 0.1</th>\n", + " <th>vendor_id</th>\n", + " <th>passenger_count</th>\n", + " <th>pickup_longitude</th>\n", + " <th>pickup_latitude</th>\n", + " <th>dropoff_longitude</th>\n", + " <th>dropoff_latitude</th>\n", + " <th>holiday_or_not</th>\n", + " <th>number_of_steps</th>\n", + " <th>visi</th>\n", + " <th>vism</th>\n", + " <th>fog</th>\n", + " <th>rain</th>\n", + " <th>snow</th>\n", + " <th>hail</th>\n", + " <th>thunder</th>\n", + " <th>tornado</th>\n", + " </tr>\n", + " </thead>\n", + " <tbody>\n", + " <tr>\n", + " <th>count</th>\n", + " <td>625134.000000</td>\n", + " <td>625134.000000</td>\n", + " <td>625134.000000</td>\n", + " <td>625134.000000</td>\n", + " <td>625134.000000</td>\n", + " <td>625134.000000</td>\n", + " <td>625134.000000</td>\n", + " <td>625134.000000</td>\n", + " <td>625134.000000</td>\n", + " <td>625134.000000</td>\n", + " <td>602975.000000</td>\n", + " <td>602975.000000</td>\n", + " <td>625134.000000</td>\n", + " <td>625134.000000</td>\n", + " <td>625134.000000</td>\n", + " <td>625134.0</td>\n", + " <td>625134.0</td>\n", + " <td>625134.0</td>\n", + " </tr>\n", + " <tr>\n", + " <th>mean</th>\n", + " <td>312566.500000</td>\n", + " <td>312566.500000</td>\n", + " <td>1.534884</td>\n", + " <td>1.661765</td>\n", + " <td>-73.973614</td>\n", + " <td>40.750927</td>\n", + " <td>-73.973458</td>\n", + " <td>40.751816</td>\n", + " <td>0.018710</td>\n", + " <td>7.545392</td>\n", + " <td>9.081615</td>\n", + " <td>14.622785</td>\n", + " <td>0.006514</td>\n", + " <td>0.094818</td>\n", + " <td>0.024284</td>\n", + " <td>0.0</td>\n", + " <td>0.0</td>\n", + " <td>0.0</td>\n", + " </tr>\n", + " <tr>\n", + " <th>std</th>\n", + " <td>180460.785927</td>\n", + " <td>180460.785927</td>\n", + " <td>0.498782</td>\n", + " <td>1.311293</td>\n", + " <td>0.073389</td>\n", + " <td>0.029848</td>\n", + " <td>0.072565</td>\n", + " <td>0.035824</td>\n", + " <td>0.135497</td>\n", + " <td>4.435818</td>\n", + " <td>1.931477</td>\n", + " <td>3.113787</td>\n", + " <td>0.111340</td>\n", + " <td>0.514837</td>\n", + " <td>0.280462</td>\n", + " <td>0.0</td>\n", + " <td>0.0</td>\n", + " <td>0.0</td>\n", + " </tr>\n", + " <tr>\n", + " <th>min</th>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " <td>1.000000</td>\n", + " <td>0.000000</td>\n", + " <td>-121.933128</td>\n", + " <td>37.389587</td>\n", + " <td>-121.933327</td>\n", + " <td>36.601322</td>\n", + " <td>0.000000</td>\n", + " <td>2.000000</td>\n", + " <td>0.200000</td>\n", + " <td>0.400000</td>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " <td>0.0</td>\n", + " <td>0.0</td>\n", + " <td>0.0</td>\n", + " </tr>\n", + " <tr>\n", + " <th>25%</th>\n", + " <td>156283.250000</td>\n", + " <td>156283.250000</td>\n", + " <td>1.000000</td>\n", + " <td>1.000000</td>\n", + " <td>-73.991852</td>\n", + " <td>40.737392</td>\n", + " <td>-73.991318</td>\n", + " <td>40.736000</td>\n", + " <td>0.000000</td>\n", + " <td>5.000000</td>\n", + " <td>9.000000</td>\n", + " <td>14.500000</td>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " <td>0.0</td>\n", + " <td>0.0</td>\n", + " <td>0.0</td>\n", + " </tr>\n", + " <tr>\n", + " <th>50%</th>\n", + " <td>312566.500000</td>\n", + " <td>312566.500000</td>\n", + " <td>2.000000</td>\n", + " <td>1.000000</td>\n", + " <td>-73.981743</td>\n", + " <td>40.754093</td>\n", + " <td>-73.979774</td>\n", + " <td>40.754543</td>\n", + " <td>0.000000</td>\n", + " <td>6.000000</td>\n", + " <td>10.000000</td>\n", + " <td>16.100000</td>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " <td>0.0</td>\n", + " <td>0.0</td>\n", + " <td>0.0</td>\n", + " </tr>\n", + " <tr>\n", + " <th>75%</th>\n", + " <td>468849.750000</td>\n", + " <td>468849.750000</td>\n", + " <td>2.000000</td>\n", + " <td>2.000000</td>\n", + " <td>-73.967400</td>\n", + " <td>40.768394</td>\n", + " <td>-73.963013</td>\n", + " <td>40.769852</td>\n", + " <td>0.000000</td>\n", + " <td>9.000000</td>\n", + " <td>10.000000</td>\n", + " <td>16.100000</td>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " <td>0.0</td>\n", + " <td>0.0</td>\n", + " <td>0.0</td>\n", + " </tr>\n", + " <tr>\n", + " <th>max</th>\n", + " <td>625133.000000</td>\n", + " <td>625133.000000</td>\n", + " <td>2.000000</td>\n", + " <td>9.000000</td>\n", + " <td>-69.248917</td>\n", + " <td>42.814938</td>\n", + " <td>-67.496796</td>\n", + " <td>48.857597</td>\n", + " <td>1.000000</td>\n", + " <td>50.000000</td>\n", + " <td>10.000000</td>\n", + " <td>16.100000</td>\n", + " <td>4.000000</td>\n", + " <td>7.000000</td>\n", + " <td>6.000000</td>\n", + " <td>0.0</td>\n", + " <td>0.0</td>\n", + " <td>0.0</td>\n", + " </tr>\n", + " </tbody>\n", + "</table>\n", + "</div>" + ], + "text/plain": [ + " Unnamed: 0 Unnamed: 0.1 vendor_id passenger_count \\\n", + "count 625134.000000 625134.000000 625134.000000 625134.000000 \n", + "mean 312566.500000 312566.500000 1.534884 1.661765 \n", + "std 180460.785927 180460.785927 0.498782 1.311293 \n", + "min 0.000000 0.000000 1.000000 0.000000 \n", + "25% 156283.250000 156283.250000 1.000000 1.000000 \n", + "50% 312566.500000 312566.500000 2.000000 1.000000 \n", + "75% 468849.750000 468849.750000 2.000000 2.000000 \n", + "max 625133.000000 625133.000000 2.000000 9.000000 \n", + "\n", + " pickup_longitude pickup_latitude dropoff_longitude dropoff_latitude \\\n", + "count 625134.000000 625134.000000 625134.000000 625134.000000 \n", + "mean -73.973614 40.750927 -73.973458 40.751816 \n", + "std 0.073389 0.029848 0.072565 0.035824 \n", + "min -121.933128 37.389587 -121.933327 36.601322 \n", + "25% -73.991852 40.737392 -73.991318 40.736000 \n", + "50% -73.981743 40.754093 -73.979774 40.754543 \n", + "75% -73.967400 40.768394 -73.963013 40.769852 \n", + "max -69.248917 42.814938 -67.496796 48.857597 \n", + "\n", + " holiday_or_not number_of_steps visi vism \\\n", + "count 625134.000000 625134.000000 602975.000000 602975.000000 \n", + "mean 0.018710 7.545392 9.081615 14.622785 \n", + "std 0.135497 4.435818 1.931477 3.113787 \n", + "min 0.000000 2.000000 0.200000 0.400000 \n", + "25% 0.000000 5.000000 9.000000 14.500000 \n", + "50% 0.000000 6.000000 10.000000 16.100000 \n", + "75% 0.000000 9.000000 10.000000 16.100000 \n", + "max 1.000000 50.000000 10.000000 16.100000 \n", + "\n", + " fog rain snow hail thunder \\\n", + "count 625134.000000 625134.000000 625134.000000 625134.0 625134.0 \n", + "mean 0.006514 0.094818 0.024284 0.0 0.0 \n", + "std 0.111340 0.514837 0.280462 0.0 0.0 \n", + "min 0.000000 0.000000 0.000000 0.0 0.0 \n", + "25% 0.000000 0.000000 0.000000 0.0 0.0 \n", + "50% 0.000000 0.000000 0.000000 0.0 0.0 \n", + "75% 0.000000 0.000000 0.000000 0.0 0.0 \n", + "max 4.000000 7.000000 6.000000 0.0 0.0 \n", + "\n", + " tornado \n", + "count 625134.0 \n", + "mean 0.0 \n", + "std 0.0 \n", + "min 0.0 \n", + "25% 0.0 \n", + "50% 0.0 \n", + "75% 0.0 \n", + "max 0.0 " + ] + }, + "execution_count": 51, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df_test.describe()" + ] + }, + { + "cell_type": "code", + "execution_count": 52, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "xvUraj46DGNs", + "outputId": "1ad44957-b6f2-44dd-def9-b8d9925728bb" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "id 0\n", + "vendor_id 0\n", + "pickup_datetime 0\n", + "passenger_count 0\n", + "pickup_longitude 0\n", + "pickup_latitude 0\n", + "dropoff_longitude 0\n", + "dropoff_latitude 0\n", + "store_and_fwd_flag 0\n", + "pickup_datetime_temp 0\n", + "holiday_or_not 0\n", + "number_of_steps 0\n", + "vism 22159\n", + "fog 0\n", + "rain 0\n", + "snow 0\n", + "dtype: int64\n" + ] + } + ], + "source": [ + "df_test = df_test.drop('Unnamed: 0', axis=1)\n", + "df_test = df_test.drop('Unnamed: 0.1', axis=1)\n", + "df_test = df_test.drop('hail', axis=1)\n", + "df_test = df_test.drop('thunder', axis=1)\n", + "df_test = df_test.drop('tornado', axis=1)\n", + "df_test = df_test.drop('visi', axis=1)\n", + "# df = df.fillna(16.1)\n", + "missing_val_count_by_column = (df_test.isnull().sum())\n", + "print(missing_val_count_by_column)" + ] + }, + { + "cell_type": "code", + "execution_count": 53, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "7_LUhtnwMMNB", + "outputId": "e86aac1f-a688-4157-bf48-6cbd75692b47" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "id 0\n", + "vendor_id 0\n", + "pickup_datetime 0\n", + "passenger_count 0\n", + "pickup_longitude 0\n", + "pickup_latitude 0\n", + "dropoff_longitude 0\n", + "dropoff_latitude 0\n", + "store_and_fwd_flag 0\n", + "pickup_datetime_temp 0\n", + "holiday_or_not 0\n", + "number_of_steps 0\n", + "vism 0\n", + "fog 0\n", + "rain 0\n", + "snow 0\n", + "dtype: int64\n" + ] + } + ], + "source": [ + "values = {'vism': 16.1}\n", + "df_test = df_test.fillna(value=values)\n", + "missing_val_count_by_column = (df_test.isnull().sum())\n", + "print(missing_val_count_by_column)" + ] + }, + { + "cell_type": "code", + "execution_count": 54, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 320 + }, + "id": "kt-tZKzLMdFe", + "outputId": "2bfe76c7-7333-421d-e05a-d4d0ea475131" + }, + "outputs": [ + { + "data": { + "text/html": [ + "<div>\n", + "<style scoped>\n", + " .dataframe tbody tr th:only-of-type {\n", + " vertical-align: middle;\n", + " }\n", + "\n", + " .dataframe tbody tr th {\n", + " vertical-align: top;\n", + " }\n", + "\n", + " .dataframe thead th {\n", + " text-align: right;\n", + " }\n", + "</style>\n", + "<table border=\"1\" class=\"dataframe\">\n", + " <thead>\n", + " <tr style=\"text-align: right;\">\n", + " <th></th>\n", + " <th>vendor_id</th>\n", + " <th>passenger_count</th>\n", + " <th>pickup_longitude</th>\n", + " <th>pickup_latitude</th>\n", + " <th>dropoff_longitude</th>\n", + " <th>dropoff_latitude</th>\n", + " <th>holiday_or_not</th>\n", + " <th>vism</th>\n", + " <th>fog</th>\n", + " <th>rain</th>\n", + " <th>snow</th>\n", + " <th>turns</th>\n", + " </tr>\n", + " </thead>\n", + " <tbody>\n", + " <tr>\n", + " <th>count</th>\n", + " <td>625134.000000</td>\n", + " <td>625134.000000</td>\n", + " <td>625134.000000</td>\n", + " <td>625134.000000</td>\n", + " <td>625134.000000</td>\n", + " <td>625134.000000</td>\n", + " <td>625134.000000</td>\n", + " <td>625134.000000</td>\n", + " <td>625134.000000</td>\n", + " <td>625134.000000</td>\n", + " <td>625134.000000</td>\n", + " <td>625134.000000</td>\n", + " </tr>\n", + " <tr>\n", + " <th>mean</th>\n", + " <td>1.534884</td>\n", + " <td>1.661765</td>\n", + " <td>-73.973614</td>\n", + " <td>40.750927</td>\n", + " <td>-73.973458</td>\n", + " <td>40.751816</td>\n", + " <td>0.018710</td>\n", + " <td>14.675148</td>\n", + " <td>0.006514</td>\n", + " <td>0.094818</td>\n", + " <td>0.024284</td>\n", + " <td>7.545392</td>\n", + " </tr>\n", + " <tr>\n", + " <th>std</th>\n", + " <td>0.498782</td>\n", + " <td>1.311293</td>\n", + " <td>0.073389</td>\n", + " <td>0.029848</td>\n", + " <td>0.072565</td>\n", + " <td>0.035824</td>\n", + " <td>0.135497</td>\n", + " <td>3.070276</td>\n", + " <td>0.111340</td>\n", + " <td>0.514837</td>\n", + " <td>0.280462</td>\n", + " <td>4.435818</td>\n", + " </tr>\n", + " <tr>\n", + " <th>min</th>\n", + " <td>1.000000</td>\n", + " <td>0.000000</td>\n", + " <td>-121.933128</td>\n", + " <td>37.389587</td>\n", + " <td>-121.933327</td>\n", + " <td>36.601322</td>\n", + " <td>0.000000</td>\n", + " <td>0.400000</td>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " <td>2.000000</td>\n", + " </tr>\n", + " <tr>\n", + " <th>25%</th>\n", + " <td>1.000000</td>\n", + " <td>1.000000</td>\n", + " <td>-73.991852</td>\n", + " <td>40.737392</td>\n", + " <td>-73.991318</td>\n", + " <td>40.736000</td>\n", + " <td>0.000000</td>\n", + " <td>14.500000</td>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " <td>5.000000</td>\n", + " </tr>\n", + " <tr>\n", + " <th>50%</th>\n", + " <td>2.000000</td>\n", + " <td>1.000000</td>\n", + " <td>-73.981743</td>\n", + " <td>40.754093</td>\n", + " <td>-73.979774</td>\n", + " <td>40.754543</td>\n", + " <td>0.000000</td>\n", + " <td>16.100000</td>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " <td>6.000000</td>\n", + " </tr>\n", + " <tr>\n", + " <th>75%</th>\n", + " <td>2.000000</td>\n", + " <td>2.000000</td>\n", + " <td>-73.967400</td>\n", + " <td>40.768394</td>\n", + " <td>-73.963013</td>\n", + " <td>40.769852</td>\n", + " <td>0.000000</td>\n", + " <td>16.100000</td>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " <td>9.000000</td>\n", + " </tr>\n", + " <tr>\n", + " <th>max</th>\n", + " <td>2.000000</td>\n", + " <td>9.000000</td>\n", + " <td>-69.248917</td>\n", + " <td>42.814938</td>\n", + " <td>-67.496796</td>\n", + " <td>48.857597</td>\n", + " <td>1.000000</td>\n", + " <td>16.100000</td>\n", + " <td>4.000000</td>\n", + " <td>7.000000</td>\n", + " <td>6.000000</td>\n", + " <td>50.000000</td>\n", + " </tr>\n", + " </tbody>\n", + "</table>\n", + "</div>" + ], + "text/plain": [ + " vendor_id passenger_count pickup_longitude pickup_latitude \\\n", + "count 625134.000000 625134.000000 625134.000000 625134.000000 \n", + "mean 1.534884 1.661765 -73.973614 40.750927 \n", + "std 0.498782 1.311293 0.073389 0.029848 \n", + "min 1.000000 0.000000 -121.933128 37.389587 \n", + "25% 1.000000 1.000000 -73.991852 40.737392 \n", + "50% 2.000000 1.000000 -73.981743 40.754093 \n", + "75% 2.000000 2.000000 -73.967400 40.768394 \n", + "max 2.000000 9.000000 -69.248917 42.814938 \n", + "\n", + " dropoff_longitude dropoff_latitude holiday_or_not vism \\\n", + "count 625134.000000 625134.000000 625134.000000 625134.000000 \n", + "mean -73.973458 40.751816 0.018710 14.675148 \n", + "std 0.072565 0.035824 0.135497 3.070276 \n", + "min -121.933327 36.601322 0.000000 0.400000 \n", + "25% -73.991318 40.736000 0.000000 14.500000 \n", + "50% -73.979774 40.754543 0.000000 16.100000 \n", + "75% -73.963013 40.769852 0.000000 16.100000 \n", + "max -67.496796 48.857597 1.000000 16.100000 \n", + "\n", + " fog rain snow turns \n", + "count 625134.000000 625134.000000 625134.000000 625134.000000 \n", + "mean 0.006514 0.094818 0.024284 7.545392 \n", + "std 0.111340 0.514837 0.280462 4.435818 \n", + "min 0.000000 0.000000 0.000000 2.000000 \n", + "25% 0.000000 0.000000 0.000000 5.000000 \n", + "50% 0.000000 0.000000 0.000000 6.000000 \n", + "75% 0.000000 0.000000 0.000000 9.000000 \n", + "max 4.000000 7.000000 6.000000 50.000000 " + ] + }, + "execution_count": 54, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df_test['turns'] = df_test['number_of_steps']\n", + "df_test = df_test.drop('number_of_steps', axis = 1)\n", + "df_test.describe()" + ] + }, + { + "cell_type": "code", + "execution_count": 55, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "Oe62ewNmUzxu", + "outputId": "a40c6250-98f8-4857-d8ce-2dc7077bda8b" + }, + "outputs": [], + "source": [ + "Xt = df_test\n", + "\n", + "Xt = Xt.drop('id', axis = 1)\n", + "\n", + "#encoding vendor_id ={1,2} to vendor_id_1 and vendor_id_2 columns\n", + "encoded_vendor_id=pd.get_dummies(Xt['vendor_id'], prefix='vendor_id')\n", + "# Drop column vendor_id as it is now encoded\n", + "Xt = Xt.drop('vendor_id',axis = 1)\n", + "# Join original with encoded \n", + "Xt = Xt.join(encoded_vendor_id)\n", + "\n", + "#encoding store_and_fwd_flag = {Y,N} to flag_1 and flag_2 columns\n", + "encoded_flag_id=pd.get_dummies(Xt['store_and_fwd_flag'], prefix='flag')\n", + "# Drop column store_and_fwd_flag as it is now encoded\n", + "Xt = Xt.drop('store_and_fwd_flag',axis = 1)\n", + "# Join original with encoded \n", + "Xt = Xt.join(encoded_flag_id)\n", + "\n", + "Xt['lat_diff'] = abs( Xt['pickup_latitude'] - Xt['dropoff_latitude'] )\n", + "Xt['long_diff'] = abs( Xt['pickup_longitude'] - Xt['dropoff_longitude'] )\n", + "\n", + "Xt['euclidean_dist'] = get_euclidean_dist(Xt['pickup_latitude'].to_numpy(), Xt['pickup_longitude'].to_numpy(), Xt['dropoff_latitude'].to_numpy(), Xt['dropoff_longitude'].to_numpy())\n", + "Xt['haversine_dist'] = get_haversine_dist(Xt['pickup_latitude'].to_numpy(), Xt['pickup_longitude'].to_numpy(), Xt['dropoff_latitude'].to_numpy(), Xt['dropoff_longitude'].to_numpy())\n", + "Xt['manhattan_dist'] = get_manhattan_distance(Xt['pickup_latitude'].to_numpy(), Xt['pickup_longitude'].to_numpy(), Xt['dropoff_latitude'].to_numpy(), Xt['dropoff_longitude'].to_numpy())\n", + "\n", + "Xt['pickup_datetime'] = pd.to_datetime(Xt['pickup_datetime'], \n", + "format = '%Y-%m-%d %H:%M:%S', \n", + " errors = 'coerce')\n", + "\n", + "Xt['pickup_day_of_the_week'] = Xt['pickup_datetime'].dt.dayofweek\n", + "Xt['pickup_hour'] = Xt['pickup_datetime'].dt.hour\n", + "Xt['pickup_month'] = Xt['pickup_datetime'].dt.month\n", + "Xt['pickup_day_of_year'] = Xt['pickup_datetime'].dt.dayofyear\n", + "Xt['pickup_week_of_year'] = Xt['pickup_datetime'].dt.weekofyear\n", + "\n", + "Xt = Xt.drop('pickup_datetime',axis=1)" + ] + }, + { + "cell_type": "code", + "execution_count": 56, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "26\n" + ] + } + ], + "source": [ + "print(len(Xt.columns))" + ] + }, + { + "cell_type": "code", + "execution_count": 57, + "metadata": { + "id": "hGR1Rw0xM2pQ" + }, + "outputs": [], + "source": [ + "Xt_temp = Xt\n", + "Xt_temp = Xt_temp.drop('flag_Y', axis=1)\n", + "Xt_temp = Xt_temp.drop('flag_N', axis=1)\n", + "Xt_temp = Xt_temp.drop('pickup_month', axis=1)\n", + "Xt_temp = Xt_temp.drop('pickup_week_of_year', axis=1)\n" + ] + }, + { + "cell_type": "code", + "execution_count": 58, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "22\n" + ] + } + ], + "source": [ + "print(len(Xt_temp.columns))" + ] + }, + { + "cell_type": "code", + "execution_count": 59, + "metadata": {}, + "outputs": [], + "source": [ + "Xt_temp = Xt_temp.drop('pickup_datetime_temp',axis=1)" + ] + }, + { + "cell_type": "code", + "execution_count": 60, + "metadata": {}, + "outputs": [], + "source": [ + "from sklearn.preprocessing import StandardScaler\n", + "s = StandardScaler()\n", + "X_scaled = Xt_temp\n", + "X_scaled = s.fit_transform(X_scaled)" + ] + }, + { + "cell_type": "code", + "execution_count": 61, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "(625134, 21)\n" + ] + } + ], + "source": [ + "print(X_scaled.shape)" + ] + }, + { + "cell_type": "code", + "execution_count": 62, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 169 + }, + "id": "rO_NDhHkremd", + "outputId": "7c3a1c95-a462-46ea-fcd9-a2a2c65dcd33" + }, + "outputs": [], + "source": [ + "yt = model.predict(X_scaled)" + ] + }, + { + "cell_type": "code", + "execution_count": 63, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "e3MjSMbGss-S", + "outputId": "5eda426d-05c4-411f-96bf-248026a1a8b1" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[ 710.09110043 919.68555512 671.3967358 ... 1280.76319759 2195.55298278\n", + " 1099.90807674]\n" + ] + } + ], + "source": [ + "ytfinal = abs(yt)\n", + "print(ytfinal)" + ] + }, + { + "cell_type": "code", + "execution_count": 64, + "metadata": { + "id": "jfpYtSeVs-l5" + }, + "outputs": [], + "source": [ + "df_test['trip_duration'] = ytfinal.astype(int)\n", + "df_test.to_csv('rf_nd_20est.csv', columns=['id', 'trip_duration'], index=False)" + ] + }, + { + "cell_type": "code", + "execution_count": 67, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Feature: 0, Score: 3.62800\n", + "Feature: 1, Score: 33.33213\n", + "Feature: 2, Score: -23.67813\n", + "Feature: 3, Score: 66.41244\n", + "Feature: 4, Score: -44.28288\n", + "Feature: 5, Score: 0.80179\n", + "Feature: 6, Score: 4.56215\n", + "Feature: 7, Score: -0.70621\n", + "Feature: 8, Score: -3.91210\n", + "Feature: 9, Score: -12.70778\n", + "Feature: 10, Score: 146.14990\n", + "Feature: 11, Score: 0.49766\n", + "Feature: 12, Score: -0.49766\n", + "Feature: 13, Score: 57.43980\n", + "Feature: 14, Score: 38.61776\n", + "Feature: 15, Score: 53.04868\n", + "Feature: 16, Score: 56.92650\n", + "Feature: 17, Score: 53.81935\n", + "Feature: 18, Score: -14.27262\n", + "Feature: 19, Score: 13.87973\n", + "Feature: 20, Score: 16.18774\n" + ] + }, + { + "data": { + "image/png": "\n", + "text/plain": [ + "<Figure size 720x360 with 1 Axes>" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "importance = model.coef_\n", + "for i,v in enumerate(importance):\n", + " print('Feature: %0d, Score: %.5f' % (i,v))\n", + "\n", + "fig = plt.figure(figsize = (10, 5))\n", + "ax = fig.add_axes([0,0,1,1])\n", + "ax.barh(Xt_temp.columns,importance,color ='maroon') \n", + "ax.set_ylabel(\"Features\") \n", + "ax.set_xlabel(\"Importance\") \n", + "ax.set_title(\"Feature Importance\") \n", + "plt.show()" + ] + }, + { + "cell_type": "code", + "execution_count": 54, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "FrCAAhDV78uc", + "outputId": "19fb8899-878c-44a3-aa70-bc6781833d43" + }, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": 55, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 282 + }, + "id": "6ErJA2PH6lub", + "outputId": "9f0bc0a5-11f3-457b-ace7-abde02cc6ec1" + }, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": 56, + "metadata": { + "id": "_9P7iZMGxMjK" + }, + "outputs": [], + "source": [] + } + ], + "metadata": { + "accelerator": "GPU", + "colab": { + "collapsed_sections": [], + "name": "rf_new_data_20est.ipynb", + "provenance": [] + }, + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.8.3" + } + }, + "nbformat": 4, + "nbformat_minor": 1 +}