Added notebooks for preprocessing(resizing imgs)

6bc2fe0d · Smit Gangurde · 061123c6 · 6bc2fe0d · 6bc2fe0d
Commit 6bc2fe0d authored Oct 27, 2020 by Smit Gangurde
Show whitespace changes
Inline Side-by-side

Showing with 391 additions and 0 deletions

Notebooks/Dataset1_preprocess.ipynb Notebooks/Dataset1_preprocess.ipynb +186 -0

Notebooks/Dataset2_preprocess.ipynb Notebooks/Dataset2_preprocess.ipynb +205 -0

No files found.
--- a/Notebooks/Dataset1_preprocess.ipynb
+++ b/Notebooks/Dataset1_preprocess.ipynb
+{
+  "nbformat": 4,
+  "nbformat_minor": 0,
+  "metadata": {
+    "colab": {
+      "name": "Dataset1.ipynb",
+      "provenance": [],
+      "collapsed_sections": []
+    },
+    "kernelspec": {
+      "name": "python3",
+      "display_name": "Python 3"
+    }
+  },
+  "cells": [
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "-jen3DITEVt2",
+        "outputId": "f74b2601-aca6-4b79-8042-a46066957545",
+        "colab": {
+          "base_uri": "https://localhost:8080/",
+          "height": 86
+        }
+      },
+      "source": [
+        "#############Setup dataset#############\n",
+        "!pip install gdown --quiet\n",
+        "!gdown --id 1zTI002FEm0BcbXlUFpLs5zvurWGnx68v #id for dataset1.tar.gz\n",
+        "!tar -zxf dataset1.tar.gz\n",
+        "!rm dataset1.tar.gz\n",
+        "!rm -r sample_data #remove the default colab sample_data"
+      ],
+      "execution_count": 1,
+      "outputs": [
+        {
+          "output_type": "stream",
+          "text": [
+            "Downloading...\n",
+            "From: https://drive.google.com/uc?id=1zTI002FEm0BcbXlUFpLs5zvurWGnx68v\n",
+            "To: /content/dataset1.tar.gz\n",
+            "\r0.00B [00:00, ?B/s]\r19.4MB [00:00, 190MB/s]\r27.7MB [00:00, 169MB/s]\n"
+          ],
+          "name": "stdout"
+        }
+      ]
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "wr6oah1TGi9R"
+      },
+      "source": [
+        "#some global variables\n",
+        "#Change any variable as required\n",
+        "dataset_dir = 'dataset1/'\n",
+        "train_dir = 'train/'\n",
+        "test_dir = 'test/'\n",
+        "train_csv = 'train.csv'\n",
+        "test_csv = 'test.csv'\n",
+        "###########################Image Variables###########################\n",
+        "std_ht = 224 #standard height -> height we want all images to have\n",
+        "std_wd = 224 #standard width  -> width we want all images to have\n",
+        "#opencv image numpy array format -> (height, width, channels)"
+      ],
+      "execution_count": 2,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "VzS-iaUkFqUJ",
+        "outputId": "b546a1f4-8f7d-45bf-f240-5b85a922b648",
+        "colab": {
+          "base_uri": "https://localhost:8080/",
+          "height": 69
+        }
+      },
+      "source": [
+        "#Update versions if required\n",
+        "import cv2\n",
+        "import numpy as np\n",
+        "import pandas as pd\n",
+        "\n",
+        "print(\"opencv version: {}\".format(cv2.__version__))\n",
+        "print(\"numpy version: {}\".format(np.__version__))\n",
+        "print(\"pandas version: {}\".format(pd.__version__))"
+      ],
+      "execution_count": 3,
+      "outputs": [
+        {
+          "output_type": "stream",
+          "text": [
+            "opencv version: 4.1.2\n",
+            "numpy version: 1.18.5\n",
+            "pandas version: 1.1.3\n"
+          ],
+          "name": "stdout"
+        }
+      ]
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "5P_niPzuHx8l"
+      },
+      "source": [
+        "#############Preprocessing#############\n",
+        "#Change the function as required\n",
+        "#Maybe if you are more comfortable with passing final image sizes\n",
+        "#as arguments, do that, Currently I have kept it as a global variable\n",
+        "#Or maybe you prefer, numpy array of 4 shapes instead of list of img numpy arrays\n",
+        "def preprocess_imgs(img_dir, csv_file, isTrain=False):\n",
+        "    '''\n",
+        "    Resizes the images to std_ht and std_wt\n",
+        "    Order of iteration over images is same as the csv file\n",
+        "    Returns list of resized imgs(numpy arrays), and list of corresponding labels if isTrain=True\n",
+        "    Else returns  a list of resized imgs(numpy arrays) \n",
+        "    '''\n",
+        "    dir_path = dataset_dir + img_dir\n",
+        "    csv_df = pd.read_csv(dataset_dir + csv_file)\n",
+        "    if isTrain: labels=[]\n",
+        "    resized_imgs=[]\n",
+        "    for i in range(len(csv_df)):\n",
+        "        temp = cv2.imread(dir_path+csv_df['Image'][i])\n",
+        "        #cv2.resize takes final size as (width, height)\n",
+        "        #Change cv2.INTER_CUBIC to any other interpolation, as required\n",
+        "        resized_imgs.append(cv2.resize(temp, dsize=(std_wd, std_ht), interpolation=cv2.INTER_CUBIC))\n",
+        "        if isTrain: labels.append(csv_df['target'][i])\n",
+        "    return (resized_imgs, labels) if isTrain else (resized_imgs)"
+      ],
+      "execution_count": 4,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "nKEaLC9fH_Sd"
+      },
+      "source": [
+        "train_imgs, train_labels = preprocess_imgs(train_dir, train_csv, True)\n",
+        "test_imgs = preprocess_imgs(test_dir, test_csv, False)"
+      ],
+      "execution_count": 5,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "-M60cZUviDvO",
+        "outputId": "bdd8586c-8c19-4b14-8df9-2c13bd9cefe2",
+        "colab": {
+          "base_uri": "https://localhost:8080/",
+          "height": 34
+        }
+      },
+      "source": [
+        "#test\n",
+        "print(train_imgs[0].shape, train_labels[0])\n",
+        "#For further testing, you can save any img array as an image and check dimensions\n",
+        "#eg: cv2.imwrite('test.jpg', train_imgs[3])"
+      ],
+      "execution_count": 6,
+      "outputs": [
+        {
+          "output_type": "stream",
+          "text": [
+            "(224, 224, 3) manipuri\n"
+          ],
+          "name": "stdout"
+        }
+      ]
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "gx9_F78BszSW"
+      },
+      "source": [
+        ""
+      ],
+      "execution_count": null,
+      "outputs": []
+    }
+  ]
+}
\ No newline at end of file
--- a/Notebooks/Dataset2_preprocess.ipynb
+++ b/Notebooks/Dataset2_preprocess.ipynb
+{
+  "nbformat": 4,
+  "nbformat_minor": 0,
+  "metadata": {
+    "colab": {
+      "name": "Dataset2.ipynb",
+      "provenance": [],
+      "collapsed_sections": []
+    },
+    "kernelspec": {
+      "name": "python3",
+      "display_name": "Python 3"
+    }
+  },
+  "cells": [
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "LkfJiyyjjfwW",
+        "outputId": "0914f216-8efe-496f-abf7-30f2fb729d20",
+        "colab": {
+          "base_uri": "https://localhost:8080/",
+          "height": 86
+        }
+      },
+      "source": [
+        "#############Setup dataset#############\n",
+        "!pip install gdown --quiet\n",
+        "!gdown --id 1I29xdtdJVd7FoT7Uyw1QHafKyRzsqPzK #id for dataset2.tar.gz\n",
+        "!tar -zxf dataset2.tar.gz\n",
+        "!rm dataset2.tar.gz\n",
+        "!rm -r sample_data #remove the default colab sample_data"
+      ],
+      "execution_count": 1,
+      "outputs": [
+        {
+          "output_type": "stream",
+          "text": [
+            "Downloading...\n",
+            "From: https://drive.google.com/uc?id=1I29xdtdJVd7FoT7Uyw1QHafKyRzsqPzK\n",
+            "To: /content/dataset2.tar.gz\n",
+            "235MB [00:01, 166MB/s]\n"
+          ],
+          "name": "stdout"
+        }
+      ]
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "yRNAzxroj4hW"
+      },
+      "source": [
+        "#some global variables\n",
+        "#change as required\n",
+        "dataset_dir = 'dataset2/'\n",
+        "train_dir = 'train/'\n",
+        "test_dir = 'test/'\n",
+        "validation_dir = 'validation/'\n",
+        "train_csv = 'train.csv'\n",
+        "test_csv = 'test.csv'\n",
+        "###########################Image Variables###########################\n",
+        "std_ht = 224 #standard height -> height we want all images to have\n",
+        "std_wd = 224 #standard width  -> width we want all images to have\n",
+        "#opencv image numpy array format -> (height, width, channels)"
+      ],
+      "execution_count": 2,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "6ybstahmkxTB",
+        "outputId": "4ad3d21b-e4ad-4371-ce6a-d879aa167cd0",
+        "colab": {
+          "base_uri": "https://localhost:8080/",
+          "height": 69
+        }
+      },
+      "source": [
+        "#Update versions as required\n",
+        "import os\n",
+        "import cv2\n",
+        "import numpy as np\n",
+        "import pandas as pd\n",
+        "\n",
+        "print(\"opencv version: {}\".format(cv2.__version__))\n",
+        "print(\"numpy version: {}\".format(np.__version__))\n",
+        "print(\"pandas version: {}\".format(pd.__version__))"
+      ],
+      "execution_count": 3,
+      "outputs": [
+        {
+          "output_type": "stream",
+          "text": [
+            "opencv version: 4.1.2\n",
+            "numpy version: 1.18.5\n",
+            "pandas version: 1.1.3\n"
+          ],
+          "name": "stdout"
+        }
+      ]
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "C6xUzHwMlBD7"
+      },
+      "source": [
+        "#############Preprocessing#############\n",
+        "#Change the function as required\n",
+        "#Maybe if you are more comfortable with passing final image sizes\n",
+        "#as arguments, do that, Currently I have kept it as a global variable\n",
+        "#Or maybe you prefer, numpy array of 4 shapes instead of list of img numpy arrays\n",
+        "def preprocess_imgs(img_dir, isTrain=False):\n",
+        "    '''\n",
+        "    Resizes the images to std_ht and std_wt\n",
+        "    Order of iteration over images is same os.listdir()\n",
+        "    Returns list of resized imgs(numpy arrays), and list of corresponding labels if isTrain=True\n",
+        "    Else returns  a list of resized imgs(numpy arrays) \n",
+        "    '''\n",
+        "    dir_path = dataset_dir + img_dir\n",
+        "    if isTrain: labels = []\n",
+        "    resized_imgs = []\n",
+        "    if isTrain:\n",
+        "        label_dirs = os.listdir(dataset_dir+img_dir)\n",
+        "        for label in label_dirs:\n",
+        "            img_list = os.listdir(dataset_dir + img_dir + label + '/')\n",
+        "            for img in img_list:\n",
+        "                #sloppy code, can chdir, but meh\n",
+        "                temp = cv2.imread(dataset_dir + img_dir + label + '/' + img)\n",
+        "                #cv2.resize takes final size as (width, height)\n",
+        "                #Change cv2.INTER_CUBIC to any other interpolation, as required\n",
+        "                resized_imgs.append(cv2.resize(temp, dsize=(std_wd, std_ht), interpolation=cv2.INTER_CUBIC))\n",
+        "                labels.append(label)\n",
+        "        return (resized_imgs, labels)\n",
+        "    else:\n",
+        "        img_list = os.listdir(dataset_dir + img_dir)\n",
+        "        for img in img_list:\n",
+        "            temp = cv2.imread(dataset_dir + img_dir + img)\n",
+        "            #cv2.resize takes final size as (width, height)\n",
+        "            #Change cv2.INTER_CUBIC to any other interpolation, as required\n",
+        "            resized_imgs.append(cv2.resize(temp, dsize=(std_wd, std_ht), interpolation=cv2.INTER_CUBIC))\n",
+        "        return resized_imgs"
+      ],
+      "execution_count": 4,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "Cn60kFfNqc2K"
+      },
+      "source": [
+        "train_imgs, train_labels = preprocess_imgs(train_dir, True)\n",
+        "dev_imgs, dev_labels = preprocess_imgs(validation_dir, True)\n",
+        "test_imgs = preprocess_imgs(test_dir, False)"
+      ],
+      "execution_count": 5,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "hCQMlxehquo3",
+        "outputId": "5265b283-eda5-406b-99b3-0d15f1b1c1c6",
+        "colab": {
+          "base_uri": "https://localhost:8080/",
+          "height": 69
+        }
+      },
+      "source": [
+        "#test\n",
+        "print(train_imgs[0].shape, train_labels[0], len(train_imgs), len(train_labels))\n",
+        "print(dev_imgs[0].shape, dev_labels[0], len(dev_imgs), len(dev_labels))\n",
+        "print(test_imgs[0].shape, len(test_imgs))\n",
+        "#For further validation you can save any img array as image and check the dimensions\n",
+        "#eg: cv2.imwrite('test.jpg', train_imgs[300])\n"
+      ],
+      "execution_count": 6,
+      "outputs": [
+        {
+          "output_type": "stream",
+          "text": [
+            "(224, 224, 3) sattriya 5000 5000\n",
+            "(224, 224, 3) sattriya 364 364\n",
+            "(224, 224, 3) 156\n"
+          ],
+          "name": "stdout"
+        }
+      ]
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "03qTDNSesynZ"
+      },
+      "source": [
+        ""
+      ],
+      "execution_count": null,
+      "outputs": []
+    }
+  ]
+}
\ No newline at end of file