Commit c0774f4f authored by Ankush's avatar Ankush

DLNLP assignment addition

parent 8a7e507f
{
"cells": [
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
"# reading dataset\n",
"train_data = pd.read_csv(\"train.csv\", index_col=0)#.iloc[:100,:]\n",
"test_data = pd.read_csv(\"test.csv\", index_col=0)#.iloc[:1000,:]\n",
"\n",
"# separate reviews, ratings\n",
"train_reviews = train_data.iloc[:, :-1]\n",
"train_ratings = train_data.iloc[:, -1]\n",
"train_ratings = pd.get_dummies(train_ratings)\n",
"\n",
"\n",
"batch_size, epochs = 32, 10 \n",
"tokenizer = Tokenizer(oov_token = '<oov>')\n",
"\n",
"# pre-process data\n",
"train_reviews, tokenizer = preprocess_data(tokenizer, train_data, preprocessing_training_data=True)\n",
"test_reviews, tokenizer = preprocess_data(tokenizer, test_data, maxlen=train_reviews.shape[1])\n"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {
"scrolled": true
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Reviews Shape: (50000, 31)\n",
"Ratings Shape: (50000, 5)\n",
"Epoch 1/10\n",
"1563/1563 [==============================] - 8s 5ms/step - loss: 317.8222 - accuracy: 0.4365\n",
"Epoch 2/10\n",
"1563/1563 [==============================] - 10s 6ms/step - loss: 9.7619 - accuracy: 0.5061\n",
"Epoch 3/10\n",
"1563/1563 [==============================] - 9s 6ms/step - loss: 6.6692 - accuracy: 0.5140\n",
"Epoch 4/10\n",
"1563/1563 [==============================] - 10s 6ms/step - loss: 6.4623 - accuracy: 0.5139\n",
"Epoch 5/10\n",
"1563/1563 [==============================] - 9s 6ms/step - loss: 6.3933 - accuracy: 0.5150\n",
"Epoch 6/10\n",
"1563/1563 [==============================] - 9s 6ms/step - loss: 6.2997 - accuracy: 0.5213\n",
"Epoch 7/10\n",
"1563/1563 [==============================] - 8s 5ms/step - loss: 6.2729 - accuracy: 0.5209\n",
"Epoch 8/10\n",
"1563/1563 [==============================] - 7s 4ms/step - loss: 6.3800 - accuracy: 0.5188\n",
"Epoch 9/10\n",
"1563/1563 [==============================] - 8s 5ms/step - loss: 6.3796 - accuracy: 0.5210\n",
"Epoch 10/10\n",
"1563/1563 [==============================] - 9s 6ms/step - loss: 6.3761 - accuracy: 0.5205\n"
]
}
],
"source": [
"import pandas as pd\n",
"import numpy as np\n",
"import nltk\n",
"from nltk.corpus import stopwords\n",
"import tensorflow as tf\n",
"from tensorflow.keras.preprocessing.text import Tokenizer\n",
"from tensorflow.keras.preprocessing.sequence import pad_sequences\n",
"from tensorflow.keras import Sequential\n",
"from tensorflow.keras.layers import Dense, Dropout, Activation\n",
"from sklearn import datasets, model_selection, metrics\n",
"import keras\n",
"stopword = stopwords.words('english')\n",
"\n",
"'''\n",
"About the task:\n",
"\n",
"You are provided with a codeflow- which consists of functions to be implemented(MANDATORY).\n",
"\n",
"You need to implement each of the functions mentioned below, you may add your own function parameters if needed(not to main).\n",
"Execute your code using the provided auto.py script(NO EDITS PERMITTED) as your code will be evaluated using an auto-grader.\n",
"'''\n",
"\n",
"def encode_data(tokenizer, text, tokens, preprocessing_training_data = False):\n",
" # This function will be used to encode the reviews using a dictionary (created using corpus vocabulary) \n",
"\n",
" # Example of encoding :\"The food was fabulous but pricey\" has a vocabulary of 4 words, each one has to be mapped to an integer like: \n",
" # {'The':1,'food':2,'was':3 'fabulous':4 'but':5 'pricey':6} this vocabulary has to be created for the entire corpus and then be used to \n",
" # encode the words into integers \n",
"\n",
" # return encoded examples\n",
" if preprocessing_training_data:\n",
" tokenizer = Tokenizer(oov_token = '<oov>')\n",
" tokenizer.fit_on_texts(tokens)\n",
"\n",
" sequences = tokenizer.texts_to_sequences(text)\n",
" return sequences, tokenizer\n",
"\n",
"def convert_to_lower(text):\n",
" # return the reviews after convering then to lowercase\n",
" lower_text = text.lower()\n",
" return lower_text\n",
"\n",
"def perform_tokenization(text):\n",
" # return the reviews after performing tokenization\n",
" token=nltk.word_tokenize(text)\n",
" return token\n",
"\n",
"def remove_stopwords(text):\n",
"\t# return the reviews after removing the stopwords\n",
"\tremoving_stopwords=[word for word in text if word not in stopword]\n",
"\treturn removing_stopwords\n",
"\t#print(removing_stopwords)\n",
"\n",
"def remove_punctuation(text):\n",
" # return the reviews after removing punctuations\n",
" removing_punctuation = [word for word in text if word.isalpha()]\n",
" return removing_punctuation\n",
"\n",
"def perform_padding(data, maxlen):\n",
" # return the reviews after padding the reviews to maximum length\n",
"\tpadded_data = pad_sequences(data, maxlen=maxlen, padding='post')\n",
"\treturn padded_data\n",
"\n",
"def preprocess_data(tokenizer, data, preprocessing_training_data=False, maxlen=None):\n",
" # make all the following function calls on your data\n",
" # EXAMPLE:->\n",
" '''\n",
" review = data[\"reviews\"]\n",
" review = convert_to_lower(review)\n",
" review = remove_punctuation(review)\n",
" review = remove_stopwords(review)\n",
" review = perform_tokenization(review)\n",
" review = encode_data(review)\n",
" review = perform_padding(review)\n",
" '''\n",
" # return processed data\n",
"\n",
" reviews = data[\"reviews\"]\n",
" list_of_reviews = list(reviews)\n",
" string_of_reviews = ' '.join(str(e) for e in list_of_reviews)\n",
"\n",
" lower_text = convert_to_lower(string_of_reviews)\n",
" # print(lower_text, end='\\n\\n\\n\\n')\n",
" tokens = perform_tokenization(lower_text)\n",
"# print(tokens, end='\\n\\n\\n\\n')\n",
" tokens = remove_stopwords(tokens)\n",
"# print(tokens, end='\\n\\n\\n\\n')\n",
" tokens = remove_punctuation(tokens)\n",
"# print(tokens, end='\\n\\n\\n\\n')\n",
" encoded_data, tokenizer = encode_data(tokenizer, reviews, tokens, preprocessing_training_data)\n",
"# print(encoded_data, end='\\n\\n\\n\\n')\n",
" reviews = perform_padding(encoded_data, maxlen)\n",
"# print(review, end='\\n\\n\\n\\n')\n",
"\n",
" # TODO: Use word embedding for better results \n",
" return reviews, tokenizer\n",
"\n",
"\n",
"# TODO: Change this function to support tensors\n",
"def softmax_activation(x):\n",
" # write your own implementation from scratch and return softmax values(using predefined softmax is prohibited)\n",
"# print(type(x))\n",
"# print(x)\n",
"# print(type(x))\n",
"# Exponent_calculation = tf.exp(x)\n",
"# print(type(Exponent_calculation))\n",
"# Normalization = tf.reduce_sum(Exponent_calculation)\n",
"# print(type(Normalization))\n",
"\n",
"\n",
" \n",
"# return tf.math.divide_no_nan(z,z2)\n",
"\n",
" from tensorflow.python.ops import math_ops\n",
"# tf.print(\"x\")\n",
"# tf.print(x)\n",
" e = math_ops.exp(x - math_ops.reduce_max(x, keepdims=True))\n",
"# tf.print(\"e\")\n",
"# tf.print(e)\n",
" s = math_ops.reduce_sum(e, keepdims=True)\n",
"# tf.print(\"s\")\n",
"# tf.print(s)\n",
" output = tf.math.divide_no_nan(e, s)\n",
"# tf.print(\"output\")\n",
"# tf.print(output)\n",
" epsilon = tf.constant(1e-16)\n",
" output = output + epsilon\n",
"# tf.print(\"output total\")\n",
"# tf.print(math_ops.reduce_sum(output))\n",
" \n",
" \n",
"# if rank == 2:\n",
"# output = tf.nn.softmax(x)\n",
"# elif rank > 2:\n",
"# e = math_ops.exp(x - math_ops.reduce_max(x, axis=axis, keepdims=True))\n",
"# s = math_ops.reduce_sum(e, axis=axis, keepdims=True)\n",
"# output = e / s\n",
"# else:\n",
"# raise ValueError('Cannot apply softmax to a tensor that is 1D. '\n",
"# 'Received input: %s' % (x,))\n",
"\n",
" # Cache the logits to use for crossentropy loss.\n",
" output._keras_logits = x # pylint: disable=protected-access\n",
"# return output\n",
" \n",
"# y1 = keras.backend.sigmoid(x)\n",
"# return exp\n",
" return output\n",
"# return Exponent_calculation/Normalization\n",
"\n",
"\n",
"class NeuralNet:\n",
"\n",
" def __init__(self, reviews, ratings):\n",
" self.reviews = reviews\n",
" self.ratings = ratings\n",
"\n",
" def build_nn(self):\n",
" #add the input and output layer here; you can use either tensorflow or pytorch\n",
" self.model = Sequential()\n",
" print(\"Reviews Shape: \", self.reviews.shape)\n",
" print(\"Ratings Shape: \", self.ratings.shape)\n",
" self.model.add(Dense(5, input_shape=(self.reviews.shape[1], )))\n",
" self.model.add(Activation('softmax'))\n",
" # TODO: Add our softmax function \n",
"# self.model.add(Activation(softmax_activation, name='Softmax'))\n",
" self.model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])\n",
" \n",
" def train_nn(self,batch_size,epochs):\n",
" # write the training loop here; you can use either tensorflow or pytorch\n",
" # print validation accuracy\n",
" self.model.fit(self.reviews, self.ratings, epochs=epochs, batch_size=batch_size, verbose=1)\n",
"\n",
" def predict(self, reviews):\n",
" # return a list containing all the ratings predicted by the trained model\n",
" predicted = self.model.predict(reviews)\n",
" return predicted\n",
"\n",
"\n",
"# TODO: Cross-validate\n",
"\n",
"# Build model\n",
"model = NeuralNet(train_reviews, train_ratings)\n",
"model.build_nn()\n",
"model.train_nn(batch_size, epochs)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"\n",
"# predict\n",
"# test_predictions = model.predict(test_reviews)\n",
"# train_predictions = model.predict(train_reviews)\n",
"# \n",
"# # get ratings from probabilities\n",
"# train_predictions = np.argmax(train_predictions, axis=1)\n",
"# test_predictions = np.argmax(test_predictions, axis=1)\n",
"# train_ratings = np.argmax(np.array(train_ratings), axis=1)\n",
"\n",
"# # report generation on training data\n",
"# print(f\"Classification report:\\n{metrics.classification_report(train_ratings, train_predictions)}\\n\")\n",
"# metrics.confusion_matrix(train_ratings, train_predictions)\n",
"\n",
"# # reading gold-test data\n",
"# test_data = pd.read_csv(\"gold_test.csv\", index_col=0)\n",
"\n",
"# # separate reviews, ratings\n",
"# test_reviews = test_data.iloc[:, :-1]\n",
"# test_ratings = test_data.iloc[:, -1]\n",
"# test_ratings = pd.get_dummies(test_ratings)\n",
"\n",
"# # pre-process data\n",
"# test_reviews, tokenizer = preprocess_data(tokenizer, test_reviews, maxlen=train_reviews.shape[1])\n",
"\n",
"# # predict\n",
"# test_predictions = model.predict(test_reviews)\n",
"\n",
"# # get ratings from probabilities\n",
"# test_predictions = np.argmax(test_predictions, axis=1) + 1\n",
"# test_ratings = np.argmax(np.array(test_ratings), axis=1) + 1\n",
"\n",
"# # report generation on training data\n",
"# print(f\"Classification report:\\n{metrics.classification_report(test_ratings, test_predictions)}\\n\")\n",
"# metrics.confusion_matrix(test_ratings, test_predictions)\n",
"\n",
"# # Test input\n",
"# Test = ['this is bad', 'wow nice!', 'this is a great product']\n",
"# test_reviews = pd.DataFrame(Test, columns=['reviews'])\n",
"\n",
"# # pre-process data\n",
"# test_reviews, tokenizer = preprocess_data(tokenizer, test_reviews, maxlen=train_reviews.shape[1])\n",
"\n",
"# # predict\n",
"# test_predictions = model.predict(test_reviews)\n",
"# test_ratings = np.argmax(np.array(test_predictions), axis=1) + 1\n",
"\t\n",
"# # show probabilities and ratings\n",
"# print(test_predictions)\n",
"# print(test_ratings)"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {
"scrolled": true
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"WARNING:tensorflow:From /home/rohit/.local/lib/python3.8/site-packages/tensorflow/python/ops/resource_variable_ops.py:1813: calling BaseResourceVariable.__init__ (from tensorflow.python.ops.resource_variable_ops) with constraint is deprecated and will be removed in a future version.\n",
"Instructions for updating:\n",
"If using Keras pass *_constraint arguments to layers.\n",
"INFO:tensorflow:Assets written to: mymodel/assets\n"
]
}
],
"source": [
"model.model.save(\"mymodel\")"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
"model = tf.keras.models.load_model(\"mymodel\")"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [
{
"ename": "NameError",
"evalue": "name 'tokenizer' is not defined",
"output_type": "error",
"traceback": [
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
"\u001b[0;31mNameError\u001b[0m Traceback (most recent call last)",
"\u001b[0;32m<ipython-input-4-5b8d9f44ad52>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n\u001b[1;32m 5\u001b[0m \u001b[0mtest_ratings\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mpd\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mget_dummies\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mtest_ratings\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 6\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 7\u001b[0;31m \u001b[0mtest_reviews\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mtokenizer\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mpreprocess_data\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mtokenizer\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mtest_reviews\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mmaxlen\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mtrain_reviews\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mshape\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;36m1\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 8\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 9\u001b[0m \u001b[0mtest_predictions\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mmodel\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mpredict\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mtest_reviews\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;31mNameError\u001b[0m: name 'tokenizer' is not defined"
]
}
],
"source": [
"test_data = pd.read_csv(\"gold_test.csv\", index_col=0)\n",
"\n",
"test_reviews = test_data.iloc[:, :-1]\n",
"test_ratings = test_data.iloc[:, -1]\n",
"test_ratings = pd.get_dummies(test_ratings)\n",
"\n",
"test_reviews, tokenizer = preprocess_data(tokenizer, test_reviews, maxlen=train_reviews.shape[1])\n",
"\n",
"test_predictions = model.predict(test_reviews)\n",
"\n",
"test_predictions = np.argmax(test_predictions, axis=1) + 1\n",
"test_ratings = np.argmax(np.array(test_ratings), axis=1) + 1\n",
"\n",
"print(f\"Classification report:\\n{metrics.classification_report(test_ratings, test_predictions)}\\n\")\n",
"metrics.confusion_matrix(test_ratings, test_predictions)"
]
},
{
"cell_type": "code",
"execution_count": 25,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"Matplotlib is building the font cache; this may take a moment.\n"
]
},
{
"data": {
"text/plain": [
"<AxesSubplot:>"
]
},
"execution_count": 25,
"metadata": {},
"output_type": "execute_result"
},
{
"data": {
"image/png": "\n",
"text/plain": [
"<Figure size 720x504 with 2 Axes>"
]
},
"metadata": {
"needs_background": "light"
},
"output_type": "display_data"
}
],
"source": [
"import seaborn as sn\n",
"import pandas as pd\n",
"import matplotlib.pyplot as plt\n",
"array = metrics.confusion_matrix(test_ratings, test_predictions)\n",
"df_cm = pd.DataFrame(array, index = [i for i in range(1, 6)],\n",
" columns = [i for i in range(1, 6)])\n",
"plt.figure(figsize = (10,7))\n",
"sn.heatmap(df_cm, annot=True)\n"
]
},
{
"cell_type": "code",
"execution_count": 19,
"metadata": {},
"outputs": [],
"source": [
"def predict_rating(text):\n",
" test_reviews = pd.DataFrame([text], columns=['reviews'])\n",
"\n",
" test_reviews, _ = preprocess_data(tokenizer, test_reviews, maxlen=train_reviews.shape[1])\n",
"\n",
" test_predictions = model.predict(test_reviews)\n",
" test_ratings = np.argmax(np.array(test_predictions), axis=1) + 1\n",
" \n",
" import tabulate\n",
" \n",
" str = f\"\\nPredicted Rating: {test_ratings}\\n\\n\\n\"\n",
" str += tabulate.tabulate(test_predictions, headers=[\"Rating-1\", \"Rating-2\", \"Rating-3\", \"Rating-4\", \"Rating-5\"])\n",
" return str"
]
},
{
"cell_type": "code",
"execution_count": 20,
"metadata": {},
"outputs": [],
"source": [
"import tkinter as tk\n",
" \n",
"# Top level window \n",
"frame = tk.Tk() \n",
"frame.title(\"Rate reviews\") \n",
"frame.geometry('600x600') \n",
"\n",
"# Function for getting Input from textbox and printing it at label widget \n",
"def printInput(): \n",
" inp = inputtxt.get(1.0, \"end-1c\") \n",
" output = predict_rating(inp)\n",
" lbl.config(text = output) \n",
"\n",
"# TextBox Creation \n",
"inputtxt = tk.Text(frame, \n",
" height = 10, \n",
" width = 40, \n",
" font=(\"Courier\", 18)) \n",
" \n",
"inputtxt.pack() \n",
" \n",
"# Button Creation \n",
"printButton = tk.Button(frame, \n",
" text = \"Print\", \n",
" command = printInput, \n",
" font=(\"monospace\", 14)) \n",
"printButton.pack() \n",
" \n",
"# Label Creation \n",
"lbl = tk.Label(frame, text = \"\") \n",
"lbl.pack() \n",
"frame.mainloop() "
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.8.5"
}
},
"nbformat": 4,
"nbformat_minor": 4
}
Instructions to run the code file for Assignment-2
1. Open code.ipynb file.
2. Run the cells in the file one by one.
3. Run the cell To access Pre Trained word embeddings from google drive which are mounted from our folder. In order to access it please mail your id to us, we will share with you the folder so all embeddings(glove.6B.300d.txt, glove.6B.200d.txt, glove.6B.300d.txt, wiki-news-300d-1M.vec, and GoogleNews-vectors-negative300.bin) and datasets( train.csv, gold_test.csv) could be accessed by you.
4. Run the Header files section.
5. Run the Pre-processing function section.
6. Run the Function for Neural Network.
7. Run the Required for word embedding.
8. There are 3 embeddings sections in the file Glove, Wod2vec and fasttext. These all embeddings are present Word Embeddings section.
9. Any one embedding section could be run.
10. Run the section Train Neural Network to get train accuracy.
11. Run the Test accuracy model to get the gold_test data accuracy.
12. First the section with headers will be run, then section with data preprocessing, then initialising the Neural Network, then run the required embedding(Glove, word2vec or fasttext) and then train the Neural Network. Train accuracy will be displayed then run the test model.
13. To run the imbalanced data handling techniques (like Undersampling Technique-1 etc.) first run the functions section, then run the Set Hyperparameter section (you can change the hyperparameters here), then run any of the following sections for using the corresponding technique. By default glove word embedding-300d is used that is embed==1, to change this go to Data Imbalanced Handling and at there acces Functions section, change the initialisation of embed. If embed==2, fasttext word embedding will be used and integers other than 1 and 2 will access word2vec pretrained embeddings.
Steps 1, 2, 3, 4, 5, 6 are necessary to execute step-12.
We have also attached a file containing the GUI (GUI.ipynb). From which GUI could be run. code.ipynb contains embedding whereas GUI.ipynb only represents GUI as input text and gives output with probabilities.
In this python notebook, the last 2 cells should be run to generate to GUI. There will be one textbox where the user has to insert text and then he can press the “Print” button to get the predicted rating and probabilities of each rating.
\ No newline at end of file
{
"nbformat": 4,
"nbformat_minor": 0,
"metadata": {
"colab": {
"name": "code.ipynb",
"provenance": []
},
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.8.5"
}
},
"cells": [
{
"cell_type": "markdown",
"metadata": {
"id": "jN5i-YL1dNqz"
},
"source": [
"#### Install python packages"
]
},
{
"cell_type": "code",
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "n-6Y9V3GOPQq",
"outputId": "94cc23d1-dfc9-4339-ccb5-a7efa6c19d72"
},
"source": [
"!pip install PyDrive\n",
"!pip install --upgrade keras"
],
"execution_count": null,
"outputs": [
{
"output_type": "stream",
"text": [
"Requirement already satisfied: PyDrive in /usr/local/lib/python3.7/dist-packages (1.3.1)\n",
"Requirement already satisfied: google-api-python-client>=1.2 in /usr/local/lib/python3.7/dist-packages (from PyDrive) (1.12.8)\n",
"Requirement already satisfied: oauth2client>=4.0.0 in /usr/local/lib/python3.7/dist-packages (from PyDrive) (4.1.3)\n",
"Requirement already satisfied: PyYAML>=3.0 in /usr/local/lib/python3.7/dist-packages (from PyDrive) (3.13)\n",
"Requirement already satisfied: google-auth-httplib2>=0.0.3 in /usr/local/lib/python3.7/dist-packages (from google-api-python-client>=1.2->PyDrive) (0.0.4)\n",
"Requirement already satisfied: uritemplate<4dev,>=3.0.0 in /usr/local/lib/python3.7/dist-packages (from google-api-python-client>=1.2->PyDrive) (3.0.1)\n",
"Requirement already satisfied: google-auth>=1.16.0 in /usr/local/lib/python3.7/dist-packages (from google-api-python-client>=1.2->PyDrive) (1.27.1)\n",
"Requirement already satisfied: httplib2<1dev,>=0.15.0 in /usr/local/lib/python3.7/dist-packages (from google-api-python-client>=1.2->PyDrive) (0.17.4)\n",
"Requirement already satisfied: google-api-core<2dev,>=1.21.0 in /usr/local/lib/python3.7/dist-packages (from google-api-python-client>=1.2->PyDrive) (1.26.1)\n",
"Requirement already satisfied: six<2dev,>=1.13.0 in /usr/local/lib/python3.7/dist-packages (from google-api-python-client>=1.2->PyDrive) (1.15.0)\n",
"Requirement already satisfied: pyasn1>=0.1.7 in /usr/local/lib/python3.7/dist-packages (from oauth2client>=4.0.0->PyDrive) (0.4.8)\n",
"Requirement already satisfied: rsa>=3.1.4 in /usr/local/lib/python3.7/dist-packages (from oauth2client>=4.0.0->PyDrive) (4.7.2)\n",
"Requirement already satisfied: pyasn1-modules>=0.0.5 in /usr/local/lib/python3.7/dist-packages (from oauth2client>=4.0.0->PyDrive) (0.2.8)\n",
"Requirement already satisfied: cachetools<5.0,>=2.0.0 in /usr/local/lib/python3.7/dist-packages (from google-auth>=1.16.0->google-api-python-client>=1.2->PyDrive) (4.2.1)\n",
"Requirement already satisfied: setuptools>=40.3.0 in /usr/local/lib/python3.7/dist-packages (from google-auth>=1.16.0->google-api-python-client>=1.2->PyDrive) (54.0.0)\n",
"Requirement already satisfied: requests<3.0.0dev,>=2.18.0 in /usr/local/lib/python3.7/dist-packages (from google-api-core<2dev,>=1.21.0->google-api-python-client>=1.2->PyDrive) (2.23.0)\n",
"Requirement already satisfied: googleapis-common-protos<2.0dev,>=1.6.0 in /usr/local/lib/python3.7/dist-packages (from google-api-core<2dev,>=1.21.0->google-api-python-client>=1.2->PyDrive) (1.53.0)\n",
"Requirement already satisfied: protobuf>=3.12.0 in /usr/local/lib/python3.7/dist-packages (from google-api-core<2dev,>=1.21.0->google-api-python-client>=1.2->PyDrive) (3.12.4)\n",
"Requirement already satisfied: pytz in /usr/local/lib/python3.7/dist-packages (from google-api-core<2dev,>=1.21.0->google-api-python-client>=1.2->PyDrive) (2018.9)\n",
"Requirement already satisfied: packaging>=14.3 in /usr/local/lib/python3.7/dist-packages (from google-api-core<2dev,>=1.21.0->google-api-python-client>=1.2->PyDrive) (20.9)\n",
"Requirement already satisfied: idna<3,>=2.5 in /usr/local/lib/python3.7/dist-packages (from requests<3.0.0dev,>=2.18.0->google-api-core<2dev,>=1.21.0->google-api-python-client>=1.2->PyDrive) (2.10)\n",
"Requirement already satisfied: urllib3!=1.25.0,!=1.25.1,<1.26,>=1.21.1 in /usr/local/lib/python3.7/dist-packages (from requests<3.0.0dev,>=2.18.0->google-api-core<2dev,>=1.21.0->google-api-python-client>=1.2->PyDrive) (1.24.3)\n",
"Requirement already satisfied: chardet<4,>=3.0.2 in /usr/local/lib/python3.7/dist-packages (from requests<3.0.0dev,>=2.18.0->google-api-core<2dev,>=1.21.0->google-api-python-client>=1.2->PyDrive) (3.0.4)\n",
"Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.7/dist-packages (from requests<3.0.0dev,>=2.18.0->google-api-core<2dev,>=1.21.0->google-api-python-client>=1.2->PyDrive) (2020.12.5)\n",
"Requirement already satisfied: pyparsing>=2.0.2 in /usr/local/lib/python3.7/dist-packages (from packaging>=14.3->google-api-core<2dev,>=1.21.0->google-api-python-client>=1.2->PyDrive) (2.4.7)\n",
"Requirement already up-to-date: keras in /usr/local/lib/python3.7/dist-packages (2.4.3)\n",
"Requirement already satisfied, skipping upgrade: pyyaml in /usr/local/lib/python3.7/dist-packages (from keras) (3.13)\n",
"Requirement already satisfied, skipping upgrade: numpy>=1.9.1 in /usr/local/lib/python3.7/dist-packages (from keras) (1.19.5)\n",
"Requirement already satisfied, skipping upgrade: h5py in /usr/local/lib/python3.7/dist-packages (from keras) (2.10.0)\n",
"Requirement already satisfied, skipping upgrade: scipy>=0.14 in /usr/local/lib/python3.7/dist-packages (from keras) (1.4.1)\n",
"Requirement already satisfied, skipping upgrade: six in /usr/local/lib/python3.7/dist-packages (from h5py->keras) (1.15.0)\n"
],
"name": "stdout"
}
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "nuyriDkqT_Sz"
},
"source": [
"#### To access pre-trained embeddings and dataset (train and test)"
]
},
{
"cell_type": "code",
"metadata": {
"id": "f068QkbOOpUC"
},
"source": [
"from pydrive.auth import GoogleAuth\n",
"from pydrive.drive import GoogleDrive\n",
"from google.colab import auth\n",
"from oauth2client.client import GoogleCredentials\n",
"\n",
"auth.authenticate_user()\n",
"gauth = GoogleAuth()\n",
"gauth.credentials = GoogleCredentials.get_application_default()\n",
"drive = GoogleDrive(gauth)\n",
"\n",
"downloaded = drive.CreateFile({'id':'1GH7dGh9ftQikz8KIDedxBooBzX_qw0zW'}) # replace the id with id of file you want to access\n",
"downloaded.GetContentFile('glove.6B.100d.txt') # replace the file name with your file\n",
"\n",
"download = drive.CreateFile({'id':'1gVExcJs31_mHWFGNvH3YpXBnS-A_XurA'}) # replace the id with id of file you want to access\n",
"download.GetContentFile('train.csv') # replace the file name with your file\n",
"\n",
"download = drive.CreateFile({'id':'13Nc1ZZaJD7_kBup22ScZYXmxAYNg1TOT'}) # replace the id with id of file you want to access\n",
"download.GetContentFile('test.csv') # replace the file name with your file\n",
"\n",
"download = drive.CreateFile({'id':'1xyw2FyO1RTOAK-h7V3PYF_Sxmn8ho5jh'}) # replace the id with id of file you want to access\n",
"download.GetContentFile('gold_test.csv') # replace the file name with your file\n",
"\n",
"download = drive.CreateFile({'id':'1daXpP2rI4YDUHwl4I-kDolS9JfoTY_r4'}) # replace the id with id of file you want to access\n",
"download.GetContentFile('glove.6B.200d.txt') # replace the file name with your file\n",
"\n",
"download = drive.CreateFile({'id':'15IB-nj1e-E6B9PRpJslkB1RMBEtiCA6l'}) # replace the id with id of file you want to access\n",
"download.GetContentFile('glove.6B.300d.txt') # replace the file name with your file\n",
"\n",
"download = drive.CreateFile({'id':'1rH_EvmtKprvULcKj5ARNI3r3p6K9EGwO'}) # replace the id with id of file you want to access\n",
"download.GetContentFile('wiki-news-300d-1M.vec') # replace the file name with your file\n",
"\n",
"download = drive.CreateFile({'id':'1mPXuUDiPAPid2nGjnhUQIzn8Qz-nHp6V'}) # replace the id with id of file you want to access\n",
"download.GetContentFile('GoogleNews-vectors-negative300.bin') # replace the file name with your file"
],
"execution_count": null,
"outputs": []
},
{
"cell_type": "markdown",
"metadata": {
"id": "fEC0XB9cUQfA"
},
"source": [
"#### Header files"
]
},
{
"cell_type": "code",
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "r5NmV1ciOT0h",
"outputId": "dea55388-b535-4937-b479-417352850317"
},
"source": [
"import pandas as pd\n",
"import numpy as np\n",
"import string\n",
"import nltk\n",
"from nltk.corpus import stopwords\n",
"import tensorflow as tf\n",
"from tensorflow.keras.preprocessing.text import Tokenizer\n",
"from tensorflow.keras.preprocessing.text import one_hot\n",
"from tensorflow.keras.preprocessing.sequence import pad_sequences\n",
"from tensorflow.keras import Sequential\n",
"from tensorflow.keras.layers import Dense, Dropout, Activation, Flatten\n",
"from sklearn import datasets, model_selection, metrics\n",
"from keras.layers.embeddings import Embedding\n",
"from keras.initializers import Constant\n",
"from nltk.tokenize import word_tokenize\n",
"from sklearn.model_selection import train_test_split \n",
"nltk.download('stopwords')\n",
"stopword = stopwords.words('english') \n",
"nltk.download('punkt')"
],
"execution_count": 6,
"outputs": [
{
"output_type": "stream",
"text": [
"[nltk_data] Downloading package stopwords to /root/nltk_data...\n",
"[nltk_data] Unzipping corpora/stopwords.zip.\n",
"[nltk_data] Downloading package punkt to /root/nltk_data...\n",
"[nltk_data] Unzipping tokenizers/punkt.zip.\n"
],
"name": "stdout"
},
{
"output_type": "execute_result",
"data": {
"text/plain": [
"True"
]
},
"metadata": {
"tags": []
},
"execution_count": 6
}
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "1vS9WexsUVPZ"
},
"source": [
"#### Preprocessing function"
]
},
{
"cell_type": "code",
"metadata": {
"id": "wsu00ZJwPCb5"
},
"source": [
"def encode_data(tokenizer, text, tokens, preprocessing_training_data = False):\n",
" # This function will be used to encode the reviews using a dictionary (created using corpus vocabulary) \n",
"\n",
" # Example of encoding :\"The food was fabulous but pricey\" has a vocabulary of 4 words, each one has to be mapped to an integer like: \n",
" # {'The':1,'food':2,'was':3 'fabulous':4 'but':5 'pricey':6} this vocabulary has to be created for the entire corpus and then be used to \n",
" # encode the words into integers \n",
"\n",
" # return encoded examples\n",
" if preprocessing_training_data:\n",
" tokenizer = Tokenizer(oov_token = '<oov>')\n",
" tokenizer.fit_on_texts(tokens)\n",
"\n",
" sequences = tokenizer.texts_to_sequences(text)\n",
" return sequences, tokenizer\n",
"\n",
"def convert_to_lower(text):\n",
" # return the reviews after convering then to lowercase\n",
" lower_text = text.lower()\n",
" return lower_text\n",
"\n",
"def perform_tokenization(text):\n",
" # return the reviews after performing tokenization\n",
" token=nltk.word_tokenize(text)\n",
" return token\n",
"\n",
"def remove_stopwords(text):\n",
"\t# return the reviews after removing the stopwords\n",
" stopword = [] # not any stopword\n",
" removing_stopwords=[word for word in text if word not in stopword]\n",
" return removing_stopwords\n",
"\t#print(removing_stopwords)\n",
"\n",
"def remove_punctuation(text):\n",
" # return the reviews after removing punctuations\n",
" removing_punctuation = [word for word in text if word.isalpha()]\n",
" return removing_punctuation\n",
"\n",
"def perform_padding(data, maxlen):\n",
" # return the reviews after padding the reviews to maximum length\n",
"\tpadded_data = pad_sequences(data, maxlen=maxlen, padding='post')\n",
"\treturn padded_data\n",
"\n",
"def preprocess_data(tokenizer, data, preprocessing_training_data=False, maxlen=None):\n",
" # make all the following function calls on your data\n",
" # EXAMPLE:->\n",
" '''\n",
" review = data[\"reviews\"]\n",
" review = convert_to_lower(review)\n",
" review = remove_punctuation(review)\n",
" review = remove_stopwords(review)\n",
" review = perform_tokenization(review)\n",
" review = encode_data(review)\n",
" review = perform_padding(review)\n",
" '''\n",
" # return processed data\n",
"\n",
" reviews = data[\"reviews\"]\n",
" list_of_reviews = list(reviews)\n",
" string_of_reviews = ' '.join(str(e) for e in list_of_reviews)\n",
"\n",
" lower_text = convert_to_lower(string_of_reviews)\n",
" # print(lower_text, end='\\n\\n\\n\\n')\n",
" tokens = perform_tokenization(lower_text)\n",
" #print(tokens, end='\\n\\n\\n\\n')\n",
" tokens = remove_stopwords(tokens)\n",
"# print(tokens, end='\\n\\n\\n\\n')\n",
" tokens = remove_punctuation(tokens)\n",
"# print(tokens, end='\\n\\n\\n\\n')\n",
" encoded_data, tokenizer = encode_data(tokenizer, reviews, tokens, preprocessing_training_data)\n",
"# print(encoded_data, end='\\n\\n\\n\\n')\n",
" reviews = perform_padding(encoded_data, maxlen)\n",
"# print(review, end='\\n\\n\\n\\n')\n",
"\n",
" # TODO: Use word embedding for better results \n",
" return pd.DataFrame(reviews), tokenizer"
],
"execution_count": 2,
"outputs": []
},
{
"cell_type": "markdown",
"metadata": {
"id": "NFdxrcfdUldJ"
},
"source": [
"#### Function for Neural Network"
]
},
{
"cell_type": "code",
"metadata": {
"id": "yTu0_OclPIol"
},
"source": [
"def softmax_activation(x):\n",
" # write your own implementation from scratch and return softmax values(using predefined softmax is prohibited)\n",
" Exponent_calculation=tf.exp(x-tf.reduce_max(x,axis=-1,keepdims=True))\n",
" Normalization=tf.reduce_sum(Exponent_calculation,axis=-1,keepdims=True)\n",
" return Exponent_calculation/Normalization\n",
"\n",
"\n",
"class NeuralNet:\n",
"\n",
" def __init__(self, reviews, ratings,e):\n",
" self.reviews = reviews\n",
" self.ratings = ratings\n",
" self.e = e\n",
" def build_nn(self):\n",
" #add the input and output layer here; you can use either tensorflow or pytorch\n",
" self.model = Sequential()\n",
" print(\"Reviews Shape: \", self.reviews.shape)\n",
" print(\"Ratings Shape: \", self.ratings.shape)\n",
" self.model.add(e)\n",
" self.model.add(Flatten())\n",
" #self.model.add(Dense(5 , activation=\"relu\" ))\n",
" #self.model.add(Dense(32, activation='sigmoid'))\n",
" self.model.add(Dense(5))\n",
" self.model.add(Activation(softmax_activation, name='Softmax'))\n",
"\n",
" self.model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])\n",
" \n",
" def train_nn(self,batch_size,epochs, ):\n",
" # write the training loop here; you can use either tensorflow or pytorch\n",
" # print validation accuracy\n",
" self.model.fit(self.reviews, self.ratings, epochs=epochs, batch_size=batch_size, verbose=1)\n",
"\n",
" def predict(self, reviews):\n",
" # return a list containing all the ratings predicted by the trained model\n",
" predicted = self.model.predict(reviews)\n",
" return predicted"
],
"execution_count": 3,
"outputs": []
},
{
"cell_type": "markdown",
"metadata": {
"id": "IZctTDz-gqok"
},
"source": [
"#### Required for word embedding"
]
},
{
"cell_type": "code",
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/",
"height": 229
},
"id": "iISfyYVQgktl",
"outputId": "721296e8-1dfd-45d6-9d85-cc41b9b0133b"
},
"source": [
"# reading dataset\n",
"train_data = pd.read_csv(\"train.csv\", index_col=0)\n",
"test_data = pd.read_csv(\"test.csv\", index_col=0)\n",
"\n",
"\n",
"# separate reviews, ratings\n",
"train_reviews = train_data.iloc[:, :-1]\n",
"train_ratings = train_data.iloc[:, -1]\n",
"train_ratings = pd.get_dummies(train_ratings)\n",
" \n",
"tokenizer = Tokenizer(oov_token = '<oov>')\n",
"\n",
"# pre-process data\n",
"train_reviews, tokenizer = preprocess_data(tokenizer, train_data, preprocessing_training_data=True)\n",
"test_reviews, tokenizer = preprocess_data(tokenizer, test_data, maxlen=train_reviews.shape[1])\n",
"#print(len(train_reviews))\n",
"#print(len(test_reviews))\n",
"maxlen=train_reviews.shape[1]\n",
"vocab_size = len(tokenizer.word_index) + 1\n",
"print(vocab_size)"
],
"execution_count": 4,
"outputs": [
{
"output_type": "error",
"ename": "NameError",
"evalue": "ignored",
"traceback": [
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
"\u001b[0;31mNameError\u001b[0m Traceback (most recent call last)",
"\u001b[0;32m<ipython-input-4-53c4bc8326ea>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m()\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[0;31m# reading dataset\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 2\u001b[0;31m \u001b[0mtrain_data\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mpd\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mread_csv\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"train.csv\"\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mindex_col\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;36m0\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 3\u001b[0m \u001b[0mtest_data\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mpd\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mread_csv\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"test.csv\"\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mindex_col\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;36m0\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 4\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 5\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;31mNameError\u001b[0m: name 'pd' is not defined"
]
}
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "MaNBH_r4UcRE"
},
"source": [
"### Data Imbalanced Handling\n"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "cXdVILD7b9gr"
},
"source": [
"#### Functions"
]
},
{
"cell_type": "code",
"metadata": {
"id": "IgQH8sYKPPzr",
"colab": {
"base_uri": "https://localhost:8080/",
"height": 229
},
"outputId": "0cf1242b-78e5-4d4c-a380-1c3624cb45a9"
},
"source": [
"class NeuralNet:\n",
"\n",
" def __init__(self, reviews, ratings,e):\n",
" self.reviews = reviews\n",
" self.ratings = ratings\n",
" self.e = e\n",
" def build_nn(self):\n",
" #add the input and output layer here; you can use either tensorflow or pytorch\n",
" self.model = Sequential()\n",
" print(\"Reviews Shape: \", self.reviews.shape)\n",
" print(\"Ratings Shape: \", self.ratings.shape)\n",
" print(\"Columns: \", self.ratings.columns)\n",
" self.model.add(e)\n",
" self.model.add(Flatten())\n",
" # self.model.add(Dense(30, activation='sigmoid', kernel_regularizer=tf.keras.regularizers.l2(l=0.1)))\n",
" self.model.add(Dense(5))\n",
" # self.model.add(Dense(5 , input_shape=(self.reviews.shape[1], )))\n",
" #self.model.add(Dense(5 , activation=\"relu\"))\n",
" self.model.add(Activation(softmax_activation, name='Softmax'))\n",
" # TODO: Add our softmax function \n",
"\n",
" self.model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])\n",
" \n",
" def train_nn(self,batch_size,epochs, cv_reviews, cv_ratings):\t\n",
"\t # write the training loop here; you can use either tensorflow or pytorch\t\n",
"\t # print validation accuracy\t\n",
"\t self.model.fit(self.reviews, self.ratings, epochs=epochs, batch_size=batch_size, verbose=1, validation_data=(cv_reviews, cv_ratings))\t\n",
"\n",
" def predict(self, reviews):\n",
" # return a list containing all the ratings predicted by the trained model\n",
" predicted = self.model.predict(reviews)\n",
" return predicted\n",
"\n",
"\n",
"# get e first\n",
"def get_e(tokenizer, maxlen, embed=1):\n",
" if embed==1: #glove\n",
" # load the whole embedding into memory\n",
" embeddings_index = dict()\n",
" f = open('glove.6B.300d.txt')\n",
" for line in f:\n",
" values = line.split()\n",
" word = values[0]\n",
" coefs = np.asarray(values[1:], dtype='float32')\n",
" embeddings_index[word] = coefs\n",
" f.close()\n",
" print('Loaded %s word vectors.' % len(embeddings_index))\n",
"\n",
" # create a weight matrix for words in training docs\n",
" embedding_matrix = np.zeros((vocab_size, 300))\n",
" for word, i in tokenizer.word_index.items():\n",
" embedding_vector = embeddings_index.get(word)\n",
" if embedding_vector is not None:\n",
" embedding_matrix[i] = embedding_vector\n",
" print(train_reviews.shape)\n",
" print(embedding_matrix.shape)\n",
" e = Embedding(vocab_size, 300, embeddings_initializer=Constant(embedding_matrix), input_length=maxlen, trainable=False)\n",
" # TODO: Cross-validate\n",
" print(train_reviews.shape)\n",
" elif embed==2: # fasttext\n",
" # load the whole embedding into memory\n",
" embeddings_index = dict()\n",
" f = open('wiki-news-300d-1M.vec')\n",
" for line in f:\n",
" values = line.split()\n",
" word = values[0]\n",
" coefs = np.asarray(values[1:], dtype='float32')\n",
" embeddings_index[word] = coefs\n",
" f.close()\n",
" print('Loaded %s word vectors.' % len(embeddings_index))\n",
" #print(values)\n",
" # create a weight matrix for words in training docs\n",
" embedding_matrix = np.zeros((vocab_size, 300))\n",
" for word, i in tokenizer.word_index.items():\n",
" embedding_vector = embeddings_index.get(word)\n",
" if embedding_vector is not None:\n",
" embedding_matrix[i] = embedding_vector\n",
" print(train_reviews.shape)\n",
" print(embedding_matrix.shape)\n",
" e = Embedding(vocab_size, 300, embeddings_initializer=Constant(embedding_matrix), input_length=maxlen, trainable=True)\n",
" # TODO: Cross-validate\n",
" print(train_reviews.shape)\n",
" else: # word2vec\n",
" from gensim.models import KeyedVectors\n",
"\n",
" # load the whole embedding into memory\n",
" embeddings_index = dict()\n",
"\n",
"\n",
" #f = open('GoogleNews-vectors-negative300.bin')\n",
" model_w2v = KeyedVectors.load_word2vec_format('GoogleNews-vectors-negative300.bin', binary=True)\n",
" print(type(model_w2v))\n",
"\n",
" # prepare embedding matrix\n",
" embedding_matrix = np.zeros((vocab_size, 300))\n",
" for word, i in tokenizer.word_index.items():\n",
" if word in model_w2v.vocab:\n",
" embedding_vector = model_w2v[word]\n",
" embedding_vector = np.array(embedding_vector)\n",
" if embedding_vector is not None:\n",
" embedding_matrix[i] = embedding_vector\n",
"\n",
" e = Embedding(vocab_size, 300, embeddings_initializer=Constant(embedding_matrix), input_length=maxlen, trainable=False)\n",
" \n",
" return e\n",
"\n",
"# reading dataset\n",
"train_data = pd.read_csv(\"train.csv\", index_col=0)#.iloc[:1000,:]\n",
"test_data = pd.read_csv(\"test.csv\", index_col=0)#.iloc[:1000,:]\n",
"gold_test_data = pd.read_csv(\"gold_test.csv\", index_col=0)\n",
"\n",
"def prepare_data(is_preprocessed, train_data, test_data, gold_test_data=None, cv=0.2):\n",
" '''\n",
" Returns:\n",
" processed_train_data : [preprocessed-reviews ratings]\n",
" train_ratings : ratings (one hot encoded)\n",
" \n",
" processed_cv_data : [preprocessed-reviews ratings]\n",
" cv_ratings : ratings (one hot encoded)\n",
" \n",
" processed_test_data : [preprocessed-reviews ratings] ::- ratings is optinal (it will be present if gold_test_data is given)\n",
" test_ratings : ratings (one hot encoded) (optional)\n",
"\n",
" vocab_size\n",
" '''\n",
" # initialize tokenizer\n",
" tokenizer = Tokenizer(oov_token = '<oov>')\n",
" \n",
" if not is_preprocessed:\n",
" # pre-process data\n",
" train_reviews, tokenizer = preprocess_data(tokenizer, train_data, preprocessing_training_data=True)\n",
" test_reviews, tokenizer = preprocess_data(tokenizer, test_data, maxlen=train_reviews.shape[1])\n",
" else:\n",
" train_reviews = pd.drop(train_data, columns=['ratings'])\n",
" test_reviews = test_data\n",
" \n",
" \n",
" # print(f\"train_reviews {train_reviews.shape}\")\n",
" # print(f\"test_reviews {test_reviews.shape}\")\n",
" \n",
" # print(pd.concat([train_reviews, train_data['ratings']], axis=1).shape)\n",
"\n",
" # split dataset into train & cross-validation set\n",
" train_reviews, cv_reviews, train_ratings, cv_ratings = train_test_split(train_reviews, train_data['ratings'], test_size=cv, random_state=0, stratify=train_data['ratings']) \n",
" \n",
" # print(f\"train_reviews {train_reviews.shape}\")\n",
" # print(f\"cv_reviews {cv_reviews.shape}\")\n",
" # print(f\"train_ratings {train_ratings.shape}\")\n",
" # print(f\"cv_ratings {cv_ratings.shape}\")\n",
" \n",
" processed_train_data = pd.concat([train_reviews, train_ratings], axis=1)\n",
" processed_cv_data = pd.concat([cv_reviews, cv_ratings], axis=1)\n",
" \n",
" # print(f\"processed_train_data {processed_train_data.shape}\")\n",
" # print(f\"processed_cv_data {processed_cv_data.shape}\")\n",
" \n",
" # One hot encode train ratings\n",
" train_ratings = pd.get_dummies(train_ratings)\n",
"\n",
" # One hot encode CV ratings\n",
" cv_ratings = pd.get_dummies(cv_ratings)\n",
"\n",
" # print(f\"OHE train_ratings {train_ratings.shape}\")\n",
" # print(f\"OHE cv_ratings {cv_ratings.shape}\")\n",
" \n",
" # One hot encode gold test ratings\n",
" if gold_test_data is not None:\n",
" gold_test_ratings = gold_test_data['ratings']\n",
" processed_gold_test_data = pd.concat([test_reviews, gold_test_ratings], axis=1)\n",
" \n",
" # One hot encode gold test ratings\n",
" gold_test_ratings = pd.get_dummies(gold_test_ratings)\n",
"\n",
" maxlen=train_reviews.shape[1] \n",
" e = get_e(tokenizer, maxlen)\n",
" return processed_train_data, train_ratings, processed_cv_data, cv_ratings, processed_gold_test_data, gold_test_ratings, e\n",
"\n",
" return processed_train_data, train_ratings, processed_cv_data, cv_ratings, test_reviews, None, None\n",
"\n",
"def train_model_and_check_test_accuracy(e, batch_size, epochs, train_reviews, train_ratings, cv_reviews, cv_ratings, test_reviews, test_ratings):\n",
" # Build model\n",
" train_ratings = pd.get_dummies(train_ratings)\n",
" cv_ratings = pd.get_dummies(cv_ratings)\n",
" test_ratings = pd.get_dummies(test_ratings)\n",
" \n",
" model = NeuralNet(train_reviews, train_ratings, e)\n",
" model.build_nn()\n",
" model.train_nn(batch_size, epochs, cv_reviews, cv_ratings)\n",
"\n",
" # predict\n",
" test_predictions = model.predict(test_reviews)\n",
" train_predictions = model.predict(train_reviews)\n",
"\n",
" # get ratings from probabilities\n",
" train_predictions = np.argmax(train_predictions, axis=1) + 1\n",
" # test_predictions = np.argmax(test_predictions, axis=1)\n",
" train_ratings = np.argmax(np.array(train_ratings), axis=1) + 1\n",
"\n",
" # report generation on training data\n",
" print(f\"Classification report:\\n{metrics.classification_report(train_ratings, train_predictions)}\\n\")\n",
" metrics.confusion_matrix(train_ratings, train_predictions)\n",
"\n",
" # predict\n",
" test_predictions = model.predict(test_reviews)\n",
"\n",
" # get ratings from probabilities\n",
" test_predictions = np.argmax(test_predictions, axis=1) + 1\n",
" test_ratings = np.argmax(np.array(test_ratings), axis=1) + 1\n",
"\n",
" # report generation on training data\n",
" print(f\"Classification report:\\n{metrics.classification_report(test_ratings, test_predictions)}\\n\")\n",
" metrics.confusion_matrix(test_ratings, test_predictions)\n",
"\n",
"def class_counts(ratings): \n",
" # find # of elements of each class\n",
" cnts = ratings.value_counts()\n",
" class1_count = cnts[1]\n",
" class2_count = cnts[2]\n",
" class3_count = cnts[3]\n",
" class4_count = cnts[4]\n",
" class5_count = cnts[5]\n",
" \n",
" return class1_count, class2_count, class3_count, class4_count, class5_count\n",
"\n",
"def minority_class_count(ratings):\n",
" # find # no of elements in the minority class\n",
" return min(class_counts(ratings))\n",
"\n",
"def majority_class_count(ratings):\n",
" # find # no of elements in the minority class\n",
" return max(class_counts(ratings))\n",
"\n",
"def average_class_count(ratings):\n",
" return int(sum(class_counts(ratings))/5) \n"
],
"execution_count": 5,
"outputs": [
{
"output_type": "error",
"ename": "NameError",
"evalue": "ignored",
"traceback": [
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
"\u001b[0;31mNameError\u001b[0m Traceback (most recent call last)",
"\u001b[0;32m<ipython-input-5-a4ea6d914189>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m()\u001b[0m\n\u001b[1;32m 106\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 107\u001b[0m \u001b[0;31m# reading dataset\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 108\u001b[0;31m \u001b[0mtrain_data\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mpd\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mread_csv\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"train.csv\"\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mindex_col\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;36m0\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;31m#.iloc[:1000,:]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 109\u001b[0m \u001b[0mtest_data\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mpd\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mread_csv\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"test.csv\"\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mindex_col\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;36m0\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;31m#.iloc[:1000,:]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 110\u001b[0m \u001b[0mgold_test_data\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mpd\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mread_csv\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"gold_test.csv\"\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mindex_col\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;36m0\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;31mNameError\u001b[0m: name 'pd' is not defined"
]
}
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "K_tcvDwJfcvU"
},
"source": [
"#### Set Hyperparameter"
]
},
{
"cell_type": "code",
"metadata": {
"id": "wPhgCP_ofgLC"
},
"source": [
"batch_size, epoch = 32, 1"
],
"execution_count": null,
"outputs": []
},
{
"cell_type": "markdown",
"metadata": {
"id": "kNFlkgEPcE7z"
},
"source": [
"#### Undersampling Technique-1"
]
},
{
"cell_type": "code",
"metadata": {
"id": "F6brLxAQbwwh"
},
"source": [
"# 1. reduce the number of ratings of each class to the 'minority_class_count' \n",
"preprocessed_train_data, train_ratings, processed_cv_data, cv_ratings, processed_gold_test_data, gold_test_ratings, e = prepare_data(False, train_data, test_data, gold_test_data)\n",
"class1_data = preprocessed_train_data[preprocessed_train_data['ratings']==1]\n",
"class2_data = preprocessed_train_data[preprocessed_train_data['ratings']==2]\n",
"class3_data = preprocessed_train_data[preprocessed_train_data['ratings']==3]\n",
"class4_data = preprocessed_train_data[preprocessed_train_data['ratings']==4]\n",
"class5_data = preprocessed_train_data[preprocessed_train_data['ratings']==5]\n",
"\n",
"minority_class_cnt = minority_class_count(preprocessed_train_data['ratings'])\n",
"class1_data = class1_data.sample(minority_class_cnt, random_state=1)\n",
"class2_data = class2_data.sample(minority_class_cnt, random_state=1)\n",
"class3_data = class3_data.sample(minority_class_cnt, random_state=1)\n",
"class4_data = class4_data.sample(minority_class_cnt, random_state=1)\n",
"class5_data = class5_data.sample(minority_class_cnt, random_state=1)\n",
"\n",
"train_data_undersample = pd.concat([class1_data, class2_data, class3_data, class4_data, class5_data], axis=0)\n",
"\n",
"train_model_and_check_test_accuracy(e, batch_size, epoch, train_data_undersample.drop(['ratings'], axis=1), train_data_undersample['ratings'], processed_cv_data.drop(['ratings'], axis=1), processed_cv_data['ratings'], processed_gold_test_data.drop(['ratings'], axis=1), processed_gold_test_data['ratings'])\n"
],
"execution_count": null,
"outputs": []
},
{
"cell_type": "markdown",
"metadata": {
"id": "jmCPQLwncOqL"
},
"source": [
"#### Undersampling technique-2"
]
},
{
"cell_type": "code",
"metadata": {
"id": "TMon7DHQbwnN"
},
"source": [
"# 2. reduce the number of ratings of each class to the 'avg_class_count' \n",
"preprocessed_train_data, train_ratings, processed_cv_data, cv_ratings, processed_gold_test_data, gold_test_ratings, e = prepare_data(False, train_data, test_data, gold_test_data)\n",
"class1_data = preprocessed_train_data[preprocessed_train_data['ratings']==1]\n",
"class2_data = preprocessed_train_data[preprocessed_train_data['ratings']==2]\n",
"class3_data = preprocessed_train_data[preprocessed_train_data['ratings']==3]\n",
"class4_data = preprocessed_train_data[preprocessed_train_data['ratings']==4]\n",
"class5_data = preprocessed_train_data[preprocessed_train_data['ratings']==5]\n",
"\n",
"avg_class_count = average_class_count(preprocessed_train_data['ratings'])\n",
"class1_data = class1_data.sample(min(class1_data.shape[0], avg_class_count), random_state=1)\n",
"class2_data = class2_data.sample(min(class2_data.shape[0], avg_class_count), random_state=1)\n",
"class3_data = class3_data.sample(min(class3_data.shape[0], avg_class_count), random_state=1)\n",
"class4_data = class4_data.sample(min(class4_data.shape[0], avg_class_count), random_state=1)\n",
"class5_data = class5_data.sample(min(class5_data.shape[0], avg_class_count), random_state=1)\n",
"\n",
"train_data_undersample = pd.concat([class1_data, class2_data, class3_data, class4_data, class5_data], axis=0)\n",
"print(train_data_undersample['ratings'].value_counts())\n",
"print(processed_cv_data['ratings'].value_counts())\n",
"\n",
"train_model_and_check_test_accuracy(e, batch_size, epoch, train_data_undersample.drop(['ratings'], axis=1), train_data_undersample['ratings'], processed_cv_data.drop(['ratings'], axis=1), processed_cv_data['ratings'], processed_gold_test_data.drop(['ratings'], axis=1), processed_gold_test_data['ratings'])\n"
],
"execution_count": null,
"outputs": []
},
{
"cell_type": "markdown",
"metadata": {
"id": "R3l5LVS8cY1Q"
},
"source": [
"#### Oversampling Technique-1\n"
]
},
{
"cell_type": "code",
"metadata": {
"id": "pjv5TTLqccHH"
},
"source": [
"# 1. increase the number of ratings of each class to the 'majority_class_count' \n",
"preprocessed_train_data, train_ratings, processed_cv_data, cv_ratings, processed_gold_test_data, gold_test_ratings, e = prepare_data(False, train_data, test_data, gold_test_data)\n",
"class1_data = preprocessed_train_data[preprocessed_train_data['ratings']==1]\n",
"class2_data = preprocessed_train_data[preprocessed_train_data['ratings']==2]\n",
"class3_data = preprocessed_train_data[preprocessed_train_data['ratings']==3]\n",
"class4_data = preprocessed_train_data[preprocessed_train_data['ratings']==4]\n",
"class5_data = preprocessed_train_data[preprocessed_train_data['ratings']==5]\n",
"\n",
"majority_class_cnt = majority_class_count(preprocessed_train_data['ratings'])\n",
"class1_data = class1_data.sample(majority_class_cnt, random_state=1, replace=True)\n",
"class2_data = class2_data.sample(majority_class_cnt, random_state=1, replace=True)\n",
"class3_data = class3_data.sample(majority_class_cnt, random_state=1, replace=True)\n",
"class4_data = class4_data.sample(majority_class_cnt, random_state=1, replace=True)\n",
"class5_data = class5_data.sample(majority_class_cnt, random_state=1, replace=True)\n",
"\n",
"train_data_undersample = pd.concat([class1_data, class2_data, class3_data, class4_data, class5_data], axis=0)\n",
"\n",
"print(train_data_undersample['ratings'].value_counts())\n",
"print(processed_cv_data['ratings'].value_counts())\n",
"\n",
"train_model_and_check_test_accuracy(e, batch_size, epoch, train_data_undersample.drop(['ratings'], axis=1), train_data_undersample['ratings'], processed_cv_data.drop(['ratings'], axis=1), processed_cv_data['ratings'], processed_gold_test_data.drop(['ratings'], axis=1), processed_gold_test_data['ratings'])\n"
],
"execution_count": null,
"outputs": []
},
{
"cell_type": "markdown",
"metadata": {
"id": "nSr7yoTscium"
},
"source": [
"#### Oversampling Technique-2"
]
},
{
"cell_type": "code",
"metadata": {
"id": "uVuXnTI3cllj"
},
"source": [
"# 2. reduce the number of ratings of each class to the 'avg_class_count' \n",
"preprocessed_train_data, train_ratings, processed_cv_data, cv_ratings, processed_gold_test_data, gold_test_ratings, e = prepare_data(False, train_data, test_data, gold_test_data)\n",
"class1_data = preprocessed_train_data[preprocessed_train_data['ratings']==1]\n",
"class2_data = preprocessed_train_data[preprocessed_train_data['ratings']==2]\n",
"class3_data = preprocessed_train_data[preprocessed_train_data['ratings']==3]\n",
"class4_data = preprocessed_train_data[preprocessed_train_data['ratings']==4]\n",
"class5_data = preprocessed_train_data[preprocessed_train_data['ratings']==5]\n",
"\n",
"avg_class_count = average_class_count(preprocessed_train_data['ratings'])\n",
"class1_data = class1_data.sample(max(class1_data.shape[0], avg_class_count), random_state=1, replace=True)\n",
"class2_data = class2_data.sample(max(class2_data.shape[0], avg_class_count), random_state=1, replace=True)\n",
"class3_data = class3_data.sample(max(class3_data.shape[0], avg_class_count), random_state=1, replace=True)\n",
"class4_data = class4_data.sample(max(class4_data.shape[0], avg_class_count), random_state=1, replace=True)\n",
"class5_data = class5_data.sample(max(class5_data.shape[0], avg_class_count), random_state=1, replace=True)\n",
"\n",
"train_data_undersample = pd.concat([class1_data, class2_data, class3_data, class4_data, class5_data], axis=0)\n",
"print(train_data_undersample['ratings'].value_counts())\n",
"print(processed_cv_data['ratings'].value_counts())\n",
"\n",
"train_model_and_check_test_accuracy(e, batch_size, epoch, train_data_undersample.drop(['ratings'], axis=1), train_data_undersample['ratings'], processed_cv_data.drop(['ratings'], axis=1), processed_cv_data['ratings'], processed_gold_test_data.drop(['ratings'], axis=1), processed_gold_test_data['ratings'])\n"
],
"execution_count": null,
"outputs": []
},
{
"cell_type": "markdown",
"metadata": {
"id": "BXAOF039clD8"
},
"source": [
"#### Oversampling & Undersampling Technique-3\n"
]
},
{
"cell_type": "code",
"metadata": {
"id": "mMDNn5Sjcx3K"
},
"source": [
"# 3. change the number of ratings of each class to the 'avg_class_count' \n",
"preprocessed_train_data, train_ratings, processed_cv_data, cv_ratings, processed_gold_test_data, gold_test_ratings, e = prepare_data(False, train_data, test_data, gold_test_data)\n",
"class1_data = preprocessed_train_data[preprocessed_train_data['ratings']==1]\n",
"class2_data = preprocessed_train_data[preprocessed_train_data['ratings']==2]\n",
"class3_data = preprocessed_train_data[preprocessed_train_data['ratings']==3]\n",
"class4_data = preprocessed_train_data[preprocessed_train_data['ratings']==4]\n",
"class5_data = preprocessed_train_data[preprocessed_train_data['ratings']==5]\n",
"\n",
"avg_class_count = average_class_count(preprocessed_train_data['ratings'])\n",
"class1_data = class1_data.sample(avg_class_count, random_state=1, replace=True)\n",
"class2_data = class2_data.sample(avg_class_count, random_state=1, replace=True)\n",
"class3_data = class3_data.sample(avg_class_count, random_state=1, replace=True)\n",
"class4_data = class4_data.sample(avg_class_count, random_state=1, replace=True)\n",
"class5_data = class5_data.sample(avg_class_count, random_state=1, replace=True)\n",
"\n",
"train_data_undersample = pd.concat([class1_data, class2_data, class3_data, class4_data, class5_data], axis=0)\n",
"print(train_data_undersample['ratings'].value_counts())\n",
"print(processed_cv_data['ratings'].value_counts())\n",
"\n",
"train_model_and_check_test_accuracy(e, batch_size, epoch, train_data_undersample.drop(['ratings'], axis=1), train_data_undersample['ratings'], processed_cv_data.drop(['ratings'], axis=1), processed_cv_data['ratings'], processed_gold_test_data.drop(['ratings'], axis=1), processed_gold_test_data['ratings'])\n",
"\n"
],
"execution_count": null,
"outputs": []
},
{
"cell_type": "markdown",
"metadata": {
"id": "20O2-bhxemrB"
},
"source": [
"### Word Embeddings"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "Kr21JYWpPlBr"
},
"source": [
"#### Glove"
]
},
{
"cell_type": "code",
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "iKLm2r3TPjYy",
"outputId": "86d795d2-31d5-4cd5-c564-2fa166e9cb5e"
},
"source": [
"# load the whole embedding into memory\n",
"embeddings_index = dict()\n",
"f = open('glove.6B.300d.txt')\n",
"for line in f:\n",
" values = line.split()\n",
" word = values[0]\n",
" coefs = np.asarray(values[1:], dtype='float32')\n",
" embeddings_index[word] = coefs\n",
"f.close()\n",
"print('Loaded %s word vectors.' % len(embeddings_index))\n",
"\n",
"# create a weight matrix for words in training docs\n",
"embedding_matrix = np.zeros((vocab_size, 300))\n",
"for word, i in tokenizer.word_index.items():\n",
" embedding_vector = embeddings_index.get(word)\n",
" if embedding_vector is not None:\n",
" embedding_matrix[i] = embedding_vector\n",
"print(train_reviews.shape)\n",
"print(embedding_matrix.shape)\n",
"e = Embedding(vocab_size, 300, embeddings_initializer=Constant(embedding_matrix), input_length=maxlen, trainable=False)\n",
"# TODO: Cross-validate\n",
"print(train_reviews.shape)"
],
"execution_count": null,
"outputs": [
{
"output_type": "stream",
"text": [
"Loaded 400000 word vectors.\n",
"(50000, 31)\n",
"(15956, 300)\n",
"(50000, 31)\n"
],
"name": "stdout"
}
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "8ETH36deQRfc"
},
"source": [
"#### Fasttext\n"
]
},
{
"cell_type": "code",
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "34JQ-3RZQDcl",
"outputId": "c53ce3a6-0787-432d-8808-d25f946e99ea"
},
"source": [
"# load the whole embedding into memory\n",
"embeddings_index = dict()\n",
"f = open('wiki-news-300d-1M.vec')\n",
"for line in f:\n",
" values = line.split()\n",
" word = values[0]\n",
" coefs = np.asarray(values[1:], dtype='float32')\n",
" embeddings_index[word] = coefs\n",
"f.close()\n",
"print('Loaded %s word vectors.' % len(embeddings_index))\n",
"#print(values)\n",
"# create a weight matrix for words in training docs\n",
"embedding_matrix = np.zeros((vocab_size, 300))\n",
"for word, i in tokenizer.word_index.items():\n",
" embedding_vector = embeddings_index.get(word)\n",
" if embedding_vector is not None:\n",
" embedding_matrix[i] = embedding_vector\n",
"print(train_reviews.shape)\n",
"print(embedding_matrix.shape)\n",
"e = Embedding(vocab_size, 300, embeddings_initializer=Constant(embedding_matrix), input_length=maxlen, trainable=True)\n",
"# TODO: Cross-validate\n",
"print(train_reviews.shape)"
],
"execution_count": null,
"outputs": [
{
"output_type": "stream",
"text": [
"Loaded 999995 word vectors.\n",
"(50000, 31)\n",
"(15816, 300)\n",
"(50000, 31)\n"
],
"name": "stdout"
}
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "WXjRde9FQVQF"
},
"source": [
"####Word2vec"
]
},
{
"cell_type": "code",
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "WKm4zt-bQY2D",
"outputId": "0427b5ce-8f8c-44c8-eeb3-6700dab476cb"
},
"source": [
"from gensim.models import KeyedVectors\n",
"\n",
"# load the whole embedding into memory\n",
"embeddings_index = dict()\n",
"\n",
"\n",
"#f = open('GoogleNews-vectors-negative300.bin')\n",
"model_w2v = KeyedVectors.load_word2vec_format('GoogleNews-vectors-negative300.bin', binary=True)\n",
"print(type(model_w2v))\n",
"\n",
"# prepare embedding matrix\n",
"embedding_matrix = np.zeros((vocab_size, 300))\n",
"for word, i in tokenizer.word_index.items():\n",
" if word in model_w2v.vocab:\n",
" embedding_vector = model_w2v[word]\n",
" embedding_vector = np.array(embedding_vector)\n",
" if embedding_vector is not None:\n",
" embedding_matrix[i] = embedding_vector\n",
"\n",
"e = Embedding(vocab_size, 300, embeddings_initializer=Constant(embedding_matrix), input_length=maxlen, trainable=False)\n"
],
"execution_count": null,
"outputs": [
{
"output_type": "stream",
"text": [
"<class 'gensim.models.keyedvectors.Word2VecKeyedVectors'>\n"
],
"name": "stdout"
}
]
},
{
"cell_type": "code",
"metadata": {
"id": "1rx4hyhmCpZ9"
},
"source": [
""
],
"execution_count": null,
"outputs": []
},
{
"cell_type": "markdown",
"metadata": {
"id": "Cj7uViOAP8nZ"
},
"source": [
"### Train Neural Network\n"
]
},
{
"cell_type": "code",
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "u8cPuyIaPcsI",
"outputId": "8432c74d-a9c8-47eb-8d81-a0149f9f9c7c"
},
"source": [
"batch_size, epochs = 32, 5\n",
"\n",
"train_ratings = pd.get_dummies(train_ratings) \n",
"\n",
"model = NeuralNet(train_reviews, train_ratings,e)\n",
"model.build_nn()\n",
"model.train_nn(batch_size, epochs)\n",
"\n",
"# predict\n",
"test_predictions = model.predict(test_reviews)\n",
"train_predictions = model.predict(train_reviews)\n",
"\n",
"# get ratings from probabilities\n",
"train_predictions = np.argmax(train_predictions, axis=1)\n",
"test_predictions = np.argmax(test_predictions, axis=1)\n",
"train_ratings = np.argmax(np.array(train_ratings), axis=1)\n",
"\n",
"# report generation on training data\n",
"print(f\"Classification report:\\n{metrics.classification_report(train_ratings, train_predictions)}\\n\")\n",
"metrics.confusion_matrix(train_ratings, train_predictions)"
],
"execution_count": null,
"outputs": [
{
"output_type": "stream",
"text": [
"Reviews Shape: (50000, 31)\n",
"Ratings Shape: (50000, 5)\n",
"Epoch 1/5\n",
"1563/1563 [==============================] - 5s 3ms/step - loss: 0.9450 - accuracy: 0.6715\n",
"Epoch 2/5\n",
"1563/1563 [==============================] - 4s 3ms/step - loss: 0.7339 - accuracy: 0.7343\n",
"Epoch 3/5\n",
"1563/1563 [==============================] - 4s 3ms/step - loss: 0.6787 - accuracy: 0.7541\n",
"Epoch 4/5\n",
"1563/1563 [==============================] - 4s 2ms/step - loss: 0.6527 - accuracy: 0.7598\n",
"Epoch 5/5\n",
"1563/1563 [==============================] - 4s 3ms/step - loss: 0.6331 - accuracy: 0.7668\n",
"Classification report:\n",
" precision recall f1-score support\n",
"\n",
" 0 0.81 0.67 0.73 4059\n",
" 1 0.68 0.52 0.59 2265\n",
" 2 0.72 0.44 0.55 3612\n",
" 3 0.62 0.32 0.42 6871\n",
" 4 0.82 0.96 0.88 33193\n",
"\n",
" accuracy 0.79 50000\n",
" macro avg 0.73 0.58 0.63 50000\n",
"weighted avg 0.77 0.79 0.77 50000\n",
"\n",
"\n"
],
"name": "stdout"
},
{
"output_type": "execute_result",
"data": {
"text/plain": [
"array([[ 2703, 161, 103, 123, 969],\n",
" [ 197, 1173, 120, 114, 661],\n",
" [ 172, 164, 1600, 356, 1320],\n",
" [ 107, 107, 236, 2172, 4249],\n",
" [ 174, 111, 169, 755, 31984]])"
]
},
"metadata": {
"tags": []
},
"execution_count": 14
}
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "x2iF2egfUspg"
},
"source": [
"#### Test accuracy model"
]
},
{
"cell_type": "code",
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "foQ-CrQnQHyL",
"outputId": "fbaee72a-374c-4893-c0b4-c915ffb690c3"
},
"source": [
"test_data = pd.read_csv(\"gold_test.csv\", index_col=0)\n",
"\n",
"# separate reviews, ratings\n",
"test_reviews = test_data.iloc[:, :-1]\n",
"test_ratings = test_data.iloc[:, -1]\n",
"test_ratings = pd.get_dummies(test_ratings)\n",
"\n",
"\n",
"# pre-process data\n",
"test_reviews, tokenizer = preprocess_data(tokenizer, test_reviews, maxlen=train_reviews.shape[1])\n",
"\n",
"# predict\n",
"test_predictions = model.predict(test_reviews)\n",
"\n",
"# get ratings from probabilities\n",
"test_predictions = np.argmax(test_predictions, axis=1)\n",
"test_ratings = np.argmax(np.array(test_ratings), axis=1)\n",
"\n",
"# report generation on training data\n",
"print(f\"Classification report:\\n{metrics.classification_report(test_ratings, test_predictions)}\\n\")\n",
"metrics.confusion_matrix(test_ratings, test_predictions)"
],
"execution_count": null,
"outputs": [
{
"output_type": "stream",
"text": [
"Classification report:\n",
" precision recall f1-score support\n",
"\n",
" 0 0.59 0.42 0.49 1271\n",
" 1 0.21 0.15 0.17 630\n",
" 2 0.32 0.17 0.22 911\n",
" 3 0.29 0.15 0.20 1404\n",
" 4 0.72 0.92 0.81 5784\n",
"\n",
" accuracy 0.63 10000\n",
" macro avg 0.42 0.36 0.38 10000\n",
"weighted avg 0.57 0.63 0.59 10000\n",
"\n",
"\n"
],
"name": "stdout"
},
{
"output_type": "execute_result",
"data": {
"text/plain": [
"array([[ 531, 147, 85, 68, 440],\n",
" [ 146, 93, 67, 73, 251],\n",
" [ 95, 103, 159, 127, 427],\n",
" [ 56, 46, 121, 209, 972],\n",
" [ 79, 63, 71, 232, 5339]])"
]
},
"metadata": {
"tags": []
},
"execution_count": 15
}
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "uiB2cRSVQlZg"
},
"source": [
"#### Test input"
]
},
{
"cell_type": "code",
"metadata": {
"id": "d1gB_zAqQkrf",
"colab": {
"base_uri": "https://localhost:8080/"
},
"outputId": "4de0015a-6423-4177-c2f8-a7b766e66a4b"
},
"source": [
"# Test input\n",
"Test = ['this is bad', 'wow nice!', 'this is a great product']\n",
"test_reviews = pd.DataFrame(Test, columns=['reviews'])\n",
"\n",
"# pre-process data\n",
"test_reviews, tokenizer = preprocess_data(tokenizer, test_reviews, maxlen=train_reviews.shape[1])\n",
"\n",
"# predict\n",
"test_predictions = model.predict(test_reviews)\n",
"\n",
"# show probabilities\n",
"print(test_predictions)"
],
"execution_count": null,
"outputs": [
{
"output_type": "stream",
"text": [
"[[0.489183 0.03963117 0.05302086 0.06513943 0.35302556]\n",
" [0.00511781 0.01673113 0.02535018 0.1042458 0.84855515]\n",
" [0.01163882 0.00901665 0.00830102 0.03947002 0.9315735 ]]\n"
],
"name": "stdout"
}
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "DyY0ZJ_U9XV5"
},
"source": [
"#### Testing on custom input GUI"
]
},
{
"cell_type": "code",
"metadata": {
"id": "zoBnu4pB9eF4"
},
"source": [
"import tabulate\n",
"def predict_rating(text):\n",
" test_reviews = pd.DataFrame([text], columns=['reviews'])\n",
"\n",
" test_reviews, _ = preprocess_data(tokenizer, test_reviews, maxlen=maxlen)\n",
"\n",
" test_predictions = model.predict(test_reviews)\n",
" test_ratings = np.argmax(np.array(test_predictions), axis=1) + 1\n",
" print(tabulate.tabulate(test_predictions, headers=['Rating-1', 'Rating-2', 'Rating-3', 'Rating-4', 'Rating-5']))\n",
" str = f\"\\nPredicted Rating: {test_ratings}\"\n",
" return str"
],
"execution_count": 1,
"outputs": []
},
{
"cell_type": "code",
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"cellView": "form",
"id": "7Eg9rqGQ9ckJ",
"outputId": "e7fd0687-3c57-4f57-895a-a7d0e4ce9a12"
},
"source": [
"#@title Predict\n",
"InputText = 'this is good' #@param {type: 'string'}\n",
"output = predict_rating(InputText)\n",
"\n",
"print(output)\n"
],
"execution_count": null,
"outputs": [
{
"output_type": "stream",
"text": [
" Rating-1 Rating-2 Rating-3 Rating-4 Rating-5\n",
"---------- ---------- ---------- ---------- ----------\n",
" 0.0630881 0.0287269 0.0568154 0.129721 0.721648\n",
"\n",
"Predicted Rating: [5]\n"
],
"name": "stdout"
}
]
}
]
}
\ No newline at end of file
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"[nltk_data] Downloading package stopwords to /home/rohit/nltk_data...\n",
"[nltk_data] Package stopwords is already up-to-date!\n",
"[nltk_data] Downloading package punkt to /home/rohit/nltk_data...\n",
"[nltk_data] Package punkt is already up-to-date!\n"
]
},
{
"data": {
"text/plain": [
"True"
]
},
"execution_count": 1,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"import tensorflow as tf\n",
"import pickle \n",
"import pandas as pd\n",
"import numpy as np\n",
"import string\n",
"import nltk\n",
"from nltk.corpus import stopwords\n",
"import tensorflow as tf\n",
"from tensorflow.keras.preprocessing.text import Tokenizer\n",
"from tensorflow.keras.preprocessing.text import one_hot\n",
"from tensorflow.keras.preprocessing.sequence import pad_sequences\n",
"from tensorflow.keras import Sequential\n",
"from tensorflow.keras.layers import Dense, Dropout, Activation, Flatten, SimpleRNN, LSTM, Bidirectional, GRU\n",
"from sklearn import datasets, model_selection, metrics\n",
"from keras.layers.embeddings import Embedding\n",
"from keras.initializers import Constant\n",
"from nltk.tokenize import word_tokenize\n",
"from sklearn.model_selection import train_test_split \n",
"nltk.download('stopwords')\n",
"stopword = stopwords.words('english') \n",
"nltk.download('punkt')"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {
"scrolled": true
},
"outputs": [],
"source": [
"def encode_data(tokenizer, text, tokens, preprocessing_training_data = False):\n",
" # This function will be used to encode the reviews using a dictionary (created using corpus vocabulary) \n",
"\n",
" # Example of encoding :\"The food was fabulous but pricey\" has a vocabulary of 4 words, each one has to be mapped to an integer like: \n",
" # {'The':1,'food':2,'was':3 'fabulous':4 'but':5 'pricey':6} this vocabulary has to be created for the entire corpus and then be used to \n",
" # encode the words into integers \n",
"\n",
" # return encoded examples\n",
" if preprocessing_training_data:\n",
" tokenizer = Tokenizer(oov_token = '<oov>')\n",
" tokenizer.fit_on_texts(tokens)\n",
"\n",
" sequences = tokenizer.texts_to_sequences(text)\n",
" return sequences, tokenizer\n",
"\n",
"def convert_to_lower(text):\n",
" # return the reviews after convering then to lowercase\n",
" lower_text = text.lower()\n",
" return lower_text\n",
"\n",
"def perform_tokenization(text):\n",
" # return the reviews after performing tokenization\n",
" token=nltk.word_tokenize(text)\n",
" return token\n",
"\n",
"def remove_stopwords(text):\n",
" # return the reviews after removing the stopwords\n",
" stopword = [] # not any stopword\n",
" removing_stopwords=[word for word in text if word not in stopword]\n",
" return removing_stopwords\n",
"\n",
"def remove_punctuation(text):\n",
" # return the reviews after removing punctuations\n",
" removing_punctuation = [word for word in text if word.isalpha()]\n",
" return removing_punctuation\n",
"\n",
"def perform_padding(data, maxlen):\n",
" # return the reviews after padding the reviews to maximum length\n",
" padded_data = pad_sequences(data, maxlen=maxlen, padding='post')\n",
" return padded_data\n",
"\n",
"def preprocess_data(tokenizer, data, preprocessing_training_data=False, maxlen=None):\n",
" # make all the following function calls on your data\n",
" # EXAMPLE:->\n",
" '''\n",
" review = data[\"reviews\"]\n",
" review = convert_to_lower(review)\n",
" review = remove_punctuation(review)\n",
" review = remove_stopwords(review)\n",
" review = perform_tokenization(review)\n",
" review = encode_data(review)\n",
" review = perform_padding(review)\n",
" '''\n",
" # return processed data\n",
"\n",
" reviews = data[\"reviews\"]\n",
" list_of_reviews = list(reviews)\n",
" string_of_reviews = ' '.join(str(e) for e in list_of_reviews)\n",
"\n",
" lower_text = convert_to_lower(string_of_reviews)\n",
" tokens = perform_tokenization(lower_text)\n",
" tokens = remove_stopwords(tokens)\n",
" tokens = remove_punctuation(tokens)\n",
" encoded_data, tokenizer = encode_data(tokenizer, reviews, tokens, preprocessing_training_data)\n",
" reviews = perform_padding(encoded_data, maxlen)\n",
"\n",
" return pd.DataFrame(reviews), tokenizer"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
"maxlen = 31 # verified "
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [],
"source": [
"! tar -xzf model.tar.gz\n",
"model = tf.keras.models.load_model(\"model\", compile = False)\n",
"\n",
"with open(r\"tokenizer.pkl\", \"rb\") as input_file:\n",
" tokenizer = pickle.load(input_file)"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"[[7.46410131e-01 1.02791116e-01 1.10829026e-01 2.15716772e-02\n",
" 1.83980539e-02]\n",
" [9.36661454e-05 1.18578457e-04 2.48400168e-03 7.10299909e-02\n",
" 9.26273704e-01]\n",
" [1.44877762e-03 4.65300196e-04 2.53088167e-03 4.58934791e-02\n",
" 9.49661493e-01]]\n"
]
}
],
"source": [
"# Test input\n",
"Test = ['this is bad', 'wow nice!', 'this is a great product']\n",
"test_reviews = pd.DataFrame(Test, columns=['reviews'])\n",
"\n",
"# pre-process data\n",
"test_reviews, tokenizer = preprocess_data(tokenizer, test_reviews, maxlen=maxlen)\n",
"\n",
"# predict\n",
"test_predictions = model.predict(test_reviews)\n",
"\n",
"# show probabilities\n",
"print(test_predictions)"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [],
"source": [
"def predict_rating(text):\n",
" test_reviews = pd.DataFrame([text], columns=['reviews'])\n",
"\n",
"# test_reviews, _ = preprocess_data(tokenizer, test_reviews, maxlen=train_reviews.shape[1])\n",
" test_reviews, _ = preprocess_data(tokenizer, test_reviews, maxlen=31)\n",
"\n",
" test_predictions = model.predict(test_reviews)\n",
" test_ratings = np.argmax(np.array(test_predictions), axis=1) + 1\n",
" \n",
" import tabulate\n",
" \n",
" str = f\"\\nPredicted Rating: {test_ratings}\\n\\n\\n\"\n",
" str += tabulate.tabulate(test_predictions, headers=[\"Rating-1\", \"Rating-2\", \"Rating-3\", \"Rating-4\", \"Rating-5\"])\n",
" return str"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [],
"source": [
"import tkinter as tk\n",
" \n",
"# Top level window \n",
"frame = tk.Tk() \n",
"frame.title(\"Rate reviews\") \n",
"frame.geometry('600x600') \n",
"\n",
"# Function for getting Input from textbox and printing it at label widget \n",
"def printInput(): \n",
" inp = inputtxt.get(1.0, \"end-1c\") \n",
" output = predict_rating(inp)\n",
" lbl.config(text = output) \n",
"\n",
"# TextBox Creation \n",
"inputtxt = tk.Text(frame, \n",
" height = 10, \n",
" width = 40, \n",
" font=(\"Courier\", 18)) \n",
" \n",
"inputtxt.pack() \n",
" \n",
"# Button Creation \n",
"printButton = tk.Button(frame, \n",
" text = \"Print\", \n",
" command = printInput, \n",
" font=(\"monospace\", 14)) \n",
"printButton.pack() \n",
" \n",
"# Label Creation \n",
"lbl = tk.Label(frame, text = \"\") \n",
"lbl.pack() \n",
"frame.mainloop() "
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.8.5"
}
},
"nbformat": 4,
"nbformat_minor": 4
}
Instructions to run the code file for Assignment-3
WITHOUT PRE-TRAINED EMBEDDING:
1. Open code.ipynb file.
2. Run the following sections in the file one by one.
* Install python packages
* To access pre-trained embeddings and dataset (train and test)
* Header files
* Preprocessing function
* Define Models (without pre-trained embedding layer)
* Import Datasets
* Without pre-trained word embedding
3. If you want to run the data imbalanced handling techniques, then go to “Data Imbalanced Handling” section and run the first cell “Utility Functions”. Then you can run any of the techniques (Undersampling Technique-1, Undersampling Technique-1 and so on).
4. Run any of the models (FFNN, LSTM, RNN, Bi-LSTM, GRU, Bi-GRU) by running the section “Train X Model” (replace X with the model according to your need like “Train LSTM Model”).
---------------------------------------------------------------------------------------------------------------------------
WITH PRE-TRAINED EMBEDDING:
1. Open code.ipynb file.
2. Run the cells in the file one by one.
3. Run “Install python packages” section.
4. Then we will run the cell “To access pre-trained embeddings and dataset (train and test)” which accesses Pre Trained word embeddings from google drive which are mounted from our folder. In order to access it please mail your id to us, we will share with you the folder so all embeddings(glove.6B.300d.txt, glove.6B.200d.txt, glove.6B.300d.txt, wiki-news-300d-1M.vec, and GoogleNews-vectors-negative300.bin) and datasets( train.csv, gold_test.csv) could be accessed by you. We are using word2vec for this assignment as it gives best results.
5. Run the “Header files” section.
6. Run the “Preprocessing function” section to convert our train dataset into embeddings.
7. We will run the “Define Models (with pretrained embedding layer)” section which contains models for RNN, LSTM, Bi-LSTM, GRU and Bi-GRU. Run one at a time.
8. Run the “Import Datasets” section to import train and test dataset.
9. Then run the “Word Embeddings” section, there are 3 embeddings sections: Glove, Word2vec and fasttext. These all embeddings are present in the Word Embeddings section. We will run the Word2vec section as only this is used in our assignment.
10.We will move to the “Data Imbalanced Handling” section and use one of the sampling techniques (like Undersampling Technique-1 etc.) from all the mentioned techniques in the section. First run the Utility functions section, then run any of the following sections for using the corresponding technique.
11. There are different sections of our models (RNN, LSTM, Bi-LSTM, GRU and Bi-GRU). We will train the model and test the accuracy of every model running these models.
----------------------------------------------------------------------------------------------------------------------------
----------------------------------------------------------------------------------------------------------------------------
GUI:
We have also attached a file containing the GUI (GUI.ipynb). This file first loads the stored model and tokenizer (code.ipynb contains one section named “Save Model“. This section should be run to store the model and tokenizer. It will generate two files model.tar.gz and tokenizer.pkl. model.tar.gz and tokenizer.pkl should be present in the same directory where GUI.ipynb is present ) and uses that throughout the code.
Libraries to be installed in your local PC:
Keras==2.4.3
tensorflow==2.2.0
numpy==1.17.4
nltk==3.4.5
tabulate==0.8.9
pandas==1.0.5
scikit_learn==0.24.1
All the cells should be run sequentially. Then the user will see a window. There will be one textbox where the user has to insert the text and then he can press the “Print” button to get the predicted rating and probabilities of each rating.
Note: This GUI does not work in colab (due to obvious reasons) but works perfectly in the local machines. If you want to test the GUI, one approach could be to run the code.ipynb from Google colab and download the model.tar.gz and tokenizer.pkl files. Put those files in the same directory of GUI.ipynb and run all the cells of GUI.ipynb sequentially.
\ No newline at end of file
This source diff could not be displayed because it is too large. You can view the blob instead.
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"[nltk_data] Downloading package stopwords to /home/rohit/nltk_data...\n",
"[nltk_data] Package stopwords is already up-to-date!\n",
"[nltk_data] Downloading package punkt to /home/rohit/nltk_data...\n",
"[nltk_data] Package punkt is already up-to-date!\n"
]
},
{
"data": {
"text/plain": [
"True"
]
},
"execution_count": 1,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"import tensorflow as tf\n",
"import pickle \n",
"import pandas as pd\n",
"import numpy as np\n",
"import string\n",
"import nltk\n",
"from nltk.corpus import stopwords\n",
"import tensorflow as tf\n",
"from tensorflow.keras.preprocessing.text import Tokenizer\n",
"from tensorflow.keras.preprocessing.text import one_hot\n",
"from tensorflow.keras.preprocessing.sequence import pad_sequences\n",
"from tensorflow.keras import Sequential\n",
"from tensorflow.keras.layers import Dense, Dropout, Activation, Flatten, SimpleRNN, LSTM, Bidirectional, GRU\n",
"from sklearn import datasets, model_selection, metrics\n",
"from keras.layers.embeddings import Embedding\n",
"from keras.initializers import Constant\n",
"from nltk.tokenize import word_tokenize\n",
"from sklearn.model_selection import train_test_split \n",
"nltk.download('stopwords')\n",
"stopword = stopwords.words('english') \n",
"nltk.download('punkt')"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {
"scrolled": true
},
"outputs": [],
"source": [
"def encode_data(tokenizer, text, tokens, preprocessing_training_data = False):\n",
" # This function will be used to encode the reviews using a dictionary (created using corpus vocabulary) \n",
"\n",
" # Example of encoding :\"The food was fabulous but pricey\" has a vocabulary of 4 words, each one has to be mapped to an integer like: \n",
" # {'The':1,'food':2,'was':3 'fabulous':4 'but':5 'pricey':6} this vocabulary has to be created for the entire corpus and then be used to \n",
" # encode the words into integers \n",
"\n",
" # return encoded examples\n",
" if preprocessing_training_data:\n",
" tokenizer = Tokenizer(oov_token = '<oov>')\n",
" tokenizer.fit_on_texts(tokens)\n",
"\n",
" sequences = tokenizer.texts_to_sequences(text)\n",
" return sequences, tokenizer\n",
"\n",
"def convert_to_lower(text):\n",
" # return the reviews after convering then to lowercase\n",
" lower_text = text.lower()\n",
" return lower_text\n",
"\n",
"def perform_tokenization(text):\n",
" # return the reviews after performing tokenization\n",
" token=nltk.word_tokenize(text)\n",
" return token\n",
"\n",
"def remove_stopwords(text):\n",
" # return the reviews after removing the stopwords\n",
" stopword = [] # not any stopword\n",
" removing_stopwords=[word for word in text if word not in stopword]\n",
" return removing_stopwords\n",
"\n",
"def remove_punctuation(text):\n",
" # return the reviews after removing punctuations\n",
" removing_punctuation = [word for word in text if word.isalpha()]\n",
" return removing_punctuation\n",
"\n",
"def perform_padding(data, maxlen):\n",
" # return the reviews after padding the reviews to maximum length\n",
" padded_data = pad_sequences(data, maxlen=maxlen, padding='post')\n",
" return padded_data\n",
"\n",
"def preprocess_data(tokenizer, data, preprocessing_training_data=False, maxlen=None):\n",
" # make all the following function calls on your data\n",
" # EXAMPLE:->\n",
" '''\n",
" review = data[\"reviews\"]\n",
" review = convert_to_lower(review)\n",
" review = remove_punctuation(review)\n",
" review = remove_stopwords(review)\n",
" review = perform_tokenization(review)\n",
" review = encode_data(review)\n",
" review = perform_padding(review)\n",
" '''\n",
" # return processed data\n",
"\n",
" reviews = data[\"reviews\"]\n",
" list_of_reviews = list(reviews)\n",
" string_of_reviews = ' '.join(str(e) for e in list_of_reviews)\n",
"\n",
" lower_text = convert_to_lower(string_of_reviews)\n",
" tokens = perform_tokenization(lower_text)\n",
" tokens = remove_stopwords(tokens)\n",
" tokens = remove_punctuation(tokens)\n",
" encoded_data, tokenizer = encode_data(tokenizer, reviews, tokens, preprocessing_training_data)\n",
" reviews = perform_padding(encoded_data, maxlen)\n",
"\n",
" return pd.DataFrame(reviews), tokenizer"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
"maxlen = 31 # verified "
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [],
"source": [
"! tar -xzf model.tar.gz\n",
"model = tf.keras.models.load_model(\"model\", compile = False)\n",
"\n",
"with open(r\"tokenizer.pkl\", \"rb\") as input_file:\n",
" tokenizer = pickle.load(input_file)"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"[[7.46410131e-01 1.02791116e-01 1.10829026e-01 2.15716772e-02\n",
" 1.83980539e-02]\n",
" [9.36661454e-05 1.18578457e-04 2.48400168e-03 7.10299909e-02\n",
" 9.26273704e-01]\n",
" [1.44877762e-03 4.65300196e-04 2.53088167e-03 4.58934791e-02\n",
" 9.49661493e-01]]\n"
]
}
],
"source": [
"# Test input\n",
"Test = ['this is bad', 'wow nice!', 'this is a great product']\n",
"test_reviews = pd.DataFrame(Test, columns=['reviews'])\n",
"\n",
"# pre-process data\n",
"test_reviews, tokenizer = preprocess_data(tokenizer, test_reviews, maxlen=maxlen)\n",
"\n",
"# predict\n",
"test_predictions = model.predict(test_reviews)\n",
"\n",
"# show probabilities\n",
"print(test_predictions)"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [],
"source": [
"def predict_rating(text):\n",
" test_reviews = pd.DataFrame([text], columns=['reviews'])\n",
"\n",
"# test_reviews, _ = preprocess_data(tokenizer, test_reviews, maxlen=train_reviews.shape[1])\n",
" test_reviews, _ = preprocess_data(tokenizer, test_reviews, maxlen=31)\n",
"\n",
" test_predictions = model.predict(test_reviews)\n",
" test_ratings = np.argmax(np.array(test_predictions), axis=1) + 1\n",
" \n",
" import tabulate\n",
" \n",
" str = f\"\\nPredicted Rating: {test_ratings}\\n\\n\\n\"\n",
" str += tabulate.tabulate(test_predictions, headers=[\"Rating-1\", \"Rating-2\", \"Rating-3\", \"Rating-4\", \"Rating-5\"])\n",
" return str"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [],
"source": [
"import tkinter as tk\n",
" \n",
"# Top level window \n",
"frame = tk.Tk() \n",
"frame.title(\"Rate reviews\") \n",
"frame.geometry('600x600') \n",
"\n",
"# Function for getting Input from textbox and printing it at label widget \n",
"def printInput(): \n",
" inp = inputtxt.get(1.0, \"end-1c\") \n",
" output = predict_rating(inp)\n",
" lbl.config(text = output) \n",
"\n",
"# TextBox Creation \n",
"inputtxt = tk.Text(frame, \n",
" height = 10, \n",
" width = 40, \n",
" font=(\"Courier\", 18)) \n",
" \n",
"inputtxt.pack() \n",
" \n",
"# Button Creation \n",
"printButton = tk.Button(frame, \n",
" text = \"Print\", \n",
" command = printInput, \n",
" font=(\"monospace\", 14)) \n",
"printButton.pack() \n",
" \n",
"# Label Creation \n",
"lbl = tk.Label(frame, text = \"\") \n",
"lbl.pack() \n",
"frame.mainloop() "
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.8.5"
}
},
"nbformat": 4,
"nbformat_minor": 4
}
Instructions to run the code file for Assignment-4
1. Open code.ipynb file.
2. Run the cells in the file one by one.
3. Run “Install python packages” section.
4. Then we will run the cell “To download dataset (train and test)” which accesses the datasets in the program.
5. Then run the 'To Install the ktrain package' to download the ktrain package.
6. Then 'import the packages'.
7. The 'Download the pre-trained DistilBERT-base-uncased Model' downloads the pretrained model.
8. In the next cell data preprocessing is being done.
9. The model is being trained in "Training the Model" cell.
10.Then the predictions are made in the next cell.
11.The "Explanability of the model" is used for analysis of thr model.
Run the section "To resolve confusion between adjacent classes" to test the overlapping of words between classes.
----------------------------------------------------------------------------------------------------------------------------
GUI:
We have also attached a file containing the GUI (GUI.ipynb). This file first loads the stored model and tokenizer (code.ipynb contains one section named “Save Model“. This section should be run to store the model and tokenizer. It will generate two files model.tar.gz and tokenizer.pkl. model.tar.gz and tokenizer.pkl should be present in the same directory where GUI.ipynb is present ) and uses that throughout the code.
All the cells should be run sequentially. Then the user will see a window. There will be one textbox where the user has to insert the text and then he can press the “Print” button to get the predicted rating and probabilities of each rating.
Note: This GUI does not work in colab (due to obvious reasons) but works perfectly in the local machines. If you want to test the GUI, one approach could be to run the code.ipynb from Google colab and download the model.tar.gz and tokenizer.pkl files. Put those files in the same directory of GUI.ipynb and run all the cells of GUI.ipynb sequentially.
This source diff could not be displayed because it is too large. You can view the blob instead.
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment