Commit d645b63d authored by Nilesh Jagdish's avatar Nilesh Jagdish

Added project files

parent fcfae6f0
# -*- coding: utf-8 -*-
"""Combined_classification.ipynb
Automatically generated by Colaboratory.
Original file is located at
https://colab.research.google.com/drive/18oDzOkExok8oXNpf4EuhMMqFtw4-uQdN
"""
import pandas as pd
import numpy as np
import pickle
import itertools
import xgboost as xgb
from sklearn.svm import LinearSVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.neural_network import MLPClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import SelectFromModel
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer, TfidfVectorizer
from sklearn.metrics import confusion_matrix, f1_score, accuracy_score, precision_score, recall_score
import matplotlib
from sklearn.manifold import TSNE
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder
from sklearn.utils.class_weight import compute_class_weight
from IPython.display import display
import matplotlib.pyplot as plt
import seaborn as sns
x_train_text = pd.read_csv('/content/drive/MyDrive/CS626/iemocap/text_train.csv')
x_test_text = pd.read_csv('/content/drive/MyDrive/CS626/iemocap/text_test.csv')
y_train_text = x_train_text['label']
y_test_text = x_test_text['label']
x_train_audio = pd.read_csv('/content/drive/MyDrive/CS626/iemocap/audio_train.csv')
x_test_audio = pd.read_csv('/content/drive/MyDrive/CS626/iemocap/audio_test.csv')
y_train_audio = x_train_audio['label']
y_test_audio = x_test_audio['label']
y_train = y_train_audio # since y_train_audio == y_train_text
y_test = y_test_audio # since y_train_audio == y_train_text
print(x_train_text.shape, y_train_text.shape, x_train_audio.shape, y_train_audio.shape)
from google.colab import drive
drive.mount('/content/drive')
emotion_dict = {'ang': 0,
'hap': 1,
'sad': 2,
'fea': 3,
'sur': 4,
'neu': 5}
emo_keys = list(['ang', 'hap', 'sad', 'fea', 'sur', 'neu'])
id_to_emotion = {0: 'ang', 1: 'hap', 2: 'sad', 3: 'fea', 4: 'sur', 5: 'neu'}
def plot_confusion_matrix(cm, classes,
normalize=False,
title='Confusion matrix',
cmap=plt.cm.Blues):
"""
This function prints and plots the confusion matrix.
Normalization can be applied by setting `normalize=True`.
"""
# plt.figure(figsize=(8,8))
plt.imshow(cm, interpolation='nearest', cmap=cmap)
plt.title(title)
plt.colorbar()
tick_marks = np.arange(len(classes))
plt.xticks(tick_marks, classes, rotation=45)
plt.yticks(tick_marks, classes)
if normalize:
cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
print("Normalized confusion matrix")
else:
print('Confusion matrix, without normalization')
print(cm)
thresh = cm.max() / 2.
for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
plt.text(j, i, cm[i, j],
horizontalalignment="center",
color="white" if cm[i, j] > thresh else "black")
plt.tight_layout()
plt.ylabel('True label')
plt.xlabel('Predicted label')
def one_hot_encoder(true_labels, num_records, num_classes):
temp = np.array(true_labels[:num_records])
true_labels = np.zeros((num_records, num_classes))
true_labels[np.arange(num_records), temp] = 1
return true_labels
def display_results(y_test, pred_probs, cm=True):
pred = np.argmax(pred_probs, axis=-1)
one_hot_true = one_hot_encoder(y_test, len(pred), len(emotion_dict))
print('Test Set Accuracy = {0:.3f}'.format(accuracy_score(y_test, pred)))
print('Test Set F-score = {0:.3f}'.format(f1_score(y_test, pred, average='macro')))
print('Test Set Precision = {0:.3f}'.format(precision_score(y_test, pred, average='macro')))
print('Test Set Recall = {0:.3f}'.format(recall_score(y_test, pred, average='macro')))
if cm:
plot_confusion_matrix(confusion_matrix(y_test, pred), classes=emo_keys)
cl_weight = dict(pd.Series(x_train_audio['label']).value_counts(normalize=True))
"""## Get Text Features"""
tfidf = TfidfVectorizer(sublinear_tf=True, min_df=5, norm='l2', encoding='latin-1', ngram_range=(1, 2), stop_words='english')
features_text = tfidf.fit_transform(x_train_text.append(x_test_text).transcription).toarray()
x_train_text = features_text[:x_train_text.shape[0]]
x_test_text = features_text[-x_test_text.shape[0]:]
print(features_text.shape, x_train_text.shape, x_test_text.shape)
!pip install sentence_transformers
from sentence_transformers import SentenceTransformer
bert = SentenceTransformer('bert-base-uncased')
print(x_train_text)
# features = x_train_text[:10].transcription.apply(bert.encode)
features_text = x_train_text.append(x_test_text).transcription.apply(bert.encode)
x_train_text = features_text[:x_train_text.shape[0]]
x_test_text = features_text[-x_test_text.shape[0]:]
print(features_text.shape, x_train_text.shape, x_test_text.shape)
print(x_train_text)
x_train_text = np.array(x_train_text.values.tolist())
x_test_text = np.array(x_test_text.values.tolist())
print(features_text.shape, x_train_text.shape, x_test_text.shape)
"""## Combine Text + Audio Features"""
combined_x_train = np.concatenate((np.array(x_train_audio[x_train_audio.columns[2:]]), x_train_text), axis=1)
combined_x_test = np.concatenate((np.array(x_test_audio[x_test_audio.columns[2:]]), x_test_text), axis=1)
print(combined_x_train.shape, combined_x_test.shape)
combined_features_dict = {}
combined_features_dict['x_train'] = combined_x_train
combined_features_dict['x_test'] = combined_x_test
combined_features_dict['y_train'] = np.array(y_train)
combined_features_dict['y_test'] = np.array(y_test)
with open('/content/drive/MyDrive/CS626/iemocap/combined_features_bert.pkl', 'wb') as f:
pickle.dump(combined_features_dict, f)
rf_classifier = RandomForestClassifier(n_estimators=600, min_samples_split=25)
rf_classifier.fit(combined_x_train, y_train)
# Predict
pred_probs = rf_classifier.predict_proba(combined_x_test)
# Results
display_results(y_test, pred_probs)
with open('/content/drive/MyDrive/CS626/iemocap/combined_rf_classifier_bert.pkl', 'wb') as f:
pickle.dump(pred_probs, f)
with open('/content/drive/MyDrive/CS626/iemocap/RF_bert.pkl', 'wb') as f:
pickle.dump(rf_classifier, f)
xgb_classifier = xgb.XGBClassifier(max_depth=7, learning_rate=0.008, objective='multi:softprob',
n_estimators=600, sub_sample=0.8, num_class=len(emotion_dict),
booster='gbtree', n_jobs=4)
xgb_classifier.fit(combined_x_train, y_train)
# Predict
pred_probs = xgb_classifier.predict_proba(combined_x_test)
# Results
display_results(y_test, pred_probs)
with open('/content/drive/MyDrive/CS626/iemocap/combined_xgb_classifier_bert.pkl', 'wb') as f:
pickle.dump(pred_probs, f)
with open('/content/drive/MyDrive/CS626/iemocap/XGB_bert.pkl', 'wb') as f:
pickle.dump(xgb_classifier, f)
svc_classifier = LinearSVC()
svc_classifier.fit(combined_x_train, y_train)
# Predict
pred = svc_classifier.predict(combined_x_test)
# Results
one_hot_true = one_hot_encoder(y_test, len(pred), len(emotion_dict))
print('Test Set Accuracy = {0:.3f}'.format(accuracy_score(y_test, pred)))
print('Test Set F-score = {0:.3f}'.format(f1_score(y_test, pred, average='macro')))
print('Test Set Precision = {0:.3f}'.format(precision_score(y_test, pred, average='macro')))
print('Test Set Recall = {0:.3f}'.format(recall_score(y_test, pred, average='macro')))
plot_confusion_matrix(confusion_matrix(y_test, pred), classes=emo_keys)
(y_test, pred_probs)
with open('/content/drive/MyDrive/CS626/iemocap/combined_svc_classifier_model_bert.pkl', 'wb') as f:
pickle.dump(pred, f)
with open('/content/drive/MyDrive/CS626/iemocap/SVC_bert.pkl', 'wb') as f:
pickle.dump(svc_classifier, f)
mnb_classifier = MultinomialNB()
mnb_classifier.fit(combined_x_train, y_train)
# Predict
pred_probs = mnb_classifier.predict_proba(combined_x_test)
# Results
display_results(y_test, pred_probs)
with open('/content/drive/MyDrive/CS626/iemocap/combined_mnb_classifier_bert.pkl', 'wb') as f:
pickle.dump(pred_probs, f)
with open('/content/drive/MyDrive/CS626/iemocap/MNB_bert.pkl', 'wb') as f:
pickle.dump(mnb_classifier, f)
mlp_classifier = MLPClassifier(hidden_layer_sizes=(1000, ), activation='relu', solver='adam', alpha=0.0001,
batch_size='auto', learning_rate='adaptive', learning_rate_init=0.01,
power_t=0.5, max_iter=1000, shuffle=True, random_state=None, tol=0.0001,
verbose=False, warm_start=True, momentum=0.8, nesterovs_momentum=True,
early_stopping=False, validation_fraction=0.1, beta_1=0.9, beta_2=0.999,
epsilon=1e-08)
mlp_classifier.fit(combined_x_train, y_train)
# Predict
pred_probs = mlp_classifier.predict_proba(combined_x_test)
# Results
display_results(y_test, pred_probs)
with open('/content/drive/MyDrive/CS626/iemocap/combined_mlp_classifier_bert.pkl', 'wb') as f:
pickle.dump(pred_probs, f)
with open('/content/drive/MyDrive/CS626/iemocap/MLP_bert.pkl', 'wb') as f:
pickle.dump(mlp_classifier, f)
lr_classifier = LogisticRegression(solver='lbfgs', multi_class='multinomial', max_iter=1000)
lr_classifier.fit(combined_x_train, y_train)
# Predict
pred_probs = lr_classifier.predict_proba(combined_x_test)
# Results
display_results(y_test, pred_probs)
with open('/content/drive/MyDrive/CS626/iemocap/combined_lr_classifier_bert.pkl', 'wb') as f:
pickle.dump(pred_probs, f)
with open('/content/drive/MyDrive/CS626/iemocap/LR_bert.pkl', 'wb') as f:
pickle.dump(lr_classifier, f)
ax = xgb.plot_importance(xgb_classifier, max_num_features=10, height=0.5, show_values=False)
fig = ax.figure
fig.set_size_inches(8, 8)
contribution_scores = xgb_classifier.feature_importances_
print(contribution_scores)
# Load predicted probabilities
with open('/content/drive/MyDrive/CS626/iemocap/combined_rf_classifier_bert.pkl', 'rb') as f:
rf_pred_probs = pickle.load(f)
with open('/content/drive/MyDrive/CS626/iemocap/combined_xgb_classifier_bert.pkl', 'rb') as f:
xgb_pred_probs = pickle.load(f)
# with open('/content/drive/MyDrive/CS626/iemocap/combined_svc_classifier_model_bert.pkl', 'rb') as f:
# svc_preds = pickle.load(f)
# svc_preds = svc_preds.reshape(-1, 6)
# with open('/content/drive/MyDrive/CS626/iemocap/combined_mnb_classifier_bert.pkl', 'rb') as f:
# mnb_pred_probs = pickle.load(f)
with open('/content/drive/MyDrive/CS626/iemocap/combined_mlp_classifier_bert.pkl', 'rb') as f:
mlp_pred_probs = pickle.load(f)
with open('/content/drive/MyDrive/CS626/iemocap/combined_lr_classifier.pkl', 'rb') as f:
lr_pred_probs = pickle.load(f)
# with open('/content/drive/MyDrive/CS626/iemocap/combined_lstm_classifier.pkl', 'rb') as f:
# lstm_pred_probs = pickle.load(f)
# Average of the predicted probabilites
ensemble_pred_probs = (xgb_pred_probs +
mlp_pred_probs +
rf_pred_probs +
# mnb_pred_probs +
lr_pred_probs)
# Show metrics
display_results(y_test, ensemble_pred_probs)
from transformers import AutoModelForSequenceClassification
model = AutoModelForSequenceClassification.from_pretrained("bert-base-cased", num_labels=2)
from transformers import TrainingArguments
training_args = TrainingArguments("test_trainer")
from transformers import Trainer
trainer = Trainer(
model=model, args=training_args, train_dataset=combined_x_train, eval_dataset=combined_x_test
)
new_xtrain = np.concatenate(combined_x_train, y_train)
trainer.train()
import numpy as np
from datasets import load_metric
metric = load_metric("accuracy")
def compute_metrics(eval_pred):
logits, labels = eval_pred
predictions = np.argmax(logits, axis=-1)
return metric.compute(predictions=predictions, references=labels)
\ No newline at end of file
# -*- coding: utf-8 -*-
"""Feature_Extraction.ipynb
Automatically generated by Colaboratory.
Original file is located at
https://colab.research.google.com/drive/18_6nXwUaTJTNZRgLkqr1w4KVrjHahufb
# Extract labels from the evaluation files
"""
import re
import os
info_line = re.compile(r'\[.+\]\n', re.IGNORECASE)
start_times, end_times, wav_file_names, emotions, vals, acts, doms = [], [], [], [], [], [], []
for sess in range(1, 6):
emo_evaluation_dir = '/content/drive/MyDrive/CS626/iemocap/IEMOCAP_full_release/Session{}/dialog/EmoEvaluation/'.format(sess)
evaluation_files = [l for l in os.listdir(emo_evaluation_dir) if 'Ses' in l]
for file in evaluation_files:
with open(emo_evaluation_dir + file) as f:
content = f.read()
info_lines = re.findall(info_line, content)
for line in info_lines[1:]: # the first line is a header
start_end_time, wav_file_name, emotion, val_act_dom = line.strip().split('\t')
start_time, end_time = start_end_time[1:-1].split('-')
val, act, dom = val_act_dom[1:-1].split(',')
val, act, dom = float(val), float(act), float(dom)
start_time, end_time = float(start_time), float(end_time)
start_times.append(start_time)
end_times.append(end_time)
wav_file_names.append(wav_file_name)
emotions.append(emotion)
vals.append(val)
acts.append(act)
doms.append(dom)
import pandas as pd
df_iemocap = pd.DataFrame(columns=['start_time', 'end_time', 'wav_file', 'emotion', 'val', 'act', 'dom'])
df_iemocap['start_time'] = start_times
df_iemocap['end_time'] = end_times
df_iemocap['wav_file'] = wav_file_names
df_iemocap['emotion'] = emotions
df_iemocap['val'] = vals
df_iemocap['act'] = acts
df_iemocap['dom'] = doms
df_iemocap.tail()
df_iemocap.to_csv('/content/drive/MyDrive/CS626/iemocap/df_iemocap.csv', index=False)
"""## Build Audio Vectors"""
# Commented out IPython magic to ensure Python compatibility.
import librosa
import os
import soundfile as sf
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.style as ms
from tqdm import tqdm
import pickle
import IPython.display
import librosa.display
ms.use('seaborn-muted')
# %matplotlib inline
labels_df = pd.read_csv('/content/drive/MyDrive/CS626/iemocap/df_iemocap.csv')
iemocap_dir = '/content/drive/MyDrive/CS626/iemocap/IEMOCAP_full_release/
sr = 44100
audio_vectors = {}
for sess in range(1, 6): # using one session due to memory constraint, can replace [5] with range(1, 6)
wav_file_path = '{}Session{}/dialog/wav/'.format(iemocap_dir, sess)
orig_wav_files = os.listdir(wav_file_path)
for orig_wav_file in tqdm(orig_wav_files):
try:
orig_wav_vector, _sr = librosa.load(wav_file_path + orig_wav_file, sr=sr)
orig_wav_file, file_format = orig_wav_file.split('.')
for index, row in labels_df[labels_df['wav_file'].str.contains(orig_wav_file)].iterrows():
start_time, end_time, truncated_wav_file_name, emotion, val, act, dom = row['start_time'], row['end_time'], row['wav_file'], row['emotion'], row['val'], row['act'], row['dom']
start_frame = math.floor(start_time * sr)
end_frame = math.floor(end_time * sr)
truncated_wav_vector = orig_wav_vector[start_frame:end_frame + 1]
audio_vectors[truncated_wav_file_name] = truncated_wav_vector
except:
print('An exception occured for {}'.format(orig_wav_file))
with open('/content/drive/MyDrive/CS626/iemocap/audio_vectors_{}.pkl'.format(sess), 'wb') as f:
pickle.dump(audio_vectors, f)
"""## Extract Audio Features"""
# Commented out IPython magic to ensure Python compatibility.
import os
import pickle
import soundfile as sf
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.style as ms
from tqdm import tqdm
import librosa
import math
import random
import pandas as pd
import IPython.display
import librosa.display
ms.use('seaborn-muted')
# %matplotlib inline
columns = ['wav_file', 'label', 'sig_mean', 'sig_std', 'rmse_mean', 'rmse_std', 'silence', 'harmonic', 'auto_corr_max', 'auto_corr_std']
df_features = pd.DataFrame(columns=columns)
emotion_dict = {'ang': 0,
'hap': 1,
'exc': 2,
'sad': 3,
'fru': 4,
'fea': 5,
'sur': 6,
'neu': 7,
'xxx': 8,
'oth': 8}
data_dir = '/content/drive/MyDrive/CS626/iemocap/'
labels_path = '{}df_iemocap.csv'.format(data_dir)
audio_vectors_path = '{}audio_vectors_'.format(data_dir)
labels_df = pd.read_csv(labels_path)
for sess in range(1, 6):
audio_vectors = pickle.load(open('{}{}.pkl'.format(audio_vectors_path, sess), 'rb'))
for index, row in tqdm(labels_df[labels_df['wav_file'].str.contains('Ses0{}'.format(sess))].iterrows()):
try:
wav_file_name = row['wav_file']
label = emotion_dict[row['emotion']]
y = audio_vectors[wav_file_name]
feature_list = [wav_file_name, label] # wav_file, label
sig_mean = np.mean(abs(y))
feature_list.append(sig_mean) # sig_mean
feature_list.append(np.std(y)) # sig_std
rmse = librosa.feature.rms(y + 0.0001)[0]
feature_list.append(np.mean(rmse)) # rmse_mean
feature_list.append(np.std(rmse)) # rmse_std
silence = 0
for e in rmse:
if e <= 0.4 * np.mean(rmse):
silence += 1
silence /= float(len(rmse))
feature_list.append(silence) # silence
y_harmonic = librosa.effects.hpss(y)[0]
feature_list.append(np.mean(y_harmonic) * 1000) # harmonic (scaled by 1000)
# based on the pitch detection algorithm mentioned here:
# http://access.feld.cvut.cz/view.php?cisloclanku=2009060001
cl = 0.45 * sig_mean
center_clipped = []
for s in y:
if s >= cl:
center_clipped.append(s - cl)
elif s <= -cl:
center_clipped.append(s + cl)
elif np.abs(s) < cl:
center_clipped.append(0)
auto_corrs = librosa.core.autocorrelate(np.array(center_clipped))
feature_list.append(1000 * np.max(auto_corrs)/len(auto_corrs)) # auto_corr_max (scaled by 1000)
feature_list.append(np.std(auto_corrs)) # auto_corr_std
df_features = df_features.append(pd.DataFrame(feature_list, index=columns).transpose(), ignore_index=True)
except:
print('Some exception occured')
df_features.to_csv('/content/drive/MyDrive/CS626/iemocap/audio_features_4.csv', index=False)
"""## Prepare Data"""
# Commented out IPython magic to ensure Python compatibility.
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from IPython.display import display
# %matplotlib inline
df = pd.read_csv('/content/drive/MyDrive/CS626/iemocap/audio_features.csv')
df = df[df['label'].isin([0, 1, 2, 3, 4, 5, 6, 7])]
print(df.shape)
display(df.head())
# change 7 to 2
df['label'] = df['label'].map({0: 0, 1: 1, 2: 1, 3: 2, 4: 2, 5: 3, 6: 4, 7: 5})
df.head()
df.to_csv('/content/drive/MyDrive/CS626/iemocap/no_sample_df.csv')
# oversample fear
fear_df = df[df['label']==3]
for i in range(30):
df = df.append(fear_df)
sur_df = df[df['label']==4]
for i in range(10):
df = df.append(sur_df)
df.to_csv('/content/drive/MyDrive/CS626/iemocap/modified_df.csv')
emotion_dict = {'ang': 0,
'hap': 1,
'sad': 2,
'neu': 3,}
scalar = MinMaxScaler()
df[df.columns[2:]] = scalar.fit_transform(df[df.columns[2:]])
df.head()
x_train, x_test = train_test_split(df, test_size=0.20)
x_train.to_csv('/content/drive/MyDrive/CS626/iemocap/audio_train.csv', index=False)
x_test.to_csv('/content/drive/MyDrive/CS626/iemocap/audio_test.csv', index=False)
print(x_train.shape, x_test.shape)
"""### Define preprocessing functions for text"""
import re
import os
import pickle
useful_regex = re.compile(r'^(\w+)', re.IGNORECASE)
file2transcriptions = {}
for sess in range(1, 6):
transcripts_path = '/content/drive/MyDrive/CS626/iemocap/IEMOCAP_full_release/Session{}/dialog/transcriptions/'.format(sess)
transcript_files = os.listdir(transcripts_path)
for f in transcript_files:
with open('{}{}'.format(transcripts_path, f), 'r') as f:
all_lines = f.readlines()
for l in all_lines:
audio_code = useful_regex.match(l).group()
transcription = l.split(':')[-1].strip()
# assuming that all the keys would be unique and hence no `try`
file2transcriptions[audio_code] = transcription
# save dict
with open('/content/drive/MyDrive/CS626/iemocap/audiocode2text.pkl', 'wb') as file:
pickle.dump(file2transcriptions, file)
len(file2transcriptions)
audiocode2text = pickle.load(open('/content/drive/MyDrive/CS626/iemocap/audiocode2text.pkl', 'rb'))
text_train = pd.DataFrame()
text_train['wav_file'] = x_train['wav_file']
text_train['label'] = x_train['label']
text_train['transcription'] = [normalizeString(audiocode2text[code]) for code in x_train['wav_file']]
text_test = pd.DataFrame()
text_test['wav_file'] = x_test['wav_file']
text_test['label'] = x_test['label']
text_test['transcription'] = [normalizeString(audiocode2text[code]) for code in x_test['wav_file']]
text_train.to_csv('/content/drive/MyDrive/CS626/iemocap/text_train.csv', index=False)
text_test.to_csv('/content/drive/MyDrive/CS626/iemocap/text_test.csv', index=False)
print(text_train.shape, text_test.shape)
\ No newline at end of file
# -*- coding: utf-8 -*-
"""Feature_Extraction_for_test_file.ipynb
Automatically generated by Colaboratory.
Original file is located at
https://colab.research.google.com/drive/1I93vu44ZmJ-bLjyLf4r0jWV6LX8XlIHO
# Get Audio File
"""
!sudo apt-get install -q -y timidity libsndfile1
!pip install pydub numba==0.48 librosa music21
!pip install SpeechRecognition pydub
!pip install sentence_transformers
from google.colab import drive
drive.mount('/content/drive')
# Commented out IPython magic to ensure Python compatibility.
import tensorflow as tf
import tensorflow_hub as hub
import numpy as np
import matplotlib.pyplot as plt
import librosa
from librosa import display as librosadisplay
import logging
import math
import statistics
import sys
from IPython.display import Audio, Javascript
from scipy.io import wavfile
from base64 import b64decode
import music21
from pydub import AudioSegment
import speech_recognition as sp_r
import re
import pandas as pd
logger = logging.getLogger()
logger.setLevel(logging.ERROR)
print("tensorflow: %s" % tf.__version__)
info_line = re.compile(r'\[.+\]\n', re.IGNORECASE)
import os
import pickle
import soundfile as sf
import matplotlib.pyplot as plt
import matplotlib.style as ms
from tqdm import tqdm
import math
import random
import IPython.display
import librosa.display
ms.use('seaborn-muted')
# %matplotlib inline
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from IPython.display import display
RECORD = """
const sleep = time => new Promise(resolve => setTimeout(resolve, time))
const b2text = blob => new Promise(resolve => {
const reader = new FileReader()
reader.onloadend = e => resolve(e.srcElement.result)
reader.readAsDataURL(blob)
})
var record = time => new Promise(async resolve => {
stream = await navigator.mediaDevices.getUserMedia({ audio: true })
recorder = new MediaRecorder(stream)
chunks = []
recorder.ondataavailable = e => chunks.push(e.data)
recorder.start()
await sleep(time)
recorder.onstop = async ()=>{
blob = new Blob(chunks)
text = await b2text(blob)
resolve(text)
}
recorder.stop()
})
"""
def record(sec=10):
try:
from google.colab import output
except ImportError:
print('No possible to import output from google.colab')
return ''
else:
print('Recording')
display(Javascript(RECORD))
s = output.eval_js('record(%d)' % (sec*1000))
fname = 'recorded_audio.wav'
print('Saving to', fname)
b = b64decode(s.split(',')[1])
with open(fname, 'wb') as f:
f.write(b)
return fname
#@title Select how to input your audio { run: "auto" }
INPUT_SOURCE = 'RECORD' #@param ["https://storage.googleapis.com/download.tensorflow.org/data/c-scale-metronome.wav", "RECORD", "UPLOAD", "./drive/My Drive/YOUR_MUSIC_FILE.wav"] {allow-input: true}
audio_vectors = {}
sr = 44100
print('You selected', INPUT_SOURCE)
def convert_audio_for_model(user_file, output_file='converted_audio_file.wav'):
audio = AudioSegment.from_file(user_file)
audio = audio.set_frame_rate(EXPECTED_SAMPLE_RATE).set_channels(1)
audio.export(output_file, format="wav")
return output_file
if INPUT_SOURCE == 'RECORD':
uploaded_file_name = record(7)
EXPECTED_SAMPLE_RATE = 16000
converted_audio_file = convert_audio_for_model(uploaded_file_name)
sample_rate, audio_samples = wavfile.read(converted_audio_file, 'rb')
orig_wav_vector, _sr = librosa.load(converted_audio_file, sr=sr)
orig_wav_file = "converted_audio_file"
audio_vectors[orig_wav_file] = orig_wav_vector
text = get_text_from_audio()
df_features, columns = get_feature_frame()
df_features = get_features(audio_vectors, df_features, columns)
save_features(audio_vectors)
df_features = apply_scaling(df_features)
encoding = get_text_encoding(text)
print(text)
probs = get_probs(encoding)
print(probs)
elif INPUT_SOURCE == 'UPLOAD':
wav_file_path = '/content/drive/MyDrive/CS626/test/'
orig_wav_files = os.listdir(wav_file_path)
for orig_wav_file in tqdm(orig_wav_files):
# try:
orig_wav_vector, _sr = librosa.load(wav_file_path + orig_wav_file, sr=sr)
orig_wav_file, file_format = orig_wav_file.split('.')
audio_vectors[orig_wav_file] = orig_wav_vector
df_features, columns = get_feature_frame()
df_features = get_features(audio_vectors, df_features, columns)
save_features(audio_vectors)
df_features = apply_scaling(df_features)
with open('/content/drive/MyDrive/CS626/test_pkl/sample_text.txt', 'r') as f:
text_example = f.readline()
encoding = get_text_encoding(text_example)
probs = get_probs(encoding)
print(probs)
# except:
# print('An exception occured for {}'.format(orig_wav_file))
"""# Get Text file"""
def get_text_from_audio():
filename = "converted_audio_file.wav"
r = sp_r.Recognizer()
with sp_r.AudioFile(filename) as source:
audio_data = r.record(source)
text = r.recognize_google(audio_data)
print(text)
return text
"""## Build Audio Vectors"""
def save_features(audio_vectors):
with open('/content/drive/MyDrive/CS626/test_pkl/audio_vector_1.pkl', 'wb') as f:
pickle.dump(audio_vectors, f)
"""## Extract Audio Features"""
def get_feature_frame():
columns = ['wav_file', 'sig_mean', 'sig_std', 'rmse_mean', 'rmse_std', 'silence', 'harmonic', 'auto_corr_max', 'auto_corr_std']
df_features = pd.DataFrame(columns=columns)
return df_features, columns
def get_features(audio_vectors, df_features, columns):
for wav_file_name in audio_vectors.keys():
try:
y = audio_vectors[wav_file_name]
feature_list = [wav_file_name]
sig_mean = np.mean(abs(y))
feature_list.append(sig_mean)
feature_list.append(np.std(y))
rmse = librosa.feature.rms(y + 0.0001)[0]
feature_list.append(np.mean(rmse))
feature_list.append(np.std(rmse))
silence = 0
for e in rmse:
if e <= 0.4 * np.mean(rmse):
silence += 1
silence /= float(len(rmse))
feature_list.append(silence)
y_harmonic = librosa.effects.hpss(y)[0]
feature_list.append(np.mean(y_harmonic) * 1000)
cl = 0.45 * sig_mean
center_clipped = []
for s in y:
if s >= cl:
center_clipped.append(s - cl)
elif s <= -cl:
center_clipped.append(s + cl)
elif np.abs(s) < cl:
center_clipped.append(0)
auto_corrs = librosa.core.autocorrelate(np.array(center_clipped))
feature_list.append(1000 * np.max(auto_corrs)/len(auto_corrs)) # auto_corr_max (scaled by 1000)
feature_list.append(np.std(auto_corrs)) # auto_corr_std
df_features = df_features.append(pd.DataFrame(feature_list, index=columns).transpose(), ignore_index=True)
except:
print('Some exception occured')
df_features.to_csv('/content/drive/MyDrive/CS626/test_pkl/audio_features_1.csv', index=False)
return df_features
"""## Prepare Data"""
def apply_scaling(df):
with open('/content/drive/MyDrive/CS626/test_pkl/scalar.pkl', 'rb') as f:
scalar = pickle.load(f)
df[df.columns[1:]] = scalar.transform(df[df.columns[1:]])
return df
"""### Define preprocessing functions for text"""
def get_text_encoding(text_example):
from sentence_transformers import SentenceTransformer
bert = SentenceTransformer('bert-base-uncased')
encoding = bert.encode(text_example)
x_test_text = encoding
x_test_text = x_test_text.reshape(-1, 768)
return x_test_text
def get_probs(x_test_text):
x_test_audio = pd.read_csv('/content/drive/MyDrive/CS626/test_pkl/audio_features_1.csv')
combined_x_test = np.concatenate((np.array(x_test_audio[x_test_audio.columns[1:]]), x_test_text), axis=1)
xgb = pickle.load(open('/content/drive/MyDrive/CS626/iemocap/XGB_bert.pkl', 'rb'))
probs_xgb = xgb.predict_proba(combined_x_test)
mlp = pickle.load(open('/content/drive/MyDrive/CS626/iemocap/MLP_bert.pkl', 'rb'))
probs_mlp = mlp.predict_proba(combined_x_test)
rf = pickle.load(open('/content/drive/MyDrive/CS626/iemocap/RF_bert.pkl', 'rb'))
probs_rf = rf.predict_proba(combined_x_test)
lr = pickle.load(open('/content/drive/MyDrive/CS626/iemocap/LR.pkl', 'rb'))
probs_lr = mlp.predict_proba(combined_x_test)
probs = probs_xgb + probs_mlp + probs_rf + probs_lr
# print(probs_xgb + probs_mlp + probs_rf + probs_lr)
emotion_dict = {0 : 'Angry',
1 : 'Happy',
2 : 'Sad',
3 : 'Fear',
4 : 'Surprised',
5 : 'Neutral'
}
index = np.argmax(probs)
print("\n\nDetected Emotion : ", emotion_dict[index])
print("Emotion Dictionary : ", emotion_dict)
return probs
\ No newline at end of file
Instructions for running the code :
1. Run feature_extraction file first.
python3 feature_extraction.py
2. Run combined_classification file to train the model.
python3 combined_classification.py
3. Run feature_extraction_for_test_file file to evaluate model on a single file.python3 feature_extraction_for_test_file.py
The dataset is hosted on drive.
Link : https://drive.google.com/drive/folders/12ELUg5aEfyd9BLcje6nzH10U5KwAA8tP?usp=sharing
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment