Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Support
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
C
CS626-Speech Emotion Analysis
Project overview
Project overview
Details
Activity
Releases
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Issues
0
Issues
0
List
Boards
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Analytics
Analytics
CI / CD
Repository
Value Stream
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
Nilesh Jagdish
CS626-Speech Emotion Analysis
Commits
d645b63d
Commit
d645b63d
authored
Aug 18, 2022
by
Nilesh Jagdish
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
Added project files
parent
fcfae6f0
Changes
5
Hide whitespace changes
Inline
Side-by-side
Showing
5 changed files
with
880 additions
and
0 deletions
+880
-0
CS626 FinalProjectPresentation.pdf
CS626 FinalProjectPresentation.pdf
+0
-0
combined_classification.py
combined_classification.py
+336
-0
feature_extraction.py
feature_extraction.py
+272
-0
feature_extraction_for_test_file.py
feature_extraction_for_test_file.py
+260
-0
readme.txt
readme.txt
+12
-0
No files found.
CS626 FinalProjectPresentation.pdf
0 → 100644
View file @
d645b63d
File added
combined_classification.py
0 → 100644
View file @
d645b63d
# -*- coding: utf-8 -*-
"""Combined_classification.ipynb
Automatically generated by Colaboratory.
Original file is located at
https://colab.research.google.com/drive/18oDzOkExok8oXNpf4EuhMMqFtw4-uQdN
"""
import
pandas
as
pd
import
numpy
as
np
import
pickle
import
itertools
import
xgboost
as
xgb
from
sklearn.svm
import
LinearSVC
from
sklearn.naive_bayes
import
MultinomialNB
from
sklearn.neural_network
import
MLPClassifier
from
sklearn.linear_model
import
LogisticRegression
from
sklearn.ensemble
import
RandomForestClassifier
from
sklearn.model_selection
import
train_test_split
from
sklearn.feature_selection
import
SelectFromModel
from
sklearn.feature_extraction.text
import
CountVectorizer
from
sklearn.feature_extraction.text
import
TfidfTransformer
,
TfidfVectorizer
from
sklearn.metrics
import
confusion_matrix
,
f1_score
,
accuracy_score
,
precision_score
,
recall_score
import
matplotlib
from
sklearn.manifold
import
TSNE
from
sklearn.preprocessing
import
MinMaxScaler
,
OneHotEncoder
from
sklearn.utils.class_weight
import
compute_class_weight
from
IPython.display
import
display
import
matplotlib.pyplot
as
plt
import
seaborn
as
sns
x_train_text
=
pd
.
read_csv
(
'/content/drive/MyDrive/CS626/iemocap/text_train.csv'
)
x_test_text
=
pd
.
read_csv
(
'/content/drive/MyDrive/CS626/iemocap/text_test.csv'
)
y_train_text
=
x_train_text
[
'label'
]
y_test_text
=
x_test_text
[
'label'
]
x_train_audio
=
pd
.
read_csv
(
'/content/drive/MyDrive/CS626/iemocap/audio_train.csv'
)
x_test_audio
=
pd
.
read_csv
(
'/content/drive/MyDrive/CS626/iemocap/audio_test.csv'
)
y_train_audio
=
x_train_audio
[
'label'
]
y_test_audio
=
x_test_audio
[
'label'
]
y_train
=
y_train_audio
# since y_train_audio == y_train_text
y_test
=
y_test_audio
# since y_train_audio == y_train_text
print
(
x_train_text
.
shape
,
y_train_text
.
shape
,
x_train_audio
.
shape
,
y_train_audio
.
shape
)
from
google.colab
import
drive
drive
.
mount
(
'/content/drive'
)
emotion_dict
=
{
'ang'
:
0
,
'hap'
:
1
,
'sad'
:
2
,
'fea'
:
3
,
'sur'
:
4
,
'neu'
:
5
}
emo_keys
=
list
([
'ang'
,
'hap'
,
'sad'
,
'fea'
,
'sur'
,
'neu'
])
id_to_emotion
=
{
0
:
'ang'
,
1
:
'hap'
,
2
:
'sad'
,
3
:
'fea'
,
4
:
'sur'
,
5
:
'neu'
}
def
plot_confusion_matrix
(
cm
,
classes
,
normalize
=
False
,
title
=
'Confusion matrix'
,
cmap
=
plt
.
cm
.
Blues
):
"""
This function prints and plots the confusion matrix.
Normalization can be applied by setting `normalize=True`.
"""
# plt.figure(figsize=(8,8))
plt
.
imshow
(
cm
,
interpolation
=
'nearest'
,
cmap
=
cmap
)
plt
.
title
(
title
)
plt
.
colorbar
()
tick_marks
=
np
.
arange
(
len
(
classes
))
plt
.
xticks
(
tick_marks
,
classes
,
rotation
=
45
)
plt
.
yticks
(
tick_marks
,
classes
)
if
normalize
:
cm
=
cm
.
astype
(
'float'
)
/
cm
.
sum
(
axis
=
1
)[:,
np
.
newaxis
]
print
(
"Normalized confusion matrix"
)
else
:
print
(
'Confusion matrix, without normalization'
)
print
(
cm
)
thresh
=
cm
.
max
()
/
2.
for
i
,
j
in
itertools
.
product
(
range
(
cm
.
shape
[
0
]),
range
(
cm
.
shape
[
1
])):
plt
.
text
(
j
,
i
,
cm
[
i
,
j
],
horizontalalignment
=
"center"
,
color
=
"white"
if
cm
[
i
,
j
]
>
thresh
else
"black"
)
plt
.
tight_layout
()
plt
.
ylabel
(
'True label'
)
plt
.
xlabel
(
'Predicted label'
)
def
one_hot_encoder
(
true_labels
,
num_records
,
num_classes
):
temp
=
np
.
array
(
true_labels
[:
num_records
])
true_labels
=
np
.
zeros
((
num_records
,
num_classes
))
true_labels
[
np
.
arange
(
num_records
),
temp
]
=
1
return
true_labels
def
display_results
(
y_test
,
pred_probs
,
cm
=
True
):
pred
=
np
.
argmax
(
pred_probs
,
axis
=-
1
)
one_hot_true
=
one_hot_encoder
(
y_test
,
len
(
pred
),
len
(
emotion_dict
))
print
(
'Test Set Accuracy = {0:.3f}'
.
format
(
accuracy_score
(
y_test
,
pred
)))
print
(
'Test Set F-score = {0:.3f}'
.
format
(
f1_score
(
y_test
,
pred
,
average
=
'macro'
)))
print
(
'Test Set Precision = {0:.3f}'
.
format
(
precision_score
(
y_test
,
pred
,
average
=
'macro'
)))
print
(
'Test Set Recall = {0:.3f}'
.
format
(
recall_score
(
y_test
,
pred
,
average
=
'macro'
)))
if
cm
:
plot_confusion_matrix
(
confusion_matrix
(
y_test
,
pred
),
classes
=
emo_keys
)
cl_weight
=
dict
(
pd
.
Series
(
x_train_audio
[
'label'
])
.
value_counts
(
normalize
=
True
))
"""## Get Text Features"""
tfidf
=
TfidfVectorizer
(
sublinear_tf
=
True
,
min_df
=
5
,
norm
=
'l2'
,
encoding
=
'latin-1'
,
ngram_range
=
(
1
,
2
),
stop_words
=
'english'
)
features_text
=
tfidf
.
fit_transform
(
x_train_text
.
append
(
x_test_text
)
.
transcription
)
.
toarray
()
x_train_text
=
features_text
[:
x_train_text
.
shape
[
0
]]
x_test_text
=
features_text
[
-
x_test_text
.
shape
[
0
]:]
print
(
features_text
.
shape
,
x_train_text
.
shape
,
x_test_text
.
shape
)
!
pip
install
sentence_transformers
from
sentence_transformers
import
SentenceTransformer
bert
=
SentenceTransformer
(
'bert-base-uncased'
)
print
(
x_train_text
)
# features = x_train_text[:10].transcription.apply(bert.encode)
features_text
=
x_train_text
.
append
(
x_test_text
)
.
transcription
.
apply
(
bert
.
encode
)
x_train_text
=
features_text
[:
x_train_text
.
shape
[
0
]]
x_test_text
=
features_text
[
-
x_test_text
.
shape
[
0
]:]
print
(
features_text
.
shape
,
x_train_text
.
shape
,
x_test_text
.
shape
)
print
(
x_train_text
)
x_train_text
=
np
.
array
(
x_train_text
.
values
.
tolist
())
x_test_text
=
np
.
array
(
x_test_text
.
values
.
tolist
())
print
(
features_text
.
shape
,
x_train_text
.
shape
,
x_test_text
.
shape
)
"""## Combine Text + Audio Features"""
combined_x_train
=
np
.
concatenate
((
np
.
array
(
x_train_audio
[
x_train_audio
.
columns
[
2
:]]),
x_train_text
),
axis
=
1
)
combined_x_test
=
np
.
concatenate
((
np
.
array
(
x_test_audio
[
x_test_audio
.
columns
[
2
:]]),
x_test_text
),
axis
=
1
)
print
(
combined_x_train
.
shape
,
combined_x_test
.
shape
)
combined_features_dict
=
{}
combined_features_dict
[
'x_train'
]
=
combined_x_train
combined_features_dict
[
'x_test'
]
=
combined_x_test
combined_features_dict
[
'y_train'
]
=
np
.
array
(
y_train
)
combined_features_dict
[
'y_test'
]
=
np
.
array
(
y_test
)
with
open
(
'/content/drive/MyDrive/CS626/iemocap/combined_features_bert.pkl'
,
'wb'
)
as
f
:
pickle
.
dump
(
combined_features_dict
,
f
)
rf_classifier
=
RandomForestClassifier
(
n_estimators
=
600
,
min_samples_split
=
25
)
rf_classifier
.
fit
(
combined_x_train
,
y_train
)
# Predict
pred_probs
=
rf_classifier
.
predict_proba
(
combined_x_test
)
# Results
display_results
(
y_test
,
pred_probs
)
with
open
(
'/content/drive/MyDrive/CS626/iemocap/combined_rf_classifier_bert.pkl'
,
'wb'
)
as
f
:
pickle
.
dump
(
pred_probs
,
f
)
with
open
(
'/content/drive/MyDrive/CS626/iemocap/RF_bert.pkl'
,
'wb'
)
as
f
:
pickle
.
dump
(
rf_classifier
,
f
)
xgb_classifier
=
xgb
.
XGBClassifier
(
max_depth
=
7
,
learning_rate
=
0.008
,
objective
=
'multi:softprob'
,
n_estimators
=
600
,
sub_sample
=
0.8
,
num_class
=
len
(
emotion_dict
),
booster
=
'gbtree'
,
n_jobs
=
4
)
xgb_classifier
.
fit
(
combined_x_train
,
y_train
)
# Predict
pred_probs
=
xgb_classifier
.
predict_proba
(
combined_x_test
)
# Results
display_results
(
y_test
,
pred_probs
)
with
open
(
'/content/drive/MyDrive/CS626/iemocap/combined_xgb_classifier_bert.pkl'
,
'wb'
)
as
f
:
pickle
.
dump
(
pred_probs
,
f
)
with
open
(
'/content/drive/MyDrive/CS626/iemocap/XGB_bert.pkl'
,
'wb'
)
as
f
:
pickle
.
dump
(
xgb_classifier
,
f
)
svc_classifier
=
LinearSVC
()
svc_classifier
.
fit
(
combined_x_train
,
y_train
)
# Predict
pred
=
svc_classifier
.
predict
(
combined_x_test
)
# Results
one_hot_true
=
one_hot_encoder
(
y_test
,
len
(
pred
),
len
(
emotion_dict
))
print
(
'Test Set Accuracy = {0:.3f}'
.
format
(
accuracy_score
(
y_test
,
pred
)))
print
(
'Test Set F-score = {0:.3f}'
.
format
(
f1_score
(
y_test
,
pred
,
average
=
'macro'
)))
print
(
'Test Set Precision = {0:.3f}'
.
format
(
precision_score
(
y_test
,
pred
,
average
=
'macro'
)))
print
(
'Test Set Recall = {0:.3f}'
.
format
(
recall_score
(
y_test
,
pred
,
average
=
'macro'
)))
plot_confusion_matrix
(
confusion_matrix
(
y_test
,
pred
),
classes
=
emo_keys
)
(
y_test
,
pred_probs
)
with
open
(
'/content/drive/MyDrive/CS626/iemocap/combined_svc_classifier_model_bert.pkl'
,
'wb'
)
as
f
:
pickle
.
dump
(
pred
,
f
)
with
open
(
'/content/drive/MyDrive/CS626/iemocap/SVC_bert.pkl'
,
'wb'
)
as
f
:
pickle
.
dump
(
svc_classifier
,
f
)
mnb_classifier
=
MultinomialNB
()
mnb_classifier
.
fit
(
combined_x_train
,
y_train
)
# Predict
pred_probs
=
mnb_classifier
.
predict_proba
(
combined_x_test
)
# Results
display_results
(
y_test
,
pred_probs
)
with
open
(
'/content/drive/MyDrive/CS626/iemocap/combined_mnb_classifier_bert.pkl'
,
'wb'
)
as
f
:
pickle
.
dump
(
pred_probs
,
f
)
with
open
(
'/content/drive/MyDrive/CS626/iemocap/MNB_bert.pkl'
,
'wb'
)
as
f
:
pickle
.
dump
(
mnb_classifier
,
f
)
mlp_classifier
=
MLPClassifier
(
hidden_layer_sizes
=
(
1000
,
),
activation
=
'relu'
,
solver
=
'adam'
,
alpha
=
0.0001
,
batch_size
=
'auto'
,
learning_rate
=
'adaptive'
,
learning_rate_init
=
0.01
,
power_t
=
0.5
,
max_iter
=
1000
,
shuffle
=
True
,
random_state
=
None
,
tol
=
0.0001
,
verbose
=
False
,
warm_start
=
True
,
momentum
=
0.8
,
nesterovs_momentum
=
True
,
early_stopping
=
False
,
validation_fraction
=
0.1
,
beta_1
=
0.9
,
beta_2
=
0.999
,
epsilon
=
1e-08
)
mlp_classifier
.
fit
(
combined_x_train
,
y_train
)
# Predict
pred_probs
=
mlp_classifier
.
predict_proba
(
combined_x_test
)
# Results
display_results
(
y_test
,
pred_probs
)
with
open
(
'/content/drive/MyDrive/CS626/iemocap/combined_mlp_classifier_bert.pkl'
,
'wb'
)
as
f
:
pickle
.
dump
(
pred_probs
,
f
)
with
open
(
'/content/drive/MyDrive/CS626/iemocap/MLP_bert.pkl'
,
'wb'
)
as
f
:
pickle
.
dump
(
mlp_classifier
,
f
)
lr_classifier
=
LogisticRegression
(
solver
=
'lbfgs'
,
multi_class
=
'multinomial'
,
max_iter
=
1000
)
lr_classifier
.
fit
(
combined_x_train
,
y_train
)
# Predict
pred_probs
=
lr_classifier
.
predict_proba
(
combined_x_test
)
# Results
display_results
(
y_test
,
pred_probs
)
with
open
(
'/content/drive/MyDrive/CS626/iemocap/combined_lr_classifier_bert.pkl'
,
'wb'
)
as
f
:
pickle
.
dump
(
pred_probs
,
f
)
with
open
(
'/content/drive/MyDrive/CS626/iemocap/LR_bert.pkl'
,
'wb'
)
as
f
:
pickle
.
dump
(
lr_classifier
,
f
)
ax
=
xgb
.
plot_importance
(
xgb_classifier
,
max_num_features
=
10
,
height
=
0.5
,
show_values
=
False
)
fig
=
ax
.
figure
fig
.
set_size_inches
(
8
,
8
)
contribution_scores
=
xgb_classifier
.
feature_importances_
print
(
contribution_scores
)
# Load predicted probabilities
with
open
(
'/content/drive/MyDrive/CS626/iemocap/combined_rf_classifier_bert.pkl'
,
'rb'
)
as
f
:
rf_pred_probs
=
pickle
.
load
(
f
)
with
open
(
'/content/drive/MyDrive/CS626/iemocap/combined_xgb_classifier_bert.pkl'
,
'rb'
)
as
f
:
xgb_pred_probs
=
pickle
.
load
(
f
)
# with open('/content/drive/MyDrive/CS626/iemocap/combined_svc_classifier_model_bert.pkl', 'rb') as f:
# svc_preds = pickle.load(f)
# svc_preds = svc_preds.reshape(-1, 6)
# with open('/content/drive/MyDrive/CS626/iemocap/combined_mnb_classifier_bert.pkl', 'rb') as f:
# mnb_pred_probs = pickle.load(f)
with
open
(
'/content/drive/MyDrive/CS626/iemocap/combined_mlp_classifier_bert.pkl'
,
'rb'
)
as
f
:
mlp_pred_probs
=
pickle
.
load
(
f
)
with
open
(
'/content/drive/MyDrive/CS626/iemocap/combined_lr_classifier.pkl'
,
'rb'
)
as
f
:
lr_pred_probs
=
pickle
.
load
(
f
)
# with open('/content/drive/MyDrive/CS626/iemocap/combined_lstm_classifier.pkl', 'rb') as f:
# lstm_pred_probs = pickle.load(f)
# Average of the predicted probabilites
ensemble_pred_probs
=
(
xgb_pred_probs
+
mlp_pred_probs
+
rf_pred_probs
+
# mnb_pred_probs +
lr_pred_probs
)
# Show metrics
display_results
(
y_test
,
ensemble_pred_probs
)
from
transformers
import
AutoModelForSequenceClassification
model
=
AutoModelForSequenceClassification
.
from_pretrained
(
"bert-base-cased"
,
num_labels
=
2
)
from
transformers
import
TrainingArguments
training_args
=
TrainingArguments
(
"test_trainer"
)
from
transformers
import
Trainer
trainer
=
Trainer
(
model
=
model
,
args
=
training_args
,
train_dataset
=
combined_x_train
,
eval_dataset
=
combined_x_test
)
new_xtrain
=
np
.
concatenate
(
combined_x_train
,
y_train
)
trainer
.
train
()
import
numpy
as
np
from
datasets
import
load_metric
metric
=
load_metric
(
"accuracy"
)
def
compute_metrics
(
eval_pred
):
logits
,
labels
=
eval_pred
predictions
=
np
.
argmax
(
logits
,
axis
=-
1
)
return
metric
.
compute
(
predictions
=
predictions
,
references
=
labels
)
\ No newline at end of file
feature_extraction.py
0 → 100644
View file @
d645b63d
# -*- coding: utf-8 -*-
"""Feature_Extraction.ipynb
Automatically generated by Colaboratory.
Original file is located at
https://colab.research.google.com/drive/18_6nXwUaTJTNZRgLkqr1w4KVrjHahufb
# Extract labels from the evaluation files
"""
import
re
import
os
info_line
=
re
.
compile
(
r'\[.+\]\n'
,
re
.
IGNORECASE
)
start_times
,
end_times
,
wav_file_names
,
emotions
,
vals
,
acts
,
doms
=
[],
[],
[],
[],
[],
[],
[]
for
sess
in
range
(
1
,
6
):
emo_evaluation_dir
=
'/content/drive/MyDrive/CS626/iemocap/IEMOCAP_full_release/Session{}/dialog/EmoEvaluation/'
.
format
(
sess
)
evaluation_files
=
[
l
for
l
in
os
.
listdir
(
emo_evaluation_dir
)
if
'Ses'
in
l
]
for
file
in
evaluation_files
:
with
open
(
emo_evaluation_dir
+
file
)
as
f
:
content
=
f
.
read
()
info_lines
=
re
.
findall
(
info_line
,
content
)
for
line
in
info_lines
[
1
:]:
# the first line is a header
start_end_time
,
wav_file_name
,
emotion
,
val_act_dom
=
line
.
strip
()
.
split
(
'
\t
'
)
start_time
,
end_time
=
start_end_time
[
1
:
-
1
]
.
split
(
'-'
)
val
,
act
,
dom
=
val_act_dom
[
1
:
-
1
]
.
split
(
','
)
val
,
act
,
dom
=
float
(
val
),
float
(
act
),
float
(
dom
)
start_time
,
end_time
=
float
(
start_time
),
float
(
end_time
)
start_times
.
append
(
start_time
)
end_times
.
append
(
end_time
)
wav_file_names
.
append
(
wav_file_name
)
emotions
.
append
(
emotion
)
vals
.
append
(
val
)
acts
.
append
(
act
)
doms
.
append
(
dom
)
import
pandas
as
pd
df_iemocap
=
pd
.
DataFrame
(
columns
=
[
'start_time'
,
'end_time'
,
'wav_file'
,
'emotion'
,
'val'
,
'act'
,
'dom'
])
df_iemocap
[
'start_time'
]
=
start_times
df_iemocap
[
'end_time'
]
=
end_times
df_iemocap
[
'wav_file'
]
=
wav_file_names
df_iemocap
[
'emotion'
]
=
emotions
df_iemocap
[
'val'
]
=
vals
df_iemocap
[
'act'
]
=
acts
df_iemocap
[
'dom'
]
=
doms
df_iemocap
.
tail
()
df_iemocap
.
to_csv
(
'/content/drive/MyDrive/CS626/iemocap/df_iemocap.csv'
,
index
=
False
)
"""## Build Audio Vectors"""
# Commented out IPython magic to ensure Python compatibility.
import
librosa
import
os
import
soundfile
as
sf
import
numpy
as
np
import
matplotlib.pyplot
as
plt
import
matplotlib.style
as
ms
from
tqdm
import
tqdm
import
pickle
import
IPython.display
import
librosa.display
ms
.
use
(
'seaborn-muted'
)
# %matplotlib inline
labels_df
=
pd
.
read_csv
(
'/content/drive/MyDrive/CS626/iemocap/df_iemocap.csv'
)
iemocap_dir
=
'/content/drive/MyDrive/CS626/iemocap/IEMOCAP_full_release/
sr = 44100
audio_vectors = {}
for sess in range(1, 6): # using one session due to memory constraint, can replace [5] with range(1, 6)
wav_file_path = '
{}
Session
{}
/
dialog
/
wav
/
'.format(iemocap_dir, sess)
orig_wav_files = os.listdir(wav_file_path)
for orig_wav_file in tqdm(orig_wav_files):
try:
orig_wav_vector, _sr = librosa.load(wav_file_path + orig_wav_file, sr=sr)
orig_wav_file, file_format = orig_wav_file.split('
.
')
for index, row in labels_df[labels_df['
wav_file
'].str.contains(orig_wav_file)].iterrows():
start_time, end_time, truncated_wav_file_name, emotion, val, act, dom = row['
start_time
'], row['
end_time
'], row['
wav_file
'], row['
emotion
'], row['
val
'], row['
act
'], row['
dom
']
start_frame = math.floor(start_time * sr)
end_frame = math.floor(end_time * sr)
truncated_wav_vector = orig_wav_vector[start_frame:end_frame + 1]
audio_vectors[truncated_wav_file_name] = truncated_wav_vector
except:
print('
An
exception
occured
for
{}
'.format(orig_wav_file))
with open('
/
content
/
drive
/
MyDrive
/
CS626
/
iemocap
/
audio_vectors_
{}
.
pkl
'.format(sess), '
wb
') as f:
pickle.dump(audio_vectors, f)
"""## Extract Audio Features"""
# Commented out IPython magic to ensure Python compatibility.
import os
import pickle
import soundfile as sf
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.style as ms
from tqdm import tqdm
import librosa
import math
import random
import pandas as pd
import IPython.display
import librosa.display
ms.use('
seaborn
-
muted
')
#
%
matplotlib inline
columns = ['
wav_file
', '
label
', '
sig_mean
', '
sig_std
', '
rmse_mean
', '
rmse_std
', '
silence
', '
harmonic
', '
auto_corr_max
', '
auto_corr_std
']
df_features = pd.DataFrame(columns=columns)
emotion_dict = {'
ang
': 0,
'
hap
': 1,
'
exc
': 2,
'
sad
': 3,
'
fru
': 4,
'
fea
': 5,
'
sur
': 6,
'
neu
': 7,
'
xxx
': 8,
'
oth
': 8}
data_dir = '
/
content
/
drive
/
MyDrive
/
CS626
/
iemocap
/
'
labels_path = '
{}
df_iemocap
.
csv
'.format(data_dir)
audio_vectors_path = '
{}
audio_vectors_
'.format(data_dir)
labels_df = pd.read_csv(labels_path)
for sess in range(1, 6):
audio_vectors = pickle.load(open('
{}{}
.
pkl
'.format(audio_vectors_path, sess), '
rb
'))
for index, row in tqdm(labels_df[labels_df['
wav_file
'].str.contains('
Ses0
{}
'.format(sess))].iterrows()):
try:
wav_file_name = row['
wav_file
']
label = emotion_dict[row['
emotion
']]
y = audio_vectors[wav_file_name]
feature_list = [wav_file_name, label] # wav_file, label
sig_mean = np.mean(abs(y))
feature_list.append(sig_mean) # sig_mean
feature_list.append(np.std(y)) # sig_std
rmse = librosa.feature.rms(y + 0.0001)[0]
feature_list.append(np.mean(rmse)) # rmse_mean
feature_list.append(np.std(rmse)) # rmse_std
silence = 0
for e in rmse:
if e <= 0.4 * np.mean(rmse):
silence += 1
silence /= float(len(rmse))
feature_list.append(silence) # silence
y_harmonic = librosa.effects.hpss(y)[0]
feature_list.append(np.mean(y_harmonic) * 1000) # harmonic (scaled by 1000)
# based on the pitch detection algorithm mentioned here:
# http://access.feld.cvut.cz/view.php?cisloclanku=2009060001
cl = 0.45 * sig_mean
center_clipped = []
for s in y:
if s >= cl:
center_clipped.append(s - cl)
elif s <= -cl:
center_clipped.append(s + cl)
elif np.abs(s) < cl:
center_clipped.append(0)
auto_corrs = librosa.core.autocorrelate(np.array(center_clipped))
feature_list.append(1000 * np.max(auto_corrs)/len(auto_corrs)) # auto_corr_max (scaled by 1000)
feature_list.append(np.std(auto_corrs)) # auto_corr_std
df_features = df_features.append(pd.DataFrame(feature_list, index=columns).transpose(), ignore_index=True)
except:
print('
Some
exception
occured
')
df_features.to_csv('
/
content
/
drive
/
MyDrive
/
CS626
/
iemocap
/
audio_features_4
.
csv
', index=False)
"""## Prepare Data"""
# Commented out IPython magic to ensure Python compatibility.
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from IPython.display import display
#
%
matplotlib inline
df = pd.read_csv('
/
content
/
drive
/
MyDrive
/
CS626
/
iemocap
/
audio_features
.
csv
')
df = df[df['
label
'].isin([0, 1, 2, 3, 4, 5, 6, 7])]
print(df.shape)
display(df.head())
# change 7 to 2
df['
label
'] = df['
label
'].map({0: 0, 1: 1, 2: 1, 3: 2, 4: 2, 5: 3, 6: 4, 7: 5})
df.head()
df.to_csv('
/
content
/
drive
/
MyDrive
/
CS626
/
iemocap
/
no_sample_df
.
csv
')
# oversample fear
fear_df = df[df['
label
']==3]
for i in range(30):
df = df.append(fear_df)
sur_df = df[df['
label
']==4]
for i in range(10):
df = df.append(sur_df)
df.to_csv('
/
content
/
drive
/
MyDrive
/
CS626
/
iemocap
/
modified_df
.
csv
')
emotion_dict = {'
ang
': 0,
'
hap
': 1,
'
sad
': 2,
'
neu
': 3,}
scalar = MinMaxScaler()
df[df.columns[2:]] = scalar.fit_transform(df[df.columns[2:]])
df.head()
x_train, x_test = train_test_split(df, test_size=0.20)
x_train.to_csv('
/
content
/
drive
/
MyDrive
/
CS626
/
iemocap
/
audio_train
.
csv
', index=False)
x_test.to_csv('
/
content
/
drive
/
MyDrive
/
CS626
/
iemocap
/
audio_test
.
csv
', index=False)
print(x_train.shape, x_test.shape)
"""### Define preprocessing functions for text"""
import re
import os
import pickle
useful_regex = re.compile(r'
^
(
\
w
+
)
', re.IGNORECASE)
file2transcriptions = {}
for sess in range(1, 6):
transcripts_path = '
/
content
/
drive
/
MyDrive
/
CS626
/
iemocap
/
IEMOCAP_full_release
/
Session
{}
/
dialog
/
transcriptions
/
'.format(sess)
transcript_files = os.listdir(transcripts_path)
for f in transcript_files:
with open('
{}{}
'.format(transcripts_path, f), 'r') as f:
all_lines = f.readlines()
for l in all_lines:
audio_code = useful_regex.match(l).group()
transcription = l.split('
:
')[-1].strip()
# assuming that all the keys would be unique and hence no `try`
file2transcriptions[audio_code] = transcription
# save dict
with open('
/
content
/
drive
/
MyDrive
/
CS626
/
iemocap
/
audiocode2text
.
pkl
', '
wb
') as file:
pickle.dump(file2transcriptions, file)
len(file2transcriptions)
audiocode2text = pickle.load(open('
/
content
/
drive
/
MyDrive
/
CS626
/
iemocap
/
audiocode2text
.
pkl
', '
rb
'))
text_train = pd.DataFrame()
text_train['
wav_file
'] = x_train['
wav_file
']
text_train['
label
'] = x_train['
label
']
text_train['
transcription
'] = [normalizeString(audiocode2text[code]) for code in x_train['
wav_file
']]
text_test = pd.DataFrame()
text_test['
wav_file
'] = x_test['
wav_file
']
text_test['
label
'] = x_test['
label
']
text_test['
transcription
'] = [normalizeString(audiocode2text[code]) for code in x_test['
wav_file
']]
text_train.to_csv('
/
content
/
drive
/
MyDrive
/
CS626
/
iemocap
/
text_train
.
csv
', index=False)
text_test.to_csv('
/
content
/
drive
/
MyDrive
/
CS626
/
iemocap
/
text_test
.
csv
', index=False)
print(text_train.shape, text_test.shape)
\ No newline at end of file
feature_extraction_for_test_file.py
0 → 100644
View file @
d645b63d
# -*- coding: utf-8 -*-
"""Feature_Extraction_for_test_file.ipynb
Automatically generated by Colaboratory.
Original file is located at
https://colab.research.google.com/drive/1I93vu44ZmJ-bLjyLf4r0jWV6LX8XlIHO
# Get Audio File
"""
!
sudo
apt
-
get
install
-
q
-
y
timidity
libsndfile1
!
pip
install
pydub
numba
==
0.48
librosa
music21
!
pip
install
SpeechRecognition
pydub
!
pip
install
sentence_transformers
from
google.colab
import
drive
drive
.
mount
(
'/content/drive'
)
# Commented out IPython magic to ensure Python compatibility.
import
tensorflow
as
tf
import
tensorflow_hub
as
hub
import
numpy
as
np
import
matplotlib.pyplot
as
plt
import
librosa
from
librosa
import
display
as
librosadisplay
import
logging
import
math
import
statistics
import
sys
from
IPython.display
import
Audio
,
Javascript
from
scipy.io
import
wavfile
from
base64
import
b64decode
import
music21
from
pydub
import
AudioSegment
import
speech_recognition
as
sp_r
import
re
import
pandas
as
pd
logger
=
logging
.
getLogger
()
logger
.
setLevel
(
logging
.
ERROR
)
print
(
"tensorflow:
%
s"
%
tf
.
__version__
)
info_line
=
re
.
compile
(
r'\[.+\]\n'
,
re
.
IGNORECASE
)
import
os
import
pickle
import
soundfile
as
sf
import
matplotlib.pyplot
as
plt
import
matplotlib.style
as
ms
from
tqdm
import
tqdm
import
math
import
random
import
IPython.display
import
librosa.display
ms
.
use
(
'seaborn-muted'
)
# %matplotlib inline
from
sklearn.model_selection
import
train_test_split
from
sklearn.preprocessing
import
MinMaxScaler
from
IPython.display
import
display
RECORD
=
"""
const sleep = time => new Promise(resolve => setTimeout(resolve, time))
const b2text = blob => new Promise(resolve => {
const reader = new FileReader()
reader.onloadend = e => resolve(e.srcElement.result)
reader.readAsDataURL(blob)
})
var record = time => new Promise(async resolve => {
stream = await navigator.mediaDevices.getUserMedia({ audio: true })
recorder = new MediaRecorder(stream)
chunks = []
recorder.ondataavailable = e => chunks.push(e.data)
recorder.start()
await sleep(time)
recorder.onstop = async ()=>{
blob = new Blob(chunks)
text = await b2text(blob)
resolve(text)
}
recorder.stop()
})
"""
def
record
(
sec
=
10
):
try
:
from
google.colab
import
output
except
ImportError
:
print
(
'No possible to import output from google.colab'
)
return
''
else
:
print
(
'Recording'
)
display
(
Javascript
(
RECORD
))
s
=
output
.
eval_js
(
'record(
%
d)'
%
(
sec
*
1000
))
fname
=
'recorded_audio.wav'
print
(
'Saving to'
,
fname
)
b
=
b64decode
(
s
.
split
(
','
)[
1
])
with
open
(
fname
,
'wb'
)
as
f
:
f
.
write
(
b
)
return
fname
#@title Select how to input your audio { run: "auto" }
INPUT_SOURCE
=
'RECORD'
#@param ["https://storage.googleapis.com/download.tensorflow.org/data/c-scale-metronome.wav", "RECORD", "UPLOAD", "./drive/My Drive/YOUR_MUSIC_FILE.wav"] {allow-input: true}
audio_vectors
=
{}
sr
=
44100
print
(
'You selected'
,
INPUT_SOURCE
)
def
convert_audio_for_model
(
user_file
,
output_file
=
'converted_audio_file.wav'
):
audio
=
AudioSegment
.
from_file
(
user_file
)
audio
=
audio
.
set_frame_rate
(
EXPECTED_SAMPLE_RATE
)
.
set_channels
(
1
)
audio
.
export
(
output_file
,
format
=
"wav"
)
return
output_file
if
INPUT_SOURCE
==
'RECORD'
:
uploaded_file_name
=
record
(
7
)
EXPECTED_SAMPLE_RATE
=
16000
converted_audio_file
=
convert_audio_for_model
(
uploaded_file_name
)
sample_rate
,
audio_samples
=
wavfile
.
read
(
converted_audio_file
,
'rb'
)
orig_wav_vector
,
_sr
=
librosa
.
load
(
converted_audio_file
,
sr
=
sr
)
orig_wav_file
=
"converted_audio_file"
audio_vectors
[
orig_wav_file
]
=
orig_wav_vector
text
=
get_text_from_audio
()
df_features
,
columns
=
get_feature_frame
()
df_features
=
get_features
(
audio_vectors
,
df_features
,
columns
)
save_features
(
audio_vectors
)
df_features
=
apply_scaling
(
df_features
)
encoding
=
get_text_encoding
(
text
)
print
(
text
)
probs
=
get_probs
(
encoding
)
print
(
probs
)
elif
INPUT_SOURCE
==
'UPLOAD'
:
wav_file_path
=
'/content/drive/MyDrive/CS626/test/'
orig_wav_files
=
os
.
listdir
(
wav_file_path
)
for
orig_wav_file
in
tqdm
(
orig_wav_files
):
# try:
orig_wav_vector
,
_sr
=
librosa
.
load
(
wav_file_path
+
orig_wav_file
,
sr
=
sr
)
orig_wav_file
,
file_format
=
orig_wav_file
.
split
(
'.'
)
audio_vectors
[
orig_wav_file
]
=
orig_wav_vector
df_features
,
columns
=
get_feature_frame
()
df_features
=
get_features
(
audio_vectors
,
df_features
,
columns
)
save_features
(
audio_vectors
)
df_features
=
apply_scaling
(
df_features
)
with
open
(
'/content/drive/MyDrive/CS626/test_pkl/sample_text.txt'
,
'r'
)
as
f
:
text_example
=
f
.
readline
()
encoding
=
get_text_encoding
(
text_example
)
probs
=
get_probs
(
encoding
)
print
(
probs
)
# except:
# print('An exception occured for {}'.format(orig_wav_file))
"""# Get Text file"""
def
get_text_from_audio
():
filename
=
"converted_audio_file.wav"
r
=
sp_r
.
Recognizer
()
with
sp_r
.
AudioFile
(
filename
)
as
source
:
audio_data
=
r
.
record
(
source
)
text
=
r
.
recognize_google
(
audio_data
)
print
(
text
)
return
text
"""## Build Audio Vectors"""
def
save_features
(
audio_vectors
):
with
open
(
'/content/drive/MyDrive/CS626/test_pkl/audio_vector_1.pkl'
,
'wb'
)
as
f
:
pickle
.
dump
(
audio_vectors
,
f
)
"""## Extract Audio Features"""
def
get_feature_frame
():
columns
=
[
'wav_file'
,
'sig_mean'
,
'sig_std'
,
'rmse_mean'
,
'rmse_std'
,
'silence'
,
'harmonic'
,
'auto_corr_max'
,
'auto_corr_std'
]
df_features
=
pd
.
DataFrame
(
columns
=
columns
)
return
df_features
,
columns
def
get_features
(
audio_vectors
,
df_features
,
columns
):
for
wav_file_name
in
audio_vectors
.
keys
():
try
:
y
=
audio_vectors
[
wav_file_name
]
feature_list
=
[
wav_file_name
]
sig_mean
=
np
.
mean
(
abs
(
y
))
feature_list
.
append
(
sig_mean
)
feature_list
.
append
(
np
.
std
(
y
))
rmse
=
librosa
.
feature
.
rms
(
y
+
0.0001
)[
0
]
feature_list
.
append
(
np
.
mean
(
rmse
))
feature_list
.
append
(
np
.
std
(
rmse
))
silence
=
0
for
e
in
rmse
:
if
e
<=
0.4
*
np
.
mean
(
rmse
):
silence
+=
1
silence
/=
float
(
len
(
rmse
))
feature_list
.
append
(
silence
)
y_harmonic
=
librosa
.
effects
.
hpss
(
y
)[
0
]
feature_list
.
append
(
np
.
mean
(
y_harmonic
)
*
1000
)
cl
=
0.45
*
sig_mean
center_clipped
=
[]
for
s
in
y
:
if
s
>=
cl
:
center_clipped
.
append
(
s
-
cl
)
elif
s
<=
-
cl
:
center_clipped
.
append
(
s
+
cl
)
elif
np
.
abs
(
s
)
<
cl
:
center_clipped
.
append
(
0
)
auto_corrs
=
librosa
.
core
.
autocorrelate
(
np
.
array
(
center_clipped
))
feature_list
.
append
(
1000
*
np
.
max
(
auto_corrs
)
/
len
(
auto_corrs
))
# auto_corr_max (scaled by 1000)
feature_list
.
append
(
np
.
std
(
auto_corrs
))
# auto_corr_std
df_features
=
df_features
.
append
(
pd
.
DataFrame
(
feature_list
,
index
=
columns
)
.
transpose
(),
ignore_index
=
True
)
except
:
print
(
'Some exception occured'
)
df_features
.
to_csv
(
'/content/drive/MyDrive/CS626/test_pkl/audio_features_1.csv'
,
index
=
False
)
return
df_features
"""## Prepare Data"""
def
apply_scaling
(
df
):
with
open
(
'/content/drive/MyDrive/CS626/test_pkl/scalar.pkl'
,
'rb'
)
as
f
:
scalar
=
pickle
.
load
(
f
)
df
[
df
.
columns
[
1
:]]
=
scalar
.
transform
(
df
[
df
.
columns
[
1
:]])
return
df
"""### Define preprocessing functions for text"""
def
get_text_encoding
(
text_example
):
from
sentence_transformers
import
SentenceTransformer
bert
=
SentenceTransformer
(
'bert-base-uncased'
)
encoding
=
bert
.
encode
(
text_example
)
x_test_text
=
encoding
x_test_text
=
x_test_text
.
reshape
(
-
1
,
768
)
return
x_test_text
def
get_probs
(
x_test_text
):
x_test_audio
=
pd
.
read_csv
(
'/content/drive/MyDrive/CS626/test_pkl/audio_features_1.csv'
)
combined_x_test
=
np
.
concatenate
((
np
.
array
(
x_test_audio
[
x_test_audio
.
columns
[
1
:]]),
x_test_text
),
axis
=
1
)
xgb
=
pickle
.
load
(
open
(
'/content/drive/MyDrive/CS626/iemocap/XGB_bert.pkl'
,
'rb'
))
probs_xgb
=
xgb
.
predict_proba
(
combined_x_test
)
mlp
=
pickle
.
load
(
open
(
'/content/drive/MyDrive/CS626/iemocap/MLP_bert.pkl'
,
'rb'
))
probs_mlp
=
mlp
.
predict_proba
(
combined_x_test
)
rf
=
pickle
.
load
(
open
(
'/content/drive/MyDrive/CS626/iemocap/RF_bert.pkl'
,
'rb'
))
probs_rf
=
rf
.
predict_proba
(
combined_x_test
)
lr
=
pickle
.
load
(
open
(
'/content/drive/MyDrive/CS626/iemocap/LR.pkl'
,
'rb'
))
probs_lr
=
mlp
.
predict_proba
(
combined_x_test
)
probs
=
probs_xgb
+
probs_mlp
+
probs_rf
+
probs_lr
# print(probs_xgb + probs_mlp + probs_rf + probs_lr)
emotion_dict
=
{
0
:
'Angry'
,
1
:
'Happy'
,
2
:
'Sad'
,
3
:
'Fear'
,
4
:
'Surprised'
,
5
:
'Neutral'
}
index
=
np
.
argmax
(
probs
)
print
(
"
\n\n
Detected Emotion : "
,
emotion_dict
[
index
])
print
(
"Emotion Dictionary : "
,
emotion_dict
)
return
probs
\ No newline at end of file
readme.txt
0 → 100644
View file @
d645b63d
Instructions for running the code :
1. Run feature_extraction file first.
python3 feature_extraction.py
2. Run combined_classification file to train the model.
python3 combined_classification.py
3. Run feature_extraction_for_test_file file to evaluate model on a single file.python3 feature_extraction_for_test_file.py
The dataset is hosted on drive.
Link : https://drive.google.com/drive/folders/12ELUg5aEfyd9BLcje6nzH10U5KwAA8tP?usp=sharing
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment