Initial Commit

parents
#!/usr/bin/env python
# coding: utf-8
# In[283]:
import warnings
warnings.filterwarnings("ignore")
import shutil
import glob
import os
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
from sklearn import preprocessing
from multiprocessing import Process
import multiprocessing
import codecs
import random as r
from sklearn.tree import DecisionTreeClassifier
from sklearn.calibration import CalibratedClassifierCV
from sklearn.metrics import log_loss
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix
from itertools import *
import csv
from sklearn.metrics import accuracy_score
# In[106]:
source = 'trainData'
destination = 'byteFiles'
if not os.path.exists(destination):
os.makedirs(destination)
if os.path.exists(source):
os.rename(source,'asmFiles')
source = 'asmFiles'
bytes_files = glob.glob(source+"/*.bytes")
for file in bytes_files:
shutil.move(file,destination)
# In[90]:
Y=pd.read_csv("trainLabels.csv")
total = len(Y)*1.
values = Y['Class'].value_counts()
df = pd.DataFrame(values).reset_index().sort_values(by='index')
df.columns = ['Class', 'count']
df = df.set_index('Class')
df['count'] = df['count'].div(total).round(4).mul(100)
ax = df.plot.bar(figsize=(12,8))
ax.set_ylabel('count_percent')
# In[157]:
files = glob.glob('byteFiles'+'/*.bytes')
filenames = Y['Id'].tolist()
class_y = Y['Class'].tolist()
opcodes = list(map(lambda x : "".join(x),product("0123456789ABCDEF",repeat=2)))
file_header = list(chain(['ID'], opcodes, ['??','size','Class']))
count = 1
with open('bytesoutputfile.csv','w+') as wfp:
csv_writer = csv.DictWriter(wfp,file_header)
csv_writer.writeheader()
for file in files:
statinfo = os.stat(file)
file_id = file.split('/')[1].split('.')[0]
dict_file = {}
dict_file['ID'] = file_id
if any(file_id == filename for filename in filenames):
i = filenames.index(file_id)
dict_file['Class'] = class_y[i]
dict_file['size'] = statinfo.st_size/(1024.0*1024.0)
with open(file,'r') as rfp:
for line in rfp:
data = line.strip().split(" ")[1:]
for hex_code in data:
dict_file.setdefault(hex_code,0)
dict_file[hex_code] += 1
csv_writer.writerow(dict_file)
count += 1
if count%1086 == 0:
print(f"-----{(count*10)/1086}% completed--------")
# In[207]:
folder_name = ['first', 'second', 'third', 'fourth', 'fifth']
for i in folder_name:
if not os.path.exists(i):
os.makedirs(i)
source = 'test'
asm_files = glob.glob(source+'/*.asm')
r.shuffle(asm_files)
folder_number = 0
for file in asm_files:
shutil.move(file,folder_name[folder_number])
folder_number = (folder_number+1)%5
# In[223]:
prefixes = ['HEADER:','.text:','.Pav:','.idata:','.data:','.bss:','.rdata:','.edata:','.rsrc:','.tls:','.reloc:','.BSS:','.CODE']
opcodes = ['jmp', 'mov', 'retf', 'push', 'pop', 'xor', 'retn', 'nop', 'sub', 'inc', 'dec', 'add','imul', 'xchg', 'or', 'shr', 'cmp', 'call', 'shl', 'ror', 'rol', 'jnb','jz','rtn','lea','movzx']
keywords = ['.dll','std::',':dword']
registers=['edx','esi','eax','ebx','ecx','edi','ebp','esp','eip']
def process(folder_name):
global filenames, class_y, prefixes, opcodes, keywords, registers
file_header = list(chain(['ID'],prefixes,opcodes,registers,keywords,['Class', 'size']))
file_list = glob.glob(folder_name+'/*.asm')
with open(f"asmtemp{folder_name}outputfile.csv","w") as wfp:
csv_writer = csv.DictWriter(wfp,file_header)
#csv_writer.writeheader()
for file in file_list:
statinfo = os.stat(file)
file_id = file.split('/')[1].split('.')[0]
dict_row = {}
for key in file_header:
dict_row.setdefault(key,0)
file_id = file.split('/')[1].split('.')[0]
dict_row['ID'] = file_id
if any(file_id == filename for filename in filenames):
i = filenames.index(file_id)
dict_row['Class'] = class_y[i]
dict_row['size'] = statinfo.st_size/(1024.0*1024.0)
with codecs.open(file, encoding='cp1252', errors='replace') as fp:
for line in fp:
line = line.strip().split()
prefix = line[0].split(':')[0]
dict_row[f"{prefix}:"] += 1
line = line[1:]
for i in range(len(opcodes)):
if any(opcodes[i]==li for li in line):
dict_row[opcodes[i]] += 1
for i in range(len(registers)):
for li in line:
if registers[i] in li and (prefix in ['.CODE', '.text']):
dict_row[registers[i]] += 1
for i in range(len(keywords)):
for li in line:
if keywords[i] in li:
dict_row[keywords[i]] += 1
csv_writer.writerow(dict_row)
def preprocessasmdata():
manager=multiprocessing.Manager()
p=[]
for folder in folder_name:
p.append(Process(target=process,args=(folder,)))
for i in range(0,5):
p[i].start()
for i in range(0,5):
p[i].join()
multipleasmoutputfiles = glob.glob("asmtemp*.csv")
file_header = list(chain(['ID'],prefixes,opcodes,registers,keywords,['Class', 'size']))
with open('asmoutputfile.csv', 'w') as wfp:
csv_writer = csv.writer(wfp)
csv_writer.writerow(file_header)
for file in multipleasmoutputfiles:
with open(file,'r') as rfp:
csv_reader = csv.reader(rfp)
for row in csv_reader:
csv_writer.writerow(row)
os.remove(file)
preprocessasmdata()
# In[285]:
# https://stackoverflow.com/a/29651514
def normalize(df):
result1 = df.copy()
for feature_name in df.columns:
if (str(feature_name) != str('ID') and str(feature_name)!=str('Class')):
max_value = df[feature_name].max()
min_value = df[feature_name].min()
result1[feature_name] = (df[feature_name] - min_value) / (max_value - min_value)
return result1
# In[286]:
def plot_confusion_matrix(test_y, predict_y):
C = confusion_matrix(test_y, predict_y)
print("Number of misclassified points ",(len(test_y)-np.trace(C))/len(test_y)*100)
A =(((C.T)/(C.sum(axis=1))).T)
B =(C/C.sum(axis=0))
labels = [1,2,3,4,5,6,7,8,9]
cmap=sns.light_palette("green")
# representing A in heatmap format
print("-"*50, "Confusion matrix", "-"*50)
plt.figure(figsize=(10,5))
sns.heatmap(C, annot=True, cmap=cmap, fmt=".3f", xticklabels=labels, yticklabels=labels)
plt.xlabel('Predicted Class')
plt.ylabel('Original Class')
plt.show()
print("-"*50, "Precision matrix", "-"*50)
plt.figure(figsize=(10,5))
sns.heatmap(B, annot=True, cmap=cmap, fmt=".3f", xticklabels=labels, yticklabels=labels)
plt.xlabel('Predicted Class')
plt.ylabel('Original Class')
plt.show()
print("Sum of columns in precision matrix",B.sum(axis=0))
# representing B in heatmap format
print("-"*50, "Recall matrix" , "-"*50)
plt.figure(figsize=(10,5))
sns.heatmap(A, annot=True, cmap=cmap, fmt=".3f", xticklabels=labels, yticklabels=labels)
plt.xlabel('Predicted Class')
plt.ylabel('Original Class')
plt.show()
print("Sum of rows in precision matrix",A.sum(axis=1))
# In[252]:
result = pd.read_csv("bytesoutputfile.csv")
result_asm = pd.read_csv("asmoutputfile.csv")
# In[265]:
result = normalize(result)
result_asm = normalize(result_asm)
result_x = pd.merge(result,result_asm.drop(['Class'], axis=1),on='ID', how='left')
result_y = result_x['Class']
result_x = result_x.drop(['ID','rtn','.BSS:','.CODE','Class'], axis=1)
result_x.astype(np.float128)
result_x.head()
# In[269]:
X_train, X_test_merge, y_train, y_test_merge = train_test_split(result_x, result_y,stratify=result_y,test_size=0.20)
X_train_merge, X_cv_merge, y_train_merge, y_cv_merge = train_test_split(X_train, y_train,stratify=y_train,test_size=0.20)
X_train.fillna(0,inplace=True)
X_test_merge.fillna(0,inplace=True)
X_cv_merge.fillna(0,inplace=True)
X_train_merge.fillna(0,inplace=True)
# In[281]:
alpha=[10,50,100,500,1000,2000,3000]
cv_log_error_array=[]
from sklearn.ensemble import RandomForestClassifier
for i in alpha:
r_cfl=RandomForestClassifier(n_estimators=i,random_state=42,n_jobs=-1)
r_cfl.fit(X_train_merge,y_train_merge)
sig_clf = CalibratedClassifierCV(r_cfl, method="sigmoid")
sig_clf.fit(X_train_merge, y_train_merge)
predict_y = sig_clf.predict_proba(X_cv_merge)
cv_log_error_array.append(log_loss(y_cv_merge, predict_y, labels=r_cfl.classes_, eps=1e-15))
for i in range(len(cv_log_error_array)):
print ('log_loss for c = ',alpha[i],'is',cv_log_error_array[i])
best_alpha = np.argmin(cv_log_error_array)
fig, ax = plt.subplots()
ax.plot(alpha, cv_log_error_array,c='g')
for i, txt in enumerate(np.round(cv_log_error_array,3)):
ax.annotate((alpha[i],np.round(txt,3)), (alpha[i],cv_log_error_array[i]))
plt.grid()
plt.title("Cross Validation Error for each alpha")
plt.xlabel("Alpha i's")
plt.ylabel("Error measure")
plt.show()
r_cfl=RandomForestClassifier(n_estimators=alpha[best_alpha],random_state=42,n_jobs=-1)
r_cfl.fit(X_train_merge,y_train_merge)
sig_clf = CalibratedClassifierCV(r_cfl, method="sigmoid")
sig_clf.fit(X_train_merge, y_train_merge)
predict_y = sig_clf.predict_proba(X_train_merge)
print ('For values of best alpha = ', alpha[best_alpha], "The train log loss is:",log_loss(y_train_merge, predict_y))
predict_y = sig_clf.predict_proba(X_cv_merge)
print('For values of best alpha = ', alpha[best_alpha], "The cross validation log loss is:",log_loss(y_cv_merge, predict_y))
predict_y = sig_clf.predict_proba(X_test_merge)
print('For values of best alpha = ', alpha[best_alpha], "The test log loss is:",log_loss(y_test_merge, predict_y))
# In[287]:
plot_confusion_matrix(y_test_merge,sig_clf.predict(X_test_merge))
# In[289]:
alpha = [10 ** x for x in range(-5, 4)]
cv_log_error_array=[]
for i in alpha:
logisticR=LogisticRegression(penalty='l2',C=i,class_weight='balanced')
logisticR.fit(X_train_merge,y_train_merge)
sig_clf = CalibratedClassifierCV(logisticR, method="sigmoid")
sig_clf.fit(X_train_merge, y_train_merge)
predict_y = sig_clf.predict_proba(X_cv_merge)
cv_log_error_array.append(log_loss(y_cv_merge, predict_y, labels=logisticR.classes_, eps=1e-15))
for i in range(len(cv_log_error_array)):
print ('log_loss for c = ',alpha[i],'is',cv_log_error_array[i])
best_alpha = np.argmin(cv_log_error_array)
fig, ax = plt.subplots()
ax.plot(alpha, cv_log_error_array,c='g')
for i, txt in enumerate(np.round(cv_log_error_array,3)):
ax.annotate((alpha[i],np.round(txt,3)), (alpha[i],cv_log_error_array[i]))
plt.grid()
plt.title("Cross Validation Error for each alpha")
plt.xlabel("Alpha i's")
plt.ylabel("Error measure")
plt.show()
logisticR=LogisticRegression(penalty='l2',C=alpha[best_alpha],class_weight='balanced')
logisticR.fit(X_train_merge,y_train_merge)
sig_clf = CalibratedClassifierCV(logisticR, method="sigmoid")
sig_clf.fit(X_train_merge, y_train_merge)
predict_y = sig_clf.predict_proba(X_train_merge)
print ('log loss for train data',(log_loss(y_train_merge, predict_y, labels=logisticR.classes_, eps=1e-15)))
predict_y = sig_clf.predict_proba(X_cv_merge)
print ('log loss for cv data',(log_loss(y_cv_merge, predict_y, labels=logisticR.classes_, eps=1e-15)))
predict_y = sig_clf.predict_proba(X_test_merge)
print ('log loss for test data',(log_loss(y_test_merge, predict_y, labels=logisticR.classes_, eps=1e-15)))
# In[ ]:
# In[ ]:
Extract the dataset train.7z which will contain a folder "train".
Rename the folder to "trainData" and execute Project_malware_detection.py
\ No newline at end of file
This source diff could not be displayed because it is too large. You can view the blob instead.
This source diff could not be displayed because it is too large. You can view the blob instead.
File added
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment