mse final leaderboard

1b4341db · SHREYANSH JAIN · 13f77136 · 1b4341db · 1b4341db · 1b4341db
Commit 1b4341db authored Sep 11, 2019 by SHREYANSH JAIN
Showing with 11626 additions and 11527 deletions

Assignment1/error.log Assignment1/error.log +8000 -8000

Assignment1/main.py Assignment1/main.py +125 -26

Assignment1/output.csv Assignment1/output.csv +3501 -3501

No files found.
--- a/Assignment1/error.log
+++ b/Assignment1/error.log
--- a/Assignment1/main.py
+++ b/Assignment1/main.py
@@ -91,10 +91,11 @@ def root_mean_squared_gradient(xdata, ydata, weights):

 class LinearRegressor:

-	def __init__(self,dims):
+	def __init__(self, dims):
 		
 		self.dims = dims
-		self.W = np.zeros(dims) 
+		self.W = np.random.rand(dims)
+		#self.W = np.random.uniform(low=0.0, high=1.0, size=dims)
 		return

 		raise NotImplementedError
@@ -140,16 +141,89 @@ def read_dataset(trainfile, testfile):

 	return np.array(xtrain), np.array(ytrain), np.array(xtest)

+def one_hot_encoding(value_list, classes):
+    res = np.eye(classes)[value_list.reshape(-1)]
+    return res.reshape(list(value_list.shape)+[classes])
+
+norm_dict = {}
+
+dictionary_of_classes_for_features = {
+	2 : 5,
+	3 : 25,
+	5: 8,
+	7: 5
+}
+
+dictionary_of_days = {
+	'Monday' : 1,
+	'Tuesday': 2,
+	'Wednesday': 3,
+	'Thursday' : 4,
+	'Friday' : 5,
+	'Saturday': 6,
+	'Sunday' : 7
+}
+
+def slicer(arr, beg, end):
+	return np.array([i[beg:end] for i in arr]).reshape(-1, 1)
+"""	
+#for normalization of parametes 'wind speed' and 'humidity' uncoment
+def normalize(arr):
+	arr = arr
+	if not norm_dict: # make dictionary once at training to be used later during test
+		# for i in range(arr.shape[1]):
+		norm_dict['init'] = [np.min(arr), np.max(arr)]
+		#norm_dict['init'] = [np.mean(arr), np.std(arr)]
+	# for i in range(arr.shape[1]):
+	arr = np.array([(x - norm_dict['init'][0])/(norm_dict['init'][1] - norm_dict['init'][0]) for x in arr]) # min-max
+	#arr = np.array([(x - norm_dict['init'][0])/(norm_dict['init'][1]) for x in arr]) # standardization
+		
+	return arr
+"""
+
 def preprocess_dataset(xdata, ydata=None):
-	xdata = xdata[:,[2,3,4,7,9]]
-	xdata = xdata.astype('float32') 
-	bias = np.ones((np.shape(xdata)[0],1))
-	xdata = np.concatenate((bias,xdata),axis=1)
-	if ydata is None:
-		return xdata
-	ydata = ydata.astype('float32')
-	return xdata,ydata
-	raise NotImplementedError
+	
+	# converting weekdays to numeric for one_hot_encoding
+    """
+
+	#for normalization of parametes 'wind speed' and 'humidity' uncoment
+	xdata[:, 10] = normalize(xdata[:, 10].astype('float'))# normalized
+	xdata[:, 11] = normalize(xdata[:, 10].astype('float'))"""
+    xdata[:, 5] = [dictionary_of_days[i] for i in xdata[:, 5]]
+
+    cat_cols = [2, 3, 5, 7]
+
+	
+    for i in cat_cols:
+		# dropping 2 columns for C-1 encoding and removing additional 0 column
+        t = one_hot_encoding(xdata[:, i].astype('int'), dictionary_of_classes_for_features[i])[:, 2:]
+        xdata = np.concatenate((xdata, t),axis=1)
+	
+    xdata = np.delete(xdata, cat_cols, 1) # removing useless columns
+    xdata = np.delete(xdata, 6, 1)
+    xdata = np.delete(xdata, 8, 1)
+	
+    # extracting features from date
+    month = slicer(xdata[:, 1], 5,7)
+    t = one_hot_encoding(month[:,0].astype('int'), 13)[:, 2:]
+    xdata = np.concatenate((xdata, t), axis=1)
+    date = slicer(xdata[:, 1], 8, 10)
+    week = np.ceil(date.astype('int') / 7)  # week of month
+    t = one_hot_encoding(week[:,0].astype('int'), 6)[:, 2:]
+    xdata = np.concatenate((xdata, t), axis=1)
+
+
+    xdata = xdata[:,2:] # dropping first 2 unnecessary columns
+	
+    xdata = xdata.astype('float32')
+    bias = np.ones((np.shape(xdata)[0],1))
+    xdata = np.concatenate((bias,xdata),axis=1)
+
+    if ydata is None:
+        return xdata
+    ydata = ydata.astype('float32')
+    return xdata,ydata
+    raise NotImplementedError

 dictionary_of_losses = {
 	'mse':(mean_squared_loss, mean_squared_gradient),
@@ -158,26 +232,51 @@ dictionary_of_losses = {
 	'logcosh':(mean_log_cosh_loss, mean_log_cosh_gradient),
 }

-def main():
+"""
+#For outliers removal from wind speed column uncomment
+def out(x, std, mean):
+    if ((x < mean + 2 * std)and (x > mean - 2 * std)):
+        return 0
+    else:
+        return 1
+
+
+def outlier(xtrain, ytrain, std, mean):
+    a =[]
+    for i in xtrain[:, 11].astype('float32'):
+        a.append(out(i,std, mean))
+    a = np.array(a)
+    xdata = np.concatenate((xtrain, a.reshape(-1, 1)), axis=1)
+    ytrain = np.delete(ytrain, np.argwhere(xdata[:, -1].astype('int') > 0), 0)
+    xdata = np.delete(xdata, np.argwhere(xdata[:, -1].astype('int') > 0), 0)
+    xdata = np.delete(xdata, -1, 1)
+    return (xdata, ytrain)"""

-	# You are free to modify the main function as per your requirements.
+def main():
+    # You are free to modify the main function as per your requirements.
 	# Uncomment the below lines and pass the appropriate value

-	xtrain, ytrain, xtest = read_dataset(args.train_file, args.test_file)
-	xtrainprocessed, ytrainprocessed = preprocess_dataset(xtrain, ytrain)
-	xtestprocessed = preprocess_dataset(xtest)
+    xtrain, ytrain, xtest = read_dataset(args.train_file, args.test_file)
+
+    """
+    #For outliers removal from wind speed column uncomment
+    std = np.std(xtrain[:, 11].astype('float32'))
+    mean = np.mean(xtrain[:, 11].astype('float32'))
+    xtrain, ytrain =outlier(xtrain, ytrain, std, mean)"""
+    xtrainprocessed, ytrainprocessed = preprocess_dataset(xtrain, ytrain)
+    xtestprocessed = preprocess_dataset(xtest)
 	
-	model = LinearRegressor(np.shape(xtrainprocessed)[1])
+    model = LinearRegressor(np.shape(xtrainprocessed)[1])

-	# The loss function is provided by command line argument	
-	loss_fn, loss_grad = dictionary_of_losses[args.loss]
+    # The loss function is provided by command line argument
+    loss_fn, loss_grad = dictionary_of_losses[args.loss]

-	errlog = model.train(xtrainprocessed, ytrainprocessed, loss_fn, loss_grad, args.epoch, args.lr)
-	ytest = model.predict(xtestprocessed)
-	ytest = ytest.astype('int')
-	output = [(i,np.absolute(ytest[i])) for i in range(len(ytest))]
-	np.savetxt("output.csv",output,delimiter=',',fmt="%d",header="instance (id),count",comments='')
-	np.savetxt("error.log",errlog,delimiter='\n',fmt="%f")
+    errlog = model.train(xtrainprocessed, ytrainprocessed, loss_fn, loss_grad, args.epoch, args.lr)
+    ytest = model.predict(xtestprocessed)
+    ytest = ytest.astype('int')
+    output = [(i,np.absolute(ytest[i])) for i in range(len(ytest))]
+    np.savetxt("output.csv",output,delimiter=',',fmt="%d",header="instance (id),count",comments='')
+    np.savetxt("error.log",errlog,delimiter='\n',fmt="%f")


 if __name__ == '__main__':
@@ -192,4 +291,4 @@ if __name__ == '__main__':

 	args = parser.parse_args()

-	main()
+	main()
\ No newline at end of file
--- a/Assignment1/output.csv
+++ b/Assignment1/output.csv