DataScienceNigeria · oyewunmio · Apr 18, 2020 · Apr 18, 2020 · Apr 19, 2020 · Apr 19, 2020
diff --git a/DSN_logo.png b/DSN_logo.png
diff --git a/Logistic_model.py b/Logistic_model.py
@@ -0,0 +1,50 @@
+#!/usr/bin/env python
+# coding: utf-8
+
+from math import exp
+import numpy as np
+import pandas as pd
+from sklearn.linear_model import LogisticRegression
+from sklearn.model_selection import train_test_split
+from sklearn.metrics import f1_score, accuracy_score
+
+# creating the logistic model class
+class logisticRegression():
+    def __init__(self, iteration_no=300, L=0.001, b0 = 0,b1 = 0):
+        self.iteration_no = iteration_no
+        self.L = L
+        self.b0 = b0
+        self.b1 = b1
+
+    def normalize(self, X):
+        return X - X.mean()
+
+    # Method to make predictions
+    def predict_prob(self,X, b0, b1):
+        return np.array([1 / (1 + exp(-1*b0 + -1*b1*x)) for x in X])
+
+    # Method to train the model
+    def fit(self,X, Y):
+
+        X = self.normalize(X)
+
+        for epoch in range(self.iteration_no):
+            y_pred = self.predict_prob(X, self.b0, self.b1)
+            D_b0 = -2 * sum((Y - y_pred) * y_pred * (1 - y_pred))  # Derivative of loss wrt b0
+            D_b1 = -2 * sum(X * (Y - y_pred) * y_pred * (1 - y_pred))  # Derivative of loss wrt b1
+            # Update b0 and b1
+            self.b0 = self.b0 - self.L * D_b0
+            self.b1 = self.b1 - self.L * D_b1
+
+    def predict(self, x_test, b0, b1):
+        v = self.predict_prob(x_test, b0, b1)
+        vr = [1 if p>= 0.5 else 0 for p in v]
+        return vr
+
+# Training the model
+Log = logisticRegression()
+Log.fit(X_train, Y_train)
+
+# Making predictions
+X_test_norm = normalize(X_test)
+y_pred = Log.predict(X_test_norm, b0, b1)
diff --git a/README.md b/README.md
@@ -1,41 +1,44 @@
-# ML-Logistic-regression-algorithm-challenge
+Designing the logistic regression from scratch
 
+logistic regression is a technique that is used to explain the relationship between the input variables(independent) and the output variable(dependent), what differentiates it from the normal linear regression is that the dependent variables can take only a fixed set of values ( 0 and 1) these values correspond to the classes of a classfication problem.
 
-![DSN logo](DSN_logo.png)|DSN Algorithm Challenge|
-|---|---|
+In logistic regression our goal is to identify the relationship betweeen the independent variables and dependent and we do these by estimating the probabilities using a logistics function (it is a sigmoid curve that is used to build the function with various parameters)
 
-A lot of data scientists or machine learning enthusiasts do use various machine learning algorithms as a black box without knowing how they work or the mathematics behind it. The purpose of this challenge is to encourage the mathematical understanding of machine learning algorithms, their break and yield point. 
+Building the logistic model
 
-In summary, participants are encouraged to understand the fundamental concepts behind machine learning algorithms/models.
+focus:
+we building a model which take in features x1, x2, x3,x4,...,xn. and returns a binary output denoted by Y.
 
+statistics:
+let p be the probability of Y being 1 (i.e p = Prob(Y=1))
+the variables relationship can be denoted as
 
-The rules and guidelines for this challenge are as follows:
+ln(p/(1-p)) = b0 + b1x1 + b2x2 + b3x3 + b4x4 + bnxn
 
-1. Ensure to register at https://bit.ly/dsnmlhack 
+where
+p/(1-p) denotes the likelihood of the event taking place.
 
-2. The algorithm challenge is open to all.
+ln(p/(1-p)) is the log of the likelihood of the event taking place and is used to represent the probability that lies between 0 and 1.
 
-3. Participants are expected to design and develop the Logistic Regression algorithm from scratch using Python or R programming.
+while terms b0, b1, b2, b3, b4,...,bn are the parameters that we are trying to estimate during training.
 
-4. For python developers (numpy is advisable).
+note: our dear interests is in getting the value of probability p in the above equation.
 
-5. To push your solution to us, make a [pull request](https://help.github.com/en/github/collaborating-with-issues-and-pull-requests/about-pull-requests) to DSN's GitHub page at  https://www.github.com/datasciencenigeria/ML-Logistic-regression-algorithm-challenge. Ensure to add your readme file to understand your code.
+Solution:
+1>> remove the log term on the LHS of the equation by raising the RHS as a power of e (exponential)
+p/(1-p) = e^b0 + b1x1 + b2x2 + b3x3 + b4x4 +...+ bnxn
 
-6. The top 3 optimized code will be compensated as follows:
+2>> simplify by cross multiplying to obtain the value of p
+p = e^b0 + b1x1 + b2x2 + b3x3 + b4x4 +...+ bnxn / (1 +e^b0 + b1x1 + b2x2 + b3x3 + b4x4 +...+ bnxn)
 
-- **1st position**: 20GB data plan.
-- **2nd position**: 15GB data plan.
-- **3rd position**: 10GB data plan.
+this equation above can also known as the equation of the sigmoid function talked about earlier and we shall be using the above derived equation to make our predictions...
 
-7. Add your scripts and readme.MD file as a folder saved as your full name (surname_first_middle name) by making a pull request to the repository.
+Implementation
+L2 loss function was implemented to calculate the error and the Gradient Descent Algorithm was used to estimate the paramaters.
 
----
-For issues on this challenge kindly reach out to the AI+campus/city managers
+we shall be looking at the relationship between the Age of some patients and their Diabetes status to test the tested created.
 
-**Twitter**: [@DataScienceNIG](https://twitter.com/DataScienceNIG), [@elishatofunmi](https://twitter.com/Elishatofunmi), [@o_funminiyi](https://twitter.com/o_funminiyi), [@gbganalyst](https://twitter.com/gbganalyst) 
+Source of dataset
+Microsoft: DAT263x Introduction to Artificial Intelligence (AI) Lab files
+wget --no-check-certificate 'https://docs.google.com/uc?export=download&id=1VlglKoqggJKRM6EJP_opFRF3bW41iTBP' -O data.csv
 
-or
-
-**Call**: +2349062000119,+2349080564419.
-
-Good luck!
diff --git a/use_of_Logistic_Regression_model.py b/use_of_Logistic_Regression_model.py
@@ -0,0 +1,99 @@
+#!/usr/bin/env python
+# coding: utf-8
+
+# In[27]:
+
+from math import exp
+import numpy as np
+import pandas as pd
+from sklearn.linear_model import LogisticRegression
+from sklearn.model_selection import train_test_split
+from sklearn.metrics import f1_score, accuracy_score
+import matplotlib.pyplot as plt
+get_ipython().run_line_magic('matplotlib', 'inline')
+
+
+# In[2]:
+
+# loading data
+input_file = '/home/oyewunmi/Documents/Codes/data_science/Books_materials/MicrosoftLabFiles/Lab01/diabetes.csv'
+data = pd.read_csv(input_file, delimiter=',')
+data.head(5)
+
+# In[3]:
+
+plt.scatter(data['Age'], data['Diabetic'])
+plt.show()
+
+# dividing the datasets into training and testing datasets
+X_train, X_test, Y_train, Y_test = train_test_split(data['Age'], data['Diabetic'], test_size = 0.2)
+
+
+# In[38]:
+
+# creating the logistic model class
+class logisticRegression():
+    def __init__(self, iteration_no=300, L=0.001, b0 = 0,b1 = 0):
+        self.iteration_no = iteration_no
+        self.L = L
+        self.b0 = b0
+        self.b1 = b1
+
+    def normalize(self, X):
+        return X - X.mean()
+
+    # Method to make predictions
+    def predict_prob(self,X, b0, b1):
+        return np.array([1 / (1 + exp(-1*b0 + -1*b1*x)) for x in X])
+
+    # Method to train the model
+    def fit(self,X, Y):
+
+        X = self.normalize(X)
+
+        for epoch in range(self.iteration_no):
+            y_pred = self.predict_prob(X, self.b0, self.b1)
+            D_b0 = -2 * sum((Y - y_pred) * y_pred * (1 - y_pred))  # Derivative of loss wrt b0
+            D_b1 = -2 * sum(X * (Y - y_pred) * y_pred * (1 - y_pred))  # Derivative of loss wrt b1
+            # Update b0 and b1
+            self.b0 = self.b0 - self.L * D_b0
+            self.b1 = self.b1 - self.L * D_b1
+
+    def predict(self, x_test, b0, b1):
+        v = self.predict_prob(x_test, b0, b1)
+        vr = [1 if p>= 0.5 else 0 for p in v]
+        return vr
+
+
+# In[39]:
+
+
+# Training the model
+Log = logisticRegression()
+Log.fit(X_train, Y_train)
+
+# Making predictions
+X_test_norm = normalize(X_test)
+y_pred = Log.predict(X_test_norm, b0, b1)
+
+plt.clf()
+plt.scatter(X_test, Y_test)
+plt.scatter(X_test, y_pred, c="red")
+plt.show()
+
+# The accuracy
+accuracy = 0
+for i in range(len(y_pred)):
+    if y_pred[i] == Y_test.iloc[i]:
+        accuracy += 1
+print(f"Accuracy = {accuracy / len(y_pred)}")
+
+
+# In[40]:
+
+
+print("accuracy:", accuracy_score(Y_test, y_pred))
+# f1 score
+print("f1 score:", f1_score(Y_test, y_pred))
+
+