前言

Here's something that might surprise you:neral networks aren't that complicated! The term "neral network" gets used as a buzzword a lot, but in reality they're often much simpler than people imagine.

This post is intended for complete beginners and assumes ZERO prior knowledge of machine learning. We'll understand how neual networks work while implementing one from scratch in Python.

Let's get started!

Deep Learning是Machine Learning的一个分支,能够让机器自动学习良好的特征,从而免去人工选取过程。

Deep Learning的基本思想来自神经生物学表明大脑神经系统处理是以分层的方式来处理信息的,即高层的特征是低层特征的组合,从低层到高层的特征表示越来越抽象,越来越能表现语义或者意图。深度学习借鉴这种分层的信息处理方式,其含有多个层,将当前层当作下一层的输入,通过这种方式实现对输入信息进行分级表达和特征提取。

第一个文件是神经网络各个部分的简单描述;第二个文件是基于向量的反向传播算法的理论知识和代码

学习笔记,LaTeX版本

An Introduction to Neural Network 学习笔记open in new window,

基于向量的反向传播算法学习笔记open in new window

Code

第一个文件附录代码:

import numpy as np

def sigmoid(x):
  # Sigmoid activation function: f(x) = 1 / (1 + e^(-x))
  return 1 / (1 + np.exp(-x))

def deriv_sigmoid(x):
  # Derivative of sigmoid: f'(x) = f(x) * (1 - f(x))
  fx = sigmoid(x)
  return fx * (1 - fx)

def mse_loss(y_true, y_pred):
  # y_true and y_pred are numpy arrays of the same length.
  return ((y_true - y_pred) ** 2).mean()

class OurNeuralNetwork:
  '''
  A neural network with:
    - 2 inputs
    - a hidden layer with 2 neurons (h1, h2)
    - an output layer with 1 neuron (o1)

  *** DISCLAIMER ***:
  The code below is intended to be simple and educational, NOT optimal.
  Real neural net code looks nothing like this. DO NOT use this code.
  Instead, read/run it to understand how this specific network works.
  '''
  def __init__(self):
    # Weights
    self.w1 = np.random.normal()
    self.w2 = np.random.normal()
    self.w3 = np.random.normal()
    self.w4 = np.random.normal()
    self.w5 = np.random.normal()
    self.w6 = np.random.normal()

    # Biases
    self.b1 = np.random.normal()
    self.b2 = np.random.normal()
    self.b3 = np.random.normal()

  def feedforward(self, x):
    # x is a numpy array with 2 elements.
    h1 = sigmoid(self.w1 * x[0] + self.w2 * x[1] + self.b1)
    h2 = sigmoid(self.w3 * x[0] + self.w4 * x[1] + self.b2)
    o1 = sigmoid(self.w5 * h1 + self.w6 * h2 + self.b3)
    return o1

  def train(self, data, all_y_trues):
    '''
    - data is a (n x 2) numpy array, n = # of samples in the dataset.
    - all_y_trues is a numpy array with n elements.
      Elements in all_y_trues correspond to those in data.
    '''
    learn_rate = 0.1
    epochs = 1000 # number of times to loop through the entire dataset

    for epoch in range(epochs):
      for x, y_true in zip(data, all_y_trues):
        # --- Do a feedforward (we'll need these values later)
        sum_h1 = self.w1 * x[0] + self.w2 * x[1] + self.b1
        h1 = sigmoid(sum_h1)

        sum_h2 = self.w3 * x[0] + self.w4 * x[1] + self.b2
        h2 = sigmoid(sum_h2)

        sum_o1 = self.w5 * h1 + self.w6 * h2 + self.b3
        o1 = sigmoid(sum_o1)
        y_pred = o1

        # --- Calculate partial derivatives.
        # --- Naming: d_L_d_w1 represents "partial L / partial w1"
        d_L_d_ypred = -2 * (y_true - y_pred)

        # Neuron o1
        d_ypred_d_w5 = h1 * deriv_sigmoid(sum_o1)
        d_ypred_d_w6 = h2 * deriv_sigmoid(sum_o1)
        d_ypred_d_b3 = deriv_sigmoid(sum_o1)

        d_ypred_d_h1 = self.w5 * deriv_sigmoid(sum_o1)
        d_ypred_d_h2 = self.w6 * deriv_sigmoid(sum_o1)

        # Neuron h1
        d_h1_d_w1 = x[0] * deriv_sigmoid(sum_h1)
        d_h1_d_w2 = x[1] * deriv_sigmoid(sum_h1)
        d_h1_d_b1 = deriv_sigmoid(sum_h1)

        # Neuron h2
        d_h2_d_w3 = x[0] * deriv_sigmoid(sum_h2)
        d_h2_d_w4 = x[1] * deriv_sigmoid(sum_h2)
        d_h2_d_b2 = deriv_sigmoid(sum_h2)

        # --- Update weights and biases
        # Neuron h1
        self.w1 -= learn_rate * d_L_d_ypred * d_ypred_d_h1 * d_h1_d_w1
        self.w2 -= learn_rate * d_L_d_ypred * d_ypred_d_h1 * d_h1_d_w2
        self.b1 -= learn_rate * d_L_d_ypred * d_ypred_d_h1 * d_h1_d_b1

        # Neuron h2
        self.w3 -= learn_rate * d_L_d_ypred * d_ypred_d_h2 * d_h2_d_w3
        self.w4 -= learn_rate * d_L_d_ypred * d_ypred_d_h2 * d_h2_d_w4
        self.b2 -= learn_rate * d_L_d_ypred * d_ypred_d_h2 * d_h2_d_b2

        # Neuron o1
        self.w5 -= learn_rate * d_L_d_ypred * d_ypred_d_w5
        self.w6 -= learn_rate * d_L_d_ypred * d_ypred_d_w6
        self.b3 -= learn_rate * d_L_d_ypred * d_ypred_d_b3

      # --- Calculate total loss at the end of each epoch
      if epoch % 10 == 0:
        y_preds = np.apply_along_axis(self.feedforward, 1, data)
        loss = mse_loss(all_y_trues, y_preds)
        print("Epoch %d loss: %.3f" % (epoch, loss))

# Define dataset
data = np.array([
  [-2, -1],  # Alice
  [25, 6],   # Bob
  [17, 4],   # Charlie
  [-15, -6], # Diana
])
all_y_trues = np.array([
  1, # Alice
  0, # Bob
  0, # Charlie
  1, # Diana
])

# Train our neural network!
network = OurNeuralNetwork()
network.train(data, all_y_trues)

第二个文件附录代码:

import numpy as np

# generate data
np.random.seed(1)
X = np.random.rand(12288, 200) # X is [12288 x 200] 
Y = np.random.rand(1, 200)

# setup configuration of the network'
n0, m = X.shape
n1 = 20
n2 = 7
n3 = 5
n4 = 1
layers_dims = [n0, n1, n2, n3, n4] #[12288, 20, 7, 5, 1]
L = len(layers_dims) - 1 # the number of layers, excluding the input layer

# activation function
def sigmoid(z):
    '''
    z is the prev_activation, with sizze n1xm
    '''
    return 1 / (1 + np.exp(-z))

# the relu
def relu(z):
    '''
    z is the prev_activation, with sizze n1xm
    '''
    return np.maximum(0,z)

param_w = [i for i in range(L+1)]
param_b = [i for i in range(L+1)]

# neural network model
def neural_network(X, Y, learning_rate=0.01, num_iterations=2000, lambd =0):
    m = X.shape[1]
    # initialize forward prop
    #param_w = [i for i in range(L+1)]
    #param_b = [i for i in range(L+1)]
    np.random.seed(10)
    for l in range(1, L+1):
        if l < L:
            # use He initialization
            param_w[l] = np.random.randn(layers_dims[l], layers_dims[l - 1]) * np.sqrt(2 / layers_dims[l - 1])
        if l == L:
            param_w[l] = np.random.randn(layers_dims[l], layers_dims[l - 1]) * 0.01
        param_b[l] = np.zeros((layers_dims[l], 1))
    
    activations = [X, ] + [i for i in range(L)]
    prev_activations = [i for i in range(L+1)]

    dA = [i for i in range(L+1)]
    dz = [i for i in range(L+1)]
    dw = [i for i in range(L+1)]
    db = [i for i in range(L+1)]

    for i in range(num_iterations):
        ### forward propagation
        for l in range(1, L+1):
            prev_activations[l] = np.dot(param_w[l], activations[l-1]) + param_b[l]
            if l < L:
                activations[l] = relu(prev_activations[l])
            else:
                activations[l] = sigmoid(prev_activations[l])

        cross_entropy_cost = -1/m * (np.dot(np.log(activations[L]), Y.T) \
                                     + np.dot(np.log(1-activations[L]), 1-Y.T))
        regularization_cost = 0
        for l in range(1, L+1):
            regularization_cost += np.sum(np.square(param_w[l])) * lambd/(2*m)
        cost = cross_entropy_cost + regularization_cost

        ### initialize backward propagation
        dA[L] =  np.divide(1-Y, 1-activations[L]) - np.divide(Y, activations[L])
        assert dA[L].shape == (1, m)

        ### backward propagation
        for l in reversed(range(1, L+1)):
            if l == L:
                dz[l] = dA[l] * activations[l] * (1-activations[l])
            else:
                dz[l] = dA[l].copy()
                dz[l][prev_activations[l] <= 0] = 0

            dw[l] = 1/m * np.dot(dz[l], activations[l-1].T) + param_w[l] * lambd/m
            db[l] = 1/m * np.sum(dz[l], axis=1, keepdims=True)
            dA[l-1] = np.dot(param_w[l].T, dz[l])

            assert dz[l].shape == prev_activations[l].shape
            assert dw[l].shape == param_w[l].shape
            assert db[l].shape == param_b[l].shape
            assert dA[l-1].shape == activations[l-1].shape

            param_w[l] = param_w[l] - learning_rate * dw[l]
            param_b[l] = param_b[l] - learning_rate * db[l]

        if i % 100 == 0:
            print("cost after iteration {}: {}".format(i, cost))

    print(param_w)
    print(param_b)
neural_network(X, Y, learning_rate=0.01, num_iterations=2000, lambd =0)
'''
# predict
def predict(X_new, parameters, threshold=0.5):
    param_w = parameters["param_w"]
    param_b = parameters["param_b"]

    activations = [X_new, ] + [i for i in range(L)]
    prev_activations = [i for i in range(L + 1)]
    m = X_new.shape[1]

    for l in range(1, L + 1):
        prev_activations[l] = np.dot(param_w[l], activations[l - 1]) + param_b[l]
        if l < L:
            activations[l] = relu(prev_activations[l])
        else:
            activations[l] = sigmoid(prev_activations[l])
    prediction = (activations[L] > threshold).astype("int")
    return prediction
'''

# predict
def predict(X_new, param_w,param_b,threshold=0.5):
    #param_w = parameters["param_w"]
    #param_b = parameters["param_b"]

    activations = [X_new, ] + [i for i in range(L)]
    prev_activations = [i for i in range(L + 1)]
    m = X_new.shape[1]

    for l in range(1, L + 1):
        prev_activations[l] = np.dot(param_w[l], activations[l - 1]) + param_b[l]
        if l < L:
            activations[l] = relu(prev_activations[l])
        else:
            activations[l] = sigmoid(prev_activations[l])
    prediction = (activations[L] > threshold).astype("int")
    print(prediction)
    return prediction

X_new = np.random.rand(12288,200)

print(predict(X_new,param_w,param_b, threshold=0.5))

本文首次发布于 ZhaoYLng Blogopen in new window, 作者 @Laqudeeopen in new window ,转载请保留原文链接.